diff --git a/.github/workflows/job_build_mlir_linux.yml b/.github/workflows/job_build_mlir_linux.yml index 3b19ee601a..523b5f2d3f 100644 --- a/.github/workflows/job_build_mlir_linux.yml +++ b/.github/workflows/job_build_mlir_linux.yml @@ -211,7 +211,7 @@ jobs: -DCMAKE_BUILD_TYPE=${{ env.CMAKE_BUILD_TYPE }} \ -DOpenVINODeveloperPackage_DIR=$(realpath ${OPENVINO_BUILD_DIR}) \ -DENABLE_TESTS=${{ env.ENABLE_TESTS_FLAG }} \ - -DENABLE_DEVELOPER_BUILD=OFF \ + -DENABLE_DEVELOPER_BUILD=ON \ -DENABLE_MLIR_COMPILER=ON \ -DBUILD_COMPILER_FOR_DRIVER=OFF \ -DENABLE_DRIVER_COMPILER_ADAPTER=ON \ diff --git a/.github/workflows/job_tests_unit_mlir_linux.yml b/.github/workflows/job_tests_unit_mlir_linux.yml index 7ea8c4d30a..3eb71eb053 100644 --- a/.github/workflows/job_tests_unit_mlir_linux.yml +++ b/.github/workflows/job_tests_unit_mlir_linux.yml @@ -55,10 +55,7 @@ jobs: echo "VPUX_TRANSLATE_PATH=$(realpath "${OPENVINO_INSTALL_DIR}/tools/vpux-translate")" >> $GITHUB_ENV chmod +x ${OPENVINO_INSTALL_DIR}/setupvars.sh chmod +x ${OPENVINO_INSTALL_DIR}/tests/FileCheck - chmod +x ${OPENVINO_INSTALL_DIR}/tests/flatc - chmod +x ${OPENVINO_INSTALL_DIR}/tests/not chmod +x ${OPENVINO_INSTALL_DIR}/tools/prof_parser/prof_parser - chmod +x ${OPENVINO_INSTALL_DIR}/tools/npu-lsp-server/npu-lsp-server chmod +x ${OPENVINO_INSTALL_DIR}/tools/vpux-opt/vpux-opt chmod +x ${OPENVINO_INSTALL_DIR}/tools/vpux-translate/vpux-translate diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c0def1569..7bb441fa23 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,44 +24,29 @@ cmake_policy(SET CMP0063 NEW) # Set PROJECT_VERSION* variables by project command only. cmake_policy(SET CMP0048 NEW) -project(InferenceEngineVPUXPlugin) +project(VPUX) # # Build properties # set(NPU_DEVICE_NAME "NPU") -string(TOLOWER "${NPU_DEVICE_NAME}" VPUX_PLUGIN_COMPONENT) -set(VPUX_INTERNAL_COMPONENT "${VPUX_PLUGIN_COMPONENT}_internal") -set(VPUX_TESTS_COMPONENT "${VPUX_PLUGIN_COMPONENT}_tests") +set(VPUX_PLUGIN_COMPONENT "npu") +set(VPUX_INTERNAL_COMPONENT "npu_internal") +set(VPUX_TESTS_COMPONENT "npu_tests") set(NPU_CPACK_COMPONENTS_ALL ${VPUX_PLUGIN_COMPONENT} ${VPUX_INTERNAL_COMPONENT}) -set(IE_MAIN_VPUX_PLUGIN_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) - if (CMAKE_BUILD_TYPE STREQUAL "") message(STATUS "CMAKE_BUILD_TYPE not defined, 'Release' will be used") set(CMAKE_BUILD_TYPE "Release") endif() -if (CMAKE_BUILD_TYPE STREQUAL "Debug") - set(CMAKE_COMPILE_PDB_OUTPUT_DIRECTORY ${MAIN_VPUX_PLUGIN_BINARY_DIR}) - set(CMAKE_PDB_OUTPUT_DIRECTORY ${MAIN_VPUX_PLUGIN_BINARY_DIR}) -endif() - -if(DEFINED ENV{THIRDPARTY_SERVER_PATH}) - set(THIRDPARTY_SERVER_PATH "$ENV{THIRDPARTY_SERVER_PATH}") -elseif(DEFINED THIRDPARTY_SERVER_PATH) - set(THIRDPARTY_SERVER_PATH "${THIRDPARTY_SERVER_PATH}") -endif() - -include(FetchContent) - # TODO remove after migration option(ENABLE_NPU_MONO "Please turn it on if you work under `npu_mono` environment" OFF) if (ENABLE_NPU_MONO) message(AUTHOR_WARNING "Experimental option ENABLE_NPU_MONO enabled") - set (NPU_MONO_ROOT ${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/..) + set (NPU_MONO_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..) endif() find_package(OpenVINODeveloperPackage REQUIRED) @@ -82,7 +67,6 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/cmake/dependencies.cmake") include(cmake/dependencies.cmake) endif() -include(cmake/cross_compilation.cmake) include(cmake/flatbuffers.cmake) include(cmake/bundle_static_library.cmake) include(cmake/embed_bin_file.cmake) @@ -179,14 +163,6 @@ if(ENABLE_SOURCE_PACKAGE) include(cmake/source_package.cmake) endif() -if(CMAKE_SOURCE_DIR STREQUAL OpenVINO_SOURCE_DIR) - # NPU plugin public headers should be a part of common OpenVINO headers - set(dev_component ${OV_CPACK_COMP_CORE_DEV}) -else() - # compatibility mode while NPU plugin is not part of OpenVINO repository and can be built separately - set(dev_component ${VPUX_PLUGIN_COMPONENT}) -endif() - # # CPack # diff --git a/CMakePresets.json b/CMakePresets.json index b786a73982..7356c19045 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -92,7 +92,7 @@ { "name": "vpuxDeveloper", - "description": "Default configuration preset for VPUX Compiler development. Enables VPUX compiler and generic OV tests, and disables plugin specific builds. Also, enables IMD backend", + "description": "Default configuration preset for VPUX Compiler development. Enables VPUX compiler and generic OV tests, and disables plugin specific builds.", "inherits": ["BuildOptimization", "EnableTests", "Disabler"], "binaryDir": "${sourceDir}/build-x86_64/Debug", "displayName": "vpuxDeveloper", @@ -118,7 +118,7 @@ }, { "name": "vpuxRelWithDebInfoDeveloper", - "description": "Release with debug info configuration preset for VPUX Compiler development. Enables VPUX compiler and generic OV tests, and disables plugin specific builds. Also, enables IMD backend", + "description": "Release with debug info configuration preset for VPUX Compiler development. Enables VPUX compiler and generic OV tests, and disables plugin specific builds.", "inherits": ["BuildOptimization", "EnableTests", "Disabler"], "binaryDir": "${sourceDir}/build-x86_64/RelWithDebInfo", "displayName": "vpuxRelWithDebInfoDeveloper", @@ -217,7 +217,7 @@ { "name": "cid", "hidden": true, - "description": "cid build necessary options. Please note the path of NPU plugin when use.", + "description": "Default configuration for NPU driver compiler. Make sure NPU_PLUGIN_HOME environment variable is set to the VPUX plugin root directory.", "cacheVariables": { "BUILD_SHARED_LIBS": false, "ENABLE_OV_IR_FRONTEND": true, @@ -235,15 +235,15 @@ } }, { - "name": "npuCidReleaseLinux", - "description": "Default configuration for NPU Driver Compiler target build on Linux. Please note: 1) Please make sure that NPU_PLUGIN_HOME environment variable is set to NPU plugin path. 2) build Cid is a static build which only build related targets and not include other unrelated options and targets to avoid getting big generated files.", - "binaryDir": "${sourceDir}/build-x86_64/Release", - "displayName": "npuCidRelease", + "name": "npuCidLinux", + "description": "Default CMake configuration to build NPU-CiD targets on Linux. Note: (1) Make sure NPU_PLUGIN_HOME and CONFIG environment variables are set. (2) CiD is a static build that includes only the necessary targets. Irrelevant targets/components are excluded to minimize the size of the build output.", + "binaryDir": "${sourceDir}/build_$env{CONFIG}", + "displayName": "npuCid", "inherits": ["cid", "BuildOptimization", "Disabler"], "cacheVariables": { "CMAKE_BUILD_TYPE": { "type": "STRING", - "value": "Release" + "value": "$env{CONFIG}" }, "ENABLE_LTO": false, @@ -291,15 +291,15 @@ } }, { - "name": "npuCidReleaseWindows", - "description": "Default configuration for NPU Driver Compiler target build on Windows. Please note: 1) Please make sure that NPU_PLUGIN_HOME environment variable is set to NPU plugin path. 2) build Cid is a static build which only build related targets and not include other unrelated options and targets to avoid getting big generated files.", - "binaryDir": "${sourceDir}/build-x86_64/Release", - "displayName": "npuCidRelease", + "name": "npuCidWindows", + "description": "Default CMake configuration to build NPU-CiD targets on Windows. Note: (1) Make sure NPU_PLUGIN_HOME and CONFIG environment variables are set. (2) CiD is a static build that includes only the necessary targets. Irrelevant targets/components are excluded to minimize the size of the build output.", + "binaryDir": "${sourceDir}/build_$env{CONFIG}", + "displayName": "npuCid", "inherits": ["cid", "BuildOptimization", "Disabler"], "cacheVariables": { "CMAKE_BUILD_TYPE": { "type": "STRING", - "value": "Release" + "value": "$env{CONFIG}" }, "CMAKE_TOOLCHAIN_FILE":"${sourceDir}\\cmake\\toolchains\\onecoreuap.toolchain.cmake", "ENABLE_LTO": false, @@ -392,22 +392,6 @@ "VPUX_PLUGIN_HOME": "${sourceDir}" } }, - { - "name": "Simulator", - "hidden": true, - "environment": { - "IE_NPU_USE_IMD_BACKEND": "1", - "IE_NPU_IMD_LAUNCH_MODE": "VPUX_IMD_SIMULATOR" - } - }, - { - "name": "Silicon", - "hidden": true, - "environment": { - "IE_NPU_USE_IMD_BACKEND": "1", - "IE_NPU_IMD_LAUNCH_MODE": "VPUX_IMD_MOVI_DEBUG" - } - }, { "name": "NPU3720/Simulator", "hidden": true, diff --git a/cmake/cross_compilation.cmake b/cmake/cross_compilation.cmake deleted file mode 100644 index ccb1661d00..0000000000 --- a/cmake/cross_compilation.cmake +++ /dev/null @@ -1,152 +0,0 @@ -# -# Copyright (C) 2022-2025 Intel Corporation. -# SPDX-License-Identifier: Apache-2.0 -# - -function(vpux_add_native_tool NATIVE_NAME NATIVE_SOURCE_DIR) - if(NOT CMAKE_CROSSCOMPILING) - set(${NATIVE_NAME}_COMMAND ${NATIVE_NAME} CACHE INTERNAL "" FORCE) - set(${NATIVE_NAME}_TARGET ${NATIVE_NAME} CACHE INTERNAL "" FORCE) - return() - endif() - - set(options) - set(oneValueArgs "EXEDIR") - set(multiValueArgs "CMAKE_ARGS") - cmake_parse_arguments(NATIVE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - set(NATIVE_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/NATIVE/${NATIVE_NAME}") - - if(NOT DEFINED NATIVE_EXEDIR) - set(NATIVE_EXEDIR ".") - endif() - - if(CMAKE_CFG_INTDIR STREQUAL ".") - set(NATIVE_CFGDIR ".") - else() - set(NATIVE_CFGDIR "Release") - endif() - - set(${NATIVE_NAME}_COMMAND - "${NATIVE_BINARY_DIR}/${NATIVE_EXEDIR}/${NATIVE_CFGDIR}/${NATIVE_NAME}${CMAKE_EXECUTABLE_SUFFIX}" - CACHE INTERNAL "" FORCE - ) - set(${NATIVE_NAME}_TARGET - NATIVE_${NATIVE_NAME} - CACHE INTERNAL "" FORCE - ) - - add_custom_command( - OUTPUT ${NATIVE_BINARY_DIR} - COMMAND ${CMAKE_COMMAND} -E make_directory ${NATIVE_BINARY_DIR} - COMMENT "[NATIVE] Creating ${NATIVE_BINARY_DIR} ..." - ) - - set(cmake_args -G ${CMAKE_GENERATOR}) - if(CMAKE_GENERATOR_TOOLSET) - list(APPEND cmake_args -T ${CMAKE_GENERATOR_TOOLSET}) - endif() - if(CMAKE_GENERATOR_PLATFORM) - list(APPEND cmake_args -A ${CMAKE_GENERATOR_PLATFORM}) - endif() - list(APPEND cmake_args -S ${NATIVE_SOURCE_DIR}) - list(APPEND cmake_args -B ${NATIVE_BINARY_DIR}) - if(CMAKE_BUILD_TYPE) - list(APPEND cmake_args -D "CMAKE_BUILD_TYPE:STRING=Release") - elseif(CMAKE_CONFIGURATION_TYPES) - list(APPEND cmake_args -D "CMAKE_CONFIGURATION_TYPES:STRING=Release") - endif() - foreach(arg IN LISTS NATIVE_CMAKE_ARGS) - list(APPEND cmake_args -D "${arg}") - endforeach() - - add_custom_command( - OUTPUT "${NATIVE_BINARY_DIR}/CMakeCache.txt" - COMMAND ${CMAKE_COMMAND} ${cmake_args} - DEPENDS ${NATIVE_BINARY_DIR} ${NATIVE_NAME} - COMMENT "[NATIVE] Configuring ${NATIVE_NAME} ..." - ) - - add_custom_command( - OUTPUT ${${NATIVE_NAME}_COMMAND} - COMMAND ${CMAKE_COMMAND} --build ${NATIVE_BINARY_DIR} --config Release --target ${NATIVE_NAME} - DEPENDS "${NATIVE_BINARY_DIR}/CMakeCache.txt" - COMMENT "[NATIVE] Building ${NATIVE_NAME} ..." - ) - - add_custom_target(NATIVE_${NATIVE_NAME} - DEPENDS ${${NATIVE_NAME}_COMMAND} - ) -endfunction() - -function(vpux_add_crosscompile_project CROSSCOMPILE_NAME SOURCE_DIR TOOLCHAIN_FILE) - if(CMAKE_CROSSCOMPILING) - set(${CROSSCOMPILE_NAME}_COMMAND ${CROSSCOMPILE_NAME} CACHE INTERNAL "" FORCE) - set(${CROSSCOMPILE_NAME}_TARGET ${CROSSCOMPILE_NAME} CACHE INTERNAL "" FORCE) - return() - endif() - - set(options) - set(oneValueArgs "EXEDIR") - set(multiValueArgs "CMAKE_ARGS") - cmake_parse_arguments(CROSSCOMPILE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - set(CROSSCOMPILE_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/CROSSCOMPILE/${CROSSCOMPILE_NAME}") - - if(NOT DEFINED CROSSCOMPILE_EXEDIR) - set(CROSSCOMPILE_EXEDIR ".") - endif() - - if(CMAKE_CFG_INTDIR STREQUAL ".") - set(CROSSCOMPILE_CFGDIR ".") - else() - set(CROSSCOMPILE_CFGDIR "Release") - endif() - - set(${CROSSCOMPILE_NAME}_COMMAND - "${CROSSCOMPILE_BINARY_DIR}/${CROSSCOMPILE_EXEDIR}/${CROSSCOMPILE_CFGDIR}/${CROSSCOMPILE_NAME}${CMAKE_EXECUTABLE_SUFFIX}" - CACHE INTERNAL "" FORCE - ) - set(${CROSSCOMPILE_NAME}_TARGET - CROSSCOMPILE_${CROSSCOMPILE_NAME} - CACHE INTERNAL "" FORCE - ) - - add_custom_command( - OUTPUT ${CROSSCOMPILE_BINARY_DIR} - COMMAND ${CMAKE_COMMAND} -E make_directory ${CROSSCOMPILE_BINARY_DIR} - COMMENT "[CROSSCOMPILE] Creating ${CROSSCOMPILE_BINARY_DIR} ..." - ) - - set(cmake_args -G ${CMAKE_GENERATOR}) - if(CMAKE_GENERATOR_TOOLSET) - list(APPEND cmake_args -T ${CMAKE_GENERATOR_TOOLSET}) - endif() - if(CMAKE_GENERATOR_PLATFORM) - list(APPEND cmake_args -A ${CMAKE_GENERATOR_PLATFORM}) - endif() - list(APPEND cmake_args -S ${SOURCE_DIR}) - list(APPEND cmake_args -B ${CROSSCOMPILE_BINARY_DIR}) - if(CMAKE_BUILD_TYPE) - list(APPEND cmake_args -D "CMAKE_BUILD_TYPE:STRING=Release") - elseif(CMAKE_CONFIGURATION_TYPES) - list(APPEND cmake_args -D "CMAKE_CONFIGURATION_TYPES:STRING=Release") - endif() - list(APPEND cmake_args -D "CMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE}") - foreach(arg IN LISTS CROSSCOMPILE_CMAKE_ARGS) - list(APPEND cmake_args -D "${arg}") - endforeach() - - add_custom_command( - OUTPUT "${CROSSCOMPILE_BINARY_DIR}/CMakeCache.txt" - COMMAND ${CMAKE_COMMAND} ${cmake_args} - DEPENDS ${CROSSCOMPILE_BINARY_DIR} - COMMENT "[CROSSCOMPILE] Configuring ${CROSSCOMPILE_NAME} ..." - ) - - add_custom_target(CROSSCOMPILE_${CROSSCOMPILE_NAME} ALL - COMMAND ${CMAKE_COMMAND} --build ${CROSSCOMPILE_BINARY_DIR} --config $ - DEPENDS "${CROSSCOMPILE_BINARY_DIR}/CMakeCache.txt" ${CROSSCOMPILE_BINARY_DIR} - COMMENT "[CROSSCOMPILE] Building ${CROSSCOMPILE_NAME} ..." - ) -endfunction() diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake index bdc0296dc1..06933af365 100644 --- a/cmake/dependencies.cmake +++ b/cmake/dependencies.cmake @@ -8,9 +8,9 @@ cmake_policy(SET CMP0054 NEW) include(ExternalProject) if(NOT BUILD_SHARED_LIBS) - set(TEMP "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/temp") + set(TEMP "${PROJECT_SOURCE_DIR}/temp") else() - ov_set_temp_directory(TEMP "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}") + ov_set_temp_directory(TEMP "${PROJECT_SOURCE_DIR}") endif() # FIXME: Create empty file to avoid errors on CI diff --git a/cmake/embed_bin_file.cmake b/cmake/embed_bin_file.cmake index aeb7bc5830..1c7b450faf 100644 --- a/cmake/embed_bin_file.cmake +++ b/cmake/embed_bin_file.cmake @@ -1,9 +1,8 @@ # -# Copyright (C) 2022-2025 Intel Corporation. +# Copyright (C) 2025 Intel Corporation. # SPDX-License-Identifier: Apache-2.0 # - function(vpux_embed_bin_file) set(options) set(oneValueArgs SOURCE_FILE HEADER_FILE VARIABLE_NAME) diff --git a/cmake/features.cmake b/cmake/features.cmake index cdd6b11dd6..846839fbcb 100644 --- a/cmake/features.cmake +++ b/cmake/features.cmake @@ -20,7 +20,7 @@ ov_dependent_option(ENABLE_NPU_FUZZ_TESTS "NPU Fuzz tests" OFF "ENABLE_TESTS" OF if(NOT ENABLE_LTO) set(ENABLE_LTO OFF) endif() -ov_dependent_option(ENABLE_LTO "Enable Link Time Optimization" ${ENABLE_LTO} "LINUX OR WIN32;NOT CMAKE_CROSSCOMPILING" OFF) +ov_dependent_option(ENABLE_LTO "Enable Link Time Optimization" ${ENABLE_LTO} "LINUX OR WIN32" OFF) if(NOT ENABLE_FASTER_BUILD) set(ENABLE_FASTER_BUILD OFF) @@ -56,12 +56,8 @@ if(ENABLE_PRIVATE_COMPILER_OPTIONS) add_definitions(-DPRIVATE_COMPILER_OPTIONS_ENABLED) endif() -if(NOT DEFINED MV_TOOLS_PATH AND DEFINED ENV{MV_TOOLS_DIR} AND DEFINED ENV{MV_TOOLS_VERSION}) - set(MV_TOOLS_PATH $ENV{MV_TOOLS_DIR}/$ENV{MV_TOOLS_VERSION}) -endif() - ov_option(ENABLE_NPU_LOADER "Enable npu-loader" OFF) -ov_option(ENABLE_NPU_LSP_SERVER "Enable npu-lsp-server" ON) +ov_option(ENABLE_NPU_LSP_SERVER "Enable npu-lsp-server" OFF) get_target_property(ov_linked_libs openvino::runtime IMPORTED_LINK_DEPENDENT_LIBRARIES_RELEASE) if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO" OR "TBB::tbb" IN_LIST ov_linked_libs) @@ -87,8 +83,8 @@ ov_option(ENABLE_NPU_MICRO_BENCHMARKS "NPU micro benchmarks" OFF) if(ENABLE_VPUX_DOCS) find_package(Doxygen) if(DOXYGEN_FOUND) - set(DOXYGEN_IN ${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/docs/VPUX_DG/Doxyfile.in) - set(DOXYGEN_OUT ${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/docs/VPUX_DG/generated/Doxyfile) + set(DOXYGEN_IN ${PROJECT_SOURCE_DIR}/docs/VPUX_DG/Doxyfile.in) + set(DOXYGEN_OUT ${PROJECT_SOURCE_DIR}/docs/VPUX_DG/generated/Doxyfile) configure_file(${DOXYGEN_IN} ${DOXYGEN_OUT} @ONLY) message("Doxygen build started") diff --git a/cmake/flatbuffers.cmake b/cmake/flatbuffers.cmake index d85656deb2..f35fe04d48 100644 --- a/cmake/flatbuffers.cmake +++ b/cmake/flatbuffers.cmake @@ -4,6 +4,9 @@ # function(vpux_add_flatc_target FLATC_TARGET_NAME) + if(NOT TARGET ${flatc_TARGET} OR NOT flatc_COMMAND) + message(FATAL_ERROR "Missing Flatbuffers") + endif() set(options) set(oneValueArgs SRC_DIR DST_DIR) set(multiValueArgs ARGS) @@ -35,7 +38,6 @@ function(vpux_add_flatc_target FLATC_TARGET_NAME) ${flatc_COMMAND} -o "${FLATC_DST_DIR}/schema" --cpp ${FLATC_ARGS} ${FLATC_SOURCES} DEPENDS ${FLATC_SOURCES} - ${flatc_COMMAND} ${flatc_TARGET} COMMENT "[flatc] Generating schema for ${FLATC_SRC_DIR} ..." @@ -55,7 +57,7 @@ function(vpux_add_flatc_target FLATC_TARGET_NAME) add_library(${FLATC_TARGET_NAME} INTERFACE) add_dependencies(${FLATC_TARGET_NAME} ${FLATC_GEN_TARGET}) target_include_directories(${FLATC_TARGET_NAME} - INTERFACE + SYSTEM INTERFACE $ ${FLATC_DST_DIR} ) @@ -75,7 +77,7 @@ function(vpux_gf_version_generate SRC_DIR DST_DIR) ) if ("${GIT_DESCRIBE_DIRTY}" STREQUAL "") - message(WARNING "GraphFile version cannot be read from ${SRC_DIR}") + message(WARNING "ELF version cannot be read from ${SRC_DIR}") set(GIT_DESCRIBE_DIRTY "v3.35.2") endif() diff --git a/cmake/lit_tests.cmake b/cmake/lit_tests.cmake index f9dfc00b14..c21f372431 100644 --- a/cmake/lit_tests.cmake +++ b/cmake/lit_tests.cmake @@ -38,7 +38,7 @@ function(vpux_setup_lit_tool) COMPONENT tests EXCLUDE_FROM_ALL) else() - set(extra_tools FileCheck not ${ARGN}) + set(extra_tools FileCheck ${ARGN}) endif() foreach(tool IN LISTS extra_tools) list(APPEND extra_tools_copy_cmd @@ -150,7 +150,7 @@ function(vpux_setup_lit_tests TEST_NAME) " return str((Path(__file__).parent / p).resolve())\n") configure_lit_site_cfg( - "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/cmake/lit.site.cfg.py.in" + "${PROJECT_SOURCE_DIR}/cmake/lit.site.cfg.py.in" "${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py" @ONLY ) @@ -183,7 +183,7 @@ function(vpux_setup_lit_tests TEST_NAME) "$/lit-tests/${TEST_NAME}" COMMAND ${CMAKE_COMMAND} -E copy - "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/cmake/lit.cfg.py" + "${PROJECT_SOURCE_DIR}/cmake/lit.cfg.py" "${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py" "$/lit-tests/${TEST_NAME}/" ${tests_copy_cmd} @@ -192,29 +192,27 @@ function(vpux_setup_lit_tests TEST_NAME) ) set_target_properties(copy_${TEST_NAME}_tests PROPERTIES FOLDER "tests") - if(NOT CMAKE_CROSSCOMPILING) - if(NOT Python3_FOUND) - message(WARNING "Python3 is not found, LIT tests ${TEST_NAME} disabled") - else() - add_test(NAME LIT-${TEST_NAME} + if(NOT Python3_FOUND) + message(WARNING "Python3 is not found, LIT tests ${TEST_NAME} disabled") + else() + add_test(NAME LIT-${TEST_NAME} + COMMAND + ${Python3_EXECUTABLE} + "$/lit-tests/lit-tool/lit.py" + -v + "$/lit-tests/${TEST_NAME}" + ) + set_tests_properties(LIT-${TEST_NAME} PROPERTIES + LABELS "NPU;LIT" + ) + if(UNIX) + add_test(NAME LIT-${TEST_NAME}-ALL COMMAND - ${Python3_EXECUTABLE} - "$/lit-tests/lit-tool/lit.py" - -v - "$/lit-tests/${TEST_NAME}" + "$/lit-tests/run_all_lit_tests.sh" ) - set_tests_properties(LIT-${TEST_NAME} PROPERTIES - LABELS "NPU;LIT" + set_tests_properties(LIT-${TEST_NAME}-ALL PROPERTIES + LABELS "NPU;LIT;Linux" ) - if(UNIX) - add_test(NAME LIT-${TEST_NAME}-ALL - COMMAND - "$/lit-tests/run_all_lit_tests.sh" - ) - set_tests_properties(LIT-${TEST_NAME}-ALL PROPERTIES - LABELS "NPU;LIT;Linux" - ) - endif() endif() endif() endfunction() diff --git a/cmake/oecore.arm64.toolchain.cmake b/cmake/oecore.arm64.toolchain.cmake deleted file mode 100644 index c511dd40dd..0000000000 --- a/cmake/oecore.arm64.toolchain.cmake +++ /dev/null @@ -1,49 +0,0 @@ -# -# Copyright (C) 2022-2025 Intel Corporation. -# SPDX-License-Identifier: Apache-2.0 -# - -if(DEFINED OECORE_BASE_DIR) - # OECORE_BASE_DIR was passed via CMake command line, nothing to do -elseif(DEFINED ENV{OECORE_BASE_DIR}) - # User sets OECORE_BASE_DIR environment variable - set(OECORE_BASE_DIR $ENV{OECORE_BASE_DIR}) -elseif(DEFINED ENV{OECORE_NATIVE_SYSROOT}) - # OECORE_NATIVE_SYSROOT is a default environment variable for the ORCore toolchain - set(OECORE_BASE_DIR "$ENV{OECORE_NATIVE_SYSROOT}/../..") -else() - # Use default value - set(OECORE_BASE_DIR "/usr/local/oecore-x86_64") -endif() - -set(OECORE_TARGET_NAME "aarch64-ese-linux") -set(OECORE_TARGET_SYSROOT "${OECORE_BASE_DIR}/sysroots/${OECORE_TARGET_NAME}") -set(OECORE_HOST_SYSROOT "${OECORE_BASE_DIR}/sysroots/x86_64-esesdk-linux") -set(OECORE_HOST_COMPILER_BIN_DIR "${OECORE_HOST_SYSROOT}/usr/bin/${OECORE_TARGET_NAME}") - -set(CMAKE_SYSTEM_NAME "Linux") -set(CMAKE_SYSTEM_PROCESSOR "aarch64") - -set(CMAKE_SYSROOT "${OECORE_TARGET_SYSROOT}") - -set(CMAKE_C_COMPILER "${OECORE_HOST_COMPILER_BIN_DIR}/aarch64-ese-linux-gcc") -set(CMAKE_CXX_COMPILER "${OECORE_HOST_COMPILER_BIN_DIR}/aarch64-ese-linux-g++") - -set(PKG_CONFIG_EXECUTABLE "${OECORE_HOST_SYSROOT}/usr/bin/x86_64-esesdk-linux-gnu-pkg-config" CACHE PATH "Path to yocto pkg-config") -set(PKG_CONFIG_PATH "${OECORE_TARGET_SYSROOT}/usr/lib/pkgconfig:${OECORE_TARGET_SYSROOT}/usr/share/pkgconfig" CACHE STRING "PKG_CONFIG_PATH for yocto") -set(PKG_CONFIG_SYSROOT_DIR "${OECORE_TARGET_SYSROOT}" CACHE STRING "PKG_CONFIG_SYSROOT_DIR for yocto") - -set(ENV{PKG_CONFIG_PATH} "${PKG_CONFIG_PATH}") -set(ENV{PKG_CONFIG_SYSROOT_DIR} "${PKG_CONFIG_SYSROOT_DIR}") - -set(CMAKE_C_FLAGS_INIT "-mcpu=cortex-a53 -mtune=cortex-a53 --sysroot=${OECORE_TARGET_SYSROOT}") -set(CMAKE_CXX_FLAGS_INIT "-mcpu=cortex-a53 -mtune=cortex-a53 --sysroot=${OECORE_TARGET_SYSROOT}") - -set(CMAKE_EXE_LINKER_FLAGS_INIT "-Wl,-O1 -Wl,--hash-style=gnu -Wl,--as-needed --sysroot=${OECORE_TARGET_SYSROOT}") -set(CMAKE_SHARED_LINKER_FLAGS_INIT "-Wl,-O1 -Wl,--hash-style=gnu -Wl,--as-needed --sysroot=${OECORE_TARGET_SYSROOT}") -set(CMAKE_MODULE_LINKER_FLAGS_INIT "-Wl,-O1 -Wl,--hash-style=gnu -Wl,--as-needed --sysroot=${OECORE_TARGET_SYSROOT}") - -set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) -set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) -set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e4f240ab70..0b6c646fd9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -19,3 +19,6 @@ endif() add_subdirectory(vpux_utils) add_subdirectory(vpux_compiler) + +if((BUILD_SHARED_LIBS AND ENABLE_MLIR_COMPILER)) +endif() diff --git a/src/vpux_compiler/CMakeLists.txt b/src/vpux_compiler/CMakeLists.txt index efe54b6711..09b3e11790 100644 --- a/src/vpux_compiler/CMakeLists.txt +++ b/src/vpux_compiler/CMakeLists.txt @@ -24,12 +24,12 @@ set(gen_base_dst_include_dir "src/vpux_compiler/include/vpux/compiler") # SCHEMA target # if(ENABLE_NPU_MONO) - if(NOT DEFINED GRAPHFILE_SCHEMA_SUBMODULE_PATH) - message(FATAL_ERROR "Graphfile schema submodule path is not set while `npu_mono` was activated") + if(NOT DEFINED ELF_SUBMODULE_PATH) + message(FATAL_ERROR "ELF submodule path is not set while `npu_mono` was activated") endif() - set(SCHEMA_SOURCE_DIR ${GRAPHFILE_SCHEMA_SUBMODULE_PATH}/src/schema) + set(SCHEMA_SOURCE_DIR ${ELF_SUBMODULE_PATH}/src/schema) else() - set(SCHEMA_SOURCE_DIR ${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/thirdparty/elf/src/schema) # Legacy path + set(SCHEMA_SOURCE_DIR ${PROJECT_SOURCE_DIR}/thirdparty/elf/src/schema) # Legacy path endif(ENABLE_NPU_MONO) @@ -77,7 +77,7 @@ add_subdirectory(tblgen) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) include_directories(SYSTEM ${CMAKE_CURRENT_BINARY_DIR}/include) -# GraphFile schema is used in tblgen and therefore in entire compiler +# Elf schema is used in tblgen and therefore in entire compiler include_directories( ${PROJECT_BINARY_DIR}/${gen_base_dst_include_dir}/dialect/VPUIP/generated $ @@ -168,7 +168,7 @@ if (ENABLE_DIALECT_SHARED_LIBRARIES) $ $ $ - $) + $) target_link_libraries(NPUReg40XX_dialect PUBLIC mlir-dependencies diff --git a/src/vpux_compiler/cmake/add_npu_library.cmake b/src/vpux_compiler/cmake/add_npu_library.cmake index 6ce6a40e55..b3dc827813 100644 --- a/src/vpux_compiler/cmake/add_npu_library.cmake +++ b/src/vpux_compiler/cmake/add_npu_library.cmake @@ -43,7 +43,7 @@ function(add_npu_library name) # The npu_mlir_compiler_schema and cpp_schema targets are added as dependencies to ensure # that the following generated headers are built before the compiler code: - # - thirdparty/elf/schema.hpp + # - elf/schema.hpp # - vpunn_generated.h add_mlir_library(${name} STATIC ${SRC_FILES} diff --git a/src/vpux_compiler/cmake/vpux_tblgen.cmake b/src/vpux_compiler/cmake/vpux_tblgen.cmake index ac31544033..768cf86141 100644 --- a/src/vpux_compiler/cmake/vpux_tblgen.cmake +++ b/src/vpux_compiler/cmake/vpux_tblgen.cmake @@ -6,13 +6,13 @@ add_custom_target(MLIRVPUXIncGenList) function(add_vpux_dialect dialect_namespace) - set(LLVM_TARGET_DEFINITIONS ops.td) + set(LLVM_TARGET_DEFINITIONS dialect.td) mlir_tablegen(dialect.hpp.inc -gen-dialect-decls -dialect=${dialect_namespace} ) mlir_tablegen(dialect.cpp.inc -gen-dialect-defs -dialect=${dialect_namespace} ) - add_mlir_doc(ops _${dialect_namespace} dialect/ -gen-dialect-doc -dialect=${dialect_namespace}) - add_vpux_ops(${dialect_namespace} GENERIC) + add_public_tablegen_target(MLIRVPUX${dialect_namespace}DialectIncGen) + add_dependencies(MLIRVPUXIncGenList MLIRVPUX${dialect_namespace}DialectIncGen) endfunction() function(add_vpux_ops dialect_namespace arch) @@ -23,6 +23,22 @@ function(add_vpux_ops dialect_namespace arch) ) add_public_tablegen_target(MLIRVPUX${dialect_namespace}${arch}OpsIncGen) add_dependencies(MLIRVPUXIncGenList MLIRVPUX${dialect_namespace}${arch}OpsIncGen) + if(arch STREQUAL GENERIC) + add_mlir_doc(ops _${dialect_namespace} dialect/ -gen-dialect-doc -dialect=${dialect_namespace}) + else() + add_mlir_doc(ops _${dialect_namespace}_${arch} dialect/ -gen-dialect-doc -dialect=${dialect_namespace}) + endif() +endfunction() + +function(add_vpux_ops_granular dialect_namespace arch target_dir ops_target) + set(LLVM_TARGET_DEFINITIONS ${target_dir}/${ops_target}.td) + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${target_dir}) + mlir_tablegen(${target_dir}/${ops_target}.hpp.inc -gen-op-decls + ) + mlir_tablegen(${target_dir}/${ops_target}.cpp.inc -gen-op-defs + ) + add_public_tablegen_target(MLIRVPUX${dialect_namespace}${arch}${ops_target}OpsIncGen) + add_dependencies(MLIRVPUXIncGenList MLIRVPUX${dialect_namespace}${arch}${ops_target}OpsIncGen) endfunction() function(add_vpux_ops_interface ops_namespace doc_dir) diff --git a/src/vpux_compiler/docs/code_style.md b/src/vpux_compiler/docs/code_style.md index 7d6af31b74..65214e6437 100644 --- a/src/vpux_compiler/docs/code_style.md +++ b/src/vpux_compiler/docs/code_style.md @@ -449,7 +449,7 @@ So we get a dangling pointer: inputShape = computerShape.tiles[0].shape; // 'inputShape' stores a copy of field of local variable } - return isSOHSupportedByDPU(inputShape, _numTiles, false, VPU::getArch(nceOp.getOperation())); + return isSOHSupportedByDPU(inputShape, _numTiles, false, config::getArch(nceOp.getOperation())); // BAD: The result from 'computerShape' object is stored in an instance of ShapeRef type that does not own memory auto inputShape = getShape(origOp.input()); // The return type of 'getShape' is ShapeRef @@ -460,7 +460,7 @@ So we get a dangling pointer: } // The lifetime of 'computerShape' is over. 'inputShape' contains a dangling pointer - return isSOHSupportedByDPU(inputShape, _numTiles, false, VPU::getArch(nceOp.getOperation())); + return isSOHSupportedByDPU(inputShape, _numTiles, false, config::getArch(nceOp.getOperation())); ``` ### Working with dynamic shapes diff --git a/src/vpux_compiler/docs/guides/images/1_ir_levels_HostCompile.drawio.svg b/src/vpux_compiler/docs/guides/images/1_ir_levels_HostCompile.drawio.svg new file mode 100644 index 0000000000..d0e7b1f8ca --- /dev/null +++ b/src/vpux_compiler/docs/guides/images/1_ir_levels_HostCompile.drawio.svg @@ -0,0 +1,4 @@ + + + +
OpenVINO IR
IE Dialect
VPU ELF Backend
VPU Dialect
VPUIP Dialect
Async Dialect
VPURT Dialect
Tensor World
Buffer World
Tasks
Scheduling
Level 3 : Topology
Level 2 : Abstract RunTime
Level 1 : VPU RunTime
Level 0 : VPU Blob
VPUIPRegMapped
SCF,
Tensor,
Arith
Dialects
HostCompile Backend
HostExec
Dialect
CPU
NPU
diff --git a/src/vpux_compiler/docs/guides/images/ws-monolithic-compilation-flow.png b/src/vpux_compiler/docs/guides/images/ws-monolithic-compilation-flow.png deleted file mode 100644 index ab0a0edd93..0000000000 Binary files a/src/vpux_compiler/docs/guides/images/ws-monolithic-compilation-flow.png and /dev/null differ diff --git a/src/vpux_compiler/docs/guides/project_structure.md b/src/vpux_compiler/docs/guides/project_structure.md index 65e8ff8f88..4840b50439 100644 --- a/src/vpux_compiler/docs/guides/project_structure.md +++ b/src/vpux_compiler/docs/guides/project_structure.md @@ -86,10 +86,10 @@ Following this approach, the development of a "mixed" pass is similar to a commo ```C++ std::unique_ptr vpux::VPU::createMCStrategyGetter(ArchKind arch, int64_t numClusters) { switch (arch) { - case VPU::ArchKind::NPU37XX: { + case config::ArchKind::NPU37XX: { return std::make_unique(); } - case VPU::ArchKind::NPU40XX: { + case config::ArchKind::NPU40XX: { return std::make_unique(numClusters); } case ArchKind::UNKNOWN: @@ -202,7 +202,7 @@ The main advantage of this approach is that we can easily hide the pipeline for ```C++ void MyPass::safeRunOnFunc() { // ... - if (arch != VPU::ArchKind::NPU37XX) { + if (arch != config::ArchKind::NPU37XX) { return mlir::failure(); } // ... @@ -352,7 +352,7 @@ The common approach here is extending `mlir::DialectInlinerInterface` and implem ```cpp struct MyDialectInlinerInterface : public mlir::DialectInlinerInterface { bool isLegalToInline(mlir::Operation*, mlir::Operation*, bool) const final { - return true; + return true; } bool isLegalToInline(mlir::Region*, mlir::Region*, bool, mlir::IRMapping&) const final { @@ -381,7 +381,7 @@ Assume we want to implement a custom `MyDialect.Call` operation. It extends `Cal ```cpp struct MyDialectDispatchedInlinerInterface : public mlir::DialectInlinerInterface { bool isLegalToInline(mlir::Operation*, mlir::Operation*, bool) const final { - return true; + return true; } bool isLegalToInline(mlir::Region*, mlir::Region*, bool, mlir::IRMapping&) const final { @@ -440,19 +440,6 @@ void MyDialect::initialize() { Note: If no dispatched inliner interface is provided via `registerDispatchedInlinerInterface`, a fallback implementation which mirrors `mlir/lib/Dialect/Func/Extensions/InlinerExtension.cpp` is used! For a lot of use-cases this is enough as the default inlining behaviour is the desired one. -## Weights Separation - -### Monolithic Mode - -The main motivation of the Monolithic mode is to align as much as possible with "real" weights separation but keeping `@init()` and `@main()` in the same blob and thus being able to use the current CI infrastructure. This eases the debugging of compilation, accuracy and inference issues (IMD). - -A rough sketch of the Monolithic WS pipeline looks like this: - -drawing - -Up until `IntroduceInitFunctionPass`, we have the normal IR structure with a single `@main(...) -> (...)` function. This pass then creates the `@init(...) -> (...)` function. The pass strips away the transformations from `const.Declare` operations and converts them into `IE`-dialect operations in `@init`. The `WSInit` pipeline is then executed only on the `@init` function. After that, the `UnpackNestedModulesPass`, together with the `InlinerPass`, converts the multiple nested functions back into a single network function. Then, the default hardware `VPUIP` pipeline is executed. - - ## HostCompile Compilation Pipeline The HostCompile pipeline is a specialized compilation mode designed to partition a neural network into multiple independently compilable functions. Each function contains NPU code, which is subsequently compiled into separate ELF blobs, along with the main function, which contains CPU host code that manages these compiled blobs using the LevelZero API. @@ -468,11 +455,11 @@ In the HostCompile pipeline, the network is divided into kernel functions and ho Consider the following example: ```mlir -module @StaticEltwiseNHWC attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} { +module @StaticEltwiseNHWC attributes {config.arch = #config.arch_kind, config.revisionID = #config.revision_id, config.compilationMode = #config.compilation_mode} { module @Module_1 { // function which contains the NPU-specific code and supposed to be compiled into ELF blobs func.func private @main_func0(%arg0: tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}>, %arg1: tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> { - %0 = VPU.NCE.Eltwise(%arg0, %arg1) {multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, quant_scale = [1.000000e+00], fp_prelu_alpha = 1.000000e+00 : f64>} -> tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> + %0 = VPU.NCE.Eltwise(%arg0, %arg1) {multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, quant_scale = [1.000000e+00], fp_prelu_alpha = 1.000000e+00 : f64>} -> tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> return %0 : tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> } } @@ -512,7 +499,7 @@ To compile a network end to end with `HostCompile` pipeline use one of the follo `./compile_tool -d NPU.4000 -m ./net.onnx -o ./net.blob -c ./extra_config_net.conf -shape [1,3,4..6,7..10]` -Below is the content of the `extra_config_net.conf` file +Below is the content of the `extra_config_net.conf` file ```plaintext NPU_COMPILATION_MODE HostCompile diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/conversion.hpp b/src/vpux_compiler/include/vpux/compiler/NPU37XX/conversion.hpp index 07bfe50ee3..190f9393ab 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU37XX/conversion.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/conversion.hpp @@ -29,6 +29,8 @@ void buildLowerVPUIP2ELFPipeline(mlir::OpPassManager& pm, Logger log = Logger::g void buildLowerVPU2VPUIPPipeline(mlir::OpPassManager& pm, bool enableInPlaceBufferization, bool useMemrefForHostFunctionBufferization, Logger log = Logger::global()); +void buildLowerIE2VPUPipelineReferenceSW(mlir::OpPassManager& pm, Logger log = Logger::global()); + // // Registration // diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/conversion/passes/IE2VPU/convert_IE_to_VPU_NCE.hpp b/src/vpux_compiler/include/vpux/compiler/NPU37XX/conversion/passes/IE2VPU/convert_IE_to_VPU_NCE.hpp index 565f0af7d3..c565616434 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU37XX/conversion/passes/IE2VPU/convert_IE_to_VPU_NCE.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/conversion/passes/IE2VPU/convert_IE_to_VPU_NCE.hpp @@ -18,7 +18,7 @@ namespace arch37xx { class ConvToNCE final : public mlir::OpRewritePattern { public: - ConvToNCE(mlir::MLIRContext* ctx, VPU::ArchKind arch, Logger log) + ConvToNCE(mlir::MLIRContext* ctx, config::ArchKind arch, Logger log) : mlir::OpRewritePattern(ctx), _arch(arch), _log(log) { setDebugName("ConvToNCE"); } @@ -27,7 +27,7 @@ class ConvToNCE final : public mlir::OpRewritePattern { mlir::LogicalResult matchAndRewrite(IE::ConvolutionOp origOp, mlir::PatternRewriter& rewriter) const final; private: - VPU::ArchKind _arch; + config::ArchKind _arch; Logger _log; }; @@ -37,7 +37,7 @@ class ConvToNCE final : public mlir::OpRewritePattern { class MatMulToNCE final : public mlir::OpRewritePattern { public: - MatMulToNCE(mlir::MLIRContext* ctx, VPU::ArchKind arch, Logger log) + MatMulToNCE(mlir::MLIRContext* ctx, config::ArchKind arch, Logger log) : mlir::OpRewritePattern(ctx), _arch(arch), _log(log) { setDebugName("MatMulToNCE"); } @@ -46,7 +46,7 @@ class MatMulToNCE final : public mlir::OpRewritePattern { mlir::LogicalResult matchAndRewrite(IE::MatMulOp origOp, mlir::PatternRewriter& rewriter) const final; private: - VPU::ArchKind _arch; + config::ArchKind _arch; Logger _log; }; @@ -56,7 +56,7 @@ class MatMulToNCE final : public mlir::OpRewritePattern { class DepthConvToNCE final : public mlir::OpRewritePattern { public: - DepthConvToNCE(mlir::MLIRContext* ctx, VPU::ArchKind arch, Logger log) + DepthConvToNCE(mlir::MLIRContext* ctx, config::ArchKind arch, Logger log) : mlir::OpRewritePattern(ctx), _arch(arch), _log(log) { } @@ -64,7 +64,7 @@ class DepthConvToNCE final : public mlir::OpRewritePattern -#include - namespace vpux { namespace IE { namespace arch37xx { @@ -22,8 +20,6 @@ namespace arch37xx { // std::unique_ptr createInsertIdentityPoolBeforeOpPass(Logger log = Logger::global()); -std::unique_ptr createMapBilinearInterpolateOnDPUPass(const bool interpolateAsSEOp = false, - Logger log = Logger::global()); std::unique_ptr createOptimizeSliceExpandPass(Logger log = Logger::global()); std::unique_ptr createPropagateExpandPass(Logger log = Logger::global()); std::unique_ptr createFusePermuteQuantizeExpandPass(Logger log = Logger::global()); @@ -97,7 +93,7 @@ struct DefaultHWOptions : public IE::DefaultHWOptionsDialectBase, virtual vpux:: llvm::cl::init(false)}; BoolOption enableRuntimeDequant{*this, "enable-runtime-dequant", - llvm::cl::desc("Enable runtime dequantization of asymmetricly quantized weight"), + llvm::cl::desc("Enable runtime dequantization of asymmetrically quantized weights"), llvm::cl::init(false)}; Int64Option runtimeDequantizationLimit{ *this, "runtime-dequantization-limit", @@ -118,6 +114,7 @@ struct DefaultHWOptions : public IE::DefaultHWOptionsDialectBase, virtual vpux:: }; void buildDefaultHWPipeline(mlir::OpPassManager& pm, const DefaultHWOptions& options, Logger log = Logger::global()); +void buildReferenceSWPipeline(mlir::OpPassManager& pm, const DefaultHWOptions& options, Logger log = Logger::global()); // // AdjustLayout diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/IE/utils/quantization.hpp b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/IE/utils/quantization.hpp index bb93f9df4f..82c7cb6ae3 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/IE/utils/quantization.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/IE/utils/quantization.hpp @@ -5,9 +5,10 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/utils/quantization.hpp" -#include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/utils/logger/logger.hpp" + +#include namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/NPUReg37XX/firmware_headers/CMakeLists.txt b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/NPUReg37XX/firmware_headers/CMakeLists.txt index 3e9bfd6de4..18cc8c4992 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/NPUReg37XX/firmware_headers/CMakeLists.txt +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/NPUReg37XX/firmware_headers/CMakeLists.txt @@ -1,6 +1,6 @@ # # Copyright (C) 2023-2025 Intel Corporation. -# SPDX-License-Identifier: Apache 2.0 +# SPDX-License-Identifier: Apache-2.0 # add_library(npu_37xx_firmware_headers INTERFACE) diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/NPUReg37XX/firmware_headers/npu_37xx_nnrt.hpp b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/NPUReg37XX/firmware_headers/npu_37xx_nnrt.hpp index d80c4f0d3c..705f8106bb 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/NPUReg37XX/firmware_headers/npu_37xx_nnrt.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/NPUReg37XX/firmware_headers/npu_37xx_nnrt.hpp @@ -1,6 +1,6 @@ // // Copyright (C) 2023-2025 Intel Corporation. -// SPDX-License-Identifier: Apache 2.0 +// SPDX-License-Identifier: Apache-2.0 // // this header must be used instead direct include of firmware headers diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPU/IR/ops_interfaces.hpp b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPU/IR/ops_interfaces.hpp index 4fd0f81c9c..3f457b27d2 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPU/IR/ops_interfaces.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPU/IR/ops_interfaces.hpp @@ -14,5 +14,6 @@ void registerDDRAccessOpModelInterface(mlir::DialectRegistry& registry); void registerNCEOpInterface(mlir::DialectRegistry& registry); void registerClusterBroadcastingOpInterfaces(mlir::DialectRegistry& registry); void registerUnrollBatchOpInterfaces(mlir::DialectRegistry& registry); +void registerICostModelUtilsInterface(mlir::DialectRegistry& registry); } // namespace vpux::VPU::arch37xx diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPU/impl/ppe_factory.hpp b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPU/impl/ppe_factory.hpp index 1f7bb886b9..af516e6026 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPU/impl/ppe_factory.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPU/impl/ppe_factory.hpp @@ -6,7 +6,6 @@ #pragma once #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/interfaces/ppe_factory.hpp" namespace vpux::VPU::arch37xx { diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPU/transforms/passes.hpp b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPU/transforms/passes.hpp index 9389c4b868..a5cd8fb828 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPU/transforms/passes.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPU/transforms/passes.hpp @@ -49,6 +49,7 @@ struct DefaultHWOptions : public VPU::DefaultHWOptionsDialectBase, virtual vpux: }; void buildDefaultHWPipeline(mlir::OpPassManager& pm, const DefaultHWOptions& options, Logger log = Logger::global()); +void buildReferenceSWPipeline(mlir::OpPassManager& pm, Logger log = Logger::global()); // // Registration diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/impl/profiling_info.hpp b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/impl/profiling_info.hpp index 9810a17844..6b6e514251 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/impl/profiling_info.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/impl/profiling_info.hpp @@ -8,8 +8,6 @@ #include #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" -using namespace vpux; - namespace vpux::VPUIP::arch37xx { mlir::Type getTimestampType(mlir::MLIRContext* ctx); diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/impl/split_cost_getter.hpp b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/impl/split_cost_getter.hpp index dada5d85b2..5a8d47d79e 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/impl/split_cost_getter.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/impl/split_cost_getter.hpp @@ -7,8 +7,6 @@ #include "vpux/compiler/dialect/VPUIP/interfaces/dpu_tiler.hpp" -using namespace vpux; - namespace vpux::VPUIP::arch37xx { int64_t computeSplitCost(const WorkloadSplit& split, const WorkloadCostParams& params, VPUNN::VPUCostModel& costModel, diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/transforms/passes.hpp b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/transforms/passes.hpp index 0a24523fba..9c30693800 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/transforms/passes.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/transforms/passes.hpp @@ -22,6 +22,7 @@ std::unique_ptr createAddSwKernelCacheHandlingOpsPass(Logger log = L std::unique_ptr createUnrollDistributedOpsPass(Logger log = Logger::global()); std::unique_ptr createUnrollDepthToSpaceDMAPass(Logger log = Logger::global()); std::unique_ptr createUnrollSpaceToDepthDMAPass(Logger log = Logger::global()); +std::unique_ptr createUnrollPermuteDMAPass(Logger log = Logger::global()); // // Optimize copies pipeline @@ -71,14 +72,10 @@ struct DefaultHWOptions : BoolOption enableActivationSwizzling{*this, "enable-activation-swizzling", ::llvm::cl::desc("Enable activation swizzling"), ::llvm::cl::init(true)}; - - BoolOption enableSWKernelPrefetchingReserveMem{ - *this, "enable-sw-kernel-prefetching-reserve-mem", - ::llvm::cl::desc("Reserve memory at the end of CMX for SW Kernel data prefetching"), - ::llvm::cl::init(true)}; }; void buildDefaultHWPipeline(mlir::OpPassManager& pm, const DefaultHWOptions& options, Logger log = Logger::global()); +void buildReferenceSWPipeline(mlir::OpPassManager& pm, const DefaultHWOptions& options, Logger log = Logger::global()); // // Registration diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/utils/permute_dma.hpp b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/utils/permute_dma.hpp new file mode 100644 index 0000000000..a008d16ffe --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIP/utils/permute_dma.hpp @@ -0,0 +1,117 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" + +namespace vpux::arch37xx { + +NDTypeInterface changeShape(NDTypeInterface originType, ShapeRef shape, ShapeRef offset); +NDTypeInterface getPerClusterInputType(NDTypeInterface inputType, NDTypeInterface outputType, mlir::AffineMap memPerm, + ShapeRef outShape, ShapeRef offset); + +class UnrollMultiClusterPermuteDMA { +public: + static mlir::LogicalResult unrollSegmentedOrOverlappedOutput(VPUIP::PermuteDMAOp permuteOp, + VPUIP::DistributedBufferType distributedType, + mlir::AffineMap memPerm, + mlir::PatternRewriter& rewriter, int64_t portCount, + Logger logger); + + static mlir::LogicalResult unrollDuplicatedOutput(VPUIP::PermuteDMAOp permuteOp, + VPUIP::DistributedBufferType distributedType, + mlir::AffineMap memPerm, mlir::PatternRewriter& rewriter, int64_t, + Logger logger); + + static mlir::LogicalResult unrollDuplicatedInputAndOutput(VPUIP::PermuteDMAOp permuteOp, mlir::AffineMap memPerm, + mlir::PatternRewriter& rewriter, int64_t, Logger logger); + + static mlir::LogicalResult unrollDuplicatedInput(VPUIP::PermuteDMAOp permuteOp, mlir::AffineMap memPerm, + mlir::PatternRewriter& rewriter, int64_t, Logger logger); +}; + +class UnrollSingleClusterPermuteDMA { +public: + static mlir::LogicalResult unroll(VPUIP::PermuteDMAOp permuteOp, mlir::PatternRewriter& rewriter, int64_t portCount, + Logger logger); +}; + +template +mlir::LogicalResult unrollPermuteDMA(VPUIP::PermuteDMAOp permuteOp, mlir::PatternRewriter& rewriter, int64_t portCount, + Logger logger) { + // Skip PermuteDMA ops which have been unrolled by checking mem_perm attribute + if (permuteOp.getMemPermAttr() == nullptr) { + return mlir::failure(); + } + + logger.trace("Permute rewriter operation '{0}' at '{1}'", permuteOp->getName(), permuteOp->getLoc()); + + const auto input = permuteOp.getInput(); + const auto output = permuteOp.getOutputBuff(); + + const auto inputType = mlir::cast(input.getType()); + const auto outputType = mlir::cast(output.getType()); + + auto inDistributedType = mlir::dyn_cast(inputType); + auto outDistributedType = mlir::dyn_cast(outputType); + + // Dispatch between single cluster and multi-cluster tasks. + // - multi-cluster tasks have at least one distributed buffer and do not have DMADescriptorAttr + // - single cluster tasks either do not have any distributed buffers or have DMADescriptorAttr resulted + // from multi-cluster task unrolling + // - only form of single-cluster tasks with distributed buffers is with DUPLICATED output buffer + if ((inDistributedType || outDistributedType) && !permuteOp.getDmaDescriptorAttr()) { + // Unroll multi-cluster task + logger.trace("process permute with DistributedType at {0}", permuteOp); + + VPUX_THROW_UNLESS(permuteOp.getMemPerm().has_value(), + "Can not get memPerm attribute from PermuteDMA layer at {0}", permuteOp.getLoc()); + const auto memPerm = permuteOp.getMemPerm().value(); + + if (inDistributedType != nullptr && outDistributedType != nullptr) { + return UnrollMultiCluster::unrollDuplicatedInputAndOutput(permuteOp, memPerm, rewriter, portCount, logger); + } else if (inDistributedType != nullptr) { + return UnrollMultiCluster::unrollDuplicatedInput(permuteOp, memPerm, rewriter, portCount, logger); + } + + VPUX_THROW_UNLESS(inputType.getMemoryKind() == VPU::MemoryKind::DDR && + outputType.getMemoryKind() == VPU::MemoryKind::CMX_NN, + "Unexpected memory space. Got: input {0}, output {1}", inputType.getMemoryKind(), + outputType.getMemoryKind()); + + VPUX_THROW_WHEN(outDistributedType == nullptr, "Expect distributed type for permute op output, actual: {0}", + outputType); + + VPUX_THROW_UNLESS(VPUIP::doesPermuteDMATileDimSupportWrapInCluster(inputType, outputType, memPerm, + outDistributedType, logger), + "Unsupported PermuteDMA under cluster tiling at '{0}'", permuteOp->getLoc()); + + const auto distributionAttr = outDistributedType.getDistribution(); + const auto mode = distributionAttr.getMode().getValue(); + if (mode == VPU::DistributionMode::SEGMENTED || mode == VPU::DistributionMode::OVERLAPPED) { + return UnrollMultiCluster::unrollSegmentedOrOverlappedOutput(permuteOp, outDistributedType, memPerm, + rewriter, portCount, logger); + } else if (VPU::bitEnumContainsAny(mode, VPU::DistributionMode::DUPLICATED) || + VPU::bitEnumContainsAny(mode, VPU::DistributionMode::MULTICASTED)) { + return UnrollMultiCluster::unrollDuplicatedOutput(permuteOp, outDistributedType, memPerm, rewriter, + portCount, logger); + } else { + VPUX_THROW("Unsupported distributed mode"); + } + } else { + // Unroll single cluster task + return UnrollSingleCluster::unroll(permuteOp, rewriter, portCount, logger); + } + + return mlir::failure(); +} + +mlir::LogicalResult rewritePermuteDMA(VPUIP::PermuteDMAOp permuteOp, mlir::PatternRewriter& rewriter, int64_t portCount, + Logger logger); + +} // namespace vpux::arch37xx diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIPDPU/ops.hpp b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIPDPU/ops.hpp index 85ec5a3717..61cdf43f51 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIPDPU/ops.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/VPUIPDPU/ops.hpp @@ -8,6 +8,7 @@ #include "vpux/compiler/dialect/VPUIPDPU/attributes.hpp" #include "vpux/compiler/dialect/VPUIPDPU/ops.hpp" #include "vpux/compiler/dialect/VPUIPDPU/ops_interfaces.hpp" +#include "vpux/compiler/dialect/config/IR/ops_interfaces.hpp" // // Generated diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect_pipeline_strategy.hpp b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect_pipeline_strategy.hpp index 88d02aec98..ec6d442cc5 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect_pipeline_strategy.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect_pipeline_strategy.hpp @@ -34,4 +34,8 @@ template extern std::unique_ptr createDialectPipelineStrategy37XX( const VPU::InitCompilerOptions* initCompilerOptions, const OptionsType* options); +template +extern std::unique_ptr createDialectPipelineStrategy37XXReferenceSW( + const VPU::InitCompilerOptions* initCompilerOptions, const OptionsType* options); + } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/NPU37XX/pipeline_options.hpp b/src/vpux_compiler/include/vpux/compiler/NPU37XX/pipeline_options.hpp index 17db85f0cc..6397f3c9a9 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU37XX/pipeline_options.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU37XX/pipeline_options.hpp @@ -10,35 +10,6 @@ #include "vpux/compiler/NPU37XX/dialect/VPUIP/transforms/passes.hpp" namespace vpux { - -// -// ReferenceSWOptions37XX -// - -struct ReferenceSWOptions37XX final : - public PublicOptions, - public ReferenceSWOptions, - public vpux::BatchCompileOptionsAdapter { - ReferenceSWOptions37XX(): vpux::BatchCompileOptionsAdapter(static_cast(*this)) { - } - ReferenceSWOptions37XX(VPU::ArchKind arch) - : PublicOptions(arch), vpux::BatchCompileOptionsAdapter(static_cast(*this)) { - } - - static std::unique_ptr createFromString(StringRef options, VPU::ArchKind arch) { - auto result = std::make_unique(arch); - if (mlir::failed(result->parseFromString(options))) { - return nullptr; - } - return result; - } - - BoolOption enableConvertFFTToConv{*this, "convert-fft-to-conv", llvm::cl::desc("Enable convert-fft-to-conv pass"), - llvm::cl::init(false)}; - BoolOption enableDecomposeGRUSequence{*this, "decompose-gru-sequence", - llvm::cl::desc("Enable decompose-gru-sequence pass"), llvm::cl::init(false)}; -}; - // // DefaultHWOptions37XX // @@ -49,10 +20,10 @@ struct DefaultHWOptions37XX final : VPUIP::arch37xx::DefaultHWOptions, mlir::PassPipelineOptions { DefaultHWOptions37XX() = default; - DefaultHWOptions37XX(VPU::ArchKind arch): PublicOptions(arch) { + DefaultHWOptions37XX(config::ArchKind arch): PublicOptions(arch) { } - static std::unique_ptr createFromString(StringRef options, VPU::ArchKind arch) { + static std::unique_ptr createFromString(StringRef options, config::ArchKind arch) { auto result = std::make_unique(arch); if (mlir::failed(result->parseFromString(options))) { return nullptr; diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/export.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/export.hpp index 2383224bd3..f5334ea0b3 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/export.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/export.hpp @@ -6,7 +6,6 @@ #pragma once #include "vpux/compiler/compiler.hpp" - #include "vpux/utils/logger/logger.hpp" #include @@ -16,9 +15,6 @@ #include -#include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - namespace vpux { namespace ELF { diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/ops.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/ops.hpp index c8f71ddd81..021a0908ba 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/ops.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/ops.hpp @@ -6,16 +6,12 @@ #pragma once #include "vpux/compiler/NPU40XX/dialect/ELF/attributes.hpp" -#include "vpux/compiler/NPU40XX/dialect/ELF/dialect.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/ops_interfaces.hpp" -#include "vpux/compiler/dialect/ELFNPU37XX/metadata.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" -#include +#include + #include -#include -#include #include // diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/ops_interfaces.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/ops_interfaces.hpp index 73ec55b0db..50a02d54f3 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/ops_interfaces.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/ops_interfaces.hpp @@ -6,13 +6,16 @@ #pragma once #include "vpux/compiler/NPU40XX/dialect/ELF/attributes.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/const/ops.hpp" - +#include "vpux/compiler/dialect/VPURT/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/attributes.hpp" #include "vpux/utils/core/dense_map.hpp" +#include "vpux/utils/core/mem_size.hpp" #include +#include +#include + namespace vpux { namespace ELF { diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/reloc_manager.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/reloc_manager.hpp index 1426c099d9..eac8d6ddc8 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/reloc_manager.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/ELF/reloc_manager.hpp @@ -29,6 +29,10 @@ class RelocManager { RelocManager& operator=(const RelocManager&) = delete; void createRelocations(ELF::RelocatableOpInterface relocatableOp); + void createRelocations(mlir::Operation* op, ELF::SymbolOp sourceSym, ELF::ElfSectionInterface targetSection, + size_t offset, bool isOffsetRelative, vpux::ELF::RelocationType relocType, size_t addend, + std::string_view description); + ELF::SymbolOp getSymbolOfBinOpOrEncapsulatingSection(mlir::Operation* binOp); private: void createRelocations(mlir::Operation* op, ELF::RelocationInfo& relocInfo); @@ -36,8 +40,6 @@ class RelocManager { void constructSymbolMap(ELF::MainOp elfMain); - ELF::SymbolOp getSymbolOfBinOpOrEncapsulatingSection(mlir::Operation* binOp); - ELF::CreateRelocationSectionOp getRelocationSection(ELF::ElfSectionInterface targetSection, ELF::CreateSymbolTableSectionOp symbolTable); diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/IE/impl/map_bilinear_interpolate_on_dpu_strategy.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/IE/impl/map_bilinear_interpolate_on_dpu_strategy.hpp new file mode 100644 index 0000000000..649dde7501 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/IE/impl/map_bilinear_interpolate_on_dpu_strategy.hpp @@ -0,0 +1,19 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/interfaces/map_bilinear_interpolate_on_dpu_strategy.hpp" + +namespace vpux::IE::arch40xx { +class MapBilinearInterpolateOnDPUStrategy final : public vpux::IE::IMapBilinearInterpolateOnDPUStrategy { +public: + MapBilinearInterpolateOnDPUStrategy(const bool interpolateAsSEOpInStrategy) + : IMapBilinearInterpolateOnDPUStrategy(interpolateAsSEOpInStrategy) { + } + void prepareInterpolate(mlir::ConversionTarget& target, LogCb logCb) const override; +}; + +} // namespace vpux::IE::arch40xx diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/IE/transforms/passes.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/IE/transforms/passes.hpp index 7bffa95777..be24fcc860 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/IE/transforms/passes.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/IE/transforms/passes.hpp @@ -7,7 +7,6 @@ #include "vpux/compiler/NPU40XX/core/pipelines_options.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/utils/options.hpp" #include "vpux/utils/logger/logger.hpp" namespace vpux { @@ -18,9 +17,6 @@ namespace arch40xx { // Passes // -std::unique_ptr createMapBilinearInterpolateOnDPUPass(const bool interpolateAsSEOp = false, - Logger log = Logger::global()); - std::unique_ptr createReduceNumTilesForSmallModelsPass(Logger log = Logger::global()); // @@ -42,13 +38,9 @@ struct DefaultHWOptions : public IE::DefaultHWOptionsDialectBase, virtual vpux:: llvm::cl::desc("Enable swap-convert-with-sw-op pass"), llvm::cl::init(true)}; BoolOption mergeUnrolledMatmul{*this, "merge-unrolled-matmul", llvm::cl::desc("Enable merging urolled Matmul ops"), llvm::cl::init(true)}; - BoolOption enableRuntimeDequant{*this, "enable-runtime-dequant", - llvm::cl::desc("Enable runtime dequantization of asymmetricly quantized weight"), + llvm::cl::desc("Enable runtime dequantization of asymmetrically quantized weights"), llvm::cl::init(true)}; - BoolOption enableApplyDynamicBoundaryCorrection{*this, "enable-apply-dynamic-boundary-correction", - llvm::cl::desc("Enable apply-dynamic-boundary-correction pass"), - llvm::cl::init(false)}; BoolOption enableReduceNumTilesForSmallModelsPass{*this, "reduce-num-tiles-for-small-models", llvm::cl::desc("Enable reduce-num-tiles-for-small-models pass"), llvm::cl::init(false)}; @@ -82,6 +74,8 @@ void buildLowPrecisionPipeline(mlir::OpPassManager& pm, const LowPrecisionOption void buildDefaultHWPipeline(mlir::OpPassManager& pm, const IE::arch40xx::DefaultHWOptions& options, Logger log = Logger::global()); +void buildReferenceSWPipeline(mlir::OpPassManager& pm, const IE::arch40xx::DefaultHWOptions& options, + Logger log = Logger::global()); // // Registration // diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/firmware_headers/CMakeLists.txt b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/firmware_headers/CMakeLists.txt index 80c556972d..46779ac154 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/firmware_headers/CMakeLists.txt +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/firmware_headers/CMakeLists.txt @@ -1,6 +1,6 @@ # # Copyright (C) 2023-2025 Intel Corporation. -# SPDX-License-Identifier: Apache 2.0 +# SPDX-License-Identifier: Apache-2.0 # add_library(npu_40xx_firmware_headers INTERFACE) diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/firmware_headers/npu_40xx_nnrt.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/firmware_headers/npu_40xx_nnrt.hpp index a6e07259e1..38a19dbca6 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/firmware_headers/npu_40xx_nnrt.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/firmware_headers/npu_40xx_nnrt.hpp @@ -1,6 +1,6 @@ // // Copyright (C) 2023-2025 Intel Corporation. -// SPDX-License-Identifier: Apache 2.0 +// SPDX-License-Identifier: Apache-2.0 // // see comments for 37xx-specific version diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/ops.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/ops.hpp index a9289d6bfb..d932da852b 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/ops.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/ops.hpp @@ -5,12 +5,12 @@ #pragma once -#include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/ops_interfaces.hpp" #include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/attributes.hpp" #include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/dialect.hpp" #include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/ops_interfaces.hpp" -#include "vpux/compiler/dialect/VPURegMapped/ops.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPURegMapped/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPURegMapped/types.hpp" #include diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/ops_interfaces.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/ops_interfaces.hpp index e4219f4a53..dac95624a0 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/ops_interfaces.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/ops_interfaces.hpp @@ -5,15 +5,8 @@ #pragma once -#include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" - -#include -#include -#include - #include #include -#include namespace vpux { namespace NPUReg40XX { diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/passes.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/passes.hpp new file mode 100644 index 0000000000..b3eb73d367 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/passes.hpp @@ -0,0 +1,18 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/utils/logger/logger.hpp" + +#include + +#include + +namespace vpux::NPUReg40XX { + +void registerPasses(); + +} // namespace vpux::NPUReg40XX diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/utils.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/utils.hpp index 48cba895ee..117956b03c 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/utils.hpp @@ -81,7 +81,7 @@ void fillNNrtConfig(npu40xx::nn_public::VpuNNShaveRuntimeConfigs& shv_rt_configs shv_rt_configs.use_schedule_embedded_rt = true; shv_rt_configs.code_window_buffer_size = - checked_cast(actShaveRtOp.getBinarySize(VPU::ArchKind::UNKNOWN)); + checked_cast(actShaveRtOp.getBinarySize(config::ArchKind::UNKNOWN)); shv_rt_configs.runtime_entry = actShaveRtOp.getKernelEntry(); shv_rt_configs.runtime_version = actShaveRtOp.getVersion(); } diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp index e2554fc848..672381044a 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp @@ -25,8 +25,8 @@ std::unique_ptr createComputeTaskStrippingPass( bool shaveDryRun = false); std::unique_ptr createComputeHaloRegionForDPUTaskOpPass(Logger log = Logger::global()); -std::unique_ptr createDMATaskProfilingHwDdrPass( - DMAProfilingMode dmaProfilingMode = DMAProfilingMode::DISABLED, Logger log = Logger::global()); +std::unique_ptr createDMATaskProfilingHwDdrPass(const std::string& enableDMAProfiling = "true", + Logger log = Logger::global()); std::unique_ptr createConstantDpuProfHwpBasePass(Logger log = Logger::global()); std::unique_ptr createCompressSpillDmaPass(Logger log = Logger::global()); std::unique_ptr createDMAOutOfOrderOptimizationPass(Logger log = Logger::global()); @@ -34,15 +34,16 @@ std::unique_ptr createDMAOutOfOrderOptimizationPass(Logger log = Log std::unique_ptr createUnrollDistributedOpsPass(Logger log = Logger::global(), bool enableSegmentedDmaFusion = false); std::unique_ptr createOptimizeConvertDMAOpPass(Logger log = Logger::global()); -std::unique_ptr createAddStartBarrierPass(bool compilerBarrierProgramming = false, - Logger log = Logger::global()); +std::unique_ptr createAddStartBarrierPass(Logger log = Logger::global()); std::unique_ptr createDetectDMASplitCandidatePass(Logger log = Logger::global()); std::unique_ptr createSplitDMAToBalanceLoadPass(Logger log = Logger::global()); std::unique_ptr createFuseSegmentedDmaPass(Logger log = Logger::global()); -std::unique_ptr createLegalizeScheduleForWlmFetchDmasPass( +std::unique_ptr createLegalizeScheduleForPartialWlmFetchDmasPass( const int virtualBarrierThreshold = VIRTUAL_BARRIER_THRESHOLD_WLM, Logger log = Logger::global()); +std::unique_ptr createAddPlaceholderFetchDMAsPass(Logger log = Logger::global()); std::unique_ptr createUnrollDepthToSpaceDMAPass(Logger log = Logger::global()); std::unique_ptr createUnrollSpaceToDepthDMAPass(Logger log = Logger::global()); +std::unique_ptr createUnrollPermuteDMAPass(Logger log = Logger::global()); // // Memory allocation pipeline @@ -83,31 +84,20 @@ struct DefaultHWOptions : BoolOption enableActivationSwizzling{*this, "enable-activation-swizzling", ::llvm::cl::desc("Enable activation swizzling"), ::llvm::cl::init(true)}; - BoolOption enableCompressActivationSpill{*this, "compress-activation-spill", - ::llvm::cl::desc("Enable compress-activation-spill feature"), - ::llvm::cl::init(false)}; - // TODO: E#118871 Switch this option to true by default BoolOption enableBarrierSchedWithFunctionOutlining{ *this, "barrier-sched-with-function-outlining", llvm::cl::desc("Enable barrier scheduling passes with IR split into multiple functions"), llvm::cl::init(false)}; - BoolOption enableSwKernelsCachePrefetch{*this, "enable-sw-kernels-cache-prefetch", - llvm::cl::desc("Enable SW kernel cache prefetch"), llvm::cl::init(false)}; - BoolOption configureUpdateBarriersForSwPrefetch{ *this, "configure-update-barriers-for-sw-prefetch", llvm::cl::desc("Configure update barrier to block shave execution until prefetch finishes"), llvm::cl::init(true)}; - - BoolOption enableSWKernelPrefetchingReserveMem{ - *this, "enable-sw-kernel-prefetching-reserve-mem", - ::llvm::cl::desc("Reserve memory at the end of CMX for SW Kernel data prefetching"), - ::llvm::cl::init(true)}; }; void buildDefaultHWPipeline(mlir::OpPassManager& pm, const DefaultHWOptions& options, Logger log = Logger::global()); +void buildReferenceSWPipeline(mlir::OpPassManager& pm, const DefaultHWOptions& options, Logger log = Logger::global()); // // Registration diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIP/utils/permute_dma.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIP/utils/permute_dma.hpp new file mode 100644 index 0000000000..162291981d --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIP/utils/permute_dma.hpp @@ -0,0 +1,21 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" + +namespace vpux::arch40xx { + +class UnrollSingleClusterPermuteDMA { +public: + static mlir::LogicalResult unroll(VPUIP::PermuteDMAOp permuteOp, mlir::PatternRewriter& rewriter, int64_t portCount, + Logger logger); +}; + +mlir::LogicalResult rewritePermuteDMA(VPUIP::PermuteDMAOp permuteOp, mlir::PatternRewriter& rewriter, int64_t portCount, + Logger logger); + +} // namespace vpux::arch40xx diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/lower_to_registers.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/lower_to_registers.hpp index 787cc7694e..d5fe7ffe8e 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/lower_to_registers.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/lower_to_registers.hpp @@ -6,6 +6,7 @@ #include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/descriptors.hpp" #include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/types.hpp" #include "vpux/compiler/NPU40XX/dialect/VPUIPDPU/ops.hpp" +#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPUIPDPU/attributes.hpp" #include "vpux/compiler/dialect/VPUIPDPU/rewriters/utils.hpp" #include "vpux/compiler/dialect/VPUMI40XX/utils.hpp" @@ -288,9 +289,6 @@ void lowerToRegIDUWeightsOp(VPUIPDPU::IDUWeightsOp op, DpuInvariantDescriptorTyp } else if (wmode == static_cast(nn_public::VpuInputTensorDType::BF16)) { vpux::type::bfloat16 bf16(value); return bf16.to_bits(); - } else if (wmode == static_cast(nn_public::VpuInputTensorDType::FP8)) { - vpux::type::float8_e5m2 bf8(value); - return bf8.to_bits(); } else { VPUX_THROW("getPalletModeBitValue: Unsupported wmode for palletization table {0}", wmode); } diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_idu.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_idu.hpp index d58362a4c2..b878afbd22 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_idu.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_idu.hpp @@ -5,8 +5,24 @@ #pragma once -#include "vpux/compiler/NPU40XX/dialect/VPUIPDPU/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPUIPDPU/attributes.hpp" +#include "vpux/utils/core/small_vector.hpp" +#include "vpux/utils/logger/logger.hpp" + +#include +#include + +#include + +namespace vpux { +class NDTypeInterface; +} +namespace vpux::config { +enum class ArchKind : uint64_t; +} +namespace vpux::VPUIP { +enum class NCETaskType : uint64_t; +} namespace vpux::VPUIPDPU::arch40xx::IDU { @@ -78,7 +94,7 @@ mlir::LogicalResult buildIDUConfig(mlir::OpBuilder& builder, const mlir::Locatio mlir::LogicalResult configureEltwiseCfg(const Logger& log, IDUConfig::EltWiseCfg& config, VPUIP::NCETaskType taskType, mlir::Type inActType, mlir::Type weightsType, const PPETask& ppeTask, - VPU::ArchKind arch); + config::ArchKind arch); mlir::LogicalResult configureDepthWiseCfg(const Logger& log, IDUConfig::DepthWiseCfg& config, VPUIP::NCETaskType taskType, std::optional smallKernelOptimization); diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_odu.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_odu.hpp index 45a0eac0e9..7c37f7bed5 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_odu.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_odu.hpp @@ -5,8 +5,25 @@ #pragma once -#include "vpux/compiler/NPU40XX/dialect/VPUIPDPU/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/core/attributes/dims_order.hpp" +#include "vpux/compiler/core/attributes/strides.hpp" +#include "vpux/compiler/dialect/VPUIPDPU/attributes.hpp" +#include "vpux/utils/logger/logger.hpp" + +#include +#include + +#include + +namespace vpux { +class NDTypeInterface; +} +namespace vpux::VPU { +enum class MPEMode : uint64_t; +} +namespace vpux::VPUIP { +enum class NCETaskType : uint64_t; +} namespace vpux::VPUIPDPU::arch40xx::ODU { @@ -46,7 +63,7 @@ mlir::LogicalResult configureDataReuse(const Logger& log, ODUConfig::DataReuse& mlir::LogicalResult configurePermuteMode(const Logger& log, ODUConfig::PermuteData& config, const DimsOrder& outDimsOrder); mlir::LogicalResult configureSparsity(const Logger&, ODUConfig::Sparsity& config, bool outSparsityEnabled, - NDTypeInterface outActType); + int64_t sparseValue); mlir::LogicalResult configureSwizzleData(const Logger& log, ODUConfig::SwizzleData& config, std::optional outSwizzling); mlir::LogicalResult configureOutActivations(const Logger& log, ODUConfig::OutActivations& config, diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_ppe.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_ppe.hpp index c22618fdf0..0a719100ac 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_ppe.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_ppe.hpp @@ -3,8 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/NPU40XX/dialect/VPUIPDPU/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPUIPDPU/attributes.hpp" + +#include namespace vpux::VPUIPDPU::arch40xx::PPE { diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPURT/interfaces/barrier_pages_split.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPURT/interfaces/barrier_pages_split.hpp index 4ecb06be2a..f252064335 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPURT/interfaces/barrier_pages_split.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPURT/interfaces/barrier_pages_split.hpp @@ -6,7 +6,7 @@ #pragma once #include "vpux/compiler/core/barrier_info.hpp" -#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/VPURT/IR/task.hpp" namespace vpux { namespace VPURT { @@ -108,6 +108,14 @@ class BarrierPagesSplitHandler { }; SmallVector getAndLegalizeDummyDmaInsertionData(); + struct DummyBarrierData { + size_t pageInd; + size_t producer; + size_t consumer; + size_t insertAfter; + }; + SmallVector getDummyBarriersInsertionData(); + void initPrevPhysBarrierData(SmallVector& barrierToPidVec); void initPrevPhysBarrierData(mlir::func::FuncOp func); SmallVector> prepareEnqueueDmaBarForFullWlm( @@ -122,7 +130,10 @@ class BarrierPagesSplitHandler { void readBarrierPageAssignmentFromIr(); + void updateBoundaryTasksDataForTask(size_t taskInd); + void enforceBoundaryTaskHasUpdateBarrier(size_t pageInd); void initializeBoundaryTasksData(); + void updateBoundaryTasksDataForPage(size_t pageInd); bool isTaskWithNonAdjacentPageDependency(size_t taskInd); bool isDepFromTaskAToTaskB(size_t taskA, size_t taskB); bool isDepFromTaskToBarrier(size_t taskInd, size_t barInd); @@ -170,6 +181,9 @@ class BarrierPagesSplitHandler { // Store information at which page given task start to execute SmallVector _taskPageAssignment; + // Store information about first and last task index on each page + SmallVector>> _firstAndLastTaskPerPage; + // For each page index store per HW FIFO boundary task data. // Since for each HW FIFO there can be a sequence of boundary tasks store index of first and last one // Depending on legalization either first task is used, when checking deps to earlier pages diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp index 1b546e78a7..22339e5430 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp @@ -24,6 +24,7 @@ std::unique_ptr createWlmSplitGraphToPagesPass(Logger log = Logger:: std::unique_ptr createWlmLegalizeSplitGraphToPagesPass(Logger log = Logger::global()); std::unique_ptr createWlmLegalizePagesForBarrierDmasPass(Logger log = Logger::global()); std::unique_ptr createWlmInsertDummyDmasInPagesPass(Logger log = Logger::global()); +std::unique_ptr createWlmInsertDummyBarriersInPagesPass(Logger log = Logger::global()); std::unique_ptr createOptimizeBarriersSlotsUsagePass(Logger log = Logger::global()); std::unique_ptr createFindWlmEnqueueDmasBarrierPass(Logger log = Logger::global()); diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect_pipeline_strategy.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect_pipeline_strategy.hpp index 6e3c042769..11306ac048 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect_pipeline_strategy.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect_pipeline_strategy.hpp @@ -33,6 +33,11 @@ template std::unique_ptr createDialectPipelineStrategy40XX( const VPU::InitCompilerOptions* initCompilerOptions, const OptionsType* options); +/// @brief This method creates a pipeline strategy for ReferenceSW compilation. +template +std::unique_ptr createDialectPipelineStrategy40XXReferenceSW( + const VPU::InitCompilerOptions* initCompilerOptions, const OptionsType* options); + /// @brief This method creates a pipeline strategy for Monolithic WS compilation. template std::unique_ptr createDialectPipelineStrategy40XXWS( diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/pipeline_options.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/pipeline_options.hpp index 8fe6a26112..17ee1ee018 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/pipeline_options.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/pipeline_options.hpp @@ -11,37 +11,6 @@ namespace vpux { -// -// ReferenceSWOptions40XX -// - -struct ReferenceSWOptions40XX final : - public PublicOptions, - public ReferenceSWOptions, - public vpux::BatchCompileOptionsAdapter { - ReferenceSWOptions40XX(): vpux::BatchCompileOptionsAdapter(static_cast(*this)) { - } - ReferenceSWOptions40XX(VPU::ArchKind arch) - : PublicOptions(arch), vpux::BatchCompileOptionsAdapter(static_cast(*this)) { - } - - static std::unique_ptr createFromString(StringRef options, VPU::ArchKind arch) { - auto result = std::make_unique(arch); - if (mlir::failed(result->parseFromString(options))) { - return nullptr; - } - return result; - } - - BoolOption enableConvertFFTToConv{*this, "convert-fft-to-conv", llvm::cl::desc("Enable convert-fft-to-conv pass"), - llvm::cl::init(false)}; - BoolOption enableDecomposeGRUSequence{*this, "decompose-gru-sequence", - llvm::cl::desc("Enable decompose-gru-sequence pass"), llvm::cl::init(false)}; -}; - -void buildReferenceSWModePipeline(mlir::OpPassManager& pm, const ReferenceSWOptions40XX& options, - Logger log = Logger::global()); - // // DefaultHWOptions40XX // @@ -52,10 +21,10 @@ struct DefaultHWOptions40XX final : VPUIP::arch40xx::DefaultHWOptions, mlir::PassPipelineOptions { DefaultHWOptions40XX() = default; - DefaultHWOptions40XX(VPU::ArchKind arch): PublicOptions(arch) { + DefaultHWOptions40XX(config::ArchKind arch): PublicOptions(arch) { } - static std::unique_ptr createFromString(StringRef options, VPU::ArchKind arch) { + static std::unique_ptr createFromString(StringRef options, config::ArchKind arch) { auto result = std::make_unique(arch); if (mlir::failed(result->parseFromString(options))) { return nullptr; diff --git a/src/vpux_compiler/include/vpux/compiler/NPU40XX/utils.hpp b/src/vpux_compiler/include/vpux/compiler/NPU40XX/utils.hpp index 0ab30ba077..5df1cd74de 100644 --- a/src/vpux_compiler/include/vpux/compiler/NPU40XX/utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/NPU40XX/utils.hpp @@ -5,9 +5,9 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" namespace vpux { @@ -20,8 +20,8 @@ bool isConvertSupportedOnDMA(T convertOp) { auto module = convertOp.getOperation(); // ConvertSWLayers2VPUIPSWKernelPass still rely on arch check logic here // Remove arch check when one-shot enabled, TODO: E#113196 - auto arch = VPU::getArch(module); - if (arch < VPU::ArchKind::NPU40XX) { + auto arch = config::getArch(module); + if (arch < config::ArchKind::NPU40XX) { // Feature is only tested on 40XX+ return false; } diff --git a/src/vpux_compiler/include/vpux/compiler/act_kernels/shave_binary_resources.h b/src/vpux_compiler/include/vpux/compiler/act_kernels/shave_binary_resources.h index 9ee93d89f6..0206181a2d 100644 --- a/src/vpux_compiler/include/vpux/compiler/act_kernels/shave_binary_resources.h +++ b/src/vpux_compiler/include/vpux/compiler/act_kernels/shave_binary_resources.h @@ -57,7 +57,7 @@ class ShaveBinaryResources { static void loadElfData(mlir::ModuleOp module); - static vpux::SmallString getSwKernelArchString(VPU::ArchKind archKind); + static vpux::SmallString getSwKernelArchString(config::ArchKind archKind); private: std::vector> _elfPermStorage; diff --git a/src/vpux_compiler/include/vpux/compiler/compilation_options.hpp b/src/vpux_compiler/include/vpux/compiler/compilation_options.hpp index 9df2de6e5f..decd5ec909 100644 --- a/src/vpux_compiler/include/vpux/compiler/compilation_options.hpp +++ b/src/vpux_compiler/include/vpux/compiler/compilation_options.hpp @@ -34,7 +34,7 @@ void checkCompilerOptions(const intel_npu::Config& config); std::tuple parseNextArg(StringRef options); template -std::unique_ptr parseOnlyPublic(StringRef compilationModeParams, VPU::ArchKind arch, bool warnForPrivate, +std::unique_ptr parseOnlyPublic(StringRef compilationModeParams, config::ArchKind arch, bool warnForPrivate, LogLevel logLevel) { Logger log("options-parser", logLevel); @@ -76,7 +76,7 @@ std::unique_ptr parseOnlyPublic(StringRef compilationModeParams, VPU::ArchKin } template -std::unique_ptr parseCompilationModeParams(StringRef compilationModeParams, VPU::ArchKind arch, +std::unique_ptr parseCompilationModeParams(StringRef compilationModeParams, config::ArchKind arch, bool warnForPrivate = false, LogLevel logLevel = LogLevel::None) { if (arePrivateOptionsEnabled()) { return T::createFromString(compilationModeParams, arch); diff --git a/src/vpux_compiler/include/vpux/compiler/conversion.hpp b/src/vpux_compiler/include/vpux/compiler/conversion.hpp index c04419be8b..d078bcee1e 100644 --- a/src/vpux_compiler/include/vpux/compiler/conversion.hpp +++ b/src/vpux_compiler/include/vpux/compiler/conversion.hpp @@ -84,11 +84,14 @@ std::unique_ptr createConvertVPUIP2VPUMI37XXPass(Logger log = Logger std::unique_ptr createConvertVPUMI37XX2VPUASMPass(Logger log = Logger::global()); std::unique_ptr createConvertVPUMI37XX2ELFPass(Logger log = Logger::global()); -// NPUReg40XX ELF specific passes +// NPU40XX ELF specific passes std::unique_ptr createConvertVPUIP2VPUMI40XXPass( Logger log = Logger::global(), bool enableMemorySideCache = false, AllocateShaveStackFrames allocateShaveStackFrames = AllocateShaveStackFrames::DISABLED); -std::unique_ptr createConvertVPUMI40XX2VPUASMPass(Logger log = Logger::global(), bool enablePWLM = false, + +std::unique_ptr createConvertVPUMI40XX2VPUASMPass(Logger log = Logger::global(), + bool disableDmaSwFifo = false); +std::unique_ptr createConvertVPUMI40XX2VPUASMPass(bool enablePWLM, Logger log = Logger::global(), bool disableDmaSwFifo = false); std::unique_ptr createConvertVPUIPDPU2NPUReg40XXPass( diff --git a/src/vpux_compiler/include/vpux/compiler/conversion/passes/IE2VPU/convert_layers_to_VPU.hpp b/src/vpux_compiler/include/vpux/compiler/conversion/passes/IE2VPU/convert_layers_to_VPU.hpp index 1d455cbf3e..e05f2accca 100644 --- a/src/vpux_compiler/include/vpux/compiler/conversion/passes/IE2VPU/convert_layers_to_VPU.hpp +++ b/src/vpux_compiler/include/vpux/compiler/conversion/passes/IE2VPU/convert_layers_to_VPU.hpp @@ -5,7 +5,16 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include #include diff --git a/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPU2VPUIP/bufferizable_ops_interface.hpp b/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPU2VPUIP/bufferizable_ops_interface.hpp index 091aff2046..75d718081d 100644 --- a/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPU2VPUIP/bufferizable_ops_interface.hpp +++ b/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPU2VPUIP/bufferizable_ops_interface.hpp @@ -4,14 +4,9 @@ // #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/core/IR/ops.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/core/array_ref.hpp" -#include "vpux/utils/core/func_ref.hpp" -#include "vpux/utils/logger/logger.hpp" #include #include @@ -118,8 +113,6 @@ mlir::LogicalResult bufferizeOp(mlir::MLIRContext* ctx, VPU::ShapeCastOp origOp, mlir::RewriterBase& rewriter); mlir::LogicalResult bufferizeOp(mlir::MLIRContext* ctx, VPU::LayoutCastOp origOp, VPU::LayoutCastOp::Adaptor newArgs, mlir::RewriterBase& rewriter); -mlir::LogicalResult bufferizeOp(mlir::MLIRContext* ctx, VPU::WorkloadCastOp origOp, - VPU::WorkloadCastOp::Adaptor newArgs, mlir::RewriterBase& rewriter); mlir::LogicalResult bufferizeOp(mlir::MLIRContext* ctx, VPU::UpsamplingOp origOp, VPU::UpsamplingOp::Adaptor newArgs, mlir::RewriterBase& rewriter); mlir::LogicalResult bufferizeOp(mlir::MLIRContext* ctx, VPU::ConcatOp origOp, VPU::ConcatOp::Adaptor newArgs, diff --git a/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPU2VPUIP/bufferize_sw_ops_interface.hpp b/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPU2VPUIP/bufferize_sw_ops_interface.hpp index a46ca1c2a3..574cc8cafa 100644 --- a/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPU2VPUIP/bufferize_sw_ops_interface.hpp +++ b/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPU2VPUIP/bufferize_sw_ops_interface.hpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/conversion/passes/VPU2VPUIP/bufferizable_ops_interface.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" namespace vpux { diff --git a/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPUMI37XX2VPUASM/symbolization_pattern.hpp b/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPUMI37XX2VPUASM/symbolization_pattern.hpp index 782b0a78e5..4d0725ae02 100644 --- a/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPUMI37XX2VPUASM/symbolization_pattern.hpp +++ b/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPUMI37XX2VPUASM/symbolization_pattern.hpp @@ -27,23 +27,7 @@ class VPUASMSymbolizationPattern : public SymbolizationPattern { // E#69730: would be cleaner to type-check at template level if Op itself declares the OneResult interface llvm::SmallVector getSymbolicNames(OperationType op, size_t) override { - auto fullName = OperationType::getOperationName(); - - auto opName = fullName.drop_front(VPUMI37XX::VPUMI37XXDialect::getDialectNamespace().size() + 1); - - mlir::Operation* base = op.getOperation(); - VPUX_THROW_UNLESS(base->getResults().size() == 1, - "Default symbolic converter only supports ops with exactly one result. For {0} got {1}", - fullName, base->getResults().size()); - auto indexType = mlir::dyn_cast(base->getResult(0).getType()); - - VPUX_THROW_UNLESS(indexType, - " Can't use the generic symbolizer if for an Op that does not return IndexType {0}", - fullName); - - auto index = std::to_string(indexType.getValue()); - auto symName = mlir::StringAttr::get(op.getContext(), opName + index); - return {mlir::FlatSymbolRefAttr::get(symName)}; + return this->createSymbolicName(op); } protected: diff --git a/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPUMI40XX2VPUASM/symbolization_pattern.hpp b/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPUMI40XX2VPUASM/symbolization_pattern.hpp index 9ad997fe2c..9194a59fa9 100644 --- a/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPUMI40XX2VPUASM/symbolization_pattern.hpp +++ b/src/vpux_compiler/include/vpux/compiler/conversion/passes/VPUMI40XX2VPUASM/symbolization_pattern.hpp @@ -29,25 +29,7 @@ class VPUASMSymbolizationPattern : public SymbolizationPattern { // E#69730: would be cleaner to type-check at template level if Op itself declares the OneResult interface llvm::SmallVector getSymbolicNames(OperationType op, size_t) override { - auto fullName = OperationType::getOperationName(); - - auto opName = fullName.drop_front(VPUMI40XX::VPUMI40XXDialect::getDialectNamespace().size() + 1); - - mlir::Operation* base = op.getOperation(); - VPUX_THROW_UNLESS(base->getResults().size() == 1, - "Default symbolic converter only supports ops with exactly one result. For {0} got {1}", - fullName, base->getResults().size()); - auto indexType = mlir::dyn_cast(base->getResult(0).getType()); - - VPUX_THROW_UNLESS(indexType, - " Can't use the generic symbolizer if for an Op that does not return IndexType {0}", - fullName); - - auto index = std::to_string(indexType.getValue()); - auto tileIdx = std::to_string(indexType.getTileIdx()); - - auto symName = mlir::StringAttr::get(op.getContext(), opName + "_" + tileIdx + "_" + index); - return {mlir::FlatSymbolRefAttr::get(symName)}; + return this->createSymbolicName(op); } protected: @@ -72,28 +54,6 @@ class VPUASMSymbolizationPattern : public SymbolizationPattern { return mlir::ArrayAttr::get(ctx, barrierVec); }; - llvm::SmallVector getSymbolicNamesByTileListValue(OperationType op) { - auto fullName = OperationType::getOperationName(); - auto opName = fullName.drop_front(VPUMI40XX::VPUMI40XXDialect::getDialectNamespace().size() + 1); - - mlir::Operation* base = op.getOperation(); - VPUX_THROW_UNLESS(base->getResults().size() == 1, - "Default symbolic converter only supports ops with exactly one result. For {0} got {1}", - fullName, base->getResults().size()); - auto indexType = mlir::dyn_cast(base->getResult(0).getType()); - - VPUX_THROW_UNLESS(indexType, "Can't use the generic symbolizer for an Op that does not return IndexType: {0}", - fullName); - - auto tileIdx = std::to_string(indexType.getTileIdx()); - auto srcTypeIdx = std::to_string(indexType.getListIdx()); - auto opIdx = std::to_string(indexType.getValue()); - - auto symName = mlir::StringAttr::get(op.getContext(), opName + "_" + tileIdx + "_" + srcTypeIdx + "_" + opIdx); - - return {mlir::FlatSymbolRefAttr::get(symName)}; - } - Logger _log; }; diff --git a/src/vpux_compiler/include/vpux/compiler/conversion/rewriters/VPUIP2VPUMI40XX/dma_rewriter.hpp b/src/vpux_compiler/include/vpux/compiler/conversion/rewriters/VPUIP2VPUMI40XX/dma_rewriter.hpp index b5a42f4cfc..d270140a4c 100644 --- a/src/vpux_compiler/include/vpux/compiler/conversion/rewriters/VPUIP2VPUMI40XX/dma_rewriter.hpp +++ b/src/vpux_compiler/include/vpux/compiler/conversion/rewriters/VPUIP2VPUMI40XX/dma_rewriter.hpp @@ -104,4 +104,16 @@ struct BarrierProgDMARewriter : DMARewriterBase { mlir::ConversionPatternRewriter& rewriter) const override; }; +struct FetchDMARewriter : DMARewriterBase { + using DMARewriterBase::DMARewriterBase; + mlir::LogicalResult matchAndRewrite(VPUIP::FetchDMAOp fetchDMAOp, OpAdaptor adaptor, + mlir::ConversionPatternRewriter& rewriter) const override; +}; + +struct EnqueueDMARewriter : DMARewriterBase { + using DMARewriterBase::DMARewriterBase; + mlir::LogicalResult matchAndRewrite(VPUIP::EnqueueDMAOp enqueueDMAOp, OpAdaptor adaptor, + mlir::ConversionPatternRewriter& rewriter) const override; +}; + } // namespace vpux::vpuip2vpumi40xx diff --git a/src/vpux_compiler/include/vpux/compiler/core/act_profiling.hpp b/src/vpux_compiler/include/vpux/compiler/core/act_profiling.hpp index 2bfcabfd24..91e3072892 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/act_profiling.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/act_profiling.hpp @@ -5,21 +5,13 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/utils/core/func_ref.hpp" -#include "vpux/utils/profiling/common.hpp" - #include "vpux/compiler/core/profiling.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" -#include "vpux/compiler/utils/rewriter.hpp" #include #include #include -#include #include // E#171862: merge NCETiledActShaveProfiler and UniformNonTiledActShaveProfiler diff --git a/src/vpux_compiler/include/vpux/compiler/core/attributes/dim.hpp b/src/vpux_compiler/include/vpux/compiler/core/attributes/dim.hpp index b51c2ad064..e49c0600b1 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/attributes/dim.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/attributes/dim.hpp @@ -11,14 +11,11 @@ #include "vpux/utils/core/array_ref.hpp" #include "vpux/utils/core/checked_cast.hpp" -#include "vpux/utils/core/error.hpp" -#include "vpux/utils/core/format.hpp" #include "vpux/utils/core/small_vector.hpp" #include "vpux/utils/core/type_traits.hpp" -#include - #include +#include namespace vpux { diff --git a/src/vpux_compiler/include/vpux/compiler/core/attributes/dim_values.hpp b/src/vpux_compiler/include/vpux/compiler/core/attributes/dim_values.hpp index 5285e38a50..7dabb4c8bf 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/attributes/dim_values.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/attributes/dim_values.hpp @@ -5,8 +5,6 @@ #pragma once -#include "vpux/compiler/core/attributes/dim.hpp" - #include "vpux/utils/core/array_ref.hpp" #include "vpux/utils/core/format.hpp" #include "vpux/utils/core/small_vector.hpp" @@ -114,6 +112,9 @@ class DimValuesBase { void insert(iterator pos, ValueType&& val) { _cont.insert(pos, std::move(val)); } + void insert(iterator pos, size_type count, const ValueType& val) { + _cont.insert(pos, count, val); + } void erase(iterator pos) { _cont.erase(pos); diff --git a/src/vpux_compiler/include/vpux/compiler/core/attributes/dims_order.hpp b/src/vpux_compiler/include/vpux/compiler/core/attributes/dims_order.hpp index d509d85a73..78acd41387 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/attributes/dims_order.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/attributes/dims_order.hpp @@ -8,12 +8,7 @@ #include "vpux/compiler/core/attributes/dim.hpp" #include "vpux/compiler/core/attributes/dim_values.hpp" #include "vpux/compiler/core/attributes/shape.hpp" -#include "vpux/compiler/core/attributes/strides.hpp" - #include "vpux/utils/core/error.hpp" -#include "vpux/utils/core/format.hpp" -#include "vpux/utils/core/mem_size.hpp" -#include "vpux/utils/core/optional.hpp" #include "vpux/utils/core/range.hpp" #include @@ -68,6 +63,8 @@ class DimsOrder final { static const DimsOrder HNWC; static const DimsOrder CWNH; static const DimsOrder CNHW; + static const DimsOrder CHWN; + static const DimsOrder HCWN; // Orders for 2D Convolution weights static const DimsOrder OIYX; diff --git a/src/vpux_compiler/include/vpux/compiler/core/barrier_info.hpp b/src/vpux_compiler/include/vpux/compiler/core/barrier_info.hpp index cf8bd50874..46f2ec0000 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/barrier_info.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/barrier_info.hpp @@ -7,18 +7,15 @@ #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" -#include "vpux/compiler/dialect/VPURegMapped/utils.hpp" -#include "vpux/utils/core/func_ref.hpp" #include "vpux/utils/core/small_vector.hpp" #include "vpux/utils/logger/logger.hpp" +#include +#include #include #include #include -#include -#include - namespace vpux { // Declare BarrierInfo utils used for testing purposes @@ -299,6 +296,16 @@ class BarrierInfo { * do not have sync-point. */ std::optional getPreviousBlockSyncPoint(size_t taskInd) const; + + /** + * @brief Get index of sync-task for the next block to which the task taskInd belongs + * + * @param taskInd - task for which next sync-task should be calculated + * @return return index of the next sync point from the taskInd's block, if the sync-task exists and return + * std::nullopt otherwise. (Tasks from last block do not have a sync-point). + * + */ + std::optional getNextBlockSyncPoint(size_t taskInd) const; void splitBarriersWithExceedingVariantCount(size_t availableSlots, size_t maxSlotsSum, size_t maxAvailableSlots); void splitBarrierProducers(size_t availableSlots, size_t maxSlotsSum, bool maxSlotsSumLimitEnabled); void splitBarrierConsumers(size_t availableSlots, size_t maxSlotsSum, bool maxSlotsSumLimitEnabled); diff --git a/src/vpux_compiler/include/vpux/compiler/core/cost_model_utils.hpp b/src/vpux_compiler/include/vpux/compiler/core/cost_model_utils.hpp index ab44229bcb..819a0624a0 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/cost_model_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/cost_model_utils.hpp @@ -5,12 +5,22 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/interfaces/dpu_tiler.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include #include +namespace vpux::VPU { +class SWOpInterface; +} // namespace vpux::VPU +namespace vpux::VPUIP { +class DPUTaskOp; +class NCEClusterTaskOp; +class SwKernelOp; +class SwKernelRun; +} // namespace vpux::VPUIP + namespace vpux { constexpr StringLiteral DPUCost = "minimumHardwareExecutionCost"; @@ -18,7 +28,7 @@ constexpr StringLiteral cycleCostAttrName = "cycleCost"; constexpr StringLiteral cycleBegin = "cycleBegin"; constexpr StringLiteral cycleEnd = "cycleEnd"; -size_t getDMACost(mlir::Value input, mlir::Value output, VPU::ArchKind archKind, +size_t getDMACost(mlir::Value input, mlir::Value output, config::ArchKind archKind, const std::shared_ptr& costModel, int64_t numDMAPorts = 1); size_t getDMACost(vpux::NDTypeInterface inTensorType, vpux::NDTypeInterface outTensorType, VPUNN::VPUDevice vpuDevice, const std::shared_ptr& costModel, int64_t numDMAPorts); @@ -27,16 +37,16 @@ size_t getDMACost(vpux::NDTypeInterface tensorType, VPUNN::VPUDevice vpuDevice, size_t getDPUCost(mlir::Operation* op); size_t getAsyncExecuteCycleBegin(mlir::async::ExecuteOp op); size_t getAsyncExecuteCycleEnd(mlir::async::ExecuteOp op); -VPUNN::DPUWorkload getDPUWorkload(VPUIP::DPUTaskOp dpuTaskOp, VPU::ArchKind arch); -size_t calculateCopyCycles(mlir::Operation* innerOp, VPU::ArchKind archKind, +VPUNN::DPUWorkload getDPUWorkload(VPUIP::DPUTaskOp dpuTaskOp, config::ArchKind arch); +size_t calculateCopyCycles(mlir::Operation* innerOp, config::ArchKind archKind, const std::shared_ptr& costModel); size_t calculateShaveActCycles(VPUIP::SwKernelOp swKernelOp, const std::shared_ptr& costModel, - VPU::ArchKind arch); + config::ArchKind arch); std::vector> calculateNceVariantCycles(VPUIP::NCEClusterTaskOp nceOp, const std::shared_ptr& costModel, - VPU::ArchKind arch, vpux::Logger log); + config::ArchKind arch, vpux::Logger log); size_t calculateNceCycles(VPUIP::NCEClusterTaskOp nceOp, const std::shared_ptr& costModel, - VPU::ArchKind arch, vpux::Logger log, int64_t numDPU = 1); + config::ArchKind arch, vpux::Logger log, int64_t numDPU = 1); vpux::Byte getSwKernelRunTotalAllocSize(VPUIP::SwKernelRun swKernelRun, ArrayRef inputs, ArrayRef outputBuffs, SmallVector& inputsForKernelRun, SmallVector& outputsForKernelRun); @@ -45,7 +55,7 @@ std::unique_ptr getVPUNNSWKernelOp(VPU::SWOpInterface operat std::unique_ptr getVPUNNSWKernelOp(VPU::SWOpInterface operation, vpux::NDTypeInterface outputNDType, ArrayRef inputTiles); size_t getDPUTaskOpCost(VPUIP::DPUTaskOp dpuTaskOp, const std::shared_ptr& costModel, - VPU::ArchKind arch, vpux::Logger log); + config::ArchKind arch, vpux::Logger log); VPUNN::MemoryLocation getMemoryLocation(mlir::Type type); VPUNN::Swizzling getVPUNNSwizzlingKey(mlir::Type type); diff --git a/src/vpux_compiler/include/vpux/compiler/core/cycle_cost_info.hpp b/src/vpux_compiler/include/vpux/compiler/core/cycle_cost_info.hpp index a804eeecae..15c282f318 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/cycle_cost_info.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/cycle_cost_info.hpp @@ -72,7 +72,7 @@ class CycleCostInfo { return _costModel; } - inline VPU::ArchKind getArchKind() const { + inline config::ArchKind getArchKind() const { return _archKind; } @@ -100,7 +100,7 @@ class CycleCostInfo { std::shared_ptr _costModel; std::set _layersWithInvalidCost; size_t _numOfTasksWithInvalidCost = 0; - VPU::ArchKind _archKind; + config::ArchKind _archKind; CycleCosts _cycleCosts; Logger _log; diff --git a/src/vpux_compiler/include/vpux/compiler/core/feasible_memory_scheduler.hpp b/src/vpux_compiler/include/vpux/compiler/core/feasible_memory_scheduler.hpp index 821d73b376..1e7ee1cf2a 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/feasible_memory_scheduler.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/feasible_memory_scheduler.hpp @@ -6,11 +6,13 @@ #pragma once #include "vpux/compiler/core/async_deps_info.hpp" -#include "vpux/compiler/core/cost_model_utils.hpp" #include "vpux/compiler/core/linear_scan_handler.hpp" #include "vpux/compiler/core/mem_live_range_info.hpp" +#include "vpux/compiler/utils/linear_scan.hpp" #include "vpux/compiler/utils/partitioner.hpp" +#include + namespace vpux { class FeasibleMemoryScheduler final { @@ -328,8 +330,9 @@ class FeasibleMemoryScheduler final { public: FeasibleMemoryScheduler(VPU::MemoryKind memKind, VPU::MemoryKind secondLvlMemKind, MemLiveRangeInfo& liveRangeInfo, AsyncDepsInfo& depsInfo, Logger log, LinearScan& scan, - VPU::ArchKind arch, std::shared_ptr costModel, int64_t nceClusterCount, - int64_t dmaCount, bool enableScheduleStatistics, bool optimizeFragmentation); + config::ArchKind arch, std::shared_ptr costModel, + int64_t nceClusterCount, int64_t dmaCount, bool enableScheduleStatistics, + bool optimizeFragmentation); public: ScheduledOpInfoVec generateSchedule(); @@ -427,7 +430,7 @@ class FeasibleMemoryScheduler final { // allocator class LinearScan& _scan; // architecture kind - VPU::ArchKind _archKind; + config::ArchKind _archKind; // VPUNN cost model std::shared_ptr _costModel; // NCE cluster count diff --git a/src/vpux_compiler/include/vpux/compiler/core/layers.hpp b/src/vpux_compiler/include/vpux/compiler/core/layers.hpp index 4f443fde4a..b4c8398d7d 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/layers.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/layers.hpp @@ -6,7 +6,6 @@ #pragma once #include "vpux/compiler/core/attributes/dim.hpp" - #include "vpux/utils/core/error.hpp" namespace vpux { diff --git a/src/vpux_compiler/include/vpux/compiler/core/linear_scan_handler.hpp b/src/vpux_compiler/include/vpux/compiler/core/linear_scan_handler.hpp index 66fbf0c94a..97f52ded08 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/linear_scan_handler.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/linear_scan_handler.hpp @@ -5,16 +5,12 @@ #pragma once -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" -#include "vpux/compiler/utils/linear_scan.hpp" - -#include "vpux/utils/core/array_ref.hpp" +#include "vpux/compiler/utils/partitioner.hpp" #include "vpux/utils/core/dense_map.hpp" -#include "vpux/utils/core/string_ref.hpp" - -#include +#include "vpux/utils/core/mem_size.hpp" #include +#include namespace vpux { diff --git a/src/vpux_compiler/include/vpux/compiler/core/mem_live_range_info.hpp b/src/vpux_compiler/include/vpux/compiler/core/mem_live_range_info.hpp index d26dc85655..1712d9dfb1 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/mem_live_range_info.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/mem_live_range_info.hpp @@ -6,11 +6,7 @@ #pragma once #include "vpux/compiler/core/aliases_info.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" #include "vpux/compiler/utils/stl_extras.hpp" - -#include "vpux/utils/core/array_ref.hpp" -#include "vpux/utils/core/small_vector.hpp" #include "vpux/utils/logger/logger.hpp" #include diff --git a/src/vpux_compiler/include/vpux/compiler/core/pipelines_options.hpp b/src/vpux_compiler/include/vpux/compiler/core/pipelines_options.hpp index d465aaf02c..49c0a6c9ab 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/pipelines_options.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/pipelines_options.hpp @@ -19,170 +19,6 @@ #include namespace vpux { - -// -// ReferenceSWMode -// - -template -struct ReferenceSWOptions : mlir::PassPipelineOptions { - BoolOption enableVerifiers{*this, "enable-verifiers", llvm::cl::desc("Enable verifiers execution after each pass"), - llvm::cl::init(isDeveloperBuild())}; - - BoolOption enableMemoryUsageCollector{*this, "enable-memory-usage-collector", - llvm::cl::desc("Enable peak memory usage instrumentation after each pass"), - llvm::cl::init(isDeveloperBuild())}; - - BoolOption enableFunctionStatisticsInstrumentation{ - *this, "enable-function-statistics-instrumentation", - llvm::cl::desc("Enable printing statistics for functions after each pass"), llvm::cl::init(false)}; - - BoolOption allowCustomValues{*this, "allow-custom-values", - ::llvm::cl::desc("[Optional] Allows keep predefined values in IR")}; - - BoolOption enableDummyOpReplacement{*this, "dummy-op-replacement", - llvm::cl::desc("Replace unsupported SW Kernel ops with Dummy ones"), - llvm::cl::init(false)}; - - BoolOption constantFoldingInBackground{*this, "constant-folding-in-background", - llvm::cl::desc("Fold constants in background threads"), - llvm::cl::init(false)}; - - IntOption constantFoldingInBackgroundNumThreads{ - *this, "constant-folding-in-background-num-threads", - llvm::cl::desc("Number of background threads to use for constant folding in background. Ignored if " - "`constant-folding-in-background` is disabled."), - llvm::cl::init(1)}; - - BoolOption constantFoldingInBackgroundCollectStatistics{ - *this, "constant-folding-in-background-collect-statistics", - llvm::cl::desc("Toggle for the collection of statistics when folding constants in background. Ignored if " - "`constant-folding-in-background` is disabled."), - llvm::cl::init(false)}; - - IntOption constantFoldingInBackgroundMemoryUsageLimit{ - *this, "constant-folding-in-background-memory-usage-limit", - llvm::cl::desc("Fold constants in background memory usage limit (in MB)"), llvm::cl::init(3 * 1024)}; - - DoubleOption constantFoldingInBackgroundCacheCleanThreshold{ - *this, "constant-folding-in-background-cache-clean-threshold", - llvm::cl::desc("Cache will be cleaned to this threshold when reach the memory usage limit"), - llvm::cl::init(0.8)}; - - BoolOption enableProfiling{*this, "profiling", llvm::cl::desc("Enable profiling"), llvm::cl::init(false)}; - - // This is a temporary option to enable running profiling passes along with outlining passes. - // It will be removed after all profiling engines are updated to support outlined functions. - // Ticket: E#159100 - BoolOption enableProfilingWithOutlining{*this, "profiling-with-outlining", - llvm::cl::desc("Enable profiling with outlining"), llvm::cl::init(false)}; - - BoolOption enableMergeFakeQuant{*this, "merge-fake-quant", llvm::cl::desc("Enable merge-fake-quant pass"), - llvm::cl::init(true)}; - - BoolOption enableOptimizeReorders{*this, "optimize-reorders", llvm::cl::desc("Enable optimize-reorders pass"), - llvm::cl::init(false)}; - - BoolOption enableExperimentalSEPtrsOperations{*this, "enable-experimental-se-ptrs-operations", - llvm::cl::desc("Enable the experimental operation of SEP"), - llvm::cl::init(false)}; - - BoolOption enableFuseClampOperations{*this, "enable-fuse-clamp-op", llvm::cl::desc("Enable fuse clamp operations"), - llvm::cl::init(false)}; - - BoolOption enableConvertPrecisionToFP16{*this, "convert-precision-to-fp16", - llvm::cl::desc("Enable convert-precision-to-fp16 pass"), - llvm::cl::init(true)}; - - BoolOption enableConvertNonConstantPadToSliceAndConcat{ - *this, "enable-convert-non-constant-pad-to-slice-and-concat", - llvm::cl::desc("Enable convert-non-constant-pad-to-slice-and-concat pass"), llvm::cl::init(true)}; - - BoolOption enableControlGraphSplit{*this, "enable-control-graph-split", - llvm::cl::desc("Enable split of control graph to simplify barrier scheduling"), - llvm::cl::init(true)}; - IntOption controlGraphSplitBlockSize{ - *this, "control-graph-split-block-size", - llvm::cl::desc("Maximal number of tasks in each block that control graph will be split into. Used to " - "reduce memory consumption of barrier legalization pipeline for big models. Memory usage is " - "roughly (control-graph-split-block-size)^2/8"), - llvm::cl::init(CONTROL_GRAPH_SPLIT_BLOCK_SIZE)}; - - BoolOption enableSimpleSchedule{*this, "simple-schedule", llvm::cl::desc("Enable schedule simplification"), - llvm::cl::init(true)}; - - BoolOption reduceParallelControlFlows{*this, "reduce-parallel-control-flows", - llvm::cl::desc("Reduce parallel overlapping control flows where possible"), - llvm::cl::init(true)}; - BoolOption enableColorBinPhysicalBarrierAssignment{ - *this, "enable-color-bin-physical-barrier-assignment", - llvm::cl::desc("Enable physical barrier assignment optimization"), llvm::cl::init(false)}; - - BoolOption enableSWKernelPrefetchingReserveMem{ - *this, "enable-sw-kernel-prefetching-reserve-mem", - ::llvm::cl::desc("Reserve memory at the end of CMX for SW Kernel data prefetching"), - ::llvm::cl::init(true)}; - - BoolOption enableGroupedMatMul{*this, "enable-grouped-matmul", - llvm::cl::desc("Enable execution of grouped MatMul as a single operation."), - llvm::cl::init(false)}; - - BoolOption fuseScalesToAccumulate{ - *this, "fuse-scales-to-accumulate", - llvm::cl::desc("Enable scales fusing to following Accumulate op from GPTQ Matmul unrolling"), - llvm::cl::init(false)}; - - BoolOption enableFP16CompressedConvolution{*this, "enable-fp16-compressed-convolution", - llvm::cl::desc("Enable FP16 Compressed convolution op"), - llvm::cl::init(false)}; - - BoolOption enableVPUNNPreSplit{*this, "enable-vpunn-pre-split", llvm::cl::desc("Enable VPUNN pre-split API"), - llvm::cl::init(false)}; - - BoolOption enableWeightsDynamicDequantization{*this, "enable-weights-dynamic-dequantization", - llvm::cl::desc("Enable weights dequantization for weights as input"), - llvm::cl::init(false)}; - - BoolOption enableRuntimeDequant{*this, "enable-runtime-dequant", - llvm::cl::desc("Enable runtime dequantization of asymmetricly quantized weight"), - llvm::cl::init(false)}; - Int64Option runtimeDequantizationLimit{ - *this, "runtime-dequantization-limit", - llvm::cl::desc("Lower limit on weight size for runtime dequantization" - "Weights smaller than the limit will be statically dequantized"), - llvm::cl::init(524'288)}; // 512kb - - BoolOption enableInPlaceBufferization{ - *this, "enable-in-place-bufferization", - llvm::cl::desc("Enable in-place bufferization. Might eliminate some redundant buffer allocations at the " - "cost of longer compile time"), - llvm::cl::init(false)}; - - BoolOption useMemrefForHostFunctionBufferization{ - *this, "use-memref-for-host-function-bufferization", - llvm::cl::desc("Enable memref bufferization for host function ops"), llvm::cl::init(false)}; - - BoolOption enableMatmulMixedPrecisionDecomposition{ - *this, "enable-matmul-mixed-precision-decomposition", - llvm::cl::desc("Enable mixed precision decomposition for matmul"), llvm::cl::init(true)}; - DoubleOption matmulMixedPrecisionDecompositionRatio{ - *this, "matmul-mixed-precision-decomposition-ratio", - llvm::cl::desc("Determines when to enable Matmul Mixed Precision Decomposition" - "Ratio = (MatMul input size)/(Sum of Inputs of newly added ops by decomposition)"), - llvm::cl::init(250.0)}; - - bool enableForceZMajorConcat = false; - bool enableSwapTransposeWithFQ = false; - bool enableAlignScales = false; - bool fuseMvn6ScaleBias = false; - // TODO: remove option after E#-83187 - bool enableFuseClamp = false; - bool enableConvertFCToConv = false; - bool enableAdjustNonZeroFakeQuant = false; - bool enableAdaptiveStripping = false; - bool enableExtraStaticShapeOps = false; -}; - // // DefaultHWOptionsBase // This class must be inherited by all dialect-base options @@ -404,6 +240,25 @@ struct DefaultHWOptionsBase : mlir::PassPipelineOptions, p BoolOption useMemrefForHostFunctionBufferization{ *this, "use-memref-for-host-function-bufferization", llvm::cl::desc("Enable memref bufferization for host function ops"), llvm::cl::init(false)}; + + BoolOption enableMergeFakeQuant{*this, "merge-fake-quant", llvm::cl::desc("Enable merge-fake-quant pass"), + llvm::cl::init(true)}; + + BoolOption enableCompressActivationSpill{*this, "compress-activation-spill", + ::llvm::cl::desc("Enable compress-activation-spill feature"), + ::llvm::cl::init(false)}; + + BoolOption enableSWKernelInstructionPrefetch{*this, "enable-sw-kernels-instruction-prefetch", + llvm::cl::desc("Enable SW kernels instruction prefetch"), + llvm::cl::init(true)}; + + // TODO: Ideally, some of the passes in the HostCompile pipeline should not operate on the main function. + // The following option will skip processing of the main function in these passes. This will be refactored in the + // future. Track: E#168311 + BoolOption disablePassOnEntryFunctionForHostCompile{ + *this, "disable-pass-on-entry-function", + llvm::cl::desc("Disable certain passes for entry function operations for HostCompile pipeline"), + llvm::cl::init(false)}; }; // diff --git a/src/vpux_compiler/include/vpux/compiler/core/profiling.hpp b/src/vpux_compiler/include/vpux/compiler/core/profiling.hpp index 5e4abc345e..92fe75ad0e 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/profiling.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/profiling.hpp @@ -5,12 +5,22 @@ #pragma once -#include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/attributes.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/strings.hpp" #include "vpux/utils/profiling/common.hpp" +namespace vpux::VPUIP { +class DMATypeOpInterface; +class ProfilingSectionOp; +} // namespace vpux::VPUIP + +namespace vpux::VPURT { +class TaskOp; +} // namespace vpux::VPURT + namespace vpux { VPUIP::DpuProfilingMetadataAttr getDpuProfilingMetaAttr(mlir::MLIRContext* ctx, unsigned bufferId, unsigned taskId, @@ -29,7 +39,7 @@ VPUIP::M2IProfilingMetadataAttr getM2IProfilingMetaAttr(mlir::MLIRContext* ctx, enum class DMAProfilingMode { DISABLED, SCRATCH, SW, STATIC_HWP, DYNAMIC_HWP }; -DMAProfilingMode getDMAProfilingMode(VPU::ArchKind arch, const std::string& optionValue); +DMAProfilingMode getDMAProfilingMode(config::ArchKind arch, const std::string& optionValue); // Post processing of profiling is relay on uniqueness of locations, but this may be violated. To ensure that all names // are unique this class is used diff --git a/src/vpux_compiler/include/vpux/compiler/core/public_options.hpp b/src/vpux_compiler/include/vpux/compiler/core/public_options.hpp index a92657becd..ebce77602b 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/public_options.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/public_options.hpp @@ -6,7 +6,7 @@ #pragma once #include "vpux/compiler/core/developer_build_utils.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/attributes.hpp" #include "vpux/compiler/utils/options.hpp" #include @@ -50,9 +50,9 @@ struct PublicOptions : mlir::PassPipelineOptions { BoolOption enableSEPtrsOperations{*this, "enable-se-ptrs-operations", llvm::cl::desc("Enable storage element pointer operations")}; - static bool getDefaultEnableSEPtrsOperations(VPU::ArchKind arch) { + static bool getDefaultEnableSEPtrsOperations(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return true; default: return false; @@ -100,9 +100,9 @@ struct PublicOptions : mlir::PassPipelineOptions { clEnumValN(WorkloadManagementMode::PWLM_V0_LCA, "PWLM_V0_LCA", "Partial WLM, enqueue barriers search algorithm at VPURT DISABLED. Use LCA based " "enqueue algorithm at VPUMI"))}; - static WorkloadManagementMode getDefaultWorkloadManagementMode(VPU::ArchKind arch) { + static WorkloadManagementMode getDefaultWorkloadManagementMode(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return WorkloadManagementMode::PWLM_V0_LCA; default: return WorkloadManagementMode::PWLM_V0_LCA; @@ -115,9 +115,9 @@ struct PublicOptions : mlir::PassPipelineOptions { StrOption enableDMAProfiling{*this, "dma-profiling", llvm::cl::desc("Enable DMA task profiling (true, false, static)"), llvm::cl::init("true")}; - static std::string getDefaultEnableDMAProfiling(VPU::ArchKind arch) { + static std::string getDefaultEnableDMAProfiling(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: // Enable for 40XX once RT will be ready, follow up #E95864 return "false"; default: @@ -148,15 +148,16 @@ struct PublicOptions : mlir::PassPipelineOptions { // PublicOptions() = default; - PublicOptions(VPU::ArchKind arch) { + PublicOptions(config::ArchKind arch) { enableSEPtrsOperations = getDefaultEnableSEPtrsOperations(arch); - if (arch != VPU::ArchKind::NPU40XX) { - workloadManagementMode = getDefaultWorkloadManagementMode(arch); + if (arch != config::ArchKind::NPU40XX) { + workloadManagementMode.setValue(getDefaultWorkloadManagementMode(arch)); } + enableDMAProfiling = getDefaultEnableDMAProfiling(arch); } - static std::unique_ptr createFromString(StringRef options, VPU::ArchKind arch) { + static std::unique_ptr createFromString(StringRef options, config::ArchKind arch) { auto result = std::make_unique(arch); if (mlir::failed(result->parseFromString(options))) { return nullptr; diff --git a/src/vpux_compiler/include/vpux/compiler/core/schedule_analysis_utils.hpp b/src/vpux_compiler/include/vpux/compiler/core/schedule_analysis_utils.hpp index d619ad81b3..faf5f88f2a 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/schedule_analysis_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/schedule_analysis_utils.hpp @@ -6,9 +6,6 @@ #pragma once #include "vpux/compiler/core/feasible_memory_scheduler.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/interfaces/dpu_tiler.hpp" -#include "vpux/compiler/utils/strings.hpp" namespace vpux { diff --git a/src/vpux_compiler/include/vpux/compiler/core/tiling.hpp b/src/vpux_compiler/include/vpux/compiler/core/tiling.hpp index 9ebf54b83a..d4370c40dd 100644 --- a/src/vpux_compiler/include/vpux/compiler/core/tiling.hpp +++ b/src/vpux_compiler/include/vpux/compiler/core/tiling.hpp @@ -7,17 +7,19 @@ #include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/core/attributes/strides.hpp" #include "vpux/compiler/core/layers.hpp" -#include "vpux/compiler/dialect/IE/IR/attributes.hpp" - #include "vpux/utils/core/format.hpp" #include "vpux/utils/logger/logger.hpp" -#include - #include +#include -#include +namespace vpux::IE { +enum class InterpolateMode : uint64_t; +enum class InterpolateCoordMode : uint64_t; +enum class InterpolateNearestMode : uint64_t; +} // namespace vpux::IE namespace vpux { @@ -181,41 +183,80 @@ bool isWeightsFirstNestedTiling(mlir::Operation* op, ShapeRef divisors); // struct PadInfo final { - int64_t left = 0; - int64_t right = 0; - int64_t top = 0; - int64_t bottom = 0; + int32_t left = 0; + int32_t right = 0; + int32_t top = 0; + int32_t bottom = 0; + int32_t front = 0; // 5D depth padding + int32_t back = 0; // 5D depth padding + bool is5D = false; // Flag to indicate 5D vs 4D usage PadInfo() = default; + // 4D constructor PadInfo(int64_t left, int64_t right, int64_t top, int64_t bottom) - : left(left), right(right), top(top), bottom(bottom) { + : left(left), right(right), top(top), bottom(bottom), is5D(false) { + } + + // 5D constructor + PadInfo(int64_t left, int64_t right, int64_t top, int64_t bottom, int64_t front, int64_t back) + : left(left), right(right), top(top), bottom(bottom), front(front), back(back), is5D(true) { } - PadInfo(mlir::ArrayAttr pads_begin, mlir::ArrayAttr pads_end) { - top = mlir::cast(pads_begin[Dims4D::PadsBegin::Top.ind()]).getValue().getSExtValue(); - bottom = mlir::cast(pads_end[Dims4D::PadsEnd::Bottom.ind()]).getValue().getSExtValue(); - left = mlir::cast(pads_begin[Dims4D::PadsBegin::Left.ind()]).getValue().getSExtValue(); - right = mlir::cast(pads_end[Dims4D::PadsEnd::Right.ind()]).getValue().getSExtValue(); + PadInfo(mlir::ArrayAttr pads_begin, mlir::ArrayAttr pads_end) + : is5D(pads_begin.size() == 3 && pads_end.size() == 3) { + if (is5D) { + top = mlir::cast(pads_begin[Dims5D::PadsBegin::Top.ind()]).getValue().getSExtValue(); + bottom = mlir::cast(pads_end[Dims5D::PadsEnd::Bottom.ind()]).getValue().getSExtValue(); + left = mlir::cast(pads_begin[Dims5D::PadsBegin::Left.ind()]).getValue().getSExtValue(); + right = mlir::cast(pads_end[Dims5D::PadsEnd::Right.ind()]).getValue().getSExtValue(); + front = mlir::cast(pads_begin[Dims5D::PadsBegin::Front.ind()]).getValue().getSExtValue(); + back = mlir::cast(pads_end[Dims5D::PadsEnd::Back.ind()]).getValue().getSExtValue(); + } else { + top = mlir::cast(pads_begin[Dims4D::PadsBegin::Top.ind()]).getValue().getSExtValue(); + bottom = mlir::cast(pads_end[Dims4D::PadsEnd::Bottom.ind()]).getValue().getSExtValue(); + left = mlir::cast(pads_begin[Dims4D::PadsBegin::Left.ind()]).getValue().getSExtValue(); + right = mlir::cast(pads_end[Dims4D::PadsEnd::Right.ind()]).getValue().getSExtValue(); + } } mlir::DenseMap> toPadByDims() const { - return {{Dims4D::Act::H.ind(), {top, bottom}}, {Dims4D::Act::W.ind(), {left, right}}}; + if (is5D) { + return {{Dims5D::Act::H.ind(), {top, bottom}}, + {Dims5D::Act::W.ind(), {left, right}}, + {Dims5D::Act::D.ind(), {front, back}}}; + } else { + return {{Dims4D::Act::H.ind(), {top, bottom}}, {Dims4D::Act::W.ind(), {left, right}}}; + } } bool enabled() const { - return left != 0 || right != 0 || top != 0 || bottom != 0; + if (is5D) { + return left != 0 || right != 0 || top != 0 || bottom != 0 || front != 0 || back != 0; + } else { + return left != 0 || right != 0 || top != 0 || bottom != 0; + } } bool operator==(const PadInfo& other) const { - return left == other.left && right == other.right && top == other.top && bottom == other.bottom; + if (is5D != other.is5D) { + return false; + } + bool base = left == other.left && right == other.right && top == other.top && bottom == other.bottom; + return is5D ? base && front == other.front && back == other.back : base; } + bool operator!=(const PadInfo& other) const { return !(*this == other); } void printFormat(llvm::raw_ostream& stream) const { - printTo(stream, "PadInfo [left = {0}, right = {1}, top = {2}, bottom = {3}]", left, right, top, bottom); + if (is5D) { + printTo(stream, "PadInfo5D [left = {0}, right = {1}, top = {2}, bottom = {3}, front = {4}, back = {5}]", + left, right, top, bottom, front, back); + } else { + printTo(stream, "PadInfo [left = {0}, right = {1}, top = {2}, bottom = {3}]", left, right, top, bottom); + } } }; @@ -258,6 +299,13 @@ InputTiling backInferGroupConvTile(const TileInfo& outputTile, ShapeRef origInpu InputTiling backInferMatMulTile(const TileInfo& outputTile, ShapeRef origInputShape, ShapeRef origFilterShape, mlir::ArrayAttr strides, const PadInfo& origPadding); +// +// 5D Pooling tiling +// + +InputTiling backInfer5DPoolTile(const TileInfo& outputTile, ShapeRef origInputShape, mlir::ArrayAttr kernel_size, + mlir::ArrayAttr strides, const PadInfo& origPadding); + // // Pooling tiling // @@ -414,7 +462,8 @@ SmallVector adaptStrides(ShapeRef origShape, StridesRef origStrides, Ar // EltwiseOp // -SmallVector getMaxNumTiles(mlir::Operation* op, bool checkMinimalWidthAndHeight = false); +SmallVector getMaxNumTiles(mlir::Operation* op, bool checkMinimalWidthAndHeight = false, + bool checkWorkloadEfficiency = false); InputTiling backInferEltwiseTile(mlir::Operation* op, const vpux::TileInfo& outputTile); // SWLayer @@ -514,6 +563,11 @@ bool isNewTileWithSameCostHasPotentialDMABenefits(mlir::Operation* op, ShapeRef */ SmallVector getNonOneDim(ShapeRef inputShape); +/* + * Get the dimensions greater than 1 with tiling order + */ +SmallVector getTilingOrderedDims(mlir::Operation* operation, ShapeRef tiling); + /* * Get the dimension with the maximum size in all non-one dimensions */ diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/ELFNPU37XX/export.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/ELFNPU37XX/export.hpp index d36d47cbc4..2bf12d9876 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/ELFNPU37XX/export.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/ELFNPU37XX/export.hpp @@ -6,7 +6,6 @@ #pragma once #include "vpux/compiler/compiler.hpp" - #include "vpux/utils/logger/logger.hpp" #include @@ -16,9 +15,6 @@ #include -#include "vpux/compiler/dialect/ELFNPU37XX/ops.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - namespace vpux { namespace ELFNPU37XX { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/ELFNPU37XX/metadata.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/ELFNPU37XX/metadata.hpp index 1ae2d5e228..b6d8241d6c 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/ELFNPU37XX/metadata.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/ELFNPU37XX/metadata.hpp @@ -5,19 +5,17 @@ #pragma once +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/utils/logger/logger.hpp" #include +#include #include #include #include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - -#include - namespace vpux { namespace ELFNPU37XX { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/ELFNPU37XX/utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/ELFNPU37XX/utils.hpp index d03b4fd6c8..c8ec4f64fe 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/ELFNPU37XX/utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/ELFNPU37XX/utils.hpp @@ -5,13 +5,15 @@ #pragma once -#include #include #include "vpux/compiler/dialect/ELFNPU37XX/ops.hpp" #include "vpux/compiler/utils/logging.hpp" #include "vpux/utils/core/error.hpp" #include "vpux/utils/core/small_string.hpp" +#include +#include + using namespace vpux; namespace llvm { @@ -56,7 +58,7 @@ using OffsetCache = mlir::DenseMap -#include -#include -#include -#include -#include -#include - -// -// Generated -// - -#define GET_OP_CLASSES -#include - -// -// Operation verifiers -// - -namespace vpux { -namespace IE { - -bool isActShaveKernel(mlir::Operation* operation); - -// -// Tiling -// - -// Adjust paddings attributes for tiled input -template -void adjustPaddings(ConcreteOp* op, const TilingInfo& inputTiling) { - const auto& inputTilePads = inputTiling.pads; - VPUX_THROW_UNLESS(inputTilePads.has_value(), "Missing tile information for paddings"); - - const std::array padsBegin = {inputTilePads->top, inputTilePads->left}; - const std::array padsEnd = {inputTilePads->bottom, inputTilePads->right}; - - auto newPadsBeginAttr = getIntArrayAttr(op->getContext(), padsBegin); - auto newPadsEndAttr = getIntArrayAttr(op->getContext(), padsEnd); - - op->setPadsBeginAttr(newPadsBeginAttr); - op->setPadsEndAttr(newPadsEndAttr); -} - -} // namespace IE - -} // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/activation.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/activation.hpp new file mode 100644 index 0000000000..4c8838718f --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/activation.hpp @@ -0,0 +1,21 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp new file mode 100644 index 0000000000..bada2b4178 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp @@ -0,0 +1,21 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/bitwise.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/bitwise.hpp new file mode 100644 index 0000000000..6f6c443fa9 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/bitwise.hpp @@ -0,0 +1,21 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/comparison.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/comparison.hpp new file mode 100644 index 0000000000..1dffc5fb6c --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/comparison.hpp @@ -0,0 +1,21 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/control_flow.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/control_flow.hpp new file mode 100644 index 0000000000..ceffcf203c --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/control_flow.hpp @@ -0,0 +1,22 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include +#include "mlir/Interfaces/ControlFlowInterfaces.h" + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/convolution.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/convolution.hpp new file mode 100644 index 0000000000..95a8c76148 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/convolution.hpp @@ -0,0 +1,22 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/data_movement.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/data_movement.hpp new file mode 100644 index 0000000000..260af6adf3 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/data_movement.hpp @@ -0,0 +1,22 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/data_type.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/data_type.hpp new file mode 100644 index 0000000000..093b04827b --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/data_type.hpp @@ -0,0 +1,23 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/eltwise.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/eltwise.hpp new file mode 100644 index 0000000000..9209fa20e5 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/eltwise.hpp @@ -0,0 +1,22 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/image.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/image.hpp new file mode 100644 index 0000000000..c65be996bb --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/image.hpp @@ -0,0 +1,21 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/logical.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/logical.hpp new file mode 100644 index 0000000000..180eb95101 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/logical.hpp @@ -0,0 +1,21 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/normalization.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/normalization.hpp new file mode 100644 index 0000000000..934a0ac4ef --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/normalization.hpp @@ -0,0 +1,21 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/pooling.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/pooling.hpp new file mode 100644 index 0000000000..4c3a67e402 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/pooling.hpp @@ -0,0 +1,22 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/recurrent.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/recurrent.hpp new file mode 100644 index 0000000000..f6c40ffa60 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/recurrent.hpp @@ -0,0 +1,21 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/reduce.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/reduce.hpp new file mode 100644 index 0000000000..c59c6f0368 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/reduce.hpp @@ -0,0 +1,21 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/resources.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/resources.hpp new file mode 100644 index 0000000000..08589d2b6e --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/resources.hpp @@ -0,0 +1,21 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp new file mode 100644 index 0000000000..b8492dbe34 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp @@ -0,0 +1,21 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/specialized.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/specialized.hpp new file mode 100644 index 0000000000..7478c2c0d6 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops/specialized.hpp @@ -0,0 +1,22 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include +#include + +// +// Generated +// + +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops_interfaces.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops_interfaces.hpp index 5489283a7b..e4d7c36ecd 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops_interfaces.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/IR/ops_interfaces.hpp @@ -5,18 +5,12 @@ #pragma once -#include "vpux/compiler/core/attributes/dim.hpp" #include "vpux/compiler/core/attributes/dims_order.hpp" -#include "vpux/compiler/core/tiling.hpp" #include "vpux/compiler/dialect/IE/IR/attributes.hpp" -#include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" - #include "vpux/utils/core/format.hpp" #include "vpux/utils/core/func_ref.hpp" -#include "vpux/utils/core/optional.hpp" #include "vpux/utils/core/small_vector.hpp" -#include #include #include #include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/convert_quantize_ops_to_nce_ops.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/convert_quantize_ops_to_nce_ops.hpp index 50931a7941..3d82e95f91 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/convert_quantize_ops_to_nce_ops.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/convert_quantize_ops_to_nce_ops.hpp @@ -6,11 +6,13 @@ // #pragma once -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/transforms/passes/convert_quantize_ops_to_nce_ops.hpp" -#include "vpux/compiler/utils/quantization.hpp" -#include "vpux/compiler/utils/rewriter.hpp" + +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" +#include "vpux/compiler/utils/passes.hpp" +#include "vpux/utils/logger/logger.hpp" + +#include namespace vpux::IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/convert_to_palletization_lut.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/convert_to_palletization_lut.hpp index b43eb2cdf0..5c2921beef 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/convert_to_palletization_lut.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/convert_to_palletization_lut.hpp @@ -3,10 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include #include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/fuse_outstanding_quant.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/fuse_outstanding_quant.hpp index 4358d9e393..c5f1c92fca 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/fuse_outstanding_quant.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/fuse_outstanding_quant.hpp @@ -3,8 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/NPU37XX/dialect/IE/utils/quantization.hpp" -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/utils/core/array_ref.hpp" +#include "vpux/utils/core/small_vector.hpp" +#include "vpux/utils/logger/logger.hpp" + +#include +#include namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/fuse_quantized_ops.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/fuse_quantized_ops.hpp index 06de10d4a7..fc5b330d1e 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/fuse_quantized_ops.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/common_rewriters/fuse_quantized_ops.hpp @@ -3,22 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - -#include "vpux/compiler/NPU37XX/dialect/IE/utils/quantization.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" -#include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" -#include "vpux/compiler/utils/attributes_properties_conversion.hpp" -#include "vpux/compiler/utils/error.hpp" -#include "vpux/compiler/utils/rewriter.hpp" #include +#include #include #include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/d2s_to_transposed_conv_verifier.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/d2s_to_transposed_conv_verifier.hpp index a813a162df..23292b956a 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/d2s_to_transposed_conv_verifier.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/d2s_to_transposed_conv_verifier.hpp @@ -5,7 +5,10 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/config/IR/attributes.hpp" + +#include namespace vpux { namespace IE { @@ -24,7 +27,7 @@ class D2SToTransposedConvVerifierBase { /* Find right class to verify whether DepthSpace to TransposedConv conversion is beneficial for particular platform */ -std::unique_ptr createD2SToTransposedConvVerifier(vpux::VPU::ArchKind arch); +std::unique_ptr createD2SToTransposedConvVerifier(vpux::config::ArchKind arch); } // namespace IE } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/fuse_convert_to_dpu_checker.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/fuse_convert_to_dpu_checker.hpp index 8a8f8cd003..7e473a923b 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/fuse_convert_to_dpu_checker.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/fuse_convert_to_dpu_checker.hpp @@ -5,7 +5,10 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/utils/logger/logger.hpp" + +#include namespace vpux { namespace IE { @@ -31,7 +34,7 @@ class FuseConvertToDPUCheckerBase { /* Find right class to verify whether fusion of Convert F16 -> F32 to parent DPU is feasible */ -std::unique_ptr createFuseConvertToDPUChecker(vpux::VPU::ArchKind arch); +std::unique_ptr createFuseConvertToDPUChecker(vpux::config::ArchKind arch); } // namespace IE } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/map_bilinear_interpolate_on_dpu_strategy.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/map_bilinear_interpolate_on_dpu_strategy.hpp new file mode 100644 index 0000000000..610617f8d1 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/interfaces/map_bilinear_interpolate_on_dpu_strategy.hpp @@ -0,0 +1,27 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/utils/logger/logger.hpp" + +#include + +namespace vpux::IE { + +class IMapBilinearInterpolateOnDPUStrategy { +public: + IMapBilinearInterpolateOnDPUStrategy(const bool interpolateAsSEOp) + : _interpolateAsSEOpInStrategy(interpolateAsSEOp) { + } + virtual void prepareInterpolate(mlir::ConversionTarget& target, LogCb logCb) const = 0; + + virtual ~IMapBilinearInterpolateOnDPUStrategy() = default; + +protected: + bool _interpolateAsSEOpInStrategy = false; +}; + +} // namespace vpux::IE diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/factories/map_bilinear_interpolate_on_dpu_strategy_getter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/factories/map_bilinear_interpolate_on_dpu_strategy_getter.hpp new file mode 100644 index 0000000000..d68f2df2c9 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/factories/map_bilinear_interpolate_on_dpu_strategy_getter.hpp @@ -0,0 +1,17 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/IE/interfaces/map_bilinear_interpolate_on_dpu_strategy.hpp" + +#include + +namespace vpux::IE { + +std::unique_ptr createMapBilinearInterpolateOnDPUStrategy( + mlir::func::FuncOp funcOp, bool interpolateAsSEOpInStrategy); + +} // namespace vpux::IE diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes.hpp index 86af8f86a0..ea72bc1bc0 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes.hpp @@ -6,14 +6,11 @@ #pragma once #include "vpux/compiler/core/pipelines_options.hpp" - +#include "vpux/compiler/dialect/IE/IR/dialect.hpp" #include "vpux/compiler/utils/options.hpp" #include "vpux/utils/logger/logger.hpp" #include -#include - -#include namespace vpux { namespace IE { @@ -95,6 +92,7 @@ std::unique_ptr createFuseReshapeMvnPass(Logger log = Logger::global std::unique_ptr createFuseRMSNormPass(Logger log = Logger::global()); std::unique_ptr createFuseRoPEPass(Logger log = Logger::global()); std::unique_ptr createFuseSDPAPass(Logger log = Logger::global()); +std::unique_ptr createExpandSoftmaxAxisPass(Logger log = Logger::global()); std::unique_ptr createFuseDynamicQuantizePass(Logger log = Logger::global()); std::unique_ptr createOptimizeParallelLayersPass(Logger log = Logger::global()); std::unique_ptr createOptimizeReordersPass(const bool seOpsEnabled = false, @@ -229,7 +227,7 @@ std::unique_ptr createPropagateShapeCastPass(Logger log = Logger::gl std::unique_ptr createPropagateTransposePass(Logger log = Logger::global()); std::unique_ptr createSwapTransposeWithFQPass(Logger log = Logger::global()); std::unique_ptr createPropagateDequantThroughConcatPass(Logger log = Logger::global()); -std::unique_ptr createSwapConvertWithTransposeReshapePass(Logger log = Logger::global()); +std::unique_ptr createSwapConvertWithReshapeKindOpsPass(Logger log = Logger::global()); std::unique_ptr createPerAxisFQConcatPass(Logger log = Logger::global()); std::unique_ptr createConvertGatherToSlicePass(Logger log = Logger::global()); std::unique_ptr createConvertToScaleShiftPass(Logger log = Logger::global()); @@ -259,6 +257,8 @@ std::unique_ptr createDumpStatisticsOfIeOpsPass(Logger log = Logger: std::unique_ptr createConvertSDPAToOnlineSDPAPass(Logger log = Logger::global()); std::unique_ptr createDecomposeOnlineSDPAPass(Logger log = Logger::global()); std::unique_ptr createDecomposeIncrementalSDPAPass(Logger log = Logger::global()); +std::unique_ptr createMapBilinearInterpolateOnDPUPass(const bool interpolateAsSEOp = false, + Logger log = Logger::global()); std::unique_ptr createTileIncrementalSDPAPass(Logger log = Logger::global()); std::unique_ptr createTileOnlineSDPAPass(Logger log = Logger::global()); std::unique_ptr createLoadExternalKernelResourcesPass(Logger log = Logger::global()); @@ -586,6 +586,7 @@ std::unique_ptr createConvertDynamicDequantizeToDequantizePass(Logge std::unique_ptr createSwapOperationWithGatherPass(Logger log = Logger::global()); std::unique_ptr createConvertVariadicSplitToStridedSlicePass(Logger log = Logger::global()); std::unique_ptr createAdjustFakeQuantizeParamsPass(Logger log = Logger::global()); +std::unique_ptr createAdjustFakeQdqParamsPass(Logger log = Logger::global()); // // Legalization for NCE @@ -823,6 +824,9 @@ struct DefaultHWOptionsDialectBase : public virtual vpux::DefaultHWOptionsBase { *this, "enable-online-sdpa-conversion", llvm::cl::desc("Convert SDPA layer into OnlineSDPA that implements FlashAttention-2 approach"), llvm::cl::init(false)}; + BoolOption enableApplyDynamicBoundaryCorrection{*this, "enable-apply-dynamic-boundary-correction", + llvm::cl::desc("Enable apply-dynamic-boundary-correction pass"), + llvm::cl::init(false)}; }; // diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/convert_quantize_ops_to_nce_ops.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/convert_quantize_ops_to_nce_ops.hpp index 105f52c749..896b8bfae4 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/convert_quantize_ops_to_nce_ops.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/convert_quantize_ops_to_nce_ops.hpp @@ -4,9 +4,8 @@ // #pragma once -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" namespace vpux::IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/convert_to_mixed_precision.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/convert_to_mixed_precision.hpp index f61bc5a256..ecff6cc6f1 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/convert_to_mixed_precision.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/convert_to_mixed_precision.hpp @@ -3,15 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - -#include "vpux/compiler/NPU37XX/dialect/IE/utils/quantization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/utils/logging.hpp" -#include "vpux/compiler/utils/passes.hpp" -#include "vpux/compiler/utils/types.hpp" -#include "vpux/utils/core/numeric.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include @@ -200,10 +197,16 @@ mlir::LogicalResult MixedFloatInQuantWeightsRewriter::matchAndRewrit return mlir::isa(op); }); - if (mlir::isa(quantFilterDequantizeType) && - (hasReLUConsumer || IE::hasReLUPostOp(convOp)) && - IE::hasNegativeScales(quantFilterDequantizeType)) { // ReLU post-op with negative scales introduces inaccuracy. - // Tracking number [E#174751] + // Check for problematic combination: per-axis quantization + ReLU postOp + negative quant scales on MTL and LNL + const auto arch = config::getArch(convOp); + const bool isPerAxisQuantized = mlir::isa(quantFilterDequantizeType); + const bool hasReLUConsumerOrPostOp = hasReLUConsumer || IE::hasReLUPostOp(convOp); + const bool hasNegativeQuantScales = IE::hasNegativeScales(quantFilterDequantizeType); + const bool isProblematicPlatform = (arch == config::ArchKind::NPU37XX || arch == config::ArchKind::NPU40XX); + + if (isPerAxisQuantized && hasReLUConsumerOrPostOp && hasNegativeQuantScales && isProblematicPlatform) { + // ReLU post-op with negative quant scales introduces inaccuracy for NPU3720 (MTL) and NPU4000 (LNL) + // Tracking number [E#174751] return mlir::failure(); } diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/expand_activation_channels.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/expand_activation_channels.hpp index b62c29f480..208a28b37c 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/expand_activation_channels.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/expand_activation_channels.hpp @@ -5,13 +5,15 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/utils/expand_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" #include "vpux/utils/core/checked_cast.hpp" #include "vpux/utils/core/func_ref.hpp" -#include "vpux/utils/core/numeric.hpp" + namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.hpp index 918faaaee8..683b876453 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.hpp @@ -5,11 +5,11 @@ #pragma once -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/interpolate_utils.hpp" -namespace vpux { -namespace IE { +#include + +namespace vpux::IE { bool isLegalInterpolateOp(IE::InterpolateOp op, bool interpolateAsSEOp, LogCb logCb); @@ -21,22 +21,17 @@ class MapBilinearInterpolateOnDPUBaseRewriter : public mlir::OpRewritePattern(ctx), _log(log) { + setDebugName("MapBilinearInterpolateOnDPURewriter"); } -public: mlir::LogicalResult matchAndRewrite(IE::InterpolateOp origOp, mlir::PatternRewriter& rewriter) const final; -protected: - virtual mlir::Value createIdentityPooling(mlir::PatternRewriter& rewriter, mlir::Location loc, - mlir::Value input) const; - private: + mlir::Value createIdentityPooling(mlir::PatternRewriter& rewriter, mlir::Location loc, mlir::Value input) const; mlir::Value scaleOnAxis(mlir::PatternRewriter& rewriter, mlir::Location loc, mlir::Value input, int64_t inputSize, int64_t outputSize, vpux::Dim axis, IE::MapCoordFuncT mapCoord) const; -private: Logger _log; }; -} // namespace IE -} // namespace vpux +} // namespace vpux::IE diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/optimize_slice_expand.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/optimize_slice_expand.hpp index 1f838fe83d..ffe736d154 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/optimize_slice_expand.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/optimize_slice_expand.hpp @@ -5,8 +5,9 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux { @@ -173,7 +174,7 @@ class OptimizeSliceMultiInputsExpand : public mlir::OpRewritePatterngetLoc()); rewriter.replaceOp(expandOp, newOp->getResults()); @@ -286,7 +287,7 @@ class OptimizeSliceEltwiseExpand final : public mlir::OpRewritePatterngetLoc()); rewriter.replaceOp(origOp, newOp->getResults()); diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/unroll_batch.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/unroll_batch.hpp index 50c34382af..cf46e6105f 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/unroll_batch.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/passes/unroll_batch.hpp @@ -5,7 +5,7 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/rewriters/expand_with_layer_rewriter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/rewriters/expand_with_layer_rewriter.hpp index 2a0dde7f60..a05200f71a 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/rewriters/expand_with_layer_rewriter.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/rewriters/expand_with_layer_rewriter.hpp @@ -3,14 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - -#include "vpux/compiler/dialect/IE/utils/quantization.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/utils/logging.hpp" -#include "vpux/compiler/utils/passes.hpp" -#include "vpux/compiler/utils/types.hpp" -#include "vpux/utils/core/numeric.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" + +#include namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/rewriters/propagate_transpose_affine_reshape_common.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/rewriters/propagate_transpose_affine_reshape_common.hpp index fb36eeb858..e19d33af1c 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/rewriters/propagate_transpose_affine_reshape_common.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/transforms/rewriters/propagate_transpose_affine_reshape_common.hpp @@ -3,7 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/act_shave_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/act_shave_utils.hpp new file mode 100644 index 0000000000..5982a34e79 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/act_shave_utils.hpp @@ -0,0 +1,16 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace vpux { +namespace IE { + +bool isActShaveKernel(mlir::Operation* operation); + +} // namespace IE +} // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/broadcast_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/broadcast_utils.hpp index ed4655d5b9..5f2f935565 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/broadcast_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/broadcast_utils.hpp @@ -5,8 +5,7 @@ #pragma once -#include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" #include "vpux/compiler/dialect/const/utils/content.hpp" namespace vpux { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/concat_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/concat_utils.hpp index f1ac35292f..7cca776eb0 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/concat_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/concat_utils.hpp @@ -5,7 +5,10 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" + +#include namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/convert_op_types.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/convert_op_types.hpp index 6c942e8404..89919d1323 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/convert_op_types.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/convert_op_types.hpp @@ -5,9 +5,8 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/utils/types.hpp" +#include "vpux/utils/core/func_ref.hpp" +#include "vpux/utils/logger/logger.hpp" #include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/convolution_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/convolution_utils.hpp index be055407b3..3ee85c9c06 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/convolution_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/convolution_utils.hpp @@ -5,7 +5,8 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp index cba485a4b2..1d0be0519a 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp @@ -19,7 +19,7 @@ bool needsStaticShape(mlir::Operation* op); bool isDynamicDataContiguous(vpux::ShapeRef shape, vpux::DimsOrder order); template -SmallVector replaceDynamicDimsWithValue(const SmallVector& original, T value) { +SmallVector replaceDynamicDimsWithValue(ArrayRef original, T value) { const auto originalRank = static_cast(original.size()); const auto transformDim = [value](auto dim) -> T { return dim != mlir::ShapedType::kDynamic ? static_cast(dim) : value; @@ -39,6 +39,15 @@ Shape extractShape(const Shape& shape); Shape extractShape(const BoundedShape& shape); Shape extractShape(const DimsMaskedShape& shape); +// Helpers for obtaining the reified form of any shape type. Calling this method on a regular Shape with dynamic dims +// will result in an exception. May be used in templated contexts where the shape type is unknown. +Shape reifyShape(ShapeRef shape); +Shape reifyShape(BoundedShapeRef shape); +Shape reifyShape(DimsMaskedShapeRef shape); +Shape reifyShape(const Shape& shape); +Shape reifyShape(const BoundedShape& shape); +Shape reifyShape(const DimsMaskedShape& shape); + // Helpers for splitting any shape type into its corresponding static Shape, Bounds and DynamicDimsMask representations. // Dynamic representations other than the one used by the given shape type are left empty. May be used in templated // contexts where the shape type is unknown. @@ -46,6 +55,12 @@ std::tuple splitShapeAndRepresentation(const Sha std::tuple splitShapeAndRepresentation(const BoundedShape& shape); std::tuple splitShapeAndRepresentation(const DimsMaskedShape& shape); +template +constexpr bool isStaticShape = llvm::is_one_of::value; +template +constexpr bool isDynamicShape = + llvm::is_one_of::value; + // Invokes and returns the result of a dynamic-shape-friendly functor on the concrete shape // of a tensor type, eliminating the need for call site type-switching. template diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/elem_type_info_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/elem_type_info_utils.hpp index adacda5499..0cc8350068 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/elem_type_info_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/elem_type_info_utils.hpp @@ -5,6 +5,10 @@ #pragma once +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" namespace vpux { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/expand_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/expand_utils.hpp index 3c5dd430b7..a359a43856 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/expand_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/expand_utils.hpp @@ -5,7 +5,9 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" + +#include namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/fake_quantize_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/fake_quantize_utils.hpp index 5effb4b7b0..50fc357d31 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/fake_quantize_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/fake_quantize_utils.hpp @@ -5,14 +5,17 @@ #pragma once -#include -#include "mlir/Support/LogicalResult.h" - -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/utils/logger/logger.hpp" +#include +#include "mlir/Support/LogicalResult.h" + namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/fft_ops_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/fft_ops_utils.hpp index 72ca542850..7e45689aa6 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/fft_ops_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/fft_ops_utils.hpp @@ -5,12 +5,12 @@ #pragma once -#include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/compiler/utils/types.hpp" -#include "vpux/utils/core/type_traits.hpp" + +#include namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/handle_kernels_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/handle_kernels_utils.hpp index abcba59c28..e27347e189 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/handle_kernels_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/handle_kernels_utils.hpp @@ -5,8 +5,9 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/utils/factors.hpp" +#include "vpux/utils/logger/logger.hpp" namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/interpolate_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/interpolate_utils.hpp index 3bbe9fb425..bf6142e4cb 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/interpolate_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/interpolate_utils.hpp @@ -5,9 +5,8 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - -#include +#include +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/matmul.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/matmul.hpp index 2cfe635852..2c8ac0149c 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/matmul.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/matmul.hpp @@ -5,7 +5,7 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/pad_extract.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/pad_extract.hpp index 9539a3a2af..f2d00a8a95 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/pad_extract.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/pad_extract.hpp @@ -5,8 +5,12 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/utils/types.hpp" +#include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/core/tiling.hpp" +#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/utils/logger/logger.hpp" + +#include namespace vpux { namespace IE { @@ -16,5 +20,31 @@ mlir::FailureOr> extractPads(mlir::Location loc, const mlir const std::optional& padAttr, vpux::ShapeRef inputShape); +// Adjust paddings attributes for tiled input +template +void adjustPaddings(ConcreteOp* op, const TilingInfo& inputTiling) { + const auto& inputTilePads = inputTiling.pads; + VPUX_THROW_UNLESS(inputTilePads.has_value(), "Missing tile information for paddings"); + + mlir::ArrayAttr newPadsBeginAttr, newPadsEndAttr; + + if (inputTilePads->is5D) { + const std::array padsBegin = {inputTilePads->front, inputTilePads->top, inputTilePads->left}; + const std::array padsEnd = {inputTilePads->back, inputTilePads->bottom, inputTilePads->right}; + + newPadsBeginAttr = getIntArrayAttr(op->getContext(), padsBegin); + newPadsEndAttr = getIntArrayAttr(op->getContext(), padsEnd); + } else { + const std::array padsBegin = {inputTilePads->top, inputTilePads->left}; + const std::array padsEnd = {inputTilePads->bottom, inputTilePads->right}; + + newPadsBeginAttr = getIntArrayAttr(op->getContext(), padsBegin); + newPadsEndAttr = getIntArrayAttr(op->getContext(), padsEnd); + } + + op->setPadsBeginAttr(newPadsBeginAttr); + op->setPadsEndAttr(newPadsEndAttr); +} + } // namespace IE } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/permute_infer.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/permute_infer.hpp index fe8772a514..6afbb89360 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/permute_infer.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/permute_infer.hpp @@ -6,9 +6,10 @@ #pragma once #include "vpux/compiler/core/attributes/dims_order.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/compiler/utils/permute_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" + +#include using namespace vpux; diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/permute_quantize_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/permute_quantize_utils.hpp index 08a50fa661..3f0a304824 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/permute_quantize_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/permute_quantize_utils.hpp @@ -5,8 +5,7 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/utils/types.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" namespace vpux { namespace IE { @@ -18,7 +17,7 @@ bool isLegalReorderLikeToPermuteQuantize(vpux::NDTypeInterface inType, vpux::NDT std::optional> getAdjustHW(int64_t alignment, int64_t width, int64_t height); bool isODUPermuteEffectiveForShape(const ShapeRef shape, const int64_t alignment); bool isShapeCompatibleWithODUPermute(const ShapeRef shape, const int64_t alignment); -bool canConvertToNCHWInOrderWithPermuteCast(vpux::NDTypeInterface inType, vpux::NDTypeInterface outType); +bool canConvertToNCHWInOrderWithPermuteCast(vpux::NDTypeInterface inType, mlir::AffineMap memPerm); bool checkNCEPermuteShapeCompatibility(ShapeRef inShape, ShapeRef outShape, int64_t alignment); } // namespace IE diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/pooling_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/pooling_utils.hpp index de9b3d7fe9..0482c37b8f 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/pooling_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/pooling_utils.hpp @@ -5,10 +5,13 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/utils/types.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/utils/core/numeric.hpp" +#include + namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/power_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/power_utils.hpp index 37fc13393c..79b3dbc369 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/power_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/power_utils.hpp @@ -5,7 +5,7 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/quantization.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/quantization.hpp index 5d77757614..e4c61c9300 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/quantization.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/quantization.hpp @@ -4,13 +4,10 @@ // #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" + +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" -#include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/utils/attributes_properties_conversion.hpp" -#include "vpux/compiler/utils/types.hpp" -#include "vpux/utils/core/algo.hpp" #include #include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/reify_shape.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/reify_shape.hpp index 3bef8b01ce..eab7d8446b 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/reify_shape.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/reify_shape.hpp @@ -5,11 +5,11 @@ #pragma once +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" + #include #include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - namespace vpux { IE::ConcatOp buildConcat(mlir::Location loc, mlir::OpBuilder& builder, ShapeRef producerShape, diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/reshape_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/reshape_utils.hpp index 8f5afb32d5..7f96fb9cb3 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/reshape_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/reshape_utils.hpp @@ -5,7 +5,9 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" + +#include namespace vpux { namespace IE { @@ -34,5 +36,8 @@ bool isEligibleToFoldStrideKernel(vpux::NDTypeInterface inputType, vpux::NDTypeI Shape getNewShapeAfterStrideFolding(ShapeRef origShape, int64_t SX); +mlir::Value createDynamicReshape(mlir::OpBuilder& builder, mlir::Location loc, mlir::Value input, + BoundedShape outputShape); + } // namespace IE } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/resources.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/resources.hpp index e2043abad2..eccdd22309 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/resources.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/resources.hpp @@ -5,7 +5,13 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/resources.hpp" + +#include + +namespace vpux::VPU { +enum class ExecutorKind : uint64_t; +} namespace vpux { namespace IE { @@ -57,6 +63,9 @@ SmallVector getReservedMemoryResources(mlir::ModuleOp main SmallVector> getReservedMemOffsetAndSizeVec(mlir::ModuleOp module, mlir::SymbolRefAttr memSpaceAttr); + +size_t getReservedMemorySize(mlir::ModuleOp mainModule, mlir::SymbolRefAttr memSpace); + // // DMA profiling reserved memory // @@ -112,22 +121,24 @@ memory_resource_if getSWKernelPrefetchingReservedMemory(mlir::ModuleOp mai SmallVector getSWKernelPrefetchingReservedMemory(mlir::ModuleOp mainModule); // -// SW Kernel cache prefetching reserved memory +// Dummy SW kernels for instruction prefetch reserved memory // -static constexpr StringLiteral swKernelCachePrefetchingResMemModuleName = "SWKernelCachePrefetchingReservedMemory"; +static constexpr StringLiteral dummySwKernelsForInstructionPrefetchResMemModuleName = + "DummySWKernelsForInstructionPrefetchReservedMemory"; -IE::MemoryResourceOp setSWKernelCachePrefetchingReservedMemory(mlir::ModuleOp mainModule, mlir::SymbolRefAttr memSpace, - int64_t size); +IE::MemoryResourceOp setDummySwKernelsForInstructionPrefetchReservedMemory(mlir::ModuleOp mainModule, + mlir::SymbolRefAttr memSpace, int64_t size); -IE::MemoryResourceOp getSWKernelCachePrefetchingReservedMemory(mlir::ModuleOp mainModule, mlir::SymbolRefAttr memSpace); +IE::MemoryResourceOp getDummySwKernelsForInstructionPrefetchReservedMemory(mlir::ModuleOp mainModule, + mlir::SymbolRefAttr memSpace); template -memory_resource_if getSWKernelCachePrefetchingReservedMemory(mlir::ModuleOp mainModule, Enum kind) { - return getSWKernelCachePrefetchingReservedMemory( +memory_resource_if getDummySwKernelsForInstructionPrefetchReservedMemory(mlir::ModuleOp mainModule, Enum kind) { + return getDummySwKernelsForInstructionPrefetchReservedMemory( mainModule, mlir::SymbolRefAttr::get(mainModule.getContext(), stringifyEnum(kind))); } -SmallVector getSWKernelCachePrefetchingReservedMemory(mlir::ModuleOp mainModule); +SmallVector getDummySwKernelsForInstructionPrefetchReservedMemory(mlir::ModuleOp mainModule); // // ExecutorResourceOp diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/roll_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/roll_utils.hpp index eb1436f279..7895d8c42c 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/roll_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/roll_utils.hpp @@ -5,8 +5,7 @@ #pragma once -#include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/core/attributes/shape.hpp" namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/scale_shift_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/scale_shift_utils.hpp index 8287884fcc..ef7858153d 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/scale_shift_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/scale_shift_utils.hpp @@ -5,7 +5,7 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/shape_infer.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/shape_infer.hpp index c4109be206..f5528e11dd 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/shape_infer.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/shape_infer.hpp @@ -6,8 +6,6 @@ #pragma once #include "vpux/compiler/core/attributes/shape.hpp" -#include "vpux/compiler/dialect/IE/IR/attributes.hpp" - #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/utils/core/array_ref.hpp" #include "vpux/utils/core/small_vector.hpp" @@ -15,6 +13,10 @@ #include +namespace vpux::IE { +enum class AutoBroadcastType : uint64_t; +} + namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/slice_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/slice_utils.hpp index 862af1b74e..d9bb03b888 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/slice_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/slice_utils.hpp @@ -5,7 +5,8 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/softmax_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/softmax_utils.hpp index 8bdb2755c4..594613c691 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/softmax_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/softmax_utils.hpp @@ -5,7 +5,7 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/transpose_op_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/transpose_op_utils.hpp index c493a44c57..5ab6d27856 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/transpose_op_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/transpose_op_utils.hpp @@ -5,7 +5,7 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" namespace vpux { namespace IE { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/unsqueeze.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/unsqueeze.hpp index be86ecadc5..606921e425 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/unsqueeze.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/IE/utils/unsqueeze.hpp @@ -5,8 +5,6 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/attributes.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/attributes.hpp index 24fc8859ca..47871bea58 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/attributes.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/attributes.hpp @@ -9,6 +9,7 @@ #include "vpux/compiler/core/attributes/strided_shape.hpp" #include "vpux/compiler/core/tiling.hpp" #include "vpux/compiler/dialect/VPU/IR/attr_interfaces.hpp" +#include "vpux/compiler/dialect/config/IR/attributes.hpp" #include "vpux/utils/core/array_ref.hpp" #include "vpux/utils/core/func_ref.hpp" #include "vpux/utils/core/mem_size.hpp" @@ -23,6 +24,7 @@ namespace vpux { namespace IE { class InterpolateCoordModeAttr; +class InterpolateNearestModeAttr; class PadModeAttr; } // namespace IE namespace VPU { @@ -50,13 +52,6 @@ namespace VPU { // This one represents a CMX_NN memory space with fragmentation consideration constexpr StringLiteral CMX_NN_FragmentationAware = "CMX_NN_FragmentationAware"; -// -// Run-time resources -// - -StringLiteral getMemoryDerateAttrName(); -StringLiteral getMemoryBandwidthAttrName(); - /** * @brief Get DPU frequency * @@ -72,7 +67,7 @@ StringLiteral getMemoryBandwidthAttrName(); * @note Values returned by this function are tight to definitions provided by * vpucostmodel. */ -unsigned int getDpuFrequency(vpux::VPU::ArchKind arch, vpux::VPU::RevisionID rev); +unsigned int getDpuFrequency(vpux::config::ArchKind arch, vpux::config::RevisionID rev); /** * @brief Get maximal DMA bandwidth for a given architecture @@ -95,11 +90,11 @@ double getDmaBandwidthGBps(mlir::ModuleOp module); * * See getDmaBandwidthGBps(mlir::ModuleOp module) */ -double getDmaBandwidthGBps(ArchKind arch); +double getDmaBandwidthGBps(config::ArchKind arch); -uint32_t getMaxArchDPUClusterNum(ArchKind arch); +uint32_t getMaxArchDPUClusterNum(config::ArchKind arch); uint32_t getMaxArchDPUClusterNum(mlir::Operation* op); -uint32_t getMaxDMAPorts(ArchKind arch); +uint32_t getMaxDMAPorts(config::ArchKind arch); /** * @brief return DMA bandwidth @@ -108,7 +103,7 @@ uint32_t getMaxDMAPorts(ArchKind arch); * @param revision - platform revision ID * @return DMA bandwidth in bytes per DPU clock cycle */ -double getDMABandwidth(ArchKind arch, VPU::RevisionID rev); +double getDMABandwidth(config::ArchKind arch, config::RevisionID rev); /** * @brief NCE troughput @@ -124,24 +119,6 @@ Byte getTotalCMXFragmentationAwareSize(mlir::Operation* op); Byte getTotalCMXFragmentationAwareSize(mlir::ModuleOp module); Byte getTotalCMXVFPipelineFragmentationAwareSize(mlir::Operation* op); -// -// ArchKind -// - -void setArch(mlir::ModuleOp module, ArchKind kind, int numOfDPUGroups, std::optional numOfDMAPorts = std::nullopt, - std::optional availableCMXMemory = std::nullopt, bool allowCustomValues = false); - -ArchKind getArch(mlir::Operation* op); -bool isArchVPUX3XXX(VPU::ArchKind arch); - -// -// RevisionID -// - -void setRevisionID(mlir::ModuleOp module, RevisionID revisionID); -bool hasRevisionID(mlir::ModuleOp module); -RevisionID getRevisionID(mlir::Operation* op); - // // PaddingAttr // diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/native_attributes/distribution_info.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/native_attributes/distribution_info.hpp index 91ee85c683..778706812d 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/native_attributes/distribution_info.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/native_attributes/distribution_info.hpp @@ -5,14 +5,20 @@ #pragma once -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/native_attributes/padding_native.hpp" +#include "vpux/utils/core/array_ref.hpp" +#include "vpux/utils/core/small_vector.hpp" + +namespace vpux::VPU { +enum class DistributionMode : uint64_t; +class DistributionInfoAttr; +} // namespace vpux::VPU namespace vpux { namespace VPU { class DistributionInfo { private: - DistributionMode _distributionMode = DistributionMode::NONE; + DistributionMode _distributionMode = {}; SmallVector _numTiles = {}; SmallVector _kernel = {}; std::optional _pad = std::nullopt; @@ -178,41 +184,7 @@ class DistributionInfo { _pad = padding; } - void printFormat(llvm::raw_ostream& stream) const { - printTo(stream, "\n#VPU.DistributedTensor", _equalMemoryAndComputeView); - } + void printFormat(llvm::raw_ostream& stream) const; }; } // namespace VPU diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/native_attributes/padding_native.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/native_attributes/padding_native.hpp index 32ac8d260a..122ff46f58 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/native_attributes/padding_native.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/native_attributes/padding_native.hpp @@ -5,8 +5,12 @@ #pragma once -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/utils/attributes.hpp" +#include +#include + +namespace vpux::VPU { +class PaddingAttr; +} // namespace vpux::VPU namespace vpux { namespace VPU { @@ -41,37 +45,11 @@ class Padding { return right; } - static Padding getClassFromAttr(PaddingAttr paddingAttr) { - if (paddingAttr == nullptr) { - return {}; - } - - auto left = paddingAttr.getLeft().getInt(); - auto right = paddingAttr.getRight().getInt(); - auto top = paddingAttr.getTop().getInt(); - auto bottom = paddingAttr.getBottom().getInt(); - - return Padding(left, right, top, bottom); - } - - static PaddingAttr getAttrFromClass(mlir::MLIRContext* ctx, const Padding& padding) { - auto topAttr = vpux::getIntAttr(ctx, padding.top); - auto bottomAttr = vpux::getIntAttr(ctx, padding.bottom); - auto leftAttr = vpux::getIntAttr(ctx, padding.left); - auto rightAttr = vpux::getIntAttr(ctx, padding.right); + static Padding getClassFromAttr(PaddingAttr paddingAttr); - return PaddingAttr::get(ctx, leftAttr, rightAttr, topAttr, bottomAttr); - }; + static PaddingAttr getAttrFromClass(mlir::MLIRContext* ctx, const Padding& padding); - void printFormat(llvm::raw_ostream& stream) const { - std::unordered_map map; - map["left"] = left; - map["right"] = right; - map["top"] = top; - map["bottom"] = bottom; - printTo(stream, "pads = "); - vpux::MapFormatProvider::format(map, stream, {}); - } + void printFormat(llvm::raw_ostream& stream) const; }; } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/ops.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/ops.hpp index 2d1840076a..8a3e16a3e4 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/ops.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/ops.hpp @@ -6,16 +6,38 @@ #pragma once #include "vpux/compiler/core/attributes/shape.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/IR/types.hpp" #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" - +#include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/utils/core/error.hpp" #include +#include #include +#include + +// E#173010: remove dependency on IE operations for VPU operations +namespace vpux::IE { +class AvgPoolOp; +class AddOp; +class BatchNormInferenceOp; +class ConvolutionOp; +class GroupConvolutionOp; +class InterpolateOp; +class LSTMCellOp; +class LSTMSequenceOp; +class MatMulOp; +class MaxPoolOp; +class MultiplyOp; +class PermuteQuantizeOp; +class SubtractOp; +class TransposedConvolutionOp; +class YuvToRgbOp; +} // namespace vpux::IE // // Generated @@ -199,6 +221,7 @@ mlir::LogicalResult isDistributedCastCompatible(T inDistributedType, T outDistri bool isNCEWithInt4Weights(mlir::Operation* op); bool isNCEWithSEPActivation(mlir::Operation* op); + std::optional getWeightsChannelsAutopad(mlir::Operation* op); } // namespace VPU diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp index 1d60ea3fb9..9bba0f4c57 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp @@ -130,76 +130,12 @@ vpux::NDTypeInterface getDistributedTypeForOpResult(mlir::Operation* op, mlir::V bool isPureViewOp(mlir::Operation* op); -// -// DefinedInArch -// - -template -struct DefinedInArch { - template - class Impl : public mlir::OpTrait::TraitBase { - public: - static mlir::LogicalResult verifyTrait(mlir::Operation* op) { - return verifyArchKind(op, arch); - } - - private: - static mlir::LogicalResult verifyArchKind(mlir::Operation* op, ArchKind definedInArch) { - auto actualArch = getArch(op); - - if (actualArch != ArchKind::UNKNOWN && actualArch < definedInArch) { - auto actualArchStr = stringifyArchKind(actualArch).str(); - auto definedInArchStr = stringifyArchKind(definedInArch).str(); - return vpux::errorAt(op, "Operation {0} not supported in {1}; op has been introduced in {2}", - op->getName(), actualArchStr, definedInArchStr); - } - - return mlir::success(); - } - }; -}; - // // SwOpInterface // bool supportSwOpLoweringAsDMA(mlir::Operation* op); -// -// LimitedToArch -// - -template -struct LimitedToArch { - template - class Impl : public mlir::OpTrait::TraitBase { - public: - static mlir::LogicalResult verifyTrait(mlir::Operation* op) { - return verifyArchKind(op, {archs...}); - } - - private: - static mlir::LogicalResult verifyArchKind(mlir::Operation* op, std::initializer_list supportedArchs) { - auto actualArch = getArch(op); - - if (actualArch != ArchKind::UNKNOWN) { - if (std::find(cbegin(supportedArchs), cend(supportedArchs), actualArch) == cend(supportedArchs)) { - auto actualArchStr = stringifyArchKind(actualArch).str(); - auto archsStr = std::accumulate(cbegin(supportedArchs), cend(supportedArchs), std::string(), - [](const std::string& accu, const ArchKind arch) -> std::string { - return accu + (accu.length() > 0 ? "," : "") + - stringifyArchKind(arch).str(); - }); - return vpux::errorAt(op, "Operation {0} not supported in {1}; list of supported archs: {2}", - op->getName(), actualArchStr, archsStr); - } - } - - return mlir::success(); - } - }; -}; - } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/tiling_info.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/tiling_info.hpp index 41d955df54..d387c9f6ed 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/tiling_info.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/tiling_info.hpp @@ -5,8 +5,11 @@ #pragma once -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/core/tiling.hpp" + +namespace vpux::VPUIP { +class SwKernelOp; +} namespace vpux::VPU { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/type_interfaces.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/type_interfaces.hpp index 872e1d2f25..03cf3f52c3 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/type_interfaces.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/IR/type_interfaces.hpp @@ -5,10 +5,11 @@ #pragma once +#include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/attributes/strided_shape.hpp" #include "vpux/compiler/core/tiling.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" - #include "vpux/utils/core/small_vector.hpp" // diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/cost_model_factory.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/cost_model_factory.hpp index 88e4e80866..e1e2bb9452 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/cost_model_factory.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/cost_model_factory.hpp @@ -11,7 +11,7 @@ namespace vpux { namespace VPU { -static constexpr unsigned int VPUNN_CACHE_SIZE = 8156; +static constexpr unsigned int VPUNN_CACHE_SIZE = 8192U; /** * @brief Interface for creating VPUNN Cost Models diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/cost_model_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/cost_model_utils.hpp new file mode 100644 index 0000000000..51b12e0e22 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/cost_model_utils.hpp @@ -0,0 +1,31 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/utils/core/error.hpp" + +#include + +namespace vpux { +namespace VPU { + +class ICostModelUtilsInterface : public mlir::DialectInterface::Base { +public: + // required by MLIR's internal type-id infrastructure: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ICostModelUtilsInterface) + + ICostModelUtilsInterface(mlir::Dialect* dialect): Base(dialect) { + } + + // indicate whether the cost model supports NCEOps with int4 weights. + virtual bool isNCEWithInt4WeightsSupported() const = 0; + + // indicate whether the cost model supports NCEOps with multi-dim pipeline tiling. + virtual bool isMultiDimPipelineTilingSupported() const = 0; +}; + +} // namespace VPU +} // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/scf/scf_tiling_interfaces.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/scf/scf_tiling_interfaces.hpp index 964cea2286..d9f54efd31 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/scf/scf_tiling_interfaces.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/scf/scf_tiling_interfaces.hpp @@ -6,6 +6,7 @@ #pragma once #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/utils/scf/scf_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/tiling_algorithm/scf_tiling/scf_tiling.hpp" @@ -31,55 +32,32 @@ class SCFTilingCommonModelOp : public mlir::TilingInterface::ExternalModel& tiledOperands, mlir::Operation* operation) const { return static_cast(this)->createTiledOperation( - std::move(opGenerator), std::move(operandsGenerator), builder, inputTiling, outputTile, dim, origShape, - origOperation, tiling); + std::move(opGenerator), std::move(operandsGenerator), builder, inputTiling, outputTile, dims, + tiledOperands, operation); } - mlir::Value generateTile(mlir::Location loc, mlir::OpBuilder& builder, mlir::Value origInput, - const SCFTileInfo& inputTileInfo) const { - auto origType = mlir::cast(origInput.getType()); - - auto staticNewShape = mlir::getConstantIntValues(inputTileInfo.shape); - if (origType.getShape().isStatic() && staticNewShape.has_value() && - llvm::equal(origType.getShape().raw(), staticNewShape.value())) { - return origInput; - } - - SmallVector defaultStrides(inputTileInfo.offsets.size(), builder.getIndexAttr(1)); + void fillInResultTilePositions(mlir::Operation* operation, mlir::OpBuilder& builder, unsigned resultNumber, + ArrayRef offsets, ArrayRef sizes, + SmallVector& resultOffsets, + SmallVector& resultSizes) const { + auto outputType = mlir::cast(operation->getResult(resultNumber).getType()); - auto extractTile = builder.create( - appendLoc(loc, "extractSlice"), origInput, inputTileInfo.offsets, inputTileInfo.shape, defaultStrides); + const auto strategy = + Shape(parseIntArrayAttr(mlir::cast(operation->getAttr(tilingStrategy)))); + auto tilingDims = getTilingOrderedDims(operation, strategy); + resultOffsets = SmallVector(outputType.getRank(), builder.getIndexAttr(0)); + resultSizes.reserve(outputType.getRank()); + resultSizes = mlir::getAsIndexOpFoldResult(builder.getContext(), outputType.getShape()); - auto newShape = getShape(extractTile.getResult()); - auto newType = origType.changeShape(ShapeRef(newShape)); - if (auto boundedType = mlir::dyn_cast(newType)) { - newType = boundedType.changeBounds(inputTileInfo.bounds); + for (auto dimIndex : irange(tilingDims.size())) { + auto index = + sizes.size() == resultOffsets.size() ? static_cast(tilingDims[dimIndex].ind()) : dimIndex; + resultOffsets[tilingDims[dimIndex].ind()] = offsets[index]; + resultSizes[tilingDims[dimIndex].ind()] = sizes[index]; } - - // by default output type loses NPU-specific attributes so we have to set it manually - extractTile->getResult(0).setType(newType); - - return extractTile; - } - - mlir::Type extractResultType(mlir::Type origType, SCFShapeRef newShape, BoundsRef bounds) const { - auto ndTensorType = mlir::cast(origType); - auto origElemType = ndTensorType.getElementType(); - - VPUX_THROW_WHEN(mlir::isa(origElemType), - "Per axis quantized types are not supported in scf"); - - const auto tensorDesc = - vpux::getTensorAttr(origElemType.getContext(), ndTensorType.getDimsOrder(), ndTensorType.getMemSpace(), - mlir::isa(origType) ? bounds : Bounds{}); - - SmallVector dynamicDims; // unused cause for shape static dims are enough - SmallVector staticDims; - mlir::dispatchIndexOpFoldResults(newShape, dynamicDims, staticDims); - return mlir::RankedTensorType::get(staticDims, origElemType, tensorDesc); } public: @@ -87,7 +65,7 @@ class SCFTilingCommonModelOp : public mlir::TilingInterface::ExternalModel(mlir::cast(operation->getAttr(tilingStrategy)))); - auto tilingDims = getNonOneDim(strategy); + auto tilingDims = getTilingOrderedDims(operation, strategy); auto loc = operation->getLoc(); auto tilingRank = tilingDims.size(); @@ -109,14 +87,11 @@ class SCFTilingCommonModelOp : public mlir::TilingInterface::ExternalModel(mlir::cast(operation->getAttr(tilingStrategy)))); - auto tilingDims = getNonOneDim(strategy); + auto tilingDims = getTilingOrderedDims(operation, strategy); SmallVector results; SmallVector resultValues; @@ -133,69 +108,68 @@ class SCFTilingCommonModelOp : public mlir::TilingInterface::ExternalModel tiledOperands; - tiledOperands.reserve(operation->getNumOperands()); - - OpTilingOperandsFunc createTiledOperands = [&](auto& tiling) { - tiledOperands.clear(); - for (auto p : operation->getOperands() | indexed) { - auto origInput = p.value(); - auto inputIdx = p.index(); + for (auto tileDim : tilingDims) { + axis[tileDim.ind()] = builder.getIndexAttr(strategy[tileDim]); + } + auto outputTile = SCFTileInfo(resultSizes, resultOffsets, axis, resultBounds); + auto inputTiling = backInferSCFTileInfo(operation, builder, outputTile); + SmallVector tiledOperands; + tiledOperands.reserve(operation->getNumOperands()); + + OpTilingOperandsFunc createTiledOperands = [&](auto& tiling) { + tiledOperands.clear(); + llvm::DenseMap sliceMatch; + for (auto p : operation->getOperands() | indexed) { + auto origInput = p.value(); + auto inputIdx = p.index(); + + if (tiling.tiles.size() <= inputIdx) { + tiledOperands.emplace_back(origInput); + continue; + } - if (tiling.size() <= inputIdx) { - tiledOperands.emplace_back(origInput); - continue; - } + if (sliceMatch.find(origInput) != sliceMatch.end()) { + tiledOperands.emplace_back(sliceMatch[origInput]); + continue; + } - auto inputTileInfo = tiling[inputIdx]; - auto tiledInput = generateTile(operation->getLoc(), builder, origInput, inputTileInfo); + auto inputTileInfo = tiling.tiles[inputIdx]; + auto tiledInput = generateTile(operation->getLoc(), builder, origInput, inputTileInfo); + sliceMatch[origInput] = tiledInput; - tiledOperands.emplace_back(tiledInput); - } - }; + tiledOperands.emplace_back(tiledInput); + } + }; - OpGeneratorFunc generatorFunc = [&]() { - auto resultDenseTile = extractResultType(operation->getResult(0).getType(), resultSizes, resultBounds); - auto* tiledOp = mlir::cloneWithoutRegions(builder, operation, {resultDenseTile}, tiledOperands); - tiledOp->removeAttr(tilingStrategy); - return tiledOp; - }; + OpGeneratorFunc generatorFunc = [&]() { + auto resultDenseTile = extractResultType(operation->getResult(0).getType(), resultSizes, resultBounds); + auto* tiledOp = mlir::cloneWithoutRegions(builder, operation, {resultDenseTile}, tiledOperands); + tiledOp->removeAttr(tilingStrategy); + return tiledOp; + }; - auto* resultOp = - createTiledOperation(std::move(generatorFunc), std::move(createTiledOperands), builder, inputTiling, - outputTile, tilingDims[index], origShape, operation, strategy); + auto* resultOp = createTiledOperation(std::move(generatorFunc), std::move(createTiledOperands), builder, + inputTiling, outputTile, tilingDims, tiledOperands, operation); - results.emplace_back(resultOp); - resultValues.emplace_back(resultOp->getResult(0)); - } + results.emplace_back(resultOp); + resultValues.emplace_back(resultOp->getResult(0)); return mlir::TilingResult{std::move(results), std::move(resultValues)}; } + mlir::FailureOr generateResultTileValue(mlir::Operation* operation, mlir::OpBuilder& builder, + unsigned, mlir::ArrayRef offsets, + mlir::ArrayRef sizes) const { + return getTiledImplementation(operation, builder, offsets, sizes); + } + mlir::LogicalResult getResultTilePosition(mlir::Operation* operation, mlir::OpBuilder& builder, unsigned resultNumber, ArrayRef offsets, ArrayRef sizes, SmallVector& resultOffsets, SmallVector& resultSizes) const { - auto outputType = mlir::cast(operation->getResult(resultNumber).getType()); - - const auto strategy = - Shape(parseIntArrayAttr(mlir::cast(operation->getAttr(tilingStrategy)))); - auto tilingDims = getNonOneDim(strategy); - resultOffsets = SmallVector(outputType.getRank(), builder.getIndexAttr(0)); - resultSizes.reserve(outputType.getRank()); - resultSizes = mlir::getAsIndexOpFoldResult(builder.getContext(), outputType.getShape()); - - for (auto dimIndex : irange(tilingDims.size())) { - resultOffsets[tilingDims[dimIndex].ind()] = offsets[dimIndex]; - resultSizes[tilingDims[dimIndex].ind()] = sizes[dimIndex]; - } - - return mlir::success(); + return static_cast(this)->getResultTilePosition(operation, builder, resultNumber, offsets, + sizes, resultOffsets, resultSizes); } mlir::LogicalResult getResultTileBounds(mlir::Operation* operation, mlir::OpBuilder& builder, unsigned resultNumber, @@ -237,9 +211,17 @@ class SCFTilingCommonModelOp : public mlir::TilingInterface::ExternalModel { public: + mlir::LogicalResult getResultTilePosition(mlir::Operation* operation, mlir::OpBuilder& builder, + unsigned resultNumber, ArrayRef offsets, + ArrayRef sizes, + SmallVector& resultOffsets, + SmallVector& resultSizes) const { + fillInResultTilePositions(operation, builder, resultNumber, offsets, sizes, resultOffsets, resultSizes); + return mlir::success(); + } mlir::Operation* createTiledOperation(OpGeneratorFunc opGenerator, OpTilingOperandsFunc operandsGenerator, - mlir::OpBuilder&, SCFTilingInfo& tiling, const SCFTileInfo&, Dim, SCFShapeRef, - mlir::Operation*, ShapeRef) const { + mlir::OpBuilder&, SCFTilingInfo& tiling, const SCFTileInfo&, DimArrRef, + SmallVector&, mlir::Operation*) const { operandsGenerator(tiling); return opGenerator(); } @@ -272,42 +254,59 @@ class SCFTilingPoolingModelOp : public SCFTilingCommonModelOp(strides[index]).getValue().getSExtValue(); const auto kernel = mlir::cast(kernelSize[index]).getValue().getSExtValue(); - mlir::Range inputRange = - solutionForOutputRange(loc, builder, outputTile, dim, kernel, stride, padMap[dim.ind()]); - inputTile.offsets[dim.ind()] = inputRange.offset; - inputTile.shape[dim.ind()] = inputRange.size; + auto [inputRange, dimBound] = + solutionForOutputRange(loc, builder, outputTile, dim, kernel, stride, + mlir::getConstantIntValue(origInputShape[dim.ind()]).value(), + padMap[dim.ind()], pads[index], pads[index + 2]); + + if (inputRange.has_value()) { + inputTile.offsets[dim.ind()] = inputRange.value().offset; + inputTile.shape[dim.ind()] = inputRange.value().size; + } + + if (dimBound.has_value()) { + inputTile.bounds[dim] = dimBound.value(); + } } - return {std::move(inputTile)}; + return {std::move(inputTile), std::move(pads)}; } public: + mlir::LogicalResult getResultTilePosition(mlir::Operation* operation, mlir::OpBuilder& builder, + unsigned resultNumber, ArrayRef offsets, + ArrayRef sizes, + SmallVector& resultOffsets, + SmallVector& resultSizes) const { + this->fillInResultTilePositions(operation, builder, resultNumber, offsets, sizes, resultOffsets, resultSizes); + correctPaddedOutput(builder, mlir::cast(operation), resultSizes); + return mlir::success(); + } mlir::Operation* createTiledOperation(OpGeneratorFunc opGenerator, OpTilingOperandsFunc operandsGenerator, - mlir::OpBuilder& builder, SCFTilingInfo& inputTiling, - const SCFTileInfo& outputTile, Dim dim, SCFShapeRef origShape, - mlir::Operation* origOperation, ShapeRef tiling) const { + mlir::OpBuilder& builder, SCFTilingInfo& inputTiling, const SCFTileInfo&, + DimArrRef dims, SmallVector& tiledOperands, + mlir::Operation* operation) const { return createTiledPaddedOperation(std::move(opGenerator), std::move(operandsGenerator), builder, - inputTiling, outputTile, dim, origShape, origOperation, tiling); + inputTiling, dims, tiledOperands, operation); } SCFTilingInfo backInferSCFTileInfo(mlir::Operation* operation, mlir::OpBuilder& builder, @@ -324,7 +323,7 @@ class SCFTilingPoolingModelOp : public SCFTilingCommonModelOp(operation); if (nceOp != nullptr && nceOp.getWeightsTableOperand() != nullptr && !mlir::isConstantIntValue(outputTile.axis[Dims4D::Act::C.ind()], 1)) { - inputTiling.emplace_back( + inputTiling.tiles.emplace_back( getWeightsTableSCFTile(nceOp.getWeightsTableOperand().getType(), builder, outputTile)); } @@ -334,36 +333,54 @@ class SCFTilingPoolingModelOp : public SCFTilingCommonModelOp class SCFTilingConvModelOp : public SCFTilingCommonModelOp { +public: + mlir::LogicalResult getResultTilePosition(mlir::Operation* operation, mlir::OpBuilder& builder, + unsigned resultNumber, ArrayRef offsets, + ArrayRef sizes, + SmallVector& resultOffsets, + SmallVector& resultSizes) const { + this->fillInResultTilePositions(operation, builder, resultNumber, offsets, sizes, resultOffsets, resultSizes); + correctPaddedOutput(builder, mlir::cast(operation), resultSizes); + return mlir::success(); + } + protected: - SCFTileInfo backInferConvInputTile(mlir::Location loc, mlir::OpBuilder& builder, const SCFTileInfo& outputTile, - SCFShapeRef origInputShape, const std::array kernel_size, - mlir::ArrayAttr strides, const PadInfo& origPadding) const { + SCFTilingInfo backInferConvInputTile(mlir::Location loc, mlir::OpBuilder& builder, const SCFTileInfo& outputTile, + SCFShapeRef origInputShape, const std::array kernelSize, + mlir::ArrayAttr strides, const PadInfo& origPadding) const { SCFTileInfo inputTile(origInputShape, builder); - - auto axes = outputTile.axis; - inputTile.shape[Dims4D::Act::N.ind()] = outputTile.shape[Dims4D::Act::N.ind()]; inputTile.offsets[Dims4D::Act::N.ind()] = outputTile.offsets[Dims4D::Act::N.ind()]; - auto padMap = origPadding.toPadByDims(); + if (!outputTile.bounds.raw().empty()) { + inputTile.bounds = outputTile.bounds; + inputTile.bounds[Dims4D::Act::C] = kernelSize[Dims4D::Filter::IC.ind()]; + } + auto padMap = origPadding.toPadByDims(); + auto pads = mlir::getAsIndexOpFoldResult( + builder.getContext(), {origPadding.left, origPadding.top, origPadding.right, origPadding.bottom}); for (auto index : irange(Dims4D::Act::numSpatialDims)) { const auto dim = Dims4D::Act::getSpatialDim(index); - if (mlir::isConstantIntValue(axes[dim.ind()], 1)) { - continue; - } - const auto stride = mlir::cast(strides[index]).getValue().getSExtValue(); - const auto kernel = kernel_size[index]; + const auto kernel = kernelSize[dim.ind()]; + + auto [inputRange, dimBound] = + solutionForOutputRange(loc, builder, outputTile, dim, kernel, stride, + mlir::getConstantIntValue(origInputShape[dim.ind()]).value(), + padMap[dim.ind()], pads[index], pads[index + 2]); - mlir::Range inputRange = - solutionForOutputRange(loc, builder, outputTile, dim, kernel, stride, padMap[dim.ind()]); + if (inputRange.has_value()) { + inputTile.offsets[dim.ind()] = inputRange.value().offset; + inputTile.shape[dim.ind()] = inputRange.value().size; + } - inputTile.offsets[dim.ind()] = inputRange.offset; - inputTile.shape[dim.ind()] = inputRange.size; + if (dimBound.has_value()) { + inputTile.bounds[dim] = dimBound.value(); + } } - return inputTile; + return SCFTilingInfo(inputTile, pads); } public: @@ -375,11 +392,11 @@ class SCFTilingConvModelOp : public SCFTilingCommonModelOp kernel = {origFilterShape[Dims4D::Filter::KX], - origFilterShape[Dims4D::Filter::KY]}; - - SCFTilingInfo tilingInfo = {backInferConvInputTile(operation->getLoc(), builder, outputTile, origInputShape, - kernel, convOperation.getStrides(), origPadding)}; + const std::array kernelSize = { + origFilterShape[Dims4D::Filter::OC], origFilterShape[Dims4D::Filter::IC], + origFilterShape[Dims4D::Filter::KX], origFilterShape[Dims4D::Filter::KY]}; + SCFTilingInfo tilingInfo = backInferConvInputTile(operation->getLoc(), builder, outputTile, origInputShape, + kernelSize, convOperation.getStrides(), origPadding); const auto tileOverChannels = !mlir::isConstantIntValue(outputTile.axis[Dims4D::Act::C.ind()], 1); @@ -389,11 +406,11 @@ class SCFTilingConvModelOp : public SCFTilingCommonModelOp(operation); if (nceOp != nullptr && nceOp.getWeightsTableOperand() != nullptr) { - tilingInfo.emplace_back( + tilingInfo.tiles.emplace_back( getWeightsTableSCFTile(nceOp.getWeightsTableOperand().getType(), builder, outputTile)); } } @@ -403,8 +420,8 @@ class SCFTilingConvModelOp : public SCFTilingCommonModelOp& tiledOperands, mlir::Operation* operation) const { auto generator = opGenerator; auto newChannelValue = mlir::getConstantIntValue(outputTile.shape[Dims4D::Act::C.ind()]); if (!mlir::isConstantIntValue(outputTile.axis[Dims4D::Act::C.ind()], 1) && newChannelValue.has_value()) { @@ -418,7 +435,7 @@ class SCFTilingConvModelOp : public SCFTilingCommonModelOp(std::move(generator), std::move(operandsGenerator), builder, - inputTiling, outputTile, dim, origShape, origOperation, tiling); + inputTiling, dims, tiledOperands, operation); } }; @@ -438,7 +455,7 @@ class SCFTilingDepthConvModelOp : public SCFTilingConvModelOp(operation); const auto origFilterShape = Shape(parseIntArrayAttr(depthOperation.getRawFilterShape())); const auto origInputShape = getShape(depthOperation.getInput()); - auto& inputTiles = inputConvTiling[0]; + auto& inputTiles = inputConvTiling.tiles[0]; mlir::AffineExpr d0; bindDims(builder.getContext(), d0); diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/scf/scf_tiling_viewlike_interfaces.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/scf/scf_tiling_viewlike_interfaces.hpp new file mode 100644 index 0000000000..b39a149234 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/scf/scf_tiling_viewlike_interfaces.hpp @@ -0,0 +1,88 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" +#include "vpux/compiler/dialect/core/types.hpp" + +#include +#include +#include + +namespace vpux::VPU { + +// +// SCFViewLikeTilingModelOp +// + +template +class SCFViewLikeTilingModelOp : public mlir::TilingInterface::ExternalModel { +protected: + SCFTilingInfo backInferSCFTileInfo(mlir::Operation* operation, mlir::OpBuilder& builder, + const SCFTileInfo& outputTile) const { + return static_cast(this)->backInferSCFTileInfo(operation, builder, outputTile); + } + +public: + SmallVector getIterationDomain(mlir::Operation*, mlir::OpBuilder&) const { + // return empty list to prevent tiling without fusion + return {}; + } + + mlir::FailureOr getTiledImplementation(mlir::Operation*, mlir::OpBuilder&, + ArrayRef, + ArrayRef) const { + // return failure to prevent tiling view like ops without fusion + return mlir::failure(); + } + + mlir::FailureOr generateResultTileValue(mlir::Operation* operation, mlir::OpBuilder& builder, + unsigned resultNumber, + ArrayRef offsets, + ArrayRef sizes) const { + auto outputTile = SCFTileInfo(sizes, offsets, SCFShape(offsets.size(), builder.getIndexAttr(1))); + auto inputTiling = backInferSCFTileInfo(operation, builder, outputTile); + + SmallVector tiledOperands; + tiledOperands.reserve(operation->getNumOperands()); + + for (auto p : operation->getOperands() | indexed) { + auto origInput = p.value(); + auto inputIdx = p.index(); + + if (inputTiling.tiles.size() <= inputIdx) { + tiledOperands.emplace_back(origInput); + continue; + } + + auto inputTileInfo = inputTiling.tiles[inputIdx]; + auto tiledInput = generateTile(operation->getLoc(), builder, origInput, inputTileInfo); + + tiledOperands.emplace_back(tiledInput); + } + + auto resultDenseTile = extractResultType(operation->getResult(0).getType(), sizes, {}); + auto* tiledOp = mlir::cloneWithoutRegions(builder, operation, {resultDenseTile}, tiledOperands); + + return mlir::TilingResult{{tiledOp}, {tiledOp->getResult(resultNumber)}}; + } + + mlir::LogicalResult getResultTilePosition(mlir::Operation*, mlir::OpBuilder&, unsigned, + ArrayRef, ArrayRef, + SmallVector&, + SmallVector&) const { + return mlir::failure(); + } +}; + +class SCFLayoutCastTilingModelOp : public SCFViewLikeTilingModelOp { +public: + SCFTilingInfo backInferSCFTileInfo(mlir::Operation*, mlir::OpBuilder&, const SCFTileInfo& outputTile) const { + return SCFTilingInfo{{outputTile}}; + } +}; + +} // namespace vpux::VPU diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/barrier_variant_constraint.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/barrier_variant_constraint.hpp index 57f3de42c5..8bc32d823f 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/barrier_variant_constraint.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/barrier_variant_constraint.hpp @@ -11,7 +11,7 @@ namespace vpux { namespace VPU { -VPU::PerBarrierVariantConstraint getPerBarrierVariantConstraint(VPU::ArchKind arch, bool workloadManagementEnable); +VPU::PerBarrierVariantConstraint getPerBarrierVariantConstraint(config::ArchKind arch, bool workloadManagementEnable); } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/convert_op_to_dma_for_performant_execution_getter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/convert_op_to_dma_for_performant_execution_getter.hpp index f372fa04d3..2c501dae21 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/convert_op_to_dma_for_performant_execution_getter.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/convert_op_to_dma_for_performant_execution_getter.hpp @@ -6,7 +6,7 @@ #pragma once #include "vpux/compiler/core/interfaces/rewriter_pattern_strategies.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/attributes.hpp" #include @@ -15,6 +15,6 @@ namespace vpux::VPU { /* Find right class to get strategies for particular platform */ -std::unique_ptr createConvertOpToDMAForPerformantExecutionStrategy(ArchKind arch); +std::unique_ptr createConvertOpToDMAForPerformantExecutionStrategy(config::ArchKind arch); } // namespace vpux::VPU diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/frequency_table.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/frequency_table.hpp index a3e8bc4da9..312b03dfe5 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/frequency_table.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/frequency_table.hpp @@ -13,7 +13,7 @@ namespace VPU { using FrequencyTableCb = VPU::FrequencyTable (*)(); -FrequencyTableCb getFrequencyTable(VPU::ArchKind arch); +FrequencyTableCb getFrequencyTable(config::ArchKind arch); } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/gather_dma_constants.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/gather_dma_constants.hpp index cc4b863e2f..394b5d1b76 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/gather_dma_constants.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/gather_dma_constants.hpp @@ -17,8 +17,8 @@ constexpr size_t DMA_MAX_INDICES_LIST_LENGTH = constexpr size_t GATHER_DMA_MAX_ELEMENT_SIZE = 4096; } // namespace arch40xx -size_t getGatherDMAMaxIndicesListLength(VPU::ArchKind arch); -size_t getGatherDMAMaxElementSize(VPU::ArchKind arch); +size_t getGatherDMAMaxIndicesListLength(config::ArchKind arch); +size_t getGatherDMAMaxElementSize(config::ArchKind arch); } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/max_kernel_size_constant.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/max_kernel_size_constant.hpp index e9fc214c6f..4f7372aa9c 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/max_kernel_size_constant.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/max_kernel_size_constant.hpp @@ -11,7 +11,7 @@ namespace vpux { namespace VPU { -VPU::MaxKernelSizeConstant getMaxKernelSizeConstant(VPU::ArchKind arch); +VPU::MaxKernelSizeConstant getMaxKernelSizeConstant(config::ArchKind arch); } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/max_lstm_hidden_size_constant.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/max_lstm_hidden_size_constant.hpp index e44c6c0cc2..3b5394bd19 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/max_lstm_hidden_size_constant.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/max_lstm_hidden_size_constant.hpp @@ -5,13 +5,12 @@ #pragma once -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" - +#include "vpux/compiler/dialect/config/IR/attributes.hpp" namespace vpux { namespace VPU { -int64_t getMaxLstmSequenceHiddenSizeConstant(VPU::ArchKind arch); -int64_t getMaxLstmCellHiddenSizeConstant(VPU::ArchKind arch); +int64_t getMaxLstmSequenceHiddenSizeConstant(config::ArchKind arch); +int64_t getMaxLstmCellHiddenSizeConstant(config::ArchKind arch); } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/mc_strategy_getter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/mc_strategy_getter.hpp index 2f8a2c4d2c..c649db75d9 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/mc_strategy_getter.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/mc_strategy_getter.hpp @@ -5,14 +5,14 @@ #pragma once -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/interfaces/mc_strategy_getter.hpp" +#include "vpux/compiler/dialect/config/IR/attributes.hpp" namespace vpux::VPU { /* Find right class to get strategies for particular platform */ -std::unique_ptr createMCStrategyGetter(ArchKind arch, int64_t numClusters); +std::unique_ptr createMCStrategyGetter(config::ArchKind arch, int64_t numClusters); } // namespace vpux::VPU diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/nce_sparsity_converters.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/nce_sparsity_converters.hpp index 5e9acd012a..7b4f47655b 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/nce_sparsity_converters.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/transforms/factories/nce_sparsity_converters.hpp @@ -5,10 +5,9 @@ #pragma once -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" - -#include -#include +#include "vpux/compiler/dialect/config/IR/attributes.hpp" +#include "vpux/utils/core/type/float8_e4m3.hpp" +#include "vpux/utils/core/type/float8_e5m2.hpp" #include #include @@ -21,8 +20,8 @@ using IntOrFloatType = std::variant -#include -#include -#include #include #include #include @@ -28,6 +22,11 @@ #include #include +namespace vpux::VPU { +enum class ActivationSparsityProfile : uint64_t; +enum class WeightsSparsityHeuristic : uint64_t; +} // namespace vpux::VPU + namespace vpux { namespace VPU { @@ -278,16 +277,17 @@ struct InitCompilerOptions : mlir::PassPipelineOptions { // options setup and lit-tests template - InitCompilerOptions(ArchKind archParam, config::CompilationMode compilationModeParam, const OtherOptions& options) { - arch = std::string(VPU::stringifyEnum(archParam)); + InitCompilerOptions(config::ArchKind archParam, config::CompilationMode compilationModeParam, + const OtherOptions& options) { + arch = std::string(config::stringifyEnum(archParam)); compilationMode = std::string(config::stringifyEnum(compilationModeParam)); this->matchAndCopyOptionValuesFrom(options); } // PSS tests - InitCompilerOptions(ArchKind archParam, config::CompilationMode compilationModeParam) { - arch = std::string(VPU::stringifyEnum(archParam)); + InitCompilerOptions(config::ArchKind archParam, config::CompilationMode compilationModeParam) { + arch = std::string(config::stringifyEnum(archParam)); compilationMode = std::string(config::stringifyEnum(compilationModeParam)); } @@ -326,6 +326,13 @@ std::unique_ptr createInitResourcesPass(); std::unique_ptr createInitResourcesPass(const InitCompilerOptions& initCompilerOptions, Logger log = Logger::global()); +std::unique_ptr createDMATaskProfilingReserveMemPass(const std::string& enableDMAProfiling = "false", + Logger log = Logger::global()); +std::unique_ptr createCompressDmaReserveMemPass(Logger log = Logger::global()); +std::unique_ptr createSWKernelInstructionPrefetchReserveMemForDummyKernelsPass( + Logger log = Logger::global()); +std::unique_ptr createSWKernelDataPrefetchReserveMemPass(Logger log = Logger::global()); + std::unique_ptr createOptimizeSharedInputCopyForConcatPass(Logger log = Logger::global()); std::unique_ptr createCMXConcatPass(Logger log = Logger::global()); std::unique_ptr createSplitNCEOpsOntoWorkloadsPass(Logger log = Logger::global()); @@ -377,6 +384,7 @@ std::unique_ptr createFuseNCEInterpolateConsumersPass(Logger log = L std::unique_ptr createAddExplicitPaddingBeforeNCEPermutePass(Logger log = Logger::global()); std::unique_ptr createOutputPipelineTilingPass(bool enablePrefetchTiling = true, Logger log = Logger::global()); +std::unique_ptr createSCFVerticalFusionPass(Logger log = Logger::global()); std::unique_ptr createLegalizeDynamicShapeConcatForSWLayersPass(Logger log = Logger::global()); std::unique_ptr createConvertConstArgsToMultiConstantsPass(Logger log = Logger::global()); std::unique_ptr createConcatRepeatingBlocksOutliningPass(int64_t minSeqLength = 1, @@ -392,11 +400,12 @@ void buildInitCompilerPipeline(mlir::OpPassManager& pm, const VPU::InitCompilerO // Sparsity // -std::unique_ptr createSparsifyWeightsPass( - VPU::WeightsSparsityHeuristic heuristic = VPU::WeightsSparsityHeuristic::RATIO, - std::optional manualThreshold = std::nullopt, - int64_t largeConstThreshold = (200_MB).to().count(), int64_t computeOpThreshold = 350, - bool enableWeightSwizzling = true, Logger log = Logger::global()); +std::unique_ptr createSparsifyWeightsPass(Logger log = Logger::global()); +std::unique_ptr createSparsifyWeightsPass(VPU::WeightsSparsityHeuristic heuristic, + std::optional manualThreshold = std::nullopt, + int64_t largeConstThreshold = (200_MB).to().count(), + int64_t computeOpThreshold = 350, + bool enableWeightSwizzling = true, Logger log = Logger::global()); std::unique_ptr createRecomputeSparsityPtrsPass(Logger log = Logger::global()); std::unique_ptr createFuseSparsityOpsPass(std::optional fuseSparsify = std::nullopt, Logger log = Logger::global()); @@ -434,7 +443,9 @@ std::unique_ptr createTilingStrategyAssignmentPass(bool enablePrefet StringRef enableShaveDDRAccessOptimization = "true", Logger log = Logger::global()); std::unique_ptr createApplyTilingPass(bool enableSCFTiling = false, Logger log = Logger::global()); -std::unique_ptr createWrapVerticalFusionRegionPass(Logger log = Logger::global()); +std::unique_ptr createWrapVerticalFusionRegionPass( + const WorkloadManagementMode workloadManagementMode = WorkloadManagementMode::PWLM_V0_LCA, + Logger log = Logger::global()); std::unique_ptr createMoveViewOpsToVerticalFusionPass( const WorkloadManagementMode workloadManagementMode = WorkloadManagementMode::PWLM_V0_LCA, Logger log = Logger::global()); @@ -458,6 +469,7 @@ std::unique_ptr createFuseClampPass(Logger log = Logger::global()); // If optimizeOnlyOuterConcat is true, only optimize when concat dimension is the highest dimension std::unique_ptr createOptimizeConcatPass(bool optimizeOnlyOuterConcat = false, + bool disablePassOnEntryFunction = false, Logger log = Logger::global()); std::unique_ptr createStrategyManagerImplPass(bool enablePrefetchTiling = true, Logger log = Logger::global()); @@ -504,53 +516,12 @@ std::unique_ptr createSetupMaxKernelSizePass(const InitCompilerOptio Logger log = Logger::global()); // -// Channels Auto Padding -// - -std::unique_ptr createSetupChannelsAutoPaddingPass(); -std::unique_ptr createSetupChannelsAutoPaddingPass(const InitCompilerOptions& initCompilerOptions, - Logger log = Logger::global()); - -// -// Reduce Operation -// - -std::unique_ptr createSetupIsReduceSupportedPass(); -std::unique_ptr createSetupIsReduceSupportedPass(const InitCompilerOptions& initCompilerOptions, - Logger log = Logger::global()); - -// -// FP16 Compressed Convolution -// - -std::unique_ptr createSetupEnableFP16CompressedConvPass(); -std::unique_ptr createSetupEnableFP16CompressedConvPass(const InitCompilerOptions& initCompilerOptions, - Logger log = Logger::global()); - -// -// VPUNN Pre-split +// Target Independent Options // -std::unique_ptr createSetupEnableVPUNNPreSplitPass(); -std::unique_ptr createSetupEnableVPUNNPreSplitPass(const InitCompilerOptions& initCompilerOptions, - Logger log = Logger::global()); - -// -// Weights table reuse -// - -std::unique_ptr createSetupWeightsTableReuseModePass(); -std::unique_ptr createSetupWeightsTableReuseModePass(const InitCompilerOptions& initCompilerOptions, - Logger log = Logger::global()); - -// -// SEPtrs Operations -// - -std::unique_ptr createSetupEnableSEPtrsOperationsPass(); -std::unique_ptr createSetupEnableSEPtrsOperationsPass(const InitCompilerOptions& initCompilerOptions, - Logger log = Logger::global()); - +std::unique_ptr createSetTargetIndependentPassOptionsPass(); +std::unique_ptr createSetTargetIndependentPassOptionsPass(const InitCompilerOptions& initCompilerOptions, + Logger log = Logger::global()); // // Tiling related contraints // @@ -570,23 +541,10 @@ std::unique_ptr createQueryWSInfoPass(const Logger& log = Logger::gl std::unique_ptr createIntroduceInitFunctionPass(const Logger& log = Logger::global()); std::unique_ptr createIntroduceInitFunctionPass(StringRef wsExtractionModeString, const Logger& log = Logger::global()); +std::unique_ptr createConcatInitInputsPass(const Logger& log = Logger::global()); std::unique_ptr createConcatInitResultsPass(const Logger& log = Logger::global()); - -// -// Adaptive Stripping -// - -std::unique_ptr createSetupEnableAdaptiveStrippingPass(); -std::unique_ptr createSetupEnableAdaptiveStrippingPass(const InitCompilerOptions& initCompilerOptions, - Logger log = Logger::global()); - -// -// Extra StaticShape ops -// - -std::unique_ptr createSetupEnableExtraStaticShapeOpsPass(); -std::unique_ptr createSetupEnableExtraStaticShapeOpsPass(const InitCompilerOptions& initCompilerOptions, - Logger log = Logger::global()); +std::unique_ptr createConcatInitResultsPass(StringRef wsExtractionModeString, + const Logger& log = Logger::global()); // // DefaultHWOptions(for all devices) @@ -642,6 +600,7 @@ struct DefaultHWOptionsDialectBase : public virtual vpux::DefaultHWOptionsBase { std::unique_ptr createScfComputeOpsOutliningPass(Logger log = Logger::global()); std::unique_ptr createFinalizeComputeFunctionBoundariesPass(Logger log = Logger::global()); +std::unique_ptr createConvertDynamicToStaticKernelsPass(Logger log = Logger::global()); // // Registration diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/clustered_op_interface_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/clustered_op_interface_utils.hpp index 3d4d48aebc..91b15e118e 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/clustered_op_interface_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/clustered_op_interface_utils.hpp @@ -7,10 +7,12 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/utils/sibling_ops_analysis.hpp" - #include "vpux/utils/core/mem_size.hpp" +namespace vpux::VPU { +class SiblingOpsAnalysis; +} + namespace vpux { namespace VPU { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/const_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/const_utils.hpp index 108ff8211b..b840348d74 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/const_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/const_utils.hpp @@ -5,7 +5,6 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/nce_sparsity_converters.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" @@ -160,7 +159,7 @@ bool isNullOrConstWithSingleValue(mlir::Value value); * * NOTE: see also vpux::calculateAlignedBuffersMemoryRequirement */ -Byte calculateAlignedBuffersMemoryRequirement(VPU::ArchKind arch, mlir::SmallVector& bufferSizes); +Byte calculateAlignedBuffersMemoryRequirement(config::ArchKind arch, mlir::SmallVector& bufferSizes); } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/conv_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/conv_utils.hpp index 833852f62d..e3369073f1 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/conv_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/conv_utils.hpp @@ -7,15 +7,19 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/tiling.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/logging.hpp" #include +namespace vpux::VPU { +enum class MultiClusterStrategy : uint64_t; +class NCEConvolutionOp; +class TransposedConvolutionOp; +} // namespace vpux::VPU + namespace vpux::VPU { bool isNCEConvSupported(mlir::Operation* op, NDTypeInterface inputType, NDTypeInterface filterType, diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp index f91c20ce6e..3b5be2cdcc 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp @@ -5,14 +5,21 @@ #pragma once -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPU/interfaces/cost_model_utils.hpp" #include "vpux/compiler/dialect/VPUIP/interfaces/dpu_tiler.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include #include +#include + #include +namespace vpux::VPU { +class NCEOpInterface; +} // namespace vpux::VPU + namespace vpux { float getWeightsSparsityRatio(vpux::NDTypeInterface weightsType, int64_t compressedSize); @@ -40,7 +47,7 @@ class LayerCostModelAnalysis { // If the input analysis is empty, create a layer cost model instance and return it. // Otherwise, return the cached layer cost model instance. static std::shared_ptr getOrCreateLayerCostModel( - std::optional> analysis, VPU::ArchKind arch, + std::optional> analysis, config::ArchKind arch, Logger log = Logger::global().nest("layer-cost-model-analysis")); private: @@ -71,7 +78,7 @@ class CostModelAnalysis { // If the input analysis is empty, create a cost model instance and return it. // Otherwise, return the cached cost model instance. static std::shared_ptr getOrCreateCostModel( - std::optional> analysis, VPU::ArchKind arch, + std::optional> analysis, config::ArchKind arch, Logger log = Logger::global().nest("cost-model-analysis")); private: @@ -98,22 +105,23 @@ void printLayerSplitInfo(const VPUNN::LayerSplitInfo& info, const Logger& log); VPU::MPEMode getMPEMode(VPUNN::ExecutionMode executionMode); float getWeightsSparsityRatio(mlir::Value weights); -VPUNN::VPUDevice getVPUDeviceType(VPU::ArchKind archKind); +VPUNN::VPUDevice getVPUDeviceType(config::ArchKind archKind); bool isVPUNNSupportedElementType(mlir::Type type); std::optional getVPUNNElementType(mlir::Type type); VPUNN::Layout getVPUNNLayout(vpux::DimsOrder vpuxLayout); VPUNN::VPUTensor getVPUTensor(ShapeRef shape, mlir::Type elemType, vpux::DimsOrder layout = vpux::DimsOrder::NHWC); VPUNN::ExecutionMode getExecutionMode(VPU::MPEMode mpeMode); VPUNN::VPULayerStrategy getVPULayerStrategy(VPU::MultiClusterStrategy mcStrategy, size_t nDPUs, size_t nTiles, - ArchKind arch, size_t nSHVs = 1, bool prefetching = false, + config::ArchKind arch, size_t nSHVs = 1, bool prefetching = false, VPU::DistributionMode distributionMode = DistributionMode::NONE, mlir::Operation* op = nullptr); VPUNN::DPULayer getDPULayer(const VPUIP::WorkloadCostParams& params); std::vector getPerClusterDPULayers(VPU::NCEOpInterface nceOp, const VPUIP::WorkloadCostParams& params, Logger log); VPUNN::DPUWorkload getDPUWorkload(const VPUIP::WorkloadCostParams& tileParams, const VPUIP::WorkloadTile& wl); -VPUIP::WorkloadCostParams getWorkloadCostParam(VPU::NCEOpInterface nceOp, VPU::ArchKind arch, int64_t numDPU, +VPUIP::WorkloadCostParams getWorkloadCostParam(VPU::NCEOpInterface nceOp, config::ArchKind arch, int64_t numDPU, int64_t numTiles = 1); +vpux::VPU::ICostModelUtilsInterface* getICostModelUtilsInterface(mlir::MLIRContext* ctx); } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp index 53e28fd481..965dc1e24e 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp @@ -17,7 +17,7 @@ namespace vpux::VPU { */ class CostModelConfig { private: - static std::map>& _getFactories(); + static std::map>& _getFactories(); static std::mutex& _getCostModelFactoryMutex() { static std::mutex mtx; @@ -30,7 +30,7 @@ class CostModelConfig { * @param arch Architecture kind * @return const ICostModelFactory& */ - static const ICostModelFactory& getFactory(ArchKind arch); + static const ICostModelFactory& getFactory(config::ArchKind arch); public: /** @@ -38,7 +38,7 @@ class CostModelConfig { * * @param arch Architecture kind */ - static void setFactory(ArchKind arch); + static void setFactory(config::ArchKind arch); /** * @brief Create a cost model for the specified architecture @@ -46,7 +46,7 @@ class CostModelConfig { * @param arch Architecture kind * @return std::shared_ptr */ - static std::shared_ptr createCostModel(ArchKind arch) { + static std::shared_ptr createCostModel(config::ArchKind arch) { return getFactory(arch).createCostModel(); } @@ -56,7 +56,7 @@ class CostModelConfig { * @param arch Architecture kind * @return std::shared_ptr */ - static std::shared_ptr createLayerCostModel(ArchKind arch) { + static std::shared_ptr createLayerCostModel(config::ArchKind arch) { return getFactory(arch).createLayerCostModel(); } }; diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/cost_model/layer_vpunn_cost.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/cost_model/layer_vpunn_cost.hpp index 4562f7090b..b16ef1c62f 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/cost_model/layer_vpunn_cost.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/cost_model/layer_vpunn_cost.hpp @@ -139,7 +139,7 @@ class LayerVPUNNCost final { StrategyCost correctStrideDMACost(vpux::NDTypeInterface type, StrategyCost cost) const; - VPU::ArchKind _arch; + config::ArchKind _arch; int64_t _numTiles; int64_t _numDPUs; int64_t _numShaveActs; diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp index 4c382a25bf..16cd684356 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp @@ -5,23 +5,36 @@ #pragma once -#include -#include -#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/native_attributes/distribution_info.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" -#include "vpux/compiler/utils/logging.hpp" -#include "vpux/utils/core/checked_cast.hpp" -#include "vpux/utils/core/numeric.hpp" +#include "vpux/compiler/dialect/config/IR/attributes.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include #include #include #include +namespace vpux::VPU { +enum class MultiClusterStrategy : uint64_t; +class ClusteredOpInterface; +class CopyOp; +class DistributedTensorType; +class DistributedTypeInterface; +class NCEOpInterface; +class SiblingOpsAnalysis; +class SparseTensorType; +class SWOpInterface; +class UnrolledTypeOp; +struct OverlapDistributionParams; +} // namespace vpux::VPU + +namespace vpux::VPUIP { +enum class NCETaskType : uint64_t; +} // namespace vpux::VPUIP + namespace vpux { namespace VPU { @@ -73,16 +86,19 @@ SmallVector getWeightsTableTensorNumTiles(VPU::ClusteredOpInterface clu VPU::MultiClusterStrategy strategy); DistributionMode getActivationTensorDistributionMode(VPU::ClusteredOpInterface clusteredOp, VPU::MultiClusterStrategy strategy); +DistributionMode getActivationTensorDistributionMode(VPU::GatherDMAOp op, VPU::MultiClusterStrategy strategy, + mlir::Value operand); DistributionMode getWeightsTensorDistributionMode(VPU::MultiClusterStrategy strategy); DistributionMode getOutputTensorDistributionMode(VPU::ClusteredOpInterface clusteredOp, VPU::MultiClusterStrategy strategy, vpux::NDTypeInterface outputType); int64_t getSOHPerClusterHeightAlignment(int64_t inputWidth, bool isInputSparse); -int64_t getSOHMinimalHeightAlignment(vpux::ShapeRef shape, int64_t numClusters, bool isInputSparse, VPU::ArchKind arch); +int64_t getSOHMinimalHeightAlignment(vpux::ShapeRef shape, int64_t numClusters, bool isInputSparse, + config::ArchKind arch); bool isSOHSupportedByDPU(vpux::NDTypeInterface inputType, ShapeRef inputShape, int64_t numClusters, bool DWTypeOp, - VPU::ArchKind arch); + config::ArchKind arch); bool isSOGSupportedByDPU(vpux::NDTypeInterface inputType, ShapeRef inputShape, int64_t numClusters, bool DWTypeOp, - VPU::ArchKind arch); + config::ArchKind arch); vpux::VPU::CopyOp createDistributedCopyIn(mlir::PatternRewriter& rewriter, VPU::ClusteredOpInterface clusteredOp, mlir::Value input, vpux::NDTypeInterface inputTensorDistributedTensorType); @@ -139,6 +155,10 @@ VPU::DistributedTensorType createDistributedTensorType(VPU::SWOpInterface swOp, DistributionMode distributionMode, ArrayRef numTiles, int64_t numClusters, ArrayRef alignment, bool uniformDistributedSegments); +VPU::DistributedTensorType createDistributedTensorType(VPU::GatherDMAOp gatherDMAOp, vpux::NDTypeInterface inputType, + DistributionMode distributionMode, ArrayRef numTiles, + int64_t numClusters, ArrayRef alignment, + bool uniformDistributedSegments); VPU::DistributedTypeInterface getDistributedActivationTypeFromOp( VPU::ClusteredOpInterface clusteredOp, mlir::Value operand, vpux::NDTypeInterface inputType, @@ -199,7 +219,7 @@ bool isSegmentedLikeDistributionMode(vpux::NDTypeInterface sourceType, const VPU mlir::Type getCompactTypeFromDistributed(mlir::Type originalType); Shape getLargestClusterOutputShape(VPU::ClusteredOpInterface clusteredOp, VPU::MultiClusterStrategy strategy); -bool isDWOpAndNeedsAlign(ArchKind arch, VPUIP::NCETaskType nceTaskType); +bool isDWOpAndNeedsAlign(config::ArchKind arch, VPUIP::NCETaskType nceTaskType); bool isEltwiseOpAndNeedsAlign(VPU::ClusteredOpInterface nceOp); bool isSWOpChannelAlignmentCompatible(VPU::ClusteredOpInterface swOp, vpux::NDTypeInterface inputType, vpux::NDTypeInterface outputType); @@ -257,6 +277,10 @@ VPU::DistributionInfo createDistributionInfo(VPU::SWOpInterface swOp, Distributi ArrayRef numTiles, int64_t optimalNumberOfClusters, ArrayRef alignment, bool uniformDistributedSegments); +VPU::DistributionInfo createDistributionInfo(VPU::GatherDMAOp gatherDMAOp, DistributionMode distributionMode, + ArrayRef numTiles, int64_t optimalNumberOfClusters, + ArrayRef alignment, bool uniformDistributedSegments); + VPU::DistributionInfo composeDistributedAttr(VPU::ClusteredOpInterface permuteOp, VPU::DistributedTensorType distType, vpux::NDTypeInterface ndType, mlir::ArrayAttr tileOverDim, const OverlapDistributionParams& fusedOverlapParams, diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/generate_tiling.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/generate_tiling.hpp index d19ea6981f..68f2896b3c 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/generate_tiling.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/generate_tiling.hpp @@ -5,13 +5,15 @@ #pragma once -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/multi_cluster_strategy_utils.hpp" -#include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/compiler/utils/types.hpp" #include +namespace vpux::VPU { +class NCEOpInterface; +class TilingBuilderOpInterface; +} // namespace vpux::VPU + namespace vpux { namespace VPU { @@ -40,11 +42,11 @@ bool prefetchTilingConditionSatisfied(mlir::Operation* op, Logger log); bool largeConstPipelineConditionSatisfied(mlir::Operation* op, Logger log); bool hasMultiBranches(mlir::Operation* op); -bool archSupportsSwLayerTiling(VPU::ArchKind arch); +bool archSupportsSwLayerTiling(config::ArchKind arch); bool doesNCEOpChannelSatisfyWorkload(mlir::Operation* nceOp, const TileInfo& outputTile); std::optional getSEPConvTilingOrder(mlir::Operation* op); std::optional> getWorkLoadInformationForNCEWithSparseOutput( - VPU::ArchKind arch, ArrayRef perClusterShapes, ArrayRef supportedChannels); + config::ArchKind arch, ArrayRef perClusterShapes, ArrayRef supportedChannels); /** * @brief Get the best hardware layer tiling strategy based on the VPUNN cost model diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/layer_post_ops_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/layer_post_ops_utils.hpp index 9ae522764f..47df3abeb8 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/layer_post_ops_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/layer_post_ops_utils.hpp @@ -4,7 +4,7 @@ // #include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/utils/logger/logger.hpp" #pragma once diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/layout_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/layout_utils.hpp index deb52e1c80..2277f196c3 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/layout_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/layout_utils.hpp @@ -5,6 +5,7 @@ #pragma once +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/mpe_engine_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/mpe_engine_utils.hpp index 1a02f61490..806802add4 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/mpe_engine_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/mpe_engine_utils.hpp @@ -5,6 +5,8 @@ #pragma once +#include "vpux/compiler/dialect/config/IR/utils.hpp" + namespace vpux::VPU { /* @brief * Static class for generating MPEEngine attributes. @@ -12,9 +14,9 @@ namespace vpux::VPU { class MPEEngineConfig { public: static MPEEngineAttr retrieveMPEEngineAttribute(mlir::Operation* operation) { - const auto arch = VPU::getArch(operation); + const auto arch = config::getArch(operation); - VPUX_THROW_WHEN(arch == VPU::ArchKind::UNKNOWN, + VPUX_THROW_WHEN(arch == config::ArchKind::UNKNOWN, "An unknown architecture is associated to the provided operation"); return MPEEngine37XXAttr::get(operation->getContext(), diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/multi_cluster_strategy_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/multi_cluster_strategy_utils.hpp index 8c8afa8a8e..40f1fa4883 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/multi_cluster_strategy_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/multi_cluster_strategy_utils.hpp @@ -5,13 +5,13 @@ #pragma once -#include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/sibling_ops_analysis.hpp" -#include "vpux/compiler/utils/logging.hpp" -#include "vpux/utils/core/checked_cast.hpp" +#include "vpux/compiler/dialect/VPUIP/interfaces/dpu_tiler.hpp" #include "vpux/utils/core/dense_map.hpp" +#include + namespace vpux { namespace VPU { @@ -143,7 +143,7 @@ class LayerCostModel final { int64_t _numDPUs = 0; // Number of DPUs per cluster int64_t _numShaveActs = 0; // Number of ACT_SHVs per cluster int64_t _numDMAPorts = 1; // Number of the DMA ports - VPU::ArchKind _arch; + config::ArchKind _arch; VPUNN::VPUDevice _vpuDeviceType; std::shared_ptr _layerCostModel; mlir::func::FuncOp _func; @@ -213,6 +213,6 @@ bool setSOKForRuntimeDequantConvolution(VPU::NCEOpInterface nceOp, LayerCostMode bool alignStrategyWithParentRuntimeDequant(VPU::ClusteredOpInterface clusteredOp, LayerCostModel& costModel); -double getStrideDMACorrectionThresholdByArch([[maybe_unused]] VPU::ArchKind arch); +double getStrideDMACorrectionThresholdByArch([[maybe_unused]] config::ArchKind arch); } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/nce_invariant.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/nce_invariant.hpp index f88233786d..57c6bcc53d 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/nce_invariant.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/nce_invariant.hpp @@ -5,10 +5,9 @@ #pragma once -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/attributes.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/utils/core/func_ref.hpp" #include "vpux/utils/core/mem_size.hpp" #include "vpux/utils/logger/logger.hpp" @@ -89,7 +88,7 @@ bool verifyPads(int64_t KY, int64_t KX, int64_t padTop, int64_t padBottom, int64 // Common utility for AvgPool, MaxPool, Eltwise and DWConv // -bool checkLayouts(mlir::TypeRange operandTypes, mlir::TypeRange resultTypes, const VPU::ArchKind& arch, +bool checkLayouts(mlir::TypeRange operandTypes, mlir::TypeRange resultTypes, const config::ArchKind& arch, const unsigned numInputOperands, LogCb logCb); mlir::LogicalResult isSupported(mlir::Operation* op, Logger log = Logger::global()); @@ -97,10 +96,10 @@ mlir::LogicalResult isSupported(mlir::Operation* op, Logger log = Logger::global // // Check if small kernel optimization is supported // -bool doesWorkloadSupportSmallKernelOpt([[maybe_unused]] VPU::ArchKind arch, int64_t KX, int64_t SX, +bool doesWorkloadSupportSmallKernelOpt([[maybe_unused]] config::ArchKind arch, int64_t KX, int64_t SX, ArrayRef workloadOutSz, bool isFp16Input, [[maybe_unused]] int64_t KY, [[maybe_unused]] int64_t padLeft); -bool isSmallKernelOptimizationSupported(const VPU::ArchKind arch, mlir::Operation* op); +bool isSmallKernelOptimizationSupported(const config::ArchKind arch, mlir::Operation* op); // // Verify kernel utils @@ -117,7 +116,7 @@ mlir::LogicalResult verifyPoolCMX(mlir::Location loc, mlir::ModuleOp module, vpu // Check if given architecture supports Elementwise multiply operation // -bool isEltwiseMultiplySubtractSupported(const VPU::ArchKind arch); +bool isEltwiseMultiplySubtractSupported(const config::ArchKind arch); // // Check whether alignment is beneficial for the operation diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp index 6566276655..9e77f2328b 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp @@ -6,24 +6,22 @@ #pragma once #include "vpux/compiler/core/attributes/shape.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/nce_sparsity_converters.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/loop.hpp" #include "vpux/compiler/utils/quantization.hpp" - #include "vpux/utils/core/algo.hpp" #include "vpux/utils/core/array_ref.hpp" -#include "vpux/utils/core/enums.hpp" -#include "vpux/utils/core/func_ref.hpp" -#include "vpux/utils/core/optional.hpp" #include #include +namespace vpux::VPUIP { +class DPUTaskOp; +} // namespace vpux::VPUIP + namespace vpux { namespace VPU { @@ -32,9 +30,6 @@ namespace NCESparsity { // base_ptr is 9bits size const int BASE_PTR_SIZE = 9; -const VPU::SparsitySupport FULLY_SUPPORTED_SPARSITY_MODE = - SparsitySupport::SPARSE_INPUTS | SparsitySupport::SPARSE_OUTPUTS | SparsitySupport::SPARSE_WEIGHTS; - constexpr int32_t SPARSITY_PTR_WHEN_NO_SPARSITY = 0xFFFFFF; const unsigned int DEFAULT_SPARSIFIABLE_INPUT_OPERAND_ID = 0; @@ -589,10 +584,6 @@ double getSparsityRatio(vpux::NDTypeInterface weightsType, ArrayRef num bool isSparsifiableWeightsOperand(mlir::Value operand); bool isSuperdenseRequired(const DimsOrder outOrder, const ShapeRef outShape, const mlir::Type outElemType); -inline VPU::SparsitySupport bitwiseNot(const VPU::SparsitySupport bits) { - static_assert(sizeof(bits) == sizeof(uint32_t), "VPU::SparsitySupport has unexpected size"); - return static_cast(~static_cast(bits)); -} // 5D weights. int32_t get5DWeightPtrStep(mlir::Value weights); diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/op_tiling_cache.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/op_tiling_cache.hpp index 7fb2535931..55012f39d0 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/op_tiling_cache.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/op_tiling_cache.hpp @@ -6,13 +6,16 @@ #pragma once #include "vpux/compiler/core/tiling.hpp" -#include "vpux/compiler/dialect/IE/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" #include "vpux/utils/core/dense_map.hpp" +#include +#include + #include -#include + +namespace vpux::VPU { +class DistributionInfo; +} namespace vpux { namespace VPU { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/scf/scf_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/scf/scf_utils.hpp index cf1acc9363..2a86dd610a 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/scf/scf_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/scf/scf_utils.hpp @@ -5,22 +5,20 @@ #pragma once -#include -#include -#include "mlir/Dialect/Affine/Utils.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Utils/StaticValueUtils.h" -#include "mlir/Interfaces/TilingInterface.h" - #include "vpux/compiler/NPU40XX/dialect/VPU/IR/ops_interfaces.hpp" -#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/core/tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" -#include "vpux/compiler/utils/rewriter.hpp" +#include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/core/attributes/shape.hpp" -#include "vpux/compiler/core/tiling.hpp" +#include +#include +#include +#include +#include +#include namespace vpux::VPU { @@ -65,9 +63,18 @@ struct SCFTileInfo { } }; -using OpTilingOperandsFunc = std::function&)>; +struct SCFTilingInfo { + SCFTilingInfo(ArrayRef tilesValue): tiles(tilesValue) { + } + SCFTilingInfo(ArrayRef tilesValue, SCFShapeRef padsValue): tiles(tilesValue), pads(padsValue) { + } + + SmallVector tiles; + std::optional pads; +}; + +using OpTilingOperandsFunc = std::function; using OpGeneratorFunc = std::function; -using SCFTilingInfo = SmallVector; // @brief Dim value of input/output/weights shape mlir::OpFoldResult getDimValue(mlir::OpBuilder& builder, mlir::Operation* operation, int64_t dim); @@ -78,12 +85,27 @@ SCFTileInfo getWeightsTableSCFTile(mlir::Type origWeightsTableType, mlir::OpBuil /** @brief Restores input tiling from output tile data - The function calculates input shape and offset based on + The function calculates input shape, offset and bounds based on parameters and shape and offset of output tile */ -mlir::Range solutionForOutputRange(mlir::Location loc, mlir::OpBuilder& builder, const SCFTileInfo& outputTile, Dim dim, - const int64_t kernel, const int64_t stride, - const std::pair& origPadding); +std::pair, std::optional> solutionForOutputRange( + mlir::Location loc, mlir::OpBuilder& builder, const SCFTileInfo& outputTile, Dim dim, const int64_t kernel, + const int64_t stride, const int64_t origInputSize, const std::pair& origPadding, + mlir::OpFoldResult& padBefore, mlir::OpFoldResult& padAfter); + +/** @brief Generate slice based on tiling information + + The function generates ExtractSliceOp based on offset and size in tile info +*/ +mlir::Value generateTile(mlir::Location loc, mlir::OpBuilder& builder, mlir::Value origInput, + const SCFTileInfo& inputTileInfo); + +/** @brief Return result type after tiling to new shape + + The function extracts result type of operation + after changing shape +*/ +mlir::Type extractResultType(mlir::Type origType, SCFShapeRef newShape, BoundsRef bounds); /** @brief create operation with padding adjustment @@ -94,99 +116,117 @@ mlir::Range solutionForOutputRange(mlir::Location loc, mlir::OpBuilder& builder, */ template mlir::Operation* createTiledPaddedOperation(OpGeneratorFunc opGenerator, OpTilingOperandsFunc operandsGenerator, - mlir::OpBuilder& builder, SCFTilingInfo& inputTiling, - const SCFTileInfo& outputTile, Dim dim, SCFShapeRef origShape, - mlir::Operation* origOperation, ShapeRef tiling) { - if (dim == Dims4D::Act::C) { + mlir::OpBuilder& builder, SCFTilingInfo& inputTiling, DimArrRef dims, + SmallVector& tiledOperands, mlir::Operation* operation) { + const auto isSpatialDim = [](auto dim) { + return dim.ind() >= static_cast(Dims4D::Act::numSpatialDims); + }; + if (llvm::none_of(dims, isSpatialDim) || !inputTiling.pads.has_value()) { operandsGenerator(inputTiling); return opGenerator(); } - auto padInfo = toPadInfo(mlir::cast(origOperation).getPad()); + auto padInfo = toPadInfo(mlir::cast(operation).getPad()); if (!padInfo.enabled()) { operandsGenerator(inputTiling); return opGenerator(); } - auto numTiles = tiling[dim]; - if (numTiles == 1) { - operandsGenerator(inputTiling); - return opGenerator(); - } - - VPUX_THROW_WHEN(static_cast(dim.ind()) < Dims4D::Act::numSpatialDims, "Incorrect tiling spacial dim {0}", - dim); - - const auto spatialDimIdx = dim.ind() - Dims4D::Act::numSpatialDims; - auto loc = origOperation->getLoc(); + auto loc = operation->getLoc(); - auto zeroOffset = builder.create(appendLoc(loc, "zero"), 0); - auto interValue = - mlir::getValueOrCreateConstantIndexOp(builder, appendLoc(loc, "offset"), outputTile.offsets[dim.ind()]); - - auto isFirstIndex = builder.create(appendLoc(loc, "equal"), mlir::arith::CmpIPredicate::eq, - interValue, zeroOffset); - - const auto createOperation = [&](bool trimBegin, bool trimEnd) { - auto poolingOp = mlir::cast(opGenerator()); - - std::array padsBegin = {padInfo.top, padInfo.left}; - std::array padsEnd = {padInfo.bottom, padInfo.right}; + operandsGenerator(inputTiling); + VPUX_THROW_WHEN(tiledOperands.empty(), "Empty tiled operation for operation"); + auto tiledInput = tiledOperands.front(); + auto tiledType = mlir::cast(tiledInput.getType()); + + auto paddingValue = builder.create(loc, builder.getZeroAttr(tiledType.getElementType())); + auto adjustedBounds = Bounds(); + if (auto boundedType = mlir::dyn_cast(tiledType)) { + adjustedBounds = boundedType.getBounds(); + } - if (trimBegin) { - padsBegin[spatialDimIdx] = 0; + SmallVector lows(tiledType.getRank(), builder.getIndexAttr(0)); + SmallVector highs(tiledType.getRank(), builder.getIndexAttr(0)); + + auto padsByDims = padInfo.toPadByDims(); + // bounds are not updated for dynamic dimensions, as the pad value is calculated at runtime based on the loop index + for (auto index : irange(Dims4D::Act::numSpatialDims)) { + const auto spatialDim = Dims4D::Act::getSpatialDim(index); + if (llvm::find(dims, spatialDim) != dims.end()) { + lows[spatialDim.ind()] = inputTiling.pads.value()[index]; + // the order of pads is "left, top, right, bottom" + // so, to get padding of other side, get +2 to current index + highs[spatialDim.ind()] = inputTiling.pads.value()[index + 2]; + } else { + lows[spatialDim.ind()] = builder.getIndexAttr(padsByDims[spatialDim.ind()].first); + highs[spatialDim.ind()] = builder.getIndexAttr(padsByDims[spatialDim.ind()].second); + if (!adjustedBounds.raw().empty()) { + adjustedBounds[spatialDim] += padsByDims[spatialDim.ind()].first + padsByDims[spatialDim.ind()].second; + } } + } - if (trimEnd) { - padsEnd[spatialDimIdx] = 0; + tiledOperands[0] = builder.create(loc, /*result=*/mlir::Type(), tiledInput, lows, highs, + paddingValue, /*nofold=*/false); + const auto tensorDesc = vpux::getTensorAttr(tiledType.getContext(), tiledType.getDimsOrder(), + tiledType.getMemSpace(), adjustedBounds); + SmallVector staticDims; + auto rankedType = mlir::cast(tiledOperands[0].getType()); + staticDims.reserve(rankedType.getRank()); + llvm::transform(llvm::seq(0, rankedType.getRank()), std::back_inserter(staticDims), [&](auto i) { + if (rankedType.isDynamicDim(i)) { + return mlir::ShapedType::kDynamic; } - - poolingOp.setPadAttr(getPaddingAttr(builder.getContext(), padsBegin[1], padsEnd[1], padsBegin[0], padsEnd[0])); - builder.create(appendLoc(loc, "yield"), poolingOp.getResult()); - }; - - operandsGenerator(inputTiling); - - const auto firstTileCreator = [&](mlir::OpBuilder&, mlir::Location) { - return createOperation(/*trimBegin=*/false, /*trimEnd=*/true); - }; - - const auto lastTileCreator = [&](mlir::OpBuilder&, mlir::Location) { - return createOperation(/*trimBegin=*/true, /*trimEnd=*/false); - }; - - const auto medianTileCreator = [&](mlir::OpBuilder& opBuilder, mlir::Location opLocation) { - auto newInfo = inputTiling; - auto& inputTile = newInfo[0]; - mlir::AffineExpr d0; - bindDims(opBuilder.getContext(), d0); - std::array padsEnd = {padInfo.bottom, padInfo.right}; - auto addMap = mlir::AffineMap::get(1, 0, {d0 + padsEnd[spatialDimIdx]}, opBuilder.getContext()); - inputTile.shape[dim.ind()] = mlir::affine::makeComposedFoldedAffineApply( - opBuilder, appendLoc(opLocation, "paddedShape"), addMap, {inputTile.shape[dim.ind()]}); - operandsGenerator(newInfo); - return createOperation(/*trimBegin=*/true, /*trimEnd=*/true); + return rankedType.getDimSize(i); + }); + + tiledOperands[0].setType(mlir::RankedTensorType::get(staticDims, tiledType.getElementType(), tensorDesc)); + const auto createOperation = [&]() { + auto generatedOp = mlir::cast(opGenerator()); + generatedOp.setPadAttr(getPaddingAttr(builder.getContext(), 0, 0, 0, 0)); + auto outputType = mlir::cast(generatedOp->getResult(0).getType()); + auto outputShape = to_small_vector(outputType.getShape().raw()); + for (auto staticDim : staticDims | indexed) { + if (staticDim.value() == mlir::ShapedType::kDynamic) { + outputShape[staticDim.index()] = mlir::ShapedType::kDynamic; + } + } + outputType = outputType.changeShape(Shape(outputShape)); + generatedOp->getResult(0).setType(outputType); + return generatedOp; }; + return createOperation(); +} - const auto elseBlockCreator = [&](mlir::OpBuilder& opBuilder, mlir::Location opLocation) { - if (numTiles == 2) { - return createOperation(/*trimBegin=*/true, /*trimEnd=*/false); +/** @brief adjust padded output + * In case the PadOp has been added, but operation used to be with static output + * generated by scf tiling functions, result sizes has to be corrected to be dynamic too + * It doesn't affect the result type of the loop, only result size in InsertSliceOp + */ +template +void correctPaddedOutput(mlir::OpBuilder& builder, ConcreteOp operation, SmallVector& resultSizes) { + auto padInfo = toPadInfo(operation.getPad()); + if (padInfo.enabled() && operation->hasAttr(tilingStrategy)) { + auto padsByDims = padInfo.toPadByDims(); + const auto strategy = + Shape(parseIntArrayAttr(mlir::cast(operation->getAttr(tilingStrategy)))); + auto tilingDims = getNonOneDim(strategy); + for (auto dim : tilingDims) { + auto sizeValue = mlir::getConstantIntValue(resultSizes[dim.ind()]); + if (sizeValue.has_value() && padsByDims.contains(dim.ind()) && + (padsByDims[dim.ind()].first != 0 || padsByDims[dim.ind()].second != 0)) { + resultSizes[dim.ind()] = builder.create( + operation->getLoc(), builder.getIndexAttr(sizeValue.value())) + .getResult(); + } } + } +} - auto maxValue = mlir::getValueOrCreateConstantIndexOp(opBuilder, appendLoc(opLocation, "maxValue"), - origShape[dim.ind()]); - auto lastIndex = opBuilder.create(appendLoc(opLocation, "sub"), maxValue, interValue); - - auto isLastIndex = opBuilder.create(appendLoc(opLocation, "equal"), - mlir::arith::CmpIPredicate::eq, interValue, lastIndex); - auto innerIfOp = opBuilder.create(appendLoc(opLocation, "innerIf"), isLastIndex, - lastTileCreator, medianTileCreator); - opBuilder.create(appendLoc(opLocation, "yield"), innerIfOp.getResult(0)); - }; - - auto ifOp = builder.create(appendLoc(loc, "outerIf"), isFirstIndex, firstTileCreator, - elseBlockCreator); +/** @brief Checks if two operations might be vertically fused - return ifOp.getOperation(); -} + The function checks if there are some spills already between operations + To be extended to more complex checks +*/ +bool checkFusion(mlir::OpOperand& consumer, mlir::OpResult producerCandidate); } // namespace vpux::VPU diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/se_padding_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/se_padding_utils.hpp index f7dd692b3c..83dcfda7f6 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/se_padding_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/se_padding_utils.hpp @@ -5,9 +5,8 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/utils/logging.hpp" namespace vpux::VPU { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/se_roll_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/se_roll_utils.hpp index 66946c08f0..712ddbfe35 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/se_roll_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/se_roll_utils.hpp @@ -5,9 +5,8 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/utils/logging.hpp" namespace vpux::VPU { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp index e5815f6330..827df7015a 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp @@ -6,10 +6,10 @@ #pragma once #include -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/config/IR/ops.hpp" #include "vpux/compiler/utils/analysis.hpp" +#include "vpux/compiler/utils/options.hpp" +#include "vpux/compiler/utils/types.hpp" #include "vpux/utils/core/error.hpp" namespace vpux { @@ -28,14 +28,43 @@ T getConstraint(mlir::Operation* op, StringRef attrName) { auto attrValue = pipelineOptionOp.lookupSymbol(attrName); VPUX_THROW_WHEN(attrValue == nullptr, "Failed to find config.OptionOp attribute: {0}", attrName); - if (auto intAttr = mlir::dyn_cast(attrValue.getOptionValue())) { + if constexpr (std::is_same_v || std::is_same_v) { + auto intAttr = mlir::dyn_cast(attrValue.getOptionValue()); + VPUX_THROW_WHEN(intAttr == nullptr, "Failed to fetch attr: {0}", attrName); + // E-174296: Possible loss of accuracy and Conversion of int64_t to unsigned return static_cast(intAttr.getValue().getSExtValue()); - } else if (auto floatAttr = mlir::dyn_cast(attrValue.getOptionValue())) { - return static_cast(floatAttr.getValueAsDouble()); - } else if (auto boolAttr = mlir::dyn_cast(attrValue.getOptionValue())) { - return static_cast(boolAttr.getValue()); + } else if constexpr (std::is_same_v) { + auto floatAttr = mlir::dyn_cast(attrValue.getOptionValue()); + VPUX_THROW_WHEN(floatAttr == nullptr, "Failed to fetch attr: {0}", attrName); + return floatAttr.getValueAsDouble(); + } else if constexpr (std::is_same_v) { + auto boolAttr = mlir::dyn_cast(attrValue.getOptionValue()); + VPUX_THROW_WHEN(boolAttr == nullptr, "Failed to fetch attr: {0}", attrName); + return boolAttr.getValue(); + } else { + // To have T in error message + static_assert(!sizeof(T), "Unsupported type for constraint"); + } +} + +std::optional tryGetBoolPassOption(mlir::ModuleOp module, StringRef attrName); + +template +mlir::Attribute getAttributeFromOption(mlir::MLIRContext* ctx, mlir::Pass::Option& optionValue) { + if constexpr (std::is_same_v) { + return mlir::BoolAttr::get(ctx, optionValue.getValue()); + } else if constexpr (std::is_same_v) { + return mlir::IntegerAttr::get(mlir::IntegerType::get(ctx, 64), optionValue.getValue()); + } else if constexpr (std::is_same_v) { + return mlir::StringAttr::get(ctx, optionValue.getValue()); + } else if constexpr (std::is_same_v) { + return mlir::FloatAttr::get(mlir::FloatType::getF64(ctx), optionValue.getValue()); + } else if constexpr (std::is_same_v) { + return mlir::IntegerAttr::get(getUInt64Type(ctx), static_cast(optionValue.getValue())); + } else { + // To have T in error message + static_assert(!sizeof(T), "Unsupported option type for attribute conversion"); } - VPUX_THROW("Unsupported type for constraint {0}", attrName); } } // namespace VPU diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sibling_ops_analysis.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sibling_ops_analysis.hpp index d82ec3f3d5..2eaaece8ce 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sibling_ops_analysis.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sibling_ops_analysis.hpp @@ -5,15 +5,16 @@ #pragma once +#include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" + +#include #include + #include #include -#include namespace vpux::VPU { -class ClusteredOpInterface; - // Analysis which finds clustered op siblings and consumers set. // Siblings and consumers are computed lazily and cached in _siblingGroups. // Be careful not to introduce or remove clustered ops into IR when using this class, if diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sparsity_support.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sparsity_support.hpp new file mode 100644 index 0000000000..0529b1f2dd --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sparsity_support.hpp @@ -0,0 +1,24 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" + +namespace vpux { +namespace VPU { +namespace NCESparsity { + +const VPU::SparsitySupport FULLY_SUPPORTED_SPARSITY_MODE = + SparsitySupport::SPARSE_INPUTS | SparsitySupport::SPARSE_OUTPUTS | SparsitySupport::SPARSE_WEIGHTS; + +inline VPU::SparsitySupport bitwiseNot(const VPU::SparsitySupport bits) { + static_assert(sizeof(bits) == sizeof(uint32_t), "VPU::SparsitySupport has unexpected size"); + return static_cast(~static_cast(bits)); +} + +} // namespace NCESparsity +} // namespace VPU +} // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp index d260cbecfa..6b9b021501 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp @@ -5,15 +5,20 @@ #pragma once -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/interfaces/sparsity_constraint.hpp" #include "vpux/compiler/utils/options.hpp" +#include "vpux/utils/core/array_ref.hpp" +#include "vpux/utils/core/small_vector.hpp" #include #include #include +namespace vpux::config { +enum class ArchKind : uint64_t; +} + namespace vpux { namespace VPU { @@ -49,7 +54,7 @@ enum SparsityRemovalFlag { SparsityRemovalFlag shouldRemoveOutputSparsity(mlir::Operation* op); -bool isSEOnlyWithoutSMSupported(VPU::ArchKind arch); +bool isSEOnlyWithoutSMSupported(config::ArchKind arch); std::pair, SmallVector> getUpdatedSliceOffsetsAndShapesForSETable( int64_t seDepth, mlir::ArrayAttr seSizeAttr, ArrayRef sliceOffsets, ArrayRef sliceSizes); diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/strategy_manager/sparsity_strategy.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/strategy_manager/sparsity_strategy.hpp index 0eb70de256..a0d21aae54 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/strategy_manager/sparsity_strategy.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/strategy_manager/sparsity_strategy.hpp @@ -5,17 +5,9 @@ #pragma once -#include "vpux/compiler/core/attributes/shape.hpp" -#include "vpux/compiler/dialect/IE/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" -#include "vpux/compiler/dialect/const/attributes/content.hpp" - +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/utils/core/array_ref.hpp" -#include "vpux/utils/core/enums.hpp" -#include "vpux/utils/core/func_ref.hpp" -#include "vpux/utils/core/optional.hpp" +#include "vpux/utils/logger/logger.hpp" #include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/strategy_manager/subgraph_optimizer.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/strategy_manager/subgraph_optimizer.hpp index d6bee48b5b..675fac168c 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/strategy_manager/subgraph_optimizer.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/strategy_manager/subgraph_optimizer.hpp @@ -5,7 +5,7 @@ #pragma once -#include +#include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/utils/multi_cluster_strategy_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/sibling_ops_analysis.hpp" diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sw_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sw_utils.hpp index 57c24f31ee..25ef756ed1 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sw_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/sw_utils.hpp @@ -138,6 +138,9 @@ SmallVector getSWInputTensorNumTiles(VPU::RollOp rollOp, int64_t numClu VPU::MultiClusterStrategy strategy, mlir::Value operand); SmallVector getSWInputTensorNumTiles(VPU::DynamicQuantizeOp op, int64_t numClustersAvailableForCompilation, VPU::MultiClusterStrategy strategy, mlir::Value operand); +SmallVector getSWInputTensorNumTiles(VPU::MemPermuteOp mempermuteOp, + int64_t numClustersAvailableForCompilation, + VPU::MultiClusterStrategy strategy); } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/scf_tiling/scf_tiling.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/scf_tiling/scf_tiling.hpp index 3a493979b9..1b4e8c342f 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/scf_tiling/scf_tiling.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/scf_tiling/scf_tiling.hpp @@ -16,6 +16,10 @@ namespace vpux::VPU { // Apply tiling using SCF dialect mlir::LogicalResult applySCFTiling(mlir::Operation* operation, mlir::RewriterBase& builder); +// Apply VF tiling using SCF dialect +mlir::FailureOr> applySCFVerticalFusion(mlir::Operation* operation, + mlir::RewriterBase& builder, Logger log); + SmallVector staticTileSizeComputation(mlir::OpBuilder& builder, mlir::Operation* operation, ShapeRef strategy, ShapeRef outputShape); diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_alg_interface.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_alg_interface.hpp index f185149a7c..ea9ff39bdc 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_alg_interface.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_alg_interface.hpp @@ -10,6 +10,8 @@ #include #include +#include "vpux/utils/core/small_vector.hpp" + namespace vpux { namespace VPU { // @@ -22,6 +24,10 @@ class ITilingAlgorithm { virtual ~ITilingAlgorithm() = default; virtual mlir::LogicalResult applyTiling(mlir::Operation* operation, mlir::RewriterBase& builder, Logger log) = 0; + + virtual mlir::FailureOr> applyVerticalFusion(mlir::Operation* operation, + mlir::RewriterBase& builder, + Logger log) = 0; }; } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_context.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_context.hpp index 9553cb9c08..f669375223 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_context.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_context.hpp @@ -24,12 +24,14 @@ class TilingContext final { mlir::LogicalResult applyTiling(mlir::RewriterBase& builder, Logger log); + mlir::FailureOr> applyVerticalFusion(mlir::RewriterBase& builder, Logger log); + private: std::unique_ptr _tilingAlgorithm; mlir::Operation* _operation = nullptr; }; // create and configure tiling context -TilingContext createTilingContext(mlir::Operation* operation, ShapeRef strategy, bool enableSCFTiling); +TilingContext createTilingContext(mlir::Operation* operation, bool enableSCFTiling); } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_general_algorithm.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_general_algorithm.hpp index c132078aa0..bac952344f 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_general_algorithm.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_general_algorithm.hpp @@ -16,6 +16,10 @@ namespace VPU { class TilingGeneralAlgorithm final : public ITilingAlgorithm { public: mlir::LogicalResult applyTiling(mlir::Operation* operation, mlir::RewriterBase& builder, Logger log) override; + + mlir::FailureOr> applyVerticalFusion(mlir::Operation* operation, + mlir::RewriterBase& builder, + Logger log) override; }; } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_scf_algorithm.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_scf_algorithm.hpp index e3f0236bc0..2533000fa1 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_scf_algorithm.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_scf_algorithm.hpp @@ -16,6 +16,10 @@ namespace VPU { class TilingSCFAlgorithm : public ITilingAlgorithm { public: mlir::LogicalResult applyTiling(mlir::Operation* operation, mlir::RewriterBase& builder, Logger log) override; + + mlir::FailureOr> applyVerticalFusion(mlir::Operation* operation, + mlir::RewriterBase& builder, + Logger log) override; }; } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_constraint_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_constraint_utils.hpp index 1664b1c6db..b89beb6884 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_constraint_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/tiling_constraint_utils.hpp @@ -15,6 +15,6 @@ namespace VPU { constexpr StringRef FRAGMENTATION_AVOID_RATIO_PIPELINING_LARGE_WEIGHTS = "VPU.FragmentationAvoidRatioPipeliningLargeWeights"; -double getFragmentationAvoidRatioPipeliningLargeWeights(VPU::ArchKind archKind); +double getFragmentationAvoidRatioPipeliningLargeWeights(config::ArchKind archKind); } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/merge_vf_region_base_rewriter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/merge_vf_region_base_rewriter.hpp index 9a5f23dacf..164d6fcb32 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/merge_vf_region_base_rewriter.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/merge_vf_region_base_rewriter.hpp @@ -65,14 +65,8 @@ class MergeVFRegionBaseRewriter : public mlir::OpRewritePattern getOptimalTilingStrategy(const IVFSchedulingPtr& scheduling, const Dim dim, - const int64_t minTiles, int64_t& maxTiles, - TilingOperationStorage::UPtr& minStorage, - TilingOperationStorage::UPtr& maxStorage, - VFConfigType& config) const = 0; virtual bool canMergeVFOpsWithoutCostCheck(VFCaseType& mergedCase) const = 0; virtual bool canSkipMergeVF(VFConfigType& vfConfig, bool opsNeedTiling) const = 0; - virtual std::deque getVFSchedulingChecks(VFConfigType& config) const = 0; virtual IVFSchedulingPtr detectScenario(VFConfigType& vfConfig) const = 0; virtual std::optional findVFTiling(VPU::VerticalFusionOp mergedOp, VPU::VerticalFusionOp prevOp, VPU::VerticalFusionOp currentOp) const = 0; diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/merge_vf_region_rewriter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/merge_vf_region_rewriter.hpp index 475585e081..8deca1b3d6 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/merge_vf_region_rewriter.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/merge_vf_region_rewriter.hpp @@ -40,8 +40,8 @@ class MergeVFRegionRewriter final : public MergeVFRegionBaseRewriter { const int64_t minTiles, int64_t& maxTiles, VPU::TilingOperationStorage::UPtr& minStorage, VPU::TilingOperationStorage::UPtr& maxStorage, - VFConfig& config) const override; - std::deque getVFSchedulingChecks(VFConfig& config) const override; + VFConfig& config) const; + std::deque getVFSchedulingChecks(VFConfig& config) const; std::shared_ptr> detectScenario(VFConfig& vfConfig) const override; std::optional findVFTiling(VPU::VerticalFusionOp mergedOp, VPU::VerticalFusionOp prevOp, VPU::VerticalFusionOp currentOp) const override; diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_config.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_config.hpp index b0a304f443..65d03f0250 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_config.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_config.hpp @@ -59,6 +59,8 @@ class VFConfig final { private: bool isVFPipelinePattern(); + void validateConfig(); + VPU::VerticalFusionOp _subgraph; mlir::Operation* _largestOp = nullptr; SmallVector _inputOps; diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_utils.hpp index 8603bb89b4..62cc410237 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_utils.hpp @@ -5,9 +5,24 @@ #pragma once +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_axis_increment.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_scheduler_interface.hpp" namespace vpux::VPU::VF::v1 { // check if whole operation is in CMX bool isCmxOperation(mlir::Operation* operation, const bool checkTilingType); + +// get the maximal valid tiling strategy for VF block between the given range of tiling strategy +mlir::FailureOr> getMaximalValidTilingStrategyFromRange( + VerticalFusionOp subgraph, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, + Dim tilingAxis, TilingOperationStorage::UPtr& opStorage, Logger log); + +// get the minimal valid tiling strategy for VF block between the given range of tiling strategy +mlir::FailureOr> getMinimalValidTilingStrategyFromRange( + VerticalFusionOp subgraph, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, + Dim tilingAxis, TilingOperationStorage::UPtr& opStorage, Logger log); + +// if the maxTile is too large, return the cbrt of it if it's a valid max tile candidate +std::optional getCbrtMaxTileCandidate(int64_t minTile, int64_t maxTile, + std::unique_ptr& axisIncrement); } // namespace vpux::VPU::VF::v1 diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vf_tiling_rewriter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vf_tiling_rewriter.hpp new file mode 100644 index 0000000000..1435b4b3e1 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vf_tiling_rewriter.hpp @@ -0,0 +1,35 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_scheduling_factory.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vf_tiling_base_rewriter.hpp" + +namespace vpux::VPU::VF::v1 { + +// +// VerticalFusionTilingRewriter +// + +class VerticalFusionTilingRewriter : public VerticalFusionTilingRewriterBase { +public: + VerticalFusionTilingRewriter(mlir::MLIRContext* ctx, bool enableVerticalFusionPipelining, + const std::unique_ptr& costFunction, Logger log) + : VerticalFusionTilingRewriterBase(ctx, enableVerticalFusionPipelining, + costFunction, log) { + } + +protected: + std::pair getDimsData(ArrayRef strategy) const override { + auto dim = getVFTilingDim(strategy); + VPUX_THROW_WHEN(!dim.has_value(), "There is no tiling for VF"); + DimArr dims = {dim.value()}; + return std::make_pair(dims, strategy[dim.value().ind()]); + } +}; +} // namespace vpux::VPU::VF::v1 diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/wrap_vf_rewriter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/wrap_vf_rewriter.hpp new file mode 100644 index 0000000000..404a9a3955 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/wrap_vf_rewriter.hpp @@ -0,0 +1,22 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once + +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/wrap_vf_base_rewriter.hpp" + +namespace vpux::VPU::VF::v1 { + +// +// WrapVFRewriter +// + +class WrapVFRewriter : public VF::WrapVFRewriterBase { +public: + WrapVFRewriter(mlir::MLIRContext* ctx, Logger log): VF::WrapVFRewriterBase(ctx, log) { + } + + bool opNeedsTobeWrapped(VPU::VerticalFusionOpInterface op) const override; +}; +} // namespace vpux::VPU::VF::v1 diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/merge_vf_region_rewriter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/merge_vf_region_rewriter.hpp index 7a6f029983..86ea4d60af 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/merge_vf_region_rewriter.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/merge_vf_region_rewriter.hpp @@ -36,14 +36,8 @@ class MergeVFRegionRewriter final : public MergeVFRegionBaseRewriter { bool canMergeVFOpsWithoutCostCheck(VFCase& mergedCase) const override; bool canSkipMergeVF(VFConfig& vfConfig, bool opsNeedTiling) const override; VPU::StrategyCost extractVFCost(VFConfig& vfConfig) const override; - std::optional getOptimalTilingStrategy(const IVFSchedulingPtr& scheduling, const Dim dim, - const int64_t minTiles, int64_t& maxTiles, - TilingOperationStorage::UPtr& minStorage, - TilingOperationStorage::UPtr& maxStorage, - VFConfig& config) const override; bool cmxSizeExceedForEltwiseOpWithSwOpUser(VFConfig& currentConfig, ArrayRef parents, Logger log) const; - std::deque getVFSchedulingChecks(VFConfig& config) const override; std::shared_ptr> detectScenario(VFConfig& vfConfig) const override; std::optional findVFTiling(VPU::VerticalFusionOp mergedOp, VPU::VerticalFusionOp prevOp, VPU::VerticalFusionOp currentOp) const override; diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_case.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_case.hpp index cb63d385ad..55fb50da3e 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_case.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_case.hpp @@ -7,6 +7,7 @@ #include "vpux/compiler/core/attributes/dim.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_scheduler_interface.hpp" namespace vpux::VPU::VF::v2 { @@ -21,7 +22,7 @@ class VFCase final { /* Constructor of VF case */ - explicit VFCase(VFConfigType& config, Dim axis); + VFCase(VFConfigType& config, const VFSplit& split); /* Destructor of VF case @@ -49,9 +50,9 @@ class VFCase final { VFCase& operator=(const VFCase& other); /* - Set number of tiles + Set number of tiles for given dimension */ - void setTilingNumber(int64_t number); + void setTilingNumber(Dim dim, int64_t number); /* Set VF scheduling @@ -81,18 +82,13 @@ class VFCase final { /* Generate VF tiling */ - mlir::ArrayAttr getTiling() const; + mlir::ArrayAttr getTiling(); /* Set Scheduling and tiling to VF */ void approveScheduling(); - /* - Get current tiling number - */ - int64_t getTilingNumber() const; - private: /* Add CMX write spills @@ -115,14 +111,9 @@ class VFCase final { VFConfigType _config; /* - Axis for tiling - */ - Dim _axis; - - /* - Number of tiles + VF Split */ - int64_t _tilingNumber = 1; + VFSplit _split; /* VF Scheduling diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.hpp index fcba92e516..82935a8453 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.hpp @@ -17,6 +17,8 @@ class VFConfig final { bool secondVFNeedsTiling = true); ~VFConfig() = default; + VFConfig(const llvm::SetVector& operations); + // get original subgraph VPU::VerticalFusionOp getSubgraph() const; @@ -30,7 +32,7 @@ class VFConfig final { const SmallVector& getOutputs(); // get all oeprations in the subgraph - const SmallVector& getVFOperations(); + const llvm::SetVector& getVFOperations(); // get all oeprations in the subgraph SmallVector getOperationsForTiling(); @@ -54,12 +56,13 @@ class VFConfig final { private: virtual bool isVFPipelinePattern(); + void validateConfig(); VPU::VerticalFusionOp _subgraph; mlir::Operation* _largestOp = nullptr; SmallVector _inputOps; SmallVector _outputOps; - SmallVector _vfOps; + llvm::SetVector _vfOps; bool _isVFPipelineCandidate = false; bool _isPipelineEnabled = false; bool _firstVFNeedsTiling = true; diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_utils.hpp index 036c837e9d..4485b662bd 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_utils.hpp @@ -5,10 +5,13 @@ #pragma once +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_scheduler_interface.hpp" namespace vpux::VPU::VF::v2 { +using VFSplit = std::map>; + // check if whole operation is in CMX bool isCmxOperation(mlir::Operation* operation, const bool checkTilingType); @@ -18,10 +21,35 @@ bool hasBeforeDDRUsers(mlir::Operation* prevOp, mlir::Operation* nextOp); // Check if the op has multi view op user with shape changed, which will cause the output to be spilled bool hasOutputSpilledForDifferentDataSizeUses(mlir::Operation* op); -// Check if the op's output is tiled on same axis as the disributed output type's tiling axis +// Check if the op's output is tiled on same axis as the distributed output type's tiling axis bool outputTileAxisIsSameAsMultiClusterStrategy(mlir::Operation* op); -// Check if the op's input is tiled on same axis as the disributed input type's tiling axis +// Check if the op's input is tiled on same axis as the distributed input type's tiling axis bool inputTileAxisIsSameAsMultiClusterStrategy(mlir::Operation* op, mlir::Value operand); +// get the maximal valid tiling strategy for VF block between the given range of tiling strategy +mlir::FailureOr> getMaximalValidTilingStrategyFromRange( + VFConfig& config, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, Dim tilingAxis, + TilingOperationStorage::UPtr& opStorage, Logger log); + +// get the minimal valid tiling strategy for VF block between the given range of tiling strategy +mlir::FailureOr> getMinimalValidTilingStrategyFromRange( + VFConfig& config, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, Dim tilingAxis, + TilingOperationStorage::UPtr& opStorage, Logger log); + +// calculate tiling regions based on particular tiling strategy +mlir::FailureOr calculateTilingRegions(VFConfig& config, ArrayRef tilingStrategy, Logger log, + const TilingOperationStorage::UPtr& opStorage); + +// Restore tiling strategy by VF split +SmallVector restoreTilingBySplit(int64_t rank, const VFSplit& split); + +// Return Vf tiling split from strategy +VFSplit getVFTilingSplit(ArrayRef tilingStrategy); + +// Get number of tiles from split +int64_t getVFTilesLen(const VFSplit& vfSplit); + +// if the maxTile is too large, return the cbrt of it if it's a valid max tile candidate +std::optional getCbrtMaxTileCandidate(int64_t minTile, int64_t maxTile); } // namespace vpux::VPU::VF::v2 diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vf_tiling_rewriter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vf_tiling_rewriter.hpp new file mode 100644 index 0000000000..4ba4d70332 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vf_tiling_rewriter.hpp @@ -0,0 +1,46 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_scheduling_factory.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vf_tiling_base_rewriter.hpp" +#include "vpux/compiler/dialect/const/dialect.hpp" + +namespace vpux::VPU::VF::v2 { + +// +// VerticalFusionTilingRewriter +// + +typedef std::function TilingFunction; + +class VerticalFusionTilingRewriter : public VerticalFusionTilingRewriterBase { +public: + VerticalFusionTilingRewriter(mlir::MLIRContext* ctx, bool enableVerticalFusionPipelining, + const std::unique_ptr& costFunction, Logger log) + : VerticalFusionTilingRewriterBase(ctx, enableVerticalFusionPipelining, + costFunction, log) { + } + +protected: + std::pair getDimsData(ArrayRef strategy) const override { + int64_t tilesLen = 1; + DimArr dims; + for (auto item : strategy | indexed) { + auto dim = Dim(item.index()); + auto tileSize = item.value(); + if (tileSize > 1) { + dims.push_back(dim); + tilesLen *= tileSize; + } + } + return std::make_pair(dims, tilesLen); + } +}; +} // namespace vpux::VPU::VF::v2 diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/wrap_vf_rewriter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/wrap_vf_rewriter.hpp new file mode 100644 index 0000000000..8604131f3d --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/wrap_vf_rewriter.hpp @@ -0,0 +1,22 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once + +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/wrap_vf_base_rewriter.hpp" + +namespace vpux::VPU::VF::v2 { + +// +// WrapVFRewriter +// + +class WrapVFRewriter : public VF::WrapVFRewriterBase { +public: + WrapVFRewriter(mlir::MLIRContext* ctx, Logger log): VF::WrapVFRewriterBase(ctx, log) { + } + + bool opNeedsTobeWrapped(VPU::VerticalFusionOpInterface op) const override; +}; +} // namespace vpux::VPU::VF::v2 diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_algorithm.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_algorithm.hpp new file mode 100644 index 0000000000..815c752aef --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_algorithm.hpp @@ -0,0 +1,28 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/utils/logger/logger.hpp" + +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" + +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_case.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_scheduler_interface.hpp" +namespace vpux::VPU::VF::v2 { + +std::deque>> getSchedulingScenarios(VFCase::VFConfigType& config, + Logger log); + +// find optimal VF configuration based on operations merged in VF +// the algorithm searches for optimal tiling axis, tiling number and scheduling +VPU::VF::v2::VFCase getVFCaseWithTiling( + VFCase::VFConfigType& config, Dim dim, const VFSplit& split, + const std::function& minNumCalc, + const std::function& maxNumCalc, Logger log, + const std::deque>>& vfSchedulingChecks); + +} // namespace vpux::VPU::VF::v2 diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_storage.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_storage.hpp index a7ecec9b90..9499948d34 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_storage.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_storage.hpp @@ -24,8 +24,18 @@ namespace VPU { template > class VFContainer { public: + // constructor + VFContainer() = default; + + // constructor with reserved size for each inner container + VFContainer(ArrayRef vfKeys, int64_t reservedSize) { + for (const auto& vfKey : vfKeys) { + vfContainer[vfKey].reserve(reservedSize); + } + } + // connection between number of tile and info - using VFTileContainer = DenseMap; + using VFTileContainer = std::unordered_map; // pointer to container using UPtr = std::unique_ptr>; @@ -42,16 +52,16 @@ class VFContainer { std::optional get(VFKey key, size_t tile); // function returns information gathered together for all tiles - std::vector gatherValue(VFKey key); + const VFTileContainer& gatherValue(VFKey key); // get whole inner container - const llvm::DenseMap& getAll() const { + const std::unordered_map& getAll() const { return vfContainer; }; private: // inner container for storage connection - DenseMap vfContainer; + std::unordered_map vfContainer; // comparator for elements of info Compare vfComparator; @@ -59,11 +69,11 @@ class VFContainer { template void vpux::VPU::VFContainer::merge(const VFContainer& src) { - for (auto item : src.getAll()) { + for (auto& item : src.getAll()) { if (vfContainer.count(item.first) == 0) { vfContainer[item.first] = item.second; } else { - for (auto tileItem : item.second) { + for (auto& tileItem : item.second) { insert(item.first, tileItem.first, tileItem.second); } } @@ -72,12 +82,10 @@ void vpux::VPU::VFContainer::merge(const VFContainer void vpux::VPU::VFContainer::insert(VFKey key, size_t tile, const VFValue& src) { - auto foundTileItem = llvm::find_if(vfContainer[key], [&](const auto& i) { - return tile == i.first; - }); - - if (foundTileItem == vfContainer[key].end()) { - vfContainer[key].try_emplace(tile, src); + auto& tileContainer = vfContainer[key]; + auto foundTileItem = tileContainer.find(tile); + if (foundTileItem == tileContainer.end()) { + tileContainer.insert({tile, src}); } else { foundTileItem->second = std::max(foundTileItem->second, src, vfComparator); } @@ -101,10 +109,9 @@ std::optional vpux::VPU::VFContainer::get(VFKe } template -std::vector vpux::VPU::VFContainer::gatherValue(VFKey key) { - return to_std_vector(vfContainer[key] | transformed([](const auto& item) { - return item.second; - })); +const typename vpux::VPU::VFContainer::VFTileContainer& +vpux::VPU::VFContainer::gatherValue(VFKey key) { + return vfContainer[key]; } } // namespace VPU diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_utils.hpp index a177715fd8..074df5bc15 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_utils.hpp @@ -14,6 +14,8 @@ namespace vpux { namespace VPU { +constexpr vpux::StringLiteral isInPlace = "is_inplace"; // inplace attribute name + // min length of tensor by tiled axis. It limits number of tiles // which we may increase in order to fit in CMX constexpr int64_t MINIMUM_LENGTH_TILING = 4; @@ -42,32 +44,23 @@ using VFContainerPipelineStorage = VFContainer; // for each VF tile TilingStorage restoreTilingRegions(VPU::VerticalFusionOp vfOp, Logger log, const TilingOperationStorage::UPtr& opStorage); -// calculate tiling regions based on particular tiling strategy -mlir::FailureOr calculateTilingRegions(VPU::VerticalFusionOp vfOp, ArrayRef tilingStrategy, - Logger log, const TilingOperationStorage::UPtr& opStorage); -// calculate tiling regions based on known output tiles for last operation in the block -mlir::FailureOr calculateTilingRegions(VPU::VerticalFusionOp vfOp, const OutputTiling& tiles, Logger log, - const TilingOperationStorage::UPtr& opStorage); + // calculate recursively tiling regions for the block starting from last operation and known output tiles for it // function builds connection between block arguments and tiles -// in case TilingOperationStorage pointer was passed, it filles in connection between each operation and +// in case TilingOperationStorage pointer was passed, it fills in connection between each operation and // its input and output tiles mlir::FailureOr calculateTilingRegions(mlir::Operation* operation, const OutputTiling& tiles, Logger log, const TilingOperationStorage::UPtr& opStorage, - std::optional numTile = std::nullopt); + const llvm::SetVector& fusedOps = {}); -// calculate limit for number of tiles for set of operations -int64_t getTilingLimit(Dim axis, ArrayRef operations); +mlir::FailureOr calculateTilingRegions(VPU::VerticalFusionOp vfOp, const OutputTiling& tiles, Logger log, + const TilingOperationStorage::UPtr& opStorage); -// get the maximal valid tiling strategy for VF block between the given range of tiling strategy -mlir::FailureOr> getMaximalValidTilingStrategyFromRange( - VPU::VerticalFusionOp op, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, - Dim tilingAxis, TilingOperationStorage::UPtr& opStorage, Logger log); +mlir::FailureOr calculateTilingRegions(VPU::VerticalFusionOp vfOp, ArrayRef tilingStrategy, + Logger log, const TilingOperationStorage::UPtr& opStorage); -// get the minimal valid tiling strategy for VF block between the given range of tiling strategy -mlir::FailureOr> getMinimalValidTilingStrategyFromRange( - VPU::VerticalFusionOp op, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, - Dim tilingAxis, TilingOperationStorage::UPtr& opStorage, Logger log); +// calculate limit for number of tiles for set of operations +int64_t getTilingLimit(Dim axis, ArrayRef operations, bool tilingOnHW = false); // get the tiling dimension according to the tiling strategy // return nullopt if there is no tiling @@ -116,7 +109,7 @@ mlir::FailureOr> backInferVFTiling( // Check if spilling read and write operations can be overlapped // For DMA ops with different source memory kind, if the HW supports VPUIP.ChannelType, the spilling read and write ops // can be overlapped -bool spillingCopyOpsCanBeOverlapped(VPU::ArchKind arch); +bool spillingCopyOpsCanBeOverlapped(config::ArchKind arch); // Check if the op is tiled or not bool isOpTiled(mlir::Operation* op); diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/vf_tiling_base_rewriter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/vf_tiling_base_rewriter.hpp new file mode 100644 index 0000000000..ceb6ed9d1e --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/vf_tiling_base_rewriter.hpp @@ -0,0 +1,373 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once + +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_utils.hpp" + +namespace vpux::VPU { + +// +// VerticalFusionTilingRewriter +// + +typedef std::function TilingFunction; + +template +class VerticalFusionTilingRewriterBase : public mlir::OpRewritePattern { +public: + VerticalFusionTilingRewriterBase(mlir::MLIRContext* ctx, bool enableVerticalFusionPipelining, + const std::unique_ptr& costFunction, Logger log) + : mlir::OpRewritePattern(ctx), + _enableVerticalFusionPipelining(enableVerticalFusionPipelining), + _vpunnCostFunction(costFunction), + _log(log) { + } + + mlir::LogicalResult matchAndRewrite(VPU::VerticalFusionOp origOp, mlir::PatternRewriter& rewriter) const final; + +protected: + virtual std::pair getDimsData(ArrayRef strategy) const = 0; + +private: + void adjustInputShape(mlir::PatternRewriter& rewriter, mlir::Operation* operation, InputTiling& inputTiling, + mlir::IRMapping& mapper, TilingStorage& tilingStorage, + const TilingOperationStorage::UPtr& opStorage, int64_t tilingIndex, DimArrRef dims) const; + void processOffset(mlir::Value operand, const TilingOperationStorage::UPtr& opStorage, TileInfo& originalTiling, + int64_t tilingIndex, DimArrRef dims, ShapeRef expectedShape) const; + bool processBlockArgument(mlir::BlockArgument blockArg, TilingStorage& tilingStorage, TileInfo& originalTiling, + int64_t tilingIndex, DimArrRef dims) const; + void applyLinearTiling(const int64_t numTiles, VFConfigType& config, SmallVector& resultTileVals, + SmallVector& resultTileOffsets, const TilingFunction& tilingProcedure) const; + void applyPipelinedTiling(const int64_t numTiles, VFConfigType& config, SmallVector& resultTileVals, + SmallVector& resultTileOffsets, const TilingFunction& tilingProcedure, + const TilingOperationStorage::UPtr& storage) const; + + bool _enableVerticalFusionPipelining; + const std::unique_ptr& _vpunnCostFunction; + Logger _log; +}; + +template +bool VerticalFusionTilingRewriterBase::processBlockArgument( + mlir::BlockArgument blockArg, TilingStorage& tilingStorage, TileInfo& originalTiling, int64_t tilingIndex, + DimArrRef dims) const { + auto& offset = originalTiling.offsets; + const auto storageInfo = tilingStorage.get(blockArg.getArgNumber(), tilingIndex); + VPUX_THROW_WHEN(!storageInfo.has_value(), "Tiling info for argument {0} with index {1} not found", blockArg, + tilingIndex); + + auto tileInfo = storageInfo.value(); + VPUX_THROW_UNLESS(dims.size() < tileInfo.shape.size(), "Got invalid tiling shape size {0}", tileInfo.shape.size()); + + const auto inputOffset = tileInfo.offsets; + const auto inputDimShape = tileInfo.shape; + const auto origDimSize = originalTiling.shape; + + _log.trace("Input Offset {0}, shape {1} ==> offset: {2}, shape: {3} ", inputOffset, inputDimShape, offset, + origDimSize); + + for (auto dim : dims) { + if (offset[dim] >= inputOffset[dim] && + (inputOffset[dim] + inputDimShape[dim]) >= (offset[dim] + origDimSize[dim])) { + offset[dim] -= inputOffset[dim]; + continue; + } + _log.trace("invalid offsets: Input Offset {0}, shape {1} ==> offset: {2}, shape: {3} ", inputOffset, + inputDimShape, offset, origDimSize); + return false; + } + + return true; +} + +template +void VerticalFusionTilingRewriterBase::processOffset( + mlir::Value operand, const TilingOperationStorage::UPtr& opStorage, TileInfo& originalTiling, + int64_t tilingIndex, DimArrRef dims, ShapeRef expectedShape) const { + auto& offset = originalTiling.offsets; + auto offsetEqualsToZero = llvm::all_of(dims, [&](Dim dim) { + return offset[dim] == 0; + }); + + if (offsetEqualsToZero) { + return; + } + + auto operandOp = operand.getDefiningOp(); + if (operandOp != nullptr) { + auto inputOutputTiling = opStorage->get(operandOp, tilingIndex); + VPUX_THROW_UNLESS(inputOutputTiling.has_value(), "Couldn't find tiling info at {0}", operandOp->getLoc()); + const auto inputOutputTilingPair = inputOutputTiling.value(); + auto& outTile = inputOutputTilingPair.second; + for (auto dim : dims) { + offset[dim] -= outTile.offsets[dim]; + } + return; + } + + for (auto dim : dims) { + offset[dim] = expectedShape[dim] - originalTiling.shape[dim]; + } +} + +/* + This function slice to original tile shape in case bigger tile size was chosen + during backpropagation process. + In this case adjust shapes to original one by slicing +*/ +template +void VerticalFusionTilingRewriterBase::adjustInputShape( + mlir::PatternRewriter& rewriter, mlir::Operation* operation, InputTiling& inputTiling, mlir::IRMapping& mapper, + TilingStorage& tilingStorage, const TilingOperationStorage::UPtr& opStorage, int64_t tilingIndex, + DimArrRef dims) const { + VPUX_THROW_WHEN(inputTiling.tiles.size() < operation->getOperands().size(), + "Number of operands {0} is more than number of operand tiles {1}", operation->getOperands().size(), + inputTiling.tiles.size()); + for (auto op : operation->getOperands() | indexed) { + auto operand = op.value(); + auto opIndex = op.index(); + + auto expectedOp = mapper.lookupOrNull(operand); + if (expectedOp == nullptr) { + continue; + } + + auto originalTiling = inputTiling.tiles[opIndex]; + auto expectedShape = getShape(expectedOp); + auto expectedOpSize = expectedShape.totalSize(); + const auto originalOpSize = originalTiling.shape.totalSize(); + if (expectedOpSize == originalOpSize) { + continue; + } + + // + // For below pattern, the Eltwise3 may be tiled before the Eltwise2. + // Then the Operand has been mapped to the new "SliceOp1" instead of "Eltwise1". + // While tiling "Eltwise2", it throw exception of "expectedOpSize < originalOpSize". + // Need to update this branch operand for this case. + // + // VF tilingStrategy: [1, 1, 1, 4] + // | | + // Eltwise1: 1x64x72x128 Conv: 1x64x72x128 + // | X | + // Eltwise2: 1x64x72x128 Eltwise3: 1x64x72x128 + // | | + // Conv: 1x64x72x128 | + // | | + // Conv: 1x64x72x128 | + // \ / + // Eltwise4: 1x64x72x128 + // | + // + // tiling into: + // + // | | + // Eltwise1: 1x64x72x36 Conv: 1x64x72x36 + // | X | + // | / SliceOp1 SliceOp2 + // | / \ | + // Eltwise2: 1x64x72x36 Eltwise3: 1x64x72x32 + // | | + // Conv: 1x64x72x34 | + // | | + // Conv: 1x64x72x32 | + // \ / + // Eltwise4: 1x64x72x32 + // | + if (expectedOpSize < originalOpSize) { + if (auto insertSliceOp = mlir::dyn_cast(expectedOp.getDefiningOp())) { + expectedOp = insertSliceOp.getInputs().front(); + expectedShape = getShape(expectedOp); + expectedOpSize = expectedShape.totalSize(); + } + } + + VPUX_THROW_WHEN( + expectedOpSize < originalOpSize, + "Original shape size for operand {0} is bigger than current one. Current size {1}, original size {2}", + operand, expectedOpSize, originalOpSize); + + VPUX_THROW_WHEN(expectedShape.size() != originalTiling.shape.size(), + "Expected shape {0} and original one {1} must have same rank", expectedShape, + originalTiling.shape); + + // correct offset of operations based on offsets of block argument + // In case the output of previous operation is bigger than expected + // which might happen when bigger tile was chosen for same block argument + // slice operation is needed after the output with correct offsets + // calculated based on tiling information of current operation and previous one + _log.trace("op {0}, Offset before {1}, shape {2}", operation->getLoc(), originalTiling.offsets, + originalTiling.shape); + + mlir::Value opSlice; + const auto valName = printToString("input {0}", opIndex); + auto blockArg = mlir::dyn_cast(operand); + if (blockArg != nullptr) { + if (!processBlockArgument(blockArg, tilingStorage, originalTiling, tilingIndex, dims)) { + auto sliceOp = mlir::dyn_cast_or_null(expectedOp.getDefiningOp()); + VPUX_THROW_WHEN(sliceOp == nullptr || sliceOp.getSource() == operand, + "Can't get the operand from Slice"); + + auto inputOutputTiling = opStorage->get(operation, tilingIndex); + VPUX_THROW_UNLESS(inputOutputTiling.has_value(), "Couldn't find tiling info at {0}", + operation->getLoc()); + + const auto inputTiling = inputOutputTiling.value().first.tiles[blockArg.getArgNumber()]; + opSlice = makeTile(rewriter, operation->getLoc(), sliceOp.getSource(), inputTiling, valName); + } else { + opSlice = makeTile(rewriter, operation->getLoc(), expectedOp, originalTiling, valName); + } + } else { + processOffset(operand, opStorage, originalTiling, tilingIndex, dims, expectedShape); + if (auto sliceOp = mlir::dyn_cast_or_null(expectedOp.getDefiningOp())) { + // correct offsets + for (auto axis : dims) { + auto sliceOffset = parseIntArrayAttr(sliceOp.getStaticOffsets()); + VPUX_THROW_UNLESS(originalTiling.offsets[axis] >= sliceOffset[axis.ind()], + "Slice offset {0} is bigger than original one {1}", sliceOffset[axis.ind()], + originalTiling.offsets[axis]); + originalTiling.offsets[axis] = originalTiling.offsets[axis] - sliceOffset[axis.ind()]; + } + } + opSlice = makeTile(rewriter, operation->getLoc(), expectedOp, originalTiling, valName); + } + + _log.trace("Offset after {0}, shape {1} expectedOp {2}", originalTiling.offsets, originalTiling.shape, + expectedOp); + + mapper.map(operand, opSlice); + } +} + +template +void VerticalFusionTilingRewriterBase::applyLinearTiling( + const int64_t numTiles, VFConfigType& config, SmallVector& resultTileVals, + SmallVector& resultTileOffsets, const TilingFunction& tilingProcedure) const { + auto operations = config.getVFOperations(); + + for (auto index : irange(numTiles)) { + mlir::Value currentResult; + Shape currentTile; + for (auto* op : operations) { + tilingProcedure(index, op, currentResult, currentTile); + } + + resultTileVals.push_back(currentResult); + resultTileOffsets.push_back(currentTile); + } +} + +template +void VerticalFusionTilingRewriterBase::applyPipelinedTiling( + const int64_t numTiles, VFConfigType& config, SmallVector& resultTileVals, + SmallVector& resultTileOffsets, const TilingFunction& tilingProcedure, + const TilingOperationStorage::UPtr& storage) const { + auto scheduling = config.getSubgraph().getScenario(); + VPUX_THROW_WHEN(!scheduling.has_value(), "Cannot get scheduling scenario from VF {0}", config.getSubgraph()); + + VFSchedulingFactoryType costFactory(/*prefetching=*/true); + auto scenario = costFactory.createVFScenario(scheduling.value(), _log); + + if (auto pipelinedScenario = std::dynamic_pointer_cast>(scenario)) { + auto pipelining = pipelinedScenario->getPipelining(config, numTiles, storage, _vpunnCostFunction); + auto timeline = pipelining.getTimeLine(); + if (!timeline.empty()) { + mlir::Value currentResult; + Shape currentTile; + for (auto& [index, operation] : pipelining.getTimeLine()) { + // currentResult and currentTiles keep result from previous call tilingProcedure + tilingProcedure(index, operation, currentResult, currentTile); + + if (llvm::find(config.getOutputs(), operation) != config.getOutputs().end()) { + resultTileVals.push_back(currentResult); + resultTileOffsets.push_back(currentTile); + } + } + return; + } + } + applyLinearTiling(numTiles, config, resultTileVals, resultTileOffsets, tilingProcedure); +} + +template +mlir::LogicalResult VerticalFusionTilingRewriterBase::matchAndRewrite( + VPU::VerticalFusionOp vfOp, mlir::PatternRewriter& rewriter) const { + const auto tilingStrategy = parseIntArrayAttr(mlir::cast(vfOp.getTilingStrategy())); + + DimArr dims; + int64_t tilesLen = 0; + std::tie(dims, tilesLen) = getDimsData(tilingStrategy); + + if (tilesLen <= 1) { + return mlir::failure(); + } + + auto operationStorage = std::make_unique(); + auto tilingStorage = restoreTilingRegions(vfOp, _log, operationStorage); + + VFConfigType vfConfig(vfOp, _enableVerticalFusionPipelining); + + SmallVector resultTileVals; + resultTileVals.reserve(tilesLen); + SmallVector resultTileOffsets; + DenseMap mappers; + + const auto tilingProcedure = [&](int64_t index, mlir::Operation* op, mlir::Value& currentResult, + Shape& currentTile) { + auto& mapper = mappers[index]; + for (auto operand : op->getOperands()) { + if (auto blockArg = mlir::dyn_cast(operand)) { + const auto valName = printToString("ba_input {0}", index); + auto origInput = vfOp.getOperand(blockArg.getArgNumber()); + auto tileInfo = tilingStorage.get(blockArg.getArgNumber(), index); + + VPUX_THROW_WHEN(!tileInfo.has_value(), "Couldn't find tile information for argument {0} and tile {1}", + blockArg.getArgNumber(), index); + auto operandTile = VPU::makeTile(rewriter, op->getLoc(), origInput, tileInfo.value(), valName); + + mapper.map(operand, operandTile); + } + } + + auto inputTiling = operationStorage->get(op, index); + + VPUX_THROW_WHEN(!inputTiling.has_value(), "Couldn't find tile information for operation {0} and tile {1}", *op, + index); + + const auto inputTilingPair = inputTiling.value(); + auto inputTilingInfo = inputTilingPair.first; + adjustInputShape(rewriter, op, inputTilingInfo, mapper, tilingStorage, operationStorage, index, dims); + + auto* copiedOp = rewriter.clone(*op, mapper); + currentResult = copiedOp->getResult(0); + + currentTile = inputTilingPair.second.offsets; + const auto baseResType = mlir::cast(op->getResult(0).getType()); + if (auto tiledBuilderOp = mlir::dyn_cast(copiedOp)) { + tiledBuilderOp.adjustAttrs(inputTilingInfo, inputTilingPair.second); + } else if (auto tiledViewOp = mlir::dyn_cast(copiedOp)) { + tiledViewOp.adjustAttrs(inputTilingInfo, inputTilingPair.second, baseResType.getShape()); + } + const auto tiledResType = + baseResType.extractDenseTile(inputTilingPair.second.offsets, inputTilingPair.second.shape); + + currentResult.setType(tiledResType); + mapper.map(op->getResult(0), currentResult); + }; + + if (vfConfig.isPipelined()) { + applyPipelinedTiling(tilesLen, vfConfig, resultTileVals, resultTileOffsets, tilingProcedure, operationStorage); + } else { + applyLinearTiling(tilesLen, vfConfig, resultTileVals, resultTileOffsets, tilingProcedure); + } + + rewriter.replaceOpWithNewOp(vfOp, vfOp->getResult(0).getType(), mlir::ValueRange(resultTileVals), + ArrayRef(resultTileOffsets)); + + return mlir::success(); +} + +} // namespace vpux::VPU diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/wrap_vf_base_rewriter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/wrap_vf_base_rewriter.hpp new file mode 100644 index 0000000000..179cf5ff71 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/vertical_fusion/wrap_vf_base_rewriter.hpp @@ -0,0 +1,32 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once + +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_utils.hpp" + +namespace vpux::VPU::VF { + +// +// WrapVFRewriter +// + +class WrapVFRewriterBase : public mlir::OpInterfaceRewritePattern { +public: + WrapVFRewriterBase(mlir::MLIRContext* ctx, Logger log) + : mlir::OpInterfaceRewritePattern(ctx), _log(log) { + } + + mlir::LogicalResult matchAndRewrite(VPU::VerticalFusionOpInterface origOp, + mlir::PatternRewriter& rewriter) const final; + + virtual void wrapIntoVFRegion(VPU::VerticalFusionOpInterface op, mlir::PatternRewriter& rewriter) const; + + virtual bool opNeedsTobeWrapped(VPU::VerticalFusionOpInterface op) const = 0; + +protected: + Logger _log; +}; +} // namespace vpux::VPU::VF diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/weights_separation.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/weights_separation.hpp index c3e90e4535..b5d85829c4 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/weights_separation.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/weights_separation.hpp @@ -14,6 +14,13 @@ namespace vpux::VPU { +/** @brief Internal prefix used for Init output + + This prefix is part of the contract between the compiler and the plugin in the context of weights separation. + Be careful when changing it. "tw" stands for "transformed weights" +*/ +constexpr const char* INIT_OUTPUT_PREFIX = "vpux_tw_"; + using MemPermuteConversionAttributes = std::tuple; @@ -154,6 +161,18 @@ namespace detail { vpux::Byte getResultBufferSizeForInit(const TransformationsSplit& x); } // namespace detail +//! @brief Specifies whether a given constant is "trivial" within the scope of +//! weights separation (e.g. has only view-like transformations). +//! +//! @note Used by isSuitableForWeightsSeparation(). +bool isTrivialForWeightsSeparation(Const::DeclareOp constOp); + +//! @brief Specifies whether a given constant is "move-worthy" within the scope +//! of weights separation. +//! +//! @note Used internally in collectMoveWorthyConstants(). +bool isSuitableForWeightsSeparation(Const::DeclareOp constOp); + //! @brief Collects all constant operations that are worth moving to the Init //! schedule. std::vector collectMoveWorthyConstants(const Logger& log, mlir::func::FuncOp mainFunc); diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/wlm_constraint_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/wlm_constraint_utils.hpp index add9fa395f..19feec6d76 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/wlm_constraint_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/wlm_constraint_utils.hpp @@ -21,7 +21,7 @@ constexpr StringRef METADATA_MAX_KERNEL_INVOCATION_COUNT = "VPU.MetadataMaxKerne constexpr StringRef METADATA_MAX_KERNEL_RANGE_COUNT = "VPU.MetadataMaxKernelRangeCount"; constexpr StringRef METADATA_MAX_MEDIA_COUNT = "VPU.MetadataMaxMediaCount"; -uint32_t getDefaultTaskListCount(VPU::TaskType taskType, VPU::ArchKind archKind); +uint32_t getDefaultTaskListCount(VPU::TaskType taskType, config::ArchKind archKind); } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp new file mode 100644 index 0000000000..a7449577af --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp @@ -0,0 +1,20 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/utils/options.hpp" +#include "vpux/utils/core/string_ref.hpp" + +namespace vpux::VPU { + +constexpr StringRef WORKLOAD_MANAGEMENT_STATUS = "VPU.WorkloadManagementStatus"; + +WorkloadManagementStatus getWorkloadManagementStatus(mlir::ModuleOp module); +void setWorkloadManagementStatus(mlir::ModuleOp module, WorkloadManagementStatus value); + +} // namespace vpux::VPU diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/workload_split_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/workload_split_utils.hpp index 4598fd7a20..dea37ad9e4 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/workload_split_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/utils/workload_split_utils.hpp @@ -14,7 +14,7 @@ namespace vpux::VPU { * Split pattern is decided by VPUNN L1 API cost */ mlir::LogicalResult genericNCEWorkloadSplit(VPU::NCEOpInterface nceOp, mlir::PatternRewriter& rewriter, - VPU::ArchKind arch, int64_t numDPU, + config::ArchKind arch, int64_t numDPU, std::shared_ptr costModel, Logger log); /** diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUASM/ops.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUASM/ops.hpp index d394aee031..69d2b5577a 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUASM/ops.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUASM/ops.hpp @@ -5,15 +5,13 @@ #pragma once +#include "vpux/compiler/NPU40XX/dialect/ELF/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPUASM/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPUASM/types.hpp" - -#include "vpux/compiler/NPU40XX/dialect/ELF/ops_interfaces.hpp" - +#include "vpux/compiler/dialect/VPUMI40XX/attributes.hpp" #include "vpux/compiler/dialect/VPURegMapped/attributes.hpp" #include "vpux/compiler/dialect/VPURegMapped/types.hpp" - -#include "vpux/compiler/dialect/VPUMI40XX/attributes.hpp" +#include "vpux/compiler/dialect/const/attributes/attributes.hpp" // // Generated diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/IR/attributes.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/IR/attributes.hpp index acefab3a5e..62f2d56d40 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/IR/attributes.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/IR/attributes.hpp @@ -12,6 +12,10 @@ #include +namespace vpux::VPU { +class ExecutorKindAttr; +} // namespace vpux::VPU + // // Generated // diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/IR/ops.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/IR/ops.hpp index 7cdc7e2a2b..7fdf4e7574 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/IR/ops.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/IR/ops.hpp @@ -5,6 +5,7 @@ #pragma once +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" @@ -12,6 +13,7 @@ #include "vpux/compiler/utils/types.hpp" #include +#include #include #include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp index 3083c563ca..bada7eb707 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp @@ -5,12 +5,9 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" +#include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" -#include "vpux/compiler/utils/attributes.hpp" - +#include "vpux/compiler/dialect/core/IR/indexed_symbol_attr.hpp" #include "vpux/utils/core/small_string.hpp" #include "vpux/utils/core/small_vector.hpp" @@ -20,6 +17,13 @@ #include #include +namespace vpux::IE { +class LayerLayoutInfo; +} // namespace vpux::IE +namespace vpux::VPU { +enum class ExecutorKind : uint64_t; +} // namespace vpux::VPU + namespace vpux { namespace VPUIP { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/impl/capture_workpoint_strategy.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/impl/capture_workpoint_strategy.hpp index 9c8465f565..f304971ddd 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/impl/capture_workpoint_strategy.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/impl/capture_workpoint_strategy.hpp @@ -7,6 +7,8 @@ #include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include + namespace vpux::VPUIP { class ICaptureWorkpointStrategy { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/interfaces/common_rewriters/convert_lut_to_const.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/interfaces/common_rewriters/convert_lut_to_const.hpp new file mode 100644 index 0000000000..c8c864c031 --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/interfaces/common_rewriters/convert_lut_to_const.hpp @@ -0,0 +1,48 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "vpux/compiler/dialect/VPU/utils/sprlut_utils.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/utils/rewriter.hpp" +#include "vpux/utils/logger/logger.hpp" + +namespace vpux { +namespace VPUIP { + +// +// LUTConverterBase +// + +class LUTConverterBase : public mlir::OpRewritePattern { +public: + LUTConverterBase(mlir::MLIRContext* ctx, Logger log, mlir::func::FuncOp netFunc) + : mlir::OpRewritePattern(ctx), _log(log), _netFunc(netFunc) { + } + + mlir::LogicalResult matchAndRewrite(VPUIP::NCEClusterTaskOp nceClusterTask, + mlir::PatternRewriter& rewriter) const final; + +protected: + virtual mlir::Value createLookupTableConst(VPUIP::NCEClusterTaskOp nceClusterTask, + mlir::PatternRewriter& rewriter) const = 0; + mlir::Value createCopyDestination(VPUIP::NCEClusterTaskOp nceClusterTask, mlir::Value LUTConst, + mlir::PatternRewriter& rewriter) const; + VPU::DistributionInfoAttr createDistributionInfoAttr(VPUIP::DistributedBufferType inputDistribType, + VPUIP::NCEClusterTaskOp nceClusterTask) const; + VPUIP::DistributedBufferType createDistributedBufferType(VPU::DistributionInfoAttr distributedInfo, + VPUIP::NCEClusterTaskOp nceClusterTask, + mlir::Value LUTConst) const; + virtual void replaceWithConstInput(VPUIP::NCEClusterTaskOp nceClusterTask, mlir::Value lutNceInput, + mlir::PatternRewriter& rewriter) const = 0; + + Logger _log; + mutable mlir::func::FuncOp _netFunc; +}; + +} // namespace VPUIP +} // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/interfaces/dpu_tiler.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/interfaces/dpu_tiler.hpp index 15d9db1029..d0f0daaeac 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/interfaces/dpu_tiler.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/interfaces/dpu_tiler.hpp @@ -8,15 +8,16 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/tiling.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" -#include "vpux/compiler/dialect/VPUIPDPU/attributes.hpp" #include #include #include +namespace vpux::VPUIP { +enum class NCETaskType : uint64_t; +} // namespace vpux::VPUIP + namespace vpux { namespace VPUIP { @@ -32,7 +33,7 @@ struct WorkloadCostParams { std::optional weightsDataType = std::nullopt; DimsOrder inOrder; DimsOrder outOrder; - VPU::ArchKind arch; + config::ArchKind arch; Shape fullInputShape; Shape inputShape; Shape outputShape; diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp index beb97f3b67..0fa1d8532f 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp @@ -5,12 +5,15 @@ #pragma once -#include -#include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" +#include +#include + namespace vpux { namespace VPUIP { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/factories/capture_workpoint_strategy_getter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/factories/capture_workpoint_strategy_getter.hpp index 73b9a1595c..7fd358d21f 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/factories/capture_workpoint_strategy_getter.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/factories/capture_workpoint_strategy_getter.hpp @@ -9,6 +9,6 @@ namespace vpux::VPUIP { -std::unique_ptr createCaptureWorkpointStrategy(VPU::ArchKind arch); +std::unique_ptr createCaptureWorkpointStrategy(config::ArchKind arch); } // namespace vpux::VPUIP diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/factories/profiling_info.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/factories/profiling_info.hpp index 4a34efe5d7..f499d8cf01 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/factories/profiling_info.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/factories/profiling_info.hpp @@ -14,7 +14,7 @@ namespace vpux::VPUIP { using TimestampTypeCb = mlir::Type (*)(mlir::MLIRContext* ctx); using SetWorkloadIdsCb = void (*)(VPUIP::NCEClusterTaskOp nceClusterTaskOp); -TimestampTypeCb getTimestampTypeCb(VPU::ArchKind arch); -SetWorkloadIdsCb setWorkloadsIdsCb(VPU::ArchKind arch); +TimestampTypeCb getTimestampTypeCb(config::ArchKind arch); +SetWorkloadIdsCb setWorkloadsIdsCb(config::ArchKind arch); } // namespace vpux::VPUIP diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/factories/split_cost_getter.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/factories/split_cost_getter.hpp index d545ba8755..1efb84d118 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/factories/split_cost_getter.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/factories/split_cost_getter.hpp @@ -12,6 +12,6 @@ namespace vpux::VPUIP { using SplitCostCb = int64_t (*)(const VPUIP::WorkloadSplit&, const VPUIP::WorkloadCostParams&, VPUNN::VPUCostModel&, LogCb); -SplitCostCb getSplitCostCb(VPU::ArchKind arch); +SplitCostCb getSplitCostCb(config::ArchKind arch); } // namespace vpux::VPUIP diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/passes.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/passes.hpp index e35363f80c..13ce3bf4c7 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/passes.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/passes.hpp @@ -6,7 +6,6 @@ #pragma once #include "vpux/compiler/core/pipelines_options.hpp" -#include "vpux/compiler/core/profiling.hpp" #include "vpux/utils/logger/logger.hpp" @@ -18,6 +17,10 @@ #include #include +namespace vpux::VPU { +enum class MemoryKind : uint64_t; +} + namespace vpux { namespace VPUIP { @@ -61,16 +64,14 @@ std::unique_ptr createCopyOpTilingPass(Logger log = Logger::global() std::unique_ptr createSetMemorySpacePass(MemKindCreateFunc memKindCb, Logger log = Logger::global()); std::unique_ptr createConvertEltwiseToInPlacePass(Logger log = Logger::global()); std::unique_ptr createConvertSprLUTToConstPass(Logger log = Logger::global()); +std::unique_ptr createConvertPalletLUTToConstPass(Logger log = Logger::global()); std::unique_ptr createConvertDynamicReshapeToInPlacePass(Logger log = Logger::global()); std::unique_ptr createInsertCopyForEltwiseInPlaceInputPass(Logger log = Logger::global()); std::unique_ptr createLinearizationPass(Logger log = Logger::global()); std::unique_ptr createBreakDataFlowPass(Logger log = Logger::global()); std::unique_ptr createPatchWeightsTablePass(Logger log = Logger::global()); std::unique_ptr createPatchPopulateWeightTableWithShavePass(Logger log = Logger::global()); -std::unique_ptr createDMATaskProfilingReserveMemPass( - DMAProfilingMode dmaProfilingMode = DMAProfilingMode::DISABLED, Logger log = Logger::global()); -std::unique_ptr createDMATaskProfilingAfterBarrierSchedPass( - DMAProfilingMode dmaProfilingMode = DMAProfilingMode::DISABLED, Logger log = Logger::global()); +std::unique_ptr createDMATaskProfilingAfterBarrierSchedPass(Logger log = Logger::global()); std::unique_ptr createCaptureWorkpointPass(Logger log = Logger::global()); std::unique_ptr createDPUProfilingPass(MemKindCreateFunc memKindCb, Logger log = Logger::global()); std::unique_ptr createM2IProfilingPass(Logger log = Logger::global()); @@ -99,8 +100,6 @@ std::unique_ptr createTileActShaveKernelTaskPass(Logger log = Logger std::unique_ptr createSetZeroOffsetWeightsTablePass(Logger log = Logger::global()); std::unique_ptr createSegmentHalosPass(Logger log = Logger::global()); std::unique_ptr createAdjustSpillSizePass(Logger log = Logger::global()); -std::unique_ptr createCompressDmaReserveMemPass(Logger log = Logger::global()); -std::unique_ptr createSWKernelPrefetchingReserveMemPass(Logger log = Logger::global()); std::unique_ptr createFuseDDRCopiesIntoConcats(Logger log = Logger::global()); std::unique_ptr createLegalizeRepeatingFuncCallsPass(Logger log = Logger::global()); @@ -163,10 +162,11 @@ std::unique_ptr createBatchMatMulToMatMulPass(Logger log = Logger::g std::unique_ptr createUnrollDMAAnalysisPass(Logger log = Logger::global()); std::unique_ptr createUnrollUpsamplingDMAPass(Logger log = Logger::global()); -std::unique_ptr createUnrollPermuteToNNDMAPass(Logger log = Logger::global()); + std::unique_ptr createUnrollExpandDMAPass(Logger log = Logger::global()); std::unique_ptr createUnrollPerAxisTileDMAPass(Logger log = Logger::global()); std::unique_ptr createInvalidateUnrollDMAAnalysisPass(Logger log = Logger::global()); +std::unique_ptr createUnrollGatherDMAPass(Logger log = Logger::global()); // // DefaultHWOptions(for all devices) diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/passes/unroll_distributed_ops.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/passes/unroll_distributed_ops.hpp index 930a4c5e45..6cd3bc5359 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/passes/unroll_distributed_ops.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/transforms/passes/unroll_distributed_ops.hpp @@ -7,9 +7,17 @@ #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" -#include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/logger/logger.hpp" +#include + +namespace vpux::Const { +class DeclareOp; +} // namespace vpux::Const +namespace vpux::VPURT { +class TaskOp; +} // namespace vpux::VPURT + namespace vpux { namespace VPUIP { @@ -122,5 +130,9 @@ class ClusterDMARewriter final : public ClusterPerElementDMABaseRewriter { void unrollDistributedOpsCommon40XXPlus(mlir::func::FuncOp funcOp, std::optional maybeDmaFusionHandler, vpux::Logger log); +mlir::Value patchSETableValue(mlir::Location loc, Const::DeclareOp constOp, + VPUIP::DistributedBufferType nceInputDistType, const int64_t targetClusterId, + mlir::OpBuilder& builder); + } // namespace VPUIP } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp index 6505638062..d208d70fef 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp @@ -5,12 +5,24 @@ #pragma once +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" + #include #include -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +namespace vpux::IE { +class ChannelPaddingAttr; +class DepthToSpaceModeAttr; +class SpaceToDepthModeAttr; +} // namespace vpux::IE +namespace vpux::VPU { +class DepthToSpaceOp; +} // namespace vpux::VPU +namespace vpux::VPUIP { +class DMADescriptorAttr; +class DistributedBufferType; +class SwKernelOp; +} // namespace vpux::VPUIP namespace vpux { namespace VPUIP { @@ -22,7 +34,7 @@ std::optional getPermuteDMAInputShape(NDTypeInterface inType, NDTypeInter vpux::Logger log); std::optional getPermuteDMAOutputShape(NDTypeInterface inType, NDTypeInterface outType, mlir::AffineMap memPerm, vpux::Logger log); -std::optional> getPermuteDMASubInputShapes(VPU::ArchKind arch, NDTypeInterface inType, +std::optional> getPermuteDMASubInputShapes(config::ArchKind arch, NDTypeInterface inType, NDTypeInterface outType, mlir::AffineMap memPerm, int64_t dmaPortCount, vpux::Logger log); SmallVector getPermuteDMASubOutputShapes(SmallVector subInputShapes, @@ -53,14 +65,14 @@ bool doesPermuteDMATileDimSupportWrapInCluster(vpux::NDTypeInterface inputType, std::optional getMemPermFromSwKernel(VPUIP::SwKernelOp swKernelTask); // Check if MemPermute satisfies the condition of optimal SW implementation -bool satisfiesOptimizedMemPermute(VPU::ArchKind arch, NDTypeInterface inType, NDTypeInterface outType); +bool satisfiesOptimizedMemPermute(config::ArchKind arch, NDTypeInterface inType, NDTypeInterface outType); /** * Cost function to evaluate whether it's beneficial to implement the operation using DMA for * operations like MemPermute. * @return true if it's beneficial for using DMA, otherwise false. */ -bool isBeneficialForUsingPermuteDMA(VPU::ArchKind arch, NDTypeInterface inType, NDTypeInterface outType, +bool isBeneficialForUsingPermuteDMA(config::ArchKind arch, NDTypeInterface inType, NDTypeInterface outType, mlir::AffineMap memPerm, int64_t dmaPortCount, vpux::Logger log); bool isMemPermSwKernel(VPUIP::SwKernelOp swKernelTask); @@ -92,7 +104,7 @@ std::optional getPerAxisTileSwKernelAttr(VPUIP::SwKernel std::pair getPerAxisTileDMAMergedShape(vpux::NDTypeInterface inType, vpux::NDTypeInterface outType, int64_t axis, int64_t tiles); -SmallVector getPerAxisTileDMASubShapes(VPU::ArchKind arch, vpux::ShapeRef shape); +SmallVector getPerAxisTileDMASubShapes(config::ArchKind arch, vpux::ShapeRef shape); // Public interface bool doesSWLayerFitIntoCMX(mlir::Operation* op, vpux::Logger log); diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp index 0bd91ad484..c95ea1ef1c 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp @@ -5,11 +5,32 @@ #pragma once -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" -#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/core/tiling.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" +#include "vpux/utils/core/array_ref.hpp" +#include "vpux/utils/core/mem_size.hpp" #include "vpux/utils/core/small_string.hpp" +#include "vpux/utils/core/small_vector.hpp" #include "vpux/utils/logger/logger.hpp" +#include +#include + +namespace vpux::config { +enum class ArchKind : uint64_t; +} // namespace vpux::config +namespace vpux::VPU { +class LayerOpInterface; +} // namespace vpux::VPU +namespace vpux::VPUIP { +struct KernelInfo; +class SwKernelOp; +class SwKernelRun; +} // namespace vpux::VPUIP +namespace vpux::VPURT { +class TaskOp; +} // namespace vpux::VPURT + namespace vpux { namespace VPUIP { @@ -101,8 +122,9 @@ const SmallVector SW_KERNELS_SUPPORTING_TILING = {"mvn1", const SmallVector SW_KERNELS_SUPPORTING_STRIDE = {"mvn1", "lstm_cell", "lstm_sequence", "reorder"}; const SmallVector SW_KERNELS_SUPPORTING_SHAVE_BALANCING = { - "softmax", "eltwise_mul", "activation_sin", "activation_cos", "activation_swish", "activation_clamp", - "eltwise_min", "eltwise_max", "round_fp16", "activation_exp", "prelu_fp16", "eltwise_logical_not"}; + "softmax", "eltwise_mul", "activation_sin", "activation_cos", "activation_swish", + "activation_clamp", "convert", "eltwise_min", "eltwise_max", "round_fp16", + "activation_exp", "prelu_fp16", "eltwise_logical_not"}; const SmallVector SW_KERNELS_LAYOUT_AGNOSTIC = { "activation_swish", "activation_gelu", "activation_hswish", "activation_hardsigmoid", @@ -173,11 +195,10 @@ mlir::SymbolRefAttr createBuiltInFunction(mlir::ModuleOp module, VPU::LayerOpInt ArrayRef operands, ArrayRef results, const VPUIP::KernelInfo& kernelInfo, const Logger& log); -void createRuntimeKernelDefinition(mlir::ModuleOp module, const Logger& log, vpux::VPU::ArchKind arch); +void createRuntimeKernelDefinition(mlir::ModuleOp module, const Logger& log, vpux::config::ArchKind arch); void initSwKernel(vpux::VPUIP::SwKernelOp swKernelOp, mlir::ValueRange inputs, mlir::ValueRange outputBuffs, - mlir::ArrayRef args, const vpux::Logger& log, - VPUIP::SwKernelRun swKernelRunOp = nullptr); + mlir::ArrayRef args, const vpux::Logger& log, VPUIP::SwKernelRun swKernelRunOp); void initSwKernel(VPUIP::SwKernelOp swKernelOp, VPUIP::SwKernelRun swKernelRunOp, const vpux::Logger& log); @@ -204,6 +225,10 @@ SmallVector getSwkernelNewAttrsAfterTiling(VPUIP::SwKernelOp sw SmallVector getPopulateWeightTableSwKernelEntries(VPUIP::SwKernelOp swKernelOp); void updatePopulateWeightTableSwKernel(VPUIP::SwKernelOp swKernelOp, int64_t currOffset, Logger log); +int64_t computeReverseMemDim(mlir::Value tensorArg, int64_t dimIdx); +void getQuantParamsAttr(mlir::Value qValue, mlir::Type pType, mlir::ArrayAttr& paramsAttr, int64_t tileSize = 0, + int64_t tileOffset = 0); + SmallVector getSwKernelTiledTypes(VPUIP::SwKernelOp swKernelOp, Dim tileDim); bool isCacheOpTaskType(std::optional<::mlir::SymbolRefAttr> kernelTaskType, bool includePrefetch = true); @@ -216,8 +241,8 @@ bool isJitKernelOp(VPUIP::SwKernelOp swKernelOp); mlir::SmallVector getDDRBuffers(mlir::ValueRange buffers); bool hasInputsInDDR(VPUIP::SwKernelOp swKernelTask); -int64_t getSwKernelTilingAddressAlignment(VPUIP::SwKernelOp swkernelOp, VPU::ArchKind arch); -std::pair getSwKernelInstructionPrefetchConfig(VPU::ArchKind arch); +int64_t getSwKernelTilingAddressAlignment(VPUIP::SwKernelOp swkernelOp, config::ArchKind arch); +std::pair getSwKernelInstructionPrefetchConfig(config::ArchKind arch); } // namespace VPUIP } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/unroll_dma_analysis.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/unroll_dma_analysis.hpp index 8438178405..be17cadbb6 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/unroll_dma_analysis.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/unroll_dma_analysis.hpp @@ -5,14 +5,8 @@ #pragma once -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" - -#include -#include -#include -#include -#include +#include +#include namespace vpux { namespace VPUIP { @@ -21,9 +15,10 @@ enum class UnrollDMAAnalysisNeeded { UnrollDepthToSpaceDMAPass, UnrollSpaceToDepthDMAPass, UnrollUpsamplingDMAPass, - UnrollPermuteToNNDMAPass, + UnrollPermuteDMAPass, UnrollExpandDMAPass, UnrollPerAxisTileDMAPass, + UnrollGatherDMAPass, NumberOfAnalyzedPasses }; diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/utils.hpp index d3ddc8dcf2..be2df4e69c 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIP/utils/utils.hpp @@ -5,28 +5,23 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/resources.hpp" +#include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" -#include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/utils/core/enums.hpp" +#include "vpux/utils/core/numeric.hpp" +#include #include #include #include #include #include -#include namespace vpux { namespace VPUIP { -// -// Wlm status utils -// - -void setWlmStatus(mlir::ModuleOp module, vpux::VPUIP::WlmStatus status); -vpux::VPUIP::WlmStatus getWlmStatus(mlir::ModuleOp module); - // // Profiling // @@ -61,15 +56,20 @@ constexpr uint32_t HW_M2I_PROFILING_MAX_BUFFER_SIZE = 128; constexpr int64_t MAX_SW_KERNEL_PREFETCH_DATA_SIZE_37XX = 256; constexpr int64_t MAX_SW_KERNEL_PREFETCH_DATA_SIZE_40XX = 1024; +// Reserved memory for buffers required by dummy kernels. +// Used to prefetch SW kernels instructions on architectures +// which do not support dedicated operation +constexpr int64_t MAX_SW_KERNEL_DUMMY_KERNELS_DATA_SIZE = 8; + // PLL WORKPOINT_CONFIG_MIRROR ADDRESS constexpr uint32_t NUM_CAPTURED_WORKPOINTS = 2; constexpr uint32_t HW_PLL_WORKPOINT_ABSOLUTE_ADDR = 0x20082020; constexpr uint16_t HW_PLL_WORKPOINT_SIZE = 4; // TODO: E#78647 refactor to use api/vpu_cmx_info_{arch}.h -const EnumMap firmwareVariantCount = { - {VPU::ArchKind::NPU37XX, 256}, - {VPU::ArchKind::NPU40XX, 128}, +const EnumMap firmwareVariantCount = { + {config::ArchKind::NPU37XX, 256}, + {config::ArchKind::NPU40XX, 128}, }; uint16_t getProfWorkloadSize(mlir::ModuleOp module); @@ -190,7 +190,8 @@ SmallVector getSplitBuffers(mlir::MLIRContext* ctx, mlir::Location // MovePureViewOpBeforeCopy Utilities // -int64_t getSOHMinimalHeightAlignment(vpux::ShapeRef shape, int64_t numClusters, bool isInputSparse, VPU::ArchKind arch); +int64_t getSOHMinimalHeightAlignment(vpux::ShapeRef shape, int64_t numClusters, bool isInputSparse, + config::ArchKind arch); int64_t getSpecificAxisFromAttr(mlir::ArrayAttr attr); @@ -204,11 +205,11 @@ bool areDistributedTypePerClusterDataCompatible(DistType inDistType, DistType ou const auto inStrides = inDistType.getStrides(); const auto outStrides = outDistType.getStrides(); const auto calcBufferOffset = [](ShapeRef shapeOffset, Strides strides) { - Byte bufOffset{0}; + Bit bufOffset{0}; for (size_t axis = 0; axis < strides.size(); axis++) { - bufOffset += shapeOffset[Dim(axis)] * static_cast(strides[Dim(axis)]); + bufOffset += shapeOffset[Dim(axis)] * strides[Dim(axis)]; } - return bufOffset.count(); + return bufOffset.to().count(); }; const auto isPerClusterCompatible = [&](ShapeRef inShape, ShapeRef outShape, ShapeRef inShapeOffset, ShapeRef outShapeOffset) { @@ -226,7 +227,7 @@ bool areDistributedTypePerClusterDataCompatible(DistType inDistType, DistType ou template VPU::DistributionInfoAttr getSOHDistAttrWithNewShape(mlir::MLIRContext* ctx, DistType origDistType, ShapeRef newShape, - VPU::ArchKind arch) { + config::ArchKind arch) { const auto origDistAttr = origDistType.getDistribution(); VPUX_THROW_UNLESS(VPU::isSegmentedOverH(origDistAttr), "Input dist type is not SEGMENTED over H"); @@ -321,7 +322,7 @@ bool isDistributedCompatibleAfterShapeChangeForViewOps(DistType inDistType, Dist template bool isDistributedCompatibleAfterShapeChangeForViewOps(DistType inDistType, ShapeRef shape, DimsOrder outOrder, - VPU::ArchKind arch) { + config::ArchKind arch) { const auto mode = inDistType.getDistribution().getMode().getValue(); VPUX_THROW_UNLESS(VPU::bitEnumContainsAny(mode, VPU::DistributionMode::DUPLICATED) || VPU::bitEnumContainsAny(mode, VPU::DistributionMode::SEGMENTED), @@ -537,7 +538,7 @@ vpux::Dim getCopyDMATilingDimForLargePlaneNum(mlir::Operation* op); int64_t getStridingLevel(const vpux::NDTypeInterface& type); int64_t getStridingLevel(const mlir::Value val); bool hasLegalStridingLevel(mlir::Operation* op); -bool isSplitNeededForLargePlanesNum(const VPU::ArchKind arch, const vpux::NDTypeInterface& type, ShapeRef shape); +bool isSplitNeededForLargePlanesNum(const config::ArchKind arch, const vpux::NDTypeInterface& type, ShapeRef shape); bool isSplitNeededForLargePlanesNum(mlir::Operation* op); // @@ -589,13 +590,18 @@ VPURT::TaskOp createBarProgDMA(mlir::OpBuilder& builder, mlir::Value input, mlir mlir::ValueRange waitBarriers, mlir::ValueRange updateBarriers, VPUIP::PhysicalBarrierRangeAttr physicalBarrierRangeAttr, llvm::StringLiteral opName = "bar_prog_dma"); + +VPURT::TaskOp createEnqueueDMA(mlir::OpBuilder& builder, mlir::Value input, mlir::Value output, int port, + mlir::ValueRange waitBarriers, mlir::ValueRange updateBarriers, + VPUIP::EnqueueDMAAttr enqueueDMAAttr, llvm::StringLiteral opName = "enqueue_dma"); + // // Distributed Type utils // template VPU::DistributionInfoAttr getDistributedAttrAfterShapeCast(VPU::DistributedTypeInterface origDistrType, - ArrayRef origOutShape, VPU::ArchKind arch) { + ArrayRef origOutShape, config::ArchKind arch) { const auto ndTypeIf = mlir::cast(origDistrType); const auto origInShape = ndTypeIf.getShape().raw(); const auto distributedType = mlir::cast(origDistrType.getDistributedTypes().front()); @@ -743,8 +749,6 @@ std::pair getSplitPartSizes(NDTypeInterface bufferType, vpux:: // Check user utils // -bool hasOneOrSameUser(mlir::Operation* op); - std::unordered_set getConcatAxes(VPUIP::ConcatViewOp concatViewOp); template diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIPDPU/rewriters/utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIPDPU/rewriters/utils.hpp index 9426dabf36..73525cbfdf 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUIPDPU/rewriters/utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUIPDPU/rewriters/utils.hpp @@ -5,13 +5,16 @@ #pragma once -#include -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPUIPDPU/attributes.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" +#include "vpux/utils/logger/logger.hpp" +#include #include +namespace vpux::VPUIPDPU { +enum class ODUDataBitWidth : uint32_t; +} // namespace vpux::VPUIPDPU + namespace vpux { namespace VPUIPDPU { @@ -84,18 +87,14 @@ void computeLsbAndMsbFromTargetWidth(int64_t targetWidth, uint64_t& msbWidth, ui msbWidth = (targetWidth & bitMaskMsb) >> lsbBitWidth; } -// Helper function to calculate zero point offset for input activations and weights template -auto getZeroPoint(vpux::NDTypeInterface type) { +SmallVector getZeroPoints(mlir::Type type) { static_assert(std::is_integral::value, "DataType must be an integer type"); - // Get also ZP - auto elementType = type.getElementType(); SmallVector quantZeroPoints; - if (const auto uniformQuantType = mlir::dyn_cast(elementType)) { + if (const auto uniformQuantType = mlir::dyn_cast(type)) { quantZeroPoints.push_back(checked_cast(uniformQuantType.getZeroPoint())); - } else if (const auto uniformQuantPerAxisType = - mlir::dyn_cast(elementType)) { + } else if (const auto uniformQuantPerAxisType = mlir::dyn_cast(type)) { auto zp = uniformQuantPerAxisType.getZeroPoints(); quantZeroPoints.resize(zp.size()); std::transform(zp.begin(), zp.end(), quantZeroPoints.begin(), [](int64_t a) { @@ -105,10 +104,12 @@ auto getZeroPoint(vpux::NDTypeInterface type) { quantZeroPoints.push_back(0); } - // Return only the first element as the zero point bias - return quantZeroPoints[0]; + return quantZeroPoints; } +// Helper function to calculate zero point offset for input/output activations and weights +int64_t getZeroPoint(vpux::NDTypeInterface type); + int64_t getRangeSize(int64_t start, int64_t end); } // namespace VPUIPDPU diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI37XX/ops.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI37XX/ops.hpp index 964799fc9f..3ecd9416f1 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI37XX/ops.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI37XX/ops.hpp @@ -12,6 +12,7 @@ #include "vpux/compiler/dialect/VPURT/IR/attributes.hpp" #include "vpux/compiler/dialect/VPURegMapped/attributes.hpp" #include "vpux/compiler/dialect/VPURegMapped/types.hpp" +#include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" #include diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI37XX/ops_interfaces.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI37XX/ops_interfaces.hpp index b87d888273..c8554eaf2b 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI37XX/ops_interfaces.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI37XX/ops_interfaces.hpp @@ -5,31 +5,8 @@ #pragma once -#include "vpux/compiler/core/attributes/dims_order.hpp" -#include "vpux/compiler/core/attributes/shape.hpp" -#include "vpux/compiler/core/attributes/strides.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" -#include "vpux/compiler/dialect/VPUMI37XX/attributes.hpp" -#include "vpux/compiler/dialect/VPURegMapped/types.hpp" -#include "vpux/compiler/dialect/const/attributes/content.hpp" - -#include "vpux/utils/core/array_ref.hpp" -#include "vpux/utils/core/optional.hpp" -#include "vpux/utils/core/range.hpp" -#include "vpux/utils/core/string_ref.hpp" -#include "vpux/utils/logger/logger.hpp" - -#include -#include -#include - -#include "vpux/compiler/utils/attributes.hpp" - -#include "vpux/utils/core/small_vector.hpp" - #include #include -#include namespace vpux { namespace VPUMI37XX { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI40XX/passes.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI40XX/passes.hpp index 995486e2af..0bda5f803d 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI40XX/passes.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI40XX/passes.hpp @@ -5,7 +5,6 @@ #pragma once -#include "vpux/compiler/core/profiling.hpp" #include "vpux/compiler/utils/options.hpp" #include "vpux/utils/logger/logger.hpp" @@ -21,8 +20,8 @@ namespace VPUMI40XX { // Passes // -std::unique_ptr createSetupProfilingVPUMI40XXPass( - DMAProfilingMode dmaProfilingMode = DMAProfilingMode::DISABLED, Logger log = Logger::global()); +std::unique_ptr createSetupProfilingVPUMI40XXPass(const std::string& enableDmaProfiling = "false", + Logger log = Logger::global()); std::unique_ptr createBarrierComputationPass(Logger log = Logger::global()); std::unique_ptr reorderMappedInferenceOpsPass(Logger log = Logger::global()); std::unique_ptr createResolveTaskLocationPass(Logger log = Logger::global()); @@ -30,19 +29,25 @@ std::unique_ptr createBarrierTopologicalMappingPass(Logger log = Log std::unique_ptr createGroupExecutionOpsPass(Logger log = Logger::global()); std::unique_ptr createUnGroupExecutionOpsPass(Logger log = Logger::global()); std::unique_ptr createAddFetchOpsPass(Logger log = Logger::global()); +std::unique_ptr createConvertFetchDmasToFetchTaskOpsPass(Logger log = Logger::global()); std::unique_ptr createResolveWLMTaskLocationPass(Logger log = Logger::global()); std::unique_ptr createPropagateFinalBarrierPass(Logger log = Logger::global()); std::unique_ptr createAddEnqueueOpsPass( WorkloadManagementMode workloadManagementMode = WorkloadManagementMode::PWLM_V0_LCA, Logger log = Logger::global()); std::unique_ptr createUnrollFetchTaskOpsPass(Logger log = Logger::global()); -std::unique_ptr createLinkEnqueueTargetsPass(Logger log = Logger::global()); +std::unique_ptr createLinkEnqueueTargetsPass( + WorkloadManagementMode workloadManagementMode = WorkloadManagementMode::PWLM_V0_LCA, + Logger log = Logger::global()); std::unique_ptr createLinkAllOpsPass(Logger log = Logger::global()); std::unique_ptr createUnrollEnqueueOpsPass(Logger log = Logger::global()); std::unique_ptr createLinkEnqueueOpsForSameBarrierPass(Logger log = Logger::global()); std::unique_ptr createSplitEnqueueOpsPass(Logger log = Logger::global()); +std::unique_ptr createSplitEnqueueDmaOpsPass(Logger log = Logger::global()); std::unique_ptr createAddBootstrapBarriersPass(Logger log = Logger::global()); -std::unique_ptr createAddBootstrapWorkItemsPass(Logger log = Logger::global()); +std::unique_ptr createAddBootstrapWorkItemsPass( + WorkloadManagementMode workloadManagementMode = WorkloadManagementMode::PWLM_V0_LCA, + Logger log = Logger::global()); std::unique_ptr createNextSameIdAssignmentPass(Logger log = Logger::global()); std::unique_ptr createAddPlatformInfoPass(Logger log = Logger::global()); std::unique_ptr createDumpStatisticsOfWlmOpsPass(Logger log = Logger::global()); @@ -55,9 +60,8 @@ std::unique_ptr createAddBarrierConfigurationOps( WorkloadManagementBarrierProgrammingMode::LEGACY, Logger log = Logger::global()); -std::unique_ptr createAddEnqueueDMAOps( - WorkloadManagementMode workloadManagementMode = WorkloadManagementMode::FWLM_V1_PAGES, - Logger log = Logger::global()); +std::unique_ptr createAddEnqueueDMAOps(Logger log = Logger::global()); +std::unique_ptr createUpdateEnqueueDMAInputAndOutput(Logger log = Logger::global()); // // Registration diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI40XX/utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI40XX/utils.hpp index 823f5d98b3..f041eb3986 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI40XX/utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI40XX/utils.hpp @@ -122,7 +122,9 @@ void reindexTaskLinkAttrForDMA(VPURegMapped::TaskOpInterface head); // // Resolve Task Location utils // -size_t getTaskBinarySize(VPURegMapped::TaskType taskType, VPU::ArchKind arch); +size_t getTaskBinarySize(VPURegMapped::TaskType taskType, config::ArchKind arch); + +VPURegMapped::TaskType convertExecutorKindToExecutableTaskType(VPU::ExecutorKind kind); } // namespace VPUMI40XX } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI40XX/wlm_utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI40XX/wlm_utils.hpp index 1767420529..d986b400fc 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI40XX/wlm_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPUMI40XX/wlm_utils.hpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPUMI40XX/ops.hpp" #include "vpux/compiler/dialect/VPUMI40XX/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPURegMapped/ops.hpp" @@ -118,6 +117,15 @@ struct FetchTaskDetails { VPUMI40XX::NNDMAOp getPreviousDMAWithBarriers(VPURegMapped::TaskOpInterface taskOpInterface); void logFetchOpsDetails(mlir::func::FuncOp netFunc, Logger log); +struct EnqDmaInfo { + int64_t startTaskIdx; + int64_t endTaskIdx; + VPUMI40XX::NNDMAOp enqDmaOp; +}; + +mlir::DenseMap> getEnqueueDmaData( + VPUMI40XX::NNDMAOp firstDmaTile0List0Op, Logger log); + } // namespace VPUMI40XX } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/IR/ops.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/IR/ops.hpp index f608480ca4..21ae228bc3 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/IR/ops.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/IR/ops.hpp @@ -6,11 +6,18 @@ #pragma once #include "vpux/compiler/dialect/ELFNPU37XX/ops_interfaces.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPURT/IR/attributes.hpp" #include "vpux/compiler/dialect/VPURT/IR/dialect.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPURT/IR/types.hpp" +#include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" + +#include + +namespace vpux::VPUIP { +class DistributedBufferType; +} // namespace vpux::VPUIP // // Generated diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/IR/task.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/IR/task.hpp index f32956f489..1301cddb16 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/IR/task.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/IR/task.hpp @@ -7,7 +7,9 @@ #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include #include +#include namespace vpux { namespace VPURT { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/transforms/passes.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/transforms/passes.hpp index 0edcdc260e..9c5947bcf5 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/transforms/passes.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/transforms/passes.hpp @@ -56,6 +56,7 @@ std::unique_ptr createInferenceExecutionAnalysisPass( bool enableActivityFactor = true, Logger log = Logger::global()); std::unique_ptr createInsertBarrierToMarkTheEndOfDescriptorGroupPass( std::optional virtualBarrierThresholdForWlm = VIRTUAL_BARRIER_THRESHOLD_WLM, + std::optional workloadManagementMode = WorkloadManagementMode::PWLM_V0_LCA, Logger log = Logger::global()); // diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/utils/color_bin_barrier_assignment.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/utils/color_bin_barrier_assignment.hpp index 4342b1a818..68fe0d9251 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/utils/color_bin_barrier_assignment.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPURT/utils/color_bin_barrier_assignment.hpp @@ -16,7 +16,7 @@ namespace VPURT { class BarrierColorBin final { public: using BinType = VPURT::TaskQueueType; - BarrierColorBin(size_t numBarriers, VPU::ArchKind arch, Logger log); + BarrierColorBin(size_t numBarriers, config::ArchKind arch, Logger log); bool calculateBinSize(BarrierGraphInfo& BarrierGraphInfo); mlir::LogicalResult assignPhysicalBarrier(BarrierGraphInfo& BarrierGraphInfo, BarrierSimulator& simulator); size_t getPhysicalBarrier(size_t virtualBarrierInd); diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPURegMapped/utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPURegMapped/utils.hpp index eb05280070..d8263050f1 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/VPURegMapped/utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPURegMapped/utils.hpp @@ -5,15 +5,16 @@ #pragma once -#include -#include +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" +#include "vpux/compiler/dialect/VPURegMapped/ops.hpp" #include - #include -#include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" -#include "vpux/compiler/dialect/VPURegMapped/ops.hpp" +#include +#include + namespace vpux { namespace VPURegMapped { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/config/IR/ops_interfaces.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/config/IR/ops_interfaces.hpp new file mode 100644 index 0000000000..37301a059d --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/config/IR/ops_interfaces.hpp @@ -0,0 +1,86 @@ + + +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "vpux/compiler/dialect/config/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/error.hpp" + +#include +#include +#include + +// +// DefinedInArch +// +namespace vpux { +namespace config { +template +struct DefinedInArch { + template + class Impl : public mlir::OpTrait::TraitBase { + public: + static mlir::LogicalResult verifyTrait(mlir::Operation* op) { + return verifyArchKind(op, arch); + } + + private: + static mlir::LogicalResult verifyArchKind(mlir::Operation* op, config::ArchKind definedInArch) { + auto actualArch = config::getArch(op); + + if (actualArch != config::ArchKind::UNKNOWN && actualArch < definedInArch) { + auto actualArchStr = stringifyArchKind(actualArch).str(); + auto definedInArchStr = stringifyArchKind(definedInArch).str(); + return vpux::errorAt(op, "Operation {0} not supported in {1}; op has been introduced in {2}", + op->getName(), actualArchStr, definedInArchStr); + } + + return mlir::success(); + } + }; +}; + +// +// LimitedToArch +// + +template +struct LimitedToArch { + template + class Impl : public mlir::OpTrait::TraitBase { + public: + static mlir::LogicalResult verifyTrait(mlir::Operation* op) { + return verifyArchKind(op, {archs...}); + } + + private: + static mlir::LogicalResult verifyArchKind(mlir::Operation* op, + std::initializer_list supportedArchs) { + auto actualArch = config::getArch(op); + + if (actualArch != config::ArchKind::UNKNOWN) { + if (std::find(cbegin(supportedArchs), cend(supportedArchs), actualArch) == cend(supportedArchs)) { + auto actualArchStr = stringifyArchKind(actualArch).str(); + auto archsStr = std::accumulate( + cbegin(supportedArchs), cend(supportedArchs), std::string(), + [](const std::string& accu, const config::ArchKind arch) -> std::string { + return accu + (accu.length() > 0 ? "," : "") + stringifyArchKind(arch).str(); + }); + return vpux::errorAt(op, "Operation {0} not supported in {1}; list of supported archs: {2}", + op->getName(), actualArchStr, archsStr); + } + } + + return mlir::success(); + } + }; +}; +} // namespace config +} // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/config/IR/utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/config/IR/utils.hpp new file mode 100644 index 0000000000..87376be7cc --- /dev/null +++ b/src/vpux_compiler/include/vpux/compiler/dialect/config/IR/utils.hpp @@ -0,0 +1,40 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "vpux/compiler/dialect/config/IR/attributes.hpp" +#include "vpux/utils/core/mem_size.hpp" + +// +// Run-time resources +// +namespace vpux { + +namespace config { +llvm::StringLiteral getMemoryDerateAttrName(); +llvm::StringLiteral getMemoryBandwidthAttrName(); + +// +// ArchKind +// +void setArch(mlir::ModuleOp module, config::ArchKind kind, int numOfDPUGroups, + std::optional numOfDMAPorts = std::nullopt, + std::optional availableCMXMemory = std::nullopt, bool allowCustomValues = false); + +config::ArchKind getArch(mlir::Operation* op); +bool isArchVPUX3XXX(config::ArchKind arch); + +// +// RevisionID +// + +void setRevisionID(mlir::ModuleOp module, config::RevisionID revisionID); +bool hasRevisionID(mlir::ModuleOp module); +config::RevisionID getRevisionID(mlir::Operation* op); + +} // namespace config +} // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/const/attributes/content.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/const/attributes/content.hpp index 04bf8da80a..777aca07ef 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/const/attributes/content.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/const/attributes/content.hpp @@ -177,8 +177,13 @@ namespace vpux::Const { mlir::ParseResult parseContentAttr(mlir::AsmParser& parser, ContentAttr& content); void printContentAttr(mlir::AsmPrinter& printer, const ContentAttr& content); -/// @brief External constant prefix used for OpenVino constants. -constexpr const char* OPENVINO_CONST_PREFIX = "ov"; +/** @brief External constant prefix. + + This prefix is used also in the context of weights separation. Be careful when changing it. + "ow" stands for "original weights" +*/ + +constexpr const char* IMPORTED_WEIGHT_PREFIX = "vpux_ow_"; /** @brief Returns new dense_resource<> "base" content. diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/const/utils/content.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/const/utils/content.hpp index aba0de6cb3..35519fe4b1 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/const/utils/content.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/const/utils/content.hpp @@ -216,7 +216,16 @@ class Content final { template auto getSplatValue() const { VPUX_THROW_UNLESS(isSplat(), "Expected the attribute to be a splat value"); - return *getValues().begin(); + return read([](auto values) { + if constexpr (std::is_same::value) { + // E#160869: checked_cast works poorly due to MSVC warning + // C4804. fixing checked_cast<> overload is also not simple, so + // for now this could act as a workaround. + return static_cast(values[0]); + } else { + return checked_cast(values[0]); + } + }); } public: diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/const/utils/utils.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/const/utils/utils.hpp index fe2fe956fb..c13d78792a 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/const/utils/utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/const/utils/utils.hpp @@ -96,4 +96,7 @@ mlir::FailureOr getSplatValue(mlir::Value input) { return getSplatValue(input.getDefiningOp()); } +/// Returns whether constant has a transformation that sparsifies the content. +bool hasSparsifyTransformation(const Const::DeclareOp& constOp); + } // namespace vpux::Const diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/core/IR/dynamic_attrs.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/core/IR/dynamic_attrs.hpp index fb995bf72a..93442dd2f3 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/core/IR/dynamic_attrs.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/core/IR/dynamic_attrs.hpp @@ -81,32 +81,43 @@ class BoundedDim { } public: - friend BoundedDim operator+(const BoundedDim& x, const BoundedDim& y); - friend BoundedDim operator-(const BoundedDim& x, const BoundedDim& y); - friend BoundedDim operator*(const BoundedDim& x, const BoundedDim& y); + BoundedDim operator+(const BoundedDim& other) const; + BoundedDim operator-(const BoundedDim& other) const; + BoundedDim operator*(const BoundedDim& other) const; BoundedDim& operator+=(const BoundedDim& other); BoundedDim& operator-=(const BoundedDim& other); BoundedDim& operator*=(const BoundedDim& other); - friend bool operator==(const BoundedDim& x, const BoundedDim& y); - friend bool operator!=(const BoundedDim& x, const BoundedDim& y); - friend bool operator<(const BoundedDim& x, const BoundedDim& y); - friend bool operator>(const BoundedDim& x, const BoundedDim& y); - friend bool operator<=(const BoundedDim& x, const BoundedDim& y); - friend bool operator>=(const BoundedDim& x, const BoundedDim& y); + bool operator==(const BoundedDim& other) const; + bool operator!=(const BoundedDim& other) const; + bool operator<(const BoundedDim& other) const; + bool operator>(const BoundedDim& other) const; + bool operator<=(const BoundedDim& other) const; + bool operator>=(const BoundedDim& other) const; + + friend BoundedDim operator+(int64_t x, const BoundedDim& y); + friend BoundedDim operator-(int64_t x, const BoundedDim& y); + friend BoundedDim operator*(int64_t x, const BoundedDim& y); + + friend bool operator==(int64_t x, const BoundedDim& y); + friend bool operator!=(int64_t x, const BoundedDim& y); + friend bool operator<(int64_t x, const BoundedDim& y); + friend bool operator>(int64_t x, const BoundedDim& y); + friend bool operator<=(int64_t x, const BoundedDim& y); + friend bool operator>=(int64_t x, const BoundedDim& y); }; -BoundedDim operator+(const BoundedDim& x, const BoundedDim& y); -BoundedDim operator-(const BoundedDim& x, const BoundedDim& y); -BoundedDim operator*(const BoundedDim& x, const BoundedDim& y); +BoundedDim operator+(int64_t x, const BoundedDim& y); +BoundedDim operator-(int64_t x, const BoundedDim& y); +BoundedDim operator*(int64_t x, const BoundedDim& y); -bool operator==(const BoundedDim& x, const BoundedDim& y); -bool operator!=(const BoundedDim& x, const BoundedDim& y); -bool operator<(const BoundedDim& x, const BoundedDim& y); -bool operator>(const BoundedDim& x, const BoundedDim& y); -bool operator<=(const BoundedDim& x, const BoundedDim& y); -bool operator>=(const BoundedDim& x, const BoundedDim& y); +bool operator==(int64_t x, const BoundedDim& y); +bool operator!=(int64_t x, const BoundedDim& y); +bool operator>(int64_t x, const BoundedDim& y); +bool operator<(int64_t x, const BoundedDim& y); +bool operator>=(int64_t x, const BoundedDim& y); +bool operator<=(int64_t x, const BoundedDim& y); // Represents a possibly dynamic dimension size. For dynamic sizes the bound is stored as the size and isDynamic is set // to true. @@ -138,32 +149,43 @@ class MaskedDim { } public: - friend MaskedDim operator+(const MaskedDim& x, const MaskedDim& y); - friend MaskedDim operator-(const MaskedDim& x, const MaskedDim& y); - friend MaskedDim operator*(const MaskedDim& x, const MaskedDim& y); + MaskedDim operator+(const MaskedDim& other) const; + MaskedDim operator-(const MaskedDim& other) const; + MaskedDim operator*(const MaskedDim& other) const; MaskedDim& operator+=(const MaskedDim& other); MaskedDim& operator-=(const MaskedDim& other); MaskedDim& operator*=(const MaskedDim& other); - friend bool operator==(const MaskedDim& x, const MaskedDim& y); - friend bool operator!=(const MaskedDim& x, const MaskedDim& y); - friend bool operator<(const MaskedDim& x, const MaskedDim& y); - friend bool operator>(const MaskedDim& x, const MaskedDim& y); - friend bool operator<=(const MaskedDim& x, const MaskedDim& y); - friend bool operator>=(const MaskedDim& x, const MaskedDim& y); + bool operator==(const MaskedDim& other) const; + bool operator!=(const MaskedDim& other) const; + bool operator<(const MaskedDim& other) const; + bool operator>(const MaskedDim& other) const; + bool operator<=(const MaskedDim& other) const; + bool operator>=(const MaskedDim& other) const; + + friend MaskedDim operator+(int64_t x, const MaskedDim& y); + friend MaskedDim operator-(int64_t x, const MaskedDim& y); + friend MaskedDim operator*(int64_t x, const MaskedDim& y); + + friend bool operator==(int64_t x, const MaskedDim& y); + friend bool operator!=(int64_t x, const MaskedDim& y); + friend bool operator<(int64_t x, const MaskedDim& y); + friend bool operator>(int64_t x, const MaskedDim& y); + friend bool operator<=(int64_t x, const MaskedDim& y); + friend bool operator>=(int64_t x, const MaskedDim& y); }; -MaskedDim operator+(const MaskedDim& x, const MaskedDim& y); -MaskedDim operator-(const MaskedDim& x, const MaskedDim& y); -MaskedDim operator*(const MaskedDim& x, const MaskedDim& y); +MaskedDim operator+(int64_t x, const MaskedDim& y); +MaskedDim operator-(int64_t x, const MaskedDim& y); +MaskedDim operator*(int64_t x, const MaskedDim& y); -bool operator==(const MaskedDim& x, const MaskedDim& y); -bool operator!=(const MaskedDim& x, const MaskedDim& y); -bool operator<(const MaskedDim& x, const MaskedDim& y); -bool operator>(const MaskedDim& x, const MaskedDim& y); -bool operator<=(const MaskedDim& x, const MaskedDim& y); -bool operator>=(const MaskedDim& x, const MaskedDim& y); +bool operator==(int64_t x, const MaskedDim& y); +bool operator!=(int64_t x, const MaskedDim& y); +bool operator>(int64_t x, const MaskedDim& y); +bool operator<(int64_t x, const MaskedDim& y); +bool operator>=(int64_t x, const MaskedDim& y); +bool operator<=(int64_t x, const MaskedDim& y); namespace details { diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/core/interfaces/type_interfaces.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/core/interfaces/type_interfaces.hpp index b0707f1211..53f43efe4c 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/core/interfaces/type_interfaces.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/core/interfaces/type_interfaces.hpp @@ -8,18 +8,17 @@ #include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/attributes/strides.hpp" -#include "vpux/compiler/dialect/core/IR/attributes.hpp" - -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/utils/attributes.hpp" - -#include "vpux/utils/core/array_ref.hpp" +#include "vpux/compiler/dialect/core/IR/dynamic_attrs.hpp" +#include "vpux/compiler/dialect/core/IR/indexed_symbol_attr.hpp" #include "vpux/utils/core/mem_size.hpp" -#include "vpux/utils/core/optional.hpp" #include #include +namespace vpux::VPU { +enum class MemoryKind : uint64_t; +} + namespace vpux { // diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/core/transforms/passes.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/core/transforms/passes.hpp index 60d82a5faa..fdc4df9a81 100644 --- a/src/vpux_compiler/include/vpux/compiler/dialect/core/transforms/passes.hpp +++ b/src/vpux_compiler/include/vpux/compiler/dialect/core/transforms/passes.hpp @@ -29,7 +29,11 @@ std::unique_ptr createStartLocationVerifierPass( std::unique_ptr createStopLocationVerifierPass(vpux::Logger log); std::unique_ptr createPackNestedModulesPass(Logger log = Logger::global()); std::unique_ptr createUnpackNestedModulesPass(const Logger& log = Logger::global()); -std::unique_ptr createAddNetInfoToModulePass(Logger log = Logger::global()); +std::unique_ptr createAddNetInfoToModulePass(Logger log = Logger::global(), + bool hasTensorSemantics = false); + +// special pass +std::unique_ptr createWsFoldReinterpretCastIntoConstPass(const Logger& log = Logger::global()); // // Registration diff --git a/src/vpux_compiler/include/vpux/compiler/frontend/IE.hpp b/src/vpux_compiler/include/vpux/compiler/frontend/IE.hpp index 4f20e0840e..7346d991d3 100644 --- a/src/vpux_compiler/include/vpux/compiler/frontend/IE.hpp +++ b/src/vpux_compiler/include/vpux/compiler/frontend/IE.hpp @@ -5,10 +5,14 @@ #pragma once -#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" #include "vpux/compiler/init.hpp" +#include "vpux/utils/IE/hash.hpp" +#include "vpux/utils/core/array_ref.hpp" +#include "vpux/utils/core/small_vector.hpp" #include "vpux/utils/logger/logger.hpp" +#include #include #include #include @@ -31,19 +35,22 @@ #include #include -// Utils -#include "vpux/compiler/utils/range_bound.hpp" -#include "vpux/utils/IE/hash.hpp" - namespace vpux { namespace IE { +struct ImportNetworkConfig { + bool sharedConstants = false; + bool enableProfiling = false; + DummyOpMode stubLayers = DummyOpMode::DISABLED; + bool dynamicShapeToStatic = false; + bool enableWeightsSeparationPath = false; +}; + // TODO Get rid of this function (importNetwork), move logic to compiler.cpp mlir::OwningOpRef importNetwork(mlir::MLIRContext* ctx, const std::shared_ptr& model, const std::vector>& originalParameters, const std::vector>& originalResults, - bool sharedConstants, mlir::TimingScope& rootTiming, - bool enableProfiling, DummyOpMode stubLayers, bool dynamicShapeToStatic, + mlir::TimingScope& rootTiming, const ImportNetworkConfig& importCfg, Logger log = Logger::global()); std::vector> buildOVParams(const std::shared_ptr& model); @@ -52,7 +59,8 @@ std::vector> buildOVResults(const std::shared_pt // TODO Move to separate file NGraphPasses class NGraphPasses final { public: - static void runNGraphPasses(const std::shared_ptr& netGraph, mlir::TimingScope& rootTiming); + static void runNGraphPasses(const std::shared_ptr& netGraph, mlir::TimingScope& rootTiming, + bool enableWeightsSeparationPath = false); }; class NGraphImporter final { diff --git a/src/vpux_compiler/include/vpux/compiler/init.hpp b/src/vpux_compiler/include/vpux/compiler/init.hpp index fb1d5a366d..a4c8b83d17 100644 --- a/src/vpux_compiler/include/vpux/compiler/init.hpp +++ b/src/vpux_compiler/include/vpux/compiler/init.hpp @@ -6,7 +6,6 @@ #pragma once #include -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" namespace vpux { diff --git a/src/vpux_compiler/include/vpux/compiler/interfaces_registry.hpp b/src/vpux_compiler/include/vpux/compiler/interfaces_registry.hpp index 6f09426941..48b33c1b2c 100644 --- a/src/vpux_compiler/include/vpux/compiler/interfaces_registry.hpp +++ b/src/vpux_compiler/include/vpux/compiler/interfaces_registry.hpp @@ -5,8 +5,10 @@ #pragma once +#include "vpux/compiler/dialect/config/IR/attributes.hpp" + +#include #include -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" namespace vpux { @@ -24,6 +26,6 @@ class IInterfaceRegistry { // createInterface // -std::unique_ptr createInterfacesRegistry(VPU::ArchKind arch); +std::unique_ptr createInterfacesRegistry(config::ArchKind arch); } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/passes_register.hpp b/src/vpux_compiler/include/vpux/compiler/passes_register.hpp index 62c6a4e251..6d630e7e72 100644 --- a/src/vpux_compiler/include/vpux/compiler/passes_register.hpp +++ b/src/vpux_compiler/include/vpux/compiler/passes_register.hpp @@ -30,6 +30,6 @@ class IPassesRegistry { // createPassesRegistry // -std::unique_ptr createPassesRegistry(VPU::ArchKind archKind); +std::unique_ptr createPassesRegistry(config::ArchKind archKind); } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/pipelines/dialect_pipeline_strategy.hpp b/src/vpux_compiler/include/vpux/compiler/pipelines/dialect_pipeline_strategy.hpp index 6446270c00..35a38329fa 100644 --- a/src/vpux_compiler/include/vpux/compiler/pipelines/dialect_pipeline_strategy.hpp +++ b/src/vpux_compiler/include/vpux/compiler/pipelines/dialect_pipeline_strategy.hpp @@ -39,11 +39,6 @@ class IDialectPipelineStrategy { VPUX_THROW("Not implemented!"); } - // E#160424: refactor ReferenceSW pipeline similarly to the rest - virtual void buildReferenceSWPipeline(mlir::OpPassManager&, Logger) { - VPUX_THROW("Not implemented!"); - } - virtual ~IDialectPipelineStrategy() = default; }; diff --git a/src/vpux_compiler/include/vpux/compiler/pipelines/options_mapper.hpp b/src/vpux_compiler/include/vpux/compiler/pipelines/options_mapper.hpp index a74a88f995..6ad20fc870 100644 --- a/src/vpux_compiler/include/vpux/compiler/pipelines/options_mapper.hpp +++ b/src/vpux_compiler/include/vpux/compiler/pipelines/options_mapper.hpp @@ -16,7 +16,7 @@ namespace vpux { -VPU::ArchKind getArchKind(const intel_npu::Config& config); +config::ArchKind getArchKind(const intel_npu::Config& config); config::CompilationMode getCompilationMode(const intel_npu::Config& config); std::optional getRevisionID(const intel_npu::Config& config); std::optional getNumberOfDPUGroups(const intel_npu::Config& config); diff --git a/src/vpux_compiler/include/vpux/compiler/pipelines/options_setup.hpp b/src/vpux_compiler/include/vpux/compiler/pipelines/options_setup.hpp index d278be9206..55c0796566 100644 --- a/src/vpux_compiler/include/vpux/compiler/pipelines/options_setup.hpp +++ b/src/vpux_compiler/include/vpux/compiler/pipelines/options_setup.hpp @@ -190,8 +190,6 @@ class WSMonolithicSetupBase : public OptionsSetup createPipelineRegistry(VPU::ArchKind archKind); +std::unique_ptr createPipelineRegistry(config::ArchKind archKind); } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/tools/options.hpp b/src/vpux_compiler/include/vpux/compiler/tools/options.hpp index 4c419e9463..1fba4a612f 100644 --- a/src/vpux_compiler/include/vpux/compiler/tools/options.hpp +++ b/src/vpux_compiler/include/vpux/compiler/tools/options.hpp @@ -10,6 +10,6 @@ namespace vpux { -vpux::VPU::ArchKind parseArchKind(int argc, char* argv[], StringRef helpHeader = ""); +vpux::config::ArchKind parseArchKind(int argc, char* argv[], StringRef helpHeader = ""); } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/utils/ELF/utils.hpp b/src/vpux_compiler/include/vpux/compiler/utils/ELF/utils.hpp index 34eddc67fb..b142e44298 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/ELF/utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/ELF/utils.hpp @@ -10,6 +10,8 @@ #include "vpux/compiler/dialect/VPURegMapped/types.hpp" #include "vpux_headers/platform.hpp" +#include + using namespace vpux; namespace llvm { @@ -56,7 +58,7 @@ size_t lcm(size_t a, size_t b); // // Platform Information // -elf::platform::ArchKind mapVpuArchKindToElfArchKind(const VPU::ArchKind& archKind); +elf::platform::ArchKind mapVpuArchKindToElfArchKind(const config::ArchKind& archKind); ArrayRef getKernelELF(mlir::Operation* operation, StringRef kernelPath, ArrayRef sectionNames = {}); ArrayRef getDataAndSizeOfElfSection(ArrayRef elfBlob, ArrayRef possibleSecNames); diff --git a/src/vpux_compiler/include/vpux/compiler/utils/IE/transposed_convolution_utils.hpp b/src/vpux_compiler/include/vpux/compiler/utils/IE/transposed_convolution_utils.hpp index 028326a242..1589f6aa1d 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/IE/transposed_convolution_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/IE/transposed_convolution_utils.hpp @@ -5,7 +5,12 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/utils/error.hpp" #include diff --git a/src/vpux_compiler/include/vpux/compiler/utils/VPU/tile_utils.hpp b/src/vpux_compiler/include/vpux/compiler/utils/VPU/tile_utils.hpp index 5708b212ee..4a86525781 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/VPU/tile_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/VPU/tile_utils.hpp @@ -7,12 +7,11 @@ #pragma once -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" -#include "vpux/utils/core/small_string.hpp" -#include "vpux/utils/logger/logger.hpp" - +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/dialect/VPU/utils/sibling_ops_analysis.hpp" +#include "vpux/utils/logger/logger.hpp" namespace vpux { namespace VPU { diff --git a/src/vpux_compiler/include/vpux/compiler/utils/adjust_layout_utils.hpp b/src/vpux_compiler/include/vpux/compiler/utils/adjust_layout_utils.hpp index fd302babe5..2ca9cf469a 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/adjust_layout_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/adjust_layout_utils.hpp @@ -5,15 +5,16 @@ #pragma once -#include -#include -#include #include "vpux/compiler/core/attributes/dims_order.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/utils/factors.hpp" -#include "vpux/utils/core/numeric.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/utils/logger/logger.hpp" +#include +#include +#include + namespace vpux { struct AdjustConvShapeParams { diff --git a/src/vpux_compiler/include/vpux/compiler/utils/analysis.hpp b/src/vpux_compiler/include/vpux/compiler/utils/analysis.hpp index 0929c2c4f1..6a477c270b 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/analysis.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/analysis.hpp @@ -9,7 +9,9 @@ #include #include -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +namespace vpux::VPUIP { +class LayerOpInterface; +} namespace vpux { @@ -19,6 +21,12 @@ namespace vpux { mlir::Operation* getFirstUser(mlir::Value output); +// +// hasOneUniqueUser +// + +bool hasOneUniqueUser(mlir::Operation* op); + // // isBufAllocOp // diff --git a/src/vpux_compiler/include/vpux/compiler/utils/async_dialect_utils.hpp b/src/vpux_compiler/include/vpux/compiler/utils/async_dialect_utils.hpp index aeee2e735a..824e4bf75e 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/async_dialect_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/async_dialect_utils.hpp @@ -5,12 +5,18 @@ #pragma once -#include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" - #include #include #include +namespace vpux::VPU { +enum class ExecutorKind : uint64_t; +} + +namespace vpux::VPUIP { +class DMATypeOpInterface; +} // namespace vpux::VPUIP + namespace vpux { // Get the type of the async value. If the value is not an async value, return its original type. diff --git a/src/vpux_compiler/include/vpux/compiler/utils/bit_compactor_codec.hpp b/src/vpux_compiler/include/vpux/compiler/utils/bit_compactor_codec.hpp index a4d07321da..f6c4efc36d 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/bit_compactor_codec.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/bit_compactor_codec.hpp @@ -13,7 +13,7 @@ namespace vpux { class BitCompactorCodec final : public ICodec { public: - BitCompactorCodec(VPU::ArchKind arch_kind); + BitCompactorCodec(config::ArchKind arch_kind); bool supportsFP16compression() const override; mlir::FailureOr> compress(std::vector& data, const CompressionMode mode, const Logger& _log) const override; diff --git a/src/vpux_compiler/include/vpux/compiler/utils/codec_factory.hpp b/src/vpux_compiler/include/vpux/compiler/utils/codec_factory.hpp index 413a712440..e87b59a84b 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/codec_factory.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/codec_factory.hpp @@ -25,5 +25,6 @@ class ICodec { static std::string compressionModeToStr(ICodec::CompressionMode mode); }; -std::unique_ptr makeCodec(const ICodec::CompressionAlgorithm algo, VPU::ArchKind arch = VPU::ArchKind::UNKNOWN); +std::unique_ptr makeCodec(const ICodec::CompressionAlgorithm algo, + config::ArchKind arch = config::ArchKind::UNKNOWN); } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/utils/convert_utils.hpp b/src/vpux_compiler/include/vpux/compiler/utils/convert_utils.hpp index a8e3ee6e1e..3947113c54 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/convert_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/convert_utils.hpp @@ -5,11 +5,8 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/const/attributes/content.hpp" -#include "vpux/compiler/utils/subspaces.hpp" -#include "vpux/compiler/utils/types.hpp" -#include "vpux/utils/core/format.hpp" +#include "vpux/compiler/dialect/const/utils/content.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include diff --git a/src/vpux_compiler/include/vpux/compiler/utils/dma.hpp b/src/vpux_compiler/include/vpux/compiler/utils/dma.hpp index 5cf5d3c781..32e8cf7f44 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/dma.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/dma.hpp @@ -5,30 +5,29 @@ #pragma once +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" + #include #include #include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" - namespace vpux { int64_t getDMAPortValue(mlir::Operation* wrappedTaskOp); -SmallVector getDMAChannelsWithIndependentLinkAgents(VPU::ArchKind arch); +SmallVector getDMAChannelsWithIndependentLinkAgents(config::ArchKind arch); // Encode DMA port and channel setting into a single integer for convenient usage by scheduling modules int64_t getDMAQueueIdEncoding(int64_t port, int64_t channelIdx); int64_t getDMAQueueIdEncoding(int64_t port, std::optional channel); int64_t getDMAQueueIdEncoding(std::optional channel); -int64_t getDMAQueueIdEncoding(VPU::MemoryKind srcMemKind, VPU::ArchKind arch); +int64_t getDMAQueueIdEncoding(VPU::MemoryKind srcMemKind, config::ArchKind arch); int64_t getDMAPortFromEncodedId(int64_t dmaQueueIdEncoding); -VPUIP::DmaChannelType getDMAChannelTypeFromEncodedId(int64_t dmaQueueIdEncoding, VPU::ArchKind arch); -std::string getDMAChannelTypeAsString(VPUIP::DmaChannelType channelType, VPU::ArchKind arch); -std::string getDMAChannelTypeAsString(int64_t dmaQueueIdEncoding, VPU::ArchKind arch); +VPUIP::DmaChannelType getDMAChannelTypeFromEncodedId(int64_t dmaQueueIdEncoding, config::ArchKind arch); +std::string getDMAChannelTypeAsString(VPUIP::DmaChannelType channelType, config::ArchKind arch); +std::string getDMAChannelTypeAsString(int64_t dmaQueueIdEncoding, config::ArchKind arch); } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/utils/dma_limits.hpp b/src/vpux_compiler/include/vpux/compiler/utils/dma_limits.hpp index 0f97efcb19..dd39f7a16e 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/dma_limits.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/dma_limits.hpp @@ -3,11 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include - +#include #include #include +#include + namespace vpux::VPUIP::DMA { template @@ -212,6 +213,6 @@ class EngineLimits { } }; -const EngineLimits& getEngineLimits(VPU::ArchKind arch); +const EngineLimits& getEngineLimits(config::ArchKind arch); } // namespace vpux::VPUIP::DMA diff --git a/src/vpux_compiler/include/vpux/compiler/utils/dynamic_shape_propagation.hpp b/src/vpux_compiler/include/vpux/compiler/utils/dynamic_shape_propagation.hpp index 22181409ca..98935763a1 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/dynamic_shape_propagation.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/dynamic_shape_propagation.hpp @@ -5,9 +5,14 @@ #pragma once -#include #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" +#include + +namespace vpux::VPU { +enum class BoundsRepresentation : uint64_t; +} // namespace vpux::VPU + namespace vpux { void assignDynamicTypeComponents(TypeComponents& typeComponents, VPU::BoundsRepresentation boundsRepresentation, diff --git a/src/vpux_compiler/include/vpux/compiler/utils/function_outlining_splitter.hpp b/src/vpux_compiler/include/vpux/compiler/utils/function_outlining_splitter.hpp index a6bda49e2d..afd6993d90 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/function_outlining_splitter.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/function_outlining_splitter.hpp @@ -5,11 +5,12 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/utils/core/array_ref.hpp" #include "vpux/utils/core/small_vector.hpp" #include "vpux/utils/logger/logger.hpp" #include +#include #include #include diff --git a/src/vpux_compiler/include/vpux/compiler/utils/infer_output_shape.hpp b/src/vpux_compiler/include/vpux/compiler/utils/infer_output_shape.hpp index a09d6a1471..2824485e4e 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/infer_output_shape.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/infer_output_shape.hpp @@ -5,12 +5,9 @@ #pragma once +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/compiler/dialect/core/types.hpp" -#include "vpux/compiler/utils/attributes.hpp" - #include "vpux/utils/core/array_ref.hpp" -#include "vpux/utils/core/range.hpp" namespace vpux { @@ -187,6 +184,13 @@ ShapeInfo inferGroupConvolutionOutputShapeInfo(ShapeInfo& inShapeInfo, ShapeInfo ArrayRef dataPaddingAbove, ArrayRef windowDilations, std::optional maybeGroups, bool hasOutputPadding); +ShapeInfo inferTransposedConvBackpropOutputShapeInfo(const ShapeInfo& inShapeInfo, const ShapeInfo& filterShapeInfo, + ArrayRef windowStrides, + ArrayRef dataPaddingBelow, + ArrayRef dataPaddingAbove, + ArrayRef windowDilations, + ArrayRef outputPadding); + // // Tensor Reifiers // @@ -210,11 +214,12 @@ mlir::FailureOr> reifyMatMulTensors(mlir::OpBuil /** * @brief Reify tensors for convolution or pooling operations. Currently, it supports only convolution with dilation - * equal to 1 and pooling. + * equal to 1 and pooling. kernel size is passed along with tensor for maxpool operations * * @param builder - builder to create new operations * @param input - input tensor * @param output - output tensor + * @param kernel - kernel tensor * @param kernelSize - kernel size * @param strides - strides * @param padBegin - padding begin @@ -223,7 +228,8 @@ mlir::FailureOr> reifyMatMulTensors(mlir::OpBuil * @return reified shapes for output tensor */ mlir::FailureOr> reifyConvPoolTensors(mlir::OpBuilder& builder, mlir::Value input, - mlir::Value output, ArrayRef kernelSize, + mlir::Value output, mlir::Value kernel, + ArrayRef kernelSize, ArrayRef strides, ArrayRef padBegin, ArrayRef padEnd, mlir::Location loc); diff --git a/src/vpux_compiler/include/vpux/compiler/utils/net/network_info_utils.hpp b/src/vpux_compiler/include/vpux/compiler/utils/net/network_info_utils.hpp index 382b1a06da..f152bfd93e 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/net/network_info_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/net/network_info_utils.hpp @@ -6,6 +6,8 @@ #pragma once #include "vpux/compiler/dialect/net/IR/ops.hpp" +#include "vpux/compiler/utils/analysis.hpp" +#include "vpux/utils/logger/logger.hpp" namespace vpux::net { @@ -23,4 +25,14 @@ void setupSections(net::NetworkInfoOp netInfo, bool enableProfiling = false); */ void eraseSectionEntries(mlir::Region& section, size_t begin = 0); +// Remove the utility function after pipeline issues are resolved in E#168311 + +/** @brief Utility function for HostCompile pipeline to fetch entry point function. + + The function safely returns an entry point function from network info object. + If the entry point function is not found, it returns nullptr. + Track: E#168311 + */ +mlir::func::FuncOp findEntryPointFunc(mlir::Operation* op, Logger& log); + } // namespace vpux::net diff --git a/src/vpux_compiler/include/vpux/compiler/utils/options.hpp b/src/vpux_compiler/include/vpux/compiler/utils/options.hpp index c1854098d4..b59e7f1dd2 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/options.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/options.hpp @@ -27,6 +27,7 @@ enum class WorkloadManagementBarrierProgrammingMode { UNKNOWN = 255 }; enum class DMAFifoType { SW = 0, HW = 1 }; + /** * @brief This enum is used to specify the mode of weights table reuse. * @@ -41,6 +42,6 @@ StringLiteral stringifyEnum(WorkloadManagementBarrierProgrammingMode val); StringLiteral stringifyEnum(DMAFifoType val); StringLiteral stringifyEnum(WeightsTableReuseMode val); std::optional convertToOptional(const StrOption& strOption); +StringLiteral stringifyEnum(WorkloadManagementMode val); bool isOptionEnabled(const BoolOption& option); - } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/utils/permute_utils.hpp b/src/vpux_compiler/include/vpux/compiler/utils/permute_utils.hpp index 8e8ee8664b..4747c127b8 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/permute_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/permute_utils.hpp @@ -5,13 +5,15 @@ #pragma once -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/core/attributes/dims_order.hpp" +#include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/VPU/IR/native_attributes/distribution_info.hpp" #include "vpux/compiler/dialect/VPU/IR/types.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" -#include "vpux/compiler/core/attributes/dims_order.hpp" -#include "vpux/compiler/core/attributes/shape.hpp" +#include namespace vpux { @@ -84,4 +86,7 @@ std::optional tryToFindPermuteCastOp(mlir::Location loc, mlir ShapeRef outShape, mlir::PatternRewriter& rewriter); Dim inferDimAfterPermutation(Dim dim, DimsOrder srcOrder, DimsOrder dstOrder, mlir::AffineMap perm); + +bool isSuitableToAdjustMemPermuteShape(vpux::NDTypeInterface inType, vpux::NDTypeInterface outType, + mlir::AffineMap memPerm); } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/utils/platform_resources.hpp b/src/vpux_compiler/include/vpux/compiler/utils/platform_resources.hpp index 9abc24d4f1..1505ad9119 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/platform_resources.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/platform_resources.hpp @@ -25,6 +25,9 @@ constexpr Byte VPUX40XX_CMX_WORKSPACE_FRAGMENTATION_AWARE_SIZE = Byte( constexpr int VPUX37XX_MAX_DPU_GROUPS = 2; constexpr int VPUX40XX_MAX_DPU_GROUPS = 6; +constexpr int VPUX37XX_MAX_SHAVES_PER_TILE = 2; +constexpr int VPUX40XX_MAX_SHAVES_PER_TILE = 2; + constexpr int VPUX37XX_MAX_DMA_PORTS = 2; constexpr int VPUX40XX_MAX_DMA_PORTS = 2; diff --git a/src/vpux_compiler/include/vpux/compiler/utils/quantization.hpp b/src/vpux_compiler/include/vpux/compiler/utils/quantization.hpp index c40488a7ee..5aa3be1cbf 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/quantization.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/quantization.hpp @@ -6,11 +6,10 @@ #pragma once #include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/utils/core/numeric.hpp" - -#include "vpux/compiler/dialect/IE/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPURT/IR/attributes.hpp" +#include "vpux/utils/logger/logger.hpp" #include #include @@ -21,6 +20,10 @@ #include #include +namespace vpux::VPU { +enum class EltwiseType : uint64_t; +} // namespace vpux::VPU + namespace vpux { struct QuantizationLevels final { @@ -45,7 +48,9 @@ mlir::Type normalizeQuantStorageType(mlir::quant::QuantizedType qType); mlir::Type expandScalesAndZP(mlir::Type perAxisQType, ShapeRef padBefore, ShapeRef padAfter); -mlir::Type tileScalesAndZP(mlir::Type perAxisQType, ShapeRef shape, ShapeRef offsets); +mlir::Type tileScalesAndZP(mlir::Type perAxisQType, ShapeRef shape, ShapeRef offsets, ShapeRef strides = Shape()); + +mlir::Type tileScalesAndZP(mlir::Type perAxisQType, ArrayRef offsets, ArrayRef sizes); mlir::Type changeAxis(mlir::Type perAxisQType, int32_t axis); diff --git a/src/vpux_compiler/include/vpux/compiler/utils/shave.hpp b/src/vpux_compiler/include/vpux/compiler/utils/shave.hpp index 41d29e8fa7..0a790eb42e 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/shave.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/shave.hpp @@ -5,15 +5,13 @@ #pragma once +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/utils/core/string_ref.hpp" + #include #include #include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" -#include "vpux/utils/logger/logger.hpp" - namespace vpux { // Encode tile index and index of SHAVE unit inside the tile into a single integer for a convenient use in scheduling @@ -24,6 +22,6 @@ int64_t getShaveQueueIdEncoding(int64_t numTiles, int64_t tileIndex, int64_t lis namespace VPU { constexpr StringRef USE_DEDICATED_FIFO_PER_SHAVE_ENGINE = "VPU.UseDedicatedFifoPerShaveEngine"; bool isFifoPerShaveEngineEnabled(mlir::Operation* op); -bool hasSupportForFifoPerShaveEngine(VPU::ArchKind arch, bool enableWorkloadManagement); +bool hasSupportForFifoPerShaveEngine(config::ArchKind arch, bool enableWorkloadManagement); } // namespace VPU } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/utils/swizzle_transform.hpp b/src/vpux_compiler/include/vpux/compiler/utils/swizzle_transform.hpp index 3ce3ef42b3..da66bae70b 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/swizzle_transform.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/swizzle_transform.hpp @@ -23,7 +23,7 @@ const uint32_t MAX_SWIZZLE_KEY{5}; class AddressTransform { public: - AddressTransform(uint32_t staggerBits, VPU::ArchKind archKind) + AddressTransform(uint32_t staggerBits, config::ArchKind archKind) : _staggerAddressBits{staggerBits}, _archKind{archKind} { setStaggerBits(staggerBits); } @@ -43,13 +43,13 @@ class AddressTransform { uint32_t _staggerAddressMask{}; uint32_t _shift{}; uint32_t _log2RamCutDataWidth{LOG2_RAM_CUT_DATA_WIDTH}; - VPU::ArchKind _archKind{}; + config::ArchKind _archKind{}; uint32_t _ramCutAddressMask{RAM_CUT_ADDRESS_MASK}; }; class BufferSwizzleTransform { public: - BufferSwizzleTransform(uint32_t swizzleKey = 5, VPU::ArchKind archKind = VPU::ArchKind::NPU37XX); + BufferSwizzleTransform(uint32_t swizzleKey = 5, config::ArchKind archKind = config::ArchKind::NPU37XX); uint32_t getSwizzlePatternStride(); template diff --git a/src/vpux_compiler/include/vpux/compiler/utils/swizzling_utils.hpp b/src/vpux_compiler/include/vpux/compiler/utils/swizzling_utils.hpp index 7d47b6f68d..68130b7cce 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/swizzling_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/swizzling_utils.hpp @@ -26,15 +26,15 @@ constexpr uint64_t SWIZZLING_KEY_5 = 5; constexpr int64_t SWIZZLING_SIZE_ALIGNMENT_VPUX40XX = 1024; constexpr int64_t SWIZZLING_SIZE_ALIGNMENT_VPUX37XX = 512; -int64_t getSizeAlignmentForSwizzling(VPU::ArchKind arch); +int64_t getSizeAlignmentForSwizzling(config::ArchKind arch); /// @brief Required alignment of buffers in CMX memory required swizzling operations /// @param swizzlingKey /// @param archKind /// @return alignment [bytes] -int64_t getAddressAlignmentForSwizzling(int64_t swizzlingKey, VPU::ArchKind archKind); +int64_t getAddressAlignmentForSwizzling(int64_t swizzlingKey, config::ArchKind archKind); -VPUIP::SwizzlingSchemeAttr createSwizzlingSchemeAttr(mlir::MLIRContext* ctx, VPU::ArchKind archKind, +VPUIP::SwizzlingSchemeAttr createSwizzlingSchemeAttr(mlir::MLIRContext* ctx, config::ArchKind archKind, int64_t swizzlingKey); // For swizzling buffer size needs to be aligned to 512/1024 as dictated by arch @@ -59,9 +59,9 @@ VPUIP::SwizzlingSchemeAttr getSwizzlingSchemeAttr(mlir::Type type); // Retrieve swizzling key setting embedded in layout with buffer types int64_t getSwizzlingKey(mlir::Type type); -mlir::Type setSwizzlingKey(mlir::Type type, mlir::IntegerAttr swizzlingKeyAttr, VPU::ArchKind archKind); +mlir::Type setSwizzlingKey(mlir::Type type, mlir::IntegerAttr swizzlingKeyAttr, config::ArchKind archKind); -mlir::Type setSwizzlingKey(mlir::Type type, int64_t swizzlingKey, VPU::ArchKind archKind); +mlir::Type setSwizzlingKey(mlir::Type type, int64_t swizzlingKey, config::ArchKind archKind); SmallVector getPerClusterBytesAddedForSwizzling(VPUIP::DistributedBufferType distributedBuffer); diff --git a/src/vpux_compiler/include/vpux/compiler/utils/symbolization.hpp b/src/vpux_compiler/include/vpux/compiler/utils/symbolization.hpp index 501f2950fb..449e9d1dba 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/symbolization.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/symbolization.hpp @@ -83,6 +83,42 @@ class SymbolizationPattern : public mlir::OpConversionPattern { } } + // helper to unify approach to build unique symbolic names + llvm::SmallVector createSymbolicName( + SourceOp op, std::optional taskTypeString = std::nullopt, + std::optional counter = std::nullopt) { + auto fullName = SourceOp::getOperationName(); + auto opName = op->getName().stripDialect(); + + mlir::Operation* base = op.getOperation(); + VPUX_THROW_UNLESS(base->getResults().size() == 1, + "Default symbolic converter only supports ops with exactly one result. For {0} got {1}", + fullName, base->getResults().size()); + + llvm::SmallVector suffixes; + if (taskTypeString.has_value()) { + suffixes.push_back(taskTypeString.value()); + } + if (auto indexType = mlir::dyn_cast(base->getResult(0).getType())) { + suffixes.push_back(std::to_string(indexType.getTileIdx())); + suffixes.push_back(std::to_string(indexType.getListIdx())); + suffixes.push_back(std::to_string(indexType.getValue())); + VPUX_THROW_WHEN(counter.has_value(), + "Unexpected counter value ({0}) was provided which is of no use for the " + "current operation {1}", + counter.value(), op); + } else if (counter.has_value()) { + suffixes.push_back(std::to_string(counter.value())); + } + + std::stringstream opSuffixStream; + for (auto& suffix : suffixes) { + opSuffixStream << "_" << suffix; + } + auto symName = mlir::StringAttr::get(op.getContext(), opName + opSuffixStream.str()); + return {mlir::FlatSymbolRefAttr::get(symName)}; + } + std::pair processDynamicShapes(mlir::MLIRContext* context, mlir::OperandRangeRange inputShapes, mlir::OperandRangeRange outputShapes) const; diff --git a/src/vpux_compiler/include/vpux/compiler/utils/types.hpp b/src/vpux_compiler/include/vpux/compiler/utils/types.hpp index cfad32b80f..726e930b57 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/types.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/types.hpp @@ -7,20 +7,22 @@ #include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" #include "vpux/compiler/dialect/core/IR/dynamic_attrs.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" - -#include "vpux/compiler/dialect/IE/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" - #include "vpux/utils/core/enums.hpp" #include "vpux/utils/core/mem_size.hpp" +#include "vpux/utils/logger/logger.hpp" #include #include #include #include +namespace vpux::IE { +enum class TypeComparisonMode : uint64_t; +} // namespace vpux::IE + namespace vpux { template diff --git a/src/vpux_compiler/include/vpux/compiler/utils/wlm_legalization_utils.hpp b/src/vpux_compiler/include/vpux/compiler/utils/wlm_legalization_utils.hpp index b06c919731..0d49016897 100644 --- a/src/vpux_compiler/include/vpux/compiler/utils/wlm_legalization_utils.hpp +++ b/src/vpux_compiler/include/vpux/compiler/utils/wlm_legalization_utils.hpp @@ -16,6 +16,12 @@ namespace vpux { using TaskQueue = std::map>; enum class MinMaxOption { Min, Max }; +enum class Type : int { Dummy = 0, Real = 1 }; + +// IndexType is used to represent an entry in _barrierAddConsumerProducerMap +// size_t represents index of barrier/DMA and Type represents the type of index i.e. Dummy or Real +// We need to store both as we need to know the true index in barrierInfo to be able to add/remove dependencies +using IndexType = std::pair; template bool compareVPURTOpPosition(const T& lhs, const T& rhs, const BarrierInfo& barrierInfo, bool useIROrder = false) { @@ -69,4 +75,11 @@ void addElementsToSet(BarrierInfo::TaskSet& targetSet, const BarrierInfo::TaskSe bool lastTaskInGroupHasMandatoryUpdateBarrier(const ExecutionGroup& executionGroup, BarrierInfo& barrierInfo); bool inSameTaskBlock(size_t task1, size_t task2, const BlockRange& blockRange); +size_t getIndexOfTask(IndexType indexType, ArrayRef dummyDMAs, BarrierInfo& barrierInfo); +size_t getIndexOfBarrier(IndexType indexType, ArrayRef dummyBarriers, + BarrierInfo& barrierInfo); +VPURT::TaskOp createFetchDMA(mlir::OpBuilder& builder, mlir::Value input, mlir::Value output, int port, + mlir::ValueRange waitBarriers, mlir::ValueRange updateBarriers, + VPUIP::FetchDMAAttr fetchDMAAttr, llvm::StringLiteral opName = "fetch_dma"); + } // namespace vpux diff --git a/src/vpux_compiler/include/vpux/compiler/version.hpp b/src/vpux_compiler/include/vpux/compiler/version.hpp index 517cd7469f..20c42d8d41 100644 --- a/src/vpux_compiler/include/vpux/compiler/version.hpp +++ b/src/vpux_compiler/include/vpux/compiler/version.hpp @@ -10,12 +10,15 @@ // This version is exposed via L0 API and reported as (read-only) plugin property NPU_COMPILER_VERSION // #define NPU_COMPILER_VERSION_MAJOR 7 -#define NPU_COMPILER_VERSION_MINOR 21 +#define NPU_COMPILER_VERSION_MINOR 22 /* Change Log: ----------- +NPU Compiler 7.22.0 + - UD32 + NPU Compiler 7.21.0 - UD28 diff --git a/src/vpux_compiler/src/NPU37XX/CMakeLists.txt b/src/vpux_compiler/src/NPU37XX/CMakeLists.txt index 286682ed97..53a9e38daa 100644 --- a/src/vpux_compiler/src/NPU37XX/CMakeLists.txt +++ b/src/vpux_compiler/src/NPU37XX/CMakeLists.txt @@ -47,13 +47,13 @@ find_package(Git REQUIRED) execute_process( COMMAND ${GIT_EXECUTABLE} lfs pull - WORKING_DIRECTORY "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/thirdparty/vpucostmodel") + WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/thirdparty/vpucostmodel") vpux_embed_bin_file( - SOURCE_FILE "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/thirdparty/vpucostmodel/models/vpu_2_7_159.vpunn" + SOURCE_FILE "${PROJECT_SOURCE_DIR}/thirdparty/vpucostmodel/models/vpu_2_7_159.vpunn" HEADER_FILE "${PROJECT_BINARY_DIR}/${gen_base_dst_include_dir}/dialect/VPU/generated/cost_model_data_2_7.hpp.inc" VARIABLE_NAME "COST_MODEL_2_7") vpux_embed_bin_file( - SOURCE_FILE "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/thirdparty/vpucostmodel/models/vpu_2_7_159.fast.vpunn" + SOURCE_FILE "${PROJECT_SOURCE_DIR}/thirdparty/vpucostmodel/models/vpu_2_7_159.fast.vpunn" HEADER_FILE "${PROJECT_BINARY_DIR}/${gen_base_dst_include_dir}/dialect/VPU/generated/cost_model_data_2_7_fast.hpp.inc" VARIABLE_NAME "COST_MODEL_2_7_FAST") diff --git a/src/vpux_compiler/src/NPU37XX/conversion/passes/IE2VPU/convert_IE_to_VPU_NCE.cpp b/src/vpux_compiler/src/NPU37XX/conversion/passes/IE2VPU/convert_IE_to_VPU_NCE.cpp index adf9eb9b41..993ae2c975 100644 --- a/src/vpux_compiler/src/NPU37XX/conversion/passes/IE2VPU/convert_IE_to_VPU_NCE.cpp +++ b/src/vpux_compiler/src/NPU37XX/conversion/passes/IE2VPU/convert_IE_to_VPU_NCE.cpp @@ -5,6 +5,8 @@ #include "vpux/compiler/conversion/passes/IE2VPU/convert_IE_to_VPU_NCE.hpp" #include "vpux/compiler/NPU37XX/conversion/passes/IE2VPU/convert_IE_to_VPU_NCE.hpp" +#include "vpux/compiler/dialect/VPU/utils/sep_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp" #include #include @@ -15,14 +17,19 @@ #include "vpux/compiler/NPU37XX/conversion.hpp" #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/mpe_engine_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_matmul_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/ppe_version_config.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" namespace vpux::arch37xx { @@ -408,7 +415,7 @@ void ConvertIEToVPUNCEPass::safeRunOnFunc() { auto& ctx = getContext(); auto func = getOperation(); auto module = func->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); mlir::ConversionTarget target(ctx); @@ -436,9 +443,12 @@ void ConvertIEToVPUNCEPass::safeRunOnFunc() { /*checkChannelAlignment=*/true); }); target.addDynamicallyLegalOp([&](IE::GroupConvolutionOp op) { + if (VPU::isDilatedGroupConv(op)) { + return true; + } + return !VPU::NCEDepthConvolutionOp::isSupported(op, logCb, /*checkLayout=*/true, - /*checkChannelAlignment=*/true) || - VPU::isDilatedGroupConv(op); + /*checkChannelAlignment=*/true); }); target.addDynamicallyLegalOp([&](IE::MaxPoolOp op) { return !VPU::NCEMaxPoolOp::isSupported(op, logCb, /*checkLayout=*/true, diff --git a/src/vpux_compiler/src/NPU37XX/conversion/pipelines.cpp b/src/vpux_compiler/src/NPU37XX/conversion/pipelines.cpp index 46e083b17a..cd6017ae7a 100644 --- a/src/vpux_compiler/src/NPU37XX/conversion/pipelines.cpp +++ b/src/vpux_compiler/src/NPU37XX/conversion/pipelines.cpp @@ -65,6 +65,12 @@ void vpux::arch37xx::buildLowerVPUIP2ELFPipeline(mlir::OpPassManager& pm, Logger pm.addPass(ELFNPU37XX::createUpdateELFSectionFlagsPass(log)); } +void vpux::arch37xx::buildLowerIE2VPUPipelineReferenceSW(mlir::OpPassManager& pm, Logger log) { + const auto grc = getDefaultGreedyRewriteConfig(); + pm.addPass(createConvertLayers2VPUPass(log)); + pm.addPass(mlir::createCanonicalizerPass(grc)); +} + // // registerConversionPipelines // @@ -75,6 +81,11 @@ void vpux::arch37xx::registerConversionPipeline() { vpux::arch37xx::buildLowerIE2VPUPipeline(pm); }); + mlir::PassPipelineRegistration<>("lower-IE-to-VPU-referense-sw", + "Performs full lowering from the IE Dialect to VPU Dialect", + [](mlir::OpPassManager& pm) { + vpux::arch37xx::buildLowerIE2VPUPipelineReferenceSW(pm); + }); mlir::PassPipelineRegistration( "lower-VPU-to-VPUIP", "Performs full lowering from the VPU Dialect to VPUIP Dialect, SW operations are converted to SWKernelOp", diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/convert_quantize_ops_to_nce_ops_strategy.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/convert_quantize_ops_to_nce_ops_strategy.cpp index c73850195d..6127e2e29c 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/convert_quantize_ops_to_nce_ops_strategy.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/convert_quantize_ops_to_nce_ops_strategy.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/NPU37XX/dialect/IE/impl/convert_quantize_ops_to_nce_ops_strategy.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" #include "vpux/compiler/dialect/IE/interfaces/common_rewriters/convert_quantize_ops_to_nce_ops.hpp" #include "vpux/compiler/dialect/IE/transforms/passes/convert_quantize_ops_to_nce_ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/d2s_to_transposed_conv_verifier.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/d2s_to_transposed_conv_verifier.cpp index 38882228db..82a4489524 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/d2s_to_transposed_conv_verifier.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/d2s_to_transposed_conv_verifier.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/NPU37XX/dialect/IE/impl/d2s_to_transposed_conv_verifier.hpp" +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/fuse_convert_to_dpu_checker.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/fuse_convert_to_dpu_checker.cpp index 6dd18952b2..283442bd4b 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/fuse_convert_to_dpu_checker.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/fuse_convert_to_dpu_checker.cpp @@ -4,6 +4,9 @@ // #include "vpux/compiler/NPU37XX/dialect/IE/impl/fuse_convert_to_dpu_checker.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" using namespace vpux::IE::arch37xx; diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/fuse_outstanding_quant_strategy.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/fuse_outstanding_quant_strategy.cpp index cf9dbf22e0..f587ef3174 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/fuse_outstanding_quant_strategy.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/fuse_outstanding_quant_strategy.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/NPU37XX/dialect/IE/impl/fuse_outstanding_quant_strategy.hpp" #include "vpux/compiler/NPU37XX/dialect/IE/utils/quantization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/interfaces/common_rewriters/fuse_outstanding_quant.hpp" namespace vpux::IE::arch37xx { diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/map_bilinear_interpolate_on_dpu_strategy.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/map_bilinear_interpolate_on_dpu_strategy.cpp new file mode 100644 index 0000000000..2b0f1e8ae2 --- /dev/null +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/impl/map_bilinear_interpolate_on_dpu_strategy.cpp @@ -0,0 +1,49 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/NPU37XX/dialect/IE/impl/map_bilinear_interpolate_on_dpu_strategy.hpp" +#include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.hpp" + +namespace vpux::IE::arch37xx { + +void MapBilinearInterpolateOnDPUStrategy::prepareInterpolate(mlir::ConversionTarget& target, LogCb logCb) const { + target.addDynamicallyLegalOp([this, logCb](IE::InterpolateOp op) { + // For interpolation on axes H & W, and C <= 4, + // SW kernel performance is bigger that DPU decomposition performance for floating scale factors + const auto inputShape = getShape(op.getInput()); + if (inputShape.size() != 4 || inputShape[Dims4D::Act::C] > 4) { + return isLegalInterpolateOp(op, _interpolateAsSEOpInStrategy, logCb); + } + + const auto outputShape = getShape(op.getOutput()); + const auto attr = op.getAttr(); + const auto coordModeAttr = attr.getCoordMode(); + bool isAlignCorners = coordModeAttr.getValue() == IE::InterpolateCoordMode::ALIGN_CORNERS; + auto isIntegerRatio = [&](const auto& dim) -> bool { + auto outputDim = outputShape[dim]; + auto inputDim = inputShape[dim]; + + if (isAlignCorners) { + outputDim = outputDim == 1 ? 1 : (outputDim - 1); + inputDim = inputDim == 1 ? 1 : (inputDim - 1); + } + + return (outputDim % inputDim == 0) || (inputDim % outputDim == 0); + }; + + const bool isInterpOnHW = inputShape[Dims4D::Act::N] == 1 && outputShape[Dims4D::Act::N] == 1 && + inputShape[Dims4D::Act::H] != outputShape[Dims4D::Act::H] && + inputShape[Dims4D::Act::W] != outputShape[Dims4D::Act::W] && + inputShape[Dims4D::Act::C] == outputShape[Dims4D::Act::C]; + + if (isInterpOnHW && !isIntegerRatio(Dims4D::Act::H) && !isIntegerRatio(Dims4D::Act::W)) { + return true; + } + return isLegalInterpolateOp(op, _interpolateAsSEOpInStrategy, logCb); + }); +} +} // namespace vpux::IE::arch37xx diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/interfaces/elem_type_info_ops.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/interfaces/elem_type_info_ops.cpp index 98a7c3cfa9..04e47e6032 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/interfaces/elem_type_info_ops.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/interfaces/elem_type_info_ops.cpp @@ -4,13 +4,13 @@ // #include "vpux/compiler/NPU37XX/dialect/IE/IR/ops_interfaces.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/utils/elem_type_info_utils.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" using namespace vpux; using namespace IE; diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_d2s_to_transpose_conv.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_d2s_to_transpose_conv.cpp index a9345272e7..9a98bd3f0d 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_d2s_to_transpose_conv.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_d2s_to_transpose_conv.cpp @@ -3,18 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/interfaces/d2s_to_transposed_conv_verifier.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" - -#include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" - +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -279,7 +276,7 @@ void ConvertDepth2SpaceToTransposedConvPass::safeRunOnFunc() { auto func = getOperation(); auto module = func->getParentOfType(); - auto benefitVerifier = IE::createD2SToTransposedConvVerifier(VPU::getArch(module)); + auto benefitVerifier = IE::createD2SToTransposedConvVerifier(config::getArch(module)); mlir::RewritePatternSet patterns(&ctx); patterns.insert(&ctx, std::move(benefitVerifier), _log); diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_deformable_conv_to_conv.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_deformable_conv_to_conv.cpp index 472bdee5c1..f8de2d91b5 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_deformable_conv_to_conv.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_deformable_conv_to_conv.cpp @@ -2,13 +2,18 @@ // Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // + #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" -#include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" - #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_fft_to_conv.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_fft_to_conv.cpp index caac1bdf53..4aa846ef9e 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_fft_to_conv.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_fft_to_conv.cpp @@ -5,14 +5,14 @@ #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/utils/fft_ops_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/compiler/utils/types.hpp" #include #include diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_sub_gru_sequence_to_conv.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_sub_gru_sequence_to_conv.cpp index fa4aadfe09..d6f017d5ad 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_sub_gru_sequence_to_conv.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_sub_gru_sequence_to_conv.cpp @@ -4,10 +4,11 @@ // #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/utils/adjust_layout_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_to_mixed_precision.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_to_mixed_precision.cpp index 516d2b7f4f..11f0d4d44b 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_to_mixed_precision.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_to_mixed_precision.cpp @@ -5,10 +5,8 @@ #include "vpux/compiler/dialect/IE/transforms/passes/convert_to_mixed_precision.hpp" #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/NPU37XX/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/utils/quantization.hpp" - #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_weights_to_i8.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_weights_to_i8.cpp index bb562b9b03..af8a92afee 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_weights_to_i8.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/convert_weights_to_i8.cpp @@ -5,14 +5,15 @@ #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/const/ops.hpp" - #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/types.hpp" #include #include + #include namespace vpux::IE::arch37xx { diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/expand_activation_channels.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/expand_activation_channels.cpp index af0eb58e99..39b36ffd90 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/expand_activation_channels.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/expand_activation_channels.cpp @@ -6,9 +6,8 @@ #include "vpux/compiler/dialect/IE/transforms/passes/expand_activation_channels.hpp" #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_outstanding_dequant.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_outstanding_dequant.cpp index 678c2fb2d9..6a8da6e7eb 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_outstanding_dequant.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_outstanding_dequant.cpp @@ -6,8 +6,8 @@ #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/NPU37XX/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/utils/quantization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_permute_quantize_expand.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_permute_quantize_expand.cpp index 87a3768071..8820572293 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_permute_quantize_expand.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_permute_quantize_expand.cpp @@ -5,14 +5,13 @@ #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/utils/permute_quantize_utils.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/core/enums.hpp" - #include #include #include diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_reorders.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_reorders.cpp index 4cb2ad41eb..1146923100 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_reorders.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_reorders.cpp @@ -5,7 +5,10 @@ #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/utils/act_shave_utils.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_static_scale.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_static_scale.cpp index 3ddfa2bece..917e371219 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_static_scale.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/fuse_static_scale.cpp @@ -4,9 +4,12 @@ // #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/insert_identity_pool_before_op.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/insert_identity_pool_before_op.cpp index c05a0c68c8..c1564cade0 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/insert_identity_pool_before_op.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/insert_identity_pool_before_op.cpp @@ -6,9 +6,10 @@ #include "vpux/compiler/dialect/IE/transforms/passes/insert_identity_pool_before_op.hpp" #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" - #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.cpp deleted file mode 100644 index ebef930675..0000000000 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// -// Copyright (C) 2023-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -#include "vpux/compiler/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.hpp" -#include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/const/ops.hpp" - -#include "vpux/compiler/utils/rewriter.hpp" - -#include - -namespace vpux::IE::arch37xx { -#define GEN_PASS_DECL_MAPBILINEARINTERPOLATEONDPUPASS -#define GEN_PASS_DEF_MAPBILINEARINTERPOLATEONDPUPASS -#include "vpux/compiler/NPU37XX/dialect/IE/passes.hpp.inc" -} // namespace vpux::IE::arch37xx - -using namespace vpux; - -namespace { - -// -// MapBilinearInterpolateOnDPUPass -// - -class MapBilinearInterpolateOnDPUPass final : - public IE::arch37xx::impl::MapBilinearInterpolateOnDPUPassBase { -public: - explicit MapBilinearInterpolateOnDPUPass(const bool interpolateAsSEOp, Logger log) - : _interpolateAsSEOp(interpolateAsSEOp) { - Base::initLogger(log, Base::getArgumentName()); - } - - mlir::LogicalResult initialize(mlir::MLIRContext* ctx) final; - -public: - class MapBilinearInterpolateOnDPURewriter; - -private: - void safeRunOnFunc() final; - -private: - bool _interpolateAsSEOp; -}; - -class MapBilinearInterpolateOnDPUPass::MapBilinearInterpolateOnDPURewriter final : - public vpux::IE::MapBilinearInterpolateOnDPUBaseRewriter { -public: - MapBilinearInterpolateOnDPURewriter(mlir::MLIRContext* ctx, Logger log) - : vpux::IE::MapBilinearInterpolateOnDPUBaseRewriter(ctx, log) { - setDebugName("MapBilinearInterpolateOnDPURewriterVPUX37XX"); - } -}; - -mlir::LogicalResult MapBilinearInterpolateOnDPUPass::initialize(mlir::MLIRContext* ctx) { - if (mlir::failed(Base::initialize(ctx))) { - return mlir::failure(); - } - - // When this parameter has a value, it probably comes from LIT test. - // Override the default - if (interpolateAsSEOp.hasValue()) { - _interpolateAsSEOp = interpolateAsSEOp.getValue(); - } - - return mlir::success(); -} - -void MapBilinearInterpolateOnDPUPass::safeRunOnFunc() { - auto& ctx = getContext(); - auto func = getOperation(); - const auto logCb = [&](const formatv_object_base& msg) { - _log.trace("{0}", msg.str()); - }; - - mlir::ConversionTarget target(ctx); - target.addDynamicallyLegalOp([&](IE::InterpolateOp op) { - // For interpolation on axes H & W, and C <= 4, - // SW kernel performance is bigger that DPU decomposition performance for floating scale factors - const auto inputShape = getShape(op.getInput()); - if (inputShape.size() != 4 || inputShape[Dims4D::Act::C] > 4) { - return isLegalInterpolateOp(op, _interpolateAsSEOp, logCb); - } - - const auto outputShape = getShape(op.getOutput()); - const auto attr = op.getAttr(); - const auto coordModeAttr = attr.getCoordMode(); - bool isAlignCorners = coordModeAttr.getValue() == IE::InterpolateCoordMode::ALIGN_CORNERS ? true : false; - auto isIntegerRatio = [&](const auto& dim) -> bool { - auto outputDim = outputShape[dim]; - auto inputDim = inputShape[dim]; - - if (isAlignCorners) { - outputDim = outputDim == 1 ? 1 : (outputDim - 1); - inputDim = inputDim == 1 ? 1 : (inputDim - 1); - } - - return (outputDim % inputDim == 0) || (inputDim % outputDim == 0); - }; - - const bool isInterpOnHW = inputShape[Dims4D::Act::N] == 1 && outputShape[Dims4D::Act::N] == 1 && - inputShape[Dims4D::Act::H] != outputShape[Dims4D::Act::H] && - inputShape[Dims4D::Act::W] != outputShape[Dims4D::Act::W] && - inputShape[Dims4D::Act::C] == outputShape[Dims4D::Act::C]; - - if (isInterpOnHW && !isIntegerRatio(Dims4D::Act::H) && !isIntegerRatio(Dims4D::Act::W)) { - return true; - } - return isLegalInterpolateOp(op, _interpolateAsSEOp, logCb); - }); - - target.addLegalOp(); - target.addLegalOp(); - target.addLegalOp(); - target.addLegalOp(); - target.addLegalOp(); - target.addLegalOp(); - - mlir::RewritePatternSet patterns(&ctx); - patterns.insert(&ctx, _log); - - if (mlir::failed(mlir::applyPartialConversion(func, target, std::move(patterns)))) { - signalPassFailure(); - } -} - -} // namespace - -// -// createMapBilinearInterpolateOnDPUPass -// - -std::unique_ptr vpux::IE::arch37xx::createMapBilinearInterpolateOnDPUPass(const bool interpolateAsSEOp, - Logger log) { - return std::make_unique(interpolateAsSEOp, log); -} diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/optimize_network_input_convert.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/optimize_network_input_convert.cpp index 8c07dadc9d..ba297ff105 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/optimize_network_input_convert.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/optimize_network_input_convert.cpp @@ -6,10 +6,12 @@ #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/NPU37XX/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/transforms/passes/convert_to_mixed_precision.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" - #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/optimize_slice_expand.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/optimize_slice_expand.cpp index d79cb5ba86..a2572c5d40 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/optimize_slice_expand.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/optimize_slice_expand.cpp @@ -5,8 +5,13 @@ #include "vpux/compiler/dialect/IE/transforms/passes/optimize_slice_expand.hpp" #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE::arch37xx { diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/process_asymmetric_zero_points_for_convolution.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/process_asymmetric_zero_points_for_convolution.cpp index 77ce8ddeb1..77b6937d5e 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/process_asymmetric_zero_points_for_convolution.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/process_asymmetric_zero_points_for_convolution.cpp @@ -4,9 +4,10 @@ // #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/const/ops.hpp" - #include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/process_asymmetric_zero_points_for_matmul.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/process_asymmetric_zero_points_for_matmul.cpp index 96c7202cdf..312d2f72f9 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/process_asymmetric_zero_points_for_matmul.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/process_asymmetric_zero_points_for_matmul.cpp @@ -6,13 +6,15 @@ #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" - -#include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/compiler/utils/analysis.hpp" +#include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/range.hpp" @@ -186,21 +188,32 @@ IE::FakeQuantizeOp getMatchingFakeQuantizeOp(IE::ConvolutionOp convOp) { // Do pattern checks and quantization checks // We want to reach FakeQuantize there may be multiple optional different ops (reshape/transpose) + auto maybeFQ = checkOp(convOp, getConvWeights, checkFQ); + if (maybeFQ != nullptr) { + return maybeFQ; + } + auto maybeReshapeToNxCx1x1 = checkOp(convOp, getConvWeights, checkReshapeToNxCx1x1); - auto maybeFQ = checkOp(maybeReshapeToNxCx1x1, getReshapeInput, checkFQ); - if (maybeFQ == nullptr) { - // TransposeOp may or may not be in the graph depending on transpose_b. - auto maybeTransposeOp = - checkOp(maybeReshapeToNxCx1x1, getReshapeInput, checkTranspose); - maybeFQ = checkOp(maybeTransposeOp, getTransposeInput, checkFQ); - if (maybeFQ == nullptr) { - // Some graphs have reshape to 1xHx1xW - auto maybeReshapeTo1xHx1xW = checkOp( - maybeTransposeOp, getTransposeInput, checkReshapeTo1xHx1xW); - maybeFQ = checkOp(maybeReshapeTo1xHx1xW, getReshapeInput, checkFQ); - } + maybeFQ = checkOp(maybeReshapeToNxCx1x1, getReshapeInput, checkFQ); + + if (maybeFQ != nullptr) { + return maybeFQ; + } + + // TransposeOp may or may not be in the graph depending on transpose_b. + auto maybeTransposeOp = + checkOp(maybeReshapeToNxCx1x1, getReshapeInput, checkTranspose); + maybeFQ = checkOp(maybeTransposeOp, getTransposeInput, checkFQ); + if (maybeFQ != nullptr) { + return maybeFQ; } + + // Some graphs have reshape to 1xHx1xW + auto maybeReshapeTo1xHx1xW = + checkOp(maybeTransposeOp, getTransposeInput, checkReshapeTo1xHx1xW); + maybeFQ = checkOp(maybeReshapeTo1xHx1xW, getReshapeInput, checkFQ); + return maybeFQ; } @@ -385,16 +398,19 @@ class FixMatmulZeroPointRewriter final : public mlir::OpRewritePatterngetName(), convOp->getLoc()); + + auto nestedLog = _log.nest(); auto asymmetricFQ = getMatchingFakeQuantizeOp(convOp); if (asymmetricFQ == nullptr) { - return mlir::failure(); + return matchFailed(nestedLog, rewriter, convOp, "Could not find asymmetric FQ"); } // Most of the time runtime dequantization is more efficient than this method, however for some layers new ops added // with this are really small compared to original matmul, so if new ops are smaller // 1/_decompositionEnablementRatio(250) of original matmul we use this method if (!isConversionBeneficial(convOp, _decompositionEnablementRatio)) { - return mlir::failure(); + return matchFailed(nestedLog, rewriter, convOp, "Conversion is not beneficial."); } // We change outputHigh/Low of FQ so original matmul runs as if it was symmetric, then we will apply a fix later diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/propagate_expand.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/propagate_expand.cpp index 6d431bee35..b8be834469 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/propagate_expand.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/propagate_expand.cpp @@ -3,28 +3,24 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - -#include -#include -#include -#include -#include #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/empty_node.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/compiler/utils/types.hpp" #include #include +#include +#include #include #include diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/propagate_reorder_to_nce.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/propagate_reorder_to_nce.cpp index dac058176d..b1d965428b 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/propagate_reorder_to_nce.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/propagate_reorder_to_nce.cpp @@ -5,11 +5,12 @@ #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/utils/act_shave_utils.hpp" #include "vpux/compiler/dialect/IE/utils/softmax_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" - -#include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/swap_d2s_scale_shift.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/swap_d2s_scale_shift.cpp index 61b34a81be..177f8942be 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/swap_d2s_scale_shift.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/swap_d2s_scale_shift.cpp @@ -4,13 +4,12 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/broadcast_utils.hpp" - #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" - -#include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/swap_maxpool_with_activation.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/swap_maxpool_with_activation.cpp index 03ada28c72..6bcb81607a 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/swap_maxpool_with_activation.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/swap_maxpool_with_activation.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/weights_quant_fused_into_task.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/weights_quant_fused_into_task.cpp index c60747701c..7434c57a6f 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/weights_quant_fused_into_task.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/passes/weights_quant_fused_into_task.cpp @@ -5,7 +5,7 @@ #include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" #include "vpux/compiler/dialect/const/ops.hpp" namespace vpux::IE::arch37xx { diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/pipelines.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/pipelines.cpp index b455437f00..bbe3c0e167 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/pipelines.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/transforms/pipelines.cpp @@ -80,6 +80,7 @@ void vpux::IE::arch37xx::buildExpandAndOptimizeActivationChannelsPipeline( } pm.addPass(IE::arch37xx::createExpandActivationChannelsPass( /*seOpsEnabled=*/isOptionEnabled(options.enableSEPtrsOperations), log)); + pm.addPass(IE::createExpandSoftmaxAxisPass(log)); pm.addPass(mlir::createCanonicalizerPass(grc)); if (options.enableOptimizeSliceExpand) { @@ -160,6 +161,7 @@ void vpux::IE::arch37xx::buildInitialTransformationsPipeline(mlir::OpPassManager pm.addPass(IE::createDecomposeNormalizeL2Pass(log)); pm.addPass(IE::createReshapeMatMulInputsPass(options.enableGroupedMatMul, log)); pm.addPass(IE::createAdjustFakeQuantizeParamsPass(log)); + pm.addPass(IE::createAdjustFakeQdqParamsPass(log)); pm.addPass(IE::createFuseFQAndMulPass(options.fuseFQAndMulWithNonConstInput, log)); pm.addPass(IE::createHandleU16FakeQuantizePass(log)); pm.addPass(IE::createSwishFusionPass(log)); @@ -197,7 +199,7 @@ void vpux::IE::arch37xx::buildLowPrecisionPipeline(mlir::OpPassManager& pm, cons pm.addPass(IE::createOptimizeUnalignedQDQSeqPass(log)); pm.addPass(IE::createSwapFakeQuantWithReshapeAndStridedSlicePass(log)); - pm.addPass(IE::createSwapConvertWithTransposeReshapePass(log)); + pm.addPass(IE::createSwapConvertWithReshapeKindOpsPass(log)); if (options.enableAlignScales) { pm.addPass(IE::createAlignScalesPass(isOptionEnabled(options.enableSEPtrsOperations), log)); } @@ -213,6 +215,9 @@ void vpux::IE::arch37xx::buildLowPrecisionPipeline(mlir::OpPassManager& pm, cons } pm.addPass(IE::createSplitFakeQuantPass(log)); + if (options.enablePropagateQuantDequant) { + pm.addPass(IE::createPropagateQuantizeDequantizePass(isOptionEnabled(options.enableSEPtrsOperations), log)); + } pm.addPass(IE::createFuseOpWithQuantizePass(log)); pm.addPass(IE::createConvertToDequantizePass(options, log)); if (options.enablePropagateQuantDequant) { @@ -481,8 +486,7 @@ void vpux::IE::arch37xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, const I } if (options.enableBilinearInterpolateOnDPU) { - pm.addPass(IE::arch37xx::createMapBilinearInterpolateOnDPUPass(isOptionEnabled(options.enableSEPtrsOperations), - log)); + pm.addPass(IE::createMapBilinearInterpolateOnDPUPass(isOptionEnabled(options.enableSEPtrsOperations), log)); } pm.addPass(IE::createConvertBatchedLayerTo1NPass(log)); @@ -544,6 +548,56 @@ void vpux::IE::arch37xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, const I } } +void vpux::IE::arch37xx::buildReferenceSWPipeline(mlir::OpPassManager& pm, + const IE::arch37xx::DefaultHWOptions& options, Logger log) { + const auto grc = getDefaultGreedyRewriteConfig(); + // No passes should be run before this pipeline, with very few exceptions. + IE::buildPostImportPipeline(pm, log); + + // Level 3 : Topology + IE::arch37xx::buildInitialLowPrecisionTransformationsPipeline(pm, IE::LowPrecisionTransformOptions(options), log); + IE::arch37xx::buildInitialTransformationsPipeline(pm, IE::TransformOptions(options), log); + IE::buildAdjustPrecisionPipeline(pm, IE::AdjustPrecisionOptions(options), log); + + // Resolve group quant MatMul pattern + pm.addPass(IE::createUniquifyOpsPass(log)); + pm.addPass(IE::createMergeParallelFullyConnectedPass(log)); + pm.addPass(IE::createUnrollGroupQuantizePass(log)); + pm.addPass(IE::createUnrollFullyConnectedPass(log)); + if (options.fuseScalesToAccumulate) { + pm.addPass(IE::createFuseScalesToAccumulatePass(log)); + } + pm.addPass(IE::createConvertMatMulToConvPass(log)); + if (options.enableConvertFCToConv) { + pm.addPass(IE::createConvertFCToConvPass(log)); + } + + pm.addPass(IE::createResolveStridedSlicePass(log)); + pm.addPass(IE::createConvertStridedSlice2ConvPass(log)); + pm.addPass(IE::createConvertNceOpsTo4DPass(log)); + pm.addPass(IE::createConvertShapeTo4DPass(log)); + pm.addPass(mlir::createCanonicalizerPass(grc)); + pm.addPass(IE::createConvertToSpatialOpPass(false, isOptionEnabled(options.enableSEPtrsOperations), log)); + pm.addPass(IE::createConvertGRNToNormalizeL2Pass(log)); + pm.addPass(IE::createResolveScatterUpdateByTransposePass(log)); + IE::buildAdjustForVPUPipeline(pm, IE::AdjustForVPUOptions(options), log); + + pm.addPass(IE::createSplitFakeQuantPass(log)); + pm.addPass(mlir::createCanonicalizerPass(grc)); + pm.addPass(IE::createDequantizeConstPass(options.runtimeDequantizationLimit, + isOptionEnabled(options.enableRuntimeDequant), log)); + if (options.enableMergeFakeQuant) { + pm.addPass(IE::createMergeFakeQuantPass(log)); + } + pm.addPass(mlir::createCanonicalizerPass(grc)); + + IE::arch37xx::buildAdjustLayoutPipeline(pm, IE::AdjustLayoutOptions(options), log); + pm.addPass(IE::createConvertAssignReadValueToReturnsAndInputs(log)); + + pm.addPass(IE::createConvertToMemPermutePass(log)); + pm.addPass(mlir::createCanonicalizerPass(grc)); +} + // // registerIEPipelines // @@ -625,4 +679,10 @@ void vpux::IE::arch37xx::registerIEPipelines() { [](mlir::OpPassManager& pm) { IE::arch37xx::buildDynamicShapeTransformationsPipeline(pm, DynamicShapeTransformOptions()); }); + + mlir::PassPipelineRegistration( + "reference-sw-mode-ie", "IE dialect part of Reference SW pipeline", + [](mlir::OpPassManager& pm, const IE::arch37xx::DefaultHWOptions& options) { + IE::arch37xx::buildReferenceSWPipeline(pm, options); + }); } diff --git a/src/vpux_compiler/src/NPU37XX/dialect/IE/utils/quantization.cpp b/src/vpux_compiler/src/NPU37XX/dialect/IE/utils/quantization.cpp index 5f3f95c910..0b3ca654a3 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/IE/utils/quantization.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/IE/utils/quantization.cpp @@ -4,7 +4,12 @@ // #include "vpux/compiler/NPU37XX/dialect/IE/utils/quantization.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/utils/quantization.hpp" +#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPU/IR/layer_permute_ie.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPU/IR/layer_permute_ie.cpp index 9599ba52ef..52697657f1 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPU/IR/layer_permute_ie.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPU/IR/layer_permute_ie.cpp @@ -6,8 +6,11 @@ #include "vpux/compiler/NPU37XX/dialect/VPU/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/VPU/interfaces/common_utils/layer_permute_ie.hpp" +#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPU/IR/layer_post_ops.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPU/IR/layer_post_ops.cpp index c17ee511d6..0cfb21c2c4 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPU/IR/layer_post_ops.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPU/IR/layer_post_ops.cpp @@ -4,16 +4,17 @@ // #include "vpux/compiler/NPU37XX/dialect/VPU/IR/ops_interfaces.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/layer_post_ops_utils.hpp" #include "vpux/compiler/dialect/config/IR/attributes.hpp" -#include "vpux/utils/core/checked_cast.hpp" -#include "vpux/utils/core/custom_float.hpp" #include "vpux/utils/core/numeric.hpp" -#include "vpux/utils/core/type/float16.hpp" #include @@ -101,7 +102,7 @@ class LayerWithPostOpModel final : return false; } - return VPU::NCEInvariant::verifyKernel(mlir::cast(mainOp)).succeeded(); + return VPU::NCEInvariant::isSupported(mlir::cast(mainOp)).succeeded(); } bool isSupportedClampOp(mlir::Operation* mainOp, mlir::Operation* clampOp, const LogCb& logCb) const { @@ -113,7 +114,7 @@ class LayerWithPostOpModel final : return false; } - return VPU::NCEInvariant::verifyKernel(mlir::cast(mainOp)).succeeded(); + return VPU::NCEInvariant::isSupported(mlir::cast(mainOp)).succeeded(); } void setLayerClampOp(mlir::Operation* mainOp, mlir::Operation* activationOp) const { diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPU/IR/layout_info.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPU/IR/layout_info.cpp index afa8079de6..017b0ed7ce 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPU/IR/layout_info.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPU/IR/layout_info.cpp @@ -4,16 +4,29 @@ // #include "vpux/compiler/NPU37XX/dialect/VPU/IR/ops_interfaces.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/bitwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/resources.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/shave_kernel_info.hpp" #include "vpux/compiler/dialect/VPU/utils/layout_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/utils/permute_utils.hpp" using namespace vpux; @@ -220,8 +233,8 @@ class MVNLayoutInfoOpModelForSW final : public IE::LayoutInfoOpInterface::Fallba return false; } - mlir::Operation* reshapeBefore = mvnOp->getOperand(0).getDefiningOp(); - if (!mlir::isa_and_nonnull(reshapeBefore)) { + auto reshapeBefore = mvnOp->getOperand(0).getDefiningOp(); + if (!reshapeBefore) { return false; } mlir::Operation* reshapeAfter = *mvnOp->getResult(0).getUsers().begin(); @@ -229,14 +242,8 @@ class MVNLayoutInfoOpModelForSW final : public IE::LayoutInfoOpInterface::Fallba return false; } - // Check Reshapes are symmetrical - const auto inType = mlir::cast(reshapeBefore->getOperand(0).getType()); - const auto inC = inType.getShape()[Dims4D::Act::C]; - const auto outType = mlir::cast(reshapeAfter->getResult(0).getType()); - const auto outC = outType.getShape()[Dims4D::Act::C]; - if (inC != outC) { - // Check pattern Reshape1 -> MVN -> Reshape2 -> GroupConv -> Reshape3 - // To be removed after E#123528 gets implemented + // To be removed after E#123528 gets implemented + if (mlir::isa_and_nonnull(reshapeAfter)) { mlir::Operation* groupConv = *reshapeAfter->getResult(0).getUsers().begin(); if (!mlir::isa_and_nonnull(groupConv)) { return false; @@ -245,11 +252,16 @@ class MVNLayoutInfoOpModelForSW final : public IE::LayoutInfoOpInterface::Fallba if (!mlir::isa_and_nonnull(finReshape)) { return false; } - const auto newOutType = mlir::cast(finReshape->getResult(0).getType()); - const auto newOutC = newOutType.getShape()[Dims4D::Act::C]; - if (inC != newOutC) { - return false; - } + reshapeAfter = finReshape; + } + + // Check Reshapes are symmetrical + const auto inType = mlir::cast(reshapeBefore->getOperand(0).getType()); + const auto outType = mlir::cast(reshapeAfter->getResult(0).getType()); + const auto inC = inType.getShape()[Dims4D::Act::C]; + const auto outC = outType.getShape()[Dims4D::Act::C]; + if (inC != outC) { + return false; } // Check channel constraints diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/nce_op_interfaces.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/nce_op_interfaces.cpp index 9542c89899..7a0a6d0822 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/nce_op_interfaces.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/nce_op_interfaces.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/NPU37XX/dialect/VPU/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -22,7 +23,7 @@ std::vector getNTHWNTKGrid(VPU::MPEMode mode) { return {16, 16, 64}; } } -VPU::MPEMode getMpeModeForConv([[maybe_unused]] VPU::ArchKind arch, ShapeRef shape) { +VPU::MPEMode getMpeModeForConv([[maybe_unused]] config::ArchKind arch, ShapeRef shape) { std::vector> MPECost = {{0.0, VPU::MPEMode::CUBOID_4x16}, {0.0, VPU::MPEMode::CUBOID_8x16}, {0.0, VPU::MPEMode::CUBOID_16x16}}; @@ -48,7 +49,7 @@ VPU::MPEMode getMpeModeForConv([[maybe_unused]] VPU::ArchKind arch, ShapeRef sha class ConvMpeModeModel { public: VPU::MPEMode getMpeModeImpl(mlir::Operation* op, mlir::Type, mlir::Type, ShapeRef shape) const { - auto archKind = VPU::getArch(op); + auto archKind = config::getArch(op); return getMpeModeForConv(archKind, shape); } }; diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/ppe_factory.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/ppe_factory.cpp index e4e346e95b..81287a16d6 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/ppe_factory.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/ppe_factory.cpp @@ -3,14 +3,18 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/utils/core/checked_cast.hpp" -#include "vpux/utils/core/custom_float.hpp" -#include "vpux/utils/core/numeric.hpp" - #include "vpux/compiler/NPU37XX/dialect/VPU/impl/ppe_factory.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/utils/eltwise_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/ppe_utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/quantization.hpp" +#include "vpux/utils/core/checked_cast.hpp" +#include "vpux/utils/core/custom_float.hpp" +#include "vpux/utils/core/numeric.hpp" #include diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/shave_kernel_info.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/shave_kernel_info.cpp index 9c4ebdb90c..4be2b8eb62 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/shave_kernel_info.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/shave_kernel_info.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/NPU37XX/dialect/VPU/impl/shave_kernel_info.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/sparsity_constraint.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/sparsity_constraint.cpp index 2307c1fa42..13308b3f3a 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/sparsity_constraint.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPU/impl/sparsity_constraint.cpp @@ -4,8 +4,8 @@ // #include "vpux/compiler/NPU37XX/dialect/VPU/impl/sparsity_constraint.hpp" +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" - #include "vpux/utils/core/numeric.hpp" using namespace vpux::VPU::arch37xx; diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPU/interfaces/cost_model_utils.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPU/interfaces/cost_model_utils.cpp new file mode 100644 index 0000000000..7f700fe81a --- /dev/null +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPU/interfaces/cost_model_utils.cpp @@ -0,0 +1,40 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/NPU37XX/dialect/VPU/IR/ops_interfaces.hpp" + +#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/interfaces/cost_model_utils.hpp" + +namespace vpux { +namespace VPU { +namespace arch37xx { +class ICostModelUtilsInterface final : public vpux::VPU::ICostModelUtilsInterface { +public: + ICostModelUtilsInterface(mlir::Dialect* dialect): vpux::VPU::ICostModelUtilsInterface(dialect) { + } + + bool isNCEWithInt4WeightsSupported() const override { + return false; + } + + bool isMultiDimPipelineTilingSupported() const override { + return false; + } +}; +} // namespace arch37xx +} // namespace VPU +} // namespace vpux + +// +// setupExtraInterfaces +// + +void vpux::VPU::arch37xx::registerICostModelUtilsInterface(mlir::DialectRegistry& registry) { + registry.addExtension(+[](mlir::MLIRContext*, vpux::VPU::VPUDialect* dialect) { + dialect->addInterfaces(); + }); +} diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPU/interfaces/unroll_batch_op.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPU/interfaces/unroll_batch_op.cpp index 69d93f968c..63458169ed 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPU/interfaces/unroll_batch_op.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPU/interfaces/unroll_batch_op.cpp @@ -6,11 +6,15 @@ #include "vpux/compiler/NPU37XX/dialect/VPU/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/IE/transforms/passes/unroll_batch.hpp" -#include "vpux/compiler/dialect/IE/utils/resources.hpp" -#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPU/transforms/passes/apply_tiling_mvn1sum.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPU/transforms/passes/apply_tiling_mvn1sum.cpp index 05f31546cf..b70c453e01 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPU/transforms/passes/apply_tiling_mvn1sum.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPU/transforms/passes/apply_tiling_mvn1sum.cpp @@ -14,6 +14,7 @@ #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/types.hpp" +#include "vpux/utils/core/numeric.hpp" #include #include diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPU/transforms/passes/correct_nce_workloads.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPU/transforms/passes/correct_nce_workloads.cpp index a2100c1fb4..448ab134a1 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPU/transforms/passes/correct_nce_workloads.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPU/transforms/passes/correct_nce_workloads.cpp @@ -11,6 +11,7 @@ #include "vpux/compiler/dialect/VPU/interfaces/workload_splitter_base.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/sparsity_constraint.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" namespace vpux::VPU::arch37xx { #define GEN_PASS_DECL_CORRECTNCEWORKLOADS @@ -64,7 +65,7 @@ class CorrectNCEWorkloadsPass final : public VPU::arch37xx::impl::CorrectNCEWork void CorrectNCEWorkloadsPass::safeRunOnFunc() { auto func = getOperation(); - const auto arch = getArch(func); + const auto arch = config::getArch(func); auto sparsityConstraint = VPU::getSparsityConstraint(arch); WorkloadSplitter37XX splitter(func, _log); diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPU/transforms/pipelines.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPU/transforms/pipelines.cpp index 24b1b7e138..60fb9d8f7c 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPU/transforms/pipelines.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPU/transforms/pipelines.cpp @@ -46,6 +46,28 @@ void vpux::VPU::arch37xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, const VPU::arch37xx::DefaultHWOptions& options, Logger log) { const auto grc = getDefaultGreedyRewriteConfig(); + /* + Memory reservation for CMX has to happen as early in VPU as possible. It is required because memory reservation + decreases usable CMX size which can result in different tiling decisions. If different passes see different + effective CMX size different failures which can be hard to diagnose can happen. Examples of such failures include: + - Fail during compilation if additional memory was reserved after tiling but before scheduling since tiles + selected by tiling pipeline won't fit CMX anymore + - Memory corruption if additional memory is reserved after scheduler since additional memory will overlap + addresses allocated by the scheduler Currently there is no validation if memory is not reserved before the first + call to getTotalCMXSize. + */ + if (options.enableProfiling) { + pm.addPass(VPU::createDMATaskProfilingReserveMemPass(options.enableDMAProfiling.getValue(), log)); + } + + /* + Call this pass after all other memory reservation has already been done. This pass checks if there is 1KiB + of reserved memory at the end of CMX and extends it if some is missing. So to not waste CMX memory make sure + as much as possible is allocated in that 1KiB region. Exception to this rule is memory reserved for SW kernel IO + for such memory make sure to reserve it after this pass to allow data prefetching. + */ + pm.addPass(VPU::createSWKernelDataPrefetchReserveMemPass(log)); + // TODO: E#140041 enable profiling with outlining if (options.enableConcatRepeatingBlockOutlining && !options.enableProfiling) { pm.addPass(VPU::createConcatRepeatingBlocksOutliningPass(options.concatRepeatingBlockOutliningSeqLength, log)); @@ -70,8 +92,8 @@ void vpux::VPU::arch37xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, pm.addPass(VPU::createFuseClampPass(log)); pm.addPass(VPU::createEnsureNCEOpsSizeRequirementsPass(true, log)); - pm.addPass(VPU::createOptimizeConcatPass(/*optimizeOnlyOuterConcat*/ false, log)); - + pm.addPass(VPU::createOptimizeConcatPass(/*optimizeOnlyOuterConcat*/ false, + /*disablePassOnEntryFunctionForHostCompile=*/false, log)); if (options.enableWeightsSparsity) { VPU::buildWeightsSparsityPipeline(pm, VPU::WeightsSparsityOptions(options), log); } @@ -94,7 +116,8 @@ void vpux::VPU::arch37xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, pm.addPass(VPU::createAdjustMemorySpacePass(log)); pm.addPass(VPU::createOptimizeSharedInputCopyForConcatPass(log)); - pm.addPass(VPU::createOptimizeConcatPass(/*optimizeOnlyOuterConcat*/ false, log)); + pm.addPass(VPU::createOptimizeConcatPass(/*optimizeOnlyOuterConcat*/ false, + /*disablePassOnEntryFunctionForHostCompile=*/false, log)); pm.addPass(mlir::createCanonicalizerPass(grc)); pm.addPass(VPU::createCMXConcatPass(log)); @@ -107,6 +130,26 @@ void vpux::VPU::arch37xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, pm.addPass(mlir::createCanonicalizerPass(grc)); } +void vpux::VPU::arch37xx::buildReferenceSWPipeline(mlir::OpPassManager& pm, Logger log) { + // Create DMA HWP scratch buffer + pm.addPass(VPU::createDMATaskProfilingReserveMemPass("false", log)); + pm.addPass(VPU::createSWKernelDataPrefetchReserveMemPass(log)); + pm.addPass(VPU::createDetectionOutputDecompositionPass(log)); + pm.addPass(VPU::arch37xx::createSplitRealDFTOpsPass(log)); + pm.addPass(VPU::createSplitGRUSequencePass(log)); + pm.addPass(VPU::arch37xx::createDecomposeMVNPass(log)); + pm.addPass(VPU::createAddSwOpAuxiliaryBufferPass(log)); + + pm.addPass(VPU::createTilingStrategyAssignmentPass( + /*enablePrefetchTiling=*/false, /*enableVPUNNCostForTiling*/ false, + /*enableShaveDDRAccessOptimization*/ "true", log)); + pm.addPass(VPU::arch37xx::createApplyTilingMVN1SumPass(/*enablePrefetchTiling=*/false, log)); + pm.addPass(VPU::createApplyTilingPass(/*enableSCFTiling=*/false, log)); + pm.addPass(VPU::createComputeInterpolateCoordinatesPass(/*enableExplicitDistributionInfoAttr*/ false, log)); + + pm.addPass(VPU::createBoundedTensorsToDynamicDimsMaskPass(log)); +} + void vpux::VPU::arch37xx::registerVPUPipelines() { mlir::PassPipelineRegistration( "default-hw-mode-vpu", "VPU dialect part of Default HW pipeline", @@ -125,4 +168,10 @@ void vpux::VPU::arch37xx::registerVPUPipelines() { [](mlir::OpPassManager& pm, const vpux::arch37xx::MCAndTilingOptionsDevice& options) { VPU::buildSMPipeline(pm, vpux::MCAndTilingOptionsBase(options)); }); + + mlir::PassPipelineRegistration( + "reference-sw-mode-vpu", "VPU dialect part of Reference SW pipeline", + [](mlir::OpPassManager& pm, const VPU::arch37xx::DefaultHWOptions&) { + VPU::arch37xx::buildReferenceSWPipeline(pm); + }); } diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/impl/profiling_info.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/impl/profiling_info.cpp index 02c244d3c7..2be0f81791 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/impl/profiling_info.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/impl/profiling_info.cpp @@ -5,6 +5,7 @@ // #include "vpux/compiler/NPU37XX/dialect/VPUIP/impl/profiling_info.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/types.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/impl/split_cost_getter.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/impl/split_cost_getter.cpp index d9610a0f47..01cf3c277f 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/impl/split_cost_getter.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/impl/split_cost_getter.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/NPU37XX/dialect/VPUIP/impl/split_cost_getter.hpp" #include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/interfaces/aligned_channels_ops.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/interfaces/aligned_channels_ops.cpp index 780fa87aac..a006ec9625 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/interfaces/aligned_channels_ops.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/interfaces/aligned_channels_ops.cpp @@ -4,12 +4,13 @@ // #include "vpux/compiler/NPU37XX/dialect/VPUIP/IR/ops_interfaces.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/utils/compressed_convolution_utils.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" #include "vpux/compiler/dialect/config/IR/attributes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/interfaces/aligned_workload_channels_ops.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/interfaces/aligned_workload_channels_ops.cpp index e37f0c1335..2fe73f9b03 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/interfaces/aligned_workload_channels_ops.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/interfaces/aligned_workload_channels_ops.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/interfaces/workload_splitter_base.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/sparsity_constraint.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -21,7 +22,7 @@ class AlignedWorkloadChannelsOpModel final : SmallVector getSupportedWorkLoadChannels(mlir::Operation* nceOp) const { auto func = nceOp->getParentOfType(); auto log = Logger::global(); - const auto arch = VPU::getArch(func); + const auto arch = config::getArch(func); auto sparsityConstraint = VPU::getSparsityConstraint(arch); VPU::WorkloadSplitter37XX splitter(func, log); diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/add_sw_kernel_cache_handling_ops.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/add_sw_kernel_cache_handling_ops.cpp index 0a056583ef..fe5de9ad82 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/add_sw_kernel_cache_handling_ops.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/add_sw_kernel_cache_handling_ops.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/NPU37XX/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/core/cost_model_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" @@ -65,7 +66,8 @@ mlir::async::ExecuteOp createCacheHandlingSwKernel(mlir::OpBuilder builder, OpBu auto cacheHandlingSwKernel = builder.create(loc, buffersRange, buffersRange, nullptr, functionSymbol, getIntAttr(builder, tileIndex)); const SmallVector args = {}; - vpux::VPUIP::initSwKernel(cacheHandlingSwKernel, buffersRange, buffersRange, args, log.nest()); + vpux::VPUIP::initSwKernel(cacheHandlingSwKernel, buffersRange, buffersRange, args, log.nest(), + /*swKernelRunOp=*/nullptr); builder.create(loc, std::nullopt); }; diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_depth_to_space_dma.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_depth_to_space_dma.cpp index 28195f905a..83c8899846 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_depth_to_space_dma.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_depth_to_space_dma.cpp @@ -7,12 +7,12 @@ #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/interfaces/dma_descriptor_generator.hpp" #include "vpux/compiler/dialect/VPUIP/utils/unroll_dma_analysis.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/dma_limits.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -82,7 +82,7 @@ mlir::LogicalResult DepthToSpaceDMARewriter::matchAndRewriteClusterDMA(VPUIP::De const auto getDistModeAttr = [&](VPUIP::DistributedBufferType distType) { const auto distAttr = distType.getDistribution(); - VPUX_THROW_WHEN(distAttr == nullptr, "Failed to extract distributon tensor from distributed type"); + VPUX_THROW_WHEN(distAttr == nullptr, "Failed to extract distribution tensor from distributed type"); return distAttr.getMode(); }; @@ -117,7 +117,7 @@ mlir::LogicalResult DepthToSpaceDMARewriter::matchAndRewriteClusterDMA(VPUIP::De mlir::SmallVector outputBuffers; if (distributedInputType != nullptr && distributedOutputType != nullptr) { - _log.nest().trace("Got multi-cluster to multi-clutser case"); + _log.nest().trace("Got multi-cluster to multi-cluster case"); const auto inputPerClusterShapes = distributedInputType.getPerClusterMemoryShapes(); const auto outputPerClusterShapes = distributedOutputType.getPerClusterMemoryShapes(); @@ -135,7 +135,7 @@ mlir::LogicalResult DepthToSpaceDMARewriter::matchAndRewriteClusterDMA(VPUIP::De } if (distributedInputType != nullptr && distributedOutputType == nullptr) { - _log.nest().trace("Got multi-cluster to single-clutser case"); + _log.nest().trace("Got multi-cluster to single-cluster case"); const auto outputShapes = SmallVector( llvm::map_range(distributedInputType.getPerClusterMemoryShapes(), inferOutputShape)); const auto outputShapeOffsets = SmallVector( @@ -149,7 +149,7 @@ mlir::LogicalResult DepthToSpaceDMARewriter::matchAndRewriteClusterDMA(VPUIP::De } if (distributedInputType == nullptr && distributedOutputType != nullptr) { - _log.nest().trace("Got single-cluster to multi-clutser case"); + _log.nest().trace("Got single-cluster to multi-cluster case"); const auto inputShapes = SmallVector( llvm::map_range(distributedOutputType.getPerClusterMemoryShapes(), [&](ShapeRef outShape) { @@ -280,7 +280,7 @@ mlir::LogicalResult DepthToSpaceDMARewriter::matchAndRewrite(VPUIP::DepthToSpace auto dmaDescriptorGenerator = VPUIP::DepthToSpaceDmaDescriptorGenerator(ctx, _log); - const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(VPU::getArch(depthToSpaceDMAOp)); + const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(config::getArch(depthToSpaceDMAOp)); const auto dmaMaxNumPlanes = dmaEngineLimits.getMaxNumPlanes() - 1; // inputH is the planes number, need to split if it exceed the max number. diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_distributed_ops.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_distributed_ops.cpp index a4b9a3eeb8..352312e4f8 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_distributed_ops.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_distributed_ops.cpp @@ -6,15 +6,14 @@ #include "vpux/compiler/dialect/VPUIP/transforms/passes/unroll_distributed_ops.hpp" #include "vpux/compiler/NPU37XX/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/NPU37XX/dialect/VPUIP/transforms/passes/unroll_distributed_ops.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" -#include "vpux/compiler/core/cost_model_utils.hpp" #include "vpux/compiler/core/profiling.hpp" #include @@ -131,8 +130,28 @@ void VPUIP::arch37xx::ClusterSWRewriter::matchAndRewrite(VPUIP::SwKernelOp swTas // For overlapped input, the Swkernel's attr need to be updated according to its input/output tiles const auto kernelEntryName = getSwKernelEntryName(swTask); + + // Dequantize needs attr updates only if tiling_axis == quantization_axis + auto isDequantizeTiledOverQuantAxis = [&]() { + if (kernelEntryName == "dequantize") { + const auto input = swTask.getInputs()[0]; + const auto inType = mlir::cast(input.getType()); + const auto elementType = inType.getElementType(); + + if (auto quantParams = mlir::dyn_cast(elementType)) { + auto tilingDimIdx = VPUIP::getTilingDimIndex(outputType); + if (tilingDimIdx.has_value()) { + auto quantAxis = quantParams.getQuantizedDimension(); + return tilingDimIdx.value() == quantAxis; + } + } + } + return false; + }; + auto needUpdateAttrs = inDistributionMode == VPU::DistributionMode::OVERLAPPED || kernelEntryName == "lstm_sequence" || kernelEntryName == "lstm_dpu" || + isDequantizeTiledOverQuantAxis() || (inDistributionMode == VPU::DistributionMode::SEGMENTED && kernelEntryName == "gatherND"); if (needUpdateAttrs) { @@ -245,7 +264,7 @@ void VPUIP::arch37xx::ClusterSWRewriter::matchAndRewrite(VPUIP::SwKernelOp swTas inputTypes.push_back(type); } - VPUIP::createRuntimeKernelDefinition(_module, _log.nest(), VPU::getArch(swTask.getOperation())); + VPUIP::createRuntimeKernelDefinition(_module, _log.nest(), config::getArch(swTask.getOperation())); auto module = swTask->getParentOfType(); auto kernelFunc = module.lookupSymbol(swTask.getKernelFunctionAttr()); @@ -284,7 +303,8 @@ void VPUIP::arch37xx::ClusterSWRewriter::matchAndRewrite(VPUIP::SwKernelOp swTas newTask.setListIndexAttr(listIndexAttr); } - initSwKernel(newTask, inputBuffs[clusterId], outputBuffs[clusterId], newArgs, _log.nest()); + initSwKernel(newTask, inputBuffs[clusterId], outputBuffs[clusterId], newArgs, _log.nest(), + /*swKernelRunOp=*/nullptr); _log.trace("Task created: {0}", newTask); } @@ -316,7 +336,7 @@ void VPUIP::arch37xx::ClusterNCERewriter::getInputBuffers( inputSETableBuffs = VPUIP::getPerClusterMemoryBuffers(_ctx, loc, "inputSETable", nceTask.getInputStorageElementTable(), numClusters, builder); - auto arch = VPU::getArch(nceTask); + auto arch = config::getArch(nceTask); bool isDWOpAndNeedsAlign = VPU::isDWOpAndNeedsAlign(arch, nceTask.getTaskType()); for (int64_t clusterId = 0; clusterId < numClusters; ++clusterId) { // For 37XX arch, ensure we have H_per_cluster x W as a multiple of 4 (or 8 for sparse inputs). diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_permute_dma.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_permute_dma.cpp new file mode 100644 index 0000000000..90a3f2c4e8 --- /dev/null +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_permute_dma.cpp @@ -0,0 +1,90 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/NPU37XX/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/NPU37XX/dialect/VPUIP/utils/permute_dma.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/unroll_dma_analysis.hpp" +#include "vpux/compiler/utils/rewriter.hpp" + +#include + +namespace vpux::VPUIP::arch37xx { +#define GEN_PASS_DECL_UNROLLPERMUTEDMA +#define GEN_PASS_DEF_UNROLLPERMUTEDMA +#include "vpux/compiler/NPU37XX/dialect/VPUIP/passes.hpp.inc" +} // namespace vpux::VPUIP::arch37xx + +using namespace vpux; + +namespace { + +// +// PermuteDMARewriter +// + +class PermuteDMARewriter final : public mlir::OpRewritePattern { +public: + PermuteDMARewriter(mlir::MLIRContext* ctx, int64_t dmaPortCount, Logger log) + : mlir::OpRewritePattern(ctx), _dmaPortCount(dmaPortCount), _log(log) { + setDebugName("PermuteDMARewriter"); + } + + mlir::LogicalResult matchAndRewrite(VPUIP::PermuteDMAOp permuteOp, mlir::PatternRewriter& rewriter) const final; + +private: + int64_t _dmaPortCount; + Logger _log; +}; + +mlir::LogicalResult PermuteDMARewriter::matchAndRewrite(VPUIP::PermuteDMAOp permuteOp, + mlir::PatternRewriter& rewriter) const { + return arch37xx::unrollPermuteDMA( + permuteOp, rewriter, _dmaPortCount, _log); +} + +// +// UnrollPermuteDMAPass +// + +class UnrollPermuteDMAPass final : public VPUIP::arch37xx::impl::UnrollPermuteDMABase { +public: + explicit UnrollPermuteDMAPass(Logger log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnFunc() final; +}; + +void UnrollPermuteDMAPass::safeRunOnFunc() { + auto& ctx = getContext(); + auto func = getOperation(); + markAnalysesPreserved(); + auto analysis = getAnalysis(); + if (!analysis.passNeeded(VPUIP::UnrollDMAAnalysisNeeded::UnrollPermuteDMAPass)) { + return; + } + auto module = func->getParentOfType(); + auto dmaOp = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN); + auto dmaPortCount = dmaOp.getCount(); + + mlir::RewritePatternSet patterns(&ctx); + patterns.add(&ctx, dmaPortCount, _log.nest()); + if (mlir::failed( + mlir::applyPatternsAndFoldGreedily(func, std::move(patterns), vpux::getDefaultGreedyRewriteConfig()))) { + signalPassFailure(); + } +} + +} // namespace + +// +// createUnrollPermuteDMAPass +// + +std::unique_ptr vpux::VPUIP::arch37xx::createUnrollPermuteDMAPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_space_to_depth_dma.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_space_to_depth_dma.cpp index 189e08f8e4..31332d0322 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_space_to_depth_dma.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/passes/unroll_space_to_depth_dma.cpp @@ -472,7 +472,7 @@ mlir::LogicalResult SpaceToDepthDMARewriter::matchAndRewriteClusterDMA(VPUIP::Sp outputType); const auto distributionAttr = distributedType.getDistribution(); - VPUX_THROW_WHEN(distributionAttr == nullptr, "Failed to extract distributon attribute from distributed type."); + VPUX_THROW_WHEN(distributionAttr == nullptr, "Failed to extract distribution attribute from distributed type."); const auto modeAttr = distributionAttr.getMode(); VPUX_THROW_WHEN(modeAttr == nullptr, "Failed to extract mode from distribution attribute."); diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/pipelines.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/pipelines.cpp index 0e068ca400..e319a4b8c3 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/pipelines.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/transforms/pipelines.cpp @@ -5,19 +5,15 @@ #include "vpux/compiler/NPU37XX/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/NPU37XX/dialect/VPURT/transforms/passes.hpp" -#include "vpux/compiler/core/profiling.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPURT/transforms/passes.hpp" #include "vpux/compiler/dialect/const/passes.hpp" #include "vpux/compiler/dialect/core/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp" - #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/profiling/common.hpp" - #include #include @@ -138,15 +134,6 @@ void vpux::VPUIP::arch37xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, VPUIP::buildAsyncSchedulingPipeline(pm, log); - if (options.enableProfiling) { - auto dmaProfilingMode = getDMAProfilingMode(VPU::ArchKind::NPU37XX, options.enableDMAProfiling.getValue()); - pm.addPass(VPUIP::createDMATaskProfilingReserveMemPass(dmaProfilingMode, log)); - } - - if (options.enableSWKernelPrefetchingReserveMem) { - pm.addPass(VPUIP::createSWKernelPrefetchingReserveMemPass(log)); - } - pm.addPass(VPUIP::createCalculateAsyncRegionCycleCostPass(log)); VPUIP::arch37xx::buildMemoryAllocationPipeline(pm, VPUIP::arch37xx::MemoryAllocationOptions(options), log); @@ -203,7 +190,7 @@ void vpux::VPUIP::arch37xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, pm.addPass(VPURT::createSimplifySchedulePass(options.reduceParallelControlFlows, std::nullopt, log)); } - pm.addPass(VPURT::createInsertBarrierToMarkTheEndOfDescriptorGroupPass(std::nullopt, log)); + pm.addPass(VPURT::createInsertBarrierToMarkTheEndOfDescriptorGroupPass(std::nullopt, std::nullopt, log)); pm.addPass(VPURT::arch37xx::createAddFinalBarrierPass(log)); @@ -219,8 +206,9 @@ void vpux::VPUIP::arch37xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, } if (options.enableProfiling) { - auto dmaProfilingMode = getDMAProfilingMode(VPU::ArchKind::NPU37XX, options.enableDMAProfiling.getValue()); - pm.addPass(VPUIP::createDMATaskProfilingAfterBarrierSchedPass(dmaProfilingMode, log)); + if (options.enableDMAProfiling == "true") { + pm.addPass(VPUIP::createDMATaskProfilingAfterBarrierSchedPass(log)); + } pm.addPass(VPUIP::createCaptureWorkpointPass(log)); pm.addPass(VPUIP::createGroupProfilingBuffersPass(log)); pm.addPass(Core::createMoveDeclarationsToTopPass(log)); @@ -254,6 +242,54 @@ void vpux::VPUIP::arch37xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, } } +void vpux::VPUIP::arch37xx::buildReferenceSWPipeline(mlir::OpPassManager& pm, + const VPUIP::arch37xx::DefaultHWOptions& options, Logger log) { + const auto grc = getDefaultGreedyRewriteConfig(); + pm.addPass(VPUIP::createSetMemorySpacePass(VPU::getMemKind, log)); + + pm.addPass(VPUIP::createAddCopyBetweenSWKernelsAndNetworkIOPass(log)); + + pm.addPass(VPUIP::createCopyOpTilingPass(log)); + pm.addPass(mlir::createCanonicalizerPass(grc)); + + if (options.enableProfiling && options.enableSWProfiling) { + pm.addPass(VPUIP::createActShaveProfilingPass(VPU::getMemKind, log)); + } + + pm.addPass(VPUIP::createUngroupBoundedBuffersPass(log)); + + pm.addPass(VPUIP::createConvertTransferOpsToDMAsPass(log)); + + VPUIP::buildAsyncSchedulingPipeline(pm, log); + + pm.addPass(VPUIP::createStaticAllocationPass(VPU::getMemKind, log)); + pm.addPass(VPUIP::createStaticAllocationPass(VPU::getMemKind, log)); + pm.addPass(VPUIP::createLinearizationPass(log)); + pm.addPass(VPUIP::createOptimizeAsyncDepsPass(log)); + + pm.addPass(VPUIP::arch37xx::createAddSwKernelCacheHandlingOpsPass(log)); + + VPUIP::buildHardwareAdaptationPipeline(pm, log); + + pm.addPass(VPURT::arch37xx::createAddFinalBarrierPass(log)); + + // Level 1 : VPU RunTime + + if (options.enableProfiling) { + pm.addPass(VPUIP::createCaptureWorkpointPass(log)); + pm.addPass(VPUIP::createGroupProfilingBuffersPass(log)); + pm.addPass(Core::createMoveDeclarationsToTopPass(log)); + } + + pm.addPass(VPURT::createAssignPhysicalBarriersPass(options.enableColorBinPhysicalBarrierAssignment, std::nullopt, + std::nullopt, log)); + pm.addPass(VPURT::createBarrierSimulationPass(log)); + pm.addPass(VPUIP::createUpdateSwKernelParamsPass(log)); + pm.addPass(mlir::createCanonicalizerPass(grc)); + pm.addPass(Const::createConstantFoldingPass()); + pm.addPass(VPUIP::createDumpStatisticsOfTaskOpsPass(log)); +} + // // DMAUnrollingPipeline // @@ -262,11 +298,12 @@ void vpux::VPUIP::arch37xx::buildDMAUnrollingPipeline(mlir::OpPassManager& pm, L pm.addPass(VPUIP::createUnrollDMAAnalysisPass(log)); pm.addPass(VPUIP::arch37xx::createUnrollDepthToSpaceDMAPass(log)); pm.addPass(VPUIP::arch37xx::createUnrollSpaceToDepthDMAPass(log)); - pm.addPass(VPUIP::createUnrollPermuteToNNDMAPass(log)); + pm.addPass(VPUIP::arch37xx::createUnrollPermuteDMAPass(log)); pm.addPass(VPUIP::createUnrollUpsamplingDMAPass(log)); pm.addPass(VPUIP::createUnrollExpandDMAPass(log)); pm.addPass(VPUIP::createUnrollPerAxisTileDMAPass(log)); + pm.addPass(VPUIP::createUnrollGatherDMAPass(log)); pm.addPass(VPUIP::createInvalidateUnrollDMAAnalysisPass(log)); } @@ -292,4 +329,10 @@ void vpux::VPUIP::arch37xx::registerVPUIPPipelines() { [](mlir::OpPassManager& pm, const VPUIP::arch37xx::DefaultHWOptions& options) { VPUIP::arch37xx::buildDefaultHWPipeline(pm, options); }); + + mlir::PassPipelineRegistration( + "reference-sw-mode-vpuip", "VPUIP dialect part of Reference SW pipeline", + [](mlir::OpPassManager& pm, const VPUIP::arch37xx::DefaultHWOptions& options) { + VPUIP::arch37xx::buildReferenceSWPipeline(pm, options); + }); } diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_permute_to_nndma.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/utils/permute_dma.cpp similarity index 73% rename from src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_permute_to_nndma.cpp rename to src/vpux_compiler/src/NPU37XX/dialect/VPUIP/utils/permute_dma.cpp index b2a954b6e4..d914dc0612 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_permute_to_nndma.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPUIP/utils/permute_dma.cpp @@ -1,36 +1,20 @@ // -// Copyright (C) 2022-2025 Intel Corporation. +// Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/NPU37XX/dialect/VPUIP/utils/permute_dma.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/interfaces/dma_descriptor_generator.hpp" -#include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/unroll_dma_analysis.hpp" -#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include -#include +namespace vpux::arch37xx { -namespace vpux::VPUIP { -#define GEN_PASS_DECL_UNROLLPERMUTETONNDMA -#define GEN_PASS_DEF_UNROLLPERMUTETONNDMA -#include "vpux/compiler/dialect/VPUIP/passes.hpp.inc" -} // namespace vpux::VPUIP - -using namespace vpux; - -namespace { - -vpux::NDTypeInterface changeShape(vpux::NDTypeInterface originType, ShapeRef shape, ShapeRef offset) { +NDTypeInterface changeShape(NDTypeInterface originType, ShapeRef shape, ShapeRef offset) { const auto elemType = originType.getElementType(); if (auto qType = mlir::dyn_cast(elemType)) { const auto newQType = tileScalesAndZP(qType, shape, offset); @@ -40,11 +24,11 @@ vpux::NDTypeInterface changeShape(vpux::NDTypeInterface originType, ShapeRef sha return originType.changeShape(shape); } -vpux::NDTypeInterface getPerClusterInputType(vpux::NDTypeInterface inputType, vpux::NDTypeInterface outputType, - mlir::AffineMap memPerm, ShapeRef outShape, ShapeRef offset) { +NDTypeInterface getPerClusterInputType(NDTypeInterface inputType, NDTypeInterface outputType, mlir::AffineMap memPerm, + ShapeRef outShape, ShapeRef offset) { auto inputShape = inputType.getShape(); - // Back infer the input shape from output shape and mem_Perm attribution + // Back infer the input shape from output shape and mem_perm attribute // For example: Input: 1x8x1x32xfp16, #NHWC -> 1x32x1x8xfp16, #NHWC, memPerm: [0, 1, 3, 2] // If want get right input shape from per cluster output shape. There are three step: // 1) Get the output real physical shape: 1x32x1x8xfp16, #NHWC -> 1x1x8x32 @@ -84,273 +68,9 @@ vpux::NDTypeInterface getPerClusterInputType(vpux::NDTypeInterface inputType, vp return changeShape(inputType, backInferInputShape(outShape), offset); } -// -// PermuteRewriter -// - -class PermuteRewriter final : public mlir::OpRewritePattern { -public: - PermuteRewriter(mlir::MLIRContext* ctx, int64_t dmaPortCount, Logger log) - : mlir::OpRewritePattern(ctx), _dmaPortCount(dmaPortCount), _log(log) { - setDebugName("PermuteRewriter"); - } - - mlir::LogicalResult matchAndRewrite(VPUIP::PermuteDMAOp permuteOp, mlir::PatternRewriter& rewriter) const final; - -private: - mlir::LogicalResult unrollSegmentedOrOverlappedOutput(VPUIP::PermuteDMAOp permuteOp, - VPUIP::DistributedBufferType distributedType, - mlir::AffineMap memPerm, - mlir::PatternRewriter& rewriter) const; - - mlir::LogicalResult unrollDuplicatedOutput(VPUIP::PermuteDMAOp permuteOp, - VPUIP::DistributedBufferType distributedType, mlir::AffineMap memPerm, - mlir::PatternRewriter& rewriter) const; - - mlir::LogicalResult unrollDuplicatedInputAndOutput(VPUIP::PermuteDMAOp permuteOp, mlir::AffineMap memPerm, - mlir::PatternRewriter& rewriter) const; - mlir::LogicalResult unrollDuplicatedInput(VPUIP::PermuteDMAOp permuteOp, mlir::AffineMap memPerm, - mlir::PatternRewriter& rewriter) const; - mlir::LogicalResult rewritePermuteDMA(VPUIP::PermuteDMAOp permuteOp, mlir::PatternRewriter& rewriter) const; - int64_t _dmaPortCount; - Logger _log; -}; - -mlir::LogicalResult PermuteRewriter::matchAndRewrite(VPUIP::PermuteDMAOp permuteOp, - mlir::PatternRewriter& rewriter) const { - // Skip PermuteDMA ops which have been unrolled by checking mem_perm attribute - if (permuteOp.getMemPermAttr() == nullptr) { - return mlir::failure(); - } - - const auto input = permuteOp.getInput(); - const auto output = permuteOp.getOutputBuff(); - - const auto inputType = mlir::cast(input.getType()); - const auto outputType = mlir::cast(output.getType()); - - auto inDistributedType = mlir::dyn_cast(inputType); - auto outDistributedType = mlir::dyn_cast(outputType); - - // Unroll by distributed type of input/output - if (inDistributedType != nullptr || outDistributedType != nullptr) { - _log.trace("process permute with DistributedType at {0}", permuteOp); - - VPUX_THROW_UNLESS(permuteOp.getMemPerm().has_value(), - "Can not get memPerm attribute from PermuteDMA layer at {0}", permuteOp.getLoc()); - const auto memPerm = permuteOp.getMemPerm().value(); - - if (inDistributedType != nullptr && outDistributedType != nullptr) { - return unrollDuplicatedInputAndOutput(permuteOp, memPerm, rewriter); - } else if (inDistributedType != nullptr) { - return unrollDuplicatedInput(permuteOp, memPerm, rewriter); - } - - VPUX_THROW_UNLESS(inputType.getMemoryKind() == VPU::MemoryKind::DDR && - outputType.getMemoryKind() == VPU::MemoryKind::CMX_NN, - "Unexpected memory space. Got: input {0}, output {1}", inputType.getMemoryKind(), - outputType.getMemoryKind()); - - VPUX_THROW_WHEN(outDistributedType == nullptr, "Expect distributed type for permute op output, actual: {0}", - outputType); - - VPUX_THROW_UNLESS(VPUIP::doesPermuteDMATileDimSupportWrapInCluster(inputType, outputType, memPerm, - outDistributedType, _log), - "Unsupported PermuteDMA under cluster tiling at '{0}'", permuteOp->getLoc()); - - const auto distributionAttr = outDistributedType.getDistribution(); - const auto mode = distributionAttr.getMode().getValue(); - if (mode == VPU::DistributionMode::SEGMENTED || mode == VPU::DistributionMode::OVERLAPPED) { - return unrollSegmentedOrOverlappedOutput(permuteOp, outDistributedType, memPerm, rewriter); - } else if (VPU::bitEnumContainsAny(mode, VPU::DistributionMode::DUPLICATED) || - VPU::bitEnumContainsAny(mode, VPU::DistributionMode::MULTICASTED)) { - return unrollDuplicatedOutput(permuteOp, outDistributedType, memPerm, rewriter); - } else { - VPUX_THROW("Unsupported distributed mode"); - } - } - - _log.trace("Permute rewriter operation '{0}' at '{1}'", permuteOp->getName(), permuteOp->getLoc()); - - // Rewrite the Permute operation itself - return rewritePermuteDMA(permuteOp, rewriter); -} - -/// @brief Rewrites PermuteDMAOp using its mem_perm attribute to update dma_descriptor attr value -mlir::LogicalResult PermuteRewriter::rewritePermuteDMA(VPUIP::PermuteDMAOp permuteOp, - mlir::PatternRewriter& rewriter) const { - auto vpurtTask = permuteOp->getParentOfType(); - VPUX_THROW_UNLESS(vpurtTask != nullptr, "Can't get VPURT task operation"); - rewriter.setInsertionPointAfter(vpurtTask); - - auto srcDeclBuff = permuteOp.getInput().getDefiningOp(); - VPUX_THROW_UNLESS(srcDeclBuff != nullptr, "Can't get buffer for operand: {0}", permuteOp.getInput()); - - auto dstDeclBuff = permuteOp.getOutputBuff().getDefiningOp(); - - auto inType = mlir::cast(permuteOp.getInput().getType()); - auto outType = mlir::cast(permuteOp.getOutput().getType()); - Byte elemTypeSize = inType.getElemTypeSize(); - - auto srcType = mlir::cast(srcDeclBuff.getType()); - auto dstType = mlir::cast(dstDeclBuff.getType()); - auto srcOffset = srcDeclBuff.getByteOffset(); - auto dstOffset = dstDeclBuff.getByteOffset(); - - // For unrolled DMA which is inside of cluster tiling, the dma descriptor is already calculated - auto dmaDescriptorAttr = permuteOp.getDmaDescriptorAttr(); - const auto memPerm = permuteOp.getMemPerm().value(); - auto mergedMemPerm = VPUIP::getPermuteDMAMergedMemPerm(inType, memPerm); - auto numPlaneDim = VPUIP::getPermuteDMANumPlaneDim(inType, memPerm); - - auto portIsAlreadyAssigned = true; - if (dmaDescriptorAttr == nullptr) { - auto ctx = permuteOp->getContext(); - auto mergedInputShape = VPUIP::getPermuteDMAInputShape(inType, outType, memPerm, _log).value(); - auto mergedOutputShape = VPUIP::getPermuteDMAOutputShape(inType, outType, memPerm, _log).value(); - auto dmaDescriptorGenerator = VPUIP::PermuteDmaDescriptorGenerator(ctx, mergedMemPerm, _log); - dmaDescriptorAttr = dmaDescriptorGenerator.generate(mergedInputShape, mergedOutputShape, elemTypeSize); - portIsAlreadyAssigned = false; - } - - auto subInput = - VPUIP::getPermuteDMASubInputShapes(VPU::getArch(permuteOp), inType, outType, memPerm, _dmaPortCount, _log); - VPUX_THROW_UNLESS(subInput.has_value(), "Cannot get unrolled subInputShapes for PermuteDMA op {0}", permuteOp); - auto subInputShapes = subInput.value(); - auto subOutputShapes = VPUIP::getPermuteDMASubOutputShapes(subInputShapes, inType, outType, memPerm); - - _log.trace("Unrolling PermuteDMAOp '{0}' at '{1}'", permuteOp->getName(), permuteOp->getLoc()); - - int64_t dmaPort = 0; - SmallVector firstPermuteDMAsOnPorts; - SmallVector lastPermuteDMAsOnPorts; - SmallVector newPermuteDMAs; - for (auto idx = 0; idx < checked_cast(subInputShapes.size()); idx++) { - auto newDMADescriptorAttr = VPUIP::updateNumPlanes(dmaDescriptorAttr, subInputShapes[idx][numPlaneDim]); - - const auto dimOrder = (subInputShapes[0].size() == 2) ? DimsOrder::NC : DimsOrder::CHW; - auto newSrcStrides = - (subInputShapes[idx].size() == 2) - ? SmallVector{Bit(subInputShapes[idx].back() * Bit(elemTypeSize).count()), - Bit(Bit(elemTypeSize).count())} - : SmallVector{Bit(subInputShapes[idx][Dim(1)] * subInputShapes[idx][Dim(2)] * - Bit(elemTypeSize).count()), - Bit(subInputShapes[idx].back() * Bit(elemTypeSize).count()), - Bit(Bit(elemTypeSize).count())}; - - auto newSrcMemRef = vpux::getMemRefType(subInputShapes[idx], srcType.getElementType(), dimOrder, - srcType.getMemSpace(), Strides(newSrcStrides)); - - auto newSrcBuff = - srcType.getMemSpace().getIndex().has_value() - ? VPURT::createOp(rewriter, srcDeclBuff, vpurtTask.getLoc(), - newSrcMemRef, srcDeclBuff.getSection(), - srcType.getMemSpace().getIndex().value(), srcOffset) - : srcDeclBuff.getSectionIndex().has_value() - ? VPURT::createOp( - rewriter, srcDeclBuff, vpurtTask.getLoc(), newSrcMemRef, srcDeclBuff.getSection(), - parseIntArrayAttr(srcDeclBuff.getSectionIndex().value()), srcOffset) - : VPURT::createOp(rewriter, srcDeclBuff, vpurtTask.getLoc(), - newSrcMemRef, srcDeclBuff.getSection(), srcOffset); - - auto newDstStrides = - (subOutputShapes[idx].size() == 2) - ? SmallVector{Bit(subOutputShapes[idx].back() * Bit(elemTypeSize).count()), - Bit(Bit(elemTypeSize).count())} - : SmallVector{Bit(subOutputShapes[idx][Dim(1)] * subOutputShapes[idx][Dim(2)] * - Bit(elemTypeSize).count()), - Bit(subOutputShapes[idx][Dim(2)] * Bit(elemTypeSize).count()), - Bit(Bit(elemTypeSize).count())}; - mlir::Type newDstType; - if (auto dstDistributedType = mlir::dyn_cast(dstType)) { - auto ctx = permuteOp->getContext(); - auto distributionAttr = dstDistributedType.getDistribution(); - VPUX_THROW_WHEN( - distributionAttr.getMode().getValue() != VPU::DistributionMode::DUPLICATED, - "Issues with unrolling PermuteNNDMA; Buffer has distributed type != DUPLICATED after unroll"); - if (VPU::isDistributedAttrWithExplicitShapesAndOffsets(distributionAttr)) { - distributionAttr = VPU::getNonOverlappedDistributedAttr( - subOutputShapes[idx], distributionAttr.getMode(), nullptr, distributionAttr.getNumClusters(), - nullptr, distributionAttr.getUniformDistributedSegments(), dstDeclBuff.getContext()); - } - - const auto layout = mlir::AffineMapAttr::get(dimOrder.toAffineMap(ctx)); - newDstType = VPUIP::DistributedBufferType::get(ctx, subOutputShapes[idx].raw(), dstType.getElementType(), - layout, dstType.getMemSpace(), distributionAttr); - } else { - newDstType = vpux::getMemRefType(subOutputShapes[idx], dstType.getElementType(), dimOrder, - dstType.getMemSpace(), Strides(newDstStrides)); - } - - VPUX_THROW_UNLESS(dstType.getMemSpace().getIndex().has_value() || dstDeclBuff.getSectionIndex().has_value(), - "No section index find at '{}'", dstDeclBuff.getLoc()); - auto newDstBuff = - dstType.getMemSpace().getIndex().has_value() - ? VPURT::createOp(rewriter, dstDeclBuff, vpurtTask.getLoc(), newDstType, - dstDeclBuff.getSection(), - dstType.getMemSpace().getIndex().value(), dstOffset) - : dstDeclBuff.getSectionIndex().has_value() - ? VPURT::createOp( - rewriter, dstDeclBuff, vpurtTask.getLoc(), newDstType, dstDeclBuff.getSection(), - parseIntArrayAttr(dstDeclBuff.getSectionIndex().value()), dstOffset) - : VPURT::createOp(rewriter, dstDeclBuff, vpurtTask.getLoc(), newDstType, - dstDeclBuff.getSection(), dstOffset); - - _log.trace("Create unrolled PermuteDMA operation with input/output shape: {0}/{1}, SrcMemory at {2}, " - "DstMemory at {3}", - subInputShapes[idx], subOutputShapes[idx], newSrcBuff.getSection(), newDstBuff.getSection()); - - const auto newLoc = appendLoc(vpurtTask->getLoc(), "_unrolled_permuteDMA"); - auto newDmaPort = portIsAlreadyAssigned ? permuteOp.getPort().value() : dmaPort; - auto newPermuteDMAOp = VPURT::wrapIntoTaskOp( - rewriter, vpurtTask.getWaitBarriers(), vpurtTask.getUpdateBarriers(), newLoc, newSrcBuff, newDstBuff, - vpux::getIntAttr(rewriter, newDmaPort), permuteOp.getIsOutOfOrderAttr(), permuteOp.getIsCriticalAttr(), - /*mem_perm*/ nullptr, newDMADescriptorAttr, permuteOp.getDmaHwpIdAttr(), - permuteOp.getProfilingMetadataAttr()); - - newPermuteDMAs.push_back(newPermuteDMAOp); - - // find the first and last DMAs on different ports - if (firstPermuteDMAsOnPorts.size() < static_cast(_dmaPortCount)) { - firstPermuteDMAsOnPorts.push_back(newPermuteDMAOp); - lastPermuteDMAsOnPorts.push_back(newPermuteDMAOp); - } else { - lastPermuteDMAsOnPorts[newDmaPort] = newPermuteDMAOp; - } - - dmaPort = (dmaPort + 1) % _dmaPortCount; - - auto numPlaneValue = newDMADescriptorAttr.getNumPlanes().getInt(); - auto srcPlaneStrideValue = newDMADescriptorAttr.getSrcPlaneStride().getInt(); - auto dstPlaneStrideValue = newDMADescriptorAttr.getDstPlaneStride().getInt(); - srcOffset += numPlaneValue * srcPlaneStrideValue; - dstOffset += numPlaneValue * dstPlaneStrideValue; - } - - for (auto& dmaOp : newPermuteDMAs) { - auto vpurtTask = dmaOp->getParentOfType(); - - // remove wait barrier dependency for these new permute DMA except first ones on each port - if (std::find(firstPermuteDMAsOnPorts.begin(), firstPermuteDMAsOnPorts.end(), dmaOp) == - firstPermuteDMAsOnPorts.end()) { - vpurtTask.getWaitBarriersMutable().clear(); - } - - // remove update barrier dependency for these new permute DMA except last ones on each port - if (std::find(lastPermuteDMAsOnPorts.begin(), lastPermuteDMAsOnPorts.end(), dmaOp) == - lastPermuteDMAsOnPorts.end()) { - vpurtTask.getUpdateBarriersMutable().clear(); - } - } - - rewriter.eraseOp(vpurtTask); - return mlir::success(); -} - -mlir::LogicalResult PermuteRewriter::unrollSegmentedOrOverlappedOutput(VPUIP::PermuteDMAOp permuteOp, - VPUIP::DistributedBufferType distributedType, - mlir::AffineMap memPerm, - mlir::PatternRewriter& rewriter) const { +mlir::LogicalResult UnrollMultiClusterPermuteDMA::unrollSegmentedOrOverlappedOutput( + VPUIP::PermuteDMAOp permuteOp, VPUIP::DistributedBufferType distributedType, mlir::AffineMap memPerm, + mlir::PatternRewriter& rewriter, int64_t portCount, Logger logger) { auto loc = permuteOp->getLoc(); auto ctx = permuteOp->getContext(); @@ -427,10 +147,10 @@ mlir::LogicalResult PermuteRewriter::unrollSegmentedOrOverlappedOutput(VPUIP::Pe return VPUIP::createNewDeclareBuffer(rewriter, insertionPoint, declBuff, newType, ddrOffset); }; - auto mergedInputShape = VPUIP::getPermuteDMAInputShape(inputType, outputType, memPerm, _log).value(); - auto mergedOutputShape = VPUIP::getPermuteDMAOutputShape(inputType, outputType, memPerm, _log).value(); + auto mergedInputShape = VPUIP::getPermuteDMAInputShape(inputType, outputType, memPerm, logger).value(); + auto mergedOutputShape = VPUIP::getPermuteDMAOutputShape(inputType, outputType, memPerm, logger).value(); auto mergedMemPerm = VPUIP::getPermuteDMAMergedMemPerm(inputType, memPerm); - auto dmaDescriptorGenerator = VPUIP::PermuteDmaDescriptorGenerator(ctx, mergedMemPerm, _log); + auto dmaDescriptorGenerator = VPUIP::PermuteDmaDescriptorGenerator(ctx, mergedMemPerm, logger); auto elemTypeSize = Byte(inputType.getElemTypeSize()); // calculate the dma descriptors and ddr offsets @@ -440,7 +160,7 @@ mlir::LogicalResult PermuteRewriter::unrollSegmentedOrOverlappedOutput(VPUIP::Pe const auto mergedOutputDimList = VPUIP::getPermuteDMAOutputMergedDimList(outputType, mergedOutputShape); auto tileDimForMergedOutput = - VPUIP::getTileDimForPermuteDMA(inputType, outputType, memPerm, distributedType, _log).value(); + VPUIP::getTileDimForPermuteDMA(inputType, outputType, memPerm, distributedType, logger).value(); const auto numTileSize = parseIntArrayAttr(distributionAttr.getNumTiles()); const auto tileDimIter = std::find_if(numTileSize.begin(), numTileSize.end(), [](const int64_t dim) { @@ -465,7 +185,7 @@ mlir::LogicalResult PermuteRewriter::unrollSegmentedOrOverlappedOutput(VPUIP::Pe for (int64_t clusterId = 0; clusterId < numClusters; ++clusterId) { auto mergedSubOutputShape = - VPUIP::getPermuteDMAOutputShape(inTypes[clusterId], outTypes[clusterId], memPerm, _log).value(); + VPUIP::getPermuteDMAOutputShape(inTypes[clusterId], outTypes[clusterId], memPerm, logger).value(); ddrOffsets.push_back(getSrcOffset(perClusterShapeOffsets[clusterId])); subMergedOutputShapes.push_back(mergedSubOutputShape); } @@ -475,47 +195,41 @@ mlir::LogicalResult PermuteRewriter::unrollSegmentedOrOverlappedOutput(VPUIP::Pe int64_t dmaPort = 0; auto inputInsertionPoint = input.getDefiningOp(); auto outputInsertionPoint = output.getDefiningOp(); - SmallVector newPermuteDMAs; + for (int64_t clusterId = 0; clusterId < numClusters; ++clusterId) { const auto newInputType = inTypes[clusterId]; const auto newOutType = outTypes[clusterId]; const auto inputBuffer = getOperand(clusterId, input, newInputType, inputInsertionPoint, ddrOffsets[clusterId]); inputInsertionPoint = inputBuffer.getDefiningOp(); - _log.trace("Insert new input buffer declaration: '{0}'", inputBuffer); + logger.trace("Insert new input buffer declaration: '{0}'", inputBuffer); const auto outBuffer = getOperand(clusterId, output, newOutType, outputInsertionPoint, Byte(0)); outputInsertionPoint = outBuffer.getDefiningOp(); - _log.trace("Insert new output buffer declaration: '{0}'", outBuffer); + logger.trace("Insert new output buffer declaration: '{0}'", outBuffer); const auto newLoc = appendLoc(loc, "_cluster_{0}", clusterId); auto newPermuteDMAOp = VPURT::wrapIntoTaskOp( rewriter, vpurtTask.getWaitBarriers(), vpurtTask.getUpdateBarriers(), newLoc, inputBuffer, outBuffer, vpux::getIntAttr(rewriter, dmaPort), permuteOp.getIsOutOfOrderAttr(), permuteOp.getIsCriticalAttr(), permuteOp.getMemPermAttr(), subDmaDescriptors[clusterId], permuteOp.getDmaHwpIdAttr(), - permuteOp.getProfilingMetadataAttr()); - - dmaPort = (dmaPort + 1) % _dmaPortCount; + permuteOp.getProfilingMetadataAttr(), /*internalDataFlow=*/nullptr); - _log.trace("Insert new permute dma : '{0}'", newPermuteDMAOp); + dmaPort = (dmaPort + 1) % portCount; - newPermuteDMAs.push_back(newPermuteDMAOp); + logger.trace("Insert new permute dma : '{0}'", newPermuteDMAOp); } + rewriter.eraseOp(vpurtTask); - // unrolling per distributed type is done, now rewrite PermuteOp itself - for (const auto& permuteDMA : newPermuteDMAs) { - if (rewritePermuteDMA(permuteDMA, rewriter).failed()) { - return mlir::failure(); - } - } return mlir::success(); } -mlir::LogicalResult PermuteRewriter::unrollDuplicatedOutput(VPUIP::PermuteDMAOp permuteOp, - VPUIP::DistributedBufferType distributedType, - mlir::AffineMap memPerm, - mlir::PatternRewriter& rewriter) const { +mlir::LogicalResult UnrollMultiClusterPermuteDMA::unrollDuplicatedOutput(VPUIP::PermuteDMAOp permuteOp, + VPUIP::DistributedBufferType distributedType, + mlir::AffineMap memPerm, + mlir::PatternRewriter& rewriter, int64_t, + Logger logger) { auto loc = permuteOp->getLoc(); auto ctx = permuteOp->getContext(); @@ -561,10 +275,10 @@ mlir::LogicalResult PermuteRewriter::unrollDuplicatedOutput(VPUIP::PermuteDMAOp return VPUIP::createNewDeclareBuffer(rewriter, insertionPoint, declBuff, newType, Byte(0)); }; - auto mergedInputShape = VPUIP::getPermuteDMAInputShape(inputType, outputType, memPerm, _log).value(); - auto mergedOutputShape = VPUIP::getPermuteDMAOutputShape(inputType, outputType, memPerm, _log).value(); + auto mergedInputShape = VPUIP::getPermuteDMAInputShape(inputType, outputType, memPerm, logger).value(); + auto mergedOutputShape = VPUIP::getPermuteDMAOutputShape(inputType, outputType, memPerm, logger).value(); auto mergedMemPerm = VPUIP::getPermuteDMAMergedMemPerm(inputType, memPerm); - auto dmaDescriptorGenerator = VPUIP::PermuteDmaDescriptorGenerator(ctx, mergedMemPerm, _log); + auto dmaDescriptorGenerator = VPUIP::PermuteDmaDescriptorGenerator(ctx, mergedMemPerm, logger); auto elemTypeSize = Byte(inputType.getElemTypeSize()); // calculate the dma descriptor @@ -592,25 +306,26 @@ mlir::LogicalResult PermuteRewriter::unrollDuplicatedOutput(VPUIP::PermuteDMAOp distributedType.getElementType()); const auto inputBuffer = getOperand(input, newInputType); - _log.trace("Insert new input buffer declaration: '{0}'", inputBuffer); + logger.trace("Insert new input buffer declaration: '{0}'", inputBuffer); const auto outBuffer = getOperand(output, newOutType); - _log.trace("Insert new output buffer declaration: '{0}'", outBuffer); + logger.trace("Insert new output buffer declaration: '{0}'", outBuffer); auto newPermuteDMAOp = VPURT::wrapIntoTaskOp( rewriter, vpurtTask.getWaitBarriers(), vpurtTask.getUpdateBarriers(), loc, inputBuffer, outBuffer, vpux::getIntAttr(rewriter, 0), permuteOp.getIsOutOfOrderAttr(), permuteOp.getIsCriticalAttr(), permuteOp.getMemPermAttr(), subDmaDescriptor, permuteOp.getDmaHwpIdAttr(), - permuteOp.getProfilingMetadataAttr()); + permuteOp.getProfilingMetadataAttr(), /*internalDataFlow=*/nullptr); - _log.trace("Insert new permute dma : '{0}'", newPermuteDMAOp); + logger.trace("Insert new permute dma : '{0}'", newPermuteDMAOp); rewriter.eraseOp(vpurtTask); - return rewritePermuteDMA(newPermuteDMAOp, rewriter); + return mlir::success(); } -mlir::LogicalResult PermuteRewriter::unrollDuplicatedInputAndOutput(VPUIP::PermuteDMAOp permuteOp, - mlir::AffineMap memPerm, - mlir::PatternRewriter& rewriter) const { +mlir::LogicalResult UnrollMultiClusterPermuteDMA::unrollDuplicatedInputAndOutput(VPUIP::PermuteDMAOp permuteOp, + mlir::AffineMap memPerm, + mlir::PatternRewriter& rewriter, + int64_t, Logger logger) { auto loc = permuteOp->getLoc(); auto ctx = permuteOp->getContext(); @@ -639,15 +354,12 @@ mlir::LogicalResult PermuteRewriter::unrollDuplicatedInputAndOutput(VPUIP::Permu auto vpurtTask = permuteOp->getParentOfType(); VPUX_THROW_WHEN(vpurtTask == nullptr, "Can not get VPURT.TaskOp for {0}", permuteOp); - const auto mode = distributionAttr.getMode().getValue(); - VPUX_THROW_UNLESS(mode == VPU::DistributionMode::DUPLICATED, "Unsupported distributed mode"); - rewriter.setInsertionPointAfter(vpurtTask); - auto mergedInputShape = VPUIP::getPermuteDMAInputShape(inputType, outputType, memPerm, _log).value(); - auto mergedOutputShape = VPUIP::getPermuteDMAOutputShape(inputType, outputType, memPerm, _log).value(); + auto mergedInputShape = VPUIP::getPermuteDMAInputShape(inputType, outputType, memPerm, logger).value(); + auto mergedOutputShape = VPUIP::getPermuteDMAOutputShape(inputType, outputType, memPerm, logger).value(); auto mergedMemPerm = VPUIP::getPermuteDMAMergedMemPerm(inputType, memPerm); - auto dmaDescriptorGenerator = VPUIP::PermuteDmaDescriptorGenerator(ctx, mergedMemPerm, _log); + auto dmaDescriptorGenerator = VPUIP::PermuteDmaDescriptorGenerator(ctx, mergedMemPerm, logger); auto elemTypeSize = Byte(inputType.getElemTypeSize()); // calculate the dma descriptor @@ -664,7 +376,7 @@ mlir::LogicalResult PermuteRewriter::unrollDuplicatedInputAndOutput(VPUIP::Permu auto inputBuffer = VPURT::createOp( rewriter, inDeclBuff, loc, newInType, VPURT::BufferSection::CMX_NN, getIntArrayAttr(ctx, ArrayRef({0})), inDeclBuff.getByteOffset(), inDeclBuff.getSwizzlingKeyAttr()); - _log.trace("Insert new input buffer declaration: '{0}'", inputBuffer); + logger.trace("Insert new input buffer declaration: '{0}'", inputBuffer); // create new output buffer auto outDeclBuff = output.getDefiningOp(); @@ -677,16 +389,18 @@ mlir::LogicalResult PermuteRewriter::unrollDuplicatedInputAndOutput(VPUIP::Permu rewriter, vpurtTask.getWaitBarriers(), vpurtTask.getUpdateBarriers(), loc, inputBuffer, outBuffer, vpux::getIntAttr(rewriter, 0), permuteOp.getIsOutOfOrderAttr(), permuteOp.getIsCriticalAttr(), permuteOp.getMemPermAttr(), subDmaDescriptor, permuteOp.getDmaHwpIdAttr(), - permuteOp.getProfilingMetadataAttr()); + permuteOp.getProfilingMetadataAttr(), /*internalDataFlow=*/nullptr); - _log.trace("Insert new permute dma : '{0}'", newPermuteDMAOp); + logger.trace("Insert new permute dma : '{0}'", newPermuteDMAOp); rewriter.eraseOp(vpurtTask); - return rewritePermuteDMA(newPermuteDMAOp, rewriter); + return mlir::success(); } -mlir::LogicalResult PermuteRewriter::unrollDuplicatedInput(VPUIP::PermuteDMAOp permuteOp, mlir::AffineMap memPerm, - mlir::PatternRewriter& rewriter) const { +mlir::LogicalResult UnrollMultiClusterPermuteDMA::unrollDuplicatedInput(VPUIP::PermuteDMAOp permuteOp, + mlir::AffineMap memPerm, + mlir::PatternRewriter& rewriter, int64_t, + Logger logger) { auto loc = permuteOp->getLoc(); auto ctx = permuteOp->getContext(); @@ -705,10 +419,10 @@ mlir::LogicalResult PermuteRewriter::unrollDuplicatedInput(VPUIP::PermuteDMAOp p rewriter.setInsertionPointAfter(vpurtTask); - auto mergedInputShape = VPUIP::getPermuteDMAInputShape(inputType, outputType, memPerm, _log).value(); - auto mergedOutputShape = VPUIP::getPermuteDMAOutputShape(inputType, outputType, memPerm, _log).value(); + auto mergedInputShape = VPUIP::getPermuteDMAInputShape(inputType, outputType, memPerm, logger).value(); + auto mergedOutputShape = VPUIP::getPermuteDMAOutputShape(inputType, outputType, memPerm, logger).value(); auto mergedMemPerm = VPUIP::getPermuteDMAMergedMemPerm(inputType, memPerm); - auto dmaDescriptorGenerator = VPUIP::PermuteDmaDescriptorGenerator(ctx, mergedMemPerm, _log); + auto dmaDescriptorGenerator = VPUIP::PermuteDmaDescriptorGenerator(ctx, mergedMemPerm, logger); auto elemTypeSize = Byte(inputType.getElemTypeSize()); // calculate the dma descriptor @@ -724,7 +438,7 @@ mlir::LogicalResult PermuteRewriter::unrollDuplicatedInput(VPUIP::PermuteDMAOp p auto inputBuffer = VPURT::createOp( rewriter, inDeclBuff, loc, newInType, VPURT::BufferSection::CMX_NN, getIntArrayAttr(ctx, ArrayRef({0})), inDeclBuff.getByteOffset(), inDeclBuff.getSwizzlingKeyAttr()); - _log.trace("Insert new input buffer declaration: '{0}'", inputBuffer); + logger.trace("Insert new input buffer declaration: '{0}'", inputBuffer); // create new output buffer auto outDeclBuff = output.getDefiningOp(); @@ -733,54 +447,190 @@ mlir::LogicalResult PermuteRewriter::unrollDuplicatedInput(VPUIP::PermuteDMAOp p rewriter, vpurtTask.getWaitBarriers(), vpurtTask.getUpdateBarriers(), loc, inputBuffer, outDeclBuff, vpux::getIntAttr(rewriter, 0), permuteOp.getIsOutOfOrderAttr(), permuteOp.getIsCriticalAttr(), permuteOp.getMemPermAttr(), subDmaDescriptor, permuteOp.getDmaHwpIdAttr(), - permuteOp.getProfilingMetadataAttr()); + permuteOp.getProfilingMetadataAttr(), /*internalDataFlow=*/nullptr); - _log.trace("Insert new permute dma : '{0}'", newPermuteDMAOp); + logger.trace("Insert new permute dma : '{0}'", newPermuteDMAOp); rewriter.eraseOp(vpurtTask); - return rewritePermuteDMA(newPermuteDMAOp, rewriter); + return mlir::success(); } -// -// UnrollPermuteToNNDMAPass -// +mlir::LogicalResult UnrollSingleClusterPermuteDMA::unroll(VPUIP::PermuteDMAOp permuteOp, + mlir::PatternRewriter& rewriter, int64_t portCount, + Logger logger) { + auto vpurtTask = permuteOp->getParentOfType(); + VPUX_THROW_UNLESS(vpurtTask != nullptr, "Can't get VPURT task operation"); + rewriter.setInsertionPointAfter(vpurtTask); + + auto srcDeclBuff = permuteOp.getInput().getDefiningOp(); + VPUX_THROW_UNLESS(srcDeclBuff != nullptr, "Can't get buffer for operand: {0}", permuteOp.getInput()); + + auto dstDeclBuff = permuteOp.getOutputBuff().getDefiningOp(); + + auto inType = mlir::cast(permuteOp.getInput().getType()); + auto outType = mlir::cast(permuteOp.getOutput().getType()); + Byte elemTypeSize = inType.getElemTypeSize(); + + auto srcType = mlir::cast(srcDeclBuff.getType()); + auto dstType = mlir::cast(dstDeclBuff.getType()); + auto srcOffset = srcDeclBuff.getByteOffset(); + auto dstOffset = dstDeclBuff.getByteOffset(); -class UnrollPermuteToNNDMAPass final : public VPUIP::impl::UnrollPermuteToNNDMABase { -public: - explicit UnrollPermuteToNNDMAPass(Logger log) { - Base::initLogger(log, Base::getArgumentName()); + // For unrolled DMA which is inside of cluster tiling, the dma descriptor is already calculated + auto dmaDescriptorAttr = permuteOp.getDmaDescriptorAttr(); + const auto memPerm = permuteOp.getMemPerm().value(); + auto mergedMemPerm = VPUIP::getPermuteDMAMergedMemPerm(inType, memPerm); + auto numPlaneDim = VPUIP::getPermuteDMANumPlaneDim(inType, memPerm); + + auto portIsAlreadyAssigned = true; + if (dmaDescriptorAttr == nullptr) { + auto ctx = permuteOp->getContext(); + auto mergedInputShape = VPUIP::getPermuteDMAInputShape(inType, outType, memPerm, logger).value(); + auto mergedOutputShape = VPUIP::getPermuteDMAOutputShape(inType, outType, memPerm, logger).value(); + auto dmaDescriptorGenerator = VPUIP::PermuteDmaDescriptorGenerator(ctx, mergedMemPerm, logger); + dmaDescriptorAttr = dmaDescriptorGenerator.generate(mergedInputShape, mergedOutputShape, elemTypeSize); + portIsAlreadyAssigned = false; } -private: - void safeRunOnFunc() final; -}; - -void UnrollPermuteToNNDMAPass::safeRunOnFunc() { - auto& ctx = getContext(); - auto func = getOperation(); - markAnalysesPreserved(); - auto analysis = getAnalysis(); - if (!analysis.passNeeded(VPUIP::UnrollDMAAnalysisNeeded::UnrollPermuteToNNDMAPass)) { - return; + auto subInput = + VPUIP::getPermuteDMASubInputShapes(config::getArch(permuteOp), inType, outType, memPerm, portCount, logger); + VPUX_THROW_UNLESS(subInput.has_value(), "Cannot get unrolled subInputShapes for PermuteDMA op {0}", permuteOp); + auto subInputShapes = subInput.value(); + auto subOutputShapes = VPUIP::getPermuteDMASubOutputShapes(subInputShapes, inType, outType, memPerm); + + logger.trace("Unrolling PermuteDMAOp '{0}' at '{1}'", permuteOp->getName(), permuteOp->getLoc()); + + int64_t dmaPort = 0; + SmallVector firstPermuteDMAsOnPorts; + SmallVector lastPermuteDMAsOnPorts; + SmallVector newPermuteDMAs; + for (auto idx = 0; idx < checked_cast(subInputShapes.size()); idx++) { + auto newDMADescriptorAttr = VPUIP::updateNumPlanes(dmaDescriptorAttr, subInputShapes[idx][numPlaneDim]); + + const auto dimOrder = (subInputShapes[0].size() == 2) ? DimsOrder::NC : DimsOrder::CHW; + auto newSrcStrides = + (subInputShapes[idx].size() == 2) + ? SmallVector{Bit(subInputShapes[idx].back() * Bit(elemTypeSize).count()), + Bit(Bit(elemTypeSize).count())} + : SmallVector{Bit(subInputShapes[idx][Dim(1)] * subInputShapes[idx][Dim(2)] * + Bit(elemTypeSize).count()), + Bit(subInputShapes[idx].back() * Bit(elemTypeSize).count()), + Bit(Bit(elemTypeSize).count())}; + + auto newSrcMemRef = vpux::getMemRefType(subInputShapes[idx], srcType.getElementType(), dimOrder, + srcType.getMemSpace(), Strides(newSrcStrides)); + + auto newSrcBuff = + srcType.getMemSpace().getIndex().has_value() + ? VPURT::createOp(rewriter, srcDeclBuff, vpurtTask.getLoc(), + newSrcMemRef, srcDeclBuff.getSection(), + srcType.getMemSpace().getIndex().value(), srcOffset) + : srcDeclBuff.getSectionIndex().has_value() + ? VPURT::createOp( + rewriter, srcDeclBuff, vpurtTask.getLoc(), newSrcMemRef, srcDeclBuff.getSection(), + parseIntArrayAttr(srcDeclBuff.getSectionIndex().value()), srcOffset) + : VPURT::createOp(rewriter, srcDeclBuff, vpurtTask.getLoc(), + newSrcMemRef, srcDeclBuff.getSection(), srcOffset); + + auto newDstStrides = + (subOutputShapes[idx].size() == 2) + ? SmallVector{Bit(subOutputShapes[idx].back() * Bit(elemTypeSize).count()), + Bit(Bit(elemTypeSize).count())} + : SmallVector{Bit(subOutputShapes[idx][Dim(1)] * subOutputShapes[idx][Dim(2)] * + Bit(elemTypeSize).count()), + Bit(subOutputShapes[idx][Dim(2)] * Bit(elemTypeSize).count()), + Bit(Bit(elemTypeSize).count())}; + mlir::Type newDstType; + if (auto dstDistributedType = mlir::dyn_cast(dstType)) { + auto ctx = permuteOp->getContext(); + auto distributionAttr = dstDistributedType.getDistribution(); + VPUX_THROW_WHEN( + distributionAttr.getMode().getValue() != VPU::DistributionMode::DUPLICATED, + "Issues with unrolling PermuteNNDMA; Buffer has distributed type != DUPLICATED after unroll"); + if (VPU::isDistributedAttrWithExplicitShapesAndOffsets(distributionAttr)) { + distributionAttr = VPU::getNonOverlappedDistributedAttr( + subOutputShapes[idx], distributionAttr.getMode(), nullptr, distributionAttr.getNumClusters(), + nullptr, distributionAttr.getUniformDistributedSegments(), dstDeclBuff.getContext()); + } + + const auto layout = mlir::AffineMapAttr::get(dimOrder.toAffineMap(ctx)); + newDstType = VPUIP::DistributedBufferType::get(ctx, subOutputShapes[idx].raw(), dstType.getElementType(), + layout, dstType.getMemSpace(), distributionAttr); + } else { + newDstType = vpux::getMemRefType(subOutputShapes[idx], dstType.getElementType(), dimOrder, + dstType.getMemSpace(), Strides(newDstStrides)); + } + + VPUX_THROW_UNLESS(dstType.getMemSpace().getIndex().has_value() || dstDeclBuff.getSectionIndex().has_value(), + "No section index find at '{}'", dstDeclBuff.getLoc()); + auto newDstBuff = + dstType.getMemSpace().getIndex().has_value() + ? VPURT::createOp(rewriter, dstDeclBuff, vpurtTask.getLoc(), newDstType, + dstDeclBuff.getSection(), + dstType.getMemSpace().getIndex().value(), dstOffset) + : dstDeclBuff.getSectionIndex().has_value() + ? VPURT::createOp( + rewriter, dstDeclBuff, vpurtTask.getLoc(), newDstType, dstDeclBuff.getSection(), + parseIntArrayAttr(dstDeclBuff.getSectionIndex().value()), dstOffset) + : VPURT::createOp(rewriter, dstDeclBuff, vpurtTask.getLoc(), newDstType, + dstDeclBuff.getSection(), dstOffset); + + logger.trace("Create unrolled PermuteDMA operation with input/output shape: {0}/{1}, SrcMemory at {2}, " + "DstMemory at {3}", + subInputShapes[idx], subOutputShapes[idx], newSrcBuff.getSection(), newDstBuff.getSection()); + + const auto newLoc = appendLoc(vpurtTask->getLoc(), "_unrolled_permuteDMA"); + auto newDmaPort = portIsAlreadyAssigned ? permuteOp.getPort().value() : dmaPort; + auto newPermuteDMAOp = VPURT::wrapIntoTaskOp( + rewriter, vpurtTask.getWaitBarriers(), vpurtTask.getUpdateBarriers(), newLoc, newSrcBuff, newDstBuff, + vpux::getIntAttr(rewriter, newDmaPort), permuteOp.getIsOutOfOrderAttr(), permuteOp.getIsCriticalAttr(), + /*mem_perm*/ nullptr, newDMADescriptorAttr, permuteOp.getDmaHwpIdAttr(), + permuteOp.getProfilingMetadataAttr(), /*internalDataFlow=*/nullptr); + + newPermuteDMAs.push_back(newPermuteDMAOp); + + // find the first and last DMAs on different ports + if (firstPermuteDMAsOnPorts.size() < static_cast(portCount)) { + firstPermuteDMAsOnPorts.push_back(newPermuteDMAOp); + lastPermuteDMAsOnPorts.push_back(newPermuteDMAOp); + } else { + lastPermuteDMAsOnPorts[newDmaPort] = newPermuteDMAOp; + } + + dmaPort = (dmaPort + 1) % portCount; + + auto numPlaneValue = newDMADescriptorAttr.getNumPlanes().getInt(); + auto srcPlaneStrideValue = newDMADescriptorAttr.getSrcPlaneStride().getInt(); + auto dstPlaneStrideValue = newDMADescriptorAttr.getDstPlaneStride().getInt(); + srcOffset += numPlaneValue * srcPlaneStrideValue; + dstOffset += numPlaneValue * dstPlaneStrideValue; } - auto module = func->getParentOfType(); - auto dmaOp = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN); - auto dmaPortCount = dmaOp.getCount(); - - mlir::RewritePatternSet patterns(&ctx); - patterns.add(&ctx, dmaPortCount, _log.nest()); - if (mlir::failed( - mlir::applyPatternsAndFoldGreedily(func, std::move(patterns), vpux::getDefaultGreedyRewriteConfig()))) { - signalPassFailure(); + + for (auto& dmaOp : newPermuteDMAs) { + auto vpurtTask = dmaOp->getParentOfType(); + + // remove wait barrier dependency for these new permute DMA except first ones on each port + if (std::find(firstPermuteDMAsOnPorts.begin(), firstPermuteDMAsOnPorts.end(), dmaOp) == + firstPermuteDMAsOnPorts.end()) { + vpurtTask.getWaitBarriersMutable().clear(); + } + + // remove update barrier dependency for these new permute DMA except last ones on each port + if (std::find(lastPermuteDMAsOnPorts.begin(), lastPermuteDMAsOnPorts.end(), dmaOp) == + lastPermuteDMAsOnPorts.end()) { + vpurtTask.getUpdateBarriersMutable().clear(); + } } -} -} // namespace + rewriter.eraseOp(vpurtTask); -// -// createUnrollPermuteToNNDMAPass -// + return mlir::success(); +} -std::unique_ptr vpux::VPUIP::createUnrollPermuteToNNDMAPass(Logger log) { - return std::make_unique(log); +mlir::LogicalResult rewritePermuteDMA(VPUIP::PermuteDMAOp permuteOp, mlir::PatternRewriter& rewriter, int64_t portCount, + Logger logger) { + return arch37xx::unrollPermuteDMA( + permuteOp, rewriter, portCount, logger); } + +} // namespace vpux::arch37xx diff --git a/src/vpux_compiler/src/NPU37XX/dialect/VPUIPDPU/ops.cpp b/src/vpux_compiler/src/NPU37XX/dialect/VPUIPDPU/ops.cpp index b4f3409eb1..08a667b0df 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect/VPUIPDPU/ops.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect/VPUIPDPU/ops.cpp @@ -174,7 +174,7 @@ mlir::LogicalResult ODUSparsityOp::verify() { } mlir::LogicalResult ODUOutActivationsOp::verify() { - auto arch = VPU::getArch(*this); + auto arch = config::getArch(*this); auto dataTypeExists = getDataType().has_value(); auto dataWidthExists = getDataWidth().has_value(); @@ -182,10 +182,10 @@ mlir::LogicalResult ODUOutActivationsOp::verify() { return ::mlir::success(); } - if ((arch == VPU::ArchKind::NPU37XX) && !(dataTypeExists && !dataWidthExists)) { + if ((arch == config::ArchKind::NPU37XX) && !(dataTypeExists && !dataWidthExists)) { return errorAt(getLoc(), "Operation {0}: use data_type attr to specify data type", getOperationName()); } - if ((arch > VPU::ArchKind::NPU37XX) && !(!dataTypeExists && dataWidthExists)) { + if ((arch > config::ArchKind::NPU37XX) && !(!dataTypeExists && dataWidthExists)) { return errorAt(getLoc(), "Operation {0}: use data_width attr to specify data type", getOperationName()); } diff --git a/src/vpux_compiler/src/NPU37XX/dialect_pipeline_strategy.cpp b/src/vpux_compiler/src/NPU37XX/dialect_pipeline_strategy.cpp index b09d708c06..b81e89c186 100644 --- a/src/vpux_compiler/src/NPU37XX/dialect_pipeline_strategy.cpp +++ b/src/vpux_compiler/src/NPU37XX/dialect_pipeline_strategy.cpp @@ -19,6 +19,7 @@ #include "vpux/compiler/dialect/core/transforms/passes.hpp" #include "vpux/compiler/pipelines/options_setup.hpp" +#include "vpux/compiler/utils/rewriter.hpp" using namespace vpux; @@ -53,10 +54,49 @@ class ShaveCodeGenSetup37XX : public OptionsSetupBase { +class ReferenceSWSetup37XX : public OptionsSetupBase { public: - using Base = OptionsSetupBase; + using Base = OptionsSetupBase; using Base::Base; + + static void setupOptionsImpl(DefaultHWOptions37XX& options, const intel_npu::Config& config) { + Base::setupOptionsImpl(options, config); + setupOptionsCommon(options); + } + + static void setupOptionsCommon(DefaultHWOptions37XX& options) { + overwriteIfUnset(options.enableDummyOpReplacement, false); + overwriteIfUnset(options.constantFoldingInBackground, false); + overwriteIfUnset(options.enableMergeFakeQuant, true); + overwriteIfUnset(options.enableOptimizeReorders, false); + overwriteIfUnset(options.enableExperimentalSEPtrsOperations, false); + overwriteIfUnset(options.enableFuseClampOperations, false); + overwriteIfUnset(options.enableConvertPrecisionToFP16, true); + overwriteIfUnset(options.enableConvertNonConstantPadToSliceAndConcat, true); + overwriteIfUnset(options.enableSimpleSchedule, true); + overwriteIfUnset(options.reduceParallelControlFlows, true); + overwriteIfUnset(options.enableGroupedMatMul, false); + overwriteIfUnset(options.fuseScalesToAccumulate, false); + overwriteIfUnset(options.enableFP16CompressedConvolution, false); + overwriteIfUnset(options.enableVPUNNPreSplit, false); + overwriteIfUnset(options.enableWeightsDynamicDequantization, false); + overwriteIfUnset(options.enableInPlaceBufferization, false); + overwriteIfUnset(options.useMemrefForHostFunctionBufferization, false); + overwriteIfUnset(options.enableRuntimeDequant, false); + + // ReferenceSW specific values + overwriteIfUnset(options.enableForceZMajorConcat, false); + overwriteIfUnset(options.enableSwapTransposeWithFQ, false); + overwriteIfUnset(options.enableAlignScales, false); + overwriteIfUnset(options.fuseMvn6ScaleBias, false); + overwriteIfUnset(options.enableConvertFCToConv, false); + overwriteIfUnset(options.enableAdjustNonZeroFakeQuant, false); + overwriteIfUnset(options.enableAdaptiveStripping, false); + overwriteIfUnset(options.enableExtraStaticShapeOps, false); + + overwriteIfUnset(options.enableConvertFFTToConv, false); + overwriteIfUnset(options.enableDecomposeGRUSequence, false); + } }; class WSMonolithicSetup37XX final : public WSMonolithicSetupBase { @@ -111,22 +151,17 @@ class DialectPipelineStrategy37XX final : public IDialectPipelineStrategy { }; // -// DialectPipelineStrategy37XX: [ReferenseSW] -// This implementation will be chosen if OptionsContainerType contains ReferenceSWOptions +// DialectPipelineStrategy37XX: [ReferenceSW] +// This implementation will be chosen if we have ReferenceSW setup // -template -using Has37XXSWOptions = typename std::enable_if_t>; - -template -class DialectPipelineStrategy37XX> final : - public IDialectPipelineStrategy { +class DialectPipelineStrategyReferenceSW37XX final : public IDialectPipelineStrategy { public: - explicit DialectPipelineStrategy37XX(const intel_npu::Config& config) - : _optionsContainer(std::make_unique(config)) { + explicit DialectPipelineStrategyReferenceSW37XX(const intel_npu::Config& config) + : _optionsContainer(std::make_unique(config)) { } - explicit DialectPipelineStrategy37XX(std::unique_ptr optionsContainer) + explicit DialectPipelineStrategyReferenceSW37XX(std::unique_ptr optionsContainer) : _optionsContainer(std::move(optionsContainer)) { } @@ -134,131 +169,30 @@ class DialectPipelineStrategy37XXgetInitCompilerOptions(), log.nest()); } - void buildReferenceSWPipeline(mlir::OpPassManager& pm, Logger log) override { - auto& options = _optionsContainer->getPipelineOptions(); - const auto grc = getDefaultGreedyRewriteConfig(); - - // No passes should be run before this pipeline, with very few exceptions. - IE::buildPostImportPipeline(pm, log); - - // Level 3 : Topology - IE::arch37xx::buildInitialLowPrecisionTransformationsPipeline(pm, IE::LowPrecisionTransformOptions(options), - log); - IE::arch37xx::buildInitialTransformationsPipeline(pm, IE::TransformOptions(options), log); - IE::buildAdjustPrecisionPipeline(pm, IE::AdjustPrecisionOptions(options), log); - - // Resolve group quant MatMul pattern - pm.addPass(IE::createUniquifyOpsPass(log)); - pm.addPass(IE::createMergeParallelFullyConnectedPass(log)); - pm.addPass(IE::createUnrollGroupQuantizePass(log)); - pm.addPass(IE::createUnrollFullyConnectedPass(log)); - if (options.fuseScalesToAccumulate) { - pm.addPass(IE::createFuseScalesToAccumulatePass(log)); - } - pm.addPass(IE::createConvertMatMulToConvPass(log)); - if (options.enableConvertFCToConv) { - pm.addPass(IE::createConvertFCToConvPass(log)); - } - - pm.addPass(IE::createResolveStridedSlicePass(log)); - pm.addPass(IE::createConvertStridedSlice2ConvPass(log)); - pm.addPass(IE::createConvertNceOpsTo4DPass(log)); - pm.addPass(IE::createConvertShapeTo4DPass(log)); - pm.addPass(mlir::createCanonicalizerPass(grc)); - pm.addPass(IE::createConvertToSpatialOpPass(false, isOptionEnabled(options.enableSEPtrsOperations), log)); - pm.addPass(IE::createConvertGRNToNormalizeL2Pass(log)); - pm.addPass(IE::createResolveScatterUpdateByTransposePass(log)); - IE::buildAdjustForVPUPipeline(pm, IE::AdjustForVPUOptions(options), log); - - pm.addPass(IE::createSplitFakeQuantPass(log)); - pm.addPass(mlir::createCanonicalizerPass(grc)); - pm.addPass(IE::createDequantizeConstPass(options.runtimeDequantizationLimit, - isOptionEnabled(options.enableRuntimeDequant), log)); - if (options.enableMergeFakeQuant) { - pm.addPass(IE::createMergeFakeQuantPass(log)); - } - pm.addPass(mlir::createCanonicalizerPass(grc)); - - IE::arch37xx::buildAdjustLayoutPipeline(pm, IE::AdjustLayoutOptions(options), log); - pm.addPass(IE::createConvertAssignReadValueToReturnsAndInputs(log)); - - pm.addPass(IE::createConvertToMemPermutePass(log)); - pm.addPass(mlir::createCanonicalizerPass(grc)); - - // Lowering to VPU - pm.addPass(createConvertLayers2VPUPass(log)); - pm.addPass(VPU::createDetectionOutputDecompositionPass(log)); - pm.addPass(VPU::arch37xx::createSplitRealDFTOpsPass(log)); - pm.addPass(VPU::createSplitGRUSequencePass(log)); - pm.addPass(VPU::arch37xx::createDecomposeMVNPass(log)); - pm.addPass(VPU::createAddSwOpAuxiliaryBufferPass(log)); - - pm.addPass(VPU::createTilingStrategyAssignmentPass( - /*enablePrefetchTiling=*/false, /*enableVPUNNCostForTiling*/ false, - /*enableShaveDDRAccessOptimization*/ "true", log)); - pm.addPass(VPU::arch37xx::createApplyTilingMVN1SumPass(/*enablePrefetchTiling=*/false, log)); - pm.addPass(VPU::createApplyTilingPass(/*enableSCFTiling=*/false, log)); - pm.addPass(VPU::createComputeInterpolateCoordinatesPass(/*enableExplicitDistributionInfoAttr*/ false, log)); - - pm.addPass(VPU::createBoundedTensorsToDynamicDimsMaskPass(log)); - - // Lowering to VPUIP - vpux::arch37xx::buildLowerVPU2VPUIPPipeline(pm, options.enableInPlaceBufferization, - /*useMemrefForHostFunctionBufferization*/ false, log); - - // Level 2 : Abstract RunTime - - pm.addPass(VPUIP::createSetMemorySpacePass(VPU::getMemKind, log)); - - pm.addPass(VPUIP::createAddCopyBetweenSWKernelsAndNetworkIOPass(log)); - - pm.addPass(VPUIP::createCopyOpTilingPass(log)); - pm.addPass(mlir::createCanonicalizerPass(grc)); - - if (options.enableProfiling && options.enableSWProfiling) { - pm.addPass(VPUIP::createActShaveProfilingPass(VPU::getMemKind, log)); - } - - pm.addPass(VPUIP::createUngroupBoundedBuffersPass(log)); - - pm.addPass(VPUIP::createConvertTransferOpsToDMAsPass(log)); - - VPUIP::buildAsyncSchedulingPipeline(pm, log); - - if (options.enableSWKernelPrefetchingReserveMem) { - pm.addPass(VPUIP::createSWKernelPrefetchingReserveMemPass(log)); - } - - pm.addPass(VPUIP::createStaticAllocationPass(VPU::getMemKind, log)); - pm.addPass(VPUIP::createStaticAllocationPass(VPU::getMemKind, log)); - pm.addPass(VPUIP::createLinearizationPass(log)); - pm.addPass(VPUIP::createOptimizeAsyncDepsPass(log)); - - pm.addPass(VPUIP::arch37xx::createAddSwKernelCacheHandlingOpsPass(log)); - - VPUIP::buildHardwareAdaptationPipeline(pm, log); + void buildIEPipeline(mlir::OpPassManager& pm, Logger log) override { + IE::arch37xx::buildReferenceSWPipeline(pm, _optionsContainer->getPipelineOptions(), log); + } - pm.addPass(VPURT::arch37xx::createAddFinalBarrierPass(log)); + void buildLowerIE2VPUPipeline(mlir::OpPassManager& pm, Logger log) override { + vpux::arch37xx::buildLowerIE2VPUPipelineReferenceSW(pm, log); + } - // Level 1 : VPU RunTime + void buildVPUPipeline(mlir::OpPassManager& pm, Logger log) override { + VPU::arch37xx::buildReferenceSWPipeline(pm, log); + } - if (options.enableProfiling) { - pm.addPass(VPUIP::createCaptureWorkpointPass(log)); - pm.addPass(VPUIP::createGroupProfilingBuffersPass(log)); - pm.addPass(Core::createMoveDeclarationsToTopPass(log)); - } + void buildLowerVPU2VPUIPPipeline(mlir::OpPassManager& pm, Logger log) override { + vpux::arch37xx::buildLowerVPU2VPUIPPipeline(pm, + _optionsContainer->getPipelineOptions().enableInPlaceBufferization, + /*useMemrefForHostFunctionBufferization*/ false, log); + } - pm.addPass(VPURT::createAssignPhysicalBarriersPass(options.enableColorBinPhysicalBarrierAssignment, - std::nullopt, std::nullopt, log)); - pm.addPass(VPURT::createBarrierSimulationPass(log)); - pm.addPass(VPUIP::createUpdateSwKernelParamsPass(log)); - pm.addPass(mlir::createCanonicalizerPass(grc)); - pm.addPass(Const::createConstantFoldingPass()); - pm.addPass(VPUIP::createDumpStatisticsOfTaskOpsPass(log)); + void buildVPUIPPipeline(mlir::OpPassManager& pm, Logger log) override { + VPUIP::arch37xx::buildReferenceSWPipeline(pm, _optionsContainer->getPipelineOptions(), log); } private: - std::unique_ptr _optionsContainer; + std::unique_ptr _optionsContainer; }; } // namespace @@ -277,7 +211,7 @@ std::unique_ptr vpux::createDialectPipelineStrategy37X return std::make_unique>(config); } case config::CompilationMode::ReferenceSW: { - return std::make_unique>(config); + return std::make_unique(config); } case config::CompilationMode::WSMonolithic: { return std::make_unique>(config); @@ -299,8 +233,8 @@ std::unique_ptr vpux::createDialectPipelineStrategy37X } template <> -std::unique_ptr vpux::createDialectPipelineStrategy37XX( - const VPU::InitCompilerOptions* initCompilerOptions, const ReferenceSWOptions37XX* options) { +std::unique_ptr vpux::createDialectPipelineStrategy37XXReferenceSW( + const VPU::InitCompilerOptions* initCompilerOptions, const DefaultHWOptions37XX* options) { auto wrapper = std::make_unique(initCompilerOptions, options); return std::make_unique>(std::move(wrapper)); } diff --git a/src/vpux_compiler/src/NPU37XX/interfaces_registry.cpp b/src/vpux_compiler/src/NPU37XX/interfaces_registry.cpp index 943399b8c7..9e92296aaf 100644 --- a/src/vpux_compiler/src/NPU37XX/interfaces_registry.cpp +++ b/src/vpux_compiler/src/NPU37XX/interfaces_registry.cpp @@ -27,6 +27,7 @@ void InterfacesRegistry37XX::registerInterfaces(mlir::DialectRegistry& registry) VPUIP::arch37xx::registerAlignedWorkloadChannelsOpInterfaces(registry); vpux::arch37xx::registerBufferizableOpInterfaces(registry); VPUIPDPU::arch37xx::registerVerifiersOpInterfaces(registry); + VPU::arch37xx::registerICostModelUtilsInterface(registry); } } // namespace vpux diff --git a/src/vpux_compiler/src/NPU37XX/pipelines_register.cpp b/src/vpux_compiler/src/NPU37XX/pipelines_register.cpp index bbe4ac7b6f..4b7c259b0b 100644 --- a/src/vpux_compiler/src/NPU37XX/pipelines_register.cpp +++ b/src/vpux_compiler/src/NPU37XX/pipelines_register.cpp @@ -29,7 +29,7 @@ void PipelineRegistry37XX::registerPipelines() { mlir::PassPipelineRegistration( "ShaveCodeGen", "Compile both from IE to VPUIP for NPU37XX", [](mlir::OpPassManager& pm, const DefaultHWOptions37XX& options) { - VPU::InitCompilerOptions initCompilerOptions{VPU::ArchKind::NPU37XX, + VPU::InitCompilerOptions initCompilerOptions{config::ArchKind::NPU37XX, config::CompilationMode::ShaveCodeGen, options}; auto createPipelineStartegy = [&](config::CompilationMode) { return createDialectPipelineStrategy37XX(&initCompilerOptions, &options); @@ -38,13 +38,14 @@ void PipelineRegistry37XX::registerPipelines() { factory.buildPipeline(pm); }); - mlir::PassPipelineRegistration( + mlir::PassPipelineRegistration( "reference-sw-mode", "Compile IE Network in Reference Software mode (SW only execution) for NPU37XX", - [](mlir::OpPassManager& pm, const ReferenceSWOptions37XX& options) { - VPU::InitCompilerOptions initCompilerOptions{VPU::ArchKind::NPU37XX, + [](mlir::OpPassManager& pm, const DefaultHWOptions37XX& options) { + VPU::InitCompilerOptions initCompilerOptions{config::ArchKind::NPU37XX, config::CompilationMode::ReferenceSW, options}; auto createPipelineStartegy = [&](config::CompilationMode) { - return createDialectPipelineStrategy37XX(&initCompilerOptions, &options); + return createDialectPipelineStrategy37XXReferenceSW(&initCompilerOptions, + &options); }; ReferenceSWStrategy factory(createPipelineStartegy, Logger::global()); factory.buildPipeline(pm); @@ -53,8 +54,8 @@ void PipelineRegistry37XX::registerPipelines() { mlir::PassPipelineRegistration( "default-hw-mode", "Compile IE Network in Default Hardware mode (HW and SW execution) for NPU37XX", [](mlir::OpPassManager& pm, const DefaultHWOptions37XX& options) { - VPU::InitCompilerOptions initCompilerOptions{VPU::ArchKind::NPU37XX, config::CompilationMode::DefaultHW, - options}; + VPU::InitCompilerOptions initCompilerOptions{config::ArchKind::NPU37XX, + config::CompilationMode::DefaultHW, options}; auto createPipelineStartegy = [&](config::CompilationMode) { return createDialectPipelineStrategy37XX(&initCompilerOptions, &options); }; @@ -65,7 +66,7 @@ void PipelineRegistry37XX::registerPipelines() { mlir::PassPipelineRegistration( "ws-monolithic", "Compile IE Network in Weights separation Monolithic mode for NPU37XX", [](mlir::OpPassManager& pm, const DefaultHWOptions37XX& options) { - VPU::InitCompilerOptions initCompilerOptions{VPU::ArchKind::NPU37XX, + VPU::InitCompilerOptions initCompilerOptions{config::ArchKind::NPU37XX, config::CompilationMode::WSMonolithic, options}; auto createPipelineStartegy = [&](config::CompilationMode) { return createDialectPipelineStrategy37XX(&initCompilerOptions, &options); diff --git a/src/vpux_compiler/src/NPU40XX/CMakeLists.txt b/src/vpux_compiler/src/NPU40XX/CMakeLists.txt index 88524de248..fe275f0422 100644 --- a/src/vpux_compiler/src/NPU40XX/CMakeLists.txt +++ b/src/vpux_compiler/src/NPU40XX/CMakeLists.txt @@ -51,14 +51,14 @@ find_package(Git REQUIRED) execute_process( COMMAND ${GIT_EXECUTABLE} lfs pull - WORKING_DIRECTORY "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/thirdparty/vpucostmodel") + WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/thirdparty/vpucostmodel") # Embed NPU40XX VPUNN models vpux_embed_bin_file( - SOURCE_FILE "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/thirdparty/vpucostmodel/models/vpu_40_159_strict.vpunn" + SOURCE_FILE "${PROJECT_SOURCE_DIR}/thirdparty/vpucostmodel/models/vpu_40_159_strict.vpunn" HEADER_FILE "${PROJECT_BINARY_DIR}/${gen_base_dst_include_dir}/dialect/VPU/generated/cost_model_data_4_0.hpp.inc" VARIABLE_NAME "COST_MODEL_4_0") vpux_embed_bin_file( - SOURCE_FILE "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/thirdparty/vpucostmodel/models/vpu_40_159_strict.fast.vpunn" + SOURCE_FILE "${PROJECT_SOURCE_DIR}/thirdparty/vpucostmodel/models/vpu_40_159_strict.fast.vpunn" HEADER_FILE "${PROJECT_BINARY_DIR}/${gen_base_dst_include_dir}/dialect/VPU/generated/cost_model_data_4_0_fast.hpp.inc" VARIABLE_NAME "COST_MODEL_4_0_FAST") diff --git a/src/vpux_compiler/src/NPU40XX/backend_pipeline_strategy.cpp b/src/vpux_compiler/src/NPU40XX/backend_pipeline_strategy.cpp index 0baf13d1ba..42fd185d07 100644 --- a/src/vpux_compiler/src/NPU40XX/backend_pipeline_strategy.cpp +++ b/src/vpux_compiler/src/NPU40XX/backend_pipeline_strategy.cpp @@ -47,7 +47,9 @@ void BackendPipelineStrategy40XX::buildELFPipeline(mlir::OpPassManager& pm, cons setupParamsAccordingToOptimizationLevel(options->optimizationLevel, *options, useWlm); setupPWLMParams(*options); dpuDryRunMode = VPU::getDPUDryRunMode(options->dpuDryRun); - backendCompilationOptions->enableDMAProfiling = options->enableDMAProfiling.getValue(); + auto enableProfiling = config.get(); + backendCompilationOptions->enableDMAProfiling = + enableProfiling ? options->enableDMAProfiling.getValue() : "false"; backendCompilationOptions->enableShaveDDRAccessOptimization = options->enableShaveDDRAccessOptimization; backendCompilationOptions->enableDumpStatisticsOfWlmOps = options->enableDumpTaskStats; backendCompilationOptions->workloadManagementBarrierCountThreshold = diff --git a/src/vpux_compiler/src/NPU40XX/conversion/pipelines.cpp b/src/vpux_compiler/src/NPU40XX/conversion/pipelines.cpp index 13b7e96928..e44910c0aa 100644 --- a/src/vpux_compiler/src/NPU40XX/conversion/pipelines.cpp +++ b/src/vpux_compiler/src/NPU40XX/conversion/pipelines.cpp @@ -8,7 +8,6 @@ #include "vpux/compiler/NPU40XX/dialect/ELF/passes.hpp" #include "vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp" #include "vpux/compiler/conversion.hpp" -#include "vpux/compiler/core/profiling.hpp" #include "vpux/compiler/dialect/VPUASM/passes.hpp" #include "vpux/compiler/dialect/VPUIPDPU/passes.hpp" #include "vpux/compiler/dialect/VPUMI40XX/passes.hpp" @@ -66,9 +65,7 @@ void vpux::arch40xx::buildLowerVPUIP2ELFPipeline(mlir::OpPassManager& pm, pm.addPass(createConvertVPUIP2VPUMI40XXPass(log, backendCompilationOptions.enableMemorySideCache, backendCompilationOptions.allocateShaveStackFrames)); - auto dmaProfilingMode = - getDMAProfilingMode(VPU::ArchKind::NPU40XX, backendCompilationOptions.enableDMAProfiling.getValue()); - pm.addPass(VPUMI40XX::createSetupProfilingVPUMI40XXPass(dmaProfilingMode, log)); + pm.addPass(VPUMI40XX::createSetupProfilingVPUMI40XXPass(backendCompilationOptions.enableDMAProfiling, log)); pm.addPass(mlir::createCanonicalizerPass()); pm.addPass(ELF::createAddABIVersionPass(log, NPUReg40XX::ABI_VERSION_MAJOR, NPUReg40XX::ABI_VERSION_MINOR, NPUReg40XX::ABI_VERSION_PATCH)); @@ -123,12 +120,17 @@ void vpux::arch40xx::elfSubsetPipelineVPUMI( pm.addPass(VPUMI40XX::createBarrierTopologicalMappingPass(log)); pm.addPass(VPUMI40XX::createGroupExecutionOpsPass(log)); - pm.addPass(VPUMI40XX::createAddFetchOpsPass(log)); + if (workloadManagementMode == WorkloadManagementMode::FWLM_V1_PAGES) { + pm.addPass(VPUMI40XX::createConvertFetchDmasToFetchTaskOpsPass(log)); + } else { + pm.addPass(VPUMI40XX::createAddFetchOpsPass(log)); + } pm.addPass(VPUMI40XX::createResolveWLMTaskLocationPass(log)); pm.addPass(VPUMI40XX::createUnGroupExecutionOpsPass(log)); pm.addPass(VPUMI40XX::createPropagateFinalBarrierPass(log)); pm.addPass(mlir::createCanonicalizerPass()); pm.addPass(VPUMI40XX::createNextSameIdAssignmentPass(log)); + // TODO: Skip AddEnqueueOps for FWLM_V1_PAGES once E#170833 is done pm.addPass(VPUMI40XX::createAddEnqueueOpsPass(workloadManagementMode, log)); pm.addPass(VPUMI40XX::createUnrollFetchTaskOpsPass(log)); if (workloadManagementMode != WorkloadManagementMode::PWLM_V0_LCA) { @@ -138,11 +140,15 @@ void vpux::arch40xx::elfSubsetPipelineVPUMI( if (workloadManagementMode != WorkloadManagementMode::FWLM_V1_PAGES) { pm.addPass(VPUMI40XX::createAddBootstrapBarriersPass(log)); } - pm.addPass(VPUMI40XX::createAddBootstrapWorkItemsPass(log)); + pm.addPass(VPUMI40XX::createAddBootstrapWorkItemsPass(workloadManagementMode, log)); + // TODO: For FWLM_V1_PAGES skip SplitEnqueueOps and use SplitEnqueueDmaOps once E#170833 is done pm.addPass(VPUMI40XX::createSplitEnqueueOpsPass(log)); - pm.addPass(VPUMI40XX::createLinkEnqueueTargetsPass(log)); - pm.addPass(VPUMI40XX::createAddEnqueueDMAOps(workloadManagementMode, log)); + pm.addPass(VPUMI40XX::createLinkEnqueueTargetsPass(workloadManagementMode, log)); + // TODO: For FWLM_V1_PAGES remove AddEnqueueDMAOps and use UpdateEnqueueDMAInputAndOutput once E#170833 is done + if (workloadManagementMode == WorkloadManagementMode::FWLM_V1_PAGES) { + pm.addPass(VPUMI40XX::createAddEnqueueDMAOps(log)); + } pm.addPass(VPUMI40XX::createUnrollEnqueueOpsPass(log)); if (workloadManagementMode != WorkloadManagementMode::PWLM_V0_LCA) { pm.addPass(VPUMI40XX::createLinkEnqueueOpsForSameBarrierPass(log)); @@ -163,7 +169,7 @@ void vpux::arch40xx::elfSubsetPipelineVPUMI( void vpux::arch40xx::elfSubsetPipelineVPUASM(mlir::OpPassManager& pm, bool workloadManagementEnable, bool disableDmaSwFifo, const Logger& log) { - pm.addPass(createConvertVPUMI40XX2VPUASMPass(log, workloadManagementEnable, disableDmaSwFifo)); + pm.addPass(createConvertVPUMI40XX2VPUASMPass(workloadManagementEnable, log, disableDmaSwFifo)); pm.addPass(ELF::createAddELFSymbolTablePass(log)); pm.addPass(ELF::createSetEntryPointPass(log)); pm.addPass(ELF::createAddNetworkMetadataPass(log)); diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops.cpp index 8323bb1acf..965a960ecf 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops.cpp @@ -5,11 +5,12 @@ #include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" #include "vpux/compiler/utils/stl_extras.hpp" - #include "vpux/utils/core/optional.hpp" #include +#include + using namespace vpux; // diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/abi_version.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/abi_version.cpp index 9267f47b96..0c426649f2 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/abi_version.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/abi_version.cpp @@ -28,14 +28,14 @@ void vpux::ELF::ABIVersionOp::serialize(elf::writer::BinaryDataSection& std::memcpy(abiVersionStruct.n_desc, desc, descSize); auto ptrCharTmp = reinterpret_cast(&abiVersionStruct); - binDataSection.appendData(ptrCharTmp, getBinarySize(VPU::ArchKind::UNKNOWN)); + binDataSection.appendData(ptrCharTmp, getBinarySize(config::ArchKind::UNKNOWN)); } -size_t vpux::ELF::ABIVersionOp::getBinarySize(VPU::ArchKind) { +size_t vpux::ELF::ABIVersionOp::getBinarySize(config::ArchKind) { return sizeof(LoaderAbiVersionNote); } -size_t vpux::ELF::ABIVersionOp::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::ELF::ABIVersionOp::getAlignmentRequirements(config::ArchKind) { return alignof(LoaderAbiVersionNote); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_data_section.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_data_section.cpp index e20240dc29..db3ff506da 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_data_section.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_data_section.cpp @@ -74,9 +74,9 @@ void vpux::ELF::DataSectionOp::preserialize(elf::Writer& writer, vpux::ELF::Sect for (auto& op : block->getOperations()) { if (op.hasTrait()) { auto binaryOp = mlir::cast(op); - // getting the BinarySize using VPU::ArchKind::UNKNOWN is OK at this point because the binaryOps should + // getting the BinarySize using config::ArchKind::UNKNOWN is OK at this point because the binaryOps should // already all be in their arch-specific form or are arch-independent - sectionSize += binaryOp.getBinarySizeCached(symRefMap, VPU::ArchKind::UNKNOWN); + sectionSize += binaryOp.getBinarySizeCached(symRefMap, config::ArchKind::UNKNOWN); } } section->setSize(sectionSize); diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_logical_section.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_logical_section.cpp index 5bc61e62e4..c67aa5f5c5 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_logical_section.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_logical_section.cpp @@ -23,9 +23,9 @@ size_t ELF::LogicalSectionOp::getTotalSize(vpux::ELF::SymbolReferenceMap& symRef auto binarySizeOp = mlir::dyn_cast(&op); if (binarySizeOp) { - // getting the BinarySize using VPU::ArchKind::UNKNOWN is OK at this point because the binarySizeOps should - // already all be in their arch-specific form or are arch-independent - auto span = binarySizeOp.getBinarySizeCached(symRefMap, VPU::ArchKind::UNKNOWN) + + // getting the BinarySize using config::ArchKind::UNKNOWN is OK at this point because the binarySizeOps + // should already all be in their arch-specific form or are arch-independent + auto span = binarySizeOp.getBinarySizeCached(symRefMap, config::ArchKind::UNKNOWN) + binarySizeOp.getMemoryOffset(); totalSize = std::max(totalSize, span); } @@ -72,7 +72,7 @@ ELF::SymbolSignature ELF::LogicalSectionOp::getSymbolSignature() { ioBindings.walk([&symSize, §ion, &index](VPUASM::DeclareBufferOp ioBuffer) { auto ioBuffLoc = ioBuffer.getBufferType().getLocation(); if (ioBuffLoc.getSection() == section && ioBuffLoc.getSectionIndex() == index) { - symSize = ioBuffer.getBinarySize(VPU::ArchKind::UNKNOWN); + symSize = ioBuffer.getBinarySize(config::ArchKind::UNKNOWN); } }); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_metadata_section.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_metadata_section.cpp index a085477928..6b7c8a47b0 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_metadata_section.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_metadata_section.cpp @@ -48,7 +48,7 @@ void vpux::ELF::CreateMetadataSectionOp::preserialize(elf::Writer& writer, vpux: VPUX_THROW_UNLESS(!isMetadataSerialized, "There should be only 1 metadata op in an ELF metadata section"); if (auto metadata_op = mlir::dyn_cast(op)) { isMetadataSerialized = true; - sectionSize = metadata_op.getBinarySize(VPU::ArchKind::UNKNOWN); + sectionSize = metadata_op.getBinarySize(config::ArchKind::UNKNOWN); } } VPUX_THROW_UNLESS(isMetadataSerialized, "No metadata defined in the ELF metadata section"); diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_performance_metrics_section.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_performance_metrics_section.cpp index 3804c60d23..c00eb53f6e 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_performance_metrics_section.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_performance_metrics_section.cpp @@ -44,14 +44,14 @@ void vpux::ELF::PerformanceMetricsOp::serialize(elf::writer::BinaryDataSection(&perf); - binDataSection.appendData(ptrCharTmp, getBinarySize(VPU::ArchKind::UNKNOWN)); + binDataSection.appendData(ptrCharTmp, getBinarySize(config::ArchKind::UNKNOWN)); } -size_t vpux::ELF::PerformanceMetricsOp::getBinarySize(VPU::ArchKind) { +size_t vpux::ELF::PerformanceMetricsOp::getBinarySize(config::ArchKind) { return sizeof(VpuPerformanceMetrics); } -size_t vpux::ELF::PerformanceMetricsOp::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::ELF::PerformanceMetricsOp::getAlignmentRequirements(config::ArchKind) { return alignof(VpuPerformanceMetrics); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_profiling_section.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_profiling_section.cpp index f3d36b37e4..4b5996ed9c 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_profiling_section.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/create_profiling_section.cpp @@ -44,7 +44,7 @@ void vpux::ELF::CreateProfilingSectionOp::preserialize(elf::Writer& writer, vpux size_t sectionSize = 0; for (auto& op : block->getOperations()) { if (auto metadataOp = mlir::dyn_cast(op)) { - sectionSize = metadataOp.getBinarySize(VPU::ArchKind::UNKNOWN); + sectionSize = metadataOp.getBinarySize(config::ArchKind::UNKNOWN); } } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/pad.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/pad.cpp index 012c5a884c..dd265cc199 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/pad.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/pad.cpp @@ -16,6 +16,6 @@ void vpux::ELF::PadOp::serialize(elf::writer::BinaryDataSection& binDat binDataSection.appendData(padding.data(), padSize); } -size_t vpux::ELF::PadOp::getBinarySize(VPU::ArchKind) { +size_t vpux::ELF::PadOp::getBinarySize(config::ArchKind) { return getPaddingSize(); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/reloc.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/reloc.cpp index 978d5e6031..eb68b42f8f 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/reloc.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/reloc.cpp @@ -3,9 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include #include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" +#include + +#include + using namespace vpux; void vpux::ELF::RelocOp::serialize(elf::writer::Relocation* relocation, vpux::ELF::SymbolMapType& symbolMap) { diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/symbol.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/symbol.cpp index 75f4ac3056..9b1e9f90dc 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/symbol.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/ops/symbol.cpp @@ -3,9 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include #include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" -#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include #include #include diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/add_abi_version.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/add_abi_version.cpp index 350cee8e55..6ac093f158 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/add_abi_version.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/add_abi_version.cpp @@ -2,9 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "vpux/compiler/NPU40XX/dialect/ELF/dialect.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/passes.hpp" +#include + namespace vpux::ELF::arch40xx { #define GEN_PASS_DECL_ADDABIVERSION #define GEN_PASS_DEF_ADDABIVERSION diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/add_network_metadata.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/add_network_metadata.cpp index bc14979c0e..cd0580f0e0 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/add_network_metadata.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/add_network_metadata.cpp @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/NPU40XX/dialect/ELF/dialect.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/passes.hpp" #include "vpux/compiler/dialect/VPUASM/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" namespace vpux::ELF::arch40xx { #define GEN_PASS_DECL_ADDNETWORKMETADATA @@ -42,7 +44,7 @@ void AddNetworkMetadata::safeRunOnFunc() { auto metadataOp = builder.create(netFunc.getLoc(), "NetworkMetadata"); auto actualAlignment = builder.getIntegerAttr(builder.getIntegerType(64, false), - metadataOp.getAlignmentRequirements(VPU::getArch(netFunc))); + metadataOp.getAlignmentRequirements(config::getArch(netFunc))); metadataSection.setSecAddrAlignAttr(actualAlignment); } } // namespace diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/cleanup_elf.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/cleanup_elf.cpp index bdbc83143d..2349ca4049 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/cleanup_elf.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/cleanup_elf.cpp @@ -5,6 +5,7 @@ // +#include "vpux/compiler/NPU40XX/dialect/ELF/dialect.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/passes.hpp" #include diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/create_relocations.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/create_relocations.cpp index 58dcb9ba79..0dcd80f05e 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/create_relocations.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/create_relocations.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/NPU40XX/dialect/ELF/dialect.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/passes.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/reloc_manager.hpp" #include "vpux/compiler/utils/options.hpp" diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/create_symbol_tables.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/create_symbol_tables.cpp index a4767bfcd8..77001afa9e 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/create_symbol_tables.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/create_symbol_tables.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/NPU40XX/dialect/ELF/dialect.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/passes.hpp" diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/handle_alignment_requirements.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/handle_alignment_requirements.cpp index efa9079825..3bae72aadc 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/handle_alignment_requirements.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/handle_alignment_requirements.cpp @@ -3,8 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/NPU40XX/dialect/ELF/dialect.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/passes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/ELF/utils.hpp" #include @@ -37,7 +39,7 @@ void HandleAlignmentRequirementsPass::safeRunOnFunc() { auto& ctx = getContext(); auto moduleOp = netFunc.getOperation()->getParentOfType(); VPUX_THROW_UNLESS(moduleOp, "The top-level module is missing"); - const auto arch = VPU::getArch(moduleOp); + const auto arch = config::getArch(moduleOp); auto mainOps = to_small_vector(netFunc.getOps()); VPUX_THROW_UNLESS(mainOps.size() == 1, "Expected exactly one ELF mainOp. Got {0}", mainOps.size()); diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/set_cmx_symbol_value.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/set_cmx_symbol_value.cpp index 9a44c8518c..5909d76d40 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/set_cmx_symbol_value.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/set_cmx_symbol_value.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/NPU40XX/dialect/ELF/dialect.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/passes.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/set_elf_entry_point.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/set_elf_entry_point.cpp index 28df78f926..43d273c99f 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/set_elf_entry_point.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/set_elf_entry_point.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/NPU40XX/dialect/ELF/dialect.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/passes.hpp" #include "vpux/compiler/dialect/VPUASM/ops.hpp" diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/set_op_offsets.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/set_op_offsets.cpp index 2e2e427dba..deabadf6a1 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/set_op_offsets.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/set_op_offsets.cpp @@ -3,8 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/NPU40XX/dialect/ELF/dialect.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/passes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/ELF/utils.hpp" #include @@ -45,7 +47,7 @@ mlir::LogicalResult SetOpOffsetsPass::initialize(mlir::MLIRContext* ctx) { void SetOpOffsetsPass::safeRunOnFunc() { auto netFunc = getOperation(); mlir::MLIRContext* ctx = &getContext(); - const auto arch = VPU::getArch(netFunc); + const auto arch = config::getArch(netFunc); auto mainOps = to_small_vector(netFunc.getOps()); VPUX_THROW_UNLESS(mainOps.size() == 1, "Expected exactly one ELF mainOp. Got {0}", mainOps.size()); diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/update_elf_section_flags.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/update_elf_section_flags.cpp index 3617bfb85e..291d189dea 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/update_elf_section_flags.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/passes/update_elf_section_flags.cpp @@ -5,6 +5,7 @@ // +#include "vpux/compiler/NPU40XX/dialect/ELF/dialect.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/passes.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" diff --git a/src/vpux_compiler/src/NPU40XX/dialect/ELF/reloc_manager.cpp b/src/vpux_compiler/src/NPU40XX/dialect/ELF/reloc_manager.cpp index 15ed6f0a4c..b5dae478e0 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/ELF/reloc_manager.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/ELF/reloc_manager.cpp @@ -89,45 +89,48 @@ ELF::SymbolOp ELF::RelocManager::getSymbolOfBinOpOrEncapsulatingSection(mlir::Op VPUX_THROW("No ELF Symbol found for the provided operation"); } -void ELF::RelocManager::createRelocations(mlir::Operation* op, ELF::RelocationInfo& relocInfo) { - auto sourceOp = symRefMap_.lookupSymbol(relocInfo.source); - - ELF::SymbolOp sourceSym = getSymbolOfBinOpOrEncapsulatingSection(sourceOp); - +void ELF::RelocManager::createRelocations(mlir::Operation* op, ELF::SymbolOp sourceSym, + ELF::ElfSectionInterface targetSection, size_t offset, bool isOffsetRelative, + vpux::ELF::RelocationType relocType, size_t addend, + std::string_view description) { // we can only modify ops for which the binary format is known auto targetOp = mlir::dyn_cast(op); if (sourceSym.getValue().has_value() && targetOp) { - auto relocFunc = relocationMap.find(relocInfo.relocType); + auto relocFunc = relocationMap.find(relocType); VPUX_THROW_UNLESS(relocFunc != relocationMap.end(), "Relocation type {0} not known!", - stringifyRelocationType(relocInfo.relocType)); + stringifyRelocationType(relocType)); // reloc offset at this point is expressed only inside of the operation specific descriptor auto descriptor = targetOp.getDescriptorStorage(); - VPUX_THROW_UNLESS(relocInfo.offset < descriptor.size(), "Offset is outside of descriptor!"); + VPUX_THROW_UNLESS(offset < descriptor.size(), "Offset is outside of descriptor!"); - relocFunc->second(reinterpret_cast(descriptor.begin() + relocInfo.offset), sourceSym.getValue().value(), - relocInfo.addend); + relocFunc->second(reinterpret_cast(descriptor.begin() + offset), sourceSym.getValue().value(), addend); } else { ELF::CreateSymbolTableSectionOp symTab = mlir::dyn_cast(sourceSym->getParentOp()); auto symForReloc = ELF::composeSectionObjectSymRef(symTab, sourceSym.getOperation()); - ELF::CreateRelocationSectionOp relocSection = getRelocationSection(relocInfo.targetSection, symTab); + ELF::CreateRelocationSectionOp relocSection = getRelocationSection(targetSection, symTab); auto relocBuilder = mlir::OpBuilder::atBlockEnd(relocSection.getBlock()); - auto offset = relocInfo.offset; - // here we set the actual offset from the beginning of the final ELF file - if (relocInfo.isOffsetRelative) { + if (isOffsetRelative) { auto baseBinarySizeOp = mlir::cast(op); offset += baseBinarySizeOp.getMemoryOffset(); } - relocBuilder.create(relocSection.getLoc(), offset, symForReloc, relocInfo.relocType, - relocInfo.addend, relocInfo.description); + relocBuilder.create(relocSection.getLoc(), offset, symForReloc, relocType, addend, description); } } +void ELF::RelocManager::createRelocations(mlir::Operation* op, ELF::RelocationInfo& relocInfo) { + auto sourceOp = symRefMap_.lookupSymbol(relocInfo.source); + + ELF::SymbolOp sourceSym = getSymbolOfBinOpOrEncapsulatingSection(sourceOp); + createRelocations(op, sourceSym, relocInfo.targetSection, relocInfo.offset, relocInfo.isOffsetRelative, + relocInfo.relocType, relocInfo.addend, relocInfo.description); +} + void ELF::RelocManager::createRelocations(mlir::Operation* op, std::vector& relocInfo) { for (auto& reloc : relocInfo) { createRelocations(op, reloc); diff --git a/src/vpux_compiler/src/NPU40XX/dialect/IE/impl/convert_to_palletization_lut_strategy.cpp b/src/vpux_compiler/src/NPU40XX/dialect/IE/impl/convert_to_palletization_lut_strategy.cpp index 3c456e53b1..803a36f647 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/IE/impl/convert_to_palletization_lut_strategy.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/IE/impl/convert_to_palletization_lut_strategy.cpp @@ -4,10 +4,10 @@ // #include "vpux/compiler/NPU40XX/dialect/IE/impl/convert_to_palletization_lut_strategy.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" #include "vpux/compiler/dialect/IE/interfaces/common_rewriters/convert_to_palletization_lut.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/quantization.hpp" -#include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/compiler/utils/types.hpp" namespace vpux::IE::arch40xx { diff --git a/src/vpux_compiler/src/NPU40XX/dialect/IE/impl/d2s_to_transposed_conv_verifier.cpp b/src/vpux_compiler/src/NPU40XX/dialect/IE/impl/d2s_to_transposed_conv_verifier.cpp index 58aaba18a5..a5aba3ac52 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/IE/impl/d2s_to_transposed_conv_verifier.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/IE/impl/d2s_to_transposed_conv_verifier.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/NPU40XX/dialect/IE/impl/d2s_to_transposed_conv_verifier.hpp" +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include diff --git a/src/vpux_compiler/src/NPU40XX/dialect/IE/impl/map_bilinear_interpolate_on_dpu_strategy.cpp b/src/vpux_compiler/src/NPU40XX/dialect/IE/impl/map_bilinear_interpolate_on_dpu_strategy.cpp new file mode 100644 index 0000000000..700be9aa8b --- /dev/null +++ b/src/vpux_compiler/src/NPU40XX/dialect/IE/impl/map_bilinear_interpolate_on_dpu_strategy.cpp @@ -0,0 +1,42 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/NPU40XX/dialect/IE/impl/map_bilinear_interpolate_on_dpu_strategy.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.hpp" +#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/utils/core/numeric.hpp" + +namespace vpux::IE::arch40xx { + +void MapBilinearInterpolateOnDPUStrategy::prepareInterpolate(mlir::ConversionTarget& target, LogCb logCb) const { + target.addDynamicallyLegalOp([this, logCb](IE::InterpolateOp op) { + const auto inputShape = getShape(op.getInput()); + const auto outputShape = getShape(op.getOutput()); + + const auto attr = op.getAttr(); + const auto coordModeAttr = attr.getCoordMode(); + bool isAlignCorners = coordModeAttr.getValue() == IE::InterpolateCoordMode::ALIGN_CORNERS; + + const auto axesValue = parseIntArrayAttr(op.getAxesAttrAttr()); + const bool isIntegerRatioOnly = std::all_of(axesValue.begin(), axesValue.end(), [&](const auto& axis) { + auto outputDim = outputShape[Dim(axis)]; + auto inputDim = inputShape[Dim(axis)]; + + if (isAlignCorners && !isDoubleEqual(axis, 1.0f)) { + outputDim = outputDim == 1 ? 1 : (outputDim - 1); + inputDim = inputDim == 1 ? 1 : (inputDim - 1); + } + + return (outputDim % inputDim == 0) || (inputDim % outputDim == 0); + }); + // SW kernel performance is bigger than DPU decomposition performance for floating scale factors. + if (!isIntegerRatioOnly) { + return true; + } + return isLegalInterpolateOp(op, _interpolateAsSEOpInStrategy, logCb); + }); +} +} // namespace vpux::IE::arch40xx diff --git a/src/vpux_compiler/src/NPU40XX/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.cpp b/src/vpux_compiler/src/NPU40XX/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.cpp deleted file mode 100644 index f3291f9e78..0000000000 --- a/src/vpux_compiler/src/NPU40XX/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.cpp +++ /dev/null @@ -1,132 +0,0 @@ -// -// Copyright (C) 2023-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -#include "vpux/compiler/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.hpp" -#include "vpux/compiler/NPU40XX/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/const/ops.hpp" - -#include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/core/numeric.hpp" - -#include - -namespace vpux::IE::arch40xx { -#define GEN_PASS_DECL_MAPBILINEARINTERPOLATEONDPUPASS -#define GEN_PASS_DEF_MAPBILINEARINTERPOLATEONDPUPASS -#include "vpux/compiler/NPU40XX/dialect/IE/passes.hpp.inc" -} // namespace vpux::IE::arch40xx - -using namespace vpux; - -namespace { - -// -// MapBilinearInterpolateOnDPUPass -// - -class MapBilinearInterpolateOnDPUPass final : - public IE::arch40xx::impl::MapBilinearInterpolateOnDPUPassBase { -public: - explicit MapBilinearInterpolateOnDPUPass(const bool interpolateAsSEOp, Logger log) - : _interpolateAsSEOp(interpolateAsSEOp) { - Base::initLogger(log, Base::getArgumentName()); - } - - mlir::LogicalResult initialize(mlir::MLIRContext* ctx) final; - -public: - class MapBilinearInterpolateOnDPURewriter; - -private: - void safeRunOnFunc() final; - -private: - bool _interpolateAsSEOp; -}; - -class MapBilinearInterpolateOnDPUPass::MapBilinearInterpolateOnDPURewriter final : - public vpux::IE::MapBilinearInterpolateOnDPUBaseRewriter { -public: - MapBilinearInterpolateOnDPURewriter(mlir::MLIRContext* ctx, Logger log) - : vpux::IE::MapBilinearInterpolateOnDPUBaseRewriter(ctx, log) { - setDebugName("MapBilinearInterpolateOnDPURewriterVPUX40XX"); - } -}; - -mlir::LogicalResult MapBilinearInterpolateOnDPUPass::initialize(mlir::MLIRContext* ctx) { - if (mlir::failed(Base::initialize(ctx))) { - return mlir::failure(); - } - - // When this parameter has a value, it probably comes from LIT test. - // Override the default - if (interpolateAsSEOp.hasValue()) { - _interpolateAsSEOp = interpolateAsSEOp.getValue(); - } - - return mlir::success(); -} - -void MapBilinearInterpolateOnDPUPass::safeRunOnFunc() { - auto& ctx = getContext(); - auto func = getOperation(); - const auto logCb = [&](const formatv_object_base& msg) { - _log.trace("{0}", msg.str()); - }; - - mlir::ConversionTarget target(ctx); - target.addDynamicallyLegalOp([&](IE::InterpolateOp op) { - const auto inputShape = getShape(op.getInput()); - const auto outputShape = getShape(op.getOutput()); - - const auto attr = op.getAttr(); - const auto coordModeAttr = attr.getCoordMode(); - bool isAlignCorners = coordModeAttr.getValue() == IE::InterpolateCoordMode::ALIGN_CORNERS ? true : false; - - const auto axesValue = parseIntArrayAttr(op.getAxesAttrAttr()); - const bool isIntegerRatioOnly = std::all_of(axesValue.begin(), axesValue.end(), [&](const auto& axis) { - auto outputDim = outputShape[Dim(axis)]; - auto inputDim = inputShape[Dim(axis)]; - - if (isAlignCorners && !isDoubleEqual(axis, 1.0f)) { - outputDim = outputDim == 1 ? 1 : (outputDim - 1); - inputDim = inputDim == 1 ? 1 : (inputDim - 1); - } - - return (outputDim % inputDim == 0) || (inputDim % outputDim == 0); - }); - // SW kernel performance is bigger that DPU decomposition performance for floating scale factors. - if (!isIntegerRatioOnly) { - return true; - } - return isLegalInterpolateOp(op, _interpolateAsSEOp, logCb); - }); - - target.addLegalOp(); - target.addLegalOp(); - target.addLegalOp(); - target.addLegalOp(); - target.addLegalOp(); - target.addLegalOp(); - - mlir::RewritePatternSet patterns(&ctx); - patterns.insert(&ctx, _log); - - if (mlir::failed(mlir::applyPartialConversion(func, target, std::move(patterns)))) { - signalPassFailure(); - } -} - -} // namespace - -// -// createMapBilinearInterpolateOnDPUPass -// - -std::unique_ptr vpux::IE::arch40xx::createMapBilinearInterpolateOnDPUPass(const bool interpolateAsSEOp, - Logger log) { - return std::make_unique(interpolateAsSEOp, log); -} diff --git a/src/vpux_compiler/src/NPU40XX/dialect/IE/transforms/passes/reduce_num_tiles_for_small_models.cpp b/src/vpux_compiler/src/NPU40XX/dialect/IE/transforms/passes/reduce_num_tiles_for_small_models.cpp index 2d8903d8c8..e0bee0ce94 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/IE/transforms/passes/reduce_num_tiles_for_small_models.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/IE/transforms/passes/reduce_num_tiles_for_small_models.cpp @@ -5,9 +5,12 @@ #include "vpux/compiler/NPU40XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include diff --git a/src/vpux_compiler/src/NPU40XX/dialect/IE/transforms/pipelines.cpp b/src/vpux_compiler/src/NPU40XX/dialect/IE/transforms/pipelines.cpp index e6045fedcf..e4a03ca2f7 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/IE/transforms/pipelines.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/IE/transforms/pipelines.cpp @@ -24,7 +24,7 @@ void vpux::IE::arch40xx::buildLowPrecisionPipeline(mlir::OpPassManager& pm, cons pm.addPass(IE::createOptimizeUnalignedQDQSeqPass(log)); pm.addPass(IE::createSwapFakeQuantWithReshapeAndStridedSlicePass(log)); - pm.addPass(IE::createSwapConvertWithTransposeReshapePass(log)); + pm.addPass(IE::createSwapConvertWithReshapeKindOpsPass(log)); if (options.enableAlignScales) { pm.addPass(IE::createAlignScalesPass(isOptionEnabled(options.enableSEPtrsOperations), log)); } @@ -40,6 +40,9 @@ void vpux::IE::arch40xx::buildLowPrecisionPipeline(mlir::OpPassManager& pm, cons } pm.addPass(IE::createSplitFakeQuantPass(log)); + if (options.enablePropagateQuantDequant) { + pm.addPass(IE::createPropagateQuantizeDequantizePass(isOptionEnabled(options.enableSEPtrsOperations), log)); + } pm.addPass(IE::createFuseOpWithQuantizePass(log)); pm.addPass(IE::createConvertToDequantizePass(options, log)); if (options.enablePropagateQuantDequant) { @@ -298,8 +301,7 @@ void vpux::IE::arch40xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, const I } if (options.enableBilinearInterpolateOnDPU) { - pm.addPass(IE::arch40xx::createMapBilinearInterpolateOnDPUPass(isOptionEnabled(options.enableSEPtrsOperations), - log)); + pm.addPass(IE::createMapBilinearInterpolateOnDPUPass(isOptionEnabled(options.enableSEPtrsOperations), log)); } pm.addPass(IE::createConvertBatchedLayerTo1NPass(log)); @@ -369,6 +371,59 @@ void vpux::IE::arch40xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, const I pm.addPass(IE::createLoadExternalKernelResourcesPass(log)); } +void vpux::IE::arch40xx::buildReferenceSWPipeline(mlir::OpPassManager& pm, + const IE::arch40xx::DefaultHWOptions& options, Logger log) { + const auto grc = getDefaultGreedyRewriteConfig(); + + // No passes should be run before this pipeline, with very few exceptions. + IE::buildPostImportPipeline(pm, log); + + // Level 3 : Topology + + IE::arch37xx::buildInitialLowPrecisionTransformationsPipeline(pm, IE::LowPrecisionTransformOptions(options), log); + IE::arch37xx::buildInitialTransformationsPipeline(pm, IE::TransformOptions(options), log); + IE::buildAdjustPrecisionPipeline(pm, IE::AdjustPrecisionOptions(options), log); + + // Resolve group quant MatMul pattern + pm.addPass(IE::createUniquifyOpsPass(log)); + pm.addPass(IE::createMergeParallelFullyConnectedPass(log)); + pm.addPass(IE::createUnrollGroupQuantizePass(log)); + pm.addPass(IE::createUnrollFullyConnectedPass(log)); + pm.addPass(IE::createMergeFullyConnectedPass(log)); + if (options.fuseScalesToAccumulate) { + pm.addPass(IE::createFuseScalesToAccumulatePass(log)); + } + pm.addPass(IE::createConvertMatMulToConvPass(log)); + if (options.enableConvertFCToConv) { + pm.addPass(IE::createConvertFCToConvPass(log)); + } + + pm.addPass(IE::createResolveStridedSlicePass(log)); + pm.addPass(IE::createConvertStridedSlice2ConvPass(log)); + pm.addPass(IE::createConvertNceOpsTo4DPass(log)); + pm.addPass(IE::createConvertShapeTo4DPass(log)); + pm.addPass(mlir::createCanonicalizerPass(grc)); + pm.addPass(IE::createConvertToSpatialOpPass(false, isOptionEnabled(options.enableSEPtrsOperations), log)); + pm.addPass(IE::createConvertGRNToNormalizeL2Pass(log)); + pm.addPass(IE::createResolveScatterUpdateByTransposePass(log)); + IE::buildAdjustForVPUPipeline(pm, IE::AdjustForVPUOptions(options), log); + + pm.addPass(IE::createSplitFakeQuantPass(log)); + pm.addPass(mlir::createCanonicalizerPass(grc)); + pm.addPass(IE::createDequantizeConstPass(options.runtimeDequantizationLimit, + isOptionEnabled(options.enableRuntimeDequant), log)); + if (options.enableMergeFakeQuant) { + pm.addPass(IE::createMergeFakeQuantPass(log)); + } + pm.addPass(mlir::createCanonicalizerPass(grc)); + + IE::arch37xx::buildAdjustLayoutPipeline(pm, IE::AdjustLayoutOptions(options), log); + pm.addPass(IE::createConvertAssignReadValueToReturnsAndInputs(log)); + + pm.addPass(IE::createConvertToMemPermutePass(log)); + pm.addPass(mlir::createCanonicalizerPass(grc)); +} + // // registerIEPipelines // @@ -450,4 +505,10 @@ void vpux::IE::arch40xx::registerIEPipelines() { [](mlir::OpPassManager& pm, const DynamicShapeTransformOptions& options) { IE::arch37xx::buildDynamicShapeTransformationsPipeline(pm, options); }); + + mlir::PassPipelineRegistration( + "reference-sw-mode-ie", "IE dialect part of Reference SW pipeline", + [](mlir::OpPassManager& pm, const IE::arch40xx::DefaultHWOptions& options) { + IE::arch40xx::buildReferenceSWPipeline(pm, options); + }); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/composers/dma_composer.cpp b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/composers/dma_composer.cpp index 41dec77720..956b433b2a 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/composers/dma_composer.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/composers/dma_composer.cpp @@ -37,7 +37,9 @@ uint64_t getTensorMode(mlir::Type type) { void setDMAConversionMode(DMARegister& initValues, mlir::Type inputType, uint64_t srcSize, mlir::Type outputType, uint64_t dstSize) { uint64_t conversionCfg = 0; - if (inputType != outputType) { + auto isQuantizedType = mlir::dyn_cast(inputType) && + mlir::dyn_cast(outputType); + if (inputType != outputType && !isQuantizedType) { if (inputType.isF32() && outputType.isF16()) { conversionCfg = DMA_DATA_CONV_FP32_FP16; } else if (inputType.isF32() && outputType.isBF16()) { @@ -289,7 +291,10 @@ DMARegister compose(VPUASM::NNDMAOp origOp, ELF::SymbolReferenceMap& symRefMap) // DMA only does FP32 -> FP16/BF16 conversions, // Because of this, dstDimSize1 will always be half of the original value - if (inputType.getElementType() != outputType.getElementType() && transactionConfig.dstDimSizes[1]) { + auto isQuantizedType = mlir::dyn_cast(inputType) && + mlir::dyn_cast(outputType); + if (inputType.getElementType() != outputType.getElementType() && !isQuantizedType && + transactionConfig.dstDimSizes[1]) { VPUX_THROW_UNLESS(elemInSize == elemOutSize * 2, "Element sizes in conversion are not supported"); long newDstDimSize1 = ((transactionConfig.dstDimSizes[1] + 1) / 2) - 1; descriptor.write(newDstDimSize1); diff --git a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/act_kernel_invo.cpp b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/act_kernel_invo.cpp index 1a5b8b7efd..8d69af3c1d 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/act_kernel_invo.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/act_kernel_invo.cpp @@ -24,14 +24,14 @@ void vpux::NPUReg40XX::ActKernelInvocationOp::serialize(elf::writer::BinaryDataS sizeof(nn_public::VpuActKernelInvocation), actKernInvoDescriptor.size()); auto serializedActKernInvoDesc = actKernInvoDescriptor.getStorage(); - binDataSection.appendData(serializedActKernInvoDesc.data(), getBinarySize(VPU::ArchKind::NPU40XX)); + binDataSection.appendData(serializedActKernInvoDesc.data(), getBinarySize(config::ArchKind::NPU40XX)); } -size_t vpux::NPUReg40XX::ActKernelInvocationOp::getBinarySize(VPU::ArchKind) { +size_t vpux::NPUReg40XX::ActKernelInvocationOp::getBinarySize(config::ArchKind) { return sizeof(nn_public::VpuActKernelInvocation); } -size_t vpux::NPUReg40XX::ActKernelInvocationOp::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::NPUReg40XX::ActKernelInvocationOp::getAlignmentRequirements(config::ArchKind) { return alignof(nn_public::VpuActKernelInvocation); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/act_kernel_range.cpp b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/act_kernel_range.cpp index c68195de2d..99b8402470 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/act_kernel_range.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/act_kernel_range.cpp @@ -23,14 +23,14 @@ void vpux::NPUReg40XX::ActKernelRangeOp::serialize(elf::writer::BinaryDataSectio sizeof(nn_public::VpuActKernelRange), actKernRangeDescriptor.size()); auto serializedActKernRangeDesc = actKernRangeDescriptor.getStorage(); - binDataSection.appendData(serializedActKernRangeDesc.data(), getBinarySize(VPU::ArchKind::NPU40XX)); + binDataSection.appendData(serializedActKernRangeDesc.data(), getBinarySize(config::ArchKind::NPU40XX)); } -size_t vpux::NPUReg40XX::ActKernelRangeOp::getBinarySize(VPU::ArchKind) { +size_t vpux::NPUReg40XX::ActKernelRangeOp::getBinarySize(config::ArchKind) { return sizeof(nn_public::VpuActKernelRange); } -size_t vpux::NPUReg40XX::ActKernelRangeOp::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::NPUReg40XX::ActKernelRangeOp::getAlignmentRequirements(config::ArchKind) { return alignof(nn_public::VpuActKernelRange); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/act_shave_rt.cpp b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/act_shave_rt.cpp index c86c671df6..a7d7a6e5f5 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/act_shave_rt.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/act_shave_rt.cpp @@ -21,14 +21,14 @@ void vpux::NPUReg40XX::ActShaveRtOp::serialize(elf::writer::BinaryDataSection& bin binDataSection.appendData(serializedDmaDesc.data(), serializedDmaDesc.size()); } -size_t NPUReg40XX::NNDMAOp::getBinarySize(VPU::ArchKind) { +size_t NPUReg40XX::NNDMAOp::getBinarySize(config::ArchKind) { return sizeof(nn_public::VpuDMATask); } -size_t NPUReg40XX::NNDMAOp::getAlignmentRequirements(VPU::ArchKind) { +size_t NPUReg40XX::NNDMAOp::getAlignmentRequirements(config::ArchKind) { return alignof(nn_public::VpuDMATask); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/dpu.cpp b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/dpu.cpp index 0bf665e09b..ad6ea7a0a2 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/dpu.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/dpu.cpp @@ -32,7 +32,7 @@ void DPUInvariantOp::serialize(elf::writer::BinaryDataSection& binDataS binDataSection.appendData(serializedInvariantDesc.data(), serializedInvariantDesc.size()); } -size_t DPUInvariantOp::getBinarySize(VPU::ArchKind) { +size_t DPUInvariantOp::getBinarySize(config::ArchKind) { return sizeof(nn_public::VpuDPUInvariant); } @@ -192,7 +192,7 @@ std::vector DPUInvariantOp::getRelocationInfo(ELF::SymbolRe return relocs; } -size_t DPUInvariantOp::getAlignmentRequirements(VPU::ArchKind) { +size_t DPUInvariantOp::getAlignmentRequirements(config::ArchKind) { return alignof(nn_public::VpuDPUInvariant); } @@ -207,7 +207,7 @@ void DPUVariantOp::serialize(elf::writer::BinaryDataSection& binDataSec binDataSection.appendData(serializedVariantDesc.data(), serializedVariantDesc.size()); } -size_t DPUVariantOp::getBinarySize(VPU::ArchKind) { +size_t DPUVariantOp::getBinarySize(config::ArchKind) { return sizeof(nn_public::VpuDPUVariant); } @@ -270,7 +270,7 @@ std::vector DPUVariantOp::getRelocationInfo(ELF::SymbolRefe return relocs; } -size_t DPUVariantOp::getAlignmentRequirements(VPU::ArchKind) { +size_t DPUVariantOp::getAlignmentRequirements(config::ArchKind) { return alignof(nn_public::VpuDPUVariant); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/m2i.cpp b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/m2i.cpp index 5f3b03fe4e..3b706f4b31 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/m2i.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/m2i.cpp @@ -67,11 +67,11 @@ void NPUReg40XX::M2IOp::serialize(elf::writer::BinaryDataSection& binDa binDataSection.appendData(serializedM2iDesc.data(), serializedM2iDesc.size()); } -size_t NPUReg40XX::M2IOp::getBinarySize(VPU::ArchKind) { +size_t NPUReg40XX::M2IOp::getBinarySize(config::ArchKind) { return sizeof(nn_public::VpuMediaTask); } -size_t NPUReg40XX::M2IOp::getAlignmentRequirements(VPU::ArchKind) { +size_t NPUReg40XX::M2IOp::getAlignmentRequirements(config::ArchKind) { return alignof(nn_public::VpuMediaTask); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/managed_barrier.cpp b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/managed_barrier.cpp index 55b43f364e..551b67de7e 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/managed_barrier.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/managed_barrier.cpp @@ -22,14 +22,14 @@ void NPUReg40XX::ManagedBarrierOp::serialize(elf::writer::BinaryDataSection(&MIVersionStruct); - binDataSection.appendData(ptrCharTmp, getBinarySize(VPU::ArchKind::NPU40XX)); + binDataSection.appendData(ptrCharTmp, getBinarySize(config::ArchKind::NPU40XX)); } -size_t vpux::NPUReg40XX::MappedInferenceVersionOp::getBinarySize(VPU::ArchKind) { +size_t vpux::NPUReg40XX::MappedInferenceVersionOp::getBinarySize(config::ArchKind) { return sizeof(MIVersionNote); } -size_t vpux::NPUReg40XX::MappedInferenceVersionOp::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::NPUReg40XX::MappedInferenceVersionOp::getAlignmentRequirements(config::ArchKind) { return alignof(MIVersionNote); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/nnrt_config.cpp b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/nnrt_config.cpp index ac775dcf2c..35945dbcb9 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/nnrt_config.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/NPUReg40XX/ops/nnrt_config.cpp @@ -27,14 +27,14 @@ void vpux::NPUReg40XX::NNrtConfigOp::serialize(elf::writer::BinaryDataSection attr) { auto kernelTextOp = getOpFrom(_symRefMap, attr); - return kernelTextOp ? kernelTextOp.getBinarySize(VPU::ArchKind::NPU40XX) : 0; + return kernelTextOp ? kernelTextOp.getBinarySize(config::ArchKind::NPU40XX) : 0; } llvm::StringRef vpux::NPUReg40XX::getKernelPath(vpux::ELF::SymbolReferenceMap& _symRefMap, diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPU/impl/convert_op_to_dma_for_performant_execution_strategy.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPU/impl/convert_op_to_dma_for_performant_execution_strategy.cpp index 69c63d19ae..b9cc6baa34 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPU/impl/convert_op_to_dma_for_performant_execution_strategy.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPU/impl/convert_op_to_dma_for_performant_execution_strategy.cpp @@ -94,9 +94,9 @@ mlir::LogicalResult MovetoDMAGather::matchAndRewrite(VPU::GatherOp origOp, mlir: auto convertIndicesOp = rewriter.createOrFold(origOp->getLoc(), reshapeIndicesOp, mlir::TypeAttr::get(requiredType64)); - auto gatherDMAOp = - rewriter.create(origOp.getLoc(), origOp.getInput(), convertIndicesOp, origOp.getAxis(), - origOp.getAxisValueAttr(), origOp.getBatchDims()); + auto gatherDMAOp = rewriter.create(origOp.getLoc(), origOp.getInput(), convertIndicesOp, + origOp.getAxis(), origOp.getAxisValueAttr(), + origOp.getBatchDims(), /*multiClusterStrategy*/ nullptr); auto reshapeOutOp = reshapeOperand(gatherDMAOp.getOutput(), outputType.getShape(), takeOpLoc(origOp, "reshape_output")); diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPU/impl/shave_kernel_info.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPU/impl/shave_kernel_info.cpp index 852cfa9625..4f138bf74d 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPU/impl/shave_kernel_info.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPU/impl/shave_kernel_info.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/NPU40XX/dialect/VPU/impl/shave_kernel_info.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPU/impl/sparsity_constraint.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPU/impl/sparsity_constraint.cpp index 6302965091..6685c4d63d 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPU/impl/sparsity_constraint.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPU/impl/sparsity_constraint.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/NPU40XX/dialect/VPU/impl/sparsity_constraint.hpp" +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" using namespace vpux::VPU::arch40xx; diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPU/interfaces/scf_tiling_ops.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPU/interfaces/scf_tiling_ops.cpp index c8791d7928..2797202e8d 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPU/interfaces/scf_tiling_ops.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPU/interfaces/scf_tiling_ops.cpp @@ -3,7 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/interfaces/scf/scf_tiling_interfaces.hpp" +#include "vpux/compiler/dialect/VPU/interfaces/scf/scf_tiling_viewlike_interfaces.hpp" void vpux::VPU::arch40xx::registerSCFTilingOpsInterfaces(mlir::DialectRegistry& registry) { registry.addExtension(+[](mlir::MLIRContext* ctx, VPU::VPUDialect*) { @@ -12,5 +14,7 @@ void vpux::VPU::arch40xx::registerSCFTilingOpsInterfaces(mlir::DialectRegistry& VPU::NCEMaxPoolOp::attachInterface>(*ctx); VPU::NCEConvolutionOp::attachInterface(*ctx); VPU::NCEDepthConvolutionOp::attachInterface(*ctx); + + VPU::LayoutCastOp::attachInterface(*ctx); }); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPU/transforms/passes/correct_nce_workloads.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPU/transforms/passes/correct_nce_workloads.cpp index a82f68f4af..f4ae7fdee8 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPU/transforms/passes/correct_nce_workloads.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPU/transforms/passes/correct_nce_workloads.cpp @@ -14,6 +14,7 @@ #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" namespace vpux::VPU::arch40xx { #define GEN_PASS_DECL_CORRECTNCEWORKLOADS @@ -156,7 +157,7 @@ void CorrectNCEWorkloadsPass::safeRunOnFunc() { auto func = getOperation(); WorkloadSplitter40XX splitter(func, _log); - const auto arch = getArch(func); + const auto arch = config::getArch(func); auto sparsityConstraint = VPU::getSparsityConstraint(arch); splitter.correctInvalidWorkload(sparsityConstraint); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPU/transforms/pipelines.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPU/transforms/pipelines.cpp index 1ce45ff763..176184c200 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPU/transforms/pipelines.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPU/transforms/pipelines.cpp @@ -30,12 +30,14 @@ void vpux::VPU::arch40xx::buildIncrementalPipeline(mlir::OpPassManager& pm, cons pm.addPass(VPU::createTileLSTMSequencePass(log)); pm.addPass(VPU::createEnsureNCEOpsSizeRequirementsPass(true, log)); - pm.addPass(VPU::createOptimizeConcatPass(/*optimizeOnlyOuterConcat*/ false, log)); + pm.addPass(VPU::createOptimizeConcatPass(/*optimizeOnlyOuterConcat*/ false, + /*disablePassOnEntryFunctionForHostCompile=*/false, log)); VPU::buildTilingPipeline(pm, VPU::TilingOptions(options), log); if (options.enableScfComputeOpsOutlining) { pm.addPass(VPU::createScfComputeOpsOutliningPass(log)); + pm.addPass(VPU::createConvertDynamicToStaticKernelsPass(log)); } pm.addPass(VPU::createBoundedTensorsToDynamicDimsMaskPass(log)); @@ -56,6 +58,40 @@ void vpux::VPU::arch40xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, const VPU::arch40xx::DefaultHWOptions& options, Logger log) { const auto grc = getDefaultGreedyRewriteConfig(); + /* + Memory reservation for CMX has to happen as early in VPU as possible. It is required because memory reservation + decreases usable CMX size which can result in different tiling decisions. If different passes see different + effective CMX size different failures which can be hard to diagnose can happen. Examples of such failures + include: + - Fail during compilation if additional memory was reserved after tiling but before scheduling since tiles + selected by tiling pipeline won't fit CMX anymore + - Memory corruption if additional memory is reserved after scheduler since additional memory will overlap + addresses allocated by the scheduler Currently there is no validation if memory is not reserved before the first + call to getTotalCMXSize. + */ + if (options.enableCompressActivationSpill) { + pm.addPass(VPU::createCompressDmaReserveMemPass(log)); + } + + // Unconditional on NPU40xx due to DMA HWP scratch range requirement + pm.addPass(VPU::createDMATaskProfilingReserveMemPass( + options.enableProfiling ? options.enableDMAProfiling.getValue() : "false", log)); + + /* + Call this pass after all other memory reservation has already been done. This pass checks if there is 1KiB + of reserved memory at the end of CMX and extends it if some is missing. So to not waste CMX memory make sure + as much as possible is allocated in that 1KiB region. Exception to this rule is memory reserved for SW kernel IO + for such memory make sure to reserve it after this pass to allow data prefetching. + */ + pm.addPass(VPU::createSWKernelDataPrefetchReserveMemPass(log)); + + // Make sure to run this after SWKernelDataPrefetchReserveMem which ensures we have enough + // memory at the end of CMX to allow SW kernel data prefetch. + // LNL Shave Kernel prefetch with profiling fails compiling. Track Number: E#169656 + if (options.enableSWKernelInstructionPrefetch && !(options.enableProfiling && options.enableSWProfiling)) { + pm.addPass(VPU::createSWKernelInstructionPrefetchReserveMemForDummyKernelsPass(log)); + } + // TODO: E#140041 enable profiling with outlining if (options.enableConcatRepeatingBlockOutlining && !options.enableProfiling) { pm.addPass(VPU::createConcatRepeatingBlocksOutliningPass(options.concatRepeatingBlockOutliningSeqLength, log)); @@ -84,8 +120,8 @@ void vpux::VPU::arch40xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, pm.addPass(VPU::createFuseClampPass(log)); pm.addPass(VPU::createEnsureNCEOpsSizeRequirementsPass(options.enableOutputEnsurance, log)); - pm.addPass(VPU::createOptimizeConcatPass(/*optimizeOnlyOuterConcat*/ false, log)); - + pm.addPass(VPU::createOptimizeConcatPass(/*optimizeOnlyOuterConcat*/ false, + /*disablePassOnEntryFunctionForHostCompile=*/false, log)); if (options.enableWeightsSparsity) { VPU::buildWeightsSparsityPipeline(pm, VPU::WeightsSparsityOptions(options), log); } @@ -113,7 +149,8 @@ void vpux::VPU::arch40xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, pm.addPass(VPU::createAdjustMemorySpacePass(log)); pm.addPass(VPU::createOptimizeSharedInputCopyForConcatPass(log)); - pm.addPass(VPU::createOptimizeConcatPass(/*optimizeOnlyOuterConcat*/ false, log)); + pm.addPass(VPU::createOptimizeConcatPass(/*optimizeOnlyOuterConcat*/ false, + options.disablePassOnEntryFunctionForHostCompile, log)); pm.addPass(mlir::createCanonicalizerPass(grc)); pm.addPass(VPU::createCMXConcatPass(log)); diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/interfaces/aligned_workload_channels_ops.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/interfaces/aligned_workload_channels_ops.cpp index 0e114cb603..967eac4212 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/interfaces/aligned_workload_channels_ops.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/interfaces/aligned_workload_channels_ops.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/interfaces/workload_splitter_base.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/sparsity_constraint.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -21,7 +22,7 @@ class AlignedWorkloadChannelsOpModel final : SmallVector getSupportedWorkLoadChannels(mlir::Operation* nceOp) const { auto func = nceOp->getParentOfType(); auto log = Logger::global(); - const auto arch = VPU::getArch(func); + const auto arch = config::getArch(func); auto sparsityConstraint = VPU::getSparsityConstraint(arch); VPU::WorkloadSplitter40XX splitter(func, log); diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/add_placeholder_fetch_dmas.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/add_placeholder_fetch_dmas.cpp new file mode 100644 index 0000000000..d8d3ab4c47 --- /dev/null +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/add_placeholder_fetch_dmas.cpp @@ -0,0 +1,331 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/core/barrier_info.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" +#include "vpux/compiler/utils/wlm_legalization_utils.hpp" + +namespace vpux::VPUIP::arch40xx { +#define GEN_PASS_DECL_ADDPLACEHOLDERFETCHDMAS +#define GEN_PASS_DEF_ADDPLACEHOLDERFETCHDMAS +#include "vpux/compiler/NPU40XX/dialect/VPUIP/passes.hpp.inc" +} // namespace vpux::VPUIP::arch40xx + +using namespace vpux; +namespace { + +// +// AddPlaceholderFetchDMAsPass +// + +using BlockRange = SmallVector>; + +struct FetchDMAData { + size_t insertionPoint; + SmallVector consumes; + SmallVector producesIn; + VPUIP::FetchDMAAttr fetchDmaAttr; +}; + +class AddPlaceholderFetchDMAsPass final : + public VPUIP::arch40xx::impl::AddPlaceholderFetchDMAsBase { +public: + explicit AddPlaceholderFetchDMAsPass(Logger log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnFunc() final; + void planFetchDMAAndBarriersInsertionPerQueue(BlockRange& blockRange, ExecutionGroupList& executionGroup, + BarrierInfo& barrierInfo); + void realizePlannedInsertions(mlir::OpBuilder& builder, BarrierInfo& barrierInfo); + VPUIP::FetchDMAAttr getFetchDMAAttr(int64_t groupIdx, BarrierInfo& barrierInfo, size_t taskIndex); + +private: + // Will be initialized in safeRunOnFunc() or relevant function, this is done to suppress the UNINIT_CTOR + // warning + size_t _numAllTaskOps = 0; + size_t _newBarrierIndex = 0; + mlir::Operation* _bufferInsertionPoint = nullptr; + mlir::Operation* _barrierInsertionPoint = nullptr; + VPURT::TaskOp _firstTaskOp; + + SmallVector _fetchDMAsToInsert; + llvm::DenseMap, SmallVector>> _barrierAddConsumerProducerMap; + + SmallVector _dummyBarriers; + SmallVector _fetchDMAs; +}; + +// Function to get tile index for DPU/SHV Op +size_t getTileIndexForDpuOrShv(BarrierInfo& barrierInfo, size_t taskIdx) { + auto taskOp = barrierInfo.getTaskOpAtIndex(taskIdx); + if (auto dmaOp = taskOp.getInnerTaskOpOfType()) { + VPUX_THROW("getTileIndexForDpuOrShv called for DMAOp {0}", taskOp); + } + + if (auto swOp = taskOp.getInnerTaskOpOfType()) { + return swOp.getTileIndex().value_or(0); + } + + auto taskQueueType = barrierInfo.getTaskQueueType(taskIdx); + return taskQueueType.id; +} + +// Function to get list index for DPU/SHV Op +size_t getListIndexForDpuOrShv(BarrierInfo& barrierInfo, size_t taskIdx) { + auto taskOp = barrierInfo.getTaskOpAtIndex(taskIdx); + if (auto dmaOp = taskOp.getInnerTaskOpOfType()) { + VPUX_THROW("getListIndexForDpuOrShv called for DMAOp {0}", taskOp); + } + + if (auto swOp = taskOp.getInnerTaskOpOfType()) { + return swOp.getListIndex().value_or(0); + } + + // All DPU tasks are expected to be on list 0 + return 0; +} + +VPURT::TaskOp createFetchDma(mlir::OpBuilder& builder, mlir::Value inputBuf, mlir::Value outputBuf, + BarrierInfo& barrierInfo, VPUIP::FetchDMAAttr fetchDMAData) { + auto newDMA = createFetchDMA(builder, inputBuf, outputBuf, 0, {}, {}, fetchDMAData); + barrierInfo.addNewTaskOp(newDMA); + return newDMA; +} + +void finalizeBarrierInfo(BarrierInfo& barrierInfo, mlir::func::FuncOp netFunc, Logger& log) { + // Update IR, verify schedules + VPURT::orderExecutionTasksAndBarriers(netFunc, barrierInfo, log); + VPUX_THROW_UNLESS(barrierInfo.verifyControlGraphSplit(), "Encountered split of control graph is incorrect"); + barrierInfo.clearAttributes(); + VPURT::postProcessBarrierOps(netFunc); +} + +// Once we know the insertion point of DMAs this function creates actual DMAs in IR while also keeps a map of [index, +// DMAOp This map is used later to refer to real DMAOp and get real task-index from barrierInfo +void AddPlaceholderFetchDMAsPass::realizePlannedInsertions(mlir::OpBuilder& builder, BarrierInfo& barrierInfo) { + auto inBuffer = VPUIP::createDummyBuffer(builder, _bufferInsertionPoint); + auto outBuffer = VPUIP::createDummyBuffer(builder, _bufferInsertionPoint); + + // Create as many dummy barriers as were indexed during scheduling + for ([[maybe_unused]] size_t unused = 0; unused < _newBarrierIndex; ++unused) { + auto newBarrierOp = createNewBarrier(builder, barrierInfo, _barrierInsertionPoint, nullptr, nullptr); + _dummyBarriers.push_back(newBarrierOp); + } + + for (const auto& [dummyDmaIndex, value] : _fetchDMAsToInsert | indexed) { + auto insertionPointOp = barrierInfo.getTaskOpAtIndex(value.insertionPoint); + // Ensure fetch DMAs for first 2 groups are always first in the list + if (value.fetchDmaAttr.getExecGroupIdx().getValue().getSExtValue() < 2) { + builder.setInsertionPoint(insertionPointOp); + } else { + builder.setInsertionPointAfter(insertionPointOp); + } + auto dummyDMA = createFetchDma(builder, inBuffer, outBuffer, barrierInfo, value.fetchDmaAttr); + _fetchDMAs.push_back(dummyDMA); + } + + // We have created the new DMAs and barriers, adjust dependencies + for (const auto& [dummyDmaIndex, value] : _fetchDMAsToInsert | indexed) { + SmallVector realProducesIn; + SmallVector realConsumes; + for (auto produce : value.producesIn) { + auto realBarrierIdx = getIndexOfBarrier(produce, _dummyBarriers, barrierInfo); + realProducesIn.push_back(realBarrierIdx); + } + for (auto consume : value.consumes) { + auto realBarrierIdx = getIndexOfBarrier(consume, _dummyBarriers, barrierInfo); + realConsumes.push_back(realBarrierIdx); + } + updateBarriersForDma(realConsumes, realProducesIn, _fetchDMAs[dummyDmaIndex], barrierInfo); + } + + for (const auto& [indexType, value] : _barrierAddConsumerProducerMap) { + auto realBarrierIdx = getIndexOfBarrier(indexType, _dummyBarriers, barrierInfo); + for (auto consumer : value.first) { + auto realTaskIdx = getIndexOfTask(consumer, _fetchDMAs, barrierInfo); + barrierInfo.addConsumer(realBarrierIdx, realTaskIdx); + } + for (auto producer : value.second) { + auto realTaskIdx = getIndexOfTask(producer, _fetchDMAs, barrierInfo); + barrierInfo.addProducer(realBarrierIdx, realTaskIdx); + } + } +} + +VPUIP::FetchDMAAttr AddPlaceholderFetchDMAsPass::getFetchDMAAttr(int64_t groupIdx, BarrierInfo& barrierInfo, + size_t taskIndex) { + auto ctx = &(getContext()); + auto taskQueueType = barrierInfo.getTaskQueueType(taskIndex); + auto executorKindAttr = VPU::ExecutorKindAttr::get(ctx, taskQueueType.type); + auto tileIdxAttr = mlir::IntegerAttr::get(getInt64Type(ctx), getTileIndexForDpuOrShv(barrierInfo, taskIndex)); + auto listIdxAttr = mlir::IntegerAttr::get(getInt64Type(ctx), getListIndexForDpuOrShv(barrierInfo, taskIndex)); + auto groupIdxAttr = mlir::IntegerAttr::get(getInt64Type(ctx), groupIdx); + return VPUIP::FetchDMAAttr::get(ctx, executorKindAttr, tileIdxAttr, listIdxAttr, groupIdxAttr); +} + +// Insert barrier and FetchDMA for each group +// LastOfGrandParent --> NewBarrier1 --> FetchDMA --> NewBarrier2 --> LastOfParent/SyncTask +void AddPlaceholderFetchDMAsPass::planFetchDMAAndBarriersInsertionPerQueue(BlockRange& blockRange, + ExecutionGroupList& executionGroup, + BarrierInfo& barrierInfo) { + // Always populate for whatever is available for first 2 groups + for (size_t groupIdx = 0; groupIdx < std::min(2, executionGroup.size()); ++groupIdx) { + FetchDMAData fetchDMAData; + auto insertionIndex = barrierInfo.getIndex(_firstTaskOp); + fetchDMAData.insertionPoint = insertionIndex; + fetchDMAData.fetchDmaAttr = getFetchDMAAttr(groupIdx, barrierInfo, executionGroup[groupIdx].front()); + _fetchDMAsToInsert.push_back(fetchDMAData); + } + + // If less than 3 groups, skip the rest of the logic that depends on both being present + if (executionGroup.size() < 3) { + return; + } + + size_t groupIdx = 2; + auto grandParentGroup = executionGroup.front(); + auto parentGroup = executionGroup[1]; + auto travelingGroup = executionGroup[groupIdx]; + while (groupIdx < executionGroup.size()) { + auto lastTaskGrandParentGroup = grandParentGroup[grandParentGroup.size() - 1]; + auto lastTaskParentGroup = parentGroup.back(); + auto dummyBarrierTwoConsumer = lastTaskParentGroup; + + FetchDMAData fetchDMAData; + // If both tasks are in different blocks, we may need a sync task as consumer for PlaceholderFetchDMA + if (!inSameTaskBlock(lastTaskParentGroup, lastTaskGrandParentGroup, blockRange)) { + auto syncPoint = barrierInfo.getControlGraphSyncPoint(lastTaskGrandParentGroup); + // lastTaskGrandParentGroup is NOT the sync task — safe to use sync as consumer + if (lastTaskGrandParentGroup != syncPoint.value()) { + dummyBarrierTwoConsumer = syncPoint.value(); + } else { + // lastTaskGrandParentGroup IS the sync task + auto blockInd1 = barrierInfo.getControlGraphBlockIndex(lastTaskGrandParentGroup); + auto blockInd2 = barrierInfo.getControlGraphBlockIndex(lastTaskParentGroup); + + if (blockInd1 + 1 == blockInd2) { + // Tasks are in consecutive blocks — use parent directly as consumer + dummyBarrierTwoConsumer = lastTaskParentGroup; + } else { + // Need next block's sync point as consumer + auto nextSync = barrierInfo.getNextBlockSyncPoint(lastTaskGrandParentGroup); + VPUX_THROW_UNLESS(nextSync.has_value(), "No next block sync point found for FetchDMA consumer"); + dummyBarrierTwoConsumer = nextSync.value(); + } + } + } + + // 1. LastOfGrandParent → B1 + size_t barOneDummyIdx = _newBarrierIndex++; + auto& producersToAdd = _barrierAddConsumerProducerMap[{barOneDummyIdx, Type::Dummy}].second; + producersToAdd.push_back({lastTaskGrandParentGroup, Type::Real}); + + // 2. B1 → FetchDMA → B2 + fetchDMAData.insertionPoint = lastTaskGrandParentGroup; + fetchDMAData.consumes = {{barOneDummyIdx, Type::Dummy}}; + + size_t barTwoDummyIdx = _newBarrierIndex++; + fetchDMAData.producesIn = {{barTwoDummyIdx, Type::Dummy}}; + + // 3. B2 → LastOfParent (or syncTask) + auto& consumersToAdd = _barrierAddConsumerProducerMap[{barTwoDummyIdx, Type::Dummy}].first; + consumersToAdd.push_back({dummyBarrierTwoConsumer, Type::Real}); + + fetchDMAData.fetchDmaAttr = getFetchDMAAttr(groupIdx, barrierInfo, travelingGroup.front()); + _fetchDMAsToInsert.push_back(fetchDMAData); + + grandParentGroup = parentGroup; + parentGroup = travelingGroup; + + ++groupIdx; + if (groupIdx < executionGroup.size()) { + travelingGroup = executionGroup[groupIdx]; + } + } +} + +void AddPlaceholderFetchDMAsPass::safeRunOnFunc() { + auto netFunc = getOperation(); + mlir::OpBuilder builder(netFunc); + + // Identify existing position of DeclareBufferOp, will be used as insertion point + // for new tasks that will be inserted in IR + auto bufferOps = netFunc.getOps(); + _bufferInsertionPoint = !bufferOps.empty() ? *bufferOps.begin() : &netFunc.getBody().front().front(); + + auto barrierOps = netFunc.getOps(); + _barrierInsertionPoint = !barrierOps.empty() ? *barrierOps.begin() : &netFunc.getBody().front().front(); + + auto& barrierInfo = getAnalysis(); + _numAllTaskOps = barrierInfo.getNumOfTasks(); + + // Get the blockRanges to check we don't add deps between blocks + BlockRange blockRange; + for (size_t blockIdx = 0; blockIdx < barrierInfo.getControlGraphBlockCount(); ++blockIdx) { + auto [blockStartInd, blockEndInd] = barrierInfo.getControlGraphBlockTaskRange( + blockIdx, /* blockStartSyncPoint */ false, /* blockEndSyncPoint */ true); + blockRange.push_back({blockStartInd, blockEndInd}); + } + + // Build task queue type map for all queues in order to test paths between tasks on different FIFOs. + barrierInfo.initializeTaskQueueTypeMap( + {VPU::ExecutorKind::DMA_NN, VPU::ExecutorKind::DPU, VPU::ExecutorKind::SHAVE_ACT}); + barrierInfo.buildTaskQueueTypeMap(); + + // Will have a map for each cluster along with task index of the task + auto taskQueues = VPURT::getTaskOpQueues(netFunc, barrierInfo); + + VPURT::TaskQueueType fetchDmaQueueType; + fetchDmaQueueType.type = VPU::ExecutorKind::DMA_NN; + fetchDmaQueueType.id = getDMAQueueIdEncoding(/*port*/ 0, VPUIP::DmaChannelType::DDR); + // _firstTaskOp is used as insertion point for FetchDMAs for initial 2 execution groups + // If we have any DMAs on supported port and channel then FetchDMAs must be placed before them + // If we don't have any DMAs on suported port and channel we can just place FetchDMA before first TaskOp + auto taskOps = netFunc.getOps(); + VPUX_THROW_WHEN(taskOps.empty(), "Can not find TaskOp"); + + _firstTaskOp = *taskOps.begin(); + if (!taskQueues[fetchDmaQueueType].empty()) { + _firstTaskOp = barrierInfo.getTaskOpAtIndex(taskQueues[fetchDmaQueueType].front()); + } + + auto& execGroupAnalysis = getAnalysis(); + execGroupAnalysis.logExecutionGroupTasks(_log); + auto dpuGroups = execGroupAnalysis.getDPUExecutionGroups(); + auto swGroups = execGroupAnalysis.getActShvExecutionGroups(); + + for (auto& [_, executionGroups] : dpuGroups) { + planFetchDMAAndBarriersInsertionPerQueue(blockRange, executionGroups, barrierInfo); + } + for (auto& [_, executionGroups] : swGroups) { + planFetchDMAAndBarriersInsertionPerQueue(blockRange, executionGroups, barrierInfo); + } + + realizePlannedInsertions(builder, barrierInfo); + finalizeBarrierInfo(barrierInfo, netFunc, _log); + + // Log the number of inserted FetchDMAs + if (!_fetchDMAsToInsert.empty()) { + _log.info("Inserted '{0}' FetchDMAs", _fetchDMAsToInsert.size()); + } +} + +} // namespace + +// +// createAddPlaceholderFetchDMAsPass +// + +std::unique_ptr vpux::VPUIP::arch40xx::createAddPlaceholderFetchDMAsPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/add_start_barrier.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/add_start_barrier.cpp index e80340417d..addb674cd7 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/add_start_barrier.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/add_start_barrier.cpp @@ -9,6 +9,7 @@ #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/dma.hpp" #include "vpux/compiler/utils/logging.hpp" @@ -22,8 +23,20 @@ using namespace vpux; namespace { +std::optional getFirstNonFetchDMAIdx(ArrayRef dmaTaskIndices, const BarrierInfo& barrierInfo) { + for (auto dmaIdx : dmaTaskIndices) { + auto taskOp = barrierInfo.getTaskOpAtIndex(dmaIdx); + auto innerOp = taskOp.getInnerTaskOp(); + + if (!mlir::isa(innerOp)) { + return dmaIdx; + } + } + return std::nullopt; +} + std::pair getFirstDmaAndStartBarrierCandidate( - BarrierInfo& barrierInfo, VPURT::TaskOpQueues& taskQueueTypeMap, bool compilerBarrierProgramming, Logger log) { + BarrierInfo& barrierInfo, VPURT::TaskOpQueues& taskQueueTypeMap, Logger log) { VPURT::TaskOp firstDmaOp; VPURT::DeclareVirtualBarrierOp startBarrierCandidateOp; @@ -51,11 +64,8 @@ std::pair getFirstDmaAndStartBarr // Find also first DMA on that FIFO that updates a barrier. // This DMA is the candidate to produce a start barrier - auto firstP0ChDdrDmaIdxIt = std::begin(taskQueueTypeMap[dmaP0ChDddrQueueType]); - - std::optional firstP0ChDdrDmaIdx = (firstP0ChDdrDmaIdxIt != taskQueueTypeMap[dmaP0ChDddrQueueType].end() - ? std::make_optional(*firstP0ChDdrDmaIdxIt) - : std::nullopt); + const auto& dmaTaskIndices = taskQueueTypeMap[dmaP0ChDddrQueueType]; + auto firstP0ChDdrDmaIdx = getFirstNonFetchDMAIdx(dmaTaskIndices, barrierInfo); // If no DMA was found then it needs to be created, do early return if (!firstP0ChDdrDmaIdx.has_value()) { @@ -164,28 +174,26 @@ std::pair getFirstDmaAndStartBarr startBarrierCandidatesVec.end()); } - if (compilerBarrierProgramming) { - // 4. Remove candidates which are produced by any other tasks than firstDMA. Special case which needed only for - // compiler barrier programming for avoid race condition between DMA tasks - startBarrierCandidatesVec.erase( - llvm::remove_if(startBarrierCandidatesVec, - [&](size_t barrierIdx) { - auto barrierUpdatedNonFirstDmaTask = false; - for (auto barrierProducerIdx : barrierInfo.getBarrierProducers(barrierIdx)) { - auto barrierProducerOp = barrierInfo.getTaskOpAtIndex(barrierProducerIdx); - if (barrierProducerOp != firstDmaOp) { - barrierUpdatedNonFirstDmaTask = true; - break; - } + // 4. Remove candidates which are produced by any other tasks than firstDMA. Special case which needed only for + // compiler barrier programming for avoid race condition between DMA tasks + startBarrierCandidatesVec.erase( + llvm::remove_if(startBarrierCandidatesVec, + [&](size_t barrierIdx) { + auto barrierUpdatedNonFirstDmaTask = false; + for (auto barrierProducerIdx : barrierInfo.getBarrierProducers(barrierIdx)) { + auto barrierProducerOp = barrierInfo.getTaskOpAtIndex(barrierProducerIdx); + if (barrierProducerOp != firstDmaOp) { + barrierUpdatedNonFirstDmaTask = true; + break; } - return barrierUpdatedNonFirstDmaTask; - }), - startBarrierCandidatesVec.end()); + } + return barrierUpdatedNonFirstDmaTask; + }), + startBarrierCandidatesVec.end()); - if (startBarrierCandidatesVec.empty()) { - log.trace("No start barrier candidates left"); - return std::make_pair(firstDmaOp, nullptr); - } + if (startBarrierCandidatesVec.empty()) { + log.trace("No start barrier candidates left"); + return std::make_pair(firstDmaOp, nullptr); } // No candidates left, return @@ -227,7 +235,7 @@ void addExplicitDependencyBetweenDmaListsAndStartBarrier(mlir::func::FuncOp func VPURT::DeclareVirtualBarrierOp startBarrierOp, Logger log) { const auto module = func->getParentOfType(); const auto dmaPortNum = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN).getCount(); - auto dmaChannels = getDMAChannelsWithIndependentLinkAgents(VPU::getArch(module)); + auto dmaChannels = getDMAChannelsWithIndependentLinkAgents(config::getArch(module)); for (auto dmaPortIdx : irange(dmaPortNum)) { for (auto dmaChannel : dmaChannels) { // We skip queue P0 CH:DDR because is queue where we have DMA that's responsible for handling start barrier. @@ -256,36 +264,21 @@ void addExplicitDependencyBetweenDmaListsAndStartBarrier(mlir::func::FuncOp func class AddStartBarrierPass final : public VPUIP::arch40xx::impl::AddStartBarrierBase { public: - explicit AddStartBarrierPass(bool compilerBarrierProgramming, Logger log) - : _compilerBarrierProgramming(compilerBarrierProgramming) { + explicit AddStartBarrierPass(Logger log) { Base::initLogger(log, Base::getArgumentName()); } - mlir::LogicalResult initialize(mlir::MLIRContext* ctx) final; - private: - bool _compilerBarrierProgramming; void safeRunOnFunc() final; }; -mlir::LogicalResult AddStartBarrierPass::initialize(mlir::MLIRContext* ctx) { - if (mlir::failed(Base::initialize(ctx))) { - return mlir::failure(); - } - if (!enableCompilerBarrierProgramming.hasValue()) { - return mlir::success(); - } - _compilerBarrierProgramming = enableCompilerBarrierProgramming.getValue(); - return mlir::success(); -} - void AddStartBarrierPass::safeRunOnFunc() { auto func = getOperation(); auto& barrierInfo = getAnalysis(); barrierInfo.buildTaskQueueTypeMap(); auto taskQueueTypeMap = VPURT::getTaskOpQueues(func, barrierInfo); auto [firstDmaOp, startBarrierCandidateOp] = - getFirstDmaAndStartBarrierCandidate(barrierInfo, taskQueueTypeMap, _compilerBarrierProgramming, _log); + getFirstDmaAndStartBarrierCandidate(barrierInfo, taskQueueTypeMap, _log); if (startBarrierCandidateOp == nullptr) { auto insertPoint = &func.getBody().front().front(); @@ -321,21 +314,14 @@ void AddStartBarrierPass::safeRunOnFunc() { auto loc = mlir::NameLoc::get(mlir::StringAttr::get(&getContext(), "start_barrier")); startBarrierCandidateOp->setLoc(loc); startBarrierCandidateOp.setIsStartBarrier(true); - if (_compilerBarrierProgramming) { - addExplicitDependencyBetweenDmaListsAndStartBarrier(func, barrierInfo, taskQueueTypeMap, - startBarrierCandidateOp, _log); - } + addExplicitDependencyBetweenDmaListsAndStartBarrier(func, barrierInfo, taskQueueTypeMap, startBarrierCandidateOp, + _log); barrierInfo.clearAttributes(); VPURT::verifyBarrierSlots(func, _log); } } // namespace -// -// createAddStartBarrierPass -// - -std::unique_ptr vpux::VPUIP::arch40xx::createAddStartBarrierPass(bool compilerBarrierProgramming, - Logger log) { - return std::make_unique(compilerBarrierProgramming, log); +std::unique_ptr vpux::VPUIP::arch40xx::createAddStartBarrierPass(Logger log) { + return std::make_unique(log); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/compress_spill_dma.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/compress_spill_dma.cpp index e3a9034435..41a57ed9de 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/compress_spill_dma.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/compress_spill_dma.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/core/attributes/stride_reqs.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/IR/attributes.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/compute_halo_region_for_dpu_task_op.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/compute_halo_region_for_dpu_task_op.cpp index 119d9ce1e6..77630aa09b 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/compute_halo_region_for_dpu_task_op.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/compute_halo_region_for_dpu_task_op.cpp @@ -8,8 +8,10 @@ #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/utils/logging.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include "vpux/utils/core/dense_map.hpp" namespace vpux::VPUIP::arch40xx { #define GEN_PASS_DECL_COMPUTEHALOREGIONFORDPUTASKOP diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/constant_dpu_prof_hwp_base.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/constant_dpu_prof_hwp_base.cpp index be4d0ae231..020b4a3216 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/constant_dpu_prof_hwp_base.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/constant_dpu_prof_hwp_base.cpp @@ -66,10 +66,10 @@ void ConstantDpuProfHwpBasePass::safeRunOnFunc() { } profDeclBuff.setByteOffsetAttr(vpux::getIntAttr(ctx, 0)); - // Workaround: The dummy DPU task injected by add-dummy-dpu-task-for-sprlut will get the same workload_id - // as the next DPU task in the same NCEClusterTask. This is intentional and makes the dummy task write its - // profiling data in the buffer that's been allocated for the next task, with the next (non-dummy) DPU task - // overwriting the dummy task's profiling data. + // Workaround: The dummy DPU task injected by add-dummy-dpu-task-for-metadata-prefetch will get the same + // workload_id as the next DPU task in the same NCEClusterTask. This is intentional and makes the dummy task + // write its profiling data in the buffer that's been allocated for the next task, with the next (non-dummy) DPU + // task overwriting the dummy task's profiling data. // TODO: E#160727 nceClusterTaskOp.walk([&](VPUIP::DPUTaskOp dpuTaskOp) { VPUX_THROW_UNLESS(dpuTaskOp.getWorkloadId().has_value(), diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/detect_dma_split_candidate.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/detect_dma_split_candidate.cpp index afc374a25f..0a22ab6a0b 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/detect_dma_split_candidate.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/detect_dma_split_candidate.cpp @@ -7,12 +7,11 @@ #include "vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/core/cost_model_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/interfaces/inference_execution_simulator.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/utils/core/error.hpp" @@ -186,7 +185,7 @@ void sortTasksAndCreateTaskMap(VPURT::TaskConfigVec& allTasks, QueueIDToTaskConf } } -size_t calculateSplitDMACost(VPUIP::NNDMAOp dmaOp, VPU::ArchKind arch, +size_t calculateSplitDMACost(VPUIP::NNDMAOp dmaOp, config::ArchKind arch, const std::shared_ptr& costModel) { size_t static constexpr COST_MAX = std::numeric_limits::max(); @@ -214,7 +213,7 @@ size_t calculateSplitDMACost(VPUIP::NNDMAOp dmaOp, VPU::ArchKind arch, const auto nnTensor = VPUNN::VPUTensor({checked_cast(Shape(splitShape).totalSize()), 1, 1, 1}, nnType.value()); - return costModel->DMA(getVPUDeviceType(arch), {nnTensor}, {nnTensor}, getMemoryLocation(inputType), + return costModel->DMA(VPU::getVPUDeviceType(arch), {nnTensor}, {nnTensor}, getMemoryLocation(inputType), getMemoryLocation(outputType)); } @@ -260,7 +259,7 @@ class DetectDMASplitCandidate final : void DetectDMASplitCandidate::safeRunOnFunc() { auto func = getOperation(); auto module = func->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto maybeCostModelAnalysis = getCachedParentAnalysis(module); auto costModel = VPU::CostModelAnalysis::getOrCreateCostModel(maybeCostModelAnalysis, arch, _log); diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/dma_out_of_order_optimization.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/dma_out_of_order_optimization.cpp index 7e056b6c20..f73204a6e5 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/dma_out_of_order_optimization.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/dma_out_of_order_optimization.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/dma_profiling_hw_ddr.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/dma_profiling_hw_ddr.cpp index 9accd32b1f..b5fb0ce47b 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/dma_profiling_hw_ddr.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/dma_profiling_hw_ddr.cpp @@ -7,7 +7,10 @@ #include "vpux/compiler/core/profiling.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/dma.hpp" #include "vpux/compiler/utils/logging.hpp" @@ -78,8 +81,8 @@ class LastDMAQueueTracker : public DMAQueueTracker { class DMATaskProfilingHwDdrPass final : public VPUIP::arch40xx::impl::DMATaskProfilingHwDdrBase { public: - explicit DMATaskProfilingHwDdrPass(DMAProfilingMode dmaProfilingMode, Logger log) - : _dmaProfilingMode(dmaProfilingMode) { + explicit DMATaskProfilingHwDdrPass(const std::string& enableDMAProfiling, Logger log) + : DMATaskProfilingHwDdrBase({enableDMAProfiling}) { Base::initLogger(log, Base::getArgumentName()); } @@ -87,7 +90,6 @@ class DMATaskProfilingHwDdrPass final : void safeRunOnModule() final; private: - DMAProfilingMode _dmaProfilingMode; FirstDMAQueueTracker firstDMATracker; LastDMAQueueTracker lastDMATracker; @@ -102,17 +104,16 @@ class DMATaskProfilingHwDdrPass final : void DMATaskProfilingHwDdrPass::safeRunOnModule() { auto moduleOp = getOperation(); auto* ctx = moduleOp->getContext(); - auto arch = VPU::getArch(moduleOp); + auto arch = config::getArch(moduleOp); - if (enableDMAProfiling.hasValue()) { - _dmaProfilingMode = getDMAProfilingMode(arch, enableDMAProfiling.getValue()); - } + VPUX_THROW_UNLESS(enableDMAProfiling.hasValue(), "No option"); + auto dmaProfilingMode = getDMAProfilingMode(arch, enableDMAProfiling); net::NetworkInfoOp netInfo; mlir::func::FuncOp funcOp; net::NetworkInfoOp::getFromModule(moduleOp, netInfo, funcOp); - switch (_dmaProfilingMode) { + switch (dmaProfilingMode) { case DMAProfilingMode::STATIC_HWP: { setupStaticProfiling(ctx, netInfo, funcOp); break; @@ -348,7 +349,7 @@ VPURT::TaskOp DMATaskProfilingHwDdrPass::generateBufferCopyAfter(mlir::OpBuilder // createDMATaskProfilingHwDdrPass // -std::unique_ptr vpux::VPUIP::arch40xx::createDMATaskProfilingHwDdrPass(DMAProfilingMode dmaProfilingMode, - Logger log) { - return std::make_unique(dmaProfilingMode, log); +std::unique_ptr vpux::VPUIP::arch40xx::createDMATaskProfilingHwDdrPass( + const std::string& enableDMAProfiling, Logger log) { + return std::make_unique(enableDMAProfiling, log); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/fuse_segmented_dma.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/fuse_segmented_dma.cpp index 44b487e572..acc3ff703f 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/fuse_segmented_dma.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/fuse_segmented_dma.cpp @@ -4,13 +4,9 @@ // #include "vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPUIP/transforms/passes/unroll_distributed_ops.hpp" - -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/utils/dma_fusion_utils.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" -#include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/utils/core/error.hpp" namespace vpux::VPUIP::arch40xx { diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/legalize_schedule_for_wlm_fetch_dmas.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/legalize_schedule_for_partial_wlm_fetch_dmas.cpp similarity index 93% rename from src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/legalize_schedule_for_wlm_fetch_dmas.cpp rename to src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/legalize_schedule_for_partial_wlm_fetch_dmas.cpp index a4da6fe27f..37c15d3c2e 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/legalize_schedule_for_wlm_fetch_dmas.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/legalize_schedule_for_partial_wlm_fetch_dmas.cpp @@ -6,16 +6,18 @@ #include "vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/core/barrier_info.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" #include "vpux/compiler/dialect/VPURegMapped/utils.hpp" +#include "vpux/compiler/utils/options.hpp" #include "vpux/compiler/utils/wlm_legalization_utils.hpp" namespace vpux::VPUIP::arch40xx { -#define GEN_PASS_DECL_LEGALIZESCHEDULEFORWLMFETCHDMAS -#define GEN_PASS_DEF_LEGALIZESCHEDULEFORWLMFETCHDMAS +#define GEN_PASS_DECL_LEGALIZESCHEDULEFORPARTIALWLMFETCHDMAS +#define GEN_PASS_DEF_LEGALIZESCHEDULEFORPARTIALWLMFETCHDMAS #include "vpux/compiler/NPU40XX/dialect/VPUIP/passes.hpp.inc" } // namespace vpux::VPUIP::arch40xx @@ -23,24 +25,20 @@ using namespace vpux; namespace { // -// LegalizeScheduleForWlmFetchDmasPass +// LegalizeScheduleForPartialWlmFetchDmasPass // -enum Type { Dummy = 0, Real = 1 }; -// IndexType is used to represent an entry in _barrierRemoveConsumerProducerMap and _barrierAddConsumerProducerMap -// size_t represents index of barrier/DMA and Type represents the type of index i.e. Dummy or Real -// We need to store both as we need to know the true index in barrierInfo to be able to add/remove dependencies -using IndexType = std::pair; struct DummyDMAData { size_t insertionPoint; SmallVector consumes; SmallVector producesIn; }; -class LegalizeScheduleForWlmFetchDmasPass final : - public VPUIP::arch40xx::impl::LegalizeScheduleForWlmFetchDmasBase { +class LegalizeScheduleForPartialWlmFetchDmasPass final : + public VPUIP::arch40xx::impl::LegalizeScheduleForPartialWlmFetchDmasBase< + LegalizeScheduleForPartialWlmFetchDmasPass> { public: - explicit LegalizeScheduleForWlmFetchDmasPass(const int virtualBarrierThreshold, Logger log) + explicit LegalizeScheduleForPartialWlmFetchDmasPass(const int virtualBarrierThreshold, Logger log) : _virtualBarrierThreshold(virtualBarrierThreshold) { Base::initLogger(log, Base::getArgumentName()); } @@ -85,23 +83,6 @@ class LegalizeScheduleForWlmFetchDmasPass final : SmallVector _dummyDMAs; }; -// Function returns true index of a task -size_t getIndexOfTask(IndexType indexType, SmallVector& dummyDMAs, BarrierInfo& barrierInfo) { - if (indexType.second == Type::Dummy) { - return barrierInfo.getIndex(dummyDMAs[indexType.first]); - } - return indexType.first; -} - -// Function returns true index of a barrier -size_t getIndexOfBarrier(IndexType indexType, SmallVector& dummyBarriers, - BarrierInfo& barrierInfo) { - if (indexType.second == Type::Dummy) { - return barrierInfo.getIndex(dummyBarriers[indexType.first]); - } - return indexType.first; -} - // Returns a DMA which copies 0 len data from DDR to DDR VPURT::TaskOp createDummyDma(mlir::OpBuilder& builder, mlir::Value inputBuf, mlir::Value outputBuf, BarrierInfo& barrierInfo, SmallVector& dummyDmas) { @@ -145,9 +126,10 @@ void finalizeBarrierInfo(BarrierInfo& barrierInfo, SmallVector& d // Once we know the insertion point of DMAs this function creates actual DMAs in IR while also keeps a map of [index, // DMAOp This map is used later to refer to real DMAOp and get real task-index from barrierInfo -void LegalizeScheduleForWlmFetchDmasPass::realizePlannedInsertions(mlir::OpBuilder& builder, BarrierInfo& barrierInfo, - mlir::Operation* bufferInsertionPoint, - SmallVector& dummyDmas) { +void LegalizeScheduleForPartialWlmFetchDmasPass::realizePlannedInsertions(mlir::OpBuilder& builder, + BarrierInfo& barrierInfo, + mlir::Operation* bufferInsertionPoint, + SmallVector& dummyDmas) { auto inBuffer = VPUIP::createDummyBuffer(builder, bufferInsertionPoint); auto outBuffer = VPUIP::createDummyBuffer(builder, bufferInsertionPoint); @@ -204,15 +186,16 @@ void LegalizeScheduleForWlmFetchDmasPass::realizePlannedInsertions(mlir::OpBuild } } -bool LegalizeScheduleForWlmFetchDmasPass::isValidDMA(BarrierInfo& barrierInfo, size_t dmaIdx) { +bool LegalizeScheduleForPartialWlmFetchDmasPass::isValidDMA(BarrierInfo& barrierInfo, size_t dmaIdx) { auto taskOp = barrierInfo.getTaskOpAtIndex(dmaIdx); return taskOp.getExecutorKind() == VPU::ExecutorKind::DMA_NN && isDMAOnSupportedPortAndChannel(taskOp) && dmaIdx < _numAllTaskOps; } -VPURT::TaskOp LegalizeScheduleForWlmFetchDmasPass::findDMAsThroughBarriersBFS(size_t startBarrier, - BarrierInfo& barrierInfo, - MinMaxOption option, bool bfsDirUp) { +VPURT::TaskOp LegalizeScheduleForPartialWlmFetchDmasPass::findDMAsThroughBarriersBFS(size_t startBarrier, + BarrierInfo& barrierInfo, + MinMaxOption option, + bool bfsDirUp) { std::queue barriersToExplore; barriersToExplore.push(startBarrier); std::unordered_set visitedBarriers; @@ -272,8 +255,8 @@ looks for a DMA which is on tile 0 list 0 and waits for 9277 If there is no DMA which waits for 9277 using BFS check if the user's barrier (barrier->task->barrier) has a DMA user and use it as first DMA */ -VPURT::TaskOp LegalizeScheduleForWlmFetchDmasPass::findFirstDmaAfterExecGroup(BarrierInfo& barrierInfo, - ExecutionGroup& executionGroup) { +VPURT::TaskOp LegalizeScheduleForPartialWlmFetchDmasPass::findFirstDmaAfterExecGroup(BarrierInfo& barrierInfo, + ExecutionGroup& executionGroup) { SmallVector updateBarriers; for (const auto& taskIndex : executionGroup) { auto upBarriers = barrierInfo.getUpdateBarriers(taskIndex); @@ -322,8 +305,8 @@ DMA which is on tile 0 list 0 and updates 9184 If there is no DMA which updates 9184 using BFS check if the user's barrier (barrier<-task<-barrier) has a DMA user and use it as last DMA */ -VPURT::TaskOp LegalizeScheduleForWlmFetchDmasPass::findLastDmaBeforeExecGroup(BarrierInfo& barrierInfo, - ExecutionGroup& executionGroup) { +VPURT::TaskOp LegalizeScheduleForPartialWlmFetchDmasPass::findLastDmaBeforeExecGroup(BarrierInfo& barrierInfo, + ExecutionGroup& executionGroup) { SmallVector waitBarriers; SmallVector possibleUpdatingDMAs; @@ -373,8 +356,8 @@ VPURT::TaskOp LegalizeScheduleForWlmFetchDmasPass::findLastDmaBeforeExecGroup(Ba return maxDmaOp; } -SmallVector LegalizeScheduleForWlmFetchDmasPass::getDmasUpdatingBarriers(llvm::DenseSet& barriers, - BarrierInfo& barrierInfo) { +SmallVector LegalizeScheduleForPartialWlmFetchDmasPass::getDmasUpdatingBarriers( + llvm::DenseSet& barriers, BarrierInfo& barrierInfo) { llvm::DenseSet allTaskUpdatingBarriers; for (auto barrIdx : barriers) { auto allProducers = barrierInfo.getBarrierProducers(barrIdx); @@ -508,7 +491,7 @@ B:9184 B:9277 B:9279 B:9353 +-----+ +-----+ */ -void LegalizeScheduleForWlmFetchDmasPass::planDummyDMAAndBarriersInsertion( +void LegalizeScheduleForPartialWlmFetchDmasPass::planDummyDMAAndBarriersInsertion( DenseMap& executionGroups, BarrierInfo& barrierInfo, SmallVector>& blockRange) { for (auto& [queueType, executionGroup] : executionGroups) { @@ -516,7 +499,7 @@ void LegalizeScheduleForWlmFetchDmasPass::planDummyDMAAndBarriersInsertion( } } -void LegalizeScheduleForWlmFetchDmasPass::planDummyDMAAndBarriersInsertionPerQueue( +void LegalizeScheduleForPartialWlmFetchDmasPass::planDummyDMAAndBarriersInsertionPerQueue( ExecutionGroupList& executionGroup, BarrierInfo& barrierInfo, SmallVector>& blockRange, VPURT::TaskQueueType queueType) { if (executionGroup.size() < 3) { @@ -807,14 +790,14 @@ void LegalizeScheduleForWlmFetchDmasPass::planDummyDMAAndBarriersInsertionPerQue } } -void LegalizeScheduleForWlmFetchDmasPass::safeRunOnFunc() { +void LegalizeScheduleForPartialWlmFetchDmasPass::safeRunOnFunc() { auto netFunc = getOperation(); auto module = netFunc->getParentOfType(); auto barriersOps = netFunc.getOps(); auto numVirtualBarriers = static_cast(std::distance(barriersOps.begin(), barriersOps.end())); if (numVirtualBarriers > _virtualBarrierThreshold) { _log.info("Skip schedule legalization due to high number of barriers: {0}", numVirtualBarriers); - vpux::VPUIP::setWlmStatus(module, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(module, VPU::WorkloadManagementStatus::FAILED); return; } @@ -891,10 +874,10 @@ void LegalizeScheduleForWlmFetchDmasPass::safeRunOnFunc() { } // namespace // -// createLegalizeScheduleForWlmFetchDmasPass +// createLegalizeScheduleForPartialWlmFetchDmasPass // -std::unique_ptr vpux::VPUIP::arch40xx::createLegalizeScheduleForWlmFetchDmasPass( +std::unique_ptr vpux::VPUIP::arch40xx::createLegalizeScheduleForPartialWlmFetchDmasPass( const int virtualBarrierThreshold, Logger log) { - return std::make_unique(virtualBarrierThreshold, log); + return std::make_unique(virtualBarrierThreshold, log); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/optimize_convert_dma.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/optimize_convert_dma.cpp index 10650efb19..82ce570e16 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/optimize_convert_dma.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/optimize_convert_dma.cpp @@ -4,11 +4,10 @@ // #include "vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPUIP/transforms/passes/unroll_distributed_ops.hpp" - -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/split_dma_to_balance_load.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/split_dma_to_balance_load.cpp index 756063c147..e2c6734377 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/split_dma_to_balance_load.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/split_dma_to_balance_load.cpp @@ -4,15 +4,13 @@ // #include "vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPUIP/transforms/passes/unroll_distributed_ops.hpp" - -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/quantization.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/error.hpp" #include diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_depth_to_space_dma.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_depth_to_space_dma.cpp index d98b66020f..1c8dac8632 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_depth_to_space_dma.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_depth_to_space_dma.cpp @@ -188,7 +188,7 @@ mlir::LogicalResult DepthToSpaceDMARewriter::matchAndRewriteClusterDMA(VPUIP::De const auto getDistModeAttr = [&](VPUIP::DistributedBufferType distType) { const auto distAttr = distType.getDistribution(); - VPUX_THROW_WHEN(distAttr == nullptr, "Failed to extract distributon tensor from distributed type"); + VPUX_THROW_WHEN(distAttr == nullptr, "Failed to extract distribution tensor from distributed type"); return distAttr.getMode(); }; @@ -223,7 +223,7 @@ mlir::LogicalResult DepthToSpaceDMARewriter::matchAndRewriteClusterDMA(VPUIP::De mlir::SmallVector outputBuffers; if (distributedInputType != nullptr && distributedOutputType != nullptr) { - _log.nest().trace("Got multi-cluster to multi-clutser case"); + _log.nest().trace("Got multi-cluster to multi-cluster case"); const auto inputPerClusterShapes = distributedInputType.getPerClusterMemoryShapes(); const auto outputPerClusterShapes = distributedOutputType.getPerClusterMemoryShapes(); @@ -241,7 +241,7 @@ mlir::LogicalResult DepthToSpaceDMARewriter::matchAndRewriteClusterDMA(VPUIP::De } if (distributedInputType != nullptr && distributedOutputType == nullptr) { - _log.nest().trace("Got multi-cluster to single-clutser case"); + _log.nest().trace("Got multi-cluster to single-cluster case"); const auto outputShapes = SmallVector( llvm::map_range(distributedInputType.getPerClusterMemoryShapes(), inferOutputShape)); const auto outputShapeOffsets = SmallVector( @@ -255,7 +255,7 @@ mlir::LogicalResult DepthToSpaceDMARewriter::matchAndRewriteClusterDMA(VPUIP::De } if (distributedInputType == nullptr && distributedOutputType != nullptr) { - _log.nest().trace("Got single-cluster to multi-clutser case"); + _log.nest().trace("Got single-cluster to multi-cluster case"); const auto inputShapes = SmallVector( llvm::map_range(distributedOutputType.getPerClusterMemoryShapes(), [&](ShapeRef outShape) { diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_distributed_ops.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_distributed_ops.cpp index e6d241754d..a6ea6dd5a0 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_distributed_ops.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_distributed_ops.cpp @@ -6,12 +6,9 @@ #include "vpux/compiler/dialect/VPUIP/transforms/passes/unroll_distributed_ops.hpp" #include "vpux/compiler/NPU37XX/dialect/VPUIP/transforms/passes/unroll_distributed_ops.hpp" #include "vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp" - -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/utils/dma_fusion_utils.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" -#include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_permute_dma.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_permute_dma.cpp new file mode 100644 index 0000000000..01aa152e33 --- /dev/null +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_permute_dma.cpp @@ -0,0 +1,91 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/NPU40XX/dialect/VPUIP/utils/permute_dma.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/unroll_dma_analysis.hpp" +#include "vpux/compiler/utils/rewriter.hpp" + +#include +#include +#include + +namespace vpux::VPUIP::arch40xx { +#define GEN_PASS_DECL_UNROLLPERMUTEDMA +#define GEN_PASS_DEF_UNROLLPERMUTEDMA +#include "vpux/compiler/NPU40XX/dialect/VPUIP/passes.hpp.inc" +} // namespace vpux::VPUIP::arch40xx + +using namespace vpux; + +namespace { + +// +// PermuteDMARewriter +// + +class PermuteDMARewriter final : public mlir::OpRewritePattern { +public: + PermuteDMARewriter(mlir::MLIRContext* ctx, int64_t dmaPortCount, Logger log) + : mlir::OpRewritePattern(ctx), _dmaPortCount(dmaPortCount), _log(log) { + setDebugName("PermuteDMARewriter"); + } + + mlir::LogicalResult matchAndRewrite(VPUIP::PermuteDMAOp permuteOp, mlir::PatternRewriter& rewriter) const final; + +private: + int64_t _dmaPortCount; + Logger _log; +}; + +mlir::LogicalResult PermuteDMARewriter::matchAndRewrite(VPUIP::PermuteDMAOp permuteOp, + mlir::PatternRewriter& rewriter) const { + return arch40xx::rewritePermuteDMA(permuteOp, rewriter, _dmaPortCount, _log); +} + +// +// UnrollPermuteDMAPass +// + +class UnrollPermuteDMAPass final : public VPUIP::arch40xx::impl::UnrollPermuteDMABase { +public: + explicit UnrollPermuteDMAPass(Logger log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnFunc() final; +}; + +void UnrollPermuteDMAPass::safeRunOnFunc() { + auto& ctx = getContext(); + auto func = getOperation(); + markAnalysesPreserved(); + auto analysis = getAnalysis(); + if (!analysis.passNeeded(VPUIP::UnrollDMAAnalysisNeeded::UnrollPermuteDMAPass)) { + return; + } + auto module = func->getParentOfType(); + auto dmaOp = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN); + auto dmaPortCount = dmaOp.getCount(); + + mlir::RewritePatternSet patterns(&ctx); + patterns.add(&ctx, dmaPortCount, _log.nest()); + if (mlir::failed( + mlir::applyPatternsAndFoldGreedily(func, std::move(patterns), vpux::getDefaultGreedyRewriteConfig()))) { + signalPassFailure(); + } +} + +} // namespace + +// +// createUnrollPermuteDMAPass +// + +std::unique_ptr vpux::VPUIP::arch40xx::createUnrollPermuteDMAPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_space_to_depth_dma.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_space_to_depth_dma.cpp index fba457201e..6cc95321c5 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_space_to_depth_dma.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/passes/unroll_space_to_depth_dma.cpp @@ -292,7 +292,7 @@ mlir::LogicalResult SpaceToDepthDMARewriter::matchAndRewriteClusterDMA(VPUIP::Sp outputType); const auto distributionAttr = distributedType.getDistribution(); - VPUX_THROW_WHEN(distributionAttr == nullptr, "Failed to extract distributon attribute from distributed type."); + VPUX_THROW_WHEN(distributionAttr == nullptr, "Failed to extract distribution attribute from distributed type."); const auto modeAttr = distributionAttr.getMode(); VPUX_THROW_WHEN(modeAttr == nullptr, "Failed to extract mode from distribution attribute."); diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/pipelines.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/pipelines.cpp index 8968464627..791940d1b2 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/pipelines.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/transforms/pipelines.cpp @@ -7,19 +7,15 @@ #include "vpux/compiler/NPU37XX/dialect/VPURT/transforms/passes.hpp" #include "vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp" #include "vpux/compiler/dialect/VPURT/transforms/passes.hpp" #include "vpux/compiler/dialect/const/passes.hpp" #include "vpux/compiler/dialect/core/transforms/passes.hpp" - -#include "vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp" - #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/profiling/common.hpp" - #include #include @@ -142,17 +138,6 @@ void vpux::VPUIP::arch40xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, pm.addPass(VPUIP::createAsyncRegionsOutliningPass(options.asyncRegionOutliningMinOpsInBlock, log)); } - if (options.enableCompressActivationSpill) { - pm.addPass(VPUIP::createCompressDmaReserveMemPass(log)); - } - - auto profilingMode = getDMAProfilingMode(VPU::ArchKind::NPU40XX, options.enableDMAProfiling.getValue()); - pm.addPass(VPUIP::createDMATaskProfilingReserveMemPass(profilingMode, log)); - - if (options.enableSWKernelPrefetchingReserveMem) { - pm.addPass(VPUIP::createSWKernelPrefetchingReserveMemPass(log)); - } - pm.addPass(VPUIP::createCalculateAsyncRegionCycleCostPass(log)); VPUIP::arch40xx::buildMemoryAllocationPipeline(pm, VPUIP::arch40xx::MemoryAllocationOptions(options), log); @@ -227,8 +212,7 @@ void vpux::VPUIP::arch40xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, } if (options.enableProfiling) { - auto dmaProfilingMode = getDMAProfilingMode(VPU::ArchKind::NPU40XX, options.enableDMAProfiling.getValue()); - pm.addPass(VPUIP::arch40xx::createDMATaskProfilingHwDdrPass(dmaProfilingMode, log)); + pm.addPass(VPUIP::arch40xx::createDMATaskProfilingHwDdrPass(options.enableDMAProfiling, log)); } if (options.enableControlGraphSplit) { @@ -250,17 +234,21 @@ void vpux::VPUIP::arch40xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, } // LNL Shave Kernel prefetch with profiling fails compiling. Track Number: E#169656 - if (options.enableSwKernelsCachePrefetch && !(options.enableProfiling && options.enableSWProfiling)) { + if (options.enableSWKernelInstructionPrefetch && !(options.enableProfiling && options.enableSWProfiling)) { pm.addPass(vpux::VPUIP::createAddSwKernelInstructionPrefetchPass(log)); } // Ensures legal schedule in the case of a WLM rollback pm.addPass(VPURT::createInsertBarrierToMarkTheEndOfDescriptorGroupPass( - options.workloadManagementBarrierCountThreshold, log)); + options.workloadManagementBarrierCountThreshold, options.workloadManagementMode, log)); if (options.workloadManagementEnable) { - pm.addPass(VPUIP::arch40xx::createLegalizeScheduleForWlmFetchDmasPass( - options.workloadManagementBarrierCountThreshold, log)); + if (options.workloadManagementMode != WorkloadManagementMode::FWLM_V1_PAGES) { + pm.addPass(VPUIP::arch40xx::createLegalizeScheduleForPartialWlmFetchDmasPass( + options.workloadManagementBarrierCountThreshold, log)); + } else { + pm.addPass(VPUIP::arch40xx::createAddPlaceholderFetchDMAsPass(log)); + } } if (!isOutliningEnabled || !options.enableBarrierSchedWithFunctionOutlining) { @@ -283,10 +271,7 @@ void vpux::VPUIP::arch40xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, pm.addPass(VPURT::arch37xx::createAddFinalBarrierPass(log)); } - pm.addPass(VPUIP::arch40xx::createAddStartBarrierPass( - options.workloadManagementBarrierProgrammingMode >= - WorkloadManagementBarrierProgrammingMode::INITIAL_BARRIER_DMAS_SCHEDULED, - log)); + pm.addPass(VPUIP::arch40xx::createAddStartBarrierPass(log)); if (options.workloadManagementEnable && options.workloadManagementMode >= WorkloadManagementMode::PWLM_V2_PAGES) { pm.addPass(VPURT::arch40xx::createWlmSplitGraphToPagesPass(log)); @@ -299,6 +284,10 @@ void vpux::VPUIP::arch40xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, if (options.workloadManagementMode == WorkloadManagementMode::FWLM_V1_PAGES) { pm.addPass(VPURT::arch40xx::createWlmInsertDummyDmasInPagesPass(log)); } + if (options.workloadManagementBarrierProgrammingMode == + WorkloadManagementBarrierProgrammingMode::ALL_BARRIER_DMAS_SCHEDULED) { + pm.addPass(VPURT::arch40xx::createWlmInsertDummyBarriersInPagesPass(log)); + } } if (options.enableCompressActivationSpill) { @@ -326,6 +315,10 @@ void vpux::VPUIP::arch40xx::buildDefaultHWPipeline(mlir::OpPassManager& pm, pm.addPass(VPURT::arch40xx::createFindWlmEnqueueDmasBarrierPass(log)); } + if (options.workloadManagementEnable && options.workloadManagementMode >= WorkloadManagementMode::PWLM_V2_PAGES) { + pm.addPass(VPURT::arch40xx::createOptimizeBarriersSlotsUsagePass(log)); + } + pm.addPass(VPURT::createBarrierSimulationPass(log)); pm.addPass(VPUIP::createUpdateSwKernelParamsPass(log)); pm.addPass(mlir::createCanonicalizerPass(grc)); @@ -354,14 +347,62 @@ void vpux::VPUIP::arch40xx::buildDMAUnrollingPipeline(mlir::OpPassManager& pm, L pm.addPass(VPUIP::createUnrollDMAAnalysisPass(log)); pm.addPass(VPUIP::arch40xx::createUnrollDepthToSpaceDMAPass(log)); pm.addPass(VPUIP::arch40xx::createUnrollSpaceToDepthDMAPass(log)); - pm.addPass(VPUIP::createUnrollPermuteToNNDMAPass(log)); + pm.addPass(VPUIP::arch40xx::createUnrollPermuteDMAPass(log)); pm.addPass(VPUIP::createUnrollUpsamplingDMAPass(log)); pm.addPass(VPUIP::createUnrollExpandDMAPass(log)); pm.addPass(VPUIP::createUnrollPerAxisTileDMAPass(log)); + pm.addPass(VPUIP::createUnrollGatherDMAPass(log)); pm.addPass(VPUIP::createInvalidateUnrollDMAAnalysisPass(log)); } +void vpux::VPUIP::arch40xx::buildReferenceSWPipeline(mlir::OpPassManager& pm, + const VPUIP::arch40xx::DefaultHWOptions& options, Logger log) { + const auto grc = getDefaultGreedyRewriteConfig(); + pm.addPass(VPUIP::createSetMemorySpacePass(VPU::getMemKind, log)); + + pm.addPass(VPUIP::createAddCopyBetweenSWKernelsAndNetworkIOPass(log)); + + pm.addPass(VPUIP::createCopyOpTilingPass(log)); + pm.addPass(mlir::createCanonicalizerPass(grc)); + + if (options.enableProfiling && options.enableSWProfiling) { + pm.addPass(VPUIP::createActShaveProfilingPass(VPU::getMemKind, log)); + } + + pm.addPass(VPUIP::createUngroupBoundedBuffersPass(log)); + + pm.addPass(VPUIP::createConvertTransferOpsToDMAsPass(log)); + + VPUIP::buildAsyncSchedulingPipeline(pm, log); + + pm.addPass(VPUIP::createStaticAllocationPass(VPU::getMemKind, log)); + pm.addPass(VPUIP::createStaticAllocationPass(VPU::getMemKind, log)); + pm.addPass(VPUIP::createLinearizationPass(log)); + pm.addPass(VPUIP::createOptimizeAsyncDepsPass(log)); + + pm.addPass(VPUIP::arch37xx::createAddSwKernelCacheHandlingOpsPass(log)); + + VPUIP::buildHardwareAdaptationPipeline(pm, log); + + pm.addPass(VPUIP::arch40xx::createAddStartBarrierPass(log)); + pm.addPass(VPURT::arch37xx::createAddFinalBarrierPass(log)); + + // Level 1 : VPU RunTime + + if (options.enableProfiling) { + pm.addPass(VPUIP::createCaptureWorkpointPass(log)); + pm.addPass(VPUIP::createGroupProfilingBuffersPass(log)); + pm.addPass(Core::createMoveDeclarationsToTopPass(log)); + } + + pm.addPass(VPURT::createAssignPhysicalBarriersPass(options.enableColorBinPhysicalBarrierAssignment, std::nullopt, + std::nullopt, log)); + pm.addPass(VPURT::createBarrierSimulationPass(log)); + pm.addPass(VPUIP::createUpdateSwKernelParamsPass(log)); + pm.addPass(mlir::createCanonicalizerPass(grc)); +} + void vpux::VPUIP::arch40xx::registerVPUIPPipelines() { mlir::PassPipelineRegistration( "optimize-copies-pipeline", "Optimize Copies Pipeline", @@ -384,4 +425,10 @@ void vpux::VPUIP::arch40xx::registerVPUIPPipelines() { [](mlir::OpPassManager& pm, const VPUIP::arch40xx::DefaultHWOptions& options) { VPUIP::arch40xx::buildDefaultHWPipeline(pm, options); }); + + mlir::PassPipelineRegistration( + "reference-sw-mode-vpuip", "VPUIP dialect part of reference SW pipeline", + [](mlir::OpPassManager& pm, const VPUIP::arch40xx::DefaultHWOptions& options) { + VPUIP::arch40xx::buildReferenceSWPipeline(pm, options); + }); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/utils/permute_dma.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/utils/permute_dma.cpp new file mode 100644 index 0000000000..cb6e535ca0 --- /dev/null +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIP/utils/permute_dma.cpp @@ -0,0 +1,258 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/NPU37XX/dialect/VPUIP/utils/permute_dma.hpp" +#include "vpux/compiler/NPU40XX/dialect/VPUIP/utils/permute_dma.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" +#include "vpux/compiler/utils/quantization.hpp" +#include "vpux/compiler/utils/rewriter.hpp" +#include "vpux/utils/core/error.hpp" +#include "vpux/utils/core/mem_size.hpp" + +namespace vpux::arch40xx { + +mlir::AffineMap getLogicalTransposeFromMemPermute(NDTypeInterface inType, NDTypeInterface outType, + mlir::AffineMap memPermute) { + VPUX_THROW_WHEN(inType.getRank() != outType.getRank(), "Rank mismatch between input type and output type"); + + auto ctx = inType.getContext(); + const auto mappingInMemToLogical = VPUIP::getSmallVectorFromAffineMap(inType.getDimsOrder().toAffineMap(ctx)); + const auto mappingOutMemToLogical = VPUIP::getSmallVectorFromAffineMap(outType.getDimsOrder().toAffineMap(ctx)); + const auto mappingOutToInMem = VPUIP::getSmallVectorFromAffineMap(memPermute); + + auto mappingOutToInLogical = mlir::SmallVector(inType.getRank()); + + for (auto index : irange(inType.getRank())) { + mappingOutToInLogical[mappingOutMemToLogical[index]] = mappingInMemToLogical[mappingOutToInMem[index]]; + } + + return mlir::AffineMap::getPermutationMap(mappingOutToInLogical, ctx); +} + +mlir::LogicalResult UnrollSingleClusterPermuteDMA::unroll(VPUIP::PermuteDMAOp permuteOp, + mlir::PatternRewriter& rewriter, int64_t portCount, Logger) { + VPUX_THROW_WHEN(permuteOp.getInternalDataFlowAttr(), "Already unrolled"); + VPUX_THROW_WHEN(portCount < 1, "Invalid number of ports (expected at least 1, but got {0})", portCount); + + auto ctx = permuteOp->getContext(); + + auto origTaskOp = permuteOp->getParentOfType(); + VPUX_THROW_UNLESS(origTaskOp != nullptr, "Can't get VPURT task operation"); + rewriter.setInsertionPointAfter(origTaskOp); + + auto origInType = mlir::cast(permuteOp.getInput().getType()); + auto origInBuffer = permuteOp.getInput().getDefiningOp(); + auto origInBufferOffset = origInBuffer.getByteOffset(); + + auto origOutType = mlir::cast(permuteOp.getOutput().getType()); + auto origOutBuffer = permuteOp.getOutputBuff().getDefiningOp(); + auto origOutBufferOffset = origOutBuffer.getByteOffset(); + + // Super paranoid check + VPUX_THROW_WHEN(origInType.getRank() != origOutType.getRank(), "Rank mismatch between input type and output type"); + + // mem_perm attribute maps out memory dims to in memory dims. + // + // The modes of operation of PermuteDMA could be summarized as follows: + // - logical transpose -> logical shape changes, but the memory layout remains unchanged + // 1x2x3x4 NHWC -> 1x3x2x4 NHWC + // - memory layout change -> logical shape is unchanged, but the memory layout is different + // 1x2x3x4 NHWC to 1x2x3x4 NCHW + // - logical transpose + memory layout change -> both logical shape and memory layout are different + // 1x2x3x4 NHWC -> 1x3x2x4 NCHW + // + // To make things more explicit: + // - memory layout change is represented through the memory layout of the input and output types + // - logical transpose is represented through mappingOrder (AffineMap) which maps output logical dims to input + // logical dims + auto mappingOrder = + getLogicalTransposeFromMemPermute(origInType, origOutType, permuteOp.getMemPermAttr().getAffineMap()); + auto mappingOutToInLogical = VPUIP::getSmallVectorFromAffineMap(mappingOrder); + auto mappingInToOutLogical = VPUIP::getSmallVectorFromAffineMap(mlir::inversePermutation(mappingOrder)); + + auto workingInShape = Shape(origInType.getShape().raw()); + auto origInStrides = origInType.getStrides(); + + auto workingOutShape = Shape(origOutType.getShape().raw()); + auto origOutStrides = origOutType.getStrides(); + + // Temporary solution to treat case where Expand is fused with Permute, which results in output size > input size + // (see E#173193). Use input dim sizes and keep output strides. + for (auto index : irange(origOutType.getRank())) { + workingOutShape[Dim(index)] = workingInShape[Dim(mappingOutToInLogical[index])]; + } + + // Initialize properly here indexes and task count in case no splittable dim is found + int64_t inSplitDimIndex = 0; + int64_t outSplitDimIndex = mappingInToOutLogical[inSplitDimIndex]; + int64_t newTaskCount = 1; + + auto hasPortAssigned = permuteOp.getPort().has_value(); + + // Initialize new port + // All cluster tasks will have port assigned + // In case of single cluster task, if splitting to all ports is not possible, always use port 0 + int64_t newPort = 0; + + // Split only byte-aligned element types + if (origInType.getElemTypeSize() % Byte(1).to() == 0) { + // Find a split candidate (search in mem order to find largest continuous chunk) + for (auto index : VPUIP::getSmallVectorFromAffineMap(origInType.getDimsOrder().toAffineMap(ctx))) { + // Find the first non-trivial dim (i.e. dim size > 1) that is evenly divided by number of ports + // This is needed to ensure load balancing, particularly when reading from DDR + if (workingInShape[Dim(index)] > 1 && workingInShape[Dim(index)] % portCount == 0) { + inSplitDimIndex = index; + newTaskCount = portCount; + outSplitDimIndex = mappingInToOutLogical[inSplitDimIndex]; + break; + } + } + } + + auto origSplitDimSize = workingInShape[Dim(inSplitDimIndex)]; + auto initialSplitDimSize = workingInShape[Dim(inSplitDimIndex)] / newTaskCount; + auto currentSplitDimSize = initialSplitDimSize; + + // Offsets needed for per-axis quant type updates + auto workingInShapeOffsets = Shape(workingInShape.size()); + auto workingOutShapeOffsets = Shape(workingOutShape.size()); + + const auto getNewElementType = [](NDTypeInterface origType, ShapeRef newShape, ShapeRef newOffset) { + auto elemType = origType.getElementType(); + if (auto qType = mlir::dyn_cast(elemType)) { + // tileScalesAndZP will update type only when we actually tiled over quantized axis + elemType = tileScalesAndZP(qType, newShape, newOffset); + } + + return elemType; + }; + + const auto createNewBuffer = [](mlir::PatternRewriter& rewriter, VPURT::TaskOp taskOp, + VPURT::DeclareBufferOp existingBuffer, NDTypeInterface newType, int64_t newOffset) { + if (newType.getMemSpace().getIndex().has_value()) { + return VPURT::createOp(rewriter, existingBuffer, taskOp.getLoc(), newType, + existingBuffer.getSection(), + newType.getMemSpace().getIndex().value(), newOffset); + } else { + if (existingBuffer.getSectionIndex().has_value()) { + return VPURT::createOp( + rewriter, existingBuffer, taskOp.getLoc(), newType, existingBuffer.getSection(), + parseIntArrayAttr(existingBuffer.getSectionIndex().value()), newOffset); + } + + return VPURT::createOp(rewriter, existingBuffer, taskOp.getLoc(), newType, + existingBuffer.getSection(), newOffset); + } + }; + + const auto getNewInType = [&getNewElementType](NDTypeInterface origType, ShapeRef newShape, ShapeRef newOffsets, + StridesRef newStrides) -> NDTypeInterface { + auto newElementType = getNewElementType(origType, newShape, newOffsets); + + VPUX_THROW_UNLESS(mlir::isa_and_nonnull(origType), "Unexpected input type"); + + return getMemRefType(newShape, newElementType, origType.getDimsOrder(), origType.getMemSpace(), newStrides); + }; + + const auto getNewOutType = [&getNewElementType](mlir::MLIRContext* ctx, NDTypeInterface origType, ShapeRef newShape, + ShapeRef newOffsets, StridesRef newStrides) -> NDTypeInterface { + auto newElementType = getNewElementType(origType, newShape, newOffsets); + + if (auto dstDistributedType = mlir::dyn_cast(origType)) { + auto distributionAttr = dstDistributedType.getDistribution(); + VPUX_THROW_WHEN( + distributionAttr.getMode().getValue() != VPU::DistributionMode::DUPLICATED, + "Issues with unrolling PermuteNNDMA; Buffer has distributed type != DUPLICATED after unroll"); + + if (VPU::isDistributedAttrWithExplicitShapesAndOffsets(distributionAttr)) { + distributionAttr = VPU::getNonOverlappedDistributedAttr( + newShape, distributionAttr.getMode(), nullptr, distributionAttr.getNumClusters(), nullptr, + distributionAttr.getUniformDistributedSegments(), ctx); + } + + // We now convert to identity logical to memory mapping + const auto layout = mlir::AffineMapAttr::get(origType.getDimsOrder().toAffineMap(ctx)); + return VPUIP::DistributedBufferType::get(ctx, newShape, newElementType, layout, origType.getMemSpace(), + distributionAttr); + } + + return getMemRefType(newShape, newElementType, origType.getDimsOrder(), origType.getMemSpace(), newStrides); + }; + + // Initialize variables here to allow loop to handle single iteration cases + // We update the buffers and types since we are switching to identity logical to memory mapping + auto newInType = getNewInType(origInType, workingInShape, workingInShapeOffsets, origInStrides); + auto newInBuffer = createNewBuffer(rewriter, origTaskOp, origInBuffer, newInType, origInBufferOffset); + + auto newOutType = getNewOutType(ctx, origOutType, workingOutShape, workingOutShapeOffsets, origOutStrides); + auto newOutBuffer = createNewBuffer(rewriter, origTaskOp, origOutBuffer, newOutType, origOutBufferOffset); + + for (auto index : irange(newTaskCount)) { + if (newTaskCount > 1) { + if (index == newTaskCount - 1) { + // For last iter use remaining size for cases where dim size is not divisible nicely + currentSplitDimSize = origSplitDimSize - initialSplitDimSize * index; + } + newPort = index; + + // Compute new shapes + workingInShape[Dim(inSplitDimIndex)] = currentSplitDimSize; + workingOutShape[Dim(outSplitDimIndex)] = currentSplitDimSize; + + // Compute new offsets. + // For simplicity, the splitting interleaves accesses to the original shapes. + // Pretty heavy assumption here that strides will turn out to be byte aligned. + // Jump only over elements in the dimension we split. + // For a compact shape, if we split over the highest order dim, the access will be continuous. + auto newInBufferOffset = + initialSplitDimSize * origInStrides[Dim(inSplitDimIndex)].to().count() * index + + origInBufferOffset; + auto newOutBufferOffset = + initialSplitDimSize * origOutStrides[Dim(outSplitDimIndex)].to().count() * index + + origOutBufferOffset; + + workingInShapeOffsets[Dim(inSplitDimIndex)] = initialSplitDimSize * index; + newInType = getNewInType(origInType, workingInShape, workingInShapeOffsets, origInStrides); + newInBuffer = createNewBuffer(rewriter, origTaskOp, newInBuffer, newInType, newInBufferOffset); + + workingOutShapeOffsets[Dim(outSplitDimIndex)] = initialSplitDimSize * index; + newOutType = getNewOutType(ctx, origOutType, workingOutShape, workingOutShapeOffsets, origOutStrides); + newOutBuffer = createNewBuffer(rewriter, origTaskOp, newOutBuffer, newOutType, newOutBufferOffset); + } + + auto loopOrder = + mlir::AffineMapAttr::get(mlir::AffineMap::getPermutationMap(VPUIP::getLinearMemOrder(newInType), ctx)); + + auto internalDataFlowAttr = VPUIP::InternalDataFlowAttr::get(ctx, newInType, newOutType, + mlir::AffineMapAttr::get(mappingOrder), loopOrder); + + const auto newLoc = appendLoc(origTaskOp->getLoc(), "_unrolled_permuteDMA"); + + // Override port if no splitting can be done and port was already assigned by cluster unrolling + if (hasPortAssigned && newTaskCount == 1) { + newPort = permuteOp.getPort().value(); + } + + VPURT::wrapIntoTaskOp( + rewriter, origTaskOp.getWaitBarriers(), origTaskOp.getUpdateBarriers(), newLoc, newInBuffer, + newOutBuffer, getIntAttr(rewriter, newPort), permuteOp.getIsOutOfOrderAttr(), + permuteOp.getIsCriticalAttr(), + /*mem_perm*/ nullptr, /* dma_descriptor */ nullptr, permuteOp.getDmaHwpIdAttr(), + permuteOp.getProfilingMetadataAttr(), internalDataFlowAttr); + } + + rewriter.eraseOp(origTaskOp); + return mlir::success(); +} + +mlir::LogicalResult rewritePermuteDMA(VPUIP::PermuteDMAOp permuteOp, mlir::PatternRewriter& rewriter, int64_t portCount, + Logger logger) { + return arch37xx::unrollPermuteDMA( + permuteOp, rewriter, portCount, logger); +} + +} // namespace vpux::arch40xx diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_idu.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_idu.cpp index 827b2c7afb..51fd67f007 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_idu.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_idu.cpp @@ -4,28 +4,18 @@ // #include "vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_idu.hpp" -#include #include "vpux/compiler/NPU40XX/dialect/VPUIPDPU/ops.hpp" #include "vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant.hpp" #include "vpux/compiler/dialect/VPUASM/ops.hpp" #include "vpux/compiler/dialect/VPUIPDPU/rewriters/utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" + +#include namespace vpux::VPUIPDPU::arch40xx::IDU { mlir::LogicalResult verifyInQuantConfig(const Logger& log, mlir::Type inType) { - SmallVector inQuantZero; - - if (const auto uniformQuantType = mlir::dyn_cast(inType)) { - inQuantZero.push_back(checked_cast(uniformQuantType.getZeroPoint())); - } else if (const auto uniformQuantPerAxisType = mlir::dyn_cast(inType)) { - auto zp = uniformQuantPerAxisType.getZeroPoints(); - inQuantZero.resize(zp.size()); - std::transform(zp.begin(), zp.end(), inQuantZero.begin(), [](int64_t a) { - return checked_cast(a); - }); - } else { - inQuantZero.push_back(0); - } + SmallVector inQuantZero = getZeroPoints(inType); if (inQuantZero.size() != 1) { log.error("Mismatch of size between input quant ZP and quant shift vector: {0} != 1", inQuantZero.size()); diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_mpe.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_mpe.cpp index a1bc828e6e..e3b47eedc2 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_mpe.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_mpe.cpp @@ -14,14 +14,14 @@ mlir::LogicalResult vpux::VPUIPDPU::arch40xx::buildDPUInvariantMPE( if (auto inAct = getInvBlockArg(BlockArg::ACT_IN, invBlock, invBlockArgsPos)) { auto inActType = getBaseType(mlir::cast(inAct.getType()).getElementType()); if (inActType.isInteger(CHAR_BIT)) { - builder.create(origInvOp.getLoc(), VPUIPDPU::getZeroPoint(inAct.getType())); + builder.create(origInvOp.getLoc(), VPUIPDPU::getZeroPoint(inAct.getType())); } } if (auto weights = getInvBlockArg(BlockArg::WEIGHTS, invBlock, invBlockArgsPos)) { auto wtType = getBaseType(mlir::cast(weights.getType()).getElementType()); if (wtType.isUnsignedInteger(CHAR_BIT)) { - builder.create(origInvOp.getLoc(), VPUIPDPU::getZeroPoint(weights.getType())); + builder.create(origInvOp.getLoc(), VPUIPDPU::getZeroPoint(weights.getType())); } } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_odu.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_odu.cpp index 401ee26e22..f983b4d84f 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_odu.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_odu.cpp @@ -157,10 +157,10 @@ mlir::LogicalResult configureOutActivations(const Logger& log, ODUConfig::OutAct } mlir::LogicalResult configureSparsity(const Logger&, ODUConfig::Sparsity& config, bool outSparsityEnabled, - NDTypeInterface outActType) { + int64_t sparseValue) { if (outSparsityEnabled) { config.compressionEnabled = true; - config.sparseValue = VPUIPDPU::getZeroPoint(outActType); + config.sparseValue = sparseValue; } return mlir::success(); } @@ -195,7 +195,9 @@ mlir::LogicalResult configureODU(const Logger& log, ODUConfig& config, const NDT if (configureDataReuse(log, config.dataReuse, mpeFrequentMode, dpuTaskType).failed()) { return mlir::failure(); } - if (VPUIPDPU::arch40xx::ODU::configureSparsity(log, config.sparsity, outSparsityEnabled, outActType).failed()) { + if (VPUIPDPU::arch40xx::ODU::configureSparsity(log, config.sparsity, outSparsityEnabled, + VPUIPDPU::getZeroPoint(outActType)) + .failed()) { return mlir::failure(); } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_ppe.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_ppe.cpp index 2c2587f0ee..025b74e1e9 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_ppe.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_ppe.cpp @@ -4,10 +4,10 @@ // #include "vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant_ppe.hpp" -#include "vpux/compiler/NPU40XX/dialect/VPUIPDPU/ops.hpp" #include "vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_invariant.hpp" #include "vpux/compiler/dialect/VPUASM/ops.hpp" #include "vpux/compiler/dialect/VPUIPDPU/rewriters/utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" namespace { diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_variant_idu.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_variant_idu.cpp index 29076da5f4..75e7270835 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_variant_idu.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_variant_idu.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_variant.hpp" #include "vpux/compiler/dialect/VPUASM/ops.hpp" #include "vpux/compiler/dialect/VPUIPDPU/rewriters/utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" namespace vpux::VPUIPDPU::arch40xx::IDU { @@ -175,9 +176,12 @@ mlir::LogicalResult vpux::VPUIPDPU::arch40xx::buildDPUVariantIDU(VPUASM::DPUVari ELF::SymbolReferenceMap& symRefMap) { auto origInvOp = mlir::cast(symRefMap.lookupSymbol(origVarOp.getInvariant())); - auto inAct = symRefMap.lookupSymbol(origInvOp.getInput()); - auto inActType = getBufferType(inAct); - auto inSwizzlingKey = getSwizzlingKey(inAct); + mlir::Operation* inAct = nullptr; + if (origInvOp.getInput()) { + inAct = symRefMap.lookupSymbol(origInvOp.getInput().value()); + } + + std::optional inSwizzlingKey; mlir::MemRefType outActType; if (!origInvOp.getIsContinued() && origInvOp.getOutput()) { @@ -207,15 +211,19 @@ mlir::LogicalResult vpux::VPUIPDPU::arch40xx::buildDPUVariantIDU(VPUASM::DPUVari } // IDUWeightSet - auto inStartZ = parseIntArrayAttr(origVarOp.getInStart())[2]; - auto inEndZ = parseIntArrayAttr(origVarOp.getInEnd())[2]; - auto outStartZ = parseIntArrayAttr(origVarOp.getStart())[2]; - auto outEndZ = parseIntArrayAttr(origVarOp.getEnd())[2]; - if (buildIDUWeightSet(builder, origVarOp.getLoc(), log, inStartZ, inEndZ, outStartZ, outEndZ, - origInvOp.getOutChannelOffset(), origInvOp.getNceTaskType(), inActType, outActType, - weightsType, origInvOp.getKernelSize()) - .failed()) { - return mlir::failure(); + if (inAct) { + inSwizzlingKey = getSwizzlingKey(inAct); + auto inStartZ = parseIntArrayAttr(origVarOp.getInStart())[2]; + auto inEndZ = parseIntArrayAttr(origVarOp.getInEnd())[2]; + auto outStartZ = parseIntArrayAttr(origVarOp.getStart())[2]; + auto outEndZ = parseIntArrayAttr(origVarOp.getEnd())[2]; + auto inActType = getBufferType(inAct); + if (buildIDUWeightSet(builder, origVarOp.getLoc(), log, inStartZ, inEndZ, outStartZ, outEndZ, + origInvOp.getOutChannelOffset(), origInvOp.getNceTaskType(), inActType, outActType, + weightsType, origInvOp.getKernelSize()) + .failed()) { + return mlir::failure(); + } } // IDUPadding diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_variant_odu.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_variant_odu.cpp index 78b206279b..c4a1cec076 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_variant_odu.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_variant_odu.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/NPU40XX/dialect/VPUIPDPU/ops.hpp" #include "vpux/compiler/NPU40XX/dialect/VPUIPDPU/transforms/passes/expand_dpu_config/expand_dpu_config_variant.hpp" #include "vpux/compiler/dialect/VPUASM/ops.hpp" +#include "vpux/compiler/utils/attributes.hpp" namespace { diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/interfaces/barrier_pages_split.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/interfaces/barrier_pages_split.cpp index 0351ec5444..fb8b58e55f 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/interfaces/barrier_pages_split.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/interfaces/barrier_pages_split.cpp @@ -171,49 +171,182 @@ void vpux::VPURT::BarrierPagesSplitHandler::initializeTaskToPageAssignment() { } } +// Update boundary task data for provided task +void vpux::VPURT::BarrierPagesSplitHandler::updateBoundaryTasksDataForTask(size_t taskInd) { + auto taskPage = _taskPageAssignment[taskInd]; + auto queueType = _barrierInfo.getTaskQueueType(taskInd); + auto updateBars = _barrierInfo.getUpdateBarriers(taskInd); + auto waitBars = _barrierInfo.getWaitBarriers(taskInd); + + if (waitBars.empty() && updateBars.empty()) { + // If there are no wait or update barriers, then no need to consider this task as + // boundary task as such task has no tight affiliation to any page + return; + } + + // Check if there was already a boundary task of this type for this page identified + if (_firstAndLastBoundaryTaskForEachPagePerFifo[taskPage].find(queueType) == + _firstAndLastBoundaryTaskForEachPagePerFifo[taskPage].end()) { + // If not check if all update barriers belong to the same page as the task. + // If all update barriers are within taskPage, then this is NOT a boundary task + if (llvm::all_of(updateBars, [&](size_t barInd) { + return getBarrierPage(barInd) == taskPage; + })) { + return; + } + + // If there is an update barrier from next page then this is a first boundary + // task on this HW FIFO + _firstAndLastBoundaryTaskForEachPagePerFifo[taskPage][queueType] = std::make_pair(taskInd, taskInd); + } + + if (taskInd < _firstAndLastBoundaryTaskForEachPagePerFifo[taskPage][queueType].second) { + return; + } + + // If there is already a boundary task of this type for this page, then update the data + // so that after traversing all tasks there is also information about the last boundary task + _firstAndLastBoundaryTaskForEachPagePerFifo[taskPage][queueType].second = taskInd; + + _log.nest().trace("Task {0} is boundary task for page {1}, queue type {2}:{3}", taskInd, taskPage, + stringifyEnum(queueType.type).data(), queueType.id); +} + +// Make sure last boundary task on each HW FIFO updates barrier from next page. +// Tasks which are on the HW FIFO after a boundary task but they themselve do not have any +// update barrier from next page should have this legalized as rest of code expects this +// based on agreed definition of boundary task +void vpux::VPURT::BarrierPagesSplitHandler::enforceBoundaryTaskHasUpdateBarrier(size_t pageInd) { + for (auto boundaryTask : getLastBoundaryTasksForPage(pageInd)) { + auto taskUpdateBarsVec = to_small_vector(_barrierInfo.getUpdateBarriers(boundaryTask)); + + // For analysis do not account for barriers from current page. Boundary task on PageN + // needs to have update barriers from next page + taskUpdateBarsVec.erase(llvm::remove_if(taskUpdateBarsVec, + [&](auto barInd) { + return getBarrierPage(barInd) == pageInd; + }), + taskUpdateBarsVec.end()); + if (!taskUpdateBarsVec.empty()) { + // If task updates barriers from next page then no need to add any additional update barrier + continue; + } + + // If there are no update barriers of this page then need to search for some other update + // barrier that can be used and which boundary task can update + _log.nest().trace("Boundary task {0} from page {1} does not have update barrier on next pages", boundaryTask, + pageInd); + + // Check for update barrier of some previous task on the same FIFO + auto currentTaskOpt = _barrierInfo.getPrevTaskOnSameQueue(boundaryTask); + while (taskUpdateBarsVec.empty() && currentTaskOpt.has_value() && + _taskPageAssignment[currentTaskOpt.value()] == pageInd) { + taskUpdateBarsVec = to_small_vector(_barrierInfo.getUpdateBarriers(currentTaskOpt.value())); + + taskUpdateBarsVec.erase(llvm::remove_if(taskUpdateBarsVec, + [&](auto barInd) { + return getBarrierPage(barInd) == pageInd; + }), + taskUpdateBarsVec.end()); + + currentTaskOpt = _barrierInfo.getPrevTaskOnSameQueue(currentTaskOpt.value()); + } + + VPUX_THROW_UNLESS(!taskUpdateBarsVec.empty(), + "Boundary task {0} from page {1} has no update barriers on next pages", boundaryTask, + pageInd); + + // If task updates multiple barriers, pick only one with smallest index and + // update barrier dependencies so that boundary also updates this barrier + // No need to use more barriers as 1 barrier is enough to know that boundary task has finished + auto newUpdateBar = *std::min_element(taskUpdateBarsVec.begin(), taskUpdateBarsVec.end()); + + _barrierInfo.addProducer(newUpdateBar, boundaryTask); + _log.nest().trace("Add producer {0} to barrier {1}", boundaryTask, newUpdateBar); + } +} + // Check all tasks and identify boundary tasks. // A boundary task is one where at least one update barrier belongs to a page // that is greater than taskPage (indicating cross-page dependency). // Boundary tasks are later used for legalization purposes. void vpux::VPURT::BarrierPagesSplitHandler::initializeBoundaryTasksData() { _firstAndLastBoundaryTaskForEachPagePerFifo.resize(_pageCount); + _firstAndLastTaskPerPage.resize(_pageCount); _log.trace("Initializing boundary tasks data"); for (size_t taskInd = 0; taskInd < _barrierInfo.getNumOfTasks(); taskInd++) { - auto updateBars = _barrierInfo.getUpdateBarriers(taskInd); - if (updateBars.empty()) { - continue; - } - auto taskPage = _taskPageAssignment[taskInd]; - // Check if all update barriers belong to the same page as the task. - // If all update barriers are within taskPage, then this is NOT a boundary task - if (llvm::all_of(updateBars, [&](size_t barInd) { - return getBarrierPage(barInd) == taskPage; - })) { - continue; - } - - // Get task queue and update boundary tasks per page data - // Since for each HW FIFO there can be a sequence of boundary tasks - // store index of first and last one - auto queueType = _barrierInfo.getTaskQueueType(taskInd); - if (_firstAndLastBoundaryTaskForEachPagePerFifo[taskPage].find(queueType) == - _firstAndLastBoundaryTaskForEachPagePerFifo[taskPage].end()) { - _firstAndLastBoundaryTaskForEachPagePerFifo[taskPage][queueType] = std::make_pair(taskInd, taskInd); + if (!_firstAndLastTaskPerPage[taskPage].has_value()) { + _firstAndLastTaskPerPage[taskPage] = std::make_pair(taskInd, taskInd); } else { - _firstAndLastBoundaryTaskForEachPagePerFifo[taskPage][queueType].second = taskInd; + _firstAndLastTaskPerPage[taskPage].value().second = taskInd; } + + updateBoundaryTasksDataForTask(taskInd); } for (size_t pageInd = 0; pageInd < _pageCount - 1; pageInd++) { - VPUX_THROW_WHEN(_firstAndLastBoundaryTaskForEachPagePerFifo[pageInd].empty(), - "No boundary tasks set for page {0}, pageCount {1}", pageInd, _pageCount); - _log.nest().trace("Page {0} boundary tasks: {1}", pageInd, getFirstAndLastBoundaryTasksForPage(pageInd)); + auto pageBoundaryTasks = getFirstAndLastBoundaryTasksForPage(pageInd); + if (pageBoundaryTasks.empty()) { + _log.trace("No boundary tasks for page {0}", pageInd); + // If in rare case there is no boundary task for page - task that starts at PageN and updates + // barrier from PageN+1 then need to create one artificially. Pick lates task in PageN and find + // some wait barrier from PageN+1 and create dependency. + auto lastTaskOnPage = _firstAndLastTaskPerPage[pageInd].value().second; + std::optional nextPageBarrier; + auto nextTask = lastTaskOnPage + 1; + while (nextTask < _barrierInfo.getNumOfTasks() && _taskPageAssignment[nextTask] <= pageInd + 1 && + !nextPageBarrier.has_value()) { + auto nextTaskWaitBars = _barrierInfo.getWaitBarriers(nextTask); + if (!nextTaskWaitBars.empty()) { + nextPageBarrier = *std::min_element(nextTaskWaitBars.begin(), nextTaskWaitBars.end()); + } + + nextTask++; + } + VPUX_THROW_UNLESS(nextPageBarrier.has_value(), "No next page barrier found for page {0} last task {1}", + pageInd, lastTaskOnPage); + + _barrierInfo.addProducer(nextPageBarrier.value(), lastTaskOnPage); + _log.nest(2).trace("Add producer {0}(page {1}) to barrier {2}(page {3})", lastTaskOnPage, pageInd, + nextPageBarrier.value(), getBarrierPage(nextPageBarrier.value())); + + _firstAndLastBoundaryTaskForEachPagePerFifo[pageInd][_barrierInfo.getTaskQueueType(lastTaskOnPage)] = + std::make_pair(lastTaskOnPage, lastTaskOnPage); + pageBoundaryTasks.push_back(lastTaskOnPage); + } + + _log.trace("Page {0} boundary tasks: {1}", pageInd, pageBoundaryTasks); + enforceBoundaryTaskHasUpdateBarrier(pageInd); } } +// Update boundary task data for provided page +void vpux::VPURT::BarrierPagesSplitHandler::updateBoundaryTasksDataForPage(size_t pageInd) { + _log.trace("Update boundary tasks data for page {0}", pageInd); + VPUX_THROW_WHEN(pageInd >= _pageCount, "Page index {0} out of range {1}", pageInd, _pageCount); + + // No need to update data for last page + if (pageInd == _pageCount - 1) { + return; + } + + VPUX_THROW_WHEN(_firstAndLastBoundaryTaskForEachPagePerFifo[pageInd].empty(), "No boundary tasks set for page {0}", + pageInd); + VPUX_THROW_WHEN(!_firstAndLastTaskPerPage[pageInd].has_value(), "No first and last task set for page {0}", pageInd); + + _firstAndLastBoundaryTaskForEachPagePerFifo[pageInd].clear(); + for (size_t taskInd = _firstAndLastTaskPerPage[pageInd].value().first; + taskInd <= _firstAndLastTaskPerPage[pageInd].value().second; taskInd++) { + updateBoundaryTasksDataForTask(taskInd); + } + + _log.trace("Page {0} boundary tasks: {1}", pageInd, getFirstAndLastBoundaryTasksForPage(pageInd)); + enforceBoundaryTaskHasUpdateBarrier(pageInd); +} + // For given page get first and last boundary tasks for each HW FIFO SmallVector vpux::VPURT::BarrierPagesSplitHandler::getFirstAndLastBoundaryTasksForPage(size_t pageInd) { auto boundaryTasks = getFirstBoundaryTasksForPage(pageInd); @@ -461,7 +594,9 @@ void vpux::VPURT::BarrierPagesSplitHandler::legalizeWaitBarrierDependency(size_t // // TODO: E#160461 Analyze if this could be improved by picking different boundary task - auto pageBoundaryTask = getLastBoundaryTasksForPage(barProdTaskPage + 1)[0]; + auto pageBoundaryTasks = getLastBoundaryTasksForPage(barProdTaskPage + 1); + VPUX_THROW_WHEN(pageBoundaryTasks.empty(), "No boundary tasks set for page {0}", barProdTaskPage + 1); + auto pageBoundaryTask = pageBoundaryTasks[0]; _log.nest(2).trace("Page boundary task: {0}(page {1})", pageBoundaryTask, _taskPageAssignment[pageBoundaryTask]); auto pageBoundaryTasksOnSameFifoIt = @@ -542,7 +677,10 @@ void vpux::VPURT::BarrierPagesSplitHandler::legalizeUpdateBarrierDependency(size // TODO: E#160461 Some improvement may be done by having some heuristic on which boundary // task should be used. Maybe take into account timing information or queue type? - auto pageBoundaryTask = getLastBoundaryTasksForPage(taskPage + 1)[0]; + auto pageBoundaryTasks = getLastBoundaryTasksForPage(taskPage + 1); + VPUX_THROW_WHEN(pageBoundaryTasks.empty(), "No boundary tasks set for page {0}", taskPage + 1); + auto pageBoundaryTask = pageBoundaryTasks[0]; + auto pageBoundaryTasksOnSameFifoIt = _firstAndLastBoundaryTaskForEachPagePerFifo[taskPage + 1].find(taskQueueType); if (pageBoundaryTasksOnSameFifoIt != _firstAndLastBoundaryTaskForEachPagePerFifo[taskPage + 1].end()) { // There is a boundary task on same FIFO. No need to insert dependency from barProdTask @@ -680,6 +818,10 @@ bool vpux::VPURT::BarrierPagesSplitHandler::areBoundaryTasksFromNeighborPagesDep // Check deps from PageN to PageN+1 boundary tasks for (size_t pageInd = 0; pageInd < _pageCount - 2; pageInd++) { + VPUX_THROW_WHEN(_firstAndLastBoundaryTaskForEachPagePerFifo[pageInd].empty(), + "No boundary tasks set for page {0}", pageInd); + VPUX_THROW_WHEN(_firstAndLastBoundaryTaskForEachPagePerFifo[pageInd + 1].empty(), + "No boundary tasks set for page {0}", pageInd + 1); auto pageBoundaryTaskPerFifo = _firstAndLastBoundaryTaskForEachPagePerFifo[pageInd]; auto nextPageBoundaryTaskPerFifo = _firstAndLastBoundaryTaskForEachPagePerFifo[pageInd + 1]; @@ -712,6 +854,10 @@ vpux::VPURT::BarrierPagesSplitHandler::getBoundaryTaskPairsMissingDepInBetween() // Check deps from PageN to PageN+1 boundary tasks for (size_t pageInd = 0; pageInd < _pageCount - 2; pageInd++) { + VPUX_THROW_WHEN(_firstAndLastBoundaryTaskForEachPagePerFifo[pageInd].empty(), + "No boundary tasks set for page {0}", pageInd); + VPUX_THROW_WHEN(_firstAndLastBoundaryTaskForEachPagePerFifo[pageInd + 1].empty(), + "No boundary tasks set for page {0}", pageInd + 1); auto pageBoundaryTaskPerFifo = _firstAndLastBoundaryTaskForEachPagePerFifo[pageInd]; auto nextPageBoundaryTaskPerFifo = _firstAndLastBoundaryTaskForEachPagePerFifo[pageInd + 1]; @@ -884,6 +1030,8 @@ void vpux::VPURT::BarrierPagesSplitHandler::adjustPageStartAndEndPointsIfOnBlock barProgDmaBlockInd, pageEndBarBlockInd, syncTaskOpt.value()); auto syncTaskUpdateBars = _barrierInfo.getUpdateBarriers(syncTaskOpt.value()); + VPUX_THROW_UNLESS(!syncTaskUpdateBars.empty(), "Control block sync task {0} has no update barriers", + syncTaskOpt.value()); auto newPageStartBar = *std::min_element(syncTaskUpdateBars.begin(), syncTaskUpdateBars.end()); pageStartBars.clear(); @@ -895,6 +1043,8 @@ void vpux::VPURT::BarrierPagesSplitHandler::adjustPageStartAndEndPointsIfOnBlock _log.trace("Control block boundary: {0} and {1}. Use sync task {2} as endpoint for legalization", barProgDmaBlockInd, pageEndBarBlockInd, syncTaskOpt.value()); + VPUX_THROW_UNLESS(!syncTaskWaitBars.empty(), "Control block sync task {0} has no wait barriers", + syncTaskOpt.value()); auto newPageEndBar = *std::max_element(syncTaskWaitBars.begin(), syncTaskWaitBars.end()); pageEndBars.clear(); pageEndAllBars.clear(); @@ -931,6 +1081,7 @@ void vpux::VPURT::BarrierPagesSplitHandler::legalizePageStartBarsDependingOnPage _log.trace("Legalize page start bars depending on page end bars"); // Pick some other pageStartBar that will be used for legalization + VPUX_THROW_UNLESS(!pageStartBars.empty(), "No page start bars to use for legalization"); auto startBarToUseForLegalization = *std::max_element(pageStartBars.begin(), pageStartBars.end()); for (auto pageStartBarToLegalize : pageStartBarsToLegalize) { @@ -963,6 +1114,8 @@ void vpux::VPURT::BarrierPagesSplitHandler::legalizePageStartBarsDependingOnPage void vpux::VPURT::BarrierPagesSplitHandler::getPageStartTasksAndBars(size_t pageInd, BarrierInfo::TaskSet& pageStartTasks, BarrierInfo::TaskSet& pageStartBars) { + VPUX_THROW_WHEN(_firstAndLastBoundaryTaskForEachPagePerFifo[pageInd - 1].empty(), + "No boundary tasks set for page {0}", pageInd - 1); for (auto& [taskQueueType, firstLastTaskInd] : _firstAndLastBoundaryTaskForEachPagePerFifo[pageInd - 1]) { _log.trace("Get page start tasks and bars for queue {0}:{1}", stringifyEnum(taskQueueType.type).data(), taskQueueType.id); @@ -976,12 +1129,15 @@ void vpux::VPURT::BarrierPagesSplitHandler::getPageStartTasksAndBars(size_t page }), taskUpdateBarsVec.end()); + VPUX_THROW_UNLESS(!taskUpdateBarsVec.empty(), "Page start task {0} has no update barriers on page {1}", + pageStartTask, pageInd); + // If task updates multiple barriers, pick only one with smallest index // No need to use more barriers as 1 barrier is enough to know that boundary task have finished auto startBar = *std::min_element(taskUpdateBarsVec.begin(), taskUpdateBarsVec.end()); pageStartBars.insert(startBar); pageStartTasks.insert(pageStartTask); - _log.nest().trace("Page start task {0}, start bart {1}", pageStartTask, startBar); + _log.nest().trace("Page start task {0}, start bar {1}", pageStartTask, startBar); } } @@ -992,6 +1148,8 @@ void vpux::VPURT::BarrierPagesSplitHandler::getPageStartTasksAndBars(size_t page void vpux::VPURT::BarrierPagesSplitHandler::getPageEndTasksAndBars(size_t pageInd, BarrierInfo::TaskSet& pageEndTasks, BarrierInfo::TaskSet& pageEndBars, BarrierInfo::TaskSet& pageEndAllBars) { + VPUX_THROW_WHEN(_firstAndLastBoundaryTaskForEachPagePerFifo[pageInd].empty(), "No boundary tasks set for page {0}", + pageInd); for (auto& [taskQueueType, firstLastTaskInd] : _firstAndLastBoundaryTaskForEachPagePerFifo[pageInd]) { _log.trace("Get page end tasks and bars for queue {0}:{1}", stringifyEnum(taskQueueType.type).data(), taskQueueType.id); @@ -1117,6 +1275,20 @@ void vpux::VPURT::BarrierPagesSplitHandler::legalizeForDmaProgrammingBarriers() llvm::set_subtract(pageStartOnlyBars, pageStartBarsToLegalize); + // During legalization dependencies are modified and boundary tasks data may change + // If tasks and barriers are from different pages, store this information so that + // boundary data for this page can be updated and barrier programming DMA legalization + // on next page can have up to date info about boundary tasks + mlir::DenseSet pagesWithPossibleBoundaryTasksChanged; + auto barrierDepsChangedHandler = [&](size_t barInd, size_t taskInd) { + auto barPage = getBarrierPage(barInd); + auto taskPage = _taskPageAssignment[taskInd]; + if (barPage != taskPage) { + pagesWithPossibleBoundaryTasksChanged.insert(taskPage); + pagesWithPossibleBoundaryTasksChanged.insert(barPage); + } + }; + if (!pageStartOnlyBars.empty()) { legalizePageStartBarsDependingOnPageEndBars(pageStartTasks, pageStartOnlyBars, pageStartBarsToLegalize); @@ -1162,10 +1334,15 @@ void vpux::VPURT::BarrierPagesSplitHandler::legalizeForDmaProgrammingBarriers() for (auto commonStartEndBarBoundaryTaskProducer : commonStartEndBarBoundaryTaskProducers) { _barrierInfo.addProducer(startBarForLegalization, commonStartEndBarBoundaryTaskProducer); + barrierDepsChangedHandler(startBarForLegalization, commonStartEndBarBoundaryTaskProducer); _log.trace("Add producer {0} to barrier {1}", commonStartEndBarBoundaryTaskProducer, startBarForLegalization); } + for (auto pageWithPossibleBoundaryTasksChanged : pagesWithPossibleBoundaryTasksChanged) { + updateBoundaryTasksDataForPage(pageWithPossibleBoundaryTasksChanged); + } + _log.trace("Legalization completed"); setBarProgDmaData(pageInd, pageStartOnlyBars, pageEndBars); _log = _log.unnest(); @@ -1222,8 +1399,10 @@ void vpux::VPURT::BarrierPagesSplitHandler::legalizeForDmaProgrammingBarriers() } _barrierInfo.removeConsumer(newStartBar, startBarConsumer); + barrierDepsChangedHandler(newStartBar, startBarConsumer); _log.trace("Remove consumer {0} from barrier {1}", startBarConsumer, newStartBar); _barrierInfo.addConsumer(endBar, startBarConsumer); + barrierDepsChangedHandler(endBar, startBarConsumer); _log.trace("Add consumer {0} to barrier {1}", startBarConsumer, endBar); } pageEndBars.insert(endBar); @@ -1255,8 +1434,10 @@ void vpux::VPURT::BarrierPagesSplitHandler::legalizeForDmaProgrammingBarriers() for (auto updBarToLegalize : updBarsToLegalizeVec) { _barrierInfo.addProducer(updBarToLegalize, endBarConsumer); + barrierDepsChangedHandler(updBarToLegalize, endBarConsumer); _log.trace("Add producer {0} to barrier {1}", endBarConsumer, updBarToLegalize); _barrierInfo.removeProducer(updBarToLegalize, startBarConsumer); + barrierDepsChangedHandler(updBarToLegalize, startBarConsumer); _log.trace("Remove producer {0} from barrier {1}", startBarConsumer, updBarToLegalize); } // Such task is now no longer a boundary task @@ -1268,6 +1449,11 @@ void vpux::VPURT::BarrierPagesSplitHandler::legalizeForDmaProgrammingBarriers() for (auto pageStartTask : pageStartTasks) { _log.trace("Add producer {0} to barrier {1}", pageStartTask, newStartBar); _barrierInfo.addProducer(newStartBar, pageStartTask); + barrierDepsChangedHandler(newStartBar, pageStartTask); + } + + for (auto pageWithPossibleBoundaryTasksChanged : pagesWithPossibleBoundaryTasksChanged) { + updateBoundaryTasksDataForPage(pageWithPossibleBoundaryTasksChanged); } _log.trace("Legalization completed"); @@ -1325,6 +1511,8 @@ void vpux::VPURT::BarrierPagesSplitHandler::updateTaskPageAssignmentForQueue(siz // boundary tasks to guarantee that all barriers from previous page are fully consumed bool VPURT::BarrierPagesSplitHandler::isDummyDmaNeeded(size_t pageInd, VPURT::TaskQueueType dmaQueueType, std::optional lastDmaTaskOnSameQueueInPageOpt) { + VPUX_THROW_WHEN(_firstAndLastBoundaryTaskForEachPagePerFifo[pageInd].empty(), "No boundary tasks set for page {0}", + pageInd); if (_firstAndLastBoundaryTaskForEachPagePerFifo[pageInd].find(dmaQueueType) != _firstAndLastBoundaryTaskForEachPagePerFifo[pageInd].end()) { // There is boundary task of this type on this page. No need to insert dummy DMA @@ -1337,6 +1525,8 @@ bool VPURT::BarrierPagesSplitHandler::isDummyDmaNeeded(size_t pageInd, VPURT::Ta if (lastDmaTaskOnSameQueueInPageOpt.has_value()) { bool isDepFromAllBoundaryTasksToDma = true; auto lastDmaTaskOnSameQueueInPage = lastDmaTaskOnSameQueueInPageOpt.value(); + VPUX_THROW_WHEN(_firstAndLastBoundaryTaskForEachPagePerFifo[pageInd - 1].empty(), + "No boundary tasks set for page {0}", pageInd - 1); for (auto& [_, firstLastTaskInd] : _firstAndLastBoundaryTaskForEachPagePerFifo[pageInd - 1]) { auto lastTask = firstLastTaskInd.second; if (!isDepFromTaskAToTaskB(lastTask, lastDmaTaskOnSameQueueInPage)) { @@ -1361,7 +1551,9 @@ bool VPURT::BarrierPagesSplitHandler::isDummyDmaNeeded(size_t pageInd, VPURT::Ta vpux::BarrierInfo::TaskSet VPURT::BarrierPagesSplitHandler::getDummyDmaWaitBars(size_t pageInd) { // As wait barrier use wait barrier of earliest boundary task on this page // WLM page split guarantees that this wait barrier is updated by all tasks from previous page - auto boundaryTask = getFirstBoundaryTasksForPage(pageInd).front(); + auto boundaryTasks = getFirstBoundaryTasksForPage(pageInd); + VPUX_THROW_WHEN(boundaryTasks.empty(), "No boundary tasks set for page {0}", pageInd); + auto boundaryTask = boundaryTasks.front(); auto dummyDmaProposedWaitBars = _barrierInfo.getWaitBarriers(boundaryTask); @@ -1404,6 +1596,8 @@ vpux::BarrierInfo::TaskSet VPURT::BarrierPagesSplitHandler::getDummyDmaWaitBars( // As wait barriers use update barriers of boundary tasks from previous page. After that dummy DMA // may have more than 1 wait barrier but since all DMA tasks are expected to be enqueued at bootstrap // this is not a problem for enqueue algorithm + VPUX_THROW_WHEN(_firstAndLastBoundaryTaskForEachPagePerFifo[prevPageInd].empty(), + "No boundary tasks set for page {0}", prevPageInd); for (auto& [_, firstLastTaskIndPair] : _firstAndLastBoundaryTaskForEachPagePerFifo[prevPageInd]) { auto prevPageBoundaryTask = firstLastTaskIndPair.second; auto prevPageBoundaryTaskUpdBars = _barrierInfo.getUpdateBarriers(prevPageBoundaryTask); @@ -1513,6 +1707,8 @@ vpux::BarrierInfo::TaskSet VPURT::BarrierPagesSplitHandler::getDummyDmaUpdateBar VPUX_THROW_UNLESS(syncTaskOpt.has_value(), "No control block sync task for task {0}", insertionPoint); auto syncTaskWaitBars = _barrierInfo.getWaitBarriers(syncTaskOpt.value()); + VPUX_THROW_UNLESS(!syncTaskWaitBars.empty(), "No wait barriers for control graph sync task {0} found", + syncTaskOpt.value()); auto newBarInd = *std::max_element(syncTaskWaitBars.begin(), syncTaskWaitBars.end()); _log.nest(2).trace("Change update barrier {0} to {1} due to crossing control graph block " "boundary set by task {2}", @@ -1654,6 +1850,59 @@ VPURT::BarrierPagesSplitHandler::getAndLegalizeDummyDmaInsertionData() { return dummyDmaInsertionDataVec; } +// Prepare data for inserting dummy barriers. They are needed to be placed +// in pages which use less than half of available physical barriers. To make barrier +// programming DMAs simple and able to always refill 4 entries for all physical barriers +// as a single transaction, each page except last two need to always use exactly half of +// physical barriers. Dummy barriers will be placed in parallel to existing barriers +// what will not have any impact on performance of schedule. +SmallVector +VPURT::BarrierPagesSplitHandler::getDummyBarriersInsertionData() { + _log.trace("Getting dummy barriers data"); + SmallVector dummyBarrierDataVec; + + if (_pageCount <= 2) { + _log.trace("No need to insert dummy barriers if model has {0} <= 2 pages", _pageCount); + return dummyBarrierDataVec; + } + + SmallVector numberOfBarriersPerPage(_pageCount, 0); + SmallVector lastBarrierIndexPerPage(_pageCount, 0); + // Iterate each barrier and count number of barriers per page and store information about + // last barrier index + for (size_t barInd = 0; barInd < _barrierInfo.getNumOfBarrierOps(); barInd++) { + auto pageInd = getBarrierPage(barInd); + numberOfBarriersPerPage[pageInd]++; + lastBarrierIndexPerPage[pageInd] = barInd; + } + + // Iterate each page and check the number of barriers + // Skip last two pages, which are last pages for two halves of physical barrier set. + // There is no need to insert dummy barriers there as those entries can be filled by dummy values + // when programming barrier FIFOs using barrier programming DMA + _log = _log.nest(); + for (size_t pageInd = 0; pageInd < _pageCount - 2; pageInd++) { + _log.trace("Page {0} barrier count: {1}", pageInd, numberOfBarriersPerPage[pageInd]); + for (size_t barrierIndexToAdd = numberOfBarriersPerPage[pageInd]; barrierIndexToAdd < _pageSize; + barrierIndexToAdd++) { + _log.nest().trace("Missing barrier. Prepare new one {0}", barrierIndexToAdd); + + DummyBarrierData dummyBarrierData; + dummyBarrierData.pageInd = pageInd; + dummyBarrierData.insertAfter = lastBarrierIndexPerPage[pageInd]; + dummyBarrierData.consumer = *(_barrierInfo.getBarrierConsumers(lastBarrierIndexPerPage[pageInd]).begin()); + dummyBarrierData.producer = *(_barrierInfo.getBarrierProducers(lastBarrierIndexPerPage[pageInd]).begin()); + _log.nest().trace("New barrier data: insert after bar {0}, producer {1}, consumer {2}", + dummyBarrierData.insertAfter, dummyBarrierData.producer, dummyBarrierData.consumer); + + dummyBarrierDataVec.push_back(dummyBarrierData); + } + } + _log = _log.unnest(); + + return dummyBarrierDataVec; +} + void vpux::VPURT::BarrierPagesSplitHandler::initPrevPhysBarrierData(SmallVector& barrierToPidVec) { size_t numOfBarriers = _barrierInfo.getNumOfBarrierOps(); diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/find_wlm_enqueue_barrier.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/find_wlm_enqueue_barrier.cpp index 3b8d7d22a8..b297f4315f 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/find_wlm_enqueue_barrier.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/find_wlm_enqueue_barrier.cpp @@ -5,9 +5,11 @@ #include "vpux/compiler/NPU40XX/dialect/VPURT/interfaces/enqueue_barrier.hpp" #include "vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" +#include "vpux/compiler/utils/options.hpp" namespace vpux::VPURT::arch40xx { #define GEN_PASS_DECL_FINDWLMENQUEUEBARRIER @@ -37,7 +39,7 @@ void FindWlmEnqueueBarrierPass::safeRunOnFunc() { auto func = getOperation(); auto module = func->getParentOfType(); - if (vpux::VPUIP::getWlmStatus(module) != vpux::VPUIP::WlmStatus::ENABLED) { + if (VPU::getWorkloadManagementStatus(module) != VPU::WorkloadManagementStatus::ENABLED) { // WLM is not supported, no need to run this pass return; } @@ -45,7 +47,7 @@ void FindWlmEnqueueBarrierPass::safeRunOnFunc() { if (_workloadManagementMode != WorkloadManagementMode::FWLM_V1_PAGES && !VPURT::verifyOneWaitBarrierPerTask(func, _log)) { _log.warning("WLM cannot be enabled as not all tasks have 1 wait barrier"); - vpux::VPUIP::setWlmStatus(module, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(module, VPU::WorkloadManagementStatus::FAILED); signalPassFailure(); return; } @@ -70,7 +72,7 @@ void FindWlmEnqueueBarrierPass::safeRunOnFunc() { const auto res = enqueueBarrier.calculateEnqueueBarriers(executorEnqAtBootstrap); if (mlir::failed(res)) { _log.warning("Enqueue algorithm failed. Need to switch to nonWLM"); - vpux::VPUIP::setWlmStatus(module, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(module, VPU::WorkloadManagementStatus::FAILED); signalPassFailure(); return; } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/find_wlm_enqueue_dmas_barrier.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/find_wlm_enqueue_dmas_barrier.cpp index b6651fd6f1..4840410a5c 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/find_wlm_enqueue_dmas_barrier.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/find_wlm_enqueue_dmas_barrier.cpp @@ -5,10 +5,12 @@ #include "vpux/compiler/NPU40XX/dialect/VPURT/interfaces/barrier_pages_split.hpp" #include "vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" #include "vpux/compiler/utils/dma.hpp" +#include "vpux/compiler/utils/options.hpp" namespace vpux::VPURT::arch40xx { #define GEN_PASS_DECL_FINDWLMENQUEUEDMASBARRIER @@ -35,7 +37,7 @@ void FindWlmEnqueueDmasBarrierPass::safeRunOnFunc() { auto func = getOperation(); auto module = func->getParentOfType(); - if (vpux::VPUIP::getWlmStatus(module) != vpux::VPUIP::WlmStatus::ENABLED) { + if (VPU::getWorkloadManagementStatus(module) != VPU::WorkloadManagementStatus::ENABLED) { // WLM is not supported, no need to run this pass return; } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/order_barriers_for_wlm.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/order_barriers_for_wlm.cpp index 1bf131f07a..67fdd3dd28 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/order_barriers_for_wlm.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/order_barriers_for_wlm.cpp @@ -5,9 +5,11 @@ #include "vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp" #include "vpux/compiler/core/barrier_info.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" +#include "vpux/compiler/utils/options.hpp" namespace vpux::VPURT::arch40xx { #define GEN_PASS_DECL_ORDERBARRIERSFORWLM @@ -33,7 +35,7 @@ void OrderBarriersForWlmPass::safeRunOnFunc() { auto func = getOperation(); auto module = func->getParentOfType(); - if (vpux::VPUIP::getWlmStatus(module) != vpux::VPUIP::WlmStatus::ENABLED) { + if (VPU::getWorkloadManagementStatus(module) != VPU::WorkloadManagementStatus::ENABLED) { // WLM is not supported, no need to run this pass return; } diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_insert_dummy_barriers_in_pages.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_insert_dummy_barriers_in_pages.cpp new file mode 100644 index 0000000000..6c79ecd6be --- /dev/null +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_insert_dummy_barriers_in_pages.cpp @@ -0,0 +1,93 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/NPU40XX/dialect/VPURT/interfaces/barrier_pages_split.hpp" +#include "vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/dma.hpp" + +namespace vpux::VPURT::arch40xx { +#define GEN_PASS_DECL_WLMINSERTDUMMYBARRIERSINPAGES +#define GEN_PASS_DEF_WLMINSERTDUMMYBARRIERSINPAGES +#include "vpux/compiler/NPU40XX/dialect/VPURT/passes.hpp.inc" +} // namespace vpux::VPURT::arch40xx + +using namespace vpux; + +namespace { + +class WlmInsertDummyBarriersInPagesPass final : + public VPURT::arch40xx::impl::WlmInsertDummyBarriersInPagesBase { +public: + explicit WlmInsertDummyBarriersInPagesPass(Logger log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnFunc() final; +}; + +void WlmInsertDummyBarriersInPagesPass::safeRunOnFunc() { + auto func = getOperation(); + auto module = func->getParentOfType(); + + if (VPU::getWorkloadManagementStatus(module) != VPU::WorkloadManagementStatus::ENABLED) { + // WLM is not supported, no need to run this pass + return; + } + + const auto numBarriers = + numBarriersOpt.hasValue() ? numBarriersOpt.getValue() : VPUIP::getNumAvailableBarriers(func); + + auto& barrierInfo = getAnalysis(); + VPURT::BarrierPagesSplitHandler barrierPagesSplitHandler(barrierInfo, numBarriers, _log); + barrierPagesSplitHandler.initializeForLegalization(); + auto dummyBarriersInsertionDataVec = barrierPagesSplitHandler.getDummyBarriersInsertionData(); + + if (dummyBarriersInsertionDataVec.empty()) { + return; + } + + _log.trace("Insert {0} dummy barriers in pages", dummyBarriersInsertionDataVec.size()); + + mlir::OpBuilder builder(func); + + for (const auto& dummyBarrierInsertionData : dummyBarriersInsertionDataVec) { + auto pageInd = dummyBarrierInsertionData.pageInd; + auto insertAfter = dummyBarrierInsertionData.insertAfter; + auto producer = dummyBarrierInsertionData.producer; + auto consumer = dummyBarrierInsertionData.consumer; + + auto insertionPointOp = barrierInfo.getBarrierOpAtIndex(insertAfter); + + builder.setInsertionPointAfter(insertionPointOp); + auto newBarrierOp = builder.create(insertionPointOp->getLoc()); + barrierInfo.addNewBarrier(newBarrierOp); + auto newBarrierIdx = barrierInfo.getIndex(newBarrierOp); + barrierInfo.addProducer(newBarrierIdx, producer); + barrierInfo.addConsumer(newBarrierIdx, consumer); + + newBarrierOp.setWlmPage(pageInd); + + _log.trace("In page {0} after barrier {1} insert new barrier {2} with producer {3} and consumer {4}", pageInd, + insertAfter, newBarrierIdx, producer, consumer); + } + barrierInfo.updateIR(); + barrierInfo.clearAttributes(); +} +} // namespace + +// +// createWlmInsertDummyBarriersInPagesPass +// + +std::unique_ptr vpux::VPURT::arch40xx::createWlmInsertDummyBarriersInPagesPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_insert_dummy_dmas_in_pages.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_insert_dummy_dmas_in_pages.cpp index 6c83745d92..ef9e5b2b7a 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_insert_dummy_dmas_in_pages.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_insert_dummy_dmas_in_pages.cpp @@ -5,10 +5,13 @@ #include "vpux/compiler/NPU40XX/dialect/VPURT/interfaces/barrier_pages_split.hpp" #include "vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/dma.hpp" +#include "vpux/compiler/utils/options.hpp" namespace vpux::VPURT::arch40xx { #define GEN_PASS_DECL_WLMINSERTDUMMYDMASINPAGES @@ -34,9 +37,9 @@ class WlmInsertDummyDmasInPagesPass final : void WlmInsertDummyDmasInPagesPass::safeRunOnFunc() { auto func = getOperation(); auto module = func->getParentOfType(); - auto arch = VPU::getArch(func); + auto arch = config::getArch(func); - if (vpux::VPUIP::getWlmStatus(module) != vpux::VPUIP::WlmStatus::ENABLED) { + if (VPU::getWorkloadManagementStatus(module) != VPU::WorkloadManagementStatus::ENABLED) { // WLM is not supported, no need to run this pass return; } @@ -105,8 +108,6 @@ void WlmInsertDummyDmasInPagesPass::safeRunOnFunc() { VPUX_THROW_UNLESS(barrierInfo.verifyControlGraphSplit(), "Encountered split of control graph is incorrect"); barrierInfo.clearAttributes(); - - VPUX_THROW_UNLESS(VPURT::verifyBarrierSlots(func, _log), "Barrier slot count check failed"); } } // namespace diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_legalize_pages_for_barrier_dmas.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_legalize_pages_for_barrier_dmas.cpp index 2ce48e043b..67c9f44cc0 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_legalize_pages_for_barrier_dmas.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_legalize_pages_for_barrier_dmas.cpp @@ -5,10 +5,12 @@ #include "vpux/compiler/NPU40XX/dialect/VPURT/interfaces/barrier_pages_split.hpp" #include "vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" #include "vpux/compiler/utils/dma.hpp" +#include "vpux/compiler/utils/options.hpp" namespace vpux::VPURT::arch40xx { #define GEN_PASS_DECL_WLMLEGALIZEPAGESFORBARRIERDMAS @@ -55,7 +57,7 @@ void WlmLegalizePagesForBarrierDmasPass::safeRunOnFunc() { auto module = func->getParentOfType(); auto nPhysBarrs = VPUIP::getNumAvailableBarriers(func); - if (vpux::VPUIP::getWlmStatus(module) != vpux::VPUIP::WlmStatus::ENABLED) { + if (VPU::getWorkloadManagementStatus(module) != VPU::WorkloadManagementStatus::ENABLED) { // WLM is not supported, no need to run this pass return; } @@ -150,8 +152,6 @@ void WlmLegalizePagesForBarrierDmasPass::safeRunOnFunc() { barrierInfo.clearAttributes(); VPURT::postProcessBarrierOps(func); - - VPUX_THROW_UNLESS(VPURT::verifyBarrierSlots(func, _log), "Barrier slot count check failed"); } } // namespace diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_legalize_split_graph_to_pages.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_legalize_split_graph_to_pages.cpp index b96e49b505..079d7b1ec0 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_legalize_split_graph_to_pages.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_legalize_split_graph_to_pages.cpp @@ -5,9 +5,11 @@ #include "vpux/compiler/NPU40XX/dialect/VPURT/interfaces/barrier_pages_split.hpp" #include "vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" +#include "vpux/compiler/utils/options.hpp" namespace vpux::VPURT::arch40xx { #define GEN_PASS_DECL_WLMLEGALIZESPLITGRAPHTOPAGES @@ -37,7 +39,7 @@ void WlmLegalizeSplitGraphToPagesPass::safeRunOnFunc() { const auto numBarriers = numBarriersOpt.hasValue() ? numBarriersOpt.getValue() : VPUIP::getNumAvailableBarriers(func); - if (vpux::VPUIP::getWlmStatus(module) != vpux::VPUIP::WlmStatus::ENABLED) { + if (VPU::getWorkloadManagementStatus(module) != VPU::WorkloadManagementStatus::ENABLED) { // WLM is not supported, no need to run this pass return; } @@ -65,8 +67,6 @@ void WlmLegalizeSplitGraphToPagesPass::safeRunOnFunc() { barrierInfo.clearAttributes(); VPURT::postProcessBarrierOps(func); - - VPUX_THROW_UNLESS(VPURT::verifyBarrierSlots(func, _log), "Barrier slot count check failed"); } } // namespace diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_split_graph_to_pages.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_split_graph_to_pages.cpp index e82356af04..a2d0d06d46 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_split_graph_to_pages.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect/VPURT/transforms/passes/wlm_split_graph_to_pages.cpp @@ -5,9 +5,11 @@ #include "vpux/compiler/NPU40XX/dialect/VPURT/interfaces/barrier_pages_split.hpp" #include "vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" +#include "vpux/compiler/utils/options.hpp" namespace vpux::VPURT::arch40xx { #define GEN_PASS_DECL_WLMSPLITGRAPHTOPAGES @@ -37,14 +39,14 @@ void WlmSplitGraphToPagesPass::safeRunOnFunc() { const auto numBarriers = numBarriersOpt.hasValue() ? numBarriersOpt.getValue() : VPUIP::getNumAvailableBarriers(func); - if (vpux::VPUIP::getWlmStatus(module) != vpux::VPUIP::WlmStatus::ENABLED) { + if (VPU::getWorkloadManagementStatus(module) != VPU::WorkloadManagementStatus::ENABLED) { // WLM is not supported, no need to run this pass return; } if (!VPURT::verifyOneWaitBarrierPerTask(func, _log)) { _log.warning("WLM cannot be enabled as not all tasks have 1 wait barrier"); - vpux::VPUIP::setWlmStatus(module, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(module, VPU::WorkloadManagementStatus::FAILED); return; } diff --git a/src/vpux_compiler/src/NPU40XX/dialect_pipeline_strategy.cpp b/src/vpux_compiler/src/NPU40XX/dialect_pipeline_strategy.cpp index 9d7e5e820c..b061dc05f0 100644 --- a/src/vpux_compiler/src/NPU40XX/dialect_pipeline_strategy.cpp +++ b/src/vpux_compiler/src/NPU40XX/dialect_pipeline_strategy.cpp @@ -21,6 +21,7 @@ #include "vpux/compiler/dialect/core/transforms/passes.hpp" #include "vpux/compiler/pipelines/options_setup.hpp" +#include "vpux/compiler/utils/rewriter.hpp" using namespace vpux; @@ -62,10 +63,33 @@ class ShaveCodeGenSetup40XX : public OptionsSetupBase { +class ReferenceSWSetup40XX : public OptionsSetupBase { public: - using Base = OptionsSetupBase; + using Base = OptionsSetupBase; using Base::Base; + + static void setupOptionsImpl(DefaultHWOptions40XX& options, const intel_npu::Config& config) { + Base::setupOptionsImpl(options, config); + setupOptionsCommon(options); + } + + static void setupOptionsCommon(DefaultHWOptions40XX& options) { + // ReferenceSW specific values + overwriteIfUnset(options.enableForceZMajorConcat, false); + overwriteIfUnset(options.enableSwapTransposeWithFQ, false); + overwriteIfUnset(options.enableAlignScales, false); + overwriteIfUnset(options.fuseMvn6ScaleBias, false); + overwriteIfUnset(options.enableConvertFCToConv, false); + overwriteIfUnset(options.enableAdjustNonZeroFakeQuant, false); + overwriteIfUnset(options.enableAdaptiveStripping, false); + overwriteIfUnset(options.enableExtraStaticShapeOps, false); + overwriteIfUnset(options.enableOptimizeReorders, false); + overwriteIfUnset(options.enableVPUNNPreSplit, false); + overwriteIfUnset(options.enableRuntimeDequant, false); + + overwriteIfUnset(options.enableConvertFFTToConv, false); + overwriteIfUnset(options.enableDecomposeGRUSequence, false); + } }; class HostCompileSetup40XX : public OptionsSetupBase { @@ -100,6 +124,7 @@ class HostCompileSetup40XX : public OptionsSetupBase { @@ -194,22 +219,17 @@ class DialectPipelineStrategy40XX final : public IDialectPipelineStrategy { }; // -// DialectPipelineStrategy40XX: [ReferenseSW] +// DialectPipelineStrategy40XX: [ReferenceSW] // This implementation will be chosen if OptionsContainerType contains ReferenceSWOptions // -template -using Has40XXSWOption = typename std::enable_if_t>; - -template -class DialectPipelineStrategy40XX> final : - public IDialectPipelineStrategy { +class DialectPipelineStrategyReferenceSW40XX final : public IDialectPipelineStrategy { public: - explicit DialectPipelineStrategy40XX(const intel_npu::Config& config) - : _optionsContainer(std::make_unique(config)) { + explicit DialectPipelineStrategyReferenceSW40XX(const intel_npu::Config& config) + : _optionsContainer(std::make_unique(config)) { } - explicit DialectPipelineStrategy40XX(std::unique_ptr optionsContainer) + explicit DialectPipelineStrategyReferenceSW40XX(std::unique_ptr optionsContainer) : _optionsContainer(std::move(optionsContainer)) { } @@ -217,135 +237,30 @@ class DialectPipelineStrategy40XXgetInitCompilerOptions(), log.nest()); } - void buildReferenceSWPipeline(mlir::OpPassManager& pm, Logger log) override { - auto& options = _optionsContainer->getPipelineOptions(); - const auto grc = getDefaultGreedyRewriteConfig(); - - // No passes should be run before this pipeline, with very few exceptions. - IE::buildPostImportPipeline(pm, log); - - // Level 3 : Topology - - IE::arch37xx::buildInitialLowPrecisionTransformationsPipeline(pm, IE::LowPrecisionTransformOptions(options), - log); - IE::arch37xx::buildInitialTransformationsPipeline(pm, IE::TransformOptions(options), log); - IE::buildAdjustPrecisionPipeline(pm, IE::AdjustPrecisionOptions(options), log); - - // Resolve group quant MatMul pattern - pm.addPass(IE::createUniquifyOpsPass(log)); - pm.addPass(IE::createMergeParallelFullyConnectedPass(log)); - pm.addPass(IE::createUnrollGroupQuantizePass(log)); - pm.addPass(IE::createUnrollFullyConnectedPass(log)); - pm.addPass(IE::createMergeFullyConnectedPass(log)); - if (options.fuseScalesToAccumulate) { - pm.addPass(IE::createFuseScalesToAccumulatePass(log)); - } - pm.addPass(IE::createConvertMatMulToConvPass(log)); - if (options.enableConvertFCToConv) { - pm.addPass(IE::createConvertFCToConvPass(log)); - } - - pm.addPass(IE::createResolveStridedSlicePass(log)); - pm.addPass(IE::createConvertStridedSlice2ConvPass(log)); - pm.addPass(IE::createConvertNceOpsTo4DPass(log)); - pm.addPass(IE::createConvertShapeTo4DPass(log)); - pm.addPass(mlir::createCanonicalizerPass(grc)); - pm.addPass(IE::createConvertToSpatialOpPass(false, isOptionEnabled(options.enableSEPtrsOperations), log)); - pm.addPass(IE::createConvertGRNToNormalizeL2Pass(log)); - pm.addPass(IE::createResolveScatterUpdateByTransposePass(log)); - IE::buildAdjustForVPUPipeline(pm, IE::AdjustForVPUOptions(options), log); - - pm.addPass(IE::createSplitFakeQuantPass(log)); - pm.addPass(mlir::createCanonicalizerPass(grc)); - pm.addPass(IE::createDequantizeConstPass(options.runtimeDequantizationLimit, - isOptionEnabled(options.enableRuntimeDequant), log)); - if (options.enableMergeFakeQuant) { - pm.addPass(IE::createMergeFakeQuantPass(log)); - } - pm.addPass(mlir::createCanonicalizerPass(grc)); - - IE::arch37xx::buildAdjustLayoutPipeline(pm, IE::AdjustLayoutOptions(options), log); - pm.addPass(IE::createConvertAssignReadValueToReturnsAndInputs(log)); - - pm.addPass(IE::createConvertToMemPermutePass(log)); - pm.addPass(mlir::createCanonicalizerPass(grc)); - - // Lowering to VPU - pm.addPass(createConvertLayers2VPUPass(log)); - pm.addPass(VPU::createDetectionOutputDecompositionPass(log)); - pm.addPass(VPU::arch37xx::createSplitRealDFTOpsPass(log)); - pm.addPass(VPU::createAddSwOpAuxiliaryBufferPass(log)); - pm.addPass(VPU::createSplitGRUSequencePass(log)); - pm.addPass(VPU::arch37xx::createDecomposeMVNPass(log)); - - pm.addPass(VPU::createTilingStrategyAssignmentPass( - /*enablePrefetchTiling=*/false, /*enableVPUNNCostForTiling*/ false, - /*enableShaveDDRAccessOptimization*/ "true", log)); - pm.addPass(VPU::arch37xx::createApplyTilingMVN1SumPass(/*enablePrefetchTiling=*/false, log)); - pm.addPass(VPU::createApplyTilingPass(/*enableSCFTiling=*/false, log)); - - pm.addPass(VPU::createComputeInterpolateCoordinatesPass(/*enableExplicitDistributionInfoAttr=*/true, log)); - - pm.addPass(VPU::createBoundedTensorsToDynamicDimsMaskPass(log)); - - // Lowering to VPUIP - vpux::arch37xx::buildLowerVPU2VPUIPPipeline(pm, options.enableInPlaceBufferization, - options.useMemrefForHostFunctionBufferization, log); - - // Level 2 : Abstract RunTime - - pm.addPass(VPUIP::createSetMemorySpacePass(VPU::getMemKind, log)); - - pm.addPass(VPUIP::createAddCopyBetweenSWKernelsAndNetworkIOPass(log)); - - pm.addPass(VPUIP::createCopyOpTilingPass(log)); - pm.addPass(mlir::createCanonicalizerPass(grc)); - - if (options.enableProfiling && options.enableSWProfiling) { - pm.addPass(VPUIP::createActShaveProfilingPass(VPU::getMemKind, log)); - } - - pm.addPass(VPUIP::createUngroupBoundedBuffersPass(log)); - - pm.addPass(VPUIP::createConvertTransferOpsToDMAsPass(log)); - - VPUIP::buildAsyncSchedulingPipeline(pm, log); - - pm.addPass(VPUIP::createDMATaskProfilingReserveMemPass(DMAProfilingMode::SCRATCH, log)); - - if (options.enableSWKernelPrefetchingReserveMem) { - pm.addPass(VPUIP::createSWKernelPrefetchingReserveMemPass(log)); - } - - pm.addPass(VPUIP::createStaticAllocationPass(VPU::getMemKind, log)); - pm.addPass(VPUIP::createStaticAllocationPass(VPU::getMemKind, log)); - pm.addPass(VPUIP::createLinearizationPass(log)); - pm.addPass(VPUIP::createOptimizeAsyncDepsPass(log)); - - pm.addPass(VPUIP::arch37xx::createAddSwKernelCacheHandlingOpsPass(log)); - - VPUIP::buildHardwareAdaptationPipeline(pm, log); + void buildIEPipeline(mlir::OpPassManager& pm, Logger log) override { + IE::arch40xx::buildReferenceSWPipeline(pm, _optionsContainer->getPipelineOptions(), log); + } - pm.addPass(VPUIP::arch40xx::createAddStartBarrierPass(/*compilerBarrierProgramming=*/false, log)); - pm.addPass(VPURT::arch37xx::createAddFinalBarrierPass(log)); + void buildLowerIE2VPUPipeline(mlir::OpPassManager& pm, Logger log) override { + vpux::arch37xx::buildLowerIE2VPUPipeline(pm, log); + } - // Level 1 : VPU RunTime + void buildVPUPipeline(mlir::OpPassManager& pm, Logger log) override { + VPU::arch37xx::buildReferenceSWPipeline(pm, log); + } - if (options.enableProfiling) { - pm.addPass(VPUIP::createCaptureWorkpointPass(log)); - pm.addPass(VPUIP::createGroupProfilingBuffersPass(log)); - pm.addPass(Core::createMoveDeclarationsToTopPass(log)); - } + void buildLowerVPU2VPUIPPipeline(mlir::OpPassManager& pm, Logger log) override { + vpux::arch37xx::buildLowerVPU2VPUIPPipeline(pm, + _optionsContainer->getPipelineOptions().enableInPlaceBufferization, + /*useMemrefForHostFunctionBufferization*/ false, log); + } - pm.addPass(VPURT::createAssignPhysicalBarriersPass(options.enableColorBinPhysicalBarrierAssignment, - std::nullopt, std::nullopt, log)); - pm.addPass(VPURT::createBarrierSimulationPass(log)); - pm.addPass(VPUIP::createUpdateSwKernelParamsPass(log)); - pm.addPass(mlir::createCanonicalizerPass(grc)); + void buildVPUIPPipeline(mlir::OpPassManager& pm, Logger log) override { + VPUIP::arch40xx::buildReferenceSWPipeline(pm, _optionsContainer->getPipelineOptions(), log); } private: - std::unique_ptr _optionsContainer; + std::unique_ptr _optionsContainer; }; } // namespace @@ -364,7 +279,8 @@ std::unique_ptr vpux::createDialectPipelineStrategy40X return std::make_unique>(config); } case config::CompilationMode::ReferenceSW: { - return std::make_unique>(config); + // return std::make_unique>(config); + return std::make_unique(config); } case config::CompilationMode::HostCompile: { return std::make_unique>(config); @@ -392,10 +308,10 @@ std::unique_ptr vpux::createDialectPipelineStrategy40X } template <> -std::unique_ptr vpux::createDialectPipelineStrategy40XX( - const VPU::InitCompilerOptions* initCompilerOptions, const ReferenceSWOptions40XX* options) { +std::unique_ptr vpux::createDialectPipelineStrategy40XXReferenceSW( + const VPU::InitCompilerOptions* initCompilerOptions, const DefaultHWOptions40XX* options) { auto wrapper = std::make_unique(initCompilerOptions, options); - return std::make_unique>(std::move(wrapper)); + return std::make_unique(std::move(wrapper)); } /// The reason this method is separate from the default and reference compilation modes is that it has to *copy* the diff --git a/src/vpux_compiler/src/NPU40XX/interfaces_registry.cpp b/src/vpux_compiler/src/NPU40XX/interfaces_registry.cpp index ebfb617e9c..a248a69494 100644 --- a/src/vpux_compiler/src/NPU40XX/interfaces_registry.cpp +++ b/src/vpux_compiler/src/NPU40XX/interfaces_registry.cpp @@ -45,6 +45,8 @@ void InterfacesRegistry40XX::registerInterfaces(mlir::DialectRegistry& registry) VPUIPDPU::arch40xx::registerDPUExpandOpInterfaces(registry); // NB: arch40xx::VerifiersOpModel uses its own logic VPUIPDPU::arch40xx::registerVerifiersOpInterfaces(registry); + // NB: arch37xx::ICostModelUtilsInterface can be re-used for 40XX + VPU::arch37xx::registerICostModelUtilsInterface(registry); } } // namespace vpux diff --git a/src/vpux_compiler/src/NPU40XX/pipeline_options.cpp b/src/vpux_compiler/src/NPU40XX/pipeline_options.cpp index 8f46ee2b64..9de9532526 100644 --- a/src/vpux_compiler/src/NPU40XX/pipeline_options.cpp +++ b/src/vpux_compiler/src/NPU40XX/pipeline_options.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/NPU40XX/pipeline_options.hpp" +#include "vpux/compiler/pipelines/options_setup.hpp" namespace vpux { @@ -20,7 +21,7 @@ void setupParamsAccordingToOptimizationLevel(int optimizationLevel, DefaultHWOpt break; } case 3: { - compilationOptions.enableReduceNumTilesForSmallModelsPass = true; + overwriteIfUnset(compilationOptions.enableReduceNumTilesForSmallModelsPass, true); break; } default: diff --git a/src/vpux_compiler/src/NPU40XX/pipelines_register.cpp b/src/vpux_compiler/src/NPU40XX/pipelines_register.cpp index d3e428e708..657039c966 100644 --- a/src/vpux_compiler/src/NPU40XX/pipelines_register.cpp +++ b/src/vpux_compiler/src/NPU40XX/pipelines_register.cpp @@ -28,7 +28,7 @@ void PipelineRegistry40XX::registerPipelines() { mlir::PassPipelineRegistration( "ShaveCodeGen", "Compile both from IE to VPUIP for VPU40XX", [](mlir::OpPassManager& pm, const DefaultHWOptions40XX& options) { - VPU::InitCompilerOptions initCompilerOptions{VPU::ArchKind::NPU40XX, + VPU::InitCompilerOptions initCompilerOptions{config::ArchKind::NPU40XX, config::CompilationMode::ShaveCodeGen, options}; auto createPipelineStartegy = [&](config::CompilationMode) { return createDialectPipelineStrategy40XX(&initCompilerOptions, &options); @@ -37,13 +37,14 @@ void PipelineRegistry40XX::registerPipelines() { factory.buildPipeline(pm); }); - mlir::PassPipelineRegistration( + mlir::PassPipelineRegistration( "reference-sw-mode", "Compile IE Network in Reference Software mode (SW only execution) for VPU40XX", - [](mlir::OpPassManager& pm, const ReferenceSWOptions40XX& options) { - VPU::InitCompilerOptions initCompilerOptions{VPU::ArchKind::NPU40XX, + [](mlir::OpPassManager& pm, const DefaultHWOptions40XX& options) { + VPU::InitCompilerOptions initCompilerOptions{config::ArchKind::NPU40XX, config::CompilationMode::ReferenceSW, options}; auto createPipelineStartegy = [&](config::CompilationMode) { - return createDialectPipelineStrategy40XX(&initCompilerOptions, &options); + return createDialectPipelineStrategy40XXReferenceSW(&initCompilerOptions, + &options); }; ReferenceSWStrategy factory(createPipelineStartegy, Logger::global()); factory.buildPipeline(pm); @@ -52,8 +53,8 @@ void PipelineRegistry40XX::registerPipelines() { mlir::PassPipelineRegistration( "default-hw-mode", "Compile IE Network in Default Hardware mode (HW and SW execution) for VPU40XX", [](mlir::OpPassManager& pm, const DefaultHWOptions40XX& options) { - VPU::InitCompilerOptions initCompilerOptions{VPU::ArchKind::NPU40XX, config::CompilationMode::DefaultHW, - options}; + VPU::InitCompilerOptions initCompilerOptions{config::ArchKind::NPU40XX, + config::CompilationMode::DefaultHW, options}; auto createPipelineStartegy = [&](config::CompilationMode) { return createDialectPipelineStrategy40XX(&initCompilerOptions, &options); }; @@ -64,7 +65,7 @@ void PipelineRegistry40XX::registerPipelines() { mlir::PassPipelineRegistration( "ws-monolithic", "Compile IE Network in Weights separation Monolithic mode for NPU40XX", [](mlir::OpPassManager& pm, const DefaultHWOptions40XX& options) { - VPU::InitCompilerOptions initCompilerOptions{VPU::ArchKind::NPU40XX, + VPU::InitCompilerOptions initCompilerOptions{config::ArchKind::NPU40XX, config::CompilationMode::WSMonolithic, options}; auto createPipelineStartegy = [&](config::CompilationMode compilationMode) { return createDialectPipelineStrategy40XXWS(compilationMode, @@ -77,7 +78,7 @@ void PipelineRegistry40XX::registerPipelines() { mlir::PassPipelineRegistration( "ws-monolithic-partial", "Compile IE Network in Weights separation Monolithic mode for NPU40XX", [](mlir::OpPassManager& pm, const DefaultHWOptions40XX& options) { - VPU::InitCompilerOptions initCompilerOptions{VPU::ArchKind::NPU40XX, + VPU::InitCompilerOptions initCompilerOptions{config::ArchKind::NPU40XX, config::CompilationMode::WSMonolithic, options}; auto createPipelineStartegy = [&](config::CompilationMode compilationMode) { return createDialectPipelineStrategy40XXWS(compilationMode, @@ -90,7 +91,7 @@ void PipelineRegistry40XX::registerPipelines() { mlir::PassPipelineRegistration( "host-compile", "Compile IE Network in Host mode (host and HW execution) for NPU40XX", [](mlir::OpPassManager& pm, const DefaultHWOptions40XX& options) { - VPU::InitCompilerOptions initCompilerOptions{VPU::ArchKind::NPU40XX, + VPU::InitCompilerOptions initCompilerOptions{config::ArchKind::NPU40XX, config::CompilationMode::HostCompile, options}; auto createPipelineStrategy = [&](config::CompilationMode) { return createDialectPipelineStrategy40XX(&initCompilerOptions, &options); diff --git a/src/vpux_compiler/src/ShaveCodeGen/passes/early_codegen_capsule_fusion.cpp b/src/vpux_compiler/src/ShaveCodeGen/passes/early_codegen_capsule_fusion.cpp index 2f21fc2e03..fa22c96c3c 100644 --- a/src/vpux_compiler/src/ShaveCodeGen/passes/early_codegen_capsule_fusion.cpp +++ b/src/vpux_compiler/src/ShaveCodeGen/passes/early_codegen_capsule_fusion.cpp @@ -5,12 +5,11 @@ #include "vpux/compiler/ShaveCodeGen/analysis.hpp" #include "vpux/compiler/ShaveCodeGen/passes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/utils/core/error.hpp" -#include "vpux/utils/logger/logger.hpp" - -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/utils/core/range.hpp" +#include "vpux/utils/logger/logger.hpp" #include #include diff --git a/src/vpux_compiler/src/ShaveCodeGen/passes/encapsulate_codegen_ops.cpp b/src/vpux_compiler/src/ShaveCodeGen/passes/encapsulate_codegen_ops.cpp index ad5c9ae575..dda7cc8948 100644 --- a/src/vpux_compiler/src/ShaveCodeGen/passes/encapsulate_codegen_ops.cpp +++ b/src/vpux_compiler/src/ShaveCodeGen/passes/encapsulate_codegen_ops.cpp @@ -5,10 +5,9 @@ #include "vpux/compiler/ShaveCodeGen/analysis.hpp" #include "vpux/compiler/ShaveCodeGen/passes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/utils/logger/logger.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - #include #include #include diff --git a/src/vpux_compiler/src/ShaveCodeGen/passes/outline_codegen_capsules.cpp b/src/vpux_compiler/src/ShaveCodeGen/passes/outline_codegen_capsules.cpp index 46936ebc03..18594eb580 100644 --- a/src/vpux_compiler/src/ShaveCodeGen/passes/outline_codegen_capsules.cpp +++ b/src/vpux_compiler/src/ShaveCodeGen/passes/outline_codegen_capsules.cpp @@ -4,13 +4,7 @@ // #include "vpux/compiler/ShaveCodeGen/passes.hpp" - -#include "vpux/compiler/conversion.hpp" -#include "vpux/compiler/utils/logging.hpp" -#include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/core/small_string.hpp" -#include "vpux/utils/logger/logger.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" @@ -23,6 +17,7 @@ #include #include #include +#include #include namespace vpux::ShaveCodeGen { @@ -197,7 +192,7 @@ void OutlineCodeGenCapsulesPass::safeRunOnModule() { mlir::RewritePatternSet patterns(&ctx); patterns.insert(&ctx, swModule, counter, swModule.getSymNameAttr()); mlir::tensor::BitcastOp::getCanonicalizationPatterns(patterns, &ctx); - if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) { + if (failed(mlir::applyPatternsAndFoldGreedily(func, std::move(patterns)))) { return signalPassFailure(); } } diff --git a/src/vpux_compiler/src/act-kernels/shave_binary_resources.cpp b/src/vpux_compiler/src/act-kernels/shave_binary_resources.cpp index bcf642c4c7..d4f5717270 100644 --- a/src/vpux_compiler/src/act-kernels/shave_binary_resources.cpp +++ b/src/vpux_compiler/src/act-kernels/shave_binary_resources.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/act_kernels/shave_binary_resources.h" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include #include @@ -21,11 +22,11 @@ ShaveBinaryResources& ShaveBinaryResources::getInstance() { return instance; } -vpux::SmallString ShaveBinaryResources::getSwKernelArchString(VPU::ArchKind archKind) { +vpux::SmallString ShaveBinaryResources::getSwKernelArchString(config::ArchKind archKind) { switch (archKind) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return vpux::SmallString("3720xx"); - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return vpux::SmallString("4000xx"); default: VPUX_THROW("unsupported archKind {0}", archKind); @@ -95,10 +96,10 @@ void ShaveBinaryResources::loadElfData(mlir::ModuleOp module) { std::string funcName; std::getline(ifileList, funcName); - VPU::ArchKind archKind = VPU::getArch(module.getOperation()); + config::ArchKind archKind = config::getArch(module.getOperation()); auto kernelArch = getSwKernelArchString(archKind); - sbr.addCompiledElf(funcName, binary, kernelArch); + sbr.addCompiledElf(funcName, binary, kernelArch, true); } ifileList.close(); diff --git a/src/vpux_compiler/src/bitc/include/commons.hpp b/src/vpux_compiler/src/bitc/include/commons.hpp index 5fc8524df0..baf401ff77 100644 --- a/src/vpux_compiler/src/bitc/include/commons.hpp +++ b/src/vpux_compiler/src/bitc/include/commons.hpp @@ -2,7 +2,6 @@ // Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // - #include #include "BitStream.hpp" diff --git a/src/vpux_compiler/src/compiler.cpp b/src/vpux_compiler/src/compiler.cpp index da9eede313..709b013869 100644 --- a/src/vpux_compiler/src/compiler.cpp +++ b/src/vpux_compiler/src/compiler.cpp @@ -14,13 +14,18 @@ #include "vpux/compiler/NPU40XX/dialect/ELF/export.hpp" #include "vpux/compiler/NPU40XX/dialect_pipeline_strategy.hpp" #include "vpux/compiler/compilation_options.hpp" +#include "vpux/compiler/conversion.hpp" #include "vpux/compiler/dialect/ELFNPU37XX/export.hpp" +#include "vpux/compiler/dialect/HostExec/IR/dialect.hpp" +#include "vpux/compiler/dialect/HostExec/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPUMI37XX/network_description.hpp" #include "vpux/compiler/dialect/config/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/utils/constant_folding_in_background.hpp" #include "vpux/compiler/frontend/IE.hpp" #include "vpux/compiler/frontend/ov_batch_detection.hpp" @@ -29,6 +34,7 @@ #include "vpux/compiler/pipelines/developer_config.hpp" #include "vpux/compiler/pipelines/options_mapper.hpp" #include "vpux/compiler/utils/logging.hpp" +#include "vpux/compiler/utils/options.hpp" #include "vpux/compiler/utils/pipeline_strategies.hpp" #include "vpux/utils/IE/itt.hpp" @@ -88,11 +94,11 @@ constexpr uint32_t SUPPORTED_OPSET = 11; StrategyFactoryFn createDialectPipelineStrategyFn(const intel_npu::Config& config) { auto arch = getArchKind(config); switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return [&](config::CompilationMode compilationMode) { return createDialectPipelineStrategy37XX(compilationMode, config); }; - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return [&](config::CompilationMode compilationMode) { return createDialectPipelineStrategy40XX(compilationMode, config); }; @@ -113,15 +119,22 @@ void buildPipeline(const intel_npu::Config& config, mlir::PassManager& pm, Logge pipelineFactory->buildPipeline(pm); } +void buildCompileHostExecPipeline(mlir::PassManager& pm, const intel_npu::Config& config, Logger log) { + const auto compilationMode = getCompilationMode(config); + if (compilationMode == config::CompilationMode::HostCompile) { + vpux::HostExec::buildHostExecPipeline(pm, log); + } +} + // // createBackendPipelineStrategy // -std::unique_ptr createBackendPipelineStrategy(VPU::ArchKind arch) { +std::unique_ptr createBackendPipelineStrategy(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return std::make_unique(); - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return std::make_unique(); default: VPUX_THROW("Unsupported arch kind: {0}", arch); @@ -176,15 +189,18 @@ namespace { auto importNetwork(mlir::MLIRContext* ctx, const std::shared_ptr& model, const std::vector>& originalParameters, const std::vector>& originalResults, const intel_npu::Config& config, - const DeveloperConfig& devConf, mlir::TimingScope& rootTiming, Logger log) { + const DeveloperConfig& devConf, mlir::TimingScope& rootTiming, Logger log, + bool enableWeightsSeparationPath = false) { auto importTiming = rootTiming.nest("Import network"); - const auto dynamicShapeToStatic = config.get(); - const auto dummyOpReplacement = getDummyOpReplacement(config).value_or(DummyOpMode::DISABLED); + IE::ImportNetworkConfig importCfg; + importCfg.sharedConstants = devConf.useSharedConstants(); + importCfg.enableProfiling = config.get(); + importCfg.stubLayers = getDummyOpReplacement(config).value_or(DummyOpMode::DISABLED); + importCfg.dynamicShapeToStatic = config.get(); + importCfg.enableWeightsSeparationPath = enableWeightsSeparationPath; - return IE::importNetwork(ctx, model, originalParameters, originalResults, devConf.useSharedConstants(), - importTiming, config.get(), dummyOpReplacement, - dynamicShapeToStatic, log.nest()); + return IE::importNetwork(ctx, model, originalParameters, originalResults, importTiming, importCfg, log.nest()); } mlir::LogicalResult compileNetwork(mlir::ModuleOp module, mlir::PassManager& pm, mlir::TimingScope& nestTiming) { @@ -215,11 +231,12 @@ void backendCompilation(mlir::OwningOpRef& vpuipModule, const De devConf.setup(elfPm, config); mlir::LogicalResult compileResult = mlir::failure(); - auto wlmStatus = vpux::VPUIP::getWlmStatus(vpuipModule.get()); - auto wlmStillEnabled = wlmStatus == vpux::VPUIP::WlmStatus::ENABLED; + auto wlmStatus = VPU::getWorkloadManagementStatus(vpuipModule.get()); + auto wlmStillEnabled = wlmStatus == VPU::WorkloadManagementStatus::ENABLED; auto backendPipelineStrategy = createBackendPipelineStrategy(getArchKind(config)); backendPipelineStrategy->buildELFPipeline(!hostCompilationMode ? elfPm : elfPm.nest(), config, elfTiming, log, wlmStillEnabled); + buildCompileHostExecPipeline(elfPm, config, log); if (getWlmRollback(config).value_or(false)) { auto backupModule = mlir::OwningOpRef(vpuipModule.get().clone()); // We moved away from the exception-based fallback mechanism because the MLIRContext remained in an invalid @@ -227,8 +244,8 @@ void backendCompilation(mlir::OwningOpRef& vpuipModule, const De // compile time stats. Now we rely on the PassManager::run result and WLM status attribute to decide if we need // to rollback. This allows MLIR to run the pass instrumentation and set the context to the correct state. compileResult = compileNetwork(vpuipModule.get(), elfPm, elfTiming); - wlmStatus = vpux::VPUIP::getWlmStatus(vpuipModule.get()); - if (mlir::failed(compileResult) && wlmStatus == vpux::VPUIP::WlmStatus::FAILED) { + wlmStatus = VPU::getWorkloadManagementStatus(vpuipModule.get()); + if (mlir::failed(compileResult) && wlmStatus == VPU::WorkloadManagementStatus::FAILED) { log.warning("Failed to export to ELF with current config, reverting to simple ELF pipeline"); vpuipModule = std::move(backupModule); mlir::PassManager simpleElfPm(vpuipModule.get()->getName(), mlir::OpPassManager::Nesting::Implicit); @@ -236,7 +253,8 @@ void backendCompilation(mlir::OwningOpRef& vpuipModule, const De backendPipelineStrategy->buildELFPipeline( !hostCompilationMode ? simpleElfPm : simpleElfPm.nest(), config, elfTiming, log, /*useWlm=*/false); - vpux::VPUIP::setWlmStatus(vpuipModule.get(), vpux::VPUIP::WlmStatus::DISABLED); + VPU::setWorkloadManagementStatus(vpuipModule.get(), VPU::WorkloadManagementStatus::DISABLED); + buildCompileHostExecPipeline(simpleElfPm, config, log); VPUX_THROW_UNLESS(mlir::succeeded(compileNetwork(vpuipModule.get(), simpleElfPm, elfTiming)), "Compilation failed"); } else { @@ -249,9 +267,9 @@ void backendCompilation(mlir::OwningOpRef& vpuipModule, const De } auto exportToELF(mlir::ModuleOp module, Logger log) { - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return vpux::ELFNPU37XX::exportToELF(module, log); default: return vpux::ELF::exportToELF(module, log); @@ -259,9 +277,9 @@ auto exportToELF(mlir::ModuleOp module, Logger log) { } auto exportToELF(mlir::ModuleOp module, Logger log, BlobAllocator& allocator) { - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return vpux::ELFNPU37XX::exportToELF(module, allocator, log); default: return vpux::ELF::exportToELF(module, allocator, log); @@ -431,11 +449,11 @@ bool isTypeSupportedNPU40xx(ov::element::Type_t elemType) { } } -bool isTypeSupported(VPU::ArchKind arch, ov::element::Type_t elemType) { +bool isTypeSupported(config::ArchKind arch, ov::element::Type_t elemType) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return isTypeSupportedNPU37xx(elemType); - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return isTypeSupportedNPU40xx(elemType); default: VPUX_THROW("Unsupported arch kind: {0}", arch); @@ -466,8 +484,11 @@ mlir::OwningOpRef compileModel(mlir::MLIRContext& ctx, const std checkDataTypes(model, config); - mlir::OwningOpRef module = - importNetwork(&ctx, model, originalParameters, originalResults, config, devConf, rootTiming, log); + // This will allow preserving as many original constants in the model as possible after nGraph passes, making the + // pipeline close to WS "Init" mode. + const auto isWSMonolithic = getCompilationMode(config) == config::CompilationMode::WSMonolithic; + mlir::OwningOpRef module = importNetwork(&ctx, model, originalParameters, originalResults, config, + devConf, rootTiming, log, isWSMonolithic); OV_ITT_TASK_NEXT(COMPILER_IMPLEMENTATION, "PassManager"); @@ -484,12 +505,6 @@ mlir::OwningOpRef compileModel(mlir::MLIRContext& ctx, const std OV_ITT_TASK_NEXT(COMPILER_IMPLEMENTATION, "compileNetwork"); - // Load VPUIP dialect before first compilation to set initial WlmStatus based on config - ctx.loadDialect(); - auto wlmEnabled = getWlmEnabled(config).value_or(false); - vpux::VPUIP::setWlmStatus(module.get(), - wlmEnabled ? vpux::VPUIP::WlmStatus::ENABLED : vpux::VPUIP::WlmStatus::DISABLED); - // applies each pass in the pipeline auto compileNetworkTiming = rootTiming.nest("Compile network"); diff --git a/src/vpux_compiler/src/conversion/factories/convert_dynamic_quant_to_vpu_nce.cpp b/src/vpux_compiler/src/conversion/factories/convert_dynamic_quant_to_vpu_nce.cpp index f6c8ad8319..f218b51a01 100644 --- a/src/vpux_compiler/src/conversion/factories/convert_dynamic_quant_to_vpu_nce.cpp +++ b/src/vpux_compiler/src/conversion/factories/convert_dynamic_quant_to_vpu_nce.cpp @@ -3,18 +3,18 @@ // SPDX-License-Identifier: Apache-2.0 // -#include - #include "vpux/compiler/conversion.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/mpe_engine_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/ppe_version_config.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" -#include "vpux/utils/core/error.hpp" + +#include namespace vpux { #define GEN_PASS_DECL_CONVERTDYNAMICQUANTTOVPUNCE diff --git a/src/vpux_compiler/src/conversion/passes/IE2VPU/convert_IE_to_VPU_M2I.cpp b/src/vpux_compiler/src/conversion/passes/IE2VPU/convert_IE_to_VPU_M2I.cpp index c94e0493fc..e21bd449bc 100644 --- a/src/vpux_compiler/src/conversion/passes/IE2VPU/convert_IE_to_VPU_M2I.cpp +++ b/src/vpux_compiler/src/conversion/passes/IE2VPU/convert_IE_to_VPU_M2I.cpp @@ -6,12 +6,13 @@ #include "vpux/compiler/conversion.hpp" #include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/m2i_utils.hpp" -#include "vpux/compiler/dialect/const/attributes/content.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/compiler/utils/types.hpp" #include @@ -154,8 +155,8 @@ void ConvertIEToVPUM2IPass::safeRunOnFunc() { auto& ctx = getContext(); auto module = getOperation(); - const auto arch = VPU::getArch(module); - if (arch < VPU::ArchKind::NPU40XX) { + const auto arch = config::getArch(module); + if (arch < config::ArchKind::NPU40XX) { _log.trace("Convert to VPU-M2I Pass enabled only for NPU40XX+ devices. Got: {0}", arch); return; } diff --git a/src/vpux_compiler/src/conversion/passes/IE2VPU/convert_layers_to_VPU.cpp b/src/vpux_compiler/src/conversion/passes/IE2VPU/convert_layers_to_VPU.cpp index 81d6ed6ee4..5629efa81d 100644 --- a/src/vpux_compiler/src/conversion/passes/IE2VPU/convert_layers_to_VPU.cpp +++ b/src/vpux_compiler/src/conversion/passes/IE2VPU/convert_layers_to_VPU.cpp @@ -6,12 +6,17 @@ #include "vpux/compiler/conversion/passes/IE2VPU/convert_layers_to_VPU.hpp" #include "vpux/compiler/conversion.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/bitwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" -#include "vpux/compiler/dialect/core/IR/attributes.hpp" #include "vpux/compiler/dialect/core/IR/ops.hpp" // Generated diff --git a/src/vpux_compiler/src/conversion/passes/ShaveCodeGen/convert_Affine_to_LLVM.cpp b/src/vpux_compiler/src/conversion/passes/ShaveCodeGen/convert_Affine_to_LLVM.cpp index 06f471348f..82bf78c6bd 100644 --- a/src/vpux_compiler/src/conversion/passes/ShaveCodeGen/convert_Affine_to_LLVM.cpp +++ b/src/vpux_compiler/src/conversion/passes/ShaveCodeGen/convert_Affine_to_LLVM.cpp @@ -5,37 +5,35 @@ #include "vpux/compiler/conversion.hpp" #include "vpux/compiler/core/aliases_info.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" #include "vpux/compiler/utils/ShaveCodeGen/utils.hpp" -#include "vpux/compiler/utils/logging.hpp" -#include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/core/small_string.hpp" #include "vpux/utils/logger/logger.hpp" #include #include #include #include +#include +#include +#include +#include #include #include #include #include +#include #include +#include +#include +#include #include +#include #include -#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" -#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" -#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" -#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h" -#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" -#include "mlir/Dialect/Index/IR/IndexDialect.h" -#include "mlir/Dialect/Index/IR/IndexOps.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/Dialect/Math/Transforms/Approximation.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/IR/BuiltinTypes.h" -#include "mlir/Pass/AnalysisManager.h" +#include +#include +#include +#include // TODO: E66812, it should be sufficient to have warnings disabled for 3-rd parties // in CMake but it does not work for early versions of MSVC 2019 diff --git a/src/vpux_compiler/src/conversion/passes/ShaveCodeGen/convert_eltwise_layers_to_math.cpp b/src/vpux_compiler/src/conversion/passes/ShaveCodeGen/convert_eltwise_layers_to_math.cpp index 628e1608dd..c1ea6d2249 100644 --- a/src/vpux_compiler/src/conversion/passes/ShaveCodeGen/convert_eltwise_layers_to_math.cpp +++ b/src/vpux_compiler/src/conversion/passes/ShaveCodeGen/convert_eltwise_layers_to_math.cpp @@ -4,23 +4,25 @@ // #include "vpux/compiler/conversion.hpp" -#include "vpux/compiler/utils/ShaveCodeGen/linalg_type_conversion.hpp" -#include "vpux/compiler/utils/logging.hpp" -#include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/core/small_string.hpp" -#include "vpux/utils/logger/logger.hpp" - #include "vpux/compiler/dialect/IE/IR/attributes.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/bitwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" +#include "vpux/compiler/utils/ShaveCodeGen/linalg_type_conversion.hpp" +#include "vpux/utils/logger/logger.hpp" +#include #include #include #include #include - -#include +#include // Generated namespace ConvertEltwiseLayersToMathPatterns { @@ -584,6 +586,21 @@ mlir::Value emitLinalgRegion(IE::SinhOp op, mlir::ValueRange args, l return rewriter.create(loc, diffExp, halfConst); } +// Negative layer +template <> +mlir::Value emitLinalgRegion(IE::NegativeOp op, mlir::ValueRange args, + llvm::ArrayRef resultTypes, mlir::PatternRewriter& rewriter) { + auto loc = op.getLoc(); + + if (mlir::isa(args[0].getType())) { + auto zero = rewriter.create(loc, rewriter.getFloatAttr(args[0].getType(), 0.)); + return rewriter.create(loc, resultTypes, mlir::ValueRange{zero, args[0]}); + } + + auto zero = rewriter.create(loc, rewriter.getIntegerAttr(args[0].getType(), 0)); + return rewriter.create(loc, resultTypes, mlir::ValueRange{zero, args[0]}); +} + // Callback type for emitting the linalg body for an operation. using EmitBodyCallback = std::function, mlir::PatternRewriter&)>; @@ -782,6 +799,107 @@ mlir::Value emitLinalgRegion(IE::ConvertOp op, mlir::ValueRange a VPUX_THROW("Unsupported convert type combination"); } +// Abs layer +template <> +mlir::Value emitLinalgRegion(IE::AbsOp op, mlir::ValueRange args, llvm::ArrayRef resultTypes, + mlir::PatternRewriter& rewriter) { + auto loc = op.getLoc(); + + return rewriter.create(loc, resultTypes, args); +} + +// Sign layer +template <> +mlir::Value emitLinalgRegion(IE::SignOp op, mlir::ValueRange args, llvm::ArrayRef resultTypes, + mlir::PatternRewriter& rewriter) { + // This algorithm computes the sign of a floating-point value by examining its sign bit. + // To handle the case where there is -0, we check if the value is exactly zero + // (including both +0 and -0) and return 0 in that case. + VPUX_UNUSED(resultTypes); + auto loc = op.getLoc(); + auto val = args[0]; + + auto zeroAttr = rewriter.getFloatAttr(val.getType(), 0.0); + mlir::Value zero = rewriter.create(val.getLoc(), zeroAttr); + auto negAttr = rewriter.getFloatAttr(val.getType(), -1.0); + mlir::Value negOne = rewriter.create(val.getLoc(), negAttr); + auto posAttr = rewriter.getFloatAttr(val.getType(), 1.0); + mlir::Value posOne = rewriter.create(val.getLoc(), posAttr); + + auto bitWidth = val.getType().getIntOrFloatBitWidth(); + auto intTy = mlir::IntegerType::get(val.getContext(), bitWidth); + auto casted = rewriter.create(loc, intTy, val); + llvm::APInt msbMaskVal = llvm::APInt::getSignMask(bitWidth); + auto msbMask = rewriter.create(loc, rewriter.getIntegerAttr(intTy, msbMaskVal)); + auto signBit = rewriter.create(loc, casted, msbMask); + + auto shift = rewriter.create(loc, casted, + rewriter.create(loc, 1, intTy)); + auto isZero = rewriter.create(loc, mlir::arith::CmpIPredicate::eq, shift, + rewriter.create(loc, 0, intTy)); + + auto signHandled = rewriter.create( + loc, + rewriter.create(loc, mlir::arith::CmpIPredicate::ne, signBit, + rewriter.create(loc, 0, intTy)), + negOne, posOne); + + return rewriter.create(loc, isZero, zero, signHandled); +} + +// HSwish layer +template <> +mlir::Value emitLinalgRegion(IE::HSwishOp op, mlir::ValueRange args, + llvm::ArrayRef resultTypes, mlir::PatternRewriter& rewriter) { + // Compute this as x*((min(max(x+3,0),6))/6) + VPUX_UNUSED(resultTypes); + auto loc = op.getLoc(); + auto val = args[0]; + auto zeroAttr = rewriter.getFloatAttr(val.getType(), 0.0); + mlir::Value zero = rewriter.create(val.getLoc(), zeroAttr); + auto threeAttr = rewriter.getFloatAttr(val.getType(), 3.0); + mlir::Value three = rewriter.create(val.getLoc(), threeAttr); + auto sixAttr = rewriter.getFloatAttr(val.getType(), 6.0); + mlir::Value six = rewriter.create(val.getLoc(), sixAttr); + auto divSixAttr = rewriter.getFloatAttr(val.getType(), 1.0 / 6.0); + mlir::Value divSix = rewriter.create(val.getLoc(), divSixAttr); + auto fmFlags = mlir::arith::FastMathFlagsAttr::get( + rewriter.getContext(), mlir::arith::FastMathFlags::nnan | mlir::arith::FastMathFlags::nsz); + + auto add = rewriter.create(loc, val, three); + auto max = rewriter.create(loc, add, zero, fmFlags); + auto min = rewriter.create(loc, max, six, fmFlags); + auto mul = rewriter.create(loc, min, divSix); + + return rewriter.create(loc, val, mul); +} + +// HSigmoid Layer +template <> +mlir::Value emitLinalgRegion(IE::HSigmoidOp op, mlir::ValueRange args, + llvm::ArrayRef resultTypes, mlir::PatternRewriter& rewriter) { + // Compute this as (min(max(x+3,0),6))/6 + VPUX_UNUSED(resultTypes); + auto loc = op.getLoc(); + auto val = args[0]; + auto zeroAttr = rewriter.getFloatAttr(val.getType(), 0.0); + mlir::Value zero = rewriter.create(val.getLoc(), zeroAttr); + auto threeAttr = rewriter.getFloatAttr(val.getType(), 3.0); + mlir::Value three = rewriter.create(val.getLoc(), threeAttr); + auto sixAttr = rewriter.getFloatAttr(val.getType(), 6.0); + mlir::Value six = rewriter.create(val.getLoc(), sixAttr); + auto divSixAttr = rewriter.getFloatAttr(val.getType(), 1.0 / 6.0); + mlir::Value divSix = rewriter.create(val.getLoc(), divSixAttr); + auto fmFlags = mlir::arith::FastMathFlagsAttr::get( + rewriter.getContext(), mlir::arith::FastMathFlags::nnan | mlir::arith::FastMathFlags::nsz); + + auto add = rewriter.create(loc, val, three); + auto max = rewriter.create(loc, add, zero, fmFlags); + auto min = rewriter.create(loc, max, six, fmFlags); + + return rewriter.create(loc, min, divSix); +} + void ConvertEltwiseLayers2MathPass::safeRunOnFunc() { auto& ctx = getContext(); auto func = getOperation(); @@ -811,6 +929,11 @@ void ConvertEltwiseLayers2MathPass::safeRunOnFunc() { target.addIllegalOp(); target.addIllegalOp(); target.addIllegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); auto populatePatterns = [&](mlir::RewritePatternSet& patternSet) { ConvertEltwiseLayersToMathPatterns::populateWithGenerated(patternSet); @@ -835,6 +958,8 @@ void ConvertEltwiseLayers2MathPass::safeRunOnFunc() { patternSet.add>(&ctx); patternSet.add, IEEltwiseToLinalg, IEEltwiseToLinalg, IEEltwiseToLinalg>(&ctx); + patternSet.add, IEEltwiseToLinalg, IEEltwiseToLinalg, + IEEltwiseToLinalg, IEEltwiseToLinalg>(&ctx); }; // E#172607 [ShaveCodeGen] Make Linalg lowering pass run on CodeGenCapsuleOps diff --git a/src/vpux_compiler/src/conversion/passes/VPU2VPUIP/bufferize_sw_ops_interface.cpp b/src/vpux_compiler/src/conversion/passes/VPU2VPUIP/bufferize_sw_ops_interface.cpp index 066f51f655..4cd6c93539 100644 --- a/src/vpux_compiler/src/conversion/passes/VPU2VPUIP/bufferize_sw_ops_interface.cpp +++ b/src/vpux_compiler/src/conversion/passes/VPU2VPUIP/bufferize_sw_ops_interface.cpp @@ -210,7 +210,7 @@ mlir::LogicalResult vpux::bufferizeSWLayerOp(mlir::RewriterBase& rewriter, mlir: } } - VPUIP::createRuntimeKernelDefinition(module, log.nest(), VPU::getArch(op)); + VPUIP::createRuntimeKernelDefinition(module, log.nest(), config::getArch(op)); // TODO : tile 0 const int64_t tileIndex = 0; @@ -223,8 +223,8 @@ mlir::LogicalResult vpux::bufferizeSWLayerOp(mlir::RewriterBase& rewriter, mlir: auto swKernelOp = rewriter.create(op->getLoc(), swKernelOperands, swKernelResults, builtInFunction, getIntAttr(ctx, tileIndex)); - vpux::VPUIP::initSwKernel(swKernelOp, swKernelOperands, swKernelResults, swLayerOp.getKernelInfo().args, - log.nest()); + vpux::VPUIP::initSwKernel(swKernelOp, swKernelOperands, swKernelResults, swLayerOp.getKernelInfo().args, log.nest(), + /*swKernelRunOp=*/nullptr); const auto moveSwOpToCMX = [&]() { // Go through all inputs and outputs that were mapped to DDR and map them to NNCMX @@ -280,7 +280,8 @@ mlir::LogicalResult vpux::bufferizeSWLayerOp(mlir::RewriterBase& rewriter, mlir: swKernelOp = rewriter.create(op->getLoc(), cmxOperands, cmxResults, builtInFunction, getIntAttr(ctx, tileIndex)); - vpux::VPUIP::initSwKernel(swKernelOp, cmxOperands, cmxResults, swLayerOp.getKernelInfo().args, log.nest()); + vpux::VPUIP::initSwKernel(swKernelOp, cmxOperands, cmxResults, swLayerOp.getKernelInfo().args, log.nest(), + /*swKernelRunOp=*/nullptr); }; if (isDMAConvertibleSwOp(mlir::dyn_cast(op)) && @@ -319,7 +320,7 @@ mlir::LogicalResult vpux::bufferizeDistributedSWLayerOp(mlir::RewriterBase& rewr auto layerOp = mlir::cast(op); auto swLayerOp = mlir::cast(op); - VPUIP::createRuntimeKernelDefinition(module, log.nest(), VPU::getArch(op)); + VPUIP::createRuntimeKernelDefinition(module, log.nest(), config::getArch(op)); auto outputBuffers = allocateBuffers(log, op->getLoc(), rewriter, op->getResults(), /*individualBuffers=*/true); @@ -332,7 +333,8 @@ mlir::LogicalResult vpux::bufferizeDistributedSWLayerOp(mlir::RewriterBase& rewr auto swKernelOp = rewriter.create(op->getLoc(), newOperands, outputBuffers, builtInFunction, getIntAttr(op->getContext(), tileIndex)); - vpux::VPUIP::initSwKernel(swKernelOp, newOperands, outputBuffers, swLayerOp.getKernelInfo().args, log.nest()); + vpux::VPUIP::initSwKernel(swKernelOp, newOperands, outputBuffers, swLayerOp.getKernelInfo().args, log.nest(), + /*swKernelRunOp=*/nullptr); mlir::bufferization::replaceOpWithBufferizedValues(rewriter, op, swKernelOp.getResults()); return mlir::success(); } @@ -409,7 +411,7 @@ bool isLegalStridedSliceOp(VPU::StridedSliceOp stridedSliceOp) { } } - const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(VPU::getArch(stridedSliceOp)); + const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(config::getArch(stridedSliceOp)); return stridingLevel <= dmaEngineLimits.getMaxStrideCount(); } diff --git a/src/vpux_compiler/src/conversion/passes/VPU2VPUIP/bufferize_vpu_nce_ops_interface.cpp b/src/vpux_compiler/src/conversion/passes/VPU2VPUIP/bufferize_vpu_nce_ops_interface.cpp index 3077cac8ce..a54e201a73 100644 --- a/src/vpux_compiler/src/conversion/passes/VPU2VPUIP/bufferize_vpu_nce_ops_interface.cpp +++ b/src/vpux_compiler/src/conversion/passes/VPU2VPUIP/bufferize_vpu_nce_ops_interface.cpp @@ -400,7 +400,7 @@ mlir::LogicalResult vpux::bufferizeOp(mlir::MLIRContext* ctx, VPU::NCEAveragePoo } mlir::UnitAttr isSmallKernelOptimizationAttr = nullptr; - if (VPU::NCEInvariant::isSmallKernelOptimizationSupported(VPU::getArch(origOp), origOp)) { + if (VPU::NCEInvariant::isSmallKernelOptimizationSupported(config::getArch(origOp), origOp)) { isSmallKernelOptimizationAttr = mlir::UnitAttr::get(ctx); } @@ -462,7 +462,7 @@ mlir::LogicalResult vpux::bufferizeOp(mlir::MLIRContext* ctx, VPU::NCEDepthConvo isSuperdenseAttr = mlir::UnitAttr::get(ctx); } - auto arch = VPU::getArch(origOp); + auto arch = config::getArch(origOp); mlir::UnitAttr isSmallKernelOptimizationAttr = nullptr; if (VPU::NCEInvariant::isSmallKernelOptimizationSupported(arch, origOp)) { isSmallKernelOptimizationAttr = mlir::UnitAttr::get(ctx); @@ -623,7 +623,6 @@ mlir::LogicalResult vpux::bufferizeOp(mlir::MLIRContext* ctx, VPU::NCEReduceOp o isSuperdenseAttr = mlir::UnitAttr::get(ctx); } - auto ppeAttr = VPU::PpeVersionConfig::retrievePPEAttribute(origOp); const auto mpeEngineAttr = VPU::MPEEngineConfig::retrieveMPEEngineAttribute(origOp); auto nceOpInterface = mlir::dyn_cast(origOp.getOperation()); auto nceOp = @@ -632,7 +631,7 @@ mlir::LogicalResult vpux::bufferizeOp(mlir::MLIRContext* ctx, VPU::NCEReduceOp o /*weight_table_bias=*/nullptr, outputBuffers, nceTaskType, getIntArrayAttr(ctx, nceOpInterface.getKernelSizeVal()), getIntArrayAttr(ctx, nceOpInterface.getStridesVal()), nceOpInterface.getPad(), - origOp.getWorkloads(), isSuperdenseAttr, ppeAttr, dpuCostAttr, + origOp.getWorkloads(), isSuperdenseAttr, origOp.getPpeAttr(), dpuCostAttr, /*isInplace=*/nullptr, /*isPermuteQuantize=*/nullptr, /*cmSpPattern=*/nullptr, /*inputChannelsCompression=*/nullptr, /*isNCEPermute=*/false, diff --git a/src/vpux_compiler/src/conversion/passes/VPU2VPUIP/bufferize_vpu_ops_interface.cpp b/src/vpux_compiler/src/conversion/passes/VPU2VPUIP/bufferize_vpu_ops_interface.cpp index cb4526ba74..b0767e6888 100644 --- a/src/vpux_compiler/src/conversion/passes/VPU2VPUIP/bufferize_vpu_ops_interface.cpp +++ b/src/vpux_compiler/src/conversion/passes/VPU2VPUIP/bufferize_vpu_ops_interface.cpp @@ -4,12 +4,12 @@ // #include "vpux/compiler/NPU40XX/utils.hpp" -#include "vpux/compiler/conversion.hpp" #include "vpux/compiler/conversion/passes/VPU2VPUIP/bufferizable_ops_interface.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/m2i_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" @@ -17,7 +17,6 @@ #include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/compiler/utils/logging.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/error.hpp" @@ -228,7 +227,7 @@ mlir::LogicalResult vpux::bufferizeOp(mlir::MLIRContext*, VPU::SliceOp origOp, V log.trace("Got '{0}' at '{1}'", origOp->getName(), origOp->getLoc()); auto newOutType = vpux::getBufferType(origOp.getType()); - auto subView = createSubviewOp(newOutType, newArgs.getSource(), origOp->getLoc(), rewriter, + auto subView = createSubviewOp(newOutType, newArgs.getInput(), origOp->getLoc(), rewriter, origOp.getStaticOffsetsAttr(), origOp.getStaticSizesAttr()); auto outputBuffers = allocateBuffers(log, origOp->getLoc(), rewriter, origOp->getOpResults(), /*individualBuffers =*/false); @@ -583,7 +582,7 @@ mlir::LogicalResult vpux::bufferizeOp(mlir::MLIRContext*, VPU::ShapeCastOp origO auto log = Logger::global().nest("one-shot-bufferize-VPUShapeCastOp", 0); log.trace("Got '{0}' at '{1}'", origOp->getName(), origOp->getLoc()); - auto newOp = rewriter.create(origOp->getLoc(), newArgs.getSource(), newArgs.getShape()); + auto newOp = rewriter.create(origOp->getLoc(), newArgs.getInput(), newArgs.getShape()); mlir::bufferization::replaceOpWithBufferizedValues(rewriter, origOp, newOp->getResults()); return mlir::success(); @@ -618,9 +617,17 @@ mlir::LogicalResult vpux::bufferizeOp(mlir::MLIRContext*, VPU::GatherDMAOp origO auto log = Logger::global().nest("one-shot-bufferize-VPUGatherDMAOp", 0); log.trace("Got '{0}' at '{1}'", origOp->getName(), origOp->getLoc()); + if (mlir::isa(origOp.getOutput().getType())) { + auto outputCMXBuffers = allocateBuffers(log, origOp.getLoc(), rewriter, origOp.getOutput(), true); + auto newOp = rewriter.create(origOp.getLoc(), newArgs.getInput(), newArgs.getIndices(), + outputCMXBuffers[0], 0, 0, 0); + newOp.setChannelType(VPUIP::DmaChannelType::DDR); + mlir::bufferization::replaceOpWithBufferizedValues(rewriter, origOp, newOp.getResult()); + return mlir::success(); + } + auto ctx = origOp->getContext(); const auto memSpaceCMX = vpux::IndexedSymbolAttr::get(ctx, stringifyEnum(VPU::MemoryKind::CMX_NN), 0); - // Hardware Limitation: In Gather addressing mode, indices must reside in CMX // Currently, this implementation only handles GatherDMAOp where the input is in DDR and the output is in CMX auto indices = newArgs.getIndices(); @@ -631,7 +638,6 @@ mlir::LogicalResult vpux::bufferizeOp(mlir::MLIRContext*, VPU::GatherDMAOp origO auto newOp = rewriter.create(origOp.getLoc(), newArgs.getInput(), indicesCMXCopy.getOutput(), outputCMXBuffers, 0, 0, 0); newOp.setChannelType(VPUIP::DmaChannelType::DDR); - auto outputDDRBuffers = allocateBuffers(log, origOp.getLoc(), rewriter, origOp->getOpResults(), /*individualBuffers =*/false); auto newResult = @@ -641,21 +647,6 @@ mlir::LogicalResult vpux::bufferizeOp(mlir::MLIRContext*, VPU::GatherDMAOp origO return mlir::success(); } -// -// bufferize VPU::WorkloadCastOp -// - -mlir::LogicalResult vpux::bufferizeOp(mlir::MLIRContext*, VPU::WorkloadCastOp origOp, - VPU::WorkloadCastOp::Adaptor newArgs, mlir::RewriterBase& rewriter) { - auto log = Logger::global().nest("one-shot-bufferize-VPUWorkloadCastOp", 0); - log.trace("Got '{0}' at '{1}'", origOp->getName(), origOp->getLoc()); - - const auto newOutType = vpux::getBufferType(origOp.getType()); - auto newOp = rewriter.create(origOp->getLoc(), newOutType, newArgs.getInput()); - mlir::bufferization::replaceOpWithBufferizedValues(rewriter, origOp, newOp->getResults()); - return mlir::success(); -} - // // bufferize VPU::UpsamplingOp // @@ -702,7 +693,7 @@ mlir::LogicalResult vpux::bufferizeOp(mlir::MLIRContext*, VPU::ShapeOfOp origOp, auto op = origOp.getOperation(); auto module = getModuleOp(op); - VPUIP::createRuntimeKernelDefinition(module, log.nest(), VPU::getArch(op)); + VPUIP::createRuntimeKernelDefinition(module, log.nest(), config::getArch(op)); auto layerOp = mlir::cast(op); auto swLayerOp = mlir::cast(op); @@ -714,8 +705,8 @@ mlir::LogicalResult vpux::bufferizeOp(mlir::MLIRContext*, VPU::ShapeOfOp origOp, auto swKernelOp = rewriter.create(origOp.getLoc(), swKernelOperands, swKernelResults, builtInFunction, getIntAttr(ctx, tileIndex)); - vpux::VPUIP::initSwKernel(swKernelOp, swKernelOperands, swKernelResults, swLayerOp.getKernelInfo().args, - log.nest()); + vpux::VPUIP::initSwKernel(swKernelOp, swKernelOperands, swKernelResults, swLayerOp.getKernelInfo().args, log.nest(), + /*swKernelRunOp=*/nullptr); log.trace("Added kernel operation: {0}", swKernelOp); @@ -755,7 +746,6 @@ void vpux::registerVPUBufferizableOpInterfaces(mlir::DialectRegistry& registry) VPU::StorageElementTableOp::attachInterface>(*ctx); VPU::ShapeCastOp::attachInterface>(*ctx); VPU::LayoutCastOp::attachInterface>(*ctx); - VPU::WorkloadCastOp::attachInterface>(*ctx); VPU::UpsamplingOp::attachInterface>(*ctx); VPU::ShapeOfOp::attachInterface>(*ctx); }); diff --git a/src/vpux_compiler/src/conversion/passes/VPUASM2NPUReg40XX/VPUASM2NPUReg40XX.cpp b/src/vpux_compiler/src/conversion/passes/VPUASM2NPUReg40XX/VPUASM2NPUReg40XX.cpp index 13b399abdd..38905f20fe 100644 --- a/src/vpux_compiler/src/conversion/passes/VPUASM2NPUReg40XX/VPUASM2NPUReg40XX.cpp +++ b/src/vpux_compiler/src/conversion/passes/VPUASM2NPUReg40XX/VPUASM2NPUReg40XX.cpp @@ -15,6 +15,7 @@ #include "vpux/compiler/conversion/rewriters/VPUASM2NPUReg40XX/mi_version_rewriter.hpp" #include "vpux/compiler/conversion/rewriters/VPUASM2NPUReg40XX/nnrt_rewriter.hpp" #include "vpux/compiler/conversion/rewriters/VPUASM2NPUReg40XX/work_item_rewriter.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/conversion.hpp" @@ -98,7 +99,7 @@ void ConvertVPUASM2NPUReg40XXPass::safeRunOnModule() { target.addIllegalOp(); target.addDynamicallyLegalOp([&](VPUASM::PlatformInfoOp op) { - return VPU::getArch(op.getOperation()) != VPU::ArchKind::UNKNOWN; + return config::getArch(op.getOperation()) != config::ArchKind::UNKNOWN; }); if (mlir::failed(mlir::applyPartialConversion(netFunc, target, std::move(patterns)))) { diff --git a/src/vpux_compiler/src/conversion/passes/VPUIP2VPUMI37XX/convert_VPUIP_to_VPUMI37XX.cpp b/src/vpux_compiler/src/conversion/passes/VPUIP2VPUMI37XX/convert_VPUIP_to_VPUMI37XX.cpp index 6e320766a4..6a1fe7e537 100644 --- a/src/vpux_compiler/src/conversion/passes/VPUIP2VPUMI37XX/convert_VPUIP_to_VPUMI37XX.cpp +++ b/src/vpux_compiler/src/conversion/passes/VPUIP2VPUMI37XX/convert_VPUIP_to_VPUMI37XX.cpp @@ -3,28 +3,27 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/utils/resources.hpp" - #include "vpux/compiler/conversion.hpp" #include "vpux/compiler/core/bounded_buffer.hpp" #include "vpux/compiler/core/profiling_metadata.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" #include "vpux/compiler/dialect/VPUMI37XX/kernel_params_utils.hpp" #include "vpux/compiler/dialect/VPUMI37XX/ops.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" - #include "vpux/compiler/utils/dma_limits.hpp" #include "vpux/compiler/utils/llvm_to_binary.hpp" +#include #include #include #include -#include - #include namespace vpux { @@ -147,7 +146,7 @@ class ConvertVPUIP2VPUMI37XXPass final : public impl::ConvertVPUIP2VPUMI37XXBase _log.trace("VPUIP_VPUMI37XX pass: replaceVPURTTaskOpWithNNDMAOp()"); const auto dmaExecCount = IE::getAvailableExecutor(moduleOp, VPU::ExecutorKind::DMA_NN).getCount(); - const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(VPU::getArch(moduleOp)); + const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(config::getArch(moduleOp)); const auto dmaMaxNumPlanes = dmaEngineLimits.getMaxNumPlanes(); llvm::SmallVector> previousDMA(dmaExecCount); diff --git a/src/vpux_compiler/src/conversion/passes/VPUIP2VPUMI40XX/convert_VPUIP_to_VPUMI40XX.cpp b/src/vpux_compiler/src/conversion/passes/VPUIP2VPUMI40XX/convert_VPUIP_to_VPUMI40XX.cpp index bb042daa1f..b7dc2fabe2 100644 --- a/src/vpux_compiler/src/conversion/passes/VPUIP2VPUMI40XX/convert_VPUIP_to_VPUMI40XX.cpp +++ b/src/vpux_compiler/src/conversion/passes/VPUIP2VPUMI40XX/convert_VPUIP_to_VPUMI40XX.cpp @@ -579,6 +579,8 @@ class ConvertVPUIP2VPUMI40XXPass final : public impl::ConvertVPUIP2VPUMI40XXBase tasksConverters.add(&ctx, _enableMemorySideCacheOption); tasksConverters.add(&ctx, _enableMemorySideCacheOption); tasksConverters.add(&ctx, _enableMemorySideCacheOption); + tasksConverters.add(&ctx, _enableMemorySideCacheOption); + tasksConverters.add(&ctx, _enableMemorySideCacheOption); tasksConverters.add(&ctx, _enableMemorySideCacheOption); tasksConverters.add(&ctx); tasksConverters.add(&ctx); diff --git a/src/vpux_compiler/src/conversion/passes/VPUIPDPU2NPUReg40XX/VPUIPDPU2NPUReg40XX.cpp b/src/vpux_compiler/src/conversion/passes/VPUIPDPU2NPUReg40XX/VPUIPDPU2NPUReg40XX.cpp index 9bc9600a48..933a0337f6 100644 --- a/src/vpux_compiler/src/conversion/passes/VPUIPDPU2NPUReg40XX/VPUIPDPU2NPUReg40XX.cpp +++ b/src/vpux_compiler/src/conversion/passes/VPUIPDPU2NPUReg40XX/VPUIPDPU2NPUReg40XX.cpp @@ -73,8 +73,8 @@ mlir::LogicalResult ConvertVPUIPDPU2NPUReg40XXPass::initialize(mlir::MLIRContext void ConvertVPUIPDPU2NPUReg40XXPass::safeRunOnModule() { auto moduleOp = getOperation(); - auto arch = VPU::getArch(moduleOp); - if (arch != VPU::ArchKind::NPU40XX) { + auto arch = config::getArch(moduleOp); + if (arch != config::ArchKind::NPU40XX) { _log.error("Unsupported architecture for ConvertVPUIPDPU2NPUReg40XXPass: {0}! Required architecture: NPU40XX.", arch); signalPassFailure(); diff --git a/src/vpux_compiler/src/conversion/passes/VPUMI40XX2VPUASM/VPUMI40XX2VPUASM.cpp b/src/vpux_compiler/src/conversion/passes/VPUMI40XX2VPUASM/VPUMI40XX2VPUASM.cpp index d44b8f0a7b..3337f5f194 100644 --- a/src/vpux_compiler/src/conversion/passes/VPUMI40XX2VPUASM/VPUMI40XX2VPUASM.cpp +++ b/src/vpux_compiler/src/conversion/passes/VPUMI40XX2VPUASM/VPUMI40XX2VPUASM.cpp @@ -53,16 +53,20 @@ namespace { class ConvertVPUMI40XX2VPUASMPass final : public impl::ConvertVPUMI40XX2VPUASMBase { public: - explicit ConvertVPUMI40XX2VPUASMPass(Logger log, bool enablePWLM, bool disableDmaSwFifo) - : _enablePWLM(enablePWLM), _disableDmaSwFifo(disableDmaSwFifo) { + ConvertVPUMI40XX2VPUASMPass(Logger log, bool disableDmaSwFifo): _disableDmaSwFifo(disableDmaSwFifo) { Base::initLogger(log, Base::getArgumentName()); } + ConvertVPUMI40XX2VPUASMPass(Logger log, bool enablePWLM, bool disableDmaSwFifo) + : _disableDmaSwFifo(disableDmaSwFifo) { + Base::initLogger(log, Base::getArgumentName()); + enablePWLMOpt = enablePWLM; + } + mlir::LogicalResult initialize(mlir::MLIRContext* ctx) override; private: void safeRunOnModule() final; - bool _enablePWLM; bool _disableDmaSwFifo; }; @@ -71,10 +75,6 @@ mlir::LogicalResult ConvertVPUMI40XX2VPUASMPass::initialize(mlir::MLIRContext* c return mlir::failure(); } - if (enablePWLMOpt.hasValue()) { - _enablePWLM = enablePWLMOpt.getValue(); - } - return mlir::success(); } @@ -145,7 +145,7 @@ void ConvertVPUMI40XX2VPUASMPass::safeRunOnModule() { patterns.add(netFunc, typeConverter, symbolNameMappings, sectionMap, &ctx, _log); patterns.add(netFunc, typeConverter, symbolNameMappings, sectionMap, &ctx, _log); patterns.add(netFunc, typeConverter, symbolNameMappings, sectionMap, &ctx, _log); - patterns.add(netFunc, typeConverter, symbolNameMappings, sectionMap, &ctx, _log, _enablePWLM); + patterns.add(netFunc, typeConverter, symbolNameMappings, sectionMap, &ctx, _log, enablePWLMOpt); patterns.add(netFunc, typeConverter, symbolNameMappings, sectionMap, &ctx, _log, _disableDmaSwFifo); patterns.add(netFunc, typeConverter, symbolNameMappings, sectionMap, &ctx, _log); @@ -166,7 +166,11 @@ void ConvertVPUMI40XX2VPUASMPass::safeRunOnModule() { // createConvertVPUMI40XX2VPUASMPass // -std::unique_ptr vpux::createConvertVPUMI40XX2VPUASMPass(Logger log, bool enablePWLM, +std::unique_ptr vpux::createConvertVPUMI40XX2VPUASMPass(Logger log, bool disableDmaSwFifo) { + return std::make_unique(log, disableDmaSwFifo); +} + +std::unique_ptr vpux::createConvertVPUMI40XX2VPUASMPass(bool enablePWLM, Logger log, bool disableDmaSwFifo) { return std::make_unique(log, enablePWLM, disableDmaSwFifo); } diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUASM2NPUReg40XX/act_kernel_range_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUASM2NPUReg40XX/act_kernel_range_rewriter.cpp index f5fa3d55d8..f22c828df1 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUASM2NPUReg40XX/act_kernel_range_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUASM2NPUReg40XX/act_kernel_range_rewriter.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/ops.hpp" #include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace NPUReg40XX; using namespace NPUReg40XX::Descriptors; @@ -34,7 +35,7 @@ mlir::LogicalResult ActKernelRangeRewriter::matchAndRewrite(VPUASM::ActKernelRan origOp.getTaskLocationAttr(), origOp.getKernelTextAttr(), origOp.getKernelEntryAttr()); - _log.trace("[{0}] Got kernel '{1}' and cpu '{2}'", getDebugName(), kernelPath, VPU::getArch(origOp)); + _log.trace("[{0}] Got kernel '{1}' and cpu '{2}'", getDebugName(), kernelPath, config::getArch(origOp)); rewriter.eraseOp(origOp); diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUASM2NPUReg40XX/mapped_inference_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUASM2NPUReg40XX/mapped_inference_rewriter.cpp index 86b6c8ccdc..8f09aed6b5 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUASM2NPUReg40XX/mapped_inference_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUASM2NPUReg40XX/mapped_inference_rewriter.cpp @@ -4,11 +4,11 @@ // #include "vpux/compiler/conversion/rewriters/VPUASM2NPUReg40XX/mapped_inference_rewriter.hpp" - #include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/ops.hpp" #include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/utils.hpp" #include "vpux/compiler/core/profiling.hpp" #include "vpux/compiler/dialect/VPU/utils/wlm_constraint_utils.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUMI40XX/utils.hpp" #include diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUASM2NPUReg40XX/nnrt_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUASM2NPUReg40XX/nnrt_rewriter.cpp index 2926fd534e..a9ef90ba53 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUASM2NPUReg40XX/nnrt_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUASM2NPUReg40XX/nnrt_rewriter.cpp @@ -4,10 +4,10 @@ // #include "vpux/compiler/conversion/rewriters/VPUASM2NPUReg40XX/nnrt_rewriter.hpp" - #include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/ops.hpp" #include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/utils.hpp" #include "vpux/compiler/core/profiling.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUIP2VPUMI40XX/dma_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUIP2VPUMI40XX/dma_rewriter.cpp index 19629d8b51..b7cb355654 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUIP2VPUMI40XX/dma_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUIP2VPUMI40XX/dma_rewriter.cpp @@ -6,12 +6,10 @@ #include "vpux/compiler/conversion/rewriters/VPUIP2VPUMI40XX/dma_rewriter.hpp" #include "vpux/compiler/conversion/passes/VPUIP2VPUMI40XX/buffer_conversion.hpp" -#include "vpux/compiler/dialect/IE/utils/resources.hpp" -#include "vpux/compiler/dialect/VPUMI40XX/utils.hpp" -#include "vpux/compiler/dialect/VPURT/IR/task.hpp" -#include "vpux/compiler/dialect/VPURegMapped/ops.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/ops.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/dma_limits.hpp" using namespace vpux; @@ -88,14 +86,10 @@ mlir::LogicalResult PermuteDMARewriter::matchAndRewrite(VPUIP::PermuteDMAOp perm const auto tileIdx = adaptor.getPort().value(); auto indexType = VPURegMapped::IndexType::get(ctx, tileIdx, getListIndex(inputType.getMemoryKind()), 0); - const auto dmaDescriptor = adaptor.getDmaDescriptor().value(); - - const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(VPU::getArch(permuteDMAOp)); - const auto dmaMaxNumPlanes = dmaEngineLimits.getMaxNumPlanes(); - - const auto numPlanes = checked_cast(dmaDescriptor.getNumPlanes().getInt()); - VPUX_THROW_UNLESS(numPlanes <= dmaMaxNumPlanes, "NUM PLANES should be less than or equal to {0}, but got {1}.", - dmaMaxNumPlanes, numPlanes); + auto internalDataFlow = adaptor.getInternalDataFlow(); + auto dmaTransaction = VPUMI40XX::PermuteDMATransactionAttr::get( + ctx, internalDataFlow->getInputType(), internalDataFlow->getOutputType(), + internalDataFlow->getMappingOrder(), internalDataFlow->getLoopOrder()); auto dmaResults = convertOrUnrollBuffer(rewriter, adaptor.getOutputBuff()); auto origOp = permuteDMAOp->getParentOfType(); @@ -111,10 +105,11 @@ mlir::LogicalResult PermuteDMARewriter::matchAndRewrite(VPUIP::PermuteDMAOp perm adaptor.getIsOutOfOrder(), adaptor.getIsCritical(), _isMemorySideCacheEnabled && enableMemorySideCache(inputType, outputType), tileIdx, VPUIP::DMAAccMode::DISABLE, - nullptr, // actCompressionSizeEntry - nullptr, // actCompressionSparsityMap - nullptr, // dmaTransaction - dmaDescriptor, adaptor.getDmaHwpIdAttr(), adaptor.getProfilingMetadataAttr(), + nullptr, // actCompressionSizeEntry + nullptr, // actCompressionSparsityMap + dmaTransaction, // dmaTransaction + nullptr, // dmaDescriptor + adaptor.getDmaHwpIdAttr(), adaptor.getProfilingMetadataAttr(), true, // allowDifferentInOutShapes nullptr, // indices nullptr, // enqueueBarrier @@ -596,11 +591,117 @@ mlir::LogicalResult BarrierProgDMARewriter::matchAndRewrite(VPUIP::BarProgDMAOp nullptr, // actCompressionSparsityMap nullptr, // dmaTransaction dmaDescriptorAttr, adaptor.getDmaHwpIdAttr(), adaptor.getProfilingMetadataAttr(), - false, // allowDifferentInOutShapes - nullptr, // indices - nullptr, // enqueueBarrier - origOp.getWlmPageAttr(), // wlmPageAttr - barProgDmaOp.getPhysicalBarrierRangeAttr() // physicalBarrierRangeAttr + false, // allowDifferentInOutShapes + nullptr, // indices + nullptr, // enqueueBarrier + origOp.getWlmPageAttr(), // wlmPageAttr + barProgDmaOp.getPhysicalBarrierRangeAttr(), // physicalBarrierRangeAttr + nullptr, // enqueueDMAAttr + nullptr // fetchDMAAttr + ); + + return mlir::success(); +} + +mlir::LogicalResult FetchDMARewriter::matchAndRewrite(VPUIP::FetchDMAOp fetchDMAOp, OpAdaptor adaptor, + mlir::ConversionPatternRewriter& rewriter) const { + auto ctx = fetchDMAOp.getContext(); + + auto inputType = mlir::cast(adaptor.getInput().getType()); + auto outputType = mlir::cast(adaptor.getOutputBuff().getType()); + + const auto tileIdx = adaptor.getPort().value(); + auto indexType = VPURegMapped::IndexType::get(ctx, tileIdx, getListIndex(inputType.getMemoryKind()), 0); + + auto zeroAttr = getIntAttr(ctx, 0); + auto dmaDescriptorAttr = VPUIP::DMADescriptorAttr::get(ctx, + zeroAttr, // numPlane + zeroAttr, // len + zeroAttr, // srcWidth + zeroAttr, // srcStride + zeroAttr, // srcPlaneStride + zeroAttr, // dstWidth + zeroAttr, // dstStride + zeroAttr // dstPlaneStride + ); + + auto dmaResults = convertOrUnrollBuffer(rewriter, adaptor.getOutputBuff()); + auto origOp = fetchDMAOp->getParentOfType(); + rewriter.replaceOpWithNewOp( + fetchDMAOp, indexType, + nullptr, // taskLocation + adaptor.getInput(), dmaResults, + nullptr, // previousTask + mlir::ValueRange(), // waitBarriers + mlir::ValueRange(), // updateBarriers + 0, // startAfter + 0, // cleanAfter + adaptor.getIsOutOfOrder(), adaptor.getIsCritical(), + _isMemorySideCacheEnabled && enableMemorySideCache(inputType, outputType), tileIdx, + VPUIP::DMAAccMode::DISABLE, + nullptr, // actCompressionSizeEntry + nullptr, // actCompressionSparsityMap + nullptr, // dmaTransaction + dmaDescriptorAttr, adaptor.getDmaHwpIdAttr(), adaptor.getProfilingMetadataAttr(), + false, // allowDifferentInOutShapes + nullptr, // indices + nullptr, // enqueueBarrier + origOp.getWlmPageAttr(), // wlmPageAttr + nullptr, // physicalBarrierRangeAttr + nullptr, // enqueueDMAAttr + fetchDMAOp.getFetchDmaAttr() // fetchDmaAttr + ); + + return mlir::success(); +} + +mlir::LogicalResult EnqueueDMARewriter::matchAndRewrite(VPUIP::EnqueueDMAOp enqueueDMAOp, OpAdaptor adaptor, + mlir::ConversionPatternRewriter& rewriter) const { + auto ctx = enqueueDMAOp.getContext(); + + auto inputType = mlir::cast(adaptor.getInput().getType()); + auto outputType = mlir::cast(adaptor.getOutputBuff().getType()); + + const auto tileIdx = adaptor.getPort().value(); + auto indexType = VPURegMapped::IndexType::get(ctx, tileIdx, getListIndex(inputType.getMemoryKind()), 0); + + auto zeroAttr = getIntAttr(ctx, 0); + auto dmaDescriptorAttr = VPUIP::DMADescriptorAttr::get(ctx, + zeroAttr, // numPlane + zeroAttr, // len + zeroAttr, // srcWidth + zeroAttr, // srcStride + zeroAttr, // srcPlaneStride + zeroAttr, // dstWidth + zeroAttr, // dstStride + zeroAttr // dstPlaneStride + ); + + auto dmaResults = convertOrUnrollBuffer(rewriter, adaptor.getOutputBuff()); + auto origOp = enqueueDMAOp->getParentOfType(); + rewriter.replaceOpWithNewOp( + enqueueDMAOp, indexType, + nullptr, // taskLocation + adaptor.getInput(), dmaResults, + nullptr, // previousTask + mlir::ValueRange(), // waitBarriers + mlir::ValueRange(), // updateBarriers + 0, // startAfter + 0, // cleanAfter + adaptor.getIsOutOfOrder(), adaptor.getIsCritical(), + _isMemorySideCacheEnabled && enableMemorySideCache(inputType, outputType), tileIdx, + VPUIP::DMAAccMode::DISABLE, + nullptr, // actCompressionSizeEntry + nullptr, // actCompressionSparsityMap + nullptr, // dmaTransaction + dmaDescriptorAttr, adaptor.getDmaHwpIdAttr(), adaptor.getProfilingMetadataAttr(), + false, // allowDifferentInOutShapes + nullptr, // indices + nullptr, // enqueueBarrier + origOp.getWlmPageAttr(), // wlmPageAttr + nullptr, // physicalBarrierRangeAttr + enqueueDMAOp.getEnqueueDmaAttr(), // enqueueDMAAttr + nullptr // fetchDmaAttr ); return mlir::success(); diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUIP2VPUMI40XX/nce_cluster_task_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUIP2VPUMI40XX/nce_cluster_task_rewriter.cpp index c22cc3c80f..24fb09d0b0 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUIP2VPUMI40XX/nce_cluster_task_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUIP2VPUMI40XX/nce_cluster_task_rewriter.cpp @@ -144,7 +144,7 @@ mlir::LogicalResult NCEClusterTaskRewriter::matchAndRewrite(VPUIP::NCEClusterTas auto dpuTasksIt = dpuTasks.begin(); if (sprLookupTable || palletLookupTable) { - // Processing dummy DPU task (see more info in AddDummyDPUTaskForSprLUT pass) + // Processing dummy DPU task (see more info in AddDummyDPUTaskForMetadataPrefetch pass) createVPUMI40XXVariant(*(dpuTasksIt++)); // For the first variant that goes after the dummy one, two additional registers are set: diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/declare_buffer_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/declare_buffer_rewriter.cpp index 601bf2aee2..553a47555b 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/declare_buffer_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/declare_buffer_rewriter.cpp @@ -48,12 +48,7 @@ mlir::FailureOr DeclareBufferRewriter::symbolize(VPURT::Dec llvm::SmallVector DeclareBufferRewriter::getSymbolicNames(VPURT::DeclareBufferOp op, size_t counter) { - auto fullName = VPURT::DeclareBufferOp::getOperationName(); - auto opName = fullName.drop_front(VPURT::VPURTDialect::getDialectNamespace().size() + 1); - - auto index = std::to_string(counter); - auto symName = mlir::StringAttr::get(op.getContext(), opName + index); - return {mlir::FlatSymbolRefAttr::get(symName)}; + return this->createSymbolicName(op, /* taskTypeString */ std::nullopt, counter); } } // namespace vpumi37xx2vpuasm diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/declare_const_buffer_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/declare_const_buffer_rewriter.cpp index 7d032977d2..9916236ec3 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/declare_const_buffer_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/declare_const_buffer_rewriter.cpp @@ -12,12 +12,7 @@ namespace vpumi37xx2vpuasm { llvm::SmallVector DeclareConstBufferRewriter::getSymbolicNames(Const::DeclareOp op, size_t counter) { - auto fullName = Const::DeclareOp::getOperationName(); - auto opName = fullName.drop_front(Const::ConstDialect::getDialectNamespace().size() + 1); - - auto index = std::to_string(counter); - auto symName = mlir::StringAttr::get(op.getContext(), opName + index); - return {mlir::FlatSymbolRefAttr::get(symName)}; + return this->createSymbolicName(op, /* taskTypeString */ std::nullopt, counter); } mlir::FailureOr DeclareConstBufferRewriter::symbolize( diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/declare_task_buffer_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/declare_task_buffer_rewriter.cpp index 68b9cfdf56..be432dd83a 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/declare_task_buffer_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/declare_task_buffer_rewriter.cpp @@ -22,17 +22,7 @@ mlir::FailureOr DeclareTaskBufferRewriter::symbolize( llvm::SmallVector DeclareTaskBufferRewriter::getSymbolicNames( VPUMI37XX::DeclareTaskBufferOp op, size_t) { - auto opName = op->getName().stripDialect(); - auto taskTypeString = VPURegMapped::stringifyTaskType(op.getTaskType()); - - auto tileIdx = std::to_string(op.getType().getTileIdx()); - auto srcTypeIdx = std::to_string(op.getType().getListIdx()); - auto opIdx = std::to_string(op.getType().getValue()); - - auto symName = mlir::StringAttr::get( - op.getContext(), opName + "_" + taskTypeString + "_" + tileIdx + "_" + srcTypeIdx + "_" + opIdx); - - return {mlir::FlatSymbolRefAttr::get(symName)}; + return createSymbolicName(op, VPURegMapped::stringifyTaskType(op.getTaskType()).str(), /* counter */ std::nullopt); } } // namespace vpumi37xx2vpuasm diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/dma_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/dma_rewriter.cpp index 179c39f6f3..8713187c9d 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/dma_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUMI37XX2VPUASM/dma_rewriter.cpp @@ -10,16 +10,7 @@ namespace vpux { namespace vpumi37xx2vpuasm { llvm::SmallVector NNDMARewriter::getSymbolicNames(VPUMI37XX::NNDMAOp op, size_t) { - auto fullName = VPUMI37XX::NNDMAOp::getOperationName(); - auto opName = fullName.drop_front(VPUMI37XX::VPUMI37XXDialect::getDialectNamespace().size() + 1); - - auto tileIdx = std::to_string(op.getType().getTileIdx()); - auto srcTypeIdx = std::to_string(op.getType().getListIdx()); - auto opIdx = std::to_string(op.getType().getValue()); - - auto symName = mlir::StringAttr::get(op.getContext(), opName + "_" + tileIdx + "_" + srcTypeIdx + "_" + opIdx); - - return {mlir::FlatSymbolRefAttr::get(symName)}; + return createSymbolicName(op, std::nullopt, std::nullopt); } llvm::SmallVector> NNDMARewriter::reduce_dims_for_dma(mlir::Value val) { diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/declare_buffer_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/declare_buffer_rewriter.cpp index a914901575..4166401f27 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/declare_buffer_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/declare_buffer_rewriter.cpp @@ -58,12 +58,7 @@ llvm::SmallVector DeclareBufferRewriter::getSymbolicNam return {mlir::FlatSymbolRefAttr()}; } - auto fullName = VPURT::DeclareBufferOp::getOperationName(); - auto opName = fullName.drop_front(VPURT::VPURTDialect::getDialectNamespace().size() + 1); - - auto index = std::to_string(counter); - auto symName = mlir::StringAttr::get(op.getContext(), opName + index); - return {mlir::FlatSymbolRefAttr::get(symName)}; + return createSymbolicName(op, /* taskTypeString */ std::nullopt, counter); } } // namespace vpumi40xx2vpuasm diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/declare_const_buffer_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/declare_const_buffer_rewriter.cpp index aa3e6144ae..d43fc5bb8a 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/declare_const_buffer_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/declare_const_buffer_rewriter.cpp @@ -12,12 +12,7 @@ namespace vpumi40xx2vpuasm { llvm::SmallVector DeclareConstBufferRewriter::getSymbolicNames(Const::DeclareOp op, size_t counter) { - auto fullName = Const::DeclareOp::getOperationName(); - auto opName = fullName.drop_front(Const::ConstDialect::getDialectNamespace().size() + 1); - - auto index = std::to_string(counter); - auto symName = mlir::StringAttr::get(op.getContext(), opName + index); - return {mlir::FlatSymbolRefAttr::get(symName)}; + return createSymbolicName(op, /* taskTypeString */ std::nullopt, counter); } mlir::FailureOr DeclareConstBufferRewriter::symbolize( diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/declare_task_buffer_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/declare_task_buffer_rewriter.cpp index b3a3cc20e2..4295e5b826 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/declare_task_buffer_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/declare_task_buffer_rewriter.cpp @@ -23,17 +23,7 @@ mlir::FailureOr DeclareTaskBufferRewriter::symbolize( llvm::SmallVector DeclareTaskBufferRewriter::getSymbolicNames( VPUMI40XX::DeclareTaskBufferOp op, size_t) { - auto opName = op->getName().stripDialect(); - auto taskTypeString = VPURegMapped::stringifyTaskType(op.getTaskType()); - - auto tileIdx = std::to_string(op.getType().getTileIdx()); - auto srcTypeIdx = std::to_string(op.getType().getListIdx()); - auto opIdx = std::to_string(op.getType().getValue()); - - auto symName = mlir::StringAttr::get( - op.getContext(), opName + "_" + taskTypeString + "_" + tileIdx + "_" + srcTypeIdx + "_" + opIdx); - - return {mlir::FlatSymbolRefAttr::get(symName)}; + return createSymbolicName(op, VPURegMapped::stringifyTaskType(op.getTaskType()).str(), /* counter */ std::nullopt); } } // namespace vpumi40xx2vpuasm diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/dma_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/dma_rewriter.cpp index 25432c70c6..7838817204 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/dma_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/dma_rewriter.cpp @@ -11,7 +11,7 @@ namespace vpux { namespace vpumi40xx2vpuasm { llvm::SmallVector NNDMARewriter::getSymbolicNames(VPUMI40XX::NNDMAOp op, size_t) { - return getSymbolicNamesByTileListValue(op); + return createSymbolicName(op); } VPUIP::DMADescriptorAttr NNDMARewriter::getDmaDescriptorAttr(VPUMI40XX::NNDMAOp op, mlir::MLIRContext* ctx) const { diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_data_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_data_rewriter.cpp index c04ea9fbd1..62f7f23aa5 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_data_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_data_rewriter.cpp @@ -19,7 +19,7 @@ mlir::FailureOr KernelDataRewriter::symbolize(VPUMI40XX::De llvm::SmallVector KernelDataRewriter::getSymbolicNames(VPUMI40XX::DeclareKernelArgsOp op, size_t) { - return getSymbolicNamesByTileListValue(op); + return createSymbolicName(op); } } // namespace vpumi40xx2vpuasm diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_entry_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_entry_rewriter.cpp index d89f401e16..9c804aba0b 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_entry_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_entry_rewriter.cpp @@ -19,7 +19,7 @@ mlir::FailureOr KernelEntryRewriter::symbolize(VPUMI40XX::D llvm::SmallVector KernelEntryRewriter::getSymbolicNames(VPUMI40XX::DeclareKernelEntryOp op, size_t) { - return getSymbolicNamesByTileListValue(op); + return createSymbolicName(op); } } // namespace vpumi40xx2vpuasm diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_invocation_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_invocation_rewriter.cpp index ce9ca3a592..5a3f018abd 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_invocation_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_invocation_rewriter.cpp @@ -60,7 +60,7 @@ mlir::FailureOr KernelInvocationRewriter::symbolize( llvm::SmallVector KernelInvocationRewriter::getSymbolicNames( VPUMI40XX::ActKernelInvocationOp op, size_t) { - return getSymbolicNamesByTileListValue(op); + return createSymbolicName(op); } } // namespace vpumi40xx2vpuasm diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_params_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_params_rewriter.cpp index 1cab0cc8db..3193c4e901 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_params_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_params_rewriter.cpp @@ -48,7 +48,7 @@ mlir::FailureOr KernelParamsRewriter::symbolize(VPUMI40XX:: llvm::SmallVector KernelParamsRewriter::getSymbolicNames(VPUMI40XX::KernelParamsOp op, size_t) { - return getSymbolicNamesByTileListValue(op); + return createSymbolicName(op); } } // namespace vpumi40xx2vpuasm diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_range_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_range_rewriter.cpp index 9fec1ed9dd..d157ed5268 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_range_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_range_rewriter.cpp @@ -32,7 +32,7 @@ mlir::FailureOr KernelRangeRewriter::symbolize(VPUMI40XX::A llvm::SmallVector KernelRangeRewriter::getSymbolicNames(VPUMI40XX::ActKernelRangeOp op, size_t) { - return getSymbolicNamesByTileListValue(op); + return createSymbolicName(op); } } // namespace vpumi40xx2vpuasm diff --git a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_text_rewriter.cpp b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_text_rewriter.cpp index e50ae503c0..ea328b8175 100644 --- a/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_text_rewriter.cpp +++ b/src/vpux_compiler/src/conversion/rewriters/VPUMI40XX2VPUASM/kernel_text_rewriter.cpp @@ -19,7 +19,7 @@ mlir::FailureOr KernelTextRewriter::symbolize(VPUMI40XX::De llvm::SmallVector KernelTextRewriter::getSymbolicNames(VPUMI40XX::DeclareKernelTextOp op, size_t) { - return getSymbolicNamesByTileListValue(op); + return createSymbolicName(op); } } // namespace vpumi40xx2vpuasm diff --git a/src/vpux_compiler/src/core/attributes/dims_order.cpp b/src/vpux_compiler/src/core/attributes/dims_order.cpp index b3e2b70e08..f0a39214ae 100644 --- a/src/vpux_compiler/src/core/attributes/dims_order.cpp +++ b/src/vpux_compiler/src/core/attributes/dims_order.cpp @@ -48,6 +48,8 @@ const DimsOrder vpux::DimsOrder::HCNW = DimsOrder(0x3214); const DimsOrder vpux::DimsOrder::HNWC = DimsOrder(0x3142); const DimsOrder vpux::DimsOrder::CWNH = DimsOrder(0x2413); const DimsOrder vpux::DimsOrder::CNHW = DimsOrder(0x2134); +const DimsOrder vpux::DimsOrder::CHWN = DimsOrder(0x2341); +const DimsOrder vpux::DimsOrder::HCWN = DimsOrder(0x3241); const DimsOrder vpux::DimsOrder::NCDHW = DimsOrder(0x12345); const DimsOrder vpux::DimsOrder::NDHWC = DimsOrder(0x13452); diff --git a/src/vpux_compiler/src/core/barrier_info.cpp b/src/vpux_compiler/src/core/barrier_info.cpp index 9f4ab1b679..1f56303c01 100644 --- a/src/vpux_compiler/src/core/barrier_info.cpp +++ b/src/vpux_compiler/src/core/barrier_info.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/dma.hpp" #include "vpux/compiler/utils/shave.hpp" @@ -1638,6 +1639,14 @@ std::optional vpux::BarrierInfo::getPreviousBlockSyncPoint(size_t taskIn return _syncTasksIds[blockInd - 1]; } +std::optional vpux::BarrierInfo::getNextBlockSyncPoint(size_t taskInd) const { + auto blockInd = getControlGraphBlockIndex(taskInd); + if (blockInd + 1 >= _syncTasksIds.size()) { + return std::nullopt; + } + return _syncTasksIds[blockInd + 1]; +} + void vpux::BarrierInfo::splitBarriersWithExceedingVariantCount(size_t availableSlots, size_t maxSlotsSum, size_t maxAvailableSlots) { bool maxSlotsSumLimitEnabled = (maxSlotsSum < maxAvailableSlots); @@ -2755,7 +2764,7 @@ void vpux::BarrierInfo::initializeTaskQueueTypeMap(const mlir::DenseSet tensorShapes, VPUNN: } // This function convert Arch kind to VPUNN VPUDevice directly and faithfully -VPUNN::VPUDevice getVPUNNDevice(VPU::ArchKind archKind) { +VPUNN::VPUDevice getVPUNNDevice(config::ArchKind archKind) { switch (archKind) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return VPUNN::VPUDevice::VPU_2_7; - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return VPUNN::VPUDevice::VPU_4_0; default: VPUX_THROW("Unsupported VPU arch type: '{0}'", archKind); @@ -64,6 +66,13 @@ VPUNN::DataType getElementType(mlir::Type type, [[maybe_unused]] VPUNN::VPUDevic } else if (type.isUnsignedInteger(CHAR_BIT * sizeof(int8_t))) { return VPUNN::DataType::UINT8; } else if (auto qType = mlir::dyn_cast(type)) { + auto storageType = qType.getStorageType(); + if (storageType.isFloat8E5M2()) { + return VPUNN::DataType::BF8; + } else if (storageType.isFloat8E4M3FN()) { + return VPUNN::DataType::HF8; + } + if (qType.getStorageTypeIntegralWidth() == 8) { return qType.isSigned() ? VPUNN::DataType::INT8 : VPUNN::DataType::UINT8; } @@ -158,16 +167,18 @@ VPUNN::ISIStrategy getVPUNNISIStrategyForNPU40XXAndBelow(VPUIP::DPUTaskOp dpuTas bool isConstDeclareOpFilledAllOne(Const::DeclareOp op) { const auto content = op.getContent(); - const auto values = content.getValues(); - if (values.size() == 0) { - return false; - } - for (const auto& value : values) { - if (value != 1) { + return content.read([](auto values) { + if (values.size() == 0) { return false; } - } - return true; + + for (const auto& value : values) { + if (checked_cast(value) != 1) { + return false; + } + } + return true; + }); } } // namespace @@ -181,7 +192,7 @@ VPUNN::SEPModeInfo vpux::getSEPModeInfo(VPUIP::SEPInfo sepInfo) { return VPUNN::SEPModeInfo{true, getWHCBShape(sepInfo.sepTableShape), getWHCBShape(sepInfo.sepActShape)}; } -VPUNN::DPUWorkload vpux::getDPUWorkload(VPUIP::DPUTaskOp dpuTaskOp, VPU::ArchKind arch) { +VPUNN::DPUWorkload vpux::getDPUWorkload(VPUIP::DPUTaskOp dpuTaskOp, config::ArchKind arch) { auto nceClusterOp = dpuTaskOp->getParentOfType(); VPUX_THROW_WHEN(nceClusterOp == nullptr, "The parent of dpuTaskOp {0} must be a NCEClusterTaskOp but not", dpuTaskOp->getLoc()); @@ -322,7 +333,7 @@ VPUNN::DPUWorkload vpux::getDPUWorkload(VPUIP::DPUTaskOp dpuTaskOp, VPU::ArchKin if (inputTwoElemType != nullptr) { vpunnDPUWorkload.weight_type = getElementType(inputTwoElemType, getVPUNNDevice(arch)); } - vpunnDPUWorkload.device = getVPUDeviceType(arch); + vpunnDPUWorkload.device = VPU::getVPUDeviceType(arch); vpunnDPUWorkload.op = opType; vpunnDPUWorkload.inputs = {inputTensor}; vpunnDPUWorkload.outputs = {outputTensor}; @@ -360,12 +371,12 @@ VPUNN::DPUWorkload vpux::getDPUWorkload(VPUIP::DPUTaskOp dpuTaskOp, VPU::ArchKin } size_t calculateMultiClusterDMACost(mlir::Value innerOperand, VPUNN::DataType inElemType, VPUNN::DataType outElemType, - VPU::ArchKind archKind, const std::shared_ptr& costModel, + config::ArchKind archKind, const std::shared_ptr& costModel, [[maybe_unused]] int64_t numDMAPorts) { auto operandType = innerOperand.getType(); auto distributedType = mlir::dyn_cast(operandType); VPUX_THROW_UNLESS(distributedType != nullptr, "Unsupported operand type {0}", operandType); - auto vpuDevice = getVPUDeviceType(archKind); + auto vpuDevice = VPU::getVPUDeviceType(archKind); // TODO: E#66557 // Currently, if DMA source is OVERLAPPED we're moving the overlap twice. Once that is optimized, @@ -386,7 +397,7 @@ bool extraDMAsRequired(mlir::Value innerOperand) { return false; } -size_t vpux::getDMACost(mlir::Value input, mlir::Value output, VPU::ArchKind archKind, +size_t vpux::getDMACost(mlir::Value input, mlir::Value output, config::ArchKind archKind, const std::shared_ptr& costModel, int64_t numDMAPorts) { auto inputType = input.getType(); auto outputType = output.getType(); @@ -408,7 +419,7 @@ size_t vpux::getDMACost(mlir::Value input, mlir::Value output, VPU::ArchKind arc auto outputShape = getShape(output); // TODO: add layout info to VPUNN tensors - auto cost = costModel->DMA(getVPUDeviceType(archKind), {getVPUNNTensor(inputShape, inElemType)}, + auto cost = costModel->DMA(VPU::getVPUDeviceType(archKind), {getVPUNNTensor(inputShape, inElemType)}, {getVPUNNTensor(outputShape, outElemType)}, getMemoryLocation(inputType), getMemoryLocation(outputType)); @@ -596,7 +607,7 @@ size_t vpux::getAsyncExecuteCycleEnd(mlir::async::ExecuteOp op) { return checked_cast(mlir::cast(op->getAttr(cycleEnd)).getValue().getSExtValue()); } -size_t vpux::calculateCopyCycles(mlir::Operation* innerOp, VPU::ArchKind archKind, +size_t vpux::calculateCopyCycles(mlir::Operation* innerOp, config::ArchKind archKind, const std::shared_ptr& costModel) { if (auto copyOp = mlir::dyn_cast(innerOp)) { return checked_cast(getDMACost(copyOp.getInput(), copyOp.getOutput(), archKind, costModel)); @@ -802,13 +813,13 @@ std::unique_ptr queryKernelMap(const std::string& swKernelNa return queryKernelMap(swKernelName, vpuDev, inputTypes, mlir::cast(output.getType())); } -size_t getShaveActCycleForSwKernelFunc(const std::string& swKernelName, VPU::ArchKind arch, +size_t getShaveActCycleForSwKernelFunc(const std::string& swKernelName, config::ArchKind arch, ArrayRef inputs, ArrayRef outputs, const std::shared_ptr& costModel) { VPUX_THROW_WHEN(inputs.empty(), "No inputs identified for op {0}", swKernelName); VPUX_THROW_WHEN(outputs.empty(), "No outputs identified for op {0}", swKernelName); - auto vpuDev = getVPUDeviceType(arch); + auto vpuDev = vpux::VPU::getVPUDeviceType(arch); std::unique_ptr vpunnLayer = queryKernelMap(swKernelName, vpuDev, inputs, outputs[0]); @@ -821,7 +832,7 @@ std::unique_ptr vpux::getVPUNNSWKernelOp(VPUIP::SwKernelOp s return nullptr; } const auto swKernelName = getSwKernelOperationName(swKernelOp); - auto vpuDev = getVPUDeviceType(VPU::getArch(swKernelOp.getOperation())); + auto vpuDev = VPU::getVPUDeviceType(config::getArch(swKernelOp.getOperation())); auto inputs = to_small_vector(swKernelOp->getOperands()); auto output = swKernelOp->getResult(0); @@ -832,7 +843,7 @@ std::unique_ptr vpux::getVPUNNSWKernelOp(VPUIP::SwKernelOp s } std::unique_ptr vpux::getVPUNNSWKernelOp(VPU::SWOpInterface operation) { - auto vpuDev = VPU::getVPUDeviceType(VPU::getArch(operation)); + auto vpuDev = VPU::getVPUDeviceType(config::getArch(operation)); const auto operName = operation->getName().stripDialect().str(); auto inputs = to_small_vector(operation->getOperands()); @@ -846,7 +857,7 @@ std::unique_ptr vpux::getVPUNNSWKernelOp(VPU::SWOpInterface std::unique_ptr vpux::getVPUNNSWKernelOp(VPU::SWOpInterface operation, vpux::NDTypeInterface outputNDType, ArrayRef types) { - auto vpuDev = VPU::getVPUDeviceType(VPU::getArch(operation)); + auto vpuDev = VPU::getVPUDeviceType(config::getArch(operation)); const auto operName = operation->getName().stripDialect().str(); std::unique_ptr vpunnLayer = queryKernelMap(operName, vpuDev, types, outputNDType); @@ -855,7 +866,7 @@ std::unique_ptr vpux::getVPUNNSWKernelOp(VPU::SWOpInterface } size_t vpux::calculateShaveActCycles(VPUIP::SwKernelOp swKernelOp, - const std::shared_ptr& costModel, VPU::ArchKind arch) { + const std::shared_ptr& costModel, config::ArchKind arch) { if (swKernelOp.getInputs().empty() || swKernelOp.getOutputBuffs().empty()) { return 1; } @@ -903,7 +914,7 @@ size_t vpux::calculateShaveActCycles(VPUIP::SwKernelOp swKernelOp, } size_t vpux::getDPUTaskOpCost(VPUIP::DPUTaskOp dpuTaskOp, const std::shared_ptr& costModel, - VPU::ArchKind arch, vpux::Logger log) { + config::ArchKind arch, vpux::Logger log) { auto nceOp = dpuTaskOp->getParentOfType(); VPUX_THROW_WHEN(nceOp == nullptr, "The parent of dpuTaskOp {0} must be a NCEClusterTaskOp but not", dpuTaskOp->getLoc()); @@ -938,7 +949,7 @@ size_t vpux::getDPUTaskOpCost(VPUIP::DPUTaskOp dpuTaskOp, const std::shared_ptr< } std::vector> vpux::calculateNceVariantCycles( - VPUIP::NCEClusterTaskOp nceOp, const std::shared_ptr& costModel, VPU::ArchKind arch, + VPUIP::NCEClusterTaskOp nceOp, const std::shared_ptr& costModel, config::ArchKind arch, vpux::Logger log) { std::vector> nceVariantCyclePerCluster; for (auto dpuTaskOp : nceOp.getVariants().getOps()) { @@ -949,7 +960,7 @@ std::vector> vpux::calculateNceVariantCycles( } size_t vpux::calculateNceCycles(VPUIP::NCEClusterTaskOp nceOp, const std::shared_ptr& costModel, - VPU::ArchKind arch, vpux::Logger log, int64_t numDPU) { + config::ArchKind arch, vpux::Logger log, int64_t numDPU) { auto variantCostVec = calculateNceVariantCycles(nceOp, costModel, arch, log); // Group costs by cluster ID and find the maximum cost for each cluster diff --git a/src/vpux_compiler/src/core/cycle_cost_info.cpp b/src/vpux_compiler/src/core/cycle_cost_info.cpp index e438611b6b..12a0eaaa0b 100644 --- a/src/vpux_compiler/src/core/cycle_cost_info.cpp +++ b/src/vpux_compiler/src/core/cycle_cost_info.cpp @@ -5,21 +5,19 @@ #include "vpux/compiler/core/cycle_cost_info.hpp" -#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" -#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" -#include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; CycleCostInfo::CycleCostInfo(mlir::func::FuncOp func) - : CycleCostInfo(VPU::CostModelConfig::createCostModel(VPU::getArch(func)), func) { + : CycleCostInfo(VPU::CostModelConfig::createCostModel(config::getArch(func)), func) { } CycleCostInfo::CycleCostInfo(std::shared_ptr costModel, mlir::func::FuncOp func) : _log(Logger::global().nest("cycle-cost-info", 0)) { auto module = func->getParentOfType(); - _archKind = VPU::getArch(module); + _archKind = config::getArch(module); _costModel = std::move(costModel); _log.trace("Analyze cycle cost for Function '@{0}'", func.getName()); diff --git a/src/vpux_compiler/src/core/execution_group_analysis.cpp b/src/vpux_compiler/src/core/execution_group_analysis.cpp index 4fa85eac4e..4d61c03b7b 100644 --- a/src/vpux_compiler/src/core/execution_group_analysis.cpp +++ b/src/vpux_compiler/src/core/execution_group_analysis.cpp @@ -5,10 +5,12 @@ // #include "vpux/compiler/core/execution_group_analysis.hpp" -#include #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/utils/wlm_constraint_utils.hpp" -#include "vpux/compiler/utils/wlm_legalization_utils.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" + +#include + namespace vpux { ExecutionGroupAnalysis::ExecutionGroupAnalysis(mlir::func::FuncOp func) diff --git a/src/vpux_compiler/src/core/feasible_memory_scheduler.cpp b/src/vpux_compiler/src/core/feasible_memory_scheduler.cpp index 67049cf430..ee26ec1ca2 100644 --- a/src/vpux_compiler/src/core/feasible_memory_scheduler.cpp +++ b/src/vpux_compiler/src/core/feasible_memory_scheduler.cpp @@ -4,19 +4,14 @@ // #include "vpux/compiler/core/feasible_memory_scheduler.hpp" - +#include "vpux/compiler/core/cost_model_utils.hpp" #include "vpux/compiler/core/profiling.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" -#include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/utils/async_dialect_utils.hpp" -#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/dma.hpp" #include "vpux/compiler/utils/stl_extras.hpp" -#include "vpux/compiler/utils/strings.hpp" - -#include "vpux/utils/core/range.hpp" using namespace vpux; using operationIdxType = FeasibleMemoryScheduler::operationIdxType; @@ -39,8 +34,8 @@ using operationIdxType = FeasibleMemoryScheduler::operationIdxType; FeasibleMemoryScheduler::FeasibleMemoryScheduler(VPU::MemoryKind memKind, VPU::MemoryKind secondLvlMemKind, MemLiveRangeInfo& liveRangeInfo, AsyncDepsInfo& depsInfo, Logger log, - LinearScan& scan, VPU::ArchKind arch, - std::shared_ptr costModel, + LinearScan& scan, + config::ArchKind arch, std::shared_ptr costModel, int64_t nceClusterCount, int64_t dmaCount, bool enableScheduleStatistics, bool optimizeFragmentation) : _log(log), diff --git a/src/vpux_compiler/src/core/feasible_memory_scheduler_control_edges.cpp b/src/vpux_compiler/src/core/feasible_memory_scheduler_control_edges.cpp index cdd933708f..1655d7f3dd 100644 --- a/src/vpux_compiler/src/core/feasible_memory_scheduler_control_edges.cpp +++ b/src/vpux_compiler/src/core/feasible_memory_scheduler_control_edges.cpp @@ -4,14 +4,12 @@ // #include "vpux/compiler/core/feasible_memory_scheduler_control_edges.hpp" - -#include "vpux/compiler/utils/analysis.hpp" -#include "vpux/compiler/utils/rewriter.hpp" - +#include "vpux/compiler/core/cost_model_utils.hpp" #include "vpux/compiler/core/feasible_scheduler_utils.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" - -#include "vpux/utils/core/range.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" +#include "vpux/compiler/utils/analysis.hpp" +#include "vpux/compiler/utils/attributes.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/core/profiling.cpp b/src/vpux_compiler/src/core/profiling.cpp index d95e9eee89..20608c5f07 100644 --- a/src/vpux_compiler/src/core/profiling.cpp +++ b/src/vpux_compiler/src/core/profiling.cpp @@ -3,13 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -// - #include "vpux/compiler/core/profiling.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" +#include + using namespace vpux; namespace { @@ -52,12 +53,12 @@ VPUIP::M2IProfilingMetadataAttr vpux::getM2IProfilingMetaAttr(mlir::MLIRContext* return VPUIP::M2IProfilingMetadataAttr::get(ctx, getIntAttr(ctx, bufferId), getIntAttr(ctx, bufferOffset)); } -DMAProfilingMode vpux::getDMAProfilingMode(VPU::ArchKind arch, const std::string& optionValue) { +DMAProfilingMode vpux::getDMAProfilingMode(config::ArchKind arch, const std::string& optionValue) { if (optionValue == "false") { - return arch == VPU::ArchKind::NPU40XX ? DMAProfilingMode::SCRATCH : DMAProfilingMode::DISABLED; + return arch == config::ArchKind::NPU40XX ? DMAProfilingMode::SCRATCH : DMAProfilingMode::DISABLED; } switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: if (optionValue == "true") { return DMAProfilingMode::SW; } else { @@ -136,7 +137,7 @@ bool vpux::isDmaHwpUsedInVPURT(mlir::func::FuncOp& func) { } bool vpux::isDmaHwpUsedInVPURT(mlir::ModuleOp& module) { - if (vpux::VPU::getArch(module) < vpux::VPU::ArchKind::NPU40XX) { + if (vpux::config::getArch(module) < vpux::config::ArchKind::NPU40XX) { return false; } net::NetworkInfoOp netInfo; diff --git a/src/vpux_compiler/src/core/profiling_metadata.cpp b/src/vpux_compiler/src/core/profiling_metadata.cpp index 7dae13ca44..85b6147b35 100644 --- a/src/vpux_compiler/src/core/profiling_metadata.cpp +++ b/src/vpux_compiler/src/core/profiling_metadata.cpp @@ -4,16 +4,14 @@ // #include "vpux/compiler/core/profiling_metadata.hpp" -#include "vpux/compiler/core/profiling.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/device.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPUMI37XX/utils.hpp" -#include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/strings.hpp" -#include "vpux/utils/core/optional.hpp" #include "vpux/utils/profiling/common.hpp" #include "vpux/utils/profiling/metadata.hpp" @@ -25,11 +23,11 @@ using namespace vpux; namespace { -VPUIP::TargetDevice mapTargetDevice(VPU::ArchKind kind) { +VPUIP::TargetDevice mapTargetDevice(config::ArchKind kind) { switch (kind) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return VPUIP::TargetDevice::TargetDevice_VPUX37XX; - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return VPUIP::TargetDevice::TargetDevice_VPUX40XX; default: VPUX_THROW("Unsupported architecture '{0}'", kind); @@ -420,7 +418,7 @@ flatbuffers::Offset createProfilingBufferOffset(Pr return ProfilingFB::CreateProfilingBuffer(builder, sectionsOffset, sectionTotalSizeBytes); } -flatbuffers::Offset createPlatformOffset(VPU::ArchKind arch, +flatbuffers::Offset createPlatformOffset(config::ArchKind arch, flatbuffers::FlatBufferBuilder& builder) { auto targetDevice = mapTargetDevice(arch); return ProfilingFB::CreatePlatform(builder, (int8_t)targetDevice); @@ -436,7 +434,7 @@ flatbuffers::DetachedBuffer buildProfilingMetaGeneric(net::NetworkInfoOp netInfo const auto barriers = getBarriers(funcOp); ProfilingConfiguration profilingCfg(netInfo); - const auto arch = VPU::getArch(funcOp); + const auto arch = config::getArch(funcOp); auto dmaOffset = getDmaTasksOffset( profilingCfg, builder, DialectProvider::template extractOp(funcOp), barriers); @@ -467,9 +465,9 @@ flatbuffers::DetachedBuffer buildProfilingMetaVPURTGeneral(net::NetworkInfoOp ne } flatbuffers::DetachedBuffer buildProfilingMeta(net::NetworkInfoOp netInfo, mlir::func::FuncOp funcOp, Logger log) { - const auto arch = VPU::getArch(funcOp); + const auto arch = config::getArch(funcOp); switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return ::buildProfilingMetaVPURTGeneral(netInfo, funcOp, log); default: return ::buildProfilingMetaVPURTGeneral(netInfo, funcOp, log); diff --git a/src/vpux_compiler/src/core/reserved_memory_info.cpp b/src/vpux_compiler/src/core/reserved_memory_info.cpp index 1c31cbed60..d1fea77e1a 100644 --- a/src/vpux_compiler/src/core/reserved_memory_info.cpp +++ b/src/vpux_compiler/src/core/reserved_memory_info.cpp @@ -16,6 +16,11 @@ using namespace vpux; ReservedMemInfo::ReservedMemInfo(mlir::ModuleOp moduleOp, mlir::AnalysisManager& am) { // TODO:#108991 -- for now only "main" function with inner functions is supported, // but it is possible support multiple nested calls using a loop through call/function ops + + if (moduleOp.getOps().empty()) { + return; + } + mlir::func::FuncOp netFunc; net::NetworkInfoOp netInfo; net::NetworkInfoOp::getFromModule(moduleOp, netInfo, netFunc); diff --git a/src/vpux_compiler/src/core/schedule_analysis_utils.cpp b/src/vpux_compiler/src/core/schedule_analysis_utils.cpp index fc5ab7cde3..41225d8a5f 100644 --- a/src/vpux_compiler/src/core/schedule_analysis_utils.cpp +++ b/src/vpux_compiler/src/core/schedule_analysis_utils.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/core/schedule_analysis_utils.hpp" +#include "vpux/compiler/core/cost_model_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/utils/profiling/reports/ted.hpp" diff --git a/src/vpux_compiler/src/core/tiling.cpp b/src/vpux_compiler/src/core/tiling.cpp index ea96c2b837..0564deeead 100644 --- a/src/vpux_compiler/src/core/tiling.cpp +++ b/src/vpux_compiler/src/core/tiling.cpp @@ -3,36 +3,37 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include -#include -#include -#include -#include - -#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/core/tiling.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" -#include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/tiling_constraint_utils.hpp" -#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/compiler/dialect/core/types.hpp" -#include "vpux/compiler/utils/attributes.hpp" -#include "vpux/utils/core/numeric.hpp" - +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/IE/utils/roll_utils.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/interfaces/workload_splitter_base.hpp" +#include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/multi_cluster_strategy_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/op_tiling_cache.hpp" +#include "vpux/compiler/dialect/VPU/utils/tiling_constraint_utils.hpp" #include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" #include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" +#include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/dilated_utils.hpp" +#include "vpux/utils/core/numeric.hpp" + +#include +#include +#include + +#include +#include +#include using namespace vpux; @@ -709,30 +710,54 @@ namespace { struct PlaneTile final { DimRange width; DimRange height; + DimRange depth; + bool is5D = false; + + PlaneTile() = default; + + // 4D constructor + PlaneTile(DimRange w, DimRange h): width(w), height(h), depth(0, 0), is5D(false) { + } + + // 5D constructor + PlaneTile(DimRange w, DimRange h, DimRange d): width(w), height(h), depth(d), is5D(true) { + } int64_t area() const { - return width.length() * height.length(); + return is5D ? width.length() * height.length() * depth.length() : width.length() * height.length(); } - // Checks if rhs located completely in this. bool contains(const PlaneTile& other) const { - return width.contains(other.width) && height.contains(other.height); + VPUX_THROW_UNLESS(is5D == other.is5D, "Cannot compare 4D and 5D tiles"); + bool base = width.contains(other.width) && height.contains(other.height); + return is5D ? base && depth.contains(other.depth) : base; } // Returns new `PlaneTile` which represents `other` as ROI of this. PlaneTile asROI(const PlaneTile& other) const { - return {width.asROI(other.width), height.asROI(other.height)}; + VPUX_THROW_UNLESS(is5D == other.is5D, "Cannot compute ROI between 4D and 5D tiles"); + return is5D ? PlaneTile(width.asROI(other.width), height.asROI(other.height), depth.asROI(other.depth)) + : PlaneTile(width.asROI(other.width), height.asROI(other.height)); } bool operator==(const PlaneTile& other) const { - return width == other.width && height == other.height; + if (is5D != other.is5D) { + return false; + } + bool base = width == other.width && height == other.height; + return is5D ? base && depth == other.depth : base; } + bool operator!=(const PlaneTile& other) const { return !(*this == other); } void printFormat(llvm::raw_ostream& stream) const { - printTo(stream, "PlaneTile [width tile = {0}, height tile = {1}]", width, height); + if (is5D) { + printTo(stream, "PlaneTile [width tile = {0}, height tile = {1}, depth tile = {2}]", width, height, depth); + } else { + printTo(stream, "PlaneTile [width tile = {0}, height tile = {1}]", width, height); + } } }; @@ -757,24 +782,55 @@ struct PlaneTileSolution final { template std::tuple inputForOutputTile(const PlaneTile& output, int64_t kernelX, int64_t kernelY, int64_t strideX, int64_t strideY, ShapeRef initialInputDims, - const PadInfo& initialPad) { - PlaneTile inputTile = {{0, 0}, {0, 0}}; - PadInfo pad = {0, 0, 0, 0}; - std::tie(inputTile.height, pad.top, pad.bottom) = inputForOutputDim( - output.height, kernelY, strideY, {0, initialInputDims[Dims::Act::H]}, initialPad.top, initialPad.bottom); + const PadInfo& initialPad, + std::optional kernelD = std::nullopt, + std::optional strideD = std::nullopt) { + PlaneTile inputTile; + PadInfo pad; + + if (output.is5D) { + inputTile = PlaneTile({0, 0}, {0, 0}, {0, 0}); + pad = {0, 0, 0, 0, 0, 0}; + + std::tie(inputTile.height, pad.top, pad.bottom) = + inputForOutputDim(output.height, kernelY, strideY, {0, initialInputDims[Dims5D::Act::H]}, + initialPad.top, initialPad.bottom); + + std::tie(inputTile.width, pad.left, pad.right) = + inputForOutputDim(output.width, kernelX, strideX, {0, initialInputDims[Dims5D::Act::W]}, + initialPad.left, initialPad.right); + + if (kernelD.has_value() && strideD.has_value()) { + std::tie(inputTile.depth, pad.front, pad.back) = + inputForOutputDim(output.depth, kernelD.value(), strideD.value(), + {0, initialInputDims[Dims5D::Act::D]}, initialPad.front, initialPad.back); + } else { + VPUX_THROW("Missing kernelD/strideD for 5D"); + } + } else { + // 4D case + inputTile = PlaneTile({0, 0}, {0, 0}); + pad = {0, 0, 0, 0}; - std::tie(inputTile.width, pad.left, pad.right) = inputForOutputDim( - output.width, kernelX, strideX, {0, initialInputDims[Dims::Act::W]}, initialPad.left, initialPad.right); + std::tie(inputTile.height, pad.top, pad.bottom) = + inputForOutputDim(output.height, kernelY, strideY, {0, initialInputDims[Dims::Act::H]}, initialPad.top, + initialPad.bottom); + + std::tie(inputTile.width, pad.left, pad.right) = inputForOutputDim( + output.width, kernelX, strideX, {0, initialInputDims[Dims::Act::W]}, initialPad.left, initialPad.right); + } return std::make_tuple(inputTile, pad); } template PlaneTileSolution solutionForOutputTile(const PlaneTile& output, int64_t kernelX, int64_t kernelY, int64_t strideX, - int64_t strideY, ShapeRef initialInputDims, const PadInfo& initialPad) { + int64_t strideY, ShapeRef initialInputDims, const PadInfo& initialPad, + std::optional kernelD = std::nullopt, + std::optional strideD = std::nullopt) { PlaneTileSolution solution; - std::tie(solution.inputTile, solution.inputPad) = - inputForOutputTile(output, kernelX, kernelY, strideX, strideY, initialInputDims, initialPad); + std::tie(solution.inputTile, solution.inputPad) = inputForOutputTile( + output, kernelX, kernelY, strideX, strideY, initialInputDims, initialPad, kernelD, strideD); return solution; } @@ -787,7 +843,8 @@ PlaneTileSolution solutionForOutputTile(const PlaneTile& output, int64_t kernelX // a redundant data slice from input. Here is to restore original input planar shape to avoid extra copies. template void restorePlanarShapeForInputTile(TileInfo& inputTile, ShapeRef origInputShape, vpux::Dim planarDim) { - if (planarDim != Dims::Act::H && planarDim != Dims::Act::W) { + if (planarDim != Dims::Act::H && planarDim != Dims::Act::W && + !(std::is_same_v && planarDim == Dims5D::Act::D)) { VPUX_THROW("Invalid planar dim {0}", planarDim); } if (inputTile.shape[planarDim] > origInputShape[planarDim]) { @@ -940,12 +997,72 @@ InputTiling vpux::backInferMatMulTile(const TileInfo& outputTile, ShapeRef origI return TilingInfo{{std::move(inputTile), std::move(filterTile)}, solution.inputPad}; } +// +// 5D Pooling tiling +// + +InputTiling vpux::backInfer5DPoolTile(const TileInfo& outputTile, ShapeRef origInputShape, mlir::ArrayAttr kernel_size, + mlir::ArrayAttr strides, const PadInfo& origPadding) { + PlaneTile output; + output.is5D = true; + output.height.begin = outputTile.offsets[Dims5D::Act::H]; + output.height.end = outputTile.offsets[Dims5D::Act::H] + outputTile.shape[Dims5D::Act::H]; + output.width.begin = outputTile.offsets[Dims5D::Act::W]; + output.width.end = outputTile.offsets[Dims5D::Act::W] + outputTile.shape[Dims5D::Act::W]; + output.depth.begin = outputTile.offsets[Dims5D::Act::D]; + output.depth.end = outputTile.offsets[Dims5D::Act::D] + outputTile.shape[Dims5D::Act::D]; + + const auto kernelY = mlir::cast(kernel_size[Dims5D::Kernel::Y.ind()]).getValue().getSExtValue(); + const auto kernelX = mlir::cast(kernel_size[Dims5D::Kernel::X.ind()]).getValue().getSExtValue(); + const auto kernelD = mlir::cast(kernel_size[Dims5D::Kernel::Z.ind()]).getValue().getSExtValue(); + + const auto strideY = mlir::cast(strides[Dims5D::Strides::Y.ind()]).getValue().getSExtValue(); + const auto strideX = mlir::cast(strides[Dims5D::Strides::X.ind()]).getValue().getSExtValue(); + const auto strideD = mlir::cast(strides[Dims5D::Strides::Z.ind()]).getValue().getSExtValue(); + + const auto solution = solutionForOutputTile(output, kernelX, kernelY, strideX, strideY, origInputShape, + origPadding, kernelD, strideD); + TileInfo inputTile(origInputShape); + + inputTile.shape[Dims5D::Act::N] = outputTile.shape[Dims5D::Act::N]; + inputTile.offsets[Dims5D::Act::N] = outputTile.offsets[Dims5D::Act::N]; + + inputTile.shape[Dims5D::Act::C] = outputTile.shape[Dims5D::Act::C]; + inputTile.offsets[Dims5D::Act::C] = outputTile.offsets[Dims5D::Act::C]; + + inputTile.offsets[Dims5D::Act::H] = solution.inputTile.height.begin; + inputTile.shape[Dims5D::Act::H] = solution.inputTile.height.length(); + + inputTile.offsets[Dims5D::Act::W] = solution.inputTile.width.begin; + inputTile.shape[Dims5D::Act::W] = solution.inputTile.width.length(); + + inputTile.offsets[Dims5D::Act::D] = solution.inputTile.depth.begin; + inputTile.shape[Dims5D::Act::D] = solution.inputTile.depth.length(); + + if (outputTile.isCompletedTile && outputTile.axis[Dims5D::Act::H] == 1) { + restorePlanarShapeForInputTile(inputTile, origInputShape, Dims5D::Act::H); + } + if (outputTile.isCompletedTile && outputTile.axis[Dims5D::Act::W] == 1) { + restorePlanarShapeForInputTile(inputTile, origInputShape, Dims5D::Act::W); + } + if (outputTile.isCompletedTile && outputTile.axis[Dims5D::Act::D] == 1) { + restorePlanarShapeForInputTile(inputTile, origInputShape, Dims5D::Act::D); + } + + return TilingInfo{{std::move(inputTile)}, solution.inputPad}; +} + // // Pooling tiling // InputTiling vpux::backInferPoolTile(const TileInfo& outputTile, ShapeRef origInputShape, mlir::ArrayAttr kernel_size, mlir::ArrayAttr strides, const PadInfo& origPadding) { + const auto inputRank = origInputShape.size(); + if (inputRank == 5) { + return backInfer5DPoolTile(outputTile, origInputShape, kernel_size, strides, origPadding); + } + PlaneTile output; output.height.begin = outputTile.offsets[Dims4D::Act::H]; output.height.end = outputTile.offsets[Dims4D::Act::H] + outputTile.shape[Dims4D::Act::H]; @@ -1776,22 +1893,20 @@ DimArr vpux::getTileDimOrder(mlir::Operation* op, TilingMode tilingMode, Logger // First tile at C // else tile at H + log.nest(2).trace("Check tile Dim order for Op at {0}", op->getLoc()); auto tileDimOrder = llvm::TypeSwitch(op) .Case([&](mlir::Operation* op) { - log.nest(2).trace("Check tile Dim order for Op at {0}", op->getLoc()); - // This can be removed when VPUNN is upgraded to support INT4 data type, tracked in E#113316. - if (VPU::isNCEWithInt4Weights(op)) { + auto costModelUtils = VPU::getICostModelUtilsInterface(op->getContext()); + if (VPU::isNCEWithInt4Weights(op) && !costModelUtils->isNCEWithInt4WeightsSupported()) { return getTileDimOrderByShape(op, Dims4D::Filter::OC, Dims4D::Act::H); } return getTileDimOrderByShape(op, Dims4D::Filter::IC, Dims4D::Act::C); }) .Case([&](mlir::Operation* op) { - log.nest(2).trace("Check tile Dim order for Op at {0}", op->getLoc()); return getTileDimOrderByShape(op, Dims4D::Filter::OC, Dims4D::Act::H); }) .Case([&](mlir::Operation* op) { - log.nest(2).trace("Check tile Dim order for Op at {0}", op->getLoc()); const auto outputShape = getShape(op->getResult(0)); const auto isChannelValid = VPU::doesNCEOpChannelSatisfyWorkload(op, TileInfo(outputShape)); if (isChannelValid) { @@ -1801,7 +1916,6 @@ DimArr vpux::getTileDimOrder(mlir::Operation* op, TilingMode tilingMode, Logger } }) .Case([&](mlir::Operation* op) { - log.nest(2).trace("Check tile Dim order for Op at {0}", op->getLoc()); const auto outputShape = getShape(op->getResult(0)); const auto isChannelValid = VPU::doesNCEOpChannelSatisfyWorkload(op, TileInfo(outputShape)); if (isChannelValid) { @@ -1813,7 +1927,6 @@ DimArr vpux::getTileDimOrder(mlir::Operation* op, TilingMode tilingMode, Logger } }) .Case([&](mlir::Operation* op) { - log.nest(2).trace("Check tile Dim order for Op at {0}", op->getLoc()); auto mvn1 = mlir::dyn_cast(op); auto dims = mvn1.getNonNormDims(); VPUX_THROW_UNLESS(dims.size(), "Could not find non-norm axes"); @@ -1831,7 +1944,8 @@ DimArr vpux::getTileDimOrder(mlir::Operation* op, TilingMode tilingMode, Logger auto retDims = getTileDimOrderND(outType.getMemShape(), order); if (order.toMemDim(Dims4D::Act::C).ind() == (outType.getRank() - 1)) { - // Avoid C-tiling in C-minor layout as may lead to Shave suboptimal configs (e.g. C=21) + // Avoid C-tiling in C-minor layout as may lead to Shave + // suboptimal configs (e.g. C=21) auto dimIt = std::find(retDims.begin(), retDims.end(), Dims4D::Act::C); if (dimIt != retDims.end()) { retDims.erase(dimIt); @@ -1840,8 +1954,8 @@ DimArr vpux::getTileDimOrder(mlir::Operation* op, TilingMode tilingMode, Logger return retDims; }) .Case([&](mlir::Operation*) { - // Not splitting over C, to keep aligned with number of Scales in qType - // and so avoid 'validateQuantElemType' fail + // Not splitting over C, to keep aligned with number of Scales in + // qType and so avoid 'validateQuantElemType' fail return DimArr{Dims4D::Act::H, Dims4D::Act::W}; }) .Case([&](mlir::Operation*) { @@ -1876,7 +1990,6 @@ DimArr vpux::getTileDimOrder(mlir::Operation* op, TilingMode tilingMode, Logger return tileDimOrder; }) .Case([&](mlir::Operation* op) { - log.nest(2).trace("Check tile Dim order for Op at {0}", op->getLoc()); auto preluOp = mlir::dyn_cast(op); auto inputShape = getShape(preluOp.getInput()); auto slopeShape = getShape(preluOp.getNegativeSlope()); @@ -1898,7 +2011,8 @@ DimArr vpux::getTileDimOrder(mlir::Operation* op, TilingMode tilingMode, Logger const auto outputType = mlir::cast(op->getResult(0).getType()); VPUX_THROW_UNLESS(outputType.getDimsOrder() == DimsOrder::NCHW || outputType.getDimsOrder() == DimsOrder::NHWC, - "DepthToSpace Op only support NCHW and NHWC layout, but got '{0}'", + "DepthToSpace Op only support NCHW and NHWC " + "layout, but got '{0}'", outputType.getDimsOrder()); // It is better to tile DepthToSpace Op at the highest dimension @@ -1909,16 +2023,17 @@ DimArr vpux::getTileDimOrder(mlir::Operation* op, TilingMode tilingMode, Logger : SmallVector{Dims4D::Act::C, Dims4D::Act::H, Dims4D::Act::W}; } - // It is illegal to tile DepthToSpace Op at channel when it is the BLOCKS_FIRST mode - // If that, the output will be a discontinuous memory buffer and will cause accuracy issue + // It is illegal to tile DepthToSpace Op at channel when it is the + // BLOCKS_FIRST mode If that, the output will be a discontinuous + // memory buffer and will cause accuracy issue if (origOp.getMode() == IE::DepthToSpaceMode::BLOCKS_FIRST) { return SmallVector{Dims4D::Act::H, Dims4D::Act::W}; } - VPUX_THROW("Unknown DepthToSpaceMode. BLOCKS_FIRST and DEPTH_FIRST methods are supported only"); + VPUX_THROW("Unknown DepthToSpaceMode. BLOCKS_FIRST and " + "DEPTH_FIRST methods are supported only"); }) .Case([&](mlir::Operation* op) { - log.nest(2).trace("Check tile Dim order for Op at {0}", op->getLoc()); const auto inType = mlir::cast(op->getOperand(0).getType()); const auto outType = mlir::cast(op->getResult(0).getType()); const auto inOrder = inType.getDimsOrder(); @@ -1929,8 +2044,7 @@ DimArr vpux::getTileDimOrder(mlir::Operation* op, TilingMode tilingMode, Logger return getTileDimOrderND(outType.getMemShape(), outType.getDimsOrder()); } }) - .Case([&](mlir::Operation* op) { - log.nest(2).trace("Check tile Dim order for Op at {0}", op->getLoc()); + .Case([&](mlir::Operation*) { return DimArr{Dims4D::Act::H, Dims4D::Act::W, Dims4D::Act::C}; }) .Case([&](VPU::NormalizeL2Op op) { @@ -1956,7 +2070,6 @@ DimArr vpux::getTileDimOrder(mlir::Operation* op, TilingMode tilingMode, Logger return tileDimOrder; }) .Case([&](VPU::DynamicDequantizeOp) { - log.nest(2).trace("Check tile Dim order for Op at {0}", op->getLoc()); const auto outputType = mlir::cast(op->getResult(0).getType()); auto tileDimOrder = getTileDimOrderND(outputType.getMemShape(), outputType.getDimsOrder()); // Ensure tile less the W dim to avoid using slow C algo @@ -1997,11 +2110,9 @@ DimArr vpux::getTileDimOrder(mlir::Operation* op, TilingMode tilingMode, Logger return tileDimOrder; }) .Case([&](mlir::Operation* op) { - log.nest(2).trace("Check tile Dim order for Op at {0}", op->getLoc()); return getOuterDimPrioritizedTileDimOrderND(op, tilingMode, log); }) .Default([&](mlir::Operation* op) -> DimArr { - log.nest(2).trace("Check tile Dim order for Op at {0}", op->getLoc()); const auto outputType = mlir::cast(op->getResult(0).getType()); return getTileDimOrderND(outputType.getMemShape(), outputType.getDimsOrder()); @@ -2057,7 +2168,8 @@ bool vpux::isMultiClusterCompatibleForTiling(mlir::Operation* op, const OutputTi // 2. (Height) DPUs are fully utilized - at least one line per DPU. // 3. checkMinimalWidthAndHeight ensures each DPU processes at least 4 lines for efficiency. // 4. (Channel) No extra channel alignment - the output channel for each cluster should be larger than minChannelSize. -SmallVector vpux::getMaxNumTiles(mlir::Operation* op, bool checkMinimalWidthAndHeight) { +SmallVector vpux::getMaxNumTiles(mlir::Operation* op, bool checkMinimalWidthAndHeight, + bool checkWorkloadEfficiency) { const auto outputShape = getShape(op->getResult(0)); // #E152765 - generic support for GNCHW const auto dimH = requiresDimsGroups5D(op) ? DimsGroups5D::Act::H : Dims4D::Act::H; @@ -2115,7 +2227,14 @@ SmallVector vpux::getMaxNumTiles(mlir::Operation* op, bool checkMinimal if (mlir::isa(op) && checkMinimalWidthAndHeight) { // Stencils are using 4x4x16 tile configuration // NCE is more efficient when height and width are larger than 4 lines - minHeightSize = std::max({4, minHeightSize}); + // + // If the height is between 5 and 7 lines, the workload efficiency is suboptimal. + // Therefore, we increase the minimum height to 8 lines to improve efficiency. + // Currently, this adjustment is only applied to the multi-dimension pipeline tiling strategy. + // This is because layers requiring multi-dimension tiling are typically compute-bound, + // necessitating a greater focus on optimizing workload efficiency. + minHeightSize = + checkWorkloadEfficiency ? std::max({8, minHeightSize}) : std::max({4, minHeightSize}); minWidthSize = std::max({4, minWidthSize}); } // NCEPermute operation requires alignment only for width @@ -2129,6 +2248,14 @@ SmallVector vpux::getMaxNumTiles(mlir::Operation* op, bool checkMinimal VPUX_THROW_UNLESS(outputShape.size() == 4 || outputShape.size() == DimsGroups5D::Act::numDims, "Unsupported shape rank: {0}", outputShape.size()); minChannelSize = std::max({minChannelSize, channelsInfo.getOutputChannelAlignment()}); + // When the output channel size is 16, the workload efficiency is suboptimal. + // To improve efficiency, we increase the minimum channel size to 64 (16*4). + // Currently, this adjustment is only applied to the multi-dimension pipeline tiling strategy. + // This is because layers requiring multi-dimension tiling are typically compute-bound, + // necessitating a greater focus on optimizing workload efficiency. + if (checkWorkloadEfficiency) { + minChannelSize = minChannelSize * 4; + } } // Consider supported channels for DW ops @@ -2174,9 +2301,12 @@ SmallVector vpux::getMaxNumTiles(mlir::Operation* op, bool checkMinimal // each cluster should compute at least minChannelSize(=16) output channels. // For SOK, we can use less than the specified number of clusters, to avoid the requirement to align output int64_t minNumClustersForSOK = tileCount; - while (minNumClustersForSOK > 0 && outputShape[dimC] % (minChannelSize * minNumClustersForSOK) != 0) { - --minNumClustersForSOK; + if (!checkWorkloadEfficiency) { + while (minNumClustersForSOK > 0 && outputShape[dimC] % (minChannelSize * minNumClustersForSOK) != 0) { + --minNumClustersForSOK; + } } + if (minNumClustersForSOK <= 1) { minNumClustersForSOK = tileCount; } @@ -2431,7 +2561,7 @@ bool vpux::isSupportedTileSizeForLargeActivation(mlir::Operation* origOp, ShapeR // with fragments considered. Otherwise, it returns false, indicating that the number of tiles should be increased. bool vpux::isSupportedTileSizeForLargeActivation(mlir::Operation* origOp, ShapeRef nTilesOnDim, double fragmentRatio, Logger log) { - if (VPU::getArch(origOp) <= VPU::ArchKind::NPU40XX) { + if (config::getArch(origOp) <= config::ArchKind::NPU40XX) { return true; } @@ -2536,6 +2666,33 @@ void ensureNTilesIsCompatibleWithMultiCluster(mlir::Operation* op, Shape& nTiles } } +std::pair determineInnerAndOuterDims(mlir::Operation* op, SmallVector& dimsToTile, + ShapeRef nTilesOnDim) { + auto unrollSpatialFirst = isSpatialFirstNestedTiling(op, nTilesOnDim); + + SmallVector dimSpatials; + std::copy_if(dimsToTile.begin(), dimsToTile.end(), std::back_inserter(dimSpatials), [](const Dim& dim) { + return dim == Dims4D::Act::H || dim == Dims4D::Act::W || dim == DimsGroups5D::Act::H || + dim == DimsGroups5D::Act::W; + }); + + SmallVector dimChannels; + std::copy_if(dimsToTile.begin(), dimsToTile.end(), std::back_inserter(dimChannels), [](const Dim& dim) { + return dim == Dims4D::Act::C || dim == DimsGroups5D::Act::C; + }); + + VPUX_THROW_WHEN(dimChannels.empty() && (dimSpatials.size() < 2), + "Operation '{0}' at '{1}' has no channel or spatial dimensions to tile", op->getName(), + op->getLoc()); + + auto innerDim = + dimChannels.empty() ? dimSpatials.back() : (unrollSpatialFirst ? dimChannels.front() : dimSpatials.front()); + auto outerDim = dimChannels.empty() ? dimSpatials.front() + : (unrollSpatialFirst ? dimSpatials.front() : dimChannels.front()); + + return {innerDim, outerDim}; +} + } // namespace // HWLayer @@ -2690,13 +2847,8 @@ mlir::FailureOr vpux::getHWLayerTilingStrategyWithTileDimOrderForP VPUX_THROW_WHEN(isolatedTiles.empty(), "Empty tiles for op '{0}'", op->getLoc()); auto nTilesOnDim = isolatedTiles.front().axis; auto dimsToTile = getNonOneDim(nTilesOnDim); - if (dimsToTile.size() > 1) { - log.nest(1).trace("Fallback to isolated strategy due to nested tiling: {0}", nTilesOnDim); - return mlir::failure(); - } - auto tilingBuilder = mlir::cast(op); - const auto& maxNumTiles = tilingBuilder.getMaxNumTiles(); + const auto& maxNumTiles = (dimsToTile.size() > 1) ? getMaxNumTiles(op, true, true) : tilingBuilder.getMaxNumTiles(); const auto dimAlignInfo = getAlignDimAndSize(op); auto dimToAlign = dimAlignInfo.first; auto dimAlignment = dimAlignInfo.second; @@ -2707,31 +2859,94 @@ mlir::FailureOr vpux::getHWLayerTilingStrategyWithTileDimOrderForP auto dimActC = requiresDimsGroups5D(op) ? DimsGroups5D::Act::C : Dims4D::Act::C; const auto targetDim = dimsToTile.size() == 0 ? dimActC : dimsToTile[0]; Shape prefetchableTilesOnDim = nTilesOnDim; - auto increaseDimForAlign = [&](Shape& tilesOnDim) -> bool { + auto increaseDimForAlign = [&](Shape& tilesOnDim, vpux::Dim curDim) -> bool { do { - ++tilesOnDim[targetDim]; - if (!isDimLeftToTile(tilesOnDim, maxNumTiles, targetDim)) { + ++tilesOnDim[curDim]; + if (!isDimLeftToTile(tilesOnDim, maxNumTiles, curDim)) { + return false; + } + } while (!isSupportedAlignedDivision(outputShape[curDim], tilesOnDim[curDim], dimAlignment)); + return true; + }; + + auto updatePrefetchableTilesOnDim = [&](vpux::Dim dim) -> bool { + if (prefetchableTilesOnDim[dim] >= MAX_PREFETCH_TILING_TIME * nTilesOnDim[dim] || + !isDimLeftToTile(prefetchableTilesOnDim, maxNumTiles, dim)) { + log.nest(3).trace("Fallback to isolated strategy: {0}", nTilesOnDim); + return false; + } + if (dim == dimToAlign && dimAlignment != 1) { + if (!increaseDimForAlign(prefetchableTilesOnDim, dim)) { + log.nest(3).trace("Fallback to isolated strategy: {0}", nTilesOnDim); + return false; + } + } else { + ++prefetchableTilesOnDim[dim]; + } + + return true; + }; + + auto generatePipelineTilingForTargetDim = [&](vpux::Dim dim) -> bool { + while (mlir::failed(isSupportedTileSize(op, prefetchableTilesOnDim, TilingMode::PIPELINING, log))) { + if (!updatePrefetchableTilesOnDim(dim)) { return false; } - } while (!isSupportedAlignedDivision(outputShape[targetDim], tilesOnDim[targetDim], dimAlignment)); + } + return true; }; log.trace("Attempting to generate tiling strategy for pipelining based on {0}", nTilesOnDim); - while (mlir::failed(isSupportedTileSize(op, prefetchableTilesOnDim, TilingMode::PIPELINING, log))) { - if (prefetchableTilesOnDim[targetDim] >= MAX_PREFETCH_TILING_TIME * nTilesOnDim[targetDim] || - !isDimLeftToTile(prefetchableTilesOnDim, maxNumTiles, targetDim)) { + if (dimsToTile.size() > 1) { + auto costModelUtils = VPU::getICostModelUtilsInterface(op->getContext()); + if (!costModelUtils->isMultiDimPipelineTilingSupported()) { + return mlir::failure(); + } + + log.nest(3).trace("prefetchableTilesOnDim is : {0}", prefetchableTilesOnDim); + log.nest(3).trace("maxNumTiles is : {0}", maxNumTiles); + for (auto dim : dimsToTile) { + if (prefetchableTilesOnDim[dim] > maxNumTiles[dim.ind()]) { + prefetchableTilesOnDim[dim] = maxNumTiles[dim.ind()]; + } + } + + // For multi-dim tiling, we need to determine the inner and outer dimensions + // and generate the pipeline tiling strategy for the inner dimension. + // If the inner dimension is not supported, we will try to increase the outer dimension + // until we find a supported tiling strategy. + // + // If the number of tiles exceeds the maximum limit, we will fallback to isolated strategy. + // Otherwise, this may lead to a timeout error during the compilation phase. + constexpr int64_t MAX_NUM_TILES = 1000; + if (nTilesOnDim.totalSize() > MAX_NUM_TILES) { log.nest(3).trace("Fallback to isolated strategy: {0}", nTilesOnDim); return mlir::failure(); } - if (targetDim == dimToAlign && dimAlignment != 1) { - if (!increaseDimForAlign(prefetchableTilesOnDim)) { - log.nest(3).trace("Fallback to isolated strategy: {0}", nTilesOnDim); + + auto innerAndOuterDims = determineInnerAndOuterDims(op, dimsToTile, nTilesOnDim); + auto innerDim = innerAndOuterDims.first; + auto outerDim = innerAndOuterDims.second; + + while (!generatePipelineTilingForTargetDim(innerDim)) { + if (!updatePrefetchableTilesOnDim(outerDim)) { return mlir::failure(); } - } else { - ++prefetchableTilesOnDim[targetDim]; } + + auto prefetchableTiles = fillDividedTiles(op, prefetchableTilesOnDim, outputShape); + if (mlir::failed(prefetchableTiles)) { + log.nest(3).trace("Fallback to isolated strategy: {0}", nTilesOnDim); + return mlir::failure(); + } + log.trace("Pipelining strategy for multi-dim tiling: {0}", prefetchableTilesOnDim); + return prefetchableTiles.value(); + } + + if (!generatePipelineTilingForTargetDim(targetDim)) { + log.nest(3).trace("Fallback to isolated strategy: {0}", nTilesOnDim); + return mlir::failure(); } // Step3. Continue to increase number of tiling for large data pipelining @@ -2753,7 +2968,7 @@ mlir::FailureOr vpux::getHWLayerTilingStrategyWithTileDimOrderForP } if (targetDim == dimToAlign && dimAlignment != 1) { - if (!increaseDimForAlign(largeDataPipeliningTilesOnDim)) { + if (!increaseDimForAlign(largeDataPipeliningTilesOnDim, targetDim)) { if (mlir::failed(prefetchableTiles)) { log.nest(3).trace("Fallback to isolated strategy: {0}", nTilesOnDim); return mlir::failure(); @@ -2847,6 +3062,12 @@ bool vpux::isDimLeftToTile(ShapeRef curNumTiles, ArrayRef maxNumTiles, mlir::FailureOr vpux::isSupportedTileSize(mlir::Operation* op, ShapeRef nTilesOnDim, TilingMode tilingMode, Logger log) { + if (llvm::any_of(nTilesOnDim, [](int64_t tile) { + return tile < 1; + })) { + return mlir::failure(); + } + const auto outputShape = getShape(op->getResult(0)); const auto tiles = fillDividedTiles(op, nTilesOnDim, outputShape); if (mlir::failed(tiles)) { @@ -2867,11 +3088,34 @@ mlir::FailureOr vpux::isSupportedTileSize(mlir::Operation* op, Sha auto tilesToCheck = tilingMode == TilingMode::ISOLATED ? VPU::getUniqueShapeTilingCandidates(op, tiles.value(), log) : tiles.value(); - if (isMultiClusterCompatibleForTiling(op, tilesToCheck, log) && - tilingInfo.isSupportedTiling(tilesToCheck, tilingMode, log)) { + if (!isMultiClusterCompatibleForTiling(op, tilesToCheck, log)) { + return mlir::failure(); + } + + auto dimsToTile = getNonOneDim(nTilesOnDim); + if ((dimsToTile.size() > 1) && (tilingMode == TilingMode::PIPELINING)) { + auto innerAndOuterDims = determineInnerAndOuterDims(op, dimsToTile, nTilesOnDim); + auto innerDim = innerAndOuterDims.first; + auto innerDimSize = nTilesOnDim[innerDim]; + auto totalTileSize = nTilesOnDim.totalSize(); + + log.trace("check pipelining tiling for inner loop {0} with size {1}", innerDim, innerDimSize); + for (int64_t offset = 0; offset < totalTileSize; offset += innerDimSize) { + auto tilesToCheckForInnerLoop = + OutputTiling(tilesToCheck.begin() + offset, tilesToCheck.begin() + offset + innerDimSize); + log.trace("isSupportedTiling from {0} to {1}", offset, offset + innerDimSize); + + if (!tilingInfo.isSupportedTiling(tilesToCheckForInnerLoop, tilingMode, log)) { + return mlir::failure(); + } + } + return tiles; } + if (tilingInfo.isSupportedTiling(tilesToCheck, tilingMode, log)) { + return tiles; + } return mlir::failure(); } @@ -2932,6 +3176,18 @@ bool vpux::isSupportedAlignedDivision(int64_t dimSize, int64_t tiles, int64_t al return remainder > 0; } +SmallVector vpux::getTilingOrderedDims(mlir::Operation* operation, ShapeRef tiling) { + auto dimOrder = getTileDimOrder(operation, TilingMode::ISOLATED, Logger::global()); + SmallVector nonOneDims; + nonOneDims.reserve(dimOrder.size()); + + llvm::copy_if(dimOrder, std::back_inserter(nonOneDims), [&](auto dim) { + return tiling[dim] > 1; + }); + + return nonOneDims; +} + SmallVector vpux::getNonOneDim(ShapeRef inputShape) { SmallVector nonOneDims; for (auto index : irange(inputShape.size())) { @@ -3074,6 +3330,7 @@ SmallVector getValidNonOneDim(ShapeRef inputShape, DimArrRef tileDimOrder) SmallVector vpux::getAllHWLayerTilingStrategies(mlir::Operation* op, TilingMode tilingMode, DimArrRef tileDimOrder, Logger log) { + log.trace("Get all feasible strategies for layer {0}", op->getLoc()); SmallVector feasibleStrategies; const auto outputShape = getShape(op->getResult(0)); auto dimensions = getValidNonOneDim(outputShape, tileDimOrder); diff --git a/src/vpux_compiler/src/dialect/ELFNPU37XX/export.cpp b/src/vpux_compiler/src/dialect/ELFNPU37XX/export.cpp index b5f401268d..43310cf7bf 100644 --- a/src/vpux_compiler/src/dialect/ELFNPU37XX/export.cpp +++ b/src/vpux_compiler/src/dialect/ELFNPU37XX/export.cpp @@ -7,6 +7,8 @@ #include #include #include "vpux/compiler/dialect/ELFNPU37XX/metadata.hpp" +#include "vpux/compiler/dialect/ELFNPU37XX/ops.hpp" +#include "vpux/compiler/dialect/ELFNPU37XX/ops_interfaces.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" namespace vpux::ELFNPU37XX { diff --git a/src/vpux_compiler/src/dialect/ELFNPU37XX/import.cpp b/src/vpux_compiler/src/dialect/ELFNPU37XX/import.cpp index 30eea10db2..7133e1d10b 100644 --- a/src/vpux_compiler/src/dialect/ELFNPU37XX/import.cpp +++ b/src/vpux_compiler/src/dialect/ELFNPU37XX/import.cpp @@ -5,8 +5,6 @@ #include "vpux/compiler/dialect/ELFNPU37XX/import.hpp" #include "vpux/compiler/dialect/ELFNPU37XX/elf_importer.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/utils/logging.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/ELFNPU37XX/metadata.cpp b/src/vpux_compiler/src/dialect/ELFNPU37XX/metadata.cpp index 966b4af4dd..e4906aa561 100644 --- a/src/vpux_compiler/src/dialect/ELFNPU37XX/metadata.cpp +++ b/src/vpux_compiler/src/dialect/ELFNPU37XX/metadata.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/ELFNPU37XX/metadata.hpp" #include "vpux/compiler/core/types/quantile_float/types.hpp" #include "vpux/compiler/dialect/VPUASM/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/utils/core/error.hpp" @@ -371,7 +372,7 @@ std::unique_ptr ELFNPU37XX::constructMetadata(mlir::Module // Copy arch_name and throw if it doesn't fit into the buffer. // arch_name must not be truncated to ensure proper operation of the ELF loader. - copy_str(metadata.mIdentification.arch_name, VPU::stringifyArchKind(VPU::getArch(module)).str(), true); + copy_str(metadata.mIdentification.arch_name, config::stringifyArchKind(config::getArch(module)).str(), true); // Copy blob_name and throw if it doesn't fit into the buffer. // blob_name must not be truncated to ensure proper operation of the driver. copy_str(metadata.mIdentification.blob_name, module.getName().value_or("network").str(), true); @@ -384,8 +385,8 @@ std::unique_ptr ELFNPU37XX::constructMetadata(mlir::Module metadata.mProfilingOutputs.resize(profilingOutputsInfo.size()); - const auto architecture = VPU::getArch(module); - if (architecture >= VPU::ArchKind::NPU40XX) { + const auto architecture = config::getArch(module); + if (architecture >= config::ArchKind::NPU40XX) { auto ioBindings = VPUASM::IOBindingsOp::getFromModule(module); auto inputDeclarations = to_small_vector(ioBindings.getInputDeclarations().front().getOps()); diff --git a/src/vpux_compiler/src/dialect/ELFNPU37XX/ops/create_profiling_section.cpp b/src/vpux_compiler/src/dialect/ELFNPU37XX/ops/create_profiling_section.cpp index b0fdcb8d08..032c8a5e2f 100644 --- a/src/vpux_compiler/src/dialect/ELFNPU37XX/ops/create_profiling_section.cpp +++ b/src/vpux_compiler/src/dialect/ELFNPU37XX/ops/create_profiling_section.cpp @@ -10,6 +10,7 @@ #include "vpux/compiler/dialect/ELFNPU37XX/attributes.hpp" #include "vpux/compiler/dialect/ELFNPU37XX/ops.hpp" #include "vpux/compiler/dialect/VPUMI37XX/ops.hpp" +#include "vpux/compiler/utils/error.hpp" void vpux::ELFNPU37XX::CreateProfilingSectionOp::serialize(elf::Writer& writer, vpux::ELFNPU37XX::SectionMapType& sectionMap, diff --git a/src/vpux_compiler/src/dialect/ELFNPU37XX/ops/symbol.cpp b/src/vpux_compiler/src/dialect/ELFNPU37XX/ops/symbol.cpp index 0076c32169..ad9cd797e8 100644 --- a/src/vpux_compiler/src/dialect/ELFNPU37XX/ops/symbol.cpp +++ b/src/vpux_compiler/src/dialect/ELFNPU37XX/ops/symbol.cpp @@ -6,6 +6,8 @@ #include #include "vpux/compiler/dialect/ELFNPU37XX/ops.hpp" +#include + namespace { mlir::Operation* getParentSectionOp(mlir::Value val) { // If one of the users of the value is a PutOpInSection op, then we are interested in its encapsulating section. diff --git a/src/vpux_compiler/src/dialect/ELFNPU37XX/utils.cpp b/src/vpux_compiler/src/dialect/ELFNPU37XX/utils.cpp index ebab34996b..09309c2b2b 100644 --- a/src/vpux_compiler/src/dialect/ELFNPU37XX/utils.cpp +++ b/src/vpux_compiler/src/dialect/ELFNPU37XX/utils.cpp @@ -94,8 +94,9 @@ size_t vpux::ELFNPU37XX::getOffsetOfOpInSection(mlir::Value& op) { return tile * vpux::ELFNPU37XX::CMX_SLICE_SIZE + declareBufferOp.getByteOffset(); } -llvm::SmallString<128> vpux::ELFNPU37XX::getSwKernelArchString(VPU::ArchKind archKind) { - VPUX_THROW_UNLESS(archKind == VPU::ArchKind::NPU37XX, "The only supported architecture for sw kernels is 3720xx"); +llvm::SmallString<128> vpux::ELFNPU37XX::getSwKernelArchString(config::ArchKind archKind) { + VPUX_THROW_UNLESS(archKind == config::ArchKind::NPU37XX, + "The only supported architecture for sw kernels is 3720xx"); return llvm::SmallString<128>("3720xx"); } @@ -265,13 +266,13 @@ size_t vpux::ELFNPU37XX::math::lcm(size_t a, size_t b) { // namespace { -const std::unordered_map vpuToElfArchEnumMap = { - {VPU::ArchKind::UNKNOWN, elf::platform::ArchKind::UNKNOWN}, - {VPU::ArchKind::NPU37XX, elf::platform::ArchKind::VPUX37XX}, - {VPU::ArchKind::NPU40XX, elf::platform::ArchKind::VPUX40XX}, +const std::unordered_map vpuToElfArchEnumMap = { + {config::ArchKind::UNKNOWN, elf::platform::ArchKind::UNKNOWN}, + {config::ArchKind::NPU37XX, elf::platform::ArchKind::VPUX37XX}, + {config::ArchKind::NPU40XX, elf::platform::ArchKind::VPUX40XX}, }; } // namespace -elf::platform::ArchKind vpux::ELFNPU37XX::mapVpuArchKindToElfArchKind(const VPU::ArchKind& archKind) { +elf::platform::ArchKind vpux::ELFNPU37XX::mapVpuArchKindToElfArchKind(const config::ArchKind& archKind) { return vpuToElfArchEnumMap.at(archKind); } diff --git a/src/vpux_compiler/src/dialect/HostExec/transforms/passes.cpp b/src/vpux_compiler/src/dialect/HostExec/transforms/passes.cpp index a99c7a067b..f74dab41c7 100644 --- a/src/vpux_compiler/src/dialect/HostExec/transforms/passes.cpp +++ b/src/vpux_compiler/src/dialect/HostExec/transforms/passes.cpp @@ -4,7 +4,6 @@ // #include "vpux/compiler/dialect/HostExec/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" namespace vpux::HostExec { diff --git a/src/vpux_compiler/src/dialect/HostExec/transforms/passes/convert_to_llvm_umd.cpp b/src/vpux_compiler/src/dialect/HostExec/transforms/passes/convert_to_llvm_umd.cpp index afcc23d553..78a567c1d4 100644 --- a/src/vpux_compiler/src/dialect/HostExec/transforms/passes/convert_to_llvm_umd.cpp +++ b/src/vpux_compiler/src/dialect/HostExec/transforms/passes/convert_to_llvm_umd.cpp @@ -18,8 +18,11 @@ #include #include #include "vpux/compiler/dialect/HostExec/IR/dialect.hpp" +#include "vpux/compiler/dialect/HostExec/IR/ops.hpp" #include "vpux/compiler/dialect/HostExec/params.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/core/IR/ops.hpp" +#include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/analysis.hpp" namespace vpux::HostExec { @@ -112,7 +115,7 @@ mlir::LLVM::CallOp createLLVMFuncCallOp(mlir::OpBuilder& builder, mlir::ModuleOp class LvlZeroAllocLowering final : public mlir::ConvertOpToLLVMPattern { public: LvlZeroAllocLowering(const mlir::LLVMTypeConverter& typeConverter) - : mlir::ConvertOpToLLVMPattern(typeConverter) { + : mlir::ConvertOpToLLVMPattern(typeConverter, vpux::benefitHigh) { } mlir::LogicalResult matchAndRewrite(mlir::memref::AllocOp origOp, OpAdaptor adaptor, mlir::ConversionPatternRewriter& rewriter) const final; @@ -133,10 +136,12 @@ mlir::LogicalResult LvlZeroAllocLowering::matchAndRewrite(mlir::memref::AllocOp auto loc = origOp.getLoc(); getMemRefDescriptorSizes(loc, memrefType, adaptor.getDynamicSizes(), rewriter, shape, strides, sizeBytes); - auto moduleOp = vpux::getModuleOp(origOp); mlir::MLIRContext* ctx = rewriter.getContext(); auto returnType = mlir::Type(mlir::LLVM::LLVMPointerType::get(ctx)); - auto funcOp = origOp->getParentOfType(); + mlir::func::FuncOp funcOp; + auto moduleOp = vpux::getModuleOp(origOp); + vpux::net::NetworkInfoOp netInfo; + vpux::net::NetworkInfoOp::getFromModule(moduleOp, netInfo, funcOp); auto numArgs = funcOp.getNumArguments(); auto context = funcOp.getArgument(GET_ARG_INDEX_CONTEXT(numArgs)); auto allocatedPtr = @@ -155,7 +160,7 @@ mlir::LogicalResult LvlZeroAllocLowering::matchAndRewrite(mlir::memref::AllocOp class LvlZeroMemoryCopyLowering final : public mlir::ConvertOpToLLVMPattern { public: LvlZeroMemoryCopyLowering(const mlir::LLVMTypeConverter& typeConverter) - : mlir::ConvertOpToLLVMPattern(typeConverter) { + : mlir::ConvertOpToLLVMPattern(typeConverter, vpux::benefitHigh) { } mlir::LogicalResult matchAndRewrite(mlir::memref::CopyOp origOp, OpAdaptor adaptor, mlir::ConversionPatternRewriter& rewriter) const final; @@ -201,7 +206,10 @@ mlir::LogicalResult LvlZeroMemoryCopyLowering ::matchAndRewrite(mlir::memref::Co auto module = vpux::getModuleOp(origOp); mlir::MLIRContext* ctx = rewriter.getContext(); auto returnType = mlir::Type(mlir::LLVM::LLVMVoidType::get(ctx)); - auto funcOp = origOp->getParentOfType(); + mlir::func::FuncOp funcOp; + auto moduleOp = vpux::getModuleOp(origOp); + vpux::net::NetworkInfoOp netInfo; + vpux::net::NetworkInfoOp::getFromModule(moduleOp, netInfo, funcOp); auto numArgs = funcOp.getNumArguments(); auto cmdlist = funcOp.getArgument(GET_ARG_INDEX_COMMAND_LIST(numArgs)); @@ -218,12 +226,15 @@ mlir::LogicalResult LvlZeroMemoryCopyLowering ::matchAndRewrite(mlir::memref::Co template class AsyncOpRewriter final : public mlir::OpRewritePattern { public: - AsyncOpRewriter(mlir::MLIRContext* ctx, Logger log): mlir::OpRewritePattern(ctx), _log(std::move(log)) { + AsyncOpRewriter(mlir::MLIRContext* ctx, const mlir::LLVMTypeConverter& typeConverter, mlir::PatternBenefit benefit, + Logger log) + : mlir::OpRewritePattern(ctx, benefit), _typeConverter(typeConverter), _log(std::move(log)) { this->setDebugName("AsyncOpRewriter"); } private: mlir::LogicalResult matchAndRewrite(AsyncOp origOp, mlir::PatternRewriter& rewriter) const final; + const mlir::LLVMTypeConverter& _typeConverter; Logger _log; }; @@ -231,7 +242,10 @@ template mlir::LogicalResult AsyncOpRewriter::matchAndRewrite(AsyncOp origOp, mlir::PatternRewriter& rewriter) const { auto submitCommandList = [&](mlir::Operation* origOp) { mlir::MLIRContext* ctx = rewriter.getContext(); - auto funcOp = origOp->getParentOfType(); + mlir::func::FuncOp funcOp; + auto moduleOp = vpux::getModuleOp(origOp); + vpux::net::NetworkInfoOp netInfo; + vpux::net::NetworkInfoOp::getFromModule(moduleOp, netInfo, funcOp); auto numArgs = funcOp.getNumArguments(); auto cmdlist = funcOp.getArgument(GET_ARG_INDEX_COMMAND_LIST(numArgs)); auto cmdQueue = funcOp.getArgument(GET_ARG_INDEX_COMMAND_QUEUE(numArgs)); @@ -250,6 +264,84 @@ mlir::LogicalResult AsyncOpRewriter::matchAndRewrite(AsyncOp origOp, ml return mlir::success(); } + mlir::async::AwaitOp awaitOp = mlir::cast(*origOp); + auto users = awaitOp->getUsers(); + if (users.empty()) { + return submitCommandList(origOp); + } + + // Async.AwaitOp + // Replace operand of uses with operand of async.awaitop + // as AwaitOp will be removed + mlir::Value awaitOpOperand = awaitOp.getOperand(); + auto op = awaitOpOperand.getDefiningOp(); + if (auto executeOp = mlir::cast(op)) { + const auto results = executeOp.getResults(); + int index = -1; + for (size_t i = 0; i < results.size(); ++i) { + if (awaitOp.getOperand() == results[i]) { + if (i == 0) { + _log.error("Invalid index {0} as the first result is token", i); + return mlir::failure(); + } + + // decrease index as the first result of ExecuteOp is token + index = static_cast(i) - 1; + break; + } + } + + if (index == -1) { + _log.error("Invalid async.AwaitOp"); + return mlir::failure(); + } + + auto moduleOp = vpux::getModuleOp(executeOp); + for (auto& op : executeOp.getBody()->getOperations()) { + if (auto callOp = mlir::dyn_cast(op)) { + auto callee = callOp->getAttrOfType("callee"); + auto root = callee.getRootReference(); + auto fnModule = moduleOp.lookupSymbol(root); + if (fnModule == nullptr) { + _log.error("Could not find binary op for subgraph: {0}", root.str()); + return mlir::failure(); + } + auto funcOp = fnModule.lookupSymbol(callee.getLeafReference().str()); + if (funcOp == nullptr) { + _log.error("Could not find function declaration: {0}", callee.getLeafReference().str()); + return mlir::failure(); + } + + auto resultCount = funcOp.getNumResults(); + auto inputCount = funcOp.getNumArguments(); + auto operand = *(callOp.getOperands().begin() + (inputCount - resultCount) + + static_cast(index)); + for (auto u : users) { + if (auto viewOp = mlir::dyn_cast(u)) { + viewOp.setOperand(0, operand); + } else if (auto copyOp = mlir::dyn_cast(u)) { + copyOp.setOperand(0, operand); + } else if (mlir::isa(u)) { + continue; + } else { + _log.error("Not supported user type: {0}", u->getName().getStringRef().str()); + return mlir::failure(); + } + } + } + } + } + + mlir::Operation* nextOp = origOp->getNextNode(); + + // If there are multiple AwaitOp for one operation with multiple outputs + // there will be multiple AwaitOp for synchronization. + // The last AwaitOp will be replaced with submitCommandList. + if (mlir::isa(nextOp)) { + rewriter.eraseOp(origOp); + return mlir::success(); + } return submitCommandList(origOp); } else if (mlir::isa(origOp)) { return submitCommandList(origOp); @@ -269,7 +361,10 @@ mlir::LogicalResult AsyncOpRewriter::matchAndRewrite( template <> mlir::LogicalResult AsyncOpRewriter::matchAndRewrite( mlir::async::CreateGroupOp origOp, mlir::PatternRewriter& rewriter) const { - auto funcOp = origOp->getParentOfType(); + mlir::func::FuncOp funcOp; + auto moduleOp = vpux::getModuleOp(origOp); + vpux::net::NetworkInfoOp netInfo; + vpux::net::NetworkInfoOp::getFromModule(moduleOp, netInfo, funcOp); auto numArgs = funcOp.getNumArguments(); auto cmdlist = funcOp.getArgument(GET_ARG_INDEX_COMMAND_LIST(numArgs)); @@ -280,6 +375,136 @@ mlir::LogicalResult AsyncOpRewriter::matchAndRewrite return mlir::success(); } +template <> +mlir::LogicalResult AsyncOpRewriter::matchAndRewrite(mlir::async::ExecuteOp origOp, + mlir::PatternRewriter& rewriter) const { + auto ctx = origOp.getContext(); + auto loc = origOp.getLoc(); + mlir::func::FuncOp funcOp; + auto moduleOp = vpux::getModuleOp(origOp); + vpux::net::NetworkInfoOp netInfo; + vpux::net::NetworkInfoOp::getFromModule(moduleOp, netInfo, funcOp); + + auto numArgs = funcOp.getNumArguments(); + auto umdContext = funcOp.getArgument(GET_ARG_INDEX_CONTEXT(numArgs)); + auto device = funcOp.getArgument(GET_ARG_INDEX_DEVICE(numArgs)); + auto ddiTable = funcOp.getArgument(GET_ARG_INDEX_DDI_TABLE(numArgs)); + auto cmdList = funcOp.getArgument(GET_ARG_INDEX_COMMAND_LIST(numArgs)); + auto cmdQueue = funcOp.getArgument(GET_ARG_INDEX_COMMAND_QUEUE(numArgs)); + // needs to calculate the size of the kernel function after serialization of the core.NestedModule + + auto voidPtrTy = mlir::LLVM::LLVMPointerType::get(ctx); + std::map> kernels; + for (auto& op : origOp.getBody()->getOperations()) { + if (auto callOp = mlir::dyn_cast(op)) { + mlir::SmallVector kernelInputs, kernelOutputs; + auto callee = callOp->getAttrOfType("callee"); + auto root = callee.getRootReference(); + auto calleeNameAttr = callee.getLeafReference(); + auto kernelBinary = moduleOp.lookupSymbol(root); + auto rootStr = root.str(); + if (!kernelBinary) { + _log.error("BinaryOp not found for {0}", root.str()); + return mlir::failure(); + } + auto binaryDataOp = + kernelBinary.lookupSymbol("serialized_" + callee.getLeafReference().str()); + if (!binaryDataOp) { + _log.error("BinaryDataOp not found for {0}", callee.getLeafReference().str()); + return mlir::failure(); + } + + auto object = binaryDataOp.getObject(); + if (!object) { + _log.error("Object not found in BinaryDataOp for {0}", + callOp.getOperation()->getName().getStringRef().str()); + return mlir::failure(); + } + + llvm::StringRef rawBytes = object.getObject().getValue(); + size_t dataSize = rawBytes.size(); + auto kernelSize = rewriter.create(loc, rewriter.getIntegerType(64), dataSize); + + auto resultCount = callOp.getNumResults(); + auto inputCount = callOp.getNumOperands() - resultCount; + + mlir::Value kernelGlobal; + auto iter = kernels.find(calleeNameAttr.str()); + if (iter != kernels.end()) { + kernelGlobal = iter->second.second; + } else { + auto name = callee.getLeafReference().getValue(); + auto nameAttr = mlir::StringAttr::get(origOp.getContext(), std::string(name) + "_kernel"); + kernelGlobal = mlir::LLVM::createGlobalString(loc, rewriter, nameAttr.getValue(), object.getObject(), + mlir::LLVM::Linkage::Internal); + kernels[calleeNameAttr.str()] = std::make_pair(kernelBinary, kernelGlobal); + } + + kernelInputs.insert(kernelInputs.begin(), callOp.getArgOperands().begin(), + callOp.getArgOperands().begin() + inputCount); + kernelOutputs.insert(kernelOutputs.begin(), callOp.getArgOperands().begin() + inputCount, + callOp.getArgOperands().end()); + + auto numInputs = + rewriter.create(loc, rewriter.getIntegerType(32), kernelInputs.size()); + auto numOutputs = + rewriter.create(loc, rewriter.getIntegerType(32), kernelOutputs.size()); + + auto inputs = rewriter.create(loc, voidPtrTy, voidPtrTy, numInputs); + auto outputs = rewriter.create(loc, voidPtrTy, voidPtrTy, numOutputs); + + // Store each output pointer as void* in the input array + for (size_t i = 0; i < kernelInputs.size(); ++i) { + auto idx = rewriter.create(loc, rewriter.getIntegerType(64), i); + auto gep = rewriter.create(loc, voidPtrTy, voidPtrTy, inputs, mlir::ValueRange{idx}); + auto llvmInput = kernelInputs[i]; + if (!mlir::LLVM::isCompatibleType(llvmInput.getType())) { + if (auto converted = _typeConverter.materializeTargetConversion( + rewriter, loc, _typeConverter.convertType(llvmInput.getType()), + mlir::ValueRange{llvmInput})) { + llvmInput = converted; + } else { + _log.error("Could not convert input type: {0}", llvmInput.getType()); + return mlir::failure(); + } + } + + auto desc = mlir::MemRefDescriptorView(mlir::ValueRange{llvmInput}); + rewriter.create(loc, desc.allocatedPtr(), gep); + } + + // Store each output pointer as void* in the output array + for (size_t i = 0; i < kernelOutputs.size(); ++i) { + auto idx = rewriter.create(loc, rewriter.getIntegerType(64), i); + auto gep = + rewriter.create(loc, voidPtrTy, voidPtrTy, outputs, mlir::ValueRange{idx}); + auto llvmOutput = kernelOutputs[i]; + if (!mlir::LLVM::isCompatibleType(llvmOutput.getType())) { + if (auto converted = _typeConverter.materializeTargetConversion( + rewriter, loc, _typeConverter.convertType(llvmOutput.getType()), + mlir::ValueRange{llvmOutput})) { + llvmOutput = converted; + } else { + _log.error("Could not convert output type: {0}", llvmOutput.getType()); + return mlir::failure(); + } + } + auto desc = mlir::MemRefDescriptorView(mlir::ValueRange{llvmOutput}); + rewriter.create(loc, desc.allocatedPtr(), gep); + } + + auto returnType = mlir::Type(mlir::LLVM::LLVMVoidType::get(ctx)); + createLLVMFuncCallOp(rewriter, getModuleOp(origOp), "npu_level_zero_execute_graph", + {inputs, numInputs, outputs, numOutputs, kernelGlobal, kernelSize, umdContext, device, + ddiTable, cmdList, cmdQueue}, + returnType); + } + } + + rewriter.eraseOp(origOp); + return mlir::success(); +} + // // ConvertToLLVMUMDCallsPass // @@ -316,20 +541,36 @@ void ConvertToLLVMUMDCallsPass::safeRunOnModule() { target.addIllegalOp(); target.addIllegalOp(); target.addIllegalOp(); - + target.addIllegalOp(); + target.addLegalOp(); + target.addLegalOp(); + target.addLegalOp(); + target.addLegalOp(); target.addLegalDialect(); - target.addLegalOp(); + target.addLegalOp(); patterns.add(typeConverter); patterns.add(typeConverter); - patterns.add>(ctx, _log); - patterns.add>(ctx, _log); - patterns.add>(ctx, _log); - patterns.add>(ctx, _log); + patterns.add>(ctx, typeConverter, vpux::benefitHigh, _log); + patterns.add>(ctx, typeConverter, vpux::benefitHigh, _log); + patterns.add>(ctx, typeConverter, vpux::benefitHigh, _log); + patterns.add>(ctx, typeConverter, vpux::benefitHigh, _log); + // Note: ExecuteOp is a special case, a few conditions apply which is why it is the last pattern, + // 1 npu_level_zero_execute_graph that all inputs and outputs are converted to LLVM types. + // 2.It will have successor and predecessor dependencies on the other async and memref operations therefore those + // operations should be removed or converted before this pattern is applied. + + patterns.add>(ctx, typeConverter, vpux::benefitLow, _log); if (mlir::failed(mlir::applyPartialConversion(module, target, std::move(patterns)))) { signalPassFailure(); } + + // Remove all BinaryOps as global variables for the ops were defined + auto binaryOps = to_small_vector(module.getOps()); + for (auto binaryOp : binaryOps) { + binaryOp.getOperation()->erase(); + } } } // namespace diff --git a/src/vpux_compiler/src/dialect/HostExec/transforms/passes/optimize_memref_copies.cpp b/src/vpux_compiler/src/dialect/HostExec/transforms/passes/optimize_memref_copies.cpp index 8005c08cf0..d933eb5c6d 100644 --- a/src/vpux_compiler/src/dialect/HostExec/transforms/passes/optimize_memref_copies.cpp +++ b/src/vpux_compiler/src/dialect/HostExec/transforms/passes/optimize_memref_copies.cpp @@ -108,8 +108,22 @@ void OptimizeMemRefCopiesPass::safeRunOnFunc() { return; } - allocOp.getResult().replaceAllUsesWith(dst); + SmallVector subviewOps; + for (auto user : allocOp->getUsers()) { + if (auto subviewOp = mlir::dyn_cast(user)) { + subviewOps.push_back(subviewOp); + } + } + for (auto subviewOp : subviewOps) { + builder.setInsertionPointAfter(subviewOp); + auto newSubview = + builder.create(subviewOp.getLoc(), dst, subviewOp.getMixedOffsets(), + subviewOp.getMixedSizes(), subviewOp.getMixedStrides()); + subviewOp.getResult().replaceAllUsesWith(newSubview.getResult()); + subviewOp.erase(); + } + allocOp.getResult().replaceAllUsesWith(dst); copyOp.erase(); if (allocOp->use_empty()) { allocOp.erase(); diff --git a/src/vpux_compiler/src/dialect/HostExec/transforms/passes/serialize_elf_to_binary.cpp b/src/vpux_compiler/src/dialect/HostExec/transforms/passes/serialize_elf_to_binary.cpp index 5e165db141..c83aea4cfb 100644 --- a/src/vpux_compiler/src/dialect/HostExec/transforms/passes/serialize_elf_to_binary.cpp +++ b/src/vpux_compiler/src/dialect/HostExec/transforms/passes/serialize_elf_to_binary.cpp @@ -11,6 +11,7 @@ #include "vpux/compiler/dialect/HostExec/IR/dialect.hpp" #include "vpux/compiler/dialect/HostExec/IR/ops.hpp" #include "vpux/compiler/dialect/VPUASM/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/IR/ops.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/passes.hpp" @@ -35,7 +36,7 @@ class SerializeELFToBinaryPass : public HostExec::impl::SerializeELFToBinaryBase private: void safeRunOnFunc() final; - mlir::func::FuncOp serialize(vpux::Core::NestedCallOp callOp, mlir::func::FuncOp funcOp, VPU::ArchKind& arch); + mlir::func::FuncOp serialize(vpux::Core::NestedCallOp callOp, mlir::func::FuncOp funcOp, config::ArchKind& arch); Logger _log; }; @@ -65,8 +66,8 @@ mlir::FunctionType constructFunctionType(mlir::ModuleOp moduleOp, net::NetworkIn return mlir::FunctionType::get(moduleOp.getContext(), funcArgs, outArgs); } -void getBinaryBuffer(mlir::ModuleOp moduleOp, VPU::ArchKind& arch, std::vector& binaryBuffer) { - if (arch == VPU::ArchKind::NPU37XX) { +void getBinaryBuffer(mlir::ModuleOp moduleOp, config::ArchKind& arch, std::vector& binaryBuffer) { + if (arch == config::ArchKind::NPU37XX) { binaryBuffer = vpux::ELFNPU37XX::exportToELF(moduleOp); } else { binaryBuffer = vpux::ELF::exportToELF(moduleOp); @@ -74,7 +75,7 @@ void getBinaryBuffer(mlir::ModuleOp moduleOp, VPU::ArchKind& arch, std::vectorgetParentOfType(); if (moduleOp == nullptr) { _log.error("Expected the func op: '{0}' nested in a module operation", funcOp.getName()); @@ -135,7 +136,7 @@ void SerializeELFToBinaryPass::safeRunOnFunc() { } mlir::OpBuilder builder(parentModuleOp); - auto arch = VPU::getArch(func); + auto arch = config::getArch(func); llvm::DenseSet serializedOps; func.walk([&](vpux::Core::NestedCallOp callOp) { diff --git a/src/vpux_compiler/src/dialect/HostExec/transforms/pipelines.cpp b/src/vpux_compiler/src/dialect/HostExec/transforms/pipelines.cpp index 47491aacf3..a4e44e8d2e 100644 --- a/src/vpux_compiler/src/dialect/HostExec/transforms/pipelines.cpp +++ b/src/vpux_compiler/src/dialect/HostExec/transforms/pipelines.cpp @@ -5,7 +5,13 @@ #include #include +#include #include +#include +#include +#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" +#include "mlir/Conversion/Passes.h" +#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h" #include "vpux/compiler/dialect/HostExec/transforms/passes.hpp" #include "vpux/compiler/dialect/core/transforms/passes.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -30,12 +36,24 @@ void HostExec::registerHostExecPipelines() { void HostExec::buildHostExecPipeline(mlir::OpPassManager& pm, Logger /*log*/) { const auto grc = getDefaultGreedyRewriteConfig(); + pm.addPass(mlir::createArithToLLVMConversionPass()); + pm.addPass(HostExec::createSerializeELFToBinaryPass()); + pm.addPass(HostExec::createConvertToLLVMUMDCallsPass()); + + // This should be placed after ConvertToLLVMUMDCalls + // as additional arguments (e.g., L0 command list, command queue, and so on) + // are added in ConvertToLLVMUMDCalls + pm.addPass(mlir::LLVM::createRequestCWrappersPass()); + + // Lowering to LLVM passes pm.addPass(mlir::createConvertSCFToCFPass()); - pm.addPass(mlir::createConvertFuncToLLVMPass()); pm.addPass(mlir::createConvertControlFlowToLLVMPass()); + pm.addPass(mlir::memref::createExpandStridedMetadataPass()); + pm.addPass(mlir::createLowerAffinePass()); + pm.addPass(mlir::createFinalizeMemRefToLLVMConversionPass()); + pm.addPass(mlir::createConvertFuncToLLVMPass()); - pm.addPass(HostExec::createSerializeELFToBinaryPass()); - pm.addPass(HostExec::createConvertToLLVMUMDCallsPass()); + pm.addPass(mlir::createReconcileUnrealizedCastsPass()); pm.addPass(mlir::createCanonicalizerPass(grc)); } diff --git a/src/vpux_compiler/src/dialect/IE/IR/dialect.cpp b/src/vpux_compiler/src/dialect/IE/IR/dialect.cpp index f3a68575ad..87174d51a8 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/dialect.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/dialect.cpp @@ -6,13 +6,30 @@ #include "vpux/compiler/dialect/IE/IR/dialect.hpp" #include #include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/bitwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/resources.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" -#include "vpux/compiler/dialect/net/IR/dialect.hpp" - #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/net/IR/dialect.hpp" #include "vpux/compiler/utils/error.hpp" +#include #include #include #include @@ -69,7 +86,75 @@ IEAsmHooks::AliasResult IEAsmHooks::getAlias(mlir::Type type, llvm::raw_ostream& void vpux::IE::IEDialect::initialize() { addOperations< #define GET_OP_LIST -#include +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include + >(); + addOperations< +#define GET_OP_LIST +#include >(); addInterfaces(); diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops.cpp index b4d2fe8448..c22e677e6f 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops.cpp @@ -3,28 +3,69 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" - -#include "vpux/compiler/core/attributes/dims_order.hpp" -#include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/bitwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/resources.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/utils/asm.hpp" -#include -#include #include #include #include using namespace vpux; -bool IE::isActShaveKernel(mlir::Operation* operation) { - return VPU::NCEInvariant::isSupported(operation, Logger::global()).failed(); -} - // // Generated // #define GET_OP_CLASSES -#include +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include +#define GET_OP_CLASSES +#include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/abs.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/abs.cpp index 49a1eea160..df46229e1a 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/abs.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/abs.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/accumulate.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/accumulate.cpp index e797f43f57..9d9f0c229e 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/accumulate.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/accumulate.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/acos.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/acos.cpp index acb1b24a11..207ddd1442 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/acos.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/acos.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/acosh.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/acosh.cpp index 834100b78b..5263a9177c 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/acosh.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/acosh.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/adaptive_avg_pool.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/adaptive_avg_pool.cpp index 3ed3fbff1d..bebc09ce0d 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/adaptive_avg_pool.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/adaptive_avg_pool.cpp @@ -3,9 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" - #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/adaptive_max_pool.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/adaptive_max_pool.cpp index 1b9db5c800..b3b4bb22c3 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/adaptive_max_pool.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/adaptive_max_pool.cpp @@ -3,9 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" - #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/add.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/add.cpp index 3c236dcf5b..7ae24e8539 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/add.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/add.cpp @@ -3,15 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/utils/type_padding.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" - -#include "vpux/compiler/dialect/core/types.hpp" - #include "vpux/utils/core/numeric.hpp" -#include "vpux/utils/core/range.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/affine_reshape.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/affine_reshape.cpp index ef08f7ca54..0618800af8 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/affine_reshape.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/affine_reshape.cpp @@ -3,17 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/const/utils/affine_reshape.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/utils/elem_type_info_utils.hpp" #include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/layout_utils.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" -#include "vpux/compiler/dialect/const/utils/affine_reshape.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/compiler/utils/types.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/and.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/and.cpp index 850eac227f..7a98ad1708 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/and.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/and.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/asin.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/asin.cpp index 9bc48b0f88..af4d80d739 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/asin.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/asin.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/asinh.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/asinh.cpp index 40118fc753..75efa879d7 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/asinh.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/asinh.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/assign.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/assign.cpp index 65191a6eb8..23cd52eba3 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/assign.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/assign.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/atan.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/atan.cpp index 255b9983ad..f21142b09a 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/atan.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/atan.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/atanh.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/atanh.cpp index 2bd7f788dd..6a88ff21d5 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/atanh.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/atanh.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/avgpool.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/avgpool.cpp index f0f98f585a..ec9ff71c3d 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/avgpool.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/avgpool.cpp @@ -3,18 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/utils/type_padding.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/empty_node.hpp" - -#include "vpux/compiler/utils/error.hpp" -#include "vpux/utils/core/checked_cast.hpp" -#include "vpux/utils/core/range.hpp" - #include "vpux/compiler/utils/infer_output_shape.hpp" +#include + using namespace vpux; mlir::LogicalResult vpux::IE::AvgPoolOp::inferReturnTypeComponents( diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/batch_norm.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/batch_norm.cpp index e60a892f8c..2cdfe58894 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/batch_norm.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/batch_norm.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/batch_to_space.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/batch_to_space.cpp index a0322bed03..3d788c924f 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/batch_to_space.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/batch_to_space.cpp @@ -3,8 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/attributes_utils.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/utils/core/small_vector.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_and.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_and.cpp index 3fe2d21125..6fcfadeec6 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_and.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_and.cpp @@ -3,11 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/bitwise.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" -#include "vpux/utils/core/checked_cast.hpp" - using namespace vpux; mlir::LogicalResult vpux::IE::BitwiseAndOp::inferReturnTypeComponents( diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_not.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_not.cpp index fa73c37258..98cfcf80a5 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_not.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_not.cpp @@ -3,10 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" - -#include "vpux/utils/core/checked_cast.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/bitwise.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_or.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_or.cpp index 7b170696c5..7a52f0bfad 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_or.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_or.cpp @@ -3,11 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/bitwise.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" -#include "vpux/utils/core/checked_cast.hpp" - using namespace vpux; mlir::LogicalResult vpux::IE::BitwiseOrOp::inferReturnTypeComponents( diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_xor.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_xor.cpp index cc9b8fa3dc..141ac5587c 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_xor.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/bitwise_xor.cpp @@ -3,11 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/bitwise.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" -#include "vpux/utils/core/checked_cast.hpp" - using namespace vpux; mlir::LogicalResult vpux::IE::BitwiseXorOp::inferReturnTypeComponents( diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/broadcast.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/broadcast.cpp index bdca0e0548..9876b45965 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/broadcast.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/broadcast.cpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/utils/broadcast_utils.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/bucketize.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/bucketize.cpp index b1957de0ff..35ea0f0857 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/bucketize.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/bucketize.cpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/ceiling.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/ceiling.cpp index 5b369d56ee..5995c801ec 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/ceiling.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/ceiling.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/clamp.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/clamp.cpp index 488d509823..107da27b63 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/clamp.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/clamp.cpp @@ -3,7 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/utils/core/custom_float.hpp" @@ -45,8 +47,9 @@ mlir::LogicalResult vpux::IE::ClampOp::inferReturnTypeComponents( return mlir::failure(); } - const auto inType = mlir::cast(clamp.getInput().getType()); - inferredReturnShapes.emplace_back(inType.getShape(), inType.getElementType()); + const auto inType = mlir::cast(clamp.getInput().getType()); + const auto outDesc = vpux::getTensorAttr(inType); + inferredReturnShapes.emplace_back(inType.getShape(), inType.getElementType(), outDesc); return mlir::success(); } diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/codegen_capsule.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/codegen_capsule.cpp index 12930d8ebe..89dd134717 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/codegen_capsule.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/codegen_capsule.cpp @@ -3,6 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" +#include "vpux/compiler/utils/asm.hpp" + #include #include #include @@ -11,11 +15,6 @@ #include #include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/compiler/utils/asm.hpp" -#include "vpux/utils/core/range.hpp" - using namespace vpux; // diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/concat.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/concat.cpp index 512e013bf1..7caa92206f 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/concat.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/concat.cpp @@ -3,20 +3,19 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/IE/utils/elem_type_info_utils.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/loop.hpp" #include "vpux/compiler/utils/quantization.hpp" - #include "vpux/utils/core/checked_cast.hpp" #include "vpux/utils/core/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/convert.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/convert.cpp index b981184da0..21cb070c5f 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/convert.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/convert.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/dialect/core/types.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/convertlike.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/convertlike.cpp index d5433858bf..73e010f493 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/convertlike.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/convertlike.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/convolution.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/convolution.cpp index b21becc894..3b58bacf4e 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/convolution.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/convolution.cpp @@ -3,9 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/utils/convolution_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -255,14 +259,13 @@ mlir::LogicalResult vpux::IE::ConvolutionOp::reifyResultShapes(mlir::OpBuilder& return errorAt(getLoc(), "Dilation is not supported for reifyResultShapes"); } - const auto kernelShape = mlir::cast(getFilter().getType()).getShape(); - SmallVector kernelSize{kernelShape[Dims4D::Filter::KY], kernelShape[Dims4D::Filter::KX]}; - const auto strides = parseIntArrayAttr(getStridesAttr()); const auto padBegin = parseIntArrayAttr(getPadsBeginAttr()); const auto padEnd = parseIntArrayAttr(getPadsEndAttr()); - auto outShape = - reifyConvPoolTensors(builder, getInput(), getOutput(), kernelSize, strides, padBegin, padEnd, getLoc()); + auto kernelShape = mlir::cast(getFilter().getType()).getShape(); + SmallVector kernelSize{kernelShape[Dims4D::Filter::KY], kernelShape[Dims4D::Filter::KX]}; + auto outShape = reifyConvPoolTensors(builder, getInput(), getOutput(), getFilter(), kernelSize, strides, padBegin, + padEnd, getLoc()); if (mlir::failed(outShape)) { return outShape; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/convolution_backprop_data.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/convolution_backprop_data.cpp index 1af0ee264c..34e40dca63 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/convolution_backprop_data.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/convolution_backprop_data.cpp @@ -32,16 +32,14 @@ // then check to make sure that the incoming delta has the same shape as the forward output. // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" - #include "vpux/compiler/utils/infer_output_shape.hpp" #include -#include "openvino/op/group_conv.hpp" +#include using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/copy.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/copy.cpp deleted file mode 100644 index d4fde8249a..0000000000 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/copy.cpp +++ /dev/null @@ -1,87 +0,0 @@ -// -// Copyright (C) 2022-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - -#include "vpux/compiler/utils/error.hpp" - -#include - -using namespace vpux; - -// -// InferTypeOpInterface -// - -mlir::LogicalResult vpux::IE::CopyOp::inferReturnTypeComponents( - mlir::MLIRContext* ctx, std::optional optLoc, mlir::ValueShapeRange operands, - mlir::DictionaryAttr attrs, mlir::OpaqueProperties prop, mlir::RegionRange, - SmallVectorImpl& inferredReturnShapes) { - const auto loc = optLoc.value_or(mlir::UnknownLoc::get(ctx)); - - IE::CopyOpAdaptor copyOp(operands, attrs, prop); - if (mlir::failed(copyOp.verify(loc))) { - return mlir::failure(); - } - - const auto ndInType = mlir::dyn_cast(copyOp.getInput().getType()); - if (ndInType == nullptr) { - return errorAt(loc, "IE::CopyOp operand must have vpux::NDTypeInterface type"); - } - - IndexedSymbolAttr outMemSpace = nullptr; - if (copyOp.getOutMemSpace().has_value()) { - outMemSpace = copyOp.getOutMemSpace().value(); - } - const auto outType = mlir::cast(ndInType.changeMemSpace(outMemSpace)); - - inferredReturnShapes.emplace_back(outType.getShape(), outType.getElementType(), outType.getEncoding()); - return mlir::success(); -} - -// -// fold -// - -mlir::OpFoldResult vpux::IE::CopyOp::fold(FoldAdaptor) { - if (getInput().getType() == getOutput().getType()) { - return getInput(); - } - - return nullptr; -} - -// -// FuseCopies -// - -namespace { - -class FuseCopies final : public mlir::OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; - - mlir::LogicalResult matchAndRewrite(IE::CopyOp origOp, mlir::PatternRewriter& rewriter) const final; -}; - -mlir::LogicalResult FuseCopies::matchAndRewrite(IE::CopyOp origOp, mlir::PatternRewriter& rewriter) const { - auto producerCopyOp = origOp.getInput().getDefiningOp(); - if (producerCopyOp == nullptr) { - return mlir::failure(); - } - - rewriter.replaceOpWithNewOp(origOp, producerCopyOp.getInput(), origOp.getOutMemSpaceAttr()); - return mlir::success(); -} - -} // namespace - -// -// getCanonicalizationPatterns -// - -void vpux::IE::CopyOp::getCanonicalizationPatterns(mlir::RewritePatternSet& results, mlir::MLIRContext* ctx) { - results.add(ctx); -} diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/cos.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/cos.cpp index f51f43d797..7b0f26cdcc 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/cos.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/cos.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/cosh.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/cosh.cpp index 6d4336bafb..35134e32ec 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/cosh.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/cosh.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/ctc_greedy_decoder.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/ctc_greedy_decoder.cpp index cca0473439..db326b9af0 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/ctc_greedy_decoder.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/ctc_greedy_decoder.cpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/ctc_greedy_decoder_seq_len.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/ctc_greedy_decoder_seq_len.cpp index 62e4ca39e5..7a103c2af3 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/ctc_greedy_decoder_seq_len.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/ctc_greedy_decoder_seq_len.cpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/cum_sum.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/cum_sum.cpp index 9ad5c19ddc..7283c37271 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/cum_sum.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/cum_sum.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/deformable_convolution.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/deformable_convolution.cpp index 8f29f0f8b8..a928c6c9ea 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/deformable_convolution.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/deformable_convolution.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/deformable_psroipooling.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/deformable_psroipooling.cpp index 22453a0cb3..f9cd17d6f3 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/deformable_psroipooling.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/deformable_psroipooling.cpp @@ -2,8 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/depth_to_space.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/depth_to_space.cpp index 79c0f7d1f0..e63d48f5cb 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/depth_to_space.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/depth_to_space.cpp @@ -4,7 +4,8 @@ // #include "vpux/compiler/core/layers.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/utils/core/checked_cast.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/dequantize.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/dequantize.cpp index 00f702b412..dd2d4f3957 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/dequantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/dequantize.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/detection_output.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/detection_output.cpp index 1c71c11cdf..49272ee48c 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/detection_output.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/detection_output.cpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/dft.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/dft.cpp index 83ea129701..0a2ae70ab1 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/dft.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/dft.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/utils/fft_ops_utils.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/divide.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/divide.cpp index 39b750d1c6..683fde1c36 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/divide.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/divide.cpp @@ -3,9 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" -#include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_broadcast.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_broadcast.cpp index c49e73acef..973790fa0f 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_broadcast.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_broadcast.cpp @@ -3,13 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" - -#include "vpux/compiler/dialect/IE/utils/broadcast_utils.hpp" -#include "vpux/compiler/dialect/IE/utils/resources.hpp" -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" -#include "vpux/compiler/utils/error.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" +#include "vpux/compiler/utils/attributes.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_data_mask.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_data_mask.cpp index e068cb26f5..1739c2edf6 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_data_mask.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_data_mask.cpp @@ -3,8 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_dequantize.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_dequantize.cpp index dc807b7117..b12239be02 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_dequantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_dequantize.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_expand.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_expand.cpp index 6f26e42ccb..f5ab32df46 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_expand.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_expand.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_fake_quantize.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_fake_quantize.cpp index 69fcf9781d..3b1e946fe2 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_fake_quantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_fake_quantize.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_quantize.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_quantize.cpp index 8cfbd06d18..0f226ce075 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_quantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_quantize.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_reshape.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_reshape.cpp index d94bba02bf..a17fe24361 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_reshape.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_reshape.cpp @@ -3,11 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" -#include "vpux/compiler/dialect/core/IR/attributes.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_tile.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_tile.cpp index c1a8227c38..158a90936e 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_tile.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/dynamic_tile.cpp @@ -3,8 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/elu.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/elu.cpp index 5a0aca80a5..735bb3814b 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/elu.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/elu.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/embedding_bag_offsets_sum.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/embedding_bag_offsets_sum.cpp index 0741b47dd7..27dbd3e3cc 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/embedding_bag_offsets_sum.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/embedding_bag_offsets_sum.cpp @@ -3,13 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/utils/core/checked_cast.hpp" #include -#include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/embedding_bag_packed_sum.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/embedding_bag_packed_sum.cpp index d9a0d85bcd..8a8b87d653 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/embedding_bag_packed_sum.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/embedding_bag_packed_sum.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/embedding_segments_sum.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/embedding_segments_sum.cpp index 4dcae4f589..3a77d3ee5b 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/embedding_segments_sum.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/embedding_segments_sum.cpp @@ -3,15 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/error.hpp" - #include "vpux/utils/core/checked_cast.hpp" #include -#include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/equal.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/equal.cpp index 81a6ad64f8..9abbdec3ab 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/equal.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/equal.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/erf.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/erf.cpp index 2758f1dadb..bb3e4a3bd1 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/erf.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/erf.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/exp.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/exp.cpp index 34abd10ad5..8a889bc20e 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/exp.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/exp.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" using namespace vpux; @@ -18,8 +19,9 @@ mlir::LogicalResult vpux::IE::ExpOp::inferReturnTypeComponents( return mlir::failure(); } - const auto inType = mlir::cast(exp.getInput().getType()); - inferredReturnShapes.emplace_back(inType.getShape(), inType.getElementType()); + const auto inType = mlir::cast(exp.getInput().getType()); + const auto outDesc = vpux::getTensorAttr(inType); + inferredReturnShapes.emplace_back(inType.getShape(), inType.getElementType(), outDesc); return mlir::success(); } diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/expand.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/expand.cpp index 8dc995d158..9ce50d0efe 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/expand.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/expand.cpp @@ -3,14 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/utils/expand_utils.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/dialect/const/ops.hpp" - -#include "vpux/compiler/dialect/IE/utils/expand_utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" - #include "vpux/utils/core/checked_cast.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/expand_dilated.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/expand_dilated.cpp index 2835f8ebf5..687afd6a14 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/expand_dilated.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/expand_dilated.cpp @@ -3,9 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" - #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/dilated_utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/experimental_detectron_roi_feature_extractor.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/experimental_detectron_roi_feature_extractor.cpp index 13354f73d8..7f2e2275d3 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/experimental_detectron_roi_feature_extractor.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/experimental_detectron_roi_feature_extractor.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/extract_image_patches.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/extract_image_patches.cpp index 5023031aba..0180ce15b6 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/extract_image_patches.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/extract_image_patches.cpp @@ -3,9 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - #include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/eye.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/eye.cpp index 8904d2fa69..ca5ff338b7 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/eye.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/eye.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes_utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/fake_convert.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/fake_convert.cpp index 43fecdb9aa..6f86302b90 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/fake_convert.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/fake_convert.cpp @@ -3,13 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/utils/core/checked_cast.hpp" -#include "vpux/utils/core/range.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/fake_quantize.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/fake_quantize.cpp index 4de3a729e4..bf1d330a2a 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/fake_quantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/fake_quantize.cpp @@ -3,13 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - #include "vpux/compiler/core/types/quantile_float/types.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/utils/fake_quantize_utils.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/IE/utils/transpose_op_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/floor.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/floor.cpp index 00943e6651..784434b24c 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/floor.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/floor.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/floor_mod.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/floor_mod.cpp index 39d0ce3efc..1744f58c6b 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/floor_mod.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/floor_mod.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/fully_connected.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/fully_connected.cpp index bc2b4ebdcf..a8b39d1c6e 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/fully_connected.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/fully_connected.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/gather.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/gather.cpp index 096524120c..56336bab84 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/gather.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/gather.cpp @@ -3,18 +3,16 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/error.hpp" - #include "vpux/utils/core/checked_cast.hpp" #include -#include - using namespace vpux; namespace { diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/gatherND.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/gatherND.cpp index d25bf6323f..9a7d469f87 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/gatherND.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/gatherND.cpp @@ -3,8 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/gather_elements.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/gather_elements.cpp index c1a074e35d..f0ab620e63 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/gather_elements.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/gather_elements.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/gather_tree.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/gather_tree.cpp index 530dd11258..57275d942f 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/gather_tree.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/gather_tree.cpp @@ -5,7 +5,7 @@ // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/gelu.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/gelu.cpp index d096d614f8..0e5c551754 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/gelu.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/gelu.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" using namespace vpux; @@ -18,8 +19,9 @@ mlir::LogicalResult vpux::IE::GeluOp::inferReturnTypeComponents( return mlir::failure(); } - const auto inType = mlir::cast(gelu.getInput().getType()); - inferredReturnShapes.emplace_back(inType.getShape(), inType.getElementType()); + const auto inType = mlir::cast(gelu.getInput().getType()); + const auto outDesc = vpux::getTensorAttr(inType); + inferredReturnShapes.emplace_back(inType.getShape(), inType.getElementType(), outDesc); return mlir::success(); } diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/greater.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/greater.cpp index d1417dfcf6..b4d38d37c4 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/greater.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/greater.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/greater_equal.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/greater_equal.cpp index 5f512b9337..f1e56bfd61 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/greater_equal.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/greater_equal.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/grid_sample.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/grid_sample.cpp index 7dda122cb5..d96e389f05 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/grid_sample.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/grid_sample.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/grn.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/grn.cpp index 882d78d954..56b303e3ac 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/grn.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/grn.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/group_convolution_backprop_data.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/group_convolution_backprop_data.cpp index 12c75988d2..33acbb2562 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/group_convolution_backprop_data.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/group_convolution_backprop_data.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/group_normalization.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/group_normalization.cpp index 3128d8940c..07deb5b538 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/group_normalization.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/group_normalization.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/group_transposed_convolution.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/group_transposed_convolution.cpp index 829686af05..aafca0a59a 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/group_transposed_convolution.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/group_transposed_convolution.cpp @@ -3,8 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/utils/IE/transposed_convolution_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/gru_cell.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/gru_cell.cpp index 13567f29c5..fb39254072 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/gru_cell.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/gru_cell.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/gru_gates.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/gru_gates.cpp index c75076e4d3..5591c88a39 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/gru_gates.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/gru_gates.cpp @@ -3,9 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - -#include "vpux/compiler/utils/error.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/gru_sequence.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/gru_sequence.cpp index 525b883413..382fbdd16d 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/gru_sequence.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/gru_sequence.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/gru_sequence_last_part.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/gru_sequence_last_part.cpp index dd1106ef41..17aa109691 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/gru_sequence_last_part.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/gru_sequence_last_part.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/hard_sigmoid.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/hard_sigmoid.cpp index 2f2ec971d0..b4567ebe01 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/hard_sigmoid.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/hard_sigmoid.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/hsigmoid.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/hsigmoid.cpp index 3d9b3682a9..9f661b1001 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/hsigmoid.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/hsigmoid.cpp @@ -4,7 +4,7 @@ // // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/hswish.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/hswish.cpp index 0a3105d186..d1786e11d4 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/hswish.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/hswish.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/idft.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/idft.cpp index fe87087ca7..7de7716eae 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/idft.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/idft.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/utils/fft_ops_utils.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/if.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/if.cpp index 150ce79a9e..df21ef6c66 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/if.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/if.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/incremental_sdpa.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/incremental_sdpa.cpp index 2cbf42765b..32ee749594 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/incremental_sdpa.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/incremental_sdpa.cpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/interpolate.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/interpolate.cpp index 02673aa8c5..12edb8e1c8 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/interpolate.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/interpolate.cpp @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" #include "vpux/compiler/dialect/IE/utils/interpolate_utils.hpp" #include "vpux/compiler/dialect/config/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include @@ -118,10 +120,10 @@ class ConvertInputToFP16 final : public mlir::OpRewritePattern(op.getInput().getType()).getElementType(); - const auto arch = VPU::getArch(op); + const auto arch = config::getArch(op); // VPU4000-M2I does not support C-minor FP16 - if (arch >= VPU::ArchKind::NPU40XX && (config::getCompilationMode(op) != config::CompilationMode::ReferenceSW)) { + if (arch >= config::ArchKind::NPU40XX && (config::getCompilationMode(op) != config::CompilationMode::ReferenceSW)) { return mlir::failure(); } diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/inverse.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/inverse.cpp index 9a0aeb0f23..db03140be7 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/inverse.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/inverse.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/irdft.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/irdft.cpp index 843ae7c207..0dc3186a06 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/irdft.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/irdft.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/utils/fft_ops_utils.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/layout_cast.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/layout_cast.cpp index aa83555ca2..17a8cf9a45 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/layout_cast.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/layout_cast.cpp @@ -3,8 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/const/attributes/content.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/less.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/less.cpp index aeaf06fa8f..7dc44ef1e0 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/less.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/less.cpp @@ -3,8 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/less_equal.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/less_equal.cpp index 494dbe3abe..eb67dd1a56 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/less_equal.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/less_equal.cpp @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" + using namespace vpux; mlir::LogicalResult vpux::IE::LessEqualOp::inferReturnTypeComponents( diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/log.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/log.cpp index 52e90fb0b7..4c4cd3cc9b 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/log.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/log.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/log_softmax.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/log_softmax.cpp index efd6f383f9..aa13119af9 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/log_softmax.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/log_softmax.cpp @@ -3,9 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" - #include "vpux/utils/core/checked_cast.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/logical_not.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/logical_not.cpp index a6d76d650e..107a572383 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/logical_not.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/logical_not.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/logical_or.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/logical_or.cpp index 5e46aaa56f..efa79cd03d 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/logical_or.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/logical_or.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/logical_xor.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/logical_xor.cpp index 7f951a0775..ed255782da 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/logical_xor.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/logical_xor.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/loop.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/loop.cpp index 8d5090f890..fc98969ee9 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/loop.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/loop.cpp @@ -3,11 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/compiler/utils/logging.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/loop_select.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/loop_select.cpp index b869787821..a96ae8b777 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/loop_select.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/loop_select.cpp @@ -3,8 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/loop_terminator.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/loop_terminator.cpp index a2d0b0a583..710d257129 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/loop_terminator.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/loop_terminator.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/lrn.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/lrn.cpp index 4483b3ac24..06553f9d14 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/lrn.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/lrn.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/lrn_ie.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/lrn_ie.cpp index a718a8b3d8..548af6f971 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/lrn_ie.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/lrn_ie.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/lstm_cell.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/lstm_cell.cpp index 87b7db611c..4d384924af 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/lstm_cell.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/lstm_cell.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/lstm_gates.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/lstm_gates.cpp index 0c68a0b2e2..f06595512d 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/lstm_gates.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/lstm_gates.cpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/lstm_sequence.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/lstm_sequence.cpp index 7569559344..3c1e92f391 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/lstm_sequence.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/lstm_sequence.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/matmul.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/matmul.cpp index c32ec7d3df..e1b37acdc7 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/matmul.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/matmul.cpp @@ -37,10 +37,10 @@ // #include "vpux/compiler/dialect/IE/utils/matmul.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/utils/error.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/maximum.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/maximum.cpp index de0104385b..b21d3df68b 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/maximum.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/maximum.cpp @@ -3,8 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/maxpool.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/maxpool.cpp index 1b2b057607..ae66f4abcb 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/maxpool.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/maxpool.cpp @@ -3,12 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/utils/type_padding.hpp" -#include "vpux/compiler/dialect/core/IR/attributes.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" #include @@ -61,8 +59,8 @@ mlir::LogicalResult vpux::IE::MaxPoolOp::reifyResultShapes(mlir::OpBuilder& buil const auto padBegin = parseIntArrayAttr(getPadsBeginAttr()); const auto padEnd = parseIntArrayAttr(getPadsEndAttr()); - auto outShape = - reifyConvPoolTensors(builder, getInput(), getOutput(), kernelSize, strides, padBegin, padEnd, getLoc()); + auto outShape = reifyConvPoolTensors(builder, getInput(), getOutput(), nullptr, kernelSize, strides, padBegin, + padEnd, getLoc()); if (mlir::failed(outShape)) { return outShape; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/maxpool8.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/maxpool8.cpp index 84c5f8cb98..a8b707f7db 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/maxpool8.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/maxpool8.cpp @@ -3,15 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/empty_node.hpp" - #include "vpux/compiler/utils/infer_output_shape.hpp" -#include "vpux/utils/core/checked_cast.hpp" - -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/mem_permute.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/mem_permute.cpp index a90dfcac08..0f064e96fe 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/mem_permute.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/mem_permute.cpp @@ -3,11 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - #include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/utils/permute_infer.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/minimum.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/minimum.cpp index fdfca362c5..e7737b3c00 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/minimum.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/minimum.cpp @@ -3,8 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/mish.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/mish.cpp index 65104f3f9b..623ce8440d 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/mish.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/mish.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/mod.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/mod.cpp index 231b1d5a53..93b7113f9d 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/mod.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/mod.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/multiply.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/multiply.cpp index 505bed2522..0029942d88 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/multiply.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/multiply.cpp @@ -3,14 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/utils/type_padding.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" - -#include "vpux/compiler/dialect/core/types.hpp" #include "vpux/utils/core/numeric.hpp" -#include "vpux/utils/core/range.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/mvn.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/mvn.cpp index 5f3878f36d..9c15dfdad2 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/mvn.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/mvn.cpp @@ -3,10 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/compiler/utils/hw_settings.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include @@ -79,48 +82,6 @@ mlir::LogicalResult LegalizeEpsAttr::matchAndRewrite(IE::MVNOp origOp, mlir::Pat return mlir::success(); } -// -// ReshapeBatched -// - -class ReshapeBatched final : public mlir::OpRewritePattern { -public: - using mlir::OpRewritePattern::OpRewritePattern; - -public: - mlir::LogicalResult matchAndRewrite(IE::MVNOp origOp, mlir::PatternRewriter& rewriter) const final; -}; - -mlir::LogicalResult ReshapeBatched::matchAndRewrite(IE::MVNOp origOp, mlir::PatternRewriter& rewriter) const { - const auto acrossChannels = origOp.getAcrossChannelsAttr().getValue(); - const auto inputType = mlir::cast(origOp.getInput().getType()); - auto origShape = inputType.getShape(); - if (acrossChannels == false || inputType.getRank() != 4 || origShape[Dims4D::Act::N] == 1) { - return mlir::failure(); - } - - // acrossChannel batched MVN with shape [N,C,H,W] can be converted into - // non-acrossChannel non-batched MVN with shape [1,N,C,H*W] - SmallVector newShape(inputType.getRank(), 1); - newShape[Dims4D::Act::C.ind()] = origShape[Dims4D::Act::N]; - newShape[Dims4D::Act::H.ind()] = origShape[Dims4D::Act::C]; - newShape[Dims4D::Act::W.ind()] = origShape[Dims4D::Act::H] * origShape[Dims4D::Act::W]; - const auto newShapeAttr = getIntArrayAttr(rewriter.getContext(), newShape); - auto inputReshape = rewriter.createOrFold(takeOpLoc(origOp, "reshape_in"), origOp.getInput(), - nullptr, false, newShapeAttr); - - auto newMvnOp = rewriter.create(origOp->getLoc(), inputReshape, - mlir::BoolAttr::get(rewriter.getContext(), false), - origOp.getNormalizeVarianceAttr(), origOp.getEpsAttr()); - - const auto origShapeAttr = getIntArrayAttr(origOp->getContext(), origShape); - auto outputReshape = rewriter.createOrFold(takeOpLoc(origOp, "reshape_out"), newMvnOp, nullptr, - false, origShapeAttr); - - rewriter.replaceOp(origOp, outputReshape); - return mlir::success(); -} - } // namespace // @@ -129,5 +90,4 @@ mlir::LogicalResult ReshapeBatched::matchAndRewrite(IE::MVNOp origOp, mlir::Patt void vpux::IE::MVNOp::getCanonicalizationPatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx) { patterns.add(ctx); - patterns.add(ctx); } diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/mvn6.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/mvn6.cpp index 245e306b4c..0f59f68fb7 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/mvn6.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/mvn6.cpp @@ -3,8 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/error.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/negative.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/negative.cpp index a10bac449d..101efd6307 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/negative.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/negative.cpp @@ -3,8 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" + #include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/non_max_suppression.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/non_max_suppression.cpp index 71d19acf87..1d2d050c79 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/non_max_suppression.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/non_max_suppression.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/non_zero.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/non_zero.cpp index c86882b899..d209520d35 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/non_zero.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/non_zero.cpp @@ -3,18 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - #include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/error.hpp" #include -#include - using namespace vpux; mlir::LogicalResult vpux::IE::NonZeroOp::inferReturnTypeComponents( diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/normalize_ie.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/normalize_ie.cpp index d39e5059be..5fa92b78a3 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/normalize_ie.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/normalize_ie.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/normalize_l2.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/normalize_l2.cpp index 4cfbf059ee..44bcb8963f 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/normalize_l2.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/normalize_l2.cpp @@ -3,10 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - -#include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/attributes_utils.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/not_equal.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/not_equal.cpp index 7fa5ae1c2e..f18f21ba55 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/not_equal.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/not_equal.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/onehot.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/onehot.cpp index 50b3158b75..423a74b333 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/onehot.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/onehot.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/online_sdpa.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/online_sdpa.cpp index 238208c355..d912573e68 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/online_sdpa.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/online_sdpa.cpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/pad.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/pad.cpp index afb4a068da..d851785fbb 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/pad.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/pad.cpp @@ -3,13 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/utils/pad_extract.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/utils/error.hpp" - +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/utils/error.hpp" #include @@ -93,17 +93,19 @@ mlir::LogicalResult ConvertConstToAttr::matchAndRewrite(IE::PadOp padOp, mlir::P if (padOp.getPadValue() != nullptr) { const auto padValueType = mlir::cast(padOp.getPadValue().getType()); if (padValueType.getNumElements() != 1) { - return errorAt(padOp.getLoc(), "'pad_value' should have only 1 element, while it has {0}", - padValueType.getNumElements()); + // Cannot convert const to attr: 'pad_value' has more than 1 element + return mlir::failure(); } auto padValueConst = padOp.getPadValue().getDefiningOp(); if (padValueConst == nullptr) { - return errorAt(padOp.getLoc(), "Only constant input is supported for 'pad_value'"); + // Cannot convert const to attr: 'pad_value' is not const + return mlir::failure(); } if (const auto& attr = padValueConst.getContentAttr(); !attr.isSplat()) { - return errorAt(padOp.getLoc(), "Only splat input is supported for 'pad_value'"); + // Cannot convert const to attr: 'pad_value' is not splat const + return mlir::failure(); } const auto padValueContent = padValueConst.getContent(); diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/permute_cast.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/permute_cast.cpp index 7fce6c454c..482aae86ab 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/permute_cast.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/permute_cast.cpp @@ -3,12 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - #include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/utils/permute_infer.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/utils/error.hpp" +#include "vpux/compiler/utils/permute_utils.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/permute_quantize.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/permute_quantize.cpp index 8ac5a58388..003932d1bf 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/permute_quantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/permute_quantize.cpp @@ -3,10 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - #include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/power.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/power.cpp index 22a1936c60..0d4edc6b83 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/power.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/power.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/utils/power_utils.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/utils/core/numeric.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/prelu.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/prelu.cpp index a3d5c0431c..580de54fc0 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/prelu.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/prelu.cpp @@ -3,8 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include @@ -21,8 +23,9 @@ mlir::LogicalResult vpux::IE::PReluOp::inferReturnTypeComponents( return mlir::failure(); } - const auto inType = mlir::cast(prelu.getInput().getType()); - inferredReturnShapes.emplace_back(inType.getShape(), inType.getElementType()); + const auto inType = mlir::cast(prelu.getInput().getType()); + const auto outDesc = vpux::getTensorAttr(inType); + inferredReturnShapes.emplace_back(inType.getShape(), inType.getElementType(), outDesc); return mlir::success(); } @@ -38,8 +41,9 @@ mlir::LogicalResult vpux::IE::LeakyReluOp::inferReturnTypeComponents( return mlir::failure(); } - const auto inType = mlir::cast(leaky_relu.getInput().getType()); - inferredReturnShapes.emplace_back(inType.getShape(), inType.getElementType()); + const auto inType = mlir::cast(leaky_relu.getInput().getType()); + const auto outDesc = vpux::getTensorAttr(inType); + inferredReturnShapes.emplace_back(inType.getShape(), inType.getElementType(), outDesc); return mlir::success(); } diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/proposal.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/proposal.cpp index f63778f7d8..702ce8eb01 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/proposal.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/proposal.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/psroipooling.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/psroipooling.cpp index 4646deb044..7d6ccaa07b 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/psroipooling.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/psroipooling.cpp @@ -2,8 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/quantize.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/quantize.cpp index 5deddd3854..f2235b819a 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/quantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/quantize.cpp @@ -4,10 +4,12 @@ // #include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/VPU/utils/adaptive_stripping_utils.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/quantize_cast.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/quantize_cast.cpp index 63bf26c3a4..ba24d08751 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/quantize_cast.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/quantize_cast.cpp @@ -3,8 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/cast_utils.hpp" #include "vpux/compiler/utils/quantization.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/random_uniform.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/random_uniform.cpp index 4b83d8509f..40c482c66e 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/random_uniform.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/random_uniform.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/range.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/range.cpp index 1626e37647..999be7737f 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/range.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/range.cpp @@ -3,13 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - #include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/range_bound.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/rdft.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/rdft.cpp index 0b2118b291..66947ffb16 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/rdft.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/rdft.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/utils/fft_ops_utils.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/read_value.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/read_value.cpp index f85cd80a0c..8a37858e97 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/read_value.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/read_value.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_l1.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_l1.cpp index 191fb7852f..a8fc504a8c 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_l1.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_l1.cpp @@ -3,10 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" - #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_l2.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_l2.cpp index 3e20a46080..06dc514956 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_l2.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_l2.cpp @@ -3,13 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/utils/core/checked_cast.hpp" - using namespace vpux; mlir::LogicalResult vpux::IE::ReduceL2Op::inferReturnTypeComponents( diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_logical_and.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_logical_and.cpp index 853a44cc59..a10ddd51bb 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_logical_and.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_logical_and.cpp @@ -3,13 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/utils/core/checked_cast.hpp" - using namespace vpux; mlir::LogicalResult vpux::IE::ReduceLogicalAndOp::inferReturnTypeComponents( diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_logical_or.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_logical_or.cpp index 4bc92383c8..76c4a7c913 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_logical_or.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_logical_or.cpp @@ -3,10 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" - #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_max.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_max.cpp index 2a71623626..fcf953f40a 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_max.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_max.cpp @@ -3,9 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" - #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_mean.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_mean.cpp index 19b2cd1e5a..6e2c700ff2 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_mean.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_mean.cpp @@ -3,16 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" #include "vpux/compiler/dialect/IE/utils/type_padding.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/utils/core/checked_cast.hpp" - using namespace vpux; void IE::ReduceMeanOp::build(mlir::OpBuilder& odsBuilder, mlir::OperationState& odsState, mlir::Type outputType, diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_min.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_min.cpp index 146e83772a..f787621a42 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_min.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_min.cpp @@ -3,13 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/utils/core/checked_cast.hpp" - using namespace vpux; mlir::LogicalResult vpux::IE::ReduceMinOp::inferReturnTypeComponents( diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_prod.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_prod.cpp index 9eebb3c9f7..224a4ba46c 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_prod.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_prod.cpp @@ -3,9 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" - #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_sum.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_sum.cpp index 4a3fd0ded5..fd55a2caf7 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_sum.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/reduce_sum.cpp @@ -3,16 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" #include "vpux/compiler/dialect/IE/utils/type_padding.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/utils/core/checked_cast.hpp" - using namespace vpux; void IE::ReduceSumOp::build(mlir::OpBuilder& odsBuilder, mlir::OperationState& odsState, mlir::Type outputType, diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/region_yolo.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/region_yolo.cpp index a333da108b..9aa3265c9a 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/region_yolo.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/region_yolo.cpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/utils/core/checked_cast.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/relu.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/relu.cpp index 5b8ebac76e..280950ec68 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/relu.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/relu.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/reorder.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/reorder.cpp index e1298bf9be..2eecde444c 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/reorder.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/reorder.cpp @@ -3,9 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" -#include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/reorg_yolo.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/reorg_yolo.cpp index a76123a666..2b791b2dc6 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/reorg_yolo.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/reorg_yolo.cpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/reshape.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/reshape.cpp index a2a4ab473e..f97fb56825 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/reshape.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/reshape.cpp @@ -3,11 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/affine_reshape.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/reverse.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/reverse.cpp index a7010cd26d..e25842f0b9 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/reverse.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/reverse.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/reverse_sequence.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/reverse_sequence.cpp index dc597d3470..2f709ada7d 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/reverse_sequence.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/reverse_sequence.cpp @@ -3,9 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" -#include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/rms.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/rms.cpp index 05d8144c6a..aa0827e243 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/rms.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/rms.cpp @@ -3,11 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/utils/core/checked_cast.hpp" - using namespace vpux; mlir::LogicalResult vpux::IE::RMSOp::inferReturnTypeComponents( diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/roialign.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/roialign.cpp index a061da344f..1db464c525 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/roialign.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/roialign.cpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/roipooling.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/roipooling.cpp index 8aafa2dd5c..249b38eac8 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/roipooling.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/roipooling.cpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/roll.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/roll.cpp index bac630cecd..0f6c1f6e67 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/roll.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/roll.cpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/rope.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/rope.cpp index 973ad9c433..de1f917431 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/rope.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/rope.cpp @@ -3,9 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - -#include "vpux/utils/core/small_vector.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/round.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/round.cpp index 92d6cfa29c..8774372683 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/round.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/round.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/scale_shift.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/scale_shift.cpp index 1a0d1722de..aa31af7962 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/scale_shift.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/scale_shift.cpp @@ -3,9 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -// add - -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/const/ops.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/scatter_elements_update.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/scatter_elements_update.cpp index 51f81b18b8..b64f35b295 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/scatter_elements_update.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/scatter_elements_update.cpp @@ -3,9 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -// - -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/scatter_nd_update.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/scatter_nd_update.cpp index 16662deafe..f3edd97d7f 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/scatter_nd_update.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/scatter_nd_update.cpp @@ -3,8 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/scatter_update.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/scatter_update.cpp index 106a9096d3..95543eb878 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/scatter_update.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/scatter_update.cpp @@ -1,10 +1,9 @@ -//// -//// Copyright (C) 2022-2025 Intel Corporation. -//// SPDX-License-Identifier: Apache-2.0 -//// +// +// Copyright (C) 2022-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/sdpa.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/sdpa.cpp index a927cbdf24..621a1dd3c4 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/sdpa.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/sdpa.cpp @@ -3,9 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - -#include "vpux/utils/core/small_vector.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/select.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/select.cpp index 9ea13a0d6d..7d694bc3d0 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/select.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/select.cpp @@ -5,7 +5,7 @@ // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/selu.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/selu.cpp index 847cef7b00..73a12309f0 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/selu.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/selu.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/shape_cast.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/shape_cast.cpp index 14898fb828..b475ee30f4 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/shape_cast.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/shape_cast.cpp @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include @@ -24,7 +26,7 @@ mlir::LogicalResult vpux::IE::ShapeCastOp::inferReturnTypeComponents( } const auto outShape = parseIntArrayAttr(shapeCast.getShape()); - const auto inType = mlir::cast(shapeCast.getSource().getType()); + const auto inType = mlir::cast(shapeCast.getInput().getType()); VPUX_THROW_UNLESS(!mlir::isa(inType), "{0} doesn't support dynamic shapes", IE::ShapeCastOp::getOperationName()); @@ -35,10 +37,10 @@ mlir::LogicalResult vpux::IE::ShapeCastOp::inferReturnTypeComponents( mlir::OpFoldResult vpux::IE::ShapeCastOp::fold(FoldAdaptor adaptor) { auto operands = adaptor.getOperands(); - auto inputType = mlir::cast(getSource().getType()); - auto outputType = mlir::cast(getResult().getType()); - if (getSource().getType() == getResult().getType()) { - return getSource(); + auto inputType = mlir::cast(getInput().getType()); + auto outputType = mlir::cast(getOutput().getType()); + if (inputType == outputType) { + return getInput(); } VPUX_THROW_UNLESS(!operands.empty(), "Wrong number of operands : {0}", operands.size()); @@ -67,7 +69,7 @@ class FuseWithShapeCastOrAffineReshape final : public mlir::OpRewritePattern(prevOp)) { return mlir::failure(); } diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/shape_of.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/shape_of.cpp index 28ba449fa1..ebed14135e 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/shape_of.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/shape_of.cpp @@ -3,14 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/compiler/utils/types.hpp" #include "vpux/utils/core/array_ref.hpp" #include "vpux/utils/core/checked_cast.hpp" +#include + using namespace vpux; mlir::LogicalResult vpux::IE::ShapeOfOp::inferReturnTypeComponents( diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/shuffle_channels.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/shuffle_channels.cpp index e983f1d761..85fce0fc97 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/shuffle_channels.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/shuffle_channels.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/sigmoid.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/sigmoid.cpp index 9d6321e79c..ed3135b798 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/sigmoid.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/sigmoid.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" using namespace vpux; @@ -18,8 +19,9 @@ mlir::LogicalResult vpux::IE::SigmoidOp::inferReturnTypeComponents( return mlir::failure(); } - const auto inType = mlir::cast(sigmoid.getInput().getType()); - inferredReturnShapes.emplace_back(inType.getShape(), inType.getElementType()); + const auto inType = mlir::cast(sigmoid.getInput().getType()); + const auto outDesc = vpux::getTensorAttr(inType); + inferredReturnShapes.emplace_back(inType.getShape(), inType.getElementType(), outDesc); return mlir::success(); } diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/sign.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/sign.cpp index 12a4989c9f..46d345a92d 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/sign.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/sign.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/sin.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/sin.cpp index 05d35000f4..3406134919 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/sin.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/sin.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/sinh.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/sinh.cpp index 690e1d8628..eab78a5fda 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/sinh.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/sinh.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/slice.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/slice.cpp index 62b1ba658a..eec52ed4f1 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/slice.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/slice.cpp @@ -3,9 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" - #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" @@ -43,7 +42,7 @@ mlir::LogicalResult vpux::IE::SliceOp::inferReturnTypeComponents( return mlir::failure(); } - const auto origType = mlir::dyn_cast(sliceOp.getSource().getType()); + const auto origType = mlir::dyn_cast(sliceOp.getInput().getType()); if (origType == nullptr) { return errorAt(loc, "IE::SliceOp operand must have vpux::NDTypeInterface type"); } @@ -73,8 +72,8 @@ mlir::LogicalResult vpux::IE::SliceOp::inferReturnTypeComponents( mlir::OpFoldResult vpux::IE::SliceOp::fold(FoldAdaptor adaptor) { auto operands = adaptor.getOperands(); - if (getSource().getType() == getResult().getType()) { - return getSource(); + if (getInput().getType() == getOutput().getType()) { + return getInput(); } if (const auto origContent = mlir::dyn_cast_or_null(operands[0])) { @@ -100,7 +99,7 @@ class ComposeSlice final : public mlir::OpRewritePattern { }; mlir::LogicalResult ComposeSlice::matchAndRewrite(IE::SliceOp origOp, mlir::PatternRewriter& rewriter) const { - auto producerSliceOp = origOp.getSource().getDefiningOp(); + auto producerSliceOp = origOp.getInput().getDefiningOp(); if (producerSliceOp == nullptr) { return mlir::failure(); } @@ -113,7 +112,7 @@ mlir::LogicalResult ComposeSlice::matchAndRewrite(IE::SliceOp origOp, mlir::Patt const auto finalOffsetsAttr = getIntArrayAttr(getContext(), finalOffsets); const auto finalShapeAttr = origOp.getStaticSizes(); - rewriter.replaceOpWithNewOp(origOp, producerSliceOp.getSource(), finalOffsetsAttr, finalShapeAttr); + rewriter.replaceOpWithNewOp(origOp, producerSliceOp.getInput(), finalOffsetsAttr, finalShapeAttr); return mlir::success(); } @@ -136,7 +135,7 @@ mlir::LogicalResult ProcessNegativeOffset::matchAndRewrite(IE::SliceOp origOp, for (size_t i = 0; i < offsets.size(); ++i) { if (offsets[i] < 0) { negFlag = true; - offsets[i] += getShape(origOp.getSource())[Dim(i)]; + offsets[i] += getShape(origOp.getInput())[Dim(i)]; } } diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/softmax.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/softmax.cpp index 4488b64dea..7490fed434 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/softmax.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/softmax.cpp @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/attributes_utils.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" #include "vpux/utils/core/checked_cast.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/softplus.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/softplus.cpp index 2c9596b969..bfa86c8b3d 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/softplus.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/softplus.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/space_to_batch.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/space_to_batch.cpp index da05482e85..6a7a5aa921 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/space_to_batch.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/space_to_batch.cpp @@ -3,8 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/attributes_utils.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/utils/core/small_vector.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/space_to_depth.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/space_to_depth.cpp index cf33238998..3a42b0f48b 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/space_to_depth.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/space_to_depth.cpp @@ -3,8 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/split.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/split.cpp index bf614e371c..1dcd4f4dbb 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/split.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/split.cpp @@ -3,9 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/sqrt.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/sqrt.cpp index 527374c74e..7d7901b48f 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/sqrt.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/sqrt.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/squared_diff.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/squared_diff.cpp index 99722396e8..71a687fd6e 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/squared_diff.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/squared_diff.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/squeeze.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/squeeze.cpp index 825cd034b4..1aa17c3892 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/squeeze.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/squeeze.cpp @@ -3,14 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/VPU/utils/layout_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/permute_utils.hpp" - #include "vpux/utils/core/checked_cast.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/strided_slice.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/strided_slice.cpp index 09bee1cbb0..6a0334cc32 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/strided_slice.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/strided_slice.cpp @@ -3,19 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/utils/elem_type_info_utils.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" -#include "vpux/compiler/utils/analysis.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/empty_node.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" #include "vpux/compiler/utils/rewriter.hpp" - -#include "vpux/utils/core/checked_cast.hpp" #include "vpux/utils/core/range.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/subtract.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/subtract.cpp index 3c5950ebb6..9d086b4dcc 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/subtract.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/subtract.cpp @@ -3,11 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" -#include "vpux/compiler/dialect/IE/utils/type_padding.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" -#include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" #include "vpux/utils/core/numeric.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/swish.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/swish.cpp index f8091db045..0fb4e78a3d 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/swish.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/swish.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/tan.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/tan.cpp index 077ca416c9..4143fadf9e 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/tan.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/tan.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/tanh.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/tanh.cpp index 1e15cebd5f..2f96580191 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/tanh.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/tanh.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" using namespace vpux; @@ -18,8 +19,9 @@ mlir::LogicalResult vpux::IE::TanhOp::inferReturnTypeComponents( return mlir::failure(); } - const auto inType = mlir::cast(tanh.getInput().getType()); - inferredReturnShapes.emplace_back(inType.getShape(), inType.getElementType()); + const auto inType = mlir::cast(tanh.getInput().getType()); + const auto outDesc = vpux::getTensorAttr(inType); + inferredReturnShapes.emplace_back(inType.getShape(), inType.getElementType(), outDesc); return mlir::success(); } diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/tensor_iterator.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/tensor_iterator.cpp index 3f0483b0a9..04fbb963c5 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/tensor_iterator.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/tensor_iterator.cpp @@ -3,7 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/tile.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/tile.cpp index ffeea5204b..05130acb29 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/tile.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/tile.cpp @@ -3,12 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" - #include "vpux/utils/core/checked_cast.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/topk.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/topk.cpp index ee903229d8..ffc09f69cc 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/topk.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/topk.cpp @@ -3,8 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/attributes_utils.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/transpose.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/transpose.cpp index 9578154126..bf3ca04e65 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/transpose.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/transpose.cpp @@ -3,13 +3,17 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/IE/utils/elem_type_info_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/static_shape_op_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/permute_utils.hpp" +#include +#include + using namespace vpux; namespace { diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/transposed_convolution.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/transposed_convolution.cpp index c015ad43eb..e52e193c20 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/transposed_convolution.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/transposed_convolution.cpp @@ -32,15 +32,14 @@ // then check to make sure that the incoming delta has the same shape as the forward output. // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" #include "vpux/compiler/dialect/IE/utils/convolution_utils.hpp" - #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" -#include "openvino/op/group_conv.hpp" +#include using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/unsqueeze.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/unsqueeze.cpp index 890dbe77d6..f477d9eedd 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/unsqueeze.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/unsqueeze.cpp @@ -4,9 +4,10 @@ // #include "vpux/compiler/dialect/IE/utils/unsqueeze.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/layout_utils.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/upsampling.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/upsampling.cpp index fce7474b70..69823a7517 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/upsampling.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/upsampling.cpp @@ -3,10 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - using namespace vpux; mlir::LogicalResult vpux::IE::UpsamplingOp::inferReturnTypeComponents( diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/variadic_split.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/variadic_split.cpp index 16b5f02614..5404320a8a 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/variadic_split.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/variadic_split.cpp @@ -3,8 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" +#include "vpux/compiler/utils/attributes.hpp" + #include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" namespace { // replaces the -1 value with the inferred split length if it exists diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/yield.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/yield.cpp index efe6b8841e..fbe2db421b 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/yield.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/yield.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops/yuv_to_rgb.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops/yuv_to_rgb.cpp index f4695e0cb5..418dfe3524 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops/yuv_to_rgb.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops/yuv_to_rgb.cpp @@ -3,7 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/IR/ops_interfaces.cpp b/src/vpux_compiler/src/dialect/IE/IR/ops_interfaces.cpp index d7ae0d8212..c68655619b 100644 --- a/src/vpux_compiler/src/dialect/IE/IR/ops_interfaces.cpp +++ b/src/vpux_compiler/src/dialect/IE/IR/ops_interfaces.cpp @@ -4,23 +4,18 @@ // #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" - -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/quantization.hpp" -#include "vpux/compiler/utils/rewriter.hpp" - -#include "vpux/utils/core/format.hpp" #include "vpux/utils/core/range.hpp" #include #include #include -#include -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/interfaces/common_rewriters/convert_quantize_ops_to_nce_ops.cpp b/src/vpux_compiler/src/dialect/IE/interfaces/common_rewriters/convert_quantize_ops_to_nce_ops.cpp index 6b8fcd6b91..45323ccd81 100644 --- a/src/vpux_compiler/src/dialect/IE/interfaces/common_rewriters/convert_quantize_ops_to_nce_ops.cpp +++ b/src/vpux_compiler/src/dialect/IE/interfaces/common_rewriters/convert_quantize_ops_to_nce_ops.cpp @@ -4,6 +4,11 @@ // #include "vpux/compiler/dialect/IE/interfaces/common_rewriters/convert_quantize_ops_to_nce_ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes/convert_quantize_ops_to_nce_ops.hpp" +#include "vpux/compiler/dialect/const/utils/utils.hpp" + namespace vpux::IE { // diff --git a/src/vpux_compiler/src/dialect/IE/interfaces/common_rewriters/fuse_outstanding_quant.cpp b/src/vpux_compiler/src/dialect/IE/interfaces/common_rewriters/fuse_outstanding_quant.cpp index 648051d7bf..c2da262154 100644 --- a/src/vpux_compiler/src/dialect/IE/interfaces/common_rewriters/fuse_outstanding_quant.cpp +++ b/src/vpux_compiler/src/dialect/IE/interfaces/common_rewriters/fuse_outstanding_quant.cpp @@ -4,7 +4,10 @@ // #include "vpux/compiler/dialect/IE/interfaces/common_rewriters/fuse_outstanding_quant.hpp" -#include "vpux/compiler/utils/quantization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/utils/quantization.hpp" +#include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/interfaces/common_rewriters/fuse_quantized_ops.cpp b/src/vpux_compiler/src/dialect/IE/interfaces/common_rewriters/fuse_quantized_ops.cpp index 24dc559abd..5398d15394 100644 --- a/src/vpux_compiler/src/dialect/IE/interfaces/common_rewriters/fuse_quantized_ops.cpp +++ b/src/vpux_compiler/src/dialect/IE/interfaces/common_rewriters/fuse_quantized_ops.cpp @@ -4,7 +4,13 @@ // #include "vpux/compiler/dialect/IE/interfaces/common_rewriters/fuse_quantized_ops.hpp" -#include "vpux/compiler/utils/quantization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" using namespace vpux; using namespace IE; diff --git a/src/vpux_compiler/src/dialect/IE/interfaces/d2s_to_transposed_conv_verifier.cpp b/src/vpux_compiler/src/dialect/IE/interfaces/d2s_to_transposed_conv_verifier.cpp index cc550700d3..e535bbe907 100644 --- a/src/vpux_compiler/src/dialect/IE/interfaces/d2s_to_transposed_conv_verifier.cpp +++ b/src/vpux_compiler/src/dialect/IE/interfaces/d2s_to_transposed_conv_verifier.cpp @@ -19,11 +19,11 @@ mlir::LogicalResult D2SToTransposedConvVerifierBase::isBeneficialConversion(Logg return mlir::success(); } -std::unique_ptr createD2SToTransposedConvVerifier(VPU::ArchKind arch) { +std::unique_ptr createD2SToTransposedConvVerifier(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return std::make_unique(); - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return std::make_unique(); default: { return std::make_unique(); diff --git a/src/vpux_compiler/src/dialect/IE/interfaces/fuse_convert_to_dpu_checker.cpp b/src/vpux_compiler/src/dialect/IE/interfaces/fuse_convert_to_dpu_checker.cpp index 56050b85c5..ad45e4af49 100644 --- a/src/vpux_compiler/src/dialect/IE/interfaces/fuse_convert_to_dpu_checker.cpp +++ b/src/vpux_compiler/src/dialect/IE/interfaces/fuse_convert_to_dpu_checker.cpp @@ -9,10 +9,10 @@ namespace vpux { namespace IE { -std::unique_ptr createFuseConvertToDPUChecker(VPU::ArchKind arch) { +std::unique_ptr createFuseConvertToDPUChecker(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: - case VPU::ArchKind::NPU40XX: { + case config::ArchKind::NPU37XX: + case config::ArchKind::NPU40XX: { return std::make_unique(); } default: { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/factories/convert_quantize_ops_to_nce_ops_strategy_getter.cpp b/src/vpux_compiler/src/dialect/IE/transforms/factories/convert_quantize_ops_to_nce_ops_strategy_getter.cpp index 7e732ac386..775318866c 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/factories/convert_quantize_ops_to_nce_ops_strategy_getter.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/factories/convert_quantize_ops_to_nce_ops_strategy_getter.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/IE/transforms/factories/convert_quantize_ops_to_nce_ops_strategy_getter.hpp" #include "vpux/compiler/NPU37XX/dialect/IE/impl/convert_quantize_ops_to_nce_ops_strategy.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -13,10 +14,10 @@ namespace vpux::IE { std::unique_ptr createConvertQuantizeOpsToNceOpsStrategy( mlir::func::FuncOp funcOp) { - const auto arch = VPU::getArch(funcOp); + const auto arch = config::getArch(funcOp); switch (arch) { - case VPU::ArchKind::NPU37XX: - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU37XX: + case config::ArchKind::NPU40XX: return std::make_unique(); default: diff --git a/src/vpux_compiler/src/dialect/IE/transforms/factories/convert_to_palletization_lut_strategy_getter.cpp b/src/vpux_compiler/src/dialect/IE/transforms/factories/convert_to_palletization_lut_strategy_getter.cpp index 8f8dddc1a3..0b0f71bfc1 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/factories/convert_to_palletization_lut_strategy_getter.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/factories/convert_to_palletization_lut_strategy_getter.cpp @@ -6,14 +6,15 @@ #include "vpux/compiler/dialect/IE/transforms/factories/convert_to_palletization_lut_strategy_getter.hpp" #include "vpux/compiler/NPU40XX/dialect/IE/impl/convert_to_palletization_lut_strategy.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" namespace vpux::IE { std::unique_ptr createConvertToPalletizationLUTStrategy(mlir::func::FuncOp funcOp) { - const auto arch = VPU::getArch(funcOp); + const auto arch = config::getArch(funcOp); switch (arch) { - case VPU::ArchKind::NPU37XX: - case VPU::ArchKind::NPU40XX: { + case config::ArchKind::NPU37XX: + case config::ArchKind::NPU40XX: { return std::make_unique(); } default: { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/factories/fuse_outstanding_quant_strategy_getter.cpp b/src/vpux_compiler/src/dialect/IE/transforms/factories/fuse_outstanding_quant_strategy_getter.cpp index ac8482c19c..ad33e20be1 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/factories/fuse_outstanding_quant_strategy_getter.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/factories/fuse_outstanding_quant_strategy_getter.cpp @@ -6,14 +6,15 @@ #include "vpux/compiler/dialect/IE/transforms/factories/fuse_outstanding_quant_strategy_getter.hpp" #include "vpux/compiler/NPU37XX/dialect/IE/impl/fuse_outstanding_quant_strategy.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" namespace vpux::IE { std::unique_ptr createFuseOutstandingQuantStrategy(mlir::func::FuncOp funcOp) { - const auto arch = VPU::getArch(funcOp); + const auto arch = config::getArch(funcOp); switch (arch) { - case VPU::ArchKind::NPU37XX: - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU37XX: + case config::ArchKind::NPU40XX: return std::make_unique(); default: { } diff --git a/src/vpux_compiler/src/dialect/IE/transforms/factories/fuse_quantized_ops_strategy_getter.cpp b/src/vpux_compiler/src/dialect/IE/transforms/factories/fuse_quantized_ops_strategy_getter.cpp index a58113dbde..b16982c88d 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/factories/fuse_quantized_ops_strategy_getter.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/factories/fuse_quantized_ops_strategy_getter.cpp @@ -6,16 +6,17 @@ #include "vpux/compiler/dialect/IE/transforms/factories/fuse_quantized_ops_strategy_getter.hpp" #include "vpux/compiler/NPU37XX/dialect/IE/impl/fuse_quantized_ops_strategy.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" namespace vpux::IE { std::unique_ptr createFuseQuantizedOpsStrategy(mlir::func::FuncOp funcOp, const bool seOpsEnabled, const bool seExperimentalOpsEnabled) { - const auto arch = VPU::getArch(funcOp); + const auto arch = config::getArch(funcOp); switch (arch) { - case VPU::ArchKind::NPU37XX: - case VPU::ArchKind::NPU40XX: { + case config::ArchKind::NPU37XX: + case config::ArchKind::NPU40XX: { return std::make_unique(seOpsEnabled, seExperimentalOpsEnabled); } default: { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/factories/map_bilinear_interpolate_on_dpu_strategy_getter.cpp b/src/vpux_compiler/src/dialect/IE/transforms/factories/map_bilinear_interpolate_on_dpu_strategy_getter.cpp new file mode 100644 index 0000000000..54e9e81b57 --- /dev/null +++ b/src/vpux_compiler/src/dialect/IE/transforms/factories/map_bilinear_interpolate_on_dpu_strategy_getter.cpp @@ -0,0 +1,24 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/IE/transforms/factories/map_bilinear_interpolate_on_dpu_strategy_getter.hpp" +#include "vpux/compiler/NPU37XX/dialect/IE/impl/map_bilinear_interpolate_on_dpu_strategy.hpp" +#include "vpux/compiler/NPU40XX/dialect/IE/impl/map_bilinear_interpolate_on_dpu_strategy.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" + +namespace vpux::IE { + +std::unique_ptr createMapBilinearInterpolateOnDPUStrategy( + mlir::func::FuncOp funcOp, const bool interpolateAsSEOpInStrategy) { + const auto arch = config::getArch(funcOp); + switch (arch) { + case config::ArchKind::NPU37XX: + return std::make_unique(interpolateAsSEOpInStrategy); + default: + return std::make_unique(interpolateAsSEOpInStrategy); + } + VPUX_THROW("Unable to get MapBilinearInterpolateOnDPUStrategy for arch {0}", arch); +} +} // namespace vpux::IE diff --git a/src/vpux_compiler/src/dialect/IE/transforms/factories/weights_dequantize_to_fakequantize_strategy_getter.cpp b/src/vpux_compiler/src/dialect/IE/transforms/factories/weights_dequantize_to_fakequantize_strategy_getter.cpp index d3e2827058..54d3bf00e0 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/factories/weights_dequantize_to_fakequantize_strategy_getter.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/factories/weights_dequantize_to_fakequantize_strategy_getter.cpp @@ -7,17 +7,18 @@ #include #include "vpux/compiler/NPU37XX/dialect/IE/impl/weights_dequantize_to_fakequantize_strategy.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/logging.hpp" #include "vpux/utils/logger/logger.hpp" using namespace vpux; std::unique_ptr IE::createWeightsDequantizeToFakeQuantizeStrategy(mlir::func::FuncOp funcOp) { - const auto arch = VPU::getArch(funcOp); + const auto arch = config::getArch(funcOp); switch (arch) { - case VPU::ArchKind::NPU37XX: - case VPU::ArchKind::NPU40XX: { + case config::ArchKind::NPU37XX: + case config::ArchKind::NPU40XX: { return std::make_unique(); } default: { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adapt_shapes_for_scale_shift.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adapt_shapes_for_scale_shift.cpp index aceefb6263..c613084cc8 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/adapt_shapes_for_scale_shift.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adapt_shapes_for_scale_shift.cpp @@ -3,13 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_convolution_input_shape.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_convolution_input_shape.cpp index 014296eaf7..db72dc904b 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_convolution_input_shape.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_convolution_input_shape.cpp @@ -3,18 +3,17 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" #include "vpux/compiler/dialect/IE/utils/convolution_utils.hpp" #include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/empty_node.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/numeric.hpp" @@ -22,6 +21,7 @@ #include #include +#include namespace vpux::IE { #define GEN_PASS_DECL_ADJUSTCONVOLUTIONINPUTSHAPE @@ -272,7 +272,7 @@ mlir::LogicalResult ReshapeAddInput::matchAndRewrite(IE::AddOp origOp, mlir::Pat if (inputShape == getShape(origOp.getInput2()) && hasParallelAdds(origOp) && mlir::isa_and_nonnull(*origOp->getUsers().begin()) && - vpux::VPU::getArch(origOp) == vpux::VPU::ArchKind::NPU37XX) { + vpux::config::getArch(origOp) == vpux::config::ArchKind::NPU37XX) { isSameInput = false; } else { return matchFailed(_log, rewriter, origOp, "Not a valid addOp with same input"); diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_convolution_shape.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_convolution_shape.cpp index 7545b5f7f5..2779e32cfc 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_convolution_shape.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_convolution_shape.cpp @@ -3,24 +3,27 @@ // SPDX-License-Identifier: Apache-2.0 // -#include #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/layers.hpp" -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" #include "vpux/compiler/dialect/IE/utils/convolution_utils.hpp" -#include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" #include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/adjust_layout_utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" + +#include + using namespace vpux; namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_convolution_weights.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_convolution_weights.cpp index 2199518032..04c34b96c1 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_convolution_weights.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_convolution_weights.cpp @@ -3,19 +3,17 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" #include "vpux/compiler/dialect/IE/utils/expand_utils.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/utils/adjust_layout_utils.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/factors.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/core/numeric.hpp" namespace vpux::IE { #define GEN_PASS_DECL_ADJUSTCONVOLUTIONWEIGHTS diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_fake_qdq_params.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_fake_qdq_params.cpp new file mode 100644 index 0000000000..37608f8fd9 --- /dev/null +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_fake_qdq_params.cpp @@ -0,0 +1,1009 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/dialect/IE/utils/quantization.hpp" +#include "vpux/compiler/dialect/VPU/utils/eltwise_utils.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/const/utils/utils.hpp" +#include "vpux/compiler/utils/error.hpp" +#include "vpux/compiler/utils/rewriter.hpp" + +namespace vpux::IE { +#define GEN_PASS_DECL_ADJUSTFAKEQDQPARAMS +#define GEN_PASS_DEF_ADJUSTFAKEQDQPARAMS +#include "vpux/compiler/dialect/IE/passes.hpp.inc" +} // namespace vpux::IE + +using namespace vpux; + +namespace { + +const float FP16_MAXIMUM = checked_cast(std::numeric_limits::max()); +const float FP16_MINIMUM = checked_cast(std::numeric_limits::lowest()); +// For the model PSD7 the script from architecture team looks at scale in QDQ layers. +// In order to match all the nodes that script modifies to achieve accuracy, we need +// to use an extra 0.6 factor. +constexpr float FP16_SCALE_FACTOR = 0.6f; +const float FP16_MIN_SCALED = FP16_SCALE_FACTOR * FP16_MINIMUM; +const float FP16_MAX_SCALED = FP16_SCALE_FACTOR * FP16_MAXIMUM; + +enum class TraversalDir { + INVALID, + DOWN, + UP, +}; + +bool isInstanceNormOp(mlir::Operation* op) { + return llvm::TypeSwitch(op) + .Case([&](IE::MVN6Op) { + return true; + }) + .Case([&](IE::MVNOp) { + return true; + }) + .Default([&](mlir::Operation*) { + return false; + }); +} + +bool isStopOperation(mlir::Operation* op) { + return llvm::TypeSwitch(op) + .Case([&](IE::SoftMaxOp) { + return true; + }) + .Default([&](mlir::Operation* op) { + return isInstanceNormOp(op); + }); +} + +// STEP 6: Handle Memory Ops +bool isMemoryOp(mlir::Operation* op) { + return llvm::TypeSwitch(op) + .Case([&](IE::TransposeOp) { + return true; + }) + .Case([&](IE::ReorderOp) { + return true; + }) + .Case([&](IE::ReshapeOp) { + return true; + }) + .Default([&](mlir::Operation*) { + return false; + }); +} + +inline bool hasExceededFp16Range(float low, float high) { + const auto retval = high >= FP16_MAX_SCALED || low <= FP16_MIN_SCALED; + return retval; +} + +// Trait to detect if Op is IE::QuantizeOp or IE::DequantizeOp +template +struct is_qdq_op { + static const bool value = false; +}; + +template <> +struct is_qdq_op { + static const bool value = true; +}; + +template <> +struct is_qdq_op { + static const bool value = true; +}; + +std::tuple getFqValues(IE::FakeQuantizeOp fq) { + return std::make_tuple(IE::getConst(fq.getInputLow().getDefiningOp())[0], + IE::getConst(fq.getInputHigh().getDefiningOp())[0], + IE::getConst(fq.getOutputLow().getDefiningOp())[0], + IE::getConst(fq.getOutputHigh().getDefiningOp())[0]); +} + +template ::value>::type> +inline float get_qdq_scale(T& qdqop) { + auto outputTypeQuantize = mlir::cast(qdqop.getType()); + auto outElemType = outputTypeQuantize.getElementType(); + + auto outUniformType = mlir::dyn_cast(outElemType); + VPUX_THROW_WHEN(!outUniformType, "ERROR: Could not get uniform quant type to determine scale for QDQ OP"); + const auto quantizeScale = outUniformType.getScale(); + + return quantizeScale; +} + +// Computing Rescale coefficient. +inline float rescaleCoefficient(IE::QuantizeOp* op) { + const auto fxv = get_qdq_scale(*op); + float scale = fxv > 0.6 ? 0.5 * fxv : 1.0f; + return scale; +} + +inline float rescaleCoefficient(IE::DequantizeOp* op) { + const auto fxv = get_qdq_scale(*op); + float scale = fxv > 0.6 ? 0.5 * fxv : 1.0f; + return scale; +} + +inline float rescaleCoefficient(IE::FakeQuantizeOp* fakeQuantOp) { + VPUX_THROW_WHEN(nullptr == fakeQuantOp, "ERROR: The operation parameter cannot be null"); + float scale = 1.0f; + auto levels_opt = fakeQuantOp->getLevels(); + if (!levels_opt.has_value()) { + return scale; + } + auto levels = levels_opt.value(); + + // E#171515: max(inHigh-inLow, outHigh-outLow) / levels) / 0.5f) + auto [inLow, inHigh, outLow, outHigh] = getFqValues(*fakeQuantOp); + scale = (std::max(inHigh - inLow, outHigh - outLow) / levels) / 0.5f; + return scale; +} + +inline float rescaleCoefficient(mlir::Operation* op) { + return llvm::TypeSwitch(op) + .Case([&](IE::FakeQuantizeOp fqop) { + return rescaleCoefficient(&fqop); + }) + .Case([&](IE::QuantizeOp qop) { + return rescaleCoefficient(&qop); + }) + .Case([&](IE::DequantizeOp dqop) { + return rescaleCoefficient(&dqop); + }) + .Default([&](mlir::Operation*) { + VPUX_THROW("ERROR: Cannot calculate scale for an op"); + return -1.0f; + }); +} + +bool isFqRangeOutOfBounds(IE::FakeQuantizeOp fqOp, float inScale = 1.0f, float outScale = 1.0f) { + auto [inLow, inHigh, outLow, outHigh] = getFqValues(fqOp); + return (hasExceededFp16Range(inLow * inScale, inHigh * inScale) || + hasExceededFp16Range(outLow * outScale, outHigh * outScale)); +} + +// +// FakeQdqParamsRewriter +// +class FakeQdqParamsRewriter final : public mlir::OpRewritePattern { +public: + FakeQdqParamsRewriter(mlir::MLIRContext* ctx, Logger log) + : mlir::OpRewritePattern(ctx), _log(log) { + } + +public: + mlir::LogicalResult matchAndRewrite(IE::FakeQuantizeOp fakeQuantizeOp, mlir::PatternRewriter& rewriter) const final; + +private: + Logger _log; +}; + +/// +/// Create a multiply operation after the input "op". +/// +vpux::IE::MultiplyOp createMultiplyOp(mlir::Operation* op, mlir::PatternRewriter& rewriter, mlir::MLIRContext* ctx, + float scale) { + VPUX_THROW_WHEN(!op, "Operation pointer cannot be null while creating multiply op"); + mlir::OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointAfter(op); + + auto tensorType = mlir::RankedTensorType::get({1}, mlir::Float32Type::get(ctx)); + const auto newScaleConst = Const::createFloatConst(rewriter, op->getLoc(), tensorType, {scale}); + + auto multiplyOp = rewriter.create(takeOpLoc(op, "as_mul"), op->getResult(0).getType(), + op->getResult(0), newScaleConst, IE::AutoBroadcastType::NUMPY, + /*post_op=*/nullptr, + /*clamp=*/nullptr, + /*output_channels=*/nullptr, + /*input_channels=*/nullptr); + return multiplyOp; +} + +/// +/// Overloaded version of Creating a Multiply op to handle the case of creating a multiply op +/// between an input argument and a user of that argument. This is called when the defining op is null. +/// +vpux::IE::MultiplyOp createMultiplyOp(mlir::Value value, mlir::PatternRewriter& rewriter, mlir::MLIRContext* ctx, + float scale) { + mlir::OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointAfterValue(value); + auto tensorType = mlir::RankedTensorType::get({1}, mlir::Float32Type::get(ctx)); + const auto newScaleConst = Const::createFloatConst(rewriter, value.getLoc(), tensorType, {scale}); + + auto multiplyOp = rewriter.create(appendLoc(value.getLoc(), "as_mul"), value.getType(), value, + newScaleConst, IE::AutoBroadcastType::NUMPY, + /*post_op=*/nullptr, + /*clamp=*/nullptr, + /*output_channels=*/nullptr, + /*input_channels=*/nullptr); + return multiplyOp; +} + +struct OpParamdata { +public: + enum class ScaleMode { + SYMMETRIC, + OUTPUT_ONLY, // So far we have not found a need for INPUT_ONLY + }; + + float scale; + mlir::Operation* op; + TraversalDir tdir; + ScaleMode scaleMode; + + OpParamdata(): scale(1.0f), op(nullptr), tdir(TraversalDir::INVALID), scaleMode(ScaleMode::SYMMETRIC) { + } + + // TODO: Tech-debt, there is perhaps a better design + OpParamdata(float os, mlir::Operation* currOp, TraversalDir travDir, ScaleMode smode = ScaleMode::SYMMETRIC) + : scale(os), op(currOp), tdir(travDir), scaleMode(smode) { + } +}; +using SubgraphMetaData = llvm::DenseMap; + +class OpParamScalerBase { +public: + // The logic to update an mlir::Operation goes here. + // The expectation is the old op is replaced by a new op. + // In any case the function updatedOp is expected to return the + // latest version of the op for further processing. + virtual inline std::pair operator()(mlir::Operation*, mlir::PatternRewriter&, + mlir::MLIRContext*, TraversalDir, float, + OpParamdata::ScaleMode) { + return {mlir::failure(), false}; + } + + // Once an is updated it needs to store the updated op + // and return the latest version. + virtual mlir::Operation* updatedOp() { + return nullptr; + } + virtual ~OpParamScalerBase() { + } +}; + +template +class OpParamScaler : public OpParamScalerBase { +public: + OpParamScaler(): OpParamScalerBase() { + } + virtual inline std::pair operator()(mlir::Operation*, mlir::PatternRewriter&, + mlir::MLIRContext*, TraversalDir, float, + OpParamdata::ScaleMode) override { + return {mlir::failure(), false}; + } + virtual ~OpParamScaler() { + } +}; + +template <> +class OpParamScaler : public OpParamScalerBase { + mlir::Operation* _updatedOp; + +public: + OpParamScaler(): OpParamScalerBase(), _updatedOp(nullptr) { + } + virtual inline std::pair operator()(mlir::Operation* op, mlir::PatternRewriter& rewriter, + mlir::MLIRContext* ctx, TraversalDir traversalDir, + float scale, OpParamdata::ScaleMode) override; + virtual mlir::Operation* updatedOp() override { + return _updatedOp; + } + virtual ~OpParamScaler() { + } +}; + +inline bool canPropagateToFQDQ(mlir::Operation*, float, TraversalDir) { + // Based on conversation with Alex, even if new scaling ruins a + // a previously good FQ or Q, DQ op we propagate the scaling. + return true; +#ifdef ENABLE_RESTRICTED_FQDQ_PROPAGATION + return llvm::TypeSwitch(op) + .Case([&](auto fqOp) { + auto [inLow, inHigh, outLow, outHigh] = getFqValues(fqOp); + // This is a previously good node, are we going to ruin it? + const bool scalingBreaksFP16Range = hasExceededFp16Range(inLow * scale, inHigh * scale) || + hasExceededFp16Range(outLow * scale, outHigh * scale); + // This is a strict criterion, regardless of whether the node without scaling had + // parameters within range. + // Alternatively we could do something like + // bool nodeGoodWithoutScaling = hasExceededFp16Range(inLow, inHigh) && !hasExceededFp16Range(outLow, + // outHigh); return nodeGoodWithoutScaling ? !scalingBreaksFP16Range : true; + return !scalingBreaksFP16Range; + }) + .Case([&](auto dqOp) { + float qscale1 = get_qdq_scale(dqOp); + const bool scalingGoesOutOfRange = qscale1 * scale > QDQ_SCALE_MAXIMUM; + return !scalingGoesOutOfRange; + }) + .Case([&](auto qOp) { + float qscale2 = get_qdq_scale(qOp); + const bool scalingGoesOutOfRange = qscale2 * scale > QDQ_SCALE_MAXIMUM; + return !scalingGoesOutOfRange; + }) + .Default([&](mlir::Operation*) { + // Not FQ or QDQ op so return true. + return true; + }); +#endif +} + +// Update FakeQuantize as outlined STEP 1 and STEP 3 of E#171489 +IE::FakeQuantizeOp updateFqParams(IE::FakeQuantizeOp origFq, float scale, OpParamdata::ScaleMode scaleMode, + mlir::PatternRewriter& rewriter) { + rewriter.setInsertionPoint(origFq); + auto [inLow, inHigh, outLow, outHigh] = getFqValues(origFq); + + float newInLow = inLow * scale; + float newInHigh = inHigh * scale; + + // Restricting to output scaling only for quantized inputs esp. PSD7 + if (OpParamdata::ScaleMode::OUTPUT_ONLY == scaleMode) { + newInLow = inLow; + newInHigh = inHigh; + } + + auto newInputLo = Const::createFloatConst(rewriter, origFq->getLoc(), origFq.getInputLow().getType(), newInLow); + auto newInputHi = Const::createFloatConst(rewriter, origFq->getLoc(), origFq.getInputHigh().getType(), newInHigh); + auto newOutputLo = + Const::createFloatConst(rewriter, origFq->getLoc(), origFq.getOutputLow().getType(), outLow * scale); + auto newOutputHi = + Const::createFloatConst(rewriter, origFq->getLoc(), origFq.getOutputHigh().getType(), outHigh * scale); + + return rewriter.replaceOpWithNewOp(origFq, origFq.getInput(), newInputLo, newInputHi, + newOutputLo, newOutputHi, origFq.getLevelsAttr(), + origFq.getLowFpTypeAttr(), origFq.getAutoBroadcastAttr()); +} + +inline std::pair OpParamScaler::operator()( + mlir::Operation* op, mlir::PatternRewriter& rewriter, mlir::MLIRContext*, TraversalDir traversalDir, + float scale, OpParamdata::ScaleMode scaleMode) { + if (auto fqOp = mlir::dyn_cast_or_null(op)) { + if (!canPropagateToFQDQ(op, scale, traversalDir)) { + return {mlir::failure(), true}; + } + _updatedOp = updateFqParams(fqOp, scale, scaleMode, rewriter); + return {mlir::success(), true}; + } + return {mlir::failure(), true}; +} + +// Check if an op is quantized const +inline bool isQuantizedConstOp(mlir::Operation* op) { + if (!op) { + return false; + } + auto cstOp = mlir::dyn_cast_or_null(op); + auto cstType = mlir::cast(cstOp.getContentAttr().getBaseContent().getType()); + auto elemType = cstType.getElementType(); + if (auto elemTypeInt = mlir::dyn_cast_or_null(elemType)) { + return elemTypeInt.getWidth() == 8 && elemTypeInt.isUnsigned(); + } + return false; +} + +// Metadata required to create multiply operation. +// This is created during graph traversal. +struct MulPropData { + TraversalDir traversalDir; + float scale; + bool active; + mlir::Operation* user; // create mul op between user->getOperand(user_operand_index) and user + size_t user_operand_index; + + MulPropData() + : traversalDir(TraversalDir::INVALID), scale(-1.0f), active(false), user(nullptr), user_operand_index(0) { + } + MulPropData(TraversalDir travDir, float scaleValue, bool enableProp, mlir::Operation* usr, size_t usr_operand_idx) + : traversalDir(travDir), + scale(scaleValue), + active(enableProp), + user(usr), + user_operand_index(usr_operand_idx) { + VPUX_THROW_WHEN(!user, "User parameter cannot be null"); + VPUX_THROW_WHEN(usr_operand_idx >= user->getNumOperands(), + "Operand index too large, {0}, loc: {1}, idx: {2}, nopers: {3}", user->getName(), + user->getLoc(), user_operand_index, user->getNumOperands()); + auto operand = user->getOperand(usr_operand_idx); + active = operand ? active : false; + } + + mlir::Operation* createMulOp(mlir::PatternRewriter& rewriter, mlir::MLIRContext* ctx) { + if (!active) { + // Prop data has been disabled don't create it. + return nullptr; + } + + // STEP 7: Fuse to constant op. + if (auto cstOp = + mlir::dyn_cast_or_null(user->getOperand(user_operand_index).getDefiningOp())) { + auto cstType = mlir::cast(cstOp.getOutput().getType()); + auto newContentAttr = cstOp.getContentAttr().transform().rescale(scale).get(); + mlir::OpBuilder builder(cstOp); + auto newCstOp = builder.create(cstOp.getLoc(), cstType, std::move(newContentAttr)); + user->setOperand(user_operand_index, newCstOp); + return newCstOp; + } + auto opervalue = user->getOperand(user_operand_index); + auto defoper = opervalue.getDefiningOp(); + auto mulOp = defoper ? createMultiplyOp(defoper, rewriter, ctx, scale) + : createMultiplyOp(opervalue, rewriter, ctx, scale); + user->setOperand(user_operand_index, mulOp); + return mulOp; + } +}; + +std::unique_ptr nodeParamScalerFactory(mlir::Operation* op) { + // Right now there is support only for FakeQuantize, in future we could add other things + // e.g., Quantize, Dequantize etc. + if (mlir::dyn_cast_or_null(op)) { + return std::make_unique>(); + } + + return std::unique_ptr{}; +} + +// Create Multiply operations using the mulProps vector +// Update FakeQuantize nodes using opMetaData +mlir::LogicalResult createMulsAndUpdateOps(llvm::SmallVector& subgraph, + llvm::SmallVector& mulProps, mlir::PatternRewriter& rewriter, + mlir::MLIRContext* ctx, + llvm::DenseMap& opMetaData) { + for (size_t i = 0; i < mulProps.size(); ++i) { + // inactive mul ops are ignored in the member function call below, + // so no need to check again. + mulProps[i].createMulOp(rewriter, ctx); + } + + // Maintain a map of current operations + // Why not update subgraph[i]? + // Just updating subgraph[i] does not work because it can be repeated in the array. + // A node can be repeated in the subgraph if + // for e.g., a Mul propagates upward from an Add operation with distinct operands. + // Besides that, once subgraph[i] is processed the index i is not revisited. + llvm::DenseMap currOpStore; + for (auto& sop : subgraph) { + currOpStore[sop] = sop; + } + for (size_t i = 0; i < subgraph.size(); ++i) { + auto& metadata = opMetaData[subgraph[i]]; + auto nps = nodeParamScalerFactory(subgraph[i]); + if (nullptr == nps) { + continue; + } + auto res = (*nps)(currOpStore[subgraph[i]], rewriter, ctx, metadata.tdir, metadata.scale, metadata.scaleMode); + if (!res.second) { + return mlir::failure(); + } + if (mlir::failed(res.first)) { + return mlir::failure(); + } + currOpStore[subgraph[i]] = nps->updatedOp(); + } + return mlir::success(); +} + +void moveMulToOutputWithFilter(const MulPropData mop, llvm::SmallVector& mulProps, + std::queue& mulQ, float newScale, + llvm::DenseSet& ignoreUserSet) { + for (size_t j = 0; j < mop.user->getNumResults(); ++j) { + llvm::DenseSet visited; + for (auto tuuser : mop.user->getResult(j).getUsers()) { + if (ignoreUserSet.contains(tuuser) || visited.contains(tuuser)) { + // Each user is visited only once. + continue; + } + visited.insert(tuuser); + for (size_t k = 0; k < tuuser->getNumOperands(); ++k) { + auto oper = tuuser->getOperand(k).getDefiningOp(); + if (oper != mop.user) { + continue; + } + mulProps.emplace_back(MulPropData(TraversalDir::DOWN, newScale, true, tuuser, k)); + mulQ.push(mulProps.size() - 1); + // A multiply is introduced on one operand of a given user only. + break; + } + } + } +} + +void moveMulToOutputWithFilter(size_t mopIndex, llvm::SmallVector& mulProps, std::queue& mulQ, + float newScale, llvm::DenseSet& ignoreUserSet) { + mulProps[mopIndex].active = false; + moveMulToOutputWithFilter(mulProps[mopIndex], mulProps, mulQ, newScale, ignoreUserSet); +} +// Wrapper function for the majority use case where there are no ignored users +void moveMulToOutput(size_t mopIndex, llvm::SmallVector& mulProps, std::queue& mulQ, + float newScale) { + llvm::DenseSet ignoreUserSet; + moveMulToOutputWithFilter(mopIndex, mulProps, mulQ, newScale, ignoreUserSet); +} + +void moveMulToInputIndex(size_t mopIndex, llvm::SmallVector& mulProps, std::queue& mulQ, + float newScale, size_t idx) { + mulProps[mopIndex].active = false; + // look at operands of toperand since we are traversing up. + auto toperand = mulProps[mopIndex].user->getOperand(mulProps[mopIndex].user_operand_index).getDefiningOp(); + mulProps.emplace_back(MulPropData(TraversalDir::UP, newScale, true, toperand, idx)); + mulQ.push(mulProps.size() - 1); +} + +void moveMulToAllInputs(size_t mopIndex, llvm::SmallVector& mulProps, std::queue& mulQ, + float newScale) { + mulProps[mopIndex].active = false; + // look at operands of toperand since we are traversing up. + auto toperand = mulProps[mopIndex].user->getOperand(mulProps[mopIndex].user_operand_index).getDefiningOp(); + for (size_t i = 0; i < toperand->getNumOperands(); ++i) { + mulProps.emplace_back(MulPropData(TraversalDir::UP, newScale, true, toperand, i)); + mulQ.push(mulProps.size() - 1); + } +} + +// Handles FQ update and considers the quantized weights case as well which is required for the const case of STEP 5. +bool setupFQUpdate(IE::FakeQuantizeOp& fqOp, float scale, TraversalDir tdir, SubgraphMetaData& opMetaData, + llvm::SmallVector& subgraph) { + const size_t input_index = 0; + auto cstOp = mlir::dyn_cast_or_null(fqOp.getOperand(input_index).getDefiningOp()); + auto opd = OpParamdata(scale, fqOp, tdir); + // If cstOp is not null and we have quantized weights we don't want to change those. + bool quantConstOp = nullptr != cstOp && isQuantizedConstOp(cstOp); + if (quantConstOp) { + // In addition we only want to scale the output low and output high for the FQs + // and leave the input range untouched. + opd.scaleMode = OpParamdata::ScaleMode::OUTPUT_ONLY; + } + if (opMetaData.contains(fqOp)) { + opd.scale *= opMetaData[fqOp].scale; + opMetaData[fqOp] = opd; + } else { + opMetaData.try_emplace(fqOp, opd); + subgraph.push_back(fqOp); + } + return quantConstOp; +} + +// STEP3 : Handle FQ of E#171489 +void handleFQOpDown(size_t mopIndex, SubgraphMetaData& opMetaData, llvm::SmallVector& subgraph, + llvm::SmallVector& mulProps, std::queue& mulQ) { + auto fqOp = mlir::dyn_cast_or_null(mulProps[mopIndex].user); + if (!fqOp) { + return; + } + + mulProps[mopIndex].active = false; + + if (!setupFQUpdate(fqOp, 1.0f / mulProps[mopIndex].scale, mulProps[mopIndex].traversalDir, opMetaData, subgraph)) { + // Propagate to output of FQ. + moveMulToOutput(mopIndex, mulProps, mulQ, mulProps[mopIndex].scale); + } +} + +// STEP 3: Handle FQ of E#171489 +void handleFQOpUp(size_t mopIndex, SubgraphMetaData& opMetaData, llvm::SmallVector& subgraph, + llvm::SmallVector& mulProps, std::queue& mulQ) { + auto fqOp = mlir::dyn_cast_or_null( + mulProps[mopIndex].user->getOperand(mulProps[mopIndex].user_operand_index).getDefiningOp()); + if (!fqOp) { + return; + } + + // During the up propagation the muls are propagated to all parameters of FQ + // and they reach all the way to the top of the graph and scale the constants. + // So updating them below + // will cause duplicate updates if we go with Option A moveMulToAllInputs + // OPTION A: Move mul + // moveMulToAllInputs(mop, mulProps, mulQ, mop.scale); + + // OPTION B: Move mul only to index 0 input and rescale the FQ op. + mulProps[mopIndex].active = false; + + // If cstOp is not null and we have quantized weights we don't want to change those. + if (!setupFQUpdate(fqOp, mulProps[mopIndex].scale, mulProps[mopIndex].traversalDir, opMetaData, subgraph)) { + const size_t input_index = 0; + moveMulToInputIndex(mopIndex, mulProps, mulQ, mulProps[mopIndex].scale, input_index); + } + + // We cannot simply call moveMulToOutput because we want to prevent multiply from being applied to mop.user. + // So mop.user is added to ignoredUsers so mul does not go back down. + const float downScale = 1.0f / mulProps[mopIndex].scale; + const size_t fakeOpIndex = 0; // This is ignored in moveMulToOutput. + MulPropData tempMop(TraversalDir::DOWN, downScale, true, fqOp, fakeOpIndex); + llvm::DenseSet ignoredUsers; + ignoredUsers.insert(mulProps[mopIndex].user); + moveMulToOutputWithFilter(tempMop, mulProps, mulQ, downScale, ignoredUsers); + tempMop.active = false; +} + +// STEP 4: Handle Add of E#171489 +void handleAddSubDown(size_t mopIndex, llvm::SmallVector& mulProps, std::queue& mulQ) { + mulProps[mopIndex].active = false; + // Need to propagate mul down and up. + for (size_t i = 0; i < mulProps[mopIndex].user->getNumOperands(); ++i) { + if (i == mulProps[mopIndex].user_operand_index) { + continue; + } + if (mulProps[mopIndex].user->getOperand(i).getDefiningOp() == + mulProps[mopIndex].user->getOperand(mulProps[mopIndex].user_operand_index).getDefiningOp()) { + // This is an identical input so don't propagate here. + continue; + } + mulProps.emplace_back( + MulPropData(TraversalDir::UP, 1.0f / mulProps[mopIndex].scale, true, mulProps[mopIndex].user, i)); + mulQ.push(mulProps.size() - 1); + } + + moveMulToOutput(mopIndex, mulProps, mulQ, mulProps[mopIndex].scale); +} + +// STEP 4: Handle Add of E#171489 +void handleAddSubUp(size_t mopIndex, llvm::SmallVector& mulProps, std::queue& mulQ) { + moveMulToAllInputs(mopIndex, mulProps, mulQ, mulProps[mopIndex].scale); +} + +bool hasConstWeightsOpAtIndex(mlir::Operation* oper, size_t index) { + return nullptr != mlir::dyn_cast_or_null(oper->getOperand(index).getDefiningOp()); +} + +bool fuseConstScale(mlir::Operation* oper, size_t operand_index, float scale) { + if (auto cstOp = mlir::dyn_cast_or_null(oper->getOperand(operand_index).getDefiningOp())) { + auto cstType = mlir::cast(cstOp.getOutput().getType()); + auto newContentAttr = cstOp.getContentAttr().transform().rescale(scale).get(); + mlir::OpBuilder builder(cstOp); + auto newCstOp = builder.create(cstOp.getLoc(), cstType, std::move(newContentAttr)); + oper->setOperand(operand_index, newCstOp); + return true; + } + return false; +} + +bool propagateToConstWeights(float scale, mlir::Operation* oper) { + if (!mlir::isa(oper)) { + return false; + } + + VPUX_THROW_WHEN(oper->getNumOperands() < 2, "Multiply, MatMul, Convolution Ops must have at least 2 operands"); + + size_t expected_count = 0; + // For multiply, matmul and convolution multiply weights. + llvm::SmallVector operandIndices; + if (hasConstWeightsOpAtIndex(oper, 1)) { + operandIndices.push_back(1); + ++expected_count; + } + + // For convolution multiply biases as well. + if (mlir::isa(oper) && oper->getNumOperands() > 2) { + if (hasConstWeightsOpAtIndex(oper, 2)) { + operandIndices.push_back(2); + ++expected_count; + } + } + if (0 == expected_count) { + // no const ops; + return false; + } + + size_t processed_count = 0; + for (size_t i = 0; i < operandIndices.size(); ++i) { + auto index = operandIndices[i]; + if (fuseConstScale(oper, index, scale)) { + ++processed_count; + } + } + VPUX_THROW_WHEN(expected_count != processed_count, + "ERROR: Could not fuse all const operands or found a mix of const and non-const operands"); + return expected_count == processed_count; +} + +bool propagateToConstWeightsDown(const MulPropData& mop) { + if (propagateToConstWeights(mop.scale, mop.user)) { + return true; + } + return false; +} + +bool propagateToConstWeightsUp(const MulPropData& mop) { + auto oper = mop.user->getOperand(mop.user_operand_index).getDefiningOp(); + if (propagateToConstWeights(mop.scale, oper)) { + return true; + } + return false; +} + +// STEP 5: Handle Multiply, Matmul and Conv +void handleMultFamilyDown(size_t mopIndex, llvm::SmallVector& mulProps, std::queue& mulQ) { + llvm::DenseSet processedIndices; + if (propagateToConstWeightsDown(mulProps[mopIndex])) { + mulProps[mopIndex].active = false; + return; + } + moveMulToOutput(mopIndex, mulProps, mulQ, mulProps[mopIndex].scale); +} + +// STEP 5: Handle Multiply, Matmul and Conv +void handleMultFamilyUp(size_t mopIndex, llvm::SmallVector& mulProps, std::queue& mulQ) { + llvm::DenseSet processedIndices; + if (propagateToConstWeightsUp(mulProps[mopIndex])) { + mulProps[mopIndex].active = false; + return; + } + mulProps[mopIndex].active = false; + auto toperand = mulProps[mopIndex].user->getOperand(mulProps[mopIndex].user_operand_index).getDefiningOp(); + if (!toperand) { + return; + } + VPUX_THROW_WHEN(toperand->getNumOperands() < 2, "Found multiply op with less than two operands"); + const size_t secondOperandIndex = 1; + mulProps.emplace_back(MulPropData(TraversalDir::UP, mulProps[mopIndex].scale, true, toperand, secondOperandIndex)); + mulQ.push(mulProps.size() - 1); + + if (mlir::isa(toperand) && toperand->getNumOperands() > 2) { + // propagate to biases as well. + const size_t thirdOperandIndex = 2; + mulProps.emplace_back( + MulPropData(TraversalDir::UP, mulProps[mopIndex].scale, true, toperand, thirdOperandIndex)); + mulQ.push(mulProps.size() - 1); + } +} + +void handleMemoryOpDown(size_t mopIndex, llvm::SmallVector& mulProps, std::queue& mulQ) { + moveMulToOutput(mopIndex, mulProps, mulQ, mulProps[mopIndex].scale); +} + +void handleMemoryOpUp(size_t mopIndex, llvm::SmallVector& mulProps, std::queue& mulQ) { + moveMulToAllInputs(mopIndex, mulProps, mulQ, mulProps[mopIndex].scale); +} + +// Traverse the subgraph and determine where the multiply operations are to be introduced. +// The introduced multiply operations are propagated through the graph. +// During propagation some of the operations may become inactive and MulPropData for those +// operations is updated accordingly. +// Further for FakeQuantize operations, the scaling information is stored in OpParamData. +// Once the traversal is done, there is information to update the input graph by introducing +// the multiply operations and updating the FQ parameters. +mlir::LogicalResult traverseSubgraph(llvm::SmallVector& subgraph, + llvm::SmallVector& mulProps, + llvm::DenseMap& opMetaData) { + auto front = subgraph.front(); + auto rc = rescaleCoefficient(front); + std::queue mulQ; + + for (size_t i = 0; i < front->getNumOperands(); ++i) { + auto oper = front->getOperand(i).getDefiningOp(); + if (auto cstOp = mlir::dyn_cast_or_null(oper)) { + continue; + } + mulProps.emplace_back(MulPropData(TraversalDir::UP, 1.0f / rc, true, front, i)); + mulQ.push(mulProps.size() - 1); + } + + // Code reuse: + // Just use a fake MulProp and reuse moveMulToOutput. + const size_t dont_care_op_index = 0; + auto frontMp = MulPropData(TraversalDir::DOWN, rc, true, front, dont_care_op_index); + llvm::DenseSet ignoredUsers; + moveMulToOutputWithFilter(frontMp, mulProps, mulQ, rc, ignoredUsers); + frontMp.active = false; + + opMetaData[front] = OpParamdata(1.0f / rc, front, TraversalDir::INVALID); + + // Mechanism to break a circular dependency. + // Keep a count of number of times an operation is added to mulQ. + // If it exceeds a threshold return failure. + llvm::DenseMap visitCount; + auto increment_visit_count = [&](mlir::Operation* op) { + if (visitCount.contains(op)) { + visitCount[op] += 1; + return; + } + visitCount[op] = 1; + }; + increment_visit_count(front); + auto visits_limit_exceeded = [&](mlir::Operation* op) { + if (!op) { + return false; + } + const size_t count_threshold = 20; + increment_visit_count(op); + return visitCount[op] > count_threshold; + }; + + // The traversal does a breadth first search starting with the first + // node which was breaking FP16 threshold. For each operation encountered + // during traversal we do one or more of the following: + // - create a MulProp and add it to queue for further propagation. + // - create or update metadata for an operation in opMetaData, for e.g., to update FQ nodes. + // - Nothing - either we are at a stop node or an unrecognized operation. + while (!mulQ.empty()) { + auto itop = mulQ.front(); + + mulQ.pop(); + if (!mulProps[itop].active) { + continue; + } + if (visits_limit_exceeded(mulProps[itop].user)) { + return mlir::failure(); + } + + auto tuser = mulProps[itop].user; + VPUX_THROW_WHEN(!tuser || mulProps[itop].user_operand_index >= mulProps[itop].user->getNumOperands(), + "User and user operand index must be valid for an active mul prop"); + auto toperandVal = tuser->getOperand(mulProps[itop].user_operand_index); + VPUX_THROW_WHEN(!toperandVal, "The operand value cannot be null for an active op"); + auto toperand = toperandVal.getDefiningOp(); + if (visits_limit_exceeded(toperand)) { + return mlir::failure(); + } + + if (mulProps[itop].traversalDir == TraversalDir::DOWN) { + llvm::TypeSwitch(tuser) + .Case([&](IE::AddOp) { + handleAddSubDown(itop, mulProps, mulQ); + }) + .Case([&](IE::SubtractOp) { + handleAddSubDown(itop, mulProps, mulQ); + }) + .Case([&](IE::FakeQuantizeOp) { + handleFQOpDown(itop, opMetaData, subgraph, mulProps, mulQ); + }) + .Case([&](IE::MultiplyOp) { + handleMultFamilyDown(itop, mulProps, mulQ); + }) + .Case([&](IE::ConvolutionOp) { + handleMultFamilyDown(itop, mulProps, mulQ); + }) + .Case([&](IE::MatMulOp) { + handleMultFamilyDown(itop, mulProps, mulQ); + }) + // Const::DeclareOps are handled in CreateMultiplyOp + //.Case([&](auto constOp) { + // top.active = false; + // opMetaData.try_emplace(constOp, Metadata(top.scale, top.scale, constOp, nullptr, + // top.traversalDir)); subgraph.push_back(constOp); + //}) + .Default([&](mlir::Operation* op) { + if (isStopOperation(op)) { + // STEP 2 + mulProps[itop].active = false; + } + if (isMemoryOp(op)) { + handleMemoryOpDown(itop, mulProps, mulQ); + } else { + // Unsupported yet. + } + }); + } else if (toperand && mulProps[itop].traversalDir == TraversalDir::UP) { + // Note if toperand is nullptr then the defining op is null we just need to create a mul op using the value. + // That is why the if statement above is different compared to down traversal. + llvm::TypeSwitch(toperand) + .Case([&](IE::AddOp) { + handleAddSubUp(itop, mulProps, mulQ); + }) + .Case([&](IE::SubtractOp) { + handleAddSubUp(itop, mulProps, mulQ); + }) + .Case([&](IE::FakeQuantizeOp) { + handleFQOpUp(itop, opMetaData, subgraph, mulProps, mulQ); + }) + .Case([&](IE::MultiplyOp) { + handleMultFamilyUp(itop, mulProps, mulQ); + }) + .Case([&](IE::ConvolutionOp) { + handleMultFamilyUp(itop, mulProps, mulQ); + }) + .Case([&](IE::MatMulOp) { + handleMultFamilyUp(itop, mulProps, mulQ); + }) + // .Case([&](Const::DeclareOp constOp) { + // Const::DeclareOps are handled in CreateMultiplyOp + // top.active = false; + // opMetaData.try_emplace(constOp, Metadata(top.scale, top.scale, constOp, nullptr, + // top.traversalDir)); subgraph.push_back(constOp); + // }) + .Default([&](mlir::Operation* op) { + if (isStopOperation(op)) { + // STEP 2 + // nothing to do here, just stop propagation + } else if (isMemoryOp(op)) { + handleMemoryOpUp(itop, mulProps, mulQ); + } else { + // Unsupported yet. + } + }); + } + } + + return mlir::success(); +} + +// For FakeQuantize nodes with parameters outside FP16 range this pass rescales the parameters and introduces +// multiply operations through the graph, as outlined in the ticket E#171489. +mlir::LogicalResult FakeQdqParamsRewriter::matchAndRewrite(IE::FakeQuantizeOp fakeQuantizeOp, + mlir::PatternRewriter& rewriter) const { + auto levels = fakeQuantizeOp.getLevels(); + + // Maximum number of levels that don't exceeds I8/U8 storage type . TODO: E#169024 adjust logic for int16 quant + // levels. + if (!levels.has_value() || *levels <= QuantizationLevels::QUANT_LEVELS_8BIT) { + return matchFailed(rewriter, fakeQuantizeOp, + "Skipping AdjustFQParams pass for quantization range < i8 {0} at {1}", + fakeQuantizeOp->getName(), fakeQuantizeOp->getLoc()); + } + + if (!IE::isPerTensorFQ({fakeQuantizeOp}) || !isFqRangeOutOfBounds(fakeQuantizeOp)) { + return matchFailed(rewriter, fakeQuantizeOp, "Skipping AdjustFQParams pass as FQ {0} at {1} is in range", + fakeQuantizeOp->getName(), fakeQuantizeOp->getLoc()); + } + // Followup: Generalize current implementation to: + // per-channel and multi-channel FakeQuantize. Look at E#177612 + + llvm::SmallVector subgraph; + llvm::DenseMap subgraphMetaData; + subgraph.push_back(fakeQuantizeOp); + + llvm::SmallVector mulProps; + + // PHASE 1: Traverse subgraph starting from the matched FakeQuantize Op and determine where graph should be changed. + if (mlir::failed(traverseSubgraph(subgraph, mulProps, subgraphMetaData))) { + return matchFailed(rewriter, fakeQuantizeOp, "Graph Traversal Failed, for FQ: {0}, loc: {1}", + fakeQuantizeOp->getName(), fakeQuantizeOp->getLoc()); + } + + // PHASE 2: Update the input graph by introducing multiply operations and/or updating FakeQuantize nodes. + if (mlir::failed(createMulsAndUpdateOps(subgraph, mulProps, rewriter, getContext(), subgraphMetaData))) { + return matchFailed(rewriter, fakeQuantizeOp, "Create Multiply Ops Failed, for FQ: {0}, loc: {1}", + fakeQuantizeOp->getName(), fakeQuantizeOp->getLoc()); + } + + return mlir::success(); +} + +// +// AdjustFakeQdqParams +// +class AdjustFakeQdqParamsPass final : public IE::impl::AdjustFakeQdqParamsBase { +public: + explicit AdjustFakeQdqParamsPass(Logger log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnFunc() final; +}; + +void AdjustFakeQdqParamsPass::safeRunOnFunc() { + auto& ctx = getContext(); + mlir::RewritePatternSet patterns(&ctx); + patterns.add(&ctx, _log); + + auto func = getOperation(); + if (mlir::failed(mlir::applyPatternsAndFoldGreedily(func, std::move(patterns), getDefaultGreedyRewriteConfig()))) { + signalPassFailure(); + } +} + +} // namespace + +// +// createAdjustFakeQdqParamsPass +// + +std::unique_ptr vpux::IE::createAdjustFakeQdqParamsPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_fake_quantize_params.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_fake_quantize_params.cpp index 96d0c76ce3..3a68bca8eb 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_fake_quantize_params.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_fake_quantize_params.cpp @@ -6,11 +6,13 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" -#include "vpux/compiler/dialect/VPU/utils/eltwise_utils.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" +#include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_groupconv_shape.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_groupconv_shape.cpp index 49f96556a2..910a83c074 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_groupconv_shape.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_groupconv_shape.cpp @@ -3,12 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/const/attributes/attributes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_input_shape.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_input_shape.cpp index a2443e3dbd..03fbb001f0 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_input_shape.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_input_shape.cpp @@ -3,26 +3,38 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include -#include #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" #include "vpux/compiler/dialect/IE/utils/convolution_utils.hpp" #include "vpux/compiler/dialect/IE/utils/expand_utils.hpp" #include "vpux/compiler/dialect/IE/utils/permute_quantize_utils.hpp" +#include "vpux/compiler/dialect/IE/utils/quantization.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/error.hpp" +#include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/numeric.hpp" +#include +#include +#include + namespace vpux::IE { #define GEN_PASS_DECL_ADJUSTINPUTSHAPE #define GEN_PASS_DEF_ADJUSTINPUTSHAPE @@ -361,7 +373,7 @@ bool ExpandEltwisePattern::init() { buffSizes.push_back(_unExpandedShape.totalSize() * outputType.getElemTypeSize()); } - const auto arch = VPU::getArch(_eltwiseOp); + const auto arch = config::getArch(_eltwiseOp); auto requiredCMXSize = vpux::VPU::calculateAlignedBuffersMemoryRequirement(arch, buffSizes).count(); return requiredCMXSize > totalAvailableCMXSize; }; @@ -480,6 +492,11 @@ mlir::LogicalResult ExpandEltwisePattern::rewrite(mlir::PatternRewriter& rewrite _log.trace("Converting unexpanded shape {0} to new aligned shape {1}", _unExpandedShape, _newExpandedShape); + if (isPerAxisQuant(_eltwiseOp->getResult(0))) { + _log.trace("Per axis quantization is not supported for replace by ShapeCast"); + return mlir::failure(); + } + auto getOwnerIgnoreQuantizeCast = [&](mlir::OpOperand& opOperand) -> mlir::Operation* { auto ownerOp = opOperand.getOwner(); while (auto quantizeCastOp = mlir::dyn_cast(ownerOp)) { @@ -1368,43 +1385,44 @@ mlir::LogicalResult AdjustMemPermuteRewriter::matchAndRewrite(IE::MemPermuteOp l mlir::PatternRewriter& rewriter) const { _log.trace("[{0}] Got '{1}' at '{2}'", this->getDebugName(), layerOp->getName(), layerOp->getLoc()); - const auto inputShape = getShape(layerOp.getInput()); - if (inputShape.size() != 4) { + const auto inputType = mlir::cast(layerOp.getInput().getType()); + const auto outputType = mlir::cast(layerOp.getOutput().getType()); + const auto inputShape = inputType.getShape(); + + const int64_t SUPPORTED_RANK = 4; + if (inputShape.size() != SUPPORTED_RANK) { return mlir::failure(); } - const auto memPerm = DimsOrder::fromAffineMap(layerOp.getMemPerm()); + const auto expectedOutOrder = DimsOrder::NCHW; + if (outputType.getDimsOrder() != expectedOutOrder || inputType.getDimsOrder() != expectedOutOrder) { + return mlir::failure(); + } - // This rewriter only process MemPermute with 2 non-one dims, and it will be processed by ConvertMemPermuteToOp - // pass. Otherwise such MemPermute cannot convert to Pool. And for such MemPermute. And for such MemPermute with 2 - // non-one dims, only 2 cases, NxCx1x1 and 1xCxHx1. Both of them are reshaped to 1x1xHxW. + // This rewriter only process MemPermute with NCHW layout on the lowest two dims, and it will be processed by + // ConvertMemPermuteToOp pass with MemPermuteNCHWInNCHWOutNCWHPerm case. + const auto memPerm = DimsOrder::fromAffineMap(layerOp.getMemPerm()); if (!((inputShape[Dims4D::Act::N] == 1 && inputShape[Dims4D::Act::W] == 1 && memPerm == DimsOrder::NHCW) || - (inputShape[Dims4D::Act::H] == 1 && inputShape[Dims4D::Act::W] == 1 && memPerm == DimsOrder::CNHW))) { + (inputShape[Dims4D::Act::H] == 1 && inputShape[Dims4D::Act::W] == 1 && memPerm == DimsOrder::CNHW) || + (memPerm == DimsOrder::NHWC))) { return mlir::failure(); } - const auto memPermuteOutputOrder = mlir::cast(layerOp.getOutput().getType()).getDimsOrder(); - const auto memPermuteInputOrder = mlir::cast(layerOp.getInput().getType()).getDimsOrder(); - const auto expectedOutOrder = DimsOrder::NCHW; - if (memPermuteOutputOrder != expectedOutOrder || memPermuteInputOrder != expectedOutOrder) { + if (inputShape[Dims4D::Act::N] == 1 && inputShape[Dims4D::Act::H] == 1) { + // If input is 1xCx1xW, then it can be convert to pooling directly return mlir::failure(); } - const auto inputType = mlir::cast(layerOp.getInput().getType()); - const auto outputType = mlir::cast(layerOp.getOutput().getType()); - auto getNonOneDims = [](ShapeRef shape) { - Shape resultShape; - llvm::copy_if(shape, std::back_inserter(resultShape), [](int64_t elem) { - return elem != 1; - }); - return resultShape; - }; - const auto inputNonOneDims = getNonOneDims(getShape(layerOp.getInput())); - if (inputNonOneDims.size() != 2) { + if (!isSuitableToAdjustMemPermuteShape(inputType, outputType, layerOp.getMemPerm())) { return mlir::failure(); } - const auto newShape = Shape({1, 1, inputNonOneDims[Dims4D::Act::N], inputNonOneDims[Dims4D::Act::C]}); + auto [mergedPermutation, mergedMemShape] = vpux::getMergedPermutationAndShape(inputType, layerOp.getMemPerm()); + for (size_t i = 0; i < inputType.getShape().size() - mergedMemShape.size() + 1; ++i) { + mergedMemShape.insert(mergedMemShape.begin(), 1); + } + const auto newShape = Shape(mergedMemShape); + auto inputShapeCastOp = rewriter.create(layerOp.getLoc(), inputType.changeShape(newShape), layerOp.getInput(), getIntArrayAttr(layerOp.getContext(), newShape)); diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_layouts.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_layouts.cpp index 254fd2733a..64f7951847 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_layouts.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_layouts.cpp @@ -4,9 +4,8 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/utils/adjust_layout_utils.hpp" #include "vpux/utils/core/dense_map.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_maxpool_input_shape.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_maxpool_input_shape.cpp index 01cc2f88ac..b4e3a70959 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_maxpool_input_shape.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_maxpool_input_shape.cpp @@ -3,10 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/factors.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_mem_permute_around_op.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_mem_permute_around_op.cpp index 29f18ed1e8..3969a3a393 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_mem_permute_around_op.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_mem_permute_around_op.cpp @@ -4,10 +4,14 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" - +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_nce_ops_with_i32_inputs.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_nce_ops_with_i32_inputs.cpp index 83a5194608..6dd1bacd9f 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_nce_ops_with_i32_inputs.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_nce_ops_with_i32_inputs.cpp @@ -3,10 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_non_zero_fake_quant.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_non_zero_fake_quant.cpp index 27c95123de..8a4c6f5b45 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_non_zero_fake_quant.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_non_zero_fake_quant.cpp @@ -4,16 +4,17 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" - -#include -#include #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/numeric.hpp" +#include +#include + namespace vpux::IE { #define GEN_PASS_DECL_ADJUSTNONZEROFAKEQUANT #define GEN_PASS_DEF_ADJUSTNONZEROFAKEQUANT diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_scale_shift_for_dw_conv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_scale_shift_for_dw_conv.cpp index ec5783ba0b..dafeb3767a 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_scale_shift_for_dw_conv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_scale_shift_for_dw_conv.cpp @@ -5,12 +5,12 @@ #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/broadcast_utils.hpp" #include "vpux/compiler/dialect/IE/utils/scale_shift_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_software_ops_precision.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_software_ops_precision.cpp index 6e11789b4c..ae1ef4edda 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_software_ops_precision.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/adjust_software_ops_precision.cpp @@ -3,10 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/align_scales.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/align_scales.cpp index a17d4eb60c..0bf0686ef5 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/align_scales.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/align_scales.cpp @@ -2,8 +2,11 @@ // Copyright (C) 2022-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // + #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/elem_type_info_utils.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" @@ -11,7 +14,6 @@ #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" - #include "vpux/utils/core/numeric.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/apply_dynamic_boundary_correction.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/apply_dynamic_boundary_correction.cpp index ab34446148..00ae69e0db 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/apply_dynamic_boundary_correction.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/apply_dynamic_boundary_correction.cpp @@ -3,11 +3,17 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/IE/utils/reify_shape.hpp" -#include "vpux/compiler/utils/infer_output_shape.hpp" +#include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/broadcast_input_for_add.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/broadcast_input_for_add.cpp index 3f37b80f79..0c47492cb1 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/broadcast_input_for_add.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/broadcast_input_for_add.cpp @@ -3,14 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/broadcast_utils.hpp" #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" +#include "vpux/compiler/utils/rewriter.hpp" + +#include namespace vpux::IE { #define GEN_PASS_DECL_BROADCASTINPUTFORADD diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/broadcast_input_for_multiply.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/broadcast_input_for_multiply.cpp index b8b030a0bd..0f01611751 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/broadcast_input_for_multiply.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/broadcast_input_for_multiply.cpp @@ -3,18 +3,18 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include - #include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/broadcast_utils.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/permute_utils.hpp" +#include "vpux/compiler/utils/rewriter.hpp" + +#include namespace vpux::IE { #define GEN_PASS_DECL_BROADCASTINPUTFORMULTIPLY diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/cleanup_fq.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/cleanup_fq.cpp index b22887d90c..9d99ce765d 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/cleanup_fq.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/cleanup_fq.cpp @@ -4,17 +4,18 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/numeric.hpp" #include #include - -#include "vpux/compiler/dialect/IE/utils/quantization.hpp" - namespace vpux::IE { #define GEN_PASS_DECL_CLEANUPFQ #define GEN_PASS_DEF_CLEANUPFQ diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/consolidate_nf4_weights_pattern.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/consolidate_nf4_weights_pattern.cpp index 85a9048128..c981fb5232 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/consolidate_nf4_weights_pattern.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/consolidate_nf4_weights_pattern.cpp @@ -3,16 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" - #include "vpux/compiler/core/types/quantile_float/types.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" -#include "vpux/compiler/dialect/core/types.hpp" -#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/consolidate_weights_dequantization.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/consolidate_weights_dequantization.cpp index 3dffaf7ae2..a5e9e21aca 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/consolidate_weights_dequantization.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/consolidate_weights_dequantization.cpp @@ -5,7 +5,8 @@ #include "vpux/compiler/core/types/quantile_float/types.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/fake_quantize_utils.hpp" #include "vpux/compiler/utils/quantization.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_assign_and_read_value.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_assign_and_read_value.cpp index ce69d03d10..67e14c5ae9 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_assign_and_read_value.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_assign_and_read_value.cpp @@ -4,7 +4,7 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/analysis.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_avg_pool_to_dw_conv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_avg_pool_to_dw_conv.cpp index b44c83a68f..cf0a2d0669 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_avg_pool_to_dw_conv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_avg_pool_to_dw_conv.cpp @@ -3,17 +3,16 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" -#include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" -#include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_batched_layer_to_1n.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_batched_layer_to_1n.cpp index 1a1f0586ee..3b1a671a9e 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_batched_layer_to_1n.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_batched_layer_to_1n.cpp @@ -3,13 +3,17 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/broadcast_utils.hpp" #include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_bilinear_to_strided_concat_and_conv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_bilinear_to_strided_concat_and_conv.cpp index 9135d9ca72..10a4c859bc 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_bilinear_to_strided_concat_and_conv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_bilinear_to_strided_concat_and_conv.cpp @@ -4,12 +4,15 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/interpolate_utils.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/dialect/core/types.hpp" @@ -395,7 +398,7 @@ mlir::LogicalResult ConvertBilinearToStridedConcatAndConvPass::BilinearInterpola // if it can be convert to CMX concat, the performance can be improved, but if not, // the extra convolution will decrease the performance, so here add the size check with output. input, // weights, and weights table to make it more accurate - const auto arch = VPU::getArch(origOp); + const auto arch = config::getArch(origOp); const auto weightTable = vpux::VPU::NCEInvariant::getWeightsTableSize(inputShape[Dims4D::Act::C]); int64_t outSize = type.getTotalAllocSize().count() / scaleH; int64_t inSize = outSize / scaleW; diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_branches_concat_to_conv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_branches_concat_to_conv.cpp index 9783d72173..f3af74cce0 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_branches_concat_to_conv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_branches_concat_to_conv.cpp @@ -3,14 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // -#include #include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" @@ -19,6 +20,8 @@ #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/error.hpp" +#include + namespace vpux::IE { #define GEN_PASS_DECL_CONVERTBRANCHESCONCATTOCONV #define GEN_PASS_DEF_CONVERTBRANCHESCONCATTOCONV diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_broadcast_to_tile.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_broadcast_to_tile.cpp index e4aaab2e1e..6667a935c9 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_broadcast_to_tile.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_broadcast_to_tile.cpp @@ -3,23 +3,21 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/broadcast_utils.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/types.hpp" -#include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" - #include namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_convbackpropdata_to_transposedconv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_convbackpropdata_to_transposedconv.cpp index 9a8eea05eb..b2ed8852c8 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_convbackpropdata_to_transposedconv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_convbackpropdata_to_transposedconv.cpp @@ -4,15 +4,12 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/IE/transposed_convolution_utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/compiler/utils/types.hpp" #include #include @@ -233,7 +230,32 @@ mlir::LogicalResult GroupConvolutionBackpropDataConversion::matchAndRewrite(IE:: auto filterOp = filterTensor.getDefiningOp(); if (filterOp == nullptr) { - return matchFailed(rewriter, origOp, "Unable to find filter constant operation"); + auto filterTensorType = mlir::cast(filterTensor.getType()); + auto permutation = to_small_vector(filterTensorType.getDimsOrder().toPermutation() | transformed([](Dim dim) { + return checked_cast(dim.ind()); + })); + std::swap(permutation[IE::GROUP_TRANSPOSED_CONV_C_IN_DIM_INDEX], + permutation[IE::GROUP_TRANSPOSED_CONV_C_OUT_DIM_INDEX]); + auto orderAttr = mlir::AffineMapAttr::get(mlir::AffineMap::getPermutationMap(permutation, getContext())); + auto transposeOp = rewriter.create(appendLoc(origOp->getLoc(), "_transpose"), filterTensor, + /*order=*/nullptr, orderAttr); + + const auto rank = filterTensorType.getRank(); + const auto axes = SmallVector{rank - 2, rank - 1}; + const auto axesAttr = getIntArrayAttr(getContext(), axes); + IE::ReverseModeAttr modeAttr = IE::ReverseModeAttr::get(getContext(), IE::ReverseMode::INDEX); + auto reverseOp = rewriter.create(appendLoc(origOp->getLoc(), "_reverse"), + transposeOp.getOutput(), nullptr, axesAttr, modeAttr); + + auto newFilter = reverseOp.getOutput(); + + rewriter.replaceOpWithNewOp( + origOp, origOp.getInput(), newFilter, origOp.getOutputShape(), origOp.getStridesAttr(), + origOp.getPadsBeginAttr(), origOp.getPadsEndAttr(), origOp.getDilationsAttr(), + origOp.getSpatialOutputPaddingAttr(), /*postOp=*/nullptr, /*clamp=*/nullptr, /*outputPadding=*/nullptr, + /*inputPadding=*/nullptr); + + return mlir::success(); } // Reverse IC and OC dimensions in filter constant: diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_depth2space.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_depth2space.cpp index 49d085bed1..a81bdba23e 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_depth2space.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_depth2space.cpp @@ -3,11 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" - -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_divide_to_multiply.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_divide_to_multiply.cpp index c44171ed73..19566ac860 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_divide_to_multiply.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_divide_to_multiply.cpp @@ -3,14 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" -#include "vpux/compiler/utils/analysis.hpp" -#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/logger/logger.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_dynamic_dequantize_to_dequantize.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_dynamic_dequantize_to_dequantize.cpp index 1cd9cbe6a6..dfb6f7a2c0 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_dynamic_dequantize_to_dequantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_dynamic_dequantize_to_dequantize.cpp @@ -5,15 +5,12 @@ #include #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" #include "vpux/compiler/dialect/IE/utils/fake_quantize_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/compiler/utils/types.hpp" #include "vpux/utils/core/numeric.hpp" namespace vpux::IE { @@ -80,6 +77,13 @@ bool isOptimizableDynamicDequantizeOp(IE::DynamicDequantizeOp origOp) { return false; } + auto quantStorageType = uniformType.getStorageType(); + if (quantStorageType.isInteger(CHAR_BIT)) { + auto quantCastOp = origOp.getInput().getDefiningOp(); + if (quantCastOp == nullptr) { + return false; + } + } return true; } @@ -317,7 +321,9 @@ mlir::LogicalResult ConvertDynamicDequantizeToDequantize::matchAndRewrite(IE::Dy return matchFailed(rewriter, origOp, "not a valid FC pattern"); } auto fcOp = isValidPattern.value(); - + auto inputType = mlir::cast(origOp.getInput().getType()); + auto uniformType = mlir::cast(inputType.getElementType()); + auto quantStorageType = uniformType.getStorageType(); // reshape the scale const auto fcOutShape = getShape(fcOp.getOutput()).raw(); const auto scaleSize = getShape(origOp.getScale()).totalSize(); @@ -327,6 +333,40 @@ mlir::LogicalResult ConvertDynamicDequantizeToDequantize::matchAndRewrite(IE::Dy outShapeAttr) .getOutput(); + if (quantStorageType.isInteger(CHAR_BIT)) { + // Rescale scales by x16 and weights by /16 to scale down the I8 weights into I4 range to avoid overflow in the + // following MatMul/FullyConnected/Convolution. See details in #E161479 + + const auto rescaler = 0.0625; + const auto descaler = 16.f; // 1/rescaler + + auto quantCastOp = origOp.getInput().getDefiningOp(); + auto quantTypeI8Rescaled = mlir::quant::UniformQuantizedType::get( + mlir::quant::QuantizationFlags::Signed, getSInt8Type(uniformType.getContext()), + uniformType.getExpressedType(), rescaler, + /*zp=*/0, /*min=*/uniformType.getStorageTypeMin(), + /*max=*/uniformType.getStorageTypeMax()); + auto quantCastOpI8Rescaled = + rewriter.create(appendLoc(origOp->getLoc(), "_quant_cast_i8_with_rescaler"), + quantCastOp.getInput(), quantTypeI8Rescaled); + rewriter.replaceOp(quantCastOp, quantCastOpI8Rescaled.getOutput()); + + const auto rescalerBaseType = + mlir::RankedTensorType::get({1}, mlir::Float16Type::get(uniformType.getContext())); + const auto rescalerBaseAttr = + Const::createConstContent(rescalerBaseType, ArrayRef(vpux::type::float16(descaler))); + const auto rescalerContentAttr = Const::ContentAttr::get(rescalerBaseAttr); + auto rescalerConstAttr = rescalerContentAttr.transform().get(); + const auto rescalerType = mlir::cast(rescalerContentAttr.getType()); + auto rescalerConstOp = rewriter.create(appendLoc(origOp->getLoc(), "_rescaler"), rescalerType, + std::move(rescalerConstAttr)); + + auto multiplyWithRescaler = + rewriter.create(appendLoc(origOp->getLoc(), "_rescale_scaler"), scale, rescalerConstOp, + IE::AutoBroadcastType::NUMPY, nullptr, nullptr, nullptr, nullptr); + scale = multiplyWithRescaler; + } + // insert a multiply post FC rewriter.setInsertionPointAfter(fcOp); auto multiply = diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_expand_to_conv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_expand_to_conv.cpp index 89c322b0fb..7bc7299845 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_expand_to_conv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_expand_to_conv.cpp @@ -3,13 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/expand_utils.hpp" #include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_extract_image_patches.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_extract_image_patches.cpp index 0dd48ddc1c..bf7602473a 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_extract_image_patches.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_extract_image_patches.cpp @@ -2,9 +2,13 @@ // Copyright (C) 2022-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_fc_to_conv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_fc_to_conv.cpp index f466e84db5..d4093bd4e1 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_fc_to_conv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_fc_to_conv.cpp @@ -4,7 +4,8 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_gather_elements_to_gather.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_gather_elements_to_gather.cpp index 6a8b6e4e35..e1d7da8636 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_gather_elements_to_gather.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_gather_elements_to_gather.cpp @@ -3,10 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/tiling.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_gather_to_slice.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_gather_to_slice.cpp index 09912fd878..e00704dd11 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_gather_to_slice.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_gather_to_slice.cpp @@ -3,10 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_grn_to_normalize_l2.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_grn_to_normalize_l2.cpp index 796f296ef5..6bba72445d 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_grn_to_normalize_l2.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_grn_to_normalize_l2.cpp @@ -3,10 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/types.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_groupconv_to_conv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_groupconv_to_conv.cpp index 916578ee22..a673b519bf 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_groupconv_to_conv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_groupconv_to_conv.cpp @@ -3,10 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -// - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/convolution_utils.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_grouptransposedconv_to_groupconv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_grouptransposedconv_to_groupconv.cpp index 8f552883c2..2e94ce2062 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_grouptransposedconv_to_groupconv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_grouptransposedconv_to_groupconv.cpp @@ -3,11 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/IE/transposed_convolution_utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_grouptransposedconv_to_transposedconv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_grouptransposedconv_to_transposedconv.cpp index 4e8393f6e3..53f91562ca 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_grouptransposedconv_to_transposedconv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_grouptransposedconv_to_transposedconv.cpp @@ -3,11 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_large_conv_to_multi_conv_with_add.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_large_conv_to_multi_conv_with_add.cpp index 494342e8ef..e351a64c6e 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_large_conv_to_multi_conv_with_add.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_large_conv_to_multi_conv_with_add.cpp @@ -4,13 +4,13 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" -#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_matmul_to_conv_and_permutecast.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_matmul_to_conv_and_permutecast.cpp index 0558e788a8..bcd993a68c 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_matmul_to_conv_and_permutecast.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_matmul_to_conv_and_permutecast.cpp @@ -3,10 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/adjust_layout_utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_mem_permute_to_op.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_mem_permute_to_op.cpp index 6c8d844820..9ce75b2118 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_mem_permute_to_op.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_mem_permute_to_op.cpp @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/IE/utils/expand_utils.hpp" @@ -14,11 +13,14 @@ #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include + namespace vpux::IE { #define GEN_PASS_DECL_CONVERTMEMPERMUTETOOPPASS #define GEN_PASS_DEF_CONVERTMEMPERMUTETOOPPASS @@ -131,28 +133,25 @@ SmallVector> calculateConversions(ShapeRef originInp return newMaxPoolOrder; } -bool isSwPermuteEfficient(IE::MemPermuteOp memPermuteOp) { - auto arch = VPU::getArch(memPermuteOp); - auto inType = mlir::cast(memPermuteOp.getInput().getType()); - auto outType = mlir::cast(memPermuteOp.getOutput().getType()); - return VPUIP::satisfiesOptimizedMemPermute(arch, inType, outType); -} - -bool isLegalConvertToPool(IE::MemPermuteOp memPermuteOp, mlir::AffineMap memPermMap, mlir::MLIRContext* ctx, - int64_t numClusters, StringRef debugName, Logger log) { +bool isLegalConvertToPool(NDTypeInterface inputType, NDTypeInterface outputType, mlir::Operation* parentOp, + mlir::AffineMap memPermMap, mlir::MLIRContext* ctx, int64_t numClusters, StringRef debugName, + config::ArchKind arch, Logger log) { // Pooling op does not support dynamic shapes, // so we fail transformation if any of the input or output shapes are dynamic. - if (IE::hasDynamicTensors(memPermuteOp.getOperation())) { + auto isDynamic = [](NDTypeInterface type) { + return mlir::isa(type) || mlir::isa(type); + }; + + if (isDynamic(inputType) || isDynamic(outputType)) { log.trace("MemPermuteOp has dynamic tensors"); return false; } - const auto inOrder = DimsOrder::fromValue(memPermuteOp.getInput()); - const auto inShape = getShape(memPermuteOp.getInput()); - const auto inMemShape = inOrder.toMemoryOrder(inShape); + const auto inShape = inputType.getShape(); + const auto inMemShape = inputType.getMemShape(); // E-128307: Replace with using a robust NCE-Op supported datatype checking mechanism - const auto elementType = mlir::cast(memPermuteOp.getType()).getElementType(); + const auto elementType = outputType.getElementType(); if (elementType.isSignedInteger() || elementType.isUnsignedInteger()) { log.trace("NCE MaxPool does not support signed or unsigned integer"); return false; @@ -179,7 +178,7 @@ bool isLegalConvertToPool(IE::MemPermuteOp memPermuteOp, mlir::AffineMap memPerm return false; } - if (auto expandOp = memPermuteOp.getInput().getDefiningOp()) { + if (auto expandOp = mlir::dyn_cast_or_null(parentOp)) { auto inType = mlir::cast(expandOp.getInput().getType()); auto outType = mlir::cast(expandOp.getResult().getType()); const auto isExpandAtChannel = inType.getShape()[Dims4D::Act::C] != outType.getShape()[Dims4D::Act::C]; @@ -215,10 +214,9 @@ bool isLegalConvertToPool(IE::MemPermuteOp memPermuteOp, mlir::AffineMap memPerm for (const auto idx : irange(inShape.size())) { poolInLogicShape[poolInOrder.dimAt(idx)] = inMemShape[MemDim(idx)]; } - auto poolInputType = mlir::cast(memPermuteOp.getOutput().getType()); const auto IC = poolInLogicShape[Dims4D::Act::C]; - const auto alignedChannel = VPU::NCEInvariant::getAlignment(poolInputType.getElementType()); + const auto alignedChannel = VPU::NCEInvariant::getAlignment(outputType.getElementType()); if (IC % alignedChannel != 0) { auto conversionMap = calculateConversions(targetInShape, alignedChannel, targetOrder); auto hasSmallHeightNum = [&](const std::pair& map) { @@ -234,20 +232,30 @@ bool isLegalConvertToPool(IE::MemPermuteOp memPermuteOp, mlir::AffineMap memPerm return false; } - auto perAxisType = mlir::dyn_cast(poolInputType.getElementType()); + auto perAxisType = mlir::dyn_cast(outputType.getElementType()); if (perAxisType && perAxisType.getQuantizedDimension() == Dims4D::Act::C.ind()) { log.trace("It's illegal to reshape perAxisType when quantizeDim is also IC"); return false; } } - if (isSwPermuteEfficient(memPermuteOp)) { + if (VPUIP::satisfiesOptimizedMemPermute(arch, inputType, outputType)) { log.trace("Software memPermute is more efficient"); return false; } return true; } +bool isLegalConvertToPool(IE::MemPermuteOp memPermuteOp, mlir::AffineMap memPermMap, mlir::MLIRContext* ctx, + int64_t numClusters, StringRef debugName, Logger log) { + auto inputType = mlir::cast(memPermuteOp.getInput().getType()); + auto outputType = mlir::cast(memPermuteOp.getOutput().getType()); + auto arch = config::getArch(memPermuteOp); + + return isLegalConvertToPool(inputType, outputType, memPermuteOp.getInput().getDefiningOp(), memPermMap, ctx, + numClusters, debugName, arch, log); +} + // // MemPermuteRewriter // @@ -578,15 +586,24 @@ class ConvertMemPermuteToPermuteQuantize final : public mlir::OpRewritePattern(origOp.getInput().getType()); - const auto outType = mlir::cast(origOp.getOutput().getType()); + auto outType = mlir::cast(origOp.getOutput().getType()); const auto inOrder = inType.getDimsOrder(); auto memPerm = origOp.getMemPerm(); bool inPermuteCastRequired = false; - if (IE::canConvertToNCHWInOrderWithPermuteCast(inType, outType)) { - // There is a chance to convert memPermuteOp to permuteQuantizeOp after inserting a permuteCastOp + bool outPermuteCastRequired = false; + if (IE::canConvertToNCHWInOrderWithPermuteCast(inType, memPerm)) { + // There is a chance to convert memPermuteOp to permuteQuantizeOp after inserting a permuteCastOp for input inType = inType.changeDimsOrder(DimsOrder::NCHW); - memPerm = vpux::getPermutationFromOrders(DimsOrder::NCHW, outType.getDimsOrder(), origOp.getContext()); + memPerm = vpux::getPermutationFromOrders(DimsOrder::NCHW, DimsOrder::NHWC, origOp.getContext()); inPermuteCastRequired = true; + + const auto outOrder = outType.getDimsOrder(); + if (outOrder != DimsOrder::NHWC) { + // There is a chance to convert memPermuteOp to permuteQuantizeOp after inserting permuteCastOp for output + outType = outType.changeDimsOrder(DimsOrder::NHWC); + outType = outType.changeShape(inType.getShape()); + outPermuteCastRequired = true; + } } const auto isLegalReorderOp = [&]() { @@ -607,10 +624,6 @@ mlir::LogicalResult ConvertMemPermuteToPermuteQuantize::matchAndRewrite(IE::MemP const auto alignment = VPU::NCEInvariant::getAlignment(inType.getElementType()); const auto inShape = inType.getShape(); - if (inShape[Dims4D::Act::C] % alignment == 0) { - _log.trace("It's more performant to be MaxPool"); - return false; - } // Avoid introducing Expand and Slice if (inShape[Dims4D::Act::H] * inShape[Dims4D::Act::W] % alignment != 0) { @@ -628,7 +641,7 @@ mlir::LogicalResult ConvertMemPermuteToPermuteQuantize::matchAndRewrite(IE::MemP const auto& ctx = origOp.getContext(); auto curInput = origOp.getInput(); - // Insert permuteCastOp + // Insert permuteCastOp for input if (inPermuteCastRequired) { const auto inMemPerm = vpux::getPermutationFromOrders(inOrder, DimsOrder::NCHW, ctx); auto inPermuteCastOp = @@ -641,13 +654,153 @@ mlir::LogicalResult ConvertMemPermuteToPermuteQuantize::matchAndRewrite(IE::MemP const auto noPadBeginEnd = SmallVector(outType.getRank(), 0); auto permuteQuantizeOp = rewriter.create( - appendLoc(origOp->getLoc(), "PermuteQuantize"), origOp.getOutput().getType(), curInput, - origOp.getDstOrderAttr(), mlir::AffineMapAttr::get(memPerm), dstElemTypeAttr, - getIntArrayAttr(ctx, noPadBeginEnd), getIntArrayAttr(ctx, noPadBeginEnd)); + appendLoc(origOp->getLoc(), "PermuteQuantize"), outType, curInput, + mlir::AffineMapAttr::get(DimsOrder::NHWC.toAffineMap(ctx)), mlir::AffineMapAttr::get(memPerm), + dstElemTypeAttr, getIntArrayAttr(ctx, noPadBeginEnd), getIntArrayAttr(ctx, noPadBeginEnd)); _log.trace("convert to PermuteQuantize {0}", origOp->getLoc()); - rewriter.replaceOp(origOp, permuteQuantizeOp.getOutput()); + // Insert pemuteCastOp for output + if (outPermuteCastRequired) { + auto identityMap = + mlir::AffineMap::getMultiDimIdentityMap(checked_cast(inType.getShape().size()), ctx); + auto outPermuteCastOp = rewriter.create(origOp.getLoc(), permuteQuantizeOp.getOutput(), + origOp.getDstOrder(), identityMap); + inferReturnTypes(outPermuteCastOp, InferShapedTypeMode::ALL); + + rewriter.replaceOp(origOp, outPermuteCastOp); + } else { + rewriter.replaceOp(origOp, permuteQuantizeOp.getOutput()); + } + + return mlir::success(); +} + +// +// AdaptTwoDimOnlyPermuteForDPU +// + +class AdaptTwoDimOnlyPermuteForDPU final : public mlir::OpRewritePattern { +public: + AdaptTwoDimOnlyPermuteForDPU(mlir::MLIRContext* ctx, int64_t numClusters, Logger log) + : mlir::OpRewritePattern(ctx), _numClusters(numClusters), _log(log) { + this->setDebugName("AdaptTwoDimOnlyPermuteForDPU"); + } + +private: + mlir::LogicalResult matchAndRewrite(IE::MemPermuteOp origOp, mlir::PatternRewriter& rewriter) const final; + +private: + int64_t _numClusters; + Logger _log; +}; + +mlir::LogicalResult AdaptTwoDimOnlyPermuteForDPU::matchAndRewrite(IE::MemPermuteOp origOp, + mlir::PatternRewriter& rewriter) const { + _log.trace("[{0}] Got '{1}' at '{2}'", getDebugName(), origOp->getName(), origOp->getLoc()); + + auto ctx = rewriter.getContext(); + auto nestedLog = _log.nest(); + auto inputType = mlir::cast(origOp.getInput().getType()); + auto outputType = mlir::cast(origOp.getOutput().getType()); + + constexpr int64_t SUPPORTED_RANK = 4; + if (inputType.getRank() != SUPPORTED_RANK) { + return matchFailed(nestedLog, rewriter, origOp, "Unsupported rank '{0}' != '{1}'", inputType.getRank(), + SUPPORTED_RANK); + } + + if (mlir::isa(inputType.getElementType()) || + mlir::isa(outputType.getElementType())) { + return matchFailed(nestedLog, rewriter, origOp, "Input and/or output data type is quantized per axis."); + } + + const auto inMemShape = inputType.getMemShape(); + const auto memPermMap = origOp.getMemPerm(); + if (isTrivialPermute(inMemShape, memPermMap)) { + return matchFailed(nestedLog, rewriter, origOp, "MemPermuteOp is trivial, no need to optimize."); + } + + const auto numOnes = std::count_if(inMemShape.begin(), inMemShape.end(), [](auto val) { + return val == 1; + }); + + if (numOnes != 2) { + return matchFailed(nestedLog, rewriter, origOp, + "Shape is unsupported for this optimization: there are {0} dims == 1; expected = 2", + numOnes); + } + + if (inputType.getShape()[Dims4D::Act::N] == 1) { + return matchFailed(nestedLog, rewriter, origOp, "No need to apply rewriter: batch is 1"); + } + + if (isLegalConvertToPool(origOp, memPermMap, ctx, _numClusters, getDebugName(), nestedLog)) { + return matchFailed(nestedLog, rewriter, origOp, "Op is already legal to convert to Pool"); + } + + // Aim to have 2 dims permuted among themselves + auto [mergedPermutation, mergedMemShape] = vpux::getMergedPermutationAndShape(inputType, memPermMap, 2); + if (mergedMemShape.size() != 2 || mergedPermutation != SmallVector{1, 0}) { + return matchFailed(nestedLog, rewriter, origOp, + "Merged memShape is too large, should have rank 2 (actual = {0}) and mergedPermutation " + "should be {1, 0}", + mergedMemShape.size()); + } + + const auto newShape = Shape({1, mergedMemShape[1], mergedMemShape[0], 1}); + auto newMemPermuteInput = inputType.changeShape(newShape).changeDimsOrder(DimsOrder::NHWC); + auto newMemPermuteOutput = outputType.changeShape(newShape).changeDimsOrder(DimsOrder::NCHW); + + // MemPermutation between NHWC -> NCHW is {0, 3, 1, 2} + const auto newMemPerm = DimsOrder::NWCH.toAffineMap(ctx); + + auto arch = config::getArch(origOp); + // Passing nullptr as parent op, since the parent op will be a PermuteCast that has not yet been created + if (!isLegalConvertToPool(newMemPermuteInput, newMemPermuteOutput, /*parentOp = */ nullptr, newMemPerm, ctx, + _numClusters, getDebugName(), arch, nestedLog)) { + return matchFailed( + nestedLog, rewriter, origOp, + "MemPermute after adaptation will not be a supported DPU permute. Do not apply transformation."); + } + + auto getMemPerm = [&ctx](ArrayRef inputMemShape, ArrayRef outputMemShape) -> mlir::AffineMap { + SmallVector perm; + for (auto oIdx : irange(outputMemShape.size())) { + for (auto iIdx : irange(inputMemShape.size())) { + if (inputMemShape[iIdx] != outputMemShape[oIdx]) { + continue; + } + + if (llvm::find(perm, iIdx) != perm.end()) { + continue; + } + + perm.push_back(static_cast(iIdx)); + break; + } + } + + return mlir::AffineMap::getPermutationMap(perm, ctx); + }; + + auto inputPermCastMemPerm = getMemPerm(inMemShape.raw(), newMemPermuteInput.getMemShape().raw()); + auto inPermCast = rewriter.create(appendLoc(origOp.getLoc(), "_in_perm_cast"), + newMemPermuteInput, origOp.getInput(), + DimsOrder::NHWC.toAffineMap(ctx), inputPermCastMemPerm); + + auto dpuFriendlyPermute = + rewriter.create(origOp->getLoc(), newMemPermuteOutput, inPermCast.getOutput(), + DimsOrder::NCHW.toAffineMap(ctx), newMemPerm); + + const auto outMemShape = outputType.getMemShape(); + auto outputPermCastMemPerm = getMemPerm(newMemPermuteOutput.getMemShape().raw(), outMemShape.raw()); + auto outPermCast = rewriter.create( + appendLoc(origOp.getLoc(), "_out_perm_cast"), outputType, dpuFriendlyPermute.getOutput(), + outputType.getDimsOrder().toAffineMap(ctx), outputPermCastMemPerm); + + rewriter.replaceOp(origOp, outPermCast); + nestedLog.trace("Successfully adapted MemPermute to be convertible to DPU op."); return mlir::success(); } @@ -685,6 +838,7 @@ void ConvertMemPermuteToOpPass::safeRunOnFunc() { auto numClusters = tileOp.getCount(); mlir::RewritePatternSet patterns(&ctx); + patterns.add(&ctx, numClusters, _log); patterns.add(&ctx, numClusters, _log); patterns.add(&ctx, numClusters, _log); patterns.add(&ctx, numClusters, _log); diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_min_max_to_clamp.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_min_max_to_clamp.cpp index 7ea3f2e4c2..61342fd889 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_min_max_to_clamp.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_min_max_to_clamp.cpp @@ -4,6 +4,8 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_mvn6_to_mvn1.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_mvn6_to_mvn1.cpp index 3f09a72a98..4994a0ff5c 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_mvn6_to_mvn1.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_mvn6_to_mvn1.cpp @@ -4,10 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_nce_ops_to_4d.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_nce_ops_to_4d.cpp index 5d1bb879fa..a0d7d75243 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_nce_ops_to_4d.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_nce_ops_to_4d.cpp @@ -3,12 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_nearest_to_broadcast_or_strided_concat.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_nearest_to_broadcast_or_strided_concat.cpp index 7b2df39b0c..a56cb6501e 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_nearest_to_broadcast_or_strided_concat.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_nearest_to_broadcast_or_strided_concat.cpp @@ -3,12 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/interpolate_utils.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" - -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_non_constant_pad_to_slice_and_concat.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_non_constant_pad_to_slice_and_concat.cpp index d6659dc4f1..7fea3c4c64 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_non_constant_pad_to_slice_and_concat.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_non_constant_pad_to_slice_and_concat.cpp @@ -3,11 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/se_padding_utils.hpp" - -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_pad_to_concat.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_pad_to_concat.cpp index 619dbc6cc8..314f9c7b6f 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_pad_to_concat.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_pad_to_concat.cpp @@ -3,12 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" #include "vpux/compiler/dialect/IE/utils/pad_extract.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/quantization.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_paddings_to_floor_mode.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_paddings_to_floor_mode.cpp index 4b59f45aff..e4c8472e6e 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_paddings_to_floor_mode.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_paddings_to_floor_mode.cpp @@ -3,11 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/attributes.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_parallel_slices_to_gather.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_parallel_slices_to_gather.cpp index 5c199c4625..2380da9265 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_parallel_slices_to_gather.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_parallel_slices_to_gather.cpp @@ -5,9 +5,11 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/gather_dma_constants.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -315,7 +317,7 @@ void ConvertParallelSlicesToGatherPass::safeRunOnFunc() { auto func = getOperation(); auto& ctx = getContext(); - const auto arch = VPU::getArch(func); + const auto arch = config::getArch(func); _maxGatherDMAIndicesListLength = VPU::getGatherDMAMaxIndicesListLength(arch); _maxGatherDMAElementSize = VPU::getGatherDMAMaxElementSize(arch); mlir::RewritePatternSet patterns(&ctx); diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_power_to_mult.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_power_to_mult.cpp index bc465371ca..51dc09cb03 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_power_to_mult.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_power_to_mult.cpp @@ -4,7 +4,7 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_precision_to_fp16.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_precision_to_fp16.cpp index ea023e16c5..7501ded4af 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_precision_to_fp16.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_precision_to_fp16.cpp @@ -3,10 +3,17 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/bitwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/convert_op_types.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" @@ -28,7 +35,7 @@ using namespace IE; namespace { -// E#160869: The compiler must have a much more general solution: either all +// E#160872: The compiler must have a much more general solution: either all // fp16 values must come out as HALF_MAX / HALF_MIN or none. the preferred way // seems to be "all". However, clamping non-splats produced inaccurate results // according to tests. This needs to be debugged properly to understand why. The diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_precision_to_i32.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_precision_to_i32.cpp index 72fc9e6e8a..1f33c83e1b 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_precision_to_i32.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_precision_to_i32.cpp @@ -4,11 +4,19 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/bitwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/convert_op_types.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" -#include "vpux/compiler/utils/types.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_quantize_ops_to_nce_ops.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_quantize_ops_to_nce_ops.cpp index 2f803049da..522483eeb0 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_quantize_ops_to_nce_ops.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_quantize_ops_to_nce_ops.cpp @@ -5,13 +5,10 @@ #include "vpux/compiler/dialect/IE/transforms/passes/convert_quantize_ops_to_nce_ops.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" #include "vpux/compiler/dialect/IE/transforms/factories/convert_quantize_ops_to_nce_ops_strategy_getter.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" -#include "vpux/compiler/dialect/const/dialect.hpp" -#include "vpux/compiler/utils/passes.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/dialect/const/utils/utils.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reduce_sum_to_conv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reduce_sum_to_conv.cpp index 18f1ea70c3..114ae6333e 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reduce_sum_to_conv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reduce_sum_to_conv.cpp @@ -5,10 +5,12 @@ #include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/core/attributes/shape.hpp" -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_reduce_utils.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reduce_to_pooling.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reduce_to_pooling.cpp index c74c41920a..d46270619a 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reduce_to_pooling.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reduce_to_pooling.cpp @@ -3,12 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/handle_kernels_utils.hpp" -#include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" #include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_reduce_utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reorder_to_permute_quantize.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reorder_to_permute_quantize.cpp index 940d1a87a7..e3ad3bb674 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reorder_to_permute_quantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reorder_to_permute_quantize.cpp @@ -3,12 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/permute_quantize_utils.hpp" #include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -45,9 +44,9 @@ mlir::LogicalResult FusePermuteRewrite::matchAndRewrite(IE::ReorderOp origOp, ml const auto outOrder = DimsOrder::fromValue(origOp.getOutput()); auto curInput = origOp.getInput(); const auto inType = mlir::cast(origOp.getInput().getType()); - const auto outType = mlir::cast(origOp.getOutput().getType()); - if (IE::canConvertToNCHWInOrderWithPermuteCast(inType, outType)) { - // There is a chance to convert reorderOp to permuteQuantizeOp after inserting a permuteCastOp + const auto origMemPerm = vpux::getPermutationFromOrders(inOrder, outOrder, origOp->getContext()); + if (IE::canConvertToNCHWInOrderWithPermuteCast(inType, origMemPerm) && outOrder == DimsOrder::NHWC) { + // There is a chance to convert reorderOp to permuteQuantizeOp after inserting a permuteCastOp for input const auto inMemPerm = vpux::getPermutationFromOrders(inOrder, DimsOrder::NCHW, origOp->getContext()); auto inPermuteCastOp = rewriter.create(appendLoc(origOp->getLoc(), "PermuteCast"), origOp.getInput(), @@ -122,8 +121,11 @@ bool hasQuantizedAvgPoolUserToPropagate(IE::ReorderOp reorder) { bool ConvertReorderToPermuteQuantizePass::isSupportedReorder(IE::ReorderOp reorder, Logger log) const { auto inType = mlir::cast(reorder.getInput().getType()); const auto outType = mlir::cast(reorder.getOutput().getType()); - if (IE::canConvertToNCHWInOrderWithPermuteCast(inType, outType)) { - // There is a chance to convert reorderOp to permuteQuantizeOp after inserting a permuteCastOp + const auto inOrder = inType.getDimsOrder(); + const auto outOrder = outType.getDimsOrder(); + const auto origMemPerm = vpux::getPermutationFromOrders(inOrder, outOrder, reorder->getContext()); + if (IE::canConvertToNCHWInOrderWithPermuteCast(inType, origMemPerm) && outOrder == DimsOrder::NHWC) { + // There is a chance to convert reorderOp to permuteQuantizeOp after inserting a permuteCastOp for input inType = inType.changeDimsOrder(DimsOrder::NCHW); } diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reverse_to_dw_conv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reverse_to_dw_conv.cpp index 4557922b53..8c8e1e166e 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reverse_to_dw_conv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_reverse_to_dw_conv.cpp @@ -5,11 +5,14 @@ #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_scalar_to_tensor.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_scalar_to_tensor.cpp index 1504d804c8..914bda67de 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_scalar_to_tensor.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_scalar_to_tensor.cpp @@ -4,7 +4,10 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_scaleshift_to_depthwise.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_scaleshift_to_depthwise.cpp index f56710e085..09fefac3ad 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_scaleshift_to_depthwise.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_scaleshift_to_depthwise.cpp @@ -5,7 +5,9 @@ #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/scale_shift_utils.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_scatterndupdate_to_strided_concat.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_scatterndupdate_to_strided_concat.cpp index ba92037c4b..23e24bfb3e 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_scatterndupdate_to_strided_concat.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_scatterndupdate_to_strided_concat.cpp @@ -3,10 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_sdpa_to_online_sdpa.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_sdpa_to_online_sdpa.cpp index 8ae9ece841..3cbfad929a 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_sdpa_to_online_sdpa.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_sdpa_to_online_sdpa.cpp @@ -3,13 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" -#include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/error.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_shape_to_4d.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_shape_to_4d.cpp index 05ffcd22a0..ba33064888 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_shape_to_4d.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_shape_to_4d.cpp @@ -4,34 +4,32 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/bitwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/broadcast_utils.hpp" #include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/IE/utils/elem_type_info_utils.hpp" +#include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/IE/utils/roll_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_reduce_utils.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" -#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/compiler/dialect/core/types.hpp" -#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/permute_utils.hpp" +#include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/core/error.hpp" -#include "vpux/utils/core/range.hpp" - -#include -#include -#include - -#include -#include -#include - namespace vpux::IE { #define GEN_PASS_DECL_CONVERTSHAPETO4D #define GEN_PASS_DEF_CONVERTSHAPETO4D @@ -48,11 +46,12 @@ using MergeMapItem = SmallVector; using MergeMap = SmallVector; bool isTrivialDim(int64_t dim) { - // Note: Dynamic dims are marked by negative values. + // Note: Dynamic dims are marked by `mlir::ShapedType::kDynamic`. return dim == 1; } -void alignShapeToReferenceShapeSize(size_t refSize, SmallVector& shape, bool extendOnH) { +template +void alignShapeToReferenceShapeSize(size_t refSize, ShapeType& shape, bool extendOnH) { VPUX_THROW_UNLESS(refSize >= shape.size(), "The reference shape size({0}) < shape size({1})", refSize, shape.size()); const size_t diff = refSize - shape.size(); @@ -62,7 +61,7 @@ void alignShapeToReferenceShapeSize(size_t refSize, SmallVector& shape, "Extend on H does not support reference shape size({0}) and shape size({1})", refSize, shape.size()); if (diff == 2) { - shape.insert(shape.end(), 1, 1); + shape.push_back(1); } shape.insert(shape.begin() + 2, 1, 1); } else { @@ -71,37 +70,55 @@ void alignShapeToReferenceShapeSize(size_t refSize, SmallVector& shape, } } -int64_t getBalancedDimIndexFromShape(SmallVector shape) { +template +void alignShapes(ShapeType1& smallShape, ShapeType2& largeShape, bool autoBroadcast) { + if (autoBroadcast) { + alignShapeToReferenceShapeSize(largeShape.size(), smallShape, false); + } else { + if (smallShape.size() == 1 && smallShape[Dim(0)] == largeShape[Dim(1)]) { + // Some operations need to map their channels first. e.g. PRelu + smallShape.insert(smallShape.begin(), 1); + smallShape.append(largeShape.size() - 2, 1); + } else { + alignShapeToReferenceShapeSize(largeShape.size(), smallShape, false); + } + } +} + +int64_t getBalancedDimIndex(ShapeRef shape) { int64_t dimH = 1; int64_t dimW = 1; - int64_t dimIndex = 0; - while (!shape.empty()) { + size_t front = 0; + size_t back = shape.size(); + + while (front < back) { if (dimW < dimH) { - dimW *= shape.back(); - shape.pop_back(); + --back; + dimW *= shape[Dim(back)]; } else { - dimH *= shape.front(); - shape.erase(shape.begin()); - dimIndex++; + dimH *= shape[Dim(front)]; + ++front; } } - return dimIndex; + return front; } -SmallVector alignShapeWithDimMap(ArrayRef originShape, const MergeMap& mapper) { - SmallVector retNewShape; +template +auto alignShapeWithDimMap(const ShapeType& originShape, const MergeMap& mapper) { + auto retNewShape = makeShape(originShape, 0, 1); for (const auto& dims : mapper) { - int64_t dimSize = 1; + typename ShapeType::ValueType dimSize = 1; for (auto i : dims) { - dimSize *= originShape[i]; + dimSize *= originShape[Dim(i)]; } retNewShape.push_back(dimSize); } return retNewShape; } -SmallVector alignShapeTo4D(ArrayRef originShape, const MergeMap& mapper, bool extendOnH) { - auto newShape = extendOnH ? SmallVector(originShape) : alignShapeWithDimMap(originShape, mapper); +template +auto alignShapeTo4D(const ShapeType& originShape, const MergeMap& mapper, bool extendOnH) { + auto newShape = extendOnH ? copyShape(originShape) : alignShapeWithDimMap(originShape, mapper); alignShapeToReferenceShapeSize(TARGET_TENSOR_DIM, newShape, extendOnH); return newShape; } @@ -114,7 +131,7 @@ MergeMap getTrivialMap(size_t size) { return mapper; } -MergeMap getDimMapWithFirstGreater1DimAsC(SmallVector shape) { +MergeMap getDimMapWithFirstGreater1DimAsC(Shape shape) { const int64_t maxDim = checked_cast(shape.size()); // Try to convert great than 4D shape to 3D. // In this way, to promise @@ -142,35 +159,36 @@ MergeMap getDimMapWithFirstGreater1DimAsC(SmallVector shape) { shape.erase(shape.begin(), shape.begin() + nextDimCIndex); // Convert shape to 2D, and make the value of 2 Dims close to each other - const auto splitDimIndex = getBalancedDimIndexFromShape(std::move(shape)) + nextDimCIndex; + const auto splitDimIndex = getBalancedDimIndex(shape) + nextDimCIndex; retMapper.push_back(irange(nextDimCIndex, splitDimIndex)); retMapper.push_back(irange(splitDimIndex, maxDim)); return retMapper; } -MergeMap getDimMapGeneric(ArrayRef shape) { +MergeMap getDimMapGeneric(ShapeRef shape) { MergeMap dimMapper; if (shape.size() > TARGET_TENSOR_DIM) { - return getDimMapWithFirstGreater1DimAsC(to_small_vector(shape)); + return getDimMapWithFirstGreater1DimAsC(Shape(shape)); } return getTrivialMap(shape.size()); } -MergeMap getDimMergeMapWith2Inputs(ArrayRef input1, ArrayRef input2) { - auto shapeSize1 = std::accumulate(input1.begin(), input1.end(), (int64_t)1, std::multiplies()); - auto shapeSize2 = std::accumulate(input2.begin(), input2.end(), (int64_t)1, std::multiplies()); +MergeMap getDimMergeMapWith2Inputs(ShapeRef input1, ShapeRef input2) { + const auto shapeSize1 = input1.totalSize(); + const auto shapeSize2 = input2.totalSize(); // Find the origin input and broadcast shape // The large size shape is the origin input // The small size shape is the shape that needs to be broadcast in some planes auto maxShape = (shapeSize1 > shapeSize2) ? input1 : input2; auto planeShape = (shapeSize1 > shapeSize2) ? input2 : input1; - auto getMergeMap = [](ArrayRef fullShape, ArrayRef planeShape, auto condition) { + auto getMergeMap = [](ShapeRef fullShape, ShapeRef planeShape, auto condition) { MergeMap dimMap; SmallVector inputDimsTmp; for (size_t i = 0; i < fullShape.size(); i++) { - auto compareVal = condition(i, fullShape); - if (compareVal == planeShape[i]) { + const auto d = Dim(i); + auto compareVal = condition(d, fullShape); + if (compareVal == planeShape[d]) { inputDimsTmp.push_back(i); } else { if (inputDimsTmp.size() > 1) { @@ -185,10 +203,10 @@ MergeMap getDimMergeMapWith2Inputs(ArrayRef input1, ArrayRef i return dimMap; }; - auto sameDimCondition = [](size_t i, ArrayRef shape) { - return shape[i]; + auto sameDimCondition = [](Dim d, ShapeRef shape) { + return shape[d]; }; - auto planeDimCondition = [](size_t, ArrayRef) { + auto planeDimCondition = [](Dim, ShapeRef) { return 1; }; @@ -204,7 +222,7 @@ MergeMap getDimMergeMapWith2Inputs(ArrayRef input1, ArrayRef i // Dim(0, 1) 4x3 and Dim(2, 3, 4) 13x13x2 can merge together. // Inputs: tensor<1x2x3x4x5x6xf16>, tensor<1x2x1x4x5x1xf16> // Dim(0, 1) 1x2, Dim(2) 3, Dim(3, 4) 4x5 and Dim(5) 6 can merge together. - auto calculateMergeMap = [&](ArrayRef fullShape, ArrayRef planeShape) { + auto calculateMergeMap = [&](ShapeRef fullShape, ShapeRef planeShape) { auto mergeInSameDims = getMergeMap(fullShape, planeShape, sameDimCondition); auto mergeInPlaneDims = getMergeMap(fullShape, planeShape, planeDimCondition); MergeMap dimsCanMerge; @@ -243,10 +261,11 @@ MergeMap getDimMergeMapWith2Inputs(ArrayRef input1, ArrayRef i return dimsCanMerge; }; - auto getSubShape = [](ArrayRef shape, ArrayRef map) { - SmallVector retShape; - for (auto& dims : map) { - retShape.push_back(shape[dims]); + auto getSubShape = [](ShapeRef shape, ArrayRef map) { + Shape retShape; + retShape.reserve(map.size()); + for (auto& dim : map) { + retShape.push_back(shape[Dim(dim)]); } return retShape; }; @@ -266,9 +285,9 @@ MergeMap getDimMergeMapWith2Inputs(ArrayRef input1, ArrayRef i dimsCanMerge = calculateMergeMap(maxShape, planeShape); } - auto isAllOne = [](const MergeMapItem& item, ArrayRef planeShape) { + auto isAllOne = [](const MergeMapItem& item, ShapeRef planeShape) { return std::all_of(item.begin(), item.end(), [&](int64_t dim) { - return planeShape[dim] == 1; + return planeShape[Dim(dim)] == 1; }); }; switch (dimsCanMerge.size()) { @@ -278,7 +297,7 @@ MergeMap getDimMergeMapWith2Inputs(ArrayRef input1, ArrayRef i } case 2: { auto expandMapTo3D = [&](auto mapIt) { - auto newReshapeDim = getBalancedDimIndexFromShape(getSubShape(maxShape, *mapIt)); + auto newReshapeDim = getBalancedDimIndex(getSubShape(maxShape, *mapIt)); SmallVector dimTmp(mapIt->begin(), mapIt->begin() + newReshapeDim); mapIt->erase(mapIt->begin(), mapIt->begin() + newReshapeDim); dimsCanMerge.insert(mapIt, dimTmp); @@ -321,7 +340,7 @@ MergeMap getDimMergeMapWith2Inputs(ArrayRef input1, ArrayRef i return dimsCanMerge; } -MergeMap getDimMergeMapWith3Inputs(ArrayRef input1, ArrayRef inputLow, ArrayRef outLow) { +MergeMap getDimMergeMapWith3Inputs(ShapeRef input1, ShapeRef inputLow, ShapeRef outLow) { // Handle 3 input shapes // input: AxBxCxDxF // in_low: 1xBx1x1x1 @@ -336,7 +355,7 @@ MergeMap getDimMergeMapWith3Inputs(ArrayRef input1, ArrayRef i return dim > 1; }; - auto getDimIdx = [&](ArrayRef dims) -> int64_t { + auto getDimIdx = [&](ShapeRef dims) -> int64_t { auto firstMoreThanOneIt = std::find_if(dims.begin(), dims.end(), moreThanOnePredicate); VPUX_THROW_WHEN(firstMoreThanOneIt == dims.end(), "The shape size is 1, should not enter this case."); return std::distance(dims.begin(), firstMoreThanOneIt); @@ -374,7 +393,7 @@ MergeMap getDimMergeMapWith3Inputs(ArrayRef input1, ArrayRef i MergeMapItem item; for (int64_t dimIdx = 0; dimIdx < checked_cast(newShape.size()); dimIdx++) { item.append(mergeMapTmp[dimIdx]); - if (newShape[dimIdx] > 1) { + if (newShape[Dim(dimIdx)] > 1) { mergeMapRet.push_back(item); item.clear(); } @@ -388,49 +407,64 @@ MergeMap getDimMergeMapWith3Inputs(ArrayRef input1, ArrayRef i } MergeMap extendInputShapeTo4D(IE::FakeQuantizeOp origOp) { - auto inputLowScaleShape = to_small_vector(getShape(origOp.getInputLow())); - auto outputLowScaleShape = to_small_vector(getShape(origOp.getOutputLow())); - const auto inputShape = to_small_vector(getShape(origOp.getInput())); - const auto ref1ElemShape = SmallVector(inputShape.size(), 1); + auto inputLowShape = Shape(getShape(origOp.getInputLow())); + auto outputLowShape = Shape(getShape(origOp.getOutputLow())); - alignShapeToReferenceShapeSize(inputShape.size(), inputLowScaleShape, false); - alignShapeToReferenceShapeSize(inputShape.size(), outputLowScaleShape, false); + const auto inputShape = callOnShapeOf(origOp.getInput().getType(), [](const auto& shape) { + return reifyShape(shape); + }); + const auto ref1ElemShape = makeShape(inputShape, inputShape.size(), 1); - if (inputLowScaleShape == outputLowScaleShape) { - return getDimMergeMapWith2Inputs(inputShape, inputLowScaleShape); + alignShapeToReferenceShapeSize(inputShape.size(), inputLowShape, false); + alignShapeToReferenceShapeSize(inputShape.size(), outputLowShape, false); + + if (inputLowShape == outputLowShape) { + return getDimMergeMapWith2Inputs(inputShape, inputLowShape); + } + if (ref1ElemShape == inputLowShape) { + return getDimMergeMapWith2Inputs(inputShape, outputLowShape); } - if (ref1ElemShape == inputLowScaleShape) { - return getDimMergeMapWith2Inputs(inputShape, outputLowScaleShape); + if (ref1ElemShape == outputLowShape) { + return getDimMergeMapWith2Inputs(inputShape, inputLowShape); } - if (ref1ElemShape == outputLowScaleShape) { - return getDimMergeMapWith2Inputs(inputShape, inputLowScaleShape); + return getDimMergeMapWith3Inputs(inputShape, inputLowShape, outputLowShape); +} + +template +mlir::Value createReshape(mlir::OpBuilder& builder, mlir::Location loc, mlir::Value input, + const ShapeType& outputShape) { + const auto ctx = builder.getContext(); + if constexpr (isStaticShape) { + return builder.createOrFold(loc, input, /*shape=*/nullptr, /*special_zero=*/false, + getIntArrayAttr(ctx, outputShape.raw())); + } else { + return IE::createDynamicReshape(builder, loc, input, shapeCast(outputShape)); } - return getDimMergeMapWith3Inputs(inputShape, inputLowScaleShape, outputLowScaleShape); } mlir::Value reshapeInputWithMergeMap(mlir::PatternRewriter& rewriter, mlir::Location loc, size_t referenceShapeSize, mlir::Value origInput, const MergeMap& map, bool extendOnH) { - auto inShape = to_small_vector(getShape(origInput)); - - // Note: ensure the rank of the current shape is aligned to the "reference" - // shape (the shape that was used to calculate the merge map). this - // guarantees we don't have buffer overflows due to mege map using indices - // outside of current shape's rank. - alignShapeToReferenceShapeSize(referenceShapeSize, inShape, extendOnH); - - auto constInputShape = alignShapeTo4D(inShape, map, extendOnH); - const auto constInputShapeAttr = getIntArrayAttr(rewriter.getContext(), constInputShape); - - return rewriter.createOrFold(loc, origInput, nullptr, false, constInputShapeAttr); + return callOnShapeOf(origInput.getType(), [&](const auto& shape) { + auto outShape = copyShape(shape); + // Note: ensure the rank of the current shape is aligned to the "reference" + // shape (the shape that was used to calculate the merge map). this + // guarantees we don't have buffer overflows due to mege map using indices + // outside of current shape's rank. + alignShapeToReferenceShapeSize(referenceShapeSize, outShape, extendOnH); + + auto constInputShape = alignShapeTo4D(outShape, map, extendOnH); + return createReshape(rewriter, loc, origInput, constInputShape); + }); } -void tryAndConvert2NCEShape(SmallVector& shape1, SmallVector& shape2, MergeMap& map) { +template +void tryAndConvert2NCEShape(ShapeType1& shape1, ShapeType2& shape2, MergeMap& map) { // 4D Multiply shape 1x1x1xM need convert Shape to 1xMx1x1 // // TODO: // This logic is a litte same as AdaptShapesForScaleShiftPass. // May combine them into 1 pass and abandon the AdaptShapesForScaleShiftPass - const auto nonTrivialDimPredicate = [](const int64_t dim) -> bool { + const auto nonTrivialDimPredicate = [](const auto dim) -> bool { return dim > 1; }; const auto nonTrivialShape1Dims = std::count_if(shape1.begin(), shape1.end(), nonTrivialDimPredicate); @@ -440,26 +474,24 @@ void tryAndConvert2NCEShape(SmallVector& shape1, SmallVector& (nonTrivialShape1Dims == 0 && nonTrivialShape2Dims == 0)) { return; } - auto findFirstNonTrivialIndex = [&](auto shape) { + + auto findFirstNonTrivialDim = [&](const auto& shape) { const auto firstIt = std::find_if(shape.begin(), shape.end(), nonTrivialDimPredicate); - return std::distance(shape.begin(), firstIt); + return Dim(std::distance(shape.begin(), firstIt)); }; - int64_t firstNonTrivialIndex; // Find the first non-trivial index from 2 input shapes - firstNonTrivialIndex = (findFirstNonTrivialIndex(shape1) <= findFirstNonTrivialIndex(shape2)) - ? findFirstNonTrivialIndex(shape1) - : findFirstNonTrivialIndex(shape2); + const auto firstNonTrivialDim = std::min(findFirstNonTrivialDim(shape1), findFirstNonTrivialDim(shape2)); // Already at DimC - if (firstNonTrivialIndex == 1) { + if (firstNonTrivialDim.ind() == 1) { return; } if (map.size() < 4) { map.insert(map.begin(), 4 - map.size(), {}); } - std::swap(shape1[1], shape1[firstNonTrivialIndex]); - std::swap(shape2[1], shape2[firstNonTrivialIndex]); - std::swap(map[1], map[firstNonTrivialIndex]); + std::swap(shape1[Dim(1)], shape1[firstNonTrivialDim]); + std::swap(shape2[Dim(1)], shape2[firstNonTrivialDim]); + std::swap(map[1], map[firstNonTrivialDim.ind()]); } // Merge all adjacent axis and non-axis dimensions @@ -568,6 +600,15 @@ std::optional, SmallVector>> getAdjusted if (newInshape.size() == TARGET_TENSOR_DIM) { return std::pair{std::move(newInshape), std::move(newRepeats)}; } + + if (origInRank == DimsGroups5D::Act::numDims && origOutRank == DimsGroups5D::Act::numDims && + repeatRank == DimsGroups5D::Act::numDims) { + return std::nullopt; + } + + if (newInshape.size() == DimsGroups5D::Act::numDims && newRepeats.size() == DimsGroups5D::Act::numDims) { + return std::pair{std::move(newInshape), std::move(newRepeats)}; + } } return std::nullopt; @@ -740,37 +781,42 @@ mlir::LogicalResult GenericConverter::convertWith2Inputs(ConcreteOp extendOnH = false; } - // Align dims - if (shapeOneVector.size() != shapeTwoVector.size()) { - extendOnH = false; - auto maxSize = std::max(shapeOneVector.size(), shapeTwoVector.size()); - auto& smallShape = (shapeOneVector.size() > shapeTwoVector.size()) ? shapeTwoVector : shapeOneVector; - auto& bigShape = (shapeOneVector.size() > shapeTwoVector.size()) ? shapeOneVector : shapeTwoVector; - SmallVector expanedShape(maxSize, 1); - if (origOp->hasAttr("auto_broadcast")) { - alignShapeToReferenceShapeSize(bigShape.size(), smallShape, false); - } else { - // Some operations need to map their channels first. e.g. PRelu - if ((smallShape.size() == 1) && (smallShape[0] == bigShape[1])) { - expanedShape[1] = smallShape[0]; - smallShape.swap(expanedShape); - } else { - alignShapeToReferenceShapeSize(bigShape.size(), smallShape, false); + const auto [newIn1, newIn2, dimsCanMerge] = callOnShapeOf(input1.getType(), [&](const auto& inShapeRef1) { + return callOnShapeOf(input2.getType(), [&](const auto& inShapeRef2) { + auto inShape1 = copyShape(inShapeRef1); + auto inShape2 = copyShape(inShapeRef2); + + // Align dims + if (inShape1.size() != inShape2.size()) { + extendOnH = false; + const auto autoBroadcast = origOp->hasAttr("auto_broadcast"); + if (inShape1.size() < inShape2.size()) { + alignShapes(inShape1, inShape2, autoBroadcast); + } else { + alignShapes(inShape2, inShape1, autoBroadcast); + } } - } - } - auto dimsCanMerge = getDimMergeMapWith2Inputs(shapeOneVector, shapeTwoVector); - auto newInputShape1 = alignShapeTo4D(shapeOneVector, dimsCanMerge, extendOnH); - auto newInputShape2 = alignShapeTo4D(shapeTwoVector, dimsCanMerge, extendOnH); + auto dimsCanMerge = getDimMergeMapWith2Inputs(reifyShape(inShape1), reifyShape(inShape2)); + auto newInShape1 = alignShapeTo4D(inShape1, dimsCanMerge, extendOnH); + auto newInShape2 = alignShapeTo4D(inShape2, dimsCanMerge, extendOnH); - if (std::is_same::value) { - tryAndConvert2NCEShape(newInputShape1, newInputShape2, dimsCanMerge); - } - auto newIn1 = rewriter.createOrFold(takeOpLoc(origOp, "reshape_lhs"), operands[0], nullptr, false, - getIntArrayAttr(this->getContext(), newInputShape1)); - auto newIn2 = rewriter.createOrFold(takeOpLoc(origOp, "reshape_rhs"), operands[1], nullptr, false, - getIntArrayAttr(this->getContext(), newInputShape2)); + if constexpr (std::is_same_v) { + tryAndConvert2NCEShape(newInShape1, newInShape2, dimsCanMerge); + } + + auto newIn1 = operands[0]; + auto newIn2 = operands[1]; + if (newInShape1 != inShapeRef1) { + newIn1 = createReshape(rewriter, takeOpLoc(origOp, "reshape_lhs"), newIn1, newInShape1); + } + if (newInShape2 != inShapeRef2) { + newIn2 = createReshape(rewriter, takeOpLoc(origOp, "reshape_rhs"), newIn2, newInShape2); + } + + return std::make_tuple(newIn1, newIn2, std::move(dimsCanMerge)); + }); + }); SmallVector newOperands; newOperands.push_back(newIn1); @@ -780,19 +826,27 @@ mlir::LogicalResult GenericConverter::convertWith2Inputs(ConcreteOp auto* newOp = rewriter.clone(*origOp, mapper); SmallVector newResults; + newResults.reserve(newOp->getResults().size()); + for (auto result : newOp->getResults()) { - auto resultNDI = mlir::cast(result.getType()); - auto resultShape = to_small_vector(resultNDI.getShape()); - result.setType(resultNDI.changeShape(ShapeRef(alignShapeTo4D(resultShape, dimsCanMerge, extendOnH)))); - const auto outputShapeAttr = getIntArrayAttr(rewriter.getContext(), resultShape); - auto resultReshapeOp = rewriter.createOrFold( - takeOpLoc(origOp, StringLiteral("reshape_out_{0}"), newResults.size()), result, nullptr, false, - outputShapeAttr); - if (result == resultReshapeOp) { - newResults.push_back(result); - } else { - newResults.push_back(resultReshapeOp.template getDefiningOp().getOutput()); - } + const auto resultType = mlir::cast(result.getType()); + auto resultReshapeOp = callOnShapeOf( + resultType, + [&](const auto& shape, const MergeMap& dimsCanMerge) { + auto newOutShape = alignShapeTo4D(copyShape(shape), dimsCanMerge, extendOnH); + result.setType(resultType.changeTypeComponents( + TypeComponents() + .setShapeWithRepresentation(std::move(newOutShape)) + .setDimsOrder(DimsOrder::fromNumDims(TARGET_TENSOR_DIM)))); + + return createReshape(rewriter, + takeOpLoc(origOp, StringLiteral("reshape_out_{0}"), newResults.size()), result, + shape); + }, + dimsCanMerge); + + const auto newResult = resultReshapeOp == result ? result : resultReshapeOp.getDefiningOp()->getResult(0); + newResults.push_back(newResult); } rewriter.replaceOp(origOp, newResults); @@ -840,13 +894,13 @@ mlir::LogicalResult FakeQuantizeConverter::matchAndRewrite(IE::FakeQuantizeOp or takeOpLoc(origOp, "fq_in"), inputReshape, inputLow, inputHigh, outputLow, outputHigh, origOp.getLevelsAttr(), origOp.getLowFpTypeAttr(), origOp.getAutoBroadcastAttr()); - const auto outputShapeAttr = getIntArrayAttr(getContext(), getShape(origOp.getOutput())); - auto outReshape = rewriter.replaceOpWithNewOp(origOp, newFakeQuantizeOp.getOutput(), nullptr, false, - outputShapeAttr); - extendOpLoc(outReshape, "reshape_out"); + auto outReshape = callOnShapeOf(origOp.getOutput().getType(), [&](const auto& shape) { + return createReshape(rewriter, takeOpLoc(origOp, "reshape_out"), newFakeQuantizeOp.getOutput(), shape); + }); - _log.trace("[{0}] Replaced with 'IE::FakeQuantize'", getDebugName()); + rewriter.replaceOp(origOp, outReshape); + _log.trace("[{0}] Replaced with 'IE::FakeQuantize'", getDebugName()); return mlir::success(); } @@ -882,7 +936,7 @@ mlir::LogicalResult TopKOpConverter::matchAndRewrite(IE::TopKOp origOp, OpAdapto } // Deduce the new TopK aix from map table - const auto inShape = to_small_vector(getShape(origOp.getInput())); + const auto inShape = getShape(origOp.getInput()); MergeMap mergeMap; SmallVector tempMap; @@ -902,7 +956,7 @@ mlir::LogicalResult TopKOpConverter::matchAndRewrite(IE::TopKOp origOp, OpAdapto const auto newAxisAttr = getIntAttr(origOp->getContext(), newAxis); - const auto newInShapeAttr = getIntArrayAttr(this->getContext(), alignShapeTo4D(inShape, mergeMap, false)); + const auto newInShapeAttr = getIntArrayAttr(this->getContext(), alignShapeTo4D(inShape, mergeMap, false).raw()); const auto newInReshape = rewriter.createOrFold(takeOpLoc(origOp, "reshape_in"), origOp.getInput(), nullptr, false, newInShapeAttr); @@ -1923,27 +1977,6 @@ mlir::LogicalResult LSTMSequenceConverter::matchAndRewrite(IE::LSTMSequenceOp or mlir::ConversionPatternRewriter& rewriter) const { const auto ctx = rewriter.getContext(); - auto createDynamicReshape = [&](mlir::Value value, ShapeRef newShape, std::string_view suffix) -> mlir::Value { - const auto valueShape = getShape(value); - auto newInputDataShape = to_small_vector(valueShape); - - auto boundedType = mlir::dyn_cast(value.getType()); - VPUX_THROW_UNLESS(boundedType != nullptr, "Expected to get BoundedTensorType at {0}", value.getLoc()); - auto newInputDataBounds = boundedType.getBounds(); - - const auto newInputDataShapeAttr = getIntArrayAttr(ctx, newInputDataShape); - const auto newInputDataBoundsAttr = getIntArrayAttr(ctx, newInputDataBounds); - - const auto newInputDataShapeRank = checked_cast(newShape.size()); - const auto dataType = mlir::RankedTensorType::get({newInputDataShapeRank}, getSInt32Type(ctx)); - auto newInputDataShapeValues = IE::replaceDynamicDimsWithValue(to_small_vector(newShape), -1); - - const auto shapeTensor = - Const::createConst(rewriter, value.getLoc(), dataType, ArrayRef(newInputDataShapeValues)); - return rewriter.createOrFold(appendLoc(value.getLoc(), suffix), value, shapeTensor, - newInputDataShapeAttr, newInputDataBoundsAttr); - }; - auto reshapeValue = [&](mlir::Value value, ShapeRef newShape, const std::string& suffix) -> mlir::Value { const auto valueShape = getShape(value); if (valueShape == newShape) { @@ -1951,7 +1984,11 @@ mlir::LogicalResult LSTMSequenceConverter::matchAndRewrite(IE::LSTMSequenceOp or } if (valueShape.isDynamic()) { - return createDynamicReshape(value, newShape, suffix); + const auto boundedType = mlir::dyn_cast(value.getType()); + VPUX_THROW_WHEN(boundedType == nullptr, "Expected to get BoundedTensorType at {0}", value.getLoc()); + + return IE::createDynamicReshape(rewriter, appendLoc(value.getLoc(), suffix), value, + boundedType.getDynamicShape()); } else { return rewriter.createOrFold(appendLoc(value.getLoc(), suffix), value, nullptr, false, getIntArrayAttr(ctx, newShape)); @@ -3191,6 +3228,64 @@ mlir::LogicalResult ReverseConverter::matchAndRewrite(IE::ReverseOp origOp, OpAd return mlir::success(); } +// +// NormalizeL2Converter +// + +class NormalizeL2Converter final : public mlir::OpConversionPattern { + using OpAdaptor = typename mlir::OpConversionPattern::OpAdaptor; + +public: + NormalizeL2Converter(mlir::TypeConverter& typeConverter, mlir::MLIRContext* ctx, Logger log) + : mlir::OpConversionPattern(typeConverter, ctx), _log(log) { + } + +public: + mlir::LogicalResult matchAndRewrite(IE::NormalizeL2Op origOp, OpAdaptor newArgs, + mlir::ConversionPatternRewriter& rewriter) const final; + +private: + Logger _log; +}; + +mlir::LogicalResult NormalizeL2Converter::matchAndRewrite(IE::NormalizeL2Op origOp, OpAdaptor, + mlir::ConversionPatternRewriter& rewriter) const { + _log.trace("Process Operation '{0}' at '{1}", origOp->getName(), origOp->getLoc()); + + const auto inType = mlir::cast(origOp->getOperand(0).getType()); + auto newShape = to_small_vector(inType.getShape()); + auto newAxes = parseIntArrayAttr(origOp.getAxesValue().value()); + auto inRank = newShape.size(); + + if (inRank > TARGET_TENSOR_DIM) { + std::tie(newShape, newAxes) = getMergedShapeAndAxes(newShape, newAxes); + inRank = newShape.size(); + } + + if (inRank < TARGET_TENSOR_DIM) { + const int64_t newDims = TARGET_TENSOR_DIM - inRank; + newShape.insert(newShape.begin(), newDims, 1); + for (auto& axis : newAxes) { + axis += newDims; + } + } + + const auto newShapeAttr = getIntArrayAttr(origOp->getContext(), newShape); + const auto axisValueAttr = getIntArrayAttr(origOp->getContext(), newAxes); + const auto outShapeAttr = getIntArrayAttr(origOp->getContext(), getShape(origOp.getResult())); + + const auto inReshape = rewriter.createOrFold(takeOpLoc(origOp, "reshape_in"), origOp.getOperand(0), + nullptr, false, newShapeAttr); + + auto newNormalizeOp = rewriter.create( + origOp->getLoc(), inReshape, /*axes*/ nullptr, axisValueAttr, origOp.getEpsAttr(), origOp.getEpsModeAttr()); + + auto outReshape = rewriter.replaceOpWithNewOp(origOp, newNormalizeOp.getResult(), nullptr, false, + outShapeAttr); + extendOpLoc(outReshape, "reshape_out"); + return mlir::success(); +} + // // safeRunOnFunc // @@ -3199,6 +3294,7 @@ auto buildReshapeMaterializer(StringRef locSuffix) { const auto reshapeFunc = [=](mlir::OpBuilder& builder, mlir::RankedTensorType dstType, mlir::ValueRange inputs, mlir::Location loc) -> mlir::Value { VPUX_THROW_UNLESS(inputs.size() == 1, "Got wrong number of inputs : {0}", inputs.size()); + const auto newLoc = appendLoc(loc, locSuffix); // TODO: E#-171827 It might be beneficial to use AffineReshape for all cases, as it has well defined semantics // for per-axis quantized types and layout information propagation. @@ -3207,8 +3303,10 @@ auto buildReshapeMaterializer(StringRef locSuffix) { const auto outShapeAttr = builder.getI64ArrayAttr(dstType.getShape()); if (!isPerAxisQuantized) { - return builder.createOrFold(appendLoc(loc, locSuffix), inputs.front(), nullptr, false, - outShapeAttr); + if (const auto boundedType = mlir::dyn_cast(dstType)) { + return IE::createDynamicReshape(builder, newLoc, inputs.front(), boundedType.getDynamicShape()); + } + return builder.createOrFold(newLoc, inputs.front(), nullptr, false, outShapeAttr); } // If we have a per-axis quantized type, we need to use AffineReshapeOp because it can properly handle the axis @@ -3238,8 +3336,7 @@ auto buildReshapeMaterializer(StringRef locSuffix) { } const auto dimMappingAttr = getIntArrayOfArray(dstType.getContext(), dimMapping); - return builder.createOrFold(appendLoc(loc, locSuffix), inputs.front(), dimMappingAttr, - outShapeAttr); + return builder.createOrFold(newLoc, inputs.front(), dimMappingAttr, outShapeAttr); }; return reshapeFunc; } @@ -3250,9 +3347,26 @@ void ConvertShapeTo4DPass::safeRunOnFunc() { mlir::TypeConverter typeConverter; typeConverter.addConversion([](vpux::NDTypeInterface type) { - SmallVector shape = to_small_vector(type.getShape()); - auto dimMapper = getDimMapGeneric(shape); - return type.changeShape(ShapeRef(alignShapeTo4D(shape, dimMapper, false))); + return callOnShapeOf(type, [&](const auto& shape) { + const auto dimMapper = getDimMapGeneric(reifyShape(shape)); + auto newShape = alignShapeTo4D(shape, dimMapper, false); + if (shape == newShape) { + return type; + } + + auto elemType = type.getElementType(); + if (const auto qElemType = mlir::dyn_cast(elemType)) { + VPUX_THROW_UNLESS(newShape.size() >= shape.size(), "Unexpected rank reduction: {0} -> {1}", shape, + newShape); + const auto rankDiff = newShape.size() - shape.size(); + elemType = changeAxis(qElemType, qElemType.getQuantizedDimension() + rankDiff); + } + + return type.changeTypeComponents(TypeComponents() + .setShapeWithRepresentation(std::move(newShape)) + .setElementType(elemType) + .setDimsOrder(DimsOrder::fromNumDims(TARGET_TENSOR_DIM))); + }); }); typeConverter.addSourceMaterialization(buildReshapeMaterializer("source")); typeConverter.addTargetMaterialization(buildReshapeMaterializer("target")); @@ -3261,12 +3375,14 @@ void ConvertShapeTo4DPass::safeRunOnFunc() { mlir::TypeConverter scaleShiftTypeConverter; // TODO: #143748 consider change ConvertScaleShiftToDWPass scaleShiftTypeConverter.addConversion([](vpux::NDTypeInterface type) { - SmallVector shape = to_small_vector(type.getShape()); - if (shape.size() == 3 && shape[0] == 1 && shape[1] > 1) { - return type.changeShape(Shape{shape[0], shape[1], 1, shape[2]}); + const auto shape = type.getShape(); + if (shape.size() == 3 && shape[Dim(0)] == 1 && shape[Dim(1)] > 1) { + auto newShape = copyShape(shape); + newShape.insert(newShape.begin() + 2, 1); // 1xHxW -> 1xHx1xW + return type.changeShape(newShape); } - auto dimMapper = getDimMapGeneric(shape); - return type.changeShape(ShapeRef(alignShapeTo4D(shape, dimMapper, false))); + auto dimMapper = getDimMapGeneric(reifyShape(shape)); + return type.changeShape(alignShapeTo4D(shape, dimMapper, false)); }); scaleShiftTypeConverter.addSourceMaterialization(buildReshapeMaterializer("scale_shift_source")); scaleShiftTypeConverter.addTargetMaterialization(buildReshapeMaterializer("scale_shift_target")); @@ -3487,6 +3603,18 @@ void ConvertShapeTo4DPass::safeRunOnFunc() { return mergedShape.size() > TARGET_TENSOR_DIM; }; + auto isLegalNormalizeL2Op = [&](IE::NormalizeL2Op op) { + const auto inShape = mlir::cast(op.getOperand(0).getType()).getShape(); + const auto outShape = mlir::cast(op.getResult().getType()).getShape(); + if (inShape.size() == TARGET_TENSOR_DIM && outShape.size() == TARGET_TENSOR_DIM) { + return true; + } + + const auto axes = parseIntArrayAttr(op.getAxesValueAttr()); + const auto mergedInputShape = getMergedShapeAndAxes(to_small_vector(inShape), axes).first; + return mergedInputShape.size() > TARGET_TENSOR_DIM; + }; + mlir::ConversionTarget target(ctx); target.addLegalDialect(); target.addLegalDialect(); @@ -3591,6 +3719,7 @@ void ConvertShapeTo4DPass::safeRunOnFunc() { target.addDynamicallyLegalOp(isLegalQuantOp); target.addDynamicallyLegalOp(isLegalQuantOp); target.addDynamicallyLegalOp(isLegalQuantOp); + target.addDynamicallyLegalOp(isLegalNormalizeL2Op); mlir::RewritePatternSet patterns(&ctx); patterns.add(&ctx, _log); @@ -3692,6 +3821,7 @@ void ConvertShapeTo4DPass::safeRunOnFunc() { patterns.add(typeConverter, &ctx, _log); patterns.add(typeConverter, &ctx, _log); patterns.add(typeConverter, &ctx, _log); + patterns.add(typeConverter, &ctx, _log); if (mlir::failed(mlir::applyPartialConversion(func, target, std::move(patterns)))) { signalPassFailure(); diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_shuffle_chanels.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_shuffle_chanels.cpp index 8edaad47bd..9c501da3f6 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_shuffle_chanels.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_shuffle_chanels.cpp @@ -3,10 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_space2depth.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_space2depth.cpp index 51413971f3..60a2995d0a 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_space2depth.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_space2depth.cpp @@ -3,12 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" - -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_split_concat_to_transpose.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_split_concat_to_transpose.cpp index 2fc418ebfa..42a7600820 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_split_concat_to_transpose.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_split_concat_to_transpose.cpp @@ -4,7 +4,8 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_strided_slice_to_conv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_strided_slice_to_conv.cpp index 52b8445306..986c45d587 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_strided_slice_to_conv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_strided_slice_to_conv.cpp @@ -3,17 +3,16 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/attributes_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_subtract_to_negative_add.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_subtract_to_negative_add.cpp index c232bc5fff..33d6b0c0ce 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_subtract_to_negative_add.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_subtract_to_negative_add.cpp @@ -3,10 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_dequantize.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_dequantize.cpp index 9a65c77a99..d6590a0145 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_dequantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_dequantize.cpp @@ -5,16 +5,14 @@ #include "vpux/compiler/core/types/quantile_float/types.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/const/ops.hpp" - -#include - -#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include #include namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_mem_permute.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_mem_permute.cpp index 98f695ccf6..c1c8305dd5 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_mem_permute.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_mem_permute.cpp @@ -3,10 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_mixed_precision.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_mixed_precision.cpp index 43800cc000..0381a607c9 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_mixed_precision.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_mixed_precision.cpp @@ -4,12 +4,14 @@ // #include "vpux/compiler/dialect/IE/transforms/passes/convert_to_mixed_precision.hpp" -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/NPU37XX/dialect/IE/utils/quantization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/quantization.hpp" -#include "vpux/utils/core/numeric.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_palletization_lut.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_palletization_lut.cpp index 043c3c5344..1f1db61b61 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_palletization_lut.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_palletization_lut.cpp @@ -4,10 +4,8 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/IE/transforms/factories/convert_to_palletization_lut_strategy_getter.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { #define GEN_PASS_DECL_CONVERTTOPALLETIZATIONLUT diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_scale_shift.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_scale_shift.cpp index a1b32f9069..d96068d619 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_scale_shift.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_scale_shift.cpp @@ -3,13 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" -#include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -300,6 +302,36 @@ class ConvertMultiplyToScaleShift : public mlir::OpRewritePattern 8192 on NPU40xx and older is faster on SHAVE"); + return false; + } + + // Operations benefit from running on DPU when channel dimension size is less than + // 1.5x(experimental value) the standard limit + // E-171794 will introduce a comprehensive solution for choosing between different executors + constexpr double DPU_BENEFIT_FACTOR = 1.5; + const bool isBenefitOnDPU = + dimCShape < static_cast(VPU::NCEInvariant::VPU_DIMENSION_LIMIT * DPU_BENEFIT_FACTOR); + // Operations that do not need to be broadcasted can be decided to execute on DPU(NCEEltwise) or + // SHAVE(VPU.Multiply) in later passes + const bool needBroadcast = activationShape != weightsShape; + if (needBroadcast && isBenefitOnDPU) { + log.trace("Operations that need to be broadcasted with C dimension > 8192 can be converted to ScaleShift"); + return true; + } + + return false; +} + mlir::LogicalResult ConvertMultiplyToScaleShift::matchAndRewrite(IE::MultiplyOp mulOp, mlir::PatternRewriter& rewriter) const { _log.trace("Got op {0} at {1}", mulOp->getName(), mulOp->getLoc()); @@ -319,14 +351,14 @@ mlir::LogicalResult ConvertMultiplyToScaleShift::matchAndRewrite(IE::MultiplyOp auto mulOutShape = getShape(mulOp.getOutput()); auto weightsShape = getShape(weightsInput); + auto activationShape = getShape(activationInput); // Activation shape and scaleShift output shape should be consistent - if (getShape(activationInput) != mulOutShape) { + if (activationShape != mulOutShape) { return mlir::failure(); } - if (mulOutShape[Dim(Dims4D::Act::C)] > VPU::NCEInvariant::VPU_DIMENSION_LIMIT) { - _log.trace("Multiply with C Dim > 8192 will not be converted to ScaleShift since it is faster on Shave."); + if (!isBeneficialToConvertMultiplyToScaleShift(activationShape, weightsShape, mulOutShape, mulOp, _log)) { return mlir::failure(); } diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_spatial_op.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_spatial_op.cpp index e3b917ca45..bab940f967 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_spatial_op.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_to_spatial_op.cpp @@ -4,7 +4,8 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/roll_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_transposedconv_to_conv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_transposedconv_to_conv.cpp index 2c75bd294b..3802e11503 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_transposedconv_to_conv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_transposedconv_to_conv.cpp @@ -3,12 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/utils/convolution_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_upsampling_to_strided_concat.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_upsampling_to_strided_concat.cpp index df16a0e2cb..669478f004 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_upsampling_to_strided_concat.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_upsampling_to_strided_concat.cpp @@ -4,13 +4,12 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/core/range.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_variadic_split_to_strided_slice.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_variadic_split_to_strided_slice.cpp index be5fffb00d..be21932c06 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_variadic_split_to_strided_slice.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_variadic_split_to_strided_slice.cpp @@ -5,11 +5,10 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" -#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/analysis.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_weights_to_i4.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_weights_to_i4.cpp index b5ccd49276..d26e214288 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_weights_to_i4.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_weights_to_i4.cpp @@ -4,10 +4,10 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" - #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/types.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_weights_to_u8.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_weights_to_u8.cpp index 7b560f5081..680c875cd1 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_weights_to_u8.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/convert_weights_to_u8.cpp @@ -4,10 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" - #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/types.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/debatcher.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/debatcher.cpp index 4573028c6d..0ee2171299 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/debatcher.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/debatcher.cpp @@ -3,20 +3,18 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/batch.hpp" -#include "vpux/compiler/utils/logging.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/swizzling_utils.hpp" #include "vpux/utils/core/dense_map.hpp" -#include "vpux/utils/core/format.hpp" + +#include namespace vpux::IE { #define GEN_PASS_DECL_DEBATCHER diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_gru_cell.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_gru_cell.cpp index 8cb0523409..6ea96dc25b 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_gru_cell.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_gru_cell.cpp @@ -5,7 +5,13 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_gru_sequence.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_gru_sequence.cpp index ed3831eae1..fe3ee6c245 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_gru_sequence.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_gru_sequence.cpp @@ -3,28 +3,31 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include - -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/utils/rewriter.hpp" - #include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/broadcast_utils.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/error.hpp" #include "vpux/utils/core/small_vector.hpp" #include #include +#include +#include + namespace vpux::IE { #define GEN_PASS_DECL_DECOMPOSEGRUSEQUENCE #define GEN_PASS_DEF_DECOMPOSEGRUSEQUENCE diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_incremental_sdpa.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_incremental_sdpa.cpp index cc09ddffeb..8911238b8a 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_incremental_sdpa.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_incremental_sdpa.cpp @@ -3,19 +3,23 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" -#include "vpux/compiler/utils/analysis.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/error.hpp" +#include +#include + namespace vpux::IE { #define GEN_PASS_DECL_DECOMPOSEINCREMENTALSDPA #define GEN_PASS_DEF_DECOMPOSEINCREMENTALSDPA diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_lstm_cell.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_lstm_cell.cpp index 09a66df01a..a5a8d4e2d9 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_lstm_cell.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_lstm_cell.cpp @@ -3,15 +3,16 @@ // SPDX-License-Identifier: Apache-2.0 // -#include - -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include + namespace vpux::IE { #define GEN_PASS_DECL_DECOMPOSELSTMCELL #define GEN_PASS_DEF_DECOMPOSELSTMCELL diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_lstm_sequence.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_lstm_sequence.cpp index 51ecdbd923..aa87d3b045 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_lstm_sequence.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_lstm_sequence.cpp @@ -3,26 +3,32 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include - #include "vpux/compiler/dialect/IE/IR/attributes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/broadcast_utils.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/error.hpp" #include "vpux/utils/core/small_vector.hpp" #include +#include #include +#include +#include + namespace vpux::IE { #define GEN_PASS_DECL_DECOMPOSELSTMSEQUENCE #define GEN_PASS_DEF_DECOMPOSELSTMSEQUENCE @@ -120,10 +126,15 @@ mlir::Value padWeightsAndBiases(mlir::PatternRewriter& rewriter, mlir::Location // %shape_of_cst = IE.ShapeOf(%cst_0) -> 1x2x512x512 auto shapeOfCst = rewriter.create(appendLoc(loc, "_shapeOfCst"), newWeights, dataTypeShapeOf); - // %slice_cst = IE.Slice %shape_of_cst [0] [2] -> 1x2 - auto sliceCst = rewriter.create(appendLoc(loc, "_sliceCst"), shapeOfCst, - /*begins=*/rewriter.getI64ArrayAttr({0}), - /*sizes=*/rewriter.getI64ArrayAttr({2})); + // %slice_cst = IE.Slice %shape_of_cst [0] [1] -> 1 + auto sliceBatchCst = rewriter.create(appendLoc(loc, "_sliceCst"), shapeOfCst, + /*begins=*/rewriter.getI64ArrayAttr({0}), + /*sizes=*/rewriter.getI64ArrayAttr({1})); + // TODO: here we use a knowledge that LSTMSequence will be multiclustered for the dynamic shape case + // by the number of directions, so we set the channel number to 1 + auto constDataType = mlir::RankedTensorType::get({1}, getSInt64Type(ctx)); + auto channelsCst = + Const::createConst(rewriter, appendLoc(loc, "_channelsNum"), constDataType, ArrayRef{1}); // %shape_of_0 = IE.ShapeOf(%0) -> 1x1x?x512 auto shapeOfInput = rewriter.create(appendLoc(loc, "_shapeOfInput"), newInputData, dataTypeShapeOf); @@ -145,46 +156,15 @@ mlir::Value padWeightsAndBiases(mlir::PatternRewriter& rewriter, mlir::Location // %concat = IE.Concat(%slice_cst, %slice_0, %slice_3) -> 1x2x?x512 const auto axisAttr = getIntAttr(ctx, 0); SmallVector concatInputs; - concatInputs.push_back(sliceCst); + concatInputs.push_back(sliceBatchCst); + concatInputs.push_back(channelsCst); concatInputs.push_back(sliceInput); concatInputs.push_back(sliceAdd); - auto concat = rewriter.create(appendLoc(loc, "_concat"), concatInputs, axisAttr); - - mlir::Value dynReshapeOperand = matMulInputOp.getOutput(); - auto operandType = mlir::cast(matMulInputOp.getOutput().getType()); - if (!IE::isDynamicDataContiguous(Shape(addShape), operandType.getDimsOrder())) { - // StridedSlice operation - const SmallVector begins(addShapeRank, 0); - const SmallVector strides(addShapeRank, 1); - - const auto beginsAttr = getIntArrayAttr(ctx, begins); - const auto stridesAttr = getIntArrayAttr(ctx, strides); - - const SmallVector empty(addShapeRank, 0); - const auto emptyAttr = getIntArrayAttr(ctx, empty); - - auto sliceOp = rewriter.create(appendLoc(loc, "_inputDataSlice"), - /*data=*/matMulInputOp.getOutput(), - /*begins=*/nullptr, - /*ends=*/concat.getOutput(), - /*strides=*/nullptr, - /*beginsAttr=*/beginsAttr, - /*endsAttr=*/nullptr, - /*stridesAttr=*/stridesAttr, - /*beginMask=*/emptyAttr, - /*endMask=*/emptyAttr, - /*newAxisMask=*/emptyAttr, - /*shrinkAxisMask=*/emptyAttr, - /*ellipsisMask=*/emptyAttr); - dynReshapeOperand = sliceOp.getOutput(); - } - // StridedSlice infers its output as tensor when any of the begins, ends, or strides are unknown at - // compile time. DynamicReshape resolves the mismatch between the output of StridedSlice and the input of - // mlir.return. - auto reshapedAddOp = rewriter.create(appendLoc(loc, "_reshapedSlice"), - /*data=*/dynReshapeOperand, - /*shape=*/concat.getOutput(), + auto newShapeOp = rewriter.create(appendLoc(loc, "_newShape"), concatInputs, axisAttr); + auto reshapedAddOp = rewriter.create(appendLoc(loc, "_reshapedAdd"), + /*data=*/matMulInputOp.getOutput(), + /*shape=*/newShapeOp.getOutput(), /*output_shape=*/shapeAttr, /*output_bounds=*/boundedShapeAttr); @@ -274,7 +254,47 @@ mlir::LogicalResult ExtractWeightsAndBiasesFromLSTMSequenceRewriter::matchAndRew loc, reshapedAddOp, op.getInitialHiddenState(), op.getInitialCellState(), nullptr, op.getReccurenceWeights(), newBiasesOp, op.getSequenceLengthAttr(), op.getDirectionAttr()); - rewriter.replaceOp(op, newLSTMSequenceOp); + mlir::Value newOutputHiddenValues = newLSTMSequenceOp.getOutputHiddenValues(); + // if user of newOutputHiddenValues is DynamicReshape, it means that output shape is propagated + // by DynamicReshape + if (auto dynReshape = mlir::dyn_cast(*op.getOutputHiddenValues().getUsers().begin())) { + rewriter.setInsertionPoint(dynReshape); + + // while LSTMSequence can work with strided data, it is not guaranteed following operations will do the + // same, so we insert StridedSlice and DynamicReshape to pack data without strides + auto reshapedLSTMSequenceOp = rewriter.create( + appendLoc(loc, "_reshapedLSTMSequence"), newLSTMSequenceOp.getOutputHiddenValues(), + dynReshape.getShape(), dynReshape.getOutputShapeAttr(), dynReshape.getOutputBoundsAttr(), + /*only_set_shape*/ true); + + auto rank = mlir::cast(newLSTMSequenceOp.getOutputHiddenValues().getType()).getRank(); + const SmallVector begins(rank, 0); + const SmallVector strides(rank, 1); + + const auto beginsAttr = getIntArrayAttr(ctx, begins); + const auto stridesAttr = getIntArrayAttr(ctx, strides); + + const SmallVector empty(rank, 0); + const auto emptyAttr = getIntArrayAttr(ctx, empty); + auto sliceOp = rewriter.create(appendLoc(loc, "_denseDataLSTMSequence"), + /*data=*/reshapedLSTMSequenceOp.getOutput(), + /*begins=*/nullptr, + /*ends=*/dynReshape.getShape(), + /*strides=*/nullptr, + /*beginsAttr=*/beginsAttr, + /*endsAttr=*/nullptr, + /*stridesAttr=*/stridesAttr, + /*beginMask=*/emptyAttr, + /*endMask=*/emptyAttr, + /*newAxisMask=*/emptyAttr, + /*shrinkAxisMask=*/emptyAttr, + /*ellipsisMask=*/emptyAttr); + newOutputHiddenValues = sliceOp.getOutput(); + } + + rewriter.replaceAllUsesWith(op.getResults(), + mlir::ValueRange{newOutputHiddenValues, newLSTMSequenceOp.getOutputHiddenState(), + newLSTMSequenceOp.getOutputCellState()}); } else { auto matMulInputOp = rewriter.create(appendLoc(loc, "_matMul"), newInputData, newWeights, false, true, nullptr); @@ -418,29 +438,6 @@ mlir::LogicalResult BaseDecomposeLSTMSequenceBidirectionalRewriter::decompose(IE return mlir::success(); } -// -// DecomposeDynamicLSTMSequenceBidirectionalRewriter -// - -// Decompose a bidirectional dynamic LSTMSequence into one forward and one reverse operator. -// This optimization allows us to skip the extra StridedSlice operations that are added as -// part of the ExtractWeightsAndBiasesFromLSTMSequenceRewriter pass. - -class DecomposeDynamicLSTMSequenceBidirectionalRewriter final : public BaseDecomposeLSTMSequenceBidirectionalRewriter { -public: - DecomposeDynamicLSTMSequenceBidirectionalRewriter(mlir::MLIRContext* ctx, mlir::PatternBenefit benefit, Logger log) - : BaseDecomposeLSTMSequenceBidirectionalRewriter(ctx, benefit, std::move(log)) { - this->setDebugName("DecomposeDynamicLSTMSequenceBidirectionalRewriter"); - } - - mlir::LogicalResult matchAndRewrite(IE::LSTMSequenceOp op, mlir::PatternRewriter& rewriter) const final { - if (!IE::hasDynamicTensors(op)) { - return mlir::failure(); - } - return decompose(op, rewriter, true); - } -}; - // // DecomposeLSTMSequenceBidirectionalRewriter // @@ -458,7 +455,7 @@ class DecomposeLSTMSequenceBidirectionalRewriter final : public BaseDecomposeLST mlir::LogicalResult matchAndRewrite(IE::LSTMSequenceOp op, mlir::PatternRewriter& rewriter) const final { // At this stage this optimization will not be needed in case of dynamic shapes. - if (VPU::LSTMSequenceOp::isSupported(op) || IE::hasDynamicTensors(op)) { + if (VPU::LSTMSequenceOp::isSupported(op)) { return mlir::failure(); } return decompose(op, rewriter, false); @@ -588,16 +585,15 @@ void DecomposeLSTMSequencePass::safeRunOnFunc() { // To explicitly control the patterns exec order to assure dependency // benefitLevels[0] is highest benefit level and represent the relative pattern is the first one to run - const uint32_t levelCount = 4; + const uint32_t levelCount = 3; const auto benefitLevels = getBenefitLevels(levelCount); mlir::RewritePatternSet patterns(&ctx); // In the dynamic case, decompose bidirectional LSTMSequence first to simplify handling of dynamic shapes // and avoid complex slicing operations. This makes subsequent optimizations easier. - patterns.add(&ctx, benefitLevels[0], _log); - patterns.add(&ctx, benefitLevels[1], _log); - patterns.add(&ctx, benefitLevels[2], _log); - patterns.add(&ctx, benefitLevels[3], _log); + patterns.add(&ctx, benefitLevels[0], _log); + patterns.add(&ctx, benefitLevels[1], _log); + patterns.add(&ctx, benefitLevels[2], _log); auto func = getOperation(); if (mlir::failed(mlir::applyPatternsAndFoldGreedily(func, std::move(patterns), getDefaultGreedyRewriteConfig()))) { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_multi_zp_quantization_pattern.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_multi_zp_quantization_pattern.cpp index 8619817f4c..7eb2aee789 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_multi_zp_quantization_pattern.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_multi_zp_quantization_pattern.cpp @@ -3,13 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/core/types/quantile_float/types.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/fake_quantize_utils.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_normalize_l2.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_normalize_l2.cpp index 7073bb2d03..e9fbcc5641 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_normalize_l2.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_normalize_l2.cpp @@ -4,9 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_online_sdpa.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_online_sdpa.cpp index c551303cb9..4f202b77bf 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_online_sdpa.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/decompose_online_sdpa.cpp @@ -3,11 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/dedebatcher.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/dedebatcher.cpp index 22c7c7db5a..f6b9649832 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/dedebatcher.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/dedebatcher.cpp @@ -4,12 +4,10 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/batch.hpp" -#include "vpux/compiler/utils/logging.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/core/format.hpp" namespace vpux::IE { #define GEN_PASS_DECL_DEDEBATCHER diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/dequantize_const.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/dequantize_const.cpp index e73f12480d..b43287cb41 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/dequantize_const.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/dequantize_const.cpp @@ -4,9 +4,10 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/dilated_conv_convert.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/dilated_conv_convert.cpp index ca17f6b48e..fe55b59951 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/dilated_conv_convert.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/dilated_conv_convert.cpp @@ -3,10 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/dump_statistics_of_ie_ops.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/dump_statistics_of_ie_ops.cpp index 03ab71a6a3..0f6ce09bba 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/dump_statistics_of_ie_ops.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/dump_statistics_of_ie_ops.cpp @@ -4,7 +4,8 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/quantization.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/duplicate_fq_across_function_calls.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/duplicate_fq_across_function_calls.cpp index 42e0a858d1..4d420d2f2b 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/duplicate_fq_across_function_calls.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/duplicate_fq_across_function_calls.cpp @@ -4,9 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/func_dialect.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -22,7 +24,6 @@ #include #include #include -#include "vpux/compiler/dialect/net/IR/ops.hpp" namespace vpux::IE { #define GEN_PASS_DECL_DUPLICATEFQACROSSFUNCTIONCALLS diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/eltwise_fake_quant_fusion.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/eltwise_fake_quant_fusion.cpp index 820a90e4a1..fac28dc36d 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/eltwise_fake_quant_fusion.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/eltwise_fake_quant_fusion.cpp @@ -4,9 +4,9 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/VPU/utils/eltwise_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/expand_activation_channels.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/expand_activation_channels.cpp index 697cddced8..bd7b0e59ec 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/expand_activation_channels.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/expand_activation_channels.cpp @@ -5,12 +5,15 @@ #include "vpux/compiler/dialect/IE/transforms/passes/expand_activation_channels.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/utils/interpolate_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" -#include "vpux/compiler/utils/error.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include "vpux/utils/core/numeric.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/expand_activation_width.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/expand_activation_width.cpp index e3fa3bfa2f..67ccd65ea1 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/expand_activation_width.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/expand_activation_width.cpp @@ -4,7 +4,8 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/expand_softmax_axis.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/expand_softmax_axis.cpp new file mode 100644 index 0000000000..88e2301a9d --- /dev/null +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/expand_softmax_axis.cpp @@ -0,0 +1,209 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/utils/rewriter.hpp" +#include "vpux/utils/core/numeric.hpp" + +namespace vpux::IE { +#define GEN_PASS_DECL_EXPANDSOFTMAXAXIS +#define GEN_PASS_DEF_EXPANDSOFTMAXAXIS +#include "vpux/compiler/dialect/IE/passes.hpp.inc" +} // namespace vpux::IE + +using namespace vpux; +#define ALIGNMENT_REQUIREMENT_IN_ELEMENTS 16 +namespace { + +// The condition for pads is that it should be only over the inner most dimension, at the end +// Softmax will support a maximum of 31 elements padded over inner most dimension +bool arePadsValid(const llvm::SmallVector& padsBegin, const llvm::SmallVector& padsEnd) { + if (padsBegin.size() != padsEnd.size()) { + return false; + } + + size_t rank = padsBegin.size(); + for (size_t i = 0; i < rank - 1; i++) { + if (padsBegin[i] != 0 || padsEnd[i] != 0) { + return false; + } + } + if (padsBegin[rank - 1] != 0) { + return false; + } + int64_t paddedDim = padsEnd[rank - 1]; + if (paddedDim < 32) { + return true; + } + return false; +} + +// +// SimplifyReshapes +// Remove rehsapes like reshape ([1,N*M,X,Y] -> [N,M,X,Y]) -> SoftMax -> reshape ([N,M,X,Y] -> [1,N*M,X,Y]) and replace +// to +// [1,N*M,X,Y] -> SoftMax -> [1,N*M,X,Y] + +class SimplifyReshapes final : public mlir::OpRewritePattern { +public: + SimplifyReshapes(mlir::MLIRContext* ctx, Logger log): mlir::OpRewritePattern(ctx), _log(log) { + } + +public: + mlir::LogicalResult matchAndRewrite(IE::SoftMaxOp softMaxOp, mlir::PatternRewriter& rewriter) const final; + +private: + Logger _log; +}; + +mlir::LogicalResult SimplifyReshapes::matchAndRewrite(IE::SoftMaxOp softMaxOp, mlir::PatternRewriter& rewriter) const { + // 1. check if softmax input is reshape + auto inReShapeOp = softMaxOp.getInput().getDefiningOp(); + if (inReShapeOp == nullptr) { + _log.trace("[{0}] SoftmaxOp '{1}' has no reshape input", getDebugName(), softMaxOp->getName()); + return mlir::failure(); + } + if (!softMaxOp.getInput().hasOneUse()) { + _log.trace("[{0}] SoftmaxOp '{1}' has multiple users", getDebugName(), softMaxOp->getName()); + return mlir::failure(); + } + + // 2. check if softmax output is reshape + if (!softMaxOp.getOutput().hasOneUse()) { + _log.trace("[{0}] SoftmaxOp '{1}' has multiple users", getDebugName(), softMaxOp->getName()); + return mlir::failure(); + } + + auto outReShapeOp = mlir::dyn_cast(*softMaxOp.getOutput().getUsers().begin()); + if (outReShapeOp == nullptr) { + _log.trace("[{0}] SoftmaxOp '{1}' has no reshape output", getDebugName(), softMaxOp->getName()); + return mlir::failure(); + } + + // 3. check that inReShapeOp input and outReShapeOp output are the same shape + auto inReShapeInputType = mlir::cast(inReShapeOp.getInput().getType()); + auto outReShapeOutputType = mlir::cast(outReShapeOp.getOutput().getType()); + if (inReShapeInputType.getShape() != outReShapeOutputType.getShape()) { + _log.trace("[{0}] SoftmaxOp '{1}' has different reshape input and output shapes", getDebugName(), + softMaxOp->getName()); + return mlir::failure(); + } + + // 4. check that inReshapeOp reshapes from [1,N*M,X,Y] to [N,M,X,Y] + const auto inReShapeInputShape = inReShapeInputType.getShape().toValues(); + const auto inReShapeOutputShape = + mlir::cast(inReShapeOp.getOutput().getType()).getShape().toValues(); + if (inReShapeInputShape.size() != 4 || inReShapeOutputShape.size() != 4) { + _log.trace("[{0}] SoftmaxOp '{1}' has unsupported reshape input or output shape", getDebugName(), + softMaxOp->getName()); + return mlir::failure(); + } + if (inReShapeInputShape[Dims4D::Act::N] != 1) { + _log.trace("[{0}] SoftmaxOp '{1}' has unsupported reshape input shape N dimension: {2}", getDebugName(), + softMaxOp->getName(), inReShapeInputShape[Dims4D::Act::N]); + return mlir::failure(); + } + if (inReShapeInputShape[Dims4D::Act::C] != + inReShapeOutputShape[Dims4D::Act::N] * inReShapeOutputShape[Dims4D::Act::C]) { + _log.trace("[{0}] SoftmaxOp '{1}' has unsupported reshape input shape C dimension: {2} != {3} * {4}", + getDebugName(), softMaxOp->getName(), inReShapeInputShape[Dims4D::Act::C], + inReShapeOutputShape[Dims4D::Act::N], inReShapeOutputShape[Dims4D::Act::C]); + return mlir::failure(); + } + + // 5. remove inReShapeOp and outReShapeOp and use their input and output as softmax input and output + rewriter.setInsertionPointAfter(inReShapeOp); + auto newSoftMaxOp = rewriter.replaceOpWithNewOp(inReShapeOp, inReShapeOp.getInput(), + softMaxOp.getAxisInd(), softMaxOp.getPadSizeAttr()); + outReShapeOp.replaceAllUsesWith(newSoftMaxOp.getOperation()); + + return mlir::success(); +} + +// ExpandSoftmaxAxisPass +// + +class ExpandSoftmaxAxisPass final : public IE::impl::ExpandSoftmaxAxisBase { +public: + explicit ExpandSoftmaxAxisPass(Logger log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnFunc() final; +}; + +void ExpandSoftmaxAxisPass::safeRunOnFunc() { + auto func = getOperation(); + auto& ctx = getContext(); + mlir::RewritePatternSet patterns(&ctx); + patterns.add(&ctx, _log); + + if (mlir::failed(mlir::applyPatternsAndFoldGreedily(func, std::move(patterns), getDefaultGreedyRewriteConfig()))) { + signalPassFailure(); + } + func->walk([&](IE::ExpandOp expandOp) { + auto smOp = expandOp.getOperand().getDefiningOp(); + if (!smOp) { + return; + } + if (!smOp.getOutput().hasOneUse() && !smOp.getInput().hasOneUse()) { + return; + } + + auto sliceOp = smOp->getOperand(0).getDefiningOp(); + if (!sliceOp) { + return; + } + + const auto inType = mlir::cast(smOp->getOperand(0).getType()); + const auto inShape = inType.getShape().toValues(); + const long int inRank = checked_cast(inType.getRank()); + const long int axis = smOp.getAxisInd(); + const auto axisDim = inShape[Dim(axis)]; + if (axis != inRank - 1) { + return; + } + + if (!expandOp.getPadsBegin() || !expandOp.getPadsEnd()) { + return; + } + llvm::SmallVector padsBegin = parseIntArrayAttr(expandOp.getPadsBegin()); + llvm::SmallVector padsEnd = parseIntArrayAttr(expandOp.getPadsEnd()); + if (!arePadsValid(padsBegin, padsEnd)) { + return; + } + int64_t padValue = padsEnd[axis] + (smOp.getPadSize().has_value() ? smOp.getPadSize().value() : 0); + if (axisDim % ALIGNMENT_REQUIREMENT_IN_ELEMENTS == 0 && + ((axisDim + padValue) % ALIGNMENT_REQUIREMENT_IN_ELEMENTS) != 0) { + return; + } + auto builder = mlir::OpBuilder(smOp); + auto newSMOp = builder.create(appendLoc(smOp->getLoc(), "_padded"), sliceOp->getOperand(0), + smOp.getAxisIndAttr(), getIntAttr(builder.getContext(), padValue)); + expandOp->replaceAllUsesWith(newSMOp); + }); +} + +} // namespace + +// +// createExpandSoftmaxAxisPass +// + +std::unique_ptr vpux::IE::createExpandSoftmaxAxisPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fold_activation_before_fq.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fold_activation_before_fq.cpp index f3799c8775..386611727c 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fold_activation_before_fq.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fold_activation_before_fq.cpp @@ -6,7 +6,9 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/quantization.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_activation_ops.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_activation_ops.cpp index 12c6ef1adf..a8cba273b6 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_activation_ops.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_activation_ops.cpp @@ -4,9 +4,9 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_conv_with_slice.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_conv_with_slice.cpp index 7005582bef..1d28c5044c 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_conv_with_slice.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_conv_with_slice.cpp @@ -5,12 +5,12 @@ #include #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/compiler/utils/types.hpp" namespace vpux::IE { #define GEN_PASS_DECL_FUSECONVWITHSLICE diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_dynamic_quantize.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_dynamic_quantize.cpp index 6aa857608c..23b2ed002e 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_dynamic_quantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_dynamic_quantize.cpp @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/core/tiling.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_fq_and_mul.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_fq_and_mul.cpp index 3c5032c38e..64d563377b 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_fq_and_mul.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_fq_and_mul.cpp @@ -6,9 +6,9 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/VPU/utils/eltwise_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_input_scale_shift.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_input_scale_shift.cpp index bc5f658773..64066a6e27 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_input_scale_shift.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_input_scale_shift.cpp @@ -4,7 +4,10 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/const/ops.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_m2i_batchnorm.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_m2i_batchnorm.cpp index c95849f9bf..218c8d6f8d 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_m2i_batchnorm.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_m2i_batchnorm.cpp @@ -3,12 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/m2i_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_mem_permute.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_mem_permute.cpp index 4d26292703..d3190e42bd 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_mem_permute.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_mem_permute.cpp @@ -6,7 +6,9 @@ #include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_mvn6_scale_bias.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_mvn6_scale_bias.cpp index 3084d45728..0f152456f6 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_mvn6_scale_bias.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_mvn6_scale_bias.cpp @@ -4,9 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { @@ -61,7 +63,7 @@ class FuseMvn6ScaleBias final : public mlir::OpRewritePattern { buffSizes.push_back(Byte(actSize)); } - const auto arch = VPU::getArch(origOp); + const auto arch = config::getArch(origOp); auto totalAvailCMXSize = vpux::VPU::getTotalCMXSize(origOp).count(); auto neededCMXSize = vpux::VPU::calculateAlignedBuffersMemoryRequirement(arch, buffSizes).count(); if (neededCMXSize >= totalAvailCMXSize) { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_op_with_quantize.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_op_with_quantize.cpp index 1dc343ca35..15c4cc4848 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_op_with_quantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_op_with_quantize.cpp @@ -4,9 +4,7 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" #include "vpux/compiler/dialect/IE/utils/convolution_utils.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_outstanding_quant.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_outstanding_quant.cpp index b9aece248e..0b02be4139 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_outstanding_quant.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_outstanding_quant.cpp @@ -4,7 +4,6 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/IE/transforms/factories/fuse_outstanding_quant_strategy_getter.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_pad_ops.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_pad_ops.cpp index b1304037d5..97d4a301dc 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_pad_ops.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_pad_ops.cpp @@ -3,12 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/pad_extract.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_permute_quantize.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_permute_quantize.cpp index c535ef2060..bc3481a9c6 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_permute_quantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_permute_quantize.cpp @@ -3,11 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/permute_quantize_utils.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_quantized_ops.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_quantized_ops.cpp index 66a6db1f85..8658c755c9 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_quantized_ops.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_quantized_ops.cpp @@ -4,7 +4,6 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/IE/transforms/factories/fuse_quantized_ops_strategy_getter.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_reshape_mvn.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_reshape_mvn.cpp index 99065dd149..79111c8654 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_reshape_mvn.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_reshape_mvn.cpp @@ -4,9 +4,12 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -209,23 +212,19 @@ bool ReshapeMVNPattern::init() { log.trace("Match failed: [Reorder]->MVN"); return false; } - mlir::Operation* reshape2 = getTargetOpWithSpecificLayoutAndSingleUser( - reorder3.getInput().getDefiningOp(), DimsOrder::NCHW, DimsOrder::NCHW); + auto reshape2 = getTargetOpWithSpecificLayoutAndSingleUser(reorder3.getInput().getDefiningOp(), + DimsOrder::NCHW, DimsOrder::NCHW); if (reshape2 == nullptr) { - reshape2 = getTargetOpWithSpecificLayoutAndSingleUser(reorder3.getInput().getDefiningOp(), - DimsOrder::NCHW, DimsOrder::NCHW); - if (reshape2 == nullptr) { - log.trace("Match failed: [Reshape]->Reorder->MVN"); - return false; - } + log.trace("Match failed: [Reshape]->Reorder->MVN"); + return false; } - auto reorder1 = getTargetOpWithSpecificLayoutAndSingleUser(reshape2->getOperand(0).getDefiningOp(), + auto reorder1 = getTargetOpWithSpecificLayoutAndSingleUser(reshape2.getInput().getDefiningOp(), DimsOrder::NHWC, DimsOrder::NCHW); if (reorder1 != nullptr) { _patternIn = reorder1.getInput(); } else { - auto concatOp = mlir::dyn_cast_or_null(reshape2->getOperand(0).getDefiningOp()); + auto concatOp = mlir::dyn_cast_or_null(reshape2.getInput().getDefiningOp()); if (concatOp == nullptr) { log.trace("Match failed: [Reorder]->Reshape->Reorder->MVN"); @@ -257,30 +256,12 @@ bool ReshapeMVNPattern::init() { _patternOut = reorder4.getOutput(); - // MVN(NHWC) -> Reorder4(NCHW) -> Reshape5(NCHW) - mlir::Operation* reshape5 = getTargetOpWithSpecificLayoutAndSingleUser( - *(_patternOut.getUsers().begin()), DimsOrder::NCHW, DimsOrder::NCHW); - if (reshape5 == nullptr) { - reshape5 = getTargetOpWithSpecificLayoutAndSingleUser(*(_patternOut.getUsers().begin()), - DimsOrder::NCHW, DimsOrder::NCHW); - if (reshape5 == nullptr) { - log.trace("Match failed: MVN->Reorder->[Reshape]"); - return false; - } - } - _patternOut = reshape5->getResult(0); - - // Check Reshapes are symmetrical - const auto patternInType = mlir::cast(_patternIn.getType()); - _newChannelSize = patternInType.getShape()[Dims4D::Act::C]; - const auto outType = mlir::cast(_patternOut.getType()); - const auto outC = outType.getShape()[Dims4D::Act::C]; - if (_newChannelSize != outC) { - // Check [Optional Part] pattern - // [ -> Reorder(NHWC) -> GroupConv(NHWC) -> Reorder(NCHW) -> Reshape(NCHW)] - // To be removed after E#123528 gets implemented - auto reorderPreGc = getTargetOpWithSpecificLayoutAndSingleUser(*(_patternOut.getUsers().begin()), - DimsOrder::NCHW, DimsOrder::NHWC); + // Check [Optional Part] pattern + // [AffineReshape(NCHW) -> Reorder(NHWC) -> GroupConv(NHWC) -> Reorder(NCHW)] + if (auto affineReshape = getTargetOpWithSpecificLayoutAndSingleUser( + *(_patternOut.getUsers().begin()), DimsOrder::NCHW, DimsOrder::NCHW)) { + auto reorderPreGc = getTargetOpWithSpecificLayoutAndSingleUser( + *(affineReshape.getOutput().getUsers().begin()), DimsOrder::NCHW, DimsOrder::NHWC); if (reorderPreGc == nullptr) { log.trace("Match failed: MVN->AffineReshape->[Reorder]"); return false; @@ -290,51 +271,48 @@ bool ReshapeMVNPattern::init() { *(reorderPreGc.getOutput().getUsers().begin()), DimsOrder::NHWC, DimsOrder::NHWC); if (!isSupportedGroupConv()) { - log.trace("Match failed: MVN->Reorder->Reshape->Reorder->[GroupConv]"); + log.trace("Match failed: MVN->AffineReshape->Reorder->[GroupConv]"); return false; } auto reorderPostGc = getTargetOpWithSpecificLayoutAndSingleUser( *(_groupConvOp.getOutput().getUsers().begin()), DimsOrder::NHWC, DimsOrder::NCHW); if (reorderPostGc == nullptr) { - log.trace("Match failed: MVN->Reorder->Reshape->Reorder->GroupConv->[Reorder]"); + log.trace("Match failed: MVN->AffineReshape->Reorder->GroupConv->[Reorder]"); return false; } - auto reshapePostGc = getTargetOpWithSpecificLayoutAndSingleUser( - *(reorderPostGc.getOutput().getUsers().begin()), DimsOrder::NCHW, DimsOrder::NCHW); - if (reshapePostGc == nullptr) { - log.trace("Match failed: MVN->Reorder->Reshape->Reorder->GroupConv->Reorder->[Reshape]"); - return false; - } - - _patternOut = reshapePostGc.getOutput(); - const auto newOutType = mlir::cast(_patternOut.getType()); - const auto newOutC = newOutType.getShape()[Dims4D::Act::C]; - if (_newChannelSize != newOutC) { - log.trace("Match failed: Pattern's input channel and output channel are not equal"); - return false; - } + _patternOut = reorderPostGc.getOutput(); } // Back to common pattern - // MVN(NHWC) -> Reorder4(NCHW) -> Reshape5(NCHW) -> [Optional Part](NCHW) -> Reorder6(NHWC) -> Output(NHWC) - auto reorder6 = getTargetOpWithSpecificLayoutAndSingleUser(*(_patternOut.getUsers().begin()), - DimsOrder::NCHW, DimsOrder::NHWC); + // MVN(NHWC) -> Reorder4(NCHW) -> [Optional Part](NCHW) -> Reshape5(NCHW) -> Reorder6(NHWC) -> Output(NHWC) + auto reshape5 = getTargetOpWithSpecificLayoutAndSingleUser(*(_patternOut.getUsers().begin()), + DimsOrder::NCHW, DimsOrder::NCHW); + if (reshape5 == nullptr) { + log.trace("Match failed: MVN->Reorder->(Optional Part)->[Reshape]"); + return false; + } + + auto reorder6 = getTargetOpWithSpecificLayoutAndSingleUser( + *(reshape5.getOutput().getUsers().begin()), DimsOrder::NCHW, DimsOrder::NHWC); if (reorder6 == nullptr) { - log.trace("Match failed: MVN->Reorder->Reshape->(Optional Part)->[Reorder]"); + log.trace("Match failed: MVN->Reorder->(Optional Part)->Reshape->[Reorder]"); return false; } + _patternOut = reorder6.getOutput(); // Check patern input and output has the same type - const auto patternOutType = mlir::cast(_patternOut.getType()); + auto patternInType = mlir::cast(_patternIn.getType()); + auto patternOutType = mlir::cast(_patternOut.getType()); if (patternInType != patternOutType) { log.trace("Mismatching pattern input type {0} and output type {1}", patternInType, patternOutType); return false; } // Checks for C reshape value + _newChannelSize = patternInType.getShape()[Dims4D::Act::C]; _origChannelSize = mvnInType.getShape()[Dims4D::Act::C]; if ((_newChannelSize % _origChannelSize) || (_newChannelSize <= _origChannelSize)) { log.trace("Expecting in-C to be a multiple of reshaped-C, got {0}, {1}", _newChannelSize, _origChannelSize); diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_rms_norm.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_rms_norm.cpp index 67afa0e152..430c3f371f 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_rms_norm.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_rms_norm.cpp @@ -5,7 +5,11 @@ #include "vpux/compiler/core/tiling.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/power_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_rope.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_rope.cpp index d37a408574..a985fe62c6 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_rope.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_rope.cpp @@ -3,11 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/core/tiling.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_scales_to_accumulate.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_scales_to_accumulate.cpp index 36f1296b99..83518add24 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_scales_to_accumulate.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_scales_to_accumulate.cpp @@ -4,7 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_sdpa.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_sdpa.cpp index dfe66d222c..90f238adce 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_sdpa.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/fuse_sdpa.cpp @@ -3,11 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/core/tiling.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { @@ -195,9 +199,9 @@ bool isLegalSDPA(mlir::Value inputQ) { void FuseSDPAPass::safeRunOnFunc() { auto func = getOperation(); - const auto arch = VPU::getArch(func); + const auto arch = config::getArch(func); // Force to fuse only on 40XX for now - if (arch != VPU::ArchKind::NPU40XX) { + if (arch != config::ArchKind::NPU40XX) { return; } func->walk([&](IE::SDPAOp sdpaOp) { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_asymmetric_strides.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_asymmetric_strides.cpp index 2b4f5863c4..e316ae6206 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_asymmetric_strides.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_asymmetric_strides.cpp @@ -3,15 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" - #include "vpux/utils/core/func_ref.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_eltwise_with_small_height.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_eltwise_with_small_height.cpp index ee25cf5e91..7c1dd8b6a6 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_eltwise_with_small_height.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_eltwise_with_small_height.cpp @@ -3,22 +3,25 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include -#include -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Support/LLVM.h" - #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/factors.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include +#include +#include +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Support/LLVM.h" + namespace vpux::IE { #define GEN_PASS_DECL_HANDLEELTWISEWITHSMALLHEIGHT #define GEN_PASS_DEF_HANDLEELTWISEWITHSMALLHEIGHT diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_exclude_pad_for_avg_pool.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_exclude_pad_for_avg_pool.cpp index 20a1a00ea4..452c1ad804 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_exclude_pad_for_avg_pool.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_exclude_pad_for_avg_pool.cpp @@ -3,11 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_large_kernels.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_large_kernels.cpp index 092976cd3f..c856d95f26 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_large_kernels.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_large_kernels.cpp @@ -3,11 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/handle_kernels_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_large_pads.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_large_pads.cpp index aeb69b08cb..f19d828621 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_large_pads.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_large_pads.cpp @@ -3,11 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_large_strides.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_large_strides.cpp index c439362734..82d5805157 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_large_strides.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_large_strides.cpp @@ -3,15 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" - #include "vpux/utils/core/func_ref.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_u16_fake_quantize.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_u16_fake_quantize.cpp index 680ab90c8e..3f51085305 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_u16_fake_quantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/handle_u16_fake_quantize.cpp @@ -4,11 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/adaptive_stripping_utils.hpp" - #include "vpux/compiler/dialect/IE/utils/quantization.hpp" +#include "vpux/compiler/dialect/VPU/utils/adaptive_stripping_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/quantization.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/input_quantization_restoration.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/input_quantization_restoration.cpp index 5d8ebafed2..ff93c2e650 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/input_quantization_restoration.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/input_quantization_restoration.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" @@ -27,15 +28,12 @@ constexpr int levels = QuantizationLevels::QUANT_LEVELS_8BIT; class InputQuantizationRestoration final : public IE::impl::InputQuantizationRestorationBase { public: - explicit InputQuantizationRestoration(Logger log): _log(log) { + explicit InputQuantizationRestoration(Logger log) { Base::initLogger(log, Base::getArgumentName()); } private: void safeRunOnFunc() final; - -private: - Logger _log; }; // diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/insert_identity_pool_before_op.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/insert_identity_pool_before_op.cpp index d926f92bbf..0137fbcf35 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/insert_identity_pool_before_op.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/insert_identity_pool_before_op.cpp @@ -5,7 +5,10 @@ #include "vpux/compiler/dialect/IE/transforms/passes/insert_identity_pool_before_op.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/insert_reorder_before_concat.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/insert_reorder_before_concat.cpp index 44403ae8d7..4bbb682f05 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/insert_reorder_before_concat.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/insert_reorder_before_concat.cpp @@ -3,10 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/legalize_dilated_conv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/legalize_dilated_conv.cpp index 6fea66d941..f71051bfb8 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/legalize_dilated_conv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/legalize_dilated_conv.cpp @@ -3,15 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" #include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" - -#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/legalize_nd_mempermute.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/legalize_nd_mempermute.cpp index f5509395c6..0837febd3d 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/legalize_nd_mempermute.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/legalize_nd_mempermute.cpp @@ -4,13 +4,13 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include - namespace vpux::IE { #define GEN_PASS_DECL_LEGALIZENDMEMPERMUTE #define GEN_PASS_DEF_LEGALIZENDMEMPERMUTE diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/legalize_reify_result_shapes_residuals.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/legalize_reify_result_shapes_residuals.cpp index e175c1e30c..23159a878a 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/legalize_reify_result_shapes_residuals.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/legalize_reify_result_shapes_residuals.cpp @@ -3,10 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" @@ -25,6 +25,8 @@ #include #include +#include + namespace vpux::IE { #define GEN_PASS_DECL_LEGALIZEREIFYRESULTSHAPESRESIDUALS #define GEN_PASS_DEF_LEGALIZEREIFYRESULTSHAPESRESIDUALS diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/load_external_kernel_resources.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/load_external_kernel_resources.cpp index 8b0f3234d2..8323dc4d4f 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/load_external_kernel_resources.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/load_external_kernel_resources.cpp @@ -10,8 +10,8 @@ #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/utils/core/error.hpp" #include "vpux/compiler/act_kernels/shave_binary_resources.h" @@ -54,7 +54,7 @@ void LoadExternalKernelResources::safeRunOnFunc() { } auto& shaveBinResources = ShaveBinaryResources::getInstance(); - auto archEncoding = shaveBinResources.getSwKernelArchString((VPU::getArch(func))); + auto archEncoding = shaveBinResources.getSwKernelArchString((config::getArch(func))); // Keep track of loaded kernel resources in order to avoid multiple retrivals std::set loadedKernels; diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/log_op_optimizations.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/log_op_optimizations.cpp index 208c4f5a54..dd80ca72c2 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/log_op_optimizations.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/log_op_optimizations.cpp @@ -4,7 +4,9 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/utils/core/numeric.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/loop_outliner.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/loop_outliner.cpp index 62d665be7d..664b3c3b5d 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/loop_outliner.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/loop_outliner.cpp @@ -4,15 +4,12 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" -#include "vpux/compiler/utils/analysis.hpp" -#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/logging.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/core/dense_map.hpp" #include "vpux/utils/core/format.hpp" #include "vpux/utils/core/range.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.cpp index 6f76334bfd..c8c133000f 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.cpp @@ -4,14 +4,22 @@ // #include "vpux/compiler/dialect/IE/transforms/passes/map_bilinear_interpolate_on_DPU.hpp" -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/transforms/factories/map_bilinear_interpolate_on_dpu_strategy_getter.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" -#include "vpux/compiler/utils/attributes.hpp" #include "vpux/utils/core/numeric.hpp" +namespace vpux::IE { +#define GEN_PASS_DECL_MAPBILINEARINTERPOLATEONDPU +#define GEN_PASS_DEF_MAPBILINEARINTERPOLATEONDPU +#include "vpux/compiler/dialect/IE/passes.hpp.inc" +} // namespace vpux::IE + using namespace vpux; namespace { @@ -359,3 +367,76 @@ bool vpux::IE::isLegalInterpolateOp(IE::InterpolateOp op, bool interpolateAsSEOp return false; } + +namespace { + +// +// MapBilinearInterpolateOnDPUPass +// + +class MapBilinearInterpolateOnDPUPass final : + public IE::impl::MapBilinearInterpolateOnDPUBase { +public: + explicit MapBilinearInterpolateOnDPUPass(const bool interpolateAsSEOp, Logger log) + : _interpolateAsSEOp(interpolateAsSEOp), _log(log) { + _log.setName(Base::getArgumentName()); + } + + mlir::LogicalResult initialize(mlir::MLIRContext* ctx) override; + +private: + void safeRunOnFunc() override; + + bool _interpolateAsSEOp = false; + Logger _log; +}; + +mlir::LogicalResult MapBilinearInterpolateOnDPUPass::initialize(mlir::MLIRContext* ctx) { + if (mlir::failed(Base::initialize(ctx))) { + return mlir::failure(); + } + + // When this parameter has a value, it probably comes from LIT test. + // Override the default + if (interpolateAsSEOp.hasValue()) { + _interpolateAsSEOp = interpolateAsSEOp.getValue(); + } + + return mlir::success(); +} + +void MapBilinearInterpolateOnDPUPass::safeRunOnFunc() { + auto& ctx = getContext(); + const auto func = getOperation(); + const auto logCb = [&](const formatv_object_base& msg) { + _log.trace("{0}", msg.str()); + }; + + auto strategy = IE::createMapBilinearInterpolateOnDPUStrategy(func, _interpolateAsSEOp); + + mlir::ConversionTarget target(ctx); + target.addLegalOp(); + target.addLegalOp(); + target.addLegalOp(); + target.addLegalOp(); + target.addLegalOp(); + target.addLegalOp(); + strategy->prepareInterpolate(target, logCb); + + mlir::RewritePatternSet patterns(&ctx); + patterns.add(&ctx, _log); + + if (mlir::failed(mlir::applyPartialConversion(func, target, std::move(patterns)))) { + signalPassFailure(); + } +} + +} // namespace + +// +// createMapBilinearInterpolateOnDPUPass +// + +std::unique_ptr vpux::IE::createMapBilinearInterpolateOnDPUPass(const bool interpolateAsSEOp, Logger log) { + return std::make_unique(interpolateAsSEOp, log); +} diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/matmul_inputs_to_2d.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/matmul_inputs_to_2d.cpp index 08305ad7b9..990da5c437 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/matmul_inputs_to_2d.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/matmul_inputs_to_2d.cpp @@ -3,29 +3,24 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include -#include -#include #include "vpux/compiler/core/attributes/shape.hpp" -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/matmul.hpp" -#include "vpux/compiler/dialect/IE/utils/quantization.hpp" -#include "vpux/compiler/dialect/IE/utils/resources.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" -#include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/checked_cast.hpp" #include "vpux/utils/core/error.hpp" #include "vpux/utils/core/small_vector.hpp" -#include "vpux/utils/core/type/float16.hpp" + +#include +#include +#include +#include namespace vpux::IE { #define GEN_PASS_DECL_MATMULINPUTSTO2D diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_fake_quant.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_fake_quant.cpp index af06bfd22f..483a8c0819 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_fake_quant.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_fake_quant.cpp @@ -4,9 +4,8 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_fully_connected.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_fully_connected.cpp index 1d2bca432b..b7d60f3038 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_fully_connected.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_fully_connected.cpp @@ -4,7 +4,10 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" @@ -1045,24 +1048,36 @@ std::optional> MergeFullyConnectedForDQPattern auto validateBranches = [this](mlir::Value inSource, mlir::Value weightsSource, ShapeRef matMulOutShape, SmallVector& unrolledMatMulBranch, IE::ConcatOp expectedOutConcat) { - for (auto inputUser : inSource.getUsers()) { - auto maybeInputSlice = mlir::dyn_cast(inputUser); - if (maybeInputSlice == nullptr || !maybeInputSlice->hasOneUse()) { - _log.trace("SliceOp is not found"); + for (auto input : expectedOutConcat.getInputs()) { + auto reshapeOp = input.getDefiningOp(); + if (reshapeOp == nullptr || !reshapeOp->hasOneUse()) { + _log.trace("The ReshapeOp is not found"); return false; } - auto maybeMatMul = getSingleUser(maybeInputSlice); - if (!maybeMatMul.has_value()) { + auto matmulOp = reshapeOp.getInput().getDefiningOp(); + if (matmulOp == nullptr || !matmulOp->hasOneUse()) { + _log.trace("The FullyConnectedOp is not found"); return false; } - auto matMul = maybeMatMul.value(); - auto weights = getMatMulWeights(matMul, matMulOutShape, expectedOutConcat); + auto sliceOp = matmulOp.getInput().getDefiningOp(); + if (sliceOp == nullptr || !sliceOp->hasOneUse()) { + _log.trace("The sliceOp is not found"); + return false; + } + + if (sliceOp.getSource() != inSource) { + _log.trace("The input source is not the same"); + return false; + } + + auto weights = getMatMulWeights(matmulOp, matMulOutShape, expectedOutConcat); if (!weights.has_value() || weights.value()->getOperand(0) != weightsSource) { + _log.trace("The weights source is not the same"); return false; } - unrolledMatMulBranch.push_back({maybeInputSlice, weights.value(), matMul}); + unrolledMatMulBranch.push_back({sliceOp, weights.value(), matmulOp}); } auto weightsSplitUserSize = std::distance(weightsSource.getUsers().begin(), weightsSource.getUsers().end()); @@ -1254,12 +1269,12 @@ mlir::Value MergeFullyConnectedForDQPatternWithDequantize::buildNewMatMulInput(A inSliceOffsets[1] = checked_cast(batchOffset) * matMulInShape[Dim(1)]; SmallVector inSliceSizes = to_small_vector(getShape(source)); inSliceSizes[1] = checked_cast(batchSize) * matMulInShape[Dim(1)]; - auto slice = rewriter.create(appendLoc(source.getLoc(), "_slice_{0}", batchIdx), source, + auto slice = rewriter.create(appendLoc(matMul.getLoc(), "_slice_{0}", batchIdx), source, getIntArrayAttr(ctx, inSliceOffsets), getIntArrayAttr(ctx, inSliceSizes)); SmallVector newInputShape{checked_cast(batchSize), matMulInShape[Dim(1)]}; const auto reshapeOutShapeAttr = getIntArrayAttr(ctx, newInputShape); - return rewriter.createOrFold(appendLoc(slice.getLoc(), "_reshape"), slice, nullptr, false, + return rewriter.createOrFold(appendLoc(matMul.getLoc(), "_reshape"), slice, nullptr, false, reshapeOutShapeAttr); } diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_parallel_fully_connected.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_parallel_fully_connected.cpp index 08da7aa4e0..40972fd615 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_parallel_fully_connected.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_parallel_fully_connected.cpp @@ -4,7 +4,10 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" @@ -18,7 +21,6 @@ #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/compiler/utils/loop.hpp" namespace vpux::IE { #define GEN_PASS_DECL_MERGEPARALLELFULLYCONNECTED diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_tile_with_slice.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_tile_with_slice.cpp index 4e0cdcdd51..d43fb8cd1b 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_tile_with_slice.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_tile_with_slice.cpp @@ -5,13 +5,13 @@ #include #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/compiler/utils/types.hpp" namespace vpux::IE { #define GEN_PASS_DECL_MERGETILEWITHSLICE diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_weights_shared_conv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_weights_shared_conv.cpp index 40357217ad..56cbf067ae 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_weights_shared_conv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/merge_weights_shared_conv.cpp @@ -3,13 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/compiler/utils/locations_verifier.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/move_dynamic_dequantize_to_user.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/move_dynamic_dequantize_to_user.cpp index effb169a9a..b3c692b1e2 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/move_dynamic_dequantize_to_user.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/move_dynamic_dequantize_to_user.cpp @@ -5,8 +5,11 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/move_multiply_divide_post_op.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/move_multiply_divide_post_op.cpp index 37bd2ae7d0..e665a87d58 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/move_multiply_divide_post_op.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/move_multiply_divide_post_op.cpp @@ -4,7 +4,10 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/move_permute_post_eltwise.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/move_permute_post_eltwise.cpp index 8e3d70f5c8..d83f406f71 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/move_permute_post_eltwise.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/move_permute_post_eltwise.cpp @@ -3,16 +3,20 @@ // SPDX-License-Identifier: Apache-2.0 // -#include #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" +#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include + namespace vpux::IE { #define GEN_PASS_DECL_MOVEPERMUTEPOSTELTWISE #define GEN_PASS_DEF_MOVEPERMUTEPOSTELTWISE diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/mvn_fusion.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/mvn_fusion.cpp index 63b90a4657..4bc984f4a4 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/mvn_fusion.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/mvn_fusion.cpp @@ -4,7 +4,12 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" #include "vpux/compiler/dialect/const/ops.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/normalize_L2_fusion.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/normalize_L2_fusion.cpp index 3e7d6b9729..9302a38b6c 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/normalize_L2_fusion.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/normalize_L2_fusion.cpp @@ -3,10 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/opt_dynamic_eltwise_with_shapeof.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/opt_dynamic_eltwise_with_shapeof.cpp index 30fda6e8e0..8eac79bc1a 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/opt_dynamic_eltwise_with_shapeof.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/opt_dynamic_eltwise_with_shapeof.cpp @@ -4,7 +4,7 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_avg_pool_with_unaligned_channels.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_avg_pool_with_unaligned_channels.cpp index 4123e86272..a6f84c506c 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_avg_pool_with_unaligned_channels.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_avg_pool_with_unaligned_channels.cpp @@ -3,9 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" #include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" @@ -14,6 +15,8 @@ #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/checked_cast.hpp" +#include + namespace vpux::IE { #define GEN_PASS_DECL_OPTIMIZEAVGPOOLWITHUNALIGNEDCHANNELS #define GEN_PASS_DEF_OPTIMIZEAVGPOOLWITHUNALIGNEDCHANNELS diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_concat_with_conv.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_concat_with_conv.cpp index 8ea8c51744..88c857a5d6 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_concat_with_conv.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_concat_with_conv.cpp @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" @@ -13,11 +15,12 @@ #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" -#include "vpux/compiler/dialect/core/types.hpp" -#include "vpux/compiler/utils/adjust_layout_utils.hpp" #include "vpux/compiler/utils/error.hpp" +#include "vpux/compiler/utils/factors.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include + namespace vpux::IE { #define GEN_PASS_DECL_OPTIMIZECONCATWITHCONV #define GEN_PASS_DEF_OPTIMIZECONCATWITHCONV diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_identity_pools.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_identity_pools.cpp index 985b55c0eb..2909fa22f6 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_identity_pools.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_identity_pools.cpp @@ -3,15 +3,17 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - -#include #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" -#include "vpux/compiler/utils/attributes_properties_conversion.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include + namespace vpux::IE { #define GEN_PASS_DECL_OPTIMIZEIDENTITYPOOL #define GEN_PASS_DEF_OPTIMIZEIDENTITYPOOL diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_op_slice.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_op_slice.cpp index dce0cc9e6d..ef7d8fa491 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_op_slice.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_op_slice.cpp @@ -4,10 +4,12 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_parallel_layers.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_parallel_layers.cpp index 1b35ab7440..6c78a36221 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_parallel_layers.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_parallel_layers.cpp @@ -3,15 +3,20 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" - -#include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" +#include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_precision_across_function_calls.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_precision_across_function_calls.cpp index 26c5fd7e93..ec02d3d8c8 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_precision_across_function_calls.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_precision_across_function_calls.cpp @@ -4,7 +4,7 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/func_dialect.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_reorders.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_reorders.cpp index 8b58cd6cdc..1af33384d0 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_reorders.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_reorders.cpp @@ -3,23 +3,27 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/transforms/rewriters/expand_with_layer_rewriter.hpp" - -#include -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/IE/utils/convolution_utils.hpp" #include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/adjust_layout_utils.hpp" +#include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/types.hpp" -#include "vpux/utils/core/dense_map.hpp" #include "vpux/utils/core/range.hpp" + +#include + namespace vpux::IE { #define GEN_PASS_DECL_OPTIMIZEREORDERS #define GEN_PASS_DEF_OPTIMIZEREORDERS @@ -1584,7 +1588,8 @@ mlir::LogicalResult ReorderWithReadValue::matchAndRewrite(IE::ReadValueOp origRe auto newReorderOp = rewriter.create(origReorderOp->getLoc(), origReadValueOp.getInput(), origReorderOp.getDstOrderAttr()); - rewriter.replaceOpWithNewOp(origReorderOp, newReorderOp.getOutput(), origReadValueOp.getName()); + rewriter.replaceOpWithNewOp(origReorderOp, newReorderOp.getOutput(), origReadValueOp.getName(), + origReadValueOp.getElementTypeAttr(), origReadValueOp.getShapeAttr()); // erase readvalue ops which has no more nodes next rewriter.eraseOp(origReadValueOp); @@ -1702,6 +1707,159 @@ mlir::LogicalResult ReorderWithHWAdd::matchAndRewrite(IE::ReorderOp return mlir::success(); } +// +// ReorderWithHWAddSlice +// +// The beneficial pattern: +// +// Reorder Reorder or Const Reorder Reorder +// \ / | | +// Add => Reorder Reorder +// | | | +// (QuantizeCast) LayoutCast LayoutCast or Const(changed dims order) +// | \ / +// Slice Add +// | +// LayoutCast +// | +// (QuantizeCast) +// | +// Reorder +// | +// Slice + +class ReorderWithHWAddSlice final : public mlir::OpRewritePattern { +public: + ReorderWithHWAddSlice(mlir::MLIRContext* ctx, Logger log): mlir::OpRewritePattern(ctx), _log(log) { + this->setDebugName("ReorderWithHWAddSlice"); + } + + mlir::LogicalResult matchAndRewrite(IE::AddOp origOp, mlir::PatternRewriter& rewriter) const final; + +private: + Logger _log; +}; + +mlir::LogicalResult ReorderWithHWAddSlice::matchAndRewrite(IE::AddOp origOp, mlir::PatternRewriter& rewriter) const { + _log.trace("[{0}] Got IE::AddOp at {1}", this->getDebugName(), origOp->getLoc()); + + // E#122076: ReorderWithHWAdd only supports HW AddOp (DimOrder::NHWC) who could be converted to NCE.Eltwise.Add + // ReorderWithHWAdd should only be a temporary solution. ReorderWithHWAdd rewriter should work for any DimOrder + const auto targetInOutOrder = DimsOrder::NHWC; + const auto origDimsOrder = DimsOrder::fromValue(origOp.getOutput()); + if (origDimsOrder != targetInOutOrder) { + return mlir::failure(); + } + + // Check [Reorder] - Add + auto input1Op = origOp.getInput1().getDefiningOp(); + auto input2Op = origOp.getInput2().getDefiningOp(); + IE::ReorderOp inputReorder = nullptr; + if ((inputReorder = mlir::dyn_cast_or_null(input1Op))) { + // Check if input2 is Reorder or Constant + if (!mlir::isa_and_nonnull(input2Op)) { + return mlir::failure(); + } + } else if ((inputReorder = mlir::dyn_cast_or_null(input2Op))) { + // Check if input1 is Constant + if (!mlir::isa_and_nonnull(input1Op)) { + return mlir::failure(); + } + } else { + // Both inputs are not Reorder + return mlir::failure(); + } + + // Check no branches + if (!hasOneUniqueUser(input1Op) || !hasOneUniqueUser(input2Op)) { + return mlir::failure(); + } + + bool bothInputsSame = inputReorder.getResult().hasOneUse() ? false : true; + + // Check Reorder - Add - [(QuantizeCast)] + auto consumerOp = *(origOp.getResult().getUsers().begin()); + auto quantCast = mlir::dyn_cast(consumerOp); + if (quantCast != nullptr) { + consumerOp = *(consumerOp->getResult(0).getUsers().begin()); + } + + // Check Reorder - Add - (QuantizeCast) - [Slice] + if (mlir::dyn_cast(consumerOp) == nullptr) { + return mlir::failure(); + } + + auto sliceParent = consumerOp->getOperand(0); + const auto origShape = getShape(sliceParent); + const auto newDimsOrder = DimsOrder::fromValue(inputReorder.getInput()); + const auto reorderInputPerm = newDimsOrder.toPermutation(); + const auto reorderOutputPerm = origDimsOrder.toPermutation(); + + // Get Slice DMA width + auto getSliceWidth = [](ShapeRef shapeBeforeSlice, ShapeRef shapeAfterSlice, DimArr permutation) -> int64_t { + int64_t width = 1; + for (int i = shapeBeforeSlice.size() - 1; i > 0; i--) { + auto dim = permutation[i]; + width *= shapeAfterSlice[dim]; + if (shapeBeforeSlice[dim] != shapeAfterSlice[dim]) { // Slice dimension + break; + } + } + return width; + }; + + for (auto* user : llvm::make_early_inc_range(sliceParent.getUsers())) { + // All users must be Slices + auto userSliceOp = mlir::dyn_cast(user); + if (userSliceOp == nullptr) { + return mlir::failure(); + } + + // Test adding Reorder and make sure the added Reorder-Slice can be further optimized into Slice-PermuteCast + auto sliceShape = getShape(userSliceOp.getResult()); + if (!isTrivialReorder(newDimsOrder, origDimsOrder, sliceShape)) { + return mlir::failure(); + } + + // After Reorder-Slice swap, the new slice should not have a smaller DMA width than the original + int64_t origSliceWidth = getSliceWidth(origShape, sliceShape, reorderOutputPerm); + int64_t newSliceWidth = getSliceWidth(origShape, sliceShape, reorderInputPerm); + if (newSliceWidth < origSliceWidth) { + return mlir::failure(); + } + } + + // Pattern matched + const auto origOrderMap = origDimsOrder.toAffineMap(rewriter.getContext()); + const auto newOrderMap = newDimsOrder.toAffineMap(rewriter.getContext()); + auto reorderInput1 = rewriter.createOrFold(origOp->getLoc(), origOp.getInput1(), newOrderMap); + auto newIn1 = rewriter.create(origOp.getLoc(), reorderInput1, origOrderMap); + auto newIn2 = newIn1; + if (!bothInputsSame) { + auto reorderInput2 = rewriter.createOrFold(origOp->getLoc(), origOp.getInput2(), newOrderMap); + newIn2 = rewriter.create(origOp.getLoc(), reorderInput2, origOrderMap); + } + mlir::Value newAdd = rewriter.create( + origOp.getLoc(), origOp.getType(), newIn1, newIn2, origOp.getAutoBroadcastAttr(), origOp.getPostOpAttr(), + origOp.getClampAttr(), origOp.getOutputPaddingAttr(), origOp.getInputPaddingAttr()); + _log.trace("New AddOp: {0}", newAdd); + auto newOut = rewriter.create(origOp.getLoc(), newAdd, newOrderMap); + auto newReorderOp = rewriter.create(origOp->getLoc(), newOut, origOrderMap); + + if (quantCast != nullptr) { + auto outputTypeQuantize = mlir::cast(quantCast.getType()); + auto outElemType = outputTypeQuantize.getElementType(); + auto newQuantCast = rewriter.create(origOp.getLoc(), newReorderOp, outElemType); + sliceParent.replaceAllUsesWith(newQuantCast.getResult()); + _log.trace("Replace by IE::QuantizeCastOp: {0}", newQuantCast); + } else { + sliceParent.replaceAllUsesWith(newReorderOp.getResult()); + _log.trace("Replace by IE::ReorderOp {0}", newReorderOp); + } + + return mlir::success(); +} + // // ReorderWithGroupConv // @@ -1970,6 +2128,7 @@ void OptimizeReordersPass::safeRunOnFunc() { patterns.add(&ctx, _log, _seOpsEnabled, _seExperimentalOpsEnabled); patterns.add(&ctx, _log); patterns.add>(&ctx, _log); + patterns.add(&ctx, _log); patterns.add(&ctx, _log); IE::ReorderOp::getCanonicalizationPatterns(patterns, &ctx); diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_reorders_across_function_calls.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_reorders_across_function_calls.cpp index 782065e5cf..bf6c56916a 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_reorders_across_function_calls.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_reorders_across_function_calls.cpp @@ -4,11 +4,9 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/func_dialect.hpp" -#include "vpux/compiler/utils/logging.hpp" -#include "vpux/utils/core/format.hpp" namespace vpux::IE { #define GEN_PASS_DECL_OPTIMIZEREORDERSACROSSFUNCTIONCALLS diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_slice_expand.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_slice_expand.cpp index 2d0c6a8157..0eb7968b51 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_slice_expand.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_slice_expand.cpp @@ -4,12 +4,12 @@ // #include "vpux/compiler/dialect/IE/transforms/passes/optimize_slice_expand.hpp" -#include -#include #include #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" #include "vpux/compiler/dialect/IE/utils/expand_utils.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" @@ -18,6 +18,9 @@ #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/range.hpp" +#include +#include + using namespace vpux; // IsLegal functions for support operations diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_slice_with_stride.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_slice_with_stride.cpp index 160bf41552..fc2d9bfb59 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_slice_with_stride.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_slice_with_stride.cpp @@ -3,21 +3,18 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/adjust_layout_utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/compiler/utils/factors.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/types.hpp" -#include "vpux/utils/core/numeric.hpp" #include #include @@ -378,8 +375,8 @@ bool doesSliceConcatMeetRequirement(IE::SliceOp sliceOp, IE::ConcatOp concatOp) return false; } - // If slice's input has other users, slice cannot fuse into convolution. - if (!sliceOp.getSource().hasOneUse()) { + // If slice's input/output has other users, slice cannot fuse into convolution. + if (!sliceOp.getSource().hasOneUse() || !sliceOp.getResult().hasOneUse()) { return false; } diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_tile_op.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_tile_op.cpp index db6111f29a..a0a23d7529 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_tile_op.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_tile_op.cpp @@ -3,10 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_unaligned_qdq.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_unaligned_qdq.cpp index 77ba92dab2..3404aa9603 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_unaligned_qdq.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/optimize_unaligned_qdq.cpp @@ -4,9 +4,11 @@ // #include - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/outliner.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/outliner.cpp index f8c077b458..59aeeee285 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/outliner.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/outliner.cpp @@ -4,12 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/IE/function_outlining_splitter.hpp" -#include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/func_dialect.hpp" #include "vpux/compiler/utils/logging.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/dense_map.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/override_tile_executor_num.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/override_tile_executor_num.cpp index 8e4bfa8a47..07e74166d7 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/override_tile_executor_num.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/override_tile_executor_num.cpp @@ -4,7 +4,6 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/utils/batch.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/pad_dynamic_inputs.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/pad_dynamic_inputs.cpp index 95c10f7288..a400e21453 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/pad_dynamic_inputs.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/pad_dynamic_inputs.cpp @@ -4,7 +4,9 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/core/types.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/per_axis_fq_concat.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/per_axis_fq_concat.cpp index bb9d76b59c..fe83dcdce1 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/per_axis_fq_concat.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/per_axis_fq_concat.cpp @@ -4,13 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" - #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/populate_dynamic_dimensions_generic.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/populate_dynamic_dimensions_generic.cpp index 825d8e5486..b99a4225bb 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/populate_dynamic_dimensions_generic.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/populate_dynamic_dimensions_generic.cpp @@ -3,20 +3,24 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include -#include - -#include #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/reify_shape.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "mlir/Dialect/Arith/IR/Arith.h" +#include +#include +#include +#include + +#include namespace vpux::IE { #define GEN_PASS_DECL_POPULATEDYNAMICDIMENSIONSGENERIC diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/populate_dynamic_dimensions_hw.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/populate_dynamic_dimensions_hw.cpp index 4c4325007d..b9dcb7846d 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/populate_dynamic_dimensions_hw.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/populate_dynamic_dimensions_hw.cpp @@ -4,17 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/IE/utils/reify_shape.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" -#include "vpux/compiler/utils/analysis.hpp" -#include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/core/error.hpp" -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Index/IR/IndexOps.h" +#include namespace vpux::IE { #define GEN_PASS_DECL_POPULATEDYNAMICDIMENSIONSHW diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_affine_reshape.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_affine_reshape.cpp index 51ff7c2104..ab142ea14d 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_affine_reshape.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_affine_reshape.cpp @@ -4,26 +4,28 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/transforms/rewriters/propagate_transpose_affine_reshape_common.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" -#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" - #include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" #include "vpux/compiler/dialect/IE/utils/elem_type_info_utils.hpp" #include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" - #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/passes.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include #include +#include #include + #include namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_dequant_through_concat.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_dequant_through_concat.cpp index 821f7ee2ae..86bfb075f3 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_dequant_through_concat.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_dequant_through_concat.cpp @@ -4,13 +4,14 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include #include + #include namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_fq.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_fq.cpp index 9bc3d04c56..7294c81bf5 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_fq.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_fq.cpp @@ -4,6 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/const/ops.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_mem_permute_before_op.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_mem_permute_before_op.cpp index 58931cdc96..0f27977bd7 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_mem_permute_before_op.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_mem_permute_before_op.cpp @@ -4,12 +4,19 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/permute_infer.hpp" #include "vpux/compiler/dialect/IE/utils/permute_quantize_utils.hpp" #include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -1065,6 +1072,12 @@ mlir::LogicalResult MoveThroughShapeCast::matchAndRewrite(IE::ShapeCastOp shapeC return mlir::failure(); } + if (isSuitableToAdjustMemPermuteShape(mlir::cast(memPermuteOp.getInput().getType()), + mlir::cast(memPermuteOp.getOutput().getType()), + memPermuteOp.getMemPerm())) { + return mlir::failure(); + } + const auto origReshapeInType = mlir::cast(shapeCastOp->getOperand(0).getType()); const auto origReshapeOutType = mlir::cast(shapeCastOp->getResult(0).getType()); const auto origReshapeInShape = origReshapeInType.getShape(); diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_mem_permute_through_eltwise.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_mem_permute_through_eltwise.cpp index b5ea8de61a..c29590c9ed 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_mem_permute_through_eltwise.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_mem_permute_through_eltwise.cpp @@ -4,9 +4,12 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" +#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/error.hpp" @@ -449,13 +452,45 @@ mlir::LogicalResult OptimizeShapeCastedEltwise::matchAndRewrite(IE::MemPermuteOp const SmallVector branches = eltwiseOp->getOperands(); - auto newAlignedShape = getNewAlignedShapeForPermuteCast(eltwiseOp, memPermuteOp); - if (!newAlignedShape.has_value()) { + auto newAlignedShapeValue = getNewAlignedShapeForPermuteCast(eltwiseOp, memPermuteOp); + if (!newAlignedShapeValue.has_value()) { return matchFailed(_log, rewriter, memPermuteOp, "The shape is not channel aligned"); } + const auto newAlignedShape = newAlignedShapeValue.value(); - SmallVector newAddInputs; + auto attr = eltwiseOp->getAttr("auto_broadcast"); + auto autoBroadcastType = IE::AutoBroadcastType::NONE_OR_EXPLICIT; + if (auto autoBroadcastAttr = mlir::dyn_cast_or_null(attr)) { + autoBroadcastType = autoBroadcastAttr.getValue(); + } + auto inferredOutShape = IE::broadcastEltwiseShape(newAlignedShape[0].raw(), newAlignedShape[1].raw(), + autoBroadcastType, eltwiseOp->getLoc()); + if (mlir::failed(inferredOutShape)) { + return matchFailed(_log, rewriter, memPermuteOp, + "Inferred shape for eltwise operation failed when propagating MemPermute"); + } + + SmallVector newShapeCastShape = newAlignedShape; + const auto& leftShape = newAlignedShape[0].raw(); + const auto& rightShape = newAlignedShape[1].raw(); + if (leftShape[0] != 1 && rightShape[0] != 1) { + newShapeCastShape[0][Dims4D::Act::N] = 1; + newShapeCastShape[1][Dims4D::Act::N] = 1; + bool isBroadcasted = false; + for (size_t i = 1; i < leftShape.size(); ++i) { + const auto leftMul = leftShape[i] * leftShape[0]; + const auto rightMul = rightShape[i] * rightShape[0]; + // If the left and right shapes are equal, we can merge them. + if (leftMul == rightMul && !isBroadcasted) { + newShapeCastShape[0][Dim(i)] = leftMul; + newShapeCastShape[1][Dim(i)] = rightMul; + isBroadcasted = true; + } + } + } + + SmallVector newAddInputs; for (size_t inputIdx = 0; inputIdx < branches.size(); inputIdx++) { auto branchInput = branches[inputIdx]; @@ -475,22 +510,18 @@ mlir::LogicalResult OptimizeShapeCastedEltwise::matchAndRewrite(IE::MemPermuteOp // IE.MemPermute -> IE.ShapeCast -> IE.Add -> IE.ShapeCast -> IE.MemPermute // the ShapeCast input will be replaced with PermuteCast: // IE.MemPermute -> IE.MemPermute -> IE.PermuteCast -> IE.Add -> ... - newInput = createNewInputWithAlignedShape(newMemPermuteOp, eltwiseOp, newAlignedShape.value()[inputIdx], - rewriter); + newInput = createNewInputWithAlignedShape(newMemPermuteOp, eltwiseOp, newAlignedShape[inputIdx], rewriter); } - if (newAlignedShape.value()[inputIdx][Dims4D::Act::N] != 1) { - auto newInputAlignedShape = getShape(newInput).raw(); - SmallVector newInputShape = {1, newInputAlignedShape[1], - newInputAlignedShape[0] * newInputAlignedShape[2], - newInputAlignedShape[3]}; - newInput = rewriter.createOrFold(memPermuteOp.getLoc(), newInput, - getIntArrayAttr(rewriter.getContext(), newInputShape)); + if (newAlignedShape[inputIdx][Dims4D::Act::N] != 1) { + newInput = rewriter.createOrFold( + memPermuteOp.getLoc(), newInput, + getIntArrayAttr(rewriter.getContext(), newShapeCastShape[inputIdx])); } newAddInputs.push_back(newInput); } - createNewOutputWithAlignedShape(memPermuteOp, eltwiseOp, newAlignedShape.value().back(), newAddInputs, rewriter); + createNewOutputWithAlignedShape(memPermuteOp, eltwiseOp, newAlignedShape.back(), newAddInputs, rewriter); return mlir::success(); } diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_mem_permute_through_softmax.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_mem_permute_through_softmax.cpp index 7208420c3a..8724b2c9fd 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_mem_permute_through_softmax.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_mem_permute_through_softmax.cpp @@ -4,11 +4,12 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/softmax_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/utils/adjust_layout_utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_op_through_batch_concat.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_op_through_batch_concat.cpp index 266333f99d..fe3010b0e4 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_op_through_batch_concat.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_op_through_batch_concat.cpp @@ -3,10 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes_utils.hpp" @@ -118,7 +121,38 @@ mlir::LogicalResult PropagateSoftmax::matchAndRewrite(IE::SoftMaxOp origOp, mlir } return llvm::all_of(concatOp.getInputs(), [&](mlir::Value input) { - return getShape(input) == getShape(input2); + // The experiment indicates that performance is highly influenced by the size of Concat input. For small + // sizes of the Concat input this rewrite will fragment to much the tensors resulting into a large number of + // small tasks of which scheduling cost will overcome the vertical fusion benefits. The empirical threshold + // for Concat input size indicated by experiments is 1 MB + constexpr Byte MIN_CONCAT_INPUT_SIZE = 1_MB; + const Byte inputSize = getTotalSize(input); + if (inputSize < MIN_CONCAT_INPUT_SIZE) { + return false; + } + auto inputShape = getShape(input); + auto addInput2Shape = getShape(input2); + int64_t inputShapeRank = inputShape.size(); + int64_t addInput2ShapeRank = addInput2Shape.size(); + // Check if Add input operands has equal shapes or broadcastable shapes + if (inputShapeRank != addInput2ShapeRank) { + int64_t maxRank = std::max(inputShapeRank, addInput2ShapeRank); + // Iterate through the dimensions in reverse order + for (int64_t i = 0; i < maxRank; ++i) { + int64_t inputDimIdx = inputShapeRank - 1 - i; + int64_t addInput2DimIdx = addInput2ShapeRank - 1 - i; + // Get dimension value and if the index is outside the range consider the dimension as equal to 1 + int64_t inputDim = (inputDimIdx < 0) ? 1 : inputShape[Dim(inputDimIdx)]; + int64_t addInput2Dim = (addInput2DimIdx < 0) ? 1 : addInput2Shape[Dim(addInput2DimIdx)]; + // The dimensions must be equal + if (inputDim != addInput2Dim) { + return false; + } + } + return true; + } + // Check that the shapes are equal + return inputShape == addInput2Shape; }); }; if (maybeAddOp != nullptr && !isValidAddOp(maybeAddOp, concatOp)) { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_permute_cast_through_dequantize.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_permute_cast_through_dequantize.cpp index 0374cdaa9c..0ca8ee4494 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_permute_cast_through_dequantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_permute_cast_through_dequantize.cpp @@ -2,14 +2,14 @@ // Copyright (C) 2024-2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // - -#include -#include "vpux/compiler/NPU37XX/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/utils/permute_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include + namespace vpux::IE { #define GEN_PASS_DECL_PROPAGATEPERMUTECASTTHROUGHDEQUANTIZE #define GEN_PASS_DEF_PROPAGATEPERMUTECASTTHROUGHDEQUANTIZE diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_quantize_dequantize.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_quantize_dequantize.cpp index d8366033ae..37ae935d26 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_quantize_dequantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_quantize_dequantize.cpp @@ -4,7 +4,7 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/elem_type_info_utils.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_shape_cast.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_shape_cast.cpp index 1a77f9981b..770538b4e8 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_shape_cast.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_shape_cast.cpp @@ -3,10 +3,19 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_transpose.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_transpose.cpp index 7d4964c496..6da0d8804d 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_transpose.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/propagate_transpose.cpp @@ -3,18 +3,18 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/transforms/rewriters/propagate_transpose_affine_reshape_common.hpp" - #include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" - #include "vpux/compiler/utils/attributes_utils.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/passes.hpp" -#include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/reassociate_multiply.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/reassociate_multiply.cpp index 16093d5677..881f3c0c75 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/reassociate_multiply.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/reassociate_multiply.cpp @@ -3,13 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" - -#include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/remove_quantdequant_seq.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/remove_quantdequant_seq.cpp index 701bd11030..241a9caddb 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/remove_quantdequant_seq.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/remove_quantdequant_seq.cpp @@ -4,7 +4,8 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/remove_view_like_ops_chain.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/remove_view_like_ops_chain.cpp index 9b6960df96..75c1d0880c 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/remove_view_like_ops_chain.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/remove_view_like_ops_chain.cpp @@ -4,7 +4,6 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/reshape_matmul_inputs.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/reshape_matmul_inputs.cpp index ab736ab387..a0b0ab500a 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/reshape_matmul_inputs.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/reshape_matmul_inputs.cpp @@ -3,10 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/matmul.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/reshape_max_pool.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/reshape_max_pool.cpp index 7b69d81297..36ea103169 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/reshape_max_pool.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/reshape_max_pool.cpp @@ -3,8 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/utils/attributes.hpp" @@ -12,7 +15,6 @@ #include #include -#include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" namespace vpux::IE { #define GEN_PASS_DECL_RESHAPEMAXPOOL diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/resolve_scatter_update_by_transpose.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/resolve_scatter_update_by_transpose.cpp index c3534cc18d..515522f52f 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/resolve_scatter_update_by_transpose.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/resolve_scatter_update_by_transpose.cpp @@ -2,10 +2,9 @@ // Copyright (C) 2022-2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/resolve_strided_slice.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/resolve_strided_slice.cpp index 98c06c2599..146b44d25e 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/resolve_strided_slice.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/resolve_strided_slice.cpp @@ -3,10 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/rewriters/expand_with_layer_rewriter.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/rewriters/expand_with_layer_rewriter.cpp index 259db787cf..5e5413332f 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/rewriters/expand_with_layer_rewriter.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/rewriters/expand_with_layer_rewriter.cpp @@ -4,8 +4,7 @@ // #include "vpux/compiler/dialect/IE/transforms/rewriters/expand_with_layer_rewriter.hpp" -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/rewriters/propagate_transpose_affine_reshape_common.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/rewriters/propagate_transpose_affine_reshape_common.cpp index 71511d3aa1..72b5b2bce5 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/rewriters/propagate_transpose_affine_reshape_common.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/rewriters/propagate_transpose_affine_reshape_common.cpp @@ -4,18 +4,23 @@ // #include "vpux/compiler/dialect/IE/transforms/rewriters/propagate_transpose_affine_reshape_common.hpp" -#include -#include -#include -#include -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes_utils.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/utils/core/error.hpp" +#include + +#include +#include +#include + namespace vpux { namespace IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/run_convert_on_dpu.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/run_convert_on_dpu.cpp index 5dc1ecffb9..5501bf01ca 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/run_convert_on_dpu.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/run_convert_on_dpu.cpp @@ -3,18 +3,18 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include -#include +#include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/interfaces/fuse_convert_to_dpu_checker.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" - -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" + +#include +#include +#include namespace vpux::IE { #define GEN_PASS_DECL_RUNF16TOF32CONVERTONDPU @@ -53,7 +53,7 @@ void RunF16ToF32ConvertOnDPUPass::fuseWithParentDPUOp(IE::ConvertOp convert, mli void RunF16ToF32ConvertOnDPUPass::safeRunOnFunc() { auto func = getOperation(); - auto parentCheck = IE::createFuseConvertToDPUChecker(VPU::getArch(func)); + auto parentCheck = IE::createFuseConvertToDPUChecker(config::getArch(func)); auto nestedLog = _log.nest(); SmallVector f16Tof32Converts = {}; diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/shrink_matmul_groups.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/shrink_matmul_groups.cpp index 58eea715b7..daac9a1fc2 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/shrink_matmul_groups.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/shrink_matmul_groups.cpp @@ -3,19 +3,20 @@ // SPDX-License-Identifier: Apache-2.0 // -#include #include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/matmul.hpp" -#include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include + namespace vpux::IE { #define GEN_PASS_DECL_SHRINKMATMULGROUPS #define GEN_PASS_DEF_SHRINKMATMULGROUPS @@ -133,13 +134,13 @@ bool checkMatMul(IE::MatMulOp origOp) { return false; } - // Restrict to Dim H = 1, otherwise it would break the VF pattern for MatMul-Add-Softmax-MatMul in LLM - // TODO: Remove the restriction, see E#138709 - if (lhsShape[H] != 1) { - return false; + if (lhsShape[H] == 1) { + return true; } - return true; + // The optimization will break the VF pattern for MatMul-Add-Softmax-MatMul in LLM, but VF pattern needs + // unrolled MatMul, not grouped MatMul. So here we check if it is beneficial for group MatMul. + return isGroupedMatMulBeneficial(origOp, lhsShape, rhsShape); } bool checkTranspose(IE::TransposeOp transposeOp) { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/split_bilinear_into_H_and_W.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/split_bilinear_into_H_and_W.cpp index dc445e4ed6..0c864156c6 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/split_bilinear_into_H_and_W.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/split_bilinear_into_H_and_W.cpp @@ -4,7 +4,9 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/interpolate_utils.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/split_conv_with_multiple_fq.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/split_conv_with_multiple_fq.cpp index bcc9283f11..0faf4a0aaf 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/split_conv_with_multiple_fq.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/split_conv_with_multiple_fq.cpp @@ -3,16 +3,19 @@ // SPDX-License-Identifier: Apache-2.0 // -#include - +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include #include +#include + namespace vpux::IE { #define GEN_PASS_DECL_SPLITCONVWITHMULTIPLEFQ #define GEN_PASS_DEF_SPLITCONVWITHMULTIPLEFQ diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/split_fake_quant.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/split_fake_quant.cpp index 565bf51a2a..3994d74ec7 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/split_fake_quant.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/split_fake_quant.cpp @@ -4,12 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" - #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/split_interpolate_axes.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/split_interpolate_axes.cpp index a5f0b58854..06fd479fb0 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/split_interpolate_axes.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/split_interpolate_axes.cpp @@ -4,9 +4,9 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_convert_with_reshape_kind_ops.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_convert_with_reshape_kind_ops.cpp new file mode 100644 index 0000000000..816fed6cb7 --- /dev/null +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_convert_with_reshape_kind_ops.cpp @@ -0,0 +1,153 @@ +// +// Copyright (C) 2022-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/utils/rewriter.hpp" + +namespace vpux::IE { +#define GEN_PASS_DECL_SWAPCONVERTWITHRESHAPEKINDOPS +#define GEN_PASS_DEF_SWAPCONVERTWITHRESHAPEKINDOPS +#include "vpux/compiler/dialect/IE/passes.hpp.inc" +} // namespace vpux::IE + +using namespace vpux; + +namespace { + +// +// SwapConvertWithReshapeKindOps +// + +class SwapConvertWithReshapeKindOps final : + public IE::impl::SwapConvertWithReshapeKindOpsBase { +public: + explicit SwapConvertWithReshapeKindOps(Logger log): _log(log) { + _log.setName(Base::getArgumentName()); + } + +public: + class OpSwapConverter; + +private: + void safeRunOnFunc() final; + +private: + Logger _log; +}; + +bool isReshapeKindOp(mlir::Operation* op) { + if (op == nullptr) { + return false; + } + return mlir::isa(op); +} + +// For OV 2.0 API U8 we can have: +// NetworkInput (NCHW) -> Convert -> Transpose-> FQ . Because of this lately after propagate quantize +// dequantize pass and fuse convert with quantize pass, will be needed to propagate the quantizeCast +// quantParams to Transpose. We want to avoid this. Also in the end this Transpose will be done as +// PermuteCast. + +// Output Case: +// Convert -> N reshapeKindOps -> return => N reshapeKindOps -> Convert -> return + +bool canBeSwapped(IE::ConvertOp origOp, mlir::Operation* swapOp) { + return (origOp.getDstElemType().isUnsignedInteger(8) && + mlir::isa(*swapOp->getResult(0).getUsers().begin())) || + mlir::isa(origOp.getInput()); +} + +void swapOps(IE::ConvertOp origOp, mlir::Operation* swapOp, mlir::PatternRewriter& rewriter) { + const auto origDataType = mlir::cast(origOp.getInput().getType()); + auto swapDataType = mlir::cast(swapOp->getResult(0).getType()); + const auto newDataType = swapDataType.changeElemType(origDataType.getElementType()); + + rewriter.setInsertionPointAfter(swapOp); + auto newConvert = rewriter.create(origOp->getLoc(), swapOp->getResult(0), origOp.getDstElemType()); + swapOp->getResult(0).replaceAllUsesExcept(newConvert.getOutput(), + llvm::SmallPtrSet{newConvert}); + origOp->replaceAllUsesWith(mlir::ValueRange(origOp.getInput())); + swapOp->getResult(0).setType(newDataType); + rewriter.eraseOp(origOp); +} + +mlir::Operation* findLastReshapeKindOp(mlir::Operation* swapOp) { + while (isReshapeKindOp(*swapOp->getResult(0).getUsers().begin())) { + swapOp = *swapOp->getResult(0).getUsers().begin(); + } + return swapOp; +} + +// +// OpSwapConverter +// + +class SwapConvertWithReshapeKindOps::OpSwapConverter final : public mlir::OpRewritePattern { +public: + OpSwapConverter(mlir::MLIRContext* ctx, Logger log): mlir::OpRewritePattern(ctx), _log(log) { + } + +public: + mlir::LogicalResult matchAndRewrite(IE::ConvertOp origOp, mlir::PatternRewriter& rewriter) const final; + +private: + Logger _log; +}; + +mlir::LogicalResult SwapConvertWithReshapeKindOps::OpSwapConverter::matchAndRewrite( + IE::ConvertOp origOp, mlir::PatternRewriter& rewriter) const { + if (!origOp.getOutput().hasOneUse()) { + return mlir::failure(); + } + + auto swapOp = *origOp.getOutput().getUsers().begin(); + auto swapOpLoop = swapOp; + + if (isReshapeKindOp(swapOp)) { + if (canBeSwapped(origOp, swapOp)) { + swapOps(origOp, swapOp, rewriter); + return mlir::success(); + } + + // Handle intermediate reshape kind ops + swapOp = findLastReshapeKindOp(swapOp); + + if (!mlir::isa(*swapOp->getResult(0).getUsers().begin())) { + return mlir::failure(); + } + + // Process swap Convert loop for reshape kind ops + while (isReshapeKindOp(*swapOpLoop->getResult(0).getUsers().begin())) { + swapOps(origOp, swapOpLoop, rewriter); + return mlir::success(); + } + } + + return mlir::failure(); +} + +void SwapConvertWithReshapeKindOps::safeRunOnFunc() { + auto func = getOperation(); + + auto& ctx = getContext(); + + mlir::RewritePatternSet patterns(&ctx); + patterns.add(&ctx, _log); + + if (mlir::failed(mlir::applyPatternsAndFoldGreedily(func, std::move(patterns), getDefaultGreedyRewriteConfig()))) { + signalPassFailure(); + } +} + +} // namespace + +std::unique_ptr vpux::IE::createSwapConvertWithReshapeKindOpsPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_convert_with_sw_op.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_convert_with_sw_op.cpp index a5b5fab703..3ce34326d2 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_convert_with_sw_op.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_convert_with_sw_op.cpp @@ -3,13 +3,20 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/attributes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/utils/transpose_op_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" +#include "vpux/compiler/dialect/IE/utils/transpose_op_utils.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/const/utils/utils.hpp" + #include "vpux/compiler/utils/rewriter.hpp" #include @@ -30,20 +37,17 @@ namespace { constexpr int64_t EXPERIMENTAL_F32_FUSION_THRESHOLD = 36000; // -// SwapConvertWithSWOp +// SwapSWOpWithConvert // -class SwapConvertWithSWOp final : public IE::impl::SwapConvertWithSWOpBase { +class SwapSWOpWithConvert final : public mlir::OpRewritePattern { public: - explicit SwapConvertWithSWOp(Logger log): _log(log) { - _log.setName(Base::getArgumentName()); + SwapSWOpWithConvert(mlir::MLIRContext* ctx, Logger log): mlir::OpRewritePattern(ctx), _log(log) { + this->setDebugName("SwapSWOpWithConvert"); } -public: - class OpSwapConverter; - private: - void safeRunOnFunc() final; + mlir::LogicalResult matchAndRewrite(IE::ConvertOp origOp, mlir::PatternRewriter& rewriter) const final; private: Logger _log; @@ -52,24 +56,10 @@ class SwapConvertWithSWOp final : public IE::impl::SwapConvertWithSWOpBase(op); } -// -// OpSwapConverter -// -class SwapConvertWithSWOp::OpSwapConverter final : public mlir::OpRewritePattern { -public: - OpSwapConverter(mlir::MLIRContext* ctx, Logger log): mlir::OpRewritePattern(ctx), _log(log) { - } +mlir::LogicalResult SwapSWOpWithConvert::matchAndRewrite(IE::ConvertOp origOp, mlir::PatternRewriter& rewriter) const { + _log.trace("Got '{0}' at '{1}'", origOp->getName(), origOp->getLoc()); -public: - mlir::LogicalResult matchAndRewrite(IE::ConvertOp origOp, mlir::PatternRewriter& rewriter) const final; - -private: - Logger _log; -}; - -mlir::LogicalResult SwapConvertWithSWOp::OpSwapConverter::matchAndRewrite(IE::ConvertOp origOp, - mlir::PatternRewriter& rewriter) const { const auto convertInput = origOp.getInput(); mlir::Operation* nceOp = convertInput.getDefiningOp(); @@ -95,10 +85,111 @@ mlir::LogicalResult SwapConvertWithSWOp::OpSwapConverter::matchAndRewrite(IE::Co return mlir::success(); } -void SwapConvertWithSWOp::safeRunOnFunc() { - auto func = getOperation(); +// +// SwapConvertWithEltwiseOp +// + +template +class SwapConvertWithEltwiseOp final : public mlir::OpRewritePattern { +public: + SwapConvertWithEltwiseOp(mlir::MLIRContext* ctx, Logger log): mlir::OpRewritePattern(ctx), _log(log) { + this->setDebugName("SwapConvertWithEltwiseOp"); + } + +public: + mlir::LogicalResult matchAndRewrite(EltwiseOp origOp, mlir::PatternRewriter& rewriter) const final; + +private: + Logger _log; +}; + +// If pattern like EltwiseOp[si32] -> ConvertOp[si32, fp16], since EltwiseOp with IntegerType cannot convert to DPU +// task, if we swap EltwiseOp with ConvertOp, the EltwiseOp will be converted to DPU task. + +/* Rewrite the pattern from: + Input Const + | | + \ / + Eltwise (IntegerType, will not be converted to DPU task) + | + ConvertOp (IntegerType to Float16Type) + + to: + Input Const (changeShapeAndElemTypeAttr, IntegerType to Float16Type) + | | + ConvertOp (IntegerType to Float16Type) | + \ / + EltwiseOp (Float16Type, will be converted to DPU task) + */ +template +mlir::LogicalResult SwapConvertWithEltwiseOp::matchAndRewrite(EltwiseOp origOp, + mlir::PatternRewriter& rewriter) const { + _log.trace("Got '{0}' at '{1}'", origOp->getName(), origOp->getLoc()); + + if (!origOp.getOutput().hasOneUse()) { + return mlir::failure(); + } + auto convertOp = mlir::dyn_cast(*origOp.getOutput().getUsers().begin()); + if (convertOp == nullptr) { + return mlir::failure(); + } + + auto convertInElemType = mlir::cast(convertOp.getInput().getType()).getElementType(); + auto convertOutElemType = mlir::cast(convertOp.getOutput().getType()).getElementType(); + if (!mlir::isa(convertInElemType) || !mlir::isa(convertOutElemType)) { + return mlir::failure(); + } + + if (mlir::failed(IE::getConstParentOp(origOp.getInput2()))) { + return mlir::failure(); + } + + // Experimental number to determine if swapping ConvertOp with EltwiseOp is beneficial. + constexpr int BENEFICIAL_SIZE = 1024; + auto shapeSize = vpux::details::calcTotalShapeSize(getShape(origOp.getOutput())); + if (shapeSize < BENEFICIAL_SIZE) { + return mlir::failure(); + } + + auto newConvert = rewriter.create(convertOp->getLoc(), origOp.getInput1(), convertOutElemType); + + auto constInput = origOp.getInput2().template getDefiningOp(); + auto biasContentAttr = + constInput.transformContentAttr().changeShapeAndElemType(getShape(constInput), convertOutElemType).get(); + auto newBiasValue = + rewriter.create(origOp.getLoc(), biasContentAttr.getType(), std::move(biasContentAttr)) + .getResult(); + mlir::IRMapping mapper; + mapper.map(origOp->getOperands(), SmallVector{newConvert.getOutput(), newBiasValue}); + auto* newOp = rewriter.clone(*origOp, mapper); + vpux::inferReturnTypes(newOp, vpux::InferShapedTypeMode::ALL); + + convertOp.replaceAllUsesWith(newOp->getResult(0)); + + return mlir::success(); +} + +// +// SwapConvertWithSWOp +// + +class SwapConvertWithSWOp final : public IE::impl::SwapConvertWithSWOpBase { +public: + explicit SwapConvertWithSWOp(Logger log): _log(log) { + _log.setName(Base::getArgumentName()); + } + +private: + void safeRunOnFunc() final; + +private: + Logger _log; +}; + +void SwapConvertWithSWOp::safeRunOnFunc() { auto& ctx = getContext(); + auto func = getOperation(); const auto isLegalOp = [](IE::ConvertOp op) -> bool { auto inputElemType = mlir::cast(op.getInput().getType()).getElementType(); @@ -150,11 +241,20 @@ void SwapConvertWithSWOp::safeRunOnFunc() { target.addDynamicallyLegalOp(isLegalOp); mlir::RewritePatternSet patterns(&ctx); - patterns.add(&ctx, _log); - + patterns.add(&ctx, _log); if (mlir::failed(mlir::applyPartialConversion(func, target, std::move(patterns)))) { signalPassFailure(); } + + mlir::RewritePatternSet eltwisePatterns(&ctx); + eltwisePatterns.add>(&ctx, _log); + eltwisePatterns.add>(&ctx, _log); + eltwisePatterns.add>(&ctx, _log); + if (mlir::failed(mlir::applyPatternsAndFoldGreedily(func, std::move(eltwisePatterns), + getDefaultGreedyRewriteConfig()))) { + signalPassFailure(); + return; + } } } // namespace diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_convert_with_transpose_reshape.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_convert_with_transpose_reshape.cpp deleted file mode 100644 index 8da9cbe8ac..0000000000 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_convert_with_transpose_reshape.cpp +++ /dev/null @@ -1,129 +0,0 @@ -// -// Copyright (C) 2022-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - -#include - -namespace vpux::IE { -#define GEN_PASS_DECL_SWAPCONVERTWITHTRANSPOSERESHAPE -#define GEN_PASS_DEF_SWAPCONVERTWITHTRANSPOSERESHAPE -#include "vpux/compiler/dialect/IE/passes.hpp.inc" -} // namespace vpux::IE - -using namespace vpux; - -namespace { - -// -// SwapConvertWithTransposeReshape -// - -class SwapConvertWithTransposeReshape final : - public IE::impl::SwapConvertWithTransposeReshapeBase { -public: - explicit SwapConvertWithTransposeReshape(Logger log): _log(log) { - _log.setName(Base::getArgumentName()); - } - -public: - class OpSwapConverter; - -private: - void safeRunOnFunc() final; - -private: - Logger _log; -}; - -bool isReshapeKindOp(mlir::Operation* op) { - if (op == nullptr) { - return false; - } - return mlir::isa(op); -} - -// -// OpSwapConverter -// - -class SwapConvertWithTransposeReshape::OpSwapConverter final : public mlir::OpRewritePattern { -public: - OpSwapConverter(mlir::MLIRContext* ctx, Logger log): mlir::OpRewritePattern(ctx), _log(log) { - } - -public: - mlir::LogicalResult matchAndRewrite(IE::ConvertOp origOp, mlir::PatternRewriter& rewriter) const final; - -private: - Logger _log; -}; - -mlir::LogicalResult SwapConvertWithTransposeReshape::OpSwapConverter::matchAndRewrite( - IE::ConvertOp origOp, mlir::PatternRewriter& rewriter) const { - auto swapOp = *origOp.getOutput().getUsers().begin(); - if (isReshapeKindOp(swapOp)) { - const auto origDataType = mlir::cast(origOp.getInput().getType()); - auto swapDataType = mlir::cast(swapOp->getResult(0).getType()); - const auto newDataType = swapDataType.changeElemType(origDataType.getElementType()); - - rewriter.setInsertionPointAfter(swapOp); - auto newConvert = - rewriter.create(origOp->getLoc(), swapOp->getResult(0), origOp.getDstElemType()); - swapOp->getResult(0).replaceAllUsesExcept(newConvert.getOutput(), - llvm::SmallPtrSet{newConvert}); - origOp->replaceAllUsesWith(mlir::ValueRange(origOp.getInput())); - swapOp->getResult(0).setType(newDataType); - rewriter.eraseOp(origOp); - } - - return mlir::success(); -} - -void SwapConvertWithTransposeReshape::safeRunOnFunc() { - auto func = getOperation(); - - auto& ctx = getContext(); - - const auto isLegalOp = [](IE::ConvertOp op) -> bool { - if (!op.getOutput().hasOneUse()) { - return true; - } - auto childOp = *op.getOutput().getUsers().begin(); - if (isReshapeKindOp(childOp)) { - // For OV 2.0 API U8 we can have: - // NetworkInput (NCHW) -> Convert -> Transpose-> FQ . Because of this lately after propagate quantize - // dequantize pass and fuse convert with quantize pass, will be needed to propagate the quantizeCast - // quantParams to Transpose. We want to avoid this. Also in the end this Transpose will be done as - // PermuteCast. - return !mlir::isa(op.getInput()); - } - - return true; - }; - - mlir::ConversionTarget target(ctx); - target.addDynamicallyLegalOp(isLegalOp); - target.addLegalOp(); - target.addLegalOp(); - target.addLegalOp(); - target.addLegalOp(); - target.addLegalOp(); - - mlir::RewritePatternSet patterns(&ctx); - patterns.add(&ctx, _log); - - if (mlir::failed(mlir::applyPartialConversion(func, target, std::move(patterns)))) { - signalPassFailure(); - } -} - -} // namespace - -std::unique_ptr vpux::IE::createSwapConvertWithTransposeReshapePass(Logger log) { - return std::make_unique(log); -} diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_fake_quant_with_reshape_and_strided_slice.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_fake_quant_with_reshape_and_strided_slice.cpp index 49d65c7043..4266a6b8bc 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_fake_quant_with_reshape_and_strided_slice.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_fake_quant_with_reshape_and_strided_slice.cpp @@ -4,9 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_mem_permute_and_expand.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_mem_permute_and_expand.cpp index 32968c5d57..29ec461fd3 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_mem_permute_and_expand.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_mem_permute_and_expand.cpp @@ -4,7 +4,7 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/transforms/rewriters/expand_with_layer_rewriter.hpp" #include "vpux/compiler/utils/permute_utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_mvn_with_transopose.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_mvn_with_transopose.cpp index e74074b7e7..224b07d29b 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_mvn_with_transopose.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_mvn_with_transopose.cpp @@ -4,7 +4,8 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/transpose_op_utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_operation_with_gather.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_operation_with_gather.cpp index 988d0f363b..acda1db4df 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_operation_with_gather.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_operation_with_gather.cpp @@ -5,7 +5,9 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_operations.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_operations.cpp index f6b8bd9b39..66a6b687fe 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_operations.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_operations.cpp @@ -4,7 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/transforms/rewriters/propagate_transpose_affine_reshape_common.hpp" #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" @@ -633,11 +637,6 @@ mlir::LogicalResult SwapAffineReshapeFakeQuantize::matchAndRewrite(IE::FakeQuant fakeQuantizeOp->getLoc(), affineReshapeOp.getInput(), fakeQuantizeOp.getInputLow(), fakeQuantizeOp.getInputHigh(), fakeQuantizeOp.getOutputLow(), fakeQuantizeOp.getOutputHigh(), fakeQuantizeOp.getLevelsAttr(), fakeQuantizeOp.getLowFpTypeAttr(), fakeQuantizeOp.getAutoBroadcastAttr()); - // Similar to GeLU, FakeQuantizeOp::inferReturnTypeComponents also doesn't forward layout info - // so need to set manually - auto dimsOrder = DimsOrder::fromValue(newFakeQuantizeOp.getInput()); - auto newOutType = mlir::cast(newFakeQuantizeOp.getOutput().getType()).changeDimsOrder(dimsOrder); - newFakeQuantizeOp->getResult(0).setType(newOutType); auto newAffineReshapeOp = rewriter.create( affineReshapeOp.getLoc(), newFakeQuantizeOp.getOutput(), affineReshapeOp.getDimMappingAttr(), diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_pad.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_pad.cpp index 0602f32de6..caf324e479 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_pad.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_pad.cpp @@ -4,10 +4,9 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/pad_extract.hpp" - #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_transpose_with_concat.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_transpose_with_concat.cpp index 0931953719..1aec53d0ba 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_transpose_with_concat.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_transpose_with_concat.cpp @@ -4,9 +4,8 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_transpose_with_fq.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_transpose_with_fq.cpp index 9c7a2edbcd..cf280c2a34 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_transpose_with_fq.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_transpose_with_fq.cpp @@ -4,7 +4,7 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_viewop_and_clamp.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_viewop_and_clamp.cpp index 0db851dd7f..3a55491496 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_viewop_and_clamp.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/swap_viewop_and_clamp.cpp @@ -4,8 +4,11 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/numeric.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/swish_fusion.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/swish_fusion.cpp index 7ccafd43df..c4b515110b 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/swish_fusion.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/swish_fusion.cpp @@ -4,9 +4,10 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/tile_incremental_sdpa.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/tile_incremental_sdpa.cpp index f550e79138..7b6ed98511 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/tile_incremental_sdpa.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/tile_incremental_sdpa.cpp @@ -3,10 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/tile_online_sdpa.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/tile_online_sdpa.cpp index 0ff6552575..4f0d49cc05 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/tile_online_sdpa.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/tile_online_sdpa.cpp @@ -3,15 +3,16 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include +#include + namespace vpux::IE { #define GEN_PASS_DECL_TILEONLINESDPA #define GEN_PASS_DEF_TILEONLINESDPA diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/transpose_to_permute_cast.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/transpose_to_permute_cast.cpp index 4e4f9cc298..a2656e5741 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/transpose_to_permute_cast.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/transpose_to_permute_cast.cpp @@ -4,7 +4,8 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/transpose_op_utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/uniquify_branches.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/uniquify_branches.cpp index d1a21f6e1d..98d22a934a 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/uniquify_branches.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/uniquify_branches.cpp @@ -3,13 +3,16 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" - -#include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/dialect/IE/utils/permute_infer.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/uniquify_ops.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/uniquify_ops.cpp index 399f22f08e..827826d1fc 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/uniquify_ops.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/uniquify_ops.cpp @@ -3,10 +3,17 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_batch.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_batch.cpp index f01d19a309..9c86923a9c 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_batch.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_batch.cpp @@ -5,13 +5,15 @@ #include "vpux/compiler/dialect/IE/transforms/passes/unroll_batch.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/compiler/utils/types.hpp" #include #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_conv3d_to_conv2d.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_conv3d_to_conv2d.cpp index 39b20ff967..5468422c06 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_conv3d_to_conv2d.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_conv3d_to_conv2d.cpp @@ -3,12 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" - -#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_fully_connected.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_fully_connected.cpp index 26d1bd73ae..3bca3e4e2e 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_fully_connected.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_fully_connected.cpp @@ -3,18 +3,25 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/IE/locations.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include +#include + namespace vpux::IE { #define GEN_PASS_DECL_UNROLLFULLYCONNECTED #define GEN_PASS_DEF_UNROLLFULLYCONNECTED diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_group_quantize.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_group_quantize.cpp index 8e8a22d5c1..80d6e325b7 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_group_quantize.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_group_quantize.cpp @@ -4,7 +4,7 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/fake_quantize_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_reducemin_all_axes.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_reducemin_all_axes.cpp index 5e032dd704..c44c8d0a40 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_reducemin_all_axes.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_reducemin_all_axes.cpp @@ -3,13 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/utils/handle_kernels_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_tensor_iterator.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_tensor_iterator.cpp index 8e21881417..d5aefa1cd8 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_tensor_iterator.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/unroll_tensor_iterator.cpp @@ -4,14 +4,14 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/logging.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/utils/core/dense_map.hpp" #include "vpux/utils/core/range.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/upstream_slice.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/upstream_slice.cpp index d3a5e8b401..892db33bee 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/upstream_slice.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/upstream_slice.cpp @@ -3,10 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/transforms/passes/use_user_precision.cpp b/src/vpux_compiler/src/dialect/IE/transforms/passes/use_user_precision.cpp index ee780b3a2d..e25e930cbc 100644 --- a/src/vpux_compiler/src/dialect/IE/transforms/passes/use_user_precision.cpp +++ b/src/vpux_compiler/src/dialect/IE/transforms/passes/use_user_precision.cpp @@ -3,12 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/transforms/passes.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" -#include "vpux/compiler/utils/IE/locations.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::IE { diff --git a/src/vpux_compiler/src/dialect/IE/utils/act_shave_utils.cpp b/src/vpux_compiler/src/dialect/IE/utils/act_shave_utils.cpp new file mode 100644 index 0000000000..029d05e9be --- /dev/null +++ b/src/vpux_compiler/src/dialect/IE/utils/act_shave_utils.cpp @@ -0,0 +1,17 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/IE/utils/act_shave_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" + +namespace vpux { +namespace IE { + +bool isActShaveKernel(mlir::Operation* operation) { + return VPU::NCEInvariant::isSupported(operation, Logger::global()).failed(); +} + +} // namespace IE +} // namespace vpux diff --git a/src/vpux_compiler/src/dialect/IE/utils/adjust_layouts_utils.cpp b/src/vpux_compiler/src/dialect/IE/utils/adjust_layouts_utils.cpp index 5597aaa6c2..34d573c66d 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/adjust_layouts_utils.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/adjust_layouts_utils.cpp @@ -3,11 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/adjust_layout_utils.hpp" +#include "vpux/compiler/utils/factors.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include "vpux/utils/core/numeric.hpp" #include #include diff --git a/src/vpux_compiler/src/dialect/IE/utils/broadcast_utils.cpp b/src/vpux_compiler/src/dialect/IE/utils/broadcast_utils.cpp index f5f4aa91ae..b25e082160 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/broadcast_utils.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/broadcast_utils.cpp @@ -4,6 +4,8 @@ // #include "vpux/compiler/dialect/IE/utils/broadcast_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/utils/const_attributes.cpp b/src/vpux_compiler/src/dialect/IE/utils/const_attributes.cpp index 5896368c16..43e29f5ab5 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/const_attributes.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/const_attributes.cpp @@ -4,7 +4,10 @@ // #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" namespace vpux { namespace IE { diff --git a/src/vpux_compiler/src/dialect/IE/utils/convert_op_types.cpp b/src/vpux_compiler/src/dialect/IE/utils/convert_op_types.cpp index cc096f02ed..2b0b2e98fa 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/convert_op_types.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/convert_op_types.cpp @@ -4,9 +4,9 @@ // #include "vpux/compiler/dialect/IE/utils/convert_op_types.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/utils/IE/locations.hpp" +#include "vpux/compiler/utils/passes.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/IE/utils/convolution_utils.cpp b/src/vpux_compiler/src/dialect/IE/utils/convolution_utils.cpp index b8f295ccc8..d01d23fb47 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/convolution_utils.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/convolution_utils.cpp @@ -4,10 +4,12 @@ // #include "vpux/compiler/dialect/IE/utils/convolution_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" -#include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/numeric.hpp" #include "vpux/utils/logger/logger.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/utils/dynamic_shape_utils.cpp b/src/vpux_compiler/src/dialect/IE/utils/dynamic_shape_utils.cpp index 7504f34a27..ee31fa1802 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/dynamic_shape_utils.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/dynamic_shape_utils.cpp @@ -5,7 +5,7 @@ #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/core/attributes/shape.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" #include "vpux/compiler/dialect/core/types.hpp" namespace vpux { @@ -79,6 +79,32 @@ Shape extractShape(const DimsMaskedShape& shape) { return shape.toReifiedShape(); } +Shape reifyShape(ShapeRef shape) { + VPUX_THROW_WHEN(shape.isDynamic(), "Tried to reify a dynamic shape without known bounds: {0}", shape); + return Shape(shape); +} + +Shape reifyShape(BoundedShapeRef shape) { + return shape.toReifiedShape(); +} + +Shape reifyShape(DimsMaskedShapeRef shape) { + return shape.toReifiedShape(); +} + +Shape reifyShape(const Shape& shape) { + VPUX_THROW_WHEN(shape.isDynamic(), "Tried to reify a dynamic shape without known bounds: {0}", shape); + return shape; +} + +Shape reifyShape(const BoundedShape& shape) { + return shape.toReifiedShape(); +} + +Shape reifyShape(const DimsMaskedShape& shape) { + return shape.toReifiedShape(); +} + std::tuple splitShapeAndRepresentation(const Shape& shape) { return std::make_tuple(shape, Bounds(), DynamicDimsMask()); } diff --git a/src/vpux_compiler/src/dialect/IE/utils/elem_type_info_utils.cpp b/src/vpux_compiler/src/dialect/IE/utils/elem_type_info_utils.cpp index 2d06e1f58e..8bd936d463 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/elem_type_info_utils.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/elem_type_info_utils.cpp @@ -4,14 +4,16 @@ // #include "vpux/compiler/dialect/IE/utils/elem_type_info_utils.hpp" -#include - -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/VPU/utils/se_padding_utils.hpp" #include "vpux/compiler/dialect/const/utils/affine_reshape.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/quantization.hpp" +#include + using namespace vpux; using namespace IE; diff --git a/src/vpux_compiler/src/dialect/IE/utils/expand_utils.cpp b/src/vpux_compiler/src/dialect/IE/utils/expand_utils.cpp index 6bbb465190..5756d06d13 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/expand_utils.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/expand_utils.cpp @@ -4,6 +4,8 @@ // #include "vpux/compiler/dialect/IE/utils/expand_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/utils/convolution_utils.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/utils/fake_quantize_utils.cpp b/src/vpux_compiler/src/dialect/IE/utils/fake_quantize_utils.cpp index a48cde57f0..cb0b20c9f1 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/fake_quantize_utils.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/fake_quantize_utils.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/IE/utils/fake_quantize_utils.hpp" #include "vpux/compiler/core/types/quantile_float/types.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" #include "vpux/compiler/dialect/IE/utils/broadcast_utils.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" @@ -347,7 +348,6 @@ mlir::FailureOr WeightsDequantizeStructureInfo:: WeightsDequantizeStructureInfo info(log); const auto status = info.initializeStructure(origOp); if (mlir::succeeded(status)) { - log.trace("Match succeeded"); return info; } return mlir::failure(); @@ -358,7 +358,6 @@ mlir::FailureOr WeightsDequantizeStructureInfo:: WeightsDequantizeStructureInfo info(log); const auto status = info.initializeStructure(origOp); if (mlir::succeeded(status)) { - log.trace("Match succeeded"); return info; } return mlir::failure(); diff --git a/src/vpux_compiler/src/dialect/IE/utils/handle_kernels_utils.cpp b/src/vpux_compiler/src/dialect/IE/utils/handle_kernels_utils.cpp index 236476eef7..3124eee444 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/handle_kernels_utils.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/handle_kernels_utils.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/IE/utils/handle_kernels_utils.hpp" +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/utils/interpolate_utils.cpp b/src/vpux_compiler/src/dialect/IE/utils/interpolate_utils.cpp index 8fec0eb33e..ad639f7e64 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/interpolate_utils.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/interpolate_utils.cpp @@ -5,14 +5,14 @@ #include "vpux/compiler/dialect/IE/utils/interpolate_utils.hpp" #include "vpux/compiler/core/attributes/dims_order.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/array_ref.hpp" #include "vpux/utils/core/error.hpp" #include "vpux/utils/core/numeric.hpp" -#include "vpux/compiler/utils/error.hpp" - namespace vpux { namespace IE { diff --git a/src/vpux_compiler/src/dialect/IE/utils/matmul.cpp b/src/vpux_compiler/src/dialect/IE/utils/matmul.cpp index d08cc4b1a4..856cc80d46 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/matmul.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/matmul.cpp @@ -2,12 +2,12 @@ // Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // + #include "vpux/compiler/dialect/IE/utils/matmul.hpp" #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/layers.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/utils/quantization.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/analysis.hpp" #include "vpux/utils/core/numeric.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/utils/permute_infer.cpp b/src/vpux_compiler/src/dialect/IE/utils/permute_infer.cpp index 442a73570a..a301cd8e0d 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/permute_infer.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/permute_infer.cpp @@ -4,8 +4,10 @@ // #include "vpux/compiler/dialect/IE/utils/permute_infer.hpp" - #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" +#include "vpux/compiler/utils/permute_utils.hpp" +#include "vpux/compiler/utils/quantization.hpp" void inferPermuteReturnTypeComponents(mlir::Value input, mlir::AffineMap mem_perm, mlir::AffineMap dst_order, SmallVectorImpl& inferredReturnShapes, diff --git a/src/vpux_compiler/src/dialect/IE/utils/permute_quantize_utils.cpp b/src/vpux_compiler/src/dialect/IE/utils/permute_quantize_utils.cpp index 7d12a9d1e9..ba55fc65ac 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/permute_quantize_utils.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/permute_quantize_utils.cpp @@ -4,8 +4,11 @@ // #include "vpux/compiler/dialect/IE/utils/permute_quantize_utils.hpp" +#include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/utils/pooling_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" + using namespace vpux; bool IE::isLegalReorderAddPattern(IE::ReorderOp origOp) { @@ -192,11 +195,12 @@ bool IE::isODUPermuteEffectiveForShape(const ShapeRef shape, const int64_t align return IH * IW % alignment == 0 || IW >= minimalEffectiveWidth; } -bool IE::canConvertToNCHWInOrderWithPermuteCast(vpux::NDTypeInterface inType, vpux::NDTypeInterface outType) { +bool IE::canConvertToNCHWInOrderWithPermuteCast(vpux::NDTypeInterface inType, mlir::AffineMap memPerm) { const auto inOrder = inType.getDimsOrder(); const auto inShape = inType.getShape(); - const auto outOrder = outType.getDimsOrder(); - return inOrder == DimsOrder::CNHW && inShape[Dims4D::Act::N] == 1 && outOrder == DimsOrder::NHWC; + + return inOrder == DimsOrder::CNHW && inShape[Dims4D::Act::N] == 1 && + DimsOrder::fromAffineMap(memPerm) == DimsOrder::CHWN; } bool IE::checkNCEPermuteShapeCompatibility(ShapeRef inShape, ShapeRef outShape, int64_t alignment) { diff --git a/src/vpux_compiler/src/dialect/IE/utils/quantization.cpp b/src/vpux_compiler/src/dialect/IE/utils/quantization.cpp index 9c89d81dfa..637b3a408e 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/quantization.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/quantization.cpp @@ -4,7 +4,8 @@ // #include "vpux/compiler/dialect/IE/utils/quantization.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/quantization.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/utils/reify_shape.cpp b/src/vpux_compiler/src/dialect/IE/utils/reify_shape.cpp index f708ca1ba6..79622c8333 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/reify_shape.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/reify_shape.cpp @@ -7,8 +7,7 @@ #include #include -#include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/transforms/passes.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/IE/utils/reify_shape.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" @@ -19,7 +18,6 @@ #include #include -#include using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/IE/utils/reshape_utils.cpp b/src/vpux_compiler/src/dialect/IE/utils/reshape_utils.cpp index 5814b22757..210ede4a62 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/reshape_utils.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/reshape_utils.cpp @@ -5,6 +5,10 @@ #include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" +#include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" +#include "vpux/compiler/dialect/const/utils/utils.hpp" +#include "vpux/compiler/utils/rewriter.hpp" + #include namespace { @@ -235,5 +239,20 @@ Shape getNewShapeAfterStrideFolding(ShapeRef origShape, int64_t SX) { return newShape; } +mlir::Value createDynamicReshape(mlir::OpBuilder& builder, mlir::Location loc, mlir::Value input, + BoundedShape outputShape) { + const auto ctx = builder.getContext(); + const auto shape = extractShape(outputShape); + const auto bounds = outputShape.toRepresentation(); + const auto outputRank = checked_cast(outputShape.size()); + const auto shapeType = mlir::RankedTensorType::get({outputRank}, getSInt32Type(ctx)); + const auto shapeValues = IE::replaceDynamicDimsWithValue(shape.raw(), -1); + + const auto shapeTensor = Const::createConst(builder, appendLoc(loc, "shape"), shapeType, shapeValues); + return builder.createOrFold(loc, input, shapeTensor, builder.getI64ArrayAttr(shape.raw()), + builder.getI64ArrayAttr(bounds.raw()), + /*only_set_shape=*/true); +} + } // namespace IE } // namespace vpux diff --git a/src/vpux_compiler/src/dialect/IE/utils/resources.cpp b/src/vpux_compiler/src/dialect/IE/utils/resources.cpp index 38dece76ba..98f09bf375 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/resources.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/resources.cpp @@ -41,7 +41,6 @@ IE::MemoryResourceOp getReservedMemoryResource(mlir::ModuleOp mainModule, mlir:: mlir::SymbolRefAttr memSpace); SmallVector getReservedMemoryResource(mlir::ModuleOp mainModule, mlir::StringLiteral reservedMemorySection); - } // namespace vpux::IE::details // @@ -68,14 +67,13 @@ bool vpux::IE::isNceTile(mlir::SymbolRefAttr executor) { // // MemoryResourceOp // - IE::MemoryResourceOp vpux::IE::details::addAvailableMemory(mlir::Region& region, mlir::SymbolRefAttr memSpace, Byte size) { VPUX_THROW_UNLESS(size.count() > 0, "Trying to set zero size of memory kind '{0}'", memSpace); const auto byteSizeAttr = getIntAttr(region.getContext(), size.count()); auto builder = mlir::OpBuilder::atBlockBegin(®ion.front()); return builder.create(mlir::UnknownLoc::get(region.getContext()), memSpace.getLeafReference(), - byteSizeAttr, nullptr); + byteSizeAttr, /*offset*/ nullptr); } IE::MemoryResourceOp vpux::IE::addAvailableMemory(mlir::ModuleOp mainModule, mlir::SymbolRefAttr memSpace, Byte size) { @@ -155,6 +153,50 @@ SmallVector vpux::IE::getReservedMemoryResources(mlir::Mod return resMemVec; } +namespace { +size_t getReservedMemoryStartOffset(mlir::ModuleOp mainModule, mlir::SymbolRefAttr memSpace) { + size_t startMemoryOffset = 0; + + // DDR reserved memory starts at 0. Adjustment is required only for CMX + auto cmxSpaceAttr = mlir::SymbolRefAttr::get(mainModule.getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN)); + if (memSpace == cmxSpaceAttr) { + startMemoryOffset = IE::getAvailableMemory(mainModule, cmxSpaceAttr).size().count(); + for (auto resource : IE::getReservedMemoryResources(mainModule, memSpace)) { + VPUX_THROW_WHEN(!resource.getOffset().has_value(), "reserved memory without offset"); + size_t offset = resource.getOffset().value(); + if (offset < startMemoryOffset) { + startMemoryOffset = offset; + } + } + } + + return startMemoryOffset; +} + +size_t getReservedMemoryEndOffset(mlir::ModuleOp mainModule, mlir::SymbolRefAttr memSpace) { + // CMX reserved memory always ends at the CMX end. + auto cmxSpaceAttr = mlir::SymbolRefAttr::get(mainModule.getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN)); + if (memSpace == cmxSpaceAttr) { + return IE::getAvailableMemory(mainModule, cmxSpaceAttr).size().count() - 1; + } + + size_t endMemoryOffset = 0; + for (auto resource : IE::getReservedMemoryResources(mainModule, memSpace)) { + VPUX_THROW_WHEN(!resource.getOffset().has_value(), "reserved memory without offset value"); + size_t offset = resource.getOffset().value() + resource.getByteSize(); + if (offset > endMemoryOffset) { + endMemoryOffset = offset; + } + } + + return endMemoryOffset; +} +} // namespace + +size_t vpux::IE::getReservedMemorySize(mlir::ModuleOp mainModule, mlir::SymbolRefAttr memSpace) { + return getReservedMemoryEndOffset(mainModule, memSpace) - getReservedMemoryStartOffset(mainModule, memSpace) + 1; +} + IE::MemoryResourceOp vpux::IE::details::addReservedMemoryResource(mlir::ModuleOp mainModule, mlir::StringLiteral reservedMemorySection, mlir::SymbolRefAttr memSpace, int64_t size) { @@ -168,6 +210,20 @@ IE::MemoryResourceOp vpux::IE::details::addReservedMemoryResource(mlir::ModuleOp } auto resMemBuilder = mlir::OpBuilder::atBlockBegin(resMemTable.getBody()); + auto cmxSpaceAttr = mlir::SymbolRefAttr::get(mainModule.getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN)); + // For DDR - reserve memory at the beginning of DDR space + // For CMX - reserve at the end of CMX space. This is done to satisfy the requirement of SW kernel + // data prefetching. When prefetching SW kernel can exceed the input buffer size potentially reading + // the memory outside the CMX range. To prevent this compiler reserves 1KiB of CMX at the end so that + // at worst reserved, but accessible, memory is read by SW kernel. + size_t offset = 0; + if (memSpace == cmxSpaceAttr) { + offset = getReservedMemoryStartOffset(mainModule, memSpace); + VPUX_THROW_WHEN(static_cast(offset) < size, "Out of CMX memory for reservation"); + offset -= size; + } else { + offset = getReservedMemoryEndOffset(mainModule, memSpace); + } auto resMemModule = resMemTable.lookupSymbol(reservedMemorySection); if (resMemModule == nullptr) { @@ -186,7 +242,8 @@ IE::MemoryResourceOp vpux::IE::details::addReservedMemoryResource(mlir::ModuleOp auto innerBuilder = mlir::OpBuilder::atBlockBegin(resMemModule.getBody()); return innerBuilder.create(mlir::UnknownLoc::get(resMemModule.getContext()), - memSpace.getLeafReference(), byteSizeAttr, nullptr); + memSpace.getLeafReference(), byteSizeAttr, + getIntAttr(mainModule.getContext(), offset)); }; IE::MemoryResourceOp vpux::IE::details::getReservedMemoryResource(mlir::ModuleOp mainModule, @@ -245,19 +302,9 @@ SmallVector> vpux::IE::getReservedMemOffsetAndSize // Check for reserved memory which memory scheduler should take into account // so that they not overlap with other buffers. Those reserved resource might be related // to handling of additional special features (e.g. DMA HW profiling) - auto reservedMemoryResources = IE::getReservedMemoryResources(module, memSpaceAttr); - if (!reservedMemoryResources.empty()) { - // Put all reserved resources starting from 0 if they were not assigned any address - size_t resMemOffset = 0; - for (auto& resMem : reservedMemoryResources) { - auto resMemSize = resMem.getByteSize(); - resMemOffset = resMem.getOffset().value_or(resMemOffset); - reservedMemVec.push_back(std::make_pair(resMemOffset, resMemSize)); - if (!resMem.getOffset().has_value()) { - resMem.setOffsetAttr(getIntAttr(module->getContext(), resMemOffset)); - } - resMemOffset += resMemSize; - } + for (auto& resMem : IE::getReservedMemoryResources(module, memSpaceAttr)) { + VPUX_THROW_UNLESS(resMem.getOffset().has_value(), "reserved memory resource without offset value"); + reservedMemVec.push_back(std::make_pair(resMem.getOffset().value(), resMem.getByteSize())); } return reservedMemVec; @@ -316,21 +363,20 @@ SmallVector vpux::IE::getSWKernelPrefetchingReservedMemory } // -// SW Kernel cache prefetching reserved memory +// Dummy SW Kernel prefetch reserved memory // -IE::MemoryResourceOp vpux::IE::setSWKernelCachePrefetchingReservedMemory(mlir::ModuleOp mainModule, - mlir::SymbolRefAttr memSpace, int64_t size) { - return details::addReservedMemoryResource(mainModule, swKernelCachePrefetchingResMemModuleName, memSpace, size); -} - -IE::MemoryResourceOp vpux::IE::getSWKernelCachePrefetchingReservedMemory(mlir::ModuleOp mainModule, - mlir::SymbolRefAttr memSpace) { - return details::getReservedMemoryResource(mainModule, swKernelCachePrefetchingResMemModuleName, memSpace); +IE::MemoryResourceOp vpux::IE::setDummySwKernelsForInstructionPrefetchReservedMemory(mlir::ModuleOp mainModule, + mlir::SymbolRefAttr memSpace, + int64_t size) { + return details::addReservedMemoryResource(mainModule, dummySwKernelsForInstructionPrefetchResMemModuleName, + memSpace, size); } -SmallVector vpux::IE::getSWKernelCachePrefetchingReservedMemory(mlir::ModuleOp mainModule) { - return details::getReservedMemoryResource(mainModule, swKernelCachePrefetchingResMemModuleName); +IE::MemoryResourceOp vpux::IE::getDummySwKernelsForInstructionPrefetchReservedMemory(mlir::ModuleOp mainModule, + mlir::SymbolRefAttr memSpace) { + return details::getReservedMemoryResource(mainModule, dummySwKernelsForInstructionPrefetchResMemModuleName, + memSpace); } // diff --git a/src/vpux_compiler/src/dialect/IE/utils/shape_infer.cpp b/src/vpux_compiler/src/dialect/IE/utils/shape_infer.cpp index dd36b1a72c..6e0c0b16fc 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/shape_infer.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/shape_infer.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/factors.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" diff --git a/src/vpux_compiler/src/dialect/IE/utils/slice_utils.cpp b/src/vpux_compiler/src/dialect/IE/utils/slice_utils.cpp index 036acf49e2..c820fcb42e 100644 --- a/src/vpux_compiler/src/dialect/IE/utils/slice_utils.cpp +++ b/src/vpux_compiler/src/dialect/IE/utils/slice_utils.cpp @@ -4,6 +4,8 @@ // #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/utils/core/range.hpp" namespace vpux { namespace IE { diff --git a/src/vpux_compiler/src/dialect/VPU/IR/attributes.cpp b/src/vpux_compiler/src/dialect/VPU/IR/attributes.cpp index a99088348b..a23e33ff6b 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/attributes.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/attributes.cpp @@ -4,22 +4,17 @@ // #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/utils/IE/private_properties.hpp" - -#include "vpux/compiler/core/attributes/stride_reqs.hpp" -#include "vpux/compiler/core/tiling.hpp" -#include "vpux/compiler/dialect/IE/IR/attributes.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/resources.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/native_attributes/distribution_info.hpp" #include "vpux/compiler/dialect/VPU/utils/op_tiling_cache.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/platform_resources.hpp" - -#include "vpux/utils/core/error.hpp" +#include "vpux/utils/IE/private_properties.hpp" #include "vpux/utils/core/mem_size.hpp" #include "vpux/utils/core/numeric.hpp" @@ -44,32 +39,11 @@ void VPU::VPUDialect::registerAttributes() { >(); } -// -// Run-time resources -// - -namespace { - -constexpr StringLiteral derateFactorAttrName = "VPU.derateFactor"; -constexpr StringLiteral bandwidthAttrName = "VPU.bandwidth"; /*!< This attribute corresponds to a single JSON field - nested at header>resources>memory_bandwidth>number in the deserialized version of the blob. - */ - -} // namespace - -StringLiteral vpux::VPU::getMemoryDerateAttrName() { - return derateFactorAttrName; -} - -StringLiteral vpux::VPU::getMemoryBandwidthAttrName() { - return bandwidthAttrName; -} - -uint32_t vpux::VPU::getMaxArchDPUClusterNum(ArchKind arch) { +uint32_t vpux::VPU::getMaxArchDPUClusterNum(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return VPUX37XX_MAX_DPU_GROUPS; - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return VPUX40XX_MAX_DPU_GROUPS; default: VPUX_THROW("Unsupported architecture '{0}'", arch); @@ -77,23 +51,23 @@ uint32_t vpux::VPU::getMaxArchDPUClusterNum(ArchKind arch) { } uint32_t vpux::VPU::getMaxArchDPUClusterNum(mlir::Operation* op) { - return VPU::getMaxArchDPUClusterNum(VPU::getArch(op)); + return VPU::getMaxArchDPUClusterNum(config::getArch(op)); } -uint32_t vpux::VPU::getMaxDMAPorts(ArchKind arch) { +uint32_t vpux::VPU::getMaxDMAPorts(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return VPUX37XX_MAX_DMA_PORTS; - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return VPUX40XX_MAX_DMA_PORTS; default: VPUX_THROW("Unsupported architecture '{0}'", arch); } } -double vpux::VPU::getDMABandwidth(ArchKind arch, VPU::RevisionID rev) { +double vpux::VPU::getDMABandwidth(config::ArchKind arch, config::RevisionID rev) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return VPUNN::get_dram_bandwidth_MBps(VPUNN::VPUDevice::VPU_2_7) / VPU::getDpuFrequency(arch, rev); default: if (VPUNN::PerformanceMode::forceLegacy_G4) { @@ -108,14 +82,14 @@ double vpux::VPU::getNCEThroughput() { return 8000000.0; } -unsigned int vpux::VPU::getDpuFrequency(vpux::VPU::ArchKind arch, vpux::VPU::RevisionID rev) { +unsigned int vpux::VPU::getDpuFrequency(vpux::config::ArchKind arch, vpux::config::RevisionID rev) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return VPUNN::get_dpu_fclk(VPUNN::VPUDevice::VPU_2_7); /*!< The value 1300 corresponds to Highvcc of dpuclk. (See VPUX37XX HAS #voltage-and-frequency-targets section). */ - case VPU::ArchKind::NPU40XX: - if (rev >= VPU::RevisionID::REVISION_B) { + case config::ArchKind::NPU40XX: + if (rev >= config::RevisionID::REVISION_B) { return 1850; // MHz; TODO: switch to the value from vpunn, once this frequency is implemented. E#127567 } return VPUNN::get_dpu_fclk(VPUNN::VPUDevice::VPU_4_0); @@ -125,14 +99,14 @@ unsigned int vpux::VPU::getDpuFrequency(vpux::VPU::ArchKind arch, vpux::VPU::Rev } double vpux::VPU::getDmaBandwidthGBps(mlir::ModuleOp module) { - const ArchKind arch = getArch(module); + const auto arch = config::getArch(module); return getDmaBandwidthGBps(arch); } -double vpux::VPU::getDmaBandwidthGBps(vpux::VPU::ArchKind arch) { +double vpux::VPU::getDmaBandwidthGBps(vpux::config::ArchKind arch) { double BW = 0; switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: BW = VPUNN::get_dram_bandwidth_MBps(VPUNN::VPUDevice::VPU_2_7); // 27000 MB/s break; default: @@ -148,19 +122,23 @@ double vpux::VPU::getDmaBandwidthGBps(vpux::VPU::ArchKind arch) { return BW; } +// NOTE: This function is expected to be called only after all CMX memory reservation. Byte vpux::VPU::getTotalCMXSize(mlir::ModuleOp module) { - auto cmxRes = IE::getAvailableMemory(module, VPU::MemoryKind::CMX_NN); - // This function is used to determine the best tile size. It tries to put maximum data in CMX. - // Available CMX memory is decreased by two profilingBufferSize even if profiling is disabled + // Available CMX memory will be decreased by the size of statically allocated reserved buffers(all reservations + // need to happen before this function is called) and by dynamic profling buffers which are not represented in + // the IR. Available CMX memory is decreased by two dynamicProfilingBufferSize even if profiling is disabled // because we want to get exactly same compiled networks with profiling enabled and disabled. // Two buffer sizes are required in case when profiling allocates new buffer and old buffer // is still not disposed. Second buffer can be treated as an optimisation that prevents spilling. - int64_t profilingBufferSize = vpux::VPUIP::HW_DMA_PROFILING_MAX_BUFFER_SIZE + - vpux::VPUIP::HW_DPU_PROFILING_MAX_BUFFER_SIZE + - vpux::VPUIP::HW_ACT_SHAVE_PROFILING_MAX_BUFFER_SIZE; + int64_t dynamicProfilingBufferSize = + vpux::VPUIP::HW_DPU_PROFILING_MAX_BUFFER_SIZE + vpux::VPUIP::HW_ACT_SHAVE_PROFILING_MAX_BUFFER_SIZE; - return cmxRes.size() - Byte(2 * profilingBufferSize); + auto cmxSpaceAttr = mlir::SymbolRefAttr::get(module.getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN)); + auto cmxSize = IE::getAvailableMemory(module, VPU::MemoryKind::CMX_NN).size(); + auto reservedCMXSize = IE::getReservedMemorySize(module, cmxSpaceAttr); + + return cmxSize - Byte(reservedCMXSize) - Byte(2 * dynamicProfilingBufferSize); } Byte vpux::VPU::getTotalCMXSize(mlir::Operation* op) { @@ -172,7 +150,7 @@ Byte vpux::VPU::getTotalCMXFragmentationAwareSize(mlir::ModuleOp module) { mlir::SymbolRefAttr::get(module.getContext(), VPU::CMX_NN_FragmentationAware)); VPUX_THROW_UNLESS(cmxRes != nullptr, "Can't get information about {0} memory", VPU::CMX_NN_FragmentationAware); - const ArchKind arch = getArch(module); + const auto arch = config::getArch(module); // This function is used to determine the best tile size. It tries to put maximum data in CMX. // Available CMX memory is decreased by two profilingBufferSize even if profiling is disabled @@ -181,7 +159,7 @@ Byte vpux::VPU::getTotalCMXFragmentationAwareSize(mlir::ModuleOp module) { // is still not disposed. Second buffer can be treated as an optimisation that prevents spilling. const int64_t profilingBufferSize = vpux::VPUIP::HW_DMA_PROFILING_MAX_BUFFER_SIZE + vpux::VPUIP::HW_DPU_PROFILING_MAX_BUFFER_SIZE + - ((arch == VPU::ArchKind::NPU37XX) ? vpux::VPUIP::HW_ACT_SHAVE_PROFILING_MAX_BUFFER_SIZE : 0); + ((arch == config::ArchKind::NPU37XX) ? vpux::VPUIP::HW_ACT_SHAVE_PROFILING_MAX_BUFFER_SIZE : 0); return cmxRes.size() - Byte(2 * profilingBufferSize); } @@ -194,268 +172,6 @@ Byte vpux::VPU::getTotalCMXVFPipelineFragmentationAwareSize(mlir::Operation* op) return Byte(static_cast(getTotalCMXSize(op).count()) * vpux::FRAGMENTATION_AVOID_RATIO_VF_PIPELINING); } -// -// ArchKind -// - -namespace { - -constexpr StringLiteral archAttrName = "VPU.arch"; - -constexpr Byte DDR_HEAP_SIZE = 64000_MB; - -struct Resources { - int numOfDPUGroups = 1; - std::optional numOfDMAPorts = std::nullopt; - std::optional availableCMXMemory = std::nullopt; - - Resources(int numOfDPUGroups, std::optional numOfDMAPorts, std::optional availableCMXMemory) - : numOfDPUGroups(numOfDPUGroups), numOfDMAPorts(numOfDMAPorts), availableCMXMemory(availableCMXMemory) { - } -}; - -struct SetResoursesFuncs { - using AddExecutorFuncType = FuncRef; - using AddTileExecutorFuncType = FuncRef; - using AddSubExecutorFuncType = FuncRef; - using AddMemoryFuncType = FuncRef; - using AddMemoryWithAttrsFuncType = FuncRef; - using AddInnerMemoryFuncType = FuncRef; - using AddInnerMemoryWithAttrsFuncType = - FuncRef; - - AddExecutorFuncType addExecutor; - AddTileExecutorFuncType addTileExecutor; - AddSubExecutorFuncType addSubExecutor; - AddMemoryFuncType addMemory; - AddMemoryWithAttrsFuncType addMemoryWithAttrs; - AddInnerMemoryFuncType addInnerMemory; - AddInnerMemoryWithAttrsFuncType addInnerMemoryWithAttrs; - - SetResoursesFuncs(AddExecutorFuncType addExecutor, AddTileExecutorFuncType addTileExecutor, - AddSubExecutorFuncType addSubExecutor, AddMemoryFuncType addMemory, - AddMemoryWithAttrsFuncType addMemoryWithAttrs, AddInnerMemoryFuncType addInnerMemory, - AddInnerMemoryWithAttrsFuncType addInnerMemoryWithAttrs) - : addExecutor(addExecutor), - addTileExecutor(addTileExecutor), - addSubExecutor(addSubExecutor), - addMemory(addMemory), - addMemoryWithAttrs(addMemoryWithAttrs), - addInnerMemory(addInnerMemory), - addInnerMemoryWithAttrs(addInnerMemoryWithAttrs) { - } -}; - -void setArch(mlir::ModuleOp module, VPU::ArchKind kind, const Resources& res, const SetResoursesFuncs& funcs, - bool allowCustom) { - VPUX_THROW_WHEN(!allowCustom && module->hasAttr(archAttrName), - "Architecture is already defined, probably you run '--init-compiler' twice"); - - if (!module->hasAttr(archAttrName)) { - module->setAttr(archAttrName, VPU::ArchKindAttr::get(module.getContext(), kind)); - } - - auto numOfDPUGroups = res.numOfDPUGroups; - auto numOfDMAPorts = res.numOfDMAPorts; - auto availableCMXMemory = res.availableCMXMemory; - - const auto getNumOfDMAPortsVal = [&](int maxDmaPorts) { - int numOfDMAPortsVal = numOfDMAPorts.has_value() ? numOfDMAPorts.value() : maxDmaPorts; - return numOfDMAPortsVal; - }; - - IE::TileResourceOp nceCluster; - - const auto ddrSymbolAttr = mlir::SymbolRefAttr::get(module.getContext(), stringifyEnum(VPU::MemoryKind::DDR)); - const auto cmxSymbolAttr = mlir::SymbolRefAttr::get(module.getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN)); - const auto cmxFragAwareSymbolAttr = mlir::SymbolRefAttr::get(module.getContext(), VPU::CMX_NN_FragmentationAware); - - switch (kind) { - case VPU::ArchKind::NPU37XX: { - const auto workspaceCMXSize = - availableCMXMemory.has_value() ? availableCMXMemory.value() : VPUX37XX_CMX_WORKSPACE_SIZE; - const auto workspaceFragmentationAwareSize = - availableCMXMemory.has_value() - ? Byte(static_cast(availableCMXMemory.value().count()) * FRAGMENTATION_AVOID_RATIO) - : VPUX37XX_CMX_WORKSPACE_FRAGMENTATION_AWARE_SIZE; - - funcs.addMemoryWithAttrs(ddrSymbolAttr, DDR_HEAP_SIZE, 0.6, 8); - - // Have NN_DMA as shared resource across clusters - funcs.addExecutor(VPU::ExecutorKind::DMA_NN, getNumOfDMAPortsVal(VPUX37XX_MAX_DMA_PORTS)); - nceCluster = funcs.addTileExecutor(numOfDPUGroups); - funcs.addSubExecutor(nceCluster, VPU::ExecutorKind::DPU, 1); - funcs.addSubExecutor(nceCluster, VPU::ExecutorKind::SHAVE_NN, 1); - funcs.addSubExecutor(nceCluster, VPU::ExecutorKind::SHAVE_ACT, 2); - funcs.addInnerMemoryWithAttrs(nceCluster, cmxSymbolAttr, workspaceCMXSize, 1.0, 32); - funcs.addInnerMemory(nceCluster, cmxFragAwareSymbolAttr, workspaceFragmentationAwareSize); - - break; - } - case VPU::ArchKind::NPU40XX: { - const auto workspaceCMXSize = - availableCMXMemory.has_value() ? availableCMXMemory.value() : VPUX40XX_CMX_WORKSPACE_SIZE; - const auto workspaceFragmentationAwareSize = - availableCMXMemory.has_value() - ? Byte(static_cast(availableCMXMemory.value().count()) * FRAGMENTATION_AVOID_RATIO) - : VPUX40XX_CMX_WORKSPACE_FRAGMENTATION_AWARE_SIZE; - - funcs.addMemoryWithAttrs(ddrSymbolAttr, DDR_HEAP_SIZE, 0.6, 64); - - // Have NN_DMA as shared resource across clusters - auto numClusters = numOfDPUGroups; - funcs.addExecutor(VPU::ExecutorKind::DMA_NN, - getNumOfDMAPortsVal(std::min(numClusters, VPUX40XX_MAX_DMA_PORTS))); - funcs.addExecutor(VPU::ExecutorKind::M2I, 1); - nceCluster = funcs.addTileExecutor(numClusters); - funcs.addSubExecutor(nceCluster, VPU::ExecutorKind::DPU, 1); - funcs.addSubExecutor(nceCluster, VPU::ExecutorKind::SHAVE_ACT, 2); - funcs.addInnerMemoryWithAttrs(nceCluster, cmxSymbolAttr, workspaceCMXSize, 1.0, 64); - funcs.addInnerMemory(nceCluster, cmxFragAwareSymbolAttr, workspaceFragmentationAwareSize); - - break; - } - default: - VPUX_THROW("Unsupported architecture '{0}'", kind); - } - - VPUX_THROW_WHEN(!allowCustom && nceCluster.hasProcessorFrequency(), - "Processor frequencyis already defined, probably you run '--init-compiler' twice"); -} - -} // namespace - -void vpux::VPU::setArch(mlir::ModuleOp module, ArchKind kind, int numOfDPUGroups, std::optional numOfDMAPorts, - std::optional availableCMXMemory, bool allowCustomValues) { - const auto addExecutor = [&](VPU::ExecutorKind kind, size_t count) { - VPUX_THROW_WHEN(!allowCustomValues && IE::hasExecutor(module, kind), - "Available executor kind '{0}' was already added", kind); - if (IE::hasExecutor(module, kind)) { - return IE::getAvailableExecutor(module, kind); - } - - return IE::addAvailableExecutor(module, kind, count); - }; - - const auto addTileExecutor = [&](size_t count) { - VPUX_THROW_WHEN(!allowCustomValues && IE::hasTileExecutor(module), "Available tile executor was already added"); - if (IE::hasTileExecutor(module)) { - return IE::getTileExecutor(module); - } - - return IE::addTileExecutor(module, count); - }; - - const auto addSubExecutor = [&](IE::TileResourceOp tileResOp, VPU::ExecutorKind kind, size_t count) { - VPUX_THROW_WHEN(!allowCustomValues && tileResOp.hasSubExecutor(kind), - "Available executor kind '{0}' was already added", kind); - if (tileResOp.hasSubExecutor(kind)) { - return tileResOp.getSubExecutor(kind); - } - - return tileResOp.addSubExecutor(kind, count); - }; - - const auto addAvailableMemory = [&](mlir::SymbolRefAttr memSpace, Byte size) { - VPUX_THROW_WHEN(!allowCustomValues && IE::hasAvailableMemory(module, memSpace), - "Available memory kind '{0}' was already added", memSpace); - if (IE::hasAvailableMemory(module, memSpace)) { - return IE::getAvailableMemory(module, memSpace); - } - - return IE::addAvailableMemory(module, memSpace, size); - }; - - const auto addMemWithAttrs = [&](mlir::SymbolRefAttr memSpace, Byte size, double derateFactor, size_t bandwidth) { - auto mem = addAvailableMemory(memSpace, size); - if (!mem->hasAttr(derateFactorAttrName)) { - mem->setAttr(derateFactorAttrName, getFPAttr(module.getContext(), derateFactor)); - } - - if (!mem->hasAttr(bandwidthAttrName)) { - mem->setAttr(bandwidthAttrName, getIntAttr(module.getContext(), bandwidth)); - } - }; - - const auto addInnerAvailableMemory = [&](IE::TileResourceOp tileResOp, mlir::SymbolRefAttr memSpace, Byte size) { - VPUX_THROW_WHEN(!allowCustomValues && tileResOp.hasAvailableMemory(memSpace), - "Available memory kind '{0}' was already added", memSpace); - if (tileResOp.hasAvailableMemory(memSpace)) { - return tileResOp.getAvailableMemory(memSpace); - } - - return tileResOp.addAvailableMemory(memSpace, size); - }; - - const auto addInnerAvailableMemoryWithAttrs = [&](IE::TileResourceOp tileResOp, mlir::SymbolRefAttr memSpace, - Byte size, double derateFactor, size_t bandwidth) { - auto mem = addInnerAvailableMemory(tileResOp, memSpace, size); - if (!mem->hasAttr(derateFactorAttrName)) { - mem->setAttr(derateFactorAttrName, getFPAttr(module.getContext(), derateFactor)); - } - - if (!mem->hasAttr(bandwidthAttrName)) { - mem->setAttr(bandwidthAttrName, getIntAttr(module.getContext(), bandwidth)); - } - }; - - ::Resources res(numOfDPUGroups, numOfDMAPorts, availableCMXMemory); - ::SetResoursesFuncs funcs(addExecutor, addTileExecutor, addSubExecutor, addAvailableMemory, addMemWithAttrs, - addInnerAvailableMemory, addInnerAvailableMemoryWithAttrs); - - return ::setArch(module, kind, res, funcs, allowCustomValues); -} - -VPU::ArchKind vpux::VPU::getArch(mlir::Operation* op) { - auto module = getModuleOp(op); - - if (auto attr = module->getAttr(archAttrName)) { - VPUX_THROW_UNLESS(mlir::isa(attr), - "Module attribute '{0}' has unsupported value '{1}'", archAttrName, attr); - return mlir::cast(attr).getValue(); - } - - return VPU::ArchKind::UNKNOWN; -} - -// To discern between VPUX3XXX and later on architectures -bool vpux::VPU::isArchVPUX3XXX(VPU::ArchKind arch) { - return (arch == VPU::ArchKind::NPU37XX); -} - -// -// RevisionID -// - -namespace { - -constexpr StringLiteral revisionIDAttrName = "VPU.revisionID"; - -} // namespace - -void vpux::VPU::setRevisionID(mlir::ModuleOp module, RevisionID revisionID) { - module->setAttr(revisionIDAttrName, VPU::RevisionIDAttr::get(module.getContext(), revisionID)); -} - -bool vpux::VPU::hasRevisionID(mlir::ModuleOp module) { - return module->hasAttr(revisionIDAttrName); -} - -VPU::RevisionID vpux::VPU::getRevisionID(mlir::Operation* op) { - auto module = getModuleOp(op); - - if (module->hasAttr(revisionIDAttrName)) { - if (auto attr = module->getAttr(revisionIDAttrName)) { - VPUX_THROW_UNLESS(mlir::isa(attr), - "Module attribute '{0}' has unsupported value '{1}'", revisionIDAttrName, attr); - - return mlir::cast(attr).getValue(); - } - } - - return VPU::RevisionID::REVISION_NONE; -} - // // PaddingAttr // @@ -822,7 +538,7 @@ bool vpux::VPU::isDistributionWithExplicitShapesAndOffsets(const VPU::Distributi } bool vpux::VPU::isUniformDistributedSegmentsSupported(mlir::Operation* op) { - return !VPU::isArchVPUX3XXX(VPU::getArch(op)); + return !config::isArchVPUX3XXX(config::getArch(op)); } // diff --git a/src/vpux_compiler/src/dialect/VPU/IR/dialect.cpp b/src/vpux_compiler/src/dialect/VPU/IR/dialect.cpp index f7bae2e805..d3d485dd8c 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/dialect.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/dialect.cpp @@ -12,6 +12,7 @@ #include "vpux/compiler/dialect/core/IR/dialect.hpp" #include "vpux/compiler/dialect/net/IR/dialect.hpp" +#include #include #include diff --git a/src/vpux_compiler/src/dialect/VPU/IR/native_attributes/distribution_info.cpp b/src/vpux_compiler/src/dialect/VPU/IR/native_attributes/distribution_info.cpp index abe4b17e41..76fba4af5d 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/native_attributes/distribution_info.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/native_attributes/distribution_info.cpp @@ -4,12 +4,13 @@ // #include "vpux/compiler/dialect/VPU/IR/native_attributes/distribution_info.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/utils/attributes.hpp" +#include "vpux/utils/core/optional.hpp" -namespace vpux { -namespace VPU { -vpux::VPU::DistributionInfo vpux::VPU::DistributionInfo::getClassFromAttr( - vpux::VPU::DistributionInfoAttr distributionAttr) { +using namespace vpux; + +VPU::DistributionInfo vpux::VPU::DistributionInfo::getClassFromAttr(vpux::VPU::DistributionInfoAttr distributionAttr) { if (distributionAttr == nullptr) { return {}; } @@ -47,7 +48,7 @@ vpux::VPU::DistributionInfo vpux::VPU::DistributionInfo::getClassFromAttr( memoryOffsets, equalMemoryAndComputeView); } -vpux::VPU::DistributionInfoAttr vpux::VPU::DistributionInfo::getAttrFromClass( +VPU::DistributionInfoAttr vpux::VPU::DistributionInfo::getAttrFromClass( mlir::MLIRContext* ctx, const vpux::VPU::DistributionInfo& distribution) { auto modeAttr = vpux::VPU::DistributionModeAttr::get(ctx, distribution.getDistributionMode()); auto numClustersAttr = vpux::getIntAttr(ctx, distribution.getNumClusters()); @@ -85,5 +86,39 @@ vpux::VPU::DistributionInfoAttr vpux::VPU::DistributionInfo::getAttrFromClass( computeShapesAttr, computeOffsetsAttr, memoryShapesAttr, memoryOffsetsAttr, equalMemoryAndComputeViewAttr); } -} // namespace VPU -} // namespace vpux + +void VPU::DistributionInfo::printFormat(llvm::raw_ostream& stream) const { + printTo(stream, "\n#VPU.DistributedTensor", _equalMemoryAndComputeView); +} diff --git a/src/vpux_compiler/src/dialect/VPU/IR/native_attributes/padding_native.cpp b/src/vpux_compiler/src/dialect/VPU/IR/native_attributes/padding_native.cpp new file mode 100644 index 0000000000..1ec0ae0c54 --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPU/IR/native_attributes/padding_native.cpp @@ -0,0 +1,42 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/VPU/IR/native_attributes/padding_native.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/utils/attributes.hpp" + +using namespace vpux; + +VPU::Padding VPU::Padding::getClassFromAttr(PaddingAttr paddingAttr) { + if (paddingAttr == nullptr) { + return {}; + } + + auto left = paddingAttr.getLeft().getInt(); + auto right = paddingAttr.getRight().getInt(); + auto top = paddingAttr.getTop().getInt(); + auto bottom = paddingAttr.getBottom().getInt(); + + return Padding(left, right, top, bottom); +} + +VPU::PaddingAttr VPU::Padding::getAttrFromClass(mlir::MLIRContext* ctx, const Padding& padding) { + auto topAttr = vpux::getIntAttr(ctx, padding.top); + auto bottomAttr = vpux::getIntAttr(ctx, padding.bottom); + auto leftAttr = vpux::getIntAttr(ctx, padding.left); + auto rightAttr = vpux::getIntAttr(ctx, padding.right); + + return PaddingAttr::get(ctx, leftAttr, rightAttr, topAttr, bottomAttr); +}; + +void VPU::Padding::printFormat(llvm::raw_ostream& stream) const { + std::unordered_map map; + map["left"] = left; + map["right"] = right; + map["top"] = top; + map["bottom"] = bottom; + printTo(stream, "pads = "); + vpux::MapFormatProvider::format(map, stream, {}); +} diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/abs.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/abs.cpp index e6f646074e..f9a81fa2f2 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/abs.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/abs.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -66,7 +67,7 @@ bool vpux::VPU::AbsOp::fitIntoCMX(llvm::ArrayRef buffers, auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/accumulate.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/accumulate.cpp index 225a2e6eb9..9da11ce97f 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/accumulate.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/accumulate.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -86,7 +87,7 @@ bool vpux::VPU::AccumulateOp::fitIntoCMX(llvm::ArrayRef b auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/add.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/add.cpp index 4ebda6b084..c37746b42f 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/add.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/add.cpp @@ -9,6 +9,7 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -83,7 +84,7 @@ bool vpux::VPU::AddOp::fitIntoCMX(llvm::ArrayRef buffers, auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/and.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/and.cpp index ec2b153746..e6429d4821 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/and.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/and.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -78,7 +79,7 @@ bool vpux::VPU::AndOp::fitIntoCMX(llvm::ArrayRef buffers, auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/avgpool.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/avgpool.cpp index 685b1f5e0f..b1ffcfad62 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/avgpool.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/avgpool.cpp @@ -3,13 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/utils/pad_extract.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/empty_node.hpp" - -#include "vpux/utils/core/checked_cast.hpp" -#include "vpux/utils/core/range.hpp" - #include "vpux/compiler/utils/infer_output_shape.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_and.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_and.cpp index b0d0f3b2cf..3358b079ef 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_and.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_and.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/utils/core/checked_cast.hpp" @@ -83,7 +84,7 @@ bool vpux::VPU::BitwiseAndOp::fitIntoCMX(llvm::ArrayRef b auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_not.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_not.cpp index a3075b8f7c..cea851db48 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_not.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_not.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -70,7 +71,7 @@ bool vpux::VPU::BitwiseNotOp::fitIntoCMX(llvm::ArrayRef b auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_or.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_or.cpp index c24476a5b7..478ae4375e 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_or.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_or.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/utils/core/checked_cast.hpp" @@ -82,7 +83,7 @@ bool vpux::VPU::BitwiseOrOp::fitIntoCMX(llvm::ArrayRef bu auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_xor.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_xor.cpp index 927d4b5e74..048ab28c23 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_xor.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/bitwise_xor.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/utils/core/checked_cast.hpp" @@ -83,7 +84,7 @@ bool vpux::VPU::BitwiseXorOp::fitIntoCMX(llvm::ArrayRef b auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/ceiling.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/ceiling.cpp index 5db82f7317..abc45dd66d 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/ceiling.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/ceiling.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -67,7 +67,7 @@ bool vpux::VPU::CeilingOp::fitIntoCMX(llvm::ArrayRef buff auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/clamp.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/clamp.cpp index c6693affeb..bc78857059 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/clamp.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/clamp.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -67,7 +68,7 @@ bool vpux::VPU::ClampOp::fitIntoCMX(llvm::ArrayRef buffer auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/concat.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/concat.cpp index 20ada97f4a..a8d2b5ba0d 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/concat.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/concat.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" @@ -11,12 +11,12 @@ #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/sw_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/quantization.hpp" - #include "vpux/utils/core/checked_cast.hpp" #include @@ -710,23 +710,11 @@ mlir::LogicalResult FuseConcatsWithDifferentAxes::matchAndRewrite(VPU::ConcatOp return false; } auto concatAxes = getConcatAxesFromOffsets(concatOp, getShape(concatOp.getResult())); - if (concatAxes.size() != 1) { - return false; - } return llvm::all_of(op->getUsers(), [&](const auto& user) { auto sliceOp = mlir::dyn_cast(user); if (sliceOp == nullptr || !sliceOp->hasOneUse()) { return false; } - auto inShape = getShape(sliceOp.getSource()); - auto outShape = getShape(sliceOp.getResult()); - auto diffAxesNum = llvm::count_if(irange(inShape.size()), [&](auto idx) { - return inShape[Dim(idx)] != outShape[Dim(idx)]; - }); - if (diffAxesNum != 1) { - return false; - } - auto nextUser = *sliceOp->getUsers().begin(); if (!mlir::isa(nextUser)) { return false; @@ -1063,7 +1051,7 @@ bool VPU::ConcatOp::doesLayerFitIntoCMX(VPU::MultiClusterStrategy strategy, Sibl auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } @@ -1072,7 +1060,7 @@ bool vpux::VPU::ConcatOp::fitIntoCMX(vpux::NDTypeInterface output, Byte reserved auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); SmallVector buffers{output.getTotalAllocSize()}; - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffers).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/convert.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/convert.cpp index 9202b52ad5..726b7e79f3 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/convert.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/convert.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -104,7 +105,7 @@ bool vpux::VPU::ConvertOp::fitIntoCMX(llvm::ArrayRef buff auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/convolution.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/convolution.cpp index b4c9f699d3..73ff56e0b5 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/convolution.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/convolution.cpp @@ -3,16 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" - #include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/dialect/IE/utils/pad_extract.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/empty_node.hpp" #include "vpux/compiler/utils/error.hpp" - #include "vpux/utils/core/checked_cast.hpp" #include +#include using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/copy.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/copy.cpp index 763c131889..12a3adf97f 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/copy.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/copy.cpp @@ -24,7 +24,7 @@ mlir::LogicalResult vpux::VPU::CopyOp::inferReturnTypes(mlir::MLIRContext* ctx, const auto ndInType = mlir::dyn_cast(copyOp.getInput().getType()); if (ndInType == nullptr) { - return errorAt(loc, "IE::CopyOp operand must have vpux::NDTypeInterface type"); + return errorAt(loc, "CopyOp operand must have vpux::NDTypeInterface type"); } IndexedSymbolAttr outMemSpace = nullptr; diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/cos.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/cos.cpp index 1d916c55ca..1e73f5416d 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/cos.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/cos.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -63,7 +63,7 @@ bool vpux::VPU::CosOp::fitIntoCMX(llvm::ArrayRef buffers, auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/cum_sum.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/cum_sum.cpp index 7874f90804..34922205ab 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/cum_sum.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/cum_sum.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -103,7 +103,7 @@ bool vpux::VPU::CumSumOp::fitIntoCMX(llvm::ArrayRef buffe auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/depth_to_space.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/depth_to_space.cpp index 94f0f9b38f..61159cab74 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/depth_to_space.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/depth_to_space.cpp @@ -3,12 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/utils/core/checked_cast.hpp" @@ -73,7 +72,7 @@ bool vpux::VPU::DepthToSpaceOp::fitIntoCMX(llvm::ArrayRef auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/dequantize.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/dequantize.cpp index 236c9f0379..16d52df64c 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/dequantize.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/dequantize.cpp @@ -3,11 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" -#include "vpux/compiler/utils/analysis.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -81,7 +80,7 @@ bool vpux::VPU::DequantizeOp::fitIntoCMX(llvm::ArrayRef b auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/detection_output_sort.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/detection_output_sort.cpp index b666544172..6ddd103d1c 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/detection_output_sort.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/detection_output_sort.cpp @@ -10,6 +10,7 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -173,7 +174,7 @@ bool vpux::VPU::DetectionOutputSortOp::fitIntoCMX(llvm::ArrayRef buffe auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/dynamic_dequantize.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/dynamic_dequantize.cpp index 0109ed5b3d..e334ab6b08 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/dynamic_dequantize.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/dynamic_dequantize.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; @@ -116,7 +117,7 @@ bool vpux::VPU::DynamicDequantizeOp::fitIntoCMX(llvm::ArrayRef auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/equal.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/equal.cpp index 9daa5245f3..7ef14f0efe 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/equal.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/equal.cpp @@ -6,8 +6,8 @@ #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -73,7 +73,7 @@ bool vpux::VPU::EqualOp::fitIntoCMX(llvm::ArrayRef buffer auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/exp.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/exp.cpp index 26b40069a2..359d2e5559 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/exp.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/exp.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -63,7 +63,7 @@ bool vpux::VPU::ExpOp::fitIntoCMX(llvm::ArrayRef buffers, auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/fake_quantize.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/fake_quantize.cpp index 77873f2326..0da9718dc5 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/fake_quantize.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/fake_quantize.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" @@ -130,7 +130,7 @@ bool vpux::VPU::FakeQuantizeOp::fitIntoCMX(llvm::ArrayRef auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/floor.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/floor.cpp index 858889ae10..a552f2377c 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/floor.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/floor.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -67,7 +67,7 @@ bool vpux::VPU::FloorOp::fitIntoCMX(llvm::ArrayRef buffer auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/gather.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/gather.cpp index 68026d77e9..b93732909d 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/gather.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/gather.cpp @@ -8,10 +8,10 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/gather_dma_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/dialect/core/types.hpp" -#include "vpux/compiler/utils/dynamic_shape_propagation.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/utils/core/checked_cast.hpp" @@ -285,7 +285,7 @@ bool vpux::VPU::GatherOp::fitIntoCMX(llvm::ArrayRef buffe auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/gatherND.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/gatherND.cpp index 6e2d2adcd4..cdd7ac21c3 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/gatherND.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/gatherND.cpp @@ -8,8 +8,8 @@ #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" @@ -256,7 +256,7 @@ bool vpux::VPU::GatherNDOp::fitIntoCMX(llvm::ArrayRef buf auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/gather_dma.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/gather_dma.cpp index f436cfd216..af5709f251 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/gather_dma.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/gather_dma.cpp @@ -5,7 +5,12 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/gather_dma_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/type_infer.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" using namespace vpux; @@ -28,7 +33,8 @@ mlir::LogicalResult vpux::VPU::GatherDMAOp::inferReturnTypes(mlir::MLIRContext*, auto outputShape = inputShape.toValues(); outputShape[Dim(axis)] = indicesShape[Dim(axis)]; - auto outType = inputType.changeShape(Shape(std::move(outputShape))); + auto outType = mlir::RankedTensorType::get(to_small_vector(outputShape), inputType.getElementType(), + createTensorAttrFromType(inputType)); inferredReturnTypes.push_back(outType); return mlir::success(); @@ -111,3 +117,84 @@ mlir::FailureOr vpux::VPU::GatherDMAOp::getTilingStrategy(TilingMo log.trace("Isolated tiling strategy: {0}", nTilesOnDimforGather); return fillDividedTiles(baseOp, nTilesOnDimforGather, outputShape); } + +// +// ClusteredOpInterface +// + +bool vpux::VPU::GatherDMAOp::checkStrategyCompatibility(VPU::MultiClusterStrategy strategy, size_t numTiles) { + const auto indicesShape = getShape(getIndices()); + const auto outputShape = getShape(getOutput()); + if (strategy == VPU::MultiClusterStrategy::SplitOverHeight) { + return indicesShape[Dims4D::Act::H] == 1 && outputShape[Dims4D::Act::H] >= checked_cast(numTiles); + } + + if (strategy == VPU::MultiClusterStrategy::SplitOverWidth) { + return indicesShape[Dims4D::Act::W] == 1 && outputShape[Dims4D::Act::W] >= checked_cast(numTiles); + } + + return false; +} + +vpux::VPU::DistributionInfo vpux::VPU::GatherDMAOp::getExplicitDistributionInfoAttr( + vpux::ShapeRef shape, vpux::VPU::DistributionMode distributionMode, ArrayRef numTiles, + const int64_t numClusters, ArrayRef alignment, const bool uniformDistributedSegments, + const vpux::VPU::OverlapDistributionParams& /*overlapParams*/) { + VPUX_THROW_UNLESS(distributionMode != VPU::DistributionMode::OVERLAPPED, + "Overlapped distribution mode is not supported for GatherDMAOp"); + + return getNonOverlappedDistributedNative(shape, distributionMode, numTiles, numClusters, alignment, + uniformDistributedSegments); +} + +vpux::NDTypeInterface vpux::VPU::GatherDMAOp::getDistributedTypeForOpOperand(mlir::OpOperand& operand, + bool hasExplicitDistributedAttr, + SiblingOpsAnalysis& siblingsAnalysis) { + auto clusteredOp = mlir::cast(getOperation()); + auto origOp = mlir::cast(getOperation()); + + if (operand.get() == origOp.getInput()) { + return mlir::dyn_cast(origOp.getInput().getType()); + } + if (operand.get() == origOp.getIndices()) { + return getDistributedTypeFromInput(clusteredOp, operand.get(), VPU::DistributionMode::DUPLICATED, {}, {}, + VPU::MultiClusterStrategy::Clustering, hasExplicitDistributedAttr, + siblingsAnalysis); + } + + VPUX_THROW("Failed to compute distributed type for op operand {0}", clusteredOp); + return nullptr; +} + +bool vpux::VPU::GatherDMAOp::fitIntoCMX(llvm::ArrayRef buffers, Byte reservedMem) { + VPUX_THROW_UNLESS(buffers.size() == 2, + "GatherDMAOp has 2 inputs and 1 output, and we only need to fit indices and output in CMX, but" + "the number of buffer is { 0 } ", + buffers.size()); + + SmallVector buffersSize; + std::transform(buffers.begin(), buffers.end(), std::back_inserter(buffersSize), [](const auto buffer) { + return buffer.getTotalAllocSize(); + }); + auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() + : getTotalCMXFragmentationAwareSize(getOperation()).count(); + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + + reservedMem.count() <= + totalAvailableCMXSize; +} + +bool vpux::VPU::GatherDMAOp::fitIntoCMX(vpux::NDTypeInterface indices, vpux::NDTypeInterface output, Byte reservedMem) { + SmallVector buffers = {indices.getTotalAllocSize(), output.getTotalAllocSize()}; + + auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() + : getTotalCMXFragmentationAwareSize(getOperation()).count(); + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffers).count() + + reservedMem.count() <= + totalAvailableCMXSize; +} + +bool vpux::VPU::GatherDMAOp::fitIntoCMX(vpux::NDTypeInterface indices, vpux::NDTypeInterface output) { + return fitIntoCMX(indices, output, Byte(0)); +} diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/gather_elements.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/gather_elements.cpp index 9aa02ecf18..6d219224ae 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/gather_elements.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/gather_elements.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/gather_dma_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -126,7 +127,7 @@ bool vpux::VPU::GatherElementsOp::fitIntoCMX(llvm::ArrayRef buffers auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/generic_sw_layer.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/generic_sw_layer.cpp index 79c05902a9..8d957d58c4 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/generic_sw_layer.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/generic_sw_layer.cpp @@ -3,14 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/IR/types.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include #include -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include #include @@ -21,33 +19,6 @@ using namespace vpux; // E#152917 Analyze & settle on GenericSwLayerOp integration & vpux interface usage -// -// SWOpInterface -// - -bool VPU::GenericSwLayerOp::fitIntoCMX(llvm::ArrayRef buffers, Byte reservedMem) { - SmallVector buffersSize; - - llvm::transform(buffers, std::back_inserter(buffersSize), [](const auto buffer) { - return buffer.getTotalAllocSize(); - }); - - auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() - : getTotalCMXFragmentationAwareSize(getOperation()).count(); - - return VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + - reservedMem.count() <= - totalAvailableCMXSize; -} - -bool VPU::GenericSwLayerOp::fitIntoCMX(llvm::ArrayRef buffers) { - return fitIntoCMX(buffers, Byte(0)); -} - -bool VPU::GenericSwLayerOp::supportCycleCostCalculation() { - return false; -} - // // SymbolUserOpInterface // diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/greater.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/greater.cpp index ba0b809025..91142364cb 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/greater.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/greater.cpp @@ -6,8 +6,8 @@ #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -78,7 +78,7 @@ bool vpux::VPU::GreaterOp::fitIntoCMX(llvm::ArrayRef buff auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/greater_equal.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/greater_equal.cpp index 6a737b2079..56e2547678 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/greater_equal.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/greater_equal.cpp @@ -6,8 +6,8 @@ #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -80,7 +80,7 @@ bool vpux::VPU::GreaterEqualOp::fitIntoCMX(llvm::ArrayRef auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/grid_sample.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/grid_sample.cpp index d8c4c7348a..362f4d66db 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/grid_sample.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/grid_sample.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -165,7 +165,7 @@ bool vpux::VPU::GridSampleOp::fitIntoCMX(llvm::ArrayRef b auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/group_convolution.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/group_convolution.cpp index d7a10cf466..d948d3b246 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/group_convolution.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/group_convolution.cpp @@ -3,8 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/utils/pad_extract.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" using namespace vpux; @@ -68,7 +70,7 @@ bool vpux::VPU::GroupConvolutionOp::fitIntoCMX(vpux::NDTypeInterface input, vpux auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffers).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/gru_gates.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/gru_gates.cpp index 06a7a2f017..ebeb4d6679 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/gru_gates.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/gru_gates.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -112,7 +112,7 @@ bool vpux::VPU::GRUGatesOp::fitIntoCMX(llvm::ArrayRef buf auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/hard_sigmoid.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/hard_sigmoid.cpp index 71ecdda64c..884ccb8a7b 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/hard_sigmoid.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/hard_sigmoid.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -69,7 +70,7 @@ bool vpux::VPU::HardSigmoidOp::fitIntoCMX(llvm::ArrayRef auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/hswish.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/hswish.cpp index ef55e0bec9..8b56fe1aa9 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/hswish.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/hswish.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -61,7 +62,7 @@ bool vpux::VPU::HSwishOp::fitIntoCMX(llvm::ArrayRef buffe auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/interpolate.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/interpolate.cpp index 76ab9138cd..efec07534c 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/interpolate.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/interpolate.cpp @@ -12,6 +12,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/quantization.hpp" #include "vpux/utils/core/error.hpp" @@ -140,7 +141,7 @@ bool vpux::VPU::InterpolateOp::fitIntoCMX(llvm::ArrayRef auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/inverse.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/inverse.cpp index 94dbd3da12..16c3e7c749 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/inverse.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/inverse.cpp @@ -2,7 +2,6 @@ // Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // - #include #include "vpux/compiler/dialect/VPU/IR/ops.hpp" diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/layout_cast.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/layout_cast.cpp index 7ee1d8088d..391327690e 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/layout_cast.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/layout_cast.cpp @@ -89,3 +89,21 @@ mlir::FailureOr> vpux::VPU::LayoutC return std::make_pair(mlir::cast(dstType.changeTypeComponents(typeComponents)), castedOutputDistribution.value()); } + +// +// TilingViewLikeOpInterface +// + +vpux::InputTiling vpux::VPU::LayoutCastOp::backInferTileInfo(const vpux::TileInfo& outputTile, vpux::Logger) { + SmallVector inputTiles; + const auto inputShape = getShape(getInput()); + VPUX_THROW_UNLESS(inputShape.size() == outputTile.shape.size(), + "Can't tile LayoutCast operation at '{0}', which has operands with different rank", + this->getLoc()); + inputTiles.push_back(outputTile); + return TilingInfo{inputTiles}; +} + +void vpux::VPU::LayoutCastOp::adjustAttrs(const TilingInfo&, const TileInfo&, ShapeRef) { + // Do nothing +} diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/less.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/less.cpp index 5a37a0ab65..c923c2e4ea 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/less.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/less.cpp @@ -6,8 +6,8 @@ #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -78,7 +78,7 @@ bool vpux::VPU::LessOp::fitIntoCMX(llvm::ArrayRef buffers auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/log.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/log.cpp index 612e61cef1..c97932cb87 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/log.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/log.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -66,7 +67,7 @@ bool vpux::VPU::LogOp::fitIntoCMX(llvm::ArrayRef buffers, auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/log_softmax.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/log_softmax.cpp index 6d322a78fc..25cb7dbb38 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/log_softmax.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/log_softmax.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -139,7 +139,7 @@ bool vpux::VPU::LogSoftmaxOp::fitIntoCMX(llvm::ArrayRef b auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/logical_not.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/logical_not.cpp index d975a95595..5a7e512975 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/logical_not.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/logical_not.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -65,7 +65,7 @@ bool vpux::VPU::LogicalNotOp::fitIntoCMX(llvm::ArrayRef b auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/loop_select.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/loop_select.cpp index 9001fd7f14..9e5ef5d1e1 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/loop_select.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/loop_select.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/lstm_cell.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/lstm_cell.cpp index 3f2dc15af0..13f9c66d16 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/lstm_cell.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/lstm_cell.cpp @@ -3,8 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/max_lstm_hidden_size_constant.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -30,8 +32,8 @@ mlir::LogicalResult vpux::VPU::LSTMCellOp::inferReturnTypes(mlir::MLIRContext* c namespace { -bool isSupported(VPU::ArchKind arch, ShapeRef inputDataShape, ShapeRef initialHiddenStateShape) { - auto maxHiddenSize = getMaxLstmCellHiddenSizeConstant(arch); +bool isSupported(config::ArchKind arch, ShapeRef inputDataShape, ShapeRef initialHiddenStateShape) { + auto maxHiddenSize = VPU::getMaxLstmCellHiddenSizeConstant(arch); // shave implementation allow reduced size. Bigger size can be map on DPU. // Cost model can be interrogate. constexpr int64_t maxInputSize(256); @@ -55,5 +57,5 @@ bool isSupported(VPU::ArchKind arch, ShapeRef inputDataShape, ShapeRef initialHi // bool vpux::VPU::LSTMCellOp::isSupported(vpux::IE::LSTMCellOp op) { - return ::isSupported(VPU::getArch(op), getShape(op.getInputData()), getShape(op.getInitialHiddenState())); + return ::isSupported(config::getArch(op), getShape(op.getInputData()), getShape(op.getInitialHiddenState())); } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/lstm_gates.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/lstm_gates.cpp index 8910f2b553..4b25cddd2a 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/lstm_gates.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/lstm_gates.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -126,7 +126,7 @@ bool vpux::VPU::LSTMGatesOp::fitIntoCMX(llvm::ArrayRef bu auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/lstm_sequence.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/lstm_sequence.cpp index f7000d0d98..02dcc0e046 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/lstm_sequence.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/lstm_sequence.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" @@ -12,6 +13,7 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -108,8 +110,9 @@ mlir::Value createIntermediateSumsBuffer(mlir::OpBuilder& rewriter, int64_t hidd const auto dpuWeightTableSize = vpux::VPU::NCEInvariant::getWeightsTableSize(hiddenSize) * lstmNumberOfGates; int64_t size = dpuWeightTableSize.count() + lstmIntermediateMultiplicationBuffersize + - VPU::getDpuDebugDataSize(VPU::getArch(module)) + VPU::getDPUVariantDataSize(VPU::getArch(module)) + - VPU::getDPUInvariantDataSize(VPU::getArch(module)); + VPU::getDpuDebugDataSize(config::getArch(module)) + + VPU::getDPUVariantDataSize(config::getArch(module)) + + VPU::getDPUInvariantDataSize(config::getArch(module)); size = size / sizeof(int32_t); // int32_t type format const auto shape = Shape{1, 1, 1, size}; @@ -119,8 +122,8 @@ mlir::Value createIntermediateSumsBuffer(mlir::OpBuilder& rewriter, int64_t hidd auxIndicesType, ArrayRef(0)); } -bool isSupported(VPU::ArchKind arch, ShapeRef initialHiddenStateShape, bool useDpu) { - auto maxHiddenSize = getMaxLstmSequenceHiddenSizeConstant(arch); +bool isSupported(config::ArchKind arch, ShapeRef initialHiddenStateShape, bool useDpu) { + auto maxHiddenSize = VPU::getMaxLstmSequenceHiddenSizeConstant(arch); // shave implementation allow reduced size. Bigger size can and are map on DPU. if (initialHiddenStateShape.back() > maxHiddenSize) { @@ -171,9 +174,9 @@ void vpux::VPU::LSTMSequenceOp::build(::mlir::OpBuilder& odsBuilder, ::mlir::Ope vpux::IE::RNNSequenceDirectionAttr direction, vpux::VPU::MultiClusterStrategyAttr multiClusterStrategy) { const auto module = getModule(odsBuilder); - auto useDpu = VPU::getShaveControlsDpu(VPU::getArch(module)); + auto useDpu = VPU::getShaveControlsDpu(config::getArch(module)); // extra alignment condition should be meet in order to run on internal on dpu. - useDpu = useDpu ? ::isSupported(VPU::getArch(module), getShape(initialHiddenState), useDpu) : useDpu; + useDpu = useDpu ? ::isSupported(config::getArch(module), getShape(initialHiddenState), useDpu) : useDpu; mlir::BoolAttr useDpuAttr(nullptr); useDpuAttr = useDpu ? mlir::BoolAttr::get(odsBuilder.getContext(), useDpu) : useDpuAttr; build(odsBuilder, odsState, inputData, initialHiddenState, initialCellState, reccurenceWeights, biases, @@ -184,7 +187,7 @@ bool vpux::VPU::LSTMSequenceOp::isSupported(vpux::IE::LSTMSequenceOp op, bool us if (op.getReccurenceWeights().getDefiningOp() == nullptr) { return false; } - return ::isSupported(VPU::getArch(op), getShape(op.getInitialHiddenState()), useDpu); + return ::isSupported(config::getArch(op), getShape(op.getInitialHiddenState()), useDpu); } // @@ -235,7 +238,7 @@ bool vpux::VPU::LSTMSequenceOp::fitIntoCMX(llvm::ArrayRef auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/m2i_color_convert.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/m2i_color_convert.cpp index 8014b9e352..c01efe67bc 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/m2i_color_convert.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/m2i_color_convert.cpp @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/m2i_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -19,7 +21,8 @@ bool vpux::VPU::M2IColorConvertOp::fitIntoCMX(mlir::Operation* op, vpux::NDTypeI reservedMem.count() == 0 ? getTotalCMXSize(op).count() : getTotalCMXFragmentationAwareSize(op).count(); // Note: for 1xPlane config, 1st input fully dictates the size SmallVector buffers = {input.getTotalAllocSize(), output.getTotalAllocSize()}; - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(op), buffers).count() + reservedMem.count() <= + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(op), buffers).count() + + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/m2i_norm.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/m2i_norm.cpp index cf94e44ed6..0d403ad35d 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/m2i_norm.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/m2i_norm.cpp @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/m2i_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -18,7 +20,8 @@ bool vpux::VPU::M2INormOp::fitIntoCMX(mlir::Operation* op, vpux::NDTypeInterface auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(op).count() : getTotalCMXFragmentationAwareSize(op).count(); SmallVector buffers = {input.getTotalAllocSize(), output.getTotalAllocSize()}; - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(op), buffers).count() + reservedMem.count() <= + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(op), buffers).count() + + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/m2i_resize.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/m2i_resize.cpp index d7f21bab56..96f200de11 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/m2i_resize.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/m2i_resize.cpp @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/m2i_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -18,7 +20,8 @@ bool vpux::VPU::M2IResizeOp::fitIntoCMX(mlir::Operation* op, vpux::NDTypeInterfa auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(op).count() : getTotalCMXFragmentationAwareSize(op).count(); SmallVector buffers = {input.getTotalAllocSize(), output.getTotalAllocSize()}; - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(op), buffers).count() + reservedMem.count() <= + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(op), buffers).count() + + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/matmul.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/matmul.cpp index bacafd337a..31c7a1a36a 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/matmul.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/matmul.cpp @@ -7,11 +7,12 @@ #include "vpux/compiler/core/attributes/dim.hpp" #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/tiling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include #include @@ -95,9 +96,9 @@ mlir::LogicalResult vpux::VPU::MatMulOp::inferReturnTypes(mlir::MLIRContext* ctx namespace { -bool isSupported(VPU::ArchKind arch, ShapeRef input1Shape, ShapeRef input2Shape, bool transposeA = false, +bool isSupported(config::ArchKind arch, ShapeRef input1Shape, ShapeRef input2Shape, bool transposeA = false, bool transposeB = false) { - if (arch == VPU::ArchKind::NPU37XX) { // All platforms except MTL can use this + if (arch == config::ArchKind::NPU37XX) { // All platforms except MTL can use this return false; } @@ -135,7 +136,7 @@ bool isSupported(VPU::ArchKind arch, ShapeRef input1Shape, ShapeRef input2Shape, // bool vpux::VPU::MatMulOp::isSupported(vpux::IE::MatMulOp matmulOp) { - return ::isSupported(VPU::getArch(matmulOp), getShape(matmulOp.getInput1()), getShape(matmulOp.getInput2()), + return ::isSupported(config::getArch(matmulOp), getShape(matmulOp.getInput1()), getShape(matmulOp.getInput2()), matmulOp.getTransposeA(), matmulOp.getTransposeB()); } @@ -149,7 +150,7 @@ mlir::LogicalResult vpux::VPU::MatMulOp::verify() { } const auto operation = getOperation(); - const auto arch = VPU::getArch(operation); + const auto arch = config::getArch(operation); if (::isSupported(arch, getShape(getInput1()), getShape(getInput2()))) { return mlir::success(); } @@ -226,7 +227,7 @@ bool vpux::VPU::MatMulOp::fitIntoCMX(llvm::ArrayRef buffe auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/maximum.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/maximum.cpp index ce58be25bd..d59886f74c 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/maximum.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/maximum.cpp @@ -6,8 +6,8 @@ #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -77,7 +77,7 @@ bool vpux::VPU::MaximumOp::fitIntoCMX(llvm::ArrayRef buff auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/maxpool.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/maxpool.cpp index 5f77b8dec0..8273831cf5 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/maxpool.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/maxpool.cpp @@ -3,10 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/utils/pad_extract.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/type_infer.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" - #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/mem_permute.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/mem_permute.cpp index 8e172bb241..58b34abd59 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/mem_permute.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/mem_permute.cpp @@ -3,16 +3,17 @@ // SPDX-License-Identifier: Apache-2.0 // -#include - #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/type_infer.hpp" #include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" - #include "vpux/compiler/utils/permute_utils.hpp" +#include + using namespace vpux; namespace { @@ -111,7 +112,7 @@ void vpux::VPU::MemPermuteOp::build(::mlir::OpBuilder& odsBuilder, ::mlir::Opera bool vpux::VPU::MemPermuteOp::checkStrategyCompatibility(VPU::MultiClusterStrategy strategy, size_t) { auto inputType = mlir::cast(getInput().getType()); auto outputType = mlir::cast(getOutput().getType()); - if (VPUIP::satisfiesOptimizedMemPermute(VPU::getArch(getOperation()), inputType, outputType)) { + if (VPUIP::satisfiesOptimizedMemPermute(config::getArch(getOperation()), inputType, outputType)) { // Optimal MemPermute kernel is most performant with SOH // Should remove this experimental condition when shave cost is supported // Track E#170850 @@ -148,7 +149,7 @@ bool vpux::VPU::MemPermuteOp::fitIntoCMX(llvm::ArrayRef b auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/minimum.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/minimum.cpp index 0897d8dbd8..e660519c02 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/minimum.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/minimum.cpp @@ -6,8 +6,8 @@ #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -77,7 +77,7 @@ bool vpux::VPU::MinimumOp::fitIntoCMX(llvm::ArrayRef buff auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/mish.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/mish.cpp index da1569e5d7..17c8c39a5f 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/mish.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/mish.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -63,7 +63,7 @@ bool vpux::VPU::MishOp::fitIntoCMX(llvm::ArrayRef buffers auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/multiply.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/multiply.cpp index 6abb5cf4bb..5fddb4b9c2 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/multiply.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/multiply.cpp @@ -6,8 +6,8 @@ #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -80,7 +80,7 @@ bool vpux::VPU::MultiplyOp::fitIntoCMX(llvm::ArrayRef buf auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn.cpp index 5066c9dcd7..a194b913ca 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; @@ -92,7 +93,7 @@ bool vpux::VPU::MVNOp::fitIntoCMX(llvm::ArrayRef buffers, auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn1_mean_var.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn1_mean_var.cpp index 43844e1a79..a2843bb39e 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn1_mean_var.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn1_mean_var.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -182,7 +183,7 @@ bool vpux::VPU::MVN1MeanVarOp::fitIntoCMX(llvm::ArrayRef auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn1_normalize.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn1_normalize.cpp index 2a4f68f138..174118a55d 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn1_normalize.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn1_normalize.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -85,7 +86,7 @@ bool vpux::VPU::MVN1NormalizeOp::fitIntoCMX(llvm::ArrayRef buff auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } @@ -116,7 +116,7 @@ bool vpux::VPU::MVN1SumOp::buffsFitIntoCMX(mlir::ModuleOp module, vpux::NDTypeIn SmallVector buffersSize; buffersSize.push_back(in.getTotalAllocSize()); buffersSize.push_back(out.getTotalAllocSize()); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto totalAvailableCMXSize = getTotalCMXSize(module).count(); return vpux::VPU::calculateAlignedBuffersMemoryRequirement(arch, buffersSize).count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn6.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn6.cpp index 69272c1b2b..446e7304fb 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn6.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/mvn6.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -91,7 +91,7 @@ bool vpux::VPU::MVN6Op::fitIntoCMX(llvm::ArrayRef buffers auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_average_pool.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_average_pool.cpp index d86894d50f..f10616b41d 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_average_pool.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_average_pool.cpp @@ -5,6 +5,8 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/IE/utils/type_padding.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" @@ -12,13 +14,12 @@ #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" -#include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" +#include "vpux/compiler/dialect/VPU/utils/sparsity_support.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/empty_node.hpp" #include "vpux/compiler/utils/error.hpp" - #include "vpux/compiler/utils/infer_output_shape.hpp" using namespace vpux; @@ -36,7 +37,7 @@ bool vpux::VPU::NCEAveragePoolOp::fitIntoCMX(vpux::NDTypeInterface input, vpux:: auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); SmallVector buffers = {input.getTotalAllocSize(), output.getTotalAllocSize()}; - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffers).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } @@ -94,7 +95,7 @@ bool vpux::VPU::NCEAveragePoolOp::isSupported(IE::AvgPoolOp op, LogCb logCb, boo } if (checkLayout) { - const auto arch = getArch(op); + const auto arch = config::getArch(op); if (!NCEInvariant::checkLayouts(op->getOperandTypes(), op->getResultTypes(), arch, 1, logCb)) { return false; } @@ -109,10 +110,10 @@ bool vpux::VPU::NCEAveragePoolOp::isSupported(IE::AvgPoolOp op, LogCb logCb, boo mlir::LogicalResult vpux::VPU::NCEAveragePoolOp::verify() { const auto op = getOperation(); - const auto arch = getArch(op); + const auto arch = config::getArch(op); // Skip checks if architecture is unknown since all of them depend on the architecture used - if (arch == VPU::ArchKind::UNKNOWN) { + if (arch == config::ArchKind::UNKNOWN) { return mlir::success(); } @@ -220,7 +221,7 @@ mlir::FailureOr vpux::VPU::NCEAveragePoolOp::getTilingStrategy(Til // bool vpux::VPU::NCEAveragePoolOp::checkStrategyCompatibility(VPU::MultiClusterStrategy strategy, size_t) { - const auto arch = VPU::getArch(getOperation()); + const auto arch = config::getArch(getOperation()); const auto outputType = mlir::cast(getOutput().getType()); const auto batchSize = outputType.getShape()[Dims4D::Act::N]; @@ -276,7 +277,7 @@ bool VPU::NCEAveragePoolOp::isOperationSplitOverHeightCompatible(const vpux::Til auto tileOp = IE::getTileExecutor(moduleOp); const auto numTiles = tileOp.getCount(); - return isSOHSupportedByDPU(inputType, inputShape, numTiles, true, VPU::getArch(nceOp.getOperation())); + return isSOHSupportedByDPU(inputType, inputShape, numTiles, true, config::getArch(nceOp.getOperation())); } bool VPU::NCEAveragePoolOp::isOperationSplitOverWidthCompatible(ShapeRef outputShape, ShapeRef offset, ShapeRef axis) { diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_compress_convolution.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_compress_convolution.cpp index 86e3f8b7df..ea1ba1494e 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_compress_convolution.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_compress_convolution.cpp @@ -5,21 +5,22 @@ // +#include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" - -#include "vpux/compiler/core/layers.hpp" -#include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" -#include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" -#include "vpux/compiler/utils/empty_node.hpp" +#include "vpux/compiler/dialect/VPU/utils/sparsity_support.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/error.hpp" #include +#include using namespace vpux; @@ -47,7 +48,7 @@ bool vpux::VPU::NCECompressConvolutionOp::fitIntoCMX(vpux::NDTypeInterface input auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffers).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } @@ -77,10 +78,10 @@ static mlir::LogicalResult verifyConv(mlir::Location loc, mlir::Operation* op, mlir::LogicalResult vpux::VPU::NCECompressConvolutionOp::verify() { auto op = getOperation(); - const auto arch = getArch(op); + const auto arch = config::getArch(op); // Skip checks if architecture is unknown since all of them depend on the architecture used - if (arch == VPU::ArchKind::UNKNOWN) { + if (arch == config::ArchKind::UNKNOWN) { return mlir::success(); } @@ -217,7 +218,7 @@ mlir::FailureOr vpux::VPU::NCECompressConvolutionOp::getTilingStra // bool vpux::VPU::NCECompressConvolutionOp::checkStrategyCompatibility(VPU::MultiClusterStrategy strategy, size_t) { - const auto arch = VPU::getArch(getOperation()); + const auto arch = config::getArch(getOperation()); const auto outputType = mlir::cast(getOutput().getType()); const auto batchSize = outputType.getShape()[Dims4D::Act::N]; if (batchSize > 1 && batchSize <= VPU::getMaxArchDPUClusterNum(arch)) { @@ -271,7 +272,7 @@ bool VPU::NCECompressConvolutionOp::isOperationSplitOverHeightCompatible(const v auto tileOp = IE::getTileExecutor(moduleOp); const auto numTiles = tileOp.getCount(); - return isSOHSupportedByDPU(inputType, inputShape, numTiles, false, VPU::getArch(nceOp.getOperation())); + return isSOHSupportedByDPU(inputType, inputShape, numTiles, false, config::getArch(nceOp.getOperation())); } bool VPU::NCECompressConvolutionOp::isOperationSplitOverWidthCompatible(ShapeRef outputShape, ShapeRef offset, @@ -318,7 +319,7 @@ bool VPU::NCECompressConvolutionOp::doesLayerFitIntoCMX(VPU::MultiClusterStrateg auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffers).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_convolution.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_convolution.cpp index ea2ed4ef26..892ee3d460 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_convolution.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_convolution.cpp @@ -3,21 +3,20 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" -#include "vpux/compiler/utils/IE/transposed_convolution_utils.hpp" - -#include "vpux/compiler/core/layers.hpp" -#include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" -#include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" -#include "vpux/compiler/dialect/VPU/utils/se_roll_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/sparsity_support.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/IE/transposed_convolution_utils.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" -#include "vpux/compiler/utils/empty_node.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" @@ -54,7 +53,7 @@ bool vpux::VPU::NCEConvolutionOp::fitIntoCMX(vpux::NDTypeInterface input, vpux:: auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffers).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } @@ -85,18 +84,19 @@ static mlir::LogicalResult verifyConv(mlir::Location loc, mlir::Operation* op, V } static mlir::LogicalResult verifyWeightTables(VPU::NCEConvolutionOp op) { - if (op.getWeightsTable() == nullptr) { - return errorAt(op, "weightsTable is required for NCEConvolutionOp"); + if ((op.getWeightTableDataPtr() || op.getWeightTableSpPtr() || op.getWeightTableScale() || + op.getWeightTableBias() || op.getWeightZeroPoints())) { + return errorAt(op, "Only weightsTable can be populated for NCEConvolutionOp"); } return mlir::success(); } mlir::LogicalResult vpux::VPU::NCEConvolutionOp::verify() { auto op = getOperation(); - const auto arch = getArch(op); + const auto arch = config::getArch(op); // Skip checks if architecture is unknown since all of them depend on the architecture used - if (arch == VPU::ArchKind::UNKNOWN) { + if (arch == config::ArchKind::UNKNOWN) { return mlir::success(); } @@ -278,7 +278,7 @@ mlir::FailureOr vpux::VPU::NCEConvolutionOp::getTilingStrategy(Til // bool vpux::VPU::NCEConvolutionOp::checkStrategyCompatibility(VPU::MultiClusterStrategy strategy, size_t) { - const auto arch = VPU::getArch(getOperation()); + const auto arch = config::getArch(getOperation()); auto nceOp = mlir::cast(getOperation()); const auto isCompatible = VPU::isSEPConvCompatibleWithClusterStrategy(nceOp, strategy); @@ -349,7 +349,7 @@ bool VPU::NCEConvolutionOp::isOperationSplitOverHeightCompatible(const vpux::Til auto tileOp = IE::getTileExecutor(moduleOp); const auto numTiles = tileOp.getCount(); - return isSOHSupportedByDPU(inputType, inputShape, numTiles, false, VPU::getArch(nceOp.getOperation())); + return isSOHSupportedByDPU(inputType, inputShape, numTiles, false, config::getArch(nceOp.getOperation())); } bool VPU::NCEConvolutionOp::isOperationSplitOverWidthCompatible(ShapeRef outputShape, ShapeRef offset, ShapeRef axis) { @@ -397,7 +397,7 @@ bool VPU::NCEConvolutionOp::doesLayerFitIntoCMX(VPU::MultiClusterStrategy strate ? VPU::getTotalCMXSize(getOperation()).count() : VPU::getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(VPU::getArch(getOperation()), buffers).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } @@ -596,3 +596,30 @@ mlir::LogicalResult vpux::VPU::NCEConvolutionOp::verifyConvCMX(mlir::Location lo return mlir::success(); } + +mlir::LogicalResult vpux::VPU::NCEConvolutionOp::reifyResultShapes( + mlir::OpBuilder& builder, mlir::ReifiedRankedShapedTypeDims& reifiedReturnShapes) { + // Parse attributes + const auto strides = parseIntArrayAttr(getStrides()); + + const auto padTop = getPad().getTop().getValue().getSExtValue(); + const auto padBottom = getPad().getBottom().getValue().getSExtValue(); + const auto padLeft = getPad().getLeft().getValue().getSExtValue(); + const auto padRight = getPad().getRight().getValue().getSExtValue(); + + const auto dataPaddingAbove = SmallVector({padTop, padLeft}); + const auto dataPaddingBelow = SmallVector({padBottom, padRight}); + + auto kernelShape = mlir::cast(getFilter().getType()).getShape(); + SmallVector kernelSize{kernelShape[Dims4D::Filter::KY], kernelShape[Dims4D::Filter::KX]}; + + // Compute output shape using utility + auto outShape = reifyConvPoolTensors(builder, getInput(), getOutput(), getFilter(), kernelSize, strides, + dataPaddingAbove, dataPaddingBelow, getLoc()); + if (mlir::failed(outShape)) { + return outShape; + } + + reifiedReturnShapes.emplace_back(std::move(outShape.value())); + return mlir::success(); +} diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_depth_convolution.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_depth_convolution.cpp index ddda93334f..7f0b74d5c4 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_depth_convolution.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_depth_convolution.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" @@ -16,10 +17,11 @@ #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" +#include "vpux/compiler/dialect/VPU/utils/sparsity_support.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/dilated_utils.hpp" -#include "vpux/compiler/utils/empty_node.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" @@ -40,7 +42,7 @@ bool vpux::VPU::NCEDepthConvolutionOp::fitIntoCMX(vpux::NDTypeInterface input, v auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - auto arch = getArch(getOperation()); + auto arch = config::getArch(getOperation()); return vpux::VPU::calculateAlignedBuffersMemoryRequirement(arch, buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } @@ -119,7 +121,7 @@ bool vpux::VPU::NCEDepthConvolutionOp::isSupported(IE::GroupConvolutionOp op, Lo } if (checkLayout) { - const auto arch = getArch(op); + const auto arch = config::getArch(op); if (!NCEInvariant::checkLayouts(op->getOperandTypes(), op->getResultTypes(), arch, 2, logCb)) { return false; } @@ -173,10 +175,10 @@ mlir::LogicalResult verifyDepthConv(mlir::Location loc, mlir::Operation* op, mlir::LogicalResult vpux::VPU::NCEDepthConvolutionOp::verify() { const auto op = getOperation(); - const auto arch = getArch(op); + const auto arch = config::getArch(op); // Skip checks if architecture is unknown since all of them depend on the architecture used - if (arch == VPU::ArchKind::UNKNOWN) { + if (arch == config::ArchKind::UNKNOWN) { return mlir::success(); } @@ -389,7 +391,7 @@ bool VPU::NCEDepthConvolutionOp::isOperationSplitOverHeightCompatible(const vpux auto tileOp = IE::getTileExecutor(moduleOp); const auto numTiles = tileOp.getCount(); - return isSOHSupportedByDPU(inputType, inputShape, numTiles, true, VPU::getArch(nceOp.getOperation())); + return isSOHSupportedByDPU(inputType, inputShape, numTiles, true, config::getArch(nceOp.getOperation())); } bool VPU::NCEDepthConvolutionOp::isOperationSplitOverWidthCompatible(ShapeRef outputShape, ShapeRef offset, @@ -428,7 +430,7 @@ bool VPU::NCEDepthConvolutionOp::doesLayerFitIntoCMX(VPU::MultiClusterStrategy s auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - auto arch = getArch(getOperation()); + auto arch = config::getArch(getOperation()); return vpux::VPU::calculateAlignedBuffersMemoryRequirement(arch, buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_eltwise.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_eltwise.cpp index 32d455d171..52ef127c37 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_eltwise.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_eltwise.cpp @@ -3,17 +3,17 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/IE/utils/type_padding.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" - -#include "vpux/compiler/core/attributes/shape.hpp" -#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/VPU/utils/eltwise_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" @@ -44,7 +44,7 @@ bool vpux::VPU::NCEEltwiseOp::fitIntoCMX(vpux::NDTypeInterface input1, vpux::NDT : getTotalCMXFragmentationAwareSize(getOperation()).count(); SmallVector buffers = {input1.getTotalAllocSize(), input2.getTotalAllocSize(), output.getTotalAllocSize()}; - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffers).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } @@ -53,7 +53,7 @@ bool vpux::VPU::NCEEltwiseOp::fitIntoCMX(vpux::NDTypeInterface input1, vpux::NDT auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); SmallVector buffers = {input1.getTotalAllocSize(), input2.getTotalAllocSize()}; - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffers).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } @@ -191,7 +191,7 @@ bool VPU::NCEEltwiseOp::doesLayerFitIntoCMX(VPU::MultiClusterStrategy strategy, strategy, siblingsAnalysis))); } - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffers).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_interpolate.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_interpolate.cpp index 7b39dbf7fe..ba89c590b5 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_interpolate.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_interpolate.cpp @@ -5,9 +5,10 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" -#include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" @@ -16,10 +17,11 @@ #include "vpux/compiler/dialect/VPU/utils/nce_interpolate_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" - -#include "vpux/compiler/utils/empty_node.hpp" +#include "vpux/compiler/dialect/VPU/utils/sparsity_support.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include +#include using namespace vpux; @@ -91,7 +93,7 @@ mlir::LogicalResult vpux::VPU::NCEInterpolateOp::verify() { } bool isNCEInterpolateSupported(vpux::NDTypeInterface inputType, vpux::NDTypeInterface outputType, - IE::InterpolateAttr attr, VPU::ArchKind arch, bool checkLayout, + IE::InterpolateAttr attr, config::ArchKind arch, bool checkLayout, bool checkChannelAlignment, bool checkBatch, mlir::Operation* op, vpux::LogCb logCb) { // TODO E#71403: remove dimension check auto dimOver8K = [](ShapeRef shape) { @@ -235,7 +237,7 @@ bool VPU::NCEInterpolateOp::isSupported(IE::InterpolateOp op, vpux::LogCb logCb, auto inputType = mlir::cast(op.getInput().getType()); auto outputType = mlir::cast(op.getOutput().getType()); - return isNCEInterpolateSupported(inputType, outputType, op.getAttr(), VPU::getArch(op), checkChannelAlignment, + return isNCEInterpolateSupported(inputType, outputType, op.getAttr(), config::getArch(op), checkChannelAlignment, checkLayout, checkBatch, op, logCb); } @@ -244,7 +246,7 @@ bool VPU::NCEInterpolateOp::isSupported(VPU::InterpolateOp op, vpux::LogCb logCb auto inputType = mlir::cast(op.getInput().getType()); auto outputType = mlir::cast(op.getOutput().getType()); - return isNCEInterpolateSupported(inputType, outputType, op.getAttr(), VPU::getArch(op), checkChannelAlignment, + return isNCEInterpolateSupported(inputType, outputType, op.getAttr(), config::getArch(op), checkChannelAlignment, checkLayout, checkBatch, op, logCb); } @@ -339,7 +341,7 @@ bool VPU::NCEInterpolateOp::isOperationSplitOverHeightCompatible(const vpux::Til auto tileOp = IE::getTileExecutor(moduleOp); const auto numTiles = tileOp.getCount(); - return isSOHSupportedByDPU(inputType, inputShape, numTiles, false, VPU::getArch(nceOp.getOperation())); + return isSOHSupportedByDPU(inputType, inputShape, numTiles, false, config::getArch(nceOp.getOperation())); } bool VPU::NCEInterpolateOp::isOperationSplitOverWidthCompatible(ShapeRef outputShape, ShapeRef offset, ShapeRef axis) { @@ -377,7 +379,7 @@ bool VPU::NCEInterpolateOp::doesLayerFitIntoCMX(VPU::MultiClusterStrategy strate auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffers).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } @@ -468,7 +470,7 @@ bool vpux::VPU::NCEInterpolateOp::fitIntoCMX(vpux::NDTypeInterface input, vpux:: auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffers).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_matmul.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_matmul.cpp index b2a722fb28..d0f53975e7 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_matmul.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_matmul.cpp @@ -3,8 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/utils/matmul.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/native_attributes/distribution_info.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" @@ -13,13 +15,12 @@ #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/analysis.hpp" - -#include "vpux/compiler/core/attributes/shape.hpp" -#include "vpux/compiler/core/layers.hpp" #include "vpux/utils/logger/logger.hpp" +#include + using namespace vpux; // @@ -80,7 +81,7 @@ mlir::LogicalResult vpux::VPU::NCEMatMulOp::verifyKernel(IE::MatMulOp) { bool doesNCEMatMulFitIntoCMX(vpux::NDTypeInterface inputType, vpux::NDTypeInterface filterType, vpux::NDTypeInterface outputType, mlir::ModuleOp moduleOp, Byte reservedMem) { - auto arch = VPU::getArch(moduleOp); + auto arch = config::getArch(moduleOp); auto largestGroupsNumPerCluster = filterType.getShape()[DimsGroups5D::Act::G]; if (auto distType = mlir::dyn_cast(filterType)) { @@ -275,7 +276,7 @@ bool VPU::NCEMatMulOp::doesLayerFitIntoCMX(VPU::MultiClusterStrategy strategy, S const auto outputType = mlir::cast(nceOp->getResult(0).getType()); auto numClusters = VPU::getOptimalNumClusters(nceOp, outputType.getShape(), strategy); auto mod = getModuleOp(getOperation()); - auto arch = VPU::getArch(mod); + auto arch = config::getArch(mod); auto filterType = mlir::cast(getWeights().getType()); auto largestGroupsNumPerCluster = filterType.getShape()[DimsGroups5D::Act::G]; diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_max_pool.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_max_pool.cpp index b991ccfb4e..7b2da45a0a 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_max_pool.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_max_pool.cpp @@ -6,6 +6,8 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/core/tiling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/IE/utils/type_padding.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" @@ -16,10 +18,11 @@ #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" +#include "vpux/compiler/dialect/VPU/utils/sparsity_support.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" - #include "vpux/compiler/utils/infer_output_shape.hpp" using namespace vpux; @@ -41,7 +44,7 @@ bool vpux::VPU::NCEMaxPoolOp::fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTy auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - auto arch = getArch(getOperation()); + auto arch = config::getArch(getOperation()); return vpux::VPU::calculateAlignedBuffersMemoryRequirement(arch, buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } @@ -55,7 +58,7 @@ bool vpux::VPU::NCEMaxPoolOp::fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTy // bool vpux::VPU::NCEMaxPoolOp::isSupported(IE::MaxPoolOp op, LogCb logCb, bool checkLayout, bool checkChannelAlignment) { - auto arch = VPU::getArch(op); + auto arch = config::getArch(op); if (op.getType().getRank() != 4) { logCb(formatv("Only 4D tensors are supported")); @@ -112,10 +115,10 @@ bool vpux::VPU::NCEMaxPoolOp::isSupported(IE::MaxPoolOp op, LogCb logCb, bool ch mlir::LogicalResult vpux::VPU::NCEMaxPoolOp::verify() { const auto op = getOperation(); - const auto arch = getArch(op); + const auto arch = config::getArch(op); // Skip checks if architecture is unknown since all of them depend on the architecture used - if (arch == VPU::ArchKind::UNKNOWN) { + if (arch == config::ArchKind::UNKNOWN) { return mlir::success(); } @@ -247,7 +250,7 @@ mlir::FailureOr vpux::VPU::NCEMaxPoolOp::getTilingStrategy(TilingM // bool vpux::VPU::NCEMaxPoolOp::checkStrategyCompatibility(VPU::MultiClusterStrategy strategy, size_t) { - const auto arch = VPU::getArch(getOperation()); + const auto arch = config::getArch(getOperation()); const auto outputType = mlir::cast(getOutput().getType()); const auto batchSize = outputType.getShape()[Dims4D::Act::N]; @@ -303,7 +306,7 @@ bool VPU::NCEMaxPoolOp::isOperationSplitOverHeightCompatible(const vpux::TileInf auto tileOp = IE::getTileExecutor(moduleOp); const auto numTiles = tileOp.getCount(); - return isSOHSupportedByDPU(inputType, inputShape, numTiles, true, VPU::getArch(nceOp.getOperation())); + return isSOHSupportedByDPU(inputType, inputShape, numTiles, true, config::getArch(nceOp.getOperation())); } bool VPU::NCEMaxPoolOp::isOperationSplitOverWidthCompatible(ShapeRef outputShape, ShapeRef offset, ShapeRef axis) { @@ -343,7 +346,7 @@ bool VPU::NCEMaxPoolOp::doesLayerFitIntoCMX(VPU::MultiClusterStrategy strategy, auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - auto arch = getArch(getOperation()); + auto arch = config::getArch(getOperation()); return vpux::VPU::calculateAlignedBuffersMemoryRequirement(arch, buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } @@ -440,3 +443,29 @@ mlir::LogicalResult vpux::VPU::NCEMaxPoolOp::verifyKernel(IE::MaxPoolOp origOp, return NCEInvariant::verifyKernel(origOp, KY, KX, SY, SX, padTop, padBottom, padLeft, padRight, log); } + +mlir::LogicalResult vpux::VPU::NCEMaxPoolOp::reifyResultShapes(mlir::OpBuilder& builder, + mlir::ReifiedRankedShapedTypeDims& reifiedReturnShapes) { + // Parse attributes + const auto strides = parseIntArrayAttr(getStrides()); + + const auto padTop = getPad().getTop().getValue().getSExtValue(); + const auto padBottom = getPad().getBottom().getValue().getSExtValue(); + const auto padLeft = getPad().getLeft().getValue().getSExtValue(); + const auto padRight = getPad().getRight().getValue().getSExtValue(); + + const auto dataPaddingAbove = SmallVector({padTop, padLeft}); + const auto dataPaddingBelow = SmallVector({padBottom, padRight}); + + const auto kernelSize = parseIntArrayAttr(getKernelSizeAttr()); + + // Compute output shape using utility + auto outShape = reifyConvPoolTensors(builder, getInput(), getOutput(), nullptr, kernelSize, strides, + dataPaddingAbove, dataPaddingBelow, getLoc()); + if (mlir::failed(outShape)) { + return outShape; + } + + reifiedReturnShapes.emplace_back(std::move(outShape.value())); + return mlir::success(); +} diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_permute.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_permute.cpp index 37bd71620e..3f546d73eb 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_permute.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_permute.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/utils/permute_quantize_utils.hpp" -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" @@ -15,7 +15,8 @@ #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" -#include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/dialect/VPU/utils/sparsity_support.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" @@ -52,7 +53,7 @@ bool vpux::VPU::NCEPermuteOp::fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTy : getTotalCMXFragmentationAwareSize(getOperation()).count(); SmallVector buffers = {input.getTotalAllocSize(), output.getTotalAllocSize()}; - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffers).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } @@ -139,10 +140,10 @@ bool vpux::VPU::NCEPermuteOp::isSupported(IE::PermuteQuantizeOp op, LogCb logCb, mlir::LogicalResult vpux::VPU::NCEPermuteOp::verify() { const auto op = getOperation(); - const auto arch = getArch(op); + const auto arch = config::getArch(op); // Skip checks if architecture is unknown since all of them depend on the architecture used - if (arch == VPU::ArchKind::UNKNOWN) { + if (arch == config::ArchKind::UNKNOWN) { return mlir::success(); } @@ -266,13 +267,13 @@ mlir::FailureOr vpux::VPU::NCEPermuteOp::getTilingStrategy(TilingM // bool vpux::VPU::NCEPermuteOp::checkStrategyCompatibility(VPU::MultiClusterStrategy strategy, size_t) { - const auto arch = getArch(getOperation()); + const auto arch = config::getArch(getOperation()); // SOK is only enabled on 40XX+, but 37XX also supports it, need to enable and refactor. // Tracked by: E116491 const auto origInputShape = getShape(this->getInput()); const auto expandedChannels = this->getExpandedChannels(); return strategy == VPU::MultiClusterStrategy::SplitOverHeightOverlapped || - (arch >= VPU::ArchKind::NPU40XX && strategy == VPU::MultiClusterStrategy::SplitOverKernel && + (arch >= config::ArchKind::NPU40XX && strategy == VPU::MultiClusterStrategy::SplitOverKernel && origInputShape[Dims4D::Act::C] == expandedChannels); } @@ -294,10 +295,10 @@ bool VPU::NCEPermuteOp::isOperationSplitOverWidthCompatible(ShapeRef outputShape } bool VPU::NCEPermuteOp::isOperationSplitOverKernelCompatible(ShapeRef outputShape, ShapeRef offset, ShapeRef axis) { - const auto arch = getArch(getOperation()); + const auto arch = config::getArch(getOperation()); const auto origInputShape = getShape(this->getInput()); const auto expandedChannels = this->getExpandedChannels(); - if (arch == VPU::ArchKind::NPU37XX || origInputShape[Dims4D::Act::C] != expandedChannels) { + if (arch == config::ArchKind::NPU37XX || origInputShape[Dims4D::Act::C] != expandedChannels) { return false; } return VPU::isOperationSplitOverKernelCompatible(getOperation(), outputShape, offset, axis); diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_reduce.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_reduce.cpp index 255110f241..10fd45ba39 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_reduce.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_reduce.cpp @@ -4,13 +4,12 @@ #include "vpux/compiler/dialect/IE/utils/type_padding.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" - #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_reduce_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/reduce_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/type_infer.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" using namespace vpux; @@ -82,7 +81,7 @@ bool vpux::VPU::NCEReduceOp::isSupported(mlir::Operation* op, LogCb logCb, bool } if (checkLayout) { - if (!NCEInvariant::checkLayouts({inputType}, {outputType}, getArch(op), 1, logCb)) { + if (!NCEInvariant::checkLayouts({inputType}, {outputType}, config::getArch(op), 1, logCb)) { return false; } } @@ -98,7 +97,7 @@ bool vpux::VPU::NCEReduceOp::fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTyp auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - auto arch = getArch(getOperation()); + auto arch = config::getArch(getOperation()); return vpux::VPU::calculateAlignedBuffersMemoryRequirement(arch, buffers).count() + reservedMem.count() <= totalAvailableCMXSize; } @@ -134,7 +133,7 @@ mlir::FailureOr vpux::VPU::NCEReduceOp::getTilingStrategy(TilingMo // bool vpux::VPU::NCEReduceOp::checkStrategyCompatibility(VPU::MultiClusterStrategy strategy, size_t) { - const auto arch = VPU::getArch(getOperation()); + const auto arch = config::getArch(getOperation()); const auto outputType = mlir::cast(getOutput().getType()); const auto batchSize = outputType.getShape()[Dims4D::Act::N]; diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/negative.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/negative.cpp index cea4c81b0c..f379cc4d97 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/negative.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/negative.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -64,7 +64,7 @@ bool vpux::VPU::NegativeOp::fitIntoCMX(llvm::ArrayRef buf auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/normalize_l2.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/normalize_l2.cpp index eea71ad106..a0d450476a 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/normalize_l2.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/normalize_l2.cpp @@ -7,11 +7,9 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; @@ -107,7 +105,7 @@ bool vpux::VPU::NormalizeL2Op::fitIntoCMX(llvm::ArrayRef auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/not_equal.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/not_equal.cpp index cceb7645ed..1825539052 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/not_equal.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/not_equal.cpp @@ -6,8 +6,8 @@ #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -74,7 +74,7 @@ bool vpux::VPU::NotEqualOp::fitIntoCMX(llvm::ArrayRef buf auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/pad.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/pad.cpp index 6731fecafd..ec20c5c442 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/pad.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/pad.cpp @@ -7,8 +7,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" @@ -169,7 +169,7 @@ bool vpux::VPU::PadOp::fitIntoCMX(llvm::ArrayRef buffers, auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/populate_weight_table.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/populate_weight_table.cpp index 02c7acfdd8..b529a0884f 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/populate_weight_table.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/populate_weight_table.cpp @@ -6,8 +6,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/compiler/dialect/core/types.hpp" using namespace vpux; @@ -92,7 +92,7 @@ bool vpux::VPU::PopulateWeightTableOp::fitIntoCMX(ArrayRef buffer auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/prelu.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/prelu.cpp index ca0df81693..e7e76e0dca 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/prelu.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/prelu.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -124,7 +124,7 @@ bool vpux::VPU::PReluOp::fitIntoCMX(llvm::ArrayRef buffer auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/random_uniform.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/random_uniform.cpp index 579db54a93..3db4de120b 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/random_uniform.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/random_uniform.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -119,7 +119,7 @@ bool vpux::VPU::RandomUniformOp::fitIntoCMX(llvm::ArrayRef buff auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/rms.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/rms.cpp index 4fe1e8c809..0e4cb6cc67 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/rms.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/rms.cpp @@ -6,10 +6,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/type_infer.hpp" -#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -83,7 +81,7 @@ bool vpux::VPU::RMSOp::fitIntoCMX(llvm::ArrayRef buffers, auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/roll.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/roll.cpp index b48abc7e2e..42fe4c4189 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/roll.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/roll.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" using namespace vpux; @@ -199,7 +200,7 @@ bool vpux::VPU::RollOp::fitIntoCMX(llvm::ArrayRef buffers auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/rope.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/rope.cpp index ff0fab152c..35cb2d9d33 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/rope.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/rope.cpp @@ -6,10 +6,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/type_infer.hpp" -#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -68,7 +66,7 @@ bool vpux::VPU::RoPEOp::fitIntoCMX(llvm::ArrayRef buffers auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/round.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/round.cpp index e026d4add5..f1bbe8a079 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/round.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/round.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -63,7 +63,7 @@ bool vpux::VPU::RoundOp::fitIntoCMX(llvm::ArrayRef buffer auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/sdpa.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/sdpa.cpp index 03ecb957de..64f650061c 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/sdpa.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/sdpa.cpp @@ -6,10 +6,9 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/type_infer.hpp" -#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -83,7 +82,7 @@ bool vpux::VPU::SDPAOp::fitIntoCMX(llvm::ArrayRef buffers auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/select.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/select.cpp index 79e6a4a2cb..44ce77e14a 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/select.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/select.cpp @@ -6,8 +6,8 @@ #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -84,7 +84,7 @@ bool vpux::VPU::SelectOp::fitIntoCMX(llvm::ArrayRef buffe auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/shape_cast.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/shape_cast.cpp index 8b21dc111a..8cc839cb66 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/shape_cast.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/shape_cast.cpp @@ -4,16 +4,15 @@ // #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; using namespace VPU; namespace { DimArr getReshapedDims(ShapeCastOp shapeCastOp) { - auto inputShape = getShape(shapeCastOp.getSource()); - auto outputShape = getShape(shapeCastOp.getResult()); + auto inputShape = getShape(shapeCastOp.getInput()); + auto outputShape = getShape(shapeCastOp.getOutput()); DimArr reshapedDims; for (auto i : irange(inputShape.size())) { Dim dim(i); @@ -38,12 +37,12 @@ mlir::LogicalResult vpux::VPU::ShapeCastOp::inferReturnTypes(mlir::MLIRContext* } const auto outShape = parseIntArrayAttr(shapeCast.getShape()); - const auto inType = mlir::cast(shapeCast.getSource().getType()); + const auto inType = mlir::cast(shapeCast.getInput().getType()); auto getDistType = [&](VPU::DistributedTypeInterface inDistInterface) { - const auto arch = VPU::getArch(mlir::isa(operands[0]) - ? operands[0].getParentRegion()->getParentOfType() - : operands[0].getDefiningOp()); + const auto arch = config::getArch(mlir::isa(operands[0]) + ? operands[0].getParentRegion()->getParentOfType() + : operands[0].getDefiningOp()); const auto distAttr = VPUIP::getDistributedAttrAfterShapeCast(inDistInterface, outShape, arch); return inDistInterface.changeShapeForExplicitDistribution(ShapeRef(outShape), distAttr); @@ -71,8 +70,8 @@ vpux::InputTiling vpux::VPU::ShapeCastOp::backInferTileInfo(const vpux::TileInfo "ShapeCastOp does not support out tile with shape {0}, offset {1}", outputTile.shape, outputTile.offsets); - const auto inputShape = vpux::getShape(getSource()); - const auto outputShape = vpux::getShape(getResult()); + const auto inputShape = vpux::getShape(getInput()); + const auto outputShape = vpux::getShape(getOutput()); const auto tilingDims = getNonOneDim(outputTile.axis); TileInfo inputTile(inputShape); @@ -116,7 +115,7 @@ bool vpux::VPU::ShapeCastOp::isSupportedTilingDim(DimArrRef tilingDims) { } auto tilingDim = tilingDims.front(); - auto dimOrder = DimsOrder::fromValue(getSource()); + auto dimOrder = DimsOrder::fromValue(getInput()); auto idx0 = checked_cast(dimOrder.dimPos(reshapedDims[0])); auto idx1 = checked_cast(dimOrder.dimPos(reshapedDims[1])); if (std::abs(idx0 - idx1) != 1) { @@ -137,8 +136,8 @@ bool vpux::VPU::ShapeCastOp::isSupportedOutTile(const TileInfo& outTile) { return true; } - auto inputShape = vpux::getShape(getSource()); - auto outputShape = vpux::getShape(getResult()); + auto inputShape = vpux::getShape(getInput()); + auto outputShape = vpux::getShape(getOutput()); auto reshapedDims = getReshapedDims(*this); auto tilingDim = tilingDims.front(); @@ -169,7 +168,7 @@ mlir::FailureOr> vpux::VPU::ShapeCa } const auto srcShape = inType.getShape(); - const auto dstType = mlir::cast(getResult().getType()); + const auto dstType = mlir::cast(getOutput().getType()); const auto outShape = dstType.getShape(); auto mode = distribution.getDistributionMode(); @@ -216,10 +215,10 @@ mlir::FailureOr> vpux::VPU::ShapeCa mlir::OpFoldResult vpux::VPU::ShapeCastOp::fold(FoldAdaptor adaptor) { auto operands = adaptor.getOperands(); - auto inputType = mlir::cast(getSource().getType()); - auto outputType = mlir::cast(getResult().getType()); - if (getSource().getType() == getResult().getType()) { - return getSource(); + auto inputType = mlir::cast(getInput().getType()); + auto outputType = mlir::cast(getOutput().getType()); + if (inputType == outputType) { + return getInput(); } VPUX_THROW_UNLESS(!operands.empty(), "Wrong number of operands : {0}", operands.size()); @@ -247,12 +246,12 @@ class FuseShapeCast final : public mlir::OpRewritePattern { }; mlir::LogicalResult FuseShapeCast::matchAndRewrite(VPU::ShapeCastOp origOp, mlir::PatternRewriter& rewriter) const { - auto prevOp = origOp.getSource().getDefiningOp(); + auto prevOp = origOp.getInput().getDefiningOp(); if (prevOp == nullptr) { return mlir::failure(); } - rewriter.replaceOpWithNewOp(origOp, prevOp.getSource(), origOp.getShape()); + rewriter.replaceOpWithNewOp(origOp, prevOp.getInput(), origOp.getShape()); return mlir::success(); } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/shape_of.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/shape_of.cpp index c575eef6ec..c15579188b 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/shape_of.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/shape_of.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -41,7 +42,7 @@ bool vpux::VPU::ShapeOfOp::fitIntoCMX(llvm::ArrayRef buff auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/sigmoid.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/sigmoid.cpp index 18c85799af..36766d2af7 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/sigmoid.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/sigmoid.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -66,7 +67,7 @@ bool vpux::VPU::SigmoidOp::fitIntoCMX(llvm::ArrayRef buff auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/sign.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/sign.cpp index 9740a87c29..871cb01d7e 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/sign.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/sign.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -66,7 +67,7 @@ bool vpux::VPU::SignOp::fitIntoCMX(llvm::ArrayRef buffers auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/sin.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/sin.cpp index 0dbea6197c..d9f5ce537f 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/sin.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/sin.cpp @@ -5,8 +5,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -63,7 +63,7 @@ bool vpux::VPU::SinOp::fitIntoCMX(llvm::ArrayRef buffers, auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/slice.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/slice.cpp index ce7d6d37a7..3e217773da 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/slice.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/slice.cpp @@ -41,7 +41,7 @@ mlir::LogicalResult vpux::VPU::SliceOp::inferReturnTypes(mlir::MLIRContext* ctx, return mlir::failure(); } - const auto origType = mlir::dyn_cast(sliceOp.getSource().getType()); + const auto origType = mlir::dyn_cast(sliceOp.getInput().getType()); if (origType == nullptr) { return errorAt(loc, "VPU::SliceOp operand must have vpux::NDTypeInterface type"); } @@ -123,8 +123,8 @@ mlir::LogicalResult vpux::VPU::SliceOp::inferReturnTypes(mlir::MLIRContext* ctx, mlir::OpFoldResult VPU::SliceOp::fold(FoldAdaptor adaptor) { auto operands = adaptor.getOperands(); - if (getSource().getType() == getResult().getType()) { - return getSource(); + if (getInput().getType() == getOutput().getType()) { + return getInput(); } if (const auto origContent = mlir::dyn_cast_or_null(operands[0])) { @@ -147,7 +147,7 @@ class ComposeSlice final : public mlir::OpRewritePattern { using OpRewritePattern::OpRewritePattern; mlir::LogicalResult matchAndRewrite(VPU::SliceOp origOp, mlir::PatternRewriter& rewriter) const final { - auto producerSliceOp = origOp.getSource().getDefiningOp(); + auto producerSliceOp = origOp.getInput().getDefiningOp(); if (producerSliceOp == nullptr) { return mlir::failure(); } @@ -160,8 +160,7 @@ class ComposeSlice final : public mlir::OpRewritePattern { const auto finalOffsetsAttr = getIntArrayAttr(getContext(), finalOffsets); const auto finalShapeAttr = origOp.getStaticSizes(); - rewriter.replaceOpWithNewOp(origOp, producerSliceOp.getSource(), finalOffsetsAttr, - finalShapeAttr); + rewriter.replaceOpWithNewOp(origOp, producerSliceOp.getInput(), finalOffsetsAttr, finalShapeAttr); return mlir::success(); } @@ -183,13 +182,13 @@ class RemoveRedundantExpandSlice final : public mlir::OpRewritePattern(); + auto expandOp = sliceOp.getInput().getDefiningOp(); if (expandOp == nullptr) { return mlir::failure(); } const auto origInputShape = getShape(expandOp.getInput()); - const auto origOutputShape = getShape(sliceOp.getResult()); + const auto origOutputShape = getShape(sliceOp.getOutput()); if (origInputShape != origOutputShape) { return mlir::failure(); } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/softmax.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/softmax.cpp index ece73cc043..b788584d9d 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/softmax.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/softmax.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -105,7 +106,7 @@ bool vpux::VPU::SoftMaxOp::fitIntoCMX(llvm::ArrayRef buff auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/sqrt.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/sqrt.cpp index b648ac4279..81b427fad7 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/sqrt.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/sqrt.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -66,7 +67,7 @@ bool vpux::VPU::SqrtOp::fitIntoCMX(llvm::ArrayRef buffers auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/subtract.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/subtract.cpp index b647172528..107029c155 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/subtract.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/subtract.cpp @@ -6,8 +6,8 @@ #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -81,7 +81,7 @@ bool vpux::VPU::SubtractOp::fitIntoCMX(llvm::ArrayRef buf auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/swish.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/swish.cpp index 1456b14aee..080faf1b96 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/swish.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/swish.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -67,7 +68,7 @@ bool vpux::VPU::SwishOp::fitIntoCMX(llvm::ArrayRef buffer auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/tanh.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/tanh.cpp index 93e56b7057..26aaabd9a4 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/tanh.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/tanh.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -62,7 +63,7 @@ bool vpux::VPU::TanhOp::fitIntoCMX(llvm::ArrayRef buffers auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/topk.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/topk.cpp index 40c6cf8943..6d85a77908 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/topk.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/topk.cpp @@ -6,8 +6,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/attributes_utils.hpp" using namespace vpux; @@ -182,7 +182,7 @@ bool vpux::VPU::TopKOp::fitIntoCMX(llvm::ArrayRef buffers auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(getOperation()).count() : getTotalCMXFragmentationAwareSize(getOperation()).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(getOperation()), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(getOperation()), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/transposed_convolution.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/transposed_convolution.cpp index ed715b929f..fd403d0e20 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/ops/transposed_convolution.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/transposed_convolution.cpp @@ -5,7 +5,10 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" +#include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" @@ -56,11 +59,19 @@ mlir::LogicalResult vpux::VPU::TransposedConvolutionOp::inferReturnTypes( auto outType = featureType.changeShape(Shape(mlirOutputShape)); inferredReturnTypes.push_back(outType); } else { - const auto mlirOutputShape = - inferTransposedConvBackpropOutputShape(featureShape, filterShape, windowStrides, dataPaddingBelow, - dataPaddingAbove, windowDilations, outputPadding); + const auto inputType = mlir::cast(convBackpropData.getInput().getType()); + const auto filterType = mlir::cast(convBackpropData.getFilter().getType()); - auto outType = featureType.changeShape(Shape(mlirOutputShape)); + const auto inShapeInfo = ShapeInfo::fromNDType(inputType); + const auto filterShapeInfo = ShapeInfo::fromNDType(filterType); + + auto shapeInfo = inferTransposedConvBackpropOutputShapeInfo(inShapeInfo, filterShapeInfo, windowStrides, + dataPaddingBelow, dataPaddingAbove, windowDilations, + outputPadding); + + const auto outDesc = + vpux::getTensorAttr(ctx, inputType.getDimsOrder(), /*memSpace=*/nullptr, Bounds(shapeInfo.bounds)); + const auto outType = mlir::RankedTensorType::get(shapeInfo.shape, inputType.getElementType(), outDesc); inferredReturnTypes.push_back(outType); } diff --git a/src/vpux_compiler/src/dialect/VPU/IR/tiling_info.cpp b/src/vpux_compiler/src/dialect/VPU/IR/tiling_info.cpp index 5b30c13df7..b4f588a421 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/tiling_info.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/tiling_info.cpp @@ -4,12 +4,14 @@ // #include "vpux/compiler/dialect/VPU/IR/tiling_info.hpp" -#include #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/core/tiling.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/utils/core/error.hpp" +#include + namespace vpux::VPU { OutputTiling DetectionOutputSortOpOutputTiling(const vpux::TileInfo& firstOutputTile) { diff --git a/src/vpux_compiler/src/dialect/VPU/IR/types/distributed_tensor.cpp b/src/vpux_compiler/src/dialect/VPU/IR/types/distributed_tensor.cpp index e9823aa8f1..e708b0606b 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/types/distributed_tensor.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/types/distributed_tensor.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/types.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/quantization.hpp" diff --git a/src/vpux_compiler/src/dialect/VPU/IR/types/sparse_tensor.cpp b/src/vpux_compiler/src/dialect/VPU/IR/types/sparse_tensor.cpp index 0eb173ccbc..9da2eac07a 100644 --- a/src/vpux_compiler/src/dialect/VPU/IR/types/sparse_tensor.cpp +++ b/src/vpux_compiler/src/dialect/VPU/IR/types/sparse_tensor.cpp @@ -3,18 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/types.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" #include "vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/type_infer.hpp" -#include "vpux/utils/core/numeric.hpp" #include -#include - using namespace vpux; // diff --git a/src/vpux_compiler/src/dialect/VPU/interfaces/common_utils/layer_permute_ie.cpp b/src/vpux_compiler/src/dialect/VPU/interfaces/common_utils/layer_permute_ie.cpp index c736d126ae..a43b8d1dc5 100644 --- a/src/vpux_compiler/src/dialect/VPU/interfaces/common_utils/layer_permute_ie.cpp +++ b/src/vpux_compiler/src/dialect/VPU/interfaces/common_utils/layer_permute_ie.cpp @@ -4,7 +4,7 @@ // #include "vpux/compiler/dialect/VPU/interfaces/common_utils/layer_permute_ie.hpp" - +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/sep_utils.hpp" #include "vpux/compiler/utils/permute_utils.hpp" diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/factories/barrier_variant_constraint.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/factories/barrier_variant_constraint.cpp index 0fe190bca8..250ded2809 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/factories/barrier_variant_constraint.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/factories/barrier_variant_constraint.cpp @@ -11,13 +11,13 @@ using namespace vpux; -VPU::PerBarrierVariantConstraint VPU::getPerBarrierVariantConstraint(VPU::ArchKind arch, +VPU::PerBarrierVariantConstraint VPU::getPerBarrierVariantConstraint(config::ArchKind arch, bool enableWorkloadManagement) { switch (arch) { - case VPU::ArchKind::NPU37XX: { + case config::ArchKind::NPU37XX: { return VPU::arch37xx::PerBarrierVariantConstraint{}; } - case VPU::ArchKind::NPU40XX: { + case config::ArchKind::NPU40XX: { return VPU::arch40xx::PerBarrierVariantConstraint{enableWorkloadManagement}; } default: { diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/factories/convert_op_to_dma_for_performant_execution_getter.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/factories/convert_op_to_dma_for_performant_execution_getter.cpp index a81c142ed6..80d69a100a 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/factories/convert_op_to_dma_for_performant_execution_getter.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/factories/convert_op_to_dma_for_performant_execution_getter.cpp @@ -5,15 +5,15 @@ #include "vpux/compiler/dialect/VPU/transforms/factories/convert_op_to_dma_for_performant_execution_getter.hpp" #include "vpux/compiler/NPU40XX/dialect/VPU/impl/convert_ops_to_dma_for_performant_execution_strategy.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/attributes.hpp" #include using namespace vpux::VPU; std::unique_ptr vpux::VPU::createConvertOpToDMAForPerformantExecutionStrategy( - ArchKind arch) { - if (arch >= VPU::ArchKind::NPU40XX) { + config::ArchKind arch) { + if (arch >= config::ArchKind::NPU40XX) { return std::make_unique(); } // TODO : E#-118296 Other ops and architectures will be enabled. diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/factories/frequency_table.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/factories/frequency_table.cpp index 1bd1f8e612..f46880c516 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/factories/frequency_table.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/factories/frequency_table.cpp @@ -11,9 +11,9 @@ using namespace vpux; -VPU::FrequencyTableCb VPU::getFrequencyTable(VPU::ArchKind arch) { +VPU::FrequencyTableCb VPU::getFrequencyTable(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU40XX: { + case config::ArchKind::NPU40XX: { return VPU::arch40xx::getFrequencyTable; } default: { diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/factories/gather_dma_constants.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/factories/gather_dma_constants.cpp index 50d3669331..02c45078b7 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/factories/gather_dma_constants.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/factories/gather_dma_constants.cpp @@ -8,18 +8,18 @@ using namespace vpux; -size_t VPU::getGatherDMAMaxIndicesListLength(VPU::ArchKind arch) { +size_t VPU::getGatherDMAMaxIndicesListLength(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return VPU::arch40xx::DMA_MAX_INDICES_LIST_LENGTH; default: VPUX_THROW("Unable to get GatherDMAMaxIndicesListLength for arch {0}", arch); } }; -size_t VPU::getGatherDMAMaxElementSize(VPU::ArchKind arch) { +size_t VPU::getGatherDMAMaxElementSize(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return VPU::arch40xx::GATHER_DMA_MAX_ELEMENT_SIZE; default: VPUX_THROW("Unable to get GatherDMAMaxElementSize for arch {0}", arch); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/factories/make_ops_with_distributed_tensor_strategy_getter.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/factories/make_ops_with_distributed_tensor_strategy_getter.cpp index f12ab08a79..d3d4456407 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/factories/make_ops_with_distributed_tensor_strategy_getter.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/factories/make_ops_with_distributed_tensor_strategy_getter.cpp @@ -7,10 +7,8 @@ #include #include "vpux/compiler/NPU37XX/dialect/VPU/impl/make_ops_with_distributed_tensor_strategy.hpp" #include "vpux/compiler/NPU40XX/dialect/VPU/impl/make_ops_with_distributed_tensor_strategy.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/compiler/utils/logging.hpp" -#include "vpux/utils/logger/logger.hpp" using namespace vpux; @@ -18,10 +16,10 @@ std::unique_ptr VPU::createMakeOpsWithDistributedTensorSt mlir::func::FuncOp funcOp, const llvm::DenseMap& typeLookup, const llvm::DenseMap>& inputTypeLookup, bool enableExplicitDistributionInfoAttr) { - const auto arch = VPU::getArch(funcOp); + const auto arch = config::getArch(funcOp); switch (arch) { - case VPU::ArchKind::NPU37XX: { + case config::ArchKind::NPU37XX: { return std::make_unique(typeLookup, inputTypeLookup, enableExplicitDistributionInfoAttr); } diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/factories/max_kernel_size_constant.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/factories/max_kernel_size_constant.cpp index 5d6d93fc5b..619ad2e282 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/factories/max_kernel_size_constant.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/factories/max_kernel_size_constant.cpp @@ -10,10 +10,10 @@ using namespace vpux; -VPU::MaxKernelSizeConstant VPU::getMaxKernelSizeConstant(VPU::ArchKind arch) { +VPU::MaxKernelSizeConstant VPU::getMaxKernelSizeConstant(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: - case VPU::ArchKind::NPU40XX: { + case config::ArchKind::NPU37XX: + case config::ArchKind::NPU40XX: { return VPU::arch37xx::MaxKernelSizeConstant{}; } default: { diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/factories/max_lstm_hidden_size_constant.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/factories/max_lstm_hidden_size_constant.cpp index cab355abca..9ab341c543 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/factories/max_lstm_hidden_size_constant.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/factories/max_lstm_hidden_size_constant.cpp @@ -6,16 +6,14 @@ #include "vpux/compiler/NPU40XX/dialect/VPU/impl/max_lstm_hidden_size_constant.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/max_lstm_hidden_size_constant.hpp" -#include "vpux/utils/core/error.hpp" - using namespace vpux; constexpr int64_t maxLstmSequenceHiddenSizeConstant = 0; constexpr int64_t maxLstmCellHiddenSizeConstant = 0; -int64_t VPU::getMaxLstmSequenceHiddenSizeConstant(VPU::ArchKind arch) { +int64_t VPU::getMaxLstmSequenceHiddenSizeConstant(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: { + case config::ArchKind::NPU37XX: { return maxLstmSequenceHiddenSizeConstant; } default: { @@ -24,9 +22,9 @@ int64_t VPU::getMaxLstmSequenceHiddenSizeConstant(VPU::ArchKind arch) { } } -int64_t VPU::getMaxLstmCellHiddenSizeConstant(VPU::ArchKind arch) { +int64_t VPU::getMaxLstmCellHiddenSizeConstant(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: { + case config::ArchKind::NPU37XX: { return maxLstmCellHiddenSizeConstant; } default: { diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/factories/mc_strategy_getter.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/factories/mc_strategy_getter.cpp index 4af5c9a25c..eb2fb0e300 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/factories/mc_strategy_getter.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/factories/mc_strategy_getter.cpp @@ -9,12 +9,12 @@ using namespace vpux::VPU; -std::unique_ptr vpux::VPU::createMCStrategyGetter(ArchKind arch, int64_t numClusters) { +std::unique_ptr vpux::VPU::createMCStrategyGetter(config::ArchKind arch, int64_t numClusters) { if (numClusters == 1) { return std::make_unique(); } switch (arch) { - case VPU::ArchKind::NPU37XX: { + case config::ArchKind::NPU37XX: { return std::make_unique(); } default: { diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/factories/nce_sparsity_converters.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/factories/nce_sparsity_converters.cpp index a3f418d7a1..99b82cc205 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/factories/nce_sparsity_converters.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/factories/nce_sparsity_converters.cpp @@ -10,27 +10,27 @@ using namespace vpux; -VPU::NCESparsity::PPEConverterCb VPU::NCESparsity::getPPEConverterCb(VPU::ArchKind arch, +VPU::NCESparsity::PPEConverterCb VPU::NCESparsity::getPPEConverterCb(config::ArchKind arch, [[maybe_unused]] bool isNewWeightTableFormat) { switch (arch) { - case VPU::ArchKind::NPU37XX: - case VPU::ArchKind::NPU40XX: { + case config::ArchKind::NPU37XX: + case config::ArchKind::NPU40XX: { return VPU::arch37xx::getScale; } - case VPU::ArchKind::UNKNOWN: + case config::ArchKind::UNKNOWN: default: { VPUX_THROW("Unexpected architecture {0}", arch); } } } -VPU::NCESparsity::BiasConverterCb VPU::NCESparsity::getBiasConverterCb(VPU::ArchKind arch, +VPU::NCESparsity::BiasConverterCb VPU::NCESparsity::getBiasConverterCb(config::ArchKind arch, [[maybe_unused]] bool isNewWeightTableFormat) { switch (arch) { - case VPU::ArchKind::NPU37XX: - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU37XX: + case config::ArchKind::NPU40XX: return VPU::arch37xx::getBias; - case VPU::ArchKind::UNKNOWN: + case config::ArchKind::UNKNOWN: default: { VPUX_THROW("Unexpected architecture {0}", arch); } diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/factories/shave_controls_dpu.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/factories/shave_controls_dpu.cpp index 5126132b8e..ac9952f8f1 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/factories/shave_controls_dpu.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/factories/shave_controls_dpu.cpp @@ -17,31 +17,31 @@ using namespace vpux; constexpr bool shaveControlsDpuValue = false; -bool VPU::getShaveControlsDpu(VPU::ArchKind arch) { +bool VPU::getShaveControlsDpu(config::ArchKind arch) { (void)arch; return shaveControlsDpuValue; } -size_t VPU::getDpuDebugDataSize(VPU::ArchKind /*arch*/) { +size_t VPU::getDpuDebugDataSize(config::ArchKind /*arch*/) { return sizeof(HwpDpuIduOduData_t); } -size_t VPU::getDPUInvariantDataSize(VPU::ArchKind arch) { +size_t VPU::getDPUInvariantDataSize(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return sizeof(npu37xx::nn_public::VpuDPUInvariantRegisters); - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return sizeof(npu40xx::nn_public::VpuDPUInvariantRegisters); default: VPUX_THROW("Unable to get DPUInvariantDataSize for arch {0}", arch); } } -size_t VPU::getDPUVariantDataSize(VPU::ArchKind arch) { +size_t VPU::getDPUVariantDataSize(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return sizeof(npu37xx::nn_public::VpuDPUVariantRegisters); - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return sizeof(npu40xx::nn_public::VpuDPUVariantRegisters); default: VPUX_THROW("Unable to get DPUVariantDataSize for arch {0}", arch); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/factories/shave_kernel_info.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/factories/shave_kernel_info.cpp index f1a0a5dffe..8f5fef8c5b 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/factories/shave_kernel_info.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/factories/shave_kernel_info.cpp @@ -6,15 +6,16 @@ #include "vpux/compiler/dialect/VPU/transforms/factories/shave_kernel_info.hpp" #include "vpux/compiler/NPU37XX/dialect/VPU/impl/shave_kernel_info.hpp" #include "vpux/compiler/NPU40XX/dialect/VPU/impl/shave_kernel_info.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/utils/core/error.hpp" using namespace vpux; std::unique_ptr VPU::getShaveKernelInfo(mlir::Operation* op) { - const auto arch = VPU::getArch(op); + const auto arch = config::getArch(op); switch (arch) { - case VPU::ArchKind::NPU37XX: { + case config::ArchKind::NPU37XX: { return std::make_unique(op); } default: { diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/factories/sparsity_constraint.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/factories/sparsity_constraint.cpp index fdaac9bee7..b6261ce885 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/factories/sparsity_constraint.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/factories/sparsity_constraint.cpp @@ -11,9 +11,9 @@ using namespace vpux; -VPU::SparsityConstraint VPU::getSparsityConstraint(VPU::ArchKind arch) { +VPU::SparsityConstraint VPU::getSparsityConstraint(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: { + case config::ArchKind::NPU37XX: { return VPU::arch37xx::SparsityConstraint{}; } default: { diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes.cpp index 07f4579586..e81cc56b9b 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" namespace vpux::VPU { diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/add_sparsity_map_to_sparse_activations.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/add_sparsity_map_to_sparse_activations.cpp index 442d7dbf3b..32e3fb5697 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/add_sparsity_map_to_sparse_activations.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/add_sparsity_map_to_sparse_activations.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -76,7 +77,7 @@ void AddSparsityMapToSparseActivationsPass::safeRunOnFunc() { auto isSEOnlyOp = mlir::isa(op) && sparseType.getSparsityMap() == nullptr && sparseType.getStorageElementTable() != nullptr; - if (VPU::isSEOnlyWithoutSMSupported(VPU::getArch(op)) && isSEOnlyOp) { + if (VPU::isSEOnlyWithoutSMSupported(config::getArch(op)) && isSEOnlyOp) { return; } diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/apply_tiling.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/apply_tiling.cpp index 5e975f7d08..3ba6846f41 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/apply_tiling.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/apply_tiling.cpp @@ -63,7 +63,7 @@ mlir::LogicalResult ApplyTiling::matchAndRewrite(VPU::TilingBuilderOpInterface o _log.nest().trace("Applying tiling for op {0} at {1}, tiles: {2}", op->getName(), op->getLoc(), strategy); - auto tilingContext = VPU::createTilingContext(op, strategy, _enableSCFTiling); + auto tilingContext = VPU::createTilingContext(op, _enableSCFTiling); auto tilingResult = tilingContext.applyTiling(rewriter, _log); return tilingResult; diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/compress_dma_reserve_mem.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/compress_dma_reserve_mem.cpp similarity index 69% rename from src/vpux_compiler/src/dialect/VPUIP/transforms/passes/compress_dma_reserve_mem.cpp rename to src/vpux_compiler/src/dialect/VPU/transforms/passes/compress_dma_reserve_mem.cpp index 36c6a37c89..2577101f0c 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/compress_dma_reserve_mem.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/compress_dma_reserve_mem.cpp @@ -4,15 +4,16 @@ // #include "vpux/compiler/dialect/IE/utils/resources.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/utils/compression_utils.hpp" -namespace vpux::VPUIP { +namespace vpux::VPU { #define GEN_PASS_DECL_COMPRESSDMARESERVEMEM #define GEN_PASS_DEF_COMPRESSDMARESERVEMEM -#include "vpux/compiler/dialect/VPUIP/passes.hpp.inc" -} // namespace vpux::VPUIP +#include "vpux/compiler/dialect/VPU/passes.hpp.inc" +} // namespace vpux::VPU using namespace vpux; @@ -22,7 +23,7 @@ namespace { // CompressDmaReserveMemPass // -class CompressDmaReserveMemPass final : public VPUIP::impl::CompressDmaReserveMemBase { +class CompressDmaReserveMemPass final : public VPU::impl::CompressDmaReserveMemBase { public: explicit CompressDmaReserveMemPass(Logger log) { Base::initLogger(log, Base::getArgumentName()); @@ -49,6 +50,6 @@ void CompressDmaReserveMemPass::safeRunOnModule() { // createCompressDmaReserveMemPass // -std::unique_ptr vpux::VPUIP::createCompressDmaReserveMemPass(Logger log) { +std::unique_ptr vpux::VPU::createCompressDmaReserveMemPass(Logger log) { return std::make_unique(log); } diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/concat_init_inputs.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/concat_init_inputs.cpp new file mode 100644 index 0000000000..b957314fee --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/concat_init_inputs.cpp @@ -0,0 +1,104 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/utils/weights_separation.hpp" +#include "vpux/compiler/dialect/net/IR/ops.hpp" +#include "vpux/compiler/utils/net/network_info_utils.hpp" +#include "vpux/compiler/utils/rewriter.hpp" + +#include +#include +#include +#include + +#include + +namespace vpux::VPU { +#define GEN_PASS_DECL_CONCATINITINPUTS +#define GEN_PASS_DEF_CONCATINITINPUTS +#include "vpux/compiler/dialect/VPU/passes.hpp.inc" +} // namespace vpux::VPU + +using namespace vpux; + +namespace { + +// Returns a unique name for concatenated init inputs. +std::string getUniqueConcatenatedNameOfInitInputs(ArrayRef names) { + if (names.size() == 1) { + // Note: preserve the original name for a single argument + return names.front().str(); + } + + llvm::hash_code hashCode = llvm::hash_combine(names); + return formatv("{0}hash_{1}_concat", Const::IMPORTED_WEIGHT_PREFIX, hashCode); +} + +class ConcatInitInputs final : public VPU::impl::ConcatInitInputsBase { +public: + enum class Mode { Unspecified, GenerateMain, GenerateInit, GenerateAll }; + + explicit ConcatInitInputs(const Logger& log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnModule() final; + + size_t concatenateFunctionInputs(mlir::func::FuncOp funcOp); + void updateNetworkInfo(net::NetworkInfoOp netInfo, mlir::func::FuncOp funcOp, size_t newInputsOffset); +}; + +size_t ConcatInitInputs::concatenateFunctionInputs(mlir::func::FuncOp funcOp) { + SmallVector indices(funcOp.getNumArguments(), 0); + std::iota(indices.begin(), indices.end(), 0); + + VPU::obfuscateInputs(_log, appendLoc(funcOp.getLoc(), "obfuscated_inputs"), funcOp, indices, + [](mlir::OpBuilder& builder, mlir::Location loc, mlir::Value input, ArrayRef offsets, + ArrayRef sizes) { + return builder.create(loc, input, offsets, sizes); + }); + return 0; +} + +void ConcatInitInputs::updateNetworkInfo(net::NetworkInfoOp netInfo, mlir::func::FuncOp funcOp, + size_t newInputsOffset) { + OpBuilderLogger builderLog(_log.nest()); + mlir::OpBuilder builder(&getContext(), &builderLog); + + auto& inputsRegion = netInfo.getInputsInfo(); + const auto namesToBeMerged = to_small_vector(inputsRegion.front().getOps() | + transformed([](net::DataInfoOp op) -> StringRef { + return op.getName(); + })); + + // update input types + // Note: preserve original, non-constant inputs information + net::eraseSectionEntries(inputsRegion, newInputsOffset); + builder.setInsertionPointToEnd(&inputsRegion.front()); + + builder.create(appendLoc(netInfo.getLoc(), "concat_in"), + getUniqueConcatenatedNameOfInitInputs(namesToBeMerged), + funcOp.getFunctionType().getInput(newInputsOffset)); +} + +void ConcatInitInputs::safeRunOnModule() { + auto moduleOp = getOperation(); + + net::NetworkInfoOp netInfo; + mlir::func::FuncOp entryPointFunc; + net::NetworkInfoOp::getFromModule(moduleOp, netInfo, entryPointFunc); + + const auto offset = concatenateFunctionInputs(entryPointFunc); + updateNetworkInfo(netInfo, entryPointFunc, offset); +} + +} // namespace + +std::unique_ptr vpux::VPU::createConcatInitInputsPass(const Logger& log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/concat_init_results.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/concat_init_results.cpp index 64382a8d4e..241788284e 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/concat_init_results.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/concat_init_results.cpp @@ -3,11 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/weights_separation.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" +#include "vpux/compiler/utils/func_dialect.hpp" #include "vpux/compiler/utils/net/network_info_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/error.hpp" @@ -16,6 +17,7 @@ #include #include #include + #include namespace vpux::VPU { @@ -58,7 +60,7 @@ std::string getUniqueConcatenatedNameOfInitResults(ArrayRef args, const size_t hash = Const::ContentAttr::getTransformationHash(arg.transformations); hashCode = llvm::hash_combine(hashCode, hash); } - return formatv("out_{0}_{1}_hash_{2}_concat", Const::OPENVINO_CONST_PREFIX, initPart, hashCode); + return formatv("{0}{1}_hash_{2}_concat", vpux::VPU::INIT_OUTPUT_PREFIX, initPart, hashCode); } // Returns a new ranked tensor without the tensor encoding. @@ -74,6 +76,11 @@ class ConcatInitResults final : public VPU::impl::ConcatInitResultsBase(); VPUX_THROW_UNLESS(info.has_value(), "VPU::WeightsSeparationInfo analysis must be cached"); @@ -194,6 +203,60 @@ void ConcatInitResults::updateNetworkInfoForMain(net::NetworkInfoOp netInfo, mli } } +void ConcatInitResults::updateWrapperMain(mlir::func::FuncOp wrapperFunc, const DerivedWeightsSeparationInfo& info) { + // entry-point is a wrapper function: when dealing with init() call, + // update it as init; otherwise, update as main. + struct { + mlir::func::CallOp initCall = nullptr; + mlir::func::FuncOp initFunc = nullptr; + mlir::func::CallOp mainCall = nullptr; + mlir::func::FuncOp mainFunc = nullptr; + } curr; // current call-site information inside wrapper_main + wrapperFunc.walk([&](mlir::func::CallOp callOp) { + auto funcOp = getCalledFunction(callOp); + if (funcOp.getSymName().starts_with("init")) { // definitely init + assert(curr.initCall == nullptr && "More than 1 init calls found!"); + curr.initCall = callOp; + curr.initFunc = funcOp; + } else { + assert(curr.mainCall == nullptr && "More than 1 main calls found!"); + curr.mainCall = callOp; + curr.mainFunc = funcOp; + } + }); + if (bool emptyInitCase = (curr.initCall == nullptr); emptyInitCase) { + // nothing to do because there's no init and thus main doesn't need to + // be updated. + return; + } + + updateInit(curr.initFunc); + std::ignore = updateTopLevelMain(curr.mainFunc, info); + + // once functions are updated, fix the call-sites + OpBuilderLogger builderLog(_log.nest()); + mlir::OpBuilder builder(wrapperFunc, &builderLog); + + // new init call + builder.setInsertionPoint(curr.initCall); + auto newInitCall = builder.create(curr.initCall.getLoc(), getCalledFunction(curr.initCall), + curr.initCall.getOperands()); + + // new main call + builder.setInsertionPoint(curr.mainCall); + auto newMainCallOperands = to_small_vector(curr.mainCall->getOperands() | filtered([&](mlir::Value x) { + return x.getDefiningOp() != curr.initCall; + })); + newMainCallOperands.push_back(newInitCall.getResult(0)); // Note: guaranteed single result + auto newMainCall = builder.create(curr.mainCall.getLoc(), getCalledFunction(curr.mainCall), + newMainCallOperands); + + // replace + curr.mainCall.replaceAllUsesWith(newMainCall.getResults()); + curr.mainCall.erase(); + curr.initCall.erase(); +} + void ConcatInitResults::safeRunOnModule() { auto moduleOp = getOperation(); if (mlir::failed(deferredInitialize(moduleOp))) { @@ -238,6 +301,10 @@ void ConcatInitResults::safeRunOnModule() { updateNetworkInfoForMain(netInfo, entryPointFunc, offset, info); break; } + case Mode::GenerateAll: { + updateWrapperMain(entryPointFunc, info); + break; + } default: VPUX_THROW("Invalid mode encountered"); } @@ -251,6 +318,8 @@ mlir::LogicalResult ConcatInitResults::initialize(mlir::MLIRContext*) { _mode = Mode::GenerateMain; } else if (modeString == "gen-init") { _mode = Mode::GenerateInit; + } else if (modeString == "gen-all") { + _mode = Mode::GenerateAll; } else { return mlir::failure(); } @@ -279,8 +348,11 @@ mlir::LogicalResult ConcatInitResults::deferredInitialize(mlir::ModuleOp moduleO break; } case Mode::GenerateAll: { - moduleOp->emitError(formatv("{0} mode is not supported", wsExtractionMode.getValue())); - return mlir::failure(); + if (limitSpecified) { + moduleOp->emitError(formatv("{0} is not supported in monolithic mode", memoryLimit.getArgStr())); + return mlir::failure(); + } + [[fallthrough]]; } case Mode::GenerateMain: { if (initPartSpecified) { @@ -304,3 +376,8 @@ mlir::LogicalResult ConcatInitResults::deferredInitialize(mlir::ModuleOp moduleO std::unique_ptr vpux::VPU::createConcatInitResultsPass(const Logger& log) { return std::make_unique(log); } + +std::unique_ptr vpux::VPU::createConcatInitResultsPass(StringRef wsExtractionModeString, + const Logger& log) { + return std::make_unique(wsExtractionModeString, log); +} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_const_args_to_multi_constants.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_const_args_to_multi_constants.cpp index c02941959d..7d544e5139 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_const_args_to_multi_constants.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_const_args_to_multi_constants.cpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_dynamic_to_static_kernels.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_dynamic_to_static_kernels.cpp new file mode 100644 index 0000000000..3f754afd92 --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_dynamic_to_static_kernels.cpp @@ -0,0 +1,386 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" +#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" +#include "vpux/compiler/dialect/net/IR/ops.hpp" +#include "vpux/compiler/utils/logging.hpp" + +#include +#include +#include +#include +#include + +namespace vpux::VPU { +#define GEN_PASS_DECL_CONVERTDYNAMICTOSTATICKERNELS +#define GEN_PASS_DEF_CONVERTDYNAMICTOSTATICKERNELS +#include "vpux/compiler/dialect/VPU/passes.hpp.inc" +} // namespace vpux::VPU + +using namespace vpux; +using namespace VPU; + +namespace { + +// +// ConvertDynamicToStaticKernelsPass +// +class ConvertDynamicToStaticKernelsPass final : + public VPU::impl::ConvertDynamicToStaticKernelsBase { +public: + explicit ConvertDynamicToStaticKernelsPass(Logger log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnModule() final; + bool adjustIndexIntoDynamicTensor(mlir::scf::ForOp forOp); + + auto isDynamicTensor(mlir::Value value) { + if (auto rankedTensorType = mlir::dyn_cast(value.getType())) { + return !rankedTensorType.hasStaticShape(); + } + return false; + }; + + bool isSupportedDynamicTensorType(mlir::Value value) { + if (auto rankedTensorType = mlir::dyn_cast(value.getType())) { + if (llvm::count(rankedTensorType.getShape(), mlir::ShapedType::kDynamic) == 1) { + return true; + } else { + mlir::emitError(value.getLoc(), "Expected a ranked tensor type with exactly one dynamic dimension"); + } + } + return false; + }; + + void tryAndInferStaticShapes(mlir::Value value, int64_t staticDimSize); + mlir::RankedTensorType inferStaticTensorType(mlir::Value value, int64_t staticDimSize, bool keepBounds = true); + mlir::func::FuncOp createStaticFuncOp(mlir::func::FuncOp dynFuncOp, const mlir::FunctionType& staticFuncType, + mlir::ModuleOp moduleOp, int64_t staticDimSize, bool eraseDynamicFunc = true); + void addConversionCast(mlir::Operation* sliceOp, int64_t staticDimSize); +}; + +/** + * @brief Adjusts indices into dynamic tensors within scf.for loops based on the current index and fixed step size. + * When the remaining elements in the current iteration are fewer than the step size, this method backtracks the index + * to ensure extraction of a static-shaped tensor. + * + * For example, given a dynamic tensor of shape <1x1x32x?xfp16> with bounds [1, 1, 32, 1000] and step size 100, + * processing an input tensor of <1x1x32x250xfp16> would normally have the last iteration at index 200, + * but only 50 elements would remain. This method adjusts the index to 150 (extracting a slice + * from offset 150 with size 100), resulting in a static-shaped tensor <1x1x32x100xfp16>. + */ +bool ConvertDynamicToStaticKernelsPass::adjustIndexIntoDynamicTensor(mlir::scf::ForOp forOp) { + auto affineMinOps = forOp.getOps(); + if (affineMinOps.empty()) { + _log.error("affine.min operation not found in the scf.for loop body. Skipping conversion for this loop."); + return false; + } + + if (std::distance(affineMinOps.begin(), affineMinOps.end()) > 1) { + _log.warning("Multiple affine.min operations found in the scf.for loop body !!"); + } + + auto affineMinOp = *affineMinOps.begin(); + mlir::OpBuilder builder(affineMinOp); + builder.setInsertionPointAfter(affineMinOp); + auto minValue = affineMinOp.getResult(); + auto cmpOp = builder.create(forOp.getLoc(), mlir::arith::CmpIPredicate::ne, minValue, + forOp.getStep()); + auto ifOp = builder.create(forOp.getLoc(), minValue.getType(), cmpOp.getResult(), + /*withElseRegion=*/true); + + // Compute and return the adjusted index (backtrack index) in the 'then' region + mlir::OpBuilder thenBuilder = ifOp.getThenBodyBuilder(); + auto stepDiff = thenBuilder.create(ifOp.getLoc(), forOp.getStep(), minValue); + + // Check whether we have enough elements to backtrack + auto canBacktrack = thenBuilder.create(stepDiff.getLoc(), mlir::arith::CmpIPredicate::slt, + forOp.getInductionVar(), stepDiff); + thenBuilder.create(canBacktrack.getLoc(), canBacktrack, + "Not enough elements to backtrack in scf.for loop"); + + auto adjustedIndex = thenBuilder.create(ifOp.getLoc(), forOp.getInductionVar(), stepDiff); + thenBuilder.create(ifOp.getLoc(), mlir::ValueRange{adjustedIndex}); + + // return the default induction variable as step size in the 'else' region + mlir::OpBuilder elseBuilder = ifOp.getElseBodyBuilder(); + elseBuilder.create(ifOp.getLoc(), mlir::ValueRange{forOp.getInductionVar()}); + + auto inductionVar = forOp.getInductionVar(); + inductionVar.replaceUsesWithIf(ifOp.getResult(0), [&](mlir::OpOperand& operand) { + return llvm::isa(operand.getOwner()->getDialect()); + }); + minValue.replaceUsesWithIf(forOp.getStep(), [&](mlir::OpOperand& operand) { + return llvm::isa(operand.getOwner()->getDialect()); + }); + + return true; +} + +/** + * @brief This function traverses operations in reverse order, starting from the input operand, + * and collects all operations that are parents of the operand tensors until it reaches slice operations. + * It ensures that all operations contributing to dynamic tensors are captured for further processing. + */ +void ConvertDynamicToStaticKernelsPass::tryAndInferStaticShapes(mlir::Value value, int64_t staticDimSize) { + SmallVector worklist = {value}; + llvm::DenseSet visited, dynamicTensorOps, sliceOps; + while (!worklist.empty()) { + auto op = worklist.pop_back_val(); + auto definingOp = op.getDefiningOp(); + if (definingOp == nullptr || !visited.insert(definingOp).second) { + continue; + } + + // Only traverse ops within the same scf.for region + if (!definingOp->getParentOfType()) { + continue; + } + + if (mlir::isa(definingOp)) { + sliceOps.insert(definingOp); + continue; + } else { + dynamicTensorOps.insert(definingOp); + } + + for (auto operand : definingOp->getOperands()) { + if (isDynamicTensor(operand)) { + worklist.push_back(operand); + } + } + } + + VPUX_THROW_WHEN(sliceOps.empty(), + "No slice operations found in the scf.for loop. Cannot convert dynamic to static shapes"); + + llvm::DenseSet sliceOpsWithConversionCast; + for (auto sliceOp : sliceOps) { + if (!sliceOpsWithConversionCast.insert(sliceOp).second) { + continue; + } + addConversionCast(sliceOp, staticDimSize); + } + + VPUX_THROW_WHEN(dynamicTensorOps.size() > 0, "Unsupported operations with dynamic tensors were found"); +} + +mlir::RankedTensorType ConvertDynamicToStaticKernelsPass::inferStaticTensorType(mlir::Value value, + int64_t staticDimSize, + bool keepBounds) { + auto inputType = mlir::cast(value.getType()); + VPUX_THROW_WHEN(inputType == nullptr, "Expected a ranked tensor type, but got: {0}", value.getType()); + auto shape = inputType.getShape(); + SmallVector staticShape(shape.begin(), shape.end()); + + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] == mlir::ShapedType::kDynamic) { + staticShape[i] = staticDimSize; + } + } + + if (auto dictAttr = mlir::dyn_cast_or_null(inputType.getEncoding())) { + SmallVector newAttrs; + for (auto attr : dictAttr) { + if (!keepBounds || attr.getName() != "bounds") { + newAttrs.push_back(attr); + } + } + // Rebuild the static type with all attributes except bounds + return mlir::RankedTensorType::get(staticShape, inputType.getElementType(), + mlir::DictionaryAttr::get(inputType.getContext(), newAttrs)); + } + + return mlir::RankedTensorType::get(staticShape, inputType.getElementType()); +} + +void ConvertDynamicToStaticKernelsPass::addConversionCast(mlir::Operation* sliceOp, int64_t staticDimSize) { + auto staticTensorType = inferStaticTensorType(sliceOp->getResult(0), staticDimSize); + VPUX_THROW_WHEN(staticTensorType == nullptr, + "Failed to infer static tensor type for slice operation {0} with static dimension size {1}", + sliceOp->getName(), staticDimSize); + mlir::OpBuilder builder(sliceOp); + builder.setInsertionPointAfter(sliceOp); + auto castOp = builder.create(sliceOp->getLoc(), staticTensorType, sliceOp->getResult(0)); + for (auto user : llvm::make_early_inc_range(sliceOp->getUsers())) { + if (user != castOp.getOperation()) { + user->replaceUsesOfWith(sliceOp->getResult(0), castOp.getResult()); + } + } +} + +mlir::func::FuncOp ConvertDynamicToStaticKernelsPass::createStaticFuncOp(mlir::func::FuncOp dynFuncOp, + const mlir::FunctionType& staticFuncType, + mlir::ModuleOp moduleOp, int64_t staticDimSize, + bool eraseDynamicFunc) { + mlir::OpBuilder builder(moduleOp.getContext()); + auto staticFuncName = dynFuncOp.getName().str() + "_static"; + auto existingFuncOp = moduleOp.lookupSymbol(staticFuncName); + VPUX_THROW_WHEN(existingFuncOp, "Static function with name {0} already exists", staticFuncName); + auto staticFuncOp = builder.create(dynFuncOp.getLoc(), staticFuncName, staticFuncType); + + mlir::IRMapping valueMap; + for (auto& oldBlock : dynFuncOp.getBody()) { + auto* newBlock = staticFuncOp.addEntryBlock(); + + // Map the block arguments from the old block to the new block + for (auto argPair : llvm::zip(oldBlock.getArguments(), newBlock->getArguments())) { + valueMap.map(std::get<0>(argPair), std::get<1>(argPair)); + } + + builder.setInsertionPointToStart(newBlock); + for (auto& oldOp : oldBlock.getOperations()) { + auto* newOp = builder.clone(oldOp, valueMap); + for (auto it : llvm::enumerate(oldOp.getResults())) { + size_t idx = it.index(); + mlir::Value result = it.value(); + if (isDynamicTensor(result)) { + auto staticResultType = inferStaticTensorType(result, staticDimSize); + newOp->getResult(idx).setType(staticResultType); + valueMap.map(result, newOp->getResult(idx)); + } + } + } + } + + moduleOp.push_back(staticFuncOp); + staticFuncOp->moveAfter(dynFuncOp); + + if (eraseDynamicFunc && dynFuncOp.use_empty()) { + dynFuncOp.erase(); + } + + return staticFuncOp; +}; + +void ConvertDynamicToStaticKernelsPass::safeRunOnModule() { + auto moduleOp = getOperation(); + net::NetworkInfoOp netInfoOp; + mlir::func::FuncOp mainFuncOp; + net::NetworkInfoOp::getFromModule(moduleOp, netInfoOp, mainFuncOp); + mlir::OpBuilder builder(moduleOp.getContext()); + + mlir::WalkResult walkResult = mainFuncOp.walk([&](mlir::scf::ForOp forOp) { + // Get the list of call operations inside the scf.for loop + SmallVector callOpsWithDynamicTensors; + for (auto op : forOp.getOps()) { + if (vpux::IE::hasDynamicTensors(op)) { + callOpsWithDynamicTensors.push_back(op); + } + } + + if (callOpsWithDynamicTensors.empty()) { + _log.info("No call operations with dynamic tensors found in the scf.for loop. Skipping conversion."); + return mlir::WalkResult::advance(); + } + + // Get the step size from the scf.for loop + auto stepValue = forOp.getStep(); + int64_t staticDimSize = 0; + if (auto constOp = stepValue.getDefiningOp()) { + staticDimSize = constOp.value(); + } + + if (staticDimSize <= 0) { + _log.error("Invalid static dimension size: {0}", staticDimSize); + return mlir::WalkResult::interrupt(); + } + + if (!adjustIndexIntoDynamicTensor(forOp)) { + _log.error("Failed to adjust index into dynamic tensor for scf.for loop."); + return mlir::WalkResult::interrupt(); + } + + for (auto callOp : callOpsWithDynamicTensors) { + for (auto operand : callOp.getOperands()) { + if (isDynamicTensor(operand)) { + if (!isSupportedDynamicTensorType(operand)) { + return mlir::WalkResult::interrupt(); + } + tryAndInferStaticShapes(operand, staticDimSize); + } + } + + SmallVector staticFuncInputTypes, staticFuncOutputTypes; + for (auto result : callOp->getResults()) { + if (isDynamicTensor(result)) { + if (!isSupportedDynamicTensorType(result)) { + return mlir::WalkResult::interrupt(); + } + staticFuncOutputTypes.push_back(inferStaticTensorType(result, staticDimSize)); + } else { + staticFuncOutputTypes.push_back(result.getType()); + } + } + + // Collect all input types for the static function type + for (auto operand : callOp.getOperands()) { + staticFuncInputTypes.push_back(operand.getType()); + } + + // Create the static function type + auto staticFuncType = + mlir::FunctionType::get(callOp->getContext(), staticFuncInputTypes, staticFuncOutputTypes); + + // Look up the original function and create a static version + auto dynFuncOp = moduleOp.lookupSymbol(callOp.getCallee()); + if (dynFuncOp == nullptr) { + // If the function is not found, it might be a dynamic function that needs to be created + _log.error("Dynamic function {0} not found in module", callOp.getCallee()); + return mlir::WalkResult::interrupt(); + } + + // Create the new static function + auto staticFuncOp = createStaticFuncOp(dynFuncOp, staticFuncType, moduleOp, staticDimSize); + + // Create a call to the new static function + builder.setInsertionPoint(callOp); + auto newCallOp = builder.create(callOp->getLoc(), staticFuncOp.getName(), + staticFuncOutputTypes, callOp.getOperands()); + + // Handle all results from the call op + for (auto resultPair : llvm::zip(callOp->getResults(), newCallOp->getResults())) { + auto oldResult = std::get<0>(resultPair); + auto newResult = std::get<1>(resultPair); + + // Create a cast operation to convert the static shaped tensor returned to dynamic tensor + if (oldResult.getType() != newResult.getType()) { + if (!isDynamicTensor(oldResult)) { + _log.error("Type mismatch between old result {0} and new result {1}", oldResult.getType(), + newResult.getType()); + return mlir::WalkResult::interrupt(); + } + + auto castToDynamicTensor = + builder.create(newCallOp->getLoc(), oldResult.getType(), newResult); + oldResult.replaceAllUsesWith(castToDynamicTensor.getResult()); + } else { + oldResult.replaceAllUsesWith(newResult); + } + } + callOp->erase(); + } + + return mlir::WalkResult::advance(); + }); + + if (walkResult.wasInterrupted()) { + signalPassFailure(); + } +} + +} // namespace + +// +// createConvertDynamicToStaticKernelsPass +// + +std::unique_ptr vpux::VPU::createConvertDynamicToStaticKernelsPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_nce_interpolate_to_dw.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_nce_interpolate_to_dw.cpp index e9e1ef4d5f..650a2f1a1e 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_nce_interpolate_to_dw.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_nce_interpolate_to_dw.cpp @@ -23,6 +23,7 @@ #include "vpux/compiler/dialect/VPU/utils/nce_interpolate_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPU/utils/ppe_version_config.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/attributes.hpp" @@ -55,7 +56,7 @@ namespace { - more than 3 (experimental number) tiles on C are needed for SEP DW.Conv to have per cluster workloads with channels in DEPTHWISE_WORKLOAD_SIZES that also meet L1aOpt workload reqs. */ -bool isDepthwiseConvMorePerformant(VPU::NCEInterpolateOp origOp, VPU::ArchKind arch, Logger log) { +bool isDepthwiseConvMorePerformant(VPU::NCEInterpolateOp origOp, config::ArchKind arch, Logger log) { const auto outputType = mlir::cast(origOp.getOutput().getType()); const auto numChannels = outputType.getShape()[Dims4D::Act::C]; const auto inputType = mlir::cast(origOp.getInput().getType()); @@ -313,13 +314,13 @@ class ConvertNCEInterpolateToDWPass final : void safeRunOnFunc() final; void convertToDWConv(VPU::NCEInterpolateOp origOp, VPU::GroupSparseTensorOp groupSparseOp, VPU::StorageElementTableOp storageElementTable, Const::DeclareOp origWeights, - VPU::ArchKind arch) const; + config::ArchKind arch) const; }; void ConvertNCEInterpolateToDWPass::convertToDWConv(VPU::NCEInterpolateOp origOp, VPU::GroupSparseTensorOp groupSparseOp, VPU::StorageElementTableOp storageElementTable, - Const::DeclareOp origWeights, VPU::ArchKind arch) const { + Const::DeclareOp origWeights, config::ArchKind arch) const { auto nestedLog = _log.nest(); mlir::OpBuilder builder(origOp); auto data = groupSparseOp.getData(); @@ -360,7 +361,7 @@ void ConvertNCEInterpolateToDWPass::convertToDWConv(VPU::NCEInterpolateOp origOp void ConvertNCEInterpolateToDWPass::safeRunOnFunc() { auto func = getOperation(); - const auto arch = VPU::getArch(func); + const auto arch = config::getArch(func); SmallVector interpsToErase{}; func.walk([&](VPU::NCEInterpolateOp interpOp) { diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_op_to_dma_for_performant_execution.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_op_to_dma_for_performant_execution.cpp index 253d28dd0e..4e8dd1a120 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_op_to_dma_for_performant_execution.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/convert_op_to_dma_for_performant_execution.cpp @@ -7,12 +7,12 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/convert_op_to_dma_for_performant_execution_getter.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include #include #include #include -#include namespace vpux::VPU { #define GEN_PASS_DECL_CONVERTOPTODMAFORPERFORMANTEXECUTION @@ -43,7 +43,7 @@ void ConvertOpToDMAForPerformantExecutionPass::safeRunOnFunc() { auto func = getOperation(); auto& ctx = getContext(); - const auto arch = VPU::getArch(func); + const auto arch = config::getArch(func); auto conversionStrategy = VPU::createConvertOpToDMAForPerformantExecutionStrategy(arch); mlir::ConversionTarget target(ctx); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/detect_in_place_eltwise.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/detect_in_place_eltwise.cpp index c5e73d20c6..9446475a89 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/detect_in_place_eltwise.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/detect_in_place_eltwise.cpp @@ -59,7 +59,7 @@ mlir::LogicalResult DetectInPlaceEltwise::matchAndRewrite(VPU::NCEEltwiseOp eltw } // sprLUT adds additional dummy DPU task, that writes garbage to the output - // (see AddDummyDPUTaskForSprLUT pass). In case of in-place operation it will + // (see AddDummyDPUTaskForMetadataPrefetch pass). In case of in-place operation it will // write into the input, corrupting its data. if (auto ppeAttr = mlir::dyn_cast_or_null(eltwiseOp.getPpeAttr())) { if (ppeAttr.getSprlut()) { diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dma_profiling_reserve_mem.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/dma_profiling_reserve_mem.cpp similarity index 52% rename from src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dma_profiling_reserve_mem.cpp rename to src/vpux_compiler/src/dialect/VPU/transforms/passes/dma_profiling_reserve_mem.cpp index a9bdc93a7d..3814bef667 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dma_profiling_reserve_mem.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/dma_profiling_reserve_mem.cpp @@ -4,17 +4,18 @@ // #include "vpux/compiler/dialect/IE/utils/resources.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/core/profiling.hpp" -namespace vpux::VPUIP { +namespace vpux::VPU { #define GEN_PASS_DECL_DMATASKPROFILINGRESERVEMEM #define GEN_PASS_DEF_DMATASKPROFILINGRESERVEMEM -#include "vpux/compiler/dialect/VPUIP/passes.hpp.inc" -} // namespace vpux::VPUIP +#include "vpux/compiler/dialect/VPU/passes.hpp.inc" +} // namespace vpux::VPU using namespace vpux; @@ -25,43 +26,40 @@ namespace { // class DMATaskProfilingReserveMemPass final : - public VPUIP::impl::DMATaskProfilingReserveMemBase { + public VPU::impl::DMATaskProfilingReserveMemBase { public: - explicit DMATaskProfilingReserveMemPass(DMAProfilingMode profilingMode, Logger log): _profilingMode(profilingMode) { + explicit DMATaskProfilingReserveMemPass(const std::string& enableDMAProfiling, Logger log) + : DMATaskProfilingReserveMemBase({enableDMAProfiling}) { Base::initLogger(log, Base::getArgumentName()); } private: - DMAProfilingMode _profilingMode; void safeRunOnModule() final; }; void DMATaskProfilingReserveMemPass::safeRunOnModule() { auto module = getOperation(); auto* ctx = module->getContext(); - auto arch = VPU::getArch(module); + auto arch = config::getArch(module); - if (enableDMAProfiling.hasValue()) { - _profilingMode = getDMAProfilingMode(arch, enableDMAProfiling.getValue()); - } + VPUX_THROW_UNLESS(enableDMAProfiling.hasValue(), "No option"); + auto dmaProfilingMode = getDMAProfilingMode(arch, enableDMAProfiling); auto dmaOp = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN); auto dmaPortCount = dmaOp.getCount(); - VPUX_THROW_UNLESS(dmaPortCount > 0, "DMA port count should be > 0; it is: {0}", dmaPortCount); + VPUX_THROW_UNLESS(dmaPortCount > 0, "No DMA ports"); VPUX_THROW_UNLESS((VPUIP::HW_DMA_PROFILING_MAX_BUFFER_SIZE % dmaPortCount) == 0, "Reserved memory for DMA profiling cannot be equally split between ports"); - if (_profilingMode == DMAProfilingMode::DISABLED) { - return; + // Small chunk of CMX memory is always reserved in NPU37xx and NPU40xx + if (arch == config::ArchKind::NPU37XX || arch == config::ArchKind::NPU40XX) { + auto memSpaceAttr = mlir::SymbolRefAttr::get(ctx, stringifyEnum(VPU::MemoryKind::CMX_NN)); + _log.trace("DMA profiling reserved CMX memory - size: '{0}'", VPUIP::HW_DMA_PROFILING_MAX_BUFFER_SIZE); + IE::setDmaProfilingReservedMemory(module, memSpaceAttr, VPUIP::HW_DMA_PROFILING_MAX_BUFFER_SIZE); } - // Small chunk of CMX memory is always reserved - auto memSpaceAttr = mlir::SymbolRefAttr::get(ctx, stringifyEnum(VPU::MemoryKind::CMX_NN)); - _log.trace("DMA profiling reserved CMX memory - size: '{0}'", VPUIP::HW_DMA_PROFILING_MAX_BUFFER_SIZE); - IE::setDmaProfilingReservedMemory(module, memSpaceAttr, VPUIP::HW_DMA_PROFILING_MAX_BUFFER_SIZE); - // Chunk of DDR is reserved if profiling is enabled - if (_profilingMode == DMAProfilingMode::DYNAMIC_HWP) { + if (dmaProfilingMode == DMAProfilingMode::DYNAMIC_HWP) { _log.trace("DMA HW profiling reserved DDR memory - size: '{0}'", VPUIP::HW_DMA_PROFILING_ID_LIMIT * VPUIP::HW_DMA_PROFILING_SIZE_BYTES_40XX); auto memSpaceAttr = mlir::SymbolRefAttr::get(ctx, stringifyEnum(VPU::MemoryKind::DDR)); @@ -76,7 +74,7 @@ void DMATaskProfilingReserveMemPass::safeRunOnModule() { // createDMATaskProfilingReserveMemPass // -std::unique_ptr vpux::VPUIP::createDMATaskProfilingReserveMemPass(DMAProfilingMode dmaProfilingMode, - Logger log) { - return std::make_unique(dmaProfilingMode, log); +std::unique_ptr vpux::VPU::createDMATaskProfilingReserveMemPass(const std::string& enableDMAProfiling, + Logger log) { + return std::make_unique(enableDMAProfiling, log); } diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/dynamic_tensor_bounds_to_static_shape.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/dynamic_tensor_bounds_to_static_shape.cpp index 66877a6644..ebf6a902bf 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/dynamic_tensor_bounds_to_static_shape.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/dynamic_tensor_bounds_to_static_shape.cpp @@ -9,10 +9,12 @@ #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/utils/net/network_info_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/range.hpp" #include +#include #include #include #include @@ -103,12 +105,25 @@ void BoundedTensorsToDynamicDimsMask::safeRunOnModule() { // pipeline, this pass is executed on the main function, which contains host-side code as well. Ideally, this pass // should not operate on the main function in the HostCompile pipeline. This will be refactored in the future. // Track: E#168311 + auto hostCompileMode = (config::getCompilationMode(module) == config::CompilationMode::HostCompile); target.addLegalDialect(); - target.addLegalDialect(); target.addLegalOp(); - target.addLegalOp(); + if (hostCompileMode) { + target.addLegalDialect(); + target.addLegalDialect(); + target.addLegalOp(); + target.addLegalOp(); + target.addLegalOp(); + target.addLegalOp(); + target.addLegalOp(); + } + const auto entryFuncOp = vpux::net::findEntryPointFunc(module, _log); target.addDynamicallyLegalOp([&](mlir::func::FuncOp funcOp) { + if (hostCompileMode && (funcOp == entryFuncOp)) { + _log.trace("Skipping function {0} in HostCompile mode", funcOp.getName()); + return true; + } return typeConverter.isSignatureLegal(funcOp.getFunctionType()); }); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/ensure_nce_ops_size_requirements.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/ensure_nce_ops_size_requirements.cpp index 8465cb1136..eb00e21c76 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/ensure_nce_ops_size_requirements.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/ensure_nce_ops_size_requirements.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/core/tiling.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" @@ -576,6 +577,18 @@ void EnsureNCEOpsSizeRequirementsPass::safeRunOnFunc() { if (!outSizeWrongDims.empty()) { _log.nest(2).debug("Output size has dims greater than HW requirements: {0}", outSizeWrongDims); } + + if (auto convOp = mlir::dyn_cast(op)) { + auto weightDequantizeOp = convOp.getFilter().getDefiningOp(); + if (weightDequantizeOp != nullptr) { + if (auto clusteredOp = mlir::cast(op)) { + if (!clusteredOp.getMultiClusterStrategy().has_value()) { + // Address VPU::Dequantize -> VPU::NCEConvolution post tiling and vf + return true; + } + } + } + } return inSizeWrongDims.empty() && outSizeWrongDims.empty(); } diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/init_resources.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/init_resources.cpp index e936456670..0b00458c67 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/init_resources.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/init_resources.cpp @@ -4,9 +4,12 @@ // #include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/config/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" namespace vpux::VPU { #define GEN_PASS_DECL_INITRESOURCES @@ -37,7 +40,7 @@ class InitResourcesPass final : public VPU::impl::InitResourcesBase _revisionID; int _numOfDPUGroups = 1; @@ -65,7 +68,7 @@ mlir::LogicalResult InitResourcesPass::initializeOptions( } void InitResourcesPass::initializeFromOptions() { - auto archStr = VPU::symbolizeEnum(archOpt.getValue()); + auto archStr = config::symbolizeEnum(archOpt.getValue()); VPUX_THROW_UNLESS(archStr.has_value(), "Unknown VPU architecture : '{0}'", archOpt.getValue()); _arch = archStr.value(); @@ -77,7 +80,7 @@ void InitResourcesPass::initializeFromOptions() { _revisionID = revisionIDOpt.getValue(); } - _numOfDPUGroups = getMaxArchDPUClusterNum(_arch); + _numOfDPUGroups = vpux::VPU::getMaxArchDPUClusterNum(_arch); if (numberOfDPUGroupsOpt.hasValue()) { _numOfDPUGroups = numberOfDPUGroupsOpt.getValue(); } @@ -99,7 +102,7 @@ void InitResourcesPass::safeRunOnModule() { auto module = getOperation(); _log.trace("Set VPU architecture to {0}", _arch); - VPU::setArch(module, _arch, _numOfDPUGroups, _numOfDMAPorts, _availableCMXMemory, _allowCustomValues); + config::setArch(module, _arch, _numOfDPUGroups, _numOfDMAPorts, _availableCMXMemory, _allowCustomValues); VPUX_THROW_WHEN(!_allowCustomValues && config::hasCompilationMode(module), "CompilationMode is already defined, probably you run '--init-compiler' twice"); @@ -108,29 +111,29 @@ void InitResourcesPass::safeRunOnModule() { config::setCompilationMode(module, _compilationMode); } - VPUX_THROW_WHEN(!_allowCustomValues && VPU::hasRevisionID(module), + VPUX_THROW_WHEN(!_allowCustomValues && config::hasRevisionID(module), "RevisionID is already defined, probably you run '--init-compiler' twice"); - if (!VPU::hasRevisionID(module)) { + if (!config::hasRevisionID(module)) { if (_revisionID.has_value()) { int revisionIDValue = _revisionID.value(); - std::optional revID = VPU::symbolizeRevisionID(revisionIDValue); + std::optional revID = config::symbolizeRevisionID(revisionIDValue); if (revID.has_value()) { _log.trace("Set RevisionID to {0}", revisionIDValue); - VPU::setRevisionID(module, revID.value()); + config::setRevisionID(module, revID.value()); } else { _log.trace("Set RevisionID to REVISION_NONE"); - VPU::setRevisionID(module, VPU::RevisionID::REVISION_NONE); + config::setRevisionID(module, config::RevisionID::REVISION_NONE); } } else { _log.trace("Set RevisionID to REVISION_NONE"); - VPU::setRevisionID(module, VPU::RevisionID::REVISION_NONE); + config::setRevisionID(module, config::RevisionID::REVISION_NONE); } } auto nceCluster = IE::getTileExecutor(module); if (!nceCluster.hasProcessorFrequency()) { - auto revisionID = VPU::getRevisionID(module); - auto freqMHz = getDpuFrequency(_arch, revisionID); + auto revisionID = config::getRevisionID(module); + auto freqMHz = vpux::VPU::getDpuFrequency(_arch, revisionID); _log.trace("Set DpuFrequency to {0}", freqMHz); nceCluster.setProcessorFrequency(getFPAttr(module.getContext(), freqMHz)); } diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/introduce_init_function.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/introduce_init_function.cpp index 434f290db8..098c563bb0 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/introduce_init_function.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/introduce_init_function.cpp @@ -3,12 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/weights_separation.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/ir_modification.hpp" @@ -24,13 +25,12 @@ #include #include #include -#include -#include -#include - #include +#include #include #include +#include +#include #include @@ -43,6 +43,180 @@ namespace vpux::VPU { using namespace vpux; namespace { +struct InitSpecificLoggerBase { + virtual ~InitSpecificLoggerBase() = default; + virtual void analyzeInitFunction(mlir::func::FuncOp) = 0; + virtual void print(const Logger&) = 0; +}; + +struct InitSpecificNullLogger : InitSpecificLoggerBase { + void analyzeInitFunction(mlir::func::FuncOp) override { + } + void print(const Logger&) override { + } +}; + +// Simple helper that gives write access to the underlying data if and only if a +// specified key has never been seen before by the helper. +template +class Uniqued { + mlir::DenseSet _uniquenessChecker; + T _data{}; + +public: + // Returns a pointer to a mutable data. Returns a nullptr if the data cannot + // be accessed. + T* operator()(const Key& key) { + const bool firstOccurrence = _uniquenessChecker.insert(key).second; + if (firstOccurrence) { + return std::addressof(_data); + } + return nullptr; + } + + // Returns a pointer to an immutable data. + const T* operator->() const { + return std::addressof(_data); + } +}; + +class InitSpecificMetaInfoLogger : public InitSpecificLoggerBase { + struct ConstantInfo { + size_t count{0}; + Byte size{0}; + }; + Uniqued _importedWeights; // weights from original model + Uniqued _availableWeights; // weights still present in pre-init IR + Uniqued _ovConstants; // OV-originated constant ops + Uniqued _usedOvConstants; // supported by weights separation + Uniqued _unusedOvConstants; // supported by weights separation but ignored + Uniqued _unsupportedOvConstants; // unsupported by weights separation + + ConstantInfo _currentInitInputs; + ConstantInfo _currentInitOutputs; + + static double toKb(vpux::Byte bytes); + static double percentify(vpux::Byte n, vpux::Byte m); + +public: + InitSpecificMetaInfoLogger(mlir::ModuleOp moduleOp); + void analyzeInitFunction(mlir::func::FuncOp) override; + void print(const Logger&) override; +}; + +InitSpecificMetaInfoLogger::InitSpecificMetaInfoLogger(mlir::ModuleOp moduleOp) { + // Note: to collect *all* OV-imported weights, use blob manager (backing + // container for dense_resource<>). + const auto& manager = mlir::DenseResourceElementsHandle::getManagerInterface(moduleOp.getContext()); + manager.getBlobManager().getBlobMap( + [&](const llvm::StringMap& allBlobs) { + for (const auto& entry : allBlobs) { + const auto& blob = entry.getValue(); + const auto key = blob.getKey(); + if (!key.starts_with(Const::IMPORTED_WEIGHT_PREFIX)) { + continue; + } + + // Note: blob entries are unique (and so are keys) + if (auto* info = _importedWeights(key)) { + info->count++; + info->size += vpux::Byte(static_cast(blob.getBlob()->getData().size())); + } + } + }); + + moduleOp->walk([&](Const::DeclareOp constOp) { + if (!Const::isOpenVINOConstant(constOp)) { + return; + } + + const auto attr = constOp.getContentAttr(); + if (auto* info = _availableWeights(getResourceName(attr.getBaseContent()))) { + info->count++; + info->size += vpux::getExpectedBufferSize(attr.getBaseContent().getType()); + } + + // Note: due to the nature of the IR, duplicate constants are assumed to + // be fused. However, there's a slight chance that the same constant can + // be used in two different functions? + if (auto* info = _ovConstants(attr)) { + info->count++; + info->size += vpux::getExpectedBufferSize(constOp.getContentAttr().getType()); + } + + // if suitable, recorded into used constants + if (VPU::isSuitableForWeightsSeparation(constOp)) { + if (auto* info = _usedOvConstants(attr)) { + info->count++; + info->size += vpux::getExpectedBufferSize(attr.getType()); + } + return; + } + + // if not suitable, but trivial, recorded into unused constants + // otherwise - unsupported + auto* weightsCategory = + VPU::isTrivialForWeightsSeparation(constOp) ? &_unusedOvConstants : &_unsupportedOvConstants; + if (auto* info = (*weightsCategory)(attr)) { + info->count++; + info->size += vpux::getExpectedBufferSize(constOp.getContentAttr().getType()); + } + }); +} + +void InitSpecificMetaInfoLogger::analyzeInitFunction(mlir::func::FuncOp initFunc) { + const auto calculateSize = [](ArrayRef c) { + return std::accumulate(c.begin(), c.end(), vpux::Byte(0), [](vpux::Byte i, mlir::Type argType) { + return i + vpux::getExpectedBufferSize(argType); + }); + }; + _currentInitInputs = {initFunc.getNumArguments(), calculateSize(initFunc.getArgumentTypes())}; + _currentInitOutputs = {initFunc.getNumResults(), calculateSize(initFunc.getResultTypes())}; +} + +void InitSpecificMetaInfoLogger::print(const Logger& log) { + log.info("Summary about constants:"); + auto generalStats = log.nest("general-statistics", 1); + generalStats.info("All imported unique weights: {0} ({1:F} KB)", _importedWeights->count, + toKb(_importedWeights->size)); + generalStats.info("Available unique weights[1]: {0} ({1:F} KB which is {2:P})", _availableWeights->count, + toKb(_availableWeights->size), percentify(_availableWeights->size, _importedWeights->size)); + generalStats.info("Unique weights used by schedule (from available): {0} ({1:F} KB which is {2:P})", + _currentInitInputs.count, toKb(_currentInitInputs.size), + percentify(_currentInitInputs.size, _availableWeights->size)); + + generalStats.info("OV-originated constants[2] in IR: {0} ({1:F} KB)", _ovConstants->count, + toKb(_ovConstants->size)); + generalStats.info("Unused constants[3]: {0} ({1:F} KB which is {2:P})", _unusedOvConstants->count, + toKb(_unusedOvConstants->size), percentify(_unusedOvConstants->size, _ovConstants->size)); + generalStats.info("Unsupported constants[4]: {0} ({1:F} KB which is {2:P})", _unsupportedOvConstants->count, + toKb(_unsupportedOvConstants->size), + percentify(_unsupportedOvConstants->size, _ovConstants->size)); + generalStats.info("Size percentage of *used* constants: {0:P}", + percentify(_usedOvConstants->size, _ovConstants->size)); + + generalStats.info("Generated schedule's total I/O size: {0:F} KB", + toKb(_currentInitInputs.size + _currentInitOutputs.size)); + + generalStats.info(""); // dummy line + generalStats.info("[1]: available unique weights - weights that come from original model and are used in the " + "compiled schedule (via constant operations)"); + generalStats.info("[2]: OV-originated constants - constant operations that combine OV weights with transformations " + "(e.g. subview, reorder)"); + generalStats.nest(1).info("Note: the same unique weight could be used in multiple constants"); + generalStats.info("[3]: unused constants - OV-originated constants that are ignored by weights separation (e.g. " + "splats, only trivial transformations)"); + generalStats.info("[4]: unsupported constants - OV-originated constants that have unsupported transformations"); +} + +double InitSpecificMetaInfoLogger::toKb(vpux::Byte bytes) { + constexpr auto multiplier = vpux::MemMultiplier::value; + return static_cast(bytes.count()) / multiplier; +} + +double InitSpecificMetaInfoLogger::percentify(vpux::Byte n, vpux::Byte m) { + return static_cast(n.count()) / static_cast(m.count()); +} // Casts the resulting value to its storage type counterpart. This is normally // done in init and thus in IE dialect. @@ -115,8 +289,8 @@ class IntroduceInitFunctionPass final : public VPU::impl::IntroduceInitFunctionB void setNetworkEntryPointToMain(net::NetworkInfoOp netInfo, const WsArgumentCache& topLevelMainArgCache); // creates new main that calls init and main in sequence. this function // becomes the new entry-point. - void buildWrapperOpForInitAndMain(net::NetworkInfoOp netInfo, mlir::func::FuncOp mainFuncOp, - mlir::func::FuncOp initFuncOp, const WsArgumentCache& initArgCache); + mlir::func::CallOp buildWrapperOpForInitAndMain(net::NetworkInfoOp netInfo, mlir::func::FuncOp mainFuncOp, + mlir::func::FuncOp initFuncOp, const WsArgumentCache& initArgCache); mlir::LogicalResult initialize(mlir::MLIRContext* context) final; mlir::LogicalResult deferredInitialize(mlir::ModuleOp moduleOp); @@ -146,7 +320,7 @@ void IntroduceInitFunctionPass::setNetworkEntryPointToInit(net::NetworkInfoOp ne for (auto it : argCache.getSortedArgs()) { const auto& [entry, blockArg] = *it; const auto type = blockArg.getType(); - const auto name = formatv("in_{0}", getResourceName(entry.content)).str(); + const auto name = getResourceName(entry.content); builder.create(appendLoc(netInfo.getLoc(), name), name, type); } @@ -457,9 +631,10 @@ void IntroduceInitFunctionPass::setNetworkEntryPointToMain(net::NetworkInfoOp ne } } -void IntroduceInitFunctionPass::buildWrapperOpForInitAndMain(net::NetworkInfoOp netInfo, mlir::func::FuncOp mainFuncOp, - mlir::func::FuncOp initFuncOp, - const WsArgumentCache& initArgCache) { +mlir::func::CallOp IntroduceInitFunctionPass::buildWrapperOpForInitAndMain(net::NetworkInfoOp netInfo, + mlir::func::FuncOp mainFuncOp, + mlir::func::FuncOp initFuncOp, + const WsArgumentCache& initArgCache) { const auto mainFuncType = mainFuncOp.getFunctionType(); const auto initFuncType = initFuncOp.getFunctionType(); // Note: expect the below to never fail @@ -518,6 +693,8 @@ void IntroduceInitFunctionPass::buildWrapperOpForInitAndMain(net::NetworkInfoOp builder.create(appendLoc(locBase, "_return"), mainCallOp.getResults()); netInfo.setEntryPoint(wrapperFuncOp.getSymName()); + + return initCallOp; } mlir::LogicalResult IntroduceInitFunctionPass::initialize(mlir::MLIRContext*) { @@ -631,7 +808,6 @@ void IntroduceInitFunctionPass::safeRunOnModule() { net::NetworkInfoOp netInfo; mlir::func::FuncOp mainFuncOp; net::NetworkInfoOp::getFromModule(moduleOp, netInfo, mainFuncOp); - const auto& wsAnalysis = getAnalysis(); auto tree = VPU::getOutliningRepresentation(mainFuncOp); @@ -643,11 +819,29 @@ void IntroduceInitFunctionPass::safeRunOnModule() { _log.debug("The following operation tree is found:\n{0}\n", stream.str()); } + auto statisticsLogger = [&]() -> std::unique_ptr { + if (_log.isActive(LogLevel::Info)) { + return std::make_unique(moduleOp); + } + return std::make_unique(); + }(); + + // Don't let PackNestedModulesPass put @main and other functions into a submodule. + auto applyNoNestingToAllFunctions = [&]() { + auto ctx = moduleOp.getContext(); + moduleOp.walk([&](mlir::func::FuncOp funcOp) { + funcOp->setAttr("do_not_nest", mlir::UnitAttr::get(ctx)); + }); + }; + switch (_mode) { case Mode::GenerateInit: { auto [initFuncOp, initArgCache, initResults] = buildInitFunction(mainFuncOp, wsAnalysis); setNetworkEntryPointToInit(netInfo, initFuncOp, initArgCache, initResults); eraseMainAndOutlinedFunctions(tree); + + statisticsLogger->analyzeInitFunction(initFuncOp); + statisticsLogger->print(_log); break; } case Mode::GenerateMain: { @@ -656,13 +850,24 @@ void IntroduceInitFunctionPass::safeRunOnModule() { break; } case Mode::GenerateAll: { + applyNoNestingToAllFunctions(); auto [initFuncOp, initArgCache, initResults] = buildInitFunction(mainFuncOp, wsAnalysis); std::ignore = updateMainAndOutlinedFunctions(moduleOp, mainFuncOp, tree); initFuncOp.setPrivate(); mainFuncOp.setPrivate(); - // Don't let PackNestedModulesPass put @main into a submodule. - mainFuncOp->setAttr("do_not_nest", mlir::UnitAttr::get(&getContext())); - buildWrapperOpForInitAndMain(netInfo, mainFuncOp, initFuncOp, initArgCache); + + auto initCallOp = buildWrapperOpForInitAndMain(netInfo, mainFuncOp, initFuncOp, initArgCache); + + // Note: erase empty init to avoid issues further in the pipeline (e.g. + // with feasible allocation) + if (bool emptyInit = initCallOp->getOperands().empty() && initCallOp->getResults().empty(); emptyInit) { + initFuncOp.erase(); + initCallOp.erase(); + break; + } + + statisticsLogger->analyzeInitFunction(initFuncOp); + statisticsLogger->print(_log); break; } default: { diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/lower_ops_to_se_nce.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/lower_ops_to_se_nce.cpp index 3230620adf..4ec75e2bad 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/lower_ops_to_se_nce.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/lower_ops_to_se_nce.cpp @@ -125,8 +125,8 @@ mlir::Value createWeightsConstantImpl(vpux::NDTypeInterface inputType, SmallVect return weightsConstOp.getOutput(); } -mlir::Value convertOpToConv(mlir::Operation* origOp, mlir::Value weights, mlir::Value sparseInput, VPU::ArchKind arch, - mlir::PatternRewriter& rewriter) { +mlir::Value convertOpToConv(mlir::Operation* origOp, mlir::Value weights, mlir::Value sparseInput, + config::ArchKind arch, mlir::PatternRewriter& rewriter) { const auto outputType = mlir::cast(origOp->getResult(0).getType()); const auto OC = outputType.getShape()[Dims4D::Act::C]; const auto ppeAttr = VPU::PpeVersionConfig::retrievePPEAttribute(origOp); @@ -171,7 +171,7 @@ mlir::Value convertOpToConv(mlir::Operation* origOp, mlir::Value weights, mlir:: class InterpolateToNCE final : public mlir::OpRewritePattern { public: - InterpolateToNCE(mlir::MLIRContext* ctx, VPU::ArchKind arch, Logger log) + InterpolateToNCE(mlir::MLIRContext* ctx, config::ArchKind arch, Logger log) : mlir::OpRewritePattern(ctx), _arch(arch), _log(log) { setDebugName("InterpolateToNCE"); } @@ -185,7 +185,7 @@ class InterpolateToNCE final : public mlir::OpRewritePattern mlir::Value createWeightsConstant(VPU::InterpolateOp origOp, mlir::PatternRewriter& rewriter, ArrayRef kernelSize) const; - VPU::ArchKind _arch; + config::ArchKind _arch; Logger _log; }; @@ -225,7 +225,7 @@ mlir::Value InterpolateToNCE::createSparseInput(VPU::InterpolateOp origOp, mlir: auto seAttr = mlir::cast(seInterpolateAttr); // Create the StorageElementTable operation - auto arch = VPU::getArch(origOp); + auto arch = config::getArch(origOp); auto sparsityConstraint = VPU::getSparsityConstraint(arch); const int64_t seSize = VPU::getSESize(inputShape[Dims4D::Act::C], sparsityConstraint); const int64_t seDepth = inputShape[Dims4D::Act::C] / seSize; @@ -329,7 +329,7 @@ mlir::LogicalResult InterpolateToNCE::matchAndRewrite(VPU::InterpolateOp origOp, class TransposedConvolutionToNCE final : public mlir::OpRewritePattern { public: - TransposedConvolutionToNCE(mlir::MLIRContext* ctx, VPU::ArchKind arch, Logger log) + TransposedConvolutionToNCE(mlir::MLIRContext* ctx, config::ArchKind arch, Logger log) : mlir::OpRewritePattern(ctx), _arch(arch), _log(log) { setDebugName("TransposedConvolutionToNCE"); } @@ -343,7 +343,7 @@ class TransposedConvolutionToNCE final : public mlir::OpRewritePattern(seUpsamplingAttr); // Create the StorageElementTable operation - auto arch = VPU::getArch(origOp); + auto arch = config::getArch(origOp); auto sparsityConstraint = VPU::getSparsityConstraint(arch); const int64_t seSize = VPU::getSESize(inputShape[Dims4D::Act::C], sparsityConstraint); const int64_t seDepth = inputShape[Dims4D::Act::C] / seSize; @@ -549,7 +549,7 @@ mlir::LogicalResult TransposedConvolutionToNCE::matchAndRewrite(VPU::TransposedC class DilatedConvolutionToNCE final : public mlir::OpRewritePattern { public: - DilatedConvolutionToNCE(mlir::MLIRContext* ctx, VPU::ArchKind arch, Logger log) + DilatedConvolutionToNCE(mlir::MLIRContext* ctx, config::ArchKind arch, Logger log) : mlir::OpRewritePattern(ctx), _arch(arch), _log(log) { setDebugName("DilatedConvolutionToNCE"); } @@ -562,7 +562,7 @@ class DilatedConvolutionToNCE final : public mlir::OpRewritePattern(seDilatedConvAttr); // Create the StorageElementTable operation - const auto arch = VPU::getArch(origOp); + const auto arch = config::getArch(origOp); const auto sparsityConstraint = VPU::getSparsityConstraint(arch); // Depthwise limitation WL min size is 16, here we only set this minimum value by default, and later the actual @@ -731,12 +731,15 @@ mlir::LogicalResult DilatedConvolutionToNCE::matchAndRewrite(VPU::GroupConvoluti int64_t offsetX = 0; int64_t offsetY = 0; + auto origType = mlir::cast(origOp.getResult().getType()); auto subConvLog = innerLog.nest(); for (auto y : irange(subConvCountY)) { for (auto x : irange(subConvCountX)) { // Get offset for concat. - outputOffsets.emplace_back(SmallVector{0, 0, offsetY, offsetX}); - outputStrides.emplace_back(SmallVector{1, 1, dilateY, dilateX}); + const auto crtOffsets = SmallVector{0, 0, offsetY, offsetX}; + const auto crtStrides = SmallVector{1, 1, dilateY, dilateX}; + outputOffsets.emplace_back(crtOffsets); + outputStrides.emplace_back(crtStrides); // Create sub-convolution. auto sparseInput = createSparseInput(subConvLog, origOp, rewriter, x, y); @@ -746,10 +749,19 @@ mlir::LogicalResult DilatedConvolutionToNCE::matchAndRewrite(VPU::GroupConvoluti weightsTable, strides, padAttr, ppeAttr, rawFilterShape, /* multiClusterStrategyAttr = */ nullptr, origOp.getOutputPaddingAttr(), origOp.getInputPaddingAttr()); - auto originalLayout = mlir::cast(origOp.getResult().getType()).getDimsOrder(); - auto convType = mlir::cast(nceDepthConvolutionOp.getResult().getType()); - nceDepthConvolutionOp.getResult().setType(convType.changeDimsOrder(originalLayout)); + auto tileElemType = origType.getElementType(); + if (const auto perAxisQType = mlir::dyn_cast(tileElemType)) { + tileElemType = + vpux::tileScalesAndZP(perAxisQType, convType.getShape(), Shape(crtOffsets), Shape(crtStrides)); + } + + convType = convType.changeDimsOrder(origType.getDimsOrder()).changeElemType(tileElemType); + + rewriter.modifyOpInPlace(nceDepthConvolutionOp, [&] { + nceDepthConvolutionOp.getResult().setType(convType); + }); + subConvolutions.emplace_back(nceDepthConvolutionOp.getResult()); offsetX += 1; @@ -771,7 +783,7 @@ mlir::LogicalResult DilatedConvolutionToNCE::matchAndRewrite(VPU::GroupConvoluti class PadToNCE final : public mlir::OpRewritePattern { public: - PadToNCE(mlir::MLIRContext* ctx, VPU::ArchKind arch, Logger log) + PadToNCE(mlir::MLIRContext* ctx, config::ArchKind arch, Logger log) : mlir::OpRewritePattern(ctx), _arch(arch), _log(log) { setDebugName("PadToNCE"); } @@ -787,7 +799,7 @@ class PadToNCE final : public mlir::OpRewritePattern { ArrayRef kernelSize) const; mlir::Value convertPadToConv(VPU::PadOp origOp, mlir::Value sparseInput, mlir::PatternRewriter& rewriter) const; - VPU::ArchKind _arch; + config::ArchKind _arch; Logger _log; }; @@ -841,7 +853,7 @@ mlir::Value PadToNCE::createSparseInput(VPU::PadOp origOp, mlir::PatternRewriter auto seAttr = mlir::cast(sePaddingAttr); // Create the StorageElementTable operation - auto arch = VPU::getArch(origOp); + auto arch = config::getArch(origOp); auto sparsityConstraint = VPU::getSparsityConstraint(arch); const int64_t seSize = VPU::getSESize(inputShape[Dims4D::Act::C], sparsityConstraint); const int64_t seDepth = inputShape[Dims4D::Act::C] / seSize; @@ -918,7 +930,7 @@ mlir::LogicalResult PadToNCE::matchAndRewrite(VPU::PadOp origOp, mlir::PatternRe class RollToNCE final : public mlir::OpRewritePattern { public: - RollToNCE(mlir::MLIRContext* ctx, VPU::ArchKind arch, Logger log) + RollToNCE(mlir::MLIRContext* ctx, config::ArchKind arch, Logger log) : mlir::OpRewritePattern(ctx), _arch(arch), _log(log) { setDebugName("RollToNCE"); } @@ -932,7 +944,7 @@ class RollToNCE final : public mlir::OpRewritePattern { mlir::Value createSparseInput(VPU::RollOp origOp, SmallVector axes, SmallVector shift, mlir::PatternRewriter& rewriter) const; - VPU::ArchKind _arch; + config::ArchKind _arch; Logger _log; }; @@ -1060,7 +1072,7 @@ void LowerOpsToSENCEPass::safeRunOnFunc() { auto& ctx = getContext(); auto func = getOperation(); auto module = func->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); const auto logCb = [&](const formatv_object_base& msg) { _log.trace("{0}", msg.str()); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/lower_sparsity_ops.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/lower_sparsity_ops.cpp index 55f013f254..0588be558e 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/lower_sparsity_ops.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/lower_sparsity_ops.cpp @@ -18,6 +18,7 @@ #include "vpux/compiler/dialect/const/dialect.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" +#include "vpux/compiler/dialect/core/dialect.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/sparsity.hpp" #include "vpux/compiler/utils/types.hpp" @@ -309,7 +310,7 @@ mlir::LogicalResult rewriteSparsityOpWithConv(mlir::PatternRewriter& rewriter, m const auto maybeRequantizedOutputType = outputType; - const auto arch = VPU::getArch(origOp); + const auto arch = config::getArch(origOp); auto alignment = vpux::VPU::NCEInvariant::getAlignment(inputType.getElementType()); bool needAlignment = !vpux::VPU::NCEInvariant::isAligned(inputType, alignment, logCb); if (needAlignment) { @@ -509,6 +510,7 @@ void LowerSparsityOpsPass::safeRunOnFunc() { mlir::ConversionTarget target(ctx); target.addIllegalOp(); target.addIllegalOp(); + target.addLegalDialect(); target.addLegalDialect(); target.addLegalDialect(); target.addLegalDialect(); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/make_distributed_copies.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/make_distributed_copies.cpp index daaf900a3d..bacdbdaf2d 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/make_distributed_copies.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/make_distributed_copies.cpp @@ -6,13 +6,18 @@ #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" +#include "vpux/compiler/dialect/core/dialect.hpp" +#include "vpux/compiler/utils/net/network_info_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include #include #include +#include +#include #include #include #include @@ -91,7 +96,7 @@ mlir::LogicalResult OptimizeShapeCastDistributedCopies::matchAndRewrite(VPU::Sha // Get updated distribution based on the distribution after the shape cast, using the shape before the shape cast auto updatedDistAttr = VPUIP::getDistributedAttrAfterShapeCast( - nextUTOpOutputDistTypeInterface, prevUTOpInputDistTensorType.getShape(), VPU::getArch(origOp)); + nextUTOpOutputDistTypeInterface, prevUTOpInputDistTensorType.getShape(), config::getArch(origOp)); _log.trace("[{0}] Updating output type of: {1}\n\tOld Distribution: {2}\n\tNew Distribution: {3}", getDebugName(), prevUTOp.getInput(), prevUTOpInputDistTensorType.getDistribution(), updatedDistAttr); @@ -176,17 +181,23 @@ void MakeDistributedCopiesPass::safeRunOnFunc() { auto func = getOperation(); auto& ctx = getContext(); + // TODO: The scf/affine/tensor dialects are explicitly marked as legal because, in the case of the HostCompile + // pipeline, this pass is executed on the main function, which contains host-side code as well. Ideally, this pass + // should not operate on the main function in the HostCompile pipeline. This will be refactored in the future. + // Track: E#168311 + bool hostCompileMode = (config::getCompilationMode(func) == config::CompilationMode::HostCompile); + auto entryPointFunc = vpux::net::findEntryPointFunc(func, _log); + if (hostCompileMode && (func == entryPointFunc)) { + return; + } + mlir::ConversionTarget target(ctx); target.addIllegalOp(); + target.addLegalDialect(); target.addLegalDialect(); target.addLegalDialect(); target.addLegalOp(); target.addLegalOp(); - - // TODO: The scf/affine/tensor dialects are explicitly marked as legal because, in the case of the HostCompile - // pipeline, this pass is executed on the main function, which contains host-side code as well. Ideally, this pass - // should not operate on the main function in the HostCompile pipeline. This will be refactored in the future. - // Track: E#168311 target.addLegalDialect(); target.addLegalDialect(); target.addLegalDialect(); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/move_reflect_pad_to_cmx.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/move_reflect_pad_to_cmx.cpp index 730b04b25e..e94fedf99b 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/move_reflect_pad_to_cmx.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/move_reflect_pad_to_cmx.cpp @@ -4,10 +4,12 @@ // #include "vpux/compiler/core/cost_model_utils.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/cost_model/layer_vpunn_cost.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" +#include "vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" namespace vpux::VPU { @@ -343,10 +345,10 @@ class MoveReflectPadToCMXPass final : public VPU::impl::MoveReflectPadToCMXBase< void MoveReflectPadToCMXPass::safeRunOnFunc() { auto func = getOperation(); auto module = func->getParentOfType(); - auto arch = VPU::getArch(module); + auto arch = config::getArch(module); auto numDMAPorts = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN).getCount(); auto vpunnCostModel = vpux::VPU::CostModelConfig::createLayerCostModel(arch); - auto vpuDevice = getVPUDeviceType(arch); + auto vpuDevice = vpux::VPU::getVPUDeviceType(arch); func.walk([&](VPU::ConcatOp concatOp) { _log.trace("Found Concat operation '{0}' at '{1}'.", concatOp->getName(), concatOp->getLoc()); @@ -412,13 +414,8 @@ void MoveReflectPadToCMXPass::safeRunOnFunc() { // move concatOp output back to DDR builder.setInsertionPointAfter(concatOp); - const auto concatOpOutBuff = concatOp.getOutput(); - auto concatOpOutBuffType = mlir::cast(concatOpOutBuff.getType()); - auto newOutputType = concatOpOutBuffType.changeMemSpace(VPU::MemoryKind::DDR); - const auto memSpaceDdr = IndexedSymbolAttr::get(concatOp.getContext(), stringifyEnum(VPU::MemoryKind::DDR)); inferReturnTypes(concatOp, vpux::InferShapedTypeMode::ALL); - auto copyOutputToDdr = - builder.create(concatOp.getLoc(), newOutputType, concatOp.getOutput(), memSpaceDdr); + auto copyOutputToDdr = builder.create(concatOp.getLoc(), concatOp.getOutput()); concatOp->getResult(0).replaceAllUsesExcept(copyOutputToDdr, copyOutputToDdr); }); } diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/multi_cluster_strategy_assignment.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/multi_cluster_strategy_assignment.cpp index a8c3f66e7b..5681053b12 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/multi_cluster_strategy_assignment.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/multi_cluster_strategy_assignment.cpp @@ -10,6 +10,7 @@ #include "vpux/compiler/dialect/VPU/utils/multi_cluster_strategy_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/op_tiling_cache.hpp" #include "vpux/compiler/dialect/VPU/utils/strategy_manager/strategy_manager.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" namespace vpux::VPU { #define GEN_PASS_DECL_MULTICLUSTERSTRATEGYASSIGNMENT @@ -92,7 +93,7 @@ void MultiClusterStrategyAssignmentPass::safeRunOnFunc() { return; } - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto maybeLayerCostModelAnalysis = getCachedParentAnalysis(module); auto layerCostModel = VPU::LayerCostModelAnalysis::getOrCreateLayerCostModel(maybeLayerCostModelAnalysis, arch, _log); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/optimize_concat.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/optimize_concat.cpp index 8c48ddcc3f..ffed8102e4 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/optimize_concat.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/optimize_concat.cpp @@ -9,6 +9,7 @@ #include "vpux/compiler/dialect/VPU/utils/concat_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" +#include "vpux/compiler/utils/net/network_info_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/dense_map.hpp" @@ -276,9 +277,10 @@ mlir::LogicalResult EliminateSameSiblingConcat::matchAndRewrite(VPU::ConcatOp or class OptimizeConcatPass final : public VPU::impl::OptimizeConcatBase { public: - explicit OptimizeConcatPass(bool optimizeOnlyOuterConcat, Logger log) { + explicit OptimizeConcatPass(bool optimizeOnlyOuterConcat, bool disablePassOnEntryFunction, Logger log) { Base::initLogger(log, Base::getArgumentName()); _optimizeOnlyOuterConcat = optimizeOnlyOuterConcat; + _disablePassOnEntryFunction = disablePassOnEntryFunction; } private: @@ -286,6 +288,7 @@ class OptimizeConcatPass final : public VPU::impl::OptimizeConcatBase errorHandler) final; void safeRunOnFunc() final; bool _optimizeOnlyOuterConcat = false; + bool _disablePassOnEntryFunction = false; }; mlir::LogicalResult OptimizeConcatPass::initializeOptions( @@ -297,6 +300,12 @@ mlir::LogicalResult OptimizeConcatPass::initializeOptions( _log.trace("Overloading optimizeOnlyOuterConcat with an MLIR variable {0}", optimizeOnlyOuterConcat.getValue()); _optimizeOnlyOuterConcat = optimizeOnlyOuterConcat.getValue(); } + + if (disablePassOnEntryFunction.hasValue()) { + _log.trace("Overloading disablePassOnEntryFunction with an MLIR variable {0}", + disablePassOnEntryFunction.getValue()); + _disablePassOnEntryFunction = disablePassOnEntryFunction.getValue(); + } return mlir::success(); } @@ -304,6 +313,12 @@ void OptimizeConcatPass::safeRunOnFunc() { auto func = getOperation(); auto& ctx = getContext(); + auto entryPointFunc = vpux::net::findEntryPointFunc(func, _log); + if (_disablePassOnEntryFunction && (func == entryPointFunc)) { + _log.trace("Skipping function {0} in HostCompile mode", func.getName()); + return; + } + mlir::RewritePatternSet patterns(&ctx); patterns.insert(&ctx, _log, _optimizeOnlyOuterConcat); patterns.insert(&ctx, _log); @@ -320,6 +335,7 @@ void OptimizeConcatPass::safeRunOnFunc() { // createOptimizeConcatPass // -std::unique_ptr vpux::VPU::createOptimizeConcatPass(bool optimizeOnlyOuterConcat, Logger log) { - return std::make_unique(optimizeOnlyOuterConcat, log); +std::unique_ptr vpux::VPU::createOptimizeConcatPass(bool optimizeOnlyOuterConcat, + bool disablePassOnEntryFunction, Logger log) { + return std::make_unique(optimizeOnlyOuterConcat, disablePassOnEntryFunction, log); } diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/optimize_shared_input_copy_for_concat.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/optimize_shared_input_copy_for_concat.cpp index 4a776d9ac8..3eea15aa3e 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/optimize_shared_input_copy_for_concat.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/optimize_shared_input_copy_for_concat.cpp @@ -74,15 +74,6 @@ NDTypeInterface getConcatDistributedType(VPU::DistributedTypeInterface origType, return mlir::cast(origType).changeTypeComponents(typeComponents); } -int64_t getSliceDimSize(VPU::SliceOp sliceOp) { - auto sliceInShape = getShape(sliceOp.getSource()); - auto sliceOutShape = getShape(sliceOp.getResult()); - auto sliceDimSize = llvm::count_if(irange(sliceInShape.size()), [&](auto idx) { - return sliceInShape[Dim(idx)] != sliceOutShape[Dim(idx)]; - }); - return sliceDimSize; -} - // // SharedCopyInputRewriter // @@ -209,10 +200,6 @@ bool SharedCopyInputRewriter::meetConcatPattern(VPU::ConcatOp concatOp) const { if (maybeSliceOp == nullptr || !maybeSliceOp->hasOneUse()) { return false; } - auto sliceDimSize = getSliceDimSize(maybeSliceOp); - if (sliceDimSize != 1) { - return false; - } auto userOp = *maybeSliceOp->getUsers().begin(); if (!isCopyDDR2CMX(userOp)) { diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/optimize_sparsity_ops.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/optimize_sparsity_ops.cpp index 3788d401c9..828842ff65 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/optimize_sparsity_ops.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/optimize_sparsity_ops.cpp @@ -9,6 +9,7 @@ #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" +#include "vpux/compiler/dialect/core/dialect.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include @@ -189,6 +190,7 @@ void OptimizeSparsityOpsPass::safeRunOnFunc() { if (_sparsityProfile != ActivationSparsityProfile::S1) { mlir::ConversionTarget target(ctx); target.addIllegalOp(); + target.addLegalDialect(); target.addLegalDialect(); target.addLegalDialect(); target.addLegalDialect(); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/output_pipeline_tiling.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/output_pipeline_tiling.cpp index 70f30e6344..715ecc31c3 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/output_pipeline_tiling.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/output_pipeline_tiling.cpp @@ -10,11 +10,9 @@ #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/sibling_ops_analysis.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" -#include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/compiler/utils/types.hpp" - #include #include @@ -158,7 +156,7 @@ void OutputPipelineTilingPass::safeRunOnFunc() { auto func = getOperation(); auto module = func->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto maybeLayerCostModelAnalysis = getCachedParentAnalysis(module); auto layerCostModel = VPU::LayerCostModelAnalysis::getOrCreateLayerCostModel(maybeLayerCostModelAnalysis, arch, _log); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/relocate_weight_table_for_reuse.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/relocate_weight_table_for_reuse.cpp index 9bf96ec5ac..d9afea4548 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/relocate_weight_table_for_reuse.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/relocate_weight_table_for_reuse.cpp @@ -4,6 +4,8 @@ // #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/IR/types.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" @@ -41,13 +43,9 @@ std::tuple, SmallVector> getOffsetsAndWeightsPtrs return {offsets, weightsPtrPerCluster}; } -std::tuple, SmallVector> getOffsetsAndWeightsPtrsForMatMul( - VPU::DistributedTensorType distrType, bool isDistrType) { - if (!isDistrType) { - return {SmallVector(1, 0), SmallVector(1, 0)}; - } +std::tuple, SmallVector> getOffsetsAndWeightsPtrsForMatMul(vpux::NDTypeInterface type) { SmallVector offsets; - const auto shape = distrType.getShape(); + auto shape = type.getShape(); for (auto group : irange(shape[DimsGroups5D::Filter::G])) { offsets.push_back(group * shape[DimsGroups5D::Filter::OC]); } @@ -110,7 +108,7 @@ void RelocateWeightTableForReusePass::safeRunOnFunc() { auto isMatMul = mlir::isa(nceOp); auto [offsets, weightsPtrPerCluster] = - isMatMul ? getOffsetsAndWeightsPtrsForMatMul(weightTableDistrType, isDistrType) + isMatMul ? getOffsetsAndWeightsPtrsForMatMul(weightTableType) : getOffsetsAndWeightsPtrsForConv(weightTableDistrType, isDistrType); auto originalOC = 0; diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/resolve_eltwise_with_z_tiled_workloads.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/resolve_eltwise_with_z_tiled_workloads.cpp index a9083f15d1..8f367e4885 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/resolve_eltwise_with_z_tiled_workloads.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/resolve_eltwise_with_z_tiled_workloads.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/scf_compute_ops_outlining.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/scf_compute_ops_outlining.cpp index 1d70cb12bf..132c01beef 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/scf_compute_ops_outlining.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/scf_compute_ops_outlining.cpp @@ -13,6 +13,8 @@ #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/dense_map.hpp" +#include +#include #include #include @@ -57,8 +59,9 @@ class ScfBlockUpdater final : public ScfOpHierarchy::Visitor { void endVisit(const Node& node) final { auto computeBlocks = node.data().computeBlockVec; for (auto computeBlock : computeBlocks) { - for (auto op : computeBlock) { + for (auto op : llvm::reverse(computeBlock)) { if (!mlir::isa(op)) { + assert(op->getUsers().empty() && "An op with users is not expected"); op->erase(); } } diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/set_target_independent_pass_options.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/set_target_independent_pass_options.cpp new file mode 100644 index 0000000000..e74f4885d4 --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/set_target_independent_pass_options.cpp @@ -0,0 +1,103 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/utils/adaptive_stripping_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/compressed_convolution_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" +#include "vpux/compiler/dialect/VPU/utils/nce_reduce_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/sep_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/static_shape_op_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/weights_table_reuse_utils.hpp" +#include "vpux/compiler/utils/analysis.hpp" +#include "vpux/utils/core/error.hpp" + +namespace vpux::VPU { +#define GEN_PASS_DECL_SETTARGETINDEPENDENTPASSOPTIONS +#define GEN_PASS_DEF_SETTARGETINDEPENDENTPASSOPTIONS +#include "vpux/compiler/dialect/VPU/passes.hpp.inc" +} // namespace vpux::VPU + +using namespace vpux; + +namespace { + +using vpux::VPU::getAttributeFromOption; + +// +// SetTargetIndependentPassOptionsPass +// + +class SetTargetIndependentPassOptionsPass final : + public VPU::impl::SetTargetIndependentPassOptionsBase { +public: + SetTargetIndependentPassOptionsPass() = default; + SetTargetIndependentPassOptionsPass(const VPU::InitCompilerOptions& initCompilerOptions, Logger log) { + Base::initLogger(log, Base::getArgumentName()); + Base::copyOptionValuesFrom(initCompilerOptions); + } + mlir::LogicalResult initialize(mlir::MLIRContext* context) override final; + +private: + void safeRunOnModule() override final; + + bool _allowCustomValues = false; + llvm::SmallVector, /* expected num Opts*/ 10> _optionSet; +}; + +mlir::LogicalResult SetTargetIndependentPassOptionsPass::initialize(mlir::MLIRContext* context) { + _optionSet = { + {VPU::AUTO_PADDING_ODU, getAttributeFromOption(context, enableAutoPaddingODU)}, + {VPU::AUTO_PADDING_IDU, getAttributeFromOption(context, enableAutoPaddingIDU)}, + {VPU::REDUCE_SUPPORTED, getAttributeFromOption(context, enableIsReduceSupported)}, + {VPU::FP16_COMPRESSED_CONV, getAttributeFromOption(context, enableFP16CompressedConvolution)}, + {VPU::VPUNN_PRE_SPLIT, getAttributeFromOption(context, enableVPUNNPreSplit)}, + {VPU::ENABLE_SE_PTRS_OPERATIONS, getAttributeFromOption(context, enableSEPtrsOperations)}, + {VPU::ENABLE_EXPERIMENTAL_SE_PTRS_OPERATIONS, + getAttributeFromOption(context, enableExperimentalSEPtrsOperations)}, + {VPU::ENABLE_ADAPTIVE_STRIPPING, getAttributeFromOption(context, enableAdaptiveStripping)}, + {VPU::ENABLE_EXTRA_STATIC_SHAPE_OPS, getAttributeFromOption(context, enableExtraStaticShapeOps)}, + {VPU::WEIGHTS_TABLE_REUSE_MODE, getAttributeFromOption(context, weightsTableReuseMode)}, + }; + + if (allowCustomValues.hasValue()) { + _allowCustomValues = allowCustomValues.getValue(); + } + return mlir::success(); +} + +void SetTargetIndependentPassOptionsPass::safeRunOnModule() { + auto moduleOp = getModuleOp(getOperation()); + auto optionsBuilder = mlir::OpBuilder::atBlockBegin(moduleOp.getBody()); + auto pipelineOptionsOp = VPU::getPipelineOptionsOp(getContext(), moduleOp); + optionsBuilder = + mlir::OpBuilder::atBlockBegin(&pipelineOptionsOp.getOptions().front(), optionsBuilder.getListener()); + + auto* ctx = optionsBuilder.getContext(); + for (const auto& [name, attribute] : _optionSet) { + bool hasPipelineOption = pipelineOptionsOp.lookupSymbol(name) != nullptr; + VPUX_THROW_WHEN(!_allowCustomValues && hasPipelineOption, + "Option {0} is already defined, probably you run '--init-compiler' twice", name); + + if (hasPipelineOption) { + continue; + } + optionsBuilder.create(optionsBuilder.getUnknownLoc(), mlir::StringAttr::get(ctx, name), + attribute); + } +} + +} // namespace + +std::unique_ptr vpux::VPU::createSetTargetIndependentPassOptionsPass() { + return std::make_unique(); +} +std::unique_ptr vpux::VPU::createSetTargetIndependentPassOptionsPass( + const VPU::InitCompilerOptions& initCompilerOptions, Logger log) { + return std::make_unique(initCompilerOptions, log); +} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_channels_auto_padding.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_channels_auto_padding.cpp deleted file mode 100644 index c4ac464690..0000000000 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_channels_auto_padding.cpp +++ /dev/null @@ -1,124 +0,0 @@ -// -// Copyright (C) 2024-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" -#include "vpux/compiler/dialect/config/IR/ops.hpp" -#include "vpux/compiler/utils/analysis.hpp" -#include "vpux/utils/core/error.hpp" - -namespace vpux::VPU { -#define GEN_PASS_DECL_SETUPCHANNELSAUTOPADDING -#define GEN_PASS_DEF_SETUPCHANNELSAUTOPADDING -#include "vpux/compiler/dialect/VPU/passes.hpp.inc" -} // namespace vpux::VPU - -using namespace vpux; - -namespace { - -// -// SetupChannelsAutoPaddingPass -// - -class SetupChannelsAutoPaddingPass final : - public VPU::impl::SetupChannelsAutoPaddingBase { -public: - SetupChannelsAutoPaddingPass() = default; - SetupChannelsAutoPaddingPass(const VPU::InitCompilerOptions& initCompilerOptions, Logger log) - : _enableAutoPaddingODU(enableAutoPaddingODU), _enableAutoPaddingIDU(enableAutoPaddingIDU) { - Base::initLogger(log, Base::getArgumentName()); - Base::copyOptionValuesFrom(initCompilerOptions); - - initializeFromOptions(); - } - -private: - mlir::LogicalResult initializeOptions( - StringRef options, llvm::function_ref errorHandler) final; - void safeRunOnModule() final; - -private: - // Initialize fields from pass options - void initializeFromOptions(); - -private: - bool _enableAutoPaddingODU = false; - bool _enableAutoPaddingIDU = false; - bool _allowCustomValues = false; -}; - -void addOption(mlir::OpBuilder optionsBuilder, config::PipelineOptionsOp pipelineOptionsOp, mlir::StringRef optionName, - size_t optionValue, bool allowCustomValues) { - auto hasPipelineOption = pipelineOptionsOp.lookupSymbol(optionName) != nullptr; - VPUX_THROW_WHEN(!allowCustomValues && hasPipelineOption, - "ODU auto padding is already defined, probably you run '--init-compiler' twice"); - - if (hasPipelineOption) { - return; - } - auto* ctx = optionsBuilder.getContext(); - const auto constraintAttr = mlir::StringAttr::get(ctx, optionName); - optionsBuilder.create(optionsBuilder.getUnknownLoc(), constraintAttr, - mlir::BoolAttr::get(ctx, optionValue)); -} - -mlir::LogicalResult SetupChannelsAutoPaddingPass::initializeOptions( - StringRef options, llvm::function_ref errorHandler) { - if (mlir::failed(Base::initializeOptions(options, errorHandler))) { - return mlir::failure(); - } - - initializeFromOptions(); - - return mlir::success(); -} - -void SetupChannelsAutoPaddingPass::initializeFromOptions() { - if (enableAutoPaddingODU.hasValue()) { - _log.trace("Overloading the default value {0} of the '_enableAutoPaddingODU' field to the value {1} " - "of the pass option 'enableAutoPaddingODU' generated by MLIR", - _enableAutoPaddingODU, enableAutoPaddingODU); - _enableAutoPaddingODU = enableAutoPaddingODU; - } - - if (enableAutoPaddingIDU.hasValue()) { - _log.trace("Overloading the default value {0} of the '_enableAutoPaddingIDU' field to the value {1} " - "of the pass option 'enableAutoPaddingIDU' generated by MLIR", - _enableAutoPaddingIDU, enableAutoPaddingIDU); - _enableAutoPaddingIDU = enableAutoPaddingIDU; - } - - if (allowCustomValues.hasValue()) { - _allowCustomValues = allowCustomValues.getValue(); - } -} - -void SetupChannelsAutoPaddingPass::safeRunOnModule() { - auto moduleOp = getModuleOp(getOperation()); - auto optionsBuilder = mlir::OpBuilder::atBlockBegin(moduleOp.getBody()); - auto pipelineOptionsOp = VPU::getPipelineOptionsOp(getContext(), moduleOp); - optionsBuilder = - mlir::OpBuilder::atBlockBegin(&pipelineOptionsOp.getOptions().front(), optionsBuilder.getListener()); - - addOption(optionsBuilder, pipelineOptionsOp, VPU::AUTO_PADDING_ODU, _enableAutoPaddingODU, _allowCustomValues); - addOption(optionsBuilder, pipelineOptionsOp, VPU::AUTO_PADDING_IDU, _enableAutoPaddingIDU, _allowCustomValues); -} - -} // namespace - -// -// createSetupChannelsAutoPaddingPass -// - -std::unique_ptr vpux::VPU::createSetupChannelsAutoPaddingPass() { - return std::make_unique(); -} - -std::unique_ptr vpux::VPU::createSetupChannelsAutoPaddingPass( - const VPU::InitCompilerOptions& initCompilerOptions, Logger log) { - return std::make_unique(initCompilerOptions, log); -} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_adaptive_stripping.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_adaptive_stripping.cpp deleted file mode 100644 index 8724cdcc99..0000000000 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_adaptive_stripping.cpp +++ /dev/null @@ -1,117 +0,0 @@ -// -// Copyright (C) 2024-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/adaptive_stripping_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp" -#include "vpux/compiler/dialect/config/IR/ops.hpp" -#include "vpux/compiler/utils/analysis.hpp" -#include "vpux/utils/core/error.hpp" - -namespace vpux::VPU { -#define GEN_PASS_DECL_SETUPENABLEADAPTIVESTRIPPING -#define GEN_PASS_DEF_SETUPENABLEADAPTIVESTRIPPING -#include "vpux/compiler/dialect/VPU/passes.hpp.inc" -} // namespace vpux::VPU - -using namespace vpux; - -namespace { - -// -// SetupEnableAdaptiveStrippingPass -// - -class SetupEnableAdaptiveStrippingPass final : - public VPU::impl::SetupEnableAdaptiveStrippingBase { -public: - SetupEnableAdaptiveStrippingPass() = default; - SetupEnableAdaptiveStrippingPass(const VPU::InitCompilerOptions& initCompilerOptions, Logger log) - : _enableAdaptiveStripping(enableAdaptiveStripping) { - Base::initLogger(log, Base::getArgumentName()); - Base::copyOptionValuesFrom(initCompilerOptions); - - initializeFromOptions(); - } - -private: - mlir::LogicalResult initializeOptions( - StringRef options, llvm::function_ref errorHandler) final; - void safeRunOnModule() final; - -private: - // Initialize fields from pass options - void initializeFromOptions(); - -private: - bool _enableAdaptiveStripping = false; - bool _allowCustomValues = false; -}; - -void addOption(mlir::OpBuilder optionsBuilder, config::PipelineOptionsOp pipelineOptionsOp, mlir::StringRef optionName, - size_t optionValue, bool allowCustomValues) { - auto hasPipelineOption = pipelineOptionsOp.lookupSymbol(optionName) != nullptr; - VPUX_THROW_WHEN(!allowCustomValues && hasPipelineOption, - "Enable Adaptive Stripping is already defined, probably you run '--init-compiler' twice"); - - if (hasPipelineOption) { - return; - } - auto* ctx = optionsBuilder.getContext(); - const auto constraintAttr = mlir::StringAttr::get(ctx, optionName); - optionsBuilder.create(optionsBuilder.getUnknownLoc(), constraintAttr, - mlir::BoolAttr::get(ctx, optionValue)); -} - -mlir::LogicalResult SetupEnableAdaptiveStrippingPass::initializeOptions( - StringRef options, llvm::function_ref errorHandler) { - if (mlir::failed(Base::initializeOptions(options, errorHandler))) { - return mlir::failure(); - } - - initializeFromOptions(); - - return mlir::success(); -} - -void SetupEnableAdaptiveStrippingPass::initializeFromOptions() { - if (enableAdaptiveStripping.hasValue()) { - _log.trace("Overloading the default value {0} of the '_EnableAdaptiveStripping' field to the value {1} " - "of the pass option 'EnableAdaptiveStripping' generated by MLIR", - _enableAdaptiveStripping, enableAdaptiveStripping); - _enableAdaptiveStripping = enableAdaptiveStripping; - } - - if (allowCustomValues.hasValue()) { - _allowCustomValues = allowCustomValues.getValue(); - } -} - -void SetupEnableAdaptiveStrippingPass::safeRunOnModule() { - auto moduleOp = getModuleOp(getOperation()); - auto optionsBuilder = mlir::OpBuilder::atBlockBegin(moduleOp.getBody()); - auto pipelineOptionsOp = VPU::getPipelineOptionsOp(getContext(), moduleOp); - optionsBuilder = - mlir::OpBuilder::atBlockBegin(&pipelineOptionsOp.getOptions().front(), optionsBuilder.getListener()); - - addOption(optionsBuilder, pipelineOptionsOp, VPU::ENABLE_ADAPTIVE_STRIPPING, _enableAdaptiveStripping, - _allowCustomValues); -} - -} // namespace - -// -// createSetupEnableAdaptiveStrippingPass -// - -std::unique_ptr vpux::VPU::createSetupEnableAdaptiveStrippingPass() { - return std::make_unique(); -} - -std::unique_ptr vpux::VPU::createSetupEnableAdaptiveStrippingPass( - const VPU::InitCompilerOptions& initCompilerOptions, Logger log) { - return std::make_unique(initCompilerOptions, log); -} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_extra_static_shape_ops.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_extra_static_shape_ops.cpp deleted file mode 100644 index 988a05f542..0000000000 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_extra_static_shape_ops.cpp +++ /dev/null @@ -1,115 +0,0 @@ -// -// Copyright (C) 2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/static_shape_op_utils.hpp" -#include "vpux/compiler/dialect/config/IR/ops.hpp" -#include "vpux/compiler/utils/analysis.hpp" -#include "vpux/utils/core/error.hpp" - -namespace vpux::VPU { -#define GEN_PASS_DECL_SETUPENABLEEXTRASTATICSHAPEOPS -#define GEN_PASS_DEF_SETUPENABLEEXTRASTATICSHAPEOPS -#include "vpux/compiler/dialect/VPU/passes.hpp.inc" -} // namespace vpux::VPU - -using namespace vpux; - -namespace { - -// -// SetupEnableExtraStaticShapeOpsPass -// - -class SetupEnableExtraStaticShapeOpsPass final : - public VPU::impl::SetupEnableExtraStaticShapeOpsBase { -public: - SetupEnableExtraStaticShapeOpsPass() = default; - SetupEnableExtraStaticShapeOpsPass(const VPU::InitCompilerOptions& initCompilerOptions, Logger log) - : _enableExtraStaticShapeOps(enableExtraStaticShapeOps) { - Base::initLogger(log, Base::getArgumentName()); - Base::copyOptionValuesFrom(initCompilerOptions); - - initializeFromOptions(); - } - -private: - mlir::LogicalResult initializeOptions( - StringRef options, llvm::function_ref errorHandler) final; - void safeRunOnModule() final; - -private: - // Initialize fields from pass options - void initializeFromOptions(); - -private: - bool _enableExtraStaticShapeOps = false; - bool _allowCustomValues = false; -}; - -void addOption(mlir::OpBuilder optionsBuilder, config::PipelineOptionsOp pipelineOptionsOp, mlir::StringRef optionName, - size_t optionValue, bool allowCustomValues) { - auto hasPipelineOption = pipelineOptionsOp.lookupSymbol(optionName) != nullptr; - VPUX_THROW_WHEN(!allowCustomValues && hasPipelineOption, - "Enable Extra StaticShape ops is already defined, probably you run '--init-compiler' twice"); - - if (hasPipelineOption) { - return; - } - auto* ctx = optionsBuilder.getContext(); - const auto constraintAttr = mlir::StringAttr::get(ctx, optionName); - optionsBuilder.create(optionsBuilder.getUnknownLoc(), constraintAttr, - mlir::BoolAttr::get(ctx, optionValue)); -} - -mlir::LogicalResult SetupEnableExtraStaticShapeOpsPass::initializeOptions( - StringRef options, llvm::function_ref errorHandler) { - if (mlir::failed(Base::initializeOptions(options, errorHandler))) { - return mlir::failure(); - } - - initializeFromOptions(); - - return mlir::success(); -} - -void SetupEnableExtraStaticShapeOpsPass::initializeFromOptions() { - _log.trace("Overloading the default value {0} of the '_enableExtraStaticShapeOps' field to the value {1} " - "of the pass option 'EnableExtraStaticShapeOps' generated by MLIR", - _enableExtraStaticShapeOps, enableExtraStaticShapeOps); - _enableExtraStaticShapeOps = enableExtraStaticShapeOps; - - if (allowCustomValues.hasValue()) { - _allowCustomValues = allowCustomValues.getValue(); - } -} - -void SetupEnableExtraStaticShapeOpsPass::safeRunOnModule() { - auto moduleOp = getModuleOp(getOperation()); - auto optionsBuilder = mlir::OpBuilder::atBlockBegin(moduleOp.getBody()); - auto pipelineOptionsOp = VPU::getPipelineOptionsOp(getContext(), moduleOp); - optionsBuilder = - mlir::OpBuilder::atBlockBegin(&pipelineOptionsOp.getOptions().front(), optionsBuilder.getListener()); - - addOption(optionsBuilder, pipelineOptionsOp, VPU::ENABLE_EXTRA_STATIC_SHAPE_OPS, _enableExtraStaticShapeOps, - _allowCustomValues); -} - -} // namespace - -// -// createSetupEnableExtraStaticShapeOpsPass -// - -std::unique_ptr vpux::VPU::createSetupEnableExtraStaticShapeOpsPass() { - return std::make_unique(); -} - -std::unique_ptr vpux::VPU::createSetupEnableExtraStaticShapeOpsPass( - const VPU::InitCompilerOptions& initCompilerOptions, Logger log) { - return std::make_unique(initCompilerOptions, log); -} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_fp16_compressed_conv.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_fp16_compressed_conv.cpp deleted file mode 100644 index a34c1c4ac0..0000000000 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_fp16_compressed_conv.cpp +++ /dev/null @@ -1,116 +0,0 @@ -// -// Copyright (C) 2024-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/compressed_convolution_utils.hpp" -#include "vpux/compiler/dialect/config/IR/ops.hpp" -#include "vpux/compiler/utils/analysis.hpp" -#include "vpux/utils/core/error.hpp" - -namespace vpux::VPU { -#define GEN_PASS_DECL_SETUPENABLEFP16COMPRESSEDCONV -#define GEN_PASS_DEF_SETUPENABLEFP16COMPRESSEDCONV -#include "vpux/compiler/dialect/VPU/passes.hpp.inc" -} // namespace vpux::VPU - -using namespace vpux; - -namespace { - -// -// SetupEnableFP16CompressedConvPass -// - -class SetupEnableFP16CompressedConvPass final : - public VPU::impl::SetupEnableFP16CompressedConvBase { -public: - SetupEnableFP16CompressedConvPass() = default; - SetupEnableFP16CompressedConvPass(const VPU::InitCompilerOptions& initCompilerOptions, Logger log) - : _enableFP16CompressedConvolution(enableFP16CompressedConvolution) { - Base::initLogger(log, Base::getArgumentName()); - Base::copyOptionValuesFrom(initCompilerOptions); - - initializeFromOptions(); - } - -private: - mlir::LogicalResult initializeOptions( - StringRef options, llvm::function_ref errorHandler) final; - void safeRunOnModule() final; - -private: - // Initialize fields from pass options - void initializeFromOptions(); - -private: - bool _enableFP16CompressedConvolution = false; - bool _allowCustomValues = false; -}; - -void addOption(mlir::OpBuilder optionsBuilder, config::PipelineOptionsOp pipelineOptionsOp, mlir::StringRef optionName, - size_t optionValue, bool allowCustomValues) { - auto hasPipelineOption = pipelineOptionsOp.lookupSymbol(optionName) != nullptr; - VPUX_THROW_WHEN(!allowCustomValues && hasPipelineOption, - "FP16 Compressed Conv is already defined, probably you run '--init-compiler' twice"); - - if (hasPipelineOption) { - return; - } - auto* ctx = optionsBuilder.getContext(); - const auto constraintAttr = mlir::StringAttr::get(ctx, optionName); - optionsBuilder.create(optionsBuilder.getUnknownLoc(), constraintAttr, - mlir::BoolAttr::get(ctx, optionValue)); -} - -mlir::LogicalResult SetupEnableFP16CompressedConvPass::initializeOptions( - StringRef options, llvm::function_ref errorHandler) { - if (mlir::failed(Base::initializeOptions(options, errorHandler))) { - return mlir::failure(); - } - - initializeFromOptions(); - - return mlir::success(); -} - -void SetupEnableFP16CompressedConvPass::initializeFromOptions() { - if (enableFP16CompressedConvolution.hasValue()) { - _log.trace("Overloading the default value {0} of the '_enableFP16CompressedConvolution' field to the value {1} " - "of the pass option 'enableFP16CompressedConvolution' generated by MLIR", - _enableFP16CompressedConvolution, enableFP16CompressedConvolution); - _enableFP16CompressedConvolution = enableFP16CompressedConvolution; - } - - if (allowCustomValues.hasValue()) { - _allowCustomValues = allowCustomValues.getValue(); - } -} - -void SetupEnableFP16CompressedConvPass::safeRunOnModule() { - auto moduleOp = getModuleOp(getOperation()); - auto optionsBuilder = mlir::OpBuilder::atBlockBegin(moduleOp.getBody()); - auto pipelineOptionsOp = VPU::getPipelineOptionsOp(getContext(), moduleOp); - optionsBuilder = - mlir::OpBuilder::atBlockBegin(&pipelineOptionsOp.getOptions().front(), optionsBuilder.getListener()); - - addOption(optionsBuilder, pipelineOptionsOp, VPU::FP16_COMPRESSED_CONV, _enableFP16CompressedConvolution, - _allowCustomValues); -} - -} // namespace - -// -// createSetupEnableFP16CompressedConvPass -// - -std::unique_ptr vpux::VPU::createSetupEnableFP16CompressedConvPass() { - return std::make_unique(); -} - -std::unique_ptr vpux::VPU::createSetupEnableFP16CompressedConvPass( - const VPU::InitCompilerOptions& initCompilerOptions, Logger log) { - return std::make_unique(initCompilerOptions, log); -} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_se_ptrs_operations.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_se_ptrs_operations.cpp deleted file mode 100644 index 944b4638d4..0000000000 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_se_ptrs_operations.cpp +++ /dev/null @@ -1,124 +0,0 @@ -// -// Copyright (C) 2024-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/sep_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp" -#include "vpux/compiler/dialect/config/IR/ops.hpp" -#include "vpux/compiler/utils/analysis.hpp" -#include "vpux/utils/core/error.hpp" - -namespace vpux::VPU { -#define GEN_PASS_DECL_SETUPENABLESEPTRSOPERATIONS -#define GEN_PASS_DEF_SETUPENABLESEPTRSOPERATIONS -#include "vpux/compiler/dialect/VPU/passes.hpp.inc" -} // namespace vpux::VPU - -using namespace vpux; - -namespace { - -// -// SetupEnableSEPtrsOperationsPass -// - -class SetupEnableSEPtrsOperationsPass final : - public VPU::impl::SetupEnableSEPtrsOperationsBase { -public: - SetupEnableSEPtrsOperationsPass() = default; - SetupEnableSEPtrsOperationsPass(const VPU::InitCompilerOptions& initCompilerOptions, Logger log) - : _enableSEPtrsOperations(enableSEPtrsOperations), - _enableExperimentalSEPtrsOperations(enableExperimentalSEPtrsOperations) { - Base::initLogger(log, Base::getArgumentName()); - Base::copyOptionValuesFrom(initCompilerOptions); - - initializeFromOptions(); - } - -private: - mlir::LogicalResult initializeOptions( - StringRef options, llvm::function_ref errorHandler) final; - void safeRunOnModule() final; - -private: - // Initialize fields from pass options - void initializeFromOptions(); - -private: - bool _enableSEPtrsOperations = false; - bool _enableExperimentalSEPtrsOperations = false; - bool _allowCustomValues = false; -}; - -void addOption(mlir::OpBuilder optionsBuilder, config::PipelineOptionsOp pipelineOptionsOp, mlir::StringRef optionName, - size_t optionValue, bool allowCustomValues) { - auto hasPipelineOption = pipelineOptionsOp.lookupSymbol(optionName) != nullptr; - VPUX_THROW_WHEN(!allowCustomValues && hasPipelineOption, - "Enable SE pointers operations is already defined, probably you run '--init-compiler' twice"); - - if (hasPipelineOption) { - return; - } - auto* ctx = optionsBuilder.getContext(); - const auto constraintAttr = mlir::StringAttr::get(ctx, optionName); - optionsBuilder.create(optionsBuilder.getUnknownLoc(), constraintAttr, - mlir::BoolAttr::get(ctx, optionValue)); -} - -mlir::LogicalResult SetupEnableSEPtrsOperationsPass::initializeOptions( - StringRef options, llvm::function_ref errorHandler) { - if (mlir::failed(Base::initializeOptions(options, errorHandler))) { - return mlir::failure(); - } - - initializeFromOptions(); - - return mlir::success(); -} - -void SetupEnableSEPtrsOperationsPass::initializeFromOptions() { - _log.trace("Overloading the default value {0} of the '_EnableSEPtrsOperations' field to the value {1} " - "of the pass option 'EnableSEPtrsOperations' generated by MLIR", - _enableSEPtrsOperations, enableSEPtrsOperations); - _enableSEPtrsOperations = enableSEPtrsOperations; - - _log.trace("Overloading the default value {0} of the '_enableExperimentalSEPtrsOperations' field to the value {1} " - "of the pass option 'enableExperimentalSEPtrsOperations' generated by MLIR", - _enableExperimentalSEPtrsOperations, enableExperimentalSEPtrsOperations); - _enableExperimentalSEPtrsOperations = enableExperimentalSEPtrsOperations; - - if (allowCustomValues.hasValue()) { - _allowCustomValues = allowCustomValues.getValue(); - } -} - -void SetupEnableSEPtrsOperationsPass::safeRunOnModule() { - auto moduleOp = getModuleOp(getOperation()); - auto optionsBuilder = mlir::OpBuilder::atBlockBegin(moduleOp.getBody()); - auto pipelineOptionsOp = VPU::getPipelineOptionsOp(getContext(), moduleOp); - optionsBuilder = - mlir::OpBuilder::atBlockBegin(&pipelineOptionsOp.getOptions().front(), optionsBuilder.getListener()); - - addOption(optionsBuilder, pipelineOptionsOp, VPU::ENABLE_SE_PTRS_OPERATIONS, _enableSEPtrsOperations, - _allowCustomValues); - addOption(optionsBuilder, pipelineOptionsOp, VPU::ENABLE_EXPERIMENTAL_SE_PTRS_OPERATIONS, - _enableExperimentalSEPtrsOperations, _allowCustomValues); -} - -} // namespace - -// -// createSetupEnableSEPtrsOperationsPass -// - -std::unique_ptr vpux::VPU::createSetupEnableSEPtrsOperationsPass() { - return std::make_unique(); -} - -std::unique_ptr vpux::VPU::createSetupEnableSEPtrsOperationsPass( - const VPU::InitCompilerOptions& initCompilerOptions, Logger log) { - return std::make_unique(initCompilerOptions, log); -} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_vpunn_pre_split.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_vpunn_pre_split.cpp deleted file mode 100644 index 0a97121baf..0000000000 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_enable_vpunn_pre_split.cpp +++ /dev/null @@ -1,114 +0,0 @@ -// -// Copyright (C) 2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" -#include "vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp" -#include "vpux/compiler/utils/analysis.hpp" -#include "vpux/utils/core/error.hpp" - -namespace vpux::VPU { -#define GEN_PASS_DECL_SETUPENABLEVPUNNPRESPLIT -#define GEN_PASS_DEF_SETUPENABLEVPUNNPRESPLIT -#include "vpux/compiler/dialect/VPU/passes.hpp.inc" -} // namespace vpux::VPU - -using namespace vpux; - -namespace { - -// -// SetupEnableVPUNNPreSplit -// - -class SetupEnableVPUNNPreSplitPass final : - public VPU::impl::SetupEnableVPUNNPreSplitBase { -public: - SetupEnableVPUNNPreSplitPass() = default; - SetupEnableVPUNNPreSplitPass(const VPU::InitCompilerOptions& initCompilerOptions, Logger log) - : _enableVPUNNPreSplit(enableVPUNNPreSplit) { - Base::initLogger(log, Base::getArgumentName()); - Base::copyOptionValuesFrom(initCompilerOptions); - - initializeFromOptions(); - } - -private: - mlir::LogicalResult initializeOptions( - StringRef options, llvm::function_ref errorHandler) final; - void safeRunOnModule() final; - -private: - // Initialize fields from pass options - void initializeFromOptions(); - -private: - bool _enableVPUNNPreSplit = false; - bool _allowCustomValues = false; -}; - -void addOption(mlir::OpBuilder optionsBuilder, config::PipelineOptionsOp pipelineOptionsOp, mlir::StringRef optionName, - size_t optionValue, bool allowCustomValues) { - auto hasPipelineOption = pipelineOptionsOp.lookupSymbol(optionName) != nullptr; - VPUX_THROW_WHEN(!allowCustomValues && hasPipelineOption, "{0} defined, probably you run '--init-compiler' twice", - optionName); - - if (hasPipelineOption) { - return; - } - auto* ctx = optionsBuilder.getContext(); - const auto constraintAttr = mlir::StringAttr::get(ctx, optionName); - optionsBuilder.create(optionsBuilder.getUnknownLoc(), constraintAttr, - mlir::BoolAttr::get(ctx, optionValue)); -} - -mlir::LogicalResult SetupEnableVPUNNPreSplitPass::initializeOptions( - StringRef options, llvm::function_ref errorHandler) { - if (mlir::failed(Base::initializeOptions(options, errorHandler))) { - return mlir::failure(); - } - - initializeFromOptions(); - - return mlir::success(); -} - -void SetupEnableVPUNNPreSplitPass::initializeFromOptions() { - _log.trace("Overloading the default value {0} of the '_enableVPUNNPreSplit' field to the value {1} " - "of the pass option 'enableVPUNNPreSplit' generated by MLIR", - _enableVPUNNPreSplit, enableVPUNNPreSplit); - _enableVPUNNPreSplit = enableVPUNNPreSplit; - - if (allowCustomValues.hasValue()) { - _allowCustomValues = allowCustomValues.getValue(); - } -} - -void SetupEnableVPUNNPreSplitPass::safeRunOnModule() { - auto moduleOp = getModuleOp(getOperation()); - auto optionsBuilder = mlir::OpBuilder::atBlockBegin(moduleOp.getBody()); - auto pipelineOptionsOp = VPU::getPipelineOptionsOp(getContext(), moduleOp); - optionsBuilder = - mlir::OpBuilder::atBlockBegin(&pipelineOptionsOp.getOptions().front(), optionsBuilder.getListener()); - - addOption(optionsBuilder, pipelineOptionsOp, VPU::VPUNN_PRE_SPLIT, _enableVPUNNPreSplit, _allowCustomValues); -} - -} // namespace - -// -// createSetupEnableVPUNNPreSplitPass -// - -std::unique_ptr vpux::VPU::createSetupEnableVPUNNPreSplitPass() { - return std::make_unique(); -} - -std::unique_ptr vpux::VPU::createSetupEnableVPUNNPreSplitPass( - const VPU::InitCompilerOptions& initCompilerOptions, Logger log) { - return std::make_unique(initCompilerOptions, log); -} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_is_reduce_supported.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_is_reduce_supported.cpp deleted file mode 100644 index 59444208e8..0000000000 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_is_reduce_supported.cpp +++ /dev/null @@ -1,114 +0,0 @@ -// -// Copyright (C) 2024-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_reduce_utils.hpp" -#include "vpux/compiler/dialect/config/IR/ops.hpp" -#include "vpux/compiler/utils/analysis.hpp" -#include "vpux/utils/core/error.hpp" - -namespace vpux::VPU { -#define GEN_PASS_DECL_SETUPISREDUCESUPPORTED -#define GEN_PASS_DEF_SETUPISREDUCESUPPORTED -#include "vpux/compiler/dialect/VPU/passes.hpp.inc" -} // namespace vpux::VPU - -using namespace vpux; - -namespace { - -// -// SetupIsReduceSupportedPass -// - -class SetupIsReduceSupportedPass final : public VPU::impl::SetupIsReduceSupportedBase { -public: - SetupIsReduceSupportedPass() = default; - SetupIsReduceSupportedPass(const VPU::InitCompilerOptions& initCompilerOptions, Logger log) - : _enableIsReduceSupported(enableIsReduceSupported) { - Base::initLogger(log, Base::getArgumentName()); - Base::copyOptionValuesFrom(initCompilerOptions); - - initializeFromOptions(); - } - -private: - mlir::LogicalResult initializeOptions( - StringRef options, llvm::function_ref errorHandler) final; - void safeRunOnModule() final; - -private: - // Initialize fields from pass options - void initializeFromOptions(); - -private: - bool _enableIsReduceSupported = false; - bool _allowCustomValues = false; -}; - -void addOption(mlir::OpBuilder optionsBuilder, config::PipelineOptionsOp pipelineOptionsOp, mlir::StringRef optionName, - size_t optionValue, bool allowCustomValues) { - auto hasPipelineOption = pipelineOptionsOp.lookupSymbol(optionName) != nullptr; - VPUX_THROW_WHEN(!allowCustomValues && hasPipelineOption, - "IsReduceSupported is already defined, probably you run '--init-compiler' twice"); - - if (hasPipelineOption) { - return; - } - auto* ctx = optionsBuilder.getContext(); - const auto constraintAttr = mlir::StringAttr::get(ctx, optionName); - optionsBuilder.create(optionsBuilder.getUnknownLoc(), constraintAttr, - mlir::BoolAttr::get(ctx, optionValue)); -} - -mlir::LogicalResult SetupIsReduceSupportedPass::initializeOptions( - StringRef options, llvm::function_ref errorHandler) { - if (mlir::failed(Base::initializeOptions(options, errorHandler))) { - return mlir::failure(); - } - - initializeFromOptions(); - - return mlir::success(); -} - -void SetupIsReduceSupportedPass::initializeFromOptions() { - if (enableIsReduceSupported.hasValue()) { - _log.trace("Overloading the default value {0} of the '_enableIsReduceSupported' field to the value {1} " - "of the pass option 'enableIsReduceSupported' generated by MLIR", - _enableIsReduceSupported, enableIsReduceSupported); - _enableIsReduceSupported = enableIsReduceSupported; - } - - if (allowCustomValues.hasValue()) { - _allowCustomValues = allowCustomValues.getValue(); - } -} - -void SetupIsReduceSupportedPass::safeRunOnModule() { - auto moduleOp = getModuleOp(getOperation()); - auto optionsBuilder = mlir::OpBuilder::atBlockBegin(moduleOp.getBody()); - auto pipelineOptionsOp = VPU::getPipelineOptionsOp(getContext(), moduleOp); - optionsBuilder = - mlir::OpBuilder::atBlockBegin(&pipelineOptionsOp.getOptions().front(), optionsBuilder.getListener()); - - addOption(optionsBuilder, pipelineOptionsOp, VPU::REDUCE_SUPPORTED, _enableIsReduceSupported, _allowCustomValues); -} - -} // namespace - -// -// createSetupIsReduceSupportedPass -// - -std::unique_ptr vpux::VPU::createSetupIsReduceSupportedPass() { - return std::make_unique(); -} - -std::unique_ptr vpux::VPU::createSetupIsReduceSupportedPass( - const VPU::InitCompilerOptions& initCompilerOptions, Logger log) { - return std::make_unique(initCompilerOptions, log); -} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_max_kernel_size.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_max_kernel_size.cpp index 9a293c953c..b04b83163a 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_max_kernel_size.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_max_kernel_size.cpp @@ -8,11 +8,10 @@ #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" #include "vpux/compiler/dialect/config/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/analysis.hpp" #include "vpux/utils/core/error.hpp" -#include - namespace vpux::VPU { #define GEN_PASS_DECL_SETUPMAXKERNELSIZE #define GEN_PASS_DEF_SETUPMAXKERNELSIZE @@ -90,7 +89,7 @@ void SetupMaxKernelSizePass::safeRunOnModule() { optionsBuilder = mlir::OpBuilder::atBlockBegin(&pipelineOptionsOp.getOptions().front(), optionsBuilder.getListener()); - auto maxKernelSizeConstant = vpux::VPU::getMaxKernelSizeConstant(VPU::getArch(getOperation())); + auto maxKernelSizeConstant = vpux::VPU::getMaxKernelSizeConstant(config::getArch(getOperation())); auto maxKernelSize = maxKernelSizeConstant.getMaxKernelSize(); addConstant(optionsBuilder, pipelineOptionsOp, VPU::MAX_KERNEL_SIZE, maxKernelSize, _allowCustomValues); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_npu_constraint.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_npu_constraint.cpp index 958a58ca49..c3b5cffc74 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_npu_constraint.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_npu_constraint.cpp @@ -8,8 +8,11 @@ #include "vpux/compiler/dialect/VPU/transforms/factories/barrier_variant_constraint.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/wlm_constraint_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/config/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/analysis.hpp" +#include "vpux/compiler/utils/platform_resources.hpp" #include "vpux/compiler/utils/shave.hpp" #include "vpux/utils/core/error.hpp" @@ -34,6 +37,15 @@ class SetupNpuConstraintPass final : public VPU::impl::SetupNpuConstraintBase(VPU::WORKLOAD_MANAGEMENT_STATUS) && !_allowCustomValues, + "Workload Management Status is already defined, probably you run '--init-compiler' twice"); + + if (!pipelineOptionsOp.lookupSymbol(VPU::WORKLOAD_MANAGEMENT_STATUS)) { + vpux::VPU::setWorkloadManagementStatus(moduleOp, workloadManagementStatus); } addConstraint(optionsBuilder, pipelineOptionsOp, VPU::USE_DEDICATED_FIFO_PER_SHAVE_ENGINE, _enableSwFifoPerShave, @@ -136,7 +151,7 @@ void SetupNpuConstraintPass::safeRunOnModule() { auto supportsSwFifoPerShave = VPU::getConstraint(moduleOp, VPU::USE_DEDICATED_FIFO_PER_SHAVE_ENGINE); _log.info("Support for FIFO per each SHAVE engine enabled: {0}", supportsSwFifoPerShave); - auto useWlmBarrierConfig = _workloadManagementEnable; + auto useWlmBarrierConfig = workloadManagementStatus == VPU::WorkloadManagementStatus::ENABLED; if (wlmRollback.hasValue() && wlmRollback.getValue() == true) { // Using non-WLM values might result in slightly worse inference latency but is // safer in case compilation with WLM enabled fails @@ -169,7 +184,8 @@ void SetupNpuConstraintPass::safeRunOnModule() { }(); VPUX_THROW_UNLESS(numShvExecutorsPerTile == 1 || numShvExecutorsPerTile == 2, - "Unsupported number of SHAVE executors"); + "Unsupported number of SHAVE executors '{0}'", numShvExecutorsPerTile); + auto maxActKernelRange = vpux::VPU::getDefaultTaskListCount(VPU::TaskType::ActKernelRange, arch) / numShvExecutorsPerTile; auto maxActKernelInvocation = @@ -183,7 +199,7 @@ void SetupNpuConstraintPass::safeRunOnModule() { _allowCustomValues); addConstraint(optionsBuilder, pipelineOptionsOp, VPU::METADATA_MAX_KERNEL_RANGE_COUNT, maxActKernelRange, _allowCustomValues); - if (!isArchVPUX3XXX(arch)) { + if (!vpux::config::isArchVPUX3XXX(arch)) { auto maxMediaCount = vpux::VPU::getDefaultTaskListCount(VPU::TaskType::M2I, arch); addConstraint(optionsBuilder, pipelineOptionsOp, VPU::METADATA_MAX_MEDIA_COUNT, maxMediaCount, _allowCustomValues); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_pipeline_options.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_pipeline_options.cpp index fc848214da..81fd867264 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_pipeline_options.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_pipeline_options.cpp @@ -62,7 +62,7 @@ mlir::LogicalResult SetupPipelineOptionsPass::initializeOptions( } void SetupPipelineOptionsPass::initializeFromOptions() { - auto archStr = VPU::symbolizeEnum(archOpt.getValue()); + auto archStr = config::symbolizeEnum(archOpt.getValue()); VPUX_THROW_UNLESS(archStr.has_value(), "Unknown VPU architecture : '{0}'", archOpt.getValue()); const auto _arch = archStr.value(); @@ -73,7 +73,7 @@ void SetupPipelineOptionsPass::initializeFromOptions() { // Register the default PPE factory singleton const auto& ppeVersion = ppeVersionOpt.getValue(); if (ppeVersion == "Auto") { - if (_arch == VPU::ArchKind::NPU37XX || _arch == VPU::ArchKind::NPU40XX) { + if (_arch == config::ArchKind::NPU37XX || _arch == config::ArchKind::NPU40XX) { VPU::PpeVersionConfig::setFactory(); _log.info("Auto target PPE version set to: 'IntPPE'"); } diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_tiling_constraint.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_tiling_constraint.cpp index 7c7828ff72..bec98f147b 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_tiling_constraint.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_tiling_constraint.cpp @@ -8,11 +8,10 @@ #include "vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/tiling_constraint_utils.hpp" #include "vpux/compiler/dialect/config/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/analysis.hpp" #include "vpux/utils/core/error.hpp" -#include - namespace vpux::VPU { #define GEN_PASS_DECL_SETUPTILINGCONSTRAINT #define GEN_PASS_DEF_SETUPTILINGCONSTRAINT @@ -90,7 +89,8 @@ void SetupTilingConstraintPass::safeRunOnModule() { optionsBuilder = mlir::OpBuilder::atBlockBegin(&pipelineOptionsOp.getOptions().front(), optionsBuilder.getListener()); - auto largeFilterRatio = vpux::VPU::getFragmentationAvoidRatioPipeliningLargeWeights(VPU::getArch(getOperation())); + auto largeFilterRatio = + vpux::VPU::getFragmentationAvoidRatioPipeliningLargeWeights(config::getArch(getOperation())); addConstant(optionsBuilder, pipelineOptionsOp, VPU::FRAGMENTATION_AVOID_RATIO_PIPELINING_LARGE_WEIGHTS, largeFilterRatio, _allowCustomValues); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_weights_table_reuse_mode.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_weights_table_reuse_mode.cpp deleted file mode 100644 index 06f06dea2a..0000000000 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/setup_weights_table_reuse_mode.cpp +++ /dev/null @@ -1,114 +0,0 @@ -// -// Copyright (C) 2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" -#include "vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/weights_table_reuse_utils.hpp" - -namespace vpux::VPU { -#define GEN_PASS_DECL_SETUPWEIGHTSTABLEREUSEMODE -#define GEN_PASS_DEF_SETUPWEIGHTSTABLEREUSEMODE -#include "vpux/compiler/dialect/VPU/passes.hpp.inc" -} // namespace vpux::VPU - -using namespace vpux; - -namespace { - -// -// SetupWeightsTableReuseMode -// - -class SetupWeightsTableReuseModePass final : - public VPU::impl::SetupWeightsTableReuseModeBase { -public: - SetupWeightsTableReuseModePass() = default; - SetupWeightsTableReuseModePass(const VPU::InitCompilerOptions& initCompilerOptions, Logger log) - : _weightsTableReuseMode(weightsTableReuseMode) { - Base::initLogger(log, Base::getArgumentName()); - Base::copyOptionValuesFrom(initCompilerOptions); - - initializeFromOptions(); - } - -private: - mlir::LogicalResult initializeOptions( - StringRef options, llvm::function_ref errorHandler) final; - void safeRunOnModule() final; - -private: - // Initialize fields from pass options - void initializeFromOptions(); - -private: - vpux::WeightsTableReuseMode _weightsTableReuseMode = vpux::WeightsTableReuseMode::DISABLED; - bool _allowCustomValues = false; -}; - -void addOption(mlir::OpBuilder optionsBuilder, config::PipelineOptionsOp pipelineOptionsOp, mlir::StringRef optionName, - size_t optionValue, bool allowCustomValues) { - auto hasPipelineOption = pipelineOptionsOp.lookupSymbol(optionName) != nullptr; - VPUX_THROW_WHEN(!allowCustomValues && hasPipelineOption, "{0} defined, probably you run '--init-compiler' twice", - optionName); - - if (hasPipelineOption) { - return; - } - auto* ctx = optionsBuilder.getContext(); - const auto constraintAttr = mlir::StringAttr::get(ctx, optionName); - optionsBuilder.create(optionsBuilder.getUnknownLoc(), constraintAttr, - mlir::IntegerAttr::get(getUInt64Type(ctx), optionValue)); -} - -mlir::LogicalResult SetupWeightsTableReuseModePass::initializeOptions( - StringRef options, llvm::function_ref errorHandler) { - if (mlir::failed(Base::initializeOptions(options, errorHandler))) { - return mlir::failure(); - } - - initializeFromOptions(); - - return mlir::success(); -} - -void SetupWeightsTableReuseModePass::initializeFromOptions() { - _log.trace("Overloading the default value {0} of the '_weightsTableReuseMode' field to the value {1} " - "of the pass option 'weightsTableReuseMode' generated by MLIR", - stringifyEnum(_weightsTableReuseMode), stringifyEnum(weightsTableReuseMode)); - _weightsTableReuseMode = weightsTableReuseMode; - - if (allowCustomValues.hasValue()) { - _allowCustomValues = allowCustomValues.getValue(); - } -} - -void SetupWeightsTableReuseModePass::safeRunOnModule() { - auto moduleOp = getModuleOp(getOperation()); - auto optionsBuilder = mlir::OpBuilder::atBlockBegin(moduleOp.getBody()); - auto pipelineOptionsOp = VPU::getPipelineOptionsOp(getContext(), moduleOp); - optionsBuilder = - mlir::OpBuilder::atBlockBegin(&pipelineOptionsOp.getOptions().front(), optionsBuilder.getListener()); - - addOption(optionsBuilder, pipelineOptionsOp, VPU::WEIGHTS_TABLE_REUSE_MODE, - static_cast(_weightsTableReuseMode), _allowCustomValues); -} - -} // namespace - -// -// createSetupWeightsTableReuseModePass -// - -std::unique_ptr vpux::VPU::createSetupWeightsTableReuseModePass() { - return std::make_unique(); -} - -std::unique_ptr vpux::VPU::createSetupWeightsTableReuseModePass( - const VPU::InitCompilerOptions& initCompilerOptions, Logger log) { - return std::make_unique(initCompilerOptions, log); -} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/sparsify_weights.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/sparsify_weights.cpp index 414f15cf7a..847e60a62b 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/sparsify_weights.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/sparsify_weights.cpp @@ -4,17 +4,16 @@ // #include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" #include "vpux/compiler/dialect/VPU/utils/strategy_manager/sparsity_strategy.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/attributes.hpp" -#include "vpux/compiler/utils/loop.hpp" #include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/sparsity.hpp" #include "vpux/compiler/utils/swizzling_utils.hpp" @@ -116,7 +115,7 @@ void SparsifyWeightsPass::safeRunOnFunc() { // with weights sparsity both weights and sparsity map will need to be aligned // increasing fragmentation likelihood with small aligned constants // avoid increasing size more than 4X - minWeightsSize = getAddressAlignmentForSwizzling(vpux::SWIZZLING_KEY_5, VPU::getArch(func)) / 4; + minWeightsSize = getAddressAlignmentForSwizzling(vpux::SWIZZLING_KEY_5, config::getArch(func)) / 4; // experimental number for small ops which do not suffer from fragmentation smallOpThreshold = 2560; @@ -376,6 +375,12 @@ void SparsifyWeightsPass::safeRunOnFunc() { // createSparsifyWeightsPass // +std::unique_ptr vpux::VPU::createSparsifyWeightsPass(Logger log) { + return std::make_unique(VPU::WeightsSparsityHeuristic::RATIO, /*manualThreshold=*/std::nullopt, + /*largeConstThreshold=*/(200_MB).to().count(), + /*computeOpThreshold=*/350, /*enableWeightSwizzling=*/true, log); +} + std::unique_ptr vpux::VPU::createSparsifyWeightsPass(VPU::WeightsSparsityHeuristic heuristic, std::optional manualThreshold, int64_t largeConstThreshold, diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/split_nce_ops_onto_workloads.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/split_nce_ops_onto_workloads.cpp index 0c04335a45..cbe54c43b8 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/split_nce_ops_onto_workloads.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/split_nce_ops_onto_workloads.cpp @@ -6,19 +6,14 @@ #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/core/cost_model_utils.hpp" -#include "vpux/compiler/core/layers.hpp" -#include "vpux/compiler/core/tiling.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" -#include "vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp" #include "vpux/compiler/dialect/VPU/utils/workload_split_utils.hpp" -#include "vpux/compiler/dialect/VPUIP/interfaces/dpu_tiler.hpp" -#include "vpux/compiler/dialect/VPUIP/transforms/factories/split_cost_getter.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" -#include "vpux/utils/core/enums.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include @@ -39,7 +34,7 @@ namespace { class NCEWorkloadSplitRewrite final : public mlir::OpInterfaceRewritePattern { public: - NCEWorkloadSplitRewrite(mlir::MLIRContext* ctx, int64_t numDPU, VPU::ArchKind arch, + NCEWorkloadSplitRewrite(mlir::MLIRContext* ctx, int64_t numDPU, config::ArchKind arch, std::shared_ptr costModel, Logger log) : mlir::OpInterfaceRewritePattern(ctx), _numDPU(numDPU), @@ -53,7 +48,7 @@ class NCEWorkloadSplitRewrite final : public mlir::OpInterfaceRewritePattern _costModel; Logger _log; }; @@ -69,7 +64,7 @@ mlir::LogicalResult NCEWorkloadSplitRewrite::matchAndRewrite(VPU::NCEOpInterface class NCEWorkloadSplitPreSplitRewrite final : public mlir::OpInterfaceRewritePattern { public: - NCEWorkloadSplitPreSplitRewrite(mlir::MLIRContext* ctx, int64_t numDPU, int64_t numTiles, VPU::ArchKind arch, + NCEWorkloadSplitPreSplitRewrite(mlir::MLIRContext* ctx, int64_t numDPU, int64_t numTiles, config::ArchKind arch, std::shared_ptr layerCostModel, std::shared_ptr costModel, Logger log) : mlir::OpInterfaceRewritePattern(ctx), @@ -91,7 +86,7 @@ class NCEWorkloadSplitPreSplitRewrite final : public mlir::OpInterfaceRewritePat int64_t _numDPU; int64_t _numTiles; - VPU::ArchKind _arch; + config::ArchKind _arch; std::shared_ptr _layerCostModel; std::shared_ptr _costModel; @@ -179,7 +174,7 @@ void SplitNCEOpsOntoWorkloadsPass::safeRunOnFunc() { auto module = func->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto nceCluster = IE::getTileExecutor(module); VPUX_THROW_UNLESS(nceCluster != nullptr, "Failed to get NCE_Cluster information"); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/split_se_ops.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/split_se_ops.cpp index 362fa41420..439a8ec40d 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/split_se_ops.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/split_se_ops.cpp @@ -11,11 +11,12 @@ #include "vpux/compiler/dialect/VPU/IR/se_attributes.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/sparsity_constraint.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_interpolate_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/se_roll_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -33,7 +34,7 @@ namespace { bool doesFitIntoCMX(mlir::Operation* op, NDTypeInterface inputType, NDTypeInterface outputType, int64_t seTableH, int64_t seTableW) { - auto arch = VPU::getArch(op); + auto arch = config::getArch(op); auto sparsityConstraint = VPU::getSparsityConstraint(arch); const auto inShape = inputType.getShape(); const auto inputC = inShape[Dims4D::Act::C]; diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/strategy_manager_pass.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/strategy_manager_pass.cpp index 7fd068c3f6..46d794e793 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/strategy_manager_pass.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/strategy_manager_pass.cpp @@ -17,6 +17,9 @@ #include "vpux/compiler/dialect/VPU/utils/strategy_manager/operation_strategies.hpp" #include "vpux/compiler/dialect/VPU/utils/strategy_manager/strategy_opt_alg.hpp" #include "vpux/compiler/dialect/VPU/utils/strategy_manager/strategy_state_provider.hpp" +#include "vpux/compiler/dialect/config/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include @@ -48,7 +51,7 @@ class StrategyManagerImplPass final : public VPU::impl::StrategyManagerImplBase< SmallVector> getOperationOptions(mlir::Operation* operation, SiblingOpsAnalysis& siblingsAnalysis, size_t numTiles); - SmallVector getAvailiableStrategies(ArchKind arch) const; + SmallVector getAvailiableStrategies(config::ArchKind arch) const; bool checkDefaultStrategy(MultiClusterStrategy strategy) const; void fillInOptions(TilingOptions& options) const; bool mcTilingNeeded() const; @@ -214,7 +217,7 @@ SmallVector> StrategyManagerImplPass::getOpera return strategies; } -SmallVector StrategyManagerImplPass::getAvailiableStrategies(ArchKind arch) const { +SmallVector StrategyManagerImplPass::getAvailiableStrategies(config::ArchKind arch) const { auto mcListGetter = createMCStrategyGetter(arch, _numTiles); SmallVector strategies; @@ -228,7 +231,7 @@ void StrategyManagerImplPass::safeRunOnFunc() { auto siblingsAnalysis = getAnalysis(); _costModel = std::make_shared(func); _numTiles = IE::getTileExecutor(module).getCount(); - _archStrategies = getAvailiableStrategies(VPU::getArch(module)); + _archStrategies = getAvailiableStrategies(config::getArch(module)); // calculate cost for all possible strategies // assign strategy with min cost diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/sw_kernel_data_prefetch_reserve_mem.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/sw_kernel_data_prefetch_reserve_mem.cpp new file mode 100644 index 0000000000..ce25119a51 --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/sw_kernel_data_prefetch_reserve_mem.cpp @@ -0,0 +1,83 @@ +// +// Copyright (C) 2024-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" + +namespace vpux::VPU { +#define GEN_PASS_DECL_SWKERNELDATAPREFETCHRESERVEMEM +#define GEN_PASS_DEF_SWKERNELDATAPREFETCHRESERVEMEM +#include "vpux/compiler/dialect/VPU/passes.hpp.inc" +} // namespace vpux::VPU + +using namespace vpux; + +namespace { + +// +// SWKernelDataPrefetchReserveMemPass +// + +class SWKernelDataPrefetchReserveMemPass final : + public VPU::impl::SWKernelDataPrefetchReserveMemBase { +public: + explicit SWKernelDataPrefetchReserveMemPass(Logger log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnModule() final; +}; + +bool checkSWKernelOp(mlir::ModuleOp& func) { + bool hasSWKernelOp = false; + func->walk([&](VPU::SWOpInterface) { + hasSWKernelOp = true; + return; + }); + + return hasSWKernelOp; +} + +void SWKernelDataPrefetchReserveMemPass::safeRunOnModule() { + auto module = getOperation(); + auto* ctx = module->getContext(); + + auto hasSWKernelOp = checkSWKernelOp(module); + if (!hasSWKernelOp) { + return; + } + + auto maxPrefetchDataSize = VPUIP::getMaximalSWKernelPrefetchDataSize(module); + auto memSpaceAttr = mlir::SymbolRefAttr::get(ctx, stringifyEnum(VPU::MemoryKind::CMX_NN)); + + int64_t reservedMemTotalSize = 0; + for (auto& resMem : IE::getReservedMemoryResources(module, memSpaceAttr)) { + reservedMemTotalSize += resMem.getByteSize(); + } + + // Enlarge the original reserved memory range when total reserved memory is not safe for SW Kernel data + // prefetching + if (reservedMemTotalSize < maxPrefetchDataSize) { + _log.trace("Enlarge the original reserved memory range for SW Kernel prefetching - size: '{0}'", + maxPrefetchDataSize - reservedMemTotalSize); + IE::setSWKernelPrefetchingReservedMemory(module, memSpaceAttr, maxPrefetchDataSize - reservedMemTotalSize); + } +} + +} // namespace + +// +// createSWKernelDataPrefetchReserveMemPass +// + +std::unique_ptr vpux::VPU::createSWKernelDataPrefetchReserveMemPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/sw_kernel_instruction_prefetch_reserve_mem_for_dummy_kernels.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/sw_kernel_instruction_prefetch_reserve_mem_for_dummy_kernels.cpp new file mode 100644 index 0000000000..86e85b8072 --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/sw_kernel_instruction_prefetch_reserve_mem_for_dummy_kernels.cpp @@ -0,0 +1,69 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" + +namespace vpux::VPU { +#define GEN_PASS_DECL_SWKERNELINSTRUCTIONPREFETCHRESERVEMEMFORDUMMYKERNELS +#define GEN_PASS_DEF_SWKERNELINSTRUCTIONPREFETCHRESERVEMEMFORDUMMYKERNELS +#include "vpux/compiler/dialect/VPU/passes.hpp.inc" +} // namespace vpux::VPU + +using namespace vpux; + +namespace { + +// +// SWKernelInstructionPrefetchReserveMemForDummyKernels +// +class SWKernelInstructionPrefetchReserveMemForDummyKernels final : + public VPU::impl::SWKernelInstructionPrefetchReserveMemForDummyKernelsBase< + SWKernelInstructionPrefetchReserveMemForDummyKernels> { +public: + explicit SWKernelInstructionPrefetchReserveMemForDummyKernels(Logger log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnModule() final; +}; + +bool checkSWKernelOp(mlir::ModuleOp& module) { + bool hasSWKernelOp = false; + module->walk([&](VPU::SWOpInterface) { + hasSWKernelOp = true; + return; + }); + + return hasSWKernelOp; +} + +void SWKernelInstructionPrefetchReserveMemForDummyKernels::safeRunOnModule() { + auto module = getOperation(); + auto hasSWKernelOp = checkSWKernelOp(module); + if (!hasSWKernelOp) { + return; + } + + auto* ctx = module->getContext(); + auto memSpaceAttr = mlir::SymbolRefAttr::get(ctx, stringifyEnum(VPU::MemoryKind::CMX_NN)); + IE::setDummySwKernelsForInstructionPrefetchReservedMemory(module, memSpaceAttr, + vpux::VPUIP::MAX_SW_KERNEL_DUMMY_KERNELS_DATA_SIZE); +} + +} // namespace + +// +// createSWKernelInstructionPrefetchReserveMemForDummyKernelsPass +// +std::unique_ptr vpux::VPU::createSWKernelInstructionPrefetchReserveMemForDummyKernelsPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/tile_gather.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/tile_gather.cpp index e64827a9c9..af6fcd8593 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/tile_gather.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/tile_gather.cpp @@ -11,6 +11,7 @@ #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/gather_dma_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/types.hpp" @@ -50,7 +51,7 @@ mlir::LogicalResult TileGatherElement::matchAndRewrite(VPU::GatherOp origOp, mli const auto inputShape = getShape(origOp.getInput()); const auto outputShape = getShape(origOp.getOutput()); const auto outputType = mlir::cast(origOp.getOutput().getType()); - const auto arch = VPU::getArch(origOp); + const auto arch = config::getArch(origOp); Shape nTilesOnDim(outputShape.size(), 1); DimArr tileDimOrder; @@ -125,7 +126,7 @@ mlir::LogicalResult TileGatherIndices::matchAndRewrite(VPU::GatherOp origOp, mli const auto indicesType = mlir::cast(origOp.getIndices().getType()); const auto indicesShape = indicesType.getShape(); const auto indicesRank = origOp.getIndicesRank().value_or(indicesShape.size()); - const auto arch = VPU::getArch(origOp); + const auto arch = config::getArch(origOp); Shape nTilesOnDim(outputShape.size(), 1); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/tile_lstm_sequence.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/tile_lstm_sequence.cpp index e95bdaacd5..db63e2f38b 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/tile_lstm_sequence.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/tile_lstm_sequence.cpp @@ -13,6 +13,7 @@ #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/sw_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -53,7 +54,7 @@ class TileLSTMSequence final : public mlir::OpRewritePattern& bufferSizes, int64_t totalAvailableCMXSize, VPU::ArchKind archKind) const; + bool fitIntoCMX(SmallVector& bufferSizes, int64_t totalAvailableCMXSize, config::ArchKind archKind) const; mlir::FailureOr getNumSplits(VPU::LSTMSequenceOp op) const; void tileLSTMSequence(VPU::LSTMSequenceOp op, mlir::PatternRewriter& rewriter, int numSplits) const; @@ -75,11 +76,11 @@ bool TileLSTMSequence::fitIntoCMX(VPU::LSTMSequenceOp op) const { } const auto totalAvailableCMXSize = getTotalCMXSize(op).count(); - return fitIntoCMX(bufferSizes, totalAvailableCMXSize, getArch(op)); + return fitIntoCMX(bufferSizes, totalAvailableCMXSize, config::getArch(op)); } bool TileLSTMSequence::fitIntoCMX(SmallVector& bufferSizes, int64_t totalAvailableCMXSize, - VPU::ArchKind archKind) const { + config::ArchKind archKind) const { return vpux::VPU::calculateAlignedBuffersMemoryRequirement(archKind, bufferSizes).count() <= totalAvailableCMXSize; } @@ -137,7 +138,7 @@ mlir::FailureOr TileLSTMSequence::getNumSplits(VPU::LSTMSequenceOp op) cons } const auto totalAvailableCMXSize = getTotalCMXSize(op).count(); - const auto archKind = getArch(op); + const auto archKind = config::getArch(op); if (fitIntoCMX(bufferSizes, totalAvailableCMXSize, archKind)) { return 1; // numSplits diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/tiling_strategy_assignment.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/tiling_strategy_assignment.cpp index a8ebc3314c..08d2f3e415 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/tiling_strategy_assignment.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/tiling_strategy_assignment.cpp @@ -7,13 +7,13 @@ #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp" +#include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/op_tiling_cache.hpp" #include "vpux/compiler/dialect/VPU/utils/sibling_ops_analysis.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" -#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" namespace vpux::VPU { #define GEN_PASS_DECL_TILINGSTRATEGYASSIGNMENT @@ -101,9 +101,10 @@ std::vector TilingStrategyAssignmentPass::getStrate std::vector strategies{}; - // Temporarily not apply cost-based tiling strategy to NCE ops with INT4 weights based on VPUNN cost. - // This can be removed when VPUNN is upgraded to support INT4 data type, tracked in E#113316. - if (!_enableVpunnCostForTiling || !mlir::isa(op) || VPU::isNCEWithInt4Weights(op) || + // Cost based strategy assignment is not applied to NCEMatMulOp yet #E126102 + auto costModelUtils = VPU::getICostModelUtilsInterface(op->getContext()); + if (!_enableVpunnCostForTiling || !mlir::isa(op) || + (VPU::isNCEWithInt4Weights(op) && !costModelUtils->isNCEWithInt4WeightsSupported()) || mlir::isa(op)) { auto strategy = origOp.getTilingStrategy(tilingMode, _log); if (mlir::succeeded(strategy)) { @@ -120,7 +121,7 @@ void TilingStrategyAssignmentPass::safeRunOnFunc() { auto func = getOperation(); auto module = func->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto maybeLayerCostModelAnalysis = getCachedParentAnalysis(module); auto layerCostModel = VPU::LayerCostModelAnalysis::getOrCreateLayerCostModel(maybeLayerCostModelAnalysis, arch, _log); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/merge_vertical_fusion_subgraphs.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/merge_vertical_fusion_subgraphs.cpp index 08fed7bf3b..d7d6036ada 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/merge_vertical_fusion_subgraphs.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/merge_vertical_fusion_subgraphs.cpp @@ -8,11 +8,8 @@ #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/merge_vf_region_rewriter.hpp" -#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_config.hpp" -#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_scheduling_factory.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/merge_vf_region_rewriter.hpp" -#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.hpp" -#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_scheduling_factory.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" namespace vpux::VPU { #define GEN_PASS_DECL_MERGEVFSUBGRAPHS @@ -77,7 +74,7 @@ void MergeVfSubgraphsPass::safeRunOnFunc() { auto func = getOperation(); auto module = func->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto maybeLayerCostModelAnalysis = getCachedParentAnalysis(module); auto layerCostModel = VPU::LayerCostModelAnalysis::getOrCreateLayerCostModel(maybeLayerCostModelAnalysis, arch, _log); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/scf/scf_vertical_fusion.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/scf/scf_vertical_fusion.cpp new file mode 100644 index 0000000000..f9eef0b19c --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/scf/scf_vertical_fusion.cpp @@ -0,0 +1,83 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/VPU/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_context.hpp" + +#include +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/IR/Iterators.h" + +namespace vpux::VPU { +#define GEN_PASS_DECL_SCFVERTICALFUSION +#define GEN_PASS_DEF_SCFVERTICALFUSION +#include "vpux/compiler/dialect/VPU/passes.hpp.inc" +} // namespace vpux::VPU + +using namespace vpux; +using namespace VPU; + +namespace { + +// +// SCFVerticalFusionPass +// + +class SCFVerticalFusionPass final : public VPU::impl::SCFVerticalFusionBase { +public: + explicit SCFVerticalFusionPass(Logger log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnFunc() final; +}; + +// +// safeRunOnFunc +// + +void SCFVerticalFusionPass::safeRunOnFunc() { + auto& ctx = getContext(); + auto func = getOperation(); + mlir::OpBuilder builder(&ctx); + mlir::IRRewriter irBuilder(builder); + + llvm::SetVector fusedOps; + + func->walk([&](mlir::TilingInterface operation) { + auto* op = operation.getOperation(); + + if (fusedOps.contains(op)) { + _log.nest().trace("Operation has already been fused"); + return; + } + + if (!op->hasAttr(tilingStrategy)) { + _log.nest().trace("No tiling strategy or it has already been applied."); + return; + } + + auto tilingContext = VPU::createTilingContext(op, /* enableSCFTiling = */ true); + auto fused = tilingContext.applyVerticalFusion(irBuilder, _log); + + if (!mlir::failed(fused)) { + fusedOps.insert(fused.value().begin(), fused.value().end()); + } + }); +} + +} // namespace + +// +// createSCFVerticalFusionPass +// + +std::unique_ptr VPU::createSCFVerticalFusionPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/vertical_fusion_outlining.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/vertical_fusion_outlining.cpp index 71536fa02f..6dd1f7c1e7 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/vertical_fusion_outlining.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/vertical_fusion_outlining.cpp @@ -65,7 +65,10 @@ void VerticalFusionOutliner::buildFuncOps(mlir::ModuleOp moduleOp, ArrayRef(op); }); - return vfNum == 1; + const auto clusteredOpNum = llvm::count_if(slice.operations, [](auto* op) { + return mlir::isa_and_nonnull(op); + }); + return vfNum == 1 && clusteredOpNum == 0; }; for (const auto& [targetIdx, slices] : outlinedTargets | indexed) { const auto& slice = slices.front(); diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/vertical_fusion_tiling.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/vertical_fusion_tiling.cpp index deefc2daf5..cecd9eed3c 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/vertical_fusion_tiling.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/vertical_fusion_tiling.cpp @@ -6,12 +6,12 @@ #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_config.hpp" -#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_scheduling_factory.hpp" -#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.hpp" -#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_scheduling_factory.hpp" -#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_utils.hpp" + +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vf_tiling_rewriter.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vf_tiling_rewriter.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" +#include "vpux/compiler/dialect/core/dialect.hpp" #include #include @@ -29,363 +29,6 @@ using namespace VPU; namespace { -// -// VerticalFusionTilingRewriter -// - -typedef std::function TilingFunction; - -template -class VerticalFusionTilingRewriter final : public mlir::OpRewritePattern { -public: - VerticalFusionTilingRewriter(mlir::MLIRContext* ctx, bool enableVerticalFusionPipelining, - const std::unique_ptr& costFunction, Logger log) - : mlir::OpRewritePattern(ctx), - _enableVerticalFusionPipelining(enableVerticalFusionPipelining), - _vpunnCostFunction(costFunction), - _log(log) { - } - - mlir::LogicalResult matchAndRewrite(VPU::VerticalFusionOp origOp, mlir::PatternRewriter& rewriter) const final; - -private: - void adjustInputShape(mlir::PatternRewriter& rewriter, mlir::Operation* operation, InputTiling& inputTiling, - mlir::IRMapping& mapper, TilingStorage& tilingStorage, - const TilingOperationStorage::UPtr& opStorage, int64_t tilingIndex, Dim axis) const; - void processOffset(mlir::Value operand, const TilingOperationStorage::UPtr& opStorage, TileInfo& originalTiling, - int64_t tilingIndex, Dim axis, ShapeRef expectedShape) const; - bool processBlockArgument(mlir::BlockArgument blockArg, TilingStorage& tilingStorage, TileInfo& originalTiling, - int64_t tilingIndex, Dim axis) const; - void applyLinearTiling(const int64_t numTiles, VFConfigType& config, SmallVector& resultTileVals, - SmallVector& resultTileOffsets, const TilingFunction& tilingProcedure) const; - void applyPipelinedTiling(const int64_t numTiles, VFConfigType& config, SmallVector& resultTileVals, - SmallVector& resultTileOffsets, const TilingFunction& tilingProcedure, - const TilingOperationStorage::UPtr& storage) const; - - bool _enableVerticalFusionPipelining; - const std::unique_ptr& _vpunnCostFunction; - Logger _log; -}; - -template -bool VerticalFusionTilingRewriter::processBlockArgument( - mlir::BlockArgument blockArg, TilingStorage& tilingStorage, TileInfo& originalTiling, int64_t tilingIndex, - Dim axis) const { - auto& offset = originalTiling.offsets[axis]; - const auto storageInfo = tilingStorage.get(blockArg.getArgNumber(), tilingIndex); - VPUX_THROW_WHEN(!storageInfo.has_value(), "Tiling info for argument {0} with index {1} not found", blockArg, - tilingIndex); - - auto tileInfo = storageInfo.value(); - VPUX_THROW_UNLESS(static_cast(axis.ind()) < tileInfo.shape.size(), "Got invalid tiling shape size {0}", - tileInfo.shape.size()); - - const auto inputOffset = tileInfo.offsets[axis]; - const auto inputDimShape = tileInfo.shape[axis]; - const auto origDimSize = originalTiling.shape[axis]; - - _log.trace("Input Offset {0}, shape {1} ==> offset: {2}, shape: {3} ", inputOffset, inputDimShape, offset, - origDimSize); - - if (offset >= inputOffset && (inputOffset + inputDimShape) >= (offset + origDimSize)) { - offset -= inputOffset; - return true; - } - - _log.trace("invalid offsets: Input Offset {0}, shape {1} ==> offset: {2}, shape: {3} ", inputOffset, inputDimShape, - offset, origDimSize); - - return false; -} - -template -void VerticalFusionTilingRewriter::processOffset( - mlir::Value operand, const TilingOperationStorage::UPtr& opStorage, TileInfo& originalTiling, - int64_t tilingIndex, Dim axis, ShapeRef expectedShape) const { - auto& offset = originalTiling.offsets[axis]; - if (offset == 0) { - return; - } - - auto operandOp = operand.getDefiningOp(); - if (operandOp != nullptr) { - auto inputOutputTiling = opStorage->get(operandOp, tilingIndex); - VPUX_THROW_UNLESS(inputOutputTiling.has_value(), "Couldn't find tiling info at {0}", operandOp->getLoc()); - const auto inputOutputTilingPair = inputOutputTiling.value(); - auto& outTile = inputOutputTilingPair.second; - offset -= outTile.offsets[axis]; - return; - } - - offset = expectedShape[axis] - originalTiling.shape[axis]; -} - -/* - This function slice to original tile shape in case bigger tile size was chosen - during backpropagation process. - In this case adjust shapes to original one by slicing -*/ -template -void VerticalFusionTilingRewriter::adjustInputShape( - mlir::PatternRewriter& rewriter, mlir::Operation* operation, InputTiling& inputTiling, mlir::IRMapping& mapper, - TilingStorage& tilingStorage, const TilingOperationStorage::UPtr& opStorage, int64_t tilingIndex, - Dim axis) const { - VPUX_THROW_WHEN(inputTiling.tiles.size() < operation->getOperands().size(), - "Number of operands {0} is more than number of operand tiles {1}", operation->getOperands().size(), - inputTiling.tiles.size()); - for (auto op : operation->getOperands() | indexed) { - auto operand = op.value(); - auto opIndex = op.index(); - - auto expectedOp = mapper.lookupOrNull(operand); - if (expectedOp == nullptr) { - continue; - } - - auto originalTiling = inputTiling.tiles[opIndex]; - auto expectedShape = getShape(expectedOp); - auto expectedOpSize = expectedShape.totalSize(); - const auto originalOpSize = originalTiling.shape.totalSize(); - if (expectedOpSize == originalOpSize) { - continue; - } - - // - // For below pattern, the Eltwise3 may be tiled before the Eltwise2. - // Then the Operand has been mapped to the new "SliceOp1" instead of "Eltwise1". - // While tiling "Eltwise2", it throw exception of "expectedOpSize < originalOpSize". - // Need to update this branch operand for this case. - // - // VF tilingStrategy: [1, 1, 1, 4] - // | | - // Eltwise1: 1x64x72x128 Conv: 1x64x72x128 - // | X | - // Eltwise2: 1x64x72x128 Eltwise3: 1x64x72x128 - // | | - // Conv: 1x64x72x128 | - // | | - // Conv: 1x64x72x128 | - // \ / - // Eltwise4: 1x64x72x128 - // | - // - // tiling into: - // - // | | - // Eltwise1: 1x64x72x36 Conv: 1x64x72x36 - // | X | - // | / SliceOp1 SliceOp2 - // | / \ | - // Eltwise2: 1x64x72x36 Eltwise3: 1x64x72x32 - // | | - // Conv: 1x64x72x34 | - // | | - // Conv: 1x64x72x32 | - // \ / - // Eltwise4: 1x64x72x32 - // | - if (expectedOpSize < originalOpSize) { - if (auto insertSliceOp = mlir::dyn_cast(expectedOp.getDefiningOp())) { - expectedOp = insertSliceOp.getInputs().front(); - expectedShape = getShape(expectedOp); - expectedOpSize = expectedShape.totalSize(); - } - } - - VPUX_THROW_WHEN( - expectedOpSize < originalOpSize, - "Original shape size for operand {0} is bigger than current one. Current size {1}, original size {2}", - operand, expectedOpSize, originalOpSize); - - VPUX_THROW_WHEN(expectedShape.size() != originalTiling.shape.size(), - "Expected shape {0} and original one {1} must have same rank", expectedShape, - originalTiling.shape); - - // correct offset of operations based on offsets of block argument - // In case the output of previous operation is bigger than expected - // which might happen when bigger tile was chosen for same block argument - // slice operation is needed after the output with correct offsets - // calculated based on tiling information of current operation and previous one - _log.trace("op {0}, Offset before {1}, shape {2}", operation->getLoc(), originalTiling.offsets, - originalTiling.shape); - - mlir::Value opSlice; - const auto valName = printToString("input {0}", opIndex); - auto blockArg = mlir::dyn_cast(operand); - if (blockArg != nullptr) { - if (!processBlockArgument(blockArg, tilingStorage, originalTiling, tilingIndex, axis)) { - auto sliceOp = mlir::dyn_cast_or_null(expectedOp.getDefiningOp()); - VPUX_THROW_WHEN(sliceOp == nullptr || sliceOp.getSource() == operand, - "Can't get the operand from Slice"); - - auto inputOutputTiling = opStorage->get(operation, tilingIndex); - VPUX_THROW_UNLESS(inputOutputTiling.has_value(), "Couldn't find tiling info at {0}", - operation->getLoc()); - - const auto inputTiling = inputOutputTiling.value().first.tiles[blockArg.getArgNumber()]; - opSlice = makeTile(rewriter, operation->getLoc(), sliceOp.getSource(), inputTiling, valName); - } else { - opSlice = makeTile(rewriter, operation->getLoc(), expectedOp, originalTiling, valName); - } - } else { - processOffset(operand, opStorage, originalTiling, tilingIndex, axis, expectedShape); - if (auto sliceOp = mlir::dyn_cast_or_null(expectedOp.getDefiningOp())) { - // correct offsets - auto sliceOffset = parseIntArrayAttr(sliceOp.getStaticOffsets()); - VPUX_THROW_UNLESS(originalTiling.offsets[axis] >= sliceOffset[axis.ind()], - "Slice offset {0} is bigger than original one {1}", sliceOffset[axis.ind()], - originalTiling.offsets[axis]); - originalTiling.offsets[axis] = originalTiling.offsets[axis] - sliceOffset[axis.ind()]; - } - opSlice = makeTile(rewriter, operation->getLoc(), expectedOp, originalTiling, valName); - } - - _log.trace("Offset after {0}, shape {1} expectedOp {2}", originalTiling.offsets, originalTiling.shape, - expectedOp); - - mapper.map(operand, opSlice); - } -} - -template -void VerticalFusionTilingRewriter::applyLinearTiling( - const int64_t numTiles, VFConfigType& config, SmallVector& resultTileVals, - SmallVector& resultTileOffsets, const TilingFunction& tilingProcedure) const { - auto operations = config.getVFOperations(); - - for (auto index : irange(numTiles)) { - mlir::Value currentResult; - Shape currentTile; - for (auto* op : operations) { - tilingProcedure(index, op, currentResult, currentTile); - } - - resultTileVals.push_back(currentResult); - resultTileOffsets.push_back(currentTile); - } -} - -template -void VerticalFusionTilingRewriter::applyPipelinedTiling( - const int64_t numTiles, VFConfigType& config, SmallVector& resultTileVals, - SmallVector& resultTileOffsets, const TilingFunction& tilingProcedure, - const TilingOperationStorage::UPtr& storage) const { - auto scheduling = config.getSubgraph().getScenario(); - VPUX_THROW_WHEN(!scheduling.has_value(), "Cannot get scheduling scenario from VF {0}", config.getSubgraph()); - - VFSchedulingFactoryType costFactory(/*prefetching=*/true); - auto scenario = costFactory.createVFScenario(scheduling.value(), _log); - - if (auto pipelinedScenario = std::dynamic_pointer_cast>(scenario)) { - auto pipelining = pipelinedScenario->getPipelining(config, numTiles, storage, _vpunnCostFunction); - - auto timeline = pipelining.getTimeLine(); - - if (!timeline.empty()) { - mlir::Value currentResult; - Shape currentTile; - for (auto& [index, operation] : pipelining.getTimeLine()) { - // currentResult and currentTiles keep result from previous call tilingProcedure - tilingProcedure(index, operation, currentResult, currentTile); - - if (llvm::find(config.getOutputs(), operation) != config.getOutputs().end()) { - resultTileVals.push_back(currentResult); - resultTileOffsets.push_back(currentTile); - } - } - return; - } - } - applyLinearTiling(numTiles, config, resultTileVals, resultTileOffsets, tilingProcedure); -} - -template -mlir::LogicalResult VerticalFusionTilingRewriter::matchAndRewrite( - VPU::VerticalFusionOp vfOp, mlir::PatternRewriter& rewriter) const { - const auto tilingStrategy = parseIntArrayAttr(mlir::cast(vfOp.getTilingStrategy())); - - const auto numTiledAxis = llvm::count_if(tilingStrategy, [](auto num) { - return num > 1; - }); - - VPUX_THROW_WHEN(numTiledAxis != 1, "VF tiling is supported only for one axis"); - - auto maxTiledLen = std::max_element(tilingStrategy.begin(), tilingStrategy.end()); - - if (maxTiledLen == tilingStrategy.end()) { - return mlir::failure(); - } - - VPUX_THROW_WHEN(*maxTiledLen <= 1, "There is no tiling for VF"); - - auto operationStorage = std::make_unique(); - auto tilingStorage = restoreTilingRegions(vfOp, _log, operationStorage); - - VFConfigType vfConfig(vfOp, _enableVerticalFusionPipelining); - - SmallVector resultTileVals; - resultTileVals.reserve(*maxTiledLen); - SmallVector resultTileOffsets; - DenseMap mappers; - - auto dim = Dim(std::distance(tilingStrategy.begin(), maxTiledLen)); - - const auto tilingProcedure = [&](int64_t index, mlir::Operation* op, mlir::Value& currentResult, - Shape& currentTile) { - auto& mapper = mappers[index]; - for (auto operand : op->getOperands()) { - if (auto blockArg = mlir::dyn_cast(operand)) { - const auto valName = printToString("ba_input {0}", index); - auto origInput = vfOp.getOperand(blockArg.getArgNumber()); - auto tileInfo = tilingStorage.get(blockArg.getArgNumber(), index); - - VPUX_THROW_WHEN(!tileInfo.has_value(), "Couldn't find tile information for argument {0} and tile {1}", - blockArg.getArgNumber(), index); - auto operandTile = VPU::makeTile(rewriter, op->getLoc(), origInput, tileInfo.value(), valName); - - mapper.map(operand, operandTile); - } - } - - auto inputTiling = operationStorage->get(op, index); - - VPUX_THROW_WHEN(!inputTiling.has_value(), "Couldn't find tile information for operation {0} and tile {1}", *op, - index); - - const auto inputTilingPair = inputTiling.value(); - auto inputTilingInfo = inputTilingPair.first; - adjustInputShape(rewriter, op, inputTilingInfo, mapper, tilingStorage, operationStorage, index, dim); - - auto* copiedOp = rewriter.clone(*op, mapper); - currentResult = copiedOp->getResult(0); - - currentTile = inputTilingPair.second.offsets; - const auto baseResType = mlir::cast(op->getResult(0).getType()); - if (auto tiledBuilderOp = mlir::dyn_cast(copiedOp)) { - tiledBuilderOp.adjustAttrs(inputTilingInfo, inputTilingPair.second); - } else if (auto tiledViewOp = mlir::dyn_cast(copiedOp)) { - tiledViewOp.adjustAttrs(inputTilingInfo, inputTilingPair.second, baseResType.getShape()); - } - const auto tiledResType = - baseResType.extractDenseTile(inputTilingPair.second.offsets, inputTilingPair.second.shape); - - currentResult.setType(tiledResType); - mapper.map(op->getResult(0), currentResult); - }; - - if (vfConfig.isPipelined()) { - applyPipelinedTiling(*maxTiledLen, vfConfig, resultTileVals, resultTileOffsets, tilingProcedure, - operationStorage); - } else { - applyLinearTiling(*maxTiledLen, vfConfig, resultTileVals, resultTileOffsets, tilingProcedure); - } - - rewriter.replaceOpWithNewOp(vfOp, vfOp->getResult(0).getType(), mlir::ValueRange(resultTileVals), - ArrayRef(resultTileOffsets)); - - return mlir::success(); -} // namespace - // // VfTilingPass // @@ -430,7 +73,7 @@ void VfTilingPass::safeRunOnFunc() { auto func = getOperation(); auto module = func->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto maybeLayerCostModelAnalysis = getCachedParentAnalysis(module); auto layerCostModel = VPU::LayerCostModelAnalysis::getOrCreateLayerCostModel(maybeLayerCostModelAnalysis, arch, _log); @@ -439,6 +82,7 @@ void VfTilingPass::safeRunOnFunc() { mlir::ConversionTarget target(ctx); target.addIllegalOp(); + target.addLegalDialect(); target.addLegalDialect(); target.addLegalDialect(); target.addLegalDialect(); @@ -447,12 +91,13 @@ void VfTilingPass::safeRunOnFunc() { target.addLegalOp(); mlir::RewritePatternSet patterns(&ctx); + if (_workloadManagementMode <= WorkloadManagementMode::PWLM_V0_LCA) { - patterns.add>( - &ctx, _enableVerticalFusionPipelining, std::move(costFunction), _log); + patterns.add(&ctx, _enableVerticalFusionPipelining, costFunction, + _log); } else { - patterns.add>( - &ctx, _enableVerticalFusionPipelining, std::move(costFunction), _log); + patterns.add(&ctx, _enableVerticalFusionPipelining, costFunction, + _log); } if (mlir::failed(mlir::applyFullConversion(func, target, std::move(patterns)))) { diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/wrap_vpu_ops_in_vertical_fusion.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/wrap_vpu_ops_in_vertical_fusion.cpp index e7de12bc06..eb8efe6fef 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/wrap_vpu_ops_in_vertical_fusion.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/vertical_fusion/wrap_vpu_ops_in_vertical_fusion.cpp @@ -4,10 +4,12 @@ // #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/wrap_vf_rewriter.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/wrap_vf_rewriter.hpp" #include "vpux/compiler/utils/logging.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include @@ -22,35 +24,6 @@ using namespace VPU; namespace { -void wrapIntoVFRegion(VPU::VerticalFusionOpInterface op, Logger log) { - if (op->getParentOfType() != nullptr) { - log.trace("[SKIP] The Operation already wrapped into VF region"); - return; - } - - const auto inputType = mlir::cast(op->getOperand(0).getType()); - const SmallVector one(inputType.getRank(), 1); - - auto tilingStrategyArray = op->hasAttr(tilingStrategy) ? mlir::cast(op->getAttr(tilingStrategy)) - : getIntArrayAttr(op->getContext(), one); - - const auto bodyBuilder = [op](mlir::OpBuilder& builder, mlir::Location loc, mlir::ValueRange newOperands) { - mlir::IRMapping mapper; - mapper.map(op->getOperands(), newOperands); - auto* newOp = builder.clone(*op, mapper); - newOp->removeAttr(tilingStrategy); - builder.create(loc, newOp->getResults()); - }; - - OpBuilderLogger builderLog(log.nest()); - mlir::OpBuilder builder(op, &builderLog); - - auto vfOp = builder.create(op->getLoc(), op->getResultTypes(), op->getOperands(), - bodyBuilder, tilingStrategyArray); - op->replaceAllUsesWith(vfOp); - op->erase(); -} - // // WrapVerticalFusionRegionPass // @@ -58,46 +31,49 @@ void wrapIntoVFRegion(VPU::VerticalFusionOpInterface op, Logger log) { class WrapVerticalFusionRegionPass final : public VPU::impl::WrapVerticalFusionRegionBase { public: - explicit WrapVerticalFusionRegionPass(Logger log) { + explicit WrapVerticalFusionRegionPass(const WorkloadManagementMode workloadManagementMode, Logger log) + : _workloadManagementMode(workloadManagementMode) { Base::initLogger(log, Base::getArgumentName()); } + mlir::LogicalResult initialize(mlir::MLIRContext* ctx) final; + private: void safeRunOnFunc() final; + + WorkloadManagementMode _workloadManagementMode = WorkloadManagementMode::PWLM_V0_LCA; }; +mlir::LogicalResult WrapVerticalFusionRegionPass::initialize(mlir::MLIRContext* ctx) { + if (mlir::failed(Base::initialize(ctx))) { + return mlir::failure(); + } + + if (workloadManagementModeOpt.hasValue()) { + _workloadManagementMode = workloadManagementModeOpt.getValue(); + } + return mlir::success(); +} + // // safeRunOnModule // void WrapVerticalFusionRegionPass::safeRunOnFunc() { - const auto callback = [&](VPU::VerticalFusionOpInterface op) { - if (mlir::isa(op->getParentOp())) { - _log.trace("Skip for operation '{0}' at '{1}' which is wrapped in other op", op->getName(), op->getLoc()); - return; - } - - if (!op.isVFSupported()) { - _log.trace("Skip for operation '{0}' at '{1}' which doesn't support VF", op->getName(), op->getLoc()); - return; - } - - if (op->hasAttr(tilingStrategy)) { - const auto tilingShape = - Shape(parseIntArrayAttr(mlir::cast(op->getAttr(tilingStrategy)))); - auto tilingDimCount = getNonOneDim(tilingShape).size(); - if (tilingDimCount > 1) { - _log.trace("Skip for operation '{0}' at '{1}' because VF doesn't support multi-dim tiling", - op->getName(), op->getLoc()); - return; - } - } - - _log.trace("Process Layer Operation '{0}' at '{1}'", op->getName(), op->getLoc()); - wrapIntoVFRegion(op, _log.nest()); - }; - - getOperation().walk(callback); + auto& ctx = getContext(); + + mlir::RewritePatternSet patterns(&ctx); + + if (_workloadManagementMode <= WorkloadManagementMode::PWLM_V0_LCA) { + patterns.add(&ctx, _log); + } else { + patterns.add(&ctx, _log); + } + + if (mlir::failed(mlir::applyPatternsAndFoldGreedily(getOperation(), std::move(patterns), + getDefaultGreedyRewriteConfig()))) { + signalPassFailure(); + } } } // namespace @@ -106,6 +82,7 @@ void WrapVerticalFusionRegionPass::safeRunOnFunc() { // createWrapVerticalFusionRegion // -std::unique_ptr VPU::createWrapVerticalFusionRegionPass(Logger log) { - return std::make_unique(log); +std::unique_ptr VPU::createWrapVerticalFusionRegionPass(const WorkloadManagementMode workloadManagementMode, + Logger log) { + return std::make_unique(workloadManagementMode, log); } diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/passes/wrap_ops_in_sparsify_desparsify.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/passes/wrap_ops_in_sparsify_desparsify.cpp index daf37a1af6..13a59e0da8 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/passes/wrap_ops_in_sparsify_desparsify.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/passes/wrap_ops_in_sparsify_desparsify.cpp @@ -9,6 +9,7 @@ #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" #include "vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" namespace vpux::VPU { #define GEN_PASS_DECL_WRAPOPSINSPARSIFYDESPARSIFYPAIRS @@ -88,7 +89,7 @@ void WrapOpsInSparsifyDesparsifyPairsPass::safeRunOnFunc() { return; } - auto arch = VPU::getArch(func); + auto arch = config::getArch(func); auto constraint = VPU::getSparsityConstraint(arch); const auto outputWrapper = [&](mlir::Operation* producerOp, mlir::Location loc) { diff --git a/src/vpux_compiler/src/dialect/VPU/transforms/pipelines.cpp b/src/vpux_compiler/src/dialect/VPU/transforms/pipelines.cpp index 633bf58f2f..480aff6a06 100644 --- a/src/vpux_compiler/src/dialect/VPU/transforms/pipelines.cpp +++ b/src/vpux_compiler/src/dialect/VPU/transforms/pipelines.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -72,17 +73,11 @@ void vpux::VPU::buildInitCompilerPipeline(mlir::OpPassManager& pm, const VPU::In pm.addPass(VPU::createInitResourcesPass(options, log)); pm.addPass(VPU::createSetupPipelineOptionsPass(options, log)); + pm.addPass(VPU::createSetTargetIndependentPassOptionsPass(options, log)); + pm.addPass(VPU::createSetupMaxKernelSizePass(options, log)); pm.addPass(VPU::createSetupNpuConstraintPass(options, log)); pm.addPass(VPU::createSetupTilingConstraintPass(options, log)); - pm.addPass(VPU::createSetupChannelsAutoPaddingPass(options, log)); - pm.addPass(VPU::createSetupIsReduceSupportedPass(options, log)); - pm.addPass(VPU::createSetupEnableFP16CompressedConvPass(options, log)); - pm.addPass(VPU::createSetupEnableVPUNNPreSplitPass(options, log)); - pm.addPass(VPU::createSetupWeightsTableReuseModePass(options, log)); - pm.addPass(VPU::createSetupEnableSEPtrsOperationsPass(options, log)); - pm.addPass(VPU::createSetupEnableAdaptiveStrippingPass(options, log)); - pm.addPass(VPU::createSetupEnableExtraStaticShapeOpsPass(options, log)); } // @@ -164,10 +159,15 @@ void vpux::VPU::buildTilingPipeline(mlir::OpPassManager& pm, const VPU::TilingOp } pm.addPass(VPU::createEfficientIROrderPass(log)); if (options.enableVerticalFusion) { - VPU::buildVFPipeline(pm, options, log); + if (!options.enableSCFTiling) { + VPU::buildVFPipeline(pm, options, log); + } else { + pm.addPass(VPU::createSCFVerticalFusionPass(log)); + pm.addPass(mlir::createCanonicalizerPass(grc)); + } } - if (options.enableOutputPipelining) { + if (!options.enableSCFTiling && options.enableOutputPipelining) { pm.addPass(VPU::createOutputPipelineTilingPass(options.enablePrefetchTiling, log)); // manual strategy debug configuration pm.addPass(VPU::createManualStrategyUtilsPass( @@ -186,7 +186,7 @@ void vpux::VPU::buildTilingPipeline(mlir::OpPassManager& pm, const VPU::TilingOp // void vpux::VPU::buildVFPipeline(mlir::OpPassManager& pm, const VPU::TilingOptions& options, Logger log) { - pm.addPass(VPU::createWrapVerticalFusionRegionPass(log)); + pm.addPass(VPU::createWrapVerticalFusionRegionPass(options.workloadManagementMode, log)); pm.addPass(VPU::createMoveViewOpsToVerticalFusionPass(options.workloadManagementMode, log)); pm.addPass(VPU::createMergeVfSubgraphsPass(options.enableVerticalFusionPipelining, options.enablePrefetchTiling, options.workloadManagementMode, log)); diff --git a/src/vpux_compiler/src/dialect/VPU/utils/adaptive_stripping_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/adaptive_stripping_utils.cpp index 6bf558c0d7..e05e48d841 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/adaptive_stripping_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/adaptive_stripping_utils.cpp @@ -14,29 +14,6 @@ using namespace vpux; -bool hasAdaptiveStrippingOption(mlir::ModuleOp module, StringRef option) { - auto pipelineOptionOp = module.lookupSymbol(VPU::PIPELINE_OPTIONS); - if (pipelineOptionOp == nullptr) { - auto logger = vpux::Logger::global(); - logger.trace("Failed to find PipelineOptions to fetch adaptive stripping option"); - return false; - } - - auto attrValue = pipelineOptionOp.lookupSymbol(option); - if (attrValue == nullptr) { - auto logger = vpux::Logger::global(); - logger.trace("Failed to find config.OptionOp to fetch adaptive stripping option"); - return false; - } - auto boolAttr = mlir::dyn_cast(attrValue.getOptionValue()); - if (boolAttr == nullptr) { - auto logger = vpux::Logger::global(); - logger.trace("Failed to cast config.OptionOp to BoolAttr"); - return false; - } - return boolAttr.getValue(); -} - bool VPU::hasEnableAdaptiveStripping(mlir::ModuleOp module) { - return hasAdaptiveStrippingOption(module, ENABLE_ADAPTIVE_STRIPPING); + return VPU::tryGetBoolPassOption(module, ENABLE_ADAPTIVE_STRIPPING).value_or(false); } diff --git a/src/vpux_compiler/src/dialect/VPU/utils/auto_padding_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/auto_padding_utils.cpp index d74581209d..fb68747514 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/auto_padding_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/auto_padding_utils.cpp @@ -15,39 +15,17 @@ using namespace vpux; -bool hasAutoPaddingOption(mlir::ModuleOp module, StringRef paddingMode) { - auto pipelineOptionOp = module.lookupSymbol(VPU::PIPELINE_OPTIONS); - if (pipelineOptionOp == nullptr) { - auto logger = vpux::Logger::global(); - logger.trace("Failed to find PipelineOptions to fetch auto padding mode"); - return false; - } - - auto attrValue = pipelineOptionOp.lookupSymbol(paddingMode); - if (attrValue == nullptr) { - auto logger = vpux::Logger::global(); - logger.trace("Failed to find config.OptionOp to fetch auto padding mode"); - return false; - } - auto boolAttr = mlir::dyn_cast(attrValue.getOptionValue()); - if (boolAttr == nullptr) { - auto logger = vpux::Logger::global(); - logger.trace("Failed to cast config.OptionOp to BoolAttr"); - return false; - } - return boolAttr.getValue(); -} - bool VPU::hasAutoPadding(mlir::ModuleOp module) { - return hasAutoPaddingOption(module, AUTO_PADDING_IDU) || hasAutoPaddingOption(module, AUTO_PADDING_ODU); + return VPU::tryGetBoolPassOption(module, AUTO_PADDING_IDU).value_or(false) || + VPU::tryGetBoolPassOption(module, AUTO_PADDING_ODU).value_or(false); } bool VPU::hasAutoPaddingIDU(mlir::ModuleOp module) { - return hasAutoPaddingOption(module, AUTO_PADDING_IDU); + return VPU::tryGetBoolPassOption(module, AUTO_PADDING_IDU).value_or(false); } bool VPU::hasAutoPaddingODU(mlir::ModuleOp module) { - return hasAutoPaddingOption(module, AUTO_PADDING_ODU); + return VPU::tryGetBoolPassOption(module, AUTO_PADDING_ODU).value_or(false); } bool VPU::areChannelsCompatibleWithIDUAutoPad(int64_t inputChannels, int64_t elemTypeBitWidth) { diff --git a/src/vpux_compiler/src/dialect/VPU/utils/clustered_op_interface_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/clustered_op_interface_utils.cpp index b7e5779a65..c0089667be 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/clustered_op_interface_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/clustered_op_interface_utils.cpp @@ -5,11 +5,14 @@ #include "vpux/compiler/dialect/VPU/utils/clustered_op_interface_utils.hpp" #include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/multi_cluster_strategy_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -48,6 +51,13 @@ bool VPU::isOperationSplitOverHeightCompatible(mlir::Operation* op, const vpux:: auto siblingsOpsAnalysis = SiblingOpsAnalysis(op); auto offset = ShapeRef(outputTile.offsets); if (!offset.empty()) { + // Tiling has impact if: + // - For NCE ops: the height axis is actually being tiled (i.e., more than 1 tile along H) + // - For non-NCE ops: always assume tiling has impact + auto tilingHasImpact = mlir::isa(op) ? (outputTile.axis[Dims4D::Act::H] > 1) : true; + if (!tilingHasImpact) { + return isSOHCompatible; + } const auto numClusters = vpux::VPU::getOptimalNumClusters(clusteredOp, outputShape, clusteredOp.getMultiClusterStrategy().value()); { @@ -110,7 +120,7 @@ bool VPU::isOperationSplitOverWidthCompatible(mlir::Operation* op, ShapeRef outp const auto numTiles = getNumTiles(op); const auto minimumOutputWidthForSOW = numTiles; - const auto arch = VPU::getArch(clusteredOp); + const auto arch = config::getArch(clusteredOp); if (outputShape == ShapeRef()) { outputShape = getShape(clusteredOp->getResult(0)); } @@ -125,7 +135,7 @@ bool VPU::isOperationSplitOverWidthCompatible(mlir::Operation* op, ShapeRef outp }; auto isSOWCompatible = widthCompatibleCheck(outputShape); - if (arch >= VPU::ArchKind::NPU40XX) { + if (arch >= config::ArchKind::NPU40XX) { // For NPU40XX+, W segmented output needs to have explicit halo regions defined. // Thus the applicability of SOW on the current operation is tightly dependent // if the consumer operations can be SOW themselves. @@ -315,7 +325,7 @@ bool VPU::doesLayerFitIntoCMX(mlir::Operation* op, VPU::MultiClusterStrategy str auto totalAvailableCMXSize = reservedMem.count() == 0 ? getTotalCMXSize(op).count() : getTotalCMXFragmentationAwareSize(op).count(); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(op), buffersSize).count() + + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(op), buffersSize).count() + reservedMem.count() <= totalAvailableCMXSize; } diff --git a/src/vpux_compiler/src/dialect/VPU/utils/compressed_convolution_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/compressed_convolution_utils.cpp index ecb9bb94e7..42a926d4be 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/compressed_convolution_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/compressed_convolution_utils.cpp @@ -11,5 +11,5 @@ using namespace vpux; bool VPU::hasFP16CompressedConv(mlir::Operation* op) { - return VPU::getConstraint(op, FP16_COMPRESSED_CONV); + return VPU::getConstraint(op, FP16_COMPRESSED_CONV); } diff --git a/src/vpux_compiler/src/dialect/VPU/utils/const_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/const_utils.cpp index ec2b27d8fd..085e302767 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/const_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/const_utils.cpp @@ -4,20 +4,20 @@ // #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include +#include #include "vpux/compiler/core/attributes/dims_order.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/hw_settings.hpp" +#include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/swizzling_utils.hpp" -#include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/utils/permute_utils.hpp" - -#include +#include namespace vpux { namespace VPU { @@ -352,10 +352,10 @@ mlir::Value alignConvWeightsTensor(mlir::OpBuilder& builder, mlir::Location loc, return alignedWeightsOp.getOutput(); } -Byte calculateAlignedBuffersMemoryRequirement(VPU::ArchKind arch, SmallVector& bufferSizes) { +Byte calculateAlignedBuffersMemoryRequirement(config::ArchKind arch, SmallVector& bufferSizes) { Byte offsetAlignment = Byte(vpux::DEFAULT_CMX_ALIGNMENT); Byte sizeAlignment = Byte(1); - if (arch == VPU::ArchKind::NPU37XX || arch == VPU::ArchKind::NPU40XX) { + if (arch == config::ArchKind::NPU37XX || arch == config::ArchKind::NPU40XX) { offsetAlignment = Byte(getAddressAlignmentForSwizzling(SWIZZLING_KEY_5, arch)); sizeAlignment = Byte(vpux::getSizeAlignmentForSwizzling(arch)); } diff --git a/src/vpux_compiler/src/dialect/VPU/utils/conv_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/conv_utils.cpp index e3d95d3c16..7d480348b3 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/conv_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/conv_utils.cpp @@ -4,14 +4,14 @@ // #include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" #include "vpux/compiler/dialect/VPU/utils/se_roll_utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/utils/core/error.hpp" using namespace vpux; using namespace VPU; diff --git a/src/vpux_compiler/src/dialect/VPU/utils/cost_model/cost_model.cpp b/src/vpux_compiler/src/dialect/VPU/utils/cost_model/cost_model.cpp index 7673da1846..ac78a4f92d 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/cost_model/cost_model.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/cost_model/cost_model.cpp @@ -5,13 +5,14 @@ #include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" #include "vpux/compiler/core/cost_model_utils.hpp" +#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_reduce_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" -#include "vpux/compiler/dialect/VPU/utils/ppe_version_config.hpp" +#include "vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/workload_split_utils.hpp" -#include "vpux/compiler/dialect/config/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include #include @@ -19,7 +20,7 @@ using namespace vpux; bool vpux::VPU::hasVPUNNPreSplit(mlir::Operation* op) { - return VPU::getConstraint(op, VPU::VPUNN_PRE_SPLIT); + return VPU::getConstraint(op, VPU::VPUNN_PRE_SPLIT); } ///@brief Validate vpunn cost. If cost is not the defined error code then return it @@ -139,11 +140,11 @@ float vpux::VPU::getWeightsSparsityRatio(mlir::Value weights) { return weightsSparsityRatio; } -VPUNN::VPUDevice vpux::VPU::getVPUDeviceType(VPU::ArchKind archKind) { +VPUNN::VPUDevice vpux::VPU::getVPUDeviceType(config::ArchKind archKind) { switch (archKind) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return VPUNN::VPUDevice::VPU_2_7; - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return VPUNN::VPUDevice::VPU_4_0; default: VPUX_THROW("Unsupported VPU arch type: '{0}'", archKind); @@ -184,6 +185,13 @@ std::optional vpux::VPU::getVPUNNElementType(mlir::Type type) { } else if (type.isUnsignedInteger(CHAR_BIT * sizeof(int8_t))) { return VPUNN::DataType::UINT8; } else if (auto qType = mlir::dyn_cast(type)) { + auto storageType = qType.getStorageType(); + if (storageType.isFloat8E5M2()) { + return VPUNN::DataType::BF8; + } else if (storageType.isFloat8E4M3FN()) { + return VPUNN::DataType::HF8; + } + if (qType.getStorageTypeIntegralWidth() == 8) { return qType.isSigned() ? VPUNN::DataType::INT8 : VPUNN::DataType::UINT8; } else if (qType.getStorageTypeIntegralWidth() == 4) { @@ -283,8 +291,8 @@ VPUNN::ExecutionMode vpux::VPU::getExecutionMode(VPU::MPEMode mpeMode) { * SOK_NO_BROADCAST is only utilized when the VPUNN cost is invalid for SOK to avoid performance regressions */ inline VPUNN::VPUTilingStrategy getSOKLayerStrategy(vpux::VPU::DistributionMode distributionMode, - vpux::VPU::ArchKind arch) { - if (distributionMode == vpux::VPU::DistributionMode::SEGMENTED && arch > vpux::VPU::ArchKind::NPU40XX) { + vpux::config::ArchKind arch) { + if (distributionMode == vpux::VPU::DistributionMode::SEGMENTED && arch > vpux::config::ArchKind::NPU40XX) { return VPUNN::VPUTilingStrategy::SOK_NO_BROADCAST; } return VPUNN::VPUTilingStrategy::SOK; @@ -298,7 +306,7 @@ inline VPUNN::VPUTilingStrategy getSOKLayerStrategy(vpux::VPU::DistributionMode * @param distributionMode the tensor distribution mode */ VPUNN::VPULayerStrategy vpux::VPU::getVPULayerStrategy(VPU::MultiClusterStrategy strategy, size_t nDPUs, size_t nTiles, - ArchKind arch, size_t nSHVs, bool prefetching, + config::ArchKind arch, size_t nSHVs, bool prefetching, DistributionMode distributionMode, mlir::Operation* op) { VPUNN::VPULayerStrategy VPUNNStrategy; VPUNNStrategy.nDPUs = static_cast(nDPUs); @@ -669,7 +677,7 @@ VPUNN::DPUWorkload vpux::VPU::getDPUWorkload(const VPUIP::WorkloadCostParams& ti auto getISIStrategy = [&](VPU::MultiClusterStrategy layerStrategy) { if (layerStrategy == VPU::MultiClusterStrategy::HKSwitch) { - if (tileParams.arch >= VPU::ArchKind::NPU40XX) { + if (tileParams.arch >= config::ArchKind::NPU40XX) { layerStrategy = VPU::MultiClusterStrategy::SplitOverHeightOverlapped; } else { layerStrategy = VPU::MultiClusterStrategy::SplitOverHeight; @@ -716,8 +724,8 @@ VPUNN::DPUWorkload vpux::VPU::getDPUWorkload(const VPUIP::WorkloadCostParams& ti return vpunnDPUWorkload; } -VPUIP::WorkloadCostParams vpux::VPU::getWorkloadCostParam(VPU::NCEOpInterface nceOp, VPU::ArchKind arch, int64_t numDPU, - int64_t numTiles) { +VPUIP::WorkloadCostParams vpux::VPU::getWorkloadCostParam(VPU::NCEOpInterface nceOp, config::ArchKind arch, + int64_t numDPU, int64_t numTiles) { const auto inputType = mlir::cast(nceOp->getOperand(0).getType()); const auto outputType = mlir::cast(nceOp->getResult(0).getType()); const auto inElemType = inputType.getElementType(); @@ -870,7 +878,7 @@ VPUIP::WorkloadCostParams vpux::VPU::getWorkloadCostParam(VPU::NCEOpInterface nc } vpux::VPU::LayerCostModelAnalysis::LayerCostModelAnalysis(mlir::ModuleOp module) { - auto arch = VPU::getArch(module); + auto arch = config::getArch(module); _layerCostModel = VPU::CostModelConfig::createLayerCostModel(arch); } @@ -887,7 +895,7 @@ void vpux::VPU::LayerCostModelAnalysis::invalidate() { } std::shared_ptr vpux::VPU::LayerCostModelAnalysis::getOrCreateLayerCostModel( - std::optional> analysis, VPU::ArchKind arch, + std::optional> analysis, config::ArchKind arch, Logger log) { if (analysis.has_value()) { log.trace("Load preserved layer cost model"); @@ -898,7 +906,7 @@ std::shared_ptr vpux::VPU::LayerCostModelAnalysis::get } vpux::VPU::CostModelAnalysis::CostModelAnalysis(mlir::ModuleOp module) { - auto arch = VPU::getArch(module); + auto arch = config::getArch(module); _costModel = VPU::CostModelConfig::createCostModel(arch); } @@ -915,7 +923,8 @@ void vpux::VPU::CostModelAnalysis::invalidate() { } std::shared_ptr vpux::VPU::CostModelAnalysis::getOrCreateCostModel( - std::optional> analysis, VPU::ArchKind arch, Logger log) { + std::optional> analysis, config::ArchKind arch, + Logger log) { if (analysis.has_value()) { log.trace("Load preserved cost model"); return analysis.value().get().getVPUNNCostModel(); @@ -923,3 +932,12 @@ std::shared_ptr vpux::VPU::CostModelAnalysis::getOrCreateCo log.warning("Create new cost model instance"); return VPU::CostModelConfig::createCostModel(arch); } + +vpux::VPU::ICostModelUtilsInterface* vpux::VPU::getICostModelUtilsInterface(mlir::MLIRContext* ctx) { + auto* dialect = ctx->getOrLoadDialect(); + assert(dialect != nullptr && "VPU Dialect must be present in the context"); + + auto iface = dialect->getRegisteredInterface(); + assert(iface != nullptr && "The requested interface must be registered in the context"); + return iface; +} diff --git a/src/vpux_compiler/src/dialect/VPU/utils/cost_model/factories/cost_model_config.cpp b/src/vpux_compiler/src/dialect/VPU/utils/cost_model/factories/cost_model_config.cpp index fcbfde43ec..26579460c0 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/cost_model/factories/cost_model_config.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/cost_model/factories/cost_model_config.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/NPU37XX/dialect/VPU/utils/cost_model_factory.hpp" #include "vpux/compiler/NPU40XX/dialect/VPU/utils/cost_model_factory.hpp" +using namespace vpux::config; namespace vpux::VPU { std::map>& CostModelConfig::_getFactories() { diff --git a/src/vpux_compiler/src/dialect/VPU/utils/cost_model/layer_vpunn_cost.cpp b/src/vpux_compiler/src/dialect/VPU/utils/cost_model/layer_vpunn_cost.cpp index 8ea2077bbc..718ad602dd 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/cost_model/layer_vpunn_cost.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/cost_model/layer_vpunn_cost.cpp @@ -4,13 +4,16 @@ // #include "vpux/compiler/dialect/VPU/utils/cost_model/layer_vpunn_cost.hpp" -#include #include "vpux/compiler/core/cost_model_utils.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/utils/multi_cluster_strategy_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" #include "vpux/compiler/utils/sparsity.hpp" +#include + using namespace vpux; using namespace VPU; @@ -40,11 +43,11 @@ StrategyCost correctSwOpCost(VPU::SWOpInterface swOp, ArrayRef(memPermute.getInput().getType()); auto outputType = mlir::cast(memPermute.getOutput().getType()); - if (VPUIP::satisfiesOptimizedMemPermute(VPU::getArch(swOp.getOperation()), inputType, outputType)) { + if (VPUIP::satisfiesOptimizedMemPermute(config::getArch(swOp.getOperation()), inputType, outputType)) { VPUX_THROW_WHEN(tiledInputTypes.empty(), "Cannot get tiled input"); auto tiledInputType = tiledInputTypes.front(); auto tiledOutputType = tiledInputType.changeDimsOrder(outputType.getDimsOrder()); - if (!VPUIP::satisfiesOptimizedMemPermute(VPU::getArch(swOp.getOperation()), tiledInputType, + if (!VPUIP::satisfiesOptimizedMemPermute(config::getArch(swOp.getOperation()), tiledInputType, tiledOutputType)) { cost *= SW_COST_CORRECTION_FACTOR_FOR_MEM_PERMUTE; } @@ -84,7 +87,7 @@ LayerVPUNNCost::LayerVPUNNCost(mlir::func::FuncOp func, std::shared_ptrgetParentOfType(); - _arch = VPU::getArch(module); + _arch = config::getArch(module); auto tileOp = IE::getTileExecutor(module); auto dpuExec = tileOp.getSubExecutor(VPU::ExecutorKind::DPU); @@ -99,7 +102,7 @@ LayerVPUNNCost::LayerVPUNNCost(mlir::func::FuncOp func, std::shared_ptrgetDPUPreloadedCacheCounter().reset(); diff --git a/src/vpux_compiler/src/dialect/VPU/utils/distributed_tensor_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/distributed_tensor_utils.cpp index dd450ac747..0207739009 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/distributed_tensor_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/distributed_tensor_utils.cpp @@ -4,10 +4,9 @@ // #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" -#include -#include #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/core/tiling.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" @@ -18,12 +17,17 @@ #include "vpux/compiler/dialect/VPU/utils/overlap_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/sw_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" #include "vpux/compiler/utils/dilated_utils.hpp" +#include "vpux/utils/core/numeric.hpp" #include "vpux/utils/core/range.hpp" +#include +#include + using namespace vpux; using namespace VPU; @@ -323,7 +327,7 @@ bool vpux::VPU::isSOCSegmentedNCEOp(mlir::Operation* op) { return mlir::isa(op) || (mlir::isa(op) && isSOCSegmentedOp(parentOp)) || - (getArch(op) > VPU::ArchKind::NPU40XX && mlir::isa(op)); + (config::getArch(op) > config::ArchKind::NPU40XX && mlir::isa(op)); } bool vpux::VPU::inputProducersCompatible(mlir::Operation* op, mlir::DenseSet handledUsers) { @@ -380,7 +384,7 @@ bool vpux::VPU::isSegmentedInputCompatible(mlir::Operation* op, mlir::DenseSet VPU::ArchKind::NPU40XX && mlir::isa(op)) { + if (config::getArch(op) > config::ArchKind::NPU40XX && mlir::isa(op)) { return true; } @@ -418,7 +422,7 @@ bool vpux::VPU::isSegmentedInputCompatible(mlir::Operation* op, mlir::DenseSet(user)) { continue; } @@ -518,7 +522,7 @@ bool vpux::VPU::isSOKSegmentedOutputCompatible(mlir::Operation* op) { // force SEG -> DWConv -> SEG or SEG|DUP -> DWConv -> SEG|DUP to avoid accuracy issue if (mlir::isa(op)) { - if (VPU::getArch(op) >= VPU::ArchKind::NPU40XX) { + if (config::getArch(op) >= config::ArchKind::NPU40XX) { auto dstOrder = mlir::cast(op->getResult(0).getType()).getDimsOrder(); // Here we have two cases to choose SOC as default for Depthwise ops: // 1. The output consumer is compatible with SEGMENTED mode @@ -644,10 +648,10 @@ SmallVector vpux::VPU::getActivationTensorNumTiles(VPU::ClusteredOpInte } } -bool vpux::VPU::isDWOpAndNeedsAlign(ArchKind arch, VPUIP::NCETaskType nceTaskType) { +bool vpux::VPU::isDWOpAndNeedsAlign(config::ArchKind arch, VPUIP::NCETaskType nceTaskType) { bool isDWOp = nceTaskType == VPUIP::NCETaskType::DWCONV || nceTaskType == VPUIP::NCETaskType::MAXPOOL || nceTaskType == VPUIP::NCETaskType::AVEPOOL; - return (arch == VPU::ArchKind::NPU37XX) && isDWOp; + return (arch == config::ArchKind::NPU37XX) && isDWOp; } bool vpux::VPU::isEltwiseOpAndNeedsAlign(VPU::ClusteredOpInterface clusteredOp) { @@ -972,14 +976,14 @@ std::optional> vpux::VPU::getActivationTensorAlignment(VPU: } else if (strategy == VPU::MultiClusterStrategy::SplitOverHeight || strategy == VPU::MultiClusterStrategy::HKSwitch) { auto operation = clusteredOp.getOperation(); - auto arch = getArch(operation); + auto arch = config::getArch(operation); - if (arch >= VPU::ArchKind::NPU40XX) { + if (arch >= config::ArchKind::NPU40XX) { return std::nullopt; } if (mlir::isa(operation) || - ((arch == VPU::ArchKind::NPU37XX) && + ((arch == config::ArchKind::NPU37XX) && mlir::isa(operation)) || isEltwiseOpAndNeedsAlign(clusteredOp)) { @@ -1223,88 +1227,91 @@ DistributionMode vpux::VPU::getActivationTensorDistributionMode(VPU::ClusteredOp strategy != VPU::MultiClusterStrategy::SplitOverHeight && strategy != VPU::MultiClusterStrategy::HKSwitch) { return false; } - auto op = clusteredOp.getOperation(); - // Note: disable concat as it is a complex topic - // As concatOp has a special cmx-concat pattern check, thus the spilling may still exist even to assign - // DUPLICATED + // As concatOp has a special cmx-concat pattern check, thus the spilling may still + // exist even to assign DUPLICATED if (mlir::isa(op)) { return false; } - - // For NCECompressConvolutionOp, the activation (without expansion) must have a channel size of - // VPU_COMPRESSED_INPUT_CHANNEL_NUM (4). Otherwise, duplicated inputs with workload offsets cannot correctly - // access the activation data for each cluster, leading to accuracy issues + // For NCECompressConvolutionOp, the activation (without expansion) must have a + // channel size of VPU_COMPRESSED_INPUT_CHANNEL_NUM (4). Otherwise, duplicated + // inputs with workload offsets cannot correctly access the activation data for + // each cluster, leading to accuracy issues if (auto compressConv = mlir::dyn_cast(op)) { auto origChannelVal = static_cast(std::log2(compressConv.getCmSpPattern() + 1)); if (origChannelVal != VPU::NCEInvariant::VPU_COMPRESSED_INPUT_CHANNEL_NUM) { return false; } } - - // For sw ops, current solution is dependent on workload offsets adjust so not support sw ops - // Todo: refer to ticket E#118242: use per cluster unrolling to solve it + // For sw ops, current solution is dependent on workload offsets adjust so not + // support sw ops Todo: refer to ticket E#118242: use per cluster unrolling to + // solve it if (mlir::isa(op)) { return false; } - - llvm::SmallVector eltwiseInputsCompatible = {false, false}; + auto eltwiseOp = mlir::dyn_cast(op); + if (eltwiseOp != nullptr && eltwiseOp.getIsInplace().value_or(false)) { + // E135492: Accuracy issue with duplicated input for SOH-like inplace + // eltwise + // TODO remove this check after the issue is fixed + Logger::global().trace("Select OVERLAPPED mode for the activation of SOH-like strategies for ELTWISE op"); + return false; + } + llvm::SmallVector areAllOperandsDuplicateLikeMode; for (auto operand : op->getOperands() | indexed) { + if (mlir::isa_and_nonnull(operand.value())) { + areAllOperandsDuplicateLikeMode.push_back(false); + continue; + } auto producerOp = operand.value().getDefiningOp(); // Skip cast ops - while (auto producerCastOp = mlir::dyn_cast_or_null(producerOp)) { - if (VPU::hasRestrictedTilingDim(producerCastOp)) { - break; + while (producerOp && mlir::isa_and_nonnull(producerOp)) { + if (auto producerCastOp = mlir::dyn_cast(producerOp)) { + if (VPU::hasRestrictedTilingDim(producerCastOp)) { + break; + } } if (hasMultiBranches(producerOp)) { break; } producerOp = producerOp->getOperand(0).getDefiningOp(); } - if (producerOp == nullptr) { - return false; + if (!producerOp || mlir::isa_and_nonnull(producerOp)) { + continue; } - - if (mlir::isa(producerOp) || (!mlir::isa(producerOp))) { - return false; + if (mlir::isa(producerOp) || !mlir::isa(producerOp)) { + areAllOperandsDuplicateLikeMode.push_back(false); + continue; } - if (mlir::isa(producerOp)) { - auto clusteredProducer = mlir::cast(producerOp); - const auto producerStrategy = clusteredProducer.getMultiClusterStrategy(); - if (!producerStrategy.has_value()) { - return false; - } + auto clusteredProducer = mlir::cast(producerOp); + const auto producerStrategy = clusteredProducer.getMultiClusterStrategy(); + if (!producerStrategy.has_value()) { + areAllOperandsDuplicateLikeMode.push_back(false); + } else { auto mode = VPU::getOutputTensorDistributionMode(clusteredProducer, producerStrategy.value(), nullptr); - if (!VPU::bitEnumContainsAny(mode, DistributionMode::DUPLICATED) && - !VPU::bitEnumContainsAny(mode, DistributionMode::MULTICASTED)) { - return false; + if (VPU::bitEnumContainsAny(mode, DistributionMode::DUPLICATED) || + VPU::bitEnumContainsAny(mode, DistributionMode::MULTICASTED)) { + areAllOperandsDuplicateLikeMode.push_back(true); + } else { + areAllOperandsDuplicateLikeMode.push_back(false); } } - auto eltwiseOp = mlir::dyn_cast(op); - if (eltwiseOp == nullptr) { - Logger::global().trace("Select DUPLICATED mode for the activation of SOH-like strategys"); - return true; - } - if (eltwiseOp.getIsInplace().value_or(false)) { - // E135492: Accuracy issue with duplicated input for SOH-like inplace eltwise - // TODO remove this check after the issue is fixed - Logger::global().trace( - "Select SEGMENTED mode for the activation of SOH-like strategies for ELTWISE op"); - return false; - } - - eltwiseInputsCompatible[operand.index()] = true; } - - if (std::all_of(eltwiseInputsCompatible.begin(), eltwiseInputsCompatible.end(), [](auto val) { + // For eltwise Op, all inputs need to have distribution mode compatible with DUPLICATED mode + if (mlir::isa(op)) { + return std::all_of(areAllOperandsDuplicateLikeMode.begin(), areAllOperandsDuplicateLikeMode.end(), + [](auto val) { + return val; + }); + } + if (std::any_of(areAllOperandsDuplicateLikeMode.begin(), areAllOperandsDuplicateLikeMode.end(), [](auto val) { return val; })) { - Logger::global().trace("Select DUPLICATED mode for the activation of SOH-like strategys"); + Logger::global().trace("Select DUPLICATED mode for the activation of SOH-like strategies"); return true; } - return false; }; @@ -1316,7 +1323,7 @@ DistributionMode vpux::VPU::getActivationTensorDistributionMode(VPU::ClusteredOp strategy == VPU::MultiClusterStrategy::HKSwitch) { // TODO: be more explicit ahead of time wrt MultiClusterStrategy for 40XX. // E#71926 to track this. - if (VPU::isArchVPUX3XXX(VPU::getArch(clusteredOp))) { + if (config::isArchVPUX3XXX(config::getArch(clusteredOp))) { return DistributionMode::SEGMENTED; } return isDuplicatedModeForSOHLikeStrategy() ? DistributionMode::DUPLICATED : DistributionMode::OVERLAPPED; @@ -1338,6 +1345,21 @@ DistributionMode vpux::VPU::getActivationTensorDistributionMode(VPU::ClusteredOp } } +DistributionMode vpux::VPU::getActivationTensorDistributionMode(VPU::GatherDMAOp op, VPU::MultiClusterStrategy strategy, + mlir::Value operand) { + const auto isIndicesTensor = operand == op.getIndices(); + + switch (strategy) { + case VPU::MultiClusterStrategy::SplitOverWidth: + case VPU::MultiClusterStrategy::SplitOverHeight: + return isIndicesTensor ? DistributionMode::DUPLICATED : DistributionMode::SEGMENTED; + default: + VPUX_THROW("{0} is an invalid multi-cluster strategy, unable to determine the distribution mode for the " + "activation tensor", + strategy); + } +} + DistributionMode vpux::VPU::getWeightsTensorDistributionMode(VPU::MultiClusterStrategy strategy) { if (strategy == VPU::MultiClusterStrategy::SplitOverHeightOverlapped || strategy == VPU::MultiClusterStrategy::SplitOverHeight || strategy == VPU::MultiClusterStrategy::Clustering || @@ -1362,7 +1384,8 @@ DistributionMode vpux::VPU::getOutputTensorDistributionMode(VPU::ClusteredOpInte strategy == VPU::MultiClusterStrategy::SplitOverWidth) { // TODO: be more explicit ahead of time wrt MultiClusterStrategy for 40XX. // E#71926 to track this. - if (VPU::isArchVPUX3XXX(VPU::getArch(clusteredOp)) || mlir::isa(clusteredOp.getOperation())) { + if (config::isArchVPUX3XXX(config::getArch(clusteredOp)) || + mlir::isa(clusteredOp.getOperation())) { return DistributionMode::SEGMENTED; } return DistributionMode::OVERLAPPED; @@ -1428,8 +1451,8 @@ int64_t vpux::VPU::getSOHPerClusterHeightAlignment(int64_t inputWidth, bool isIn } int64_t vpux::VPU::getSOHMinimalHeightAlignment(vpux::ShapeRef shape, int64_t numClusters, bool isInputSparse, - VPU::ArchKind arch) { - if (!VPU::isArchVPUX3XXX(arch)) { + config::ArchKind arch) { + if (!config::isArchVPUX3XXX(arch)) { return 1; } @@ -1453,7 +1476,7 @@ int64_t vpux::VPU::getSOHMinimalHeightAlignment(vpux::ShapeRef shape, int64_t nu } bool vpux::VPU::isSOHSupportedByDPU(vpux::NDTypeInterface inputType, ShapeRef inputShape, int64_t numClusters, bool, - ArchKind arch) { + config::ArchKind arch) { // Layers with 5D input shapes does not support SOH if (inputShape.size() == DimsGroups5D::Act::numDims) { return false; @@ -1475,7 +1498,7 @@ bool vpux::VPU::isSOHSupportedByDPU(vpux::NDTypeInterface inputType, ShapeRef in // On VPUX40XX, SOH doesn't have the rules above // Actually the input tile shapes are completely back-inferred by output tile shapes which are following // uniformDistributedSegments method - if (arch >= VPU::ArchKind::NPU40XX) { + if (arch >= config::ArchKind::NPU40XX) { return true; } @@ -1500,7 +1523,7 @@ bool vpux::VPU::isSOHSupportedByDPU(vpux::NDTypeInterface inputType, ShapeRef in bool vpux::VPU::isSOGSupportedByDPU([[maybe_unused]] vpux::NDTypeInterface inputType, [[maybe_unused]] ShapeRef inputShape, [[maybe_unused]] int64_t numClusters, - [[maybe_unused]] bool DWTypeOp, [[maybe_unused]] ArchKind arch) { + [[maybe_unused]] bool DWTypeOp, [[maybe_unused]] config::ArchKind arch) { return true; } @@ -1687,6 +1710,10 @@ VPU::DistributedTensorType vpux::VPU::createDistributedTensorType( numClusters, alignment, uniformDistributedSegments, overlapParams.getKernel(), padAttr, overlapParams.getStride()); }) + .Case([&](VPU::GatherDMAOp gatherDMAOp) { + return createDistributedTensorType(gatherDMAOp, inputType, distributionMode, numTiles, numClusters, + alignment, uniformDistributedSegments); + }) .Default([clusteredOp](mlir::Operation*) -> DistributedTensorType { VPUX_THROW("unsupported operation for createDistributedTensorType: {0}", clusteredOp); }); @@ -1849,6 +1876,23 @@ DistributedTensorType vpux::VPU::createDistributedTensorType( VPU::Padding::getClassFromAttr(pad), stride))); } +DistributedTensorType vpux::VPU::createDistributedTensorType( + VPU::GatherDMAOp gatherDMAOp, vpux::NDTypeInterface inputType, DistributionMode distributionMode, + ArrayRef numTiles, int64_t optimalNumberOfClusters, ArrayRef alignment, + const bool uniformDistributedSegments) { + auto* ctx = gatherDMAOp->getContext(); + const auto memSpace = vpux::IndexedSymbolAttr::get(ctx, stringifyEnum(MemoryKind::CMX_NN)); + + const auto order = mlir::AffineMapAttr::get(inputType.getDimsOrder().toAffineMap(ctx)); + auto elemType = inputType.getElementType(); + + return DistributedTensorType::get( + ctx, inputType.getShape().raw(), elemType, order, memSpace, + VPU::DistributionInfo::getAttrFromClass( + ctx, createDistributionInfo(gatherDMAOp, distributionMode, numTiles, optimalNumberOfClusters, + alignment, uniformDistributedSegments))); +} + vpux::VPU::CopyOp vpux::VPU::createDistributedCopyIn(mlir::PatternRewriter& rewriter, VPU::ClusteredOpInterface clusteredOp, mlir::Value input, vpux::NDTypeInterface inputTensorDistributedTensorType) { @@ -1990,13 +2034,16 @@ VPU::DistributedTypeInterface vpux::VPU::getDistributedActivationTypeFromOp( VPU::ClusteredOpInterface clusteredOp, mlir::Value operand, vpux::NDTypeInterface inputType, int64_t numClusters, VPU::MultiClusterStrategy customStrategy, ArrayRef customAlignment, vpux::NDTypeInterface tiledOutputType, const vpux::TileInfo& tileInfo) { - auto activationTensorDistributionMode = getActivationTensorDistributionMode(clusteredOp, customStrategy); - auto activationTensorNumTiles = getActivationTensorNumTiles(clusteredOp, numClusters, customStrategy, inputType); + DistributionMode activationTensorDistributionMode; + SmallVector activationTensorNumTiles; if (mlir::isa(clusteredOp.getOperation())) { activationTensorDistributionMode = getSWInputTensorDistributionMode(clusteredOp, customStrategy, operand, inputType); activationTensorNumTiles = getSWInputTensorNumTiles(clusteredOp, numClusters, customStrategy, operand, inputType); + } else { + activationTensorDistributionMode = getActivationTensorDistributionMode(clusteredOp, customStrategy); + activationTensorNumTiles = getActivationTensorNumTiles(clusteredOp, numClusters, customStrategy, inputType); } auto actualOutputType = tiledOutputType != nullptr @@ -2717,6 +2764,24 @@ VPU::DistributionInfo vpux::VPU::createDistributionInfo( return {}; } +VPU::DistributionInfo vpux::VPU::createDistributionInfo(VPU::GatherDMAOp gatherDMAOp, DistributionMode distributionMode, + ArrayRef numTiles, int64_t optimalNumberOfClusters, + ArrayRef alignment, bool uniformDistributedSegments) { + VPUX_THROW_UNLESS(mlir::isa_and_nonnull(gatherDMAOp), "Op {0} is not a view like op", + gatherDMAOp->getName()); + + if (distributionMode == DistributionMode::DUPLICATED) { + return DistributionInfo(distributionMode, {}, {}, {}, {}, optimalNumberOfClusters, alignment, + uniformDistributedSegments, {}, {}, {}, {}, {}); + } else if (VPU::bitEnumContainsAny(distributionMode, VPU::DistributionMode::SEGMENTED)) { + return DistributionInfo(distributionMode, numTiles, {}, {}, {}, optimalNumberOfClusters, alignment, + uniformDistributedSegments, {}, {}, {}, {}, {}); + } + VPUX_THROW("Unsupported distribution mode {0} for op {1}", VPU::stringifyDistributionMode(distributionMode), + gatherDMAOp); + return {}; +} + TensorDistributionMap vpux::VPU::getActivationDistributionAttrFromOp( VPU::ClusteredOpInterface clusteredOp, mlir::Value operand, vpux::NDTypeInterface inputType, int64_t numClusters, SiblingOpsAnalysis& siblingsAnalysis, vpux::NDTypeInterface tiledOutputType, @@ -2739,6 +2804,9 @@ TensorDistributionMap vpux::VPU::getActivationDistributionAttrFromOp( getSWInputTensorDistributionMode(clusteredOp, customStrategy, operand, inputType); activationTensorNumTiles = getSWInputTensorNumTiles(clusteredOp, numClusters, customStrategy, operand, inputType); + } else if (auto gatherOp = mlir::dyn_cast_or_null(clusteredOp.getOperation())) { + activationTensorDistributionMode = getActivationTensorDistributionMode(gatherOp, customStrategy, operand); + activationTensorNumTiles = getActivationTensorNumTiles(clusteredOp, numClusters, customStrategy, inputType); } else { activationTensorDistributionMode = getActivationTensorDistributionMode(clusteredOp, customStrategy); activationTensorNumTiles = getActivationTensorNumTiles(clusteredOp, numClusters, customStrategy, inputType); @@ -2975,6 +3043,10 @@ VPU::DistributionInfo vpux::VPU::createDistributionInfo(VPU::ClusteredOpInterfac alignment, uniformDistributedSegments, overlapParams.getKernel(), overlapParams.getPads(), overlapParams.getStride()); }) + .Case([&](VPU::GatherDMAOp gatherOp) { + return createDistributionInfo(gatherOp, distributionMode, numTiles, numClusters, alignment, + uniformDistributedSegments); + }) .Default([clusteredOp](mlir::Operation*) -> DistributionInfo { VPUX_THROW("unsupported operation for createDistributedTensor: {0}", clusteredOp); }); diff --git a/src/vpux_compiler/src/dialect/VPU/utils/eltwise_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/eltwise_utils.cpp index 3e95c9ca64..bbcd82575a 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/eltwise_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/eltwise_utils.cpp @@ -4,12 +4,11 @@ // #include "vpux/compiler/dialect/VPU/utils/eltwise_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/quantization.hpp" -#include "vpux/utils/core/numeric.hpp" using namespace vpux; using namespace VPU; @@ -50,7 +49,7 @@ bool vpux::VPU::isNCEEltwiseSupported(mlir::Operation* op, vpux::NDTypeInterface return false; } - auto arch = getArch(op); + auto arch = config::getArch(op); if (checkChannelAlignment) { auto iface = mlir::dyn_cast(op); auto outputAlignment = iface != nullptr ? iface.getOutputChannelAlignment() diff --git a/src/vpux_compiler/src/dialect/VPU/utils/explicit_distribution_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/explicit_distribution_utils.cpp index a86b6629d9..3a746dc963 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/explicit_distribution_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/explicit_distribution_utils.cpp @@ -4,13 +4,15 @@ // #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" -#include #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/sw_utils.hpp" #include "vpux/utils/core/error.hpp" +#include "vpux/utils/core/numeric.hpp" #include "vpux/utils/core/range.hpp" +#include + using namespace vpux; VPU::OverlapDistributionParams VPU::getExplicitOverlapParamsForSWOpInput(VPU::SWOpInterface swOp, ShapeRef outShape, diff --git a/src/vpux_compiler/src/dialect/VPU/utils/gather_dma_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/gather_dma_utils.cpp index f501658a27..05fe93f209 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/gather_dma_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/gather_dma_utils.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/VPU/utils/gather_dma_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" namespace vpux::VPU { @@ -18,7 +19,7 @@ bool isLegalConvertToGatherDMA(VPU::GatherOp op, bool isElementTile, bool isIndi const auto outputType = mlir::cast(op.getOutput().getType()); const auto indicesType = mlir::cast(op.getIndices().getType()); const auto inputType = mlir::cast(op.getInput().getType()); - auto arch = VPU::getArch(op); + auto arch = config::getArch(op); if (!op.getAxisValue().has_value()) { return false; diff --git a/src/vpux_compiler/src/dialect/VPU/utils/generate_tiling.cpp b/src/vpux_compiler/src/dialect/VPU/utils/generate_tiling.cpp index 200979af46..f38feb6d46 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/generate_tiling.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/generate_tiling.cpp @@ -10,17 +10,20 @@ #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/sparsity_constraint.hpp" +#include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/op_tiling_cache.hpp" #include "vpux/compiler/dialect/VPU/utils/se_roll_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/tiling_constraint_utils.hpp" #include "vpux/compiler/dialect/VPUIP/interfaces/dpu_tiler.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/dilated_utils.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/error.hpp" #include "vpux/utils/core/numeric.hpp" @@ -486,7 +489,7 @@ std::optional> getTilingMode(mlir::Operation* op, bo } std::optional> getWorkLoadInformationForNCEWithSparseOutput( - VPU::ArchKind arch, ArrayRef perClusterShapes, ArrayRef supportedChannels) { + config::ArchKind arch, ArrayRef perClusterShapes, ArrayRef supportedChannels) { auto getWorkloadNum = [&](int64_t channelSupported) { size_t wlMaxNumPerCluster = 0; size_t wlNumInTotal = 0; @@ -607,8 +610,8 @@ bool doesNCEOpChannelSatisfyWorkload(mlir::Operation* nceOp, const TileInfo& out return false; } } - const auto workloadInformation = - getWorkLoadInformationForNCEWithSparseOutput(getArch(nceOp), perClusterShapes, supportedChannels); + const auto workloadInformation = getWorkLoadInformationForNCEWithSparseOutput( + config::getArch(nceOp), perClusterShapes, supportedChannels); if (!workloadInformation.has_value()) { return false; } @@ -717,9 +720,8 @@ std::vector getHwLayerTilingStrategiesWithCost(mlir::Operation VPUX_THROW_WHEN(nceOp == nullptr, "Operation '{0}' doesn't implement NCEop Interface", op->getName()); const auto tileDimOrder = getTileDimOrder(op, tilingMode, log); - // Temporarily not apply cost-based tiling strategy to NCE ops with INT4 weights based on VPUNN cost. - // This can be removed when VPUNN is upgraded to support INT4 data type, tracked in E#113316. - if (VPU::isNCEWithInt4Weights(op)) { + auto costModelUtils = VPU::getICostModelUtilsInterface(op->getContext()); + if (VPU::isNCEWithInt4Weights(op) && !costModelUtils->isNCEWithInt4WeightsSupported()) { auto strategy = getHWLayerTilingStrategyWithTileDimOrder(op, tilingMode, tileDimOrder, log); if (mlir::failed(strategy)) { return {}; diff --git a/src/vpux_compiler/src/dialect/VPU/utils/layer_post_ops_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/layer_post_ops_utils.cpp index 22727c25b1..62683e4b7b 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/layer_post_ops_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/layer_post_ops_utils.cpp @@ -4,6 +4,9 @@ // #include "vpux/compiler/dialect/VPU/utils/layer_post_ops_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/utils/core/numeric.hpp" diff --git a/src/vpux_compiler/src/dialect/VPU/utils/layout_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/layout_utils.cpp index 8ba9a3d29b..cd9e067ae2 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/layout_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/layout_utils.cpp @@ -4,10 +4,15 @@ // #include "vpux/compiler/dialect/VPU/utils/layout_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/utils/const_attributes.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/shave_controls_dpu.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/utils/affine_reshape.hpp" using namespace vpux; @@ -479,7 +484,7 @@ void vpux::VPU::inferLSTMSequenceLayoutInfo(mlir::Operation* op, IE::LayerLayout info.setInput(ind++, DimsOrder::NWHC); // weights } // recurrenceWeights order. It depends by lower-level implementation. - auto useDpu = VPU::getShaveControlsDpu(VPU::getArch(op)); + auto useDpu = VPU::getShaveControlsDpu(config::getArch(op)); useDpu = useDpu ? VPU::LSTMSequenceOp::isSupported(lstmSequenceOp, useDpu) : useDpu; auto recurrenceWeightsOrder = useDpu ? DimsOrder::NCHW : DimsOrder::NWHC; info.setInput(ind++, recurrenceWeightsOrder); diff --git a/src/vpux_compiler/src/dialect/VPU/utils/m2i_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/m2i_utils.cpp index 7091572450..21c4194f10 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/m2i_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/m2i_utils.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/VPU/utils/m2i_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" #include "vpux/utils/core/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/VPU/utils/max_kernel_size_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/max_kernel_size_utils.cpp index f064a08e74..b59ba4797f 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/max_kernel_size_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/max_kernel_size_utils.cpp @@ -26,5 +26,5 @@ bool VPU::hasMaxKernelSize(mlir::Operation* op) { } int64_t VPU::getMaxKernelSize(mlir::Operation* op) { - return VPU::getConstraint(op, VPU::MAX_KERNEL_SIZE); + return VPU::getConstraint(op, VPU::MAX_KERNEL_SIZE); } diff --git a/src/vpux_compiler/src/dialect/VPU/utils/nce_invariant.cpp b/src/vpux_compiler/src/dialect/VPU/utils/nce_invariant.cpp index 6cb8e54cd5..8125698c01 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/nce_invariant.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/nce_invariant.cpp @@ -4,19 +4,26 @@ // #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" -#include "vpux/compiler/core/layers.hpp" -#include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/se_padding_utils.hpp" - -#include #include "vpux/compiler/core/attributes/dims_order.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/se_padding_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/se_roll_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" +#include + using namespace vpux; // @@ -226,7 +233,7 @@ mlir::FailureOr> vpux::VPU::NCEInvariant::getWeightsTableSize( // bool vpux::VPU::NCEInvariant::checkLayouts(mlir::TypeRange operandTypes, mlir::TypeRange resultTypes, - const VPU::ArchKind& arch, const unsigned numInputOperands, LogCb logCb) { + const config::ArchKind& arch, const unsigned numInputOperands, LogCb logCb) { VPUX_UNUSED(resultTypes); VPUX_UNUSED(arch); @@ -242,8 +249,8 @@ bool vpux::VPU::NCEInvariant::checkLayouts(mlir::TypeRange operandTypes, mlir::T return true; } -bool vpux::VPU::NCEInvariant::isEltwiseMultiplySubtractSupported(const VPU::ArchKind arch) { - return arch > VPU::ArchKind::NPU40XX; +bool vpux::VPU::NCEInvariant::isEltwiseMultiplySubtractSupported(const config::ArchKind arch) { + return arch > config::ArchKind::NPU40XX; } mlir::LogicalResult vpux::VPU::NCEInvariant::isSupported(mlir::Operation* op, Logger) { @@ -271,7 +278,7 @@ mlir::LogicalResult vpux::VPU::NCEInvariant::isSupported(mlir::Operation* op, Lo }) // #E157147: Do not set layout for NCE multiply. It will be enabled once it is optimal. .Case([&](auto origOp) { - const auto arch = getArch(origOp); + const auto arch = config::getArch(origOp); if (!isEltwiseMultiplySubtractSupported(arch)) { return false; } @@ -314,9 +321,10 @@ mlir::LogicalResult vpux::VPU::NCEInvariant::isSupported(mlir::Operation* op, Lo })); } -bool vpux::VPU::NCEInvariant::doesWorkloadSupportSmallKernelOpt([[maybe_unused]] VPU::ArchKind arch, const int64_t KX, - const int64_t SX, ArrayRef workloadOutSz, - bool isFp16Input, [[maybe_unused]] const int64_t KY, +bool vpux::VPU::NCEInvariant::doesWorkloadSupportSmallKernelOpt([[maybe_unused]] config::ArchKind arch, + const int64_t KX, const int64_t SX, + ArrayRef workloadOutSz, bool isFp16Input, + [[maybe_unused]] const int64_t KY, [[maybe_unused]] const int64_t padLeft) { // L1Opt can be enabled when kernelX = 3 and strideX = 1 if (KX != 3 || SX != 1) { @@ -328,9 +336,9 @@ bool vpux::VPU::NCEInvariant::doesWorkloadSupportSmallKernelOpt([[maybe_unused]] : workloadOutSz[Dims4D::Act::C.ind()] % VPU_CHANNEL_SIZE_FOR_L1OPT16 == 0; } -bool vpux::VPU::NCEInvariant::isSmallKernelOptimizationSupported(const VPU::ArchKind arch, mlir::Operation* op) { +bool vpux::VPU::NCEInvariant::isSmallKernelOptimizationSupported(const config::ArchKind arch, mlir::Operation* op) { // TODO: E#96201, attach concrete implementation of NCEOpInterface depending on the type of device - if (arch == VPU::ArchKind::NPU37XX) { + if (arch == config::ArchKind::NPU37XX) { return false; } if (!mlir::isa(op)) { diff --git a/src/vpux_compiler/src/dialect/VPU/utils/nce_reduce_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/nce_reduce_utils.cpp index fa80efac77..8291306e0a 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/nce_reduce_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/nce_reduce_utils.cpp @@ -4,18 +4,14 @@ // #include "vpux/compiler/dialect/VPU/utils/nce_reduce_utils.hpp" -#include "llvm/ADT/TypeSwitch.h" -#include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/layers.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/config/IR/ops.hpp" -#include "vpux/compiler/utils/analysis.hpp" -#include "vpux/compiler/utils/attributes.hpp" #include "vpux/utils/core/error.hpp" +#include + using namespace vpux; bool vpux::VPU::isNCEReduceSupported(mlir::Operation* op, LogCb logCb) { @@ -36,7 +32,7 @@ bool vpux::VPU::isNCEReduceSupported(mlir::Operation* op, LogCb logCb) { } bool vpux::VPU::isReduceOpSupportedOnNCE(mlir::Operation* op) { - return VPU::getConstraint(op, REDUCE_SUPPORTED); + return VPU::getConstraint(op, REDUCE_SUPPORTED); } VPUIP::NCETaskType vpux::VPU::configureNCEReduceTaskType(VPU::NCEReduceOp origOp) { diff --git a/src/vpux_compiler/src/dialect/VPU/utils/nce_sparsity.cpp b/src/vpux_compiler/src/dialect/VPU/utils/nce_sparsity.cpp index 61427f9037..c4c9a7a9a8 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/nce_sparsity.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/nce_sparsity.cpp @@ -4,15 +4,15 @@ // #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" -#include "vpux/compiler/dialect/VPU/transforms/factories/nce_sparsity_converters.hpp" - #include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/VPU/IR/types.hpp" +#include "vpux/compiler/dialect/VPU/transforms/factories/nce_sparsity_converters.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes_properties_conversion.hpp" #include "vpux/compiler/utils/types.hpp" - #include "vpux/utils/core/enums.hpp" #include "vpux/utils/core/numeric.hpp" @@ -123,7 +123,6 @@ int32_t vpux::VPU::NCESparsity::getWeightPtrStep(mlir::Value weights) { std::vector vpux::VPU::NCESparsity::getExpandedWeightsTable(ArrayRef weightsTableVector, int64_t OC) { using vpux::VPU::NCEInvariant::VPU_CHANNEL_ALIGNMENT; - using vpux::VPU::NCEInvariant::WEIGHT_TABLE_NUM_ELEMENTS_PER_OC; auto expandedWTVec = weightsTableVector.vec(); for (auto oc = OC; oc < VPU_CHANNEL_ALIGNMENT; oc++) { diff --git a/src/vpux_compiler/src/dialect/VPU/utils/overlap_distribution_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/overlap_distribution_utils.cpp index 5846d015a8..9bafee9e98 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/overlap_distribution_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/overlap_distribution_utils.cpp @@ -7,7 +7,10 @@ #include "vpux/compiler/core/tiling.hpp" #include "vpux/compiler/dialect/VPU/IR/native_attributes/distribution_info.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/sibling_ops_analysis.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" + +#include #include @@ -476,8 +479,8 @@ OverlapDistributionParams vpux::VPU::getActivationOverlappedParams(VPU::Clustere // For 37XX, we do not set input workloads explicitly and therefore // OVERLAPPED should only represent the current op's input needs w/o // the sibling requirements - auto archKind = getArch(clusteredOp.getOperation()); - if (archKind == VPU::ArchKind::NPU37XX) { + auto archKind = config::getArch(clusteredOp.getOperation()); + if (archKind == config::ArchKind::NPU37XX) { return localOverlappedParams; } @@ -658,7 +661,7 @@ std::set vpux::VPU::getSiblingOps(mlir::Operation* op } bool vpux::VPU::outputOverlappedParamsIsHaloSupported(mlir::Operation* op) { - return getArch(op) >= VPU::ArchKind::NPU40XX; + return config::getArch(op) >= config::ArchKind::NPU40XX; } OverlapDistributionParams vpux::VPU::getActivationOverlappedParams(VPU::ClusteredOpInterface clusteredOp, @@ -731,8 +734,8 @@ OverlapDistributionParams vpux::VPU::getOutputOverlappedParams(VPU::ClusteredOpI vpux::NDTypeInterface outputType, ArrayRef activationTensorNumTiles) { SmallVector consumerSubgraph; - auto archKind = getArch(clusteredOp.getOperation()); - const auto equalComputeAndMemoryView = archKind <= VPU::ArchKind::NPU37XX; + auto archKind = config::getArch(clusteredOp.getOperation()); + const auto equalComputeAndMemoryView = archKind <= config::ArchKind::NPU37XX; if (auto eltwise = mlir::dyn_cast(clusteredOp.getOperation())) { if (eltwise.getIsInplace().value_or(false)) { @@ -864,7 +867,7 @@ OverlapDistributionParams vpux::VPU::getOutputOverlappedParams(VPU::ClusteredOpI OverlapDistributionParams vpux::VPU::getOutputOverlappedParamsNoHalo(VPU::ClusteredOpInterface clusteredOp, ArrayRef outputTensorNumTiles) { - auto archKind = getArch(clusteredOp.getOperation()); + auto archKind = config::getArch(clusteredOp.getOperation()); VPUX_THROW_WHEN(!mlir::isa(clusteredOp.getOperation()), "Arch {0} does not support output OVERLAPPED distribution for op = {1}", archKind, clusteredOp); diff --git a/src/vpux_compiler/src/dialect/VPU/utils/performance_metrics.cpp b/src/vpux_compiler/src/dialect/VPU/utils/performance_metrics.cpp index e7375fda44..69639552f3 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/performance_metrics.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/performance_metrics.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/dialect/VPU/utils/performance_metrics.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/frequency_table.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/utils/profiling/parser/freq.hpp" @@ -62,11 +63,11 @@ SmallVector> getBWTicks(mlir::ModuleOp module) { } // Get corresponding dpu freq (MHz) from vpunn to parse inferenceTimebyDPUCycle - const auto arch = VPU::getArch(module); - size_t dpuBaseFreq = VPU::getDpuFrequency(arch, VPU::getRevisionID(module)); + const auto arch = config::getArch(module); + size_t dpuBaseFreq = VPU::getDpuFrequency(arch, config::getRevisionID(module)); // Convert inference ticks by getProfClk - auto profClk = arch >= VPU::ArchKind::NPU40XX ? profiling::ProfClk40XX::PROF_CLK_DEFAULT_VALUE_MHZ - : profiling::ProfClk37XX::PROF_CLK_DEFAULT_VALUE_MHZ; + auto profClk = arch >= config::ArchKind::NPU40XX ? profiling::ProfClk40XX::PROF_CLK_DEFAULT_VALUE_MHZ + : profiling::ProfClk37XX::PROF_CLK_DEFAULT_VALUE_MHZ; auto freqTable = VPU::getFrequencyTable(arch); auto freqBase = freqTable().base; auto freqStep = freqTable().step; @@ -88,11 +89,11 @@ SmallVector> getBWTicks(mlir::ModuleOp module) { double getActivityFactor(VPU::ExecutorKind execKind, mlir::ModuleOp module, IE::ComputeResourceOpInterface res) { // 0.5 is a recommanded default value for AF by VPUNN team double activityFactor = 0.5; - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); if (execKind == VPU::ExecutorKind::NCE || execKind == VPU::ExecutorKind::SHAVE_NN) { switch (arch) { - case VPU::ArchKind::NPU37XX: - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU37XX: + case config::ArchKind::NPU40XX: // Here we must get AF from NCE res (a TileResourceOp) as the AF attribute is attached to tile op if (execKind == VPU::ExecutorKind::NCE) { auto NCERes = mlir::cast(res.getOperation()); diff --git a/src/vpux_compiler/src/dialect/VPU/utils/ppe_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/ppe_utils.cpp index d402036a10..f19ca7d89d 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/ppe_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/ppe_utils.cpp @@ -4,9 +4,9 @@ // #include "vpux/compiler/dialect/VPU/utils/ppe_utils.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" - #include "vpux/compiler/utils/quantization.hpp" #include "vpux/utils/core/error.hpp" diff --git a/src/vpux_compiler/src/dialect/VPU/utils/reduce_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/reduce_utils.cpp index 1b7bb487bc..92a01e5de9 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/reduce_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/reduce_utils.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/utils/core/array_ref.hpp" @@ -51,7 +52,8 @@ bool fitIntoCMXReduce(mlir::Operation* operation, llvm::ArrayRef(origInput.getType()); + + auto staticNewShape = mlir::getConstantIntValues(inputTileInfo.shape); + if (origType.getShape().isStatic() && staticNewShape.has_value() && + llvm::equal(origType.getShape().raw(), staticNewShape.value())) { + return origInput; + } + + SmallVector defaultStrides(inputTileInfo.offsets.size(), builder.getIndexAttr(1)); + + auto extractTile = builder.create( + appendLoc(loc, "extractSlice"), origInput, inputTileInfo.offsets, inputTileInfo.shape, defaultStrides); + + auto newShape = getShape(extractTile.getResult()); + auto newType = origType.changeShape(ShapeRef(newShape)); + if (auto boundedType = mlir::dyn_cast(newType)) { + newType = boundedType.changeBounds(inputTileInfo.bounds); + } + + // by default output type loses NPU-specific attributes so we have to set it manually + extractTile->getResult(0).setType(newType); + + return extractTile; +} + +mlir::Type vpux::VPU::extractResultType(mlir::Type origType, SCFShapeRef newShape, BoundsRef bounds) { + auto ndTensorType = mlir::cast(origType); + auto origElemType = ndTensorType.getElementType(); + + VPUX_THROW_WHEN(mlir::isa(origElemType), + "Per axis quantized types are not supported in scf"); + + const auto tensorDesc = + vpux::getTensorAttr(origElemType.getContext(), ndTensorType.getDimsOrder(), ndTensorType.getMemSpace(), + mlir::isa(origType) ? bounds : Bounds{}); + + SmallVector dynamicDims; // unused cause for shape static dims are enough + SmallVector staticDims; + mlir::dispatchIndexOpFoldResults(newShape, dynamicDims, staticDims); + return mlir::RankedTensorType::get(staticDims, origElemType, tensorDesc); +} + SCFTileInfo vpux::VPU::getWeightsTableSCFTile(mlir::Type origWeightsTableType, mlir::OpBuilder& builder, const SCFTileInfo& outputTile) { auto origWeightsTableShape = mlir::cast(origWeightsTableType).getShape(); @@ -38,19 +87,41 @@ SCFTileInfo vpux::VPU::getWeightsTableSCFTile(mlir::Type origWeightsTableType, m return weightsTableTile; } -mlir::Range vpux::VPU::solutionForOutputRange(mlir::Location loc, mlir::OpBuilder& builder, - const SCFTileInfo& outputTile, Dim dim, const int64_t kernel, - const int64_t stride, const std::pair& origPadding) { +std::pair, std::optional> vpux::VPU::solutionForOutputRange( + mlir::Location loc, mlir::OpBuilder& builder, const SCFTileInfo& outputTile, Dim dim, const int64_t kernel, + const int64_t stride, const int64_t origInputSize, const std::pair& origPadding, + mlir::OpFoldResult& padBefore, mlir::OpFoldResult& padAfter) { auto zero = builder.getIndexAttr(0); auto one = builder.getIndexAttr(1); mlir::Range inputRange = {zero, zero, one}; mlir::Range outputRange = {outputTile.offsets[dim.ind()], outputTile.shape[dim.ind()], one}; - mlir::AffineExpr s0, d0; - bindDims(builder.getContext(), d0); - bindSymbols(builder.getContext(), s0); + // define dimensions (d0, d1, ...) as variables which are represented by loop dim identifier + // and symbols (s0, s1, ...) which are either known constants or known attributes of operation + mlir::AffineExpr s0, s1, d0, d1; + bindDims(builder.getContext(), d0, d1); + bindSymbols(builder.getContext(), s0, s1); + + mlir::AffineExpr sizeExpr = (d0 - 1) * stride + kernel - origPadding.first; + auto sizeMap = mlir::AffineMap::get(1, 0, {sizeExpr}, builder.getContext()); + + std::optional dimBound; + if (!outputTile.bounds.raw().empty()) { + auto outputTileBound = builder.getIntegerAttr(builder.getIndexType(), outputTile.bounds[dim]); + SmallVector resultsAttrs; + if (sizeMap.constantFold({outputTileBound}, resultsAttrs).succeeded()) { + if (auto result = mlir::dyn_cast(resultsAttrs.front())) { + dimBound = result.getInt() - origPadding.second; + } + } + } + + if (mlir::isConstantIntValue(outputTile.axis[dim.ind()], 1)) { + return {std::nullopt, dimBound}; + } const auto hasPadBefore = origPadding.first != 0; + const auto hasPadAfter = origPadding.second != 0; // input offset is based on output tile offset and operation's parameters // current calculation is @@ -60,14 +131,70 @@ mlir::Range vpux::VPU::solutionForOutputRange(mlir::Location loc, mlir::OpBuilde if (!hasPadBefore && stride == 1) { inputRange.offset = outputRange.offset; } else { - auto offsetMap = mlir::AffineMap::get(1, 1, {d0 * stride - origPadding.first, s0}, builder.getContext()); + mlir::AffineExpr offsetExpr = d0 * stride - origPadding.first; + auto offsetMap = mlir::AffineMap::get(1, 1, {offsetExpr, s0}, builder.getContext()); inputRange.offset = mlir::affine::makeComposedFoldedAffineMax(builder, appendLoc(loc, "inputOffset"), offsetMap, {outputRange.offset, zero}); + + auto minDiffMap = mlir::AffineMap::get(1, 2, {s0 - offsetExpr, s1}, builder.getContext()); + auto minDiffValue = mlir::affine::makeComposedFoldedAffineMin(builder, appendLoc(loc, "minDiff"), minDiffMap, + {outputRange.offset, zero, zero}); + auto padBeforeMap = mlir::AffineMap::get(0, 2, {s0, s1}, builder.getContext()); + padBefore = mlir::affine::makeComposedFoldedAffineMax(builder, appendLoc(loc, "paddingBefore"), padBeforeMap, + {minDiffValue, builder.getIndexAttr(origPadding.first)}); } - auto sizeMap = mlir::AffineMap::get(1, 0, {(d0 - 1) * stride + kernel - origPadding.first}, builder.getContext()); inputRange.size = mlir::affine::makeComposedFoldedAffineApply(builder, appendLoc(loc, "inputSize"), sizeMap, {outputRange.size}); + if (hasPadAfter) { + auto minDiffMap = mlir::AffineMap::get(2, 2, {d1 + sizeExpr - s0, s1}, builder.getContext()); + auto minDiffValue = mlir::affine::makeComposedFoldedAffineMin( + builder, appendLoc(loc, "minDiff"), minDiffMap, + {outputRange.offset, inputRange.offset, builder.getIndexAttr(origInputSize), zero}); + auto padAfterMap = mlir::AffineMap::get(0, 2, {s0, s1}, builder.getContext()); + padAfter = mlir::affine::makeComposedFoldedAffineMax(builder, appendLoc(loc, "paddingAfter"), padAfterMap, + {minDiffValue, builder.getIndexAttr(origPadding.second)}); + } + + return {inputRange, dimBound}; +} + +bool vpux::VPU::checkFusion(mlir::OpOperand& consumer, mlir::OpResult producerCandidate) { + // TODO E-172888 rewrite unified code for checking compatibility with current VF + + if (!mlir::isa(producerCandidate.getOwner())) { + return false; + } + + if (VPU::isPureViewOp(producerCandidate.getOwner()) || VPU::isPureViewOp(consumer.getOwner())) { + return true; + } + + const auto hasMCStategy = [](mlir::Operation* operation) { + auto clusterOp = mlir::dyn_cast(operation); + return clusterOp != nullptr && clusterOp.getMultiClusterStrategy().has_value(); + }; + + auto consumerHasStrategy = hasMCStategy(consumer.getOwner()); + auto producerHasStrategy = hasMCStategy(producerCandidate.getOwner()); + + if (!consumerHasStrategy && !producerHasStrategy) { + return true; + } + + if (consumerHasStrategy ^ producerHasStrategy) { + return false; + } + + auto producerClusterOp = mlir::cast(producerCandidate.getOwner()); + auto consumerClusterOp = mlir::cast(consumer.getOwner()); + + VPU::SiblingOpsAnalysis siblingAnalisys(consumer.getOwner()); + + auto consumerDistrType = mlir::cast( + consumerClusterOp.getDistributedTypeForOpOperand(consumer, false, siblingAnalisys)); + auto producerDistrType = mlir::cast(producerClusterOp.getDistributedTypeForOpResult( + producerCandidate, producerClusterOp.getMultiClusterStrategy().value(), siblingAnalisys, false)); - return inputRange; + return areDistributionAttrsCompatible(producerDistrType, consumerDistrType, true).succeeded(); } diff --git a/src/vpux_compiler/src/dialect/VPU/utils/se_padding_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/se_padding_utils.cpp index 594866f304..dae893d88c 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/se_padding_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/se_padding_utils.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/utils/se_padding_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/utils/core/numeric.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/VPU/utils/se_roll_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/se_roll_utils.cpp index 58c57ca1de..e48a2cae52 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/se_roll_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/se_roll_utils.cpp @@ -3,14 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/VPU/utils/se_roll_utils.hpp" #include "vpux/compiler/dialect/IE/utils/roll_utils.hpp" - #include "vpux/compiler/dialect/VPU/IR/se_attributes.hpp" #include "vpux/compiler/dialect/VPU/utils/conv_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/se_roll_utils.hpp" - -#include "vpux/compiler/utils/error.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" using namespace vpux; using namespace VPU; diff --git a/src/vpux_compiler/src/dialect/VPU/utils/sep_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/sep_utils.cpp index b20a26c0e4..a9a2ecdf10 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/sep_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/sep_utils.cpp @@ -14,33 +14,10 @@ using namespace vpux; -bool hasSEOption(mlir::ModuleOp module, StringRef seOption) { - auto pipelineOptionOp = module.lookupSymbol(VPU::PIPELINE_OPTIONS); - if (pipelineOptionOp == nullptr) { - auto logger = vpux::Logger::global(); - logger.trace("Failed to find PipelineOptions to fetch SEP option"); - return false; - } - - auto attrValue = pipelineOptionOp.lookupSymbol(seOption); - if (attrValue == nullptr) { - auto logger = vpux::Logger::global(); - logger.trace("Failed to find config.OptionOp to fetch SEP option"); - return false; - } - auto boolAttr = mlir::dyn_cast(attrValue.getOptionValue()); - if (boolAttr == nullptr) { - auto logger = vpux::Logger::global(); - logger.trace("Failed to cast config.OptionOp to BoolAttr"); - return false; - } - return boolAttr.getValue(); -} - bool VPU::hasEnableSEPtrsOperations(mlir::ModuleOp module) { - return hasSEOption(module, ENABLE_SE_PTRS_OPERATIONS); + return VPU::tryGetBoolPassOption(module, ENABLE_SE_PTRS_OPERATIONS).value_or(false); } bool VPU::hasEnableExperimentalSEPtrsOperations(mlir::ModuleOp module) { - return hasSEOption(module, ENABLE_EXPERIMENTAL_SE_PTRS_OPERATIONS); + return VPU::tryGetBoolPassOption(module, ENABLE_EXPERIMENTAL_SE_PTRS_OPERATIONS).value_or(false); } diff --git a/src/vpux_compiler/src/dialect/VPU/utils/setup_pipeline_options_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/setup_pipeline_options_utils.cpp index 1f038bd7ed..b2154dad9a 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/setup_pipeline_options_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/setup_pipeline_options_utils.cpp @@ -25,3 +25,26 @@ config::PipelineOptionsOp VPU::getPipelineOptionsOp(mlir::MLIRContext& ctx, mlir return pipelineOptionsOp; } + +std::optional VPU::tryGetBoolPassOption(mlir::ModuleOp module, StringRef attrName) { + auto pipelineOptionOp = module.lookupSymbol(VPU::PIPELINE_OPTIONS); + auto logger = vpux::Logger::global(); + + if (pipelineOptionOp == nullptr) { + logger.trace("Failed to find PipelineOptions to fetch '{0}' option", attrName); + return {}; + } + + auto attrValue = pipelineOptionOp.lookupSymbol(attrName); + if (attrValue == nullptr) { + logger.trace("Failed to find config.OptionOp to fetch '{0}' option", attrName); + return {}; + } + + auto boolAttr = mlir::dyn_cast(attrValue.getOptionValue()); + if (boolAttr == nullptr) { + logger.trace("Failed to cast config.OptionOp to BoolAttr for '{0}'", attrName); + return {}; + } + return boolAttr.getValue(); +} diff --git a/src/vpux_compiler/src/dialect/VPU/utils/sparsity_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/sparsity_utils.cpp index 46a88fa934..3ba488b7ce 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/sparsity_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/sparsity_utils.cpp @@ -5,9 +5,10 @@ #include "vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/types.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" - +#include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" #include "vpux/utils/core/error.hpp" @@ -158,8 +159,8 @@ VPU::SparsityRemovalFlag VPU::shouldRemoveOutputSparsity(mlir::Operation* op) { return SparsityRemovalFlag::CatchAllFail; } -bool VPU::isSEOnlyWithoutSMSupported(VPU::ArchKind arch) { - return arch != VPU::ArchKind::NPU37XX && arch != VPU::ArchKind::NPU40XX; +bool VPU::isSEOnlyWithoutSMSupported(config::ArchKind arch) { + return arch != config::ArchKind::NPU37XX && arch != config::ArchKind::NPU40XX; } mlir::Type VPU::getEffectiveSparseOutputType(mlir::Type sparseType) { diff --git a/src/vpux_compiler/src/dialect/VPU/utils/sprlut_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/sprlut_utils.cpp index a4cba7d112..3c8a0ad8c6 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/sprlut_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/sprlut_utils.cpp @@ -11,5 +11,5 @@ using namespace vpux; bool VPU::isSprLUTEnabled(mlir::Operation* op) { - return VPU::getConstraint(op, VPU::SPRLUT_ENABLED); + return VPU::getConstraint(op, VPU::SPRLUT_ENABLED); } diff --git a/src/vpux_compiler/src/dialect/VPU/utils/static_shape_op_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/static_shape_op_utils.cpp index b4851af562..451acf16f9 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/static_shape_op_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/static_shape_op_utils.cpp @@ -14,29 +14,6 @@ using namespace vpux; -bool hasEnableExtraStaticShapeOpOption(mlir::ModuleOp module, StringRef option) { - auto pipelineOptionOp = module.lookupSymbol(VPU::PIPELINE_OPTIONS); - if (pipelineOptionOp == nullptr) { - auto logger = vpux::Logger::global(); - logger.trace("Failed to find PipelineOptions to fetch extra shape bound option"); - return false; - } - - auto attrValue = pipelineOptionOp.lookupSymbol(option); - if (attrValue == nullptr) { - auto logger = vpux::Logger::global(); - logger.trace("Failed to find config.OptionOp to fetch extra shape bound option"); - return false; - } - auto boolAttr = mlir::dyn_cast(attrValue.getOptionValue()); - if (boolAttr == nullptr) { - auto logger = vpux::Logger::global(); - logger.trace("Failed to cast config.OptionOp to BoolAttr"); - return false; - } - return boolAttr.getValue(); -} - bool VPU::hasEnableExtraStaticShapeOps(mlir::ModuleOp module) { - return hasEnableExtraStaticShapeOpOption(module, ENABLE_EXTRA_STATIC_SHAPE_OPS); + return VPU::tryGetBoolPassOption(module, ENABLE_EXTRA_STATIC_SHAPE_OPS).value_or(false); } diff --git a/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/multi_cluster_strategy_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/multi_cluster_strategy_utils.cpp index 7f186e6b42..143234b539 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/multi_cluster_strategy_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/multi_cluster_strategy_utils.cpp @@ -6,7 +6,7 @@ #include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/core/cost_model_utils.hpp" #include "vpux/compiler/core/layers.hpp" -#include "vpux/compiler/dialect/IE/utils/permute_infer.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp" @@ -15,9 +15,9 @@ #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/op_tiling_cache.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" -#include "vpux/compiler/utils/hash.hpp" #include "vpux/compiler/utils/sparsity.hpp" #include "vpux/utils/core/numeric.hpp" @@ -25,8 +25,6 @@ #include #include -#include - using namespace vpux; using namespace VPU; @@ -118,8 +116,8 @@ mlir::Value getInputFromClusteredOp(VPU::ClusteredOpInterface clusteredOp, mlir: bool useSOKWhenMVNSide(VPU::ClusteredOpInterface clusteredOp) { // This check is a W/A for NPU40 or lower arch, where VPUNN cost is not accurate or not available (error code) // For newer platforms, a more accurate VPUNN cost model is provided so that this check should be dropped - auto arch = VPU::getArch(clusteredOp.getOperation()); - if (arch > VPU::ArchKind::NPU40XX) { + auto arch = config::getArch(clusteredOp.getOperation()); + if (arch > config::ArchKind::NPU40XX) { return false; } @@ -278,13 +276,13 @@ LayerCostModel::LayerCostModel(mlir::func::FuncOp func, bool enablePrefetchTilin _numTiles = tileOp.getCount(); _numDPUs = dpuExec.getCount(); _NCEThroughput = getNCEThroughput(); - _DMABandwidth = getDMABandwidth(VPU::getArch(tileOp), VPU::getRevisionID(module)); + _DMABandwidth = getDMABandwidth(config::getArch(tileOp), config::getRevisionID(module)); if (auto shaveActExec = tileOp.getSubExecutor(ExecutorKind::SHAVE_ACT)) { _numShaveActs = shaveActExec.getCount(); } } _numDMAPorts = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN).getCount(); - _arch = VPU::getArch(module); + _arch = config::getArch(module); _vpuDeviceType = VPU::getVPUDeviceType(_arch); _layerCostModel = VPU::CostModelConfig::createLayerCostModel(_arch); } @@ -305,13 +303,13 @@ LayerCostModel::LayerCostModel(mlir::func::FuncOp func, bool enablePrefetchTilin _numTiles = tileOp.getCount(); _numDPUs = dpuExec.getCount(); _NCEThroughput = getNCEThroughput(); - _DMABandwidth = getDMABandwidth(VPU::getArch(tileOp), VPU::getRevisionID(module)); + _DMABandwidth = getDMABandwidth(config::getArch(tileOp), config::getRevisionID(module)); if (auto shaveActExec = tileOp.getSubExecutor(ExecutorKind::SHAVE_ACT)) { _numShaveActs = shaveActExec.getCount(); } } _numDMAPorts = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN).getCount(); - _arch = VPU::getArch(module); + _arch = config::getArch(module); _vpuDeviceType = VPU::getVPUDeviceType(_arch); } @@ -445,7 +443,7 @@ double LayerCostModel::getDMACostOfType(vpux::NDTypeInterface srcType, SpillingT auto srcMode = distributedSrcType != nullptr ? distributedSrcType.getDistribution().getMode().getValue() : VPU::DistributionMode::NONE; - if (_arch == VPU::ArchKind::NPU37XX) { + if (_arch == config::ArchKind::NPU37XX) { return static_cast(getDMACost(srcType, _vpuDeviceType, _layerCostModel->get_TheoreticalDMA_cost_model_shared(), _numDMAPorts)); } @@ -475,7 +473,7 @@ double LayerCostModel::getSpillingWriteCost(vpux::NDTypeInterface srcTensorType) double LayerCostModel::getDMACostOfType(vpux::NDTypeInterface srcType, const VPU::DistributionInfo& distribution, SpillingType spillingType) const { - if (_arch == VPU::ArchKind::NPU37XX) { + if (_arch == config::ArchKind::NPU37XX) { TensorDistributionMap distributionMap; distributionMap.insert(std::make_pair(srcType, distribution)); auto distributedType = getDistributedTypeFromDistributionMap(srcType, distributionMap); @@ -910,6 +908,9 @@ double LayerCostModel::getLayerCost(VPU::ClusteredOpInterface clusteredOp, VPU:: } else if (mlir::isa(clusteredOp.getOperation())) { // Concat has no computation cost return 0.0; + } else if (mlir::isa(clusteredOp.getOperation())) { + // GatherDMAOp has no computation cost + return 0.0; } else { VPUX_THROW("Unsupported op type {0} at {1}", clusteredOp->getName(), clusteredOp->getLoc()); } @@ -960,8 +961,8 @@ HwLayerTilingStrategyCosts LayerCostModel::getDPUandDMATimeCostWithCustomTiling( // Types for each tile std::vector>> tilesTypes; - _log.trace("Start calculating VPUNN layer cost for Op {0} {1} with strategy {2} {3}", nceOp.getLoc(), - nceOp->getName(), strategy, outTiles[0].axis); + _log.trace("[Cost Analysis] {0} [{1}] Get VPUNN cost for {2} {3}", nceOp.getLoc(), nceOp->getName(), strategy, + outTiles[0].axis); const auto costParams = VPU::getWorkloadCostParam(nceOp, _arch, _numDPUs, _numTiles); @@ -974,6 +975,7 @@ HwLayerTilingStrategyCosts LayerCostModel::getDPUandDMATimeCostWithCustomTiling( } SmallVector vpunnLayerDPUCosts; + SmallVector vpunnOriginalLayerDPUCosts; const auto vpunnStrategy = VPU::getVPULayerStrategy(strategy, _numDPUs, _numTiles, _arch, 1, true, distributionMode, nceOp); @@ -990,6 +992,7 @@ HwLayerTilingStrategyCosts LayerCostModel::getDPUandDMATimeCostWithCustomTiling( return {COST_MAX, COST_MAX}; } _log.trace("VPUNN DPU layer costs {0}", vpunnLayerDPUCosts); + vpunnOriginalLayerDPUCosts = vpunnLayerDPUCosts; const auto getSpillingReadCost = [&](NDTypeInterface srcType, const TensorDistributionMap& distributions) -> uint32_t { @@ -1006,6 +1009,8 @@ HwLayerTilingStrategyCosts LayerCostModel::getDPUandDMATimeCostWithCustomTiling( const auto outOrder = DimsOrder::fromValue(nceOp->getResult(0)); const auto inputTiledOnLowestDim = isTiledOnLowestDim(outTiles[0], inOrder); const auto outputTiledOnLowestDim = isTiledOnLowestDim(outTiles[0], outOrder); + bool correctedStrideActDMACost = false; + bool correctedStrideOutputDMACost = false; // When weights are tiled over channel, the activation input has to be copied with a strided DMA. The cost of a // strided DMA is not accurate in VPUNN. It should be addressed by NNDMA cost model. Without accurate strided DMA @@ -1016,32 +1021,33 @@ HwLayerTilingStrategyCosts LayerCostModel::getDPUandDMATimeCostWithCustomTiling( [&](ArrayRef>> tilesTypes, const std::function>)>& tileTypeGetter, - SmallVector& dmaCost, bool isStridedDMA) { - if (!isStridedDMA) { - return; - } - VPUX_THROW_WHEN(dmaCost.size() != tilesTypes.size(), "DMA costs size mismatches with tiled types"); - for (auto tileId : irange(tilesTypes.size())) { - auto currentTileType = tileTypeGetter(tilesTypes[tileId]); - const auto dimOrder = currentTileType.getDimsOrder(); - const auto lowestDim = dimOrder.dimAt(dimOrder.numDims() - 1); - const Bit elemSize = currentTileType.getElemTypeSize(); - if (auto sparseTensorType = mlir::dyn_cast(currentTileType)) { - currentTileType = mlir::cast(sparseTensorType.getData()); - } - Bit continuousBitsOnLowestDim; - if (auto distributedType = mlir::dyn_cast(currentTileType)) { - continuousBitsOnLowestDim = distributedType.getLargestCompactShape()[lowestDim] * elemSize; - } else { - continuousBitsOnLowestDim = currentTileType.getShape()[lowestDim] * elemSize; - } - auto curStrideDMACorrectionThreshold = getStrideDMACorrectionThresholdByArch(getArch(nceOp)); - if (continuousBitsOnLowestDim.count() < curStrideDMACorrectionThreshold) { - auto factor = curStrideDMACorrectionThreshold / continuousBitsOnLowestDim.count(); - dmaCost[tileId] = checked_cast(std::floor(factor * dmaCost[tileId])); - } - } - }; + SmallVector& dmaCost, bool isStridedDMA) -> bool { + if (!isStridedDMA) { + return false; + } + VPUX_THROW_WHEN(dmaCost.size() != tilesTypes.size(), "DMA costs size mismatches with tiled types"); + for (auto tileId : irange(tilesTypes.size())) { + auto currentTileType = tileTypeGetter(tilesTypes[tileId]); + const auto dimOrder = currentTileType.getDimsOrder(); + const auto lowestDim = dimOrder.dimAt(dimOrder.numDims() - 1); + const Bit elemSize = currentTileType.getElemTypeSize(); + if (auto sparseTensorType = mlir::dyn_cast(currentTileType)) { + currentTileType = mlir::cast(sparseTensorType.getData()); + } + Bit continuousBitsOnLowestDim; + if (auto distributedType = mlir::dyn_cast(currentTileType)) { + continuousBitsOnLowestDim = distributedType.getLargestCompactShape()[lowestDim] * elemSize; + } else { + continuousBitsOnLowestDim = currentTileType.getShape()[lowestDim] * elemSize; + } + auto curStrideDMACorrectionThreshold = getStrideDMACorrectionThresholdByArch(config::getArch(nceOp)); + if (continuousBitsOnLowestDim.count() < curStrideDMACorrectionThreshold) { + auto factor = curStrideDMACorrectionThreshold / continuousBitsOnLowestDim.count(); + dmaCost[tileId] = checked_cast(std::floor(factor * dmaCost[tileId])); + } + } + return true; + }; auto activationTileTypeGetter = [](ArrayRef> tilesType) { auto [srcType, distributionMap] = tilesType.front(); return getDistributedTypeFromDistributionMap(srcType, distributionMap); @@ -1074,8 +1080,8 @@ HwLayerTilingStrategyCosts LayerCostModel::getDPUandDMATimeCostWithCustomTiling( _enablePrefetchTiling); cost += weightsCost; costWithPrefetching += weightsCostWithPrefetching; - _log.trace("cost + weights DMA cost {0} = {1}", weightsCost, cost); - _log.trace("costWithPrefetching + weightsWithPrefetching DMA cost {0} = {1}", weightsCostWithPrefetching, + _log.trace("Include Weights DMA cost {0}, Full layer cost now {1}", weightsCost, cost); + _log.trace("Include Weights Prefetch DMA cost {0}, Full Prefetch layer cost now {1}", weightsCostWithPrefetching, costWithPrefetching); auto getParentOp = [&]() -> mlir::Operation* { @@ -1087,44 +1093,55 @@ HwLayerTilingStrategyCosts LayerCostModel::getDPUandDMATimeCostWithCustomTiling( }; // Add activation DMA costs // Subgraph optimization will calculate input spilling cost instead of activation dma cost + SmallVector vpunnLayerActCosts; if (!isUnderSubgraphOpt() || getParentOp() == nullptr) { - auto vpunnLayerActCosts = - getPerTileActivationDMACosts(nceOp, tilesTypes, getSpillingReadCost, strategy, _numTiles); + vpunnLayerActCosts = getPerTileActivationDMACosts(nceOp, tilesTypes, getSpillingReadCost, strategy, _numTiles); // TODO: Ticket E#135490, remove this after stride DMA cost is accurate - correctStrideDMACost(tilesTypes, activationTileTypeGetter, vpunnLayerActCosts, inputTiledOnLowestDim); + correctedStrideActDMACost = + correctStrideDMACost(tilesTypes, activationTileTypeGetter, vpunnLayerActCosts, inputTiledOnLowestDim); _log.trace("VPUNN activation DMA costs {0}", vpunnLayerActCosts); _log.trace("vpunnLayerDPUCosts {0}", vpunnLayerDPUCosts); const auto actCost = getActivationDMACostForNCEOp(nceOp, outTiles, vpunnLayerDPUCosts, vpunnLayerActCosts, _enablePrefetchTiling, _log); - _log.trace(" VPUNN accumulated activation DMA cost {0} with prefetchTiling option {1}", actCost, - _enablePrefetchTiling); cost += actCost; costWithPrefetching += actCost; - _log.trace("cost + activation DMA cost {0} = {1}", actCost, cost); - _log.trace("costWithPrefetching + activation DMA cost {0} = {1}", actCost, costWithPrefetching); + _log.trace("Include Activation DMA cost {0}, Full layer cost now {1}", actCost, cost); + _log.trace("Include Activation DMA cost {0}, Prefetched full layer cost now {1}", actCost, costWithPrefetching); } // Add output spilling cost // for non clusteredOp, must be ops that requires tiling VPUX_THROW_WHEN(clusteredOp == nullptr, "NCE op at '{0}' is not a clustered op", nceOp.getLoc()); - + SmallVector vpunnLayerOutputCosts; if (!clusteredOp.doesLayerFitIntoCMX(strategy, _siblingsOpsAnalysis, /*reservedMem=*/Byte(0))) { // Consider output spilling pipelining with the next tile's DPU // Might be inaccurate when the DPU time is smaller than the sum of DMA time (input + weights + output) - auto vpunnLayerOutputCosts = getPerTileOutputDMACosts(nceOp, tilesTypes, getSpillingWriteCost); + vpunnLayerOutputCosts = getPerTileOutputDMACosts(nceOp, tilesTypes, getSpillingWriteCost); // TODO: Ticket E#135490, remove this after stride DMA cost is accurate - correctStrideDMACost(tilesTypes, outputTileTypeGetter, vpunnLayerOutputCosts, outputTiledOnLowestDim); + correctedStrideOutputDMACost = + correctStrideDMACost(tilesTypes, outputTileTypeGetter, vpunnLayerOutputCosts, outputTiledOnLowestDim); _log.trace("VPUNN output DMA costs {0}", vpunnLayerOutputCosts); _log.trace("vpunnLayerDPUCosts {0}", vpunnLayerDPUCosts); const auto outCost = getOutputDMACostForNCEOp(nceOp, outTiles, vpunnLayerDPUCosts, vpunnLayerOutputCosts, _enablePrefetchTiling, _log); - _log.trace(" VPUNN accumulated output DMA cost {0} with prefetchTiling option {1}", outCost, - _enablePrefetchTiling); cost += outCost; costWithPrefetching += outCost; - _log.trace("cost + output DMA cost {0} = {1}", outCost, cost); - _log.trace("costWithPrefetching + output DMA cost {0} = {1}", outCost, costWithPrefetching); + _log.trace("Include Output DMA cost {0}, Full layer cost now {1}", outCost, cost); + _log.trace("Include Output DMA cost {0}, Prefetched full layer cost now {1}", outCost, costWithPrefetching); + } + + _log.nest(2).trace("[Cost Analysis] Full DPU costs {0}", vpunnOriginalLayerDPUCosts); + _log.nest(2).trace("[Cost Analysis] Weights costs {0}", vpunnLayerWeightsCosts); + _log.nest(2).trace("[Cost Analysis] Activation costs {0}", vpunnLayerActCosts); + _log.nest(2).trace("[Cost Analysis] Output costs {0}", vpunnLayerOutputCosts); + _log.nest(2).trace("[Cost Analysis] Non-Pipelined DPU cost remainder {0}", vpunnLayerDPUCosts); + if (correctedStrideActDMACost || correctedStrideOutputDMACost) { + _log.nest(4).trace( + "[Cost Analysis] Note: DMA costs have been corrected for stride DMA. Activation {0}, Output {1}", + correctedStrideActDMACost, correctedStrideOutputDMACost); } + _log.trace("[Cost Analysis] Final Full Cost with No Prefetch: {0}", cost); + _log.trace("[Cost Analysis] Final Full Cost with Prefetching: {0}", costWithPrefetching); return {cost, costWithPrefetching}; } @@ -1876,7 +1893,7 @@ SmallVector vpux::VPU::getDPUCostForNCEOp(VPU::NCEOpInterface nceOp, V const std::shared_ptr& vpunnCostModel, Logger log) { // E#160175 Apply workaround only for VPUX4XXX architecture - if (costParams.arch == VPU::ArchKind::NPU40XX && VPU::isNCEWithSEPActivation(nceOp.getOperation())) { + if (costParams.arch == config::ArchKind::NPU40XX && VPU::isNCEWithSEPActivation(nceOp.getOperation())) { return SmallVector(outTiles.size(), 1); } @@ -1984,8 +2001,8 @@ SmallVector vpux::VPU::getDPUCostForNCEOp(VPU::NCEOpInterface nceOp, V } auto clusteredOp = mlir::cast(nceOp.getOperation()); - const auto arch = VPU::getArch(clusteredOp); - if (mlir::isa(nceOp.getOperation()) && arch == VPU::ArchKind::NPU40XX) { + const auto arch = config::getArch(clusteredOp); + if (mlir::isa(nceOp.getOperation()) && arch == config::ArchKind::NPU40XX) { if (mcStrategy == VPU::MultiClusterStrategy::SplitOverKernel && outTiles.size() > 1) { auto nTiles = vpunnStrategy.nTiles; auto outShape = mlir::cast(clusteredOp.getOperation()->getResult(0).getType()) @@ -2000,7 +2017,7 @@ SmallVector vpux::VPU::getDPUCostForNCEOp(VPU::NCEOpInterface nceOp, V } if (mlir::isa(nceOp.getOperation())) { auto nTiles = vpunnStrategy.nTiles; - if ((mcStrategy == VPU::MultiClusterStrategy::SplitOverKernel) && (!VPU::isArchVPUX3XXX(arch))) { + if ((mcStrategy == VPU::MultiClusterStrategy::SplitOverKernel) && (!config::isArchVPUX3XXX(arch))) { auto modeIn = VPU::getActivationTensorDistributionMode(clusteredOp, mcStrategy); auto modeOut = VPU::getOutputTensorDistributionMode(clusteredOp, mcStrategy, nullptr); @@ -2041,7 +2058,7 @@ SmallVector vpux::VPU::getDPUCostForNCEOp(VPU::NCEOpInterface nceOp, V mlir::dyn_cast(nceOp->getResult(0).getType())) && (mcStrategy == VPU::MultiClusterStrategy::SplitOverKernel)) { // op with SEP activation should not use this ratio - if (!VPU::isNCEWithSEPActivation(nceOp.getOperation()) && !VPU::isArchVPUX3XXX(arch)) { + if (!VPU::isNCEWithSEPActivation(nceOp.getOperation()) && !config::isArchVPUX3XXX(arch)) { // The VPUNN cost of ACT-SPARSITY is inaccurate // Multiply a ratio to correct the cost // Track [E#117195] @@ -2164,10 +2181,10 @@ std::pair vpux::VPU::getWeightsDMACostForNCEOp(VPU::NCEOpInt auto tiles = outTiles.empty() ? OutputTiling({TileInfo(outShape)}) : outTiles; auto tilingStrategy = tiles.front().axis; const auto isWeightsSharedNestedTiling = isWeightsFirstNestedTiling(nceOp.getOperation(), tilingStrategy); - log.trace("Weights DMA Cost found nested tiling: {0}", isWeightsSharedNestedTiling); SmallVector filteredDMACosts; SmallVector filteredDPUCosts; if (isWeightsSharedNestedTiling) { + log.trace("[Cost Analysis] Assumption: Weights First nested tiling"); // Unroll channel first // weights are partially shared. Every tile_H * tile_W weights are shared const auto temporalSize = tilingStrategy[Dims4D::Act::C]; @@ -2189,11 +2206,11 @@ std::pair vpux::VPU::getWeightsDMACostForNCEOp(VPU::NCEOpInt enablePrefetchTiling ? tilingInfoOp != nullptr && tilingInfoOp.isSupportedTiling(tiles, vpux::TilingMode::PIPELINING, log) : false; - log.trace(" weights DMA cost option: isDMAOverlappedWithDPU {1}", isDMAOverlappedWithDPU); uint32_t totalDMACost = 0; if (isDMAOverlappedWithDPU) { // Weights DMA from second tile on will be overlapped with DPU of previous tile + log.trace("[Cost Analysis] Assumption: Weights DMA will pipeline with DPU"); totalDMACost += getPrefetchDMACostOverlappsWithPreviousDPU(filteredDPUCosts, ArrayRef(filteredDMACosts), isWeightsDMASplitOnEachTile); } else { @@ -2252,6 +2269,7 @@ uint32_t vpux::VPU::getActivationDMACostForNCEOp(VPU::NCEOpInterface nceOp, cons if (isDMAOverlappedWithDPU) { // Act DMA from second tile on will be overlapped with DPU of previous tile + log.trace("[Cost Analysis] Assumption: Activation DMA will pipeline with DPU"); totalDMACost += getPrefetchDMACostOverlappsWithPreviousDPU(filteredDPUCosts, ArrayRef(filteredDMACosts), isActDMASplitOnEachTile); } else { @@ -2297,6 +2315,7 @@ uint32_t vpux::VPU::getOutputDMACostForNCEOp(VPU::NCEOpInterface nceOp, const Ou if (isDMAOverlappedWithDPU) { // Output DMA expect for the last tile will be overlapped with DPU of the next tile + log.trace("[Cost Analysis] Assumption: Output DMA will pipeline with DPU"); totalDMACost += getOutputDMACostOverlappsWithNextDPU(layerDPUCosts, layerDMACosts, true); } else { totalDMACost += std::accumulate(layerDMACosts.begin(), layerDMACosts.end(), 0U); @@ -2318,8 +2337,8 @@ bool vpux::VPU::hasLayerWithMultipleInputs(mlir::Operation* op) { } bool vpux::VPU::isSingleBatchRequired(mlir::Operation* op) { - return !mlir::isa(op); + return !mlir::isa(op); } bool vpux::VPU::setSOKForRuntimeDequantConvolution(VPU::NCEOpInterface nceOp, LayerCostModel& costModel) { @@ -2388,6 +2407,6 @@ bool vpux::VPU::alignStrategyWithParentRuntimeDequant(VPU::ClusteredOpInterface return setSOKForRuntimeDequantConvolution(nceOp, costModel); } -double vpux::VPU::getStrideDMACorrectionThresholdByArch([[maybe_unused]] VPU::ArchKind arch) { +double vpux::VPU::getStrideDMACorrectionThresholdByArch([[maybe_unused]] config::ArchKind arch) { return strideDMACorrectionThresholdInBitsV1; } diff --git a/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/sparsity_strategy.cpp b/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/sparsity_strategy.cpp index fe05644ed9..66e9fa61b8 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/sparsity_strategy.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/sparsity_strategy.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/dialect/VPU/utils/strategy_manager/sparsity_strategy.hpp" #include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" #include "vpux/utils/core/numeric.hpp" diff --git a/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/strategy_manager.cpp b/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/strategy_manager.cpp index 2378680684..57b22df156 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/strategy_manager.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/strategy_manager.cpp @@ -9,6 +9,7 @@ #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/analysis.hpp" #include @@ -205,8 +206,8 @@ void StrategyManager::assignMultiClusterStrategy(bool enableMultiClusterForSWLay const auto outputType = mlir::cast(memPermuteOp.getOutput().getType()); auto module = getModuleOp(memPermuteOp.getOperation()); const auto dmaPortNum = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN).getCount(); - if (VPUIP::isBeneficialForUsingPermuteDMA(getArch(memPermuteOp.getOperation()), inputType, - outputType, memPerm, dmaPortNum, _log)) { + if (VPUIP::isBeneficialForUsingPermuteDMA(config::getArch(memPermuteOp.getOperation()), + inputType, outputType, memPerm, dmaPortNum, _log)) { _log.trace("Operation {0} is mapped to permute DMA, do not assign strategy", origOp); return; } @@ -372,6 +373,10 @@ void StrategyManager::assignMultiClusterStrategy(bool enableMultiClusterForSWLay setLayerStrategy(VPU::MultiClusterStrategy::SplitOverGroup, origOp.getOperation()); } }) + .Case([&](GatherDMAOp /*origOp*/) { + // Multiclustering is not yet enabled for Gather DMA op + return; + }) .Default([&](mlir::Operation* unknownOp) -> void { _log.trace("Operation '{0}' at '{1}' does not support multi cluster", unknownOp->getName(), unknownOp->getLoc()); diff --git a/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/subgraph_optimizer.cpp b/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/subgraph_optimizer.cpp index 93fec65fce..0e01033a19 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/subgraph_optimizer.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/strategy_manager/subgraph_optimizer.cpp @@ -1,15 +1,16 @@ // -// Copyright (C) 2025 Intel Corporation. +// Copyright (C) 2022-2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // #include "vpux/compiler/dialect/VPU/utils/strategy_manager/subgraph_optimizer.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" -#include "vpux/compiler/dialect/VPU/utils/strategy_manager/strategy_manager.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" -#include "vpux/compiler/utils/strings.hpp" #include #include "mlir/IR/Iterators.h" @@ -286,6 +287,9 @@ double SubgraphOptimizer::getInputSpillingCostToMultiClusterLayer(VPU::Clustered } return inputSpillingCost; }) + .Case([&](GatherDMAOp origOp) { + return getInputSpillingCostToMultiClusterLayer(origOp, origOp.getIndices(), strategy, config); + }) .Default([&](mlir::Operation* origOp) { VPUX_THROW("Roll back strategy for op {0} at {1} is not supported", origOp->getName(), origOp->getLoc()); @@ -663,7 +667,7 @@ bool SubgraphOptimizer::hasLongTermSpilling(VPU::ClusteredOpInterface origOp, VP SmallVector buffersSize; buffersSize.push_back(reservedMem); - reservedMem = VPU::calculateAlignedBuffersMemoryRequirement(getArch(parentOp), buffersSize); + reservedMem = VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(parentOp), buffersSize); auto middleOpList = _shortcutsMap.at(user).second; SmallVector middleOps{&middleOpList[0], &middleOpList[middleOpList.size() - 1]}; @@ -695,6 +699,12 @@ bool SubgraphOptimizer::hasLongTermSpilling(VPU::ClusteredOpInterface origOp, VP return concat.fitIntoCMX(outputType, reservedMem); } else if (auto swOp = mlir::dyn_cast(clusteredOp.getOperation())) { return swOpFitsInCMX(swOp); + } else if (auto gatherDMA = mlir::dyn_cast(clusteredOp.getOperation())) { + const auto indicesType = + mlir::cast(gatherDMA.getIndices().getType()); + const auto outputType = + mlir::cast(gatherDMA->getResult(0).getType()); + return gatherDMA.fitIntoCMX(indicesType, outputType, reservedMem); } else { VPUX_THROW("Operation '{0}' at '{1}' has no MC strategy", clusteredOp->getName(), clusteredOp->getLoc()); diff --git a/src/vpux_compiler/src/dialect/VPU/utils/sw_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/sw_utils.cpp index 4a07aaa721..4013d39333 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/sw_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/sw_utils.cpp @@ -647,6 +647,39 @@ SmallVector vpux::VPU::getSWInputTensorNumTiles(VPU::ClusteredOpInterfa return getActivationTensorNumTiles(clusteredOp, numClustersAvailableForCompilation, strategy); } +SmallVector vpux::VPU::getSWInputTensorNumTiles(VPU::MemPermuteOp mempermuteOp, + int64_t numClustersAvailableForCompilation, + VPU::MultiClusterStrategy strategy) { + SmallVector outputTensorNumTiles; + if (strategy == VPU::MultiClusterStrategy::Clustering) { + outputTensorNumTiles = {1, 1, 1, 1}; + } else if (strategy == VPU::MultiClusterStrategy::SplitOverKernel) { + outputTensorNumTiles = {1, numClustersAvailableForCompilation, 1, 1}; + } else if (strategy == VPU::MultiClusterStrategy::SplitOverHeight) { + outputTensorNumTiles = {1, 1, numClustersAvailableForCompilation, 1}; + } else { + VPUX_THROW("{0} is an invalid multi-cluster strategy, unable to determine the number of tiles for mempermute ", + strategy); + } + + // back infer for input tensor num tiles before permutation + // outLogicDim -(map)-> outMemDim -(reversed Perm)-> inMemDim -(map)-> inLogicDim + const auto memPerm = mempermuteOp.getMemPerm(); + const auto perm = DimsOrder::fromAffineMap(memPerm); + const auto inType = mlir::cast(mempermuteOp.getInput().getType()); + const auto outType = mlir::cast(mempermuteOp.getOutput().getType()); + const auto inOrder = inType.getDimsOrder(); + const auto outOrder = outType.getDimsOrder(); + SmallVector inputTensorNumTiles(outputTensorNumTiles.size(), 1); + for (size_t outLogicInd = 0; outLogicInd < outputTensorNumTiles.size(); ++outLogicInd) { + const auto outMemDim = outOrder.dimAt(outLogicInd); + const auto inMemDim = perm.dimAt(outMemDim.ind()); + const auto inLogicDim = inOrder.dimAt(inMemDim.ind()); + inputTensorNumTiles[inLogicDim.ind()] = outputTensorNumTiles[outLogicInd]; + } + return inputTensorNumTiles; +} + SmallVector vpux::VPU::getSWInputTensorNumTiles(VPU::InterpolateOp interpolateOp, int64_t numClustersAvailableForCompilation, VPU::MultiClusterStrategy strategy, mlir::Value operand) { @@ -1269,6 +1302,9 @@ SmallVector vpux::VPU::getSWInputTensorNumTiles(VPU::ClusteredOpInterfa .Case([&](VPU::MatMulOp op) { return getSWInputTensorNumTiles(op, numClustersAvailableForCompilation, strategy); }) + .Case([&](VPU::MemPermuteOp op) { + return getSWInputTensorNumTiles(op, numClustersAvailableForCompilation, strategy); + }) .Case([&](VPU::LSTMGatesOp lstmGatesOp) { return getSWInputTensorNumTiles(lstmGatesOp, numClustersAvailableForCompilation, strategy); }) diff --git a/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/scf_tiling/scf_tiling.cpp b/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/scf_tiling/scf_tiling.cpp index 9decd4c787..cce36c8944 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/scf_tiling/scf_tiling.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/scf_tiling/scf_tiling.cpp @@ -3,13 +3,22 @@ // SPDX-License-Identifier: Apache-2.0 // +#if defined(__GNUC__) && !defined(__clang__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + #include "vpux/compiler/dialect/VPU/utils/tiling_algorithm/scf_tiling/scf_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/scf/scf_utils.hpp" - +#include "vpux/compiler/dialect/VPU/utils/sibling_ops_analysis.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/merge_vf_region_rewriter.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_algorithm.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h" +#include "mlir/IR/Dominance.h" using namespace vpux; @@ -22,7 +31,7 @@ SmallVector vpux::VPU::staticTileSizeComputation(mlir::OpBui return {}; } - auto tilingDims = getNonOneDim(strategy); + auto tilingDims = getTilingOrderedDims(operation, strategy); std::unordered_map sizes; for (auto& tile : tiles.value()) { @@ -34,20 +43,18 @@ SmallVector vpux::VPU::staticTileSizeComputation(mlir::OpBui SmallVector tileSizes; tileSizes.reserve(tilingDims.size()); - const auto tileSizeCondition = [&](auto& sizePair) -> mlir::OpFoldResult { - return builder.getIndexAttr(sizePair.second); + const auto tileSizeCondition = [&](auto index) -> mlir::OpFoldResult { + return builder.getIndexAttr(sizes[tilingDims[index]]); }; - llvm::transform(sizes, std::back_inserter(tileSizes), tileSizeCondition); + llvm::transform(llvm::seq(0, tilingDims.size()), std::back_inserter(tileSizes), tileSizeCondition); return tileSizes; } SmallVector vpux::VPU::dynamicTileSizeComputation(mlir::OpBuilder& builder, mlir::Operation* operation, ShapeRef strategy) { - // E-162801 extend to multi axes tiling - auto tilingDims = getNonOneDim(strategy); - VPUX_THROW_WHEN(tilingDims.size() != 1, "Unsupported tiling strategy for dynamic shapes"); + auto tilingDims = getTilingOrderedDims(operation, strategy); auto outputType = mlir::cast(operation->getResult(0).getType()); if (auto boundedType = mlir::dyn_cast(outputType)) { @@ -56,27 +63,32 @@ SmallVector vpux::VPU::dynamicTileSizeComputation(mlir::OpBu } auto outputShape = outputType.getShape(); - const auto tileDim = tilingDims.front(); - VPUX_THROW_WHEN(!outputType.isDynamicDim(tileDim.ind()), "Tiled axis {0} must be dynamic", tileDim); + SmallVector tileSizes; + tileSizes.reserve(tilingDims.size()); - auto loc = operation->getLoc(); + for (auto tileDim : tilingDims) { + VPUX_THROW_WHEN(!outputType.isDynamicDim(tileDim.ind()), "Tiled axis {0} must be dynamic", tileDim); - auto shapeValue = getDimValue(builder, operation, tileDim.ind()); + auto loc = operation->getLoc(); - auto optAlignment = vpux::getAlignment(operation, strategy, Shape(outputShape)); - const auto divisor = strategy[tileDim]; - const auto alignment = optAlignment.has_value() ? optAlignment.value()[tileDim.ind()] : 1; + auto shapeValue = getDimValue(builder, operation, tileDim.ind()); - mlir::OpFoldResult tileSize; - mlir::AffineExpr d0; - bindDims(builder.getContext(), d0); - auto tileSizeMap = mlir::AffineMap::get(1, 0, {(d0.ceilDiv(divisor) + alignment - 1).floorDiv(alignment)}, - builder.getContext()); - tileSize = - mlir::affine::makeComposedFoldedAffineApply(builder, appendLoc(loc, "tileSize"), tileSizeMap, {shapeValue}); + auto optAlignment = vpux::getAlignment(operation, strategy, Shape(outputShape)); + const auto divisor = strategy[tileDim]; + const auto alignment = optAlignment.has_value() ? optAlignment.value()[tileDim.ind()] : 1; - return {tileSize}; + mlir::OpFoldResult tileSize; + mlir::AffineExpr d0; + bindDims(builder.getContext(), d0); + auto tileSizeMap = mlir::AffineMap::get(1, 0, {(d0.ceilDiv(divisor) + alignment - 1).floorDiv(alignment)}, + builder.getContext()); + tileSize = mlir::affine::makeComposedFoldedAffineApply(builder, appendLoc(loc, "tileSize"), tileSizeMap, + {shapeValue}); + tileSizes.emplace_back(tileSize); + } + + return tileSizes; } mlir::LogicalResult vpux::VPU::applySCFTiling(mlir::Operation* operation, mlir::RewriterBase& builder) { @@ -123,11 +135,349 @@ mlir::LogicalResult vpux::VPU::applySCFTiling(mlir::Operation* operation, mlir:: auto argIndex = blockArg.getArgNumber() - forOp.getNumInductionVars(); forOp.getInitArgs()[argIndex].setType(operand.getType()); } + } else { + // outer loop has no insertSlice op, modify init args by setting order to the last one + forOp.getInitArgs().back().setType(operand.getType()); } }); } }); builder.replaceOp(operation, tilingResult->replacements); + return mlir::success(); } + +// copied from llvm, the logic is adjusted before llvm 20 update +static std::tuple> getUntiledProducerFromSliceSource( + mlir::OpOperand* source, ArrayRef loops) { + std::optional destinationIterArg; + auto loopIt = loops.rbegin(); + while (auto iterArg = mlir::dyn_cast(source->get())) { + auto loop = *loopIt; + if (iterArg.getOwner()->getParentOp() != loop) { + break; + } + source = loop.getTiedLoopInit(iterArg); + loopIt++; + } + if (loopIt == loops.rend()) { + destinationIterArg = source; + } + return {mlir::dyn_cast(source->get()), destinationIterArg}; +} + +/// Implementation of tile consumer and fuse producer greedily. +mlir::FailureOr tileConsumerAndFuseProducers( + mlir::RewriterBase& rewriter, mlir::TilingInterface consumer, const mlir::scf::SCFTileAndFuseOptions& options) { + // This transformation is only valid for ops that return values (i.e. not + // valid to use with operations that have memref operands). + if (!consumer->getNumResults()) { + return rewriter.notifyMatchFailure(consumer, "invalid pattern for op with no results"); + } + + // 1. First tile the consumer. + mlir::SetVector fusedProducers, tiledAndFusedOps; + llvm::SmallDenseMap origProducerToLoopResultNum; + + auto tilingResult = tileUsingSCF(rewriter, consumer, options.tilingOptions); + + if (failed(tilingResult)) { + return rewriter.notifyMatchFailure(consumer, "failed to tile consumer"); + } + for (auto* tiledOp : tilingResult->tiledOps) { + tiledAndFusedOps.insert(tiledOp); + } + + // If there are no loops generated, fusion is immaterial. + auto& loops = tilingResult->loops; + if (loops.empty()) { + DenseMap replacements; + for (auto [origVal, replacement] : llvm::zip_equal(consumer->getResults(), tilingResult->replacements)) { + replacements[origVal] = replacement; + } + return mlir::scf::SCFTileAndFuseResult{std::move(fusedProducers), std::move(tiledAndFusedOps), loops, + replacements}; + } + + // To keep track of replacements for now just record the map from the original + // untiled value to the result number of the for loop. Since the loop gets + // potentially replaced during fusion, keeping the value directly wont work. + DenseMap origValToResultNumber; + for (auto [index, result] : llvm::enumerate(consumer->getResults())) { + origValToResultNumber[result] = index; + } + + std::function&)> addCandidateSlices = + [&](mlir::Operation* fusedOp, std::deque& candidates) { + for (mlir::Value operand : fusedOp->getOperands()) { + if (auto sliceOp = operand.getDefiningOp()) { + if (candidates.empty() || llvm::find(candidates, sliceOp) == candidates.end()) { + candidates.push_back(sliceOp); + } + } else if (auto padOp = mlir::dyn_cast(operand.getDefiningOp())) { + addCandidateSlices(padOp, candidates); + } + } + }; + + std::deque candidates; + addCandidateSlices(tiledAndFusedOps.back(), candidates); + mlir::OpBuilder::InsertionGuard g(rewriter); + while (!candidates.empty()) { + // Traverse the slices in BFS fashion. + mlir::tensor::ExtractSliceOp candidateSliceOp = candidates.front(); + candidates.pop_front(); + + // Find the original producer of the slice. + auto [fusableProducer, destinationInitArg] = + getUntiledProducerFromSliceSource(&candidateSliceOp.getSourceMutable(), loops); + if (!fusableProducer) { + continue; + } + + auto [fuseSlice, yieldReplacement] = + options.fusionControlFn(candidateSliceOp, fusableProducer, destinationInitArg.has_value()); + if (!fuseSlice) { + continue; + } + + // The operands of the fused producer might themselved be slices of + // values produced by operations that implement the `TilingInterface`. + // Add these operations to the worklist. + auto fusedResult = mlir::scf::tileAndFuseProducerOfSlice(rewriter, candidateSliceOp, loops); + if (!fusedResult) { + continue; + } + + if (mlir::Operation* tiledAndFusedOp = fusedResult->tiledAndFusedProducer.getDefiningOp()) { + fusedProducers.insert(fusedResult->origProducer.getDefiningOp()); + tiledAndFusedOps.insert(tiledAndFusedOp); + addCandidateSlices(tiledAndFusedOp, candidates); + } + } + + DenseMap replacements; + for (auto [origVal, resultNumber] : origValToResultNumber) { + replacements[origVal] = loops.front()->getResult(resultNumber); + } + + return mlir::scf::SCFTileAndFuseResult{std::move(fusedProducers), std::move(tiledAndFusedOps), loops, replacements}; +} + +llvm::SetVector collectTiledAndFusedOps(mlir::Operation* op) { + SmallVector worklist; + llvm::SetVector producers; + worklist.push_back(op); + producers.insert(op); + while (!worklist.empty()) { + mlir::Operation* current = worklist.pop_back_val(); + for (mlir::OpOperand& operand : current->getOpOperands()) { + mlir::Operation* producer = operand.get().getDefiningOp(); + const auto checkProducersUsers = [&](auto* user) { + return !producers.contains(user); + }; + if (!mlir::isa_and_nonnull(producer) || producers.contains(producer) || + !vpux::VPU::checkFusion(operand, producer->getOpResult(0)) || + llvm::any_of(producer->getUsers(), checkProducersUsers)) { + continue; + } + worklist.push_back(producer); + producers.insert(producer); + } + } + return producers; +} + +mlir::FailureOr> vpux::VPU::applySCFVerticalFusion(mlir::Operation* operation, + mlir::RewriterBase& builder, + Logger log) { + if (!operation->hasAttr(tilingStrategy)) { + return mlir::failure(); + } + + auto tilingInterfaceOp = mlir::cast(operation); + + const auto strategy = + Shape(parseIntArrayAttr(mlir::cast(operation->getAttr(tilingStrategy)))); + mlir::scf::SCFTilingOptions tilingOptions; + VPU::SiblingOpsAnalysis siblingAnalisys(operation); + + // calculate tile size based on VF restrictions + auto allOpsToFuse = collectTiledAndFusedOps(operation); + + if (allOpsToFuse.size() == 1) { + return mlir::failure(); + } + + VF::v2::VFConfig config(allOpsToFuse); + + // calculate tile size for VF: + // 1. allOpsToFuse contains operations to build VF + // 2. get all allowed dimensions for these operations to tile + // 3. choose the dimension which tiles the largest dimension + // 4. get optimal tiling number for vertical fusion + // 5. calculate tile size based on computed tiling number + const auto vfTileSizeComputationFn = [&](mlir::OpBuilder& builder, + mlir::Operation* operation) -> SmallVector { + DimArr allowedDims = getAllowedDims(allOpsToFuse.getArrayRef(), log); + if (allowedDims.empty()) { + return {}; + } + + auto outputs = config.getOutputs(); + if (outputs.empty()) { + return {}; + } + + auto* lastOp = outputs.back(); + auto outputType = mlir::cast(lastOp->getResult(0).getType()); + Shape outputShape = outputType.getShape().toValues(); + if (auto clusterOp = mlir::dyn_cast(lastOp)) { + if (clusterOp.getMultiClusterStrategy().has_value()) { + outputType = clusterOp.getDistributedTypeForOpResult( + lastOp->getResult(0), clusterOp.getMultiClusterStrategy().value(), siblingAnalisys, false); + + auto distribution = VPU::DistributionInfo::getClassFromAttr( + mlir::cast(outputType).getDistribution()); + if (distribution.getMemoryShapes().empty()) { + auto optMemoryShapes = VPU::getPerClusterMemoryShapes(outputShape, distribution); + if (optMemoryShapes.has_value()) { + outputShape = Shape(optMemoryShapes.value().front()); + } + } else { + outputShape = Shape(distribution.getMemoryShapes().front()); + } + } + } + + const auto compareDims = [&](auto dimLeft, auto dimRight) { + if (outputShape[dimLeft] < 0) { + return false; + } + + if (outputShape[dimRight] < 0) { + return true; + } + + return outputShape[dimLeft] < outputShape[dimRight]; + }; + auto maxDim = std::max_element(allowedDims.begin(), allowedDims.end(), compareDims); + + if (maxDim == allowedDims.end()) { + return {}; + } + + const auto getMinTiles = [&](auto dim, const VPU::VF::v2::VFSplit&) { + const auto getDimValue = [&dim](auto* oper) -> int64_t { + return oper->hasAttr(tilingStrategy) ? Shape(parseIntArrayAttr(mlir::cast( + oper->getAttr(tilingStrategy))))[dim] + : 1; + }; + + std::set minTilesSet; + llvm::copy(config.getVFOperations() | transformed(getDimValue), + std::inserter(minTilesSet, minTilesSet.end())); + return *minTilesSet.rbegin(); + }; + const auto getMaxTiles = [&](auto dim, const VPU::VF::v2::VFSplit&) { + return getTilingLimit(dim, config.getVFOperations().getArrayRef()); + }; + + auto bestVFCase = VPU::VF::v2::getVFCaseWithTiling(config, *maxDim, {}, getMinTiles, getMaxTiles, log, + VPU::VF::v2::getSchedulingScenarios(config, log)); + + if (!bestVFCase.isInitialized()) { + return {}; + } + + auto strategy = Shape(parseIntArrayAttr(bestVFCase.getTiling())); + + if (outputType.getShape().isStatic()) { + return staticTileSizeComputation(builder, operation, strategy, getShape(operation->getResult(0))); + } + + return dynamicTileSizeComputation(builder, operation, strategy); + }; + + tilingOptions.setTileSizeComputationFunction(vfTileSizeComputationFn); + + mlir::scf::SCFTileAndFuseOptions tilingAndFuseOptions; + tilingAndFuseOptions.setTilingOptions(std::move(tilingOptions)); + + mlir::scf::SCFTileAndFuseOptions::ControlFnTy controlFn = [&](mlir::tensor::ExtractSliceOp, + mlir::OpResult originalProducer, bool) { + return std::make_tuple(allOpsToFuse.contains(originalProducer.getOwner()), false); + }; + tilingAndFuseOptions.setFusionControlFn(std::move(controlFn)); + + builder.setInsertionPoint(operation); + auto tiledResults = tileConsumerAndFuseProducers(builder, tilingInterfaceOp, tilingAndFuseOptions); + + if (mlir::failed(tiledResults) || tiledResults->replacements.empty() || tiledResults->loops.empty() || + tiledResults->fusedProducers.empty()) { + return mlir::failure(); + } + + // propagate result type with order and bounds attributes to operations + // created in SCF functions. + for (auto result : operation->getResults()) { + tiledResults->replacements[result].setType(result.getType()); + + // in case the shape is dynamic, reifyResultShapes functions may add tensor.dim operations + // to the parent of the function. in case the parent is fused to the loop + // and original operation is supposed to be removed from the IR, such users should be reassigned + // to the inputs of VF + if (mlir::cast(result.getType()).getShape().isDynamic()) { + for (auto operand : operation->getOperands()) { + auto* parentOp = operand.getDefiningOp(); + if (tiledResults->fusedProducers.contains(parentOp) && !parentOp->hasOneUse()) { + for (auto& use : llvm::make_early_inc_range(parentOp->getUses())) { + if (use.getOwner() == operation) { + continue; + } + if (auto dimTensor = mlir::dyn_cast(use.getOwner())) { + dimTensor.setOperand(use.getOperandNumber(), config.getInputs().front()->getOperand(0)); + } + } + } + } + } + } + + // E-162999 rewrite to update order attribute for output types more elegantly + llvm::for_each(tiledResults->loops, [&](mlir::LoopLikeOpInterface loopOperation) { + auto loop = mlir::cast(loopOperation); + + auto* terminator = loop.getBody()->getTerminator(); + if (terminator != nullptr) { + llvm::for_each(terminator->getOperands(), [&](mlir::Value operand) { + operand.setType(loop.getResult(0).getType()); + + if (auto insertSlice = mlir::dyn_cast_or_null(operand.getDefiningOp())) { + insertSlice.getDestMutable().get().setType(loop.getResult(0).getType()); + if (auto blockArg = mlir::dyn_cast_or_null(insertSlice.getDest())) { + auto argIndex = blockArg.getArgNumber() - loop.getNumInductionVars(); + loop.getInitArgs()[argIndex].setType(operand.getType()); + } + } + }); + } + }); + + for (mlir::OpResult res : operation->getResults()) { + if (auto replacement = tiledResults->replacements.lookup(res)) { + builder.replaceAllUsesWith(res, replacement); + } + } + + if (operation->use_empty()) { + builder.eraseOp(operation); + } + + return to_small_vector(tiledResults->fusedProducers); +} + +#if defined(__GNUC__) && !defined(__clang__) +# pragma GCC diagnostic pop +#endif diff --git a/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/tiling_context.cpp b/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/tiling_context.cpp index 2a6c8e8bf5..267b2f81a2 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/tiling_context.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/tiling_context.cpp @@ -26,7 +26,14 @@ mlir::LogicalResult TilingContext::applyTiling(mlir::RewriterBase& builder, Logg return _tilingAlgorithm->applyTiling(_operation, builder, log); } -bool isSCFSupported(mlir::Operation* operation, ShapeRef tilingStrategy) { +mlir::FailureOr> TilingContext::applyVerticalFusion(mlir::RewriterBase& builder, + Logger log) { + VPUX_THROW_WHEN(_tilingAlgorithm == nullptr, "Tiling algorithm is not specified"); + + return _tilingAlgorithm->applyVerticalFusion(_operation, builder, log); +} + +bool isSCFSupported(mlir::Operation* operation) { // E-162801 extend for operations with > 1 output if (operation->getNumResults() > 1) { return false; @@ -38,20 +45,19 @@ bool isSCFSupported(mlir::Operation* operation, ShapeRef tilingStrategy) { return false; } - // E-162801 extend to multi axes tiling - auto tilingDims = getNonOneDim(tilingStrategy); - if (tilingDims.size() != 1) { + // E-172335 add sparse tensors support + if (mlir::isa(operation->getOperand(0).getType())) { return false; } return true; } -TilingContext vpux::VPU::createTilingContext(mlir::Operation* operation, ShapeRef strategy, bool enableSCFTiling) { +TilingContext vpux::VPU::createTilingContext(mlir::Operation* operation, bool enableSCFTiling) { TilingContext context(operation); std::unique_ptr algorithm; - if (enableSCFTiling && mlir::isa(operation) && isSCFSupported(operation, strategy)) { + if (enableSCFTiling && mlir::isa(operation) && isSCFSupported(operation)) { algorithm = std::make_unique(); } else { algorithm = std::make_unique(); diff --git a/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/tiling_general_algorithm.cpp b/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/tiling_general_algorithm.cpp index 4af6b68f8a..90ebcd4ec6 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/tiling_general_algorithm.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/tiling_general_algorithm.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/utils/tiling_algorithm/tiling_general_algorithm.hpp" #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" using namespace vpux; using namespace VPU; @@ -30,3 +31,10 @@ mlir::LogicalResult TilingGeneralAlgorithm::applyTiling(mlir::Operation* operati operation->removeAttr(tilingStrategy); return VPU::applyTileStrategy(tilingBuilder, tiles.value(), builder, log); } + +mlir::FailureOr> TilingGeneralAlgorithm::applyVerticalFusion(mlir::Operation*, + mlir::RewriterBase&, + Logger) { + // TODO E-172818 move VF general algorithm here + return mlir::failure(); +} diff --git a/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/tiling_scf_algorithm.cpp b/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/tiling_scf_algorithm.cpp index 6232af442a..332cb661e2 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/tiling_scf_algorithm.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/tiling_algorithm/tiling_scf_algorithm.cpp @@ -13,3 +13,9 @@ mlir::LogicalResult TilingSCFAlgorithm::applyTiling(mlir::Operation* operation, Logger /*log*/) { return VPU::applySCFTiling(operation, builder); } + +mlir::FailureOr> TilingSCFAlgorithm::applyVerticalFusion(mlir::Operation* operation, + mlir::RewriterBase& builder, + Logger log) { + return VPU::applySCFVerticalFusion(operation, builder, log); +} diff --git a/src/vpux_compiler/src/dialect/VPU/utils/tiling_constraint_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/tiling_constraint_utils.cpp index 2f707aac98..72ccf3f2f5 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/tiling_constraint_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/tiling_constraint_utils.cpp @@ -14,12 +14,12 @@ using namespace vpux; constexpr double NPU37XX_FRAGMENTATION_AVOID_RATIO_PIPELINING_LARGE_WEIGHTS = 0.45; -const std::unordered_map fragmentationDefaultRatioForPipelinlingMap = { - {VPU::ArchKind::NPU37XX, NPU37XX_FRAGMENTATION_AVOID_RATIO_PIPELINING_LARGE_WEIGHTS}, - {VPU::ArchKind::NPU40XX, NPU37XX_FRAGMENTATION_AVOID_RATIO_PIPELINING_LARGE_WEIGHTS}, +const std::unordered_map fragmentationDefaultRatioForPipelinlingMap = { + {config::ArchKind::NPU37XX, NPU37XX_FRAGMENTATION_AVOID_RATIO_PIPELINING_LARGE_WEIGHTS}, + {config::ArchKind::NPU40XX, NPU37XX_FRAGMENTATION_AVOID_RATIO_PIPELINING_LARGE_WEIGHTS}, }; -double VPU::getFragmentationAvoidRatioPipeliningLargeWeights(VPU::ArchKind archKind) { +double VPU::getFragmentationAvoidRatioPipeliningLargeWeights(config::ArchKind archKind) { auto iter = fragmentationDefaultRatioForPipelinlingMap.find(archKind); VPUX_THROW_WHEN(iter == fragmentationDefaultRatioForPipelinlingMap.end(), "getFragmentationAvoidRatioPipeliningLargeWeights: Unsupported arch {0}", archKind); diff --git a/src/vpux_compiler/src/dialect/VPU/utils/type_infer.cpp b/src/vpux_compiler/src/dialect/VPU/utils/type_infer.cpp index 490a0957fd..277125f6bd 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/type_infer.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/type_infer.cpp @@ -4,11 +4,12 @@ // #include "vpux/compiler/dialect/VPU/utils/type_infer.hpp" - #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" -#include "vpux/compiler/dialect/IE/utils/permute_infer.hpp" #include "vpux/compiler/dialect/IE/utils/reduce_infer.hpp" #include "vpux/compiler/dialect/IE/utils/type_padding.hpp" +#include "vpux/compiler/utils/error.hpp" +#include "vpux/compiler/utils/permute_utils.hpp" +#include "vpux/compiler/utils/quantization.hpp" namespace vpux { namespace VPU { diff --git a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/merge_vf_region_base_rewriter.cpp b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/merge_vf_region_base_rewriter.cpp index ef0237cd2e..927fc14442 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/merge_vf_region_base_rewriter.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/merge_vf_region_base_rewriter.cpp @@ -5,20 +5,13 @@ #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/merge_vf_region_base_rewriter.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_case.hpp" -#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_config.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_case.hpp" -#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.hpp" -#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vf_axis_increment.hpp" -#include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/compiler/utils/VPU/tile_utils.hpp" -#include "vpux/compiler/utils/rewriter.hpp" #include #include @@ -67,7 +60,7 @@ bool isSingleTileOpCompatibleWithDistributedOutput(VPU::TilingBuilderOpInterface operationNDTypes.push_back(mlir::cast(type).getTotalAllocSize()); } const auto totalAvailableCMXSize = getTotalCMXSize(op.getOperation()); - return vpux::VPU::calculateAlignedBuffersMemoryRequirement(getArch(op.getOperation()), operationNDTypes) > + return vpux::VPU::calculateAlignedBuffersMemoryRequirement(config::getArch(op.getOperation()), operationNDTypes) > totalAvailableCMXSize; } @@ -313,7 +306,6 @@ bool MergeVFRegionBaseRewriter::alignMCTiling(VPU::VerticalFusionOp } auto inferredCurrInDistType = inferInputDistributedType(prevOutDistType, currInputViewLikeOps); - if (areDistributionAttrsCompatible(inferredCurrInDistType, actualCurrInDistType, true).failed()) { return false; } diff --git a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v1/merge_vf_region_rewriter.cpp b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v1/merge_vf_region_rewriter.cpp index 4ab27b1bea..787f5e934b 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v1/merge_vf_region_rewriter.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v1/merge_vf_region_rewriter.cpp @@ -5,24 +5,81 @@ #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/merge_vf_region_rewriter.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_config.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_scheduling_factory.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vf_axis_increment.hpp" -#include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/compiler/utils/VPU/tile_utils.hpp" -#include "vpux/compiler/utils/rewriter.hpp" #include #include #include namespace vpux::VPU::VF::v1 { + +std::optional findOptimalTilingStrategyInRange(const MergeVFRegionRewriter::IVFSchedulingPtr& scheduling, + const Dim dim, int64_t minNTiles, int64_t& maxNTiles, + std::unique_ptr& axisIncrement, + ArrayRef origTilingArray, + TilingOperationStorage::UPtr& minStorage, + TilingOperationStorage::UPtr& maxStorage, VFConfig& config, + Logger log) { + std::optional result = std::nullopt; + const auto origMaxTile = maxNTiles; + auto nextValueFromMin = minNTiles; + axisIncrement->increasedValue(nextValueFromMin, maxNTiles); + SmallVector tilingMaxStrategy(origTilingArray.begin(), origTilingArray.end()); + SmallVector tilingArray(origTilingArray.begin(), origTilingArray.end()); + + while (minNTiles < maxNTiles) { + auto currentNTiles = axisIncrement->getMiddleValue(minNTiles, maxNTiles); + + if (maxNTiles == nextValueFromMin) { + result = maxNTiles; + if (maxNTiles == origMaxTile) { + minStorage.reset(maxStorage.release()); + } + break; + } + + if (currentNTiles == minNTiles) { + return std::nullopt; + } + + tilingMaxStrategy[dim.ind()] = maxNTiles; + tilingArray[dim.ind()] = currentNTiles; + + auto opStorage = std::make_unique(); + auto getValidTilingStrategy = getMinimalValidTilingStrategyFromRange(config.getSubgraph(), tilingArray, + tilingMaxStrategy, dim, opStorage, log); + if (mlir::failed(getValidTilingStrategy)) { + return std::nullopt; + } + + tilingArray = getValidTilingStrategy.value(); + currentNTiles = tilingArray[dim.ind()]; + result = currentNTiles; + + if (currentNTiles == maxNTiles) { + break; + } + + if (scheduling->validate(config, opStorage)) { + maxNTiles = currentNTiles; + minStorage.reset(opStorage.release()); + } else { + minNTiles = currentNTiles; + } + + nextValueFromMin = minNTiles; + axisIncrement->increasedValue(nextValueFromMin, maxNTiles); + } + return result; +}; + std::optional MergeVFRegionRewriter::getOptimalTilingStrategy( const IVFSchedulingPtr& scheduling, const Dim dim, const int64_t minTiles, int64_t& maxTiles, TilingOperationStorage::UPtr& minStorage, TilingOperationStorage::UPtr& maxStorage, VFConfig& config) const { @@ -40,7 +97,7 @@ std::optional MergeVFRegionRewriter::getOptimalTilingStrategy( if (minTiles == maxTiles) { if (minStorage == nullptr) { minStorage = std::make_unique(); - auto tilingRegions = VPU::calculateTilingRegions(config.getSubgraph(), tilingArray, _log, minStorage); + auto tilingRegions = calculateTilingRegions(config.getSubgraph(), tilingArray, _log, minStorage); if (mlir::failed(tilingRegions)) { minStorage.reset(); @@ -59,8 +116,8 @@ std::optional MergeVFRegionRewriter::getOptimalTilingStrategy( if (minStorage == nullptr) { minStorage = std::make_unique(); - auto getValidStrategy = VPU::getMinimalValidTilingStrategyFromRange(config.getSubgraph(), tilingArray, - tilingMaxStrategy, dim, minStorage, _log); + auto getValidStrategy = getMinimalValidTilingStrategyFromRange(config.getSubgraph(), tilingArray, + tilingMaxStrategy, dim, minStorage, _log); if (mlir::failed(getValidStrategy)) { minStorage.reset(); @@ -76,76 +133,50 @@ std::optional MergeVFRegionRewriter::getOptimalTilingStrategy( return result; } + auto axisIncrement = getVFAxisIncrement(dim); + VPUX_THROW_WHEN(axisIncrement == nullptr, "Cannot get functions to get values for axis {0}", dim); + if (maxStorage == nullptr) { - maxStorage = std::make_unique(); - auto getValidStrategy = VPU::getMaximalValidTilingStrategyFromRange(config.getSubgraph(), tilingArray, - tilingMaxStrategy, dim, maxStorage, _log); + maxStorage = std::make_unique(config.getOperationsForTiling(), maxNTiles); + // When maxNTiles is too large, to avoid spending too much time on calculating, try to check if the cube root + // of the max tile is valid or not. + mlir::FailureOr> getValidStrategy = mlir::failure(); + auto cbrtMaxTile = getCbrtMaxTileCandidate(minNTiles, maxNTiles, axisIncrement); + if (cbrtMaxTile.has_value()) { + auto tilingCbrtMaxStrategy = tilingMaxStrategy; + tilingCbrtMaxStrategy[dim.ind()] = cbrtMaxTile.value(); + getValidStrategy = getMaximalValidTilingStrategyFromRange(config.getSubgraph(), tilingArray, + tilingCbrtMaxStrategy, dim, maxStorage, _log); + + auto useCbrtMaxTileStrategy = mlir::succeeded(getValidStrategy) && scheduling->validate(config, maxStorage); + if (useCbrtMaxTileStrategy) { + maxNTiles = getValidStrategy.value()[dim.ind()]; + result = findOptimalTilingStrategyInRange(scheduling, dim, minNTiles, maxNTiles, axisIncrement, + tilingArray, minStorage, maxStorage, config, _log); + maxStorage.reset(); + return result; + } + } + maxStorage.reset(); + getValidStrategy = getMaximalValidTilingStrategyFromRange(config.getSubgraph(), tilingArray, tilingMaxStrategy, + dim, maxStorage, _log); if (mlir::failed(getValidStrategy)) { maxStorage.reset(); return std::nullopt; } + maxTiles = tilingMaxStrategy[dim.ind()]; tilingMaxStrategy = getValidStrategy.value(); maxNTiles = tilingMaxStrategy[dim.ind()]; - maxTiles = tilingMaxStrategy[dim.ind()]; } if (!scheduling->validate(config, maxStorage)) { return std::nullopt; } - auto axisIncrement = getVFAxisIncrement(dim); - VPUX_THROW_WHEN(axisIncrement == nullptr, "Cannot get functions to get values for axis {0}", dim); - - auto nextValueFromMin = minNTiles; - axisIncrement->increasedValue(nextValueFromMin, maxNTiles); - - while (minNTiles < maxNTiles) { - auto currentNTiles = axisIncrement->getMiddleValue(minNTiles, maxNTiles); - - if (maxNTiles == nextValueFromMin) { - result = maxNTiles; - if (maxNTiles == maxTiles) { - minStorage.reset(maxStorage.release()); - } - break; - } - - if (currentNTiles == minNTiles) { - return std::nullopt; - } - - tilingMaxStrategy[dim.ind()] = maxNTiles; - tilingArray[dim.ind()] = currentNTiles; - - auto opStorage = std::make_unique(); - auto getValidTilingStrategy = VPU::getMinimalValidTilingStrategyFromRange( - config.getSubgraph(), tilingArray, tilingMaxStrategy, dim, opStorage, _log); - if (mlir::failed(getValidTilingStrategy)) { - return std::nullopt; - } - - tilingArray = getValidTilingStrategy.value(); - currentNTiles = tilingArray[dim.ind()]; - result = currentNTiles; - - if (currentNTiles == maxNTiles) { - break; - } - - if (scheduling->validate(config, opStorage)) { - maxNTiles = currentNTiles; - minStorage.reset(opStorage.release()); - } else { - minNTiles = currentNTiles; - } - - nextValueFromMin = minNTiles; - axisIncrement->increasedValue(nextValueFromMin, maxNTiles); - } - - return result; + return findOptimalTilingStrategyInRange(scheduling, dim, minNTiles, maxNTiles, axisIncrement, tilingArray, + minStorage, maxStorage, config, _log); } StrategyCost MergeVFRegionRewriter::extractVFCost(VFConfig& vfConfig) const { @@ -201,7 +232,7 @@ StrategyCost MergeVFRegionRewriter::extractVFCost(VFConfig& vfConfig) const { return false; } - const auto arch = VPU::getArch(operation); + const auto arch = config::getArch(operation); if (!VPU::spillingCopyOpsCanBeOverlapped(arch)) { return false; } diff --git a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_utils.cpp index 2c49f65f9b..32f19405de 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_utils.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/vertical_fusion_config.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vf_axis_increment.hpp" namespace vpux::VPU::VF::v1 { bool isCmxOperation(mlir::Operation* operation, const bool checkTilingType) { @@ -52,4 +53,79 @@ bool isCmxOperation(mlir::Operation* operation, const bool checkTilingType) { return !isSpatialTiling(tiling); } +// get a valid tiling strategy for VF block between the given range of tiling strategy +// it returns mlir::failure() if all tiling strategies in this range can't be supported by all operations or operations +// can't fit in CMX +// otherwise, return the valid strategy that is close to the lower or upper boundary according to closeToUpperLimit +// parameter +mlir::FailureOr> getValidTilingStrategyFromRange( + VerticalFusionOp subgraph, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, + bool closeToUpperLimit, Dim tilingAxis, TilingOperationStorage::UPtr& opStorage, Logger log) { + SmallVector validTilingStrategy = + closeToUpperLimit ? to_small_vector(upperTilingStrategy) : to_small_vector(lowerTilingStrategy); + + auto notBeyondBoundary = [](int64_t value, int64_t lowerLimit, int64_t upperLimit, bool closeToUpperLimit) { + return closeToUpperLimit ? value >= lowerLimit : value <= upperLimit; + }; + + auto axisIncrement = VPU::getVFAxisIncrement(tilingAxis); + VPUX_THROW_WHEN(axisIncrement == nullptr, "Cannot get functions to get values for axis {0}", tilingAxis); + + while (notBeyondBoundary(validTilingStrategy[tilingAxis.ind()], lowerTilingStrategy[tilingAxis.ind()], + upperTilingStrategy[tilingAxis.ind()], closeToUpperLimit)) { + auto curOpStorage = std::make_unique(); + auto tilingRegions = calculateTilingRegions(subgraph, validTilingStrategy, log, curOpStorage); + if (!mlir::failed(tilingRegions)) { + // a valid strategy is found + opStorage.reset(curOpStorage.release()); + return validTilingStrategy; + } + + auto currentValue = validTilingStrategy[tilingAxis.ind()]; + + if (closeToUpperLimit) { + axisIncrement->decreasedValue(validTilingStrategy[tilingAxis.ind()], lowerTilingStrategy[tilingAxis.ind()]); + } else { + axisIncrement->increasedValue(validTilingStrategy[tilingAxis.ind()], upperTilingStrategy[tilingAxis.ind()]); + } + + if (currentValue == validTilingStrategy[tilingAxis.ind()]) { + return mlir::failure(); + } + } + + // no valid strategy can be found + return mlir::failure(); +} + +// get a maximal valid tiling strategy for VF block between the given range of tiling strategy +// it returns mlir::failure() if all tiling strategies in this range can't be supported by all operations or operations +// can't fit in CMX +mlir::FailureOr> getMaximalValidTilingStrategyFromRange( + VerticalFusionOp subgraph, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, + Dim tilingAxis, TilingOperationStorage::UPtr& opStorage, Logger log) { + return getValidTilingStrategyFromRange(subgraph, lowerTilingStrategy, upperTilingStrategy, true, tilingAxis, + opStorage, log); +} + +// get a minimal valid tiling strategy for VF block between the given range of tiling strategy +// it returns mlir::failure() if all tiling strategies in this range can't be supported by all operations or operations +// can't fit in CMX +mlir::FailureOr> getMinimalValidTilingStrategyFromRange( + VerticalFusionOp subgraph, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, + Dim tilingAxis, TilingOperationStorage::UPtr& opStorage, Logger log) { + return getValidTilingStrategyFromRange(subgraph, lowerTilingStrategy, upperTilingStrategy, false, tilingAxis, + opStorage, log); +} + +// return the cube root of the max tile +std::optional getCbrtMaxTileCandidate(int64_t minTile, int64_t maxTile, + std::unique_ptr& axisIncrement) { + auto cbrtMaxTile = static_cast(std::floor(std::cbrt(maxTile))); + if (cbrtMaxTile > minTile && axisIncrement->getMiddleValue(minTile, cbrtMaxTile) > minTile) { + return cbrtMaxTile; + } + return std::nullopt; +} + } // namespace vpux::VPU::VF::v1 diff --git a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v1/wrap_vf_rewriter.cpp b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v1/wrap_vf_rewriter.cpp new file mode 100644 index 0000000000..61fde81fbb --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v1/wrap_vf_rewriter.cpp @@ -0,0 +1,35 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v1/wrap_vf_rewriter.hpp" +#include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" + +namespace vpux::VPU::VF::v1 { + +bool WrapVFRewriter::opNeedsTobeWrapped(VPU::VerticalFusionOpInterface op) const { + if (mlir::isa(op->getParentOp())) { + _log.trace("operation '{0}' at '{1}' is already wrapped in VF op", op->getName(), op->getLoc()); + return false; + } + + if (!op.isVFSupported()) { + _log.trace("Operation '{0}' at '{1}' doesn't support VF", op->getName(), op->getLoc()); + return false; + } + + if (op->hasAttr(tilingStrategy)) { + const auto tilingSize = parseIntArrayAttr(mlir::cast(op->getAttr(tilingStrategy))); + const auto tilingDimCount = llvm::count_if(tilingSize, [](auto value) { + return value > 1; + }); + if (tilingDimCount > 1) { + _log.trace("Operation '{0}' at '{1}' can not be wraped in VF since multi-dim tiling is not supported", + op->getName(), op->getLoc()); + return false; + } + } + return true; +} +} // namespace vpux::VPU::VF::v1 diff --git a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/merge_vf_region_rewriter.cpp b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/merge_vf_region_rewriter.cpp index 32d0513547..8bb71d6cb6 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/merge_vf_region_rewriter.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/merge_vf_region_rewriter.cpp @@ -5,17 +5,18 @@ #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/merge_vf_region_rewriter.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_algorithm.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vf_axis_increment.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" -#include "vpux/compiler/utils/rewriter.hpp" +#include "vpux/utils/core/numeric.hpp" #include #include @@ -66,136 +67,46 @@ bool hasSpill(mlir::Operation* parentOp, mlir::Operation* currentOp, mlir::Value inputTileAxisIsSameAsMultiClusterStrategy(currentOp, currentOpOperand); } -/* - As soon as we don't have logic right now for excluding operations or break subgraph - check in advance that all users or previous block will be merged to current one -*/ -std::optional MergeVFRegionRewriter::getOptimalTilingStrategy( - const IVFSchedulingPtr& scheduling, const Dim dim, const int64_t minTiles, int64_t& maxTiles, - TilingOperationStorage::UPtr& minStorage, TilingOperationStorage::UPtr& maxStorage, VFConfig& config) const { - if (minTiles > maxTiles || maxTiles == 1) { - return std::nullopt; +bool tileOnSameDims(const VFSplit& curVFSplit, const VFSplit& preVFSplit, const int64_t linkNumber, + VFConfig& currentConfig, std::unordered_map& opDimMap) { + if (curVFSplit.size() != preVFSplit.size()) { + return false; } - - auto minNTiles = minTiles; - auto maxNTiles = maxTiles; - - std::optional result; - auto outType = mlir::cast(config.getSubgraph()->getResult(0).getType()); - auto tilingArray = SmallVector(outType.getRank(), 1); - tilingArray[dim.ind()] = minNTiles; - if (minTiles == maxTiles) { - if (minStorage == nullptr) { - minStorage = std::make_unique(); - auto tilingRegions = VPU::calculateTilingRegions(config.getSubgraph(), tilingArray, _log, minStorage); - - if (mlir::failed(tilingRegions)) { - minStorage.reset(); - return std::nullopt; - } - } - - if (scheduling->validate(config, minStorage)) { - result = minTiles; + for (const auto& item : curVFSplit) { + auto curTilingDim = item.first; + auto curInputAxesResult = VPU::backInferVFTilingDim(currentConfig, curTilingDim, opDimMap); + if (mlir::failed(curInputAxesResult)) { + return false; } - return result; - } - - auto tilingMaxStrategy = SmallVector(outType.getRank(), 1); - tilingMaxStrategy[dim.ind()] = maxNTiles; - - if (minStorage == nullptr) { - minStorage = std::make_unique(); - auto getValidStrategy = VPU::getMinimalValidTilingStrategyFromRange(config.getSubgraph(), tilingArray, - tilingMaxStrategy, dim, minStorage, _log); - - if (mlir::failed(getValidStrategy)) { - minStorage.reset(); - return std::nullopt; + auto curInputAxes = curInputAxesResult.value(); + const auto isTiledOnPreVF = preVFSplit.find(curInputAxes[linkNumber]) != preVFSplit.end(); + if (!isTiledOnPreVF) { + return false; } - - tilingArray = getValidStrategy.value(); - minNTiles = tilingArray[dim.ind()]; - } - - if (scheduling->validate(config, minStorage)) { - result = minNTiles; - return result; } + return true; +} - if (maxStorage == nullptr) { - maxStorage = std::make_unique(); - auto getValidStrategy = VPU::getMaximalValidTilingStrategyFromRange(config.getSubgraph(), tilingArray, - tilingMaxStrategy, dim, maxStorage, _log); +SmallVector getSplitFromDimArr(DimArrRef dimsToCheck, DimArrRef allowedDims, VFConfig& vfConfig) { + SmallVector splits; + for (auto dim : dimsToCheck) { + VFSplit singleSplit = {{dim, std::nullopt}}; + splits.emplace_back(singleSplit); - if (mlir::failed(getValidStrategy)) { - maxStorage.reset(); - return std::nullopt; + if (dim.ind() <= Dims4D::Act::C.ind()) { + // Only enable 2D tiling for H and W + continue; } - - tilingMaxStrategy = getValidStrategy.value(); - maxNTiles = tilingMaxStrategy[dim.ind()]; - maxTiles = tilingMaxStrategy[dim.ind()]; - } - - if (!scheduling->validate(config, maxStorage)) { - return std::nullopt; - } - - auto axisIncrement = getVFAxisIncrement(dim); - VPUX_THROW_WHEN(axisIncrement == nullptr, "Cannot get functions to get values for axis {0}", dim); - - auto nextValueFromMin = minNTiles; - axisIncrement->increasedValue(nextValueFromMin, maxNTiles); - - while (minNTiles < maxNTiles) { - auto currentNTiles = axisIncrement->getMiddleValue(minNTiles, maxNTiles); - - if (maxNTiles == nextValueFromMin) { - result = maxNTiles; - if (maxNTiles == maxTiles) { - minStorage.reset(maxStorage.release()); + for (auto otherDim : allowedDims) { + if (dim.ind() > otherDim.ind() && otherDim.ind() > Dims4D::Act::C.ind()) { + VFSplit doubleSplit = { + {otherDim, getTilingLimit(otherDim, vfConfig.getVFOperations().getArrayRef(), true)}, + {dim, std::nullopt}}; + splits.emplace_back(doubleSplit); } - break; - } - - if (currentNTiles == minNTiles) { - minStorage.reset(); - return std::nullopt; - } - - tilingMaxStrategy[dim.ind()] = maxNTiles; - tilingArray[dim.ind()] = currentNTiles; - - auto opStorage = std::make_unique(); - auto getValidTilingStrategy = VPU::getMinimalValidTilingStrategyFromRange( - config.getSubgraph(), tilingArray, tilingMaxStrategy, dim, opStorage, _log); - if (mlir::failed(getValidTilingStrategy)) { - minStorage.reset(); - return std::nullopt; - } - - tilingArray = getValidTilingStrategy.value(); - currentNTiles = tilingArray[dim.ind()]; - result = currentNTiles; - - if (currentNTiles == maxNTiles) { - minStorage.reset(opStorage.release()); - break; - } - - if (scheduling->validate(config, opStorage)) { - maxNTiles = currentNTiles; - minStorage.reset(opStorage.release()); - } else { - minNTiles = currentNTiles; } - - nextValueFromMin = minNTiles; - axisIncrement->increasedValue(nextValueFromMin, maxNTiles); } - - return result; + return splits; } StrategyCost MergeVFRegionRewriter::extractVFCost(VFConfig& vfConfig) const { @@ -269,7 +180,7 @@ StrategyCost MergeVFRegionRewriter::extractVFCost(VFConfig& vfConfig) const { return false; } - const auto arch = VPU::getArch(operation); + const auto arch = config::getArch(operation); if (!VPU::spillingCopyOpsCanBeOverlapped(arch)) { return false; } @@ -445,8 +356,8 @@ StrategyCost MergeVFRegionRewriter::extractVFCost(VFConfig& vfConfig) const { return cost; } - auto vfCase = VFCase(vfConfig, dim.value()); - vfCase.setTilingNumber(tilingDims[dim.value().ind()]); + auto vfSplit = getVFTilingSplit(tilingDims); + auto vfCase = VFCase(vfConfig, vfSplit); auto scenario = detectScenario(vfConfig); @@ -528,30 +439,6 @@ bool MergeVFRegionRewriter::canSkipMergeVF(VFConfig& vfConfig, bool opsNeedTilin return !opsNeedTiling && !vfConfig.isPipelined(); } -std::deque MergeVFRegionRewriter::getVFSchedulingChecks( - VFConfig& config) const { - std::deque vfChecks; - VFSchedulingFactory vfFactory(_enablePrefetchTiling); - - auto minimalCheck = vfFactory.createVFScenario(VFScenario::MINIMAL, _log); - - if (config.isPipelined()) { - auto pipeliningChecks = vfFactory.createVFScenario(VFScenario::VF_PIPELINING, _log); - minimalCheck->addNext(std::move(pipeliningChecks)); - } - - auto prefetchingCheck = vfFactory.createVFScenario(VFScenario::LASTOP_PREFETCHING, _log); - auto weightsCheck = vfFactory.createVFScenario(VFScenario::WEIGHTS_PREFETCHING, _log); - auto fullPrefetching = vfFactory.createVFScenario(VFScenario::FULL_PREFETCHING, _log); - weightsCheck->addNext(std::move(fullPrefetching)); - prefetchingCheck->addNext(std::move(weightsCheck)); - minimalCheck->addNext(std::move(prefetchingCheck)); - - vfChecks.emplace_back(std::move(minimalCheck)); - - return vfChecks; -} - MergeVFRegionRewriter::IVFSchedulingPtr MergeVFRegionRewriter::detectScenario(VFConfig& vfConfig) const { VFSchedulingFactory costFactory(_enablePrefetchTiling); auto scenarioKind = vfConfig.getSubgraph().getScenario().has_value() ? vfConfig.getSubgraph().getScenario().value() @@ -571,12 +458,8 @@ std::optional MergeVFRegionRewriter::findVFTiling(VPU::VerticalFusionOp VFConfig currentConfig(currentOp, _enableVerticalFusionPipelining); VFConfig prevConfig(prevOp, _enableVerticalFusionPipelining); - auto curAxis = getVFTilingDim(currentTiling, currentConfig.getVFOperations()); - auto prevAxis = getVFTilingDim(prevTiling, prevConfig.getVFOperations()); - - if (mlir::failed(curAxis) || mlir::failed(prevAxis)) { - return std::nullopt; - } + auto curVFSplit = getVFTilingSplit(currentTiling); + auto preVFSplit = getVFTilingSplit(prevTiling); bool curHasTiling = hasTiling(currentTiling); bool prevHasTiling = hasTiling(prevTiling); @@ -630,107 +513,97 @@ std::optional MergeVFRegionRewriter::findVFTiling(VPU::VerticalFusionOp return false; }; - auto vfSchedulingChecks = getVFSchedulingChecks(vfConfig); - - VPU::VFSubgraphUserSetter setter(currentOp, mergedOp); - - auto getVFCaseWithTiling = [&](const Dim curDim, const Dim prevDim) { - auto maxTiles = getTilingLimit(curDim, vfConfig.getVFOperations()); - auto minTiles = std::max(currentTiling[curDim.ind()], prevTiling[prevDim.ind()]); - - VFCase mergedCase(vfConfig, curDim); - - auto schedulingChecks = vfSchedulingChecks; - - TilingOperationStorage::UPtr maxStorage = nullptr; - TilingOperationStorage::UPtr minStorage = nullptr; - - while (!schedulingChecks.empty()) { - auto currentCheck = schedulingChecks.front(); - schedulingChecks.pop_front(); - auto numTiles = getOptimalTilingStrategy(currentCheck, curDim, minTiles, maxTiles, minStorage, maxStorage, - vfConfig); - - if (numTiles.has_value()) { - mergedCase.setTilingNumber(numTiles.value()); - mergedCase.setScheduling(currentCheck); + auto vfSchedulingChecks = getSchedulingScenarios(vfConfig, _log); + const auto linkNumber = getLinkNumber(currentOp, prevOp); - if (currentCheck->nextChecks().empty()) { - mergedCase.setTilingStorage(std::move(minStorage)); - return mergedCase; - } - for (const auto& check : currentCheck->nextChecks() | reversed) { - schedulingChecks.push_front(check); - } - minTiles = numTiles.value(); - } + const auto getMinimalNumber = [&](auto dim, const VFSplit& split) -> int64_t { + if (split.size() == 1) { + // 1D tiling + auto curInputAxes = backInferVFTilingDim(currentConfig, dim, opDimMap); + return std::max(currentTiling[dim.ind()], prevTiling[curInputAxes.value()[linkNumber].ind()]); + } else { + // 2D tiling + return MINIMUM_LENGTH_TILING; } - - return mergedCase; }; - const auto linkNumber = getLinkNumber(currentOp, prevOp); - std::optional checkedDim; - if (curHasTiling && prevHasTiling) { - auto curInputAxesResult = backInferVFTilingDim(currentConfig, curAxis.value(), opDimMap); - if (mlir::succeeded(curInputAxesResult)) { - auto curInputAxes = curInputAxesResult.value(); - if (curInputAxes[linkNumber] == prevAxis.value() && !isRegionRestrictedDim(opDimMap)) { - auto areAllAligned = llvm::all_of(vfConfig.getOperationsForTiling(), [](auto* operation) { - return mlir::isa(operation); - }); - if (prevAxis.value() != Dims4D::Act::C || !areAllAligned) { - // try to use current axis, otherwise try to find other axis - auto mergedCase = getVFCaseWithTiling(curAxis.value(), prevAxis.value()); - checkedDim = curAxis.value(); - if (mergedCase.isInitialized()) { - return mergedCase; - } - } - } + const auto getMaximalNumber = [&](auto dim, const VFSplit& split) -> int64_t { + auto maxTiles = getTilingLimit(dim, vfConfig.getVFOperations().getArrayRef()); + if (split.size() > 1) { + // 2D tiling + auto otherDimSum = getVFTilesLen(split); + maxTiles = divUp(maxTiles, otherDimSum); } - } + return maxTiles; + }; - DimArr allowedDims = getAllowedDims(vfConfig.getVFOperations(), _log); + VPU::VFSubgraphUserSetter setter(currentOp, mergedOp); + + DimArr allowedDims = getAllowedDims(vfConfig.getVFOperations().getArrayRef(), _log); if (allowedDims.empty()) { return std::nullopt; } - StrategyCost bestCost = std::numeric_limits::max(); - std::optional mergedCase = std::nullopt; - for (auto dim : allowedDims) { - // in order not to check twice dim which has been handled unsuccessfully - if (checkedDim.has_value() && checkedDim.value() == dim) { - continue; - } - // E.g., prevTiling [1, 3, 1, 1] -> permuteCast -> currentTiling [1, 1, 2, 1] - // Thus we need dim backinfer to get correct axis to compare - // As Vf inputs may be more than one, we need backinfer dim for each of them and use correct one - auto curInputDimsResult = backInferVFTilingDim(currentConfig, dim, opDimMap); - if (mlir::failed(curInputDimsResult)) { - continue; + DimArr dimsToCheck; + if (tileOnSameDims(curVFSplit, preVFSplit, linkNumber, currentConfig, opDimMap)) { + // If the current and previous VF splits are on the same dimensions, we can try to check the common dimensions + // first + for (auto& item : curVFSplit) { + dimsToCheck.push_back(item.first); } + } else { + // Otherwise, we check all allowed dimensions + dimsToCheck = allowedDims; + } - if (isRegionRestrictedDim(opDimMap)) { - continue; - } + auto getVFCaseFromSplits = [&](ArrayRef splits) -> std::optional { + StrategyCost bestCost = std::numeric_limits::max(); + std::optional mergedCase = std::nullopt; + for (auto split : splits) { + auto dim = split.rbegin()->first; + if (mlir::failed(backInferVFTilingDim(currentConfig, dim, opDimMap))) { + continue; + } + + if (isRegionRestrictedDim(opDimMap)) { + continue; + } - auto curInputDims = curInputDimsResult.value(); - auto currentVFCase = getVFCaseWithTiling(dim, curInputDims[linkNumber]); + auto currentVFCase = VPU::VF::v2::getVFCaseWithTiling(vfConfig, dim, split, getMinimalNumber, + getMaximalNumber, _log, vfSchedulingChecks); - // calculate optimal number of tiles for that dim - if (!currentVFCase.isInitialized()) { - continue; - } + // calculate optimal number of tiles for that dim + if (!currentVFCase.isInitialized()) { + continue; + } - // get vpunncost - StrategyCost cost = currentVFCase.getCost(_vpunnCostFunction, _log.nest()); - // compare cost, choose best strategy - if (cost < bestCost) { - bestCost = cost; - mergedCase = std::move(currentVFCase); + // get vpunncost + StrategyCost cost = currentVFCase.getCost(_vpunnCostFunction, _log.nest()); + // compare cost, choose best strategy + if (cost < bestCost) { + bestCost = cost; + mergedCase = std::move(currentVFCase); + } } + return mergedCase; + }; + + auto splits = getSplitFromDimArr(dimsToCheck, allowedDims, vfConfig); + auto mergedCase = getVFCaseFromSplits(splits); + if (mergedCase.has_value()) { + return mergedCase; } + + // If no valid case found, try to check dims that are not in dimsToCheck. For example, if the current VF and + // previous VF has tiled on same dimensions W. Then the allowedDims will only contains dimW instead. If merge on + // dimW is not optimal, the compiler can still have the change to merge on other supported dimensions like dimH, + // dimH&dimW, etc. + DimArr restAllowedDims; + llvm::copy_if(allowedDims, std::back_inserter(restAllowedDims), [&](const Dim& dim) { + return llvm::find(dimsToCheck, dim) == dimsToCheck.end(); + }); + auto splitsWithLowPriority = getSplitFromDimArr(restAllowedDims, allowedDims, vfConfig); + mergedCase = getVFCaseFromSplits(splitsWithLowPriority); return mergedCase; } diff --git a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/pipelining_vf_scheduling.cpp b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/pipelining_vf_scheduling.cpp index 8a5d5f950b..c262f912e2 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/pipelining_vf_scheduling.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/pipelining_vf_scheduling.cpp @@ -6,11 +6,10 @@ #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/pipelining_vf_scheduling.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_pipeline_container.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" +#include "vpux/utils/core/string_ref.hpp" static constexpr double PIPELINING_AVAILABLE_RATIO = 0.95; -constexpr StringLiteral isInPlace = "is_inplace"; // inplace attribute name - struct OpIndexWithCost { size_t tileIdx; size_t opIdx; diff --git a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_case.cpp b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_case.cpp index b8214a4187..94d7694fae 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_case.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_case.cpp @@ -22,20 +22,18 @@ vpux::VPU::MultiClusterStrategy getStrategy(mlir::Operation* operation) { namespace vpux::VPU::VF::v2 { -VFCase::VFCase(VFCase::VFConfigType& config, Dim axis): _config(config), _axis(axis) { +VFCase::VFCase(VFCase::VFConfigType& config, const VFSplit& split): _config(config), _split(split) { } VFCase::~VFCase() { } VFCase::VFCase(VFCase&& vfCase): _config(vfCase._config) { - _axis = vfCase._axis; + _split = std::move(vfCase._split); _cachedCost = vfCase._cachedCost; _vfScheduling = std::move(vfCase._vfScheduling); _vfTilingStorage = std::move(vfCase._vfTilingStorage); - _tilingNumber = vfCase._tilingNumber; - vfCase._tilingNumber = 1; vfCase._vfScheduling = nullptr; vfCase._vfTilingStorage = nullptr; } @@ -48,11 +46,9 @@ VFCase& VFCase::operator=(VFCase&& other) { std::swap(_config, other._config); std::swap(_vfScheduling, other._vfScheduling); _vfTilingStorage = std::move(other._vfTilingStorage); - _axis = other._axis; - _tilingNumber = other._tilingNumber; + _split = std::move(other._split); std::swap(_cachedCost, other._cachedCost); - other._tilingNumber = 1; other._vfScheduling = nullptr; _vfTilingStorage = nullptr; @@ -60,10 +56,9 @@ VFCase& VFCase::operator=(VFCase&& other) { } VFCase::VFCase(const VFCase& vfCase): _config(vfCase._config) { - _axis = vfCase._axis; + _split = vfCase._split; _cachedCost = vfCase._cachedCost; _vfScheduling = vfCase._vfScheduling; - _tilingNumber = vfCase._tilingNumber; _vfTilingStorage = nullptr; } @@ -73,20 +68,19 @@ VFCase& VFCase::operator=(const VFCase& other) { } _config = other._config; - _axis = other._axis; + _split = other._split; _cachedCost = other._cachedCost; _vfScheduling = other._vfScheduling; - _tilingNumber = other._tilingNumber; _vfTilingStorage = nullptr; return *this; } -void VFCase::setTilingNumber(int64_t number) { - if (_tilingNumber != number) { +void VFCase::setTilingNumber(Dim dim, int64_t number) { + if (_split[dim].value_or(1) != number) { clearCache(); } - _tilingNumber = number; + _split[dim] = number; } void VFCase::setScheduling(std::shared_ptr> vfScheduling) { @@ -114,12 +108,17 @@ StrategyCost VFCase::getCost(const std::unique_ptr& costFun if (_vfTilingStorage == nullptr) { _vfTilingStorage = std::make_unique(); auto tilingDims = parseIntArrayAttr(getTiling()); - auto tilingStorage = calculateTilingRegions(_config.getSubgraph(), tilingDims, log, _vfTilingStorage); + auto tilingStorage = calculateTilingRegions(_config, tilingDims, log, _vfTilingStorage); VPUX_THROW_WHEN(mlir::failed(tilingStorage), "Cannot get tiling regions for {0} and {1} tiles", _config.getSubgraph(), tilingDims); } - - _cachedCost = _vfScheduling->getCost(_config, _tilingNumber, _vfTilingStorage, costFunction); + VPUX_THROW_WHEN(llvm::any_of(_split, + [](const auto& item) { + return !item.second.has_value(); + }), + "Cannot get cost for VF {0} without fixed tiling number", _config.getSubgraph().getLoc()); + auto tileLen = getVFTilesLen(_split); + _cachedCost = _vfScheduling->getCost(_config, tileLen, _vfTilingStorage, costFunction); log.trace("Merged VF {0} cost {1}", _config.getSubgraph().getLoc(), _cachedCost.value()); addCMXWriteSpills(costFunction, log); log.trace("Merged VF {0} cost with spill write {1}", _config.getSubgraph().getLoc(), _cachedCost.value()); @@ -134,16 +133,12 @@ VFCase::VFConfigType& VFCase::getConfig() { return _config; } -mlir::ArrayAttr VFCase::getTiling() const { - auto outType = mlir::cast(_config.getSubgraph()->getResult(0).getType()); - auto tilingArray = SmallVector(outType.getRank(), 1); - tilingArray[_axis.ind()] = _tilingNumber; - - return getIntArrayAttr(_config.getSubgraph().getContext(), tilingArray); -} - -int64_t VFCase::getTilingNumber() const { - return _tilingNumber; +mlir::ArrayAttr VFCase::getTiling() { + auto outputs = _config.getOutputs(); + VPUX_THROW_WHEN(outputs.empty(), "No outputs for VF"); + auto outType = mlir::cast(outputs.front()->getResult(0).getType()); + auto tilingArray = restoreTilingBySplit(outType.getRank(), _split); + return getIntArrayAttr(outputs.front()->getContext(), tilingArray); } void VFCase::approveScheduling() { @@ -154,7 +149,12 @@ void VFCase::approveScheduling() { } bool VFCase::isInitialized() { - return _vfScheduling != nullptr && _tilingNumber > 1; + return _vfScheduling != nullptr && + llvm::all_of(_split, + [](const auto& item) { + return item.second.has_value(); + }) && + getVFTilesLen(_split) > 1; } void VFCase::addCMXReadSpills(const std::unique_ptr& costFunction, Logger log) { diff --git a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.cpp b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.cpp index afdd27352d..61aeae4857 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_utils.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" using namespace vpux; @@ -21,6 +22,11 @@ VFConfig::VFConfig(VPU::VerticalFusionOp vfOp, bool enableVFPipelining /*true*/, _isVFPipelineCandidate = _isPipelineEnabled && isVFPipelinePattern(); } +VFConfig::VFConfig(const llvm::SetVector& operations): _subgraph(nullptr), _isPipelineEnabled(true) { + _vfOps = std::move(operations); + _isVFPipelineCandidate = isVFPipelinePattern(); +} + bool VFConfig::isVFPipelinePattern() { // if we have operations with both executors const auto filterNCE = [](mlir::Operation* op) { @@ -46,13 +52,21 @@ bool VFConfig::isVFPipelinePattern() { llvm::all_of(checkedOperations, filterSWKernels)); } -const SmallVector& VFConfig::getVFOperations() { +void VFConfig::validateConfig() { + VPUX_THROW_WHEN(_vfOps.empty() && _subgraph == nullptr, + "Vertical fusion config should be enabled by wrapped operation or list of operations"); +} + +const llvm::SetVector& VFConfig::getVFOperations() { + validateConfig(); if (_vfOps.empty()) { const auto getOpPointer = [](auto& op) -> mlir::Operation* { return &op; }; - llvm::copy(_subgraph.getBody()->without_terminator() | transformed(getOpPointer), std::back_inserter(_vfOps)); + auto operations = _subgraph.getBody()->without_terminator() | transformed(getOpPointer); + _vfOps.insert(operations.begin(), operations.end()); } + return _vfOps; } @@ -63,7 +77,9 @@ SmallVector VFConfig::getOperationsForTiling() { } void VFConfig::invalidatePointers() { - _vfOps.clear(); + if (_subgraph != nullptr) { + _vfOps.clear(); + } _largestOp = nullptr; _inputOps.clear(); _outputOps.clear(); @@ -76,7 +92,7 @@ VPU::VerticalFusionOp VFConfig::getSubgraph() const { mlir::Operation* VFConfig::getLargestOp() { if (_largestOp == nullptr) { - auto operations = _subgraph.getBody()->without_terminator(); + auto operations = getVFOperations(); const auto sumTypes = [&](const Byte& sum, mlir::Value value) { return sum + mlir::cast(value.getType()).getTotalAllocSize(); @@ -85,29 +101,35 @@ mlir::Operation* VFConfig::getLargestOp() { const auto getAllocationSize = [&](auto valueList) -> Byte { return std::accumulate(valueList.begin(), valueList.end(), Byte(0), sumTypes); }; + const auto getTotalAllocationSize = [&](auto& operation) { + if (operation->hasAttr(isInPlace)) { + return getAllocationSize(operation->getOperands()); + } + return getAllocationSize(operation->getOperands()) + getAllocationSize(operation->getResults()); + }; auto largestOperation = std::max_element(operations.begin(), operations.end(), [&](auto& op1, auto& op2) { - return getAllocationSize(op1.getOperands()) + getAllocationSize(op1.getResults()) < - getAllocationSize(op2.getOperands()) + getAllocationSize(op2.getResults()); + return getTotalAllocationSize(op1) < getTotalAllocationSize(op2); }); if (largestOperation == operations.end()) { return nullptr; } - _largestOp = &(*largestOperation); + _largestOp = *largestOperation; } return _largestOp; } const SmallVector& VFConfig::getInputs() { if (_inputOps.empty()) { - const auto allOperandsInputs = [](auto* current) -> bool { - return llvm::all_of(current->getOperands(), [](mlir::Value operand) { - return mlir::dyn_cast(operand) != nullptr; + auto operations = getVFOperations(); + const auto allOperandsInputs = [&](auto* current) -> bool { + return llvm::all_of(current->getOperands(), [&](mlir::Value operand) { + return mlir::dyn_cast(operand) != nullptr || + !_vfOps.contains(operand.getDefiningOp()); }); }; - auto operations = getVFOperations(); for (auto* operation : operations) { if (!mlir::isa(operation)) { continue; @@ -116,8 +138,12 @@ const SmallVector& VFConfig::getInputs() { if (!allOperandsInputs(operation)) { bool notInput = false; for (auto operand : operation->getOperands()) { - if (!mlir::isa(operand)) { + if (!mlir::isa(operand) && + !mlir::isa(operand.getDefiningOp())) { auto* parent = operand.getDefiningOp(); + if (!_vfOps.contains(parent)) { + break; + } while (parent != nullptr) { if (mlir::isa(parent)) { notInput = true; @@ -139,10 +165,20 @@ const SmallVector& VFConfig::getInputs() { const SmallVector& VFConfig::getOutputs() { if (_outputOps.empty()) { - _outputOps = to_small_vector(_subgraph.getBody()->getTerminator()->getOperands() | - transformed([](auto operand) -> mlir::Operation* { - return operand.getDefiningOp(); - })); + if (_subgraph != nullptr) { + _outputOps = to_small_vector(_subgraph.getBody()->getTerminator()->getOperands() | + transformed([](auto operand) -> mlir::Operation* { + return operand.getDefiningOp(); + })); + } else { + auto operations = getVFOperations(); + const auto hasNoUserInVF = [this](auto* operation) { + return llvm::none_of(operation->getUsers(), [&](auto* user) { + return _vfOps.contains(user); + }); + }; + _outputOps = to_small_vector(operations | filtered(hasNoUserInVF)); + } } return _outputOps; } @@ -152,8 +188,8 @@ bool VFConfig::isPipelined() const { } SmallVector VFConfig::getOperationTypes(mlir::Operation* operation) { - VPUX_THROW_WHEN(llvm::find(getVFOperations(), operation) == _vfOps.end(), "Cannot find operation {0} in VF {1}", - *operation, _subgraph); + getVFOperations(); + VPUX_THROW_WHEN(!_vfOps.contains(operation), "Cannot find operation {0} in VF", *operation); auto origShape = Shape(getShape(operation->getResult(0))); if (_tilesCache.find(operation) == _tilesCache.end()) { diff --git a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_utils.cpp index 8efbcb9c60..604530b543 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_utils.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_config.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_axis_increment.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" namespace vpux::VPU::VF::v2 { @@ -62,6 +63,85 @@ bool isCmxOperation(mlir::Operation* operation, const bool checkTilingType) { return !isSpatialTiling(tiling); } +mlir::FailureOr calculateTilingRegions(VFConfig& config, ArrayRef tilingStrategy, Logger log, + const TilingOperationStorage::UPtr& opStorage) { + auto outputOp = config.getSubgraph() != nullptr ? config.getSubgraph() : config.getOutputs().back(); + const auto outputShape = getShape(outputOp->getResult(0)); + const auto strategy = Shape(tilingStrategy); + + const auto tiles = fillDividedTiles(outputOp, strategy, outputShape); + if (mlir::failed(tiles)) { + return mlir::failure(); + } + + return calculateTilingRegions(config.getOutputs().back(), tiles.value(), log, opStorage, config.getVFOperations()); +} + +// get a valid tiling strategy for VF block between the given range of tiling strategy +// it returns mlir::failure() if all tiling strategies in this range can't be supported by all operations or operations +// can't fit in CMX +// otherwise, return the valid strategy that is close to the lower or upper boundary according to closeToUpperLimit +// parameter +mlir::FailureOr> getValidTilingStrategyFromRange( + VFConfig& config, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, + bool closeToUpperLimit, Dim tilingAxis, TilingOperationStorage::UPtr& opStorage, Logger log) { + SmallVector validTilingStrategy = + closeToUpperLimit ? to_small_vector(upperTilingStrategy) : to_small_vector(lowerTilingStrategy); + + auto notBeyondBoundary = [](int64_t value, int64_t lowerLimit, int64_t upperLimit, bool closeToUpperLimit) { + return closeToUpperLimit ? value >= lowerLimit : value <= upperLimit; + }; + + auto axisIncrement = VPU::getVFAxisIncrement(tilingAxis); + VPUX_THROW_WHEN(axisIncrement == nullptr, "Cannot get functions to get values for axis {0}", tilingAxis); + + while (notBeyondBoundary(validTilingStrategy[tilingAxis.ind()], lowerTilingStrategy[tilingAxis.ind()], + upperTilingStrategy[tilingAxis.ind()], closeToUpperLimit)) { + auto curOpStorage = std::make_unique(); + auto tilingRegions = calculateTilingRegions(config, validTilingStrategy, log, curOpStorage); + if (!mlir::failed(tilingRegions)) { + // a valid strategy is found + opStorage.reset(curOpStorage.release()); + return validTilingStrategy; + } + + auto currentValue = validTilingStrategy[tilingAxis.ind()]; + + if (closeToUpperLimit) { + axisIncrement->decreasedValue(validTilingStrategy[tilingAxis.ind()], lowerTilingStrategy[tilingAxis.ind()]); + } else { + axisIncrement->increasedValue(validTilingStrategy[tilingAxis.ind()], upperTilingStrategy[tilingAxis.ind()]); + } + + if (currentValue == validTilingStrategy[tilingAxis.ind()]) { + return mlir::failure(); + } + } + + // no valid strategy can be found + return mlir::failure(); +} + +// get a maximal valid tiling strategy for VF block between the given range of tiling strategy +// it returns mlir::failure() if all tiling strategies in this range can't be supported by all operations or operations +// can't fit in CMX +mlir::FailureOr> getMaximalValidTilingStrategyFromRange( + VFConfig& config, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, Dim tilingAxis, + TilingOperationStorage::UPtr& opStorage, Logger log) { + return getValidTilingStrategyFromRange(config, lowerTilingStrategy, upperTilingStrategy, true, tilingAxis, + opStorage, log); +} + +// get a minimal valid tiling strategy for VF block between the given range of tiling strategy +// it returns mlir::failure() if all tiling strategies in this range can't be supported by all operations or operations +// can't fit in CMX +mlir::FailureOr> getMinimalValidTilingStrategyFromRange( + VFConfig& config, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, Dim tilingAxis, + TilingOperationStorage::UPtr& opStorage, Logger log) { + return getValidTilingStrategyFromRange(config, lowerTilingStrategy, upperTilingStrategy, false, tilingAxis, + opStorage, log); +} + bool hasBeforeDDRUsers(mlir::Operation* prevOp, mlir::Operation* nextOp) { // check if previous operation has more than 1 users apart from nextOp // and all of them are in DDR @@ -125,4 +205,45 @@ bool inputTileAxisIsSameAsMultiClusterStrategy(mlir::Operation* op, mlir::Value return isDataTiledOnSameAxisWithMCStrategy(distributedType, tilingDim); } +SmallVector restoreTilingBySplit(int64_t rank, const VFSplit& split) { + SmallVector tilingStrategy(rank, 1); + for (auto& [dim, dimValue] : split) { + if (dimValue.has_value()) { + tilingStrategy[dim.ind()] = dimValue.value(); + } + } + + return tilingStrategy; +} + +VFSplit getVFTilingSplit(ArrayRef tilingStrategy) { + VFSplit vfSplit; + + for (auto value : tilingStrategy | indexed) { + if (value.value() > 1) { + vfSplit[Dim(value.index())] = value.value(); + } + } + + return vfSplit; +} + +int64_t getVFTilesLen(const VFSplit& vfSplit) { + SmallVector splitValues; + splitValues.reserve(vfSplit.size()); + llvm::transform(vfSplit, std::back_inserter(splitValues), [](auto& kv) { + return kv.second.value_or(1); + }); + + return std::accumulate(splitValues.begin(), splitValues.end(), 1, std::multiplies()); +} + +// return the cube root of the max tile +std::optional getCbrtMaxTileCandidate(int64_t minTile, int64_t maxTile) { + auto cbrtMaxTile = static_cast(std::floor(std::cbrt(maxTile))); + if (cbrtMaxTile > minTile) { + return cbrtMaxTile; + } + return std::nullopt; +} } // namespace vpux::VPU::VF::v2 diff --git a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/wrap_vf_rewriter.cpp b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/wrap_vf_rewriter.cpp new file mode 100644 index 0000000000..85e5ab8d79 --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/v2/wrap_vf_rewriter.cpp @@ -0,0 +1,23 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/wrap_vf_rewriter.hpp" +#include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" + +namespace vpux::VPU::VF::v2 { + +bool WrapVFRewriter::opNeedsTobeWrapped(VPU::VerticalFusionOpInterface op) const { + if (mlir::isa(op->getParentOp())) { + _log.trace("operation '{0}' at '{1}' is already wrapped in VF op", op->getName(), op->getLoc()); + return false; + } + + if (!op.isVFSupported()) { + _log.trace("Operation '{0}' at '{1}' doesn't support VF", op->getName(), op->getLoc()); + return false; + } + return true; +} +} // namespace vpux::VPU::VF::v2 diff --git a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/vertical_fusion_algorithm.cpp b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/vertical_fusion_algorithm.cpp new file mode 100644 index 0000000000..4f290dddd2 --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/vertical_fusion_algorithm.cpp @@ -0,0 +1,243 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vertical_fusion_algorithm.hpp" +#include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_scheduling_factory.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/v2/vertical_fusion_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vf_axis_increment.hpp" + +namespace vpux::VPU::VF::v2 { + +std::optional findOptimalTilingStrategyInRange( + const std::shared_ptr>& scheduling, const Dim dim, int64_t minNTiles, + int64_t& maxNTiles, std::unique_ptr& axisIncrement, ArrayRef origTilingArray, + TilingOperationStorage::UPtr& minStorage, TilingOperationStorage::UPtr& maxStorage, VFConfig& config, + Logger log) { + std::optional result = std::nullopt; + const auto origMaxTile = maxNTiles; + auto nextValueFromMin = minNTiles; + axisIncrement->increasedValue(nextValueFromMin, maxNTiles); + SmallVector tilingMaxStrategy(origTilingArray.begin(), origTilingArray.end()); + SmallVector tilingArray(origTilingArray.begin(), origTilingArray.end()); + + while (minNTiles < maxNTiles) { + auto currentNTiles = axisIncrement->getMiddleValue(minNTiles, maxNTiles); + + if (maxNTiles == nextValueFromMin) { + result = maxNTiles; + if (maxNTiles == origMaxTile) { + minStorage.reset(maxStorage.release()); + } + break; + } + + if (currentNTiles == minNTiles) { + minStorage.reset(); + return std::nullopt; + } + + tilingMaxStrategy[dim.ind()] = maxNTiles; + tilingArray[dim.ind()] = currentNTiles; + + auto opStorage = std::make_unique(); + auto getValidTilingStrategy = + getMinimalValidTilingStrategyFromRange(config, tilingArray, tilingMaxStrategy, dim, opStorage, log); + if (mlir::failed(getValidTilingStrategy)) { + minStorage.reset(); + return std::nullopt; + } + + tilingArray = getValidTilingStrategy.value(); + currentNTiles = tilingArray[dim.ind()]; + result = currentNTiles; + + if (currentNTiles == maxNTiles) { + minStorage.reset(opStorage.release()); + break; + } + + if (scheduling->validate(config, opStorage)) { + maxNTiles = currentNTiles; + minStorage.reset(opStorage.release()); + } else { + minNTiles = currentNTiles; + } + + nextValueFromMin = minNTiles; + axisIncrement->increasedValue(nextValueFromMin, maxNTiles); + } + return result; +}; + +std::deque>> getSchedulingScenarios(VFCase::VFConfigType& config, + Logger log) { + std::deque>> vfChecks; + VFSchedulingFactory vfFactory(true); + + auto minimalCheck = vfFactory.createVFScenario(VFScenario::MINIMAL, log); + + if (config.isPipelined()) { + auto pipeliningChecks = vfFactory.createVFScenario(VFScenario::VF_PIPELINING, log); + minimalCheck->addNext(std::move(pipeliningChecks)); + } + + auto prefetchingCheck = vfFactory.createVFScenario(VFScenario::LASTOP_PREFETCHING, log); + auto weightsCheck = vfFactory.createVFScenario(VFScenario::WEIGHTS_PREFETCHING, log); + auto fullPrefetching = vfFactory.createVFScenario(VFScenario::FULL_PREFETCHING, log); + weightsCheck->addNext(std::move(fullPrefetching)); + prefetchingCheck->addNext(std::move(weightsCheck)); + minimalCheck->addNext(std::move(prefetchingCheck)); + + vfChecks.emplace_back(std::move(minimalCheck)); + + return vfChecks; +} + +std::optional getOptimalTilingStrategy(const std::shared_ptr>& scheduling, + const Dim dim, const VFSplit& split, const int64_t minTiles, + int64_t& maxTiles, TilingOperationStorage::UPtr& minStorage, + TilingOperationStorage::UPtr& maxStorage, VFCase::VFConfigType& config, + Logger log) { + if (minTiles > maxTiles || maxTiles == 1) { + return std::nullopt; + } + + auto minNTiles = minTiles; + auto maxNTiles = maxTiles; + + std::optional result; + auto outType = mlir::cast(config.getOutputs().back()->getResult(0).getType()); + auto tilingArray = restoreTilingBySplit(outType.getRank(), split); + tilingArray[dim.ind()] = minNTiles; + if (minTiles == maxTiles) { + if (minStorage == nullptr) { + minStorage = std::make_unique(); + auto tilingRegions = calculateTilingRegions(config, tilingArray, log, minStorage); + + if (mlir::failed(tilingRegions)) { + minStorage.reset(); + return std::nullopt; + } + } + + if (scheduling->validate(config, minStorage)) { + result = minTiles; + } + return result; + } + + auto tilingMaxStrategy = restoreTilingBySplit(outType.getRank(), split); + tilingMaxStrategy[dim.ind()] = maxNTiles; + + if (minStorage == nullptr) { + minStorage = std::make_unique(); + auto getValidStrategy = + getMinimalValidTilingStrategyFromRange(config, tilingArray, tilingMaxStrategy, dim, minStorage, log); + + if (mlir::failed(getValidStrategy)) { + minStorage.reset(); + return std::nullopt; + } + + tilingArray = getValidStrategy.value(); + minNTiles = tilingArray[dim.ind()]; + } + + if (scheduling->validate(config, minStorage)) { + result = minNTiles; + return result; + } + + auto axisIncrement = getVFAxisIncrement(dim); + VPUX_THROW_WHEN(axisIncrement == nullptr, "Cannot get functions to get values for axis {0}", dim); + + if (maxStorage == nullptr) { + maxStorage = std::make_unique(); + // When maxNTiles is too large, to avoid spending too much time on calculating, try to check if the cube root + // of the max tile is valid or not. + auto cbrtMaxTile = getCbrtMaxTileCandidate(minNTiles, maxNTiles); + mlir::FailureOr> getValidStrategy = mlir::failure(); + if (cbrtMaxTile.has_value()) { + auto tilingCbrtMaxStrategy = tilingMaxStrategy; + tilingCbrtMaxStrategy[dim.ind()] = cbrtMaxTile.value(); + getValidStrategy = getMaximalValidTilingStrategyFromRange(config, tilingArray, tilingCbrtMaxStrategy, dim, + maxStorage, log); + + auto useCbrtMaxTileStrategy = mlir::succeeded(getValidStrategy) && scheduling->validate(config, maxStorage); + if (useCbrtMaxTileStrategy) { + maxNTiles = getValidStrategy.value()[dim.ind()]; + result = findOptimalTilingStrategyInRange(scheduling, dim, minNTiles, maxNTiles, axisIncrement, + tilingArray, minStorage, maxStorage, config, log); + maxStorage.reset(); + return result; + } + maxStorage.reset(); + } + + getValidStrategy = + getMaximalValidTilingStrategyFromRange(config, tilingArray, tilingMaxStrategy, dim, maxStorage, log); + if (mlir::failed(getValidStrategy)) { + maxStorage.reset(); + return std::nullopt; + } + maxTiles = tilingMaxStrategy[dim.ind()]; + tilingMaxStrategy = getValidStrategy.value(); + maxNTiles = tilingMaxStrategy[dim.ind()]; + } + + if (!scheduling->validate(config, maxStorage)) { + return std::nullopt; + } + return findOptimalTilingStrategyInRange(scheduling, dim, minNTiles, maxNTiles, axisIncrement, tilingArray, + minStorage, maxStorage, config, log); +} + +VPU::VF::v2::VFCase getVFCaseWithTiling( + VPU::VF::v2::VFConfig& config, Dim dim, const VPU::VF::v2::VFSplit& split, + const std::function& minNumCalc, + const std::function& maxNumCalc, Logger log, + const std::deque>>& vfSchedulingChecks) { + auto minTiles = minNumCalc(dim, split); + auto maxTiles = maxNumCalc(dim, split); + auto mergedCase = VPU::VF::v2::VFCase(config, split); + + if (maxTiles < 0) { + mergedCase.setTilingNumber(dim, minTiles); + mergedCase.setScheduling(vfSchedulingChecks.front()); + return mergedCase; + } + + auto schedulingChecks = vfSchedulingChecks; + + TilingOperationStorage::UPtr maxStorage = nullptr; + TilingOperationStorage::UPtr minStorage = nullptr; + + while (!schedulingChecks.empty()) { + auto currentCheck = schedulingChecks.front(); + schedulingChecks.pop_front(); + auto numTiles = getOptimalTilingStrategy(currentCheck, dim, split, minTiles, maxTiles, minStorage, maxStorage, + config, log); + + if (numTiles.has_value()) { + mergedCase.setTilingNumber(dim, numTiles.value()); + mergedCase.setScheduling(currentCheck); + + if (currentCheck->nextChecks().empty()) { + mergedCase.setTilingStorage(std::move(minStorage)); + return mergedCase; + } + for (const auto& check : currentCheck->nextChecks() | reversed) { + schedulingChecks.push_front(check); + } + minTiles = numTiles.value(); + } + } + mergedCase.setTilingStorage(std::move(minStorage)); + + return mergedCase; +} + +} // namespace vpux::VPU::VF::v2 diff --git a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/vertical_fusion_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/vertical_fusion_utils.cpp index b2d48fbc5b..3245146a13 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/vertical_fusion_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/vertical_fusion_utils.cpp @@ -15,6 +15,7 @@ #include "vpux/compiler/dialect/VPU/utils/vertical_fusion/vf_axis_increment.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" #include "vpux/compiler/utils/dma.hpp" +#include "vpux/utils/core/numeric.hpp" #include #include @@ -56,94 +57,112 @@ mlir::FailureOr vpux::VPU::calculateTilingRegions(VPU::VerticalFu return calculateTilingRegions(lastOp, tiles, log, opStorage); } +mlir::FailureOr vpux::VPU::calculateTilingRegions(VPU::VerticalFusionOp vfOp, + ArrayRef tilingStrategy, Logger log, + const TilingOperationStorage::UPtr& opStorage) { + const auto outputShape = getShape(vfOp->getResult(0)); + const auto strategy = Shape(tilingStrategy); + + const auto tiles = fillDividedTiles(vfOp, strategy, outputShape); + if (mlir::failed(tiles)) { + return mlir::failure(); + } + + return calculateTilingRegions(vfOp, tiles.value(), log, opStorage); +} + mlir::FailureOr vpux::VPU::calculateTilingRegions(mlir::Operation* operation, const OutputTiling& tiles, Logger log, const TilingOperationStorage::UPtr& opStorage, - std::optional numTile) { + const llvm::SetVector& fusedOps) { TilingStorage storage; + // Work queue of (operation, tile, tileNumber) + using WorkItem = std::tuple; + std::queue workQueue; + + // Initialize the queue with the starting operation and its tiles for (const auto& item : tiles | indexed) { - auto tile = item.value(); + auto& tile = item.value(); + const auto tileNumber = item.index(); + workQueue.push(std::make_tuple(operation, tile, tileNumber)); + } + + // Process all operations in the queue + while (!workQueue.empty()) { + auto workItem = workQueue.front(); + auto& currentOp = std::get<0>(workItem); + auto& tile = std::get<1>(workItem); + auto& tileNumber = std::get<2>(workItem); + workQueue.pop(); auto inputTiling = TilingInfo(ArrayRef({tile})); try { - if (auto tilingBuilderOp = mlir::dyn_cast(operation)) { + if (auto tilingBuilderOp = mlir::dyn_cast(currentOp)) { inputTiling = tilingBuilderOp.backInferTileInfo(tile, log); if (opStorage != nullptr && !inputTiling.tiles.empty()) { - auto allValues = opStorage->gatherValue(operation); - const auto sameTile = [&](auto& tiling) { + auto& allValues = opStorage->gatherValue(currentOp); + const auto sameTile = [&](auto& item) { + auto& tiling = item.second; return tiling.second.shape == tile.shape && tiling.first.tiles[0].shape == inputTiling.tiles[0].shape; }; if (llvm::none_of(allValues, sameTile)) { - if (auto tilingInfoOp = mlir::dyn_cast(operation)) { - if (!isMultiClusterCompatibleForTiling(operation, {tile}, log) || + if (auto tilingInfoOp = mlir::dyn_cast(currentOp)) { + if (!isMultiClusterCompatibleForTiling(currentOp, {tile}, log) || !tilingInfoOp.isSupportedTiling({tile}, TilingMode::ISOLATED, log)) { return mlir::failure(); } } } } - } else if (auto tilingViewLikeOp = mlir::dyn_cast(operation)) { + } else if (auto tilingViewLikeOp = mlir::dyn_cast(currentOp)) { if (!tilingViewLikeOp.isSupportedOutTile(tile)) { return mlir::failure(); } inputTiling = tilingViewLikeOp.backInferTileInfo(tile, log); } else { - VPUX_THROW("Unsupported operation type {0} for VF", operation->getName()); + VPUX_THROW("Unsupported operation type {0} for VF", currentOp->getName()); } } catch (Exception&) { return mlir::failure(); } - const auto tileNumber = numTile.value_or(item.index()); - + // Store the tiling info for the current operation if (opStorage != nullptr) { - opStorage->insert(operation, tileNumber, std::make_pair(inputTiling, tile)); - log.trace("TileInfo inserted for operation {0} tile {1}, {2}", *operation, tileNumber, tile); + opStorage->insert(currentOp, tileNumber, std::make_pair(inputTiling, tile)); + log.trace("TileInfo inserted for operation at loc {0} tile {1}, {2}", currentOp->getLoc(), tileNumber, + tile); } - for (const auto& op : operation->getOperands() | indexed) { + // Process each operand of the current operation + for (const auto& op : currentOp->getOperands() | indexed) { const auto operand = op.value(); const auto indexOp = op.index(); if (auto arg = mlir::dyn_cast(operand)) { + // Store block argument info storage.insert(arg.getArgNumber(), tileNumber, inputTiling.tiles[indexOp]); log.trace("TileInfo inserted for argument {0} tile {1}, {2}", arg.getArgNumber(), tileNumber, inputTiling.tiles[indexOp]); continue; } - auto& oneTile = inputTiling.tiles[indexOp]; - auto inputTile = TileInfo(oneTile.shape, oneTile.offsets, tile.axis, tile.isCompletedTile); - auto innerStorage = calculateTilingRegions(operand.getDefiningOp(), {std::move(inputTile)}, log, opStorage, - numTile.value_or(item.index())); - if (mlir::failed(innerStorage)) { - return mlir::failure(); + if (!fusedOps.empty() && !fusedOps.contains(operand.getDefiningOp())) { + continue; } - storage.merge(innerStorage.value()); + // Create the tile for the operand and add it to the work queue + auto& oneTile = inputTiling.tiles[indexOp]; + auto inputTile = TileInfo(oneTile.shape, oneTile.offsets, tile.axis, tile.isCompletedTile); + workQueue.push(std::make_tuple(operand.getDefiningOp(), inputTile, tileNumber)); } } return storage; } -mlir::FailureOr vpux::VPU::calculateTilingRegions(VPU::VerticalFusionOp vfOp, - ArrayRef tilingStrategy, Logger log, - const TilingOperationStorage::UPtr& opStorage) { - const auto outputShape = getShape(vfOp->getResult(0)); - const auto strategy = Shape(tilingStrategy); - - const auto tiles = fillDividedTiles(vfOp, strategy, outputShape); - if (mlir::failed(tiles)) { - return mlir::failure(); - } - - return calculateTilingRegions(vfOp, tiles.value(), log, opStorage); -} - -int64_t vpux::VPU::getTilingLimit(Dim axis, ArrayRef operations) { +int64_t vpux::VPU::getTilingLimit(Dim axis, ArrayRef operations, bool tilingOnHW) { SmallVector axisLengthsOfNonChannelAlignedOps; SmallVector axisLengthsOfChannelAlignedOps; auto hasChannelAxis = axis == Dims4D::Act::C; @@ -158,7 +177,11 @@ int64_t vpux::VPU::getTilingLimit(Dim axis, ArrayRef operation auto limit = getMaxNumTiles(curOp)[curAxis.ind()]; if (curAxis.ind() >= Dims4D::Act::getSpatialDim(0).ind()) { - limit /= MINIMUM_LENGTH_TILING; + if (tilingOnHW) { + limit = divUp(limit, (MINIMUM_LENGTH_TILING * MINIMUM_LENGTH_TILING)); + } else { + limit = limit / MINIMUM_LENGTH_TILING; + } } limit = std::min(limit, VPU::NCEInvariant::VPU_DIMENSION_LIMIT / MINIMUM_LENGTH_TILING); if (mlir::isa(curOp)) { @@ -191,71 +214,6 @@ int64_t vpux::VPU::getTilingLimit(Dim axis, ArrayRef operation return axisIncrement->getLimitValue(axisLengthsOfChannelAlignedOps, axisLengthsOfNonChannelAlignedOps); } -// get a valid tiling strategy for VF block between the given range of tiling strategy -// it returns mlir::failure() if all tiling strategies in this range can't be supported by all operations or operations -// can't fit in CMX -// otherwise, return the valid strategy that is close to the lower or upper boundary according to closeToUpperLimit -// parameter -mlir::FailureOr> getValidTilingStrategyFromRange( - VPU::VerticalFusionOp op, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, - bool closeToUpperLimit, Dim tilingAxis, TilingOperationStorage::UPtr& opStorage, Logger log) { - SmallVector validTilingStrategy = - closeToUpperLimit ? to_small_vector(upperTilingStrategy) : to_small_vector(lowerTilingStrategy); - - auto notBeyondBoundary = [](int64_t value, int64_t lowerLimit, int64_t upperLimit, bool closeToUpperLimit) { - return closeToUpperLimit ? value >= lowerLimit : value <= upperLimit; - }; - - auto axisIncrement = getVFAxisIncrement(tilingAxis); - VPUX_THROW_WHEN(axisIncrement == nullptr, "Cannot get functions to get values for axis {0}", tilingAxis); - - while (notBeyondBoundary(validTilingStrategy[tilingAxis.ind()], lowerTilingStrategy[tilingAxis.ind()], - upperTilingStrategy[tilingAxis.ind()], closeToUpperLimit)) { - auto curOpStorage = std::make_unique(); - auto tilingRegions = calculateTilingRegions(op, validTilingStrategy, log, curOpStorage); - if (!mlir::failed(tilingRegions)) { - // a valid strategy is found - opStorage.reset(curOpStorage.release()); - return validTilingStrategy; - } - - auto currentValue = validTilingStrategy[tilingAxis.ind()]; - - if (closeToUpperLimit) { - axisIncrement->decreasedValue(validTilingStrategy[tilingAxis.ind()], lowerTilingStrategy[tilingAxis.ind()]); - } else { - axisIncrement->increasedValue(validTilingStrategy[tilingAxis.ind()], upperTilingStrategy[tilingAxis.ind()]); - } - - if (currentValue == validTilingStrategy[tilingAxis.ind()]) { - return mlir::failure(); - } - } - - // no valid strategy can be found - return mlir::failure(); -} - -// get a maximal valid tiling strategy for VF block between the given range of tiling strategy -// it returns mlir::failure() if all tiling strategies in this range can't be supported by all operations or operations -// can't fit in CMX -mlir::FailureOr> vpux::VPU::getMaximalValidTilingStrategyFromRange( - VPU::VerticalFusionOp op, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, - Dim tilingAxis, TilingOperationStorage::UPtr& opStorage, Logger log) { - return getValidTilingStrategyFromRange(op, lowerTilingStrategy, upperTilingStrategy, true, tilingAxis, opStorage, - log); -} - -// get a minimal valid tiling strategy for VF block between the given range of tiling strategy -// it returns mlir::failure() if all tiling strategies in this range can't be supported by all operations or operations -// can't fit in CMX -mlir::FailureOr> vpux::VPU::getMinimalValidTilingStrategyFromRange( - VPU::VerticalFusionOp op, ArrayRef lowerTilingStrategy, ArrayRef upperTilingStrategy, - Dim tilingAxis, TilingOperationStorage::UPtr& opStorage, Logger log) { - return getValidTilingStrategyFromRange(op, lowerTilingStrategy, upperTilingStrategy, false, tilingAxis, opStorage, - log); -} - std::optional vpux::VPU::getVFTilingDim(ArrayRef tilingStrategy) { auto maxTiledLen = std::max_element(tilingStrategy.begin(), tilingStrategy.end()); if (maxTiledLen != tilingStrategy.end() && *maxTiledLen != 1) { @@ -613,7 +571,7 @@ bool vpux::VPU::isPrevOperationEarlyScheduled(mlir::Operation* prevOp, mlir::Ope return false; } -bool vpux::VPU::spillingCopyOpsCanBeOverlapped(VPU::ArchKind arch) { +bool vpux::VPU::spillingCopyOpsCanBeOverlapped(config::ArchKind arch) { return getDMAChannelsWithIndependentLinkAgents(arch) != SmallVector{VPUIP::DmaChannelType::NOT_SPECIFIED}; } diff --git a/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/wrap_vf_base_rewriter.cpp b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/wrap_vf_base_rewriter.cpp new file mode 100644 index 0000000000..025bffd750 --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPU/utils/vertical_fusion/wrap_vf_base_rewriter.cpp @@ -0,0 +1,37 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/VPU/utils/vertical_fusion/wrap_vf_base_rewriter.hpp" +#include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" + +namespace vpux::VPU::VF { + +mlir::LogicalResult WrapVFRewriterBase::matchAndRewrite(VPU::VerticalFusionOpInterface origOp, + mlir::PatternRewriter& rewriter) const { + if (!opNeedsTobeWrapped(origOp)) { + _log.trace("Operation '{0}' at '{1}' does not need to be wrapped", origOp->getName(), origOp->getLoc()); + return mlir::failure(); + } + wrapIntoVFRegion(origOp, rewriter); + return mlir::success(); +} + +void WrapVFRewriterBase::wrapIntoVFRegion(VPU::VerticalFusionOpInterface op, mlir::PatternRewriter& rewriter) const { + const auto inputType = mlir::cast(op->getOperand(0).getType()); + const SmallVector one(inputType.getRank(), 1); + + auto tilingStrategyArray = op->hasAttr(tilingStrategy) ? mlir::cast(op->getAttr(tilingStrategy)) + : getIntArrayAttr(op->getContext(), one); + const auto bodyBuilder = [op](mlir::OpBuilder& builder, mlir::Location loc, mlir::ValueRange newOperands) { + mlir::IRMapping mapper; + mapper.map(op->getOperands(), newOperands); + auto* newOp = builder.clone(*op, mapper); + newOp->removeAttr(tilingStrategy); + builder.create(loc, newOp->getResults()); + }; + rewriter.replaceOpWithNewOp(op, op->getResultTypes(), op->getOperands(), bodyBuilder, + tilingStrategyArray); +} +} // namespace vpux::VPU::VF diff --git a/src/vpux_compiler/src/dialect/VPU/utils/weights_separation.cpp b/src/vpux_compiler/src/dialect/VPU/utils/weights_separation.cpp index 1a248f5ed4..bebc467640 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/weights_separation.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/weights_separation.cpp @@ -5,17 +5,21 @@ #include "vpux/compiler/dialect/VPU/utils/weights_separation.hpp" #include "vpux/compiler/core/force_link_macros.hpp" - +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/IE/utils/reshape_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/const/attr_interfaces.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/dialect/const/utils/transformations.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/dialect/core/IR/ops.hpp" +#include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/func_dialect.hpp" #include "vpux/compiler/utils/permute_utils.hpp" +#include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/types.hpp" @@ -110,28 +114,26 @@ bool hasOnlyViewLikeTransformations(const Const::ContentAttr& contentAttr) { namespace conversions { // forward declarations bool isSupportedTransformation(vpux::NDTypeInterface inType, Const::TransformAttrInterface t); } +} // namespace -bool shouldProcessThisConstant(Const::DeclareOp constOp) { - // preserve splats in @main - they (should be) cheap to work with. - const auto contentAttr = constOp.getContentAttr(); - +bool isTrivialForWeightsSeparation(Const::DeclareOp constOp) { // E#151098: this should be handled the same way as view-like-only // transformations. + const auto contentAttr = constOp.getContentAttr(); if (contentAttr.getTransformations().empty()) { - return false; + return true; } + return contentAttr.isSplat() || ViewLikeUtils::hasOnlyViewLikeTransformations(contentAttr); +} + +bool isSuitableForWeightsSeparation(Const::DeclareOp constOp) { // ignore all non-OV constants if (!Const::isOpenVINOConstant(constOp)) { return false; } - // splat values should be quick enough to process in main() - if (contentAttr.isSplat()) { - return false; - } - - if (ViewLikeUtils::hasOnlyViewLikeTransformations(contentAttr)) { + if (isTrivialForWeightsSeparation(constOp)) { return false; } @@ -154,6 +156,7 @@ bool shouldProcessThisConstant(Const::DeclareOp constOp) { return hasOnlySupportedTransformations(constOp); } +namespace { /// Utilities related to converting const transformations to IR operations. namespace conversions { /// Returns a QuantizeCast, optionally wrapped into Convert ops to ensure @@ -516,14 +519,6 @@ mlir::Value createMatchingIeOperation(mlir::OpBuilder& builder, mlir::Location l .Case([&](Const::TransposeAttr transpose) { return builder.create(loc, input, /*order=*/nullptr, transpose.getOrder()); }) - .Case([&](Const::SparsifyAttr sparsify) -> mlir::Value { - // it's fine if sparsity is disabled - if (!sparsify.getCompressOutputType().getValue()) { - return input; - } - - return nullptr; - }) .Default([](Const::TransformAttrInterface) { return nullptr; }); @@ -549,8 +544,8 @@ bool isSupportedTransformation(vpux::NDTypeInterface inType, Const::TransformAtt return mlir::isa(t); + Const::ScalarMultInverseAttr, Const::SubViewAttr, Const::TransposeAttr, Const::AffineReshapeAttr>( + t); } /// Returns a VPU operation for the given constant transformation. @@ -879,7 +874,7 @@ bool operator<(const TransformationsSplit& x, const TransformationsSplit& y) { std::vector collectMoveWorthyConstants(const Logger& log, mlir::func::FuncOp mainFunc) { std::vector ops; mainFunc.walk([&](Const::DeclareOp constOp) { - if (!shouldProcessThisConstant(constOp)) { + if (!isSuitableForWeightsSeparation(constOp)) { log.trace("Constant is NOT used in init schedule: {0}", constOp); return; } @@ -980,11 +975,18 @@ std::vector> sliceAccordingToMemoryLimit(const return constantsForInits; } +mlir::StringRef getResourceId(mlir::DenseResourceElementsAttr attr) { + auto fullKey = getResourceName(attr); + + return fullKey.drop_front(mlir::StringRef(Const::IMPORTED_WEIGHT_PREFIX).size()); +} + std::string ConstArg::getUniqueName() const { - const auto name = getResourceName(content); - assert(!name.empty() && "Weights separation only works with dense_resource<>"); + const auto id = getResourceId(content); + + assert(!id.empty() && "Weights separation only works with dense_resource<>"); const size_t hash = Const::ContentAttr::getTransformationHash(transformations); - return formatv("out_{0}_hash_{1}", name, hash).str(); + return formatv("{0}{1}_hash_{2}", INIT_OUTPUT_PREFIX, id, hash).str(); } // We want to cache the results of mapping a list of transformations to operations to avoid the call of a diff --git a/src/vpux_compiler/src/dialect/VPU/utils/wlm_constraint_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/wlm_constraint_utils.cpp index 26baa3a5c1..7b4a83f60f 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/wlm_constraint_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/wlm_constraint_utils.cpp @@ -25,7 +25,7 @@ constexpr uint32_t NPU37XX_VARIANT_COUNT = 256; constexpr uint32_t NPU37XX_KERNEL_RANGE_COUNT = 32; constexpr uint32_t NPU37XX_KERNEL_INVO_COUNT = 64; struct TaskListKey { - VPU::ArchKind archKind; + config::ArchKind archKind; VPU::TaskType taskType; bool operator==(const TaskListKey& other) const { return (archKind == other.archKind && taskType == other.taskType); @@ -34,7 +34,7 @@ struct TaskListKey { struct TaskListKeyHash { std::size_t operator()(const TaskListKey& key) const noexcept { auto hashTask = std::hash{}(key.taskType); - auto hashArch = std::hash{}(key.archKind); + auto hashArch = std::hash{}(key.archKind); // make sure the hash function is good enough for minimizing collision occurrence (same output // for different key values) return hashTask ^ (hashArch << 3); @@ -42,20 +42,20 @@ struct TaskListKeyHash { }; const std::unordered_map taskListsDefaultCapacityMap = { - {{VPU::ArchKind::NPU37XX, VPU::TaskType::DPUInvariant}, NPU37XX_INVARIANT_COUNT}, - {{VPU::ArchKind::NPU37XX, VPU::TaskType::DPUVariant}, NPU37XX_VARIANT_COUNT}, - {{VPU::ArchKind::NPU37XX, VPU::TaskType::ActKernelInvocation}, NPU37XX_KERNEL_INVO_COUNT}, - {{VPU::ArchKind::NPU37XX, VPU::TaskType::ActKernelRange}, NPU37XX_KERNEL_RANGE_COUNT}, - {{VPU::ArchKind::NPU37XX, VPU::TaskType::DMA}, NPU37XX_DMA_TASK_COUNT}, - {{VPU::ArchKind::NPU40XX, VPU::TaskType::DPUInvariant}, NPU_DEFAULT_INVARIANT_COUNT}, - {{VPU::ArchKind::NPU40XX, VPU::TaskType::DPUVariant}, NPU_DEFAULT_VARIANT_COUNT}, - {{VPU::ArchKind::NPU40XX, VPU::TaskType::ActKernelInvocation}, NPU_DEFAULT_KERNEL_INVO_COUNT}, - {{VPU::ArchKind::NPU40XX, VPU::TaskType::ActKernelRange}, NPU_DEFAULT_KERNEL_RANGE_COUNT}, - {{VPU::ArchKind::NPU40XX, VPU::TaskType::M2I}, NPU_DEFAULT_MEDIA_COUNT}, - {{VPU::ArchKind::NPU40XX, VPU::TaskType::DMA}, NPU_DEFAULT_DMA_TASK_COUNT}, + {{config::ArchKind::NPU37XX, VPU::TaskType::DPUInvariant}, NPU37XX_INVARIANT_COUNT}, + {{config::ArchKind::NPU37XX, VPU::TaskType::DPUVariant}, NPU37XX_VARIANT_COUNT}, + {{config::ArchKind::NPU37XX, VPU::TaskType::ActKernelInvocation}, NPU37XX_KERNEL_INVO_COUNT}, + {{config::ArchKind::NPU37XX, VPU::TaskType::ActKernelRange}, NPU37XX_KERNEL_RANGE_COUNT}, + {{config::ArchKind::NPU37XX, VPU::TaskType::DMA}, NPU37XX_DMA_TASK_COUNT}, + {{config::ArchKind::NPU40XX, VPU::TaskType::DPUInvariant}, NPU_DEFAULT_INVARIANT_COUNT}, + {{config::ArchKind::NPU40XX, VPU::TaskType::DPUVariant}, NPU_DEFAULT_VARIANT_COUNT}, + {{config::ArchKind::NPU40XX, VPU::TaskType::ActKernelInvocation}, NPU_DEFAULT_KERNEL_INVO_COUNT}, + {{config::ArchKind::NPU40XX, VPU::TaskType::ActKernelRange}, NPU_DEFAULT_KERNEL_RANGE_COUNT}, + {{config::ArchKind::NPU40XX, VPU::TaskType::M2I}, NPU_DEFAULT_MEDIA_COUNT}, + {{config::ArchKind::NPU40XX, VPU::TaskType::DMA}, NPU_DEFAULT_DMA_TASK_COUNT}, }; -uint32_t VPU::getDefaultTaskListCount(VPU::TaskType taskType, VPU::ArchKind archKind) { +uint32_t VPU::getDefaultTaskListCount(VPU::TaskType taskType, config::ArchKind archKind) { auto taskListCapacityIter = taskListsDefaultCapacityMap.find({archKind, taskType}); VPUX_THROW_WHEN(taskListCapacityIter == taskListsDefaultCapacityMap.end(), "getDefaultTaskListCount: Unknown task type {0} for arch {1}", taskType, archKind); diff --git a/src/vpux_compiler/src/dialect/VPU/utils/workload_management_status_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/workload_management_status_utils.cpp new file mode 100644 index 0000000000..0529186f80 --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPU/utils/workload_management_status_utils.cpp @@ -0,0 +1,47 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp" +#include "vpux/compiler/dialect/config/IR/ops.hpp" +#include "vpux/compiler/utils/options.hpp" +#include "vpux/compiler/utils/types.hpp" +#include "vpux/utils/core/error.hpp" + +using namespace vpux; +using namespace vpux::VPU; + +WorkloadManagementStatus vpux::VPU::getWorkloadManagementStatus(mlir::ModuleOp moduleOp) { + auto pipelineOptionOp = moduleOp.lookupSymbol(PIPELINE_OPTIONS); + VPUX_THROW_WHEN(pipelineOptionOp == nullptr, "Failed to find PipelineOptions to fetch workload management status"); + + auto wlmStatusConfigOp = pipelineOptionOp.lookupSymbol(WORKLOAD_MANAGEMENT_STATUS); + VPUX_THROW_WHEN(wlmStatusConfigOp == nullptr, "Failed to find config.OptionOp to fetch workload management status"); + + auto wlmStatusString = mlir::dyn_cast(wlmStatusConfigOp.getOptionValue()); + VPUX_THROW_WHEN(wlmStatusString == nullptr, "{0} config.OptionOp is expected to be a string, got {1}", + WORKLOAD_MANAGEMENT_STATUS, wlmStatusConfigOp); + + auto wlmStatus = symbolizeWorkloadManagementStatus(wlmStatusString.getValue()); + VPUX_THROW_WHEN(!wlmStatus.has_value(), "Failed to symbolize workload management status from string '{0}'", + wlmStatusString.getValue()); + + return wlmStatus.value(); +} + +void vpux::VPU::setWorkloadManagementStatus(mlir::ModuleOp moduleOp, WorkloadManagementStatus value) { + auto context = moduleOp.getContext(); + auto pipelineOptionsOp = VPU::getPipelineOptionsOp(*context, moduleOp); + const auto attrName = mlir::StringAttr::get(context, WORKLOAD_MANAGEMENT_STATUS); + auto attrValue = mlir::StringAttr::get(context, stringifyEnum(value)); + + if (auto wlmStatusConfigOp = pipelineOptionsOp.lookupSymbol(attrName)) { + wlmStatusConfigOp.setOptionValueAttr(attrValue); + } else { + auto optionsBuilder = mlir::OpBuilder::atBlockBegin(&pipelineOptionsOp.getOptions().front()); + optionsBuilder.create(optionsBuilder.getUnknownLoc(), attrName, attrValue); + } +} diff --git a/src/vpux_compiler/src/dialect/VPU/utils/workload_split_utils.cpp b/src/vpux_compiler/src/dialect/VPU/utils/workload_split_utils.cpp index 0e146de2ab..488b9be905 100644 --- a/src/vpux_compiler/src/dialect/VPU/utils/workload_split_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPU/utils/workload_split_utils.cpp @@ -29,7 +29,7 @@ void addSubTensorOffset(TileInfo& tileInfo, ShapeRef tensorOffset) { int64_t computeSplitCost(const VPUIP::WorkloadSplit& split, const VPUIP::WorkloadCostParams& params, VPUNN::VPUCostModel& costModel, bool isAutopadODUEnabled, LogCb logCb) { - VPUX_THROW_WHEN(params.arch < VPU::ArchKind::NPU37XX, "Unexpected architecture {0}", params.arch); + VPUX_THROW_WHEN(params.arch < config::ArchKind::NPU37XX, "Unexpected architecture {0}", params.arch); std::vector workloadCost; workloadCost.reserve(split.size()); @@ -251,7 +251,7 @@ void splitOntoWorkloads(mlir::OpBuilder& builder, VPU::NCEOpInterface origOp, VP costParams.outputShape = outputSubTensorShapes[clusterId]; costParams.numTiles = distributionAttr.getNumClusters().getInt(); // #E129156 once with the update of VPUNN to provide MPE mode explicitly - if (costParams.arch != VPU::ArchKind::NPU40XX && + if (costParams.arch != config::ArchKind::NPU40XX && mlir::isa(origOp)) { mpeMode = origOp.getMpeMode(nullptr, nullptr, outputSubTensorShapes[clusterId]); } @@ -292,7 +292,7 @@ SmallVector getSupportedWorkloadSplitDim(VPU::NCEOpInterface nceOp, vpux:: } mlir::LogicalResult vpux::VPU::genericNCEWorkloadSplit(VPU::NCEOpInterface nceOp, mlir::PatternRewriter& rewriter, - VPU::ArchKind arch, int64_t numDPU, + config::ArchKind arch, int64_t numDPU, std::shared_ptr costModel, Logger log) { const auto mpeMode = getNCEHeuristicMPEMode(nceOp); auto params = VPU::getWorkloadCostParam(nceOp, arch, numDPU); diff --git a/src/vpux_compiler/src/dialect/VPUASM/ops.cpp b/src/vpux_compiler/src/dialect/VPUASM/ops.cpp index 6fb03389ed..a15bfcdce7 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/ops.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/ops.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/VPUASM/ops.hpp" +#include "vpux/compiler/NPU40XX/dialect/ELF/dialect.hpp" #include "vpux/compiler/dialect/VPUASM/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUASM/ops/bootstrap.cpp b/src/vpux_compiler/src/dialect/VPUASM/ops/bootstrap.cpp index aa0d6abf64..ffae091978 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/ops/bootstrap.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/ops/bootstrap.cpp @@ -15,14 +15,14 @@ using namespace vpux; void vpux::VPUASM::BootstrapOp::serialize(elf::writer::BinaryDataSection& binaryDataSection) { uint32_t barId = getBarrierId(); auto ptrCharTmp = reinterpret_cast(&barId); - binaryDataSection.appendData(ptrCharTmp, getBinarySize(VPU::ArchKind::UNKNOWN)); + binaryDataSection.appendData(ptrCharTmp, getBinarySize(config::ArchKind::UNKNOWN)); } -size_t vpux::VPUASM::BootstrapOp::getBinarySize(VPU::ArchKind) { +size_t vpux::VPUASM::BootstrapOp::getBinarySize(config::ArchKind) { return sizeof(uint32_t); } -size_t vpux::VPUASM::BootstrapOp::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::VPUASM::BootstrapOp::getAlignmentRequirements(config::ArchKind) { return alignof(uint32_t); } diff --git a/src/vpux_compiler/src/dialect/VPUASM/ops/const_buffer.cpp b/src/vpux_compiler/src/dialect/VPUASM/ops/const_buffer.cpp index 3af299d657..11fa4fd6a2 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/ops/const_buffer.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/ops/const_buffer.cpp @@ -15,18 +15,18 @@ using namespace vpux; void VPUASM::ConstBufferOp::serialize(elf::writer::BinaryDataSection& binDataSection) { auto cnt = getProperties().getContent().fold(); auto ptr = binDataSection.getCurrentWriteAddr() + getMemoryOffset(); - const auto size = getBinarySize(VPU::ArchKind::UNKNOWN); + const auto size = getBinarySize(config::ArchKind::UNKNOWN); MutableArrayRef inBlobView(reinterpret_cast(ptr), reinterpret_cast(ptr) + size); cnt.copyTo(inBlobView); } -size_t VPUASM::ConstBufferOp::getBinarySize(VPU::ArchKind) { +size_t VPUASM::ConstBufferOp::getBinarySize(config::ArchKind) { auto content = getProperties().getContent(); VPUX_THROW_WHEN(content == nullptr, "This content is already deleted!"); return content.getType().getTotalAllocSize().count(); } -size_t VPUASM::ConstBufferOp::getAlignmentRequirements(VPU::ArchKind) { +size_t VPUASM::ConstBufferOp::getAlignmentRequirements(config::ArchKind) { // TODO: E#59169 measure if weights alignment has any impact on performance. return ELF::VPUX_DEFAULT_ALIGNMENT; } diff --git a/src/vpux_compiler/src/dialect/VPUASM/ops/declare_buffer.cpp b/src/vpux_compiler/src/dialect/VPUASM/ops/declare_buffer.cpp index c7d39c2b31..d24152a0f2 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/ops/declare_buffer.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/ops/declare_buffer.cpp @@ -12,12 +12,12 @@ using namespace vpux; // DeclareBufferOp // -size_t VPUASM::DeclareBufferOp::getBinarySize(VPU::ArchKind) { +size_t VPUASM::DeclareBufferOp::getBinarySize(config::ArchKind) { const auto type = mlir::cast(getBufferType().getMemref()); return ELF::getOpBinarySize(type); } -size_t VPUASM::DeclareBufferOp::getAlignmentRequirements(VPU::ArchKind) { +size_t VPUASM::DeclareBufferOp::getAlignmentRequirements(config::ArchKind) { // DeclareBuffers are addressed by the mem-schedulers, so can't override anything return ELF::VPUX_NO_ALIGNMENT; } diff --git a/src/vpux_compiler/src/dialect/VPUASM/ops/declare_kernel_data.cpp b/src/vpux_compiler/src/dialect/VPUASM/ops/declare_kernel_data.cpp index 963767b0dc..e09c61c686 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/ops/declare_kernel_data.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/ops/declare_kernel_data.cpp @@ -20,12 +20,12 @@ void vpux::VPUASM::DeclareKernelDataOp::serialize(elf::writer::BinaryDataSection binDataSection.appendData(data.data(), data.size()); } -size_t vpux::VPUASM::DeclareKernelDataOp::getBinarySize(VPU::ArchKind) { +size_t vpux::VPUASM::DeclareKernelDataOp::getBinarySize(config::ArchKind) { return vpux::ELF::getKernelELF(getOperation(), getKernelPath(), {".data", ".arg.data"}).size(); } // The .data sections for the sw layers must be 1kB aligned as an ActShave requirement -size_t vpux::VPUASM::DeclareKernelDataOp::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::VPUASM::DeclareKernelDataOp::getAlignmentRequirements(config::ArchKind) { return ELF::VPUX_SHAVE_ALIGNMENT; } diff --git a/src/vpux_compiler/src/dialect/VPUASM/ops/declare_kernel_text.cpp b/src/vpux_compiler/src/dialect/VPUASM/ops/declare_kernel_text.cpp index 34045a3a1c..a155c92b28 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/ops/declare_kernel_text.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/ops/declare_kernel_text.cpp @@ -19,12 +19,12 @@ void vpux::VPUASM::DeclareKernelTextOp::serialize(elf::writer::BinaryDataSection binDataSection.appendData(text.data(), text.size()); } -size_t vpux::VPUASM::DeclareKernelTextOp::getBinarySize(VPU::ArchKind) { +size_t vpux::VPUASM::DeclareKernelTextOp::getBinarySize(config::ArchKind) { return vpux::ELF::getKernelELF(getOperation(), getKernelPath(), {".text"}).size(); } // The .text sections for the sw layers must be 1kB aligned as an ActShave requirement -size_t vpux::VPUASM::DeclareKernelTextOp::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::VPUASM::DeclareKernelTextOp::getAlignmentRequirements(config::ArchKind) { return ELF::VPUX_SHAVE_ALIGNMENT; } diff --git a/src/vpux_compiler/src/dialect/VPUASM/ops/declare_task_buffer_op.cpp b/src/vpux_compiler/src/dialect/VPUASM/ops/declare_task_buffer_op.cpp index 62c3637db5..2da956b1bc 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/ops/declare_task_buffer_op.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/ops/declare_task_buffer_op.cpp @@ -14,7 +14,7 @@ using namespace vpux; // DeclareTaskBufferOp // -size_t VPUASM::DeclareTaskBufferOp::getBinarySize([[maybe_unused]] VPU::ArchKind arch) { +size_t VPUASM::DeclareTaskBufferOp::getBinarySize([[maybe_unused]] config::ArchKind arch) { switch (getTaskType()) { case VPURegMapped::TaskType::DMA: return sizeof(npu40xx::nn_public::VpuDMATask); @@ -33,7 +33,7 @@ size_t VPUASM::DeclareTaskBufferOp::getBinarySize([[maybe_unused]] VPU::ArchKind } } -size_t VPUASM::DeclareTaskBufferOp::getAlignmentRequirements(VPU::ArchKind) { +size_t VPUASM::DeclareTaskBufferOp::getAlignmentRequirements(config::ArchKind) { return ELF::VPUX_NO_ALIGNMENT; } diff --git a/src/vpux_compiler/src/dialect/VPUASM/ops/dma.cpp b/src/vpux_compiler/src/dialect/VPUASM/ops/dma.cpp index 64a867d31a..81fc5db42e 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/ops/dma.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/ops/dma.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/VPUASM/ops.hpp" #include "vpux/compiler/dialect/VPUASM/utils.hpp" #include "vpux/compiler/utils/ELF/utils.hpp" +#include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/VPUASM/ops/dpu.cpp b/src/vpux_compiler/src/dialect/VPUASM/ops/dpu.cpp index 75e4032094..c577cb40f9 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/ops/dpu.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/ops/dpu.cpp @@ -37,11 +37,11 @@ bool vpux::VPUASM::DPUVariantOp::hasMemoryFootprint() { return true; } -size_t vpux::VPUASM::DPUInvariantOp_37XX::getBinarySize(VPU::ArchKind) { +size_t vpux::VPUASM::DPUInvariantOp_37XX::getBinarySize(config::ArchKind) { return sizeof(npu40xx::nn_public::VpuDPUInvariant); } -size_t vpux::VPUASM::DPUInvariantOp_37XX::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::VPUASM::DPUInvariantOp_37XX::getAlignmentRequirements(config::ArchKind) { return alignof(npu40xx::nn_public::VpuDPUInvariant); } @@ -57,11 +57,11 @@ bool vpux::VPUASM::DPUInvariantOp_37XX::hasMemoryFootprint() { return true; } -size_t vpux::VPUASM::DPUVariantOp_37XX::getBinarySize(VPU::ArchKind) { +size_t vpux::VPUASM::DPUVariantOp_37XX::getBinarySize(config::ArchKind) { return sizeof(npu40xx::nn_public::VpuDPUVariant); } -size_t vpux::VPUASM::DPUVariantOp_37XX::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::VPUASM::DPUVariantOp_37XX::getAlignmentRequirements(config::ArchKind) { return alignof(npu40xx::nn_public::VpuDPUVariant); } diff --git a/src/vpux_compiler/src/dialect/VPUASM/ops/kernel_params.cpp b/src/vpux_compiler/src/dialect/VPUASM/ops/kernel_params.cpp index 22d4d9a431..d624263350 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/ops/kernel_params.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/ops/kernel_params.cpp @@ -114,7 +114,7 @@ void vpux::VPUASM::KernelParamsOp::serializeCached(elf::writer::BinaryDataSectio return; } -size_t vpux::VPUASM::KernelParamsOp::getBinarySizeCached(ELF::SymbolReferenceMap& symRefMap, VPU::ArchKind) { +size_t vpux::VPUASM::KernelParamsOp::getBinarySizeCached(ELF::SymbolReferenceMap& symRefMap, config::ArchKind) { auto actualParamsSize = getParamsStructSize(); if (getIsJitCompiled()) { return actualParamsSize; @@ -157,7 +157,7 @@ size_t vpux::VPUASM::KernelParamsOp::getParamsStructSize() { } // The parameter structs for the sw layers must be 64Byte aligned as an ActShave requirement -size_t vpux::VPUASM::KernelParamsOp::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::VPUASM::KernelParamsOp::getAlignmentRequirements(config::ArchKind) { return ELF::VPUX_DEFAULT_ALIGNMENT; } diff --git a/src/vpux_compiler/src/dialect/VPUASM/ops/mapped_inference.cpp b/src/vpux_compiler/src/dialect/VPUASM/ops/mapped_inference.cpp index 546ad001fb..46d82f9a16 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/ops/mapped_inference.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/ops/mapped_inference.cpp @@ -31,11 +31,11 @@ bool vpux::VPUASM::MappedInferenceOp::hasMemoryFootprint() { // MappedInferenceOp_37XX // -size_t vpux::VPUASM::MappedInferenceOp_37XX::getBinarySize(VPU::ArchKind) { +size_t vpux::VPUASM::MappedInferenceOp_37XX::getBinarySize(config::ArchKind) { return sizeof(npu40xx::nn_public::VpuMappedInference); } -size_t vpux::VPUASM::MappedInferenceOp_37XX::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::VPUASM::MappedInferenceOp_37XX::getAlignmentRequirements(config::ArchKind) { return alignof(npu40xx::nn_public::VpuMappedInference); } diff --git a/src/vpux_compiler/src/dialect/VPUASM/ops/network_metadata.cpp b/src/vpux_compiler/src/dialect/VPUASM/ops/network_metadata.cpp index 304ab8b480..ba5f2c538a 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/ops/network_metadata.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/ops/network_metadata.cpp @@ -28,7 +28,7 @@ void vpux::VPUASM::NetworkMetadataOp::serialize(elf::writer::BinaryDataSection #include #include "vpux/compiler/dialect/VPUASM/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/ELF/utils.hpp" #include "vpux_headers/platform.hpp" @@ -14,23 +15,23 @@ using namespace vpux; void vpux::VPUASM::PlatformInfoOp::serialize(elf::writer::BinaryDataSection& binDataSection) { elf::platform::PlatformInfo platformInfo; - platformInfo.mArchKind = ELF::mapVpuArchKindToElfArchKind(VPU::getArch(getOperation())); + platformInfo.mArchKind = ELF::mapVpuArchKindToElfArchKind(config::getArch(getOperation())); auto serializedPlatformInfo = elf::platform::PlatformInfoSerialization::serialize(platformInfo); binDataSection.appendData(&serializedPlatformInfo[0], serializedPlatformInfo.size()); } -size_t vpux::VPUASM::PlatformInfoOp::getBinarySize(VPU::ArchKind) { +size_t vpux::VPUASM::PlatformInfoOp::getBinarySize(config::ArchKind) { // calculate size based on serialized form, instead of just sizeof(PlatformInfo) // serialization uses metadata that also gets stored in the blob and must be accounted for // also for non-POD types (e.g. have vector as member) account for all data to be serialized // (data owned by vector, instead of just pointer) elf::platform::PlatformInfo platformInfo; - platformInfo.mArchKind = ELF::mapVpuArchKindToElfArchKind(VPU::getArch(getOperation())); + platformInfo.mArchKind = ELF::mapVpuArchKindToElfArchKind(config::getArch(getOperation())); return elf::platform::PlatformInfoSerialization::serialize(platformInfo).size(); } -size_t vpux::VPUASM::PlatformInfoOp::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::VPUASM::PlatformInfoOp::getAlignmentRequirements(config::ArchKind) { return alignof(elf::platform::PlatformInfo); } diff --git a/src/vpux_compiler/src/dialect/VPUASM/ops/profiling_metadata.cpp b/src/vpux_compiler/src/dialect/VPUASM/ops/profiling_metadata.cpp index d3a50447b6..839e1617cc 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/ops/profiling_metadata.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/ops/profiling_metadata.cpp @@ -14,12 +14,12 @@ void vpux::VPUASM::ProfilingMetadataOp::serialize(elf::writer::BinaryDataSection binDataSection.appendData(reinterpret_cast(buf.data()), buf.size()); } -size_t vpux::VPUASM::ProfilingMetadataOp::getBinarySize(VPU::ArchKind) { +size_t vpux::VPUASM::ProfilingMetadataOp::getBinarySize(config::ArchKind) { auto values = getMetadata().getValues(); return values.size(); } -size_t vpux::VPUASM::ProfilingMetadataOp::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::VPUASM::ProfilingMetadataOp::getAlignmentRequirements(config::ArchKind) { return ELF::VPUX_NO_ALIGNMENT; } diff --git a/src/vpux_compiler/src/dialect/VPUASM/ops/shave_stack_frames.cpp b/src/vpux_compiler/src/dialect/VPUASM/ops/shave_stack_frames.cpp index 19972ba963..7b2da67fba 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/ops/shave_stack_frames.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/ops/shave_stack_frames.cpp @@ -12,11 +12,11 @@ using namespace vpux; // ShaveStackFrameOp // -size_t vpux::VPUASM::ShaveStackFrameOp::getBinarySizeCached(ELF::SymbolReferenceMap&, VPU::ArchKind) { +size_t vpux::VPUASM::ShaveStackFrameOp::getBinarySizeCached(ELF::SymbolReferenceMap&, config::ArchKind) { return getStackSize(); } -size_t vpux::VPUASM::ShaveStackFrameOp::getAlignmentRequirements(VPU::ArchKind) { +size_t vpux::VPUASM::ShaveStackFrameOp::getAlignmentRequirements(config::ArchKind) { return ELF::VPUX_DEFAULT_ALIGNMENT; } diff --git a/src/vpux_compiler/src/dialect/VPUASM/passses/add_profiling_section.cpp b/src/vpux_compiler/src/dialect/VPUASM/passses/add_profiling_section.cpp index 7b7cce1d00..986dba8dae 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/passses/add_profiling_section.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/passses/add_profiling_section.cpp @@ -3,10 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/NPU40XX/dialect/ELF/dialect.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" #include "vpux/compiler/dialect/VPUASM/dialect.hpp" #include "vpux/compiler/dialect/VPUASM/ops.hpp" #include "vpux/compiler/dialect/VPUASM/passes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/passes.hpp" @@ -34,7 +36,7 @@ void AddProfilingSection::safeRunOnModule() { net::NetworkInfoOp netInfo; mlir::func::FuncOp netFunc; net::NetworkInfoOp::getFromModule(moduleOp, netInfo, netFunc); - const auto arch = VPU::getArch(moduleOp); + const auto arch = config::getArch(moduleOp); if (netInfo.getProfilingOutputsInfo().empty()) { return; } diff --git a/src/vpux_compiler/src/dialect/VPUASM/utils.cpp b/src/vpux_compiler/src/dialect/VPUASM/utils.cpp index c3a1a23ddb..ab8ee89c37 100644 --- a/src/vpux_compiler/src/dialect/VPUASM/utils.cpp +++ b/src/vpux_compiler/src/dialect/VPUASM/utils.cpp @@ -9,6 +9,7 @@ #include "vpux/compiler/dialect/VPUASM/ops.hpp" #include "vpux/compiler/dialect/VPUASM/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/ops_interfaces.hpp" @@ -96,7 +97,7 @@ SparsityMap getSparsityMapBuffTileMask(VPUASM::NNDMAOp dmaOp, ELF::SymbolReferen if (auto buffer = mlir::dyn_cast_if_present(sparsityMapBufferRef)) { sparsityMap.tileSelectMaskForBuffer = getTileSelectMaskForBuffer(buffer); - sparsityMap.size = buffer.getBinarySize(VPU::getArch(dmaOp)); + sparsityMap.size = buffer.getBinarySize(config::getArch(dmaOp)); } } return sparsityMap; diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/dialect.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/dialect.cpp index e5b25d7900..039a45b0d3 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/dialect.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/dialect.cpp @@ -4,7 +4,6 @@ // #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/types.hpp" @@ -17,6 +16,7 @@ #include "vpux/compiler/dialect/core/IR/dialect.hpp" #include "vpux/compiler/dialect/core/IR/unified_func_inliner_interface.hpp" #include "vpux/compiler/dialect/net/IR/dialect.hpp" +#include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/dialect_interfaces.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/dialect_interfaces.cpp index 82b5bfc5f9..7d635b6c3c 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/dialect_interfaces.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/dialect_interfaces.cpp @@ -5,6 +5,8 @@ #include "vpux/compiler/dialect/VPUIP/IR/dialect_interfaces.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/types.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/dialect/const/ops.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/ops.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/ops.cpp index e48fd497ad..a3b5185f0c 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/ops.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/ops.cpp @@ -4,8 +4,10 @@ // #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" @@ -14,26 +16,21 @@ #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/dialect_interfaces.hpp" #include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/config/IR/attributes.hpp" -#include "vpux/compiler/dialect/core/IR/attributes.hpp" #include "vpux/compiler/dialect/core/IR/dialect.hpp" #include "vpux/compiler/dialect/core/IR/ops.hpp" - #include "vpux/compiler/utils/VPU/tile_utils.hpp" #include "vpux/compiler/utils/asm.hpp" #include "vpux/utils/core/numeric.hpp" +#include #include #include #include -#include -#include - using namespace vpux; namespace { @@ -281,6 +278,10 @@ bool isSupportedIsolatedTilingEltwise(mlir::Operation* origOp, const OutputTilin const auto input1Type = mlir::cast(origOp->getOperand(0).getType()); const auto input2Type = mlir::cast(origOp->getOperand(1).getType()); const auto outputType = mlir::cast(origOp->getResult(0).getType()); + const auto isValidTile = [](auto dim) { + return dim > 1; + }; + return llvm::all_of(tiles, [&](const TileInfo& tile) { const auto input1TileType = input1Type.extractDenseTile(tile.offsets, tile.shape); const auto input2TileType = input2Type.extractDenseTile(tile.offsets, tile.shape); @@ -300,13 +301,28 @@ bool isSupportedIsolatedTilingEltwise(mlir::Operation* origOp, const OutputTilin clusteredOp, outputTileType.getShape(), mlir::cast(clusteredOp->getAttr(VPU::multiClusterStrategy)) .getValue()); + auto input1DistrType = VPU::getDistributedActivationTypeFromOp( + clusteredOp, origOp->getOperand(0), input1TileType, numClusters, outputTileType, tile); + auto input2DistrType = input1DistrType; + if (input1TileType.getShape() != input2TileType.getShape()) { + input2DistrType = VPU::getDistributedActivationTypeFromOp( + clusteredOp, origOp->getOperand(1), input2TileType, numClusters, outputTileType, tile); + } + + const auto multiClusterStrategy = clusteredOp.getMultiClusterStrategy().value(); + const auto tensorNumTiles = + getOutputTensorNumTiles(clusteredOp, numClusters, multiClusterStrategy, outputTileType); + const auto tensorDistributionMode = + getOutputTensorDistributionMode(clusteredOp, multiClusterStrategy, outputTileType); + + if ((VPU::bitEnumContainsAny(tensorDistributionMode, VPU::DistributionMode::SEGMENTED) || + VPU::bitEnumContainsAny(tensorDistributionMode, VPU::DistributionMode::OVERLAPPED)) && + llvm::count_if(tensorNumTiles, isValidTile) != 1) { + return false; + } return mlir::succeeded(VPU::NCEEltwiseOp::verifyEltwiseCMX( - origOp->getLoc(), module, isInplace, - VPU::getDistributedActivationTypeFromOp(clusteredOp, origOp->getOperand(0), input1TileType, - numClusters, outputTileType, tile), - VPU::getDistributedActivationTypeFromOp(clusteredOp, origOp->getOperand(1), input2TileType, - numClusters, outputTileType, tile), + origOp->getLoc(), module, isInplace, input1DistrType, input2DistrType, VPU::getDistributedOutputTypeFromOp(clusteredOp, outputTileType, numClusters, {input1TileType, input2TileType}))); } @@ -487,38 +503,104 @@ bool isSupportedIsolatedTilingGRUSequenceLastPart(VPU::GRUSequenceLastPartOp op, }); } +SmallVector getAllOperandsGatherDMAOp(VPU::GatherDMAOp origOp, const TileInfo& outputTile, + Logger log) { + vpux::OutputTiling inputTiles{outputTile}; + if (auto tilingBuilderInterface = mlir::dyn_cast(origOp.getOperation())) { + inputTiles = tilingBuilderInterface.backInferTileInfo(outputTile, log).tiles; + } + + VPUX_THROW_UNLESS(inputTiles.size() == origOp->getOperands().size(), + "Unexpected inputTile size '{0}' and Op operands size '{1}'", inputTiles.size(), + origOp->getOperands().size()); + + mlir::SmallVector inputTileTypes; + const auto inputType = mlir::cast(origOp.getIndices().getType()); + inputTileTypes.push_back(inputType.extractDenseTile(inputTiles[1].offsets, inputTiles[1].shape)); + + auto valueTypes = inputTileTypes; + mlir::SmallVector outputTileTypes; + const auto outputType = mlir::cast(origOp.getOutput().getType()); + const auto outputTileType = outputType.extractDenseTile(outputTile.offsets, outputTile.shape); + outputTileTypes.push_back(outputTileType); + valueTypes.push_back(outputTileType); + + if (!origOp->hasAttr(VPU::multiClusterStrategy)) { + return valueTypes; + } + + auto clusteredOp = mlir::dyn_cast(origOp.getOperation()); + VPUX_THROW_WHEN(clusteredOp == nullptr, "Op {0} has multiClusterStrategy but is not an ClusteredOp", + origOp->getLoc()); + auto numClusters = VPU::getOptimalNumClusters(clusteredOp, outputTileTypes[0].getShape(), + clusteredOp.getMultiClusterStrategy().value()); + + if (!llvm::all_of(outputTileTypes, [&](const vpux::NDTypeInterface& outputTileType) { + auto numClustersOfPerOutput = VPU::getOptimalNumClusters(clusteredOp, outputTileType.getShape(), + clusteredOp.getMultiClusterStrategy().value()); + return numClustersOfPerOutput == numClusters; + })) { + return SmallVector{}; + } + + SmallVector distributedTensorTypes; + auto inDistributedType = + getDistributedActivationTypeFromOp(clusteredOp, clusteredOp->getOperand(1), inputTileTypes[0], numClusters, + VPU::MultiClusterStrategy::Clustering, + /*customAlignment*/ ArrayRef{}, outputTileTypes[0], outputTile); + distributedTensorTypes.push_back(mlir::cast(inDistributedType)); + + for (const auto& outputTileType : outputTileTypes) { + auto outDistributedType = + VPU::getDistributedOutputTypeFromOp(clusteredOp, outputTileType, numClusters, inputTileTypes); + distributedTensorTypes.push_back(mlir::cast(outDistributedType)); + } + + return distributedTensorTypes; +} + bool isSupportedIsolatedTilingGatherDMA(VPU::GatherDMAOp op, const OutputTiling& tiles, Logger log) { const auto origOp = op.getOperation(); auto tilingOp = mlir::dyn_cast(origOp); VPUX_THROW_UNLESS(tilingOp != nullptr, "Not a tileable operation {0}", origOp->getName()); - const auto cmxAvailableBytes = vpux::VPU::getTotalCMXSize(origOp).to().count(); + if (!origOp->hasAttr(VPU::multiClusterStrategy)) { + const auto cmxAvailableBytes = vpux::VPU::getTotalCMXSize(origOp).to().count(); - const auto inputOutputTilesFitCMX = [&](const TileInfo& firstOutputTile) { - const auto computeRequiredMemory = [&](const auto& operand, const TileInfo& tilingInfo) { - const auto tensorType = mlir::cast(operand.getType()); - const auto denseTile = tensorType.extractDenseTile(tilingInfo.offsets, tilingInfo.shape); - return denseTile.getTotalAllocSize().count(); - }; + const auto inputOutputTilesFitCMX = [&](const TileInfo& firstOutputTile) { + const auto computeRequiredMemory = [&](const auto& operand, const TileInfo& tilingInfo) { + const auto tensorType = mlir::cast(operand.getType()); + const auto denseTile = tensorType.extractDenseTile(tilingInfo.offsets, tilingInfo.shape); + return denseTile.getTotalAllocSize().count(); + }; - const auto inputTilingInfo = tilingOp.backInferTileInfo(firstOutputTile, log); - const auto indicesMemorySize = computeRequiredMemory(op.getIndices(), inputTilingInfo.tiles[1]); + const auto inputTilingInfo = tilingOp.backInferTileInfo(firstOutputTile, log); + const auto indicesMemorySize = computeRequiredMemory(op.getIndices(), inputTilingInfo.tiles[1]); - const auto outputTiles = tilingOp.getOutputTiling(firstOutputTile, log); - const auto outputMemorySize = computeRequiredMemory(op.getOutput(), outputTiles[0]); - // For gather DMA only indices and output are copy to CMX. - const auto requiredCMX = indicesMemorySize + outputMemorySize; + const auto outputTiles = tilingOp.getOutputTiling(firstOutputTile, log); + const auto outputMemorySize = computeRequiredMemory(op.getOutput(), outputTiles[0]); + // For gather DMA only indices and output are copy to CMX. + const auto requiredCMX = indicesMemorySize + outputMemorySize; - if (requiredCMX > cmxAvailableBytes) { - log.trace("Op '{0}' doesn't fit into CMX: required {1}, available {2}", origOp->getLoc(), requiredCMX, - cmxAvailableBytes); - return false; - } + if (requiredCMX > cmxAvailableBytes) { + log.trace("Op '{0}' doesn't fit into CMX: required {1}, available {2}", origOp->getLoc(), requiredCMX, + cmxAvailableBytes); + return false; + } - return true; - }; + return true; + }; - return llvm::all_of(tiles, inputOutputTilesFitCMX); + return llvm::all_of(tiles, inputOutputTilesFitCMX); + } + + return llvm::all_of(tiles, [&](const TileInfo& outputTile) { + SmallVector operands = getAllOperandsGatherDMAOp(op, outputTile, log); + if (operands.empty()) { + return false; + } + return op.fitIntoCMX(operands, Byte(0)); + }); } bool isSupportedIsolatedTilingGeneric(mlir::Operation* origOp, const OutputTiling& firstOutputTiles, Logger log) { @@ -782,18 +864,29 @@ bool isSupportedPrefetchTilingConvBased(ConcreteOp origOp, const OutputTiling& t return checkPrefetchMem(origOp.getOperation(), tiles, log); }; - // neutral tiling check - if (tileDims.empty() && tilingMode == vpux::TilingMode::PREFETCHING) { - return isMemPrefetchable(); + // Neutral tiling check + if (tileDims.empty()) { + if (tilingMode == vpux::TilingMode::PREFETCHING) { + return isMemPrefetchable(); + } + return false; } - // Prefetch tiling is only triggered when the isolated tiling is not nested - if (tileDims.size() != 1) { + if (!isMemPrefetchable()) { return false; } - auto tileDim = tileDims[0]; - return VPU::isDivisibleTile(origOp.getOperation(), tileAxis, tileDim) && isMemPrefetchable() && - !isLastTileBiggest(origOp.getOperation(), tileAxis, outputShape, tileDim); + + for (auto tileDim : tileDims) { + if (!VPU::isDivisibleTile(origOp.getOperation(), tileAxis, tileDim)) { + return false; + } + + if (isLastTileBiggest(origOp.getOperation(), tileAxis, outputShape, tileDim)) { + return false; + } + } + + return true; } bool isSupportedPrefetchTiling(VPU::NCEConvolutionOp origOp, const OutputTiling& tiles, Logger log, diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/copy.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/copy.cpp index eaecbab3fa..a4262f7ce7 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/copy.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/copy.cpp @@ -6,6 +6,8 @@ #include "vpux/compiler/core/cost_model_utils.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/error.hpp" using namespace vpux; @@ -14,7 +16,7 @@ size_t vpux::VPUIP::CopyOp::getOperationCycleCost(std::shared_ptr(getDMACost(getInput(), getOutput(), arch, costModel, numDMAPorts)); } diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/dma.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/dma.cpp index b192e91b27..48ac9c1f78 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/dma.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/dma.cpp @@ -3,23 +3,25 @@ // SPDX-License-Identifier: Apache-2.0 // -#include #include "vpux/compiler/core/cost_model_utils.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/transforms/factories/gather_dma_constants.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/compression_utils.hpp" #include "vpux/compiler/utils/dma_limits.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/utils/core/checked_cast.hpp" -#include "vpux/utils/core/error.hpp" + +#include using namespace vpux; namespace { -mlir::LogicalResult verifyTensorSize(VPU::ArchKind arch, mlir::Location loc, mlir::Value tensor) { +mlir::LogicalResult verifyTensorSize(config::ArchKind arch, mlir::Location loc, mlir::Value tensor) { const auto size = static_cast(getCompactSize(tensor)); auto maxTransferSize = VPUIP::DMA::getEngineLimits(arch).getTransferLimits().max(); @@ -165,7 +167,7 @@ mlir::LogicalResult vpux::VPUIP::NNDMAOp::verify() { } } - return verifyTensorSize(VPU::getArch(getOperation()), loc, getInput()); + return verifyTensorSize(config::getArch(getOperation()), loc, getInput()); } size_t vpux::VPUIP::NNDMAOp::getOperationCycleCost(std::shared_ptr& costModel) { @@ -173,7 +175,7 @@ size_t vpux::VPUIP::NNDMAOp::getOperationCycleCost(std::shared_ptr(getDMACost(getInput(), getOutput(), arch, costModel, numDMAPorts)); } @@ -187,7 +189,7 @@ void vpux::VPUIP::PermuteDMAOp::build(mlir::OpBuilder& builder, mlir::OperationS build(builder, state, input, output_buff, /*port=*/nullptr, /*is_out_of_order=*/nullptr, /*is_critical=*/nullptr, mem_perm, dma_descriptor, /* dma_hwp_id= */ nullptr, - /* profilingMetadata */ nullptr); + /* profilingMetadata */ nullptr, /*internalDataFlow=*/nullptr); } void vpux::VPUIP::PermuteDMAOp::build(mlir::OpBuilder& builder, mlir::OperationState& state, mlir::Value input, @@ -195,11 +197,12 @@ void vpux::VPUIP::PermuteDMAOp::build(mlir::OpBuilder& builder, mlir::OperationS VPUIP::DMADescriptorAttr dma_descriptor, mlir::IntegerAttr port) { build(builder, state, input, output_buff, /*port=*/port, /*is_out_of_order=*/nullptr, - /*is_critical=*/nullptr, mem_perm, dma_descriptor, nullptr, /* profilingMetadata */ nullptr); + /*is_critical=*/nullptr, mem_perm, dma_descriptor, nullptr, /* profilingMetadata= */ nullptr, + /*internalDataFlow=*/nullptr); } mlir::LogicalResult vpux::VPUIP::PermuteDMAOp::verify() { - return verifyTensorSize(VPU::getArch(getOperation()), getLoc(), getInput()); + return verifyTensorSize(config::getArch(getOperation()), getLoc(), getInput()); } size_t vpux::VPUIP::PermuteDMAOp::getOperationCycleCost(std::shared_ptr& costModel) { @@ -209,7 +212,7 @@ size_t vpux::VPUIP::PermuteDMAOp::getOperationCycleCost(std::shared_ptr(getDMACost(getInput(), getOutput(), arch, costModel, numDMAPorts)); } @@ -240,15 +243,15 @@ void vpux::VPUIP::GatherDMAOp::build(mlir::OpBuilder& builder, mlir::OperationSt mlir::LogicalResult vpux::VPUIP::GatherDMAOp::verify() { auto loc = getLoc(); - auto arch = VPU::getArch(getOperation()); + auto arch = config::getArch(getOperation()); // Skip checks if architecture is unknown, enables LIT tests. - if (arch == VPU::ArchKind::UNKNOWN) { + if (arch == config::ArchKind::UNKNOWN) { return mlir::success(); } // TODO: E#-86281 move to 40xx - if (arch < VPU::ArchKind::NPU40XX) { + if (arch < config::ArchKind::NPU40XX) { return errorAt(loc, "Operation {0} is only supported for NPU40XX+ but got {1}.", getOperationName(), arch); } @@ -269,7 +272,7 @@ mlir::LogicalResult vpux::VPUIP::GatherDMAOp::verify() { DMA_MAX_INDICES_LIST_LENGTH_ARCH_BASED); } - return verifyTensorSize(VPU::getArch(getOperation()), loc, getOutput()); + return verifyTensorSize(config::getArch(getOperation()), loc, getOutput()); } size_t vpux::VPUIP::GatherDMAOp::getOperationCycleCost(std::shared_ptr& costModel) { @@ -279,7 +282,7 @@ size_t vpux::VPUIP::GatherDMAOp::getOperationCycleCost(std::shared_ptr(getDMACost(getInput(), getOutput(), arch, costModel, numDMAPorts)); } @@ -304,10 +307,10 @@ void vpux::VPUIP::ConvertDMAOp::build(mlir::OpBuilder& builder, mlir::OperationS mlir::LogicalResult vpux::VPUIP::ConvertDMAOp::verify() { auto loc = getLoc(); - auto arch = VPU::getArch(getOperation()); + auto arch = config::getArch(getOperation()); // Skip checks if architecture is unknown since all of them depend on the architecture used - if (arch == VPU::ArchKind::UNKNOWN) { + if (arch == config::ArchKind::UNKNOWN) { return mlir::success(); } @@ -316,7 +319,7 @@ mlir::LogicalResult vpux::VPUIP::ConvertDMAOp::verify() { auto inputType = mlir::cast(getInput().getType()); const auto inputElementType = inputType.getElementType(); - if ((arch < VPU::ArchKind::NPU40XX) || !inputElementType.isF32() || + if ((arch < config::ArchKind::NPU40XX) || !inputElementType.isF32() || (!outputElementType.isF16() && !outputElementType.isBF16())) { return errorAt(loc, "Operation {0} is only supported for NPU40XX+ arch for F32 to F16/BF16 " @@ -326,7 +329,7 @@ mlir::LogicalResult vpux::VPUIP::ConvertDMAOp::verify() { getOperationName(), arch, inputElementType, outputElementType); } - return verifyTensorSize(VPU::getArch(getOperation()), loc, getInput()); + return verifyTensorSize(config::getArch(getOperation()), loc, getInput()); } size_t vpux::VPUIP::ConvertDMAOp::getOperationCycleCost(std::shared_ptr& costModel) { @@ -336,7 +339,7 @@ size_t vpux::VPUIP::ConvertDMAOp::getOperationCycleCost(std::shared_ptr(getDMACost(getInput(), getOutput(), arch, costModel, numDMAPorts)); } @@ -422,7 +425,7 @@ mlir::LogicalResult vpux::VPUIP::DecompressDMAOp::verify() { } if (mlir::failed(verifyInOutElementType(loc, getInput(), getOutput())) || - mlir::failed(verifyTensorSize(VPU::getArch(getOperation()), loc, getInput()))) { + mlir::failed(verifyTensorSize(config::getArch(getOperation()), loc, getInput()))) { return mlir::failure(); } @@ -441,7 +444,7 @@ size_t vpux::VPUIP::DecompressDMAOp::getOperationCycleCost(std::shared_ptr(getDMACost(getInput(), getOutput(), arch, costModel, numDMAPorts)); } @@ -492,7 +495,7 @@ mlir::LogicalResult vpux::VPUIP::CompressDMAOp::verify() { } if (mlir::failed(verifyInOutElementType(loc, getInput(), getOutput())) || - mlir::failed(verifyTensorSize(VPU::getArch(getOperation()), loc, getInput()))) { + mlir::failed(verifyTensorSize(config::getArch(getOperation()), loc, getInput()))) { return mlir::failure(); } @@ -511,7 +514,7 @@ size_t vpux::VPUIP::CompressDMAOp::getOperationCycleCost(std::shared_ptr(getDMACost(getInput(), getOutput(), arch, costModel, numDMAPorts)); } @@ -545,7 +548,7 @@ mlir::LogicalResult vpux::VPUIP::DepthToSpaceDMAOp::verify() { return errorAt(loc, "block size for D2S should be > 0; actual {0}", blockSize); } - return verifyTensorSize(VPU::getArch(getOperation()), loc, getInput()); + return verifyTensorSize(config::getArch(getOperation()), loc, getInput()); } size_t vpux::VPUIP::DepthToSpaceDMAOp::getOperationCycleCost(std::shared_ptr& costModel) { @@ -555,7 +558,7 @@ size_t vpux::VPUIP::DepthToSpaceDMAOp::getOperationCycleCost(std::shared_ptr(getDMACost(getInput(), getOutput(), arch, costModel, numDMAPorts)); } @@ -587,7 +590,7 @@ mlir::LogicalResult vpux::VPUIP::SpaceToDepthDMAOp::verify() { return errorAt(loc, "block size for S2D should be > 0; actual {0}", blockSize); } - return verifyTensorSize(VPU::getArch(getOperation()), loc, getInput()); + return verifyTensorSize(config::getArch(getOperation()), loc, getInput()); } size_t vpux::VPUIP::SpaceToDepthDMAOp::getOperationCycleCost(std::shared_ptr& costModel) { @@ -597,7 +600,7 @@ size_t vpux::VPUIP::SpaceToDepthDMAOp::getOperationCycleCost(std::shared_ptr(getDMACost(getInput(), getOutput(), arch, costModel, numDMAPorts)); } @@ -630,7 +633,7 @@ mlir::LogicalResult vpux::VPUIP::ExpandDMAOp::verify() { // It should be tiled with several sub ExpandDMA that will be done at Unroll Pass. // Descriptor is generated at Unroll pass so using Descriptor as a flag to check the tensor size. if (getDmaDescriptor().has_value()) { - return verifyTensorSize(VPU::getArch(getOperation()), getLoc(), getInput()); + return verifyTensorSize(config::getArch(getOperation()), getLoc(), getInput()); } return mlir::success(); @@ -643,7 +646,7 @@ size_t vpux::VPUIP::ExpandDMAOp::getOperationCycleCost(std::shared_ptr(getDMACost(getInput(), getOutput(), arch, costModel, numDMAPorts)); } @@ -669,7 +672,7 @@ void vpux::VPUIP::PerAxisTileDMAOp::build(mlir::OpBuilder& odsBuilder, mlir::Ope } mlir::LogicalResult vpux::VPUIP::PerAxisTileDMAOp::verify() { - return verifyTensorSize(VPU::getArch(getOperation()), getLoc(), getInput()); + return verifyTensorSize(config::getArch(getOperation()), getLoc(), getInput()); } size_t vpux::VPUIP::PerAxisTileDMAOp::getOperationCycleCost(std::shared_ptr& costModel) { @@ -679,7 +682,7 @@ size_t vpux::VPUIP::PerAxisTileDMAOp::getOperationCycleCost(std::shared_ptr(getDMACost(getInput(), getOutput(), arch, costModel, numDMAPorts)); } @@ -722,7 +725,7 @@ mlir::LogicalResult vpux::VPUIP::UpsamplingDMAOp::verify() { // It should be tiled with several sub UpsamplingDMA that will be done at Unroll Pass. // Descriptor is generated at Unroll pass so using Descriptor as a flag to check the tensor size. if (getDmaDescriptor().has_value()) { - return verifyTensorSize(VPU::getArch(getOperation()), getLoc(), getInput()); + return verifyTensorSize(config::getArch(getOperation()), getLoc(), getInput()); } return mlir::success(); @@ -735,7 +738,7 @@ size_t vpux::VPUIP::UpsamplingDMAOp::getOperationCycleCost(std::shared_ptr(getDMACost(getInput(), getOutput(), arch, costModel, numDMAPorts)); } @@ -761,7 +764,7 @@ mlir::LogicalResult vpux::VPUIP::ReadOnlyDMAOp::verify() { if (!getResult().use_empty()) { return errorAt(loc, "ReadOnlyDMAOp result should have no users, but it does."); } - return verifyTensorSize(VPU::getArch(getOperation()), getLoc(), getInput()); + return verifyTensorSize(config::getArch(getOperation()), getLoc(), getInput()); } size_t vpux::VPUIP::ReadOnlyDMAOp::getOperationCycleCost(std::shared_ptr&) { @@ -791,7 +794,73 @@ mlir::LogicalResult vpux::VPUIP::BarProgDMAOp::verify() { size_t vpux::VPUIP::BarProgDMAOp::getOperationCycleCost(std::shared_ptr& costModel) { auto module = getOperation()->getParentOfType(); + // TODO: E#97004 Expose API to get arch from cost model + const auto arch = config::getArch(module); + return checked_cast(getDMACost(getInput(), getOutput(), arch, costModel)); +} + +// +// EnqueueDMAOp +// + +mlir::LogicalResult vpux::VPUIP::EnqueueDMAOp::verify() { + auto loc = getLoc(); + auto enqueueDmaAttr = getEnqueueDmaAttr(); + + auto startTaskIdx = enqueueDmaAttr.getStartTaskIdx().getValue().getSExtValue(); + auto endTaskIdx = enqueueDmaAttr.getEndTaskIdx().getValue().getSExtValue(); + + if (endTaskIdx < startTaskIdx) { + return errorAt(loc, "endTaskIdx: {0} - must be greater than or equal to startTaskIdx: {1}", endTaskIdx, + startTaskIdx); + } + + if (enqueueDmaAttr.getTargetExecutorKindAttr().getValue() == VPU::ExecutorKind::DMA_NN) { + return errorAt(loc, "DMA tasks cannot be enqueued by enqueue DMA"); + } + + if (auto parentModule = getOperation()->getParentOfType()) { + if (auto tileExecutorOp = IE::getTileExecutor(parentModule)) { + const auto tilesCount = tileExecutorOp.getCount(); + int64_t tileIdx = enqueueDmaAttr.getTileIdx().getValue().getSExtValue(); + if (tileIdx < 0 || tileIdx >= tilesCount) { + return errorAt(loc, "tileIdx: {0} - must be in range [0, {1}]", tileIdx, tilesCount); + } + } + } + + return mlir::success(); +} + +size_t vpux::VPUIP::EnqueueDMAOp::getOperationCycleCost(std::shared_ptr& costModel) { + auto module = getOperation()->getParentOfType(); + + // TODO: E#97004 Expose API to get arch from cost model + const auto arch = config::getArch(module); + return checked_cast(getDMACost(getInput(), getOutput(), arch, costModel)); +} + +// +// FetchDMAOp +// + +mlir::LogicalResult vpux::VPUIP::FetchDMAOp::verify() { + auto parentModule = getOperation()->getParentOfType(); + const auto tilesCount = IE::getTileExecutor(parentModule).getCount(); + auto loc = getLoc(); + + int64_t tileIdx = getFetchDma().getTileIdx().getValue().getSExtValue(); + if (tileIdx < 0 || tileIdx >= tilesCount) { + return errorAt(loc, "tileIdx: {0} must be in range [0, {1}]", tileIdx, tilesCount); + } + + return mlir::success(); +} + +size_t vpux::VPUIP::FetchDMAOp::getOperationCycleCost(std::shared_ptr& costModel) { + auto module = getOperation()->getParentOfType(); + // TODO: Expose API to get arch from cost model - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); return checked_cast(getDMACost(getInput(), getOutput(), arch, costModel)); } diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/dpu_task.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/dpu_task.cpp index 2a07e31bc3..eaf3b8a6c3 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/dpu_task.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/dpu_task.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/core/cost_model_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -54,7 +55,7 @@ size_t vpux::VPUIP::DPUTaskOp::getOperationCycleCost(std::shared_ptrgetParentOfType(); // TODO: Expose API to get arch from cost model - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); vpux::Logger log = Logger::global(); return checked_cast(getDPUTaskOpCost(dpuTaskOp, costModel, arch, log)); diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/nce_cluster_task.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/nce_cluster_task.cpp index 8a5db55068..71d1a5cc66 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/nce_cluster_task.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/nce_cluster_task.cpp @@ -16,7 +16,7 @@ #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" -#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; @@ -315,7 +315,7 @@ size_t vpux::VPUIP::NCEClusterTaskOp::getOperationCycleCost(std::shared_ptrgetParentOfType(); // TODO: Expose API to get arch from cost model - auto arch = VPU::getArch(module); + auto arch = config::getArch(module); auto tileOp = IE::getTileExecutor(module); VPUX_THROW_WHEN(tileOp == nullptr, "Couldn't get TileExecutor for module"); @@ -358,15 +358,16 @@ mlir::LogicalResult verifyInOutOrder(mlir::Operation* op, const std::string& opN return mlir::success(); } -mlir::LogicalResult verifyNCEConv(VPUIP::NCEClusterTaskOp op, VPU::ArchKind arch) { +mlir::LogicalResult verifyNCEConv(VPUIP::NCEClusterTaskOp op, config::ArchKind arch) { VPUX_THROW_UNLESS(op.getTaskType() == VPUIP::NCETaskType::CONV, "Expected task type '{0}', but got '{1}'", VPUIP::NCETaskType::CONV, op.getTaskType()); if (op.getWeights() == nullptr) { return errorAt(op, "weights is required for NCETaskType : '{0}'", op.getTaskType()); } - if (op.getWeightTable() == nullptr) { - return errorAt(op, "weight_table is required for NCETaskType : '{0}'", op.getTaskType()); + if ((op.getWeightTableDataPtr() || op.getWeightTableSpPtr() || op.getWeightTableScale() || + op.getWeightTableBias() || op.getWeightZeroPoints())) { + return errorAt(op, "Only weight_table can be populated for NCETaskType : '{0}'", op.getTaskType()); } else if (op.getWeightTableDataPtr() && op.getWeightZeroPoints()) { return errorAt(op, "weight_table data pointers and zero points only are mutually exclusive for NCETaskType : '{0}'", @@ -439,7 +440,7 @@ mlir::LogicalResult verifyNCEConv(VPUIP::NCEClusterTaskOp op, VPU::ArchKind arch const auto batch = outputShape[Dims4D::Act::N]; if (batch != vpux::VPU::NCEInvariant::SUPPORTED_BATCH_SIZE) { - if (arch < VPU::ArchKind::NPU37XX) { + if (arch < config::ArchKind::NPU37XX) { return errorAt(op, "Got unsupported input batch '{0}' expected '{1}'", batch, vpux::VPU::NCEInvariant::SUPPORTED_BATCH_SIZE); } @@ -488,7 +489,7 @@ mlir::LogicalResult verifyNCEPool(VPUIP::NCEClusterTaskOp op) { return verifyInOutOrder(op, "Pooling"); } -mlir::LogicalResult verifyNCEEltwise(VPUIP::NCEClusterTaskOp op, VPU::ArchKind) { +mlir::LogicalResult verifyNCEEltwise(VPUIP::NCEClusterTaskOp op, config::ArchKind) { VPUX_THROW_UNLESS(op.getTaskType() == VPUIP::NCETaskType::ELTWISE, "Expected task type '{0}', but got '{1}'", VPUIP::NCETaskType::ELTWISE, op.getTaskType()); if (op.getKernelSizeAttr() != nullptr) { @@ -504,7 +505,7 @@ mlir::LogicalResult verifyNCEEltwise(VPUIP::NCEClusterTaskOp op, VPU::ArchKind) return mlir::success(); } -mlir::LogicalResult verifyNCEDWConv(VPUIP::NCEClusterTaskOp op, [[maybe_unused]] VPU::ArchKind arch) { +mlir::LogicalResult verifyNCEDWConv(VPUIP::NCEClusterTaskOp op, [[maybe_unused]] config::ArchKind arch) { VPUX_THROW_UNLESS(op.getTaskType() == VPUIP::NCETaskType::DWCONV, "Expected task type '{0}', but got '{1}'", VPUIP::NCETaskType::DWCONV, op.getTaskType()); @@ -602,7 +603,7 @@ mlir::LogicalResult vpux::VPUIP::DPUTaskOp::verify() { mlir::LogicalResult vpux::VPUIP::NCEClusterTaskOp::verify() { const auto op = getOperation(); auto module = op->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); if (getTaskType() == VPUIP::NCETaskType::CONV) { if (mlir::failed(verifyNCEConv(*this, arch))) { @@ -659,7 +660,7 @@ mlir::LogicalResult vpux::VPUIP::NCEClusterTaskOp::verify() { if (isInput4d) { const auto inputBatch = inputShape[Dims4D::Act::N]; if (inputBatch != vpux::VPU::NCEInvariant::SUPPORTED_BATCH_SIZE) { - if (arch < VPU::ArchKind::NPU37XX) { + if (arch < config::ArchKind::NPU37XX) { return errorAt(op, "Got unsupported input batch '{0}' expected '{1}'", inputBatch, vpux::VPU::NCEInvariant::SUPPORTED_BATCH_SIZE); } @@ -719,7 +720,7 @@ mlir::LogicalResult vpux::VPUIP::NCEClusterTaskOp::verify() { } } - if (arch >= VPU::ArchKind::NPU40XX) { + if (arch >= config::ArchKind::NPU40XX) { auto outputType = mlir::dyn_cast(getOutput().getType()); auto outputItiBuffs = getOutput_ITIBuff(); diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/permute_cast.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/permute_cast.cpp index c96315e0b7..691cac01ac 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/permute_cast.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/permute_cast.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/dialect/VPU/utils/type_infer.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/permute_utils.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/profiling_section.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/profiling_section.cpp index 34698fa905..bd261b3360 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/profiling_section.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/profiling_section.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" +#include "vpux/compiler/utils/error.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/shape_cast.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/shape_cast.cpp index 573d2bf9a0..dda5545222 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/shape_cast.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/shape_cast.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/utils/strides_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/reshape_utils.hpp" @@ -143,9 +144,9 @@ mlir::LogicalResult VPUIP::ShapeCastOp::inferReturnTypes(mlir::MLIRContext* ctx, if (mlir::failed(shapeCast.verify(loc))) { return mlir::failure(); } - const auto arch = VPU::getArch(mlir::isa(operands[0]) - ? operands[0].getParentRegion()->getParentOfType() - : operands[0].getDefiningOp()); + const auto arch = config::getArch(mlir::isa(operands[0]) + ? operands[0].getParentRegion()->getParentOfType() + : operands[0].getDefiningOp()); const auto inType = mlir::cast(shapeCast.getSource().getType()); const auto outShape = parseIntArrayAttr(shapeCast.getShape()); diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/storage_element_table.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/storage_element_table.cpp index ef45daea92..aa72625521 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/storage_element_table.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/storage_element_table.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/types.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/sw_kernel.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/sw_kernel.cpp index 73e6c92446..27031b8307 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/sw_kernel.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/sw_kernel.cpp @@ -6,23 +6,23 @@ #include "vpux/compiler/core/attributes/stride_reqs.hpp" #include "vpux/compiler/core/bounded_buffer.hpp" #include "vpux/compiler/core/cost_model_utils.hpp" -#include "vpux/compiler/dialect/IE/utils/roll_utils.hpp" -#include "vpux/compiler/dialect/VPU/transforms/factories/shave_controls_dpu.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/asm.hpp" #include "vpux/compiler/utils/error.hpp" -#include "vpux/compiler/utils/permute_utils.hpp" -#include "vpux/utils/core/range.hpp" #include #include #include #include #include + #include + #define REGION_YOLO_MAX_MASK_SIZE 9 // max mask size for region yolo op #define PROPOSAL_MAX_RATIO 3 // max ratio size for proposal op #define PROPOSAL_MAX_SCALE 6 // max scale size for proposal op @@ -36,19 +36,6 @@ using namespace vpux; using namespace mlir; namespace { -// special format of dims/order available only on kernel-FW side -int64_t computeReverseMemDim(mlir::Value tensorArg, int64_t dimIdx) { - const auto inOrder = DimsOrder::fromValue(tensorArg); - // Negative value means counting dimension from the end - if (dimIdx < 0) { - dimIdx += inOrder.numDims(); - } - MemDim md = inOrder.toMemDim(Dim(dimIdx)); - - const auto shape = getShape(tensorArg); - auto nDims = checked_cast(shape.size()); - return nDims - 1 - md.ind(); -} // permute int array attribute in the physical order static SmallVector permuteIntArrayAttr(DimsOrder inOrder, mlir::ArrayAttr arrayAttr) { @@ -78,92 +65,11 @@ static SmallVector getAxesArrayRevertAndOrderAware(mlir::Value tensorAr const auto axes = parseIntArrayAttr(arrayAttr); SmallVector revertedAxesArray(MAX_NUM_DIMS, 0); for (const auto srcInd : irange(arrayAttr.size())) { - revertedAxesArray[srcInd] = computeReverseMemDim(tensorArg, axes[srcInd]); + revertedAxesArray[srcInd] = VPUIP::computeReverseMemDim(tensorArg, axes[srcInd]); } return revertedAxesArray; } -uint64_t getFloatBits(vpux::type::float16 val) { - return static_cast(val.to_bits()); -} - -uint64_t getFloatBits(float val) { - uint32_t f32Bits = llvm::bit_cast(val); - return static_cast(f32Bits); -} - -template -void packAsFpIntoU64(const SmallVector& values, SmallVector& params) { - static constexpr uint32_t PACKED_VALUES_COUNT = sizeof(int64_t) / sizeof(OT); - static constexpr uint64_t bitWidth = sizeof(OT) * CHAR_BIT; - OT fltValue[PACKED_VALUES_COUNT]; - size_t packIdx = 0; - - auto pack = [](OT fltVals[PACKED_VALUES_COUNT]) -> uint64_t { - uint64_t ret = 0; - for (uint32_t i = 0; i < PACKED_VALUES_COUNT; i++) { - ret |= getFloatBits(fltVals[i]) << (bitWidth * i); - } - return ret; - }; - - for (const auto val : values) { - fltValue[packIdx++] = static_cast(val); - if (packIdx == PACKED_VALUES_COUNT) { - params.push_back(pack(fltValue)); - packIdx = 0; // reset pack index - } - } - - // Store trailing elements - if (packIdx) { - // Pad with zeros up to U64 alignment - while (packIdx < PACKED_VALUES_COUNT) { - fltValue[packIdx++] = 0; - } - params.push_back(pack(fltValue)); - } -} - -void getQuantParamsAttr(mlir::MLIRContext* ctx, mlir::Value qValue, mlir::Type pType, mlir::ArrayAttr& paramsAttr) { - SmallVector scales; - SmallVector zeroes; - int64_t quantDim = -1; - const auto qType = mlir::cast(qValue.getType()).getElementType(); - - if (mlir::isa(qType)) { - auto quantParams = mlir::cast(qType); - scales = {quantParams.getScale()}; - zeroes = {quantParams.getZeroPoint()}; - } else if (mlir::isa(qType)) { - auto quantParams = mlir::cast(qType); - quantDim = computeReverseMemDim(qValue, quantParams.getQuantizedDimension()); - scales = {quantParams.getScales().begin(), quantParams.getScales().end()}; - zeroes = {quantParams.getZeroPoints().begin(), quantParams.getZeroPoints().end()}; - } else { - VPUX_THROW("Unsupported quantized type {0}", qType); - } - - typedef decltype(scales)::value_type TS; - typedef decltype(zeroes)::value_type TZ; - - // Convert & pack float values into u64 words for serialization - llvm::SmallVector params; - params.push_back(quantDim); - params.push_back(scales.size()); - if (pType.isF16()) { - packAsFpIntoU64(scales, params); - packAsFpIntoU64(zeroes, params); - } else if (pType.isF32()) { - packAsFpIntoU64(scales, params); - packAsFpIntoU64(zeroes, params); - } else { - pType.dump(); - VPUX_THROW("Supported non-quantized type : f16/f32"); - } - paramsAttr = getIntArrayAttr(ctx, std::move(params)); -} - // Build a bit-mask to indicate present inputs/outputs // (for SW kernels with optional IOs) mlir::ArrayAttr optionalIoAttr(mlir::Operation* op) { @@ -1281,13 +1187,13 @@ VPUIP::KernelInfo SwKernelOp::getKernelInfo(mlir::Operation* origOp) { .Case([&](VPU::QuantizeOp op) { const auto iType = mlir::cast(op.getInput().getType()); mlir::ArrayAttr paramsAttr; - getQuantParamsAttr(ctx, op.getOutput(), iType.getElementType(), paramsAttr); + getQuantParamsAttr(op.getOutput(), iType.getElementType(), paramsAttr); return VPUIP::KernelInfo{SmallVector{paramsAttr}, {"quantize"}}; }) .Case([&](VPU::DequantizeOp op) { const auto oType = mlir::cast(op.getOutput().getType()); mlir::ArrayAttr paramsAttr; - getQuantParamsAttr(ctx, op.getInput(), oType.getElementType(), paramsAttr); + getQuantParamsAttr(op.getInput(), oType.getElementType(), paramsAttr); return VPUIP::KernelInfo{SmallVector{paramsAttr}, {"dequantize"}}; }) .Case([&](VPU::DynamicQuantizeOp) { @@ -2188,7 +2094,7 @@ size_t SwKernelOp::getOperationCycleCost(std::shared_ptr& c auto module = getOperation()->getParentOfType(); // TODO: Expose API to get arch from cost model - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); return checked_cast(calculateShaveActCycles(swKernelOp, costModel, arch)); } diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/workload_cast.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/ops/workload_cast.cpp deleted file mode 100644 index 87b61900b6..0000000000 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/ops/workload_cast.cpp +++ /dev/null @@ -1,16 +0,0 @@ -// -// Copyright (C) 2023-2025 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" - -using namespace vpux; - -// -// ViewLikeOpInterface -// - -mlir::Value VPUIP::WorkloadCastOp::getViewSource() { - return getInput(); -} diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/ops_interfaces.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/ops_interfaces.cpp index 40a074764d..d79778ee13 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/ops_interfaces.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/ops_interfaces.cpp @@ -4,19 +4,14 @@ // #include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" -#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" -#include "vpux/compiler/dialect/VPURT/IR/types.hpp" -#include "vpux/compiler/core/attributes/stride_reqs.hpp" -#include "vpux/compiler/utils/analysis.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/quantization.hpp" -#include "vpux/utils/core/format.hpp" - #include using namespace vpux; @@ -386,8 +381,8 @@ mlir::LogicalResult vpux::VPUIP::verifySameOperandsAndResultElementType(mlir::Op std::optional vpux::VPUIP::getChannelType(mlir::Operation* op) { // Configure DMA channel only for VPU4 for now - const auto arch = VPU::getArch(op); - if (arch < VPU::ArchKind::NPU40XX) { + const auto arch = config::getArch(op); + if (arch < config::ArchKind::NPU40XX) { return std::nullopt; } diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/types/bounded_buffer.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/types/bounded_buffer.cpp index f1f2c9a958..d76f2d9c24 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/types/bounded_buffer.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/types/bounded_buffer.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" +#include "vpux/utils/core/error.hpp" // // BoundedBuffer @@ -183,9 +184,6 @@ mlir::Type VPUIP::BoundedBufferType::parse(mlir::AsmParser& parser) { if (parser.parseType(data)) { return Type(); } - if (mlir::succeeded(parser.parseOptionalGreater())) { - return get(data, mlir::Type{}); - } if (parser.parseComma()) { return Type(); @@ -233,4 +231,53 @@ vpux::VPUIP::BoundedBufferType vpux::VPUIP::BoundedBufferType::cloneWith(std::op mlir::Attribute vpux::VPUIP::BoundedBufferType::getMemorySpace() const { return getMemSpace(); } + +// +// DistributedTypeInterface +// +bool vpux::VPUIP::BoundedBufferType::containsDistributedTypes() const { + // If the data is a distributed type, the dynamicShape will be as well + return mlir::isa(getData()); +} + +SmallVector vpux::VPUIP::BoundedBufferType::getDistributedTypes() const { + SmallVector distributedTypes; + if (mlir::isa(getData())) { + distributedTypes.push_back(getData()); + } + if (mlir::isa(getDynamicShape())) { + distributedTypes.push_back(getDynamicShape()); + } + return distributedTypes; +} + +NDTypeInterface vpux::VPUIP::BoundedBufferType::changeShapeForExplicitDistribution( + ShapeRef shape, VPU::DistributionInfoAttr distributedAttr) const { + return changeShapeElemTypeForExplicitDistribution(shape, getElementType(), distributedAttr); +} + +NDTypeInterface vpux::VPUIP::BoundedBufferType::changeShapeElemTypeForExplicitDistribution( + ShapeRef /*shape*/, mlir::Type /*elemType*/, VPU::DistributionInfoAttr /*distributedAttr*/) const { + VPUX_THROW("Not implemented"); + return nullptr; +} + +NDTypeInterface vpux::VPUIP::BoundedBufferType::changeTypeComponentsForExplicitDistribution( + const TypeComponents& /*typeComponents*/, VPU::DistributionInfoAttr /*distributedAttr*/) const { + VPUX_THROW("Not implemented"); + return nullptr; +} + +NDTypeInterface vpux::VPUIP::BoundedBufferType::extractDenseTileForExplicitDistribution( + vpux::ShapeRef /*tileOffsets*/, vpux::ShapeRef /*tileShape*/, + VPU::DistributionInfoAttr /*distributedAttr*/) const { + VPUX_THROW("Not implemented"); + return nullptr; +} + +NDTypeInterface vpux::VPUIP::BoundedBufferType::extractViewTileForExplicitDistribution( + vpux::ShapeRef, vpux::ShapeRef, vpux::ShapeRef, VPU::DistributionInfoAttr) const { + VPUX_THROW("Not implemented"); +} + } // namespace vpux diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/types/distributed_buffer.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/types/distributed_buffer.cpp index 75248494e5..4038b7db53 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/types/distributed_buffer.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/types/distributed_buffer.cpp @@ -6,18 +6,15 @@ #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/attributes/stride_reqs.hpp" #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" - -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/core/IR/memref_attr.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/swizzling_utils.hpp" #include "vpux/compiler/utils/types.hpp" - #include "vpux/utils/core/numeric.hpp" #include -#include - using namespace vpux; namespace { @@ -68,6 +65,45 @@ Byte getStridedAllocSize(const StridedShape& stridedTiledShape, ShapeRef strided } return Byte(totalBytes); } + +// Returns strides attribute only when the provided strides are not compact. +// This is aligned to what getMemRefType() does for mlir::MemRefType. +mlir::ArrayAttr getStridesAttr(mlir::MLIRContext* ctx, StridesRef strides, const DimsOrder& dimsOrder, Bit elemSize, + ShapeRef shape) { + if (strides.empty()) { + return nullptr; + } + const auto memStrides = dimsOrder.toMemoryOrder(strides); + const auto memShape = dimsOrder.toMemoryOrder(shape); + + const bool isCompact = StrideReqs::compact(shape.size()).checkStrides(memStrides, elemSize, memShape); + if (isCompact) { + return nullptr; + } + + // Have strides only if they are not compact + const auto elemStrides = to_small_vector(strides | transformed([&](Bit stride) { + return stride.count() / elemSize.count(); + })); + return getIntArrayAttr(ctx, elemStrides); +} + +// Returns either a vpux::MemRefAttr or an mlir::AffineMapAttr (order) depending +// on the specified input: when nothing except order is specified, only order is +// returned. This aligns the behaviour to getMemRefType() that creates a +// mlir::MemRefType. +mlir::MemRefLayoutAttrInterface getMemrefLayout(mlir::AffineMapAttr order, mlir::ArrayAttr optionalStrides, + mlir::IntegerAttr optionalAllocSize, + mlir::ArrayRef fields) { + const bool hwSpecificFieldsEmpty = + std::all_of(fields.begin(), fields.end(), [](const vpux::HwSpecificMemRefField& field) { + return field == nullptr; + }); + if (optionalStrides == nullptr && optionalAllocSize == nullptr && hwSpecificFieldsEmpty) { + return order; + } + return vpux::MemRefAttr::get(order, optionalStrides, optionalAllocSize, fields, order.getContext()); +} } // namespace // @@ -420,12 +456,12 @@ mlir::MemRefType VPUIP::DistributedBufferType::getCompactType() const { swizzlingSchemeAttr, VPUIP::getSparsityCompressionAttr(*this)); } +namespace { + // // Shape utils // -namespace { - Shape* getLargestShapeIt(SmallVector& shapes) { return std::max_element(shapes.begin(), shapes.end(), [](ShapeRef a, ShapeRef b) { return details::calcTotalShapeSize(a.raw()) < details::calcTotalShapeSize(b.raw()); @@ -442,6 +478,44 @@ StridedShape* getLargestStridedShapeIt(SmallVector& stridedShapes) }); } +// +// Helper to extract subview quantized type when input is already distributed over quant-axis +// +// e.g. full shape = 16x?x?x? +// full qElemType = !quant.uniform(inDistribution.getNumTiles()); + auto inputOffsets = vpux::parseIntArrayOfArrayAttr(inDistribution.getComputeOffsets()); + auto subviewShapes = vpux::parseIntArrayOfArrayAttr(subDistribution.getMemoryShapes()); + SmallVector offsets; + SmallVector sizes; + VPUX_THROW_UNLESS(tileOffsets[Dim(axis)] % numTiles[axis] == 0, + "Previous subview shapes are not identical in all tiles."); + + int64_t bias = tileOffsets[Dim(axis)] / numTiles[axis]; // bias within tile + for (auto ind : irange(inputOffsets.size())) { + offsets.push_back(inputOffsets[ind][axis] + bias); + sizes.push_back(subviewShapes[ind][axis]); + } + + return vpux::tileScalesAndZP(perAxisQType, offsets, sizes); +} + } // namespace // @brief Retrieve the array of compute shapes. @@ -592,8 +666,8 @@ NDTypeInterface VPUIP::DistributedBufferType::changeShapeElemTypeForExplicitDist const auto orderAttr = mlir::AffineMapAttr::get(newOrder.toAffineMap(ctx)); // If swizzlingKey is set get rid of strides settings if (memRefAttr.hwSpecificField()) { - layoutAttr = vpux::MemRefAttr::get(orderAttr, nullptr, - /*allocSize=*/nullptr, memRefAttr.hwSpecificFields(), ctx); + layoutAttr = getMemrefLayout(orderAttr, nullptr, + /*allocSize=*/nullptr, memRefAttr.hwSpecificFields()); } else { layoutAttr = orderAttr; } @@ -630,15 +704,11 @@ NDTypeInterface VPUIP::DistributedBufferType::changeTypeComponentsForExplicitDis VPUX_THROW_UNLESS(dimsOrder.numDims() == shape.size(), "Order '{0}' is incompatible with the shape '{1}'", dimsOrder, shape); - const auto elemSize = vpux::getElemTypeSize(elementType).count(); + const auto elemSize = vpux::getElemTypeSize(elementType); const auto order = mlir::AffineMapAttr::get(dimsOrder.toAffineMap(ctx)); - const auto newStrides = to_small_vector(strides | transformed([&](Bit stride) { - return stride.count() / elemSize; - })); - const auto newStridesAttr = getIntArrayAttr(ctx, newStrides); - + const auto newStridesAttr = getStridesAttr(ctx, strides, dimsOrder, elemSize, shape); auto hwSpecificFields = getHwSpecificFields(getLayout()); - const auto newDescAttr = vpux::MemRefAttr::get(order, newStridesAttr, /*allocSize=*/nullptr, hwSpecificFields, ctx); + const auto newDescAttr = getMemrefLayout(order, newStridesAttr, /*allocSize=*/nullptr, hwSpecificFields); auto newType = VPUIP::DistributedBufferType::get(ctx, shape.raw(), elementType, newDescAttr, memSpace, distributedAttr, getSparsityCompression()); @@ -683,13 +753,27 @@ NDTypeInterface VPUIP::DistributedBufferType::extractViewTileForExplicitDistribu } const auto ctx = getContext(); - const auto elemSize = getElemTypeSize().count(); + const auto elemSize = getElemTypeSize(); const auto order = mlir::AffineMapAttr::get(getDimsOrder().toAffineMap(ctx)); const auto memSpace = getMemSpace(); auto tileElemType = getElementType(); if (const auto perAxisQType = mlir::dyn_cast(tileElemType)) { - tileElemType = vpux::tileScalesAndZP(perAxisQType, tileShape, tileOffsets); + auto inMode = getDistribution().getMode().getValue(); + auto isMultiClusterAndMultiShaveOnAxis = false; + if (inMode == VPU::DistributionMode::SEGMENTED) { + auto axis = perAxisQType.getQuantizedDimension(); + auto inTiles = vpux::parseIntArrayAttr(getDistribution().getNumTiles()); + auto subTiles = vpux::parseIntArrayAttr(distributedAttr.getNumTiles()); + isMultiClusterAndMultiShaveOnAxis = (inTiles[axis] > 1) && (subTiles[axis] > 1); + } + + if (isMultiClusterAndMultiShaveOnAxis) { + tileElemType = + getQuantTypeForExplicitDistribution(perAxisQType, getDistribution(), distributedAttr, tileOffsets); + } else { + tileElemType = vpux::tileScalesAndZP(perAxisQType, tileShape, tileOffsets); + } } auto tileStrides = getStrides(); @@ -703,14 +787,9 @@ NDTypeInterface VPUIP::DistributedBufferType::extractViewTileForExplicitDistribu } } - const auto newStrides = to_small_vector(tileStrides | transformed([&](Bit stride) { - return stride.count() / elemSize; - })); - - const auto newStridesAttr = getIntArrayAttr(ctx, newStrides); + const auto newStridesAttr = getStridesAttr(ctx, tileStrides, getDimsOrder(), elemSize, tileShape); auto hwSpecificFields = getHwSpecificFields(getLayout()); - const auto newDescAttr = vpux::MemRefAttr::get(order, newStridesAttr, - /*allocSize=*/nullptr, hwSpecificFields, ctx); + const auto newDescAttr = getMemrefLayout(order, newStridesAttr, /*allocSize=*/nullptr, hwSpecificFields); const auto sparsityCompression = VPUIP::tileSparsityCompression(getSparsityCompression(), tileOffsets, tileShape); @@ -973,8 +1052,8 @@ NDTypeInterface VPUIP::DistributedBufferType::changeShapeElemType(ShapeRef shape const auto orderAttr = mlir::AffineMapAttr::get(newOrder.toAffineMap(ctx)); // If swizzlingKey is set get rid of strides settings if (memRefAttr.hwSpecificField()) { - layoutAttr = vpux::MemRefAttr::get(orderAttr, nullptr, - /*allocSize=*/nullptr, memRefAttr.hwSpecificFields(), ctx); + layoutAttr = getMemrefLayout(orderAttr, nullptr, + /*allocSize=*/nullptr, memRefAttr.hwSpecificFields()); } else { layoutAttr = orderAttr; } @@ -997,8 +1076,8 @@ NDTypeInterface VPUIP::DistributedBufferType::changeDimsOrder(DimsOrder order) c auto orderAttr = mlir::AffineMapAttr::get(order.toAffineMap(ctx)); if (auto memRefAttr = mlir::dyn_cast(getLayout())) { // Assume compact strides - layoutAttr = vpux::MemRefAttr::get(orderAttr, nullptr, - /*allocSize=*/nullptr, memRefAttr.hwSpecificFields(), ctx); + layoutAttr = getMemrefLayout(orderAttr, nullptr, + /*allocSize=*/nullptr, memRefAttr.hwSpecificFields()); } else { layoutAttr = orderAttr; } @@ -1013,15 +1092,12 @@ NDTypeInterface VPUIP::DistributedBufferType::changeMemSpace(IndexedSymbolAttr / NDTypeInterface VPUIP::DistributedBufferType::changeStrides(StridesRef strides) const { const auto ctx = getContext(); - const auto elemSize = getElemTypeSize().count(); + const auto elemSize = getElemTypeSize(); const auto order = mlir::AffineMapAttr::get(getDimsOrder().toAffineMap(ctx)); - const auto newStrides = to_small_vector(strides | transformed([&](Bit stride) { - return stride.count() / elemSize; - })); - const auto newStridesAttr = getIntArrayAttr(ctx, newStrides); + const auto newStridesAttr = getStridesAttr(ctx, strides, getDimsOrder(), elemSize, getShape()); auto hwSpecificFields = getHwSpecificFields(getLayout()); - const auto newDescAttr = vpux::MemRefAttr::get(order, newStridesAttr, - /*allocSize=*/nullptr, hwSpecificFields, ctx); + const auto newDescAttr = getMemrefLayout(order, newStridesAttr, + /*allocSize=*/nullptr, hwSpecificFields); return VPUIP::DistributedBufferType::get(ctx, getShape().raw(), getElementType(), newDescAttr, getMemSpace(), getDistribution(), getSparsityCompression()); } @@ -1042,16 +1118,13 @@ NDTypeInterface VPUIP::DistributedBufferType::changeTypeComponents(const vpux::T "Cannot change shape when having explicit per cluster shapes/offsets"); } - const auto elemSize = vpux::getElemTypeSize(elementType).count(); + const auto elemSize = vpux::getElemTypeSize(elementType); const auto order = mlir::AffineMapAttr::get(dimsOrder.toAffineMap(ctx)); - const auto newStrides = to_small_vector(strides | transformed([&](Bit stride) { - return stride.count() / elemSize; - })); - const auto newStridesAttr = getIntArrayAttr(ctx, newStrides); + const auto newStridesAttr = getStridesAttr(ctx, strides, dimsOrder, elemSize, shape); auto hwSpecificFields = getHwSpecificFields(getLayout()); - const auto newDescAttr = vpux::MemRefAttr::get(order, newStridesAttr, - /*allocSize=*/nullptr, hwSpecificFields, ctx); + const auto newDescAttr = getMemrefLayout(order, newStridesAttr, + /*allocSize=*/nullptr, hwSpecificFields); return VPUIP::DistributedBufferType::get(ctx, shape.raw(), elementType, newDescAttr, memSpace, distribution, getSparsityCompression()); @@ -1086,7 +1159,7 @@ NDTypeInterface VPUIP::DistributedBufferType::extractViewTile(vpux::ShapeRef til "Cannot get DistributedBufferType with new shape from old one when having explicit per cluster " "shapes/offsets"); - const auto elemSize = getElemTypeSize().count(); + const auto elemSize = getElemTypeSize(); const auto order = mlir::AffineMapAttr::get(getDimsOrder().toAffineMap(ctx)); const auto memSpace = getMemSpace(); @@ -1106,14 +1179,10 @@ NDTypeInterface VPUIP::DistributedBufferType::extractViewTile(vpux::ShapeRef til } } - const auto newStrides = to_small_vector(tileStrides | transformed([&](Bit stride) { - return stride.count() / elemSize; - })); - - const auto newStridesAttr = getIntArrayAttr(ctx, newStrides); + const auto newStridesAttr = getStridesAttr(ctx, tileStrides, getDimsOrder(), elemSize, tileShape); auto hwSpecificFields = getHwSpecificFields(getLayout()); - const auto newDescAttr = vpux::MemRefAttr::get(order, newStridesAttr, - /*allocSize=*/nullptr, hwSpecificFields, ctx); + const auto newDescAttr = getMemrefLayout(order, newStridesAttr, + /*allocSize=*/nullptr, hwSpecificFields); const auto sparsityCompression = VPUIP::tileSparsityCompression(getSparsityCompression(), tileOffsets, tileShape); diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/types/iti_buffer.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/types/iti_buffer.cpp index 188d249aef..f4ebf4d141 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/types/iti_buffer.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/types/iti_buffer.cpp @@ -7,9 +7,8 @@ #include "vpux/compiler/core/attributes/stride_reqs.hpp" #include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" - -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" - +#include "vpux/compiler/dialect/core/IR/memref_attr.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/compression_utils.hpp" #include "vpux/compiler/utils/swizzling_utils.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/IR/types/sparse_buffer.cpp b/src/vpux_compiler/src/dialect/VPUIP/IR/types/sparse_buffer.cpp index ced8a86b2c..fb611fd344 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/IR/types/sparse_buffer.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/IR/types/sparse_buffer.cpp @@ -4,7 +4,7 @@ // #include "vpux/compiler/core/attributes/stride_reqs.hpp" -#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" #include "vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/interfaces/aligned_channels_ops.cpp b/src/vpux_compiler/src/dialect/VPUIP/interfaces/aligned_channels_ops.cpp index ff77be26a4..ff9a29e4e7 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/interfaces/aligned_channels_ops.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/interfaces/aligned_channels_ops.cpp @@ -3,18 +3,17 @@ // SPDX-License-Identifier: Apache-2.0 // -#include #include "vpux/compiler/core/layers.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" -#include "vpux/compiler/dialect/VPU/IR/types.hpp" #include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/error.hpp" +#include + using namespace vpux; namespace { diff --git a/src/vpux_compiler/src/dialect/VPUIP/interfaces/common_rewriters/convert_lut_to_const.cpp b/src/vpux_compiler/src/dialect/VPUIP/interfaces/common_rewriters/convert_lut_to_const.cpp new file mode 100644 index 0000000000..e9539a7901 --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPUIP/interfaces/common_rewriters/convert_lut_to_const.cpp @@ -0,0 +1,90 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/VPUIP/interfaces/common_rewriters/convert_lut_to_const.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/types.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" + +#include + +using namespace vpux; + +mlir::LogicalResult VPUIP::LUTConverterBase::matchAndRewrite(VPUIP::NCEClusterTaskOp nceClusterTask, + mlir::PatternRewriter& rewriter) const { + _log.trace("[{0}] Got '{1}' at '{2}'", getDebugName(), nceClusterTask->getName(), nceClusterTask->getLoc()); + + const auto LUTConst = [&]() { + mlir::OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(&_netFunc.getBody().front().front()); + return createLookupTableConst(nceClusterTask, rewriter); + }(); + + const auto copyDst = createCopyDestination(nceClusterTask, LUTConst, rewriter); + const auto lutNceInput = rewriter.create(nceClusterTask->getLoc(), LUTConst, copyDst).getOutput(); + + replaceWithConstInput(nceClusterTask, lutNceInput, rewriter); + + return mlir::success(); +} + +mlir::Value VPUIP::LUTConverterBase::createCopyDestination(VPUIP::NCEClusterTaskOp nceClusterTask, mlir::Value LUTConst, + mlir::PatternRewriter& rewriter) const { + const auto input = nceClusterTask.getInput(); + const auto inputType = input.getType(); + + return llvm::TypeSwitch(inputType) + .Case([&](VPUIP::DistributedBufferType distributedBufferTypeIn) { + const auto distributedInfo = createDistributionInfoAttr(distributedBufferTypeIn, nceClusterTask); + const auto distributedBufferType = + createDistributedBufferType(distributedInfo, nceClusterTask, LUTConst); + + auto alignment = vpux::getIntAttr(nceClusterTask.getContext(), VPU::SPRLUT_ALIGNMENT_REQUIREMENT); + + auto allocDistributed = rewriter.create( + nceClusterTask.getLoc(), distributedBufferType, alignment, nullptr); + return allocDistributed->getResult(0); + }) + .Case([&](auto) { + const auto constOutType = mlir::dyn_cast(LUTConst.getType()); + VPUX_THROW_WHEN(constOutType == nullptr, + "{0}: sprLUT const output type is expected to be MemRefType, but got {1}", + getDebugName(), LUTConst.getType()); + const auto memSpaceCMX = vpux::IndexedSymbolAttr::get(nceClusterTask.getContext(), + stringifyEnum(VPU::MemoryKind::CMX_NN), 0); + const auto cmxMemType = mlir::MemRefType::get(constOutType.getShape(), constOutType.getElementType(), + constOutType.getLayout(), memSpaceCMX); + const auto allocOp = rewriter.create(nceClusterTask.getLoc(), cmxMemType); + return allocOp->getResult(0); + }) + .Default([&](mlir::Type inputType) { + VPUX_THROW("{0}: `{1}` is not supported as an input type", getDebugName(), inputType); + return mlir::Value{}; + }); +} + +VPU::DistributionInfoAttr VPUIP::LUTConverterBase::createDistributionInfoAttr( + VPUIP::DistributedBufferType inputDistribType, VPUIP::NCEClusterTaskOp nceClusterTask) const { + auto inputDistribInfo = inputDistribType.getDistribution(); + VPUX_THROW_WHEN(inputDistribInfo == nullptr, "{0}: inputDistribInfo == nullptr for the input type is not allowed", + getDebugName()); + const auto duplicatedDistrModeAttr = + VPU::DistributionModeAttr::get(nceClusterTask.getContext(), VPU::DistributionMode::DUPLICATED); + return VPU::DistributionInfoAttr::get(nceClusterTask.getContext(), duplicatedDistrModeAttr, nullptr, nullptr, + nullptr, nullptr, inputDistribInfo.getNumClusters(), nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr); +} + +VPUIP::DistributedBufferType VPUIP::LUTConverterBase::createDistributedBufferType( + VPU::DistributionInfoAttr distributedInfo, VPUIP::NCEClusterTaskOp nceClusterTask, mlir::Value LUTConst) const { + const auto memSpaceCMX = + vpux::IndexedSymbolAttr::get(nceClusterTask.getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN), 0); + const auto ndTypeInterface = mlir::cast(LUTConst.getType()); + return VPUIP::DistributedBufferType::get( + nceClusterTask.getContext(), ndTypeInterface.getShape().raw(), ndTypeInterface.getElementType(), + mlir::dyn_cast(LUTConst.getType()).getLayout(), memSpaceCMX, distributedInfo); +} diff --git a/src/vpux_compiler/src/dialect/VPUIP/interfaces/dma_descriptor_generator.cpp b/src/vpux_compiler/src/dialect/VPUIP/interfaces/dma_descriptor_generator.cpp index ac985e0a78..0b9ea3d7a4 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/interfaces/dma_descriptor_generator.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/interfaces/dma_descriptor_generator.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/VPUIP/interfaces/dma_descriptor_generator.hpp" +#include "vpux/compiler/utils/attributes.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/VPUIP/interfaces/dpu_tiler.cpp b/src/vpux_compiler/src/dialect/VPUIP/interfaces/dpu_tiler.cpp index b52f87ab3d..b5ab18fb61 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/interfaces/dpu_tiler.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/interfaces/dpu_tiler.cpp @@ -4,14 +4,11 @@ // #include "vpux/compiler/dialect/VPUIP/interfaces/dpu_tiler.hpp" -#include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" - #include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" #include "vpux/compiler/utils/factors.hpp" - #include "vpux/utils/core/numeric.hpp" -#include #include using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/VPUIP/interfaces/nce_invariant.cpp b/src/vpux_compiler/src/dialect/VPUIP/interfaces/nce_invariant.cpp index c9b075e653..7157333caf 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/interfaces/nce_invariant.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/interfaces/nce_invariant.cpp @@ -4,25 +4,20 @@ // #include "vpux/compiler/dialect/VPUIP/interfaces/nce_invariant.hpp" - #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/core/tiling.hpp" -#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/utils/auto_padding_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" -#include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" -#include "vpux/compiler/utils/IE/transposed_convolution_utils.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" -#include "vpux/compiler/utils/types.hpp" - -#include #include +#include using namespace vpux; @@ -526,8 +521,11 @@ SmallVector> getRequiredO return {curTileTypes[0], curTileTypes[1], curTileTypes[2], nextTileTypes[0], nextTileTypes[1]}; } - // TODO : Logic should be improved to handle tiling on 2 dimensions. - const auto isWeightPrefetch = curTile.axis[Dims4D::Act::C] > 1; + auto isWeightPrefetch = curTile.axis[Dims4D::Act::C] > 1; + if (isNestedTiling(tiling)) { + auto unrollSpatialFirst = isSpatialFirstNestedTiling(origOp, curTile.axis); + isWeightPrefetch = unrollSpatialFirst; + } return {curTileTypes[0], curTileTypes[1], curTileTypes[2], isWeightPrefetch ? nextTileTypes[1] : nextTileTypes[0]}; } @@ -602,9 +600,6 @@ mlir::LogicalResult verifyPipeliningCMXConvBased(ConcreteOp origOp, const Output if (tiling.size() <= 1) { return mlir::failure(); } - if (isNestedTiling(tiling)) { - return mlir::failure(); - } auto module = origOp->template getParentOfType(); const auto cmxSize = getCMXSizeForTiling(module); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/factories/capture_workpoint_strategy_getter.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/factories/capture_workpoint_strategy_getter.cpp index 3a2fe498d9..fb33ed5eaa 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/factories/capture_workpoint_strategy_getter.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/factories/capture_workpoint_strategy_getter.cpp @@ -10,9 +10,9 @@ using namespace vpux; namespace vpux::VPUIP { -std::unique_ptr createCaptureWorkpointStrategy(VPU::ArchKind arch) { +std::unique_ptr createCaptureWorkpointStrategy(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return std::make_unique(); default: return std::make_unique(); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/factories/profilling_info.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/factories/profilling_info.cpp index 8e3f175590..c3fcd7f26e 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/factories/profilling_info.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/factories/profilling_info.cpp @@ -10,15 +10,15 @@ using namespace vpux; -VPUIP::TimestampTypeCb VPUIP::getTimestampTypeCb(VPU::ArchKind arch) { - if (arch == VPU::ArchKind::NPU37XX) { +VPUIP::TimestampTypeCb VPUIP::getTimestampTypeCb(config::ArchKind arch) { + if (arch == config::ArchKind::NPU37XX) { return VPUIP::arch37xx::getTimestampType; } VPUX_THROW("Unexpected architecture {0}", arch); } -VPUIP::SetWorkloadIdsCb VPUIP::setWorkloadsIdsCb(VPU::ArchKind arch) { - if (arch >= VPU::ArchKind::NPU37XX) { +VPUIP::SetWorkloadIdsCb VPUIP::setWorkloadsIdsCb(config::ArchKind arch) { + if (arch >= config::ArchKind::NPU37XX) { return VPUIP::arch37xx::setWorkloadIds; } VPUX_THROW("Unexpected architecture {0}", arch); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/factories/split_cost_getter.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/factories/split_cost_getter.cpp index db6cb63008..e9e99d8a65 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/factories/split_cost_getter.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/factories/split_cost_getter.cpp @@ -10,8 +10,8 @@ using namespace vpux; -VPUIP::SplitCostCb VPUIP::getSplitCostCb(VPU::ArchKind arch) { - if (arch >= VPU::ArchKind::NPU37XX) { +VPUIP::SplitCostCb VPUIP::getSplitCostCb(config::ArchKind arch) { + if (arch >= config::ArchKind::NPU37XX) { return VPUIP::arch37xx::computeSplitCost; } VPUX_THROW("Unexpected architecture {0}", arch); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes.cpp index 66f591284c..bcfe3756f9 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes.cpp @@ -4,7 +4,8 @@ // #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" namespace vpux::VPUIP { diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_copy_between_swkernels_and_network_io.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_copy_between_swkernels_and_network_io.cpp index e88dbb8877..aca002cbd6 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_copy_between_swkernels_and_network_io.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_copy_between_swkernels_and_network_io.cpp @@ -5,11 +5,13 @@ #include "vpux/compiler/core/aliases_info.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/func_dialect.hpp" #include "vpux/compiler/utils/logging.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include @@ -516,6 +518,11 @@ class AddCopyBetweenSWKernelsAndNetworkIOPass final : void AddCopyBetweenSWKernelsAndNetworkIOPass::safeRunOnModule() { auto moduleOp = getOperation(); + + if (moduleOp.getOps().empty()) { + return; + } + mlir::func::FuncOp mainFuncOp; net::NetworkInfoOp netInfo; net::NetworkInfoOp::getFromModule(moduleOp, netInfo, mainFuncOp); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp index 3b86926449..d6483bef38 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp @@ -12,6 +12,7 @@ #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/utils/cache_utils.hpp" #include "vpux/compiler/dialect/VPURT/interfaces/inference_execution_simulator.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/logging.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -81,7 +82,6 @@ class AddSwKernelInstructionPrefetch final : mlir::Value bestUpdateBarrier); bool hasVPUSWModule(mlir::Operation* funcOp); - void setCachePrefetchReservedMemory(const mlir::ModuleOp module); size_t getOffsetReservedMem(const mlir::ModuleOp module); std::map kernelNameToSymbol; @@ -91,9 +91,7 @@ class AddSwKernelInstructionPrefetch final : static constexpr StringLiteral vpuTaskTypeAttrName{"VPU.task_type"}; static constexpr StringLiteral vpuKernelEntryAttrName{"VPU.kernel_entry"}; static constexpr size_t CACHE_LINE_SIZE = 64ul; - static constexpr int64_t MAX_CACHE_PREFETCH_RESERVED_MEM_SIZE = 1024; - size_t _dummyKernelIOMemOffset = 0; bool _minFreeCyclesHasValue = false; size_t _minimumFreeCyclesForPrefetch = 250000; bool _useDummyKernelForInstructionPrefetch = false; @@ -106,29 +104,11 @@ bool AddSwKernelInstructionPrefetch::hasVPUSWModule(mlir::Operation* funcOp) { return innerModule; } -void AddSwKernelInstructionPrefetch::setCachePrefetchReservedMemory(const mlir::ModuleOp module) { - auto* ctx = module->getContext(); - auto memSpaceAttr = mlir::SymbolRefAttr::get(ctx, stringifyEnum(VPU::MemoryKind::CMX_NN)); - auto available = IE::getAvailableMemory(module, memSpaceAttr); - - // Insert a dummy reserved memory when there's no reserved memory - int64_t maxCachePrefetchReservedMemSize = MAX_CACHE_PREFETCH_RESERVED_MEM_SIZE; - IE::setSWKernelCachePrefetchingReservedMemory(module, memSpaceAttr, maxCachePrefetchReservedMemSize); - - // Put all reserved memory at the end of CMX - auto newReservedMemoryResources = IE::getReservedMemoryResources(module, memSpaceAttr); - size_t resMemOffset = available.getByteSize(); - for (auto& resMem : newReservedMemoryResources) { - auto currResMemSize = resMem.getByteSize(); - resMemOffset -= currResMemSize; - auto currResMemOffset = resMemOffset; - resMem.setOffsetAttr(getIntAttr(module->getContext(), currResMemOffset)); - } -} - size_t AddSwKernelInstructionPrefetch::getOffsetReservedMem(const mlir::ModuleOp module) { - auto cachePrefetchMem = IE::getSWKernelCachePrefetchingReservedMemory(module, VPU::MemoryKind::CMX_NN); + auto cachePrefetchMem = IE::getDummySwKernelsForInstructionPrefetchReservedMemory(module, VPU::MemoryKind::CMX_NN); auto offsetCachePrefetch = cachePrefetchMem.getOffset(); + VPUX_THROW_WHEN(!offsetCachePrefetch.has_value(), + "DummySwKernelsForInstructionPrefetchReservedMemory offset is not set!"); return offsetCachePrefetch.value(); } @@ -199,7 +179,8 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertPrefetchOpBeforeFirstKer cachePrefetchSwKernel->setAttr("kernelElfName", mlir::StringAttr::get(ctx, kernelName)); const mlir::SmallVector args = {}; - vpux::VPUIP::initSwKernel(cachePrefetchSwKernel, buffersRange, buffersRange, args, _log.nest()); + vpux::VPUIP::initSwKernel(cachePrefetchSwKernel, buffersRange, buffersRange, args, _log.nest(), + /*swKernelRunOp=*/nullptr); _log.trace("cachePrefetchSwKernel {0}", cachePrefetchSwKernel); return cachePrefetchSwKernel; } @@ -212,17 +193,15 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst mlir::OpBuilder builder(firstSwTask); auto moduleOp = firstSwTask->getParentOfType(); auto reservedMemOffset = getOffsetReservedMem(moduleOp); + auto offsetAttr = getIntAttr(moduleOp->getContext(), reservedMemOffset); auto kernelOp = kernelNameToOps[kernelName]; auto createBuffer = [&](mlir::Value io, StringRef suffix, mlir::SmallVector& buffers) { if (auto bufOp = io.getDefiningOp()) { auto newType = mlir::cast(io.getType()).changeShape({1, 1, 1, 1}); - auto offsetAttr = getIntAttr(moduleOp->getContext(), reservedMemOffset + _dummyKernelIOMemOffset); auto newBuff = builder.create(appendLoc(bufOp->getLoc(), suffix), newType, bufOp.getSectionAttr(), bufOp.getSectionIndexAttr(), offsetAttr, bufOp.getSwizzlingKeyAttr()); - _dummyKernelIOMemOffset += - mlir::cast(io.getType()).getElemTypeSize().to().count(); buffers.push_back(newBuff); return true; } @@ -257,7 +236,7 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst auto args = (kernelName == "convert") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName]; vpux::VPUIP::initSwKernel(cachePrefetchSwKernel, mlir::ValueRange(srcBuffers), mlir::ValueRange(dstBuffers), args, - _log.nest()); + _log.nest(), /*swKernelRunOp=*/nullptr); _log.trace("cachePrefetchSwKernel {0}", cachePrefetchSwKernel); return cachePrefetchSwKernel; @@ -437,7 +416,7 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() { auto simLogger = vpux::Logger("InfSim", _log.level()); auto module = funcOp->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto maybeCostModelAnalysis = getCachedParentAnalysis(module); auto costModel = VPU::CostModelAnalysis::getOrCreateCostModel(maybeCostModelAnalysis, arch, _log); CycleCostInfo cycleCostInfo(std::move(costModel), funcOp); @@ -467,7 +446,10 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() { _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle); if (_useDummyKernelForInstructionPrefetch) { - setCachePrefetchReservedMemory(funcOp->getParentOfType()); + auto memSpaceAttr = mlir::SymbolRefAttr::get(module->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN)); + auto dummyKernelResMem = IE::getDummySwKernelsForInstructionPrefetchReservedMemory(module, memSpaceAttr); + VPUX_THROW_WHEN(dummyKernelResMem == nullptr, + "Cannot find DummySWKernelsForInstructionPrefetchReservedMemory!"); } auto newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/adjust_input_data_for_explicit_se_table.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/adjust_input_data_for_explicit_se_table.cpp index e03cef0af3..020e6a72ed 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/adjust_input_data_for_explicit_se_table.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/adjust_input_data_for_explicit_se_table.cpp @@ -4,7 +4,10 @@ // #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/utils/rewriter.hpp" namespace vpux::VPUIP { #define GEN_PASS_DECL_ADJUSTINPUTDATAFOREXPLICITSETABLE diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/async_regions_outlining.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/async_regions_outlining.cpp index 98ebeb0062..ee11f7a5fb 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/async_regions_outlining.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/async_regions_outlining.cpp @@ -16,6 +16,7 @@ #include "vpux/compiler/utils/allocate_buffers_for_net_results.hpp" #include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/logging.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/move_view_ops_into_async_regions.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/move_view_ops_into_async_regions.cpp index 5cd0b8bb5e..a9b7b4dbfd 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/move_view_ops_into_async_regions.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/move_view_ops_into_async_regions.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/move_wait_result_to_async_block_args.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/move_wait_result_to_async_block_args.cpp index f195f36371..63c9dc7c0e 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/move_wait_result_to_async_block_args.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/move_wait_result_to_async_block_args.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/core/async_deps_info.hpp" #include "vpux/compiler/utils/analysis.hpp" +#include "vpux/utils/core/dense_map.hpp" namespace vpux::VPUIP { #define GEN_PASS_DECL_MOVEWAITRESULTTOASYNCBLOCKARGS diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/wrap_into_async_regions.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/wrap_into_async_regions.cpp index bfa4bf1ebe..c8c9616de8 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/wrap_into_async_regions.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/async_scheduling/wrap_into_async_regions.cpp @@ -4,9 +4,11 @@ // #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/utils/logging.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/range.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/batch_matmul_to_matmul.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/batch_matmul_to_matmul.cpp index 553329fdea..0e24dbe605 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/batch_matmul_to_matmul.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/batch_matmul_to_matmul.cpp @@ -5,6 +5,9 @@ #include "vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/utils/rewriter.hpp" namespace vpux::VPUIP { #define GEN_PASS_DECL_BATCHMATMULTOMATMUL diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/break_data_flow.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/break_data_flow.cpp index 8258dddf00..bcf8aa59f6 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/break_data_flow.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/break_data_flow.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" namespace vpux::VPUIP { diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/calculate_async_region_cycle_cost.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/calculate_async_region_cycle_cost.cpp index 16a11bcb6d..883cbcb66e 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/calculate_async_region_cycle_cost.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/calculate_async_region_cycle_cost.cpp @@ -7,7 +7,10 @@ #include "vpux/compiler/core/cycle_cost_info.hpp" #include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" namespace vpux::VPUIP { #define GEN_PASS_DECL_CALCULATEASYNCREGIONCYCLECOST @@ -33,7 +36,7 @@ class CalculateAsyncRegionCycleCostPass final : void CalculateAsyncRegionCycleCostPass::safeRunOnFunc() { auto funcOp = getOperation(); auto module = funcOp->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto maybeCostModelAnalysis = getCachedParentAnalysis(module); auto costModel = VPU::CostModelAnalysis::getOrCreateCostModel(maybeCostModelAnalysis, arch, _log); CycleCostInfo cycleCostInfo(std::move(costModel), funcOp); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/capture_workpoint.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/capture_workpoint.cpp index 0065374f1c..216c5cd58a 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/capture_workpoint.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/capture_workpoint.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/dialect/VPUIP/transforms/factories/capture_workpoint_strategy_getter.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/utils/profiling/common.hpp" @@ -39,7 +40,7 @@ class CaptureWorkpointPass final : public VPUIP::impl::CaptureWorkpointBasegetContext(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto archSpecificStrategy = VPUIP::createCaptureWorkpointStrategy(arch); net::NetworkInfoOp netInfo; diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/compress_weights_btc.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/compress_weights_btc.cpp index 226880559f..ebc9407817 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/compress_weights_btc.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/compress_weights_btc.cpp @@ -9,8 +9,8 @@ #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/codec_factory.hpp" #include "vpux/compiler/utils/compression_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -47,7 +47,8 @@ class CompressWeightsBTCPass final : public VPUIP::impl::CompressWeightsBTCBase< class NNDMAOpConverter final : public mlir::OpRewritePattern { public: - NNDMAOpConverter(mlir::MLIRContext* ctx, const ICodec::CompressionAlgorithm& algo, VPU::ArchKind arch, Logger log) + NNDMAOpConverter(mlir::MLIRContext* ctx, const ICodec::CompressionAlgorithm& algo, config::ArchKind arch, + Logger log) : mlir::OpRewritePattern(ctx), _log(log), _codec(vpux::makeCodec(algo, arch)) { } @@ -219,7 +220,7 @@ mlir::LogicalResult NNDMAOpConverter::matchAndRewrite(VPUIP::NNDMAOp origOp, mli void CompressWeightsBTCPass::safeRunOnFunc() { auto func = getOperation(); auto module = func->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); const auto algo = ICodec::CompressionAlgorithm::BITCOMPACTOR_CODEC; _log.trace("VPUIP CompressWeightsBTCPass"); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/compute_se_sizes.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/compute_se_sizes.cpp index bd40e80337..d7624ae43a 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/compute_se_sizes.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/compute_se_sizes.cpp @@ -4,9 +4,11 @@ // #include "vpux/compiler/dialect/VPU/transforms/factories/sparsity_constraint.hpp" -#include "vpux/compiler/dialect/VPU/utils/sparsity_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include #include @@ -141,7 +143,7 @@ void ComputeSESizesPass::safeRunOnFunc() { auto& ctx = getContext(); auto func = getOperation(); - auto arch = VPU::getArch(func); + auto arch = config::getArch(func); auto constraint = VPU::getSparsityConstraint(arch); // Set the storage element size attributes only for the input operand in case the sparse data is concatenated diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_VPUIP_copy_to_SW_copy.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_VPUIP_copy_to_SW_copy.cpp index 79cc707d3f..7348467bde 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_VPUIP_copy_to_SW_copy.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_VPUIP_copy_to_SW_copy.cpp @@ -4,9 +4,14 @@ // #include "vpux/compiler/core/aliases_info.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/types.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/error.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/types.hpp" namespace vpux::VPUIP { @@ -226,7 +231,7 @@ mlir::LogicalResult ConvertVPUIPCopyToSWCopy::matchAndRewrite(VPUIP::CopyOp orig } // Create SWKernelOp type Copy - VPUIP::createRuntimeKernelDefinition(module, _log.nest(), VPU::getArch(origOp)); + VPUIP::createRuntimeKernelDefinition(module, _log.nest(), config::getArch(origOp)); const int64_t tileIndex = 0; vpux::VPUIP::KernelInfo kernelInfo(SmallVector{inBitOffsets, outBitOffsets}, SmallString("copy"), @@ -237,7 +242,8 @@ mlir::LogicalResult ConvertVPUIPCopyToSWCopy::matchAndRewrite(VPUIP::CopyOp orig auto swKernelOp = rewriter.create(location, operandsBuff, outputsBuff, builtInFunction, getIntAttr(ctx, tileIndex)); - vpux::VPUIP::initSwKernel(swKernelOp, operandsBuff, outputsBuff, kernelInfo.args, _log.nest()); + vpux::VPUIP::initSwKernel(swKernelOp, operandsBuff, outputsBuff, kernelInfo.args, _log.nest(), + /*swKernelRunOp=*/nullptr); _log.trace("Replace origin op {0} with new outputs from SW Kernel Copy", location); rewriter.replaceOp(origOp, swKernelOp); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_allocations_to_declarations.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_allocations_to_declarations.cpp index 59102b6019..952059892d 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_allocations_to_declarations.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_allocations_to_declarations.cpp @@ -4,8 +4,12 @@ // #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPURT/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" +#include "vpux/compiler/dialect/core/dialect.hpp" #include @@ -46,6 +50,7 @@ void ConvertAllocationsToDeclarationsPass::safeRunOnFunc() { mlir::ConversionTarget target(ctx); target.addLegalDialect(); + target.addLegalDialect(); target.addLegalDialect(); target.addLegalDialect(); target.addLegalDialect(); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_async_ops_to_tasks.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_async_ops_to_tasks.cpp index 9b03d35e87..7bca65b7f4 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_async_ops_to_tasks.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_async_ops_to_tasks.cpp @@ -4,8 +4,11 @@ // #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" +#include "vpux/compiler/dialect/core/dialect.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include @@ -194,6 +197,7 @@ void ConvertAsyncOpsToTasksPass::safeRunOnFunc() { typeConverter.addSourceMaterialization(dummyConverter); mlir::ConversionTarget target(ctx); + target.addLegalDialect(); target.addLegalDialect(); target.addLegalDialect(); target.addLegalDialect(); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_copy_to_DMA.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_copy_to_DMA.cpp index 5f040ac4f1..6565222c78 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_copy_to_DMA.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_copy_to_DMA.cpp @@ -3,11 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" - #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/rewriter.hpp" namespace vpux::VPUIP { #define GEN_PASS_DECL_CONVERTTRANSFEROPSTODMAS @@ -25,7 +27,7 @@ namespace { class TimestampRewrite final : public mlir::OpRewritePattern { public: - TimestampRewrite(mlir::MLIRContext* ctx, vpux::VPU::ArchKind arch, Logger log) + TimestampRewrite(mlir::MLIRContext* ctx, vpux::config::ArchKind arch, Logger log) : mlir::OpRewritePattern(ctx), _log(log), _arch(arch) { } @@ -34,7 +36,7 @@ class TimestampRewrite final : public mlir::OpRewritePattern private: Logger _log; - vpux::VPU::ArchKind _arch; + vpux::config::ArchKind _arch; }; mlir::LogicalResult TimestampRewrite::matchAndRewrite(VPUIP::TimestampOp origOp, @@ -49,7 +51,7 @@ mlir::LogicalResult TimestampRewrite::matchAndRewrite(VPUIP::TimestampOp origOp, uint32_t hwAddress = 0; switch (_arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: hwAddress = VPUIP::HW_TIMER_ABSOLUTE_ADDR_37XX; VPUX_THROW_UNLESS(origType.getElementType() == getUInt64Type(getContext()), "Got wrong element type for TimestampOp"); @@ -112,7 +114,7 @@ void ConvertTransferOpsToDMAsPass::safeRunOnFunc() { auto func = getOperation(); auto module = func->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); mlir::ConversionTarget target(ctx); target.addIllegalOp(); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_expand.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_expand.cpp index fa0f56cd51..c6d20dc289 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_expand.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_expand.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/quantization.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_func_args_to_declarations.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_func_args_to_declarations.cpp index 417aafe798..b8fb67cf8c 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_func_args_to_declarations.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_func_args_to_declarations.cpp @@ -3,20 +3,19 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" - -#include "vpux/compiler/core/aliases_info.hpp" -#include "vpux/compiler/utils/error.hpp" - -#include - #include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/func_dialect.hpp" #include "vpux/compiler/utils/hash.hpp" -#include "vpux/compiler/utils/logging.hpp" +#include "vpux/compiler/utils/rewriter.hpp" + +#include +#include namespace vpux::VPUIP { #define GEN_PASS_DECL_CONVERTFUNCARGSTODECLARATIONS @@ -83,6 +82,10 @@ class ConvertFuncArgsToDeclarationsPass final : void ConvertFuncArgsToDeclarationsPass::safeRunOnModule() { auto moduleOp = getOperation(); + if (moduleOp.getOps().empty()) { + return; + } + const auto buildNewDecl = [](mlir::OpBuilder& argBuilder, mlir::Value val, VPURT::BufferSection section, int64_t sectionIndex) { return argBuilder.create(val.getLoc(), val.getType(), section, sectionIndex, 0) diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_pallet_lut_to_const.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_pallet_lut_to_const.cpp new file mode 100644 index 0000000000..bfadd70a74 --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_pallet_lut_to_const.cpp @@ -0,0 +1,172 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/VPU/utils/sprlut_utils.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/types.hpp" +#include "vpux/compiler/dialect/VPUIP/interfaces/common_rewriters/convert_lut_to_const.hpp" +#include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" + +namespace vpux::VPUIP { +#define GEN_PASS_DECL_CONVERTPALLETLUTTOCONST +#define GEN_PASS_DEF_CONVERTPALLETLUTTOCONST +#include "vpux/compiler/dialect/VPUIP/passes.hpp.inc" +} // namespace vpux::VPUIP + +using namespace vpux; + +namespace { + +// +// PalletLUTConverter +// + +class PalletLUTConverter final : public VPUIP::LUTConverterBase { +public: + PalletLUTConverter(mlir::MLIRContext* ctx, Logger log, mlir::func::FuncOp netFunc) + : LUTConverterBase(ctx, log, netFunc) { + setDebugName("ConvertPalletLUTToConstPass::PalletLUTConverter"); + } + +private: + SmallVector fillPalletTable(mlir::Type quantileType, ArrayRef quantilesLut) const; + mlir::Value createLookupTableConst(VPUIP::NCEClusterTaskOp nceClusterTask, + mlir::PatternRewriter& rewriter) const override; + void replaceWithConstInput(VPUIP::NCEClusterTaskOp nceClusterTask, mlir::Value lutNceInput, + mlir::PatternRewriter& rewriter) const override; +}; + +SmallVector PalletLUTConverter::fillPalletTable(mlir::Type quantileType, + ArrayRef quantilesLut) const { + // For 8 bit quantileType, the 16 bit pallet LUT entries get duplicated as 2 x 8 bit elems. + // If the value is not correctly dimensioned for the 8 bit range, truncations will occur. + auto getPalletModeBitValue = [quantileType](const double value) -> uint16_t { + if (quantileType.isF16()) { + vpux::type::float16 f16(static_cast(value)); + return f16.to_bits(); + } else if (quantileType.isBF16()) { + vpux::type::bfloat16 bf16(static_cast(value)); + return bf16.to_bits(); + } else if (quantileType.isUnsignedInteger(8)) { + uint16_t u8Masked = value < 0. ? 0u : static_cast(value); + return u8Masked << 8 | u8Masked; // Duplicate the value for 16-bit representation + } else if (quantileType.isSignedInteger(8)) { + uint16_t i8Masked = static_cast(static_cast(value) & 0x00FF); + return i8Masked << 8 | i8Masked; + } else { + VPUX_THROW("getPalletModeBitValue: Unsupported quantileType for palletization table {0}", quantileType); + } + return 0; + }; + + constexpr unsigned PALLETIZATION_TABLE_16BIT_ENTRIES = 64; + SmallVector lutValues(PALLETIZATION_TABLE_16BIT_ENTRIES, 0); + for (unsigned i = 0; i < quantilesLut.size(); ++i) { + lutValues[i] = getPalletModeBitValue(quantilesLut[i]); + } + + return lutValues; +} + +mlir::Value PalletLUTConverter::createLookupTableConst(VPUIP::NCEClusterTaskOp nceClusterTask, + mlir::PatternRewriter& rewriter) const { + const auto weightsType = + mlir::dyn_cast(nceClusterTask.getWeights().getType()).getElementType(); + + const auto [quantileType, quantileLUT] = [&]() { + if (const auto quantileUniformType = mlir::dyn_cast_or_null(weightsType)) { + return std::tuple>(quantileUniformType.getQuantileType(), + quantileUniformType.getQuantiles()); + } + if (const auto quantilePerAxisType = + mlir::dyn_cast_or_null(weightsType)) { + return std::tuple>(quantilePerAxisType.getQuantileType(), + quantilePerAxisType.getQuantiles()); + } + VPUX_THROW("{0}: expected palletized weight type but {1} type was found instead", getDebugName(), weightsType); + }(); + + auto uint16PalletLUT = fillPalletTable(quantileType, quantileLUT); + auto uint16Type = + mlir::IntegerType::get(rewriter.getContext(), 16, mlir::IntegerType::SignednessSemantics::Unsigned); + auto palletLUTType = mlir::RankedTensorType::get({checked_cast(uint16PalletLUT.size())}, uint16Type); + auto palletLUTAttr = mlir::DenseElementsAttr::get(palletLUTType, ArrayRef(uint16PalletLUT)); + + const auto bufferType = vpux::getBufferType(palletLUTType); + Const::ContentSetup setup(mlir::cast(bufferType)); + const auto contentAttr = Const::ContentAttr::get(palletLUTAttr, setup); + return rewriter.create(nceClusterTask->getLoc(), bufferType, contentAttr).getOutput(); +} + +void PalletLUTConverter::replaceWithConstInput(VPUIP::NCEClusterTaskOp nceClusterTask, mlir::Value lutNceInput, + mlir::PatternRewriter& rewriter) const { + auto newInput = [&]() -> mlir::Value { + if (vpux::VPUIP::hasDistributedOperand(nceClusterTask)) { + const auto palletLUTOutType = mlir::dyn_cast(lutNceInput.getType()); + VPUX_THROW_WHEN(palletLUTOutType == nullptr, + "{0}: pallet LUT output type is expected to be DistributedBufferType, but got {1}", + getDebugName(), lutNceInput.getType()); + nceClusterTask.getPalletLookupTableMutable().append(lutNceInput); + } + return lutNceInput; + }(); + rewriter.modifyOpInPlace(nceClusterTask, [&] { + nceClusterTask.getPalletLookupTableMutable().assign(newInput); + }); +} + +// +// ConvertPalletLUTToConstPass +// + +class ConvertPalletLUTToConstPass final : public VPUIP::impl::ConvertPalletLUTToConstBase { +public: + explicit ConvertPalletLUTToConstPass(Logger log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnFunc() final; +}; + +void ConvertPalletLUTToConstPass::safeRunOnFunc() { + auto& ctx = getContext(); + auto func = getOperation(); + + mlir::ConversionTarget palletLutTarget(ctx); + palletLutTarget.addLegalOp(); + palletLutTarget.addDynamicallyLegalOp([](VPUIP::NCEClusterTaskOp op) { + if (op.getPalletLookupTable() != nullptr) { + return true; + } + if (auto weights = op.getWeights()) { + const auto weightsType = mlir::dyn_cast(weights.getType()).getElementType(); + if (mlir::isa_and_nonnull( + weightsType)) { + return false; + } + } + return true; + }); + + mlir::RewritePatternSet palletLutPatterns(&ctx); + palletLutPatterns.add(&ctx, _log, func); + if (mlir::failed(applyPartialConversion(func, palletLutTarget, std::move(palletLutPatterns)))) { + signalPassFailure(); + } +} + +} // namespace + +// +// createConvertPalletLUTToConstPass +// + +std::unique_ptr vpux::VPUIP::createConvertPalletLUTToConstPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_sprlut_to_const.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_sprlut_to_const.cpp index 76dc396e80..6185ae741f 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_sprlut_to_const.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_sprlut_to_const.cpp @@ -6,12 +6,12 @@ #include "vpux/compiler/dialect/VPU/utils/sprlut_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" +#include "vpux/compiler/dialect/VPUIP/interfaces/common_rewriters/convert_lut_to_const.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/const/ops.hpp" - -#include +#include "vpux/compiler/utils/rewriter.hpp" namespace vpux::VPUIP { #define GEN_PASS_DECL_CONVERTSPRLUTTOCONST @@ -27,57 +27,24 @@ namespace { // SprLUTConverter // -class SprLUTConverter final : public mlir::OpRewritePattern { +class SprLUTConverter final : public VPUIP::LUTConverterBase { public: SprLUTConverter(mlir::MLIRContext* ctx, Logger log, mlir::func::FuncOp netFunc) - : mlir::OpRewritePattern(ctx), _log(log), _netFunc(netFunc) { + : LUTConverterBase(ctx, log, netFunc) { setDebugName("ConvertSprLUTToConstPass::SprLUTConverter"); } - mlir::LogicalResult matchAndRewrite(VPUIP::NCEClusterTaskOp nceClusterTask, - mlir::PatternRewriter& rewriter) const final; - private: - mlir::Value createSprLookupTableConst(VPUIP::NCEClusterTaskOp nceClusterTask, - mlir::PatternRewriter& rewriter) const; - mlir::Value createCopyDestination(VPUIP::NCEClusterTaskOp nceClusterTask, mlir::Value sprLUTConst, - mlir::PatternRewriter& rewriter) const; - VPU::DistributionInfoAttr createDistributionInfoAttr(VPUIP::DistributedBufferType inputDistribType, - VPUIP::NCEClusterTaskOp nceClusterTask) const; - VPUIP::DistributedBufferType createDistributedBufferType(VPU::DistributionInfoAttr distributedInfo, - VPUIP::NCEClusterTaskOp nceClusterTask, - mlir::Value sprLUTConst) const; - void replaceAttrWithConst(VPUIP::NCEClusterTaskOp nceClusterTask, mlir::Value sprLUT, - mlir::PatternRewriter& rewriter) const; + mlir::Value createLookupTableConst(VPUIP::NCEClusterTaskOp nceClusterTask, + mlir::PatternRewriter& rewriter) const override; + void replaceWithConstInput(VPUIP::NCEClusterTaskOp nceClusterTask, mlir::Value sprLUT, + mlir::PatternRewriter& rewriter) const override; void removeSprLUTFromPPE(VPUIP::NCEClusterTaskOp nceClusterTask, mlir::PatternRewriter& rewriter) const; VPU::PPEFpAttr createPPEWithoutSprLUT(VPU::PPEFpAttr prevPPE) const; - -private: - Logger _log; - mutable mlir::func::FuncOp _netFunc; }; -mlir::LogicalResult SprLUTConverter::matchAndRewrite(VPUIP::NCEClusterTaskOp nceClusterTask, - mlir::PatternRewriter& rewriter) const { - _log.trace("[{0}] Got '{1}' at '{2}'", getDebugName(), nceClusterTask->getName(), nceClusterTask->getLoc()); - - const auto sprLUTConst = [&]() { - mlir::OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(&_netFunc.getBody().front().front()); - return createSprLookupTableConst(nceClusterTask, rewriter); - }(); - - const auto copyDst = createCopyDestination(nceClusterTask, sprLUTConst, rewriter); - const auto sprLutNceInput = - rewriter.create(nceClusterTask->getLoc(), sprLUTConst, copyDst).getOutput(); - - replaceAttrWithConst(nceClusterTask, sprLutNceInput, rewriter); - - return mlir::success(); -} - -mlir::Value SprLUTConverter::createSprLookupTableConst(VPUIP::NCEClusterTaskOp nceClusterTask, - mlir::PatternRewriter& rewriter) const { +mlir::Value SprLUTConverter::createLookupTableConst(VPUIP::NCEClusterTaskOp nceClusterTask, + mlir::PatternRewriter& rewriter) const { const auto ppeOps = nceClusterTask.getPpe().getOps(); VPUX_THROW_WHEN(ppeOps.empty(), "{0}: expected PPE inside {1}, but it was not found", getDebugName(), nceClusterTask); @@ -91,66 +58,8 @@ mlir::Value SprLUTConverter::createSprLookupTableConst(VPUIP::NCEClusterTaskOp n return rewriter.create(nceClusterTask->getLoc(), bufferType, contentAttr).getOutput(); } -mlir::Value SprLUTConverter::createCopyDestination(VPUIP::NCEClusterTaskOp nceClusterTask, mlir::Value sprLUTConst, - mlir::PatternRewriter& rewriter) const { - const auto input = nceClusterTask.getInput(); - const auto inputType = input.getType(); - - return llvm::TypeSwitch(inputType) - .Case([&](VPUIP::DistributedBufferType distributedBufferType) { - const auto distributedInfo = createDistributionInfoAttr(distributedBufferType, nceClusterTask); - const auto ditributedBufferType = - createDistributedBufferType(distributedInfo, nceClusterTask, sprLUTConst); - - auto alignment = vpux::getIntAttr(nceClusterTask.getContext(), VPU::SPRLUT_ALIGNMENT_REQUIREMENT); - - auto allocDistributed = rewriter.create( - nceClusterTask.getLoc(), ditributedBufferType, alignment, nullptr); - return allocDistributed->getResult(0); - }) - .Case([&](auto) { - const auto constOutType = mlir::dyn_cast(sprLUTConst.getType()); - VPUX_THROW_WHEN(constOutType == nullptr, - "{0}: sprLUT const output type is expected to be MemRefType, but got {1}", - getDebugName(), sprLUTConst.getType()); - const auto memSpaceCMX = vpux::IndexedSymbolAttr::get(nceClusterTask.getContext(), - stringifyEnum(VPU::MemoryKind::CMX_NN), 0); - const auto cmxMemType = mlir::MemRefType::get(constOutType.getShape(), constOutType.getElementType(), - constOutType.getLayout(), memSpaceCMX); - const auto allocOp = rewriter.create(nceClusterTask.getLoc(), cmxMemType); - return allocOp->getResult(0); - }) - .Default([&](mlir::Type inputType) { - VPUX_THROW("{0}: `{1}` is not supported as an input type", getDebugName(), inputType); - return mlir::Value{}; - }); -} - -VPU::DistributionInfoAttr SprLUTConverter::createDistributionInfoAttr(VPUIP::DistributedBufferType inputDistribType, - VPUIP::NCEClusterTaskOp nceClusterTask) const { - auto inputDistribInfo = inputDistribType.getDistribution(); - VPUX_THROW_WHEN(inputDistribInfo == nullptr, "{0}: inputDistribInfo == nullptr for the input type is not allowed", - getDebugName()); - const auto duplicatedDistrModeAttr = - VPU::DistributionModeAttr::get(nceClusterTask.getContext(), VPU::DistributionMode::DUPLICATED); - return VPU::DistributionInfoAttr::get(nceClusterTask.getContext(), duplicatedDistrModeAttr, nullptr, nullptr, - nullptr, nullptr, inputDistribInfo.getNumClusters(), nullptr, nullptr, - nullptr, nullptr, nullptr, nullptr, nullptr); -} - -VPUIP::DistributedBufferType SprLUTConverter::createDistributedBufferType(VPU::DistributionInfoAttr distributedInfo, - VPUIP::NCEClusterTaskOp nceClusterTask, - mlir::Value sprLUTConst) const { - const auto memSpaceCMX = - vpux::IndexedSymbolAttr::get(nceClusterTask.getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN), 0); - const auto ndTypeInterface = mlir::cast(sprLUTConst.getType()); - return VPUIP::DistributedBufferType::get( - nceClusterTask.getContext(), ndTypeInterface.getShape().raw(), ndTypeInterface.getElementType(), - mlir::dyn_cast(sprLUTConst.getType()).getLayout(), memSpaceCMX, distributedInfo); -} - -void SprLUTConverter::replaceAttrWithConst(VPUIP::NCEClusterTaskOp nceClusterTask, mlir::Value sprLUT, - mlir::PatternRewriter& rewriter) const { +void SprLUTConverter::replaceWithConstInput(VPUIP::NCEClusterTaskOp nceClusterTask, mlir::Value sprLUT, + mlir::PatternRewriter& rewriter) const { auto newInput = [&]() -> mlir::Value { if (vpux::VPUIP::hasDistributedOperand(nceClusterTask)) { const auto sprLUTOutType = mlir::dyn_cast(sprLUT.getType()); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_to_dma.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_to_dma.cpp index d6e930ae03..99b08142cf 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_to_dma.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_to_dma.cpp @@ -3,14 +3,16 @@ // SPDX-License-Identifier: Apache-2.0 // +#include #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPU/utils/const_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/quantization.hpp" @@ -415,6 +417,60 @@ VPUIP::GenericReshapeOp convertMemPermuteHNWCAsDMA(VPUIP::SwKernelOp swKernelOp, return rewriter.create(swKernelOp->getLoc(), outType, secondPermDmaOp); } +// +// Convert MemPermute NCHW->HCWN to 2 permuteDMAs +// MemPermute NCHW->HCWN, Permute pattern: [d0, d1, d2, d3] -> [d2, d1, d3, d0] +// For example, MemPermute NCHW->HCWN: +// Input : 128x2x36x68xf16#NCHW +// | +// MemPermute : memPerm: (d0, d1, d2, d3) -> (d2, d1, d3, d0) +// | +// Output : 36x2x68x128xf16#NCHW +// Convert to: +// Input : 128x2x36x68xf16#NCHW +// | +// GenericReshape 1 : 1x256x36x68xf16#NCHW +// | +// PermuteDMA 1 : 1x36x256x68xf16#NCHW ([1, 0, 2]: HWC->WHC) +// | +// GenericReshape 2 : 1x36x128x136xf16#NCHW +// | +// PermuteDMA 2 : 1x36x136x128xf16#NCHW ([0, 2, 1]: HWC->HCW) +// | +// GenericReshape 3 : 36x2x68x128xf16#NCHW +// | +// Output : 36x2x68x128xf16#NCHW +// +VPUIP::GenericReshapeOp convertMemPermuteHCWNAsDMA(VPUIP::SwKernelOp swKernelOp, mlir::Value input, + mlir::PatternRewriter& rewriter) { + const auto outType = mlir::cast(swKernelOp->getResult(0).getType()); + // Create genericReshapeOp for first permuteDMAOp + const auto mergedPerm = DimsOrder::NHCW.toAffineMap(rewriter.getContext()); + auto inGenReshapeOp = createGenericReshape(swKernelOp, input, outType, mergedPerm, rewriter); + auto inGenReshapeType = mlir::dyn_cast(inGenReshapeOp.getOutput().getType()); + // Create first permuteDMAOp: permutation is [d0, d2, d1, d3] + auto dimsOrderDMA = DimsOrder::NHCW; + auto firstPermDmaOp = + createPermuteDMA(swKernelOp, inGenReshapeOp, inGenReshapeType, dimsOrderDMA, outType, rewriter); + auto firstPermDMAType = mlir::dyn_cast(firstPermDmaOp.getOutput().getType()); + // Create genericReshapeOp for second permuteDMAOp + auto midGenReshapeType = firstPermDMAType; + auto outTypeMemShape = Shape(outType.getMemShape().raw()); + auto midGenReshapeNewMemShape = Shape({1, outTypeMemShape[Dims4D::Act::N], outTypeMemShape[Dims4D::Act::W], + outTypeMemShape[Dims4D::Act::H] * outTypeMemShape[Dims4D::Act::C]}); + midGenReshapeType = + VPUIP::changeShapeWithMemShape(&midGenReshapeType, midGenReshapeNewMemShape, outType.getDimsOrder()); + auto midGenReshapeOp = + rewriter.create(swKernelOp->getLoc(), midGenReshapeType, firstPermDmaOp); + auto midGenReshapeOutType = mlir::dyn_cast(midGenReshapeOp.getOutput().getType()); + // Create second permuteDMAOp: permutation is [d0, d1, d3, d2] + auto secondDimsOrderDMA = DimsOrder::NCWH; + auto secondPermDmaOp = + createPermuteDMA(swKernelOp, midGenReshapeOp, midGenReshapeOutType, secondDimsOrderDMA, outType, rewriter); + // Create genericReshapeOp for output + return rewriter.create(swKernelOp->getLoc(), outType, secondPermDmaOp); +} + mlir::LogicalResult ConvertToDMAPass::SwKernelMemPermuteConverter::matchAndRewrite( VPUIP::SwKernelOp swKernelOp, mlir::PatternRewriter& rewriter) const { if (!VPUIP::isMemPermSwKernel(swKernelOp)) { @@ -435,6 +491,13 @@ mlir::LogicalResult ConvertToDMAPass::SwKernelMemPermuteConverter::matchAndRewri const auto outputBuf = swKernelOp.getOperand(1); // Check for inversed permutation which needs split into 2 consecutive permuteDMAs // e.g. pattern [d0, d1, d2, d3] -> [d0, d3, d2, d1] + + if (config::getArch(swKernelOp.getOperation()) > config::ArchKind::NPU37XX) { + rewriter.replaceOpWithNewOp(swKernelOp, input, outputBuf, + mlir::AffineMapAttr::get(memPerm.value()), nullptr); + return mlir::success(); + } + auto mergedPerm = vpux::VPUIP::getPermuteDMAMergedMemPerm(inType, memPerm.value()); if (!VPUIP::isSplitNeededForPermuteDMA(inType, memPerm.value())) { rewriter.replaceOpWithNewOp(swKernelOp, input, outputBuf, @@ -463,6 +526,12 @@ mlir::LogicalResult ConvertToDMAPass::SwKernelMemPermuteConverter::matchAndRewri auto newOp = convertMemPermuteCWNH(swKernelOp, input, rewriter); rewriter.replaceOp(swKernelOp, newOp.getOutput()); + return mlir::success(); + } else if (mergedPerm == DimsOrder::HCWN.toAffineMap(rewriter.getContext())) { + // Convert MemPermute NCHW->HCWN to 2 permuteDMAs + auto newOp = convertMemPermuteHCWNAsDMA(swKernelOp, input, rewriter); + rewriter.replaceOp(swKernelOp, newOp.getOutput()); + return mlir::success(); } else if (mergedPerm == DimsOrder::HNWC.toAffineMap(rewriter.getContext())) { // Convert MemPermute NCHW->HNWC to 2 permuteDMAs diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_view_ops_to_declarations.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_view_ops_to_declarations.cpp index 10f10d81b4..2d456cfdf2 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_view_ops_to_declarations.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/convert_view_ops_to_declarations.cpp @@ -5,9 +5,13 @@ #include "vpux/compiler/core/aliases_info.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" +#include "vpux/compiler/dialect/core/IR/ops.hpp" #include "vpux/compiler/utils/error.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/swizzling_utils.hpp" #include @@ -69,9 +73,9 @@ Byte ViewLikeRewrite::calculateOffset(mlir::Value val) const { mlir::LogicalResult ViewLikeRewrite::matchAndRewrite(mlir::ViewLikeOpInterface origOp, mlir::PatternRewriter& rewriter) const { - if (!mlir::isa(origOp.getOperation())) { + if (!mlir::isa(origOp.getOperation())) { return matchFailed(rewriter, origOp, "Unknown view-like operation '{0}'", origOp->getName()); } @@ -139,9 +143,9 @@ void ConvertViewOpsToDeclarationsPass::safeRunOnFunc() { target.addLegalOp(); // The logic for ConcatView has been moved to BreakDataFlow pass // Leave ConcatView illegal here for sanity check - target.addIllegalOp(); target.addLegalOp(); target.markOpRecursivelyLegal([&](mlir::Operation*) { diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/copy_op_tiling.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/copy_op_tiling.cpp index 1bcba2e1c4..cf6fd652ec 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/copy_op_tiling.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/copy_op_tiling.cpp @@ -10,11 +10,11 @@ #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/dma_limits.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/types.hpp" -#include "vpux/utils/core/numeric.hpp" #include @@ -44,7 +44,7 @@ bool isLegalCopyOp(VPUIP::CopyOp copyOp) { return true; } - const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(VPU::getArch(copyOp)); + const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(config::getArch(copyOp)); // If tensor size is greater than max plane size, its no longer legal operation if (getDmaSize(copyOp) > Byte(dmaEngineLimits.getMaxLength())) { @@ -127,7 +127,7 @@ class CopyOpTilingPass final : public VPUIP::impl::CopyOpTilingBase { public: - CopyOpTiling(mlir::MLIRContext* ctx, Logger log, VPU::ArchKind arch, int64_t dmaPortNum) + CopyOpTiling(mlir::MLIRContext* ctx, Logger log, config::ArchKind arch, int64_t dmaPortNum) : mlir::OpRewritePattern(ctx), _log(log), _arch(arch), _dmaPortNum(dmaPortNum) { } @@ -138,7 +138,7 @@ class CopyOpTiling final : public mlir::OpRewritePattern { SmallVector createTiles(VPUIP::CopyOp origOp, mlir::PatternRewriter& rewriter) const; Logger _log; - VPU::ArchKind _arch; + config::ArchKind _arch; int64_t _dmaPortNum; }; @@ -342,7 +342,7 @@ void CopyOpTilingPass::safeRunOnFunc() { auto& ctx = getContext(); auto func = getOperation(); auto module = func->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto dmaOp = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN); const auto dmaPortNum = dmaOp.getCount(); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dispatched_inliner.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dispatched_inliner.cpp index f8c2a9cc0a..9f6dc1ea93 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dispatched_inliner.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dispatched_inliner.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/core/interfaces/attr_interfaces.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dma_profiling_after_barrier_sched.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dma_profiling_after_barrier_sched.cpp index 53801b6568..326e7ebc3c 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dma_profiling_after_barrier_sched.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dma_profiling_after_barrier_sched.cpp @@ -11,9 +11,9 @@ #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/dma.hpp" -#include "vpux/compiler/utils/strings.hpp" #include "vpux/utils/profiling/common.hpp" namespace vpux::VPUIP { @@ -33,13 +33,8 @@ namespace { class DMATaskProfilingAfterBarrierSchedPass final : public VPUIP::impl::DMATaskProfilingAfterBarrierSchedBase { public: - explicit DMATaskProfilingAfterBarrierSchedPass(DMAProfilingMode dmaProfilingMode, Logger log) - : _profilingMode(dmaProfilingMode), - _timerType(), - _Cmx0MemKind(), - _timestampSize(), - _profOutputId(), - _hwAddr() { + explicit DMATaskProfilingAfterBarrierSchedPass(Logger log) + : _timerType(), _Cmx0MemKind(), _timestampSize(), _profOutputId(), _hwAddr() { Base::initLogger(log, Base::getArgumentName()); } @@ -48,7 +43,7 @@ class DMATaskProfilingAfterBarrierSchedPass final : mlir::Operation* getDmaTask(VPURT::TaskOp taskOp); int64_t getDMAPortValue(VPURT::TaskOp taskOp); - uint32_t getHwProfAddress(VPU::ArchKind arch); + uint32_t getHwProfAddress(config::ArchKind arch); VPURT::TaskOp createProfTask(mlir::OpBuilder& builder, mlir::ValueRange updateBarriers, mlir::ValueRange waitBarriers, int64_t port, size_t address, mlir::Location loc, bool isOutOfOrder, VPUIP::DmaProfilingMetadataAttr profMetadata); @@ -56,7 +51,6 @@ class DMATaskProfilingAfterBarrierSchedPass final : void createCmx2DdrProfDma(mlir::OpBuilder& builder, mlir::ValueRange updateBarriers, int64_t port, size_t srcCmxAddr, size_t dstDdrAddr, size_t sizeBytes); - DMAProfilingMode _profilingMode; mlir::Type _timerType; IndexedSymbolAttr _Cmx0MemKind; int64_t _timestampSize; @@ -64,9 +58,9 @@ class DMATaskProfilingAfterBarrierSchedPass final : uint32_t _hwAddr; }; -uint32_t DMATaskProfilingAfterBarrierSchedPass::getHwProfAddress(VPU::ArchKind arch) { +uint32_t DMATaskProfilingAfterBarrierSchedPass::getHwProfAddress(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return VPUIP::HW_TIMER_ABSOLUTE_ADDR_37XX; default: VPUX_THROW("Unsuported architecture"); @@ -135,19 +129,13 @@ void DMATaskProfilingAfterBarrierSchedPass::createCmx2DdrProfDma(mlir::OpBuilder void DMATaskProfilingAfterBarrierSchedPass::safeRunOnModule() { auto module = getOperation(); auto* ctx = module->getContext(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); - if (enableDMAProfiling.hasValue()) { - _profilingMode = getDMAProfilingMode(arch, enableDMAProfiling.getValue()); - } - - if (_profilingMode != DMAProfilingMode::SW) { - return; - } + VPUX_THROW_UNLESS(arch == config::ArchKind::NPU37XX, "Unsupported platform"); const auto isOutOfOrderOptimizationApplicable = - (arch == VPU::ArchKind::NPU37XX); // For 37XX PROFBEGIN and profiled DMA may be proceeded by different - // channels, which allow such DMAs issued out of order + (arch == config::ArchKind::NPU37XX); // For 37XX PROFBEGIN and profiled DMA may be proceeded by different + // channels, which allow such DMAs issued out of order net::NetworkInfoOp netInfo; mlir::func::FuncOp func; net::NetworkInfoOp::getFromModule(module, netInfo, func); @@ -324,7 +312,6 @@ void DMATaskProfilingAfterBarrierSchedPass::safeRunOnModule() { // createDMATaskProfilingAfterBarrierSchedPass // -std::unique_ptr vpux::VPUIP::createDMATaskProfilingAfterBarrierSchedPass(DMAProfilingMode dmaProfilingMode, - Logger log) { - return std::make_unique(dmaProfilingMode, log); +std::unique_ptr vpux::VPUIP::createDMATaskProfilingAfterBarrierSchedPass(Logger log) { + return std::make_unique(log); } diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dpu_profiling.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dpu_profiling.cpp index 28f1024a72..e9ca46f9c1 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dpu_profiling.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dpu_profiling.cpp @@ -17,6 +17,7 @@ #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/utils/profiling/common.hpp" @@ -84,7 +85,7 @@ void DPUProfilingPass::safeRunOnModule() { net::NetworkInfoOp netInfo; mlir::func::FuncOp netFunc; net::NetworkInfoOp::getFromModule(module, netInfo, netFunc); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); OpBuilderLogger builderLog(_log.nest()); mlir::OpBuilder builder(&netFunc.getBody().front().front(), &builderLog); unsigned profilingWorkloadSize = VPUIP::getProfWorkloadSize(module); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dump_statistics_of_task_ops.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dump_statistics_of_task_ops.cpp index 12f379d447..48f6fec268 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dump_statistics_of_task_ops.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/dump_statistics_of_task_ops.cpp @@ -6,7 +6,9 @@ #include "vpux/compiler/core/feasible_memory_scheduler_spilling.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/abstract_tree.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/feasible_allocation.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/feasible_allocation.cpp index 4d92f01611..cffa9a0d14 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/feasible_allocation.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/feasible_allocation.cpp @@ -3,32 +3,30 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/utils/resources.hpp" -#include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" - #include "vpux/compiler/core/async_deps_info.hpp" +#include "vpux/compiler/core/cost_model_utils.hpp" #include "vpux/compiler/core/feasible_memory_scheduler.hpp" #include "vpux/compiler/core/feasible_memory_scheduler_control_edges.hpp" #include "vpux/compiler/core/feasible_memory_scheduler_spilling.hpp" #include "vpux/compiler/core/mem_live_range_info.hpp" #include "vpux/compiler/core/prefetch_data_ops.hpp" #include "vpux/compiler/core/schedule_analysis_utils.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" -#include "vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/hw_settings.hpp" #include "vpux/compiler/utils/linear_scan.hpp" - +#include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/checked_cast.hpp" #if defined(VPUX_DEVELOPER_BUILD) || !defined(NDEBUG) - #include "vpux/compiler/core/developer_build_utils.hpp" - #endif // defined(VPUX_DEVELOPER_BUILD) || !defined(NDEBUG) namespace vpux::VPUIP { @@ -653,7 +651,7 @@ void FeasibleAllocationPass::safeRunOnFunc() { linearizeComputeOps(_linearizeSchedule, _enablePipelining, func, depsInfo); // VPUNN cost model - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto maybeCostModelAnalysis = getCachedParentAnalysis(module); auto costModel = VPU::CostModelAnalysis::getOrCreateCostModel(maybeCostModelAnalysis, arch, _log); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/flatten_sparse_weights_types.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/flatten_sparse_weights_types.cpp index 8864daa0df..204de47c7f 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/flatten_sparse_weights_types.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/flatten_sparse_weights_types.cpp @@ -6,7 +6,9 @@ #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/linearize_call_ops.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/linearize_call_ops.cpp index 68b4ebd549..8714fe32c5 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/linearize_call_ops.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/linearize_call_ops.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/core/async_deps_info.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPURT/IR/dialect.hpp" #include diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/move_pure_view_op_before_copy.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/move_pure_view_op_before_copy.cpp index 0edc7daa9e..d2cb393fc4 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/move_pure_view_op_before_copy.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/move_pure_view_op_before_copy.cpp @@ -9,6 +9,7 @@ #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/allocate_buffers.hpp" #include "vpux/compiler/utils/permute_utils.hpp" @@ -154,7 +155,7 @@ mlir::LogicalResult MoveViewOpToTheFrontOfCopy::matchAndRewrite(mlir::ViewLikeOp } if (mlir::isa(origOp)) { - const auto arch = VPU::getArch(origOp.getOperation()); + const auto arch = config::getArch(origOp.getOperation()); return VPUIP::isDistributedCompatibleAfterShapeChangeForViewOps( distributedType, viewOpOutputShape, viewOpOutputType.getDimsOrder(), arch); } @@ -238,7 +239,7 @@ mlir::LogicalResult MoveViewOpToTheFrontOfCopy::matchAndRewrite(mlir::ViewLikeOp auto getDistributionForViewOpOutput = [&]() -> VPU::DistributionInfoAttr { auto ctx = origOp->getContext(); - const auto arch = VPU::getArch(origOp.getOperation()); + const auto arch = config::getArch(origOp.getOperation()); const auto mode = distributedType.getDistribution().getMode().getValue(); const auto origDistribution = distributedType.getDistribution(); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/nn_dma_tiling.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/nn_dma_tiling.cpp index 054680e9e2..e0770a279b 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/nn_dma_tiling.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/nn_dma_tiling.cpp @@ -9,6 +9,7 @@ #include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/dma_limits.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/numeric.hpp" @@ -31,7 +32,7 @@ namespace { class SplitNNDMARewriter final : public mlir::OpRewritePattern { public: - SplitNNDMARewriter(mlir::MLIRContext* ctx, int64_t dmaPortCount, Logger log, VPU::ArchKind arch) + SplitNNDMARewriter(mlir::MLIRContext* ctx, int64_t dmaPortCount, Logger log, config::ArchKind arch) : mlir::OpRewritePattern(ctx), _log(log), _dmaPortCount(dmaPortCount), _arch(arch) { setDebugName("SplitNNDMARewriter"); @@ -47,7 +48,7 @@ class SplitNNDMARewriter final : public mlir::OpRewritePattern { Logger _log; int64_t _dmaPortCount; mlir::FlatSymbolRefAttr _cmxNameAttr; - VPU::ArchKind _arch; + config::ArchKind _arch; }; Byte getDmaSize(VPUIP::NNDMAOp nndmaOp) { @@ -226,7 +227,7 @@ void NNDMATilingPass::safeRunOnFunc() { auto& ctx = getContext(); auto func = getOperation(); auto module = func->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto dmaOp = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN); auto dmaPortCount = dmaOp.getCount(); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/operation_stubbing.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/operation_stubbing.cpp index 203a2ad507..b869d15a6c 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/operation_stubbing.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/operation_stubbing.cpp @@ -4,11 +4,13 @@ // #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPURT/IR/dialect.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_concat_view_copies.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_concat_view_copies.cpp index 0916f475f1..fe690cdbd5 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_concat_view_copies.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_concat_view_copies.cpp @@ -5,9 +5,11 @@ #include "vpux/compiler/core/aliases_info.hpp" #include "vpux/compiler/core/attributes/dim.hpp" -#include "vpux/compiler/dialect/IE/utils/concat_utils.hpp" #include "vpux/compiler/dialect/IE/utils/slice_utils.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" +#include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/ppe_version_config.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" @@ -16,8 +18,8 @@ #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/allocate_buffers.hpp" +#include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/permute_utils.hpp" -#include "vpux/compiler/utils/reshape_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include @@ -25,7 +27,6 @@ #include #include #include -#include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" namespace vpux::VPUIP { #define GEN_PASS_DECL_OPTIMIZECONCATVIEWCOPIES @@ -244,7 +245,7 @@ mlir::LogicalResult AvoidConcatExtraChannel::checkConcatInputs(mlir::ValueRange auto sizes = parseIntArrayAttr(subview.getStaticSizesAttr()); auto concatDims = getConcatDims(Shape(sizes), concatOutShape); - if (concatDims.size() != 1) { + if (concatDims.size() != 1 && llvm::find(concatDims, Dims4D::Act::C) != concatDims.end()) { return mlir::failure(); } @@ -881,7 +882,7 @@ bool ReuseConcatViewAsInput::isLegalConcatViewInputPattern(VPUIP::ConcatViewOp c return false; } - if (!VPUIP::hasOneOrSameUser(concatViewOp.getOperation())) { + if (!hasOneUniqueUser(concatViewOp.getOperation())) { log.nest().nest().trace("ConcatViewOp has more than one user"); return false; } @@ -2522,7 +2523,7 @@ class SplitUnbalancedDDRConcatBase : public mlir::OpRewritePattern& views, SmallVector& distributedCopies, - Dim newConcatDim, int64_t leftConcatInputSize) const = 0; + Dim newConcatDim, int64_t leftConcatInputSize, int64_t rightConcatInputSize) const = 0; virtual VPUIP::DistributedBufferType updateDistributedType(mlir::Value dst, mlir::Value dstView, ShapeRef copyShape) const = 0; @@ -3024,7 +3025,8 @@ class SplitUnbalancedDDRConcatBase : public mlir::OpRewritePattern&, SmallVector&, Dim, int64_t) const override { + bool isValidSegment(SmallVector&, SmallVector&, Dim, int64_t, + int64_t) const override { return true; } @@ -3290,7 +3293,7 @@ class SplitUnbalancedDDRConcatOnSameAxis : public SplitUnbalancedDDRConcatBase { } bool isValidSegment(SmallVector& views, SmallVector& distributedCopies, - Dim newConcatDim, int64_t leftConcatInputSize) const override { + Dim newConcatDim, int64_t leftConcatInputSize, int64_t rightConcatInputSize) const override { for (size_t i = 0; i < distributedCopies.size(); ++i) { VPUIP::CopyOp distributedCopy = distributedCopies[i]; VPUX_THROW_UNLESS(vpux::VPUIP::hasDistributedOperand(distributedCopy), "Expected a distributed Copy op"); @@ -3300,8 +3303,22 @@ class SplitUnbalancedDDRConcatOnSameAxis : public SplitUnbalancedDDRConcatBase { auto dstDistribution = dstDistributedType.getDistribution(); auto dstDistributionInfo = VPU::DistributionInfo::getClassFromAttr(dstDistribution); + const auto bufferShape = dstDistributedType.getShape(); + auto maybeTileIndex = VPUIP::getTilingDimIndex(dstDistributedType); + if (!maybeTileIndex.has_value()) { + return false; + } + const auto tileDim = Dim(maybeTileIndex.value()); + if (leftConcatInputSize >= bufferShape[tileDim]) { + return false; + } + if (isDistributionWithExplicitShapesAndOffsets(dstDistributionInfo)) { - return true; + const auto computeShapes = VPU::arrayAttrToVecOfShapes(dstDistribution.getComputeShapes()); + // After optimization, if the tensor is not evenly split, the distributed copy for the left + // concatenation data will result in a smaller compute shape on the last cluster. + // Ensure that the left branch has sufficient data for the last cluster. + return computeShapes.back()[newConcatDim] > rightConcatInputSize; } const auto distributionMode = dstDistributionInfo.getDistributionMode(); @@ -3552,7 +3569,14 @@ class SplitUnbalancedDDRConcatOnSameAxisDDR : public SplitUnbalancedDDRConcatBas return newConcatDim.ind() == tilingDim; } - bool isValidSegment(SmallVector&, SmallVector&, Dim, int64_t) const override { + bool isValidSegment(SmallVector& subviews, SmallVector&, Dim newConcatDim, + int64_t leftConcatInputSize, int64_t rightConcatInputSize) const override { + for (auto subview : subviews) { + auto outShape = getShape(subview.getResult()); + if (leftConcatInputSize + rightConcatInputSize != outShape[newConcatDim]) { + return false; + } + } return true; } diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_copies.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_copies.cpp index 35db48ab8e..4badc57ca4 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_copies.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_copies.cpp @@ -481,14 +481,9 @@ mlir::LogicalResult CopyOpSequence::matchAndRewrite(VPUIP::CopyOp copyOp, mlir:: nestedLogger.trace("CopyOpSequence: current CopyOp is non-distributed"); // Check ViewLikeOp without output_buff const auto isViewLikeOpWithoutOutputBuff = [&](mlir::Operation* op) -> bool { - if (mlir::isa(op)) { - return true; - } - - return false; + return mlir::isa(op); }; mlir::Value parentCopyOpInputBuff; diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_expand_subview.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_expand_subview.cpp index 659e58c927..a13037353c 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_expand_subview.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_expand_subview.cpp @@ -10,6 +10,7 @@ #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/utils/allocate_buffers.hpp" +#include "vpux/compiler/utils/rewriter.hpp" namespace vpux::VPUIP { #define GEN_PASS_DECL_OPTIMIZEEXPANDSUBVIEW diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_parallel_copies.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_parallel_copies.cpp index b94c5fe170..1338786670 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_parallel_copies.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/optimize_parallel_copies.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -262,23 +263,6 @@ bool ParallelCopiesRewriter::isCopyFusable(VPUIP::CopyOp copyOp, Logger& log) co return true; } -bool arePositionsConsecutive(const std::set& positions) { - if (positions.size() < 2) { - return false; - } - - auto it = positions.begin(); - auto prev = *it; - ++it; - for (; it != positions.end(); ++it) { - if (*it != prev + 1) { - return false; - } - prev = *it; - } - return true; -} - void ParallelCopiesRewriter::insertUserPosition(VPUIP::NCEClusterTaskOp nceConvUserOp, std::set& positions) const { uint32_t userOpPos = std::numeric_limits::max(); @@ -395,6 +379,34 @@ mlir::LogicalResult ParallelCopiesRewriter::matchAndRewrite(VPUIP::CopyOp origin return false; }; + auto arch = config::getArch(originCopyOp); + auto arePositionsConsecutive = [&](const std::set& positions) -> bool { + if (positions.size() < 2) { + return false; + } + + size_t countConsecutive = 0; + auto it = positions.begin(); + auto prev = *it; + + for (++it; it != positions.end(); ++it) { + if (*it == prev + 1) { + ++countConsecutive; + } + prev = *it; + } + + double consecutiveRatio = static_cast(countConsecutive) / (positions.size() - 1); + // If at least 90% of the positions are consecutive, we consider it as a valid case for optimization + // It's a workaround for E#172473, where we have a case with multi-dim tiling + // and we want to avoid unnecessary spillings in the case. + // This workaround will be removed by another solution on E#172578 + // + // For VPUX3XXX, we need to be more strict and require 100% consecutive positions. + // Otherwise, we will have regressions due to increased dpu cost. More details in E#174330. + return isArchVPUX3XXX(arch) ? consecutiveRatio >= 1.0 : consecutiveRatio >= 0.9; + }; + auto checkSiblingCopies = [&](mlir::Operation* targetOp) -> bool { auto rootCopyOp = mlir::dyn_cast(targetOp); if (rootCopyOp == nullptr) { diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/propagate_compression_scheme.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/propagate_compression_scheme.cpp index 0dc90dc3c4..bd6d770e5a 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/propagate_compression_scheme.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/propagate_compression_scheme.cpp @@ -5,8 +5,10 @@ #include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/swizzling_utils.hpp" #include "vpux/compiler/utils/types.hpp" @@ -100,24 +102,14 @@ void PropagateSparsityCompression::safeRunOnFunc() { auto func = getOperation(); func.walk([&](Const::DeclareOp constOp) { - const auto& contentAttr = constOp.getContentAttr(); - const auto transformations = contentAttr.getTransformations(); - if (transformations.empty()) { - return; - } - - auto sparsifyTransformationIt = - std::find_if(transformations.rbegin(), transformations.rend(), [](Const::TransformAttrInterface tr) { - return mlir::isa(tr); - }); - if (sparsifyTransformationIt == transformations.rend()) { + if (!Const::hasSparsifyTransformation(constOp)) { return; } auto userOp = *constOp.getOutput().getUsers().begin(); auto userGroupOp = mlir::dyn_cast(userOp); VPUX_THROW_UNLESS(userGroupOp != nullptr, "Expected weights user to be a VPUIP.GroupSparseBuffer op, got {0}", - userOp); + userOp->getName()); auto sparsityCompressionAttr = userGroupOp.getSparsityCompressionAttr(); const auto outputType = mlir::cast(constOp.getType()); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/resolve_dma_with_swizzling.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/resolve_dma_with_swizzling.cpp index 040fed8bcb..fd00944fb4 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/resolve_dma_with_swizzling.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/resolve_dma_with_swizzling.cpp @@ -5,7 +5,10 @@ #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPURT/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/swizzling_utils.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/set_memory_space.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/set_memory_space.cpp index 80d890fe96..d94b8b1e84 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/set_memory_space.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/set_memory_space.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/core/interfaces/ops_interfaces.hpp" #include "vpux/compiler/core/aliases_info.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/set_zero_offset_weights_table.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/set_zero_offset_weights_table.cpp index 9d393ce8da..9b0ff1b021 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/set_zero_offset_weights_table.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/set_zero_offset_weights_table.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/utils/weights_table_reuse_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/static_allocation.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/static_allocation.cpp index 290d8e77af..b09a396f2c 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/static_allocation.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/static_allocation.cpp @@ -3,28 +3,21 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/utils/resources.hpp" - -#include "vpux/compiler/core/control_edge_generator.hpp" -#include "vpux/compiler/core/feasible_scheduler_utils.hpp" - #include "vpux/compiler/core/allocation_info.hpp" #include "vpux/compiler/core/async_deps_info.hpp" -#include "vpux/compiler/core/cost_model_utils.hpp" +#include "vpux/compiler/core/control_edge_generator.hpp" #include "vpux/compiler/core/feasible_memory_scheduler_control_edges.hpp" +#include "vpux/compiler/core/feasible_scheduler_utils.hpp" #include "vpux/compiler/core/linear_scan_handler.hpp" #include "vpux/compiler/core/mem_live_range_info.hpp" #include "vpux/compiler/core/reserved_memory_info.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" -#include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/linear_scan.hpp" - #include "vpux/utils/core/checked_cast.hpp" #include "vpux/utils/core/error.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/sw_kernel_prefetching_reserve_mem.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/sw_kernel_prefetching_reserve_mem.cpp deleted file mode 100644 index 01974e3bb7..0000000000 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/sw_kernel_prefetching_reserve_mem.cpp +++ /dev/null @@ -1,106 +0,0 @@ -// -// Copyright (C) 2024-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -#include -#include -#include "vpux/compiler/dialect/IE/utils/resources.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" -#include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" - -namespace vpux::VPUIP { -#define GEN_PASS_DECL_SWKERNELPREFETCHINGRESERVEMEM -#define GEN_PASS_DEF_SWKERNELPREFETCHINGRESERVEMEM -#include "vpux/compiler/dialect/VPUIP/passes.hpp.inc" -} // namespace vpux::VPUIP - -using namespace vpux; - -namespace { - -// -// SWKernelPrefetchingReserveMemPass -// - -class SWKernelPrefetchingReserveMemPass final : - public VPUIP::impl::SWKernelPrefetchingReserveMemBase { -public: - explicit SWKernelPrefetchingReserveMemPass(Logger log) { - Base::initLogger(log, Base::getArgumentName()); - } - -private: - void safeRunOnModule() final; -}; - -bool checkSWKernelOp(mlir::ModuleOp& func) { - bool hasSWKernelOp = false; - func->walk([&](VPUIP::SwKernelOp) { - hasSWKernelOp = true; - return; - }); - - return hasSWKernelOp; -} - -void SWKernelPrefetchingReserveMemPass::safeRunOnModule() { - auto module = getOperation(); - auto* ctx = module->getContext(); - - auto hasSWKernelOp = checkSWKernelOp(module); - if (!hasSWKernelOp) { - return; - } - - auto maxPrefetchDataSize = VPUIP::getMaximalSWKernelPrefetchDataSize(module); - auto memSpaceAttr = mlir::SymbolRefAttr::get(ctx, stringifyEnum(VPU::MemoryKind::CMX_NN)); - auto available = IE::getAvailableMemory(module, memSpaceAttr); - const auto maxSize = available.size(); - auto reservedMemoryResources = IE::getReservedMemoryResources(module, memSpaceAttr); - if (reservedMemoryResources.empty()) { - // Insert a dummy reserved memory when there's no reserved memory - _log.trace("Reserve dummy memory for SW Kernel prefetching - size: '{0}'", maxPrefetchDataSize); - IE::setSWKernelPrefetchingReservedMemory(module, memSpaceAttr, maxPrefetchDataSize); - } else { - // Calculate reserved memory total size - int64_t reservedMemTotalSize = 0; - for (auto& resMem : reservedMemoryResources) { - reservedMemTotalSize += resMem.getByteSize(); - } - - // Enlarge the original reserved memory range when total reserved memory is not safe for SW Kernel data - // prefetching - if (reservedMemTotalSize < maxPrefetchDataSize) { - _log.trace("Enlarge the original reserved memory range for SW Kernel prefetching - size: '{0}'", - maxPrefetchDataSize - reservedMemTotalSize); - - auto lastResMem = reservedMemoryResources.back(); - auto lastResMemSize = lastResMem.getByteSize(); - auto newResMemSize = lastResMemSize + maxPrefetchDataSize - reservedMemTotalSize; - lastResMem.setByteSizeAttr(getIntAttr(module->getContext(), newResMemSize)); - } - } - - // Put all reserved memory at the end of CMX - auto newReservedMemoryResources = IE::getReservedMemoryResources(module, memSpaceAttr); - size_t resMemOffset = maxSize.count(); - for (auto& resMem : newReservedMemoryResources) { - auto currResMemSize = resMem.getByteSize(); - resMemOffset -= currResMemSize; - auto currResMemOffset = resMemOffset; - resMem.setOffsetAttr(getIntAttr(module->getContext(), currResMemOffset)); - } -} - -} // namespace - -// -// createSWKernelPrefetchingReserveMemPass -// - -std::unique_ptr vpux::VPUIP::createSWKernelPrefetchingReserveMemPass(Logger log) { - return std::make_unique(log); -} diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/swizzling.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/swizzling.cpp index 08c5804dee..35dbaad399 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/swizzling.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/swizzling.cpp @@ -10,10 +10,10 @@ #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/constant_fusion.hpp" -#include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/hw_settings.hpp" #include "vpux/compiler/utils/logging.hpp" #include "vpux/compiler/utils/quantization.hpp" @@ -21,9 +21,6 @@ #include "vpux/compiler/utils/swizzling_utils.hpp" #include "vpux/compiler/utils/types.hpp" -#include "vpux/utils/core/numeric.hpp" -#include "vpux/utils/core/range.hpp" - #include #include @@ -61,7 +58,7 @@ class Swizzling final : public VPUIP::impl::SwizzlingBase { bool _enableSwizzlingOfFusedConsts = false; struct DeviceInfo { - VPU::ArchKind archKind; + config::ArchKind archKind; int64_t cmxSize; int64_t reservedCMXSize; }; @@ -85,7 +82,7 @@ class Swizzling final : public VPUIP::impl::SwizzlingBase { DeviceInfo& deviceInfo); template void addSwizzlingAttributesToBuffer(mlir::OpBuilder& builder, InAllocOp inAllocOp, mlir::Type newType, - VPU::ArchKind archKind); + config::ArchKind archKind); void updateConstantTypeForSwizzling(Const::DeclareOp decOp, mlir::Operation* cstLoadOp, int64_t swizzlingKey, DeviceInfo& deviceInfo); ValuesSet getSwizzledOperandsFromFlagsMap(VPUIP::NCEClusterTaskOp nceOp, OpsInfo& opsInfo); @@ -97,7 +94,7 @@ class Swizzling final : public VPUIP::impl::SwizzlingBase { OpsInfo& opsInfo, AliasesInfo& aliasesInfo); }; -void adjustReturnTypesForInputChain(mlir::Value value, int64_t swizzlingKey, VPU::ArchKind archKind) { +void adjustReturnTypesForInputChain(mlir::Value value, int64_t swizzlingKey, config::ArchKind archKind) { auto adjustReturnType = [&](mlir::Value value) { auto adjustedType = setSwizzlingKey(value.getType(), swizzlingKey, archKind); value.setType(adjustedType); @@ -124,7 +121,7 @@ VPUIP::DistributedBufferType getDistributedBufferTypeWithSwizzling(VPUIP::Distri origDistType.getSparsityCompression()); } -bool isSizeAlignmentRequired(Const::DeclareOp decOp, VPU::ArchKind archKind, +bool isSizeAlignmentRequired(Const::DeclareOp decOp, config::ArchKind archKind, VPUIP::DistributedBufferType distributedType = nullptr) { auto isAlignmentRequired = [&](NDTypeInterface type) { auto swizzlingSizeAlignment = vpux::getSizeAlignmentForSwizzling(archKind); @@ -416,7 +413,7 @@ bool Swizzling::canSwizzleWeights(VPUIP::NCEClusterTaskOp nceOp, DeviceInfo& dev template void Swizzling::addSwizzlingAttributesToBuffer(mlir::OpBuilder& builder, InAllocOp inAllocOp, mlir::Type newType, - VPU::ArchKind archKind) { + config::ArchKind archKind) { auto swizzlingSchemeAttr = getSwizzlingSchemeAttr(newType); auto addressAlignment = vpux::getAddressAlignmentForSwizzling(swizzlingSchemeAttr.getKey().getInt(), archKind); auto addressAlignmentAttr = getIntAttr(&getContext(), addressAlignment); @@ -769,7 +766,7 @@ void Swizzling::safeRunOnFunc() { auto module = func->getParentOfType(); DeviceInfo deviceInfo; - deviceInfo.archKind = VPU::getArch(module); + deviceInfo.archKind = config::getArch(module); deviceInfo.cmxSize = IE::getAvailableMemory(module, VPU::MemoryKind::CMX_NN).size().count(); deviceInfo.reservedCMXSize = deviceInfo.cmxSize - VPU::getTotalCMXSize(module).count(); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/tile_act_shave_kernel_task.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/tile_act_shave_kernel_task.cpp index 813cae69d5..579070730b 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/tile_act_shave_kernel_task.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/tile_act_shave_kernel_task.cpp @@ -3,23 +3,20 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/core/bounded_buffer.hpp" #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/core/tiling.hpp" -#include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/tiling_info.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" -#include "vpux/compiler/dialect/VPU/utils/generate_tiling.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/types.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" -#include "vpux/compiler/utils/allocate_buffers.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" +#include "vpux/utils/logger/logger.hpp" #include #include @@ -42,6 +39,17 @@ element num less than the threshold. Need to replace it with cost model when the */ constexpr size_t TILING_THRESHOLD_FOR_CONVERT = 8192; +vpux::VPUIP::DistributedBufferType getDistributedBufferTypeFromType(mlir::Type type) { + auto distributedTypeInterface = mlir::dyn_cast(type); + if (distributedTypeInterface == nullptr) { + return nullptr; + } + auto distributedType = + mlir::dyn_cast(distributedTypeInterface.getDistributedTypes().front()); + + return distributedType; +} + Dim convertKernelAxisToDim(mlir::Value tensorArg, int64_t kernelAxis) { const auto inOrder = DimsOrder::fromValue(tensorArg); @@ -158,8 +166,8 @@ bool hasOnlyOneOffset(VPUIP::SwKernelOp swKernelOp, Dim tileDim) { if (!VPUIP::hasDistributedOperand(swKernelOp)) { return true; } - auto distributedType = mlir::dyn_cast(swKernelOp.getResult(0).getType()); - VPUX_THROW_UNLESS(distributedType != nullptr, "Unsupported type {0}", distributedType); + auto distributedType = getDistributedBufferTypeFromType(swKernelOp.getResult(0).getType()); + VPUX_THROW_WHEN(distributedType == nullptr, "Unsupported type {0}", distributedType); auto order = distributedType.getDimsOrder(); auto dimIdx = VPUIP::getTilingDimIndex(distributedType); if (dimIdx.has_value() && order.dimPos(Dim(dimIdx.value())) > order.dimPos(tileDim)) { @@ -212,6 +220,12 @@ Dim getSwKernelTileDim(VPUIP::SwKernelOp swKernelOp) { return Dims4D::Act::N; } else if (kernelEntryName == "gru_sequence_last_part") { return Dims4D::Act::N; + } else if (kernelEntryName == "grid_sample") { + const auto numShaves = IE::getTotalNumOfEngines(swKernelOp, VPU::ExecutorKind::SHAVE_ACT); + const auto inShape = mlir::cast(swKernelOp->getOperand(0).getType()).getShape(); + if (inShape[Dim(Dims4D::Act::N)] >= numShaves) { + return Dim(Dims4D::Act::N); + } } else if (kernelEntryName == "lstm_gates") { return Dims4D::Act::H; } else if (kernelEntryName == "lstm_cell") { @@ -420,9 +434,9 @@ bool doesSwKernelSupportTiling(VPUIP::SwKernelOp swKernelOp, vpux::Logger log) { return isDynamicTilingSupported(kernelEntryName); } - const auto arch = VPU::getArch(swKernelOp); + const auto arch = config::getArch(swKernelOp); // this is a workaround to force tiling of an operation with multiple outputs - if ((kernelEntryName == "detection_output_sort") && (arch == VPU::ArchKind::NPU37XX)) { + if ((kernelEntryName == "detection_output_sort") && (arch == config::ArchKind::NPU37XX)) { auto module = swKernelOp.getOperation()->getParentOfType(); auto tileOp = vpux::IE::getTileExecutor(module); VPUX_THROW_UNLESS(tileOp != nullptr, "Expected tileOp executor in order to query SHAVE_ACT executor."); @@ -692,7 +706,7 @@ mlir::FailureOr getSwKernelOutputTiling(VPUIP::SwKernelOp swKernel strideOnTilingDim *= memShape[MemDim(i)]; } } - const auto arch = VPU::getArch(swKernelOp); + const auto arch = config::getArch(swKernelOp); const auto addrAlign = VPUIP::getSwKernelTilingAddressAlignment(swKernelOp, arch); const auto elemSize = mlir::cast(swKernelOp.getOutputs().front().getType()).getElemTypeSize(); @@ -708,13 +722,14 @@ mlir::FailureOr getSwKernelOutputTiling(VPUIP::SwKernelOp swKernel mlir::Value createSubViewOpWithDistributedOutput(mlir::PatternRewriter& rewriter, mlir::Location loc, vpux::NDTypeInterface outType, mlir::Value operand, ShapeRef offset) { - auto distributedType = mlir::cast(outType); + auto distributedType = getDistributedBufferTypeFromType(outType); auto distribution = distributedType.getDistribution(); auto mode = distribution.getMode().getValue(); auto ctx = rewriter.getContext(); auto outShape = to_small_vector(outType.getShape()); - if (outType.getShape() == mlir::cast(operand.getType()).getShape()) { + auto inputDistributedType = getDistributedBufferTypeFromType(operand.getType()); + if (outType.getShape() == mlir::cast(inputDistributedType).getShape()) { return operand; } @@ -735,7 +750,7 @@ bool checkSwKernelTilingAlignment(VPUIP::SwKernelOp swKernelOp, const vpux::NDTy // todo: enable unaligned shave on VPUX37XX too // ticket E#114487 - if (!isArchVPUX3XXX(VPU::getArch(swKernelOp))) { + if (!vpux::config::isArchVPUX3XXX(config::getArch(swKernelOp))) { return true; } @@ -1467,7 +1482,7 @@ bool ClusterSwKernelRewriter::requireBalancingShapeCast(VPUIP::SwKernelOp swKern const auto numClusters = distributedType.getDistribution().getNumClusters().getInt(); const auto elemSize = distributedType.getElemTypeSize(); auto mode = distributedType.getDistribution().getMode().getValue(); - const auto arch = VPU::getArch(swKernelOp); + const auto arch = config::getArch(swKernelOp); for (auto clusterId : irange(numClusters)) { // no need to check last shave for (auto shaveId : irange(_shaveCount - 1)) { @@ -1685,6 +1700,7 @@ mlir::FailureOr ClusterSwKernelRewriter::getSWKernelWithFuse mlir::OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(swKernelOp); auto newOutType = mlir::cast(swKernelOp->getResult(0).getType()); + const auto dstElemType = newOutType.getElementType(); auto isShapeCastCorrect = [&](VPUIP::DistributedBufferType shapeCastOutType, SmallVector& expectedShapes, SmallVector& expectedOffsets) -> bool { @@ -1759,13 +1775,14 @@ mlir::FailureOr ClusterSwKernelRewriter::getSWKernelWithFuse auto origDistribution = distributedType.getDistribution(); const auto mode = origDistribution.getMode().getValue(); if (VPU::bitEnumContainsAny(mode, VPU::DistributionMode::SEGMENTED)) { - if (!VPUIP::isDistributedCompatibleAfterShapeChangeForViewOps( - distributedType, fusedNewShape, distributedType.getDimsOrder(), VPU::getArch(swKernelOp))) { + if (!VPUIP::isDistributedCompatibleAfterShapeChangeForViewOps(distributedType, fusedNewShape, + distributedType.getDimsOrder(), + config::getArch(swKernelOp))) { return mlir::failure(); } } auto newDistribution = VPUIP::getDistributedAttrAfterShapeCast( - distributedType, fuseNewShapeArray, VPU::getArch(swKernelOp)); + distributedType, fuseNewShapeArray, config::getArch(swKernelOp)); auto outType = distributedType.changeShapeForExplicitDistribution(fusedNewShape, newDistribution); auto newShapeCastOutType = @@ -1789,6 +1806,7 @@ mlir::FailureOr ClusterSwKernelRewriter::getSWKernelWithFuse // 512, 512] // input0 needs broadcast. So the new output type should be the same as new input1 newOutType = mlir::cast(inShapeCastOp.getType()); + newOutType = newOutType.changeElemType(dstElemType); } newInputs.push_back(inShapeCastOp); } @@ -1828,12 +1846,12 @@ bool ClusterSwKernelRewriter::checkTilePattern(VPUIP::SwKernelOp swKernelOp, boo return false; } - auto distributedType = mlir::dyn_cast(swKernelOp.getResult(0).getType()); + auto distributedType = getDistributedBufferTypeFromType(swKernelOp.getResult(0).getType()); if (distributedType == nullptr) { return false; } - auto parentInputDistType = mlir::dyn_cast(swKernelOp->getOperand(0).getType()); + auto parentInputDistType = getDistributedBufferTypeFromType(swKernelOp->getOperand(0).getType()); if (parentInputDistType == nullptr) { return false; } @@ -1897,8 +1915,8 @@ bool ClusterSwKernelRewriter::needInsertSubviewOnly(VPUIP::SwKernelOp swKernelOp auto isOverlapped = [&](mlir::Value val) { auto valueType = val.getType(); - auto distributedType = mlir::dyn_cast(valueType); - VPUX_THROW_UNLESS(distributedType != nullptr, "Unsupported type {0}", valueType); + auto distributedType = getDistributedBufferTypeFromType(valueType); + VPUX_THROW_WHEN(distributedType == nullptr, "Unsupported type {0}", distributedType); auto distribution = distributedType.getDistribution(); auto distributionMode = distribution.getMode().getValue(); @@ -1926,7 +1944,8 @@ std::optional ClusterSwKernelRewriter::calculateOutputTiles(VPUIP: if (!VPUIP::hasDistributedOperand(swKernelOp)) { return std::nullopt; } - auto distributedType = mlir::dyn_cast(swKernelOp.getResult(0).getType()); + auto distributedType = getDistributedBufferTypeFromType(swKernelOp.getResult(0).getType()); + VPUX_THROW_WHEN(distributedType == nullptr, "Unsupported type {0}", distributedType); auto perClusterShapes = distributedType.getPerClusterComputeShapes(); const auto insertSubviewOnly = needInsertSubviewOnly(swKernelOp); @@ -2195,7 +2214,8 @@ std::optional> ClusterSwKernelRewriter::calculateInputT } size_t ClusterSwKernelRewriter::getShaveTileSize(VPUIP::SwKernelOp swKernelOp, const OutputTiling& outTiles) const { - auto distributedType = mlir::dyn_cast(swKernelOp.getResult(0).getType()); + auto distributedType = getDistributedBufferTypeFromType(swKernelOp.getResult(0).getType()); + VPUX_THROW_WHEN(distributedType == nullptr, "Unsupported type {0}", distributedType); auto mode = distributedType.getDistribution().getMode().getValue(); if (mode == VPU::DistributionMode::DUPLICATED) { return outTiles.size(); @@ -2303,7 +2323,8 @@ SmallVector ClusterSwKernelRewriter::createNewOutBuffs(VPUIP::SwKer mlir::OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPointAfterValue(outBuffs[outputId]); - auto allocType = mlir::cast(outBuffs[outputId].getType()); + auto allocType = getDistributedBufferTypeFromType(outBuffs[outputId].getType()); + VPUX_THROW_WHEN(allocType == nullptr, "Unsupported type {0}", allocType); auto mode = allocType.getDistribution().getMode().getValue(); VPUX_THROW_WHEN(mode == VPU::DistributionMode::OVERLAPPED, @@ -2499,7 +2520,8 @@ OutputTiling ClusterSwKernelRewriter::getOuterMostOutputTiling(VPUIP::SwKernelOp auto outTiles = calculateOutputTiles(swKernelOp).value(); VPUX_THROW_WHEN(!VPUIP::hasDistributedOperand(swKernelOp), "Unexpected I/O op type at '{0}'", swKernelOp->getLoc()); - auto distributedType = mlir::dyn_cast(swKernelOp.getResult(0).getType()); + auto distributedType = getDistributedBufferTypeFromType(swKernelOp.getResult(0).getType()); + VPUX_THROW_WHEN(distributedType == nullptr, "Unsupported type {0}", distributedType); auto mode = distributedType.getDistribution().getMode().getValue(); if (mode == VPU::DistributionMode::DUPLICATED) { @@ -2618,7 +2640,8 @@ vpux::NDTypeInterface ClusterSwKernelRewriter::getNewTiledDistributedType( std::function getTileInfo) const { - auto distributedType = mlir::cast(outerOperand.getType()); + auto distributedType = getDistributedBufferTypeFromType(outerOperand.getType()); + VPUX_THROW_WHEN(distributedType == nullptr, "Unsupported type {0}", distributedType); auto distributionAttr = distributedType.getDistribution(); const auto mode = distributionAttr.getMode().getValue(); const auto insertSubview = needInsertSubviewOnly(swKernelOp); @@ -2753,8 +2776,8 @@ std::pair ClusterSwKernelRewriter::getStrideOn if (!insertSubview) { return inputOutputStrides; } - auto inputDistributedType = mlir::dyn_cast(swKernelOp->getOperand(0).getType()); - auto outputDistributedType = mlir::dyn_cast(swKernelOp.getResult(0).getType()); + auto inputDistributedType = getDistributedBufferTypeFromType(swKernelOp.getOperand(0).getType()); + auto outputDistributedType = getDistributedBufferTypeFromType(swKernelOp.getResult(0).getType()); auto ctx = swKernelOp->getContext(); inputOutputStrides.second = getStrideOnEachClusterImpl(outputDistributedType, ctx); // All currently strided operations except memPermute can just use outputStrides for both input/output diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/ungroup_sparse_buffers.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/ungroup_sparse_buffers.cpp index dd8885d656..88ed317295 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/ungroup_sparse_buffers.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/ungroup_sparse_buffers.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_distributed_ops.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_distributed_ops.cpp index f57f873e67..e6dbf63840 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_distributed_ops.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_distributed_ops.cpp @@ -7,12 +7,15 @@ #include "vpux/compiler/core/attributes/stride_reqs.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" +#include "vpux/compiler/dialect/VPU/utils/nce_sparsity.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/compression_utils.hpp" #include "vpux/compiler/utils/memref_attr_utils.hpp" +#include "vpux/compiler/utils/platform_resources.hpp" #include "vpux/compiler/utils/quantization.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/strings.hpp" #include "vpux/compiler/utils/swizzling_utils.hpp" @@ -611,40 +614,75 @@ void VPUIP::ClusterPerElementDMABaseRewriter::matchAndRewrite(VPUIP::DMATypeOpIn vpurtTask->erase(); } -bool isStorageElementTableConstantOp(Const::DeclareOp constOp) { - auto elementType = mlir::cast(constOp.getType()).getElementType(); +std::optional getUniqueNCEInputTypeForPatchingSETable(Const::DeclareOp constOp) { + const auto elementType = mlir::cast(constOp.getType()).getElementType(); if (!elementType.isInteger(32) || constOp.getResult().use_empty()) { - return false; + return std::nullopt; } - for (auto constUser : constOp.getResult().getUsers()) { - auto copyOp = mlir::dyn_cast(constUser); - if (copyOp == nullptr) { - return false; - } + const auto extractNCEInputTypeFromCopyOp = + [](VPUIP::NNDMAOp copyOp) -> std::optional { + VPUIP::DistributedBufferType nceInputType = nullptr; - bool onlyCopyUser = true; for (auto copyUser : copyOp.getOutputBuff().getUsers()) { if (copyUser == copyOp) { continue; } - onlyCopyUser = false; auto nceTask = mlir::dyn_cast(copyUser); if (nceTask == nullptr) { - return false; + return std::nullopt; } if (nceTask.getInputStorageElementTable() != copyOp.getOutputBuff()) { - return false; + return std::nullopt; } + + auto currentNCEInputType = mlir::dyn_cast(nceTask.getInput().getType()); + if (currentNCEInputType == nullptr) { + return std::nullopt; + } + + if (nceInputType == nullptr) { + nceInputType = currentNCEInputType; + } else if (nceInputType != currentNCEInputType) { + VPUX_THROW("SE Table DMA for multi NCEs but these NCE input types are not unique, got {0} and {1}", + nceInputType, currentNCEInputType); + } + } + + return nceInputType ? std::optional(nceInputType) : std::nullopt; + }; + + VPUIP::DistributedBufferType uniqueNCEInputType = nullptr; + + for (const auto constUser : constOp.getResult().getUsers()) { + auto copyOp = mlir::dyn_cast(constUser); + if (copyOp == nullptr) { + return std::nullopt; + } + + const auto nceInputType = extractNCEInputTypeFromCopyOp(copyOp); + if (!nceInputType.has_value()) { + return std::nullopt; } - if (onlyCopyUser) { - return false; + + if (uniqueNCEInputType == nullptr) { + uniqueNCEInputType = nceInputType.value(); + } else if (uniqueNCEInputType != nceInputType.value()) { + const auto uniqueNCEInputDistAttr = uniqueNCEInputType.getDistribution(); + const auto currNCEInputDistAttr = nceInputType.value().getDistribution(); + // When the same SETable is shared among multiple NCE operations with different input quantization types + // compare memory shapes and offsets instead of complete types to ensure consistent distributed shapes + VPUX_THROW_UNLESS( + uniqueNCEInputDistAttr.getMemoryShapes() == currNCEInputDistAttr.getMemoryShapes() && + uniqueNCEInputDistAttr.getMemoryOffsets() == currNCEInputDistAttr.getMemoryOffsets(), + "SE Table Const for multi NCEs but these NCE per cluster Shape are not unique, got {0} and {1}", + uniqueNCEInputType, nceInputType.value()); } } - return true; + return uniqueNCEInputType ? std::optional(uniqueNCEInputType) : std::nullopt; } // SE pointers have the following format: @@ -663,25 +701,84 @@ bool isStorageElementTableConstantOp(Const::DeclareOp constOp) { // BASE_PTR at Cluster 0: 0 0 0 0 0 0 0 0 0 0 0 0 // BASE_PTR at Cluster 1: 1 1 1 1 1 1 1 1 1 1 1 1 // The third data "2" exists in two clusters on 40XX+ -mlir::Value patchSETableValue(mlir::Location loc, Const::DeclareOp constOp, const int64_t clusterId, - mlir::OpBuilder& builder) { - auto seTableContent = constOp.getContent(); - auto seTableSize = seTableContent.getType().getShape().totalSize(); +mlir::Value VPUIP::patchSETableValue(mlir::Location loc, Const::DeclareOp constOp, + VPUIP::DistributedBufferType nceInputDistType, const int64_t targetClusterId, + mlir::OpBuilder& builder) { + const auto seTableContent = constOp.getContent(); + const auto seTableShape = seTableContent.getType().getShape(); + const auto seTableSize = seTableShape.totalSize(); auto seTableVals = to_small_vector(seTableContent.getValues()); VPUX_THROW_UNLESS(seTableVals.size() == checked_cast(seTableSize), "Unable to correctly obtain the seTable values"); - auto baseSEPointer = *std::min_element(seTableVals.begin(), seTableVals.end(), [&](const auto lhs, const auto rhs) { - if ((lhs & 0x1FF) != clusterId) { - return (rhs & 0x1FF) == clusterId || lhs < rhs; + const auto tileIndex = VPUIP::getTilingDimIndex(nceInputDistType); + VPUX_THROW_UNLESS(tileIndex.has_value(), "Failed to get tiling dim index for input distributed type: {0}", + nceInputDistType); + const auto tileDim = Dim(tileIndex.value()); + VPUX_THROW_UNLESS(tileDim == Dims4D::Act::H || tileDim == Dims4D::Act::W, + "Invalid Tile dim, got {0}, expect tiling on H or W for SEP NCEClusterTask", tileDim); + + const bool isTilingOnH = (tileDim == Dims4D::Act::H); + const int64_t seTableC = seTableShape[Dims4D::Act::C]; + const int64_t seTableH = seTableShape[Dims4D::Act::H]; + const int64_t seTableW = seTableShape[Dims4D::Act::W]; + const int64_t lineCount = isTilingOnH ? seTableH : seTableW; + const auto tileStride = static_cast(nceInputDistType.getStrides()[tileDim]); + + const auto extractClusterId = [](int32_t seVal) -> int64_t { + return seVal & 0x1FF; + }; + + // Step 1: Find the smallest data ptr as baseSEPointer for each non-target cluster + llvm::SmallDenseMap baseSEPointers; + for (const auto seVal : seTableVals) { + const int64_t nonTargetClusterId = extractClusterId(seVal); + if (nonTargetClusterId != targetClusterId) { + auto [it, inserted] = baseSEPointers.try_emplace(nonTargetClusterId, seVal); + if (!inserted && seVal < it->second) { + it->second = seVal; + } } - return false; - }); + } - for (int64_t index = 0; index < seTableSize; ++index) { - const int32_t basePtr = seTableVals[index] & 0x1FF; - if (clusterId != basePtr) { - seTableVals[index] = seTableVals[index] - baseSEPointer + clusterId; + // Step 2: Figure out the new start offset newSEPointerOffset for each non-target cluster + llvm::SmallDenseMap> clusterUniqueSeVals; + for (int64_t lineIdx = 0; lineIdx < lineCount; ++lineIdx) { + const int64_t firstElementIdx = isTilingOnH ? (lineIdx * seTableW * seTableC) : (lineIdx * seTableC); + if (firstElementIdx < seTableSize) { + const int32_t firstSeVal = seTableVals[firstElementIdx]; + const int64_t nonTargetClusterId = extractClusterId(firstSeVal); + if (nonTargetClusterId != targetClusterId) { + clusterUniqueSeVals[nonTargetClusterId].insert(firstSeVal); + } + } + } + + SmallVector sortedClusterIds; + for (const auto& [clusterId, _] : clusterUniqueSeVals) { + sortedClusterIds.push_back(clusterId); + } + llvm::sort(sortedClusterIds); + + llvm::SmallDenseMap newSEPointerOffsets; + int64_t cumulativeUniqueLines = 0; + for (const auto clusterId : sortedClusterIds) { + const int64_t offsetInBytes = (cumulativeUniqueLines * tileStride.count() >> 4) + << VPU::NCESparsity::BASE_PTR_SIZE; + newSEPointerOffsets[clusterId] = offsetInBytes; + cumulativeUniqueLines += clusterUniqueSeVals[clusterId].size(); + } + + // Step 3: Apply patch: + // patchedSEPointer = currSEPointer - baseSEPointer + newSEPointerOffset + targetClusterId + for (int64_t idx = 0; idx < seTableSize; ++idx) { + const int64_t currClusterId = extractClusterId(seTableVals[idx]); + if (currClusterId != targetClusterId) { + const auto baseIt = baseSEPointers.find(currClusterId); + const auto offsetIt = newSEPointerOffsets.find(currClusterId); + if (baseIt != baseSEPointers.end() && offsetIt != newSEPointerOffsets.end()) { + seTableVals[idx] = seTableVals[idx] - baseIt->second + offsetIt->second + targetClusterId; + } } } @@ -843,11 +940,12 @@ void VPUIP::ClusterPerElementDMABaseRewriter::unrollSegmentedOrOverlapped(mlir:: // The reason for that is because there is no overlap of values. Each depth is generated // separately for each cluster. const auto isSOK = distributionAttr != nullptr && VPU::isSegmentedOverC(distributionAttr); - bool requirePatching = isDataOverlapped && !isSOK && isStorageElementTableConstantOp(cst); - if (requirePatching) { - auto newCstOp = subviewOp.getDefiningOp(); - VPUX_THROW_WHEN(newCstOp == nullptr, "Cannot get the constant operation of SETable"); - return patchSETableValue(loc, newCstOp, clusterId, builder); + if (isDataOverlapped && !isSOK) { + if (auto nceInputDistType = getUniqueNCEInputTypeForPatchingSETable(cst)) { + auto newCstOp = subviewOp.getDefiningOp(); + VPUX_THROW_WHEN(newCstOp == nullptr, "Cannot get the constant operation of SETable"); + return VPUIP::patchSETableValue(loc, newCstOp, nceInputDistType.value(), clusterId, builder); + } } return subviewOp; @@ -996,7 +1094,9 @@ void VPUIP::ClusterPerElementDMABaseRewriter::unrollSegmentedOrOverlapped(mlir:: // This is the requirement for DMA load balancing pass. Technically it can works with any number of ports, but // now only 2 is supported - VPUX_THROW_WHEN(_dmaPortCount > 2, "Too much DMA ports"); + auto maxDMAPorts = VPUX40XX_MAX_DMA_PORTS; + + VPUX_THROW_WHEN(_dmaPortCount > maxDMAPorts, "Too many DMA ports"); // Split one of DMAs to load balance on DMA ports if needed const bool isDmaSplitRequired = numClusters % _dmaPortCount != 0; diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_expand_dma.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_expand_dma.cpp index f17dcef379..78b3c6a900 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_expand_dma.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_expand_dma.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/dialect/VPUIP/interfaces/dma_descriptor_generator.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/unroll_dma_analysis.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" @@ -251,7 +252,7 @@ void ExpandDMARewriter::createTilesForLargeSize(VPUIP::ExpandDMAOp origOp, const auto singlePlaneSize = fullCopySize / numPlanesOfFullShape; // Deeply rooted in NPU2.7 representation - const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(VPU::getArch(origOp)); + const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(config::getArch(origOp)); const auto dmaMaxLength = dmaEngineLimits.getMaxLength(); const auto numPlanesPerTile = (dmaMaxLength / singlePlaneSize.count()); VPUX_THROW_UNLESS(numPlanesPerTile != 0, @@ -345,7 +346,7 @@ mlir::LogicalResult ExpandDMARewriter::matchAndRewrite(VPUIP::ExpandDMAOp expand _log.trace("ExpandDMA's result is not DistributedBufferType"); const auto dmaSize = static_cast(getCompactSize(expandDmaOp.getInput())); - const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(VPU::getArch(expandDmaOp)); + const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(config::getArch(expandDmaOp)); const auto dmaMaxLength = dmaEngineLimits.getMaxLength(); if (dmaSize > Byte(dmaMaxLength)) { _log.trace("ExpandDMA with input size '{0}' large than limitation '{1}' and need to tile", dmaSize, diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_gather_dma.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_gather_dma.cpp new file mode 100644 index 0000000000..b05efd84b8 --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_gather_dma.cpp @@ -0,0 +1,185 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/core/attributes/shape.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/unroll_dma_analysis.hpp" + +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" +#include "vpux/compiler/utils/dma_limits.hpp" +#include "vpux/compiler/utils/rewriter.hpp" + +#include + +namespace vpux::VPUIP { +#define GEN_PASS_DECL_UNROLLGATHERDMA +#define GEN_PASS_DEF_UNROLLGATHERDMA +#include "vpux/compiler/dialect/VPUIP/passes.hpp.inc" +} // namespace vpux::VPUIP + +using namespace vpux; + +namespace { + +// +// GatherDMARewriter +// + +class GatherDMARewriter final : public mlir::OpRewritePattern { +public: + GatherDMARewriter(mlir::MLIRContext* ctx, int64_t dmaPortCount, Logger log) + : mlir::OpRewritePattern(ctx), _log(log), _ctx(ctx), _dmaPortCount(dmaPortCount) { + setDebugName("GatherDMARewriter"); + + _cmxNameAttr = mlir::FlatSymbolRefAttr::get(ctx, stringifyEnum(VPU::MemoryKind::CMX_NN)); + } + + mlir::LogicalResult matchAndRewrite(VPUIP::GatherDMAOp gatherDmaOp, mlir::PatternRewriter& rewriter) const final; + +private: + Logger _log; + mlir::MLIRContext* _ctx; + int64_t _dmaPortCount; + mlir::FlatSymbolRefAttr _cmxNameAttr; +}; + +mlir::LogicalResult GatherDMARewriter::matchAndRewrite(VPUIP::GatherDMAOp gatherDmaOp, + mlir::PatternRewriter& rewriter) const { + _log.trace("Process GatherDMA op: {0}", gatherDmaOp); + + const auto loc = gatherDmaOp->getLoc(); + const auto input = gatherDmaOp.getInput(); + const auto indices = gatherDmaOp.getIndices(); + const auto output = gatherDmaOp.getOutputBuff(); + + auto declBuff = indices.getDefiningOp(); + auto declBuffType = mlir::cast(declBuff.getType()); + const auto memSpaceId = declBuffType.getMemSpace().getIndex(); + if (memSpaceId.has_value()) { + _log.nest().trace("This GatherDMAOp has already been unrolled."); + return mlir::failure(); + } + + const auto distributedIndicesType = mlir::dyn_cast(indices.getType()); + const auto distributedOutputType = mlir::dyn_cast(output.getType()); + + VPUX_THROW_WHEN(distributedIndicesType == nullptr || distributedOutputType == nullptr, + "Indices and output must have DistributedBuffer type"); + + const auto getDistModeAttr = [&](VPUIP::DistributedBufferType distType) { + const auto distAttr = distType.getDistribution(); + VPUX_THROW_WHEN(distAttr == nullptr, "Failed to extract distribution tensor from distributed type"); + return distAttr.getMode(); + }; + + const auto indicesDistModeAttr = getDistModeAttr(distributedIndicesType); + VPUX_THROW_UNLESS( + indicesDistModeAttr != nullptr && indicesDistModeAttr.getValue() == VPU::DistributionMode::DUPLICATED, + "Unsupported input distributed mode: {0}", indicesDistModeAttr); + const auto outputDistModeAttr = getDistModeAttr(distributedOutputType); + VPUX_THROW_UNLESS( + outputDistModeAttr != nullptr && outputDistModeAttr.getValue() == VPU::DistributionMode::SEGMENTED, + "Unsupported output distributed mode: {0}", outputDistModeAttr); + + auto vpurtTask = gatherDmaOp->getParentOfType(); + VPUX_THROW_WHEN(vpurtTask == nullptr, "Can not get VPURT.TaskOp for {0}", gatherDmaOp); + + mlir::SmallVector inputBuffers; + mlir::SmallVector indicesBuffers; + mlir::SmallVector outputBuffers; + if (distributedIndicesType != nullptr && distributedOutputType != nullptr) { + _log.nest().trace("Got single-cluster to multi-cluster case"); + auto tileIndex = VPUIP::getTilingDimIndex(distributedOutputType); + VPUX_THROW_UNLESS(tileIndex.has_value(), "No tiling dimension found"); + auto origInputShape = mlir::dyn_cast(input.getType()).getShape().raw(); + + const auto inputShapes = SmallVector( + llvm::map_range(distributedOutputType.getPerClusterMemoryShapes(), [&](ShapeRef outShape) { + auto inShape = Shape(origInputShape); + inShape[Dim(tileIndex.value())] = outShape.raw()[tileIndex.value()]; + return inShape; + })); + + const auto inputShapeOffsets = distributedOutputType.getPerClusterMemoryShapeOffsets(); + + const auto numClusters = checked_cast(inputShapes.size()); + inputBuffers = VPUIP::getSplitBuffers(_ctx, loc, "input", input, inputShapes, inputShapeOffsets, numClusters, + rewriter); + + indicesBuffers = VPUIP::getPerClusterMemoryBuffers(_ctx, loc, "indices", indices, numClusters, rewriter); + outputBuffers = VPUIP::getPerClusterMemoryBuffers(_ctx, loc, "output", output, numClusters, rewriter); + } + + VPUX_THROW_WHEN(inputBuffers.size() != outputBuffers.size(), "Size of input/output buffers list must match"); + const auto numClusters = inputBuffers.size(); + + rewriter.setInsertionPointAfter(vpurtTask); + + int64_t dmaPort = 0; + for (size_t clusterId = 0; clusterId < numClusters; ++clusterId) { + const auto newLoc = appendLoc(gatherDmaOp->getLoc(), "_cluster_{0}", clusterId); + auto newGatherDMAOp = VPURT::wrapIntoTaskOp( + rewriter, vpurtTask.getWaitBarriers(), vpurtTask.getUpdateBarriers(), newLoc, inputBuffers[clusterId], + indicesBuffers[clusterId], outputBuffers[clusterId], gatherDmaOp.getElementSize(), + gatherDmaOp.getPadding(), gatherDmaOp.getPort().value()); + newGatherDMAOp.setChannelType(gatherDmaOp.getChannelType()); + dmaPort = (dmaPort + 1) % _dmaPortCount; + + _log.nest().trace("Insert new newGatherDMAOp: '{0}'", newGatherDMAOp); + } + rewriter.eraseOp(vpurtTask); + + return mlir::success(); +} + +// +// UnrollGatherDMAPass +// + +class UnrollGatherDMAPass final : public VPUIP::impl::UnrollGatherDMABase { +public: + explicit UnrollGatherDMAPass(Logger log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnFunc() final; +}; + +void UnrollGatherDMAPass::safeRunOnFunc() { + markAnalysesPreserved(); + auto analysis = getAnalysis(); + if (!analysis.passNeeded(VPUIP::UnrollDMAAnalysisNeeded::UnrollGatherDMAPass)) { + return; + } + auto& ctx = getContext(); + auto func = getOperation(); + auto module = func->getParentOfType(); + auto dmaOp = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN); + auto dmaPortCount = dmaOp.getCount(); + + mlir::RewritePatternSet patterns(&ctx); + patterns.insert(&ctx, dmaPortCount, _log); + + if (mlir::failed( + mlir::applyPatternsAndFoldGreedily(func, std::move(patterns), vpux::getDefaultGreedyRewriteConfig()))) { + signalPassFailure(); + } +} + +} // namespace + +// +// createUnrollGatherDMAPass +// + +std::unique_ptr vpux::VPUIP::createUnrollGatherDMAPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_per_axis_tile_dma.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_per_axis_tile_dma.cpp index 7787d08962..34021ea2b6 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_per_axis_tile_dma.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_per_axis_tile_dma.cpp @@ -15,6 +15,7 @@ #include "vpux/compiler/dialect/VPURT/IR/attributes.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/quantization.hpp" @@ -210,7 +211,7 @@ mlir::LogicalResult PerAxisTileDMARewriter::unrollPerAxisTile(VPUIP::PerAxisTile portIsAlreadyAssigned = false; } - const auto arch = VPU::getArch(perAxisTileDMAOp); + const auto arch = config::getArch(perAxisTileDMAOp); auto subInputShapes = VPUIP::getPerAxisTileDMASubShapes(arch, mergedShapes.first); auto subOutputShapes = VPUIP::getPerAxisTileDMASubShapes(arch, mergedShapes.second); VPUX_THROW_UNLESS(subInputShapes.size() == subOutputShapes.size(), diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_upsample_dma.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_upsample_dma.cpp index 94bcfa3713..79deef74e7 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_upsample_dma.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/unroll_upsample_dma.cpp @@ -4,13 +4,14 @@ // #include "vpux/compiler/dialect/IE/utils/resources.hpp" -#include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" - #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/unroll_dma_analysis.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/dma_limits.hpp" @@ -139,7 +140,7 @@ mlir::LogicalResult UpsamplingDMARewriter::matchAndRewrite(VPUIP::UpsamplingDMAO } } - const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(VPU::getArch(upsamplingDMAOp)); + const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(config::getArch(upsamplingDMAOp)); const auto dmaMaxLength = dmaEngineLimits.getMaxLength(); const auto dmaMaxNumPlanes = dmaEngineLimits.getMaxNumPlanes(); diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/update_sw_kernel_params.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/update_sw_kernel_params.cpp index 7e9711b08f..7a91ca9575 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/update_sw_kernel_params.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/update_sw_kernel_params.cpp @@ -10,6 +10,7 @@ #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/swizzling_utils.hpp" #include "vpux/utils/profiling/common.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/wrap_with_permute_as_nndma.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/wrap_with_permute_as_nndma.cpp index 75c43af004..3da4ce06d7 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/wrap_with_permute_as_nndma.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/wrap_with_permute_as_nndma.cpp @@ -3,8 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/attributes/stride_reqs.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" @@ -13,6 +16,7 @@ #include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/attributes.hpp" @@ -21,6 +25,8 @@ #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/error.hpp" +#include +#include #include #include #include @@ -34,6 +40,7 @@ namespace vpux::VPUIP { using namespace vpux; namespace { +bool checkPattern(mlir::Operation* op, ShapeRef expandInputShape, mlir::ArrayAttr expandPadBegin); template T getTheOnlyUser(mlir::Operation* op) { @@ -84,7 +91,7 @@ bool isSplitContinuousBufferType(VPUIP::DistributedBufferType distributedType) { } VPUIP::DistributedBufferType createDMADistributedTensorType(mlir::MLIRContext* ctx, vpux::NDTypeInterface operandType, - mlir::IntegerAttr tileCount, VPU::ArchKind arch, + mlir::IntegerAttr tileCount, config::ArchKind arch, bool uniformDistributedSegments) { const auto distMode = VPU::DistributionModeAttr::get(ctx, VPU::DistributionMode::SEGMENTED); const auto numTiles = getIntArrayAttr(ctx, SmallVector{1, 1, tileCount.getInt(), 1}); @@ -213,8 +220,8 @@ bool checkPermuteWithCopyPattern(VPUIP::SwKernelOp swKernelOp, Logger log) { VPUX_THROW_WHEN(dmaPortNum <= 0, "Invalid number of DMA ports; should be > 0, but actual value is {0}", dmaPortNum); - auto dmaSubShapes = VPUIP::getPermuteDMASubInputShapes(VPU::getArch(swKernelOp), permuteInType, permuteOutType, - memPerm, dmaPortNum, log); + auto dmaSubShapes = VPUIP::getPermuteDMASubInputShapes(config::getArch(swKernelOp), permuteInType, + permuteOutType, memPerm, dmaPortNum, log); // If fuse Permute with next Distributed Copy Op and PermuteDMA need unroll to severl Sub DMA tasks, // Find a scenerior has regression. Need investigate the root cause and find a cost model for that. // For example: Shape size with 1x4420x1x2, mode is DUPLICATED. @@ -375,72 +382,90 @@ bool checkExpandU8Pattern(VPUIP::ExpandOp expandOp, Logger log) { return isExpandOpWrapable(expandOp, log); } -bool checkExpandFP16Pattern(VPUIP::ExpandOp expandOp, Logger log) { - log.trace("Got ExpandOpFP16 at {0}. Try to find fuse pattern.", expandOp->getLoc()); +bool checkLastChild(mlir::Operation* op, ShapeRef expandInputShape, mlir::ArrayAttr expandPadBegin) { + const auto expandInput = to_small_vector(expandInputShape); + const auto expandBegin = parseIntArrayAttr(expandPadBegin); + + if (op == nullptr) { + return false; + } - const auto isCopyOpWithOneUser = [&](mlir::Operation* op) -> bool { - if (!VPUIP::hasOneOrSameUser(op)) { + if (auto subviewOp = mlir::dyn_cast(op)) { + if (subviewOp.getStaticStrides().has_value()) { return false; } + const auto staticOffsets = parseIntArrayAttr(subviewOp.getStaticOffsets()); + const auto staticSizes = parseIntArrayAttr(subviewOp.getStaticSizes()); + if (expandBegin[Dims4D::Act::C.ind()] != staticOffsets[Dims4D::Act::C.ind()] || + staticSizes[Dims4D::Act::C.ind()] != expandInput[Dims4D::Act::C.ind()]) { + return false; + } + } else { + // In case there is no SubView operation, it is possible for the NCE op(s) to produce the channels unpadded + // directly + if (expandBegin[Dims4D::Act::C.ind()] != 0) { + return false; + } + const auto operandType = mlir::cast(op->getOperand(0).getType()); + if (operandType.getShape()[Dims4D::Act::C] != expandInput[Dims4D::Act::C.ind()]) { + return false; + } + } + return true; +} + +// ExpandOp aligns channels to multiples of 16 due to HW constraints +// ExpandDMA copies only the actively used data; the expanded portion may contain uninitialized data +// - For operations like Pooling or GroupConvolution, computations are performed per channel +// Thus, any data, including uninitialized memory, can serve as the expanded data without affecting outcomes +// - However, for Convolution operations, the expanded channels are included in computations +// Therefore, filling these channels with abnormal data (e.g., null values) can adversely affect the results +// +// Illegal Pattern: "Expand -> NceExceptConv -> Convolution -> Subview" +// Dirty data impacts Convolution +// Legal Pattern: "Expand -> NceExceptConv x N -> Subview" +// Multiple non-conv NCE operations between Expand and Subview are supported +bool checkPattern(mlir::Operation* op, ShapeRef expandInputShape, mlir::ArrayAttr expandPadBegin) { + const auto isCopyOp = [&](mlir::Operation* op) -> bool { return mlir::isa_and_nonnull(op); }; - const auto isNceButNotConvOpWithOneUser = [&](mlir::Operation* op) -> bool { - if (!VPUIP::hasOneOrSameUser(op)) { - return false; - } + const auto isPermuteCastOp = [&](mlir::Operation* op) -> bool { + return mlir::isa_and_nonnull(op); + }; + const auto isNceButNotConvOp = [&](mlir::Operation* op) -> bool { auto nceTask = mlir::dyn_cast(op); return nceTask != nullptr && nceTask.getTaskType() != VPUIP::NCETaskType::CONV; }; - // ExpandOp aligns channels to multiples of 16 due to HW constraints - // ExpandDMA copies only the actively used data; the expanded portion may contain uninitialized data - // - For operations like Pooling or GroupConvolution, computations are performed per channel - // Thus, any data, including uninitialized memory, can serve as the expanded data without affecting outcomes - // - However, for Convolution operations, the expanded channels are included in computations - // Therefore, filling these channels with abnormal data (e.g., null values) can adversely affect the results - // - // Illegal Pattern: "Expand -> NceExceptConv -> Convolution -> Subview" - // Dirty data impacts Convolution - // Legal Pattern: "Expand -> NceExceptConv x N -> Subview" - // Multiple non-conv NCE operations between Expand and Subview are supported - - auto potentialCopyOrNceOperand = expandOp->getUses().begin(); - while (isCopyOpWithOneUser(potentialCopyOrNceOperand->getOwner()) || - isNceButNotConvOpWithOneUser(potentialCopyOrNceOperand->getOwner())) { - potentialCopyOrNceOperand = potentialCopyOrNceOperand->getOwner()->getUses().begin(); - } - - const auto expandInput = to_small_vector(getShape(expandOp.getInput())); - const auto expandBegin = parseIntArrayAttr(expandOp.getPadsBegin()); - - if (potentialCopyOrNceOperand->getOwner() != nullptr) { - auto subviewOp = mlir::dyn_cast(potentialCopyOrNceOperand->getOwner()); - if (subviewOp != nullptr) { - if (subviewOp.getStaticStrides().has_value()) { - return false; - } - const auto staticOffsets = parseIntArrayAttr(subviewOp.getStaticOffsets()); - const auto staticSizes = parseIntArrayAttr(subviewOp.getStaticSizes()); - if (expandBegin[Dims4D::Act::C.ind()] != staticOffsets[Dims4D::Act::C.ind()] || - staticSizes[Dims4D::Act::C.ind()] != expandInput[Dims4D::Act::C.ind()]) { - return false; - } + auto returnFlag = true; + for (auto& child : op->getUses()) { + auto childOp = child.getOwner(); + if (isCopyOp(childOp) || isNceButNotConvOp(childOp) || isPermuteCastOp(childOp)) { + returnFlag = returnFlag && checkPattern(childOp, expandInputShape, expandPadBegin); } else { - // In case there is no SubView operation, it is possible for the NCE op(s) to produce the channels unpadded - // directly - if (expandBegin[Dims4D::Act::C.ind()] != 0) { - return false; - } - const auto operandType = mlir::cast(potentialCopyOrNceOperand->get().getType()); - if (operandType.getShape()[Dims4D::Act::C] != expandInput[Dims4D::Act::C.ind()]) { - return false; - } + returnFlag = returnFlag && checkLastChild(childOp, expandInputShape, expandPadBegin); + } + + if (!returnFlag) { + return returnFlag; } } + return returnFlag; +} + +bool checkExpandFP16Pattern(VPUIP::ExpandOp expandOp, Logger log) { + log.trace("Got ExpandOpFP16 at {0}. Try to find fuse pattern.", expandOp->getLoc()); + auto shape = getShape(expandOp.getInput()); + const auto expandPadBegin = expandOp.getPadsBegin(); + + if (!checkPattern(expandOp, shape, expandPadBegin)) { + return false; + } + return isExpandOpWrapable(expandOp, log); } @@ -1382,7 +1407,7 @@ mlir::LogicalResult WrapDepthToSpaceAsDistributedNNDMA::matchAndRewrite(VPUIP::S _log.trace("Found DepthToSpace at '{0}' with DistributedCopy pattern", swKernelOp->getLoc()); auto ctx = swKernelOp.getContext(); - auto arch = VPU::getArch(swKernelOp.getOperation()); + auto arch = config::getArch(swKernelOp.getOperation()); // Extract D2S attributes auto d2sAttrs = VPUIP::getDepthToSpaceSwKernelAttr(swKernelOp); diff --git a/src/vpux_compiler/src/dialect/VPUIP/utils/convert_to_dma_utils.cpp b/src/vpux_compiler/src/dialect/VPUIP/utils/convert_to_dma_utils.cpp index b18d6d5595..6c8791ae30 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/utils/convert_to_dma_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/utils/convert_to_dma_utils.cpp @@ -2,20 +2,25 @@ // Copyright (C) 2022-2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // -#include +#include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/layers.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" #include "vpux/compiler/dialect/VPUIP/interfaces/dma_descriptor_generator.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/dialect/config/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/dma_limits.hpp" +#include "vpux/utils/core/numeric.hpp" + +#include using namespace vpux; namespace { @@ -63,7 +68,7 @@ bool isBeneficialForUsingSWDepthToSpace(vpux::NDTypeInterface inType, vpux::IE:: return isNHWC && is16bit && ((isBS4 && isC16C128) || (isBS2 && isC16Align)) && isDepthFirst; } -bool isBeneficialForUsingSWDepthToSpace(VPUIP::SwKernelOp swKernelOp, VPU::ArchKind /*arch*/) { +bool isBeneficialForUsingSWDepthToSpace(VPUIP::SwKernelOp swKernelOp, config::ArchKind /*arch*/) { VPUX_THROW_UNLESS(VPUIP::isDepthToSpaceSwKernel(swKernelOp), "SwKernelOp {0} is not DepthToSpace", swKernelOp->getLoc()); const auto inType = mlir::cast(swKernelOp.getInputs()[0].getType()); @@ -74,7 +79,7 @@ bool isBeneficialForUsingSWDepthToSpace(VPUIP::SwKernelOp swKernelOp, VPU::ArchK return isBeneficialForUsingSWDepthToSpace(inType, mode, blockSize); } -SmallVector computeDMASubShape(VPU::ArchKind arch, ShapeRef shape, Dim numPlaneDim, int64_t dmaPortCount) { +SmallVector computeDMASubShape(config::ArchKind arch, ShapeRef shape, Dim numPlaneDim, int64_t dmaPortCount) { VPUX_THROW_WHEN(dmaPortCount <= 0, "Invalid number of DMA ports: {0}", dmaPortCount); const auto shapeSize = shape.size(); @@ -122,9 +127,9 @@ SmallVector computeDMASubShape(VPU::ArchKind arch, ShapeRef shape, Dim nu } } // namespace -bool vpux::VPUIP::satisfiesOptimizedMemPermute(VPU::ArchKind arch, NDTypeInterface inType, NDTypeInterface outType) { +bool vpux::VPUIP::satisfiesOptimizedMemPermute(config::ArchKind arch, NDTypeInterface inType, NDTypeInterface outType) { // MemPermute kernel is specially optimized for the conditions below - if (arch == VPU::ArchKind::NPU37XX) { + if (arch == config::ArchKind::NPU37XX) { return false; } const auto inBits = inType.getElemTypeSize().count(); @@ -169,7 +174,7 @@ bool isDMASupportedMemPermuteDistribution(vpux::NDTypeInterface inputType, vpux: return supportedOutputMode; } -bool vpux::VPUIP::isBeneficialForUsingPermuteDMA(VPU::ArchKind arch, NDTypeInterface inType, NDTypeInterface outType, +bool vpux::VPUIP::isBeneficialForUsingPermuteDMA(config::ArchKind arch, NDTypeInterface inType, NDTypeInterface outType, mlir::AffineMap memPerm, int64_t dmaPortCount, vpux::Logger log) { // Check if the memory permutation satisfies optimized conditions for Shave optimizations // If it does, using PermuteDMA is not beneficial @@ -286,6 +291,9 @@ std::optional vpux::VPUIP::getPermuteDMAInputShape(NDTypeInterface inType } else if (mergedMemPerm == DimsOrder::HNWC.toAffineMap(inType.getContext())) { // Check for permute pattern: [d0, d1, d2, d3] -> [d2, d0, d3, d1] return Shape{newInputShape[Dim(2)], newInputShape[Dim(0)], newInputShape[Dim(3)], newInputShape[Dim(1)]}; + } else if (mergedMemPerm == DimsOrder::HCWN.toAffineMap(inType.getContext())) { + // Check for permute pattern: [d0, d1, d2, d3] -> [d2, d1, d3, d0] + return Shape{newInputShape[Dim(2)], newInputShape[Dim(1)], newInputShape[Dim(3)], newInputShape[Dim(0)]}; } else { return std::nullopt; } @@ -305,7 +313,8 @@ std::optional vpux::VPUIP::getPermuteDMAOutputShape(NDTypeInterface inTyp return mergedOutputShape; } -std::optional> vpux::VPUIP::getPermuteDMASubInputShapes(VPU::ArchKind arch, NDTypeInterface inType, +std::optional> vpux::VPUIP::getPermuteDMASubInputShapes(config::ArchKind arch, + NDTypeInterface inType, NDTypeInterface outType, mlir::AffineMap perm, int64_t dmaPortCount, vpux::Logger log) { @@ -420,7 +429,8 @@ bool vpux::VPUIP::isSplitNeededForPermuteDMA(vpux::NDTypeInterface inType, mlir: return mergedPerm == DimsOrder::WHC.toAffineMap(ctx) || mergedPerm == DimsOrder::NHCW.toAffineMap(ctx) || mergedPerm == DimsOrder::HCNW.toAffineMap(ctx) || mergedPerm == DimsOrder::NWHC.toAffineMap(ctx) || - mergedPerm == DimsOrder::CWNH.toAffineMap(ctx) || mergedPerm == DimsOrder::HNWC.toAffineMap(ctx); + mergedPerm == DimsOrder::CWNH.toAffineMap(ctx) || mergedPerm == DimsOrder::HNWC.toAffineMap(ctx) || + mergedPerm == DimsOrder::HCWN.toAffineMap(ctx); } SmallVector vpux::VPUIP::getPermuteDMAOutputMergedDimList(vpux::NDTypeInterface outputType, @@ -565,7 +575,7 @@ bool vpux::VPUIP::isLegalConvertToDMA(mlir::Operation* op, vpux::Logger log, boo if (config::getCompilationMode(op) == config::CompilationMode::ReferenceSW) { return false; } - const auto arch = VPU::getArch(op); + const auto arch = config::getArch(op); const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(arch); const auto dmaMaxNumPlanes = dmaEngineLimits.getMaxNumPlanes(); @@ -654,7 +664,7 @@ bool vpux::VPUIP::isLegalConvertToDMA(mlir::Operation* op, vpux::Logger log, boo auto module = swKernelOp->getParentOfType(); const auto dmaPortNum = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN).getCount(); - if (!VPUIP::getPermuteDMASubInputShapes(VPU::getArch(op), inputType, outputType, memPerm.value(), + if (!VPUIP::getPermuteDMASubInputShapes(config::getArch(op), inputType, outputType, memPerm.value(), dmaPortNum, log) .has_value()) { log.trace("SwKernelOp at {0} doesn't support DMA implementation.", op->getLoc()); @@ -743,7 +753,7 @@ bool vpux::VPUIP::isLegalAndBeneficialConvertToDMA(mlir::Operation* op, vpux::Lo if (!isLegalConvertToDMA(op, log)) { return false; } - const auto arch = VPU::getArch(op); + const auto arch = config::getArch(op); auto module = op->getParentOfType(); const auto dmaPortNum = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN).getCount(); VPUX_THROW_WHEN(dmaPortNum <= 0, "Number of ports should be a positive integer, while it is {0}", dmaPortNum); @@ -759,8 +769,8 @@ bool vpux::VPUIP::isLegalAndBeneficialConvertToDMA(mlir::Operation* op, vpux::Lo const auto inputType = mlir::cast(op->getOperand(0).getType()); const auto outputType = mlir::cast(op->getResult(0).getType()); - return isBeneficialForUsingPermuteDMA(VPU::getArch(op), inputType, outputType, memPerm.value(), dmaPortNum, - log); + return isBeneficialForUsingPermuteDMA(config::getArch(op), inputType, outputType, memPerm.value(), + dmaPortNum, log); } return false; @@ -865,7 +875,7 @@ bool vpux::VPUIP::isCompatibleWithMultiClusterNNDMA(VPU::DepthToSpaceOp op, vpux } const auto inputShape = inputType.getShape(); - const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(VPU::getArch(op)); + const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(config::getArch(op)); const auto dmaMaxNumPlanes = dmaEngineLimits.getMaxNumPlanes(); if (inputShape[Dims4D::Act::H] > dmaMaxNumPlanes) { @@ -1147,7 +1157,7 @@ std::pair vpux::VPUIP::getPerAxisTileDMAMergedShape(vp getMergedShape(outMemShape, inOrder.dimPos(Dim(axis)))); } -SmallVector vpux::VPUIP::getPerAxisTileDMASubShapes(VPU::ArchKind arch, vpux::ShapeRef shape) { +SmallVector vpux::VPUIP::getPerAxisTileDMASubShapes(config::ArchKind arch, vpux::ShapeRef shape) { const auto shapeSize = shape.size(); VPUX_THROW_UNLESS(shapeSize == 3, "PerAxisTile merged Shape size should be 3, but got {0}", shapeSize); diff --git a/src/vpux_compiler/src/dialect/VPUIP/utils/dma_fusion_utils.cpp b/src/vpux_compiler/src/dialect/VPUIP/utils/dma_fusion_utils.cpp index d6b4e22534..e066d49e8f 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/utils/dma_fusion_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/utils/dma_fusion_utils.cpp @@ -3,8 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // #include "vpux/compiler/dialect/VPUIP/utils/dma_fusion_utils.hpp" - #include "vpux/compiler/core/attributes/stride_reqs.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/utils/memref_attr_utils.hpp" #include "vpux/compiler/utils/rewriter.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUIP/utils/sw_utils.cpp b/src/vpux_compiler/src/dialect/VPUIP/utils/sw_utils.cpp index 98644901a1..980752d914 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/utils/sw_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/utils/sw_utils.cpp @@ -4,20 +4,23 @@ // #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" -#include -#include -#include -#include -#include -#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" - #include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/tiling_info.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/logging.hpp" #include "vpux/utils/core/range.hpp" +#include +#include +#include +#include + +#include + namespace vpux { namespace VPUIP { @@ -162,7 +165,7 @@ mlir::SymbolRefAttr createBuiltInFunction(mlir::ModuleOp module, VPU::LayerOpInt kernelInfo.sourceFileName, kernelInfo.layerName, log); } -void createRuntimeKernelDefinition(mlir::ModuleOp module, const Logger& log, vpux::VPU::ArchKind arch) { +void createRuntimeKernelDefinition(mlir::ModuleOp module, const Logger& log, vpux::config::ArchKind arch) { auto vpuswModule = getVPUSWModule(module, log); static const SmallString runtimeKernelName{"runtime"}; @@ -201,7 +204,7 @@ void createRuntimeKernelDefinition(mlir::ModuleOp module, const Logger& log, vpu constexpr int nShavePerTile = 2; auto tilesUsed = VPUIP::getNumTilesUsed(module); auto maxShaves = tilesUsed * nShavePerTile; - if (arch == vpux::VPU::ArchKind::NPU40XX) { + if (arch == vpux::config::ArchKind::NPU40XX) { maxShaves = std::min(maxShaves, static_cast(12)); } SmallVector stacksArray(maxShaves, defaultStackSize); @@ -296,6 +299,20 @@ SmallVector reversePermutation(mlir::AffineMap map) { return revPerm; } +// special format of dims/order available only on kernel-FW side +int64_t computeReverseMemDim(mlir::Value tensorArg, int64_t dimIdx) { + const auto inOrder = DimsOrder::fromValue(tensorArg); + // Negative value means counting dimension from the end + if (dimIdx < 0) { + dimIdx += inOrder.numDims(); + } + MemDim md = inOrder.toMemDim(Dim(dimIdx)); + + const auto shape = getShape(tensorArg); + auto nDims = checked_cast(shape.size()); + return nDims - 1 - md.ind(); +} + void initSwKernel(VPUIP::SwKernelOp swKernelOp, VPUIP::SwKernelRun swKernelRunOp, const vpux::Logger& log) { auto& bodyRegion = swKernelOp.getBody(); auto& swKernelBlock = bodyRegion.emplaceBlock(); @@ -461,6 +478,99 @@ bool isStridedDataAccessSupported(VPUIP::SwKernelOp swKernelOp) { return false; } +namespace { + +uint64_t getFloatBits(vpux::type::float16 val) { + return static_cast(val.to_bits()); +} + +uint64_t getFloatBits(float val) { + uint32_t f32Bits = llvm::bit_cast(val); + return static_cast(f32Bits); +} + +template +void packAsFpIntoU64(const SmallVector& values, SmallVector& params) { + static constexpr uint32_t PACKED_VALUES_COUNT = sizeof(int64_t) / sizeof(OT); + static constexpr uint64_t bitWidth = sizeof(OT) * CHAR_BIT; + OT fltValue[PACKED_VALUES_COUNT]; + size_t packIdx = 0; + + auto pack = [](OT fltVals[PACKED_VALUES_COUNT]) -> uint64_t { + uint64_t ret = 0; + for (uint32_t i = 0; i < PACKED_VALUES_COUNT; i++) { + ret |= getFloatBits(fltVals[i]) << (bitWidth * i); + } + return ret; + }; + + for (const auto val : values) { + fltValue[packIdx++] = static_cast(val); + if (packIdx == PACKED_VALUES_COUNT) { + params.push_back(pack(fltValue)); + packIdx = 0; // reset pack index + } + } + + // Store trailing elements + if (packIdx) { + // Pad with zeros up to U64 alignment + while (packIdx < PACKED_VALUES_COUNT) { + fltValue[packIdx++] = 0; + } + params.push_back(pack(fltValue)); + } +} + +} // namespace + +void getQuantParamsAttr(mlir::Value qValue, mlir::Type pType, mlir::ArrayAttr& paramsAttr, int64_t tileSize, + int64_t tileOffset) { + SmallVector scales; + SmallVector zeroes; + int64_t quantDim = -1; + const auto qType = mlir::cast(qValue.getType()).getElementType(); + + if (mlir::isa(qType)) { + auto quantParams = mlir::cast(qType); + scales = {quantParams.getScale()}; + zeroes = {quantParams.getZeroPoint()}; + } else if (mlir::isa(qType)) { + auto quantParams = mlir::cast(qType); + quantDim = computeReverseMemDim(qValue, quantParams.getQuantizedDimension()); + scales = {quantParams.getScales().begin(), quantParams.getScales().end()}; + zeroes = {quantParams.getZeroPoints().begin(), quantParams.getZeroPoints().end()}; + } else { + VPUX_THROW("Unsupported quantized type {0}", qType); + } + + typedef decltype(scales)::value_type TS; + typedef decltype(zeroes)::value_type TZ; + + // Convert & pack float values into u64 words for serialization + + if (tileSize != 0) { // Multi-Cluster/Shave tiling context: + VPUX_THROW_UNLESS(tileOffset + tileSize <= (int64_t)scales.size(), "Slice exceeds full size"); + scales = SmallVector(scales.begin() + tileOffset, scales.begin() + tileOffset + tileSize); + zeroes = SmallVector(zeroes.begin() + tileOffset, zeroes.begin() + tileOffset + tileSize); + } + + llvm::SmallVector params; + params.push_back(quantDim); + params.push_back(scales.size()); + if (pType.isF16()) { + packAsFpIntoU64(scales, params); + packAsFpIntoU64(zeroes, params); + } else if (pType.isF32()) { + packAsFpIntoU64(scales, params); + packAsFpIntoU64(zeroes, params); + } else { + pType.dump(); + VPUX_THROW("Supported non-quantized type : f16/f32"); + } + paramsAttr = getIntArrayAttr(qValue.getContext(), std::move(params)); +} + namespace { // reverse int attribute from the physical order int64_t reverseMemDim(DimsOrder inOrder, int64_t dimIdx) { @@ -917,6 +1027,41 @@ SmallVector getPadSwkernelNewAttrsAfterTiling(VPUIP::SwKernelOp return newAttrs; } +SmallVector getDequantizeSwkernelNewAttrsAfterTiling(VPUIP::SwKernelOp swKernelOp, + ArrayRef origAttr, + const TileInfo& outTile, Logger log) { + auto kernelRun = *swKernelOp.getBody().getOps().begin(); + auto attrs = kernelRun.getAttrs().value(); + VPUX_THROW_UNLESS(origAttr.size() == attrs.size(), "Unmatched attr size found at '{0}'", swKernelOp); + + const auto input = swKernelOp.getInputs()[0]; + const auto inType = mlir::cast(input.getType()); + const auto elementType = inType.getElementType(); + + Dim quantDim; + bool attrNeedUpdates = false; + if (auto quantParams = mlir::dyn_cast(elementType)) { + auto quantAxis = quantParams.getQuantizedDimension(); + quantDim = Dim(quantAxis); + if (outTile.axis[quantDim] > 1) { + log.trace("update attrs for Dequantize SwKernel Op at '{0}' for out tile {1}", swKernelOp, outTile); + attrNeedUpdates = true; + } + } + + if (!attrNeedUpdates) { + return SmallVector{origAttr}; + } + + const auto oType = mlir::cast(swKernelOp.getOutputs()[0].getType()); + int64_t sliceSize = outTile.shape[quantDim]; + int64_t sliceOffset = outTile.offsets[quantDim]; + mlir::ArrayAttr paramsAttr; + getQuantParamsAttr(input, oType.getElementType(), paramsAttr, sliceSize, sliceOffset); + + return SmallVector{paramsAttr}; +} + SmallVector getLstmSequenceSwkernelNewAttrsAfterTiling(VPUIP::SwKernelOp swKernelOp, ArrayRef origAttr, const TileInfo& outTile, Logger log) { @@ -1440,7 +1585,7 @@ InputTiling backInferMvn1NormSwKernelInputTile(VPUIP::SwKernelOp swKernelOp, con InputTiling backInferSwKernelInputTile(VPUIP::SwKernelOp swKernelOp, const SmallVector& outputTiles, int tileId, Logger log) { auto kernelEntryName = getSwKernelEntryName(swKernelOp); - const auto arch = VPU::getArch(swKernelOp); + const auto arch = config::getArch(swKernelOp); const auto& outputTile = outputTiles[tileId]; if (kernelEntryName == "interpolate") { return backInferInterpolateSwKernelInputTile(swKernelOp, outputTile, log); @@ -1488,7 +1633,7 @@ InputTiling backInferSwKernelInputTile(VPUIP::SwKernelOp swKernelOp, const Small return backInferRandomUniformSwKernelInputTile(swKernelOp, outputTile, log); } else if (kernelEntryName == "roll") { return backInferRollSwKernelInputTile(swKernelOp, outputTile, log); - } else if ((kernelEntryName == "detection_output_sort") && (arch == VPU::ArchKind::NPU37XX)) { + } else if ((kernelEntryName == "detection_output_sort") && (arch == config::ArchKind::NPU37XX)) { return vpux::VPU::DetectionOutputSortOpInputTilingOnShave(swKernelOp, outputTile, tileId, outputTiles.size(), log); } else if (kernelEntryName == "reorder") { @@ -1531,6 +1676,8 @@ SmallVector getSwkernelNewAttrsAfterTiling(VPUIP::SwKernelOp sw return getLstmSequenceSwkernelNewAttrsAfterTiling(swKernelOp, origAttr, outTile, log); } else if (kernelEntryName == "gatherND") { return getGatherNDSwkernelNewAttrsAfterTiling(swKernelOp, origAttr, outTile, log); + } else if (kernelEntryName == "dequantize") { + return getDequantizeSwkernelNewAttrsAfterTiling(swKernelOp, origAttr, outTile, log); } else { return SmallVector(origAttr.begin(), origAttr.end()); } @@ -1771,8 +1918,8 @@ bool hasInputsInDDR(VPUIP::SwKernelOp swKernelTask) { }); } -int64_t getSwKernelTilingAddressAlignment(VPUIP::SwKernelOp swkernelOp, VPU::ArchKind arch) { - if (arch == VPU::ArchKind::NPU37XX) { +int64_t getSwKernelTilingAddressAlignment(VPUIP::SwKernelOp swkernelOp, config::ArchKind arch) { + if (arch == config::ArchKind::NPU37XX) { return 1; } @@ -1783,10 +1930,10 @@ int64_t getSwKernelTilingAddressAlignment(VPUIP::SwKernelOp swkernelOp, VPU::Arc return NPU40XX_SW_KERNEL_ADDRESS_ALIGNMENT; } -std::pair getSwKernelInstructionPrefetchConfig(VPU::ArchKind arch) { +std::pair getSwKernelInstructionPrefetchConfig(config::ArchKind arch) { // Return {useDummyKernelForInstructionPrefetch, minimumShaveStartTimeForPrefetch} switch (arch) { - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return std::make_pair(true, MIN_FREE_CYCLES_FOR_PREFETCH_280K); default: VPUX_THROW("Unsupported Arch {0} to do Shave Instruction Prefetch", arch); diff --git a/src/vpux_compiler/src/dialect/VPUIP/utils/unroll_dma_analysis.cpp b/src/vpux_compiler/src/dialect/VPUIP/utils/unroll_dma_analysis.cpp index f2af626f20..621ddc1c69 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/utils/unroll_dma_analysis.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/utils/unroll_dma_analysis.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/dialect/VPUIP/utils/unroll_dma_analysis.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" using namespace vpux; @@ -13,7 +14,7 @@ void ProcessOp(VPURT::TaskOp vpurtTask, VPUIP::UnrollDMAAnalysis::StorageType& l if (vpurtTask.getInnerTaskOpOfType() != nullptr) { lookupArray[static_cast(VPUIP::UnrollDMAAnalysisNeeded::UnrollExpandDMAPass)] = 1; } else if (vpurtTask.getInnerTaskOpOfType() != nullptr) { - lookupArray[static_cast(VPUIP::UnrollDMAAnalysisNeeded::UnrollPermuteToNNDMAPass)] = 1; + lookupArray[static_cast(VPUIP::UnrollDMAAnalysisNeeded::UnrollPermuteDMAPass)] = 1; } else if (vpurtTask.getInnerTaskOpOfType() != nullptr) { lookupArray[static_cast(VPUIP::UnrollDMAAnalysisNeeded::UnrollDepthToSpaceDMAPass)] = 1; } else if (vpurtTask.getInnerTaskOpOfType() != nullptr) { @@ -22,6 +23,8 @@ void ProcessOp(VPURT::TaskOp vpurtTask, VPUIP::UnrollDMAAnalysis::StorageType& l lookupArray[static_cast(VPUIP::UnrollDMAAnalysisNeeded::UnrollUpsamplingDMAPass)] = 1; } else if (vpurtTask.getInnerTaskOpOfType() != nullptr) { lookupArray[static_cast(VPUIP::UnrollDMAAnalysisNeeded::UnrollPerAxisTileDMAPass)] = 1; + } else if (vpurtTask.getInnerTaskOpOfType() != nullptr) { + lookupArray[static_cast(VPUIP::UnrollDMAAnalysisNeeded::UnrollGatherDMAPass)] = 1; } } } // namespace diff --git a/src/vpux_compiler/src/dialect/VPUIP/utils/utils.cpp b/src/vpux_compiler/src/dialect/VPUIP/utils/utils.cpp index 02f7b1c732..c5d83b85c2 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/utils/utils.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/utils/utils.cpp @@ -4,7 +4,6 @@ // #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" - #include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/attributes/stride_reqs.hpp" #include "vpux/compiler/core/layers.hpp" @@ -14,11 +13,12 @@ #include "vpux/compiler/dialect/VPU/utils/max_kernel_size_utils.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPU/utils/wlm_constraint_utils.hpp" -#include "vpux/compiler/dialect/VPUIP/utils/convert_to_dma_utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/attributes.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/IR/memref_attr.hpp" #include "vpux/compiler/utils/VPU/tile_utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/dma_limits.hpp" @@ -35,27 +35,10 @@ using namespace vpux; -// -// Wlm status utils -// - -void vpux::VPUIP::setWlmStatus(mlir::ModuleOp module, vpux::VPUIP::WlmStatus status) { - module->setAttr(vpux::VPUIP::WlmStatusAttr::name, vpux::VPUIP::WlmStatusAttr::get(module->getContext(), status)); -} - -vpux::VPUIP::WlmStatus vpux::VPUIP::getWlmStatus(mlir::ModuleOp module) { - auto wlmStatus = vpux::VPUIP::WlmStatus::ENABLED; - if (module->hasAttr(vpux::VPUIP::WlmStatusAttr::name)) { - auto wlmAttr = module->getAttr(vpux::VPUIP::WlmStatusAttr::name); - wlmStatus = mlir::cast(wlmAttr).getValue(); - } - return wlmStatus; -} - uint16_t vpux::VPUIP::getProfWorkloadSize(mlir::ModuleOp module) { uint16_t profilingWorkloadSize; - switch (VPU::getArch(module)) { - case VPU::ArchKind::NPU37XX: + switch (config::getArch(module)) { + case config::ArchKind::NPU37XX: profilingWorkloadSize = VPUIP::HW_DPU_PROFILING_SIZE_BYTES_37XX; break; default: @@ -86,11 +69,11 @@ double vpux::VPUIP::getMemoryDerateFactor(IE::MemoryResourceOp mem) { VPUX_THROW_UNLESS(mlir::isa(mem.getKind()), "Unsupported memory resource kind '{0}'", mem.getKind()); - auto attr = mem->getAttr(VPU::getMemoryDerateAttrName()); + auto attr = mem->getAttr(config::getMemoryDerateAttrName()); VPUX_THROW_UNLESS(attr != nullptr, "Memory resource '{0}' has no '{1}' attribute", mem.getKind(), - VPU::getMemoryDerateAttrName()); + config::getMemoryDerateAttrName()); VPUX_THROW_UNLESS(mlir::isa(attr), "Memory resource '{0}' has wrong '{1}' attribute : '{2}'", - mem.getKind(), VPU::getMemoryDerateAttrName(), attr); + mem.getKind(), config::getMemoryDerateAttrName(), attr); return mlir::cast(attr).getValueAsDouble(); } @@ -100,11 +83,11 @@ uint32_t vpux::VPUIP::getMemoryBandwidth(IE::MemoryResourceOp mem) { VPUX_THROW_UNLESS(mlir::isa(mem.getKind()), "Unsupported memory resource kind '{0}'", mem.getKind()); - auto attr = mem->getAttr(VPU::getMemoryBandwidthAttrName()); + auto attr = mem->getAttr(config::getMemoryBandwidthAttrName()); VPUX_THROW_UNLESS(attr != nullptr, "Memory resource '{0}' has no '{1}' attribute", mem.getKind(), - VPU::getMemoryBandwidthAttrName()); + config::getMemoryBandwidthAttrName()); VPUX_THROW_UNLESS(mlir::isa(attr), "Memory resource '{0}' has wrong '{1}' attribute : '{2}'", - mem.getKind(), VPU::getMemoryBandwidthAttrName(), attr); + mem.getKind(), config::getMemoryBandwidthAttrName(), attr); return checked_cast(mlir::cast(attr).getInt()); } @@ -116,12 +99,12 @@ int64_t vpux::VPUIP::getNumTilesUsed(mlir::ModuleOp module) { return tileOp.getCount(); } -int64_t getMaxBarriersPerInference(VPU::ArchKind arch) { +int64_t getMaxBarriersPerInference(config::ArchKind arch) { // TODO: E#78647 refactor to use api/vpu_cmx_info_{arch}.h switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return 64; - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return 96; default: VPUX_THROW("Unable to get MaxBarriersPerInference for arch {0}", arch); @@ -129,7 +112,7 @@ int64_t getMaxBarriersPerInference(VPU::ArchKind arch) { } int64_t vpux::VPUIP::getNumAvailableBarriers(mlir::Operation* parentOp) { - const auto arch = VPU::getArch(parentOp); + const auto arch = config::getArch(parentOp); auto module = parentOp->getParentOfType(); @@ -182,12 +165,12 @@ int64_t vpux::VPUIP::getNumberOfIndependentDmaQueues(mlir::Operation* parentOp) VPUX_THROW_UNLESS(dmaPorts != nullptr, "Failed to get DMA information"); auto dmaCount = dmaPorts.getCount(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); // On VPU4+ there is a dedicated Link Agent exposed depending on DMA // channel (CMX and DDR) thus the number of independent DMA FIFOs that // compiler needs to track is twice the number of DMA ports - if (arch >= vpux::VPU::ArchKind::NPU40XX) { + if (arch >= vpux::config::ArchKind::NPU40XX) { return 2 * dmaCount; } @@ -195,11 +178,11 @@ int64_t vpux::VPUIP::getNumberOfIndependentDmaQueues(mlir::Operation* parentOp) } bool vpux::VPUIP::supportsPerVariantBarrierConfiguration(mlir::ModuleOp module) { - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); // If there are more than one DPU per tile, then all variants should consume/produce barriers. If there's only one // DPU per tile, then it is sufficient that only first variant of an invariant consumes a barrier and the last // variant of that invariant produces a barrier. - return arch >= VPU::ArchKind::NPU40XX; + return arch >= config::ArchKind::NPU40XX; } // @@ -1218,11 +1201,15 @@ SmallVector vpux::VPUIP::getSplitBuffers(mlir::MLIRContext* ctx, ml shapes.size(), splitNum); VPUX_THROW_UNLESS(shapeOffsets.size() == checked_cast(splitNum), "Mismatch in shape offsets '{0}' and buffers '{1}'", shapeOffsets.size(), splitNum); - - const auto memSpaceId = declBuffType.getMemSpace().getIndex(); + vpux::IndexedSymbolAttr symbolAttr; const auto memKind = declBuffType.getMemoryKind(); - VPUX_THROW_UNLESS(memSpaceId.has_value(), "Failed to extract section id"); - const auto symbolAttr = vpux::IndexedSymbolAttr::get(ctx, stringifyEnum(memKind), memSpaceId.value()); + const auto memSpaceId = declBuffType.getMemSpace().getIndex(); + if (memKind == VPU::MemoryKind::CMX_NN) { + VPUX_THROW_UNLESS(memSpaceId.has_value(), "Failed to extract section id"); + symbolAttr = vpux::IndexedSymbolAttr::get(ctx, stringifyEnum(memKind), memSpaceId.value()); + } else { + symbolAttr = vpux::IndexedSymbolAttr::get(ctx, stringifyEnum(memKind)); + } const auto originStride = operandType.getStrides(); auto insertionPoint = declBuff.getOperation(); @@ -1239,9 +1226,16 @@ SmallVector vpux::VPUIP::getSplitBuffers(mlir::MLIRContext* ctx, ml } const auto newLoc = appendLoc(loc, "_{0}_split_{1}", bufferName, bufferId); - auto newCmxBuffer = - VPURT::createOp(builder, insertionPoint, newLoc, cmxBuffType, - declBuff.getSection(), memSpaceId.value(), cmxOffset.count()); + VPURT::DeclareBufferOp newCmxBuffer; + if (memSpaceId.has_value()) { + newCmxBuffer = VPURT::createOp(builder, insertionPoint, newLoc, cmxBuffType, + declBuff.getSection(), memSpaceId.value(), + cmxOffset.count()); + } else { + newCmxBuffer = VPURT::createOp(builder, insertionPoint, newLoc, cmxBuffType, + declBuff.getSection(), nullptr, cmxOffset.count(), + declBuff.getSwizzlingKeyAttr()); + } insertionPoint = newCmxBuffer.getOperation(); buffers[bufferId] = newCmxBuffer; @@ -1905,7 +1899,7 @@ vpux::Dim vpux::VPUIP::getCopyDMATilingDimForLargePlaneNum(mlir::Operation* op) // CopyOp or NNDMAop is split needed for large plane number in one of below two conditions: // 1.Input has level 2 stride and input plane number is larger than 255 // 2.Output has level 2 stride and output plane number is larger than 255 -bool vpux::VPUIP::isSplitNeededForLargePlanesNum(const VPU::ArchKind arch, const vpux::NDTypeInterface& type, +bool vpux::VPUIP::isSplitNeededForLargePlanesNum(const config::ArchKind arch, const vpux::NDTypeInterface& type, ShapeRef shape) { const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(arch); const auto maxStridingLevel = dmaEngineLimits.getMaxStrideCount(); @@ -1931,7 +1925,7 @@ bool vpux::VPUIP::isSplitNeededForLargePlanesNum(const VPU::ArchKind arch, const bool vpux::VPUIP::isSplitNeededForLargePlanesNum(mlir::Operation* op) { VPUX_THROW_UNLESS((mlir::isa(op)), "isSplitNeededForLargePlanesNum: not a CopyOp or NNDMAOp"); - const auto arch = VPU::getArch(op); + const auto arch = config::getArch(op); const auto inShape = getShape(op->getOperand(0)); const auto inType = mlir::cast(VPUIP::extractDataType(op->getOperand(0))); const auto outShape = getShape(op->getResult(0)); @@ -1946,7 +1940,7 @@ bool vpux::VPUIP::isSplitNeededForLargePlanesNum(mlir::Operation* op) { bool vpux::VPUIP::hasLegalStridingLevel(mlir::Operation* op) { VPUX_THROW_WHEN(mlir::dyn_cast(op) == nullptr && mlir::dyn_cast(op) == nullptr, "hasLegalStridingLevel: not a CopyOp or NNDMAOp"); - const auto arch = VPU::getArch(op); + const auto arch = config::getArch(op); const auto& dmaEngineLimits = VPUIP::DMA::getEngineLimits(arch); const auto maxStridingLevel = dmaEngineLimits.getMaxStrideCount(); const auto inputStridingLevel = getStridingLevel(op->getOperand(0)); @@ -2294,7 +2288,7 @@ VPURT::TaskOp VPUIP::createBarProgDMA(mlir::OpBuilder& builder, mlir::Value inpu } int64_t vpux::VPUIP::getSOHMinimalHeightAlignment(vpux::ShapeRef shape, int64_t numClusters, bool isInputSparse, - VPU::ArchKind arch) { + config::ArchKind arch) { return VPU::getSOHMinimalHeightAlignment(shape, numClusters, isInputSparse, arch); } @@ -2303,11 +2297,11 @@ int64_t vpux::VPUIP::getSOHMinimalHeightAlignment(vpux::ShapeRef shape, int64_t // int64_t vpux::VPUIP::getMaximalSWKernelPrefetchDataSize(mlir::ModuleOp module) { - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return VPUIP::MAX_SW_KERNEL_PREFETCH_DATA_SIZE_37XX; - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return VPUIP::MAX_SW_KERNEL_PREFETCH_DATA_SIZE_40XX; default: VPUX_THROW("Unable to get MaximalSWKernelPrefetchDataSize for arch {0}", arch); @@ -2325,22 +2319,6 @@ std::pair vpux::VPUIP::getSplitPartSizes(NDTypeInterface buffe return {firstPartSize, secondPartSize}; } -// -// Check user utils -// - -bool VPUIP::hasOneOrSameUser(mlir::Operation* op) { - auto users = op->getUsers(); - if (users.empty()) { - return false; - } - - auto firstUser = *users.begin(); - return std::all_of(std::next(users.begin()), users.end(), [&](mlir::Operation* userOp) { - return firstUser == userOp; - }); -} - std::unordered_set VPUIP::getConcatAxes(VPUIP::ConcatViewOp concatViewOp) { std::unordered_set res; @@ -2774,3 +2752,17 @@ bool vpux::VPUIP::isSubViewCompatibleWithDistributedBuffer(VPUIP::SubViewOp subV // Be compatible if SubView does not shrink segmented axis return origShape[Dim(tileIndexVal)] == subShape[Dim(tileIndexVal)]; } + +VPURT::TaskOp VPUIP::createEnqueueDMA(mlir::OpBuilder& builder, mlir::Value input, mlir::Value output, int port, + mlir::ValueRange waitBarriers, mlir::ValueRange updateBarriers, + VPUIP::EnqueueDMAAttr enqueueDMAAttr, llvm::StringLiteral opName) { + auto ctx = builder.getContext(); + auto syncDmaLoc = mlir::NameLoc::get(mlir::StringAttr::get(ctx, opName)); + auto portAttr = vpux::getIntAttr(ctx, port); + + auto enqueueDMAOp = VPURT::wrapIntoTaskOp( + builder, waitBarriers, updateBarriers, syncDmaLoc, input, output, portAttr, + /*isOutOfOrder*/ nullptr, /*isCritical*/ nullptr, /*dmaHwpId*/ nullptr, + /*dmaProfilingMetaData*/ nullptr, enqueueDMAAttr); + return enqueueDMAOp->getParentOfType(); +} diff --git a/src/vpux_compiler/src/dialect/VPUIPDPU/ops.cpp b/src/vpux_compiler/src/dialect/VPUIPDPU/ops.cpp index 1af3d3affa..dac775f5db 100644 --- a/src/vpux_compiler/src/dialect/VPUIPDPU/ops.cpp +++ b/src/vpux_compiler/src/dialect/VPUIPDPU/ops.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPUIPDPU/ops.hpp" #include "vpux/compiler/dialect/VPUIPDPU/dialect.hpp" #include "vpux/compiler/dialect/VPUIPDPU/ops_interfaces.hpp" +#include "vpux/compiler/utils/error.hpp" #include "vpux/compiler/utils/traits_utils.hpp" #include @@ -29,7 +30,7 @@ using namespace mlir; // mlir::LogicalResult vpux::VPUIPDPU::DPUInvariantOp::verify() { - if (!hasMandatorySingleInstanceChildren(*this)) { + if (!hasMandatorySingleInstanceChildren(*this)) { return errorAt(getLoc(), "Operation {0}: missing mandatory child ops", getOperationName()); } if (!hasOptionalSingleInstanceChildren(*this)) { diff --git a/src/vpux_compiler/src/dialect/VPUIPDPU/passes/rewriters/dpu_invariant_rewriter.cpp b/src/vpux_compiler/src/dialect/VPUIPDPU/passes/rewriters/dpu_invariant_rewriter.cpp index 568afe26fe..6dbd4c8be2 100644 --- a/src/vpux_compiler/src/dialect/VPUIPDPU/passes/rewriters/dpu_invariant_rewriter.cpp +++ b/src/vpux_compiler/src/dialect/VPUIPDPU/passes/rewriters/dpu_invariant_rewriter.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPUIPDPU/rewriters/dpu_invariant_rewriter.hpp" #include "vpux/compiler/dialect/VPUIPDPU/ops.hpp" #include "vpux/compiler/dialect/VPUIPDPU/rewriters/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux::VPUIPDPU; @@ -32,9 +33,11 @@ mlir::LogicalResult insertInvBlockArgs(VPUASM::DPUInvariantOp op, const Logger& std::unordered_map& invBlockArgsPos, ELF::SymbolReferenceMap& symRefMap) { // input activations - auto inputType = getBufferType(symRefMap.lookupSymbol(op.getInput())); - invBlock->addArgument(inputType, op.getLoc()); - invBlockArgsPos[BlockArg::ACT_IN] = invBlock->getNumArguments() - 1; + if (op.getInput()) { + auto inputType = getBufferType(symRefMap.lookupSymbol(op.getInput().value())); + invBlock->addArgument(inputType, op.getLoc()); + invBlockArgsPos[BlockArg::ACT_IN] = invBlock->getNumArguments() - 1; + } // input storage elements if (op.getInputStorageElementTable()) { @@ -176,7 +179,7 @@ mlir::LogicalResult DPUInvariantRewriter::matchAndRewrite(VPUASM::DPUInvariantOp auto dpuInvariantExpandIface = mlir::dyn_cast(op.getOperation()); if (dpuInvariantExpandIface == nullptr) { _log.error("Missing expand DPU invariant configuration interface for arch {0}", - stringifyArchKind(VPU::getArch(op)).str()); + stringifyArchKind(config::getArch(op)).str()); return mlir::failure(); } diff --git a/src/vpux_compiler/src/dialect/VPUIPDPU/passes/rewriters/dpu_variant_rewriter.cpp b/src/vpux_compiler/src/dialect/VPUIPDPU/passes/rewriters/dpu_variant_rewriter.cpp index c3414e595e..a6db1dac1c 100644 --- a/src/vpux_compiler/src/dialect/VPUIPDPU/passes/rewriters/dpu_variant_rewriter.cpp +++ b/src/vpux_compiler/src/dialect/VPUIPDPU/passes/rewriters/dpu_variant_rewriter.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/dialect/VPUIPDPU/rewriters/dpu_variant_rewriter.hpp" #include "vpux/compiler/dialect/VPUIPDPU/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" namespace vpux { namespace VPUIPDPU { @@ -27,7 +28,7 @@ mlir::LogicalResult DPUVariantRewriter::matchAndRewrite(VPUASM::DPUVariantOp op, auto dpuVariantExpandIface = mlir::dyn_cast(op.getOperation()); if (dpuVariantExpandIface == nullptr) { _log.error("Missing expand DPU variant configuration interface for arch {0}", - stringifyArchKind(VPU::getArch(op)).str()); + stringifyArchKind(config::getArch(op)).str()); return mlir::failure(); } diff --git a/src/vpux_compiler/src/dialect/VPUIPDPU/passes/rewriters/utils.cpp b/src/vpux_compiler/src/dialect/VPUIPDPU/passes/rewriters/utils.cpp index 6c783dd2e0..cd71217a6b 100644 --- a/src/vpux_compiler/src/dialect/VPUIPDPU/passes/rewriters/utils.cpp +++ b/src/vpux_compiler/src/dialect/VPUIPDPU/passes/rewriters/utils.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPUIPDPU/rewriters/utils.hpp" #include "vpux/compiler/dialect/VPUASM/ops.hpp" #include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" +#include "vpux/compiler/dialect/VPUIPDPU/attributes.hpp" #include "vpux/compiler/utils/quantization.hpp" namespace vpux { @@ -207,5 +208,20 @@ int64_t getRangeSize(int64_t start, int64_t end) { return end - start + 1; } +int64_t getZeroPoint(vpux::NDTypeInterface type) { + int64_t zeroPointVal = 0; + auto elemType = type.getElementType(); + if (auto qType = mlir::dyn_cast(elemType)) { + if (auto intType = mlir::dyn_cast(qType.getStorageType())) { + if (intType.getWidth() == 8) { + zeroPointVal = VPUIPDPU::getZeroPoints(type.getElementType())[0]; + } else if (intType.getWidth() == 16) { + zeroPointVal = VPUIPDPU::getZeroPoints(type.getElementType())[0]; + } + } + } + return zeroPointVal; +} + } // namespace VPUIPDPU } // namespace vpux diff --git a/src/vpux_compiler/src/dialect/VPUMI37XX/dialect.cpp b/src/vpux_compiler/src/dialect/VPUMI37XX/dialect.cpp index 3acea756b6..ba0c505400 100644 --- a/src/vpux_compiler/src/dialect/VPUMI37XX/dialect.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI37XX/dialect.cpp @@ -9,6 +9,7 @@ #include "vpux/compiler/dialect/VPURegMapped/dialect.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" +#include #include using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/VPUMI37XX/ops/act_kernel_range.cpp b/src/vpux_compiler/src/dialect/VPUMI37XX/ops/act_kernel_range.cpp index 0eda6094dd..e76443f878 100644 --- a/src/vpux_compiler/src/dialect/VPUMI37XX/ops/act_kernel_range.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI37XX/ops/act_kernel_range.cpp @@ -3,13 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // -#include #include "vpux/compiler/dialect/ELFNPU37XX/utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/sw_utils.hpp" #include "vpux/compiler/dialect/VPUMI37XX/ops.hpp" +#include "vpux/compiler/utils/error.hpp" #include +#include + using namespace vpux; using namespace npu37xx; diff --git a/src/vpux_compiler/src/dialect/VPUMI37XX/ops/act_shave_rt.cpp b/src/vpux_compiler/src/dialect/VPUMI37XX/ops/act_shave_rt.cpp index 036a2aaef6..07250a7449 100644 --- a/src/vpux_compiler/src/dialect/VPUMI37XX/ops/act_shave_rt.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI37XX/ops/act_shave_rt.cpp @@ -5,14 +5,13 @@ #include #include -#include #include #include #include #include "vpux/compiler/act_kernels/shave_binary_resources.h" #include "vpux/compiler/dialect/ELFNPU37XX/utils.hpp" #include "vpux/compiler/dialect/VPUMI37XX/ops.hpp" -#include "vpux/utils/core/scope_exit.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" using namespace vpux; @@ -24,7 +23,7 @@ void vpux::VPUMI37XX::ActShaveRtOp::serialize(elf::writer::BinaryDataSectiongetOperation())); + const SmallString arch = ELFNPU37XX::getSwKernelArchString(config::getArch(this->getOperation())); const auto elfBlob = kernelInfo.getElf(kernel, arch); @@ -37,7 +36,7 @@ size_t vpux::VPUMI37XX::ActShaveRtOp::getBinarySize() { auto kernel = getKernelPath(); const auto& kernelInfo = ShaveBinaryResources::getInstance(); - const SmallString arch = ELFNPU37XX::getSwKernelArchString(VPU::getArch(this->getOperation())); + const SmallString arch = ELFNPU37XX::getSwKernelArchString(config::getArch(this->getOperation())); const auto elfBlob = kernelInfo.getElf(kernel, arch); @@ -50,7 +49,7 @@ uint32_t vpux::VPUMI37XX::ActShaveRtOp::getKernelEntry() { auto kernel = getKernelPath(); const auto& kernelInfo = ShaveBinaryResources::getInstance(); - const SmallString arch = ELFNPU37XX::getSwKernelArchString(VPU::getArch(this->getOperation())); + const SmallString arch = ELFNPU37XX::getSwKernelArchString(config::getArch(this->getOperation())); const auto elfBlob = kernelInfo.getElf(kernel, arch); @@ -65,7 +64,7 @@ uint32_t vpux::VPUMI37XX::ActShaveRtOp::getVersion() { auto kernel = getKernelPath(); const auto& kernelInfo = ShaveBinaryResources::getInstance(); - const SmallString arch = ELFNPU37XX::getSwKernelArchString(VPU::getArch(this->getOperation())); + const SmallString arch = ELFNPU37XX::getSwKernelArchString(config::getArch(this->getOperation())); const auto elfBlob = kernelInfo.getElf(kernel, arch); diff --git a/src/vpux_compiler/src/dialect/VPUMI37XX/ops/dpu.cpp b/src/vpux_compiler/src/dialect/VPUMI37XX/ops/dpu.cpp index 5ff2429f8c..47abd6ea56 100644 --- a/src/vpux_compiler/src/dialect/VPUMI37XX/ops/dpu.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI37XX/ops/dpu.cpp @@ -3,15 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include - +#include "vpux/compiler/dialect/VPUMI37XX/blob_writer.hpp" #include "vpux/compiler/dialect/VPUMI37XX/ops.hpp" +#include "vpux/compiler/dialect/VPUMI37XX/utils.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" - #include "vpux/compiler/utils/ELF/utils.hpp" #include "vpux/utils/core/checked_cast.hpp" -#include "vpux/compiler/dialect/VPUMI37XX/blob_writer.hpp" +#include #include @@ -22,11 +21,7 @@ #include #include -#include "vpux/compiler/dialect/VPUMI37XX/utils.hpp" - -namespace { #include "external/runtime_dpu_parser_imports.cpp.inc" -} using namespace vpux; using namespace npu37xx; diff --git a/src/vpux_compiler/src/dialect/VPUMI37XX/ops/mapped_inference.cpp b/src/vpux_compiler/src/dialect/VPUMI37XX/ops/mapped_inference.cpp index de8c509cee..e467040c20 100644 --- a/src/vpux_compiler/src/dialect/VPUMI37XX/ops/mapped_inference.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI37XX/ops/mapped_inference.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/act_kernels/shave_binary_resources.h" #include "vpux/compiler/core/profiling.hpp" #include "vpux/compiler/dialect/ELFNPU37XX/utils.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUMI37XX/ops.hpp" #include "vpux/utils/core/mem_size.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUMI37XX/ops/platform_info.cpp b/src/vpux_compiler/src/dialect/VPUMI37XX/ops/platform_info.cpp index 7f684de717..2355098119 100644 --- a/src/vpux_compiler/src/dialect/VPUMI37XX/ops/platform_info.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI37XX/ops/platform_info.cpp @@ -46,5 +46,5 @@ vpux::ELFNPU37XX::SectionFlagsAttr vpux::VPUMI37XX::PlatformInfoOp::getUserProcs } void vpux::VPUMI37XX::PlatformInfoOp::build(::mlir::OpBuilder& odsBuilder, ::mlir::OperationState& odsState) { - build(odsBuilder, odsState, VPU::ArchKind::NPU37XX); + build(odsBuilder, odsState, config::ArchKind::NPU37XX); } diff --git a/src/vpux_compiler/src/dialect/VPUMI37XX/ops_interfaces.cpp b/src/vpux_compiler/src/dialect/VPUMI37XX/ops_interfaces.cpp index 6cf3c7008f..253342dc34 100644 --- a/src/vpux_compiler/src/dialect/VPUMI37XX/ops_interfaces.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI37XX/ops_interfaces.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/VPUMI37XX/ops_interfaces.hpp" +#include "vpux/compiler/dialect/VPURegMapped/types.hpp" #include "vpux/compiler/utils/error.hpp" #include diff --git a/src/vpux_compiler/src/dialect/VPUMI37XX/passes/assign_full_kernel_path.cpp b/src/vpux_compiler/src/dialect/VPUMI37XX/passes/assign_full_kernel_path.cpp index c0ee2830f2..c1eda9a358 100644 --- a/src/vpux_compiler/src/dialect/VPUMI37XX/passes/assign_full_kernel_path.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI37XX/passes/assign_full_kernel_path.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/dialect/VPUMI37XX/dialect.hpp" #include "vpux/compiler/dialect/VPUMI37XX/ops.hpp" #include "vpux/compiler/dialect/VPUMI37XX/passes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/passes.hpp" #include "vpux/utils/core/format.hpp" @@ -53,7 +54,7 @@ void AssignFullKernelPathPass::safeRunOnFunc() { auto kernelParamsOpInputs = kernelParamsOp.getInputs(); auto kernelPath = kernelParamsOp.getKernelType(); - auto cpu = ELFNPU37XX::getSwKernelArchString(VPU::getArch(origOp)); + auto cpu = ELFNPU37XX::getSwKernelArchString(config::getArch(origOp)); std::string newKernelType; bool hasDDRInputBuffers = !VPUIP::getDDRBuffers(kernelParamsOpInputs).empty(); diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/ops/barrier.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/ops/barrier.cpp index 9cff60c773..12a8b5a210 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/ops/barrier.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/ops/barrier.cpp @@ -7,6 +7,7 @@ #include #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPUMI40XX/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/error.hpp" using namespace vpux; @@ -45,7 +46,7 @@ void ConfigureBarrierOp::build(mlir::OpBuilder& odsBuilder, mlir::OperationState mlir::LogicalResult ConfigureBarrierOp::verify() { // Skip checks if architecture is unknown since all of them depend on the architecture used - if (VPU::getArch(getOperation()) == VPU::ArchKind::UNKNOWN) { + if (config::getArch(getOperation()) == config::ArchKind::UNKNOWN) { return mlir::success(); } diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/ops/dma.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/ops/dma.cpp index 6ef5325040..4bc6786764 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/ops/dma.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/ops/dma.cpp @@ -22,7 +22,7 @@ void NNDMAOp::build(mlir::OpBuilder& odsBuilder, mlir::OperationState& odsState, updateBarriers, 0, 0, false, false, false, 0, VPUIP::DMAAccMode::DISABLE, /*act_compression_size_entry*/ nullptr, /*act_compression_sparsity_map*/ nullptr, dma_transaction, dma_descriptor, 0, nullptr, 0, nullptr, /*enqueue_target_barrier*/ nullptr, /*wlmPage*/ nullptr, - /*physicalBarrierRangeAttr*/ nullptr); + /*physicalBarrierRangeAttr*/ nullptr, /*enqueueDMAAttr*/ nullptr, /*fetchDMAAttr*/ nullptr); } void NNDMAOp::build(mlir::OpBuilder& odsBuilder, mlir::OperationState& odsState, mlir::Type index, @@ -38,7 +38,7 @@ void NNDMAOp::build(mlir::OpBuilder& odsBuilder, mlir::OperationState& odsState, updateBarriers, start_after, clean_after, is_out_of_order, is_critical, enable_msc, port, acceleration_mode, act_compression_size_entry, act_compression_sparsity_map, dma_transaction, dma_descriptor, dma_hwp_id, profilingMetadata, allow_different_in_out_shapes, indices, enqueueBarrier, wlmPage, - /*physicalBarrierRangeAttr*/ nullptr); + /*physicalBarrierRangeAttr*/ nullptr, /*enqueueDMAAttr*/ nullptr, /*fetchDMAAttr*/ nullptr); } mlir::LogicalResult NNDMAOp::verify() { diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_bootstrap_work_items.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_bootstrap_work_items.cpp index 5c128d4a5e..a6876061ba 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_bootstrap_work_items.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_bootstrap_work_items.cpp @@ -9,6 +9,7 @@ #include "vpux/compiler/dialect/VPUMI40XX/ops.hpp" #include "vpux/compiler/dialect/VPUMI40XX/passes.hpp" #include "vpux/compiler/dialect/VPUMI40XX/utils.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/wlm_utils.hpp" #include "vpux/compiler/dialect/VPURegMapped/ops.hpp" #include "vpux/compiler/utils/passes.hpp" @@ -24,12 +25,15 @@ namespace { class AddBootstrapWorkItemsPass : public VPUMI40XX::impl::AddBootstrapWorkItemsBase { public: - explicit AddBootstrapWorkItemsPass(Logger log) { + explicit AddBootstrapWorkItemsPass(const WorkloadManagementMode workloadManagementMode, Logger log) + : _workloadManagementMode(workloadManagementMode) { Base::initLogger(log, Base::getArgumentName()); } private: void safeRunOnFunc() final; + + WorkloadManagementMode _workloadManagementMode; }; void reindexEnqueueOps(llvm::SmallVector enquOps) { @@ -54,7 +58,16 @@ void reindexEnqueueOps(llvm::SmallVector enquOps) { return; } -bool hasEnqueue(VPURegMapped::TaskOpInterface task) { +bool hasEnqueue(VPURegMapped::TaskOpInterface task, std::optional firstTaskIdxWithEnqueueDma) { + // Check if task is enqueued by enqueue DMA (Full WLM) + if (firstTaskIdxWithEnqueueDma.has_value()) { + auto taskIdx = mlir::cast(task.getResult().getType()).getValue(); + if (taskIdx >= firstTaskIdxWithEnqueueDma.value()) { + return true; + } + } + + // Check if task is enqueued by EnqueueOp (Partial WLM) auto users = task.getResult().getUsers(); auto enquIt = llvm::find_if(users, [](mlir::Operation* user) { return mlir::isa(user); @@ -63,7 +76,8 @@ bool hasEnqueue(VPURegMapped::TaskOpInterface task) { } int64_t addEnqueueForOp(mlir::MLIRContext* ctx, mlir::func::FuncOp netFunc, mlir::Value listHead, - const VPURegMapped::TaskType taskType, VPURegMapped::EnqueueOp firstEnqueue) { + const VPURegMapped::TaskType taskType, VPURegMapped::EnqueueOp firstEnqueueOp, + std::optional firstTaskIdxWithEnqueueDma) { auto mpi = VPUMI40XX::getMPI(netFunc); auto builder = mlir::OpBuilder(mpi.getOperation()); int64_t bootstrapWorkItems = 0; @@ -72,11 +86,11 @@ int64_t addEnqueueForOp(mlir::MLIRContext* ctx, mlir::func::FuncOp netFunc, mlir } auto curTask = mlir::cast(listHead.getDefiningOp()); - if (!hasEnqueue(curTask)) { + if (!hasEnqueue(curTask, firstTaskIdxWithEnqueueDma)) { auto startTask = curTask; auto endTask = curTask; while (auto nextTask = VPUMI40XX::getNextOp(endTask)) { - if (!hasEnqueue(nextTask)) { + if (!hasEnqueue(nextTask, firstTaskIdxWithEnqueueDma)) { endTask = nextTask; } else { break; @@ -86,9 +100,9 @@ int64_t addEnqueueForOp(mlir::MLIRContext* ctx, mlir::func::FuncOp netFunc, mlir auto bootstrapEnqueue = builder.create( startTask->getLoc(), trivialIndexType, nullptr, nullptr, /*previousTaskIdxOnSameBarrier*/ nullptr, taskType, startTask->getResult(0), endTask->getResult(0)); - if (firstEnqueue) { + if (firstEnqueueOp) { bootstrapEnqueue.getOperation()->moveBefore( - mlir::cast(firstEnqueue).getOperation()); + mlir::cast(firstEnqueueOp).getOperation()); } bootstrapWorkItems++; @@ -104,32 +118,84 @@ void AddBootstrapWorkItemsPass::safeRunOnFunc() { auto parentModule = netFunc.getOperation()->getParentOfType(); const auto tilesCount = IE::getTileExecutor(parentModule).getCount(); - const auto shavesCountPerTile = IE::getAvailableExecutor(parentModule, VPU::ExecutorKind::SHAVE_ACT).getCount(); + + if (workloadManagementModeOpt.hasValue()) { + _workloadManagementMode = workloadManagementModeOpt.getValue(); + } + + // Check if there are any Enqueue DMAs present in the schedule + mlir::DenseMap firstEnqueueDmaPerHwQueue; + + if (_workloadManagementMode == WorkloadManagementMode::FWLM_V1_PAGES) { + auto dmaTile0List0Task = mpi.getListHead(VPURegMapped::TaskType::DMA, 0, 0).getDefiningOp(); + do { + auto enqueueDmaAttr = dmaTile0List0Task.getEnqueueDmaAttr(); + if (enqueueDmaAttr.has_value()) { + auto taskType = VPUMI40XX::convertExecutorKindToExecutableTaskType( + enqueueDmaAttr.value().getTargetExecutorKindAttr().getValue()); + auto tileIdx = static_cast(enqueueDmaAttr.value().getTileIdx().getValue().getSExtValue()); + auto listIdx = static_cast(enqueueDmaAttr.value().getListIdx().getValue().getSExtValue()); + auto hwQueue = VPUMI40XX::HwQueueType{taskType, tileIdx, listIdx}; + + if (firstEnqueueDmaPerHwQueue.find(hwQueue) == firstEnqueueDmaPerHwQueue.end()) { + auto firstTaskIdx = enqueueDmaAttr.value().getStartTaskIdx().getValue().getSExtValue(); + firstEnqueueDmaPerHwQueue[hwQueue] = firstTaskIdx; + _log.trace("Found Enqueue DMA for task type {0} on tile {1}, list {2} with first task index {3}", + taskType, tileIdx, listIdx, firstTaskIdx); + } + } + dmaTile0List0Task = VPUMI40XX::getNextOp(dmaTile0List0Task); + } while (dmaTile0List0Task); + } VPURegMapped::EnqueueOp firstEnqueue = nullptr; if (mpi.getWorkItemTasks()) { firstEnqueue = mlir::cast(mpi.getWorkItemTasks().getDefiningOp()); } - int totalNumberBootstrapworkItems = 0; + VPUX_THROW_WHEN(firstEnqueue != nullptr && !firstEnqueueDmaPerHwQueue.empty(), + "Enqueue ops should not yet be present if there are enqueue DMAs"); - for (int64_t tileIdx = 0; tileIdx < tilesCount; tileIdx++) { - for (int64_t listIdx = 0; listIdx < 2; listIdx++) { - auto curHead = mpi.getListHead(VPURegMapped::TaskType::DMA, tileIdx, listIdx); - totalNumberBootstrapworkItems += - addEnqueueForOp(ctx, netFunc, curHead, VPURegMapped::TaskType::DMA, firstEnqueue); - } - } + int totalNumberBootstrapWorkItems = 0; - for (int64_t tileIdx = 0; tileIdx < tilesCount; tileIdx++) { - auto curVariantHead = mpi.getListHead(VPURegMapped::TaskType::DPUVariant, tileIdx); - totalNumberBootstrapworkItems += - addEnqueueForOp(ctx, netFunc, curVariantHead, VPURegMapped::TaskType::DPUVariant, firstEnqueue); + uint32_t shavesCountPerTile = 0; + auto actInvosCount = parseIntArrayOfArrayAttr(mpi.getActKernelInvocationsCount()); + llvm::for_each(actInvosCount, [&](auto actInvosCountForTile) { + shavesCountPerTile = std::max(shavesCountPerTile, static_cast(actInvosCountForTile.size())); + }); - for (int64_t listIdx = 0; listIdx < shavesCountPerTile; listIdx++) { - auto curActKernelHead = mpi.getListHead(VPURegMapped::TaskType::ActKernelInvocation, tileIdx, listIdx); - totalNumberBootstrapworkItems += addEnqueueForOp(ctx, netFunc, curActKernelHead, - VPURegMapped::TaskType::ActKernelInvocation, firstEnqueue); + uint32_t dmaCountPerTile = 0; + auto dmasCount = parseIntArrayOfArrayAttr(mpi.getDmaCount()); + llvm::for_each(dmasCount, [&](auto dmasCountForTile) { + dmaCountPerTile = std::max(dmaCountPerTile, static_cast(dmasCountForTile.size())); + }); + + const mlir::DenseSet> taskTypesWithListCountPerTile = { + {{VPURegMapped::TaskType::DMA, dmaCountPerTile}, + {VPURegMapped::TaskType::DPUVariant, 1}, + {VPURegMapped::TaskType::ActKernelInvocation, shavesCountPerTile}}}; + + for (uint32_t tileIdx = 0; tileIdx < tilesCount; tileIdx++) { + for (const auto& [taskType, listCount] : taskTypesWithListCountPerTile) { + for (uint32_t listIdx = 0; listIdx < listCount; listIdx++) { + _log.trace("Check task type {0} on tile {1}, list {2} if bootstrap work items are needed", taskType, + tileIdx, listIdx); + auto curHead = mpi.getListHead(taskType, tileIdx, listIdx); + + auto hwQueue = VPUMI40XX::HwQueueType{taskType, tileIdx, listIdx}; + std::optional firstTaskIdxWithEnqueueDma = std::nullopt; + if (_workloadManagementMode == WorkloadManagementMode::FWLM_V1_PAGES) { + auto firstTaskIdxWithEnqueueDmaIt = firstEnqueueDmaPerHwQueue.find(hwQueue); + if (firstTaskIdxWithEnqueueDmaIt != firstEnqueueDmaPerHwQueue.end()) { + firstTaskIdxWithEnqueueDma = firstTaskIdxWithEnqueueDmaIt->second; + } + } + + auto bootstrapWorkItems = + addEnqueueForOp(ctx, netFunc, curHead, taskType, firstEnqueue, firstTaskIdxWithEnqueueDma); + _log.nest().trace("Added {0} bootstrap work items", bootstrapWorkItems); + totalNumberBootstrapWorkItems += bootstrapWorkItems; + } } } @@ -138,7 +204,7 @@ void AddBootstrapWorkItemsPass::safeRunOnFunc() { reindexEnqueueOps(enquOps); mpi.getWorkItemTasksMutable().assign(enquOps[0].getResult()); mpi.setWorkItemCount(enquOps.size()); - mpi.setBootsrapWorkItemsCountAttr(builder.getI64IntegerAttr(totalNumberBootstrapworkItems)); + mpi.setBootsrapWorkItemsCountAttr(builder.getI64IntegerAttr(totalNumberBootstrapWorkItems)); } else { VPUX_THROW("We expect at least one enqueue operation in the function."); } @@ -150,6 +216,7 @@ void AddBootstrapWorkItemsPass::safeRunOnFunc() { // createAddBootstrapWorkItemsPass // -std::unique_ptr vpux::VPUMI40XX::createAddBootstrapWorkItemsPass(Logger log) { - return std::make_unique(log); +std::unique_ptr vpux::VPUMI40XX::createAddBootstrapWorkItemsPass( + WorkloadManagementMode workloadManagementMode, Logger log) { + return std::make_unique(workloadManagementMode, log); } diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_enqueue_dma_ops.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_enqueue_dma_ops.cpp index 211ae58347..263914da4e 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_enqueue_dma_ops.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_enqueue_dma_ops.cpp @@ -24,11 +24,7 @@ using namespace vpux; namespace { class AddEnqueueDMAOps : public VPUMI40XX::impl::AddEnqueueDMAOpsBase { public: - explicit AddEnqueueDMAOps(const WorkloadManagementMode workloadManagementMode, Logger log) - : _workloadManagementMode(workloadManagementMode), - _dpuEnqueueDMACount(0), - _shvEnqueueDMACount(0), - _actShavePerTile(0) { + explicit AddEnqueueDMAOps(Logger log): _dpuEnqueueDMACount(0), _shvEnqueueDMACount(0), _actShavePerTile(0) { Base::initLogger(log, Base::getArgumentName()); } @@ -44,7 +40,6 @@ class AddEnqueueDMAOps : public VPUMI40XX::impl::AddEnqueueDMAOpsBase _lastDmaTile0List0ByPage; llvm::DenseMap _opFetchDMAMap; llvm::DenseMap, mlir::Value> _regBufferCache; @@ -324,9 +319,6 @@ void AddEnqueueDMAOps::createDMAToPushTaskInFIFO(mlir::OpBuilder& builder, VPURe } void AddEnqueueDMAOps::safeRunOnFunc() { - if (_workloadManagementMode != WorkloadManagementMode::FWLM_V1_PAGES) { - return; - } auto netFunc = getOperation(); auto mpi = VPUMI40XX::getMPI(netFunc); auto module = netFunc->getParentOfType(); @@ -419,7 +411,6 @@ void AddEnqueueDMAOps::safeRunOnFunc() { // createAddEnqueueDMAOps // -std::unique_ptr vpux::VPUMI40XX::createAddEnqueueDMAOps(WorkloadManagementMode workloadManagementMode, - Logger log) { - return std::make_unique(workloadManagementMode, log); +std::unique_ptr vpux::VPUMI40XX::createAddEnqueueDMAOps(Logger log) { + return std::make_unique(log); } diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_enqueue_ops.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_enqueue_ops.cpp index 1fceed5395..a2c0d93fa6 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_enqueue_ops.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_enqueue_ops.cpp @@ -4,13 +4,18 @@ // #include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUMI40XX/dialect.hpp" #include "vpux/compiler/dialect/VPUMI40XX/ops.hpp" #include "vpux/compiler/dialect/VPUMI40XX/passes.hpp" #include "vpux/compiler/dialect/VPUMI40XX/utils.hpp" #include "vpux/compiler/dialect/VPUMI40XX/wlm_utils.hpp" #include "vpux/compiler/dialect/VPURegMapped/ops.hpp" +#include "vpux/compiler/utils/options.hpp" #include "vpux/compiler/utils/passes.hpp" +#include "vpux/compiler/utils/stl_extras.hpp" + +#include namespace vpux::VPUMI40XX { #define GEN_PASS_DECL_ADDENQUEUEOPS @@ -140,7 +145,7 @@ mlir::LogicalResult verifyEnqueueBarrierIsNotBlockedByFutureTask( do { dmaOp = nextDmaOp; - if (auto executableTaskOp = mlir::dyn_cast(dmaOp)) { + if (auto executableTaskOp = mlir::dyn_cast(dmaOp)) { for (const auto& waitBar : executableTaskOp.waitBarriers()) { auto barrierIdx = mlir::cast(waitBar.getType()).getValue(); VPUX_THROW_WHEN(barrierIdx >= barrierCount, @@ -854,21 +859,21 @@ void AddEnqueueOpsPass::safeRunOnFunc() { if (mlir::failed(addEnqusForTasksWithFetch(mpi, VPURegMapped::TaskType::DPUInvariant, VPURegMapped::TaskType::DPUVariant, globalPreviousEnqu, builder, globalEnquCounter, cache, _log, tilesCount))) { - vpux::VPUIP::setWlmStatus(module, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(module, VPU::WorkloadManagementStatus::FAILED); signalPassFailure(); return; } if (mlir::failed(addEnqusForTasksWithFetch( mpi, VPURegMapped::TaskType::ActKernelRange, VPURegMapped::TaskType::ActKernelInvocation, globalPreviousEnqu, builder, globalEnquCounter, cache, _log, tilesCount, shavesCountPerTile))) { - vpux::VPUIP::setWlmStatus(module, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(module, VPU::WorkloadManagementStatus::FAILED); signalPassFailure(); return; } if (mlir::failed(addEnqusForDmas(mpi, tilesCount, globalPreviousEnqu, builder, globalEnquCounter, lastDmaWithNoEnqueue, cache, _log))) { - vpux::VPUIP::setWlmStatus(module, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(module, VPU::WorkloadManagementStatus::FAILED); signalPassFailure(); return; } @@ -907,7 +912,7 @@ void AddEnqueueOpsPass::safeRunOnFunc() { // Verify enqueue ops can be enqueued at given barriers if (mlir::failed(verifyEnqueueBarrierIsNotBlockedByFutureTask(mpi, enquOps, barriers, lastDmaWithNoEnqueue, tilesCount, _log))) { - vpux::VPUIP::setWlmStatus(module, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(module, VPU::WorkloadManagementStatus::FAILED); signalPassFailure(); return; } @@ -915,7 +920,7 @@ void AddEnqueueOpsPass::safeRunOnFunc() { // Check if enqueues order for given HW FIFO is not enqueueing tasks // for this FIFO out of order - task N needs to be enqueued before task N+1 if (mlir::failed(verifyEnqueueOpsOrderIsAlignedWithPerFifoTaskOrder(enquOps, _log))) { - vpux::VPUIP::setWlmStatus(module, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(module, VPU::WorkloadManagementStatus::FAILED); signalPassFailure(); return; } diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_fetch_ops.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_fetch_ops.cpp index 0d6bdd17a1..33d1ed3a6c 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_fetch_ops.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_fetch_ops.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPUMI40XX/dialect.hpp" #include "vpux/compiler/dialect/VPUMI40XX/ops.hpp" @@ -12,6 +13,7 @@ #include "vpux/compiler/dialect/VPURegMapped/ops.hpp" #include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/utils/options.hpp" #include "vpux/compiler/utils/passes.hpp" #include "vpux/compiler/utils/stl_extras.hpp" @@ -203,8 +205,7 @@ mlir::LogicalResult addFetchTasks(VPUMI40XX::MappedInferenceOp mpi, const size_t // wlmPage is not assigned for first Fetch on each tile as technically they don't belong to a page rather // than bootstrap auto firstFetch = builder.create( - firstGroup.getLoc(), dummyIndexType, - nullptr, // no previous + firstGroup.getLoc(), dummyIndexType, mlir::ValueRange({}), mlir::ValueRange({}), nullptr, firstGroup.getStartIndexes()[0], firstGroup.getEndIndexes()[0], firstGroup.getStartIndexes()[1], firstGroup.getEndIndexes()[1], VPURegMapped::TaskTypeAttr::get(builder.getContext(), taskType), mlir::IntegerAttr::get(uint64Type, tileIdx), mlir::IntegerAttr::get(uint64Type, groupIdx), @@ -238,11 +239,12 @@ mlir::LogicalResult addFetchTasks(VPUMI40XX::MappedInferenceOp mpi, const size_t builder.setInsertionPointAfter(insertionDma.getOperation()); auto fetchTaskOp = builder.create( - travelingGroup.getLoc(), dummyIndexType, insertionDma.getResult(), - travelingGroup.getStartIndexes()[0], travelingGroup.getEndIndexes()[0], - travelingGroup.getStartIndexes()[1], travelingGroup.getEndIndexes()[1], - VPURegMapped::TaskTypeAttr::get(ctx, taskType), mlir::IntegerAttr::get(uint64Type, tileIdx), - mlir::IntegerAttr::get(uint64Type, groupIdx), insertionDma.getWlmPageAttr()); + travelingGroup.getLoc(), dummyIndexType, mlir::ValueRange({}), mlir::ValueRange({}), + insertionDma.getResult(), travelingGroup.getStartIndexes()[0], + travelingGroup.getEndIndexes()[0], travelingGroup.getStartIndexes()[1], + travelingGroup.getEndIndexes()[1], VPURegMapped::TaskTypeAttr::get(ctx, taskType), + mlir::IntegerAttr::get(uint64Type, tileIdx), mlir::IntegerAttr::get(uint64Type, groupIdx), + insertionDma.getWlmPageAttr()); // set the previousIdx to the fetchOp insertionDma.getResult().replaceAllUsesExcept(fetchTaskOp.getIndex(), fetchTaskOp.getOperation()); @@ -275,13 +277,13 @@ void AddFetchOpsPass::safeRunOnFunc() { if (mlir::failed(addFetchTasks(mpi, DMA_WLM_TILEIDX, DMA_DDR2CMX_LISTIDX, VPURegMapped::TaskType::DPUInvariant, _log, tilesCount))) { - vpux::VPUIP::setWlmStatus(parentModule, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(parentModule, VPU::WorkloadManagementStatus::FAILED); signalPassFailure(); return; } if (mlir::failed(addFetchTasks(mpi, DMA_WLM_TILEIDX, DMA_DDR2CMX_LISTIDX, VPURegMapped::TaskType::ActKernelRange, _log, tilesCount, shavesCountPerTile))) { - vpux::VPUIP::setWlmStatus(parentModule, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(parentModule, VPU::WorkloadManagementStatus::FAILED); signalPassFailure(); return; } diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_mapped_inference_version_op.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_mapped_inference_version_op.cpp index bb86145c40..32f1f2b976 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_mapped_inference_version_op.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/add_mapped_inference_version_op.cpp @@ -14,6 +14,9 @@ namespace vpux::VPUMI40XX { #define GEN_PASS_DEF_ADDMAPPEDINFERENCEVERSIONOP #include "vpux/compiler/dialect/VPUMI40XX/passes.hpp.inc" } // namespace vpux::VPUMI40XX + +using namespace vpux; + namespace { // TODO: E111344 class AddMappedInferenceVersionOpPass : diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/barrier_topological_mapping.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/barrier_topological_mapping.cpp index 5b8de92b95..1e082b3e75 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/barrier_topological_mapping.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/barrier_topological_mapping.cpp @@ -10,6 +10,7 @@ #include "vpux/compiler/dialect/VPUMI40XX/wlm_utils.hpp" #include "vpux/compiler/dialect/VPURegMapped/types.hpp" #include "vpux/compiler/utils/passes.hpp" +#include "vpux/compiler/utils/stl_extras.hpp" #include #include diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/convert_fetch_dmas_to_fetch_task_ops.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/convert_fetch_dmas_to_fetch_task_ops.cpp new file mode 100644 index 0000000000..5c52fbfe5a --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/convert_fetch_dmas_to_fetch_task_ops.cpp @@ -0,0 +1,217 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/dialect.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/ops.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/passes.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/utils.hpp" +#include "vpux/compiler/dialect/VPURegMapped/ops.hpp" + +#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/utils/options.hpp" +#include "vpux/compiler/utils/passes.hpp" +#include "vpux/compiler/utils/stl_extras.hpp" + +namespace vpux::VPUMI40XX { +#define GEN_PASS_DECL_CONVERTFETCHDMASTOFETCHTASKOPS +#define GEN_PASS_DEF_CONVERTFETCHDMASTOFETCHTASKOPS +#include "vpux/compiler/dialect/VPUMI40XX/passes.hpp.inc" +} // namespace vpux::VPUMI40XX + +using namespace vpux; + +struct FetchDMAKey { + int64_t tile; + int64_t list; + int64_t group; + VPURegMapped::TaskType taskType; + + bool operator==(const FetchDMAKey& other) const { + return tile == other.tile && list == other.list && group == other.group && taskType == other.taskType; + } +}; + +namespace llvm { +template <> +struct DenseMapInfo { + static inline FetchDMAKey getEmptyKey() { + return {DenseMapInfo::getEmptyKey(), DenseMapInfo::getEmptyKey(), + DenseMapInfo::getEmptyKey(), static_cast(-1)}; + } + + static inline FetchDMAKey getTombstoneKey() { + return {DenseMapInfo::getTombstoneKey(), DenseMapInfo::getTombstoneKey(), + DenseMapInfo::getTombstoneKey(), static_cast(-2)}; + } + + static unsigned getHashValue(const FetchDMAKey& key) { + return hash_combine(key.tile, key.list, key.group, + static_cast>(key.taskType)); + } + + static bool isEqual(const FetchDMAKey& lhs, const FetchDMAKey& rhs) { + return lhs == rhs; + } +}; +} // namespace llvm + +namespace { +class ConvertFetchDmasToFetchTaskOpsPass : + public VPUMI40XX::impl::ConvertFetchDmasToFetchTaskOpsBase { +public: + explicit ConvertFetchDmasToFetchTaskOpsPass(Logger log): _log(log) { + } + +private: + Logger _log; + void safeRunOnFunc() final; + llvm::DenseMap _placeHolderFetchDMAMap; +}; + +VPURegMapped::TaskType convertTargetToTaskType(VPU::ExecutorKind kind) { + VPURegMapped::TaskType returnType; + switch (kind) { + case VPU::ExecutorKind::DPU: + returnType = VPURegMapped::TaskType::DPUInvariant; + break; + case VPU::ExecutorKind::SHAVE_ACT: + returnType = VPURegMapped::TaskType::ActKernelRange; + break; + default: + VPUX_THROW("Unsupported executor kind passed for FetchTask"); + } + + return returnType; +} + +mlir::LogicalResult addFetchTasks(VPUMI40XX::MappedInferenceOp mpi, const VPURegMapped::TaskType taskType, + const int64_t tilesCount, + llvm::DenseMap& placeHolderFetchDMAMap, + SmallVector& fetchTasks, const int64_t listsCount = 1) { + auto ctx = mpi.getContext(); + auto builder = mlir::OpBuilder(mpi); + + for (int64_t tileIdx = 0; tileIdx < tilesCount; tileIdx++) { + for (int64_t listIdx = 0; listIdx < listsCount; listIdx++) { + auto startingInvValue = mpi.getListHead(taskType, tileIdx, listIdx); + // theoretically there can be cases where we run for 6 tiles, but only 4 tiles have Variants associated + if (!startingInvValue) { + continue; + } + + auto currentGroup = + mlir::dyn_cast_or_null(startingInvValue.getDefiningOp()); + if (!currentGroup) { + continue; + } + + int64_t groupIdx = 0; + + while (currentGroup) { + FetchDMAKey searchKey{tileIdx, listIdx, groupIdx, taskType}; + if (!placeHolderFetchDMAMap.contains(searchKey)) { + VPUX_THROW("Placeholder FetchDMA not found for {0} {1} {2} {3}", tileIdx, listIdx, groupIdx, + taskType); + } + auto insertionDma = placeHolderFetchDMAMap[searchKey]; + builder.setInsertionPointAfter(insertionDma.getOperation()); + + auto wlmPageAttr = groupIdx < 2 ? mlir::IntegerAttr::get(getInt64Type(ctx), static_cast(-1)) + : insertionDma.getWlmPageAttr(); + + auto fetchTaskOp = builder.create( + currentGroup.getLoc(), insertionDma.getIndexType(), insertionDma.getWaitBarriers(), + insertionDma.getUpdateBarriers(), insertionDma.getPreviousTask(), + currentGroup.getStartIndexes()[0], currentGroup.getEndIndexes()[0], + currentGroup.getStartIndexes()[1], currentGroup.getEndIndexes()[1], + VPURegMapped::TaskTypeAttr::get(ctx, taskType), + mlir::IntegerAttr::get(getUInt64Type(ctx), tileIdx), + mlir::IntegerAttr::get(getUInt64Type(ctx), groupIdx), wlmPageAttr); + + // set the previousIdx to the fetchOp + insertionDma.getResult().replaceAllUsesWith(fetchTaskOp.getResult()); + if (insertionDma->use_empty()) { + insertionDma->erase(); + } + + fetchTasks.push_back(fetchTaskOp); + currentGroup = VPUMI40XX::getNextGroup(currentGroup); + ++groupIdx; + } + } + } + + return mlir::success(); +} + +void ConvertFetchDmasToFetchTaskOpsPass::safeRunOnFunc() { + auto netFunc = getOperation(); + + auto parentModule = netFunc.getOperation()->getParentOfType(); + const auto tilesCount = IE::getTileExecutor(parentModule).getCount(); + const auto shavesCountPerTile = IE::getAvailableExecutor(parentModule, VPU::ExecutorKind::SHAVE_ACT).getCount(); + + auto mpi = VPUMI40XX::getMPI(netFunc); + + const size_t DMA_DDR2CMX_LISTIDX = 0; + const size_t DMA_WLM_TILEIDX = 0; // all WLM dma's should be on tile0 for now; + + auto dmaTaskOps = netFunc.getOps(); + + _log.trace("Get placeholder Fetch DMAs"); + for (auto dmaOp : llvm::make_early_inc_range(llvm::make_filter_range(dmaTaskOps, [](auto dma) { + return dma.getFetchDmaAttr() != nullptr; + }))) { + auto fetchAttr = dmaOp.getFetchDmaAttr(); + + const auto tileIdx = fetchAttr.getTileIdx().getValue().getSExtValue(); + const auto listIdx = fetchAttr.getListIdx().getValue().getSExtValue(); + const auto groupIdx = fetchAttr.getExecGroupIdx().getValue().getSExtValue(); + const auto targetExecutorKind = fetchAttr.getTargetExecutorKindAttr(); + + FetchDMAKey key{tileIdx, listIdx, groupIdx, convertTargetToTaskType(targetExecutorKind.getValue())}; + _placeHolderFetchDMAMap[key] = dmaOp; + } + + _log.trace("Add Fetch Tasks"); + SmallVector fetchTasks; + if (mlir::failed(addFetchTasks(mpi, VPURegMapped::TaskType::DPUInvariant, tilesCount, _placeHolderFetchDMAMap, + fetchTasks))) { + VPU::setWorkloadManagementStatus(parentModule, VPU::WorkloadManagementStatus::FAILED); + signalPassFailure(); + return; + } + if (mlir::failed(addFetchTasks(mpi, VPURegMapped::TaskType::ActKernelRange, tilesCount, _placeHolderFetchDMAMap, + fetchTasks, shavesCountPerTile))) { + VPU::setWorkloadManagementStatus(parentModule, VPU::WorkloadManagementStatus::FAILED); + signalPassFailure(); + return; + } + + _log.trace("Reindex list"); + auto firstFetch = + *std::min_element(fetchTasks.begin(), fetchTasks.end(), [](mlir::Operation* lhs, mlir::Operation* rhs) { + auto lhsDma = mlir::cast(lhs); + auto rhsDma = mlir::cast(rhs); + return lhsDma.getType().getValue() < rhsDma.getType().getValue(); + }); + + VPUMI40XX::reindexList(mpi, firstFetch, DMA_WLM_TILEIDX, DMA_DDR2CMX_LISTIDX); + + return; +} + +} // namespace + +// +// createConvertFetchDmasToFetchTaskOpsPass +// + +std::unique_ptr vpux::VPUMI40XX::createConvertFetchDmasToFetchTaskOpsPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/dump_statistics_of_wlm_ops.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/dump_statistics_of_wlm_ops.cpp index 786bb55bd3..a80612ec63 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/dump_statistics_of_wlm_ops.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/dump_statistics_of_wlm_ops.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/VPUMI40XX/ops.hpp" #include "vpux/compiler/dialect/VPUMI40XX/passes.hpp" #include "vpux/compiler/dialect/VPUMI40XX/wlm_utils.hpp" +#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/VPURegMapped/ops.hpp" #include "vpux/compiler/utils/passes.hpp" diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/group_exec_ops.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/group_exec_ops.cpp index e13c9af714..962520b3c7 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/group_exec_ops.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/group_exec_ops.cpp @@ -11,8 +11,10 @@ #include "vpux/compiler/dialect/VPUMI40XX/utils.hpp" #include "vpux/compiler/dialect/VPURegMapped/ops.hpp" #include "vpux/compiler/dialect/VPURegMapped/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/passes.hpp" #include "vpux/compiler/utils/shave.hpp" +#include "vpux/compiler/utils/stl_extras.hpp" namespace vpux::VPUMI40XX { #define GEN_PASS_DECL_GROUPEXECUTIONOPS @@ -125,7 +127,7 @@ VPUMI40XX::ExecutableTaskOpInterface getBarrieredOp(VPURegMapped::TaskOpInterfac return nullptr; } -size_t getMetadataSize(mlir::Operation* op, VPURegMapped::TaskType taskType, VPU::ArchKind archKind) { +size_t getMetadataSize(mlir::Operation* op, VPURegMapped::TaskType taskType, config::ArchKind archKind) { // TODO: E109456 VPU::TaskType vpuTaskType = VPURegMapped::TaskTypeMapper::map(taskType); switch (vpuTaskType) { @@ -147,7 +149,7 @@ size_t getMetadataSize(mlir::Operation* op, VPURegMapped::TaskType taskType, VPU void groupExecOps(VPUMI40XX::MappedInferenceOp mpi, const VPURegMapped::TaskType primary, const VPURegMapped::TaskType secondary, int64_t tilesCount, int64_t listsCount = 1) { - auto archKind = VPU::getArch(mpi.getOperation()); + auto archKind = config::getArch(mpi.getOperation()); for (int64_t tileIdx = 0; tileIdx < tilesCount; tileIdx++) { for (int64_t listIdx = 0; listIdx < listsCount; listIdx++) { auto startingVal = mpi.getListHead(primary, tileIdx, listIdx); diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/link_enqueue_targets.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/link_enqueue_targets.cpp index 2fed941985..b707ed8073 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/link_enqueue_targets.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/link_enqueue_targets.cpp @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPUMI40XX/dialect.hpp" #include "vpux/compiler/dialect/VPUMI40XX/passes.hpp" #include "vpux/compiler/dialect/VPUMI40XX/utils.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/wlm_utils.hpp" #include "vpux/compiler/dialect/VPURegMapped/ops.hpp" #include "vpux/compiler/utils/passes.hpp" #include "vpux/compiler/utils/shave.hpp" @@ -22,16 +24,110 @@ namespace { class LinkEnqueueTargetsPass : public VPUMI40XX::impl::LinkEnqueueTargetsBase { public: - explicit LinkEnqueueTargetsPass(Logger log) { + explicit LinkEnqueueTargetsPass(const WorkloadManagementMode workloadManagementMode, Logger log) + : _workloadManagementMode(workloadManagementMode) { Base::initLogger(log, Base::getArgumentName()); } private: void safeRunOnFunc() final; + void processEnqueueDmaOps(mlir::func::FuncOp netFunc); + void processEnqueueOps(mlir::func::FuncOp netFunc); + + WorkloadManagementMode _workloadManagementMode; }; -void LinkEnqueueTargetsPass::safeRunOnFunc() { - auto netFunc = getOperation(); +// Identify enqueue DMAs ops and process all tasks. If multiple tasks of the same type are enqueued +// by the same enqueue DMA and those tasks support task linking then enable the link. Later pass when +// updating enqueue DMA will only enqueue the head of the linked list of tasks. +void LinkEnqueueTargetsPass::processEnqueueDmaOps(mlir::func::FuncOp netFunc) { + auto mpi = VPUMI40XX::getMPI(netFunc); + + auto dmaTile0List0Head = mpi.getListHead(VPURegMapped::TaskType::DMA, 0, 0); + if (!dmaTile0List0Head) { + return; + } + + auto parentModule = netFunc.getOperation()->getParentOfType(); + const auto tilesCount = IE::getTileExecutor(parentModule).getCount(); + const auto shavesCountPerTile = IE::getAvailableExecutor(parentModule, VPU::ExecutorKind::SHAVE_ACT).getCount(); + + auto firstDmaTile0List0Op = dmaTile0List0Head.getDefiningOp(); + auto enqueueDmasPerHwQueue = VPUMI40XX::getEnqueueDmaData(firstDmaTile0List0Op, _log); + + if (enqueueDmasPerHwQueue.empty()) { + _log.trace("No Enqueue DMAs available for task linking"); + return; + } + + const mlir::DenseSet> taskTypesWithListCountPerTile = { + {{VPURegMapped::TaskType::DPUVariant, 1}, + {VPURegMapped::TaskType::ActKernelInvocation, shavesCountPerTile}}}; + + // Iterate over DPU/SHV tasks on each tile and list and check if multiple tasks are enqueued by the + // same enqueue DMA. If yes, check if those tasks support task linking and if so, link them. + for (uint32_t tileIdx = 0; tileIdx < tilesCount; tileIdx++) { + for (const auto& [taskType, listCount] : taskTypesWithListCountPerTile) { + for (uint32_t listIdx = 0; listIdx < listCount; listIdx++) { + auto listHead = mpi.getListHead(taskType, tileIdx, listIdx); + if (!listHead) { + continue; + } + + _log.trace("Check task type {0} on tile {1}, list {2} if task linking is possible", taskType, tileIdx, + listIdx); + auto taskOp = mlir::cast(listHead.getDefiningOp()); + + auto hwQueue = VPUMI40XX::HwQueueType{taskType, tileIdx, listIdx}; + + VPUX_THROW_WHEN(enqueueDmasPerHwQueue.find(hwQueue) == enqueueDmasPerHwQueue.end(), + "No Enqueue DMAs available for task type {0} on tile {1}, list {2}", taskType, tileIdx, + listIdx); + // Initial tasks must be enqueued by first enqueue DMA for given HW queue type + size_t curEnqueueIndex = 0; + // Get start and end task range enqueued by enqueue DMA to understand what range of tasks + // are processed by a single enqueue DMA + auto [enqueueDmaStartIdx, enqueueDmaEndIdx, _] = enqueueDmasPerHwQueue[hwQueue][curEnqueueIndex]; + _log.trace("Enqueue DMA task range: {0} - {1}", enqueueDmaStartIdx, enqueueDmaEndIdx); + + // Iterate over tasks in the list and check if enabling link to previous is possible + do { + auto taskInd = mlir::cast(taskOp.getResult().getType()).getValue(); + + // If current task index is greater than end index of current enqueue DMA it means that + // this task is enqueued by next enqueue DMA -> switch to next enqueue DMA + if (taskInd > enqueueDmaEndIdx) { + _log.trace("Task {0} is after end task {1} of current enqueue DMA. Move to next enqueue DMA op", + taskInd, enqueueDmaEndIdx); + // Move to next enqueue DMA and get start and end indexes + curEnqueueIndex++; + VPUX_THROW_UNLESS( + curEnqueueIndex < enqueueDmasPerHwQueue[hwQueue].size(), + "No enqueue DMAs available for task type {0} on tile {1}, list {2} at index {3}", + taskType, tileIdx, listIdx, curEnqueueIndex); + enqueueDmaStartIdx = enqueueDmasPerHwQueue[hwQueue][curEnqueueIndex].startTaskIdx; + enqueueDmaEndIdx = enqueueDmasPerHwQueue[hwQueue][curEnqueueIndex].endTaskIdx; + + _log.trace("Enqueue DMA task range: {0} - {1}", enqueueDmaStartIdx, enqueueDmaEndIdx); + } + + // If task index is larger than first task enqueued by a DMA than link it to previous task + if (taskInd > enqueueDmaStartIdx) { + _log.trace("Link task {0} to previous", taskInd); + taskOp.linkToPreviousTask(); + } + + taskOp = taskOp.getNextTask(); + } while (taskOp); + } + } + } +} + +// Iterate over all enqueue ops. For each op that enqueues a range of tasks of the given type +// if the tasks support linking, enable task linking and update the op to enqueue only the head +// of the linked task chain +void LinkEnqueueTargetsPass::processEnqueueOps(mlir::func::FuncOp netFunc) { bool fifoPerShaveEngineEnabled = VPU::isFifoPerShaveEngineEnabled(netFunc); for (auto enqueue : netFunc.getOps()) { @@ -94,8 +190,19 @@ void LinkEnqueueTargetsPass::safeRunOnFunc() { } } } +} + +void LinkEnqueueTargetsPass::safeRunOnFunc() { + auto netFunc = getOperation(); - return; + if (workloadManagementModeOpt.hasValue()) { + _workloadManagementMode = workloadManagementModeOpt.getValue(); + } + + if (_workloadManagementMode == WorkloadManagementMode::FWLM_V1_PAGES) { + processEnqueueDmaOps(netFunc); + } + processEnqueueOps(netFunc); } } // namespace @@ -103,6 +210,7 @@ void LinkEnqueueTargetsPass::safeRunOnFunc() { // createLinkEnqueueTargetsPass // -std::unique_ptr vpux::VPUMI40XX::createLinkEnqueueTargetsPass(Logger log) { - return std::make_unique(log); +std::unique_ptr vpux::VPUMI40XX::createLinkEnqueueTargetsPass(WorkloadManagementMode workloadManagementMode, + Logger log) { + return std::make_unique(workloadManagementMode, log); } diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/resolve_WLM_task_locations.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/resolve_WLM_task_locations.cpp index fcc686d680..31fc2edd0d 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/resolve_WLM_task_locations.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/resolve_WLM_task_locations.cpp @@ -10,7 +10,7 @@ #include "vpux/compiler/dialect/VPUMI40XX/dialect.hpp" #include "vpux/compiler/dialect/VPUMI40XX/passes.hpp" #include "vpux/compiler/dialect/VPUMI40XX/utils.hpp" -#include "vpux/compiler/dialect/VPURegMapped/utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/passes.hpp" #include "vpux/compiler/utils/shave.hpp" @@ -45,7 +45,7 @@ void ResolveWLMTaskLocationPass::safeRunOnFunc() { const auto availableShaveEnginesPerTile = IE::getAvailableExecutor(parentModule, VPU::ExecutorKind::SHAVE_ACT).getCount(); - auto archKind = VPU::getArch(netFunc); + auto archKind = config::getArch(netFunc); const llvm::DenseMap sizes = { {VPURegMapped::TaskType::DPUInvariant, VPU::getConstraint(netFunc, VPU::METADATA_MAX_INVARIANT_COUNT) / 2}, {VPURegMapped::TaskType::DPUVariant, VPU::getConstraint(netFunc, VPU::METADATA_MAX_VARIANT_COUNT) / 2}, diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/resolve_task_location.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/resolve_task_location.cpp index 93d1133ae0..25aa5e9b8f 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/resolve_task_location.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/resolve_task_location.cpp @@ -9,6 +9,7 @@ #include "vpux/compiler/dialect/VPUMI40XX/dialect.hpp" #include "vpux/compiler/dialect/VPUMI40XX/passes.hpp" #include "vpux/compiler/dialect/VPURegMapped/passes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/passes.hpp" #include "vpux/compiler/utils/shave.hpp" @@ -200,7 +201,7 @@ void ResolveTaskLocationPass::createTaskLocationBuffers(VPURegMapped::TaskBuffer void ResolveTaskLocationPass::safeRunOnFunc() { auto funcOp = getOperation(); VPUX_THROW_WHEN(VPU::isFifoPerShaveEngineEnabled(funcOp), "Dedicated Shave FIFOs for non-Wlm are not supported."); - const auto arch = VPU::getArch(funcOp); + const auto arch = config::getArch(funcOp); MetadataBuffersContainer metadataBuffers; MaxTileInfo maxTileInfo; diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/setup_profiling_VPUMI40XX.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/setup_profiling_VPUMI40XX.cpp index d286cc692d..ad1a533a9e 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/setup_profiling_VPUMI40XX.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/setup_profiling_VPUMI40XX.cpp @@ -4,7 +4,6 @@ // #include "vpux/compiler/core/profiling.hpp" -#include "vpux/compiler/dialect/ELFNPU37XX/dialect.hpp" #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" @@ -12,7 +11,7 @@ #include "vpux/compiler/dialect/VPUMI40XX/ops.hpp" #include "vpux/compiler/dialect/VPUMI40XX/passes.hpp" #include "vpux/compiler/dialect/VPUMI40XX/utils.hpp" -#include "vpux/compiler/dialect/VPURegMapped/dialect.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/passes.hpp" @@ -35,13 +34,12 @@ namespace { class SetupProfilingVPUMI40XXPass final : public VPUMI40XX::impl::SetupProfilingVPUMI40XXBase { public: - explicit SetupProfilingVPUMI40XXPass(DMAProfilingMode dmaProfilingMode, Logger log) - : _dmaProfilingMode(dmaProfilingMode) { + explicit SetupProfilingVPUMI40XXPass(const std::string& enableDmaProfiling, Logger log) + : SetupProfilingVPUMI40XXBase({enableDmaProfiling}) { Base::initLogger(log, Base::getArgumentName()); } private: - DMAProfilingMode _dmaProfilingMode; void safeRunOnModule() final; mlir::Value createDmaHwpBaseStatic(mlir::OpBuilder builderFunc, VPUIP::ProfilingSectionOp dmaSection) { @@ -104,11 +102,12 @@ class SetupProfilingVPUMI40XXPass final : return dmaHwpScratch.getResult(); } - void addDmaHwpBase(mlir::OpBuilder builderFunc, mlir::ModuleOp moduleOp, VPUMI40XX::MappedInferenceOp mpi) { + void addDmaHwpBase(DMAProfilingMode dmaProfilingMode, mlir::OpBuilder builderFunc, mlir::ModuleOp moduleOp, + VPUMI40XX::MappedInferenceOp mpi) { _log.trace("addDmaHwpBase"); mlir::Value dmaHwpBase = nullptr; - switch (_dmaProfilingMode) { + switch (dmaProfilingMode) { case DMAProfilingMode::DYNAMIC_HWP: { dmaHwpBase = createDmaHwpBaseDynamic(builderFunc, moduleOp); break; @@ -164,15 +163,10 @@ class SetupProfilingVPUMI40XXPass final : void SetupProfilingVPUMI40XXPass::safeRunOnModule() { auto moduleOp = getOperation(); - auto arch = VPU::getArch(moduleOp); + auto arch = config::getArch(moduleOp); - if (enableDMAProfiling.hasValue()) { - _dmaProfilingMode = getDMAProfilingMode(arch, enableDMAProfiling.getValue()); - } - - if (_dmaProfilingMode == DMAProfilingMode::DISABLED) { - return; - } + VPUX_THROW_UNLESS(enableDMAProfiling.hasValue(), "No option"); + auto dmaProfilingMode = getDMAProfilingMode(arch, enableDMAProfiling); net::NetworkInfoOp netInfo; mlir::func::FuncOp funcOp; @@ -182,7 +176,7 @@ void SetupProfilingVPUMI40XXPass::safeRunOnModule() { auto mpi = VPUMI40XX::getMPI(funcOp); // create DMA hardware profiling base ref in MI - addDmaHwpBase(builderFunc, moduleOp, mpi); + addDmaHwpBase(dmaProfilingMode, builderFunc, moduleOp, mpi); // create workpoint cfg ref in MI for hardware profiling addWorkpointCapture(builderFunc, moduleOp, mpi); @@ -194,7 +188,7 @@ void SetupProfilingVPUMI40XXPass::safeRunOnModule() { // createSetupProfilingVPUMI40XXPass // -std::unique_ptr vpux::VPUMI40XX::createSetupProfilingVPUMI40XXPass(DMAProfilingMode dmaProfilingMode, +std::unique_ptr vpux::VPUMI40XX::createSetupProfilingVPUMI40XXPass(const std::string& enableDmaProfiling, Logger log) { - return std::make_unique(dmaProfilingMode, log); + return std::make_unique(enableDmaProfiling, log); } diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/split_enqueue_dma_ops.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/split_enqueue_dma_ops.cpp new file mode 100644 index 0000000000..659dc33e67 --- /dev/null +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/split_enqueue_dma_ops.cpp @@ -0,0 +1,272 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/dialect.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/ops.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/passes.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/utils.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/wlm_utils.hpp" +#include "vpux/compiler/dialect/VPURegMapped/ops.hpp" +#include "vpux/compiler/utils/passes.hpp" + +#include + +#include + +namespace vpux::VPUMI40XX { +#define GEN_PASS_DECL_SPLITENQUEUEDMAOPS +#define GEN_PASS_DEF_SPLITENQUEUEDMAOPS +#include "vpux/compiler/dialect/VPUMI40XX/passes.hpp.inc" +} // namespace vpux::VPUMI40XX + +using namespace vpux; + +namespace { + +// DMA(enqueue) = [var1, var2(breakingPoint), var3, var4(breakingPoint), var5] +// we are going to replace it with 3 enqueue DMAs +// DMA(enqueue1) = [var1, var2] +// DMA(enqueue2) = [var3, var4] +// DMA(enqueue3) = [var5] + +class SplitEnqueueDmaOpsPass : public VPUMI40XX::impl::SplitEnqueueDmaOpsBase { +public: + explicit SplitEnqueueDmaOpsPass(Logger log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnFunc() final; + VPUMI40XX::NNDMAOp createEnqueueDma(int64_t startTaskIdx, int64_t endTaskIdx, mlir::ValueRange waitBarriers, + mlir::ValueRange updateBarriers, VPUMI40XX::NNDMAOp origEnqDmaOp, + mlir::Value prevDmaVal); +}; + +// Create new enqueue DMA based on original one but with updated enqueue DMA attribute task index range and barriers. +VPUMI40XX::NNDMAOp SplitEnqueueDmaOpsPass::createEnqueueDma(int64_t startTaskIdx, int64_t endTaskIdx, + mlir::ValueRange waitBarriers, + mlir::ValueRange updateBarriers, + VPUMI40XX::NNDMAOp origEnqDmaOp, mlir::Value prevDmaVal) { + mlir::OpBuilder builder(origEnqDmaOp); + auto newOp = builder.clone(*origEnqDmaOp); + auto ctx = origEnqDmaOp.getContext(); + + auto newEnqueueDmaOp = mlir::cast(newOp); + newEnqueueDmaOp.getPreviousTaskMutable().clear(); + if (prevDmaVal != nullptr) { + newEnqueueDmaOp.getPreviousTaskMutable().assign(prevDmaVal); + } + + newEnqueueDmaOp.getWaitBarriersMutable().clear(); + if (!waitBarriers.empty()) { + newEnqueueDmaOp.getWaitBarriersMutable().assign(waitBarriers); + } + + newEnqueueDmaOp.getUpdateBarriersMutable().clear(); + if (!updateBarriers.empty()) { + newEnqueueDmaOp.getUpdateBarriersMutable().assign(updateBarriers); + } + + auto enqueueDmaAttr = origEnqDmaOp.getEnqueueDmaAttrAttr(); + + auto executorKindAttr = enqueueDmaAttr.getTargetExecutorKindAttr(); + auto tileIdxAttr = enqueueDmaAttr.getTileIdx(); + auto listIdxAttr = enqueueDmaAttr.getListIdx(); + auto startTaskIdxAttr = mlir::IntegerAttr::get(getInt64Type(ctx), startTaskIdx); + auto endTaskIdxAttr = mlir::IntegerAttr::get(getInt64Type(ctx), endTaskIdx); + + auto newEnqueueDmaAttr = VPUIP::EnqueueDMAAttr::get(ctx, executorKindAttr, tileIdxAttr, listIdxAttr, + startTaskIdxAttr, endTaskIdxAttr); + newEnqueueDmaOp.setEnqueueDmaAttrAttr(newEnqueueDmaAttr); + + return newEnqueueDmaOp; +} + +void SplitEnqueueDmaOpsPass::safeRunOnFunc() { + auto netFunc = getOperation(); + auto ctx = &(getContext()); + auto mpi = VPUMI40XX::getMPI(netFunc); + + auto parentModule = netFunc.getOperation()->getParentOfType(); + const auto tilesCount = IE::getTileExecutor(parentModule).getCount(); + const auto shavesCountPerTile = IE::getAvailableExecutor(parentModule, VPU::ExecutorKind::SHAVE_ACT).getCount(); + + auto dmaTile0List0Head = mpi.getListHead(VPURegMapped::TaskType::DMA, 0, 0); + if (!dmaTile0List0Head) { + _log.trace("No list where enqueue DMAs are located"); + return; + } + + auto firstDmaTile0List0Op = dmaTile0List0Head.getDefiningOp(); + auto enqueueDmasPerHwQueue = VPUMI40XX::getEnqueueDmaData(firstDmaTile0List0Op, _log); + + const mlir::DenseSet> taskTypesWithListCountPerTile = { + {{VPURegMapped::TaskType::DPUVariant, 1}, + {VPURegMapped::TaskType::ActKernelInvocation, shavesCountPerTile}}}; + + // Iterate over DPU/SHV tasks on each tile and list and check if group end is encountered - presence + // of lastSecondaryTaskInExecutionGroup attribute. If yes and such task is not the last one that is enqueued + // by enqueue DMA then such enqueue DMA needs to be split - old enqueue DMA is erased and multiple new ones are + // created. When split is done enqueueDmaAttribute start/end task indexes are updated and barrier dependencies + // changed so that only the first DMA resulting from split has wait barrier and only last one has update barrier. + // Example: + // Task0, Task1(lastSecondaryTaskInExecutionGroup), Task2, Task3(lastSecondaryTaskInExecutionGroup) + // Before: + // DMA {enqueueDmaAttribute(startTaskIdx=0, endTaskIdx=3)} waitBarrier(BAR0) updateBarrier(BAR1) + // After: + // DMA {enqueueDmaAttribute(startTaskIdx=0, endTaskIdx=1)} waitBarrier(BAR0) + // DMA {enqueueDmaAttribute(startTaskIdx=2, endTaskIdx=3)} updateBarrier(BAR1) + // + for (uint32_t tileIdx = 0; tileIdx < tilesCount; tileIdx++) { + for (const auto& [taskType, listCount] : taskTypesWithListCountPerTile) { + for (uint32_t listIdx = 0; listIdx < listCount; listIdx++) { + auto listHead = mpi.getListHead(taskType, tileIdx, listIdx); + if (!listHead) { + continue; + } + + _log.trace("Check task type {0} on tile {1}, list {2} if enqueue DMA split is needed", taskType, + tileIdx, listIdx); + auto taskOp = mlir::cast(listHead.getDefiningOp()); + + auto hwQueue = VPUMI40XX::HwQueueType{taskType, tileIdx, listIdx}; + + VPUX_THROW_WHEN(enqueueDmasPerHwQueue.find(hwQueue) == enqueueDmasPerHwQueue.end(), + "No Enqueue DMAs available for task type {0} on tile {1}, list {2}", taskType, tileIdx, + listIdx); + // Initial tasks must be enqueued by first enqueue DMA for given HW queue type + size_t curEnqueueIndex = 0; + // Get start and end task range enqueued by enqueue DMA to understand what range of tasks + // are processed by a single enqueue DMA + auto [enqueueDmaStartIdx, enqueueDmaEndIdx, enqueueDmaOp] = + enqueueDmasPerHwQueue[hwQueue][curEnqueueIndex]; + _log.trace("Enqueue DMA task range: {0} - {1}", enqueueDmaStartIdx, enqueueDmaEndIdx); + + // When splitting DMAs and creating new ones code needs to correctly maintain connections + // to previous DMA + auto prevDmaVal = enqueueDmaOp.getPreviousTask(); + + _log = _log.nest(); + bool breakPointDetected = false; + // Flag to indicate if when encountering next breakpoint it is the first one for given enqueue DMA. + // This is used to indicate which newly created enqueue DMA should have wait barriers - only the first + // one + bool nextBreakPointIsFirstForGivenEnqueueDma = true; + // Iterate over tasks in the list and check for lastSecondaryTaskInExecutionGroup + do { + auto taskInd = mlir::cast(taskOp.getResult().getType()).getValue(); + // If current task index is greater than end index of current enqueue DMA it means that + // this task is enqueued by next enqueue DMA -> switch to next enqueue DMA + if (taskInd > enqueueDmaEndIdx) { + _log.trace("Task {0} is after end task {1} of current enqueue DMA. Move to next enqueue DMA op", + taskInd, enqueueDmaEndIdx); + if (breakPointDetected) { + // If breakpoint was detected before, when switching to using next enqueue DMA first need to + // create new enqueue DMA that will handle enqueue of last tasks for this enqueue DMA + _log.trace("Create new enqueue DMA with range {0} - {1} to handle enqueue of last tasks in " + "group", + enqueueDmaStartIdx, enqueueDmaEndIdx); + auto newEnqueueDmaOp = + createEnqueueDma(enqueueDmaStartIdx, enqueueDmaEndIdx, {}, + enqueueDmaOp.getUpdateBarriers(), enqueueDmaOp, prevDmaVal); + + enqueueDmaOp.replaceAllUsesWith(newEnqueueDmaOp.getIndex()); + enqueueDmaOp.erase(); + } + // Move to next enqueue DMA and get start and end indexes + curEnqueueIndex++; + VPUX_THROW_UNLESS( + curEnqueueIndex < enqueueDmasPerHwQueue[hwQueue].size(), + "No enqueue DMAs available for task type {0} on tile {1}, list {2} at index {3}", + taskType, tileIdx, listIdx, curEnqueueIndex); + enqueueDmaStartIdx = enqueueDmasPerHwQueue[hwQueue][curEnqueueIndex].startTaskIdx; + enqueueDmaEndIdx = enqueueDmasPerHwQueue[hwQueue][curEnqueueIndex].endTaskIdx; + enqueueDmaOp = enqueueDmasPerHwQueue[hwQueue][curEnqueueIndex].enqDmaOp; + prevDmaVal = enqueueDmaOp.getPreviousTask(); + breakPointDetected = false; + nextBreakPointIsFirstForGivenEnqueueDma = true; + _log.trace("Enqueue DMA task range: {0} - {1}", enqueueDmaStartIdx, enqueueDmaEndIdx); + } + VPUX_THROW_UNLESS(taskInd >= enqueueDmaStartIdx && taskInd <= enqueueDmaEndIdx, + "Task index {0} is out of range for Enqueue DMA: {1} - {2}", taskInd, + enqueueDmaStartIdx, enqueueDmaEndIdx); + + // Check if current task is the last one in execution group but also is not the last one + // that is enqueued by enqueue DMA. In such case enqueue DMA needs to be split + if (taskOp->hasAttr(VPUMI40XX::lastSecondaryTaskInExecutionGroup) && taskInd < enqueueDmaEndIdx) { + _log.trace("Task index {0} - breakpoint detected", taskInd); + breakPointDetected = true; + + _log.trace("Create new enqueue DMA with range {0} - {1}", enqueueDmaStartIdx, taskInd); + mlir::ValueRange waitBarriers = {}; + if (nextBreakPointIsFirstForGivenEnqueueDma) { + waitBarriers = enqueueDmaOp.getWaitBarriers(); + } + nextBreakPointIsFirstForGivenEnqueueDma = false; + + auto newEnqueueDmaOp = createEnqueueDma(enqueueDmaStartIdx, taskInd, waitBarriers, {}, + enqueueDmaOp, prevDmaVal); + prevDmaVal = newEnqueueDmaOp.getIndex(); + + // If this enqueue DMA is also first one in the list then when replacing it with new one update + // its usage in MappedInferenceOp and EnqueueOp responsible for enqueueing this DMA + if (firstDmaTile0List0Op == enqueueDmaOp) { + firstDmaTile0List0Op.getResult().replaceUsesWithIf( + newEnqueueDmaOp, [](mlir::OpOperand& operand) { + if (mlir::isa(operand.getOwner())) { + return true; + } + + if (auto enqueueOp = + mlir::dyn_cast(operand.getOwner())) { + if (enqueueOp.getStartMutable() == operand) { + return true; + } + } + + return false; + }); + firstDmaTile0List0Op = newEnqueueDmaOp; + } + + // If new enqueue DMA was created then update the enqueueDmaStartIdx to point to the next task + // so that next new enqueue DMA will have correct start index + enqueueDmaStartIdx = taskInd + 1; + } + taskOp = taskOp.getNextTask(); + } while (taskOp); + + if (breakPointDetected) { + _log.trace("After processing all tasks create new enqueue DMA with range {0} - {1} to handle " + "enqueue of last tasks of this type", + enqueueDmaStartIdx, enqueueDmaEndIdx); + auto newEnqueueDmaOp = createEnqueueDma(enqueueDmaStartIdx, enqueueDmaEndIdx, {}, + enqueueDmaOp.getUpdateBarriers(), enqueueDmaOp, prevDmaVal); + enqueueDmaOp.replaceAllUsesWith(newEnqueueDmaOp.getIndex()); + enqueueDmaOp.erase(); + } + + _log = _log.unnest(); + } + } + } + + // After splitting enqueue DMAs their indexes and count needs to be updated + auto newCount = VPUMI40XX::reindexList(firstDmaTile0List0Op); + auto dmasCount = parseIntArrayOfArrayAttr(mpi.getDmaCount()); + dmasCount[0][0] = newCount; + mpi.setDmaCountAttr(getIntArrayOfArray(ctx, dmasCount)); +} + +} // namespace + +// +// createSplitEnqueueDmaOpsPass +// + +std::unique_ptr vpux::VPUMI40XX::createSplitEnqueueDmaOpsPass(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/unroll_fetch_task_ops.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/unroll_fetch_task_ops.cpp index d76b8e16e4..d756019e9f 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/passes/unroll_fetch_task_ops.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/passes/unroll_fetch_task_ops.cpp @@ -10,6 +10,7 @@ #include "vpux/compiler/dialect/VPUMI40XX/utils.hpp" #include "vpux/compiler/dialect/VPUMI40XX/wlm_utils.hpp" #include "vpux/compiler/dialect/VPURegMapped/ops.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/passes.hpp" #include "vpux/compiler/utils/rewriter.hpp" @@ -30,7 +31,7 @@ namespace { class RewriteFetchTaskToDma final : public mlir::OpRewritePattern { public: - RewriteFetchTaskToDma(mlir::MLIRContext* ctx, VPU::ArchKind arch, Logger log) + RewriteFetchTaskToDma(mlir::MLIRContext* ctx, config::ArchKind arch, Logger log) : mlir::OpRewritePattern(ctx), _arch(arch), _log(log) { setDebugName("FetchTaskOpRewriter"); } @@ -39,16 +40,13 @@ class RewriteFetchTaskToDma final : public mlir::OpRewritePattern +#include +#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/ops.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/passes.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/utils.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/wlm_utils.hpp" +#include "vpux/compiler/dialect/VPURegMapped/ops.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/utils/passes.hpp" +namespace vpux::VPUMI40XX { +#define GEN_PASS_DECL_UPDATEENQUEUEDMAINPUTANDOUTPUT +#define GEN_PASS_DEF_UPDATEENQUEUEDMAINPUTANDOUTPUT +#include "vpux/compiler/dialect/VPUMI40XX/passes.hpp.inc" +} // namespace vpux::VPUMI40XX + +using namespace vpux; + +namespace { +class UpdateEnqueueDMAInputAndOutput : + public VPUMI40XX::impl::UpdateEnqueueDMAInputAndOutputBase { +public: + explicit UpdateEnqueueDMAInputAndOutput(Logger log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnFunc() final; + + mlir::Value getOrCreateRegisterBuffer(mlir::OpBuilder& builder, mlir::Operation* bufferInsertionPoint, + mlir::MemRefType memType, uint32_t fifoAddr); + Const::DeclareOp createEnqueueConstant(mlir::OpBuilder& builder, mlir::Operation* insertionPoint, + const uint32_t& val); + + void updateInputAndOutput(mlir::OpBuilder& builder, VPUMI40XX::NNDMAOp enqueueDma, + VPURegMapped::TaskOpInterface taskOp, uint32_t fifoAddr, + mlir::Operation* bufferInsertionPoint, mlir::Operation* cstInsertionPoint); + + llvm::DenseMap, mlir::Value> _regBufferCache; +}; + +uint32_t getFifoAddr(VPURegMapped::TaskType type, size_t tile, size_t list, int64_t actShavePerTile) { + VPUX_THROW_WHEN(tile >= VPUMI40XX::NPU_MAX_TILES || list > 1, "Invalid tile index {0} or list index {1}", tile, + list); + const auto offset = type == VPURegMapped::TaskType::DPUVariant + ? VPUMI40XX::DPU_FIFO_OFFSETS[tile] + : VPUMI40XX::SHV_FIFO_OFFSETS[(actShavePerTile * tile) + list]; + const auto base = type == VPURegMapped::TaskType::DPUVariant ? VPUMI40XX::NNCMX_DPU_CMX_CTRL_BASE + : VPUMI40XX::NNCMX_SHV_CMX_CTRL_BASE; + return base + offset; +} + +Const::DeclareOp UpdateEnqueueDMAInputAndOutput::createEnqueueConstant(mlir::OpBuilder& builder, + mlir::Operation* insertionPoint, + const uint32_t& val) { + const Shape valShape = {1}; + const auto dataStorageType = mlir::RankedTensorType::get(valShape.raw(), getUInt32Type(builder.getContext())); + const auto dataAttr = mlir::DenseElementsAttr::get(dataStorageType, ArrayRef(val)); + + auto memType = mlir::MemRefType::get(dataStorageType.getShape(), dataStorageType.getElementType()); + builder.setInsertionPoint(insertionPoint); + auto configurationConstOp = + builder.create(builder.getUnknownLoc(), memType, Const::ContentAttr::get(dataAttr)); + + return configurationConstOp; +} + +mlir::Value UpdateEnqueueDMAInputAndOutput::getOrCreateRegisterBuffer(mlir::OpBuilder& builder, + mlir::Operation* bufferInsertionPoint, + mlir::MemRefType memType, uint32_t fifoAddr) { + std::pair key = {memType, fifoAddr}; + + auto it = _regBufferCache.find(key); + if (it != _regBufferCache.end()) { + return it->second; + } + + builder.setInsertionPoint(bufferInsertionPoint); + auto declBuf = builder.create(builder.getUnknownLoc(), memType, + VPURT::BufferSection::Register, fifoAddr); + + mlir::Value buffer = declBuf.getBuffer(); + _regBufferCache[key] = buffer; + return buffer; +} + +// For given enqueue DMA operation create a new enqueue DMA that will have correct input and output buffers and will +// replace original op +void UpdateEnqueueDMAInputAndOutput::updateInputAndOutput(mlir::OpBuilder& builder, VPUMI40XX::NNDMAOp enqueueDma, + VPURegMapped::TaskOpInterface taskOp, uint32_t fifoAddr, + mlir::Operation* bufferInsertionPoint, + mlir::Operation* cstInsertionPoint) { + auto ctx = builder.getContext(); + auto cmxTaskLocationBuf = mlir::cast(taskOp.getTaskLocation().getDefiningOp()); + + // ---------------------------------------- + // Step 1: Prepare DMA-specific Attributes + // ---------------------------------------- + + // Convert CMX offset to metadata-relative offset in 32-byte units. + // 15360 = VPU_METADATA_OFFSET = start of metadata region in CMX. + const auto descriptorOffsetInCMX = (cmxTaskLocationBuf.getOffset().value() + 15360) >> 5; + auto enqueueConstOp = createEnqueueConstant(builder, cstInsertionPoint, descriptorOffsetInCMX); + const auto constOutputType = mlir::cast(enqueueConstOp.getOutput().getType()); + + const auto layout = mlir::MemRefLayoutAttrInterface{}; + const auto regMemSpace = vpux::IndexedSymbolAttr::get(ctx, stringifyEnum(VPU::MemoryKind::Register)); + auto outputType = constOutputType.changeMemSpace(regMemSpace); + + // ---------------------------------------- + // Step 2: Declare Buffers for Register + // ---------------------------------------- + + builder.setInsertionPoint(bufferInsertionPoint); + auto memType = mlir::MemRefType::get(outputType.getShape().raw(), outputType.getElementType(), layout, + outputType.getMemSpace()); + mlir::Value dstBuffer = getOrCreateRegisterBuffer(builder, bufferInsertionPoint, memType, fifoAddr); + + // ---------------------------------------- + // Step 3: Create new DMA operation with proper input and output + // to replace original op that had dummy buffers + // ---------------------------------------- + mlir::IRMapping mapper; + mapper.map(enqueueDma.getInput(), enqueueConstOp.getOutput()); + mapper.map(enqueueDma.getOutputBuffs().front(), dstBuffer); + + builder.setInsertionPointAfter(enqueueDma); + auto newOp = builder.clone(*enqueueDma, mapper); + auto newEnqueueDma = mlir::cast(newOp); + + // TODO: Check if setting DMA descriptor explicitly is really needed + const auto dmaDescriptorAttr = VPUIP::DMADescriptorAttr::get(ctx, + /*numPlane*/ getIntAttr(ctx, 0), + /*len*/ getIntAttr(ctx, 4), + /*srcWidth*/ getIntAttr(ctx, 4), + /*srcStride*/ getIntAttr(ctx, 0), + /*srcPlaneStride*/ getIntAttr(ctx, 0), + /*dstWidth*/ getIntAttr(ctx, 4), + /*dstStride*/ getIntAttr(ctx, 0), + /*dstPlaneStride*/ getIntAttr(ctx, 0)); + + newEnqueueDma.setDmaDescriptorAttr(dmaDescriptorAttr); + + enqueueDma.replaceAllUsesWith(newEnqueueDma.getResult()); + enqueueDma.erase(); +} + +void UpdateEnqueueDMAInputAndOutput::safeRunOnFunc() { + auto netFunc = getOperation(); + auto mpi = VPUMI40XX::getMPI(netFunc); + + auto parentModule = netFunc.getOperation()->getParentOfType(); + const auto tilesCount = IE::getTileExecutor(parentModule).getCount(); + const auto shavesCountPerTile = IE::getAvailableExecutor(parentModule, VPU::ExecutorKind::SHAVE_ACT).getCount(); + + auto dmaTile0List0Head = mpi.getListHead(VPURegMapped::TaskType::DMA, 0, 0); + if (!dmaTile0List0Head) { + return; + } + + auto builder = mlir::OpBuilder(mpi.getOperation()); + + // Set insertion point where new buffers representing HW FIFO register address will be placed + auto bufferOps = netFunc.getOps(); + auto bufferInsertionPoint = !bufferOps.empty() ? *bufferOps.begin() : &netFunc.getBody().front().front(); + + // Set insertion point where new constant ops storing task descriptor location will be placed + auto declOps = netFunc.getOps(); + auto cstInsertionPoint = !declOps.empty() ? *declOps.begin() : &netFunc.getBody().front().front(); + + // Gather data about all enqueue DMAs which are always on DMA tile 0 list 0 (Port 0, channel DDR) + auto firstDmaTile0List0Op = dmaTile0List0Head.getDefiningOp(); + auto enqueueDmasPerHwQueue = VPUMI40XX::getEnqueueDmaData(firstDmaTile0List0Op, _log); + + const mlir::DenseSet> taskTypesWithListCountPerTile = { + {{VPURegMapped::TaskType::DPUVariant, 1}, + {VPURegMapped::TaskType::ActKernelInvocation, shavesCountPerTile}}}; + + // Iterate over DPU/SHV tasks on each tile and list and check if task is the head of the enqueued task by the + // enqueue DMA. If yes update input and output buffer of the enqueue DMA so that it will push task + // descriptor to HW FIFO register. + for (uint32_t tileIdx = 0; tileIdx < tilesCount; tileIdx++) { + for (const auto& [taskType, listCount] : taskTypesWithListCountPerTile) { + for (uint32_t listIdx = 0; listIdx < listCount; listIdx++) { + auto listHead = mpi.getListHead(taskType, tileIdx, listIdx); + if (!listHead) { + continue; + } + + _log.trace("Check enqueue DMAs for task type {0} on tile {1}, list {2}", taskType, tileIdx, listIdx); + auto taskOp = mlir::cast(listHead.getDefiningOp()); + + auto hwQueue = VPUMI40XX::HwQueueType{taskType, tileIdx, listIdx}; + auto fifoAddr = getFifoAddr(taskType, tileIdx, listIdx, shavesCountPerTile); + + VPUX_THROW_WHEN(enqueueDmasPerHwQueue.find(hwQueue) == enqueueDmasPerHwQueue.end(), + "No Enqueue DMAs available for task type {0} on tile {1}, list {2}", taskType, tileIdx, + listIdx); + // Initial tasks must be enqueued by first enqueue DMA for given HW queue type + size_t curEnqueueIndex = 0; + // Get start and end task range enqueued by enqueue DMA to understand what range of tasks + // are processed by a single enqueue DMA + auto [enqueueDmaStartIdx, enqueueDmaEndIdx, enqueueDmaOp] = + enqueueDmasPerHwQueue[hwQueue][curEnqueueIndex]; + _log.trace("Enqueue DMA task range: {0} - {1}", enqueueDmaStartIdx, enqueueDmaEndIdx); + + // Iterate over tasks in the list and check if enabling link to previous is possible + do { + auto taskInd = mlir::cast(taskOp.getResult().getType()).getValue(); + + // If current task index is greater than end index of current enqueue DMA it means that + // this task is enqueued by next enqueue DMA -> switch to next enqueue DMA + if (taskInd > enqueueDmaEndIdx) { + _log.trace("Task {0} is after end task {1} of current enqueue DMA. Move to next enqueue DMA op", + taskInd, enqueueDmaEndIdx); + // Move to next enqueue DMA and get start and end indexes + curEnqueueIndex++; + VPUX_THROW_UNLESS( + curEnqueueIndex < enqueueDmasPerHwQueue[hwQueue].size(), + "No enqueue DMAs available for task type {0} on tile {1}, list {2} at index {3}", + taskType, tileIdx, listIdx, curEnqueueIndex); + enqueueDmaStartIdx = enqueueDmasPerHwQueue[hwQueue][curEnqueueIndex].startTaskIdx; + enqueueDmaEndIdx = enqueueDmasPerHwQueue[hwQueue][curEnqueueIndex].endTaskIdx; + enqueueDmaOp = enqueueDmasPerHwQueue[hwQueue][curEnqueueIndex].enqDmaOp; + + _log.trace("Enqueue DMA task range: {0} - {1}", enqueueDmaStartIdx, enqueueDmaEndIdx); + } + + // If task index is same as first task enqueued by a DMA then update enqueue DMA input and + // output buffers so that this DMA will push task descriptor to HW FIFO register + if (taskInd == enqueueDmaStartIdx) { + _log.trace("Update input and output data of enqueue DMA for task {0} ", taskInd); + updateInputAndOutput(builder, enqueueDmaOp, taskOp, fifoAddr, bufferInsertionPoint, + cstInsertionPoint); + } + + taskOp = taskOp.getNextTask(); + } while (taskOp); + } + } + } +} + +} // namespace + +// +// createUpdateEnqueueDMAInputAndOutput +// + +std::unique_ptr vpux::VPUMI40XX::createUpdateEnqueueDMAInputAndOutput(Logger log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/utils.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/utils.cpp index 324170cd5f..559276ae2f 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/utils.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/utils.cpp @@ -221,9 +221,28 @@ const std::unordered_map taskBinarySize40XX = { // TODO: E#121934 Add method for VPURegMapped TaskType to be able to directly return its binary size in an // arch-specific way -size_t getTaskBinarySize(VPURegMapped::TaskType taskType, [[maybe_unused]] VPU::ArchKind arch) { +size_t getTaskBinarySize(VPURegMapped::TaskType taskType, [[maybe_unused]] config::ArchKind arch) { return taskBinarySize40XX.at(taskType); } +VPURegMapped::TaskType convertExecutorKindToExecutableTaskType(VPU::ExecutorKind kind) { + VPURegMapped::TaskType returnType; + switch (kind) { + case VPU::ExecutorKind::DMA_NN: + returnType = VPURegMapped::TaskType::DMA; + break; + case VPU::ExecutorKind::DPU: + returnType = VPURegMapped::TaskType::DPUVariant; + break; + case VPU::ExecutorKind::SHAVE_ACT: + returnType = VPURegMapped::TaskType::ActKernelInvocation; + break; + default: + VPUX_THROW("Unsupported executor kind {0}", kind); + } + + return returnType; +} + } // namespace VPUMI40XX } // namespace vpux diff --git a/src/vpux_compiler/src/dialect/VPUMI40XX/wlm_utils.cpp b/src/vpux_compiler/src/dialect/VPUMI40XX/wlm_utils.cpp index 65ab01b700..b059cf3a29 100644 --- a/src/vpux_compiler/src/dialect/VPUMI40XX/wlm_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPUMI40XX/wlm_utils.cpp @@ -4,6 +4,8 @@ // #include "vpux/compiler/dialect/VPUMI40XX/wlm_utils.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/dialect/VPUMI40XX/utils.hpp" namespace vpux { namespace VPUMI40XX { @@ -533,5 +535,37 @@ void logFetchOpsDetails(mlir::func::FuncOp netFunc, Logger log) { } } +// Check if there are any Enqueue DMAs present in the schedule and gather data +// about them +// Map: +// - key - HwQueueType - {taskType, tileIdx, listIdx} +// - value - vector of {startTaskIndex, endTaskIndex, dmaOp} +mlir::DenseMap> getEnqueueDmaData( + VPUMI40XX::NNDMAOp firstDmaTile0List0Op, Logger log) { + mlir::DenseMap> enqueueDmasPerHwQueue; + + // Iterate over all DMAs on tile0 list 0 (Port 0, Channel DDR) and check for EnqueueDma attribute + auto dmaTile0List0Task = firstDmaTile0List0Op; + do { + auto enqueueDmaAttr = dmaTile0List0Task.getEnqueueDmaAttr(); + if (enqueueDmaAttr.has_value()) { + auto taskType = VPUMI40XX::convertExecutorKindToExecutableTaskType( + enqueueDmaAttr.value().getTargetExecutorKindAttr().getValue()); + auto tileIdx = static_cast(enqueueDmaAttr.value().getTileIdx().getValue().getSExtValue()); + auto listIdx = static_cast(enqueueDmaAttr.value().getListIdx().getValue().getSExtValue()); + auto hwQueue = VPUMI40XX::HwQueueType{taskType, tileIdx, listIdx}; + + auto startTaskIdx = enqueueDmaAttr.value().getStartTaskIdx().getValue().getSExtValue(); + auto endTaskIdx = enqueueDmaAttr.value().getEndTaskIdx().getValue().getSExtValue(); + enqueueDmasPerHwQueue[hwQueue].push_back(EnqDmaInfo{startTaskIdx, endTaskIdx, dmaTile0List0Task}); + log.trace("Found Enqueue DMA for task type {0} on tile {1}, list {2} with task index range {3} - {4}", + taskType, tileIdx, listIdx, startTaskIdx, endTaskIdx); + } + dmaTile0List0Task = VPUMI40XX::getNextOp(dmaTile0List0Task); + } while (dmaTile0List0Task); + + return enqueueDmasPerHwQueue; +} + } // namespace VPUMI40XX } // namespace vpux diff --git a/src/vpux_compiler/src/dialect/VPURT/IR/dialect.cpp b/src/vpux_compiler/src/dialect/VPURT/IR/dialect.cpp index 13d4546d1f..81bfd41511 100644 --- a/src/vpux_compiler/src/dialect/VPURT/IR/dialect.cpp +++ b/src/vpux_compiler/src/dialect/VPURT/IR/dialect.cpp @@ -8,6 +8,7 @@ #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include #include namespace { diff --git a/src/vpux_compiler/src/dialect/VPURT/IR/ops.cpp b/src/vpux_compiler/src/dialect/VPURT/IR/ops.cpp index 2dc1f23c37..320fad1e2b 100644 --- a/src/vpux_compiler/src/dialect/VPURT/IR/ops.cpp +++ b/src/vpux_compiler/src/dialect/VPURT/IR/ops.cpp @@ -4,6 +4,7 @@ // #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/types.hpp" #include "vpux/compiler/dialect/VPURT/IR/task.hpp" #include diff --git a/src/vpux_compiler/src/dialect/VPURT/IR/ops/barrier.cpp b/src/vpux_compiler/src/dialect/VPURT/IR/ops/barrier.cpp index 87c1374b6a..0cf6043451 100644 --- a/src/vpux_compiler/src/dialect/VPURT/IR/ops/barrier.cpp +++ b/src/vpux_compiler/src/dialect/VPURT/IR/ops/barrier.cpp @@ -4,7 +4,7 @@ // #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" - +#include "vpux/compiler/utils/error.hpp" #include "vpux/utils/core/format.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/VPURT/IR/ops/buffer.cpp b/src/vpux_compiler/src/dialect/VPURT/IR/ops/buffer.cpp index f5218032f3..30380e4f73 100644 --- a/src/vpux_compiler/src/dialect/VPURT/IR/ops/buffer.cpp +++ b/src/vpux_compiler/src/dialect/VPURT/IR/ops/buffer.cpp @@ -4,8 +4,8 @@ // #include "vpux/compiler/dialect/ELFNPU37XX/utils.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/types.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" - #include "vpux/compiler/utils/ELF/utils.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/error.hpp" diff --git a/src/vpux_compiler/src/dialect/VPURT/transforms/passes/assign_physical_barriers.cpp b/src/vpux_compiler/src/dialect/VPURT/transforms/passes/assign_physical_barriers.cpp index 6c3282e025..e5884da5b2 100644 --- a/src/vpux_compiler/src/dialect/VPURT/transforms/passes/assign_physical_barriers.cpp +++ b/src/vpux_compiler/src/dialect/VPURT/transforms/passes/assign_physical_barriers.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPURT/transforms/passes.hpp" #include "vpux/compiler/core/barrier_info.hpp" @@ -10,6 +11,8 @@ #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" #include "vpux/compiler/dialect/VPURT/interfaces/barrier_simulator.hpp" #include "vpux/compiler/dialect/VPURT/utils/color_bin_barrier_assignment.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/options.hpp" #include #include @@ -121,7 +124,7 @@ void AssignPhysicalBarriersPass::safeRunOnFunc() { const auto numBarriers = numBarriersOpt.hasValue() ? numBarriersOpt.getValue() : VPUIP::getNumAvailableBarriers(func); - auto wlmFlag = vpux::VPUIP::getWlmStatus(module) == vpux::VPUIP::WlmStatus::ENABLED; + auto wlmFlag = VPU::getWorkloadManagementStatus(module) == VPU::WorkloadManagementStatus::ENABLED; const auto barrierColorBinFlag = colorBinEnableOpt.hasValue() ? static_cast(colorBinEnableOpt.getValue()) : _barrierColorBinFlag; @@ -142,19 +145,24 @@ void AssignPhysicalBarriersPass::safeRunOnFunc() { _log.trace("WLM flag turned off because number of barrier is above threshold {0} > {1}", numVirtualBarriers, virtualBarrierThresholdForWlm.value()); wlmFlag = false; - vpux::VPUIP::setWlmStatus(module, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(module, VPU::WorkloadManagementStatus::FAILED); } if (!barrierSim.isDynamicBarriers()) { return; } - if (mlir::failed(barrierSim.checkProducerCount(_log.nest()))) { - signalPassFailure(); - return; - } - if (mlir::failed(barrierSim.checkProducerAndConsumerCount(_log.nest()))) { - signalPassFailure(); - return; + + if (wlmFlag == false || _workloadManagementMode < WorkloadManagementMode::PWLM_V2_PAGES) { + // No need to verify below for newer WLM modes as later pass - OptimizeBarriersSlotsUsage + // pass will take care of it + if (mlir::failed(barrierSim.checkProducerCount(_log.nest()))) { + signalPassFailure(); + return; + } + if (mlir::failed(barrierSim.checkProducerAndConsumerCount(_log.nest()))) { + signalPassFailure(); + return; + } } if (barrierColorBinFlag && numVirtualBarriers <= numBarriers) { @@ -170,7 +178,7 @@ void AssignPhysicalBarriersPass::safeRunOnFunc() { if (barrierColorBinFlag && numVirtualBarriers > numBarriers) { auto& barrierGraphInfo = getAnalysis(); - auto arch = VPU::getArch(func); + auto arch = config::getArch(func); VPURT::BarrierColorBin BarrierColorBinAssignment(numBarriers, arch, _log); // Apply color binning algorithm for physical barrier assignment diff --git a/src/vpux_compiler/src/dialect/VPURT/transforms/passes/inference_execution_analysis.cpp b/src/vpux_compiler/src/dialect/VPURT/transforms/passes/inference_execution_analysis.cpp index c2ad38cc5e..f7f07370fd 100644 --- a/src/vpux_compiler/src/dialect/VPURT/transforms/passes/inference_execution_analysis.cpp +++ b/src/vpux_compiler/src/dialect/VPURT/transforms/passes/inference_execution_analysis.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPURT/interfaces/inference_execution_simulator.hpp" #include "vpux/compiler/dialect/VPURT/transforms/passes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/strings.hpp" @@ -149,7 +150,7 @@ class InferenceExecutionAnalysisPass final : void InferenceExecutionAnalysisPass::safeRunOnFunc() { auto funcOp = getOperation(); auto moduleOp = funcOp->getParentOfType(); - const auto arch = VPU::getArch(moduleOp); + const auto arch = config::getArch(moduleOp); auto maybeCostModelAnalysis = getCachedParentAnalysis(moduleOp); auto costModel = VPU::CostModelAnalysis::getOrCreateCostModel(maybeCostModelAnalysis, arch, _log); CycleCostInfo cycleCostInfo(std::move(costModel), funcOp); diff --git a/src/vpux_compiler/src/dialect/VPURT/transforms/passes/insert_barrier_marking_end_of_descriptor_groups.cpp b/src/vpux_compiler/src/dialect/VPURT/transforms/passes/insert_barrier_marking_end_of_descriptor_groups.cpp index 2962bc4498..d5b685a77c 100644 --- a/src/vpux_compiler/src/dialect/VPURT/transforms/passes/insert_barrier_marking_end_of_descriptor_groups.cpp +++ b/src/vpux_compiler/src/dialect/VPURT/transforms/passes/insert_barrier_marking_end_of_descriptor_groups.cpp @@ -4,9 +4,11 @@ // #include "vpux/compiler/core/barrier_info.hpp" -#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/transforms/passes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/options.hpp" #include "vpux/compiler/utils/wlm_legalization_utils.hpp" namespace vpux::VPURT { @@ -50,8 +52,10 @@ class InsertBarrierToMarkTheEndOfDescriptorGroupPass final : InsertBarrierToMarkTheEndOfDescriptorGroupPass> { public: explicit InsertBarrierToMarkTheEndOfDescriptorGroupPass( - std::optional workloadManagementBarrierCountThreshold, Logger log) - : _workloadManagementBarrierCountThreshold(workloadManagementBarrierCountThreshold) { + std::optional workloadManagementBarrierCountThreshold, + std::optional workloadManagementMode, Logger log) + : _workloadManagementBarrierCountThreshold(workloadManagementBarrierCountThreshold), + _workloadManagementMode(workloadManagementMode) { Base::initLogger(log, Base::getArgumentName()); } @@ -67,6 +71,7 @@ class InsertBarrierToMarkTheEndOfDescriptorGroupPass final : private: std::optional _workloadManagementBarrierCountThreshold; + std::optional _workloadManagementMode; void safeRunOnFunc() final; }; @@ -268,14 +273,20 @@ void InsertBarrierToMarkTheEndOfDescriptorGroupPass::insertBarriersForQueue( void InsertBarrierToMarkTheEndOfDescriptorGroupPass::safeRunOnFunc() { auto netFunc = getOperation(); auto module = netFunc->getParentOfType(); - auto isWlmEnabled = (vpux::VPUIP::getWlmStatus(module) == vpux::VPUIP::WlmStatus::ENABLED) && - !isArchVPUX3XXX(VPU::getArch(module)); + auto isWlmEnabled = (VPU::getWorkloadManagementStatus(module) == VPU::WorkloadManagementStatus::ENABLED) && + !config::isArchVPUX3XXX(config::getArch(module)); if (!isWlmEnabled) { legalizeScheduleForNonWlm(netFunc); return; } + // createAddPlaceholderFetchDMAsPass inserts placeholder FetchDMAs + if (_workloadManagementMode.has_value() && + _workloadManagementMode.value() == WorkloadManagementMode::FWLM_V1_PAGES) { + return; + } + mlir::OpBuilder builder(netFunc); auto taskOps = netFunc.getOps(); @@ -290,7 +301,7 @@ void InsertBarrierToMarkTheEndOfDescriptorGroupPass::safeRunOnFunc() { numVirtualBarriers > _workloadManagementBarrierCountThreshold.value()) { _log.info("Skip WLM schedule legalization due to high number of barriers: {0}, threshold: {1}", numVirtualBarriers, _workloadManagementBarrierCountThreshold.value()); - vpux::VPUIP::setWlmStatus(module, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(module, VPU::WorkloadManagementStatus::FAILED); legalizeScheduleForNonWlm(netFunc); return; } @@ -347,7 +358,8 @@ void InsertBarrierToMarkTheEndOfDescriptorGroupPass::safeRunOnFunc() { // std::unique_ptr vpux::VPURT::createInsertBarrierToMarkTheEndOfDescriptorGroupPass( - std::optional workloadManagementBarrierCountThreshold, Logger log) { + std::optional workloadManagementBarrierCountThreshold, + std::optional workloadManagementMode, Logger log) { return std::make_unique(workloadManagementBarrierCountThreshold, - log); + workloadManagementMode, log); } diff --git a/src/vpux_compiler/src/dialect/VPURT/transforms/passes/reduce_exceeding_active_count_barriers.cpp b/src/vpux_compiler/src/dialect/VPURT/transforms/passes/reduce_exceeding_active_count_barriers.cpp index d50a9fbe9e..981164e480 100644 --- a/src/vpux_compiler/src/dialect/VPURT/transforms/passes/reduce_exceeding_active_count_barriers.cpp +++ b/src/vpux_compiler/src/dialect/VPURT/transforms/passes/reduce_exceeding_active_count_barriers.cpp @@ -5,10 +5,13 @@ #include "vpux/compiler/core/barrier_info.hpp" #include "vpux/compiler/core/execution_group_analysis.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/interfaces/barrier_simulator.hpp" #include "vpux/compiler/dialect/VPURT/transforms/passes.hpp" #include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" +#include "vpux/compiler/utils/options.hpp" namespace vpux::VPURT { #define GEN_PASS_DECL_REDUCEEXCEEDINGACTIVECOUNTBARRIERS @@ -272,7 +275,7 @@ void ReduceExceedingActiveCountBarriersPass::verifyFinalBarrier(mlir::func::Func void ReduceExceedingActiveCountBarriersPass::safeRunOnFunc() { auto func = getOperation(); auto module = func->getParentOfType(); - auto arch = VPU::getArch(module); + auto arch = config::getArch(module); const auto numBarriersToUse = numBarriers.hasValue() ? checked_cast(numBarriers.getValue()) : checked_cast(VPUIP::getNumAvailableBarriers(func)); @@ -288,7 +291,8 @@ void ReduceExceedingActiveCountBarriersPass::safeRunOnFunc() { VPUX_THROW_UNLESS(numBarriersToUse > 1, "Not possible to satisfy barrier requirement numBarriersToUse '{0}'", numBarriersToUse); - auto wlmFlag = (vpux::VPUIP::getWlmStatus(module) == vpux::VPUIP::WlmStatus::ENABLED) && !isArchVPUX3XXX(arch); + auto wlmFlag = (VPU::getWorkloadManagementStatus(module) == VPU::WorkloadManagementStatus::ENABLED) && + !config::isArchVPUX3XXX(arch); _shareWaitAndUpdateBarriers = VPURT::isShareWaitAndUpdateBarriersNeeded(_workloadManagementMode); auto shareWaitAndUpdateBarriers = shareWaitAndUpdateBarriersOpt.hasValue() @@ -312,7 +316,7 @@ void ReduceExceedingActiveCountBarriersPass::safeRunOnFunc() { _log.trace("WLM flag turned off because number of barrier is above threshold {0} > {1}", barrierInfo.getNumOfBarrierOps(), _virtualBarrierThresholdForWlm.value()); wlmFlag = false; - vpux::VPUIP::setWlmStatus(module, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(module, VPU::WorkloadManagementStatus::FAILED); } if (wlmFlag) { diff --git a/src/vpux_compiler/src/dialect/VPURT/transforms/passes/satisfy_one_wait_barrier_per_task.cpp b/src/vpux_compiler/src/dialect/VPURT/transforms/passes/satisfy_one_wait_barrier_per_task.cpp index 2c8e1fa852..085fff1d78 100644 --- a/src/vpux_compiler/src/dialect/VPURT/transforms/passes/satisfy_one_wait_barrier_per_task.cpp +++ b/src/vpux_compiler/src/dialect/VPURT/transforms/passes/satisfy_one_wait_barrier_per_task.cpp @@ -4,10 +4,12 @@ // #include "vpux/compiler/core/barrier_info.hpp" +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/dialect/VPURT/interfaces/barrier_simulator.hpp" #include "vpux/compiler/dialect/VPURT/transforms/passes.hpp" #include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" +#include "vpux/compiler/utils/options.hpp" #include @@ -47,7 +49,7 @@ void SatisfyOneWaitBarrierPerTaskPass::safeRunOnFunc() { barrierInfo.enableUnevenVariantSplit(); } - auto wlmFlag = vpux::VPUIP::getWlmStatus(module) == vpux::VPUIP::WlmStatus::ENABLED; + auto wlmFlag = VPU::getWorkloadManagementStatus(module) == VPU::WorkloadManagementStatus::ENABLED; // In case of WLM all tasks need to be driven by single barrier as this is one of the constraints // to make each schedule feasible for WLM enabling @@ -58,7 +60,7 @@ void SatisfyOneWaitBarrierPerTaskPass::safeRunOnFunc() { _log.trace("WLM flag turned off because number of barrier is above threshold {0} > {1}", barrierInfo.getNumOfBarrierOps(), _virtualBarrierThresholdForWlm.value()); wlmFlag = false; - vpux::VPUIP::setWlmStatus(module, vpux::VPUIP::WlmStatus::FAILED); + VPU::setWorkloadManagementStatus(module, VPU::WorkloadManagementStatus::FAILED); } const auto maxAvailableSlots = maxVariantCount.hasValue() ? checked_cast(maxVariantCount.getValue()) diff --git a/src/vpux_compiler/src/dialect/VPURT/transforms/passes/simplify_schedule.cpp b/src/vpux_compiler/src/dialect/VPURT/transforms/passes/simplify_schedule.cpp index 133cdda4a5..00c5b48b41 100644 --- a/src/vpux_compiler/src/dialect/VPURT/transforms/passes/simplify_schedule.cpp +++ b/src/vpux_compiler/src/dialect/VPURT/transforms/passes/simplify_schedule.cpp @@ -10,6 +10,7 @@ #include "vpux/compiler/dialect/VPURT/interfaces/inference_execution_simulator.hpp" #include "vpux/compiler/dialect/VPURT/transforms/passes.hpp" #include "vpux/compiler/dialect/VPURT/utils/barrier_legalization_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include @@ -334,7 +335,7 @@ void SimplifySchedulePass::safeRunOnFunc() { auto& barrierInfo = getAnalysis(); auto module = funcOp->getParentOfType(); - const auto arch = VPU::getArch(module); + const auto arch = config::getArch(module); auto maybeCostModelAnalysis = getCachedParentAnalysis(module); auto costModel = VPU::CostModelAnalysis::getOrCreateCostModel(maybeCostModelAnalysis, arch, _log); CycleCostInfo cycleCostInfo(std::move(costModel), funcOp); diff --git a/src/vpux_compiler/src/dialect/VPURT/utils/barrier_legalization_utils.cpp b/src/vpux_compiler/src/dialect/VPURT/utils/barrier_legalization_utils.cpp index eb7ba48ec5..62357b499e 100644 --- a/src/vpux_compiler/src/dialect/VPURT/utils/barrier_legalization_utils.cpp +++ b/src/vpux_compiler/src/dialect/VPURT/utils/barrier_legalization_utils.cpp @@ -200,10 +200,17 @@ void VPURT::orderExecutionTasksAndBarriers(mlir::func::FuncOp funcOp, BarrierInf }; // simulate per FIFO execution - all FIFOs must reach end + SmallVector lastReadyOps; while (!VPURT::allQueuesReachedEnd(frontTasks, taskOpQueues)) { const auto readyOps = VPURT::findReadyOpsFromTaskOpQueues(frontTasks, taskOpQueues, barrierInfo); - // at each step there must be some ready ops - VPUX_THROW_WHEN(readyOps.empty(), "Failed to simulate execution"); + + // If simulation fails (no ready ops), dump previous + if (readyOps.empty()) { + if (!lastReadyOps.empty()) { + log.error("Last readyOps before failure: {0}", lastReadyOps); + } + VPUX_THROW("Failed to simulate execution. Possible deadlock or barrier misconfiguration"); + } for (auto& readyOp : readyOps) { log.trace("Task '{0}' is ready", readyOp); @@ -211,6 +218,7 @@ void VPURT::orderExecutionTasksAndBarriers(mlir::func::FuncOp funcOp, BarrierInf newTaskOpOrder.push_back(readyOp); removeBarrierProducer(readyOp); } + lastReadyOps = std::move(readyOps); } // ensure number of tasks remains the same diff --git a/src/vpux_compiler/src/dialect/VPURT/utils/color_bin_barrier_assignment.cpp b/src/vpux_compiler/src/dialect/VPURT/utils/color_bin_barrier_assignment.cpp index f0c821ad93..a8f3a0d5f4 100644 --- a/src/vpux_compiler/src/dialect/VPURT/utils/color_bin_barrier_assignment.cpp +++ b/src/vpux_compiler/src/dialect/VPURT/utils/color_bin_barrier_assignment.cpp @@ -18,11 +18,11 @@ constexpr double THRESHOLD_FOR_MIN_BARRIER_BIN = 20; // Targets which need to delay the reuse of physical barriers, since on those hardware platform, the runtime has to // reprogram the barrier count before the next reuse -const std::set compatibleTargets = {VPU::ArchKind::NPU37XX}; +const std::set compatibleTargets = {config::ArchKind::NPU37XX}; namespace { -size_t getBarrierGracePeriod(VPU::ArchKind arch) { +size_t getBarrierGracePeriod(config::ArchKind arch) { if (compatibleTargets.find(arch) != compatibleTargets.end()) { return BARRIER_GRACE_PERIOD; } @@ -30,7 +30,7 @@ size_t getBarrierGracePeriod(VPU::ArchKind arch) { } } // namespace -VPURT::BarrierColorBin::BarrierColorBin(size_t numBarriers, VPU::ArchKind arch, Logger log) +VPURT::BarrierColorBin::BarrierColorBin(size_t numBarriers, config::ArchKind arch, Logger log) : _numBarriers(numBarriers), _log(log) { _gracePeriod = getBarrierGracePeriod(arch); } diff --git a/src/vpux_compiler/src/dialect/VPURegMapped/ops/task_buffer_layout.cpp b/src/vpux_compiler/src/dialect/VPURegMapped/ops/task_buffer_layout.cpp index 17294aa9b3..14389dc2ed 100644 --- a/src/vpux_compiler/src/dialect/VPURegMapped/ops/task_buffer_layout.cpp +++ b/src/vpux_compiler/src/dialect/VPURegMapped/ops/task_buffer_layout.cpp @@ -6,6 +6,8 @@ #include #include +#include + using namespace vpux; using namespace VPURegMapped; diff --git a/src/vpux_compiler/src/dialect/VPURegMapped/passes/deduce_dynamic_mapped_inference_version.cpp b/src/vpux_compiler/src/dialect/VPURegMapped/passes/deduce_dynamic_mapped_inference_version.cpp index 3b070295df..e771261757 100644 --- a/src/vpux_compiler/src/dialect/VPURegMapped/passes/deduce_dynamic_mapped_inference_version.cpp +++ b/src/vpux_compiler/src/dialect/VPURegMapped/passes/deduce_dynamic_mapped_inference_version.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" #include "vpux/compiler/dialect/VPURegMapped/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPURegMapped/passes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" namespace vpux::VPURegMapped { @@ -54,7 +55,7 @@ void DeduceDynamicMappedInferenceVersion::safeRunOnModule() { return maxVersion; }; - const auto newVersion = getVersionFromOps(elfMain); + auto newVersion = getVersionFromOps(elfMain); auto setNewVersion = [&](ELF::MainOp main) -> void { for (auto dataSection : main.getOps()) { diff --git a/src/vpux_compiler/src/dialect/config/IR/attributes.cpp b/src/vpux_compiler/src/dialect/config/IR/attributes.cpp index f0774b03ed..164ff4b153 100644 --- a/src/vpux_compiler/src/dialect/config/IR/attributes.cpp +++ b/src/vpux_compiler/src/dialect/config/IR/attributes.cpp @@ -4,8 +4,13 @@ // #include "vpux/compiler/dialect/config/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/config/IR/dialect.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/utils/analysis.hpp" +#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/utils/platform_resources.hpp" #include "vpux/utils/core/string_ref.hpp" #include @@ -37,6 +42,17 @@ void config::ConfigDialect::registerAttributes() { // namespace { +// +// Run-time resources +// + +constexpr llvm::StringLiteral archAttrName = "config.arch"; +constexpr Byte DDR_HEAP_SIZE = 64000_MB; + +constexpr llvm::StringLiteral derateFactorAttrName = "config.derateFactor"; +constexpr llvm::StringLiteral bandwidthAttrName = "config.bandwidth"; /*!< This attribute corresponds to a single JSON + field nested at header>resources>memory_bandwidth>number in the deserialized version of the blob. + */ constexpr StringLiteral compilationModeAttrName = "config.compilationMode"; @@ -63,3 +79,268 @@ config::CompilationMode vpux::config::getCompilationMode(mlir::Operation* op) { // Use DefaultHW as a default mode return config::CompilationMode::DefaultHW; } + +StringLiteral vpux::config::getMemoryDerateAttrName() { + return derateFactorAttrName; +} + +StringLiteral vpux::config::getMemoryBandwidthAttrName() { + return bandwidthAttrName; +} + +// +// ArchKind +// + +namespace { + +struct Resources { + int numOfDPUGroups = 1; + std::optional numOfDMAPorts = std::nullopt; + std::optional availableCMXMemory = std::nullopt; + + Resources(int numOfDPUGroups, std::optional numOfDMAPorts, std::optional availableCMXMemory) + : numOfDPUGroups(numOfDPUGroups), numOfDMAPorts(numOfDMAPorts), availableCMXMemory(availableCMXMemory) { + } +}; + +struct SetResoursesFuncs { + using AddExecutorFuncType = FuncRef; + using AddTileExecutorFuncType = FuncRef; + using AddSubExecutorFuncType = FuncRef; + using AddMemoryFuncType = FuncRef; + using AddMemoryWithAttrsFuncType = FuncRef; + using AddInnerMemoryFuncType = FuncRef; + using AddInnerMemoryWithAttrsFuncType = + FuncRef; + + AddExecutorFuncType addExecutor; + AddTileExecutorFuncType addTileExecutor; + AddSubExecutorFuncType addSubExecutor; + AddMemoryFuncType addMemory; + AddMemoryWithAttrsFuncType addMemoryWithAttrs; + AddInnerMemoryFuncType addInnerMemory; + AddInnerMemoryWithAttrsFuncType addInnerMemoryWithAttrs; + + SetResoursesFuncs(AddExecutorFuncType addExecutor, AddTileExecutorFuncType addTileExecutor, + AddSubExecutorFuncType addSubExecutor, AddMemoryFuncType addMemory, + AddMemoryWithAttrsFuncType addMemoryWithAttrs, AddInnerMemoryFuncType addInnerMemory, + AddInnerMemoryWithAttrsFuncType addInnerMemoryWithAttrs) + : addExecutor(addExecutor), + addTileExecutor(addTileExecutor), + addSubExecutor(addSubExecutor), + addMemory(addMemory), + addMemoryWithAttrs(addMemoryWithAttrs), + addInnerMemory(addInnerMemory), + addInnerMemoryWithAttrs(addInnerMemoryWithAttrs) { + } +}; + +void setArch(mlir::ModuleOp module, config::ArchKind kind, const Resources& res, const SetResoursesFuncs& funcs, + bool allowCustom) { + VPUX_THROW_WHEN(!allowCustom && module->hasAttr(archAttrName), + "Architecture is already defined, probably you run '--init-compiler' twice"); + + if (!module->hasAttr(archAttrName)) { + module->setAttr(archAttrName, config::ArchKindAttr::get(module.getContext(), kind)); + } + + auto numOfDPUGroups = res.numOfDPUGroups; + auto numOfDMAPorts = res.numOfDMAPorts; + auto availableCMXMemory = res.availableCMXMemory; + + const auto getNumOfDMAPortsVal = [&](int maxDmaPorts) { + int numOfDMAPortsVal = numOfDMAPorts.has_value() ? numOfDMAPorts.value() : maxDmaPorts; + return numOfDMAPortsVal; + }; + + IE::TileResourceOp nceCluster; + + const auto ddrSymbolAttr = mlir::SymbolRefAttr::get(module.getContext(), stringifyEnum(VPU::MemoryKind::DDR)); + const auto cmxSymbolAttr = mlir::SymbolRefAttr::get(module.getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN)); + const auto cmxFragAwareSymbolAttr = mlir::SymbolRefAttr::get(module.getContext(), VPU::CMX_NN_FragmentationAware); + + switch (kind) { + case config::ArchKind::NPU37XX: { + const auto workspaceCMXSize = + availableCMXMemory.has_value() ? availableCMXMemory.value() : VPUX37XX_CMX_WORKSPACE_SIZE; + const auto workspaceFragmentationAwareSize = + availableCMXMemory.has_value() + ? Byte(static_cast(availableCMXMemory.value().count()) * FRAGMENTATION_AVOID_RATIO) + : VPUX37XX_CMX_WORKSPACE_FRAGMENTATION_AWARE_SIZE; + + funcs.addMemoryWithAttrs(ddrSymbolAttr, DDR_HEAP_SIZE, 0.6, 8); + + // Have NN_DMA as shared resource across clusters + funcs.addExecutor(VPU::ExecutorKind::DMA_NN, getNumOfDMAPortsVal(VPUX37XX_MAX_DMA_PORTS)); + nceCluster = funcs.addTileExecutor(numOfDPUGroups); + funcs.addSubExecutor(nceCluster, VPU::ExecutorKind::DPU, 1); + funcs.addSubExecutor(nceCluster, VPU::ExecutorKind::SHAVE_NN, 1); + funcs.addSubExecutor(nceCluster, VPU::ExecutorKind::SHAVE_ACT, VPUX37XX_MAX_SHAVES_PER_TILE); + funcs.addInnerMemoryWithAttrs(nceCluster, cmxSymbolAttr, workspaceCMXSize, 1.0, 32); + funcs.addInnerMemory(nceCluster, cmxFragAwareSymbolAttr, workspaceFragmentationAwareSize); + + break; + } + case config::ArchKind::NPU40XX: { + const auto workspaceCMXSize = + availableCMXMemory.has_value() ? availableCMXMemory.value() : VPUX40XX_CMX_WORKSPACE_SIZE; + const auto workspaceFragmentationAwareSize = + availableCMXMemory.has_value() + ? Byte(static_cast(availableCMXMemory.value().count()) * FRAGMENTATION_AVOID_RATIO) + : VPUX40XX_CMX_WORKSPACE_FRAGMENTATION_AWARE_SIZE; + + funcs.addMemoryWithAttrs(ddrSymbolAttr, DDR_HEAP_SIZE, 0.6, 64); + + // Have NN_DMA as shared resource across clusters + auto numClusters = numOfDPUGroups; + funcs.addExecutor(VPU::ExecutorKind::DMA_NN, + getNumOfDMAPortsVal(std::min(numClusters, VPUX40XX_MAX_DMA_PORTS))); + funcs.addExecutor(VPU::ExecutorKind::M2I, 1); + nceCluster = funcs.addTileExecutor(numClusters); + funcs.addSubExecutor(nceCluster, VPU::ExecutorKind::DPU, 1); + funcs.addSubExecutor(nceCluster, VPU::ExecutorKind::SHAVE_ACT, VPUX40XX_MAX_SHAVES_PER_TILE); + funcs.addInnerMemoryWithAttrs(nceCluster, cmxSymbolAttr, workspaceCMXSize, 1.0, 64); + funcs.addInnerMemory(nceCluster, cmxFragAwareSymbolAttr, workspaceFragmentationAwareSize); + + break; + } + default: + VPUX_THROW("Unsupported architecture '{0}'", kind); + } + + VPUX_THROW_WHEN(!allowCustom && nceCluster.hasProcessorFrequency(), + "Processor frequencyis already defined, probably you run '--init-compiler' twice"); +} +} // namespace + +void vpux::config::setArch(mlir::ModuleOp module, config::ArchKind kind, int numOfDPUGroups, + std::optional numOfDMAPorts, std::optional availableCMXMemory, + bool allowCustomValues) { + const auto addExecutor = [&](VPU::ExecutorKind kind, size_t count) { + VPUX_THROW_WHEN(!allowCustomValues && IE::hasExecutor(module, kind), + "Available executor kind '{0}' was already added", kind); + if (IE::hasExecutor(module, kind)) { + return IE::getAvailableExecutor(module, kind); + } + + return IE::addAvailableExecutor(module, kind, count); + }; + + const auto addTileExecutor = [&](size_t count) { + VPUX_THROW_WHEN(!allowCustomValues && IE::hasTileExecutor(module), "Available tile executor was already added"); + if (IE::hasTileExecutor(module)) { + return IE::getTileExecutor(module); + } + + return IE::addTileExecutor(module, count); + }; + + const auto addSubExecutor = [&](IE::TileResourceOp tileResOp, VPU::ExecutorKind kind, size_t count) { + VPUX_THROW_WHEN(!allowCustomValues && tileResOp.hasSubExecutor(kind), + "Available executor kind '{0}' was already added", kind); + if (tileResOp.hasSubExecutor(kind)) { + return tileResOp.getSubExecutor(kind); + } + + return tileResOp.addSubExecutor(kind, count); + }; + + const auto addAvailableMemory = [&](mlir::SymbolRefAttr memSpace, Byte size) { + VPUX_THROW_WHEN(!allowCustomValues && IE::hasAvailableMemory(module, memSpace), + "Available memory kind '{0}' was already added", memSpace); + if (IE::hasAvailableMemory(module, memSpace)) { + return IE::getAvailableMemory(module, memSpace); + } + + return IE::addAvailableMemory(module, memSpace, size); + }; + + const auto addMemWithAttrs = [&](mlir::SymbolRefAttr memSpace, Byte size, double derateFactor, size_t bandwidth) { + auto mem = addAvailableMemory(memSpace, size); + if (!mem->hasAttr(derateFactorAttrName)) { + mem->setAttr(derateFactorAttrName, getFPAttr(module.getContext(), derateFactor)); + } + + if (!mem->hasAttr(bandwidthAttrName)) { + mem->setAttr(bandwidthAttrName, getIntAttr(module.getContext(), bandwidth)); + } + }; + + const auto addInnerAvailableMemory = [&](IE::TileResourceOp tileResOp, mlir::SymbolRefAttr memSpace, Byte size) { + VPUX_THROW_WHEN(!allowCustomValues && tileResOp.hasAvailableMemory(memSpace), + "Available memory kind '{0}' was already added", memSpace); + if (tileResOp.hasAvailableMemory(memSpace)) { + return tileResOp.getAvailableMemory(memSpace); + } + + return tileResOp.addAvailableMemory(memSpace, size); + }; + + const auto addInnerAvailableMemoryWithAttrs = [&](IE::TileResourceOp tileResOp, mlir::SymbolRefAttr memSpace, + Byte size, double derateFactor, size_t bandwidth) { + auto mem = addInnerAvailableMemory(tileResOp, memSpace, size); + if (!mem->hasAttr(derateFactorAttrName)) { + mem->setAttr(derateFactorAttrName, getFPAttr(module.getContext(), derateFactor)); + } + + if (!mem->hasAttr(bandwidthAttrName)) { + mem->setAttr(bandwidthAttrName, getIntAttr(module.getContext(), bandwidth)); + } + }; + + ::Resources res(numOfDPUGroups, numOfDMAPorts, availableCMXMemory); + ::SetResoursesFuncs funcs(addExecutor, addTileExecutor, addSubExecutor, addAvailableMemory, addMemWithAttrs, + addInnerAvailableMemory, addInnerAvailableMemoryWithAttrs); + + return ::setArch(module, kind, res, funcs, allowCustomValues); +} + +config::ArchKind vpux::config::getArch(mlir::Operation* op) { + auto module = getModuleOp(op); + + if (auto attr = module->getAttr(archAttrName)) { + VPUX_THROW_UNLESS(mlir::isa(attr), + "Module attribute '{0}' has unsupported value '{1}'", archAttrName, attr); + return mlir::cast(attr).getValue(); + } + + return config::ArchKind::UNKNOWN; +} + +bool vpux::config::isArchVPUX3XXX(config::ArchKind arch) { + return (arch == config::ArchKind::NPU37XX); +} + +// +// RevisionID +// + +namespace { + +constexpr StringLiteral revisionIDAttrName = "config.revisionID"; + +} // namespace + +void vpux::config::setRevisionID(mlir::ModuleOp module, RevisionID revisionID) { + module->setAttr(revisionIDAttrName, config::RevisionIDAttr::get(module.getContext(), revisionID)); +} + +bool vpux::config::hasRevisionID(mlir::ModuleOp module) { + return module->hasAttr(revisionIDAttrName); +} + +config::RevisionID vpux::config::getRevisionID(mlir::Operation* op) { + auto module = getModuleOp(op); + + if (module->hasAttr(revisionIDAttrName)) { + if (auto attr = module->getAttr(revisionIDAttrName)) { + VPUX_THROW_UNLESS(mlir::isa(attr), + "Module attribute '{0}' has unsupported value '{1}'", revisionIDAttrName, attr); + + return mlir::cast(attr).getValue(); + } + } + + return config::RevisionID::REVISION_NONE; +} diff --git a/src/vpux_compiler/src/dialect/const/attributes/convert_elem_type.cpp b/src/vpux_compiler/src/dialect/const/attributes/convert_elem_type.cpp index 404eb1dc83..360e93bbe9 100644 --- a/src/vpux_compiler/src/dialect/const/attributes/convert_elem_type.cpp +++ b/src/vpux_compiler/src/dialect/const/attributes/convert_elem_type.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/core/types/quantile_float/types.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/dialect/const/utils/transformations.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/convert_utils.hpp" #include "vpux/compiler/utils/stable_hash.hpp" #include "vpux/utils/core/format.hpp" diff --git a/src/vpux_compiler/src/dialect/const/attributes/dequantize.cpp b/src/vpux_compiler/src/dialect/const/attributes/dequantize.cpp index 188adc5f6d..c146c73bd9 100644 --- a/src/vpux_compiler/src/dialect/const/attributes/dequantize.cpp +++ b/src/vpux_compiler/src/dialect/const/attributes/dequantize.cpp @@ -46,25 +46,30 @@ Const::Content vpux::Const::DequantizeAttr::transform(vpux::Const::Content& inpu auto output = Const::Content::allocTempBuffer(inferOutputType(input.getType()), mlir::Float32Type::get(getContext()), inferOutputSplat(input.isSplat(), input.getType())); - const auto qVals = input.getValues(); auto realVals = output.getTempBuf(); if (const auto uniformType = mlir::dyn_cast(qElemType)) { const auto scale = uniformType.getScale(); const auto zeroPoint = uniformType.getZeroPoint(); - if (const auto quantileUniformType = mlir::dyn_cast(qElemType)) { - const auto quantilesLUT = quantileUniformType.getQuantiles(); - for (size_t i = 0; i < realVals.size(); ++i) { - realVals[i] = dequantizeDouble(quantilesLUT[qVals[i]], scale, zeroPoint); + input.read([&](auto qVals) { + if (const auto quantileUniformType = mlir::dyn_cast(qElemType)) { + const auto quantilesLUT = quantileUniformType.getQuantiles(); + for (size_t i = 0; i < realVals.size(); ++i) { + const auto qVal = checked_cast(qVals[i]); + realVals[i] = dequantizeDouble(quantilesLUT[qVal], scale, zeroPoint); + } + } else { + for (size_t i = 0; i < realVals.size(); ++i) { + const auto qVal = checked_cast(qVals[i]); + realVals[i] = dequantize(qVal, scale, zeroPoint); + } } - } else { - for (size_t i = 0; i < realVals.size(); ++i) { - realVals[i] = dequantize(qVals[i], scale, zeroPoint); - } - } + }); } else if (const auto uniformPerAxisType = mlir::dyn_cast(qElemType)) { + const auto qVals = input.getValues(); + const auto scales = uniformPerAxisType.getScales(); const auto zeroPoints = uniformPerAxisType.getZeroPoints(); const auto axis = Dim(uniformPerAxisType.getQuantizedDimension()); diff --git a/src/vpux_compiler/src/dialect/const/attributes/quantize.cpp b/src/vpux_compiler/src/dialect/const/attributes/quantize.cpp index ae165f2104..342b6aceab 100644 --- a/src/vpux_compiler/src/dialect/const/attributes/quantize.cpp +++ b/src/vpux_compiler/src/dialect/const/attributes/quantize.cpp @@ -108,7 +108,6 @@ template Const::Content transformImpl(mlir::quant::QuantizedType qElemType, mlir::Type outType, mlir::MLIRContext* ctx, vpux::Const::Content& input) { auto output = allocateTempBuffer(qElemType, outType, input.isSplat()); - const auto realVals = input.getValues(); auto qVals = output.getTempBuf(); if (const auto uniformType = mlir::dyn_cast(qElemType)) { @@ -119,10 +118,15 @@ Const::Content transformImpl(mlir::quant::QuantizedType qElemType, mlir::Type ou // qVals.size is 1 when the input is splat, while realVals.size can be greater than 1 // realVals must contain same element at every index when the input is splat. // Use qVals.size to terminate the loop early in this scenario. - for (size_t i = 0; i < qVals.size(); ++i) { - qVals[i] = static_cast(quantizer(realVals[i])); - } + input.read([&](auto realVals) { + for (size_t i = 0; i < qVals.size(); ++i) { + const auto realVal = checked_cast(realVals[i]); + qVals[i] = static_cast(quantizer(realVal)); + } + }); } else if (const auto uniformType = mlir::dyn_cast(qElemType)) { + const auto realVals = input.getValues(); + const auto scales = uniformType.getScales(); const auto zeroPoints = uniformType.getZeroPoints(); const auto axis = Dim(uniformType.getQuantizedDimension()); diff --git a/src/vpux_compiler/src/dialect/const/attributes/reverse.cpp b/src/vpux_compiler/src/dialect/const/attributes/reverse.cpp index 816ede8df7..95bcc6cbb2 100644 --- a/src/vpux_compiler/src/dialect/const/attributes/reverse.cpp +++ b/src/vpux_compiler/src/dialect/const/attributes/reverse.cpp @@ -9,7 +9,7 @@ #include "vpux/compiler/utils/quantization.hpp" #include "vpux/compiler/utils/types.hpp" -#include +#include using namespace vpux; @@ -56,14 +56,10 @@ bool vpux::Const::ReverseAttr::inferOutputSplat(bool inputIsSplat, vpux::NDTypeI return inputIsSplat; } -template -Const::Content reverseImpl(Const::Content& input, NDTypeInterface outputType, int64_t axis) { - const bool nothingToDo = input.isSplat(); - if (nothingToDo) { - return Const::Content::moveBuffer(outputType, std::move(input)); - } +template +Const::Content reverseImpl(ArrayRef inputValues, NDTypeInterface inputType, int64_t axis) { + assert(inputValues.size() > 1 && "Splat case is handled outside of this function"); - const auto inputType = input.getType(); auto inputShape = ShapeRef(inputType.getShape()); const auto inputRank = inputType.getRank(); VPUX_THROW_UNLESS(axis >= 0 && axis < inputRank - 1, @@ -74,14 +70,16 @@ Const::Content reverseImpl(Const::Content& input, NDTypeInterface outputType, in spatialDims *= inputShape[Dim(axisIt)]; } - auto output = - Const::Content::allocTempBuffer(outputType, outputType.getElementType(), - Const::ReverseAttr::inferOutputSplat(input.isSplat(), input.getType())); + const auto outputType = inputType; // Note: in reverse, input type == output type + auto output = Const::Content::allocTempBuffer(outputType, outputType.getElementType(), + Const::ReverseAttr::inferOutputSplat(false, inputType)); auto outBuf = output.getTempBuf(); - const auto inputValues = input.getValues(); - std::copy(inputValues.begin(), inputValues.end(), outBuf.begin()); - + std::transform(inputValues.begin(), inputValues.end(), outBuf.begin(), [](InputType x) { + // E#160869: use CvtHelper here because checked_cast<> cannot properly + // resolve non-standard floating types and "noop" conversion case. + return Const::details::CvtHelper::cvt(x); + }); for (auto it = outBuf.begin(); it < outBuf.end(); it += spatialDims) { std::reverse(it, it + spatialDims); } @@ -96,25 +94,25 @@ Const::Content reverseImpl(Const::Content& input, NDTypeInterface outputType, in Const::Content vpux::Const::ReverseAttr::transform(vpux::Const::Content& input) const { auto inputType = input.getType(); auto inputElementType = inputType.getElementType(); - auto outputType = inferOutputType(input.getType()); + assert(inferOutputType(inputType) == inputType && "reverse transformation cannot change the type"); const auto axis = getAxis().getInt(); if (auto qtype = mlir::dyn_cast_or_null(inputElementType)) { inputElementType = normalizeQuantStorageType(qtype); } - if (inputElementType.isSignedInteger(8)) { - return reverseImpl(input, outputType, axis); - } else if (inputElementType.isUnsignedInteger(8)) { - return reverseImpl(input, outputType, axis); - } else if (inputElementType.isF16()) { - return reverseImpl(input, outputType, axis); - } else if (inputElementType.isBF16()) { - return reverseImpl(input, outputType, axis); - } else if (inputElementType.isF32()) { - return reverseImpl(input, outputType, axis); + VPUX_THROW_UNLESS(inputElementType.isSignedInteger(8) || inputElementType.isUnsignedInteger(8) || + inputElementType.isF16() || inputElementType.isBF16() || inputElementType.isF32(), + "Unexpected data type: {0}", inputElementType); + + if (bool nothingToDo = input.isSplat(); nothingToDo) { + return Const::Content::moveBuffer(inputType, std::move(input)); } - VPUX_THROW("Unexpected data type: {0}", inputElementType); + // Note: reverse could happen after CastElemType and in this case we must + // perform explicit type conversion - dispatch by input element type. + return input.read(inputElementType, [&](auto inputValues, auto dummy) { + return reverseImpl(inputValues, inputType, axis); + }); } // diff --git a/src/vpux_compiler/src/dialect/const/attributes/sparsify.cpp b/src/vpux_compiler/src/dialect/const/attributes/sparsify.cpp index 622c1eadc6..3f3a1f1bc6 100644 --- a/src/vpux_compiler/src/dialect/const/attributes/sparsify.cpp +++ b/src/vpux_compiler/src/dialect/const/attributes/sparsify.cpp @@ -1,17 +1,15 @@ // Copyright (C) 2022-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" #include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/utils/sparsity.hpp" - #include "vpux/utils/core/numeric.hpp" #include -#include - using namespace vpux; // diff --git a/src/vpux_compiler/src/dialect/const/attributes/swizzle_transform.cpp b/src/vpux_compiler/src/dialect/const/attributes/swizzle_transform.cpp index e0f52a63ab..b89fb9d599 100644 --- a/src/vpux_compiler/src/dialect/const/attributes/swizzle_transform.cpp +++ b/src/vpux_compiler/src/dialect/const/attributes/swizzle_transform.cpp @@ -19,7 +19,7 @@ using namespace vpux::BufferTransform; // // vpux::BufferTransform::BufferSwizzleTransform // -BufferSwizzleTransform::BufferSwizzleTransform(uint32_t swizzleKey, VPU::ArchKind archKind) +BufferSwizzleTransform::BufferSwizzleTransform(uint32_t swizzleKey, config::ArchKind archKind) : _addressTransform(swizzleKey, archKind) { } @@ -42,12 +42,12 @@ void AddressTransform::setStaggerBits(uint32_t bits) { _shift = LOG2_RAM_CUT_BYTES - _staggerAddressBits; switch (_archKind) { - case VPU::ArchKind::NPU40XX: // NPU40XX - NN CMX ram cut data width = 32B + case config::ArchKind::NPU40XX: // NPU40XX - NN CMX ram cut data width = 32B _shift++; _log2RamCutDataWidth++; _ramCutAddressMask = (1u << (LOG2_RAM_CUT_BYTES + 1)) - 1u; break; - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: break; default: VPUX_THROW("Unsuported ArchKind {0}", _archKind); @@ -128,7 +128,7 @@ mlir::Attribute vpux::Const::SwizzleConstantAttr::parse(mlir::AsmParser& parser, vpux::NDTypeInterface vpux::Const::SwizzleConstantAttr::inferOutputType(vpux::NDTypeInterface inputType) const { const uint32_t arch = static_cast(*getArch().getValue().getRawData()); - VPU::ArchKind archKind = static_cast(arch); + config::ArchKind archKind = static_cast(arch); const auto newSize = alignSizeForSwizzling(inputType.getTotalAllocSize().count(), getSizeAlignmentForSwizzling(archKind)); // Create a flat type with aligned size based on HW requirements @@ -164,7 +164,7 @@ Const::Content vpux::Const::SwizzleConstantAttr::transform(vpux::Const::Content& const uint32_t swizzleKey = checked_cast(*getSwizzleKey().getValue().getRawData()); const uint32_t dataWidth = checked_cast(input.getType().getElemTypeSize().count()); const uint32_t arch = static_cast(getArch().getValue().getSExtValue()); - VPU::ArchKind archKind = static_cast(arch); + config::ArchKind archKind = static_cast(arch); auto outputType = inferOutputType(input.getType()); BufferSwizzleTransform bufferSwizzleTransform{swizzleKey, archKind}; diff --git a/src/vpux_compiler/src/dialect/const/ops.cpp b/src/vpux_compiler/src/dialect/const/ops.cpp index c297d8877c..c5d108e5d3 100644 --- a/src/vpux_compiler/src/dialect/const/ops.cpp +++ b/src/vpux_compiler/src/dialect/const/ops.cpp @@ -8,9 +8,8 @@ #include "vpux/compiler/dialect/ELFNPU37XX/utils.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" - +#include "vpux/compiler/dialect/core/IR/memref_attr.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" -#include "vpux/compiler/utils/passes.hpp" #include "vpux/compiler/utils/swizzling_utils.hpp" #include diff --git a/src/vpux_compiler/src/dialect/const/passes/apply_swizzling.cpp b/src/vpux_compiler/src/dialect/const/passes/apply_swizzling.cpp index efad536d15..5c5af199ec 100644 --- a/src/vpux_compiler/src/dialect/const/passes/apply_swizzling.cpp +++ b/src/vpux_compiler/src/dialect/const/passes/apply_swizzling.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/passes.hpp" @@ -48,7 +49,7 @@ void ApplySwizzlingPass::safeRunOnFunc() { auto newContentAttr = constOp.getContentAttr() .transform() - .swizzleConstant(getSwizzlingKey(constType), static_cast(VPU::getArch(module))) + .swizzleConstant(getSwizzlingKey(constType), static_cast(config::getArch(module))) .get(); mlir::OpBuilder builder(constOp); auto newConstOp = diff --git a/src/vpux_compiler/src/dialect/const/utils/affine_reshape.cpp b/src/vpux_compiler/src/dialect/const/utils/affine_reshape.cpp index 8cf110493f..760a9f0423 100644 --- a/src/vpux_compiler/src/dialect/const/utils/affine_reshape.cpp +++ b/src/vpux_compiler/src/dialect/const/utils/affine_reshape.cpp @@ -4,6 +4,8 @@ // #include "vpux/compiler/dialect/const/utils/affine_reshape.hpp" +#include "vpux/compiler/utils/attributes.hpp" + #include using namespace vpux; diff --git a/src/vpux_compiler/src/dialect/const/utils/content.cpp b/src/vpux_compiler/src/dialect/const/utils/content.cpp index 9a19983eab..11842ce95b 100644 --- a/src/vpux_compiler/src/dialect/const/utils/content.cpp +++ b/src/vpux_compiler/src/dialect/const/utils/content.cpp @@ -125,7 +125,7 @@ void vpux::Const::Content::copySubByteContent(MutableArrayRef targetData, if (_storageElemType.isInteger(1)) { subByteValue = _data.data().front() & 1; } else { - subByteValue = getValues()[0] & mask; + subByteValue = getSplatValue() & mask; } for (int64_t shift = 0; shift < numShifts; shift++) { @@ -179,18 +179,6 @@ void vpux::Const::Content::copyTo(MutableArrayRef targetData) const { return; } - // E#160872: float16 splats are special due to (obscure) overflow semantics - // handling, but float16 non-splats are not special?! - const bool isTrivialStorage = (elemType == _storageElemType); - if (!_isSplat && isTrivialStorage) { - VPUX_THROW_UNLESS(targetData.size() >= _data.size(), - "Byte sizes of the target buffer '{0}' is smaller then storage buffer '{1}' ", - targetData.size(), _data.size()); - auto srcData = _data.data(); - std::memcpy(targetData.data(), srcData.data(), srcData.size()); - return; - } - read(elemType, [&](auto srcData, auto dummy) { fillBuf(srcData, targetData); }); diff --git a/src/vpux_compiler/src/dialect/const/utils/content_setup.cpp b/src/vpux_compiler/src/dialect/const/utils/content_setup.cpp index dc3fe10ece..6443c9ed4d 100644 --- a/src/vpux_compiler/src/dialect/const/utils/content_setup.cpp +++ b/src/vpux_compiler/src/dialect/const/utils/content_setup.cpp @@ -6,7 +6,10 @@ #include #include "vpux/compiler/dialect/const/attr_interfaces.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" +#ifdef BACKGROUND_FOLDING_ENABLED #include "vpux/compiler/dialect/const/utils/constant_folding_cache.hpp" +#endif +#include "vpux/compiler/utils/attributes.hpp" namespace vpux::Const { namespace detail { diff --git a/src/vpux_compiler/src/dialect/const/utils/transformations.cpp b/src/vpux_compiler/src/dialect/const/utils/transformations.cpp index 1b875d8b77..3605e47ce5 100644 --- a/src/vpux_compiler/src/dialect/const/utils/transformations.cpp +++ b/src/vpux_compiler/src/dialect/const/utils/transformations.cpp @@ -9,6 +9,7 @@ #include "vpux/compiler/dialect/const/utils/affine_reshape.hpp" #include "vpux/compiler/dialect/const/utils/constant_folding_cache.hpp" #include "vpux/compiler/dialect/const/utils/mem_permute_optimized.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/loop.hpp" #include "vpux/compiler/utils/permute_utils.hpp" #include "vpux/compiler/utils/quantization.hpp" diff --git a/src/vpux_compiler/src/dialect/const/utils/utils.cpp b/src/vpux_compiler/src/dialect/const/utils/utils.cpp index caf4313f85..15e69002eb 100644 --- a/src/vpux_compiler/src/dialect/const/utils/utils.cpp +++ b/src/vpux_compiler/src/dialect/const/utils/utils.cpp @@ -63,7 +63,7 @@ mlir::StringRef getOvKey(Const::DeclareOp declareOp) { static_assert(std::is_same_v, "Cannot return StringRef if the underlying getResourceName() doesn't return it - potential dangling " "reference otherwise"); - if (key.starts_with(Const::OPENVINO_CONST_PREFIX)) { + if (key.starts_with(Const::IMPORTED_WEIGHT_PREFIX)) { return key; } return {}; @@ -248,4 +248,22 @@ void appendContentToVector(Const::Content& content, MutableArrayRef buffer start += bufSizeBytes; } +bool hasSparsifyTransformation(const Const::DeclareOp& constOp) { + const auto& contentAttr = constOp.getContentAttr(); + const auto transformations = contentAttr.getTransformations(); + if (transformations.empty()) { + return false; + } + + auto sparsifyTransformationIt = + std::find_if(transformations.rbegin(), transformations.rend(), [](Const::TransformAttrInterface tr) { + return mlir::isa(tr); + }); + if (sparsifyTransformationIt == transformations.rend()) { + return false; + } + + return true; +} + } // namespace vpux::Const diff --git a/src/vpux_compiler/src/dialect/core/IR/dynamic_attrs.cpp b/src/vpux_compiler/src/dialect/core/IR/dynamic_attrs.cpp index 30d040788e..e299f1d394 100644 --- a/src/vpux_compiler/src/dialect/core/IR/dynamic_attrs.cpp +++ b/src/vpux_compiler/src/dialect/core/IR/dynamic_attrs.cpp @@ -10,13 +10,13 @@ using namespace vpux; namespace { inline void assertBound(int64_t dimValue, int64_t bound) { - VPUX_THROW_WHEN(bound < 1, "Got non-positive shape dim bound: '{0}'", bound); + VPUX_THROW_WHEN(bound < 0, "Got negative shape dim bound: '{0}'", bound); VPUX_THROW_WHEN(dimValue != bound && dimValue > 0, "Got mismatching shape dim size: '{0}' and bound: '{1}'", dimValue, bound); } inline void assertMask(int64_t dimValue) { - VPUX_THROW_WHEN(dimValue < 1, "Got non-positive shape dim size or bound: '{0}'", dimValue); + VPUX_THROW_WHEN(dimValue < 0, "Got negative shape dim size or bound: '{0}'", dimValue); } } // namespace @@ -51,16 +51,16 @@ int64_t BoundedDim::reifiedSize() const { return _bound; } -BoundedDim vpux::operator+(const BoundedDim& x, const BoundedDim& y) { - return BoundedDim::apply(x, y, std::plus<>()); +BoundedDim BoundedDim::operator+(const BoundedDim& other) const { + return apply(*this, other, std::plus<>()); } -BoundedDim vpux::operator-(const BoundedDim& x, const BoundedDim& y) { - return BoundedDim::apply(x, y, std::minus<>()); +BoundedDim BoundedDim::operator-(const BoundedDim& other) const { + return apply(*this, other, std::minus<>()); } -BoundedDim vpux::operator*(const BoundedDim& x, const BoundedDim& y) { - return BoundedDim::apply(x, y, std::multiplies<>()); +BoundedDim BoundedDim::operator*(const BoundedDim& other) const { + return apply(*this, other, std::multiplies<>()); } BoundedDim& BoundedDim::operator+=(const BoundedDim& other) { @@ -75,28 +75,64 @@ BoundedDim& BoundedDim::operator*=(const BoundedDim& other) { return *this = *this * other; } -bool vpux::operator==(const BoundedDim& x, const BoundedDim& y) { - return x.reifiedSize() == y.reifiedSize(); +bool BoundedDim::operator==(const BoundedDim& other) const { + return this->reifiedSize() == other.reifiedSize(); } -bool vpux::operator!=(const BoundedDim& x, const BoundedDim& y) { +bool BoundedDim::operator!=(const BoundedDim& other) const { + return !(*this == other); +} + +bool BoundedDim::operator<(const BoundedDim& other) const { + return this->reifiedSize() < other.reifiedSize(); +} + +bool BoundedDim::operator>(const BoundedDim& other) const { + return this->reifiedSize() > other.reifiedSize(); +} + +bool BoundedDim::operator<=(const BoundedDim& other) const { + return this->reifiedSize() <= other.reifiedSize(); +} + +bool BoundedDim::operator>=(const BoundedDim& other) const { + return this->reifiedSize() >= other.reifiedSize(); +} + +BoundedDim vpux::operator+(int64_t x, const BoundedDim& y) { + return BoundedDim::apply(BoundedDim(x), y, std::plus<>()); +} + +BoundedDim vpux::operator-(int64_t x, const BoundedDim& y) { + return BoundedDim::apply(BoundedDim(x), y, std::minus<>()); +} + +BoundedDim vpux::operator*(int64_t x, const BoundedDim& y) { + return BoundedDim::apply(BoundedDim(x), y, std::multiplies<>()); +} + +bool vpux::operator==(int64_t x, const BoundedDim& y) { + return x == y.reifiedSize(); +} + +bool vpux::operator!=(int64_t x, const BoundedDim& y) { return !(x == y); } -bool vpux::operator<(const BoundedDim& x, const BoundedDim& y) { - return x.reifiedSize() < y.reifiedSize(); +bool vpux::operator<(int64_t x, const BoundedDim& y) { + return x < y.reifiedSize(); } -bool vpux::operator>(const BoundedDim& x, const BoundedDim& y) { - return x.reifiedSize() > y.reifiedSize(); +bool vpux::operator>(int64_t x, const BoundedDim& y) { + return x > y.reifiedSize(); } -bool vpux::operator<=(const BoundedDim& x, const BoundedDim& y) { - return x.reifiedSize() <= y.reifiedSize(); +bool vpux::operator<=(int64_t x, const BoundedDim& y) { + return x <= y.reifiedSize(); } -bool vpux::operator>=(const BoundedDim& x, const BoundedDim& y) { - return x.reifiedSize() >= y.reifiedSize(); +bool vpux::operator>=(int64_t x, const BoundedDim& y) { + return x >= y.reifiedSize(); } // @@ -129,16 +165,16 @@ int64_t MaskedDim::reifiedSize() const { return _dimValue; } -MaskedDim vpux::operator+(const MaskedDim& x, const MaskedDim& y) { - return MaskedDim::apply(x, y, std::plus<>()); +MaskedDim MaskedDim::operator+(const MaskedDim& other) const { + return apply(*this, other, std::plus<>()); } -MaskedDim vpux::operator-(const MaskedDim& x, const MaskedDim& y) { - return MaskedDim::apply(x, y, std::minus<>()); +MaskedDim MaskedDim::operator-(const MaskedDim& other) const { + return apply(*this, other, std::minus<>()); } -MaskedDim vpux::operator*(const MaskedDim& x, const MaskedDim& y) { - return MaskedDim::apply(x, y, std::multiplies<>()); +MaskedDim MaskedDim::operator*(const MaskedDim& other) const { + return apply(*this, other, std::multiplies<>()); } MaskedDim& MaskedDim::operator+=(const MaskedDim& other) { @@ -153,26 +189,62 @@ MaskedDim& MaskedDim::operator*=(const MaskedDim& other) { return *this = *this * other; } -bool vpux::operator==(const MaskedDim& x, const MaskedDim& y) { - return x.reifiedSize() == y.reifiedSize(); +bool MaskedDim::operator==(const MaskedDim& other) const { + return this->reifiedSize() == other.reifiedSize(); +} + +bool MaskedDim::operator!=(const MaskedDim& other) const { + return !(*this == other); +} + +bool MaskedDim::operator<(const MaskedDim& other) const { + return this->reifiedSize() < other.reifiedSize(); +} + +bool MaskedDim::operator>(const MaskedDim& other) const { + return this->reifiedSize() > other.reifiedSize(); +} + +bool MaskedDim::operator<=(const MaskedDim& other) const { + return this->reifiedSize() <= other.reifiedSize(); +} + +bool MaskedDim::operator>=(const MaskedDim& other) const { + return this->reifiedSize() >= other.reifiedSize(); +} + +MaskedDim vpux::operator+(int64_t x, const MaskedDim& y) { + return MaskedDim::apply(MaskedDim(x), y, std::plus<>()); +} + +MaskedDim vpux::operator-(int64_t x, const MaskedDim& y) { + return MaskedDim::apply(MaskedDim(x), y, std::minus<>()); +} + +MaskedDim vpux::operator*(int64_t x, const MaskedDim& y) { + return MaskedDim::apply(MaskedDim(x), y, std::multiplies<>()); +} + +bool vpux::operator==(int64_t x, const MaskedDim& y) { + return x == y.reifiedSize(); } -bool vpux::operator!=(const MaskedDim& x, const MaskedDim& y) { +bool vpux::operator!=(int64_t x, const MaskedDim& y) { return !(x == y); } -bool vpux::operator<(const MaskedDim& x, const MaskedDim& y) { - return x.reifiedSize() < y.reifiedSize(); +bool vpux::operator<(int64_t x, const MaskedDim& y) { + return x < y.reifiedSize(); } -bool vpux::operator>(const MaskedDim& x, const MaskedDim& y) { - return x.reifiedSize() > y.reifiedSize(); +bool vpux::operator>(int64_t x, const MaskedDim& y) { + return x > y.reifiedSize(); } -bool vpux::operator<=(const MaskedDim& x, const MaskedDim& y) { - return x.reifiedSize() <= y.reifiedSize(); +bool vpux::operator<=(int64_t x, const MaskedDim& y) { + return x <= y.reifiedSize(); } -bool vpux::operator>=(const MaskedDim& x, const MaskedDim& y) { - return x.reifiedSize() >= y.reifiedSize(); +bool vpux::operator>=(int64_t x, const MaskedDim& y) { + return x >= y.reifiedSize(); } diff --git a/src/vpux_compiler/src/dialect/core/IR/memref_attr.cpp b/src/vpux_compiler/src/dialect/core/IR/memref_attr.cpp index b1d860a1cb..cc2ca93249 100644 --- a/src/vpux_compiler/src/dialect/core/IR/memref_attr.cpp +++ b/src/vpux_compiler/src/dialect/core/IR/memref_attr.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/dialect/core/IR/memref_attr.hpp" #include "vpux/compiler/core/attributes/stride_reqs.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/utils/core/checked_cast.hpp" #include "vpux/utils/core/error.hpp" #include "vpux/utils/core/range.hpp" diff --git a/src/vpux_compiler/src/dialect/core/interfaces/type_interfaces.cpp b/src/vpux_compiler/src/dialect/core/interfaces/type_interfaces.cpp index 78f8834733..9bf6b98066 100644 --- a/src/vpux_compiler/src/dialect/core/interfaces/type_interfaces.cpp +++ b/src/vpux_compiler/src/dialect/core/interfaces/type_interfaces.cpp @@ -7,11 +7,13 @@ #include "vpux/compiler/core/attributes/stride_reqs.hpp" #include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" #include "vpux/compiler/dialect/core/IR/attributes.hpp" #include "vpux/compiler/dialect/core/IR/dynamic_attrs.hpp" #include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/compression_utils.hpp" #include "vpux/compiler/utils/memref_attr_utils.hpp" #include "vpux/compiler/utils/quantization.hpp" @@ -377,23 +379,28 @@ vpux::NDTypeInterface TensorNDTypeInterface::pad(mlir::Type type, vpux::ShapeRef VPUX_THROW_UNLESS(padBefore.size() == padAfter.size(), "Got non consistent 'padBefore' and 'padAfter' values"); VPUX_THROW_UNLESS(origShape.size() == padBefore.size(), "Paddings and input shape are not consistent"); - Shape newShape(origShape.size()); - for (auto ind : irange(newShape.size())) { - const auto d = Dim(ind); - newShape[d] = origShape[d] + padBefore[d] + padAfter[d]; - } + return callOnShapeOf(type, [&](const auto& inShape) { + auto outShape = copyShape(inShape); + for (auto ind : irange(inShape.size())) { + const auto d = Dim(ind); + outShape[d] = inShape[d] + padBefore[d] + padAfter[d]; + } + auto [outStaticShape, outBounds, outDimMask] = splitShapeAndRepresentation(outShape); - auto elemType = getElementType(type); - if (const auto perAxisQType = mlir::dyn_cast(elemType)) { - elemType = expandScalesAndZP(perAxisQType, padBefore, padAfter); - } + auto elemType = getElementType(type); + if (const auto perAxisQType = mlir::dyn_cast(elemType)) { + elemType = expandScalesAndZP(perAxisQType, padBefore, padAfter); + } - const auto newType = vpux::getTensorType(newShape, elemType, getDimsOrder(type), getMemSpace(type), getBounds(type), - getDynamicDimsMask(type)); - const auto loc = mlir::UnknownLoc::get(type.getContext()); - VPUX_THROW_UNLESS(vpux::validateQuantElemType(loc, newType).succeeded(), "Got invalid ShapedType '{0}'", newType); + const auto newType = vpux::getTensorType(outStaticShape, elemType, getDimsOrder(type), getMemSpace(type), + outBounds, outDimMask); - return newType; + const auto loc = mlir::UnknownLoc::get(type.getContext()); + VPUX_THROW_UNLESS(vpux::validateQuantElemType(loc, newType).succeeded(), "Got invalid ShapedType '{0}'", + newType); + + return newType; + }); } // diff --git a/src/vpux_compiler/src/dialect/core/transforms/passes/add_netinfo_to_module.cpp b/src/vpux_compiler/src/dialect/core/transforms/passes/add_netinfo_to_module.cpp index c1d0c113ca..1697bde5da 100644 --- a/src/vpux_compiler/src/dialect/core/transforms/passes/add_netinfo_to_module.cpp +++ b/src/vpux_compiler/src/dialect/core/transforms/passes/add_netinfo_to_module.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/core/transforms/passes.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/net/network_info_utils.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/types.hpp" namespace vpux::Core { @@ -24,18 +25,19 @@ namespace { class AddNetInfoToModule final : public Core::impl::AddNetInfoToModuleBase { public: - explicit AddNetInfoToModule(Logger log) { + explicit AddNetInfoToModule(Logger log, bool hasTensorSemantics) { Base::initLogger(log, Base::getArgumentName()); + this->hasTensorSemantics = hasTensorSemantics; } private: void safeRunOnModule() final; }; -net::NetworkInfoOp createNetInfoForFuncOp(mlir::func::FuncOp funcOp) { +net::NetworkInfoOp createNetInfoForFuncOp(mlir::func::FuncOp funcOp, bool hasTensorSemantics) { auto ctx = funcOp.getContext(); mlir::OpBuilder builder(ctx); - auto netInfo = builder.create(funcOp.getLoc(), + auto netInfo = builder.create(appendLoc(funcOp.getLoc(), "nested_network_info"), mlir::FlatSymbolRefAttr::get(ctx, funcOp.getName()), false); net::setupSections(netInfo); @@ -45,10 +47,14 @@ net::NetworkInfoOp createNetInfoForFuncOp(mlir::func::FuncOp funcOp) { auto& inputRegion = netInfo.getInputsInfo(); builder.setInsertionPointToStart(&inputRegion.front()); - for (unsigned i = 0; i < funcType.getNumInputs() - funcType.getNumResults(); ++i) { + auto numOfInputs = + hasTensorSemantics ? funcType.getNumInputs() : funcType.getNumInputs() - funcType.getNumResults(); + + for (unsigned i = 0; i < numOfInputs; ++i) { auto argType = mlir::cast(funcType.getInput(i)); const auto newType = mlir::RankedTensorType::get(argType.getShape(), argType.getElementType(), nullptr); - builder.create(funcOp.getLoc(), formatv("in_{0}", i).str(), newType); + auto name = formatv("in_{0}", i).str(); + builder.create(appendLoc(funcOp.getLoc(), name), name, newType); } // Handle outputs @@ -58,7 +64,8 @@ net::NetworkInfoOp createNetInfoForFuncOp(mlir::func::FuncOp funcOp) { for (unsigned i = 0; i < funcType.getNumResults(); ++i) { auto resType = mlir::cast(funcType.getResult(i)); const auto newType = mlir::RankedTensorType::get(resType.getShape(), resType.getElementType(), nullptr); - builder.create(funcOp.getLoc(), formatv("out_{0}", i).str(), newType); + auto name = formatv("out_{0}", i).str(); + builder.create(appendLoc(funcOp.getLoc(), name), name, newType); } return netInfo; @@ -72,6 +79,12 @@ void AddNetInfoToModule::safeRunOnModule() { auto funcOps = module.getOps(); auto it = funcOps.begin(); + // Module without funcOp indicates reserved memory module. Skip this pass + // for such modules. + if (std::distance(it, funcOps.end()) == 0) { + return; + } + if (std::distance(it, funcOps.end()) != 1) { module->emitError("Module must contain exactly one function to add NetworkInfoOp"); return signalPassFailure(); @@ -82,7 +95,7 @@ void AddNetInfoToModule::safeRunOnModule() { return signalPassFailure(); } - auto netInfo = createNetInfoForFuncOp(*it); + auto netInfo = createNetInfoForFuncOp(*it, hasTensorSemantics); builder.setInsertionPointToStart(module.getBody()); builder.insert(netInfo); _log.trace("Added NetworkInfoOp to module '{0}'", module.getSymName()); @@ -94,6 +107,6 @@ void AddNetInfoToModule::safeRunOnModule() { // createAddNetInfoToModulePass // -std::unique_ptr vpux::Core::createAddNetInfoToModulePass(Logger log) { - return std::make_unique(log); +std::unique_ptr vpux::Core::createAddNetInfoToModulePass(Logger log, bool hasTensorSemantics) { + return std::make_unique(log, hasTensorSemantics); } diff --git a/src/vpux_compiler/src/dialect/core/transforms/passes/unpack_nested_modules.cpp b/src/vpux_compiler/src/dialect/core/transforms/passes/unpack_nested_modules.cpp index dfd51759ed..6fcbe4edb8 100644 --- a/src/vpux_compiler/src/dialect/core/transforms/passes/unpack_nested_modules.cpp +++ b/src/vpux_compiler/src/dialect/core/transforms/passes/unpack_nested_modules.cpp @@ -3,15 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include -#include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/core/IR/ops.hpp" #include "vpux/compiler/dialect/core/transforms/passes.hpp" #include "vpux/compiler/utils/analysis.hpp" -#include "vpux/compiler/utils/logging.hpp" -#include "vpux/compiler/utils/passes.hpp" +#include "vpux/utils/core/array_ref.hpp" + +#include +#include +#include namespace vpux::Core { #define GEN_PASS_DECL_UNPACKNESTEDMODULES @@ -63,6 +62,15 @@ SmallVector UnpackNestedModulesPass::collectTopLevelNestedModule return mlir::WalkResult::advance(); } + auto funcOps = nestedModule.getOps(); + auto it = funcOps.begin(); + + // Module without funcOp indicates reserved memory module. Skip this pass + // for such modules. + if (std::distance(it, funcOps.end()) == 0) { + return mlir::WalkResult::advance(); + } + const bool directChildOfMainModule = (nestedModule->getParentOfType() == mainModule); if (directChildOfMainModule) { _log.trace("Found top-level nested module '{0}' inside '{1}'", nestedModule.getSymName(), diff --git a/src/vpux_compiler/src/dialect/core/transforms/passes/ws_fold_reinterpret_cast_into_const.cpp b/src/vpux_compiler/src/dialect/core/transforms/passes/ws_fold_reinterpret_cast_into_const.cpp new file mode 100644 index 0000000000..82390b0fe5 --- /dev/null +++ b/src/vpux_compiler/src/dialect/core/transforms/passes/ws_fold_reinterpret_cast_into_const.cpp @@ -0,0 +1,74 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/const/attributes/content.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/core/IR/ops.hpp" +#include "vpux/compiler/dialect/core/transforms/passes.hpp" +#include "vpux/compiler/utils/logging.hpp" + +#include + +namespace vpux::Core { +#define GEN_PASS_DECL_WSFOLDREINTERPRETCASTINTOCONST +#define GEN_PASS_DEF_WSFOLDREINTERPRETCASTINTOCONST +#include "vpux/compiler/dialect/core/passes.hpp.inc" +} // namespace vpux::Core + +using namespace vpux; + +namespace { + +class FoldReinterpretCastIntoConst final : + public Core::impl::WsFoldReinterpretCastIntoConstBase { +public: + explicit FoldReinterpretCastIntoConst(const Logger& log) { + Base::initLogger(log, Base::getArgumentName()); + } + +private: + void safeRunOnFunc() final; +}; + +void FoldReinterpretCastIntoConst::safeRunOnFunc() { + auto funcOp = getOperation(); + + OpBuilderLogger listener(_log); + mlir::OpBuilder builder(&getContext(), &listener); + + funcOp.walk([&](Core::ReinterpretCastOp castOp) { + auto constOp = castOp.getInput().getDefiningOp(); + if (constOp == nullptr) { + return; + } + const auto outputType = mlir::cast(castOp.getOutput().getType()); + + auto contentAttr = constOp.getContentAttr(); + + // Note: this pass performs *eager* constant folding which means + // potentially significant RAM usage during compilation. this is fine + // here because the pass is *never* supposed to be used outside of + // testing pipelines. + auto content = contentAttr.fold(); + const auto tensorType = mlir::RankedTensorType::get(outputType.getShape(), outputType.getElementType()); + auto denseAttr = mlir::DenseElementsAttr::getFromRawBuffer(tensorType, content.getRawStorageBuf()); + + auto newContentAttr = Const::ContentAttr::get(denseAttr); + + builder.setInsertionPoint(castOp); + auto newConstOp = builder.create(castOp->getLoc(), newContentAttr.getType(), newContentAttr); + castOp.replaceAllUsesWith(newConstOp.getResult()); + castOp.erase(); + if (constOp->getUses().empty()) { + constOp.erase(); + } + }); +} + +} // namespace + +std::unique_ptr vpux::Core::createWsFoldReinterpretCastIntoConstPass(const Logger& log) { + return std::make_unique(log); +} diff --git a/src/vpux_compiler/src/dialect/net/IR/ops/network_info.cpp b/src/vpux_compiler/src/dialect/net/IR/ops/network_info.cpp index 88dd2e689e..bc020b970d 100644 --- a/src/vpux_compiler/src/dialect/net/IR/ops/network_info.cpp +++ b/src/vpux_compiler/src/dialect/net/IR/ops/network_info.cpp @@ -13,6 +13,7 @@ #include "vpux/compiler/utils/error.hpp" #include "vpux/utils/core/range.hpp" +#include #include using namespace vpux; @@ -108,13 +109,24 @@ void net::NetworkInfoOp::build(mlir::OpBuilder& builder, mlir::OperationState& s } mlir::LogicalResult net::NetworkInfoOp::verifySymbolUses(mlir::SymbolTableCollection& symbolTable) { + auto& cnnOp = *this; + const bool hostCompileMode = config::getCompilationMode(cnnOp) == config::CompilationMode::HostCompile; auto netFunc = symbolTable.lookupNearestSymbolFrom(*this, getEntryPointAttr()); if (netFunc == nullptr) { + if (hostCompileMode) { + // For host compilation, mlir::func::FuncOp is transformed to LLVMFuncOp in ConvertFuncToLLVMPass + // So, if netFunc is null and llvmFuncOp is not null, skip netinfo verification + // Later, revisit here if an additional pass is added to remove netinfo or transform it to something + // global binary + auto llvmFuncOp = symbolTable.lookupNearestSymbolFrom(*this, getEntryPointAttr()); + if (llvmFuncOp != nullptr) { + return mlir::success(); + } + } return errorAt(*this, "entryPoint '@{0}' doesn't refer to existing Function", getEntryPoint()); } - auto& cnnOp = *this; auto inputsInfo = to_small_vector(this->getInputsInfo().getOps()); auto outputsInfo = to_small_vector(this->getOutputsInfo().getOps()); SmallVector profilingOutputsInfo; @@ -128,7 +140,6 @@ mlir::LogicalResult net::NetworkInfoOp::verifySymbolUses(mlir::SymbolTableCollec const bool hoistedIOs = (netFuncType.getNumInputs() == 0) && (netFuncType.getNumResults() == 0); // Note: host compilation pipeline generate LLVM main function w/ no return value in ConvertToLLVMUMDCallsPass // This is to alleviate output buffer verification for host compilation - const bool hostCompileMode = config::getCompilationMode(cnnOp) == config::CompilationMode::HostCompile; const bool resultVerificationDisabled = hoistedIOs || (hostCompileMode && (netFuncType.getResults().size() == 0)); if (checkFunctionPrototype(cnnOp, netFunc, inputsInfo, outputsInfo, profilingOutputsInfo, @@ -180,7 +191,7 @@ mlir::LogicalResult net::NetworkInfoOp::verifySymbolUses(mlir::SymbolTableCollec }); size_t argOffset = 0; - ArrayRef outputTypes; + ArrayRef outputTypes; if (isArgsBufferized && args.size() > inputsInfo.size()) { argOffset = inputsInfo.size(); outputTypes = netFuncType.getInputs(); diff --git a/src/vpux_compiler/src/frontend/IE.cpp b/src/vpux_compiler/src/frontend/IE.cpp index 0406c0e226..b2c6c8432b 100644 --- a/src/vpux_compiler/src/frontend/IE.cpp +++ b/src/vpux_compiler/src/frontend/IE.cpp @@ -4,32 +4,45 @@ // #include "vpux/compiler/frontend/IE.hpp" - #include "vpux/compiler/core/attributes/dims_order.hpp" #include "vpux/compiler/core/types/quantile_float/dialect.hpp" #include "vpux/compiler/core/types/quantile_float/types.hpp" #include "vpux/compiler/dialect/IE/IR/attributes.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/bitwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/const/attributes/content.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/dialect/const/utils/sub_byte.hpp" #include "vpux/compiler/dialect/const/utils/utils.hpp" -#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" #include "vpux/compiler/utils/IE/locations.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/cal_range_data.hpp" -#include "vpux/compiler/utils/infer_output_shape.hpp" #include "vpux/compiler/utils/logging.hpp" #include "vpux/compiler/utils/net/network_info_utils.hpp" +#include "vpux/compiler/utils/range_bound.hpp" #include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/strings.hpp" #include "vpux/compiler/utils/types.hpp" - #include "vpux/utils/core/array_ref.hpp" #include "vpux/utils/core/checked_cast.hpp" #include "vpux/utils/core/error.hpp" @@ -42,13 +55,19 @@ #include "intel_npu/config/config.hpp" +#include +#include #include +#include +#include #include #include +#include #include #include #include +#include #include #include #include @@ -118,14 +137,6 @@ #include #include -#include -#include -#include -#include -#include -#include -#include - #include #include @@ -1057,11 +1068,34 @@ void NGraphImporter::parseNode(mlir::OpBuilder& builder, const std::shared_ptrget_data_ptr(), bufferSize); // DenseElementsAttr has very limited support for sub byte type (only I1 is supported). // Therefore, we need to avoid using DenseElementsAttr to store sub byte type. + if (!vpux::Const::isSubByte(bitWidth) && !_sharedConstants) { return mlir::DenseElementsAttr::getFromRawBuffer(tensorType, rawBuffer); } - return vpux::Const::createExternalConstContent(tensorType, rawBuffer, vpux::Const::OPENVINO_CONST_PREFIX); + std::string cstName; + ov::RTMap& runtimeInfoMap = origNode->get_rt_info(); + + cstName = [&]() -> std::string { + const auto& weightlessCacheAttrIt = + runtimeInfoMap.find(ov::WeightlessCacheAttribute::get_type_info_static()); + + if (weightlessCacheAttrIt != runtimeInfoMap.end()) { + auto& weightlessCacheAttr = weightlessCacheAttrIt->second.as(); + if (origNode->get_element_type() == weightlessCacheAttr.original_dtype) { + return formatv("{0}{1}", vpux::Const::IMPORTED_WEIGHT_PREFIX, weightlessCacheAttr.bin_offset).str(); + } else { + Logger::global().debug( + "Weightless cache attribute type {0} does not match constant type {1} for node '{2}'", + weightlessCacheAttr.original_dtype, origNode->get_element_type(), + origNode->get_friendly_name()); + } + } + + return "INTERNAL_CONSTANT"; + }(); + + return Const::createExternalConstContent(tensorType, rawBuffer, cstName); }(); Const::ContentSetup contentSetup(value.getType()); @@ -3626,7 +3660,7 @@ void NGraphImporter::parseNode(mlir::OpBuilder& builder, const std::shared_ptrget_variable_id()); - auto op = builder.create(createLocation(origNode), inputs[0], nameAttr); + auto op = builder.create(createLocation(origNode), inputs[0], nameAttr, nullptr, nullptr); addOutputs(origNode, op); } @@ -3635,12 +3669,23 @@ void NGraphImporter::parseNode(mlir::OpBuilder& builder, const std::shared_ptrget_friendly_name(), inputs.size()); const auto nameAttr = mlir::StringAttr::get(_ctx, origNode->get_variable_id()); + const auto elementType = importPrecision(_ctx, origNode->get_element_type()); + const auto shape = importShape(origNode->get_shape()); + mlir::Value inputTensor; + if (inputs.size() == 0) { + // If the 1st input is not provided, ReadValue returns the tensor with zero values + const auto inputType = mlir::RankedTensorType::get({shape}, elementType); + inputTensor = Const::createZerosConst(builder, createLocation(origNode), inputType); + } else { + inputTensor = inputs[0]; + } - auto op = builder.create(createLocation(origNode), inputs[0], nameAttr); + auto op = builder.create(createLocation(origNode), inputTensor, nameAttr, + mlir::TypeAttr::get(elementType), getIntArrayAttr(_ctx, shape)); addOutputs(origNode, op); } @@ -4982,7 +5027,8 @@ static void addCommonOptimizationsPasses(ov::pass::Manager& manager) { manager.register_pass(); } -void NGraphPasses::runNGraphPasses(const std::shared_ptr& netGraph, mlir::TimingScope& rootTiming) { +void NGraphPasses::runNGraphPasses(const std::shared_ptr& netGraph, mlir::TimingScope& rootTiming, + bool isWeightsSeparationPath) { auto scopeTiming = rootTiming.nest("Common nGraph passes"); ov::pass::Manager manager; @@ -5003,7 +5049,11 @@ void NGraphPasses::runNGraphPasses(const std::shared_ptr& netGraph, m manager.register_pass(decompression_precisions, /*fold_subtract_const=*/true); manager.register_pass(); passConfig->set_callback( - [](const std::shared_ptr& node) -> bool { + [&](const std::shared_ptr& node) -> bool { + if (isWeightsSeparationPath) { + return false; + } + return skipKeepConstAndDecompressionForNode(node); }); manager.register_pass(); @@ -5206,9 +5256,8 @@ void dynamicToStaticShape(const std::shared_ptr& model) { mlir::OwningOpRef vpux::IE::importNetwork( mlir::MLIRContext* ctx, const std::shared_ptr& model, const std::vector>& originalParameters, - const std::vector>& originalResults, bool sharedConstants, - mlir::TimingScope& rootTiming, bool enableProfiling, vpux::DummyOpMode stubLayers, bool dynamicShapeToStatic, - Logger log) { + const std::vector>& originalResults, mlir::TimingScope& rootTiming, + const vpux::IE::ImportNetworkConfig& importCfg, Logger log) { log.setName("IE::FrontEnd::importNetwork"); log.trace("Load IE::FrontEnd dependent Dialects"); @@ -5217,12 +5266,12 @@ mlir::OwningOpRef vpux::IE::importNetwork( ctx->loadDialect(); ctx->loadDialect(); - if (dynamicShapeToStatic && model->is_dynamic()) { + if (importCfg.dynamicShapeToStatic && model->is_dynamic()) { dynamicToStaticShape(model); } log.trace("Run common nGraph passes"); - NGraphPasses::runNGraphPasses(model, rootTiming); + NGraphPasses::runNGraphPasses(model, rootTiming, importCfg.enableWeightsSeparationPath); const auto moduleLoc = IE::createLayerLocation(ctx, "module", "Module"); auto module = mlir::ModuleOp::create(moduleLoc, StringRef(model->get_friendly_name())); @@ -5233,11 +5282,14 @@ mlir::OwningOpRef vpux::IE::importNetwork( auto builder = mlir::OpBuilder::atBlockBegin(module.getBody(), &builderLog); log.trace("Add NetworkInfo Operation"); - addNetworkInfoOp(builder, mainFuncName, model, originalParameters, originalResults, rootTiming, enableProfiling); + addNetworkInfoOp(builder, mainFuncName, model, originalParameters, originalResults, rootTiming, + importCfg.enableProfiling); log.trace("Import nGraph function"); - NGraphImporter importer(ctx, model, sharedConstants, log); - importer.buildMainFunc(builder, mainFuncName.getValue(), rootTiming, stubLayers, dynamicShapeToStatic); + + NGraphImporter importer(ctx, model, importCfg.sharedConstants, log); + importer.buildMainFunc(builder, mainFuncName.getValue(), rootTiming, importCfg.stubLayers, + importCfg.dynamicShapeToStatic); log.trace("Validate MLIR module"); auto finalTiming = rootTiming.nest("Validate MLIR module"); diff --git a/src/vpux_compiler/src/init.cpp b/src/vpux_compiler/src/init.cpp index b827db18b8..a6d860d59a 100644 --- a/src/vpux_compiler/src/init.cpp +++ b/src/vpux_compiler/src/init.cpp @@ -4,39 +4,35 @@ // #include "vpux/compiler/init.hpp" - #include "vpux/compiler/NPU37XX/dialect/NPUReg37XX/ops.hpp" -#include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" -#include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/ops.hpp" +#include "vpux/compiler/NPU40XX/dialect/ELF/dialect.hpp" +#include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/dialect.hpp" #include "vpux/compiler/conversion/passes/VPU2VPUIP/bufferizable_ops_interface.hpp" +#include "vpux/compiler/core/types/quantile_float/dialect.hpp" +#include "vpux/compiler/core/types/quantile_float/types.hpp" #include "vpux/compiler/dialect/ELFNPU37XX/dialect.hpp" -#include "vpux/compiler/dialect/ELFNPU37XX/ops.hpp" +#include "vpux/compiler/dialect/HostExec/IR/dialect.hpp" +#include "vpux/compiler/dialect/HostExec/IR/ops.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPUASM/dialect.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIPDPU/dialect.hpp" #include "vpux/compiler/dialect/VPUMI37XX/dialect.hpp" -#include "vpux/compiler/dialect/VPUMI37XX/ops.hpp" #include "vpux/compiler/dialect/VPUMI40XX/dialect.hpp" -#include "vpux/compiler/dialect/VPURT/IR/ops.hpp" +#include "vpux/compiler/dialect/VPURT/IR/dialect.hpp" #include "vpux/compiler/dialect/VPURegMapped/dialect.hpp" -#include "vpux/compiler/dialect/VPURegMapped/ops.hpp" #include "vpux/compiler/dialect/config/IR/dialect.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" #include "vpux/compiler/dialect/core/IR/dialect.hpp" #include "vpux/compiler/dialect/net/IR/dialect.hpp" #include "vpux/compiler/utils/rewriter.hpp" -#include "vpux/compiler/core/types/quantile_float/dialect.hpp" -#include "vpux/compiler/core/types/quantile_float/types.hpp" - #include +#include +#include +#include #include #include #include @@ -58,8 +54,10 @@ #include #include +#include #include #include +#include using namespace vpux; @@ -95,6 +93,7 @@ void registerDialects(mlir::DialectRegistry& registry) { vpux::VPUASM::VPUASMDialect, // vpux::VPURegMapped::VPURegMappedDialect, // vpux::ELF::ELFDialect, // + vpux::HostExec::HostExecDialect, // vpux::NPUReg37XX::NPUReg37XXDialect, // vpux::NPUReg40XX::NPUReg40XXDialect, // vpux::ELFNPU37XX::ELFNPU37XXDialect, // @@ -144,7 +143,7 @@ mlir::DialectRegistry vpux::createDialectRegistry(DummyOpMode dummyOpMode) { mlir::registerLLVMDialectTranslation(registry); mlir::registerConvertMemRefToLLVMInterface(registry); mlir::registerConvertFuncToLLVMInterface(registry); - + mlir::cf::registerConvertControlFlowToLLVMInterface(registry); if (dummyOpMode == DummyOpMode::ENABLED) { VPUIP::VPUIPDialect::setupExtraInterfacesAdditional(registry); } diff --git a/src/vpux_compiler/src/interfaces_registry.cpp b/src/vpux_compiler/src/interfaces_registry.cpp index 3348245e86..5567861166 100644 --- a/src/vpux_compiler/src/interfaces_registry.cpp +++ b/src/vpux_compiler/src/interfaces_registry.cpp @@ -19,11 +19,11 @@ namespace vpux { // createInterfaceRegistry // -std::unique_ptr createInterfacesRegistry(VPU::ArchKind arch) { +std::unique_ptr createInterfacesRegistry(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return std::make_unique(); - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return std::make_unique(); default: VPUX_THROW("Unsupported arch kind: {0}", arch); diff --git a/src/vpux_compiler/src/passes_register.cpp b/src/vpux_compiler/src/passes_register.cpp index 2a0e4ab981..edde3a4f1a 100644 --- a/src/vpux_compiler/src/passes_register.cpp +++ b/src/vpux_compiler/src/passes_register.cpp @@ -14,11 +14,11 @@ using namespace vpux; // createPassesRegistry // -std::unique_ptr vpux::createPassesRegistry(VPU::ArchKind arch) { +std::unique_ptr vpux::createPassesRegistry(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return std::make_unique(); - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return std::make_unique(); default: VPUX_THROW("Unsupported arch kind: {0}", arch); diff --git a/src/vpux_compiler/src/pipelines/compilation_options.cpp b/src/vpux_compiler/src/pipelines/compilation_options.cpp index cf478100ad..f68f5731ee 100644 --- a/src/vpux_compiler/src/pipelines/compilation_options.cpp +++ b/src/vpux_compiler/src/pipelines/compilation_options.cpp @@ -22,28 +22,19 @@ void checkCompilerOptions(const intel_npu::Config& config) { VPUX_THROW_WHEN(options == nullptr, "Failed to parse COMPILATION_MODE_PARAMS"); } -template -void checkCompilerOptions(const intel_npu::Config& config) { - const auto compilationMode = getCompilationMode(config); - if (compilationMode == config::CompilationMode::ReferenceSW) { - checkCompilerOptions(config); - } else if (compilationMode == config::CompilationMode::DefaultHW || - compilationMode == config::CompilationMode::HostCompile) { - checkCompilerOptions(config); - } else if (compilationMode == config::CompilationMode::ShaveCodeGen) { - checkCompilerOptions(config); - } -} - void checkCompilerOptions(const intel_npu::Config& config) { const auto arch = getArchKind(config); - if (arch == VPU::ArchKind::NPU37XX) { - checkCompilerOptions(config); - } else if (arch == VPU::ArchKind::NPU40XX) { - checkCompilerOptions(config); + if (arch == config::ArchKind::NPU37XX) { + checkCompilerOptions(config); + } else if (arch == config::ArchKind::NPU40XX) { + checkCompilerOptions(config); } } +/// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +/// See https://llvm.org/LICENSE.txt for license information. +/// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + /// Parse in the next argument from the given options string. Returns a tuple /// containing [the key of the option, the value of the option, updated /// `options` string pointing after the parsed option]. diff --git a/src/vpux_compiler/src/pipelines/options_mapper.cpp b/src/vpux_compiler/src/pipelines/options_mapper.cpp index 8a919e025c..99141e115e 100644 --- a/src/vpux_compiler/src/pipelines/options_mapper.cpp +++ b/src/vpux_compiler/src/pipelines/options_mapper.cpp @@ -71,29 +71,12 @@ std::optional getPerformanceHintOverride(const intel_npu::Config& c return options->performanceHintOverride; } -template -std::optional getPerformanceHintOverride(const intel_npu::Config& config) { - const auto compilationMode = getCompilationMode(config); - if (compilationMode == config::CompilationMode::ReferenceSW) { - return getPerformanceHintOverride(config); - } else if (compilationMode == config::CompilationMode::DefaultHW || - compilationMode == config::CompilationMode::WSMonolithic || - compilationMode == config::CompilationMode::WSInit || - compilationMode == config::CompilationMode::HostCompile) { - return getPerformanceHintOverride(config); - } else if (compilationMode == config::CompilationMode::ShaveCodeGen) { - return getPerformanceHintOverride(config); - } else { - return std::nullopt; - } -} - std::optional getPerformanceHintOverride(const intel_npu::Config& config) { const auto arch = getArchKind(config); - if (arch == VPU::ArchKind::NPU37XX) { - return getPerformanceHintOverride(config); - } else if (arch == VPU::ArchKind::NPU40XX) { - return getPerformanceHintOverride(config); + if (arch == config::ArchKind::NPU37XX) { + return getPerformanceHintOverride(config); + } else if (arch == config::ArchKind::NPU40XX) { + return getPerformanceHintOverride(config); } else { return std::nullopt; } @@ -168,15 +151,15 @@ namespace vpux { // getArchKind // -VPU::ArchKind getArchKind(const intel_npu::Config& config) { +config::ArchKind getArchKind(const intel_npu::Config& config) { const std::string platform = ov::intel_npu::Platform::standardize(config.get()); if (platform == ov::intel_npu::Platform::AUTO_DETECT) { - return VPU::ArchKind::UNKNOWN; + return config::ArchKind::UNKNOWN; } else if (platform == ov::intel_npu::Platform::NPU3720) { - return VPU::ArchKind::NPU37XX; + return config::ArchKind::NPU37XX; } else if (platform == ov::intel_npu::Platform::NPU4000) { - return VPU::ArchKind::NPU40XX; + return config::ArchKind::NPU40XX; } else { VPUX_THROW("Unsupported VPUX platform"); } @@ -313,19 +296,20 @@ std::optional getEnableVerifiers(const intel_npu::Config& config) { if (options == nullptr) { return std::nullopt; } + + // TODO: E174237, need to enable verifiers for WSMonolithic compilation mode + if (getCompilationMode(config) == config::CompilationMode::WSMonolithic) { + return std::nullopt; + } return options->enableVerifiers; } -template std::optional getEnableVerifiers(const intel_npu::Config& config) { - const auto compilationMode = getCompilationMode(config); - if (compilationMode == config::CompilationMode::ReferenceSW) { - return getEnableVerifiers(config); - } else if (compilationMode == config::CompilationMode::DefaultHW || - compilationMode == config::CompilationMode::HostCompile) { - return getEnableVerifiers(config); - } else if (compilationMode == config::CompilationMode::ShaveCodeGen) { - return getEnableVerifiers(config); + const auto arch = getArchKind(config); + if (arch == config::ArchKind::NPU37XX) { + return getEnableVerifiers(config); + } else if (arch == config::ArchKind::NPU40XX) { + return getEnableVerifiers(config); } else { return std::nullopt; } @@ -344,9 +328,9 @@ std::optional getWlmEnabled(const intel_npu::Config& config) { std::optional getWlmEnabled(const intel_npu::Config& config) { const auto arch = getArchKind(config); - if (arch == VPU::ArchKind::NPU37XX) { + if (arch == config::ArchKind::NPU37XX) { return std::nullopt; - } else if (arch == VPU::ArchKind::NPU40XX) { + } else if (arch == config::ArchKind::NPU40XX) { return getWlmEnabled(config); } else { return std::nullopt; @@ -366,9 +350,9 @@ std::optional getWlmRollback(const intel_npu::Config& config) { std::optional getWlmRollback(const intel_npu::Config& config) { const auto arch = getArchKind(config); - if (arch == VPU::ArchKind::NPU37XX) { + if (arch == config::ArchKind::NPU37XX) { return std::nullopt; - } else if (arch == VPU::ArchKind::NPU40XX) { + } else if (arch == config::ArchKind::NPU40XX) { return getWlmRollback(config); } else { return std::nullopt; @@ -385,17 +369,6 @@ std::optional getQDQOptimization(const intel_npu::Config& config) { return std::nullopt; } -std::optional getEnableVerifiers(const intel_npu::Config& config) { - const auto arch = getArchKind(config); - if (arch == VPU::ArchKind::NPU37XX) { - return getEnableVerifiers(config); - } else if (arch == VPU::ArchKind::NPU40XX) { - return getEnableVerifiers(config); - } else { - return std::nullopt; - } -} - template std::optional getEnableMemoryUsageCollector(const intel_npu::Config& config) { const auto options = @@ -406,27 +379,12 @@ std::optional getEnableMemoryUsageCollector(const intel_npu::Config& confi return options->enableMemoryUsageCollector; } -template -std::optional getEnableMemoryUsageCollector(const intel_npu::Config& config) { - const auto compilationMode = getCompilationMode(config); - if (compilationMode == config::CompilationMode::ReferenceSW) { - return getEnableMemoryUsageCollector(config); - } else if (compilationMode == config::CompilationMode::DefaultHW || - compilationMode == config::CompilationMode::HostCompile) { - return getEnableMemoryUsageCollector(config); - } else if (compilationMode == config::CompilationMode::ShaveCodeGen) { - return getEnableMemoryUsageCollector(config); - } else { - return std::nullopt; - } -} - std::optional getEnableMemoryUsageCollector(const intel_npu::Config& config) { const auto arch = getArchKind(config); - if (arch == VPU::ArchKind::NPU37XX) { - return getEnableMemoryUsageCollector(config); - } else if (arch == VPU::ArchKind::NPU40XX) { - return getEnableMemoryUsageCollector(config); + if (arch == config::ArchKind::NPU37XX) { + return getEnableMemoryUsageCollector(config); + } else if (arch == config::ArchKind::NPU40XX) { + return getEnableMemoryUsageCollector(config); } else { return std::nullopt; } @@ -442,27 +400,12 @@ std::optional getEnableFunctionStatisticsInstrumentation(const intel_npu:: return options->enableFunctionStatisticsInstrumentation; } -template -std::optional getEnableFunctionStatisticsInstrumentation(const intel_npu::Config& config) { - const auto compilationMode = getCompilationMode(config); - if (compilationMode == config::CompilationMode::ReferenceSW) { - return getEnableFunctionStatisticsInstrumentation(config); - } else if (compilationMode == config::CompilationMode::DefaultHW || - compilationMode == config::CompilationMode::HostCompile) { - return getEnableFunctionStatisticsInstrumentation(config); - } else if (compilationMode == config::CompilationMode::ShaveCodeGen) { - return getEnableFunctionStatisticsInstrumentation(config); - } else { - return std::nullopt; - } -} - std::optional getEnableFunctionStatisticsInstrumentation(const intel_npu::Config& config) { const auto arch = getArchKind(config); - if (arch == VPU::ArchKind::NPU37XX) { - return getEnableFunctionStatisticsInstrumentation(config); - } else if (arch == VPU::ArchKind::NPU40XX) { - return getEnableFunctionStatisticsInstrumentation(config); + if (arch == config::ArchKind::NPU37XX) { + return getEnableFunctionStatisticsInstrumentation(config); + } else if (arch == config::ArchKind::NPU40XX) { + return getEnableFunctionStatisticsInstrumentation(config); } else { return std::nullopt; } @@ -478,27 +421,12 @@ std::optional getDummyOpReplacement(const intel_npu::Config& config return options->enableDummyOpReplacement ? DummyOpMode::ENABLED : DummyOpMode::DISABLED; } -template -std::optional getDummyOpReplacement(const intel_npu::Config& config) { - const auto compilationMode = getCompilationMode(config); - if (compilationMode == config::CompilationMode::ReferenceSW) { - return getDummyOpReplacement(config); - } else if (compilationMode == config::CompilationMode::DefaultHW || - compilationMode == config::CompilationMode::HostCompile) { - return getDummyOpReplacement(config); - } else if (compilationMode == config::CompilationMode::ShaveCodeGen) { - return getDummyOpReplacement(config); - } else { - return std::nullopt; - } -} - std::optional getDummyOpReplacement(const intel_npu::Config& config) { const auto arch = getArchKind(config); - if (arch == VPU::ArchKind::NPU37XX) { - return getDummyOpReplacement(config); - } else if (arch == VPU::ArchKind::NPU40XX) { - return getDummyOpReplacement(config); + if (arch == config::ArchKind::NPU37XX) { + return getDummyOpReplacement(config); + } else if (arch == config::ArchKind::NPU40XX) { + return getDummyOpReplacement(config); } else { return std::nullopt; } @@ -527,27 +455,12 @@ std::optional getConstantFoldingInBackground(const intel_ options->constantFoldingInBackgroundCacheCleanThreshold}; } -template -std::optional getConstantFoldingInBackground(const intel_npu::Config& config) { - const auto compilationMode = getCompilationMode(config); - if (compilationMode == config::CompilationMode::ReferenceSW) { - return getConstantFoldingInBackground(config); - } else if (compilationMode == config::CompilationMode::DefaultHW || - compilationMode == config::CompilationMode::HostCompile) { - return getConstantFoldingInBackground(config); - } else if (compilationMode == config::CompilationMode::ShaveCodeGen) { - return getConstantFoldingInBackground(config); - } else { - return std::nullopt; - } -} - std::optional getConstantFoldingInBackground(const intel_npu::Config& config) { const auto arch = getArchKind(config); - if (arch == VPU::ArchKind::NPU37XX) { - return getConstantFoldingInBackground(config); - } else if (arch == VPU::ArchKind::NPU40XX) { - return getConstantFoldingInBackground(config); + if (arch == config::ArchKind::NPU37XX) { + return getConstantFoldingInBackground(config); + } else if (arch == config::ArchKind::NPU40XX) { + return getConstantFoldingInBackground(config); } else { return std::nullopt; } diff --git a/src/vpux_compiler/src/pipelines/pipeline_strategies.cpp b/src/vpux_compiler/src/pipelines/pipeline_strategies.cpp index 0927704f17..7a4b7d0729 100644 --- a/src/vpux_compiler/src/pipelines/pipeline_strategies.cpp +++ b/src/vpux_compiler/src/pipelines/pipeline_strategies.cpp @@ -95,7 +95,11 @@ void ReferenceSWStrategy::buildPipeline(mlir::OpPassManager& pm) { auto strategy = _createPipelineStrategy(config::CompilationMode::ReferenceSW); strategy->initializePipeline(pm, _log); - strategy->buildReferenceSWPipeline(pm, _log); + strategy->buildIEPipeline(pm, _log); + strategy->buildLowerIE2VPUPipeline(pm, _log); + strategy->buildVPUPipeline(pm, _log); + strategy->buildLowerVPU2VPUIPPipeline(pm, _log); + strategy->buildVPUIPPipeline(pm, _log); } // @@ -121,6 +125,7 @@ void WSMonolithicStrategy::buildPipeline(mlir::OpPassManager& pm) { auto& nestedPm = pm.nest(); { auto initStrategy = _createPipelineStrategy(config::CompilationMode::WSInit); + nestedPm.addPass(vpux::Core::createAddNetInfoToModulePass(_log, true /* hasTensorSemantics */)); initStrategy->initializePipeline(nestedPm, _log); initStrategy->buildIEPipeline(nestedPm, _log); initStrategy->buildLowerIE2VPUPipeline(nestedPm, _log); diff --git a/src/vpux_compiler/src/pipelines_register.cpp b/src/vpux_compiler/src/pipelines_register.cpp index 8e55b2e84c..45a008a357 100644 --- a/src/vpux_compiler/src/pipelines_register.cpp +++ b/src/vpux_compiler/src/pipelines_register.cpp @@ -14,11 +14,11 @@ using namespace vpux; // createPipelineRegistry // -std::unique_ptr vpux::createPipelineRegistry(VPU::ArchKind arch) { +std::unique_ptr vpux::createPipelineRegistry(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return std::make_unique(); - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return std::make_unique(); default: VPUX_THROW("Unsupported arch kind: {0}", arch); diff --git a/src/vpux_compiler/src/tools/options.cpp b/src/vpux_compiler/src/tools/options.cpp index e26ace4692..9a5431e747 100644 --- a/src/vpux_compiler/src/tools/options.cpp +++ b/src/vpux_compiler/src/tools/options.cpp @@ -18,7 +18,7 @@ using namespace vpux; // parseArchKind // -vpux::VPU::ArchKind vpux::parseArchKind(int argc, char* argv[], StringRef helpHeader) { +vpux::config::ArchKind vpux::parseArchKind(int argc, char* argv[], StringRef helpHeader) { static llvm::cl::OptionCategory vpuxOptOptions("NPU Options"); // Please use this option to test pipelines only (DefaultHW, ReferenceSW, etc.) @@ -54,13 +54,13 @@ vpux::VPU::ArchKind vpux::parseArchKind(int argc, char* argv[], StringRef helpHe VPUX_THROW_WHEN(archOpt.empty() && initCompiler.empty(), "Can't get ArchKind value"); const auto getArchFromString = [](vpux::StringRef archOptStr) { - auto archKind = vpux::VPU::symbolizeEnum(archOptStr); + auto archKind = vpux::config::symbolizeEnum(archOptStr); VPUX_THROW_UNLESS(archKind.has_value(), "Unknown VPU architecture : '{0}'", archOpt.getValue()); return archKind.value(); }; - auto arch = vpux::VPU::ArchKind::UNKNOWN; + auto arch = vpux::config::ArchKind::UNKNOWN; if (!archOpt.empty()) { arch = getArchFromString(archOpt); } else { diff --git a/src/vpux_compiler/src/utils/ELF/utils.cpp b/src/vpux_compiler/src/utils/ELF/utils.cpp index 5a41bdbb74..b68b7b1700 100644 --- a/src/vpux_compiler/src/utils/ELF/utils.cpp +++ b/src/vpux_compiler/src/utils/ELF/utils.cpp @@ -10,6 +10,7 @@ #include #include "vpux/compiler/NPU40XX/dialect/ELF/ops.hpp" #include "vpux/compiler/act_kernels/shave_binary_resources.h" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/dialect/net/IR/ops.hpp" ArrayRef vpux::ELF::getDataAndSizeOfElfSection(ArrayRef elfBlob, @@ -77,7 +78,7 @@ vpux::ELF::MainOp vpux::ELF::getElfMainOp(mlir::func::FuncOp funcOp) { ArrayRef vpux::ELF::getKernelELF(mlir::Operation* operation, StringRef kernelPath, ArrayRef sectionNames) { const auto& kernelInfo = ShaveBinaryResources::getInstance(); - const auto archKind = VPU::getArch(operation); + const auto archKind = config::getArch(operation); const auto arch = ShaveBinaryResources::getSwKernelArchString(archKind); llvm::ArrayRef elfBlob = kernelInfo.getElf(kernelPath, arch); @@ -148,14 +149,14 @@ void ELF::SymbolReferenceMap::walkAllSymbols() { // namespace { -const std::unordered_map vpuToElfArchEnumMap = { - {VPU::ArchKind::UNKNOWN, elf::platform::ArchKind::UNKNOWN}, - {VPU::ArchKind::NPU37XX, elf::platform::ArchKind::VPUX37XX}, - {VPU::ArchKind::NPU40XX, elf::platform::ArchKind::VPUX40XX}, +const std::unordered_map vpuToElfArchEnumMap = { + {config::ArchKind::UNKNOWN, elf::platform::ArchKind::UNKNOWN}, + {config::ArchKind::NPU37XX, elf::platform::ArchKind::VPUX37XX}, + {config::ArchKind::NPU40XX, elf::platform::ArchKind::VPUX40XX}, }; } // namespace -elf::platform::ArchKind vpux::ELF::mapVpuArchKindToElfArchKind(const VPU::ArchKind& archKind) { +elf::platform::ArchKind vpux::ELF::mapVpuArchKindToElfArchKind(const config::ArchKind& archKind) { return vpuToElfArchEnumMap.at(archKind); } diff --git a/src/vpux_compiler/src/utils/IE/function_outlining_splitter_naive.cpp b/src/vpux_compiler/src/utils/IE/function_outlining_splitter_naive.cpp index c2586145df..be7269431a 100644 --- a/src/vpux_compiler/src/utils/IE/function_outlining_splitter_naive.cpp +++ b/src/vpux_compiler/src/utils/IE/function_outlining_splitter_naive.cpp @@ -3,7 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/utils/IE/function_outlining_splitter.hpp" #include "vpux/utils/core/dense_map.hpp" diff --git a/src/vpux_compiler/src/utils/IE/function_outlining_splitter_repeating_blocks.cpp b/src/vpux_compiler/src/utils/IE/function_outlining_splitter_repeating_blocks.cpp index 50d48482b3..9413631801 100644 --- a/src/vpux_compiler/src/utils/IE/function_outlining_splitter_repeating_blocks.cpp +++ b/src/vpux_compiler/src/utils/IE/function_outlining_splitter_repeating_blocks.cpp @@ -3,11 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/utils/IE/function_outlining_splitter.hpp" #include "vpux/compiler/utils/hash.hpp" - #include "vpux/utils/core/array_ref.hpp" #include "vpux/utils/core/dense_map.hpp" diff --git a/src/vpux_compiler/src/utils/ShaveCodeGen/linalg_type_conversion.cpp b/src/vpux_compiler/src/utils/ShaveCodeGen/linalg_type_conversion.cpp index 1283c4cd01..11de7270d1 100644 --- a/src/vpux_compiler/src/utils/ShaveCodeGen/linalg_type_conversion.cpp +++ b/src/vpux_compiler/src/utils/ShaveCodeGen/linalg_type_conversion.cpp @@ -4,7 +4,8 @@ // #include "vpux/compiler/utils/ShaveCodeGen/linalg_type_conversion.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" +#include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include #include diff --git a/src/vpux_compiler/src/utils/VPU/function_outlining_splitter.cpp b/src/vpux_compiler/src/utils/VPU/function_outlining_splitter.cpp index 09f28e7a3b..3432fa918b 100644 --- a/src/vpux_compiler/src/utils/VPU/function_outlining_splitter.cpp +++ b/src/vpux_compiler/src/utils/VPU/function_outlining_splitter.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/utils/VPU/function_outlining_splitter.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/dialect/const/utils/utils.hpp" #include "vpux/compiler/utils/stl_extras.hpp" using namespace vpux; @@ -229,6 +230,12 @@ void VFOutliningSplitter::createOutliningInstanceFromStorage(ValueOrderedSet& st } else if (mlir::isa(operand.getType())) { // TODO: E#140551 support GroupSparseTensorOp as function arg return; + } else if (auto constOp = operand.getDefiningOp()) { + if (Const::hasSparsifyTransformation(constOp)) { + // If a constant op has sparsify transformation, its user must be the VPU.GroupSparseTensor op + // TODO: E#173628 duplicating the Const::DeclareOp and VPU.GroupSparseTensorOp for function outlining + return; + } } } @@ -307,7 +314,7 @@ SmallVector VFOutliningSplitter::getOutliningInstances(mlir:: return instanceOps.find(op) != instanceOps.end(); }; - const auto isParallelConcatInput = [&](mlir::Operation* op) { + const auto isParallelConcatInput = [&](mlir::Operation* op, bool tiledOnMultiDims) { /* ... ... Op \ | / \ Check for pattern: ... Concat Concat @@ -326,8 +333,10 @@ SmallVector VFOutliningSplitter::getOutliningInstances(mlir:: if (user == concatOp || !mlir::isa_and_nonnull(user)) { continue; } - if (vfOpInStorage == concatOpInStorage && + if (!tiledOnMultiDims && vfOpInStorage == concatOpInStorage && concatOpInStorage == isOpInCurrentOutliningInstance(user)) { + // if VF is tiled on multiple dimensions, need to further avoid separating VF ops into + // different function ops continue; } // VFOp is a consumer of parallel concat, can not outline since @@ -345,10 +354,13 @@ SmallVector VFOutliningSplitter::getOutliningInstances(mlir:: const auto tilingStrategy = parseIntArrayAttr(vfOp.getTilingStrategy()); const auto numTiles = std::accumulate(tilingStrategy.begin(), tilingStrategy.end(), size_t(1), std::multiplies()); + const auto tiledOnMultiDims = llvm::count_if(tilingStrategy, [](auto value) { + return value > 1; + }) > 1; if (numTiles < _verticalFusionTileThreshold) { return false; } - if (isParallelConcatInput(op)) { + if (isParallelConcatInput(op, tiledOnMultiDims)) { return false; } return true; diff --git a/src/vpux_compiler/src/utils/VPU/tile_utils.cpp b/src/vpux_compiler/src/utils/VPU/tile_utils.cpp index ba09a6c989..f71b83577a 100644 --- a/src/vpux_compiler/src/utils/VPU/tile_utils.cpp +++ b/src/vpux_compiler/src/utils/VPU/tile_utils.cpp @@ -6,8 +6,10 @@ #include "vpux/compiler/utils/VPU/tile_utils.hpp" #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" +#include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" #include "vpux/compiler/utils/analysis.hpp" #include @@ -554,16 +556,17 @@ SmallVector getTileTypesCommon(mlir::Operation* origOp, c VPUX_THROW_UNLESS(inTiles.size() == origOp->getOperands().size(), "Unexpected inputTile size '{0}' and Op operands size '{1}'", inTiles.size(), origOp->getOperands().size()); + inputTileTypes.reserve(origOp->getNumOperands()); for (const auto& input : origOp->getOperands() | indexed) { const auto inputType = mlir::cast(input.value().getType()); - inputTileTypes.push_back( + inputTileTypes.emplace_back( inputType.extractDenseTile(inTiles[input.index()].offsets, inTiles[input.index()].shape)); } const auto outputTileType = outputType.extractDenseTile(outTile.offsets, outTile.shape); if (!origOp->hasAttr(VPU::multiClusterStrategy)) { - inputTileTypes.push_back(outputTileType); + inputTileTypes.emplace_back(outputTileType); return inputTileTypes; } @@ -574,15 +577,19 @@ SmallVector getTileTypesCommon(mlir::Operation* origOp, c clusteredOp.getMultiClusterStrategy().value()); SmallVector distributedTensorTypes; + distributedTensorTypes.reserve(inputTileTypes.size()); for (const auto& [idx, inputTileType] : inputTileTypes | indexed) { - auto inDistributedType = VPU::getDistributedActivationTypeFromOp( - clusteredOp, clusteredOp->getOperand(idx), inputTileType, numClusters, outputTileType, outTile); - distributedTensorTypes.push_back(mlir::cast(inDistributedType)); + auto inDistributedType = + idx != 0 && inputTileType == inputTileTypes.front() + ? distributedTensorTypes.front() + : VPU::getDistributedActivationTypeFromOp(clusteredOp, clusteredOp->getOperand(idx), + inputTileType, numClusters, outputTileType, outTile); + distributedTensorTypes.emplace_back(mlir::cast(inDistributedType)); } auto outDistributedType = VPU::getDistributedOutputTypeFromOp(clusteredOp, outputTileType, numClusters, inputTileTypes); - distributedTensorTypes.push_back(mlir::cast(outDistributedType)); + distributedTensorTypes.emplace_back(mlir::cast(outDistributedType)); return distributedTensorTypes; } diff --git a/src/vpux_compiler/src/utils/VPUIP/function_outlining_splitter.cpp b/src/vpux_compiler/src/utils/VPUIP/function_outlining_splitter.cpp index 0113dc1921..13f4bacbe5 100644 --- a/src/vpux_compiler/src/utils/VPUIP/function_outlining_splitter.cpp +++ b/src/vpux_compiler/src/utils/VPUIP/function_outlining_splitter.cpp @@ -4,23 +4,19 @@ // #include "vpux/compiler/utils/VPUIP/function_outlining_splitter.hpp" -#include "vpux/compiler/conversion.hpp" #include "vpux/compiler/core/aliases_info.hpp" #include "vpux/compiler/core/async_deps_info.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" -#include "vpux/compiler/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/dialect/VPURT/IR/ops.hpp" -#include "vpux/compiler/dialect/core/transforms/passes.hpp" - -#include "vpux/compiler/utils/allocate_buffers.hpp" -#include "vpux/compiler/utils/analysis.hpp" #include "vpux/compiler/utils/async_dialect_utils.hpp" -#include "vpux/compiler/utils/logging.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include "vpux/compiler/utils/swizzling_utils.hpp" +#include #include #include -#include using namespace vpux; diff --git a/src/vpux_compiler/src/utils/analysis.cpp b/src/vpux_compiler/src/utils/analysis.cpp index 750b1837e7..6ee3d6bfdf 100644 --- a/src/vpux_compiler/src/utils/analysis.cpp +++ b/src/vpux_compiler/src/utils/analysis.cpp @@ -4,7 +4,7 @@ // #include "vpux/compiler/utils/analysis.hpp" - +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/utils/core/error.hpp" #include @@ -29,6 +29,22 @@ mlir::Operation* vpux::getFirstUser(mlir::Value output) { return firstUser == users.end() ? nullptr : *firstUser; } +// +// hasOneUniqueUser +// + +bool vpux::hasOneUniqueUser(mlir::Operation* op) { + auto users = op->getUsers(); + if (users.empty()) { + return false; + } + + auto firstUser = *users.begin(); + return std::all_of(std::next(users.begin()), users.end(), [&](mlir::Operation* userOp) { + return firstUser == userOp; + }); +} + // // isBufAllocOp // diff --git a/src/vpux_compiler/src/utils/async_dialect_utils.cpp b/src/vpux_compiler/src/utils/async_dialect_utils.cpp index 6f92ff4ed2..72af20d763 100644 --- a/src/vpux_compiler/src/utils/async_dialect_utils.cpp +++ b/src/vpux_compiler/src/utils/async_dialect_utils.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/utils/async_dialect_utils.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops_interfaces.hpp" using namespace vpux; diff --git a/src/vpux_compiler/src/utils/bit_compactor_codec.cpp b/src/vpux_compiler/src/utils/bit_compactor_codec.cpp index 97f48d796b..d8fd5cc64b 100644 --- a/src/vpux_compiler/src/utils/bit_compactor_codec.cpp +++ b/src/vpux_compiler/src/utils/bit_compactor_codec.cpp @@ -7,12 +7,12 @@ using namespace vpux; -vpux::BitCompactorCodec::BitCompactorCodec(VPU::ArchKind arch_kind) { +vpux::BitCompactorCodec::BitCompactorCodec(config::ArchKind arch_kind) { switch (arch_kind) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: arch_type_ = vpux::bitc::ArchType::NPU27; break; - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: arch_type_ = vpux::bitc::ArchType::NPU4; break; default: diff --git a/src/vpux_compiler/src/utils/codec_factory.cpp b/src/vpux_compiler/src/utils/codec_factory.cpp index f5b7a23ca4..550b861e67 100644 --- a/src/vpux_compiler/src/utils/codec_factory.cpp +++ b/src/vpux_compiler/src/utils/codec_factory.cpp @@ -9,11 +9,11 @@ namespace vpux { -std::unique_ptr getBitCompactorCodec(VPU::ArchKind arch) { +std::unique_ptr getBitCompactorCodec(config::ArchKind arch) { return std::make_unique(arch); } -std::unique_ptr makeCodec(const ICodec::CompressionAlgorithm algo, VPU::ArchKind arch) { +std::unique_ptr makeCodec(const ICodec::CompressionAlgorithm algo, config::ArchKind arch) { switch (algo) { case ICodec::CompressionAlgorithm::BITCOMPACTOR_CODEC: return getBitCompactorCodec(arch); diff --git a/src/vpux_compiler/src/utils/compression_utils.cpp b/src/vpux_compiler/src/utils/compression_utils.cpp index d6bbb89dc9..0b7ed0aec9 100644 --- a/src/vpux_compiler/src/utils/compression_utils.cpp +++ b/src/vpux_compiler/src/utils/compression_utils.cpp @@ -7,6 +7,8 @@ #include "vpux/compiler/core/layers.hpp" #include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" +#include "vpux/compiler/dialect/core/IR/memref_attr.hpp" +#include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" #include "vpux/compiler/utils/memref_attr_utils.hpp" #include "vpux/compiler/utils/swizzling_utils.hpp" diff --git a/src/vpux_compiler/src/utils/constant_fusion.cpp b/src/vpux_compiler/src/utils/constant_fusion.cpp index e19ca33d74..5019c50839 100644 --- a/src/vpux_compiler/src/utils/constant_fusion.cpp +++ b/src/vpux_compiler/src/utils/constant_fusion.cpp @@ -7,7 +7,7 @@ #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/utils/explicit_distribution_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" -#include "vpux/utils/core/custom_float.hpp" +#include "vpux/compiler/dialect/core/IR/memref_attr.hpp" #include diff --git a/src/vpux_compiler/src/utils/dma.cpp b/src/vpux_compiler/src/utils/dma.cpp index 2146ea06d2..c8838764c9 100644 --- a/src/vpux_compiler/src/utils/dma.cpp +++ b/src/vpux_compiler/src/utils/dma.cpp @@ -21,8 +21,8 @@ int64_t vpux::getDMAPortValue(mlir::Operation* wrappedTaskOp) { VPUX_THROW("Could not cast to DMA task '{0}'", *wrappedTaskOp); } -SmallVector vpux::getDMAChannelsWithIndependentLinkAgents(VPU::ArchKind arch) { - if (arch <= VPU::ArchKind::NPU37XX) { +SmallVector vpux::getDMAChannelsWithIndependentLinkAgents(config::ArchKind arch) { + if (arch <= config::ArchKind::NPU37XX) { return {VPUIP::DmaChannelType::NOT_SPECIFIED}; } @@ -40,8 +40,8 @@ int64_t vpux::getDMAQueueIdEncoding(std::optional c return getDMAQueueIdEncoding(0, static_cast(channel.value_or(VPUIP::DmaChannelType::NOT_SPECIFIED))); } -int64_t vpux::getDMAQueueIdEncoding(VPU::MemoryKind srcMemKind, VPU::ArchKind arch) { - if (arch <= VPU::ArchKind::NPU37XX) { +int64_t vpux::getDMAQueueIdEncoding(VPU::MemoryKind srcMemKind, config::ArchKind arch) { + if (arch <= config::ArchKind::NPU37XX) { return getDMAQueueIdEncoding(std::nullopt); } @@ -55,24 +55,24 @@ int64_t vpux::getDMAPortFromEncodedId(int64_t dmaQueueIdEncoding) { return dmaQueueIdEncoding / (VPUIP::getMaxEnumValForDmaChannelType() + 1); } -VPUIP::DmaChannelType vpux::getDMAChannelTypeFromEncodedId(int64_t dmaQueueIdEncoding, VPU::ArchKind arch) { - if (arch <= VPU::ArchKind::NPU37XX) { +VPUIP::DmaChannelType vpux::getDMAChannelTypeFromEncodedId(int64_t dmaQueueIdEncoding, config::ArchKind arch) { + if (arch <= config::ArchKind::NPU37XX) { return VPUIP::DmaChannelType::NOT_SPECIFIED; } return static_cast(dmaQueueIdEncoding % (VPUIP::getMaxEnumValForDmaChannelType() + 1)); } -std::string vpux::getDMAChannelTypeAsString(VPUIP::DmaChannelType channelType, VPU::ArchKind arch) { - if (arch <= VPU::ArchKind::NPU37XX) { +std::string vpux::getDMAChannelTypeAsString(VPUIP::DmaChannelType channelType, config::ArchKind arch) { + if (arch <= config::ArchKind::NPU37XX) { return ""; } return stringifyEnum(channelType).str(); } -std::string vpux::getDMAChannelTypeAsString(int64_t dmaQueueIdEncoding, VPU::ArchKind arch) { - if (arch <= VPU::ArchKind::NPU37XX) { +std::string vpux::getDMAChannelTypeAsString(int64_t dmaQueueIdEncoding, config::ArchKind arch) { + if (arch <= config::ArchKind::NPU37XX) { return ""; } diff --git a/src/vpux_compiler/src/utils/dma_limits.cpp b/src/vpux_compiler/src/utils/dma_limits.cpp index cc96018e87..847198a5e1 100644 --- a/src/vpux_compiler/src/utils/dma_limits.cpp +++ b/src/vpux_compiler/src/utils/dma_limits.cpp @@ -61,11 +61,11 @@ const EngineLimits NPU40XX_ENGINE_LIMITS_DEFAULT = { DimLimits(SizeLimits(1, 0x10000), StrideLimits(0, 0xFFFFFFFF), {}), /**/ })}; -const EngineLimits& getEngineLimits(VPU::ArchKind arch) { +const EngineLimits& getEngineLimits(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return NPU37XX_ENGINE_LIMITS_DEFAULT; - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return NPU40XX_ENGINE_LIMITS_DEFAULT; default: return DEFAULT_ENGINE_LIMITS_DEFAULT; diff --git a/src/vpux_compiler/src/utils/dma_transaction_utils.cpp b/src/vpux_compiler/src/utils/dma_transaction_utils.cpp index de91851cb1..b0875dbcb9 100644 --- a/src/vpux_compiler/src/utils/dma_transaction_utils.cpp +++ b/src/vpux_compiler/src/utils/dma_transaction_utils.cpp @@ -229,6 +229,12 @@ DMATransaction getDMATransactionFromPermutation(vpux::NDTypeInterface inType, vp auto ctx = inType.getContext(); + VPUX_THROW_WHEN(inType.getRank() != outType.getRank(), "Rank mismatch between input and output types"); + VPUX_THROW_WHEN(inType.getRank() != mappingOrder.getNumDims(), + "Rank mismatch between input type and mapping order"); + VPUX_THROW_WHEN(inType.getRank() != static_cast(loopOrder.size()), + "Rank mismatch between input type and loop order"); + // Mapping order maps out logical dims to in logical dims // This mapping allows to find the in logical dim corresponding to a given out logical dim diff --git a/src/vpux_compiler/src/utils/dynamic_shape_propagation.cpp b/src/vpux_compiler/src/utils/dynamic_shape_propagation.cpp index 81a5a88566..f12e09ba1d 100644 --- a/src/vpux_compiler/src/utils/dynamic_shape_propagation.cpp +++ b/src/vpux_compiler/src/utils/dynamic_shape_propagation.cpp @@ -4,6 +4,7 @@ // #include +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" namespace vpux { diff --git a/src/vpux_compiler/src/utils/infer_output_shape.cpp b/src/vpux_compiler/src/utils/infer_output_shape.cpp index 9edb5280d5..4f93129346 100644 --- a/src/vpux_compiler/src/utils/infer_output_shape.cpp +++ b/src/vpux_compiler/src/utils/infer_output_shape.cpp @@ -4,9 +4,12 @@ // #include "vpux/compiler/utils/infer_output_shape.hpp" - +#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" +#include "vpux/compiler/dialect/IE/utils/type_padding.hpp" +#include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/IE/transposed_convolution_utils.hpp" #include "vpux/compiler/utils/error.hpp" +#include "vpux/compiler/utils/rewriter.hpp" #include "vpux/utils/core/checked_cast.hpp" #include "vpux/utils/core/range.hpp" @@ -23,9 +26,8 @@ #include #include #include + #include -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" -#include "vpux/compiler/dialect/IE/utils/type_padding.hpp" using namespace vpux; @@ -407,6 +409,32 @@ ShapeInfo vpux::inferGroupConvolutionOutputShapeInfo(ShapeInfo& inShapeInfo, Sha return createShapeInfoFromPartialShape(op.get_output_partial_shape(0)); } +ShapeInfo vpux::inferTransposedConvBackpropOutputShapeInfo( + const ShapeInfo& inShapeInfo, const ShapeInfo& filterShapeInfo, ArrayRef windowStrides, + ArrayRef dataPaddingBelow, ArrayRef dataPaddingAbove, ArrayRef windowDilations, + ArrayRef outputPadding) { + const auto inPartialShape = createPartialShapeFromShapeInfo(inShapeInfo); + const auto filterPartialShape = createPartialShapeFromShapeInfo(filterShapeInfo); + + auto backpropFilter = to_std_vector(filterPartialShape); + backpropFilter[Dims4D::Filter::OC.ind()] = inPartialShape[Dims4D::Act::C.ind()]; + + auto ovOpShape = ov::op::v1::ConvolutionBackpropData( + std::make_shared(ov::element::f32, inPartialShape), + std::make_shared(ov::element::f32, backpropFilter), + ov::Strides(windowStrides.begin(), windowStrides.end()), + ov::CoordinateDiff(dataPaddingBelow.begin(), dataPaddingBelow.end()), + ov::CoordinateDiff(dataPaddingAbove.begin(), dataPaddingAbove.end()), + ov::Strides(windowDilations.begin(), windowDilations.end()), ov::op::PadType::EXPLICIT, + ov::CoordinateDiff(outputPadding.begin(), outputPadding.end())) + .get_output_partial_shape(0); + + ovOpShape[Dims4D::Act::N.ind()] = inPartialShape[Dims4D::Act::N.ind()]; + ovOpShape[Dims4D::Act::C.ind()] = filterPartialShape[Dims4D::Filter::OC.ind()]; + + return createShapeInfoFromPartialShape(ovOpShape); +} + // // Tensor Reifiers // @@ -588,74 +616,64 @@ mlir::FailureOr> vpux::reifyEltwiseTensors(mlir: return errorAt(loc, "Unsupported BroadcastType '{0}'", broadcastType); } -namespace { +mlir::FailureOr> vpux::reifyConvPoolTensors( + mlir::OpBuilder& builder, mlir::Value input, mlir::Value output, mlir::Value kernel, + ArrayRef kernelSize, ArrayRef strides, ArrayRef padBegin, ArrayRef padEnd, + mlir::Location loc) { + const auto inputShapedType = mlir::cast(input.getType()); + const auto outputShapedType = mlir::cast(output.getType()); -// 3d: [batch, channels, columns] -> 1 spatial dimension -// 4d: [batch, channels, rows, columns] -> 2 spatial dimensions -// 5d: [batch, channels, depth, rows, columns] -> 3 spatial dimensions -// Subtract 2 to exclude batch and channels. -int64_t calculateMul(const int64_t dim, const ArrayRef strides) { - const int64_t spatialDim = dim - 2; - VPUX_THROW_UNLESS(spatialDim >= 0 && spatialDim < checked_cast(strides.size()), - "Cannot get stride by index {0}", dim); - return strides[spatialDim]; -} + VPUX_THROW_WHEN(inputShapedType.getRank() != 4 || outputShapedType.getRank() != 4, + "reifyConvPoolTensors: Unsupported input or output rank: {0} , {1}", inputShapedType.getRank(), + outputShapedType.getRank()); -int64_t calculateAddend(int64_t dim, const ArrayRef kernelSize, const ArrayRef strides, - const ArrayRef padBegin, const ArrayRef padEnd) { - const int64_t spatialDim = dim - 2; - VPUX_THROW_UNLESS(spatialDim >= 0 && spatialDim < checked_cast(kernelSize.size()), - "Cannot get kernel size by index {0}", dim); - VPUX_THROW_UNLESS(spatialDim >= 0 && spatialDim < checked_cast(strides.size()), - "Cannot get stride by index {0}", dim); - VPUX_THROW_UNLESS(spatialDim >= 0 && spatialDim < checked_cast(padBegin.size()), - "Cannot get pad begin by index {0}", dim); - VPUX_THROW_UNLESS(spatialDim >= 0 && spatialDim < checked_cast(padEnd.size()), - "Cannot get pad end by index {0}", dim); - return kernelSize[spatialDim] - strides[spatialDim] - padBegin[spatialDim] - padEnd[spatialDim]; -} + if (kernel != nullptr) { + const auto kernelShapedType = mlir::cast(kernel.getType()); + VPUX_THROW_WHEN(kernelShapedType.getRank() != 4, "reifyConvPoolTensors: Unsupported kernel rank: {0}", + kernelShapedType.getRank()); + } -}; // namespace + auto makeIndex = [&](int64_t value) { + return builder.createOrFold(loc, value); + }; -mlir::FailureOr> vpux::reifyConvPoolTensors( - mlir::OpBuilder& builder, mlir::Value input, mlir::Value output, ArrayRef kernelSize, - ArrayRef strides, ArrayRef padBegin, ArrayRef padEnd, mlir::Location loc) { - const auto inputShapedType = mlir::cast(input.getType()); - const auto outputShapedType = mlir::cast(output.getType()); + auto calculateDimSize = [&](mlir::Value inputDim, int64_t kernelDim, int64_t padBegin, int64_t padEnd, + int64_t stride) { + // output = (input + padBegin + padEnd - kernelDim + stride) / stride + auto padConst = padBegin + padEnd - kernelDim + stride; + auto sum = builder.createOrFold(takeOpLoc(inputDim.getDefiningOp(), "_add"), inputDim, + makeIndex(padConst)); + return builder.createOrFold(takeOpLoc(sum.getDefiningOp(), "_div"), sum, + makeIndex(stride)); + }; + + // Use generator functions based on index for each output dimension + auto computeShapeForDim = [&](int64_t idx) -> mlir::OpFoldResult { + auto dimLoc = appendLoc(loc, llvm::StringLiteral("_dim_{0}"), idx); + if (idx == Dims4D::Act::N.ind()) { + return reifyDim(builder, input, idx, dimLoc); + } else if (idx == Dims4D::Act::C.ind()) { + return kernel == nullptr ? reifyDim(builder, input, Dims4D::Act::C.ind(), dimLoc) + : reifyDim(builder, kernel, Dims4D::Filter::OC.ind(), dimLoc); + } else if (idx == Dims4D::Act::H.ind() || idx == Dims4D::Act::W.ind()) { + auto inputDim = reifyDim(builder, input, idx, dimLoc); + auto inputDimVal = inputDim.dyn_cast(); + VPUX_THROW_WHEN(inputDimVal == nullptr, "Failed to reify input dimension {0} for input {1} at location {2}", + idx, input, loc); + auto adjustedIdx = idx - 2; + return calculateDimSize(inputDimVal, kernelSize[adjustedIdx], padBegin[adjustedIdx], padEnd[adjustedIdx], + strides[adjustedIdx]); + } else { + VPUX_THROW("Unexpected dimension index {0}", idx); + } + }; SmallVector shapes; for (const auto dim : llvm::seq(0, outputShapedType.getRank())) { - if (!outputShapedType.isDynamicDim(dim)) { - // Static dim: Return IntegerAttr. - shapes.push_back(builder.getIndexAttr(inputShapedType.getDimSize(dim))); + if (outputShapedType.isDynamicDim(dim)) { + shapes.push_back(mlir::getValueOrCreateConstantIndexOp(builder, loc, computeShapeForDim(dim))); } else { - // Dynamic dim: Return Value. - // in_x = kernel_x + stride_x * (out_x - 1) - pad_begin_x - pad_end_x - // in_x = kernel_x + stride_x * out_x - stride_x - pad_begin_x - pad_end_x - // multiplier = stride_x - // addend = kernel_x - stride_x - pad_begin_x - pad_end_x - const auto inputMul = calculateMul(dim, strides); - const auto dimOp = builder.createOrFold(loc, input, dim); - - const auto applyMul = [&](mlir::Value value) { - if (inputMul > 1) { - mlir::Value constOp = builder.createOrFold(loc, inputMul); - return builder.createOrFold(loc, value, constOp); - } - return value; - }; - const auto afterMul = applyMul(dimOp); - - const auto addend = calculateAddend(dim, kernelSize, strides, padBegin, padEnd); - const auto applyAddend = [&](mlir::Value value) { - if (addend != 0) { - mlir::Value constOp = builder.createOrFold(loc, addend); - return builder.createOrFold(loc, value, constOp); - } - return value; - }; - const auto afterAddend = applyAddend(afterMul); - shapes.push_back(getValueOrCreateConstantIndexOp(builder, loc, afterAddend)); + shapes.push_back(builder.getIndexAttr(outputShapedType.getDimSize(dim))); } } diff --git a/src/vpux_compiler/src/utils/llvm_to_binary.cpp b/src/vpux_compiler/src/utils/llvm_to_binary.cpp index 8c47d416e6..316fc6efad 100644 --- a/src/vpux_compiler/src/utils/llvm_to_binary.cpp +++ b/src/vpux_compiler/src/utils/llvm_to_binary.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/utils/llvm_to_binary.hpp" #include "shave_ld.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include #include @@ -21,22 +22,22 @@ using namespace vpux; namespace { -std::string getMoviToolsArchArgument(VPU::ArchKind arch) { +std::string getMoviToolsArchArgument(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return "3720xx"; - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return "4000xx"; default: VPUX_THROW("Invalid ArchKind for MoviTools usage"); } } -std::string getMoviLDArchPath(VPU::ArchKind arch) { +std::string getMoviLDArchPath(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return "37xxxx"; - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return "40xxxx"; default: VPUX_THROW("Invalid ArchKind for Movi LLD path resolution"); @@ -137,8 +138,8 @@ void vpux::lowerLLVMToBinary(mlir::ModuleOp moduleOp, mlir::SymbolRefAttr swKern auto llvmFuncOp = moduleOp.lookupSymbol(swKernelSymbol); VPUX_THROW_UNLESS(llvmFuncOp != nullptr, "llvmFuncOp should be valid"); - const auto arch = VPU::getArch(moduleOp); - VPUX_THROW_UNLESS(arch != VPU::ArchKind::UNKNOWN, "Could not identify arch"); + const auto arch = config::getArch(moduleOp); + VPUX_THROW_UNLESS(arch != config::ArchKind::UNKNOWN, "Could not identify arch"); auto archArgument = getMoviToolsArchArgument(arch); @@ -170,15 +171,17 @@ void vpux::lowerLLVMToBinary(mlir::ModuleOp moduleOp, mlir::SymbolRefAttr swKern auto prgMCStr = std::string(mvToolsPathCompleteStr) + "/linux64/bin/moviCompile"; llvm::StringRef prgMC = prgMCStr; std::string mcpuStr = std::string("-mcpu=") + archArgument; - llvm::SmallVector runArgsMC = {prgMC, // Movicompile tool - mcpuStr, // CPU - "-S", // Only run preprocess and compilation steps - "-o", // Write output to: - "sw_layer.s", // file sw_layer.s - "-x", // Treat subsequent input files as having: - "ir", // type ir - "-O3", // optimize code - "sw_layer.ll"}; // Output file + llvm::SmallVector runArgsMC = {prgMC, // Movicompile tool + mcpuStr, // CPU + "-S", // Only run preprocess and compilation steps + "-o", // Write output to: + "sw_layer.s", // file sw_layer.s + "-x", // Treat subsequent input files as having: + "ir", // type ir + "-O3", // optimize code + "-mllvm", // Next option is for llvm + "-enable-loop-flatten", // Enable the loop flatten optimization + "sw_layer.ll"}; // Input file const auto procErrMC = llvm::sys::ExecuteAndWait(prgMC, runArgsMC, /*Env=*/std::nullopt, redirects, /*SecondsToWait*/ 100, /*MemoryLimit=*/0, &errMsg); diff --git a/src/vpux_compiler/src/utils/locations_verifier.cpp b/src/vpux_compiler/src/utils/locations_verifier.cpp index bf98627cf7..fc65eef9f3 100644 --- a/src/vpux_compiler/src/utils/locations_verifier.cpp +++ b/src/vpux_compiler/src/utils/locations_verifier.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/utils/locations_verifier.hpp" #include "vpux/compiler/core/developer_build_utils.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" #include "vpux/compiler/dialect/IE/IR/ops_interfaces.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" diff --git a/src/vpux_compiler/src/utils/logging.cpp b/src/vpux_compiler/src/utils/logging.cpp index 70194e94ab..38addb9ee7 100644 --- a/src/vpux_compiler/src/utils/logging.cpp +++ b/src/vpux_compiler/src/utils/logging.cpp @@ -9,7 +9,9 @@ #include #include +#include "vpux/compiler/dialect/VPU/utils/workload_management_status_utils.hpp" #include "vpux/compiler/dialect/VPUIP/utils/utils.hpp" +#include "vpux/compiler/utils/options.hpp" using namespace vpux; @@ -86,7 +88,7 @@ class PassLogging final : public mlir::PassInstrumentation { return; } auto module = op->getParentOfType(); - if (vpux::VPUIP::getWlmStatus(module) == vpux::VPUIP::WlmStatus::FAILED) { + if (VPU::getWorkloadManagementStatus(module) == VPU::WorkloadManagementStatus::FAILED) { _log.warning("WLM Failed Pass {0} on Operation {1}", pass->getName(), op->getLoc()); } else { _log.error("Failed Pass {0} on Operation {1}", pass->getName(), op->getLoc()); diff --git a/src/vpux_compiler/src/utils/memref_attr_utils.cpp b/src/vpux_compiler/src/utils/memref_attr_utils.cpp index 639f6c96a3..a86bc95dff 100644 --- a/src/vpux_compiler/src/utils/memref_attr_utils.cpp +++ b/src/vpux_compiler/src/utils/memref_attr_utils.cpp @@ -3,10 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // #include "vpux/compiler/utils/memref_attr_utils.hpp" - #include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" +#include "vpux/compiler/dialect/core/IR/memref_attr.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/swizzling_utils.hpp" #include "vpux/compiler/utils/types.hpp" diff --git a/src/vpux_compiler/src/utils/net/network_info_utils.cpp b/src/vpux_compiler/src/utils/net/network_info_utils.cpp index 67d2fe894a..5538701289 100644 --- a/src/vpux_compiler/src/utils/net/network_info_utils.cpp +++ b/src/vpux_compiler/src/utils/net/network_info_utils.cpp @@ -34,4 +34,26 @@ void eraseSectionEntries(mlir::Region& section, size_t begin) { } } +mlir::func::FuncOp findEntryPointFunc(mlir::Operation* op, Logger& log) { + if (op == nullptr) { + return nullptr; + } + + auto topModuleOp = getTopModuleOp(op); + auto netOps = to_small_vector(topModuleOp.getOps()); + if (netOps.size() != 1) { + log.warning("Expected exactly one net::NetworkInfoOp, found {0}", netOps.size()); + return nullptr; + } + + auto netInfo = netOps.front(); + auto netFunc = topModuleOp.lookupSymbol(netInfo.getEntryPointAttr()); + if (netFunc == nullptr) { + log.warning("Entry point function '{0}' not found", netInfo.getEntryPoint()); + return nullptr; + } + + return netFunc; +} + } // namespace vpux::net diff --git a/src/vpux_compiler/src/utils/options.cpp b/src/vpux_compiler/src/utils/options.cpp new file mode 100644 index 0000000000..c7c9ba758b --- /dev/null +++ b/src/vpux_compiler/src/utils/options.cpp @@ -0,0 +1,22 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// +#include "vpux/compiler/utils/options.hpp" +#include +#include + +namespace llvm { +inline ::llvm::raw_ostream& operator<<(::llvm::raw_ostream& p, vpux::WorkloadManagementMode value) { + auto valueStr = vpux::stringifyEnum(value); + return p << valueStr; +} + +template <> +struct format_provider { + static void format(const vpux::WorkloadManagementMode& val, raw_ostream& OS, StringRef /*Options*/) { + OS << vpux::stringifyEnum(val); + } +}; + +} // namespace llvm diff --git a/src/vpux_compiler/src/utils/passes.cpp b/src/vpux_compiler/src/utils/passes.cpp index f918ab6ba7..0c940052f7 100644 --- a/src/vpux_compiler/src/utils/passes.cpp +++ b/src/vpux_compiler/src/utils/passes.cpp @@ -66,6 +66,21 @@ StringLiteral vpux::stringifyEnum(WeightsTableReuseMode val) { } } +StringLiteral vpux::stringifyEnum(WorkloadManagementMode val) { + switch (val) { + case WorkloadManagementMode::PWLM_V0_LCA: + return "PWLM_V0_LCA"; + case WorkloadManagementMode::PWLM_V1_BARRIER_FIFO: + return "PWLM_V1_BARRIER_FIFO"; + case WorkloadManagementMode::PWLM_V2_PAGES: + return "PWLM_V2_PAGES"; + case WorkloadManagementMode::FWLM_V1_PAGES: + return "FWLM_V1_PAGES"; + default: + return "UNKNOWN"; + } +} + // // PatternBenefit // diff --git a/src/vpux_compiler/src/utils/permute_utils.cpp b/src/vpux_compiler/src/utils/permute_utils.cpp index a312084c95..7b4675a91c 100644 --- a/src/vpux_compiler/src/utils/permute_utils.cpp +++ b/src/vpux_compiler/src/utils/permute_utils.cpp @@ -4,6 +4,9 @@ // #include "vpux/compiler/utils/permute_utils.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/VPU/utils/distributed_tensor_utils.hpp" using namespace vpux; @@ -421,3 +424,18 @@ Dim vpux::inferDimAfterPermutation(Dim dim, DimsOrder srcOrder, DimsOrder dstOrd const auto dstDimPos = DimsOrder::fromAffineMap(perm).dimPos(Dim(srcMemDim.ind())); return dstOrder.dimAt(dstDimPos); } + +bool vpux::isSuitableToAdjustMemPermuteShape(vpux::NDTypeInterface inType, vpux::NDTypeInterface outType, + mlir::AffineMap permuteMap) { + // Calculate merged dims, for cases mem_perm [0, 1, 3, 2], [0, 2, 1], [1, 0] + auto [mergedPermutation, mergedMemShape] = vpux::getMergedPermutationAndShape(inType, permuteMap); + + if (outType.getDimsOrder() != DimsOrder::NCHW || inType.getDimsOrder() != DimsOrder::NCHW) { + return false; + } + if (mergedPermutation != SmallVector{1, 0} && mergedPermutation != SmallVector{0, 2, 1}) { + return false; + } + + return true; +} diff --git a/src/vpux_compiler/src/utils/quantization.cpp b/src/vpux_compiler/src/utils/quantization.cpp index 2a1f364183..239992bfa2 100644 --- a/src/vpux_compiler/src/utils/quantization.cpp +++ b/src/vpux_compiler/src/utils/quantization.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/utils/quantization.hpp" +#include "vpux/compiler/core/attributes/shape.hpp" #include "vpux/compiler/core/types/quantile_float/types.hpp" #include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" #include "vpux/compiler/dialect/VPU/utils/eltwise_utils.hpp" @@ -224,11 +225,14 @@ mlir::Type vpux::expandScalesAndZP(mlir::Type perAxisQType, ShapeRef padBefore, return getPerAxisTypeElem(perAxisUniformQType, newScales, newZeroPoints); } -mlir::Type vpux::tileScalesAndZP(mlir::Type perAxisQType, ShapeRef shape, ShapeRef offsets) { +mlir::Type vpux::tileScalesAndZP(mlir::Type perAxisQType, ShapeRef shape, ShapeRef offsets, ShapeRef strides) { const auto perAxisUniformQType = mlir::dyn_cast(perAxisQType); VPUX_THROW_UNLESS(perAxisUniformQType != nullptr, "perAxisQType should be a UniformQuantizedPerAxisType!"); VPUX_THROW_UNLESS(offsets.size() == shape.size(), "Offsets '{0}' doesn't match shape '{1}'", offsets, shape); + VPUX_THROW_UNLESS(strides.empty() || strides.size() == shape.size(), + "Strides '{0}' are not empty and do not match shape '{1}'", strides, shape); + VPUX_THROW_UNLESS(shape.size() >= static_cast(perAxisUniformQType.getQuantizedDimension()), "Unsupported shape size {0}. Quantized dimension index {1}", shape.size(), perAxisUniformQType.getQuantizedDimension()); @@ -248,15 +252,50 @@ mlir::Type vpux::tileScalesAndZP(mlir::Type perAxisQType, ShapeRef shape, ShapeR return perAxisUniformQType; } - const auto newScales = scales.slice(qSliceOffset, qSliceSize); - const auto newZeroPoints = zeroPoints.slice(qSliceOffset, qSliceSize); + auto getTiledElemType = [&](ArrayRef tiledScale, ArrayRef tiledZp) -> mlir::Type { + if (const auto perAxisQuantileQType = + mlir::dyn_cast(perAxisUniformQType)) { + return getPerAxisTypeElem(perAxisQuantileQType, tiledScale, tiledZp); + } - if (const auto perAxisQuantileQType = - mlir::dyn_cast(perAxisUniformQType)) { - return getPerAxisTypeElem(perAxisQuantileQType, newScales, newZeroPoints); + return getPerAxisTypeElem(perAxisUniformQType, tiledScale, tiledZp); + }; + + if (strides.empty() || strides[qDim] <= 1) { + return getTiledElemType(scales.slice(qSliceOffset, qSliceSize), zeroPoints.slice(qSliceOffset, qSliceSize)); } - return getPerAxisTypeElem(perAxisUniformQType, newScales, newZeroPoints); + SmallVector newScales; + SmallVector newZeroPoints; + + for (auto offset = qSliceOffset; newScales.size() < qSliceSize; offset += strides[qDim]) { + newScales.push_back(scales[offset]); + newZeroPoints.push_back(zeroPoints[offset]); + } + + return getTiledElemType(newScales, newZeroPoints); +} + +mlir::Type vpux::tileScalesAndZP(mlir::Type perAxisQType, ArrayRef offsets, ArrayRef sizes) { + const auto perAxisUniformQType = mlir::dyn_cast(perAxisQType); + VPUX_THROW_UNLESS(perAxisUniformQType != nullptr, "perAxisQType should be a UniformQuantizedPerAxisType!"); + + const auto inScales = perAxisUniformQType.getScales(); + const auto inZeroes = perAxisUniformQType.getZeroPoints(); + + std::vector newScales; + std::vector newZeroes; + auto nClusters = offsets.size(); + int64_t length = inScales.size(); + + for (size_t k = 0; k < nClusters; k++) { + VPUX_THROW_UNLESS(offsets[k] + sizes[k] <= length, "Slice exceeds full type length: {0} + {1} > {2}", + offsets[k], sizes[k], length); + newScales.insert(newScales.end(), inScales.begin() + offsets[k], inScales.begin() + offsets[k] + sizes[k]); + newZeroes.insert(newZeroes.end(), inZeroes.begin() + offsets[k], inZeroes.begin() + offsets[k] + sizes[k]); + } + + return getPerAxisTypeElem(perAxisUniformQType, newScales, newZeroes); } mlir::Type vpux::changeAxis(mlir::Type perAxisQType, int32_t newAxis) { diff --git a/src/vpux_compiler/src/utils/shave.cpp b/src/vpux_compiler/src/utils/shave.cpp index 66b4a9d852..06cf47fb71 100644 --- a/src/vpux_compiler/src/utils/shave.cpp +++ b/src/vpux_compiler/src/utils/shave.cpp @@ -21,13 +21,13 @@ bool vpux::VPU::isFifoPerShaveEngineEnabled(mlir::Operation* op) { return VPU::getConstraint(op, VPU::USE_DEDICATED_FIFO_PER_SHAVE_ENGINE); } -bool vpux::VPU::hasSupportForFifoPerShaveEngine(VPU::ArchKind arch, bool enableWorkloadManagement) { +bool vpux::VPU::hasSupportForFifoPerShaveEngine(config::ArchKind arch, bool enableWorkloadManagement) { if (!enableWorkloadManagement) { return false; } switch (arch) { - case VPU::ArchKind::NPU37XX: { + case config::ArchKind::NPU37XX: { return false; } default: { diff --git a/src/vpux_compiler/src/utils/swizzling_utils.cpp b/src/vpux_compiler/src/utils/swizzling_utils.cpp index e427336ef3..2ecedbce3d 100644 --- a/src/vpux_compiler/src/utils/swizzling_utils.cpp +++ b/src/vpux_compiler/src/utils/swizzling_utils.cpp @@ -4,18 +4,18 @@ // #include "vpux/compiler/utils/swizzling_utils.hpp" #include "vpux/compiler/core/attributes/stride_reqs.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" #include "vpux/compiler/dialect/VPUIP/IR/types.hpp" +#include "vpux/compiler/dialect/core/IR/memref_attr.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/types.hpp" using namespace vpux; -int64_t vpux::getSizeAlignmentForSwizzling(VPU::ArchKind arch) { +int64_t vpux::getSizeAlignmentForSwizzling(config::ArchKind arch) { switch (arch) { - case VPU::ArchKind::NPU37XX: + case config::ArchKind::NPU37XX: return SWIZZLING_SIZE_ALIGNMENT_VPUX37XX; - case VPU::ArchKind::NPU40XX: + case config::ArchKind::NPU40XX: return SWIZZLING_SIZE_ALIGNMENT_VPUX40XX; default: { } @@ -23,7 +23,7 @@ int64_t vpux::getSizeAlignmentForSwizzling(VPU::ArchKind arch) { VPUX_THROW("Architecture {0} does not support swizzling", arch); } -VPUIP::SwizzlingSchemeAttr vpux::createSwizzlingSchemeAttr(mlir::MLIRContext* ctx, VPU::ArchKind archKind, +VPUIP::SwizzlingSchemeAttr vpux::createSwizzlingSchemeAttr(mlir::MLIRContext* ctx, config::ArchKind archKind, int64_t swizzlingKey) { VPUIP::SwizzlingSchemeAttr swizzlingSchemeAttr = nullptr; if (swizzlingKey < 1 || swizzlingKey > 5) { @@ -38,7 +38,7 @@ VPUIP::SwizzlingSchemeAttr vpux::createSwizzlingSchemeAttr(mlir::MLIRContext* ct return swizzlingSchemeAttr; } -int64_t vpux::getAddressAlignmentForSwizzling(int64_t swizzlingKey, VPU::ArchKind archKind) { +int64_t vpux::getAddressAlignmentForSwizzling(int64_t swizzlingKey, config::ArchKind archKind) { if (swizzlingKey < 1 || swizzlingKey > 5) { return 0; } @@ -49,7 +49,7 @@ int64_t vpux::getAddressAlignmentForSwizzling(int64_t swizzlingKey, VPU::ArchKin {3, 4096}, {4, 8192}, {5, 16384}}; - int64_t archMultiplier = archKind >= VPU::ArchKind::NPU40XX ? 2 : 1; + int64_t archMultiplier = archKind >= config::ArchKind::NPU40XX ? 2 : 1; return swizzlingAddressAlignment.at(swizzlingKey) * archMultiplier; } @@ -146,7 +146,7 @@ int64_t vpux::getSwizzlingKey(mlir::Type type) { return 0; } -mlir::Type vpux::setSwizzlingKey(mlir::Type type, mlir::IntegerAttr swizzlingKeyAttr, VPU::ArchKind archKind) { +mlir::Type vpux::setSwizzlingKey(mlir::Type type, mlir::IntegerAttr swizzlingKeyAttr, config::ArchKind archKind) { VPUX_THROW_WHEN(type == nullptr, "NULL type provided"); if (!swizzlingKeyAttr) { @@ -201,7 +201,7 @@ mlir::Type vpux::setSwizzlingKey(mlir::Type type, mlir::IntegerAttr swizzlingKey VPUX_THROW("Unsupported type for storing swizzling setting"); } -mlir::Type vpux::setSwizzlingKey(mlir::Type type, int64_t swizzlingKey, VPU::ArchKind archKind) { +mlir::Type vpux::setSwizzlingKey(mlir::Type type, int64_t swizzlingKey, config::ArchKind archKind) { if (swizzlingKey < 1 || swizzlingKey > 5) { return type; } diff --git a/src/vpux_compiler/src/utils/types.cpp b/src/vpux_compiler/src/utils/types.cpp index da0ce4c246..607daa8c50 100644 --- a/src/vpux_compiler/src/utils/types.cpp +++ b/src/vpux_compiler/src/utils/types.cpp @@ -4,25 +4,24 @@ // #include "vpux/compiler/utils/types.hpp" - #include "vpux/compiler/core/attributes/stride_reqs.hpp" #include "vpux/compiler/core/attributes/strides.hpp" #include "vpux/compiler/core/types/quantile_float/types.hpp" #include "vpux/compiler/dialect/IE/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPU/IR/types.hpp" #include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" -#include "vpux/compiler/dialect/VPUIP/IR/types.hpp" +#include "vpux/compiler/dialect/core/IR/memref_attr.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/interfaces/type_interfaces.hpp" #include "vpux/compiler/utils/attributes.hpp" #include "vpux/compiler/utils/quantization.hpp" -#include "vpux/compiler/utils/swizzling_utils.hpp" #include "vpux/utils/core/error.hpp" -#include - #include +#include #include + #include -#include using namespace vpux; diff --git a/src/vpux_compiler/src/utils/wlm_legalization_utils.cpp b/src/vpux_compiler/src/utils/wlm_legalization_utils.cpp index eb01581e6e..4d598c3dac 100644 --- a/src/vpux_compiler/src/utils/wlm_legalization_utils.cpp +++ b/src/vpux_compiler/src/utils/wlm_legalization_utils.cpp @@ -3,9 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // #include "vpux/compiler/utils/wlm_legalization_utils.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/ops.hpp" + #include -#include "vpux/compiler/dialect/IE/utils/resources.hpp" -#include "vpux/compiler/dialect/VPU/utils/wlm_constraint_utils.hpp" namespace vpux { @@ -175,4 +175,38 @@ bool inSameTaskBlock(size_t task1, size_t task2, const BlockRange& blockRange) { }); } +/// Utils for adding placeholder fetch DMAs + +// Function returns index of a task +size_t getIndexOfTask(IndexType indexType, ArrayRef dummyDMAs, BarrierInfo& barrierInfo) { + if (indexType.second == Type::Dummy) { + return barrierInfo.getIndex(dummyDMAs[indexType.first]); + } + return indexType.first; +} + +// Function returns index of a barrier +size_t getIndexOfBarrier(IndexType indexType, ArrayRef dummyBarriers, + BarrierInfo& barrierInfo) { + if (indexType.second == Type::Dummy) { + return barrierInfo.getIndex(dummyBarriers[indexType.first]); + } + return indexType.first; +} + +VPURT::TaskOp createFetchDMA(mlir::OpBuilder& builder, mlir::Value input, mlir::Value output, int port, + mlir::ValueRange waitBarriers, mlir::ValueRange updateBarriers, + VPUIP::FetchDMAAttr fetchDMAAttr, llvm::StringLiteral opName) { + auto* ctx = builder.getContext(); + auto syncDmaLoc = mlir::NameLoc::get(mlir::StringAttr::get(ctx, opName)); + auto portAttr = vpux::getIntAttr(ctx, port); + + auto fetchDMAOp = VPURT::wrapIntoTaskOp( + builder, waitBarriers, updateBarriers, syncDmaLoc, input, output, portAttr, + /*isOutOfOrder*/ nullptr, /*isCritical*/ nullptr, /*dmaHwpId*/ nullptr, + /*dmaProfilingMetaData*/ nullptr, fetchDMAAttr); + + return fetchDMAOp->getParentOfType(); +} + } // namespace vpux diff --git a/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/IE/passes.td b/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/IE/passes.td index 777d643127..684c19bd6c 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/IE/passes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/IE/passes.td @@ -28,96 +28,6 @@ def InsertIdentityPoolBeforeOp : PassBase<"insert-identity-pool-before-op", "vpu ]; } -// -// MapBilinearInterpolateOnDPUPass -// - -def MapBilinearInterpolateOnDPUPass : PassBase<"map-bilinear-interpolate-on-dpu", "vpux::FunctionPass"> { - let summary = "Convert bilinear interpolate op to strided concat, AvgPool and some depthwise convolution Ops"; - - let description = [{ - Purpose: - This pass replaces `Bilinear Interpolate` for which tiling is required to fit in CMX - with sequences of operation that can be mapped on DPU and DMA. - - How it works: - The supported interpolation axis currently supported are H and W. - For each of these axis the scaling is happening individually, first perform vertical scaling and after perform horizontal scaling. - On each axis the processing is split in three main regions BEGIN, MIDDLE and END. - These three regions refers to slices from the output tensor and are influenced by the coordinate transformation mode attribute. - * BEGIN - refers to the slice from output for which it is only needed to duplicate the first line/column from input - * MIDDLE - refers to the slice from output where: - * for each output line/column from the output it is required to take two consecutive lines/colums from the input - * based on the coordinate transformation mode attribute compute the weight coefficients each of two lines/columns - has on theresulting output line/column - * each output line/column is computed with a GroupConvolution operation for which the weights are obtained by expanding - the weight coefficients of the input lines/columns - * END - refers to the slice from output for which it is only needed to duplicate the last line/column from input - ``` - Vertical scaling: Horizontal scaling - ________________________ ____________________________ - | BEGIN | | | | | - |______________________| | | | | - | | | B | M | | - | | | E | I | E | - | MIDDLE | | G | D | N | - | | | I | D | D | - |______________________| | N | L | | - | END | | | E | | - |______________________| |___|__________________|___| - ``` - - The rewrite implemented per each region is described below: - BEGIN region: - ``` Input - | - Slice - first line/column - | ... | - Identity Identity - AvgPool AvgPool - - MIDDLE region - Input - ---------|--------- - | | - Slice ... Slice - two lines/colums two lines/colums - | | - GroupConv GroupConv - one output line/colum one output line/colum - - END region: - Input - | - Slice - last line/column - | ... | - Identity Identity - AvgPool AvgPool - ``` - At the end the results of all the operation resulted are concatenated together on the scaling axis. - - In case the `interpolateAsSEOp` option is set to true, only cases that cannot be executed - using the Storage Element hardware feature will be converted to concats. - }]; - - let constructor = "vpux::IE::arch37xx::createMapBilinearInterpolateOnDPUPass()"; - - let dependentDialects = [ - "vpux::IE::IEDialect" - ]; - - - let options = [ - Option< - "interpolateAsSEOp", "interpolate-as-se-op", - "bool", "false", - "Flag which identifies whether an Interpolate operation can be executed using the Storage Element hardware feature" - > - ]; -} - // // OptimizeSliceExpand // diff --git a/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/NPUReg37XX/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/NPUReg37XX/CMakeLists.txt index 01245ea6d9..3b60d22900 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/NPUReg37XX/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/NPUReg37XX/CMakeLists.txt @@ -4,4 +4,5 @@ # add_vpux_dialect(NPUReg37XX) +add_vpux_ops(NPUReg37XX GENERIC) add_vpux_type(NPUReg37XX) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/VPUIP/passes.td b/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/VPUIP/passes.td index c214ec6427..fd6813277b 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/VPUIP/passes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/VPUIP/passes.td @@ -85,4 +85,21 @@ def UnrollSpaceToDepthDMA : PassBase<"unroll-space-to-depth-dma", "vpux::Functio let constructor = "vpux::VPUIP::arch37xx::createUnrollSpaceToDepthDMAPass()"; } +// +// UnrollPermuteDMA +// + +def UnrollPermuteDMA : PassBase<"unroll-permute-dma", "vpux::FunctionPass"> { + let summary = "Unroll PermuteDMA tasks"; + + let description = [{ + This pass unrolls PermuteDMA tasks to one or several PermuteDMA tasks. + The number of new PermuteDMA tasks depends on the number of planes (num_planes <= 256). + 1. NCHW -> NHWC: The number of planes is C. + 2. NHWC -> NCHW: The number of planes is H * W, and W must be <= 256. + }]; + + let constructor = "vpux::VPUIP::arch37xx::createUnrollPermuteDMAPass()"; +} + #endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/VPUIPDPU/ops.td b/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/VPUIPDPU/ops.td index dcdbc66c4f..41c3680e99 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/VPUIPDPU/ops.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/NPU37XX/dialect/VPUIPDPU/ops.td @@ -7,6 +7,7 @@ #define VPUX_COMPILER_DIALECT_VPUIPDPU_OPS include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/config/ops_interfaces.td" include "vpux/compiler/dialect/VPUIPDPU/attributes.td" include "vpux/compiler/dialect/VPUIPDPU/dialect.td" include "vpux/compiler/dialect/VPUIPDPU/ops_interfaces.td" @@ -22,7 +23,7 @@ class VPUIPDPU_Op traits = []> : !listconcat( traits, [ - DefinedInArch<"vpux::VPU::ArchKind::NPU37XX"> + DefinedInArch<"vpux::config::ArchKind::NPU37XX"> ] ) >; @@ -58,7 +59,7 @@ def VPUIPDPU_IDUWeights37XXOp : VPUIPDPU_Op<"IDUWeights37XX", [ HasParent<"vpux::VPUIPDPU::IDUCfgOp">, - LimitedToArch<["vpux::VPU::ArchKind::NPU37XX"]> + LimitedToArch<["vpux::config::ArchKind::NPU37XX"]> ] > { let summary = "IDU Weight offset NPU37XX"; @@ -532,7 +533,7 @@ def VPUIPDPU_IDUSESegmentOp : VPUIPDPU_Op<"IDUSESegment", [ HasParent<"vpux::VPUIPDPU::IDUCfgOp">, - LimitedToArch<["vpux::VPU::ArchKind::NPU37XX"]> + LimitedToArch<["vpux::config::ArchKind::NPU37XX"]> ] > { @@ -576,7 +577,7 @@ def VPUIPDPU_IDUSPSegmentOp : VPUIPDPU_Op<"IDUSPSegment", [ HasParent<"vpux::VPUIPDPU::IDUCfgOp">, - LimitedToArch<["vpux::VPU::ArchKind::NPU37XX"]> + LimitedToArch<["vpux::config::ArchKind::NPU37XX"]> ] > { let summary = "IDU sparsity segmentation config"; @@ -1021,7 +1022,7 @@ def VPUIPDPU_ODUCastOp : VPUIPDPU_Op<"ODUCast", [ HasParent<"vpux::VPUIPDPU::ODUCfgOp">, - LimitedToArch<["vpux::VPU::ArchKind::NPU37XX"]> + LimitedToArch<["vpux::config::ArchKind::NPU37XX"]> ] > { let summary = "ODU cast config."; diff --git a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/ELF/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/ELF/CMakeLists.txt index 24e3a12ec2..39eb9b9e52 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/ELF/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/ELF/CMakeLists.txt @@ -4,6 +4,7 @@ # add_vpux_dialect(ELF) +add_vpux_ops(ELF GENERIC) add_vpux_ops_interface(ELF NPU40XX/dialect/ELF/) add_vpux_attribute(ELF ENABLE_VPUX_ENUMS ENABLE_VPUX_ATTR) add_vpux_pass(ELF ELF NPU40XX/dialect/ELF/) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/ELF/ops_interfaces.td b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/ELF/ops_interfaces.td index efea74aa76..92e80641df 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/ELF/ops_interfaces.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/ELF/ops_interfaces.td @@ -196,7 +196,7 @@ def ELF_BinarySizeOpInterface : OpInterface<"BinarySizeOpInterface"> { // E#136377 "Get the size in bytes of the serialized form of this object; it maybe not the same as sizeof for non-POD types", "size_t", - "getBinarySize", (ins "::vpux::VPU::ArchKind":$arch), [{}], + "getBinarySize", (ins "::vpux::config::ArchKind":$arch), [{}], /*defaultImplementation*/ [{ VPUX_THROW("Unexpected call to interface implementation."); }] @@ -205,7 +205,7 @@ def ELF_BinarySizeOpInterface : OpInterface<"BinarySizeOpInterface"> { InterfaceMethod< "Get the size in bytes of the serialized form of this object, using a symbol reference map for faster look-up", "size_t", - "getBinarySizeCached", (ins "ELF::SymbolReferenceMap&":$symRefMap, "::vpux::VPU::ArchKind":$arch), [{}], + "getBinarySizeCached", (ins "ELF::SymbolReferenceMap&":$symRefMap, "::vpux::config::ArchKind":$arch), [{}], /*defaultImplementation*/ [{ return $_op.getBinarySize(arch); }] @@ -214,7 +214,7 @@ def ELF_BinarySizeOpInterface : OpInterface<"BinarySizeOpInterface"> { InterfaceMethod< "Get the alignment requirements for the serialization of this op", "size_t", - "getAlignmentRequirements", (ins "::vpux::VPU::ArchKind":$arch), [{}], + "getAlignmentRequirements", (ins "::vpux::config::ArchKind":$arch), [{}], /*defaultImplementation*/ [{ return ELF::VPUX_NO_ALIGNMENT; }] diff --git a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/IE/passes.td b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/IE/passes.td index 41a5ca903a..9731fd1e72 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/IE/passes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/IE/passes.td @@ -8,96 +8,6 @@ include "mlir/Pass/PassBase.td" -// -// MapBilinearInterpolateOnDPUPass -// - -def MapBilinearInterpolateOnDPUPass : PassBase<"map-bilinear-interpolate-on-dpu", "vpux::FunctionPass"> { - let summary = "Convert bilinear interpolate op to strided concat, AvgPool and some depthwise convolution Ops"; - - let description = [{ - Purpose: - This pass replaces `Bilinear Interpolate` for which tiling is required to fit in CMX - with sequences of operation that can be mapped on DPU and DMA. - - How it works: - The supported interpolation axis currently supported are H and W. - For each of these axis the scaling is happening individually, first perform vertical scaling and after perform horizontal scaling. - On each axis the processing is split in three main regions BEGIN, MIDDLE and END. - These three regions refers to slices from the output tensor and are influenced by the coordinate transformation mode attribute. - * BEGIN - refers to the slice from output for which it is only needed to duplicate the first line/column from input - * MIDDLE - refers to the slice from output where: - * for each output line/column from the output it is required to take two consecutive lines/colums from the input - * based on the coordinate transformation mode attribute compute the weight coefficients each of two lines/columns - has on theresulting output line/column - * each output line/column is computed with a GroupConvolution operation for which the weights are obtained by expanding - the weight coefficients of the input lines/columns - * END - refers to the slice from output for which it is only needed to duplicate the last line/column from input - ``` - Vertical scaling: Horizontal scaling - ________________________ ____________________________ - | BEGIN | | | | | - |______________________| | | | | - | | | B | M | | - | | | E | I | E | - | MIDDLE | | G | D | N | - | | | I | D | D | - |______________________| | N | L | | - | END | | | E | | - |______________________| |___|__________________|___| - ``` - - The rewrite implemented per each region is described below: - BEGIN region: - ``` Input - | - Slice - first line/column - | ... | - Identity Identity - AvgPool AvgPool - - MIDDLE region - Input - ---------|--------- - | | - Slice ... Slice - two lines/colums two lines/colums - | | - GroupConv GroupConv - one output line/colum one output line/colum - - END region: - Input - | - Slice - last line/column - | ... | - Identity Identity - AvgPool AvgPool - ``` - At the end the results of all the operation resulted are concatenated together on the scaling axis. - - In case the `interpolateAsSEOp` option is set to true, only cases that cannot be executed - using the Storage Element hardware feature will be converted to concats. - }]; - - let constructor = "vpux::IE::arch40xx::createMapBilinearInterpolateOnDPUPass()"; - - let dependentDialects = [ - "vpux::IE::IEDialect" - ]; - - - let options = [ - Option< - "interpolateAsSEOp", "interpolate-as-se-op", - "bool", "false", - "Flag which identifies whether an Interpolate operation can be executed using the Storage Element hardware feature" - > - ]; -} - // // ReduceNumTilesForSmallModelsPass // diff --git a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/NPUReg40XX/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/NPUReg40XX/CMakeLists.txt index a7de56f069..729c4abd7b 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/NPUReg40XX/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/NPUReg40XX/CMakeLists.txt @@ -4,8 +4,10 @@ # add_vpux_dialect(NPUReg40XX) +add_vpux_ops(NPUReg40XX GENERIC) add_vpux_ops_interface(NPUReg40XX dialect/NPUReg40XX/) add_vpux_attribute(NPUReg40XX ENABLE_VPUX_ENUMS ENABLE_VPUX_ATTR) +add_vpux_pass(NPUReg40XX NPUReg40XX NPU40XX/dialect/NPUReg40XX/) # E#135032: add_vpux_type to be replaced with add_npu_reg_type add_vpux_type(NPUReg40XX) add_npu_reg_type(NPU40XX) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/NPUReg40XX/passes.td b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/NPUReg40XX/passes.td new file mode 100644 index 0000000000..e61d22f332 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/NPUReg40XX/passes.td @@ -0,0 +1,11 @@ +// +// Copyright (C) 2022-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_NPUREG40XX_PASSES +#define VPUX_COMPILER_DIALECT_NPUREG40XX_PASSES + +include "mlir/Pass/PassBase.td" + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/NPUReg40XX/types.td b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/NPUReg40XX/types.td index 3380f673c4..4c1a1f27be 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/NPUReg40XX/types.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/NPUReg40XX/types.td @@ -1,5 +1,5 @@ // -// Copyright (C) 2025 Intel Corporation. +// Copyright (C) 2022-2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // diff --git a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/VPUIP/passes.td b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/VPUIP/passes.td index 1b5f76019e..fb6c621982 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/VPUIP/passes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/VPUIP/passes.td @@ -172,15 +172,6 @@ def AddStartBarrier: PassBase<"add-start-barrier", "vpux::FunctionPass"> { DMATask -> Barrier -> DMATask -> other tasks }]; - let options = [ - Option< - "enableCompilerBarrierProgramming", "enable-compiler-barrier-programming", - "bool", "false", - "Indicates whether barrier programming was done by the compiler or the runtime" - > - ]; - - let constructor = "vpux::VPUIP::arch40xx::createAddStartBarrierPass()"; } @@ -227,11 +218,11 @@ def FuseSegmentedDMA : PassBase<"fuse-segmented-dma", "vpux::FunctionPass"> { } // -// LegalizeScheduleForWlmFetchDmas +// LegalizeScheduleForPartialWlmFetchDmas // -def LegalizeScheduleForWlmFetchDmas : PassBase<"legalize-schedule-for-wlm", "vpux::FunctionPass"> { - let summary = "Pass inserts dummy dmas to facilitate workload-management pass"; +def LegalizeScheduleForPartialWlmFetchDmas : PassBase<"legalize-schedule-for-partial-wlm", "vpux::FunctionPass"> { + let summary = "Pass inserts dummy dmas to facilitate add-fetch-ops pass"; let description = [{ Workload Management pass expects to always find a DMA on tile 0 and list 0 for attaching a @@ -242,7 +233,7 @@ def LegalizeScheduleForWlmFetchDmas : PassBase<"legalize-schedule-for-wlm", "vpu dummy DMAs such we would always find DMAs during fetch task insertion while maintaining a valid schedule }]; - let constructor = "vpux::VPUIP::arch40xx::createLegalizeScheduleForWlmFetchDmasPass()"; + let constructor = "vpux::VPUIP::arch40xx::createLegalizeScheduleForPartialWlmFetchDmasPass()"; } // @@ -278,4 +269,50 @@ def UnrollSpaceToDepthDMA : PassBase<"unroll-space-to-depth-dma", "vpux::Functio let constructor = "vpux::VPUIP::arch40xx::createUnrollSpaceToDepthDMAPass()"; } +// +// AddPlaceholderFetchDMAs +// + +def AddPlaceholderFetchDMAs : PassBase<"add-placeholder-fetch-dmas", "vpux::FunctionPass"> { + let summary = "Pass inserts fetch dmas to facilitate add-fetch-ops pass"; + + let description = [{ + [Group N-2]..[Group N-1]..[Group N] + For the first 2 available groups it inserts FetchDMAs at beginning of the DMA list on port 0 ch:DDR + + For the rest of groups it inserts FetchDMA with barriers. It creates 2 barriers such that last task of Group N-2 updates + barrier X and last task of Group N-1 waits for barrier Y. In such case it inserts FetchDMA which waits for barrier X and + updates barrier Y ensuring fetching for Group N is finished before Group N-1 is finished execution + + These FetchDMAs are replaced with actual fetch DMAs by add-fetch-dma-ops-full-wlm pass + To identify which DMA belongs to which group on which queue we store fetchDMAAttr which contains information like + FetchDMAAttr{ + executorKind + tileIdx + listIdx + executionGroupIdx + } + Any FetchDMA can be uniquely identified by above parameters and replaced + }]; + + let constructor = "vpux::VPUIP::arch40xx::createAddPlaceholderFetchDMAsPass()"; +} + +// +// UnrollPermuteDMA +// + +def UnrollPermuteDMA : PassBase<"unroll-permute-dma", "vpux::FunctionPass"> { + let summary = "Unroll PermuteDMA tasks"; + + let description = [{ + This pass unrolls PermuteDMA tasks to one or several PermuteDMA tasks. + The multi-cluster unrolling is carried over from NPU37XX and the single-cluster unrolling is completely new + for NPU40XX in order to leverage the more flexible DMA capabilities of NPU40XX. + After this pass completes, all PermuteDMA tasks will have an InternalDataFlowAttr attached. + }]; + + let constructor = "vpux::VPUIP::arch40xx::createUnrollPermuteDMAPass()"; +} + #endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/VPUIPDPU/ops.td b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/VPUIPDPU/ops.td index 80aef086ec..d5be18cd29 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/VPUIPDPU/ops.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/VPUIPDPU/ops.td @@ -7,6 +7,7 @@ #define VPUX_COMPILER_DIALECT_VPUIPDPU_OPS include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/config/ops_interfaces.td" include "vpux/compiler/dialect/VPUIPDPU/attributes.td" include "vpux/compiler/dialect/VPUIPDPU/dialect.td" include "vpux/compiler/dialect/VPUIPDPU/ops_interfaces.td" @@ -22,7 +23,7 @@ class VPUIPDPU_Op traits = []> : !listconcat( traits, [ - DefinedInArch<"vpux::VPU::ArchKind::NPU40XX"> + DefinedInArch<"vpux::config::ArchKind::NPU40XX"> ] ) >; @@ -50,7 +51,7 @@ Setting this value will trigger pool_wt_rd_dis and disable reading of weights fr wt_plt_cfg - This config allows to specify the weight palletization in the IDU. -quantiles_lut - Optiona attribute that stores the weight palletization table values, when palletization is enabled. +quantiles_lut - Optional attribute that stores the weight palletization table values, when palletization is enabled. }]; let arguments =(ins diff --git a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/VPURT/passes.td b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/VPURT/passes.td index 9779aed00f..b92020969b 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/VPURT/passes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/NPU40XX/dialect/VPURT/passes.td @@ -201,6 +201,30 @@ def WlmInsertDummyDmasInPages : PassBase<"wlm-insert-dummy-dmas-in-pages", "vpux ]; } +// +// WlmInsertDummyBarriersInPages +// + +def WlmInsertDummyBarriersInPages : PassBase<"wlm-insert-dummy-barriers-in-pages", "vpux::FunctionPass"> { + let summary = "For WLM pages split insert dummy barriers if given page doesn't use half of physical barriers"; + + let description = [{ + This pass checks if in each page except last one number of barriers is equal to half of + number of physical barriers. If not, then it inserts dummy barriers to fill the gap. + Dummy barriers will be inserted parallel to some existing barrier with same consumer and producer. + }]; + + let constructor = "vpux::VPURT::arch40xx::createWlmInsertDummyBarriersInPagesPass()"; + + let options = [ + Option< + "numBarriersOpt", "num-barriers", + "int", "", + "Number of physical barriers, available for use" + > + ]; +} + // // OptimizeBarriersSlotsUsage // diff --git a/src/vpux_compiler/tblgen/vpux/compiler/conversion/passes.td b/src/vpux_compiler/tblgen/vpux/compiler/conversion/passes.td index 9b511ab6f1..cc191d0583 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/conversion/passes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/conversion/passes.td @@ -288,7 +288,7 @@ def ConvertVPUMI40XX2VPUASM : PassBase<"convert-VPUMI40XX-to-VPUASM", "vpux::Mod let options = [ Option< "enablePWLMOpt", "workload-management-enable", - "bool", "false", + "bool", "true", "Flag which identifies enablement of partial workload management" > ]; diff --git a/src/vpux_compiler/tblgen/vpux/compiler/conversion/rewriters/convert_eltwise_layers_to_math.td b/src/vpux_compiler/tblgen/vpux/compiler/conversion/rewriters/convert_eltwise_layers_to_math.td index 91f10749b2..3ca9897b2f 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/conversion/rewriters/convert_eltwise_layers_to_math.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/conversion/rewriters/convert_eltwise_layers_to_math.td @@ -6,7 +6,7 @@ #ifndef VPUX_COMPILER_CONVERSION_REWRITERS_CONVERT_ELTWISE_LAYERS_TO_MATH #define VPUX_COMPILER_CONVERSION_REWRITERS_CONVERT_ELTWISE_LAYERS_TO_MATH -include "vpux/compiler/dialect/IE/ops.td" +include "vpux/compiler/dialect/IE/ops/arithmetic.td" include "mlir/Dialect/Math/IR/MathOps.td" include "mlir/IR/OpBase.td" diff --git a/src/vpux_compiler/tblgen/vpux/compiler/conversion/rewriters/convert_layers_to_VPU.td b/src/vpux_compiler/tblgen/vpux/compiler/conversion/rewriters/convert_layers_to_VPU.td index 8ba71b965e..b9b026647c 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/conversion/rewriters/convert_layers_to_VPU.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/conversion/rewriters/convert_layers_to_VPU.td @@ -6,9 +6,43 @@ #ifndef VPUX_COMPILER_CONVERSION_REWRITERS_CONVERT_LAYERS_TO_VPU #define VPUX_COMPILER_CONVERSION_REWRITERS_CONVERT_LAYERS_TO_VPU -include "vpux/compiler/dialect/IE/ops.td" +include "vpux/compiler/dialect/IE/ops/activation.td" +include "vpux/compiler/dialect/IE/ops/arithmetic.td" +include "vpux/compiler/dialect/IE/ops/bitwise.td" +include "vpux/compiler/dialect/IE/ops/comparison.td" +include "vpux/compiler/dialect/IE/ops/control_flow.td" +include "vpux/compiler/dialect/IE/ops/convolution.td" include "vpux/compiler/dialect/IE/ops/data_movement.td" -include "vpux/compiler/dialect/VPU/ops.td" +include "vpux/compiler/dialect/IE/ops/data_type.td" +include "vpux/compiler/dialect/IE/ops/eltwise.td" +include "vpux/compiler/dialect/IE/ops/image.td" +include "vpux/compiler/dialect/IE/ops/logical.td" +include "vpux/compiler/dialect/IE/ops/normalization.td" +include "vpux/compiler/dialect/IE/ops/pooling.td" +include "vpux/compiler/dialect/IE/ops/recurrent.td" +include "vpux/compiler/dialect/IE/ops/reduce.td" +include "vpux/compiler/dialect/IE/ops/resources.td" +include "vpux/compiler/dialect/IE/ops/shape_manipulation.td" +include "vpux/compiler/dialect/IE/ops/specialized.td" + +include "vpux/compiler/dialect/VPU/ops/activation.td" +include "vpux/compiler/dialect/VPU/ops/arithmetic.td" +include "vpux/compiler/dialect/VPU/ops/bitwise.td" +include "vpux/compiler/dialect/VPU/ops/comparison.td" +include "vpux/compiler/dialect/VPU/ops/control_flow.td" +include "vpux/compiler/dialect/VPU/ops/convolution.td" +include "vpux/compiler/dialect/VPU/ops/data_movement.td" +include "vpux/compiler/dialect/VPU/ops/data_type.td" +include "vpux/compiler/dialect/VPU/ops/eltwise.td" +include "vpux/compiler/dialect/VPU/ops/image.td" +include "vpux/compiler/dialect/VPU/ops/internal.td" +include "vpux/compiler/dialect/VPU/ops/logical.td" +include "vpux/compiler/dialect/VPU/ops/normalization.td" +include "vpux/compiler/dialect/VPU/ops/pooling.td" +include "vpux/compiler/dialect/VPU/ops/recurrent.td" +include "vpux/compiler/dialect/VPU/ops/reduce.td" +include "vpux/compiler/dialect/VPU/ops/shape_manipulation.td" +include "vpux/compiler/dialect/VPU/ops/specialized.td" include "mlir/IR/OpBase.td" include "mlir/IR/PatternBase.td" @@ -1426,22 +1460,6 @@ def RewritePad : (createPadOp $input, $pads_begin, $pads_end, $pad_value, $pads_begin_attr, $pads_end_attr, $pad_value_attr, $mode, $output_padding, $input_padding) >; -// -// IE.Copy -> VPU.Copy -// - -def createCopyOp : - NativeCodeCall<[{ - $_builder.create( - $_loc, $0, $1) - }]>; - -def RewriteCopy : - Pat< - (IE_CopyOp $input, $out_mem_space), - (createCopyOp $input, $out_mem_space) - >; - // // IE.Slice -> VPU.Slice // @@ -2264,8 +2282,8 @@ def RewriteRoPE : def createSDPAOp : NativeCodeCall<[{ - $_builder.create($_loc, $0, $1, $2, - $3.size() == 1 ? $3[0] : nullptr, + $_builder.create($_loc, $0, $1, $2, + $3.size() == 1 ? $3[0] : nullptr, $4.size() == 1 ? $4[0] : nullptr, $5.size() == 1 ? $5[0] : nullptr, nullptr) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/ELFNPU37XX/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/ELFNPU37XX/CMakeLists.txt index 40098ea2d0..1ee8c15cfe 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/ELFNPU37XX/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/ELFNPU37XX/CMakeLists.txt @@ -4,6 +4,7 @@ # add_vpux_dialect(ELFNPU37XX) +add_vpux_ops(ELFNPU37XX GENERIC) add_vpux_ops_interface(ELFNPU37XX dialect/ELFNPU37XX/) add_vpux_attribute(ELFNPU37XX ENABLE_VPUX_ENUMS ENABLE_VPUX_ATTR) add_vpux_type(ELFNPU37XX) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/HostExec/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/HostExec/CMakeLists.txt index d110cc269e..8cc0cdc285 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/HostExec/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/HostExec/CMakeLists.txt @@ -4,6 +4,7 @@ # add_vpux_dialect(HostExec) +add_vpux_ops(HostExec GENERIC) add_vpux_pass(HostExec HostExec dialect/HostExec/) add_vpux_attribute(HostExec ENABLE_VPUX_ATTR) add_vpux_type(HostExec) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/CMakeLists.txt index 888c6ce978..fc99993522 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/CMakeLists.txt @@ -4,6 +4,24 @@ # add_vpux_dialect(IE) +add_vpux_ops_granular(IE GENERIC ops/ activation) +add_vpux_ops_granular(IE GENERIC ops/ arithmetic) +add_vpux_ops_granular(IE GENERIC ops/ bitwise) +add_vpux_ops_granular(IE GENERIC ops/ comparison) +add_vpux_ops_granular(IE GENERIC ops/ control_flow) +add_vpux_ops_granular(IE GENERIC ops/ convolution) +add_vpux_ops_granular(IE GENERIC ops/ data_movement) +add_vpux_ops_granular(IE GENERIC ops/ data_type) +add_vpux_ops_granular(IE GENERIC ops/ eltwise) +add_vpux_ops_granular(IE GENERIC ops/ image) +add_vpux_ops_granular(IE GENERIC ops/ logical) +add_vpux_ops_granular(IE GENERIC ops/ normalization) +add_vpux_ops_granular(IE GENERIC ops/ pooling) +add_vpux_ops_granular(IE GENERIC ops/ recurrent) +add_vpux_ops_granular(IE GENERIC ops/ reduce) +add_vpux_ops_granular(IE GENERIC ops/ resources) +add_vpux_ops_granular(IE GENERIC ops/ shape_manipulation) +add_vpux_ops_granular(IE GENERIC ops/ specialized) add_vpux_ops_interface(IE dialect/IE/) add_vpux_attr_interface(IE dialect/IE/) add_vpux_attribute(IE ENABLE_VPUX_ENUMS ENABLE_VPUX_ATTR) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops.td deleted file mode 100644 index d99c29ab44..0000000000 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops.td +++ /dev/null @@ -1,60 +0,0 @@ -// -// Copyright (C) 2022-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -#ifndef VPUX_COMPILER_DIALECT_IE_OPS -#define VPUX_COMPILER_DIALECT_IE_OPS - -include "vpux/compiler/dialect/IE/ops/activation.td" -include "vpux/compiler/dialect/IE/ops/arithmetic.td" -include "vpux/compiler/dialect/IE/ops/bitwise.td" -include "vpux/compiler/dialect/IE/ops/comparison.td" -include "vpux/compiler/dialect/IE/ops/control_flow.td" -include "vpux/compiler/dialect/IE/ops/convolution.td" -include "vpux/compiler/dialect/IE/ops/data_movement.td" -include "vpux/compiler/dialect/IE/ops/data_type.td" -include "vpux/compiler/dialect/IE/ops/eltwise.td" -include "vpux/compiler/dialect/IE/ops/image.td" -include "vpux/compiler/dialect/IE/ops/logical.td" -include "vpux/compiler/dialect/IE/ops/normalization.td" -include "vpux/compiler/dialect/IE/ops/pooling.td" -include "vpux/compiler/dialect/IE/ops/recurrent.td" -include "vpux/compiler/dialect/IE/ops/reduce.td" -include "vpux/compiler/dialect/IE/ops/resources.td" -include "vpux/compiler/dialect/IE/ops/shape_manipulation.td" -include "vpux/compiler/dialect/IE/ops/specialized.td" - -// -// ExternalKernelOp -// - -def IE_ExternalKernelOp : - IE_Op< - "ExternalKernel", - [] - > { - let summary = "Represents a kernel whose details/implementation are defined externally"; - - let arguments = (ins - Variadic:$inputs, - DictionaryAttr:$attrDict, - StrAttr:$unique_id, - StrAttr:$kernel_path - ); - - let results = (outs - Variadic:$outputs - ); - - let assemblyFormat = [{ - $unique_id - `at_path` `(` $kernel_path `)` - (`inputs` `(` $inputs^ `:` type($inputs) `)`)? - `attrs` `(` $attrDict `)` - attr-dict - `->` type(results) - }]; -} - -#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/activation.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/activation.td index d2d343e7dc..d2d9e03652 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/activation.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/activation.td @@ -82,6 +82,7 @@ def IE_HSigmoidOp : IE_LayerOp< "HSigmoid", [ + DeclareOpInterfaceMethods, IE_EltwiseOp ] > { @@ -101,6 +102,7 @@ def IE_HSwishOp : IE_LayerOp< "HSwish", [ + DeclareOpInterfaceMethods, IE_EltwiseOp ] > { diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/arithmetic.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/arithmetic.td index 558d7d2c9c..1084cf5114 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/arithmetic.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/arithmetic.td @@ -18,6 +18,7 @@ def IE_AbsOp : IE_LayerOp< "Abs", [ + DeclareOpInterfaceMethods, IE_EltwiseOp ] > { @@ -342,6 +343,7 @@ def IE_NegativeOp : IE_LayerOp< "Negative", [ + DeclareOpInterfaceMethods, IE_EltwiseOp ] > { @@ -383,6 +385,7 @@ def IE_SignOp : IE_LayerOp< "Sign", [ + DeclareOpInterfaceMethods, IE_EltwiseOp ] > { diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/data_movement.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/data_movement.td index cb9f4de2e9..87b0e70935 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/data_movement.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/data_movement.td @@ -116,30 +116,6 @@ def IE_ConcatOp : let checkInferredMemSpace = 1; } -def IE_CopyOp : - IE_LayerOp< - "Copy" - > { - let summary = "InferenceEngine Copy layer"; - - let arguments = (ins - AnyRankedTensor:$input, - - OptionalAttr:$out_mem_space - ); - - let results = (outs - AnyRankedTensor:$output - ); - - let hasFolder = 1; - let hasCanonicalizer = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_STRICT_EQUAL]; - let checkInferredDimsOrder = 1; - let checkInferredMemSpace = 1; -} - def IE_DepthToSpaceOp : IE_LayerOp< "DepthToSpace" @@ -670,27 +646,27 @@ def IE_SliceOp : let summary = "Extract single slice from tensor"; let arguments = (ins - AnyRankedTensor:$source, + AnyRankedTensor:$input, I64ArrayAttr:$static_offsets, I64ArrayAttr:$static_sizes ); let results = (outs - AnyRankedTensor:$result + AnyRankedTensor:$output ); let assemblyFormat = [{ - $source $static_offsets $static_sizes - attr-dict `:` type($source) `to` type(results) + $input $static_offsets $static_sizes + attr-dict `:` type($input) `to` type(results) }]; let builders = [ OpBuilder< - (ins "mlir::Value":$source, "vpux::ShapeRef":$static_offsets, "vpux::ShapeRef":$static_sizes) + (ins "mlir::Value":$input, "vpux::ShapeRef":$static_offsets, "vpux::ShapeRef":$static_sizes) >, OpBuilder< - (ins "mlir::Value":$source, "vpux::ArrayRef":$static_offsets, "vpux::ArrayRef":$static_sizes) + (ins "mlir::Value":$input, "vpux::ArrayRef":$static_offsets, "vpux::ArrayRef":$static_sizes) > ]; @@ -700,6 +676,11 @@ def IE_SliceOp : let elemComparisonModes = [IE_TypeComparisonMode_STRICT_EQUAL]; let checkInferredDimsOrder = 1; let checkInferredMemSpace = 1; + + let extraClassDeclaration = [{ + // Note: to be compatible with existing code + mlir::Value getSource() { return getInput(); } + }] # baseExtraClassDeclaration; } def IE_SpaceToBatch : diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/shape_manipulation.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/shape_manipulation.td index 114c3497ad..3601d61444 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/shape_manipulation.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/shape_manipulation.td @@ -111,22 +111,27 @@ def IE_ShapeCastOp : let summary = "ShapeCast layer"; let arguments = (ins - AnyRankedTensor:$source, + AnyRankedTensor:$input, I64ArrayAttr:$shape ); let results = (outs - AnyRankedTensor:$result + AnyRankedTensor:$output ); let assemblyFormat = [{ attr-dict - `inputs` `(` $source `:` type($source) `)` + `inputs` `(` $input `:` type($input) `)` `->` type(results) }]; let hasCanonicalizer = 1; let hasFolder = 1; + + let extraClassDeclaration = [{ + // Note: to be compatible with existing code + mlir::Value getSource() { return getInput(); } + }] # baseExtraClassDeclaration; } def IE_ShapeOfOp : diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/specialized.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/specialized.td index c008f80f40..17de0f32da 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/specialized.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/ops/specialized.td @@ -133,6 +133,38 @@ def IE_CGCYieldOp : }]; } +// +// ExternalKernelOp +// + +def IE_ExternalKernelOp : + IE_Op< + "ExternalKernel", + [] + > { + let summary = "Represents a kernel whose details/implementation are defined externally"; + + let arguments = (ins + Variadic:$inputs, + DictionaryAttr:$attrDict, + StrAttr:$unique_id, + StrAttr:$kernel_path + ); + + let results = (outs + Variadic:$outputs + ); + + let assemblyFormat = [{ + $unique_id + `at_path` `(` $kernel_path `)` + (`inputs` `(` $inputs^ `:` type($inputs) `)`)? + `attrs` `(` $attrDict `)` + attr-dict + `->` type(results) + }]; +} + def IE_DetectionOutputOp : IE_LayerOp< "DetectionOutput", @@ -689,7 +721,9 @@ def IE_ReadValueOp : let arguments = (ins AnyRankedTensor:$input, - StrAttr:$name + StrAttr:$name, + OptionalAttr:$element_type, + OptionalAttr:$shape ); let results = (outs diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/passes.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/passes.td index c0a55b531c..086f085743 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/passes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/passes.td @@ -12,6 +12,95 @@ include "mlir/Pass/PassBase.td" // Precisions and Layouts //================================================================================= +// +// MapBilinearInterpolateOnDPU +// + +def MapBilinearInterpolateOnDPU : PassBase<"map-bilinear-interpolate-on-dpu", "vpux::FunctionPass"> { + let summary = "Convert bilinear interpolate op to strided concat, AvgPool and some depthwise convolution Ops"; + + let description = [{ + Purpose: + This pass replaces `Bilinear Interpolate` for which tiling is required to fit in CMX + with sequences of operation that can be mapped on DPU and DMA. + + How it works: + The supported interpolation axis currently supported are H and W. + For each of these axis the scaling is happening individually, first perform vertical scaling and after perform horizontal scaling. + On each axis the processing is split in three main regions BEGIN, MIDDLE and END. + These three regions refers to slices from the output tensor and are influenced by the coordinate transformation mode attribute. + * BEGIN - refers to the slice from output for which it is only needed to duplicate the first line/column from input + * MIDDLE - refers to the slice from output where: + * for each output line/column from the output it is required to take two consecutive lines/colums from the input + * based on the coordinate transformation mode attribute compute the weight coefficients each of two lines/columns + has on theresulting output line/column + * each output line/column is computed with a GroupConvolution operation for which the weights are obtained by expanding + the weight coefficients of the input lines/columns + * END - refers to the slice from output for which it is only needed to duplicate the last line/column from input + ``` + Vertical scaling: Horizontal scaling + ________________________ ____________________________ + | BEGIN | | | | | + |______________________| | | | | + | | | B | M | | + | | | E | I | E | + | MIDDLE | | G | D | N | + | | | I | D | D | + |______________________| | N | L | | + | END | | | E | | + |______________________| |___|__________________|___| + ``` + + The rewrite implemented per each region is described below: + BEGIN region: + ``` Input + | + Slice + first line/column + | ... | + Identity Identity + AvgPool AvgPool + + MIDDLE region + Input + ---------|--------- + | | + Slice ... Slice + two lines/colums two lines/colums + | | + GroupConv GroupConv + one output line/colum one output line/colum + + END region: + Input + | + Slice + last line/column + | ... | + Identity Identity + AvgPool AvgPool + ``` + At the end the results of all the operation resulted are concatenated together on the scaling axis. + + In case the `interpolateAsSEOp` option is set to true, only cases that cannot be executed + using the Storage Element hardware feature will be converted to concats. + }]; + + let constructor = "vpux::IE::createMapBilinearInterpolateOnDPUPass()"; + + let dependentDialects = [ + "vpux::IE::IEDialect" + ]; + + let options = [ + Option< + "interpolateAsSEOp", "interpolate-as-se-op", + "bool", "false", + "Flag which identifies whether an Interpolate operation can be executed using the Storage Element hardware feature" + > + ]; +} + // // Outliner // @@ -317,6 +406,20 @@ def FuseRoPE : PassBase<"fuse-rope", "vpux::FunctionPass"> { ]; } +// +// ExpandSoftmaxAxis +// +def ExpandSoftmaxAxis : PassBase<"expand-softmax-axis", "vpux::FunctionPass"> { + let summary = "Fuse Slice and Expand if necessary to ExpandSoftmaxAxis"; + let description = [{ + If certain conditions are met, we can fuse the slice and expand operators around Softmax, inside a padded Softmax Operator. + }]; + let constructor = "vpux::IE::createExpandSoftmaxAxisPass()"; + let dependentDialects = [ + "vpux::IE::IEDialect" + ]; +} + // // FuseSDPA // @@ -2418,6 +2521,25 @@ def AdjustFakeQuantizeParams : PassBase<"adjust-fake-quantize-params", "vpux::Fu ]; } +// +// AdjustFakeQdqParams +// +def AdjustFakeQdqParams : PassBase<"adjust-fake-qdq-params", "vpux::FunctionPass"> { + let summary = "Check QDQ and FQ layers if input or output quantization params exceeds FP16 range and adjust params to be in scale."; + + let description = [{ + The pass checks if the FQ params are in FP16 range. + It updates the parameters of such FQ operations and introduces multiply operators + that are propagated through the graph. + }]; + + let constructor = "vpux::IE::createAdjustFakeQdqParamsPass()"; + + let dependentDialects = [ + "vpux::IE::IEDialect" + ]; +} + // // FuseFQAndMul // @@ -3064,20 +3186,25 @@ def SwapTransposeWithFQ : PassBase<"swap-transpose-with-fq", "vpux::FunctionPass } // -// SwapConvertWithTransposeReshape +// SwapConvertWithReshapeKindOps // -def SwapConvertWithTransposeReshape : PassBase<"swap-convert-with-transpose-reshape", "vpux::FunctionPass"> { - let summary = "Swaps Transpose operation with Convert"; +def SwapConvertWithReshapeKindOps : PassBase<"swap-convert-with-reshape-kind-ops", "vpux::FunctionPass"> { + let summary = "Swaps Reshape kind operations with Convert"; let description = [{ The pass is a part of `HardwareMode` pipeline. + It swaps Reshape kind operations with Convert operation when possible. - It swaps `Transpose` and 'Reshape' operations with Convert operation when possible. - This transormation reduces the number of `MemPermute` operations in resulting graph. + Network input case: + NetworkInput (NCHW) -> Convert -> Transpose-> FQ => NetworkInput (NCHW) -> Transpose -> Convert-> FQ + This transformation reduces the number of `MemPermute` operations in resulting graph. + + Network output case: + Convert -> N reshapeKindOps -> return => N reshapeKindOps -> Convert -> return }]; - let constructor = "vpux::IE::createSwapConvertWithTransposeReshapePass()"; + let constructor = "vpux::IE::createSwapConvertWithReshapeKindOpsPass()"; let dependentDialects = [ "vpux::IE::IEDialect" diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/rewriters/convert.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/rewriters/convert.td index 6a6ea89768..73718ec101 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/rewriters/convert.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/rewriters/convert.td @@ -6,7 +6,7 @@ #ifndef VPUX_COMPILER_DIALECT_IE_REWRITERS_CONVERT #define VPUX_COMPILER_DIALECT_IE_REWRITERS_CONVERT -include "vpux/compiler/dialect/IE/ops.td" +include "vpux/compiler/dialect/IE/ops/data_type.td" include "mlir/IR/OpBase.td" include "mlir/IR/PatternBase.td" diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/rewriters/reorder.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/rewriters/reorder.td index e69f77ef2f..037f8c17dd 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/rewriters/reorder.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/IE/rewriters/reorder.td @@ -6,7 +6,7 @@ #ifndef VPUX_COMPILER_DIALECT_IE_REWRITERS_REORDER #define VPUX_COMPILER_DIALECT_IE_REWRITERS_REORDER -include "vpux/compiler/dialect/IE/ops.td" +include "vpux/compiler/dialect/IE/ops/data_movement.td" include "mlir/IR/OpBase.td" include "mlir/IR/PatternBase.td" diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/CMakeLists.txt index e2b1976d3f..1c4395892b 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/CMakeLists.txt @@ -4,6 +4,7 @@ # add_vpux_dialect(VPU) +add_vpux_ops(VPU GENERIC) add_vpux_ops_interface(VPU dialect/VPU/) add_vpux_attr_interface(VPU dialect/VPU/) add_vpux_type_interface(VPU dialect/VPU/) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/attributes.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/attributes.td index 346b08bbef..0fb23077a3 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/attributes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/attributes.td @@ -36,24 +36,6 @@ class VPU_EnumAttr traits let assemblyFormat = "`<`$value`>`"; } -// -// ArchKind -// - -def VPU_ArchKind : - VPU_I64EnumAttr< - "ArchKind", - "Represents VPU architecture generation", - [ - I64EnumAttrCase<"UNKNOWN", 0>, - I64EnumAttrCase<"NPU37XX", 3>, - I64EnumAttrCase<"NPU40XX", 4>, - ] - > { -} - -def VPU_ArchKindAttr : VPU_EnumAttr; - // // MemoryKind // @@ -115,29 +97,6 @@ def VPU_ExecutorKind : def VPU_ExecutorKindAttr : VPU_EnumAttr; -// -// RevisionID -// - -def VPU_RevisionID : - VPU_I64EnumAttr< - "RevisionID", - "Revision ID", - [ - I64EnumAttrCase<"REVISION_A0", 0>, - I64EnumAttrCase<"REVISION_A1", 1>, - I64EnumAttrCase<"REVISION_A3", 2>, - I64EnumAttrCase<"REVISION_B", 3>, - I64EnumAttrCase<"REVISION_C", 4>, - I64EnumAttrCase<"REVISION_D", 5>, - I64EnumAttrCase<"REVISION_K", 6>, - I64EnumAttrCase<"REVISION_NONE", 7> - ] - > { -} - -def VPU_RevisionIDAttr : VPU_EnumAttr; - // // VFScenario // @@ -1197,4 +1156,18 @@ def VPU_BoundsRepresentation : def VPU_BoundsRepresentationAttr : VPU_EnumAttr; +def VPU_WorkloadManagementStatus : + VPU_I64EnumAttr< + "WorkloadManagementStatus", + "Workload Management Status", + [ + I64EnumAttrCase<"ENABLED", 0>, + I64EnumAttrCase<"DISABLED", 1>, + I64EnumAttrCase<"FAILED", 2> + ] + > { +} + +def VPU_WorkloadManagementStatusAttr : VPU_EnumAttr; + #endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops.td index 255dab2057..0c58b6e171 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops.td @@ -6,8701 +6,26 @@ #ifndef VPUX_COMPILER_DIALECT_VPU_OPS #define VPUX_COMPILER_DIALECT_VPU_OPS -include "vpux/compiler/dialect/core/attributes.td" -include "vpux/compiler/dialect/core/ops_interfaces.td" -include "vpux/compiler/dialect/core/types.td" -include "vpux/compiler/dialect/core/constraints.td" -include "vpux/compiler/dialect/const/attributes.td" -include "vpux/compiler/dialect/IE/attributes.td" -include "vpux/compiler/dialect/IE/ops_interfaces.td" -include "vpux/compiler/dialect/VPU/attributes.td" -include "vpux/compiler/dialect/VPU/dialect.td" -include "vpux/compiler/dialect/VPU/ops_interfaces.td" -include "vpux/compiler/dialect/VPU/types.td" +include "vpux/compiler/dialect/VPU/ops/activation.td" +include "vpux/compiler/dialect/VPU/ops/arithmetic.td" +include "vpux/compiler/dialect/VPU/ops/bitwise.td" +include "vpux/compiler/dialect/VPU/ops/comparison.td" +include "vpux/compiler/dialect/VPU/ops/control_flow.td" +include "vpux/compiler/dialect/VPU/ops/convolution.td" +include "vpux/compiler/dialect/VPU/ops/data_movement.td" +include "vpux/compiler/dialect/VPU/ops/data_type.td" +include "vpux/compiler/dialect/VPU/ops/dpu.td" +include "vpux/compiler/dialect/VPU/ops/eltwise.td" +include "vpux/compiler/dialect/VPU/ops/image.td" +include "vpux/compiler/dialect/VPU/ops/internal.td" +include "vpux/compiler/dialect/VPU/ops/logical.td" +include "vpux/compiler/dialect/VPU/ops/m2i.td" +include "vpux/compiler/dialect/VPU/ops/normalization.td" +include "vpux/compiler/dialect/VPU/ops/pooling.td" +include "vpux/compiler/dialect/VPU/ops/recurrent.td" +include "vpux/compiler/dialect/VPU/ops/reduce.td" +include "vpux/compiler/dialect/VPU/ops/shape_manipulation.td" +include "vpux/compiler/dialect/VPU/ops/specialized.td" -include "mlir/Dialect/Quant/QuantOpsBase.td" -include "mlir/Interfaces/InferTypeOpInterface.td" -include "mlir/Interfaces/SideEffectInterfaces.td" -include "mlir/Interfaces/ControlFlowInterfaces.td" -include "mlir/Interfaces/CastInterfaces.td" -include "mlir/Interfaces/CallInterfaces.td" -include "mlir/IR/SymbolInterfaces.td" - -// -// Base classes -// - -class VPU_Op traits = []> : - Op< - VPU_Dialect, - mnemonic, - traits - >; - -class VPU_LayerOp traits = []> : - VPU_Op< - mnemonic, - [ - Pure, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] # traits - > { - list elemComparisonModes = [IE_TypeComparisonMode_STRICT_EQUAL]; - bit checkInferredDimsOrder = 0; - bit checkInferredMemSpace = 0; - - code baseExtraClassDeclaration = [{ - static bool isCompatibleReturnTypes(mlir::TypeRange lhs, mlir::TypeRange rhs) { - return vpux::areTypesCompatible(lhs, rhs, - }] # !interleave(elemComparisonModes, "|") # [{, - static_cast(}] # checkInferredDimsOrder # [{), - static_cast(}] # checkInferredMemSpace # [{) - ); - } - }]; - let extraClassDeclaration = baseExtraClassDeclaration; - - let assemblyFormat = [{ - `(` operands `)` attr-dict `:` type(operands) `->` type(results) - }]; -} - -// -// DPU.Workload -// - -def VPU_DPUWorkloadOp : - VPU_Op< - "DPU.Workload", - [ - ParentOneOf<[ - "vpux::VPU::NCEConvolutionOp", - "vpux::VPU::NCEMatMulOp", - "vpux::VPU::NCEDepthConvolutionOp", - "vpux::VPU::NCEMaxPoolOp", - "vpux::VPU::NCEAveragePoolOp", - "vpux::VPU::NCEEltwiseOp", - "vpux::VPU::NCECompressConvolutionOp", - "vpux::VPU::NCEInterpolateOp", - "vpux::VPU::NCEPermuteOp", - "vpux::VPU::NCEReduceOp" - ]> - ] - > { - let summary = "Workload for a single DPU tile"; - - let arguments = (ins - I64ArrayAttr:$outOffsets, - I64ArrayAttr:$outSizes, - - OptionalAttr:$inOffsets, - OptionalAttr:$inSizes, - - VPU_PaddingAttr:$pad, - VPU_MPEModeAttr:$mpe_mode, - - OptionalAttr:$cluster_id - ); - - let builders = [ - OpBuilder<(ins - "mlir::ArrayAttr":$outOffsets, - "mlir::ArrayAttr":$outSizes, - "vpux::VPU::PaddingAttr":$kernelFunction, - "vpux::VPU::MPEMode":$mpe_mode - )>, - - OpBuilder<(ins - "mlir::ArrayAttr":$outOffsets, - "mlir::ArrayAttr":$outSizes, - "vpux::VPU::PaddingAttr":$kernelFunction, - "vpux::VPU::MPEModeAttr":$mpe_mode, - "mlir::IntegerAttr":$cluster_id - )>, - - OpBuilder<(ins - "mlir::ArrayAttr":$outOffsets, - "mlir::ArrayAttr":$outSizes, - "vpux::VPU::PaddingAttr":$kernelFunction, - "vpux::VPU::MPEMode":$mpe_mode, - "mlir::IntegerAttr":$cluster_id - )> - ]; - - let assemblyFormat = [{ - ( `inOffsets` $inOffsets^ )? ( `inSizes` $inSizes^ )? `outOffsets` $outOffsets `outSizes` $outSizes $pad $mpe_mode attr-dict-with-keyword - }]; -} - -// -// NCE.Convolution -// - -def VPU_NCEConvolutionOp : - VPU_LayerOp< - "NCE.Convolution", - [ - AttrSizedOperandSegments, - NoRegionArguments, - NoTerminator, - SingleBlock, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "NCE version of Convolution layer"; - - let arguments = (ins - AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input, - AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$filter, - Optional, VPU_DistributedTensor]>>:$weightsTable, - Optional, VPU_DistributedTensor]>>:$weight_table_data_ptr, - Optional, VPU_DistributedTensor]>>:$weight_table_sp_ptr, - Optional, VPU_DistributedTensor]>>:$weight_table_scale, - Optional, VPU_DistributedTensor]>>:$weight_table_bias, - Optional, VPU_DistributedTensor]>>:$weight_zero_points, - - ConfinedAttr]>:$strides, - VPU_PaddingAttr:$pad, - - VPU_PPEAttr:$ppe, - OptionalAttr:$mpe_engine, - - ConfinedAttr]>:$rawFilterShape, - - OptionalAttr:$multiClusterStrategy, - OptionalAttr:$output_padding, - OptionalAttr:$input_padding - ); - - let results = (outs - AnyTypeOf<[4DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let regions = (region - AnyRegion:$workloads - ); - - let assemblyFormat = [{ - `(` - $input `,` - $filter - (`,` $weightsTable^)? - (`,` $weight_table_data_ptr^)? - (`,` $weight_table_sp_ptr^)? - (`,` $weight_table_scale^)? - (`,` $weight_table_bias^)? - (`,` $weight_zero_points^)? - `)` - attr-dict `:` type(operands) `->` type(results) - custom($workloads) - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output, Byte reservedMem); - - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output); - - static bool isSupported(vpux::IE::ConvolutionOp origOp, vpux::LogCb logCb, bool checkLayout = false, - bool checkChannelAlignment = false); - - vpux::Shape inferAlignedFilterShape(vpux::NDTypeInterface input, vpux::NDTypeInterface output, vpux::NDTypeInterface filter); - - DimArr restrictedFusionAxes(); - - static mlir::LogicalResult verifyKernel(IE::ConvolutionOp origOp, Logger log = Logger::global()); - static mlir::LogicalResult verifyKernel(IE::TransposedConvolutionOp origOp, Logger log = Logger::global()); - - static mlir::LogicalResult verifyConvCMX(mlir::Location loc, mlir::ModuleOp module, vpux::NDTypeInterface inputType, - vpux::NDTypeInterface filterType, vpux::NDTypeInterface outputType, - mlir::ArrayAttr kernelStrides, Logger log = Logger::global()); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, - IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// NCE.MatMul -// - -def VPU_NCEMatMulOp : - VPU_LayerOp< - "NCE.MatMul", - [ - NoRegionArguments, - NoTerminator, - SingleBlock, - SameVariadicOperandSize, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "NCE version of MatMul executed on DPU"; - - let description = [{ - NCE version of MatMul that will be executed on DPU. - - This operation supports 5D operands so that we may encode batch unrolling as part of the shape - and defer this unrolling until later in the compilation pipeline when it's more beneficial. - - Instead of unrolling batch into a chain of discrete slice -> convolution -> concat operations in IE dialect, - we instead encode this as part of the shape and preserve it until later on in multiclustering passes where we - can use split-over-group (in the case we have more groups than clusters or else use another approach). - - We use this split-over-group strategy to solve the problem of having too many small workloads generated - which introduce a lot of DMA overhead. - - With 5D operands, we have the following layouts (where G = Group): - * NCHW -> GNCHW - * NHWC -> GNHWC - * OIYX -> GOIYX - - Before we would convert IE.MatMul like this with repeated chains for the number of batches we need to unroll: - - | - ---/ \---------------------- ... (repeated for number of batches) - / \ \ - | | | - IE.Slice IE.Slice ... - | | | - IE.Reshape IE.Reshape ... - | | | - IE.MatMul => IE.Convolution IE.Convolution ... - | | | - IE.Reshape IE.Reshape ... - | | | - \ / / - ---\ /---------------------- ... (repeated for number of batches) - | - Concat - - Now we keep a single chain of operations (which we later unroll in multiclustering): - - VPU.AffineReshape - | - VPU.PermuteCast - | - IE.MatMul => VPU.NCE.MatMul - | - VPU.PermuteCast - | - VPU.AffineReshape - - See E#125047 for more information. - }]; - - let arguments = (ins - AnyTypeOf<[5DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input, - AnyTypeOf<[5DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$weights, - - AnyTypeOf<[5DTensorOf<[SI32]>, VPU_DistributedTensor]>:$weightsTable, - - ConfinedAttr]>:$strides, - VPU_PaddingAttr:$pad, - - VPU_PPEAttr:$ppe, - OptionalAttr:$mpe_engine, - ConfinedAttr]>:$rawFilterShape, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[5DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let regions = (region - AnyRegion:$workloads - ); - - let assemblyFormat = [{ - `(` - $input `,` - $weights `,` - $weightsTable - `)` - attr-dict - custom(type($input), type($weights), type($weightsTable)) `` - `->` type(results) - custom($workloads) - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - static bool isSupported(vpux::IE::MatMulOp origOp, vpux::LogCb logCb, - bool checkLayout = false, bool checkChannelAlignment = false); - static bool isSupported(vpux::VPU::NCEMatMulOp origOp, vpux::LogCb logCb, - bool checkLayout = false, bool checkChannelAlignment = false); - - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output, Byte reservedMem); - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output); - - static mlir::LogicalResult verifyKernel(IE::MatMulOp origOp); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [ - IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, - IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, - IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, - IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT - ]; -} - -// -// NCE.DepthConvolution -// - -def VPU_NCEDepthConvolutionOp : - VPU_LayerOp< - "NCE.DepthConvolution", - [ - NoRegionArguments, - NoTerminator, - SingleBlock, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "NCE version of Depthwise Convolution layer"; - - let arguments = (ins - AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input, - AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$filter, - AnyTypeOf<[4DTensorOf<[SI32]>, VPU_DistributedTensor]>:$weightsTable, - - ConfinedAttr]>:$strides, - VPU_PaddingAttr:$pad, - - VPU_PPEAttr:$ppe, - - ConfinedAttr]>:$rawFilterShape, - - OptionalAttr:$multiClusterStrategy, - OptionalAttr:$output_padding, - OptionalAttr:$input_padding - ); - - let results = (outs - AnyTypeOf<[4DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let regions = (region - AnyRegion:$workloads - ); - - let assemblyFormat = [{ - `(` - $input `,` - $filter `,` - $weightsTable - `)` - attr-dict - custom(type($input), type($filter), type($weightsTable)) `` - `->` type(results) - custom($workloads) - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output, Byte reservedMem); - - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output); - - static bool isSupported(vpux::IE::GroupConvolutionOp origOp, vpux::LogCb logCb, bool checkLayout = false, - bool checkChannelAlignment = false); - - vpux::Shape inferAlignedFilterShape(vpux::NDTypeInterface output, vpux::NDTypeInterface filter); - - static mlir::LogicalResult verifyKernel(IE::GroupConvolutionOp origOp, Logger log = Logger::global()); - - static mlir::LogicalResult verifyGroupConvCMX(mlir::Location loc, mlir::ModuleOp module, - vpux::NDTypeInterface inputType, vpux::NDTypeInterface filterType, - vpux::NDTypeInterface outputType, mlir::ArrayAttr kernelStrides, - Logger log = Logger::global()); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, - IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// NCE.CompressConvolution -// - -def VPU_NCECompressConvolutionOp : - VPU_LayerOp< - "NCE.CompressConvolution", - [ - NoRegionArguments, - NoTerminator, - SingleBlock, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "NCE version of Compressed Convolution layer"; - - let description = [{ - This operation must have 4 or less input channels, - instead of the usual multiple of 16 as for a normal Convolution op. - }]; - - let arguments = (ins - AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input, - AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$filter, - AnyTypeOf<[4DTensorOf<[SI32]>, VPU_DistributedTensor]>:$weightsTable, - - ConfinedAttr]>:$strides, - VPU_PaddingAttr:$pad, - - VPU_PPEAttr:$ppe, - - ConfinedAttr]>:$rawFilterShape, - - OptionalAttr:$multiClusterStrategy, - IntAttr:$cm_sp_pattern, - OptionalAttr:$output_padding, - OptionalAttr:$input_padding - ); - - let results = (outs - AnyTypeOf<[4DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let regions = (region - AnyRegion:$workloads - ); - - let assemblyFormat = [{ - `(` $input `,` $filter `,` $weightsTable `)` - attr-dict - custom(type($input), type($filter), type($weightsTable)) `` - `->` type(results) - custom($workloads) - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output, Byte reservedMem); - - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output); - - static bool isSupported(vpux::IE::ConvolutionOp origOp, vpux::LogCb logCb, bool checkLayout = false, - bool checkChannelAlignment = false); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, - IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// NCE.MaxPool -// - -def VPU_NCEMaxPoolOp : - VPU_LayerOp< - "NCE.MaxPool", - [ - NoRegionArguments, - NoTerminator, - SingleBlock, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "NCE version of MaxPool layer"; - - let arguments = (ins - AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input, - Optional, VPU_DistributedTensor]>>:$weightsTable, - - ConfinedAttr]>:$kernel_size, - ConfinedAttr]>:$strides, - VPU_PaddingAttr:$pad, - - VPU_PPEAttr:$ppe, - - OptionalAttr:$multiClusterStrategy, - OptionalAttr:$output_padding, - OptionalAttr:$input_padding - ); - - let results = (outs - AnyTypeOf<[4DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let regions = (region - AnyRegion:$workloads - ); - - let assemblyFormat = [{ - `(` $input - (`,` $weightsTable^ custom(type($weightsTable)) ``)? - `)` - attr-dict - custom(type($input)) `` - `->` type(results) - custom($workloads) - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output, Byte reservedMem); - - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output); - - static bool isSupported(vpux::IE::MaxPoolOp origOp, vpux::LogCb logCb, bool checkLayout = false, - bool checkChannelAlignment = false); - - static mlir::LogicalResult verifyKernel(IE::MaxPoolOp origOp, Logger log = Logger::global()); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, - IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// NCE.AveragePool -// - -def VPU_NCEAveragePoolOp : - VPU_LayerOp< - "NCE.AveragePool", - [ - NoRegionArguments, - NoTerminator, - SingleBlock, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "NCE version of AveragePool layer"; - - let arguments = (ins - AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input, - - ConfinedAttr]>:$kernel_size, - ConfinedAttr]>:$strides, - VPU_PaddingAttr:$pad, - - VPU_PPEAttr:$ppe, - - OptionalAttr:$multiClusterStrategy, - OptionalAttr:$output_padding, - OptionalAttr:$input_padding - ); - - let results = (outs - AnyTypeOf<[4DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let regions = (region - AnyRegion:$workloads - ); - - let assemblyFormat = [{ - `(` $input `)` - attr-dict - custom(type($input)) `` - `->` type(results) - custom($workloads) - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output, Byte reservedMem); - - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output); - - static bool isSupported(vpux::IE::AvgPoolOp origOp, vpux::LogCb logCb, bool checkLayout = false, - bool checkChannelAlignment = false); - - static mlir::LogicalResult verifyKernel(IE::AvgPoolOp origOp, Logger log = Logger::global()); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, - IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// NCE.Eltwise -// - -def VPU_NCEEltwiseOp : - VPU_LayerOp< - "NCE.Eltwise", - [ - NoRegionArguments, - NoTerminator, - SingleBlock, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "NCE version of Eltwise layer"; - - let arguments = (ins - AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input2, - - VPU_EltwiseTypeAttr:$op_type, - - VPU_PPEAttr:$ppe, - - OptionalAttr:$multiClusterStrategy, - OptionalAttr:$is_inplace, - OptionalAttr:$output_padding, - OptionalAttr:$input_padding - ); - - let results = (outs - AnyTypeOf<[4DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let regions = (region - AnyRegion:$workloads - ); - - let assemblyFormat = [{ - `(` $input1 `,` $input2 `)` - attr-dict - custom(type($input1), type($input2)) `` - `->` type(results) - custom($workloads) - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - bool fitIntoCMX(vpux::NDTypeInterface input1, vpux::NDTypeInterface input2, vpux::NDTypeInterface output, Byte reservedMem); - - bool fitIntoCMX(vpux::NDTypeInterface input1, vpux::NDTypeInterface input2, vpux::NDTypeInterface output); - - bool fitIntoCMX(vpux::NDTypeInterface input1, vpux::NDTypeInterface input2, Byte reservedMem); - - static bool isSupported(mlir::Operation* op, bool allowDifferentScales, bool allowDifferentZp, - vpux::LogCb logCb, bool checkLayout = false, - bool checkChannelAlignment = false); - - static mlir::LogicalResult verifyKernel(IE::AddOp origOp, Logger log = Logger::global()); - static mlir::LogicalResult verifyKernel(IE::MultiplyOp origOp, Logger log = Logger::global()); - static mlir::LogicalResult verifyKernel(IE::SubtractOp origOp, Logger log = Logger::global()); - - static mlir::LogicalResult verifyEltwiseCMX(mlir::Location loc, mlir::ModuleOp module, bool isInplace, - vpux::NDTypeInterface firstInputType, - vpux::NDTypeInterface secondInputType, vpux::NDTypeInterface outputType, - Logger log = Logger::global()); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// NCE.Reduce -// - -def VPU_NCEReduceOp : - VPU_LayerOp< - "NCE.Reduce", - [ - NoRegionArguments, - NoTerminator, - SingleBlock, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "NCE version of Reduce layer"; - - let arguments = (ins - AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_DistributedTensor]>:$input, - I64ArrayAttr:$axes, - - VPU_PPEAttr:$ppe, - VPU_ReduceTypeAttr:$op_type, - OptionalAttr:$multiClusterStrategy, - OptionalAttr:$output_padding, - OptionalAttr:$input_padding - ); - - let results = (outs - AnyTypeOf<[4DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output, Byte reservedMem); - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output); - - static bool isSupported(mlir::Operation* op, - vpux::LogCb logCb, bool checkLayout = false, - bool checkChannelAlignment = false); - static mlir::LogicalResult verifyKernel(mlir::Operation* origOp, Logger log = Logger::global()); - - }] # baseExtraClassDeclaration; - - let assemblyFormat = [{ - `(` $input`)` - attr-dict - custom(type($input)) `` - `->` type(results) - custom($workloads) - }]; - - let regions = (region - AnyRegion:$workloads - ); - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; - - let hasVerifier = 1; -} - -// -// NCE.Permute -// - -def VPU_NCEPermuteOp : - VPU_LayerOp< - "NCE.Permute", - [ - NoRegionArguments, - NoTerminator, - SingleBlock, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "More abstract version of combined NCE Permute and Quantization layers"; - - let description = [{ - Used to perform a datatype conversion, relayout of data and shape expansion, - all using a single NCE HW op. - - * expandedChannels - target size of output channels after expansion, usual values are 4 and 16 - * dstElemType - output tensor datatype - * dstOrder - output tensor layout, NCHW input to NHWC output relayout is supported - }]; - - let arguments = (ins - AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input, - - IntAttr:$expandedChannels, - TypeAttr:$dstElemType, - AffineMapAttr:$dstOrder, - VPU_PPEAttr:$ppe, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let regions = (region - AnyRegion:$workloads - ); - - let assemblyFormat = [{ - `(` $input `)` - attr-dict - custom(type($input)) `` - `->` type(results) - custom($workloads) - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output, Byte reservedMem); - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output); - static bool isSupported(vpux::IE::PermuteQuantizeOp origOp, vpux::LogCb logCb, bool checkLayout = true, bool checkAlignment = true); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// NCE.Interpolate -// - -def VPU_NCEInterpolateOp : - VPU_LayerOp< - "NCE.Interpolate", - [ - NoRegionArguments, - NoTerminator, - SingleBlock, - SameVariadicOperandSize, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "NCE version of Interpolate layer"; - - let arguments = (ins - AnyTypeOf<[VPU_SparseTensor, VPU_DistributedTensor]>:$input, - Optional>:$weights, - Optional, VPU_DistributedTensor]>>:$weightsTable, - - ConfinedAttr]>:$strides, - - VPU_PPEAttr:$ppe, - ConfinedAttr]>:$rawFilterShape, - OptionalAttr:$multiClusterStrategy, - OptionalAttr:$mode - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let regions = (region - AnyRegion:$workloads - ); - - let assemblyFormat = [{ - `(` $input - (`,` $weights^ `` custom(type($weights)))? - (`,` $weightsTable^ `` custom(type($weightsTable)))? - `)` - attr-dict - custom(type($input)) `` - `->` type(results) - custom($workloads) - }]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - static bool isSupported(vpux::IE::InterpolateOp origOp, vpux::LogCb logCb, - bool checkLayout = false, bool checkChannelAlignment = false, bool checkBatch = false); - static bool isSupported(vpux::VPU::InterpolateOp origOp, vpux::LogCb logCb, - bool checkLayout = false, bool checkChannelAlignment = false, bool checkBatch = false); - - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output, Byte reservedMem); - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, - IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// VerticalFusion -// - -def VPU_VerticalFusionOp : - VPU_Op< - "VerticalFusion", - [ - Pure, - IsolatedFromAbove, - DeclareOpInterfaceMethods, - SingleBlockImplicitTerminator<"YieldOp"> - ] - > { - let summary = "Operation that encapsulates details of VF subgraph"; - - let arguments = (ins - Variadic>:$operands, - I64ArrayAttr:$tilingStrategy, - OptionalAttr:$scenario - ); - - let results = (outs - Variadic>:$results - ); - - let regions = (region SizedRegion<1>:$ops); - - let hasVerifier = 1; - - let skipDefaultBuilders = 1; - let builders = [ - OpBuilder<(ins "mlir::TypeRange":$resultTypes, "mlir::ValueRange":$operands, - "llvm::function_ref":$bodyBuilder, - "mlir::ArrayAttr":$tilingInfo)>, - ]; - - let extraClassDeclaration = [{ - using BodyBuilderFn = - llvm::function_ref; - - mlir::Operation* getFirstInnerTaskOp(); - - void print(::mlir::OpAsmPrinter& p); - static ::mlir::ParseResult parse(::mlir::OpAsmParser& parser, ::mlir::OperationState& result); - }]; -} - -// -// YieldOp -// - -def VPU_YieldOp : - VPU_Op< - "Yield", - [ - HasParent<"VerticalFusionOp">, - DeclareOpInterfaceMethods, - Pure, - Terminator - ] - > { - let summary = "Terminator for wrapping operation"; - - let arguments = (ins - Variadic>:$operands - ); - - let assemblyFormat = [{ - $operands - custom(type($operands)) `` - attr-dict - }]; - - let hasVerifier = 1; -} - -// -// DistributedCastOp -// - -def VPU_DistributedCastOp : - VPU_Op< - "DistributedCast", - [ - VPU_ViewLikeOpInterface - ] - > { - let summary = "Operation that casts one DistributedTensor type to another."; - - let description = [{ - Used to cast one DistributedTensor type to another and help with NNCMX retention - of data. - - Currently following distribution mode pairs are compatible: - - DUPLICATED|SEGMENTED -> DUPLICATED ## needed for K cluster tiling - }]; - - let arguments = (ins - AnyTypeOf<[VPU_DistributedTensor, VPU_SparseTensor]>:$input - ); - - let results = (outs - AnyTypeOf<[VPU_DistributedTensor, VPU_SparseTensor]>:$output - ); - - let assemblyFormat = [{ - `(` $input `:` qualified(type($input)) `)` - attr-dict - `->` qualified(type($output)) - }]; - - let hasFolder = 1; - - let hasVerifier = 1; -} - -// -// GroupSparseTensor -// - -def VPU_GroupSparseTensorOp : - VPU_Op< - "GroupSparseTensor", - [ - Pure, - DeclareOpInterfaceMethods, - AttrSizedOperandSegments, - VPU_GroupedViewLikeOpInterface, - DeclareOpInterfaceMethods - ] - > { - let summary = "Groups sparse data and metadata into a single value"; - - let arguments = (ins - AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_DistributedTensor]>:$data, - Optional>:$sparsityMap, - Optional>:$storageElementTable, - - UnitAttr:$is_weights, - OptionalAttr:$sparsity_compression, - - OptionalAttr:$seAttr - ); - - let results = (outs - AnyTypeOf<[VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder< - (ins "mlir::Value":$data, - CArg<"bool", "{}">:$is_weights, CArg<"VPU::SparsityCompressionAttr", "{}">:$sparsity_compression) - >, - OpBuilder< - (ins "mlir::Value":$data, "mlir::Value":$sparsityMap, - CArg<"bool", "{}">:$is_weights, CArg<"VPU::SparsityCompressionAttr", "{}">:$sparsity_compression) - >, - OpBuilder< - (ins "mlir::Value":$data, "mlir::Value":$sparsityMap, "mlir::Value":$storageElementTable, - CArg<"bool", "{}">:$is_weights, CArg<"VPU::SparsityCompressionAttr", "{}">:$sparsity_compression) - >, - OpBuilder< - (ins "mlir::Value":$data, "mlir::Value":$sparsityMap, "mlir::Value":$storageElementTable, - CArg<"VPU::SEAttr", "{}">:$seAttr) - > - ]; - - let assemblyFormat = [{ - `(` $data - (`,` $sparsityMap^ `` custom(type($sparsityMap)))? - (`,` $storageElementTable^ `` custom(type($storageElementTable)))? - `)` - attr-dict - `` custom(type($data)) - `->` type(results) - }]; - - let hasCanonicalizer = 1; -} - - -// -// UngroupSparseTensor -// - -def VPU_UngroupSparseTensorOp : - VPU_Op< - "UngroupSparseTensor", - [ - Pure, - AttrSizedResultSegments, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - ] - > { - let summary = "Ungroups sparse data and metadata into multiple values"; - - let arguments = (ins - VPU_SparseTensor:$input - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$data, - Optional>:$sparsityMap, - Optional>:$storageElementTable - ); - - let assemblyFormat = [{ - `(` $input `)` - attr-dict - `` custom(type($input)) - `->` type(results) - }]; -} - -// -// SliceOp -// - -def VPU_SliceOp : - VPU_LayerOp< - "Slice", - [ - VPU_ViewLikeOpInterface - ] - > { - let summary = "Extract single slice from ranked tensor or distributed tensor"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor, VPU_SparseTensor]>:$source, - I64ArrayAttr:$static_offsets, - I64ArrayAttr:$static_sizes - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor, VPU_SparseTensor]>:$result - ); - - let assemblyFormat = [{ - $source $static_offsets $static_sizes - attr-dict `:` type($source) `to` type(results) - }]; - - let builders = [ - OpBuilder< - (ins "mlir::Value":$source, "vpux::ShapeRef":$static_offsets, "vpux::ShapeRef":$static_sizes) - >, - OpBuilder< - (ins "mlir::Value":$source, "vpux::ArrayRef":$static_offsets, "vpux::ArrayRef":$static_sizes) - > - ]; - - let hasFolder = 1; - let hasCanonicalizer = 1; - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// ConditionalCopyOp -// - -def VPU_ConditionalCopyOp : - VPU_LayerOp< - "ConditionalCopyOp" - > { - let summary = "Conditional copy VPU layer"; - - let arguments = (ins - 1DTensorOf<[Bool8, SI8]>:$cond, - AnyRankedTensor:$input1, - AnyRankedTensor:$input2 - ); - - let results = (outs - AnyRankedTensor:$output - ); - - let hasVerifier = 1; -} - -// -// LoopSelectOp -// - -def VPU_LoopSelectOp : - VPU_LayerOp< - "LoopSelect" - > { - let summary = "Select a slice of the input according to collected execution conditions, helping implement Loop Op."; - - let arguments = (ins - 1DTensorOf<[Bool8, SI8]>:$initExecCond, - 1DTensorOf<[Bool8, SI8]>:$execConds, - AnyRankedTensor:$input, - - BoolAttr:$do_concat, - IntAttr:$axis, - IntAttr:$stride - ); - - let results = (outs - AnyRankedTensor:$output - ); - - let hasVerifier = 1; -} - - -// -// ConcatOp -// - -def VPU_ConcatOp : - VPU_LayerOp< - "Concat", - [ - VPU_ViewLikeOpInterface, - DeclareOpInterfaceMethods - ] - > { - let summary = "VPU Concat layer"; - - let arguments = (ins - Variadic>:$inputs, - - OptionalAttr:$per_axis, - OptionalAttr:$static_offsets, - OptionalAttr:$strides, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let assemblyFormat = [{ - `(` operands `)` attr-dict `:` type(operands) `->` type(results) - }]; - - let builders = [ - OpBuilder< - (ins "mlir::ValueRange":$inputs, "vpux::IE::ConcatAttr":$per_axis) - >, - OpBuilder< - (ins "mlir::ValueRange":$inputs, "vpux::IE::ConcatAttr":$per_axis, "mlir::ArrayAttr":$static_offsets) - >, - OpBuilder< - (ins "mlir::ValueRange":$inputs, "mlir::IntegerAttr":$axis, - CArg<"mlir::IntegerAttr", "{}">:$offset, CArg<"mlir::IntegerAttr", "{}">:$stride) - >, - OpBuilder< - (ins "mlir::ValueRange":$inputs, "int64_t":$axis, CArg<"int64_t", "0">:$offset, CArg<"int64_t", "1">:$stride) - >, - OpBuilder< - (ins "mlir::ValueRange":$inputs, "vpux::Dim":$axis, CArg<"int64_t", "0">:$offset, CArg<"int64_t", "1">:$stride) - >, - - OpBuilder< - (ins "mlir::Type":$outType, "mlir::ValueRange":$inputs, "mlir::ArrayAttr":$static_offsets) - >, - OpBuilder< - (ins "mlir::Type":$outType, "mlir::ValueRange":$inputs, "vpux::IE::ConcatAttr":$per_axis, "mlir::ArrayAttr":$static_offsets) - >, - OpBuilder< - (ins "mlir::Type":$outType, "mlir::ValueRange":$inputs, "vpux::ArrayRef":$static_offsets) - >, - OpBuilder< - (ins "mlir::Type":$outType, "mlir::ValueRange":$inputs, "vpux::ArrayRef":$static_offsets) - >, - OpBuilder< - (ins "mlir::Type":$outType, "mlir::ValueRange":$inputs, "mlir::ArrayAttr":$static_offsets, "mlir::ArrayAttr":$strides) - > - ]; - - let hasVerifier = 1; - - let extraClassDeclaration = [{ - ::mlir::LogicalResult customVerify(); - bool fitIntoCMX(vpux::NDTypeInterface output, Byte reservedMem); - bool fitIntoCMX(vpux::NDTypeInterface output); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; - - let hasCanonicalizer = 1; - let hasFolder = 1; -} - -// -// RollOp -// - -def VPU_RollOp : - VPU_LayerOp< - "Roll", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Roll VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$data, - AnyTypeOf<[RankedTensorOf<[SI32, SI64]>, VPU_DistributedTensor]>:$shift, - AnyTypeOf<[RankedTensorOf<[SI32, SI64]>, VPU_DistributedTensor]>:$axes, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$data, - "::mlir::Value":$shift, - "::mlir::Value":$axes - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Tanh -// - -def VPU_TanhOp : - VPU_LayerOp< - "Tanh", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Tanh VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Sin -// - -def VPU_SinOp : - VPU_LayerOp< - "Sin", - [ - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Sin VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; -} - -// -// Cos -// - -def VPU_CosOp : - VPU_LayerOp< - "Cos", - [ - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Cos VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; -} - -// -// Tan -// - -def VPU_TanOp : - VPU_LayerOp< - "Tan", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Tan VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Sqrt -// - -def VPU_SqrtOp : - VPU_LayerOp< - "Sqrt", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Sqrt VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Sinh -// - -def VPU_SinhOp : - VPU_LayerOp< - "Sinh", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Sinh VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Cosh -// - -def VPU_CoshOp : - VPU_LayerOp< - "Cosh", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Cosh VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Asinh -// - -def VPU_AsinhOp : - VPU_LayerOp< - "Asinh", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Asinh VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Acosh -// - -def VPU_AcoshOp : - VPU_LayerOp< - "Acosh", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Acosh VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Abs -// - -def VPU_AbsOp : - VPU_LayerOp< - "Abs", - [ - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Abs VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Atan -// - -def VPU_AtanOp : - VPU_LayerOp< - "Atan", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Atan VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Asin -// - -def VPU_AsinOp : - VPU_LayerOp< - "Asin", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Asin VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Acos -// - -def VPU_AcosOp : - VPU_LayerOp< - "Acos", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Acos VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Atanh -// - -def VPU_AtanhOp : - VPU_LayerOp< - "Atanh", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Atanh VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// HSigmoidOp -// - -def VPU_HSigmoidOp : - VPU_LayerOp< - "HSigmoid", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "HSigmoid VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// GridSampleOp -// - -def VPU_GridSampleOp : - VPU_LayerOp< - "GridSample", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "GridSample VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$grid, - - UnitAttr:$align_corners, - OptionalAttr:$mode, - OptionalAttr:$padding_mode, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$grid, - "::mlir::UnitAttr":$align_corners, - "vpux::IE::GridSampleModeAttr":$mode, - "vpux::IE::GridSamplePaddingModeAttr":$padding_mode - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Log -// - -def VPU_LogOp : - VPU_LayerOp< - "Log", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Log VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Gelu -// - -def VPU_GeluOp : - VPU_LayerOp< - "Gelu", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Gelu VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Exp -// - -def VPU_ExpOp : - VPU_LayerOp< - "Exp", - [ - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Exp VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; -} - -// -// HSwish -// - -def VPU_HSwishOp : - VPU_LayerOp< - "HSwish", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "HSwish VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Floor -// - -def VPU_FloorOp : - VPU_LayerOp< - "Floor", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Floor VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Round -// - -def VPU_RoundOp : - VPU_LayerOp< - "Round", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Round VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - - IE_RoundModeAttr:$mode, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "vpux::IE::RoundModeAttr":$mode - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Mish -// - -def VPU_MishOp : - VPU_LayerOp< - "Mish", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Mish VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input - )> - ]; -} - -// -// Erf -// - -def VPU_ErfOp : - VPU_LayerOp< - "Erf", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Erf VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Clamp -// - -def VPU_ClampOp : - VPU_LayerOp< - "Clamp", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Clamp VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[I16, I32, SI16, SI32, F16, F32, quant_QuantizedType]>, VPU_DistributedTensor]>:$input, - - F64Attr:$min, - F64Attr:$max, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[I16, I32, SI16, SI32, F16, F32, quant_QuantizedType]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::FloatAttr":$min, - "::mlir::FloatAttr":$max - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Elu -// - -def VPU_EluOp : - VPU_LayerOp< - "Elu", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Elu VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - - F64Attr:$x - ); - - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Sigmoid -// - -def VPU_SigmoidOp : - VPU_LayerOp< - "Sigmoid", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp, - DeclareOpInterfaceMethods - ] - > { - let summary = "Sigmoid VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// HardSigmoidOp -// - -def VPU_HardSigmoidOp : - VPU_LayerOp< - "HardSigmoid", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "HardSigmoid VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - F64Attr:$alpha_value, - F64Attr:$beta_value, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::FloatAttr":$alpha, - "::mlir::FloatAttr":$beta - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// EmbeddingBagOffsetsSumOp -// - -def VPU_EmbeddingBagOffsetsSumOp : - VPU_LayerOp< - "EmbeddingBagOffsetsSum", - [ - AttrSizedOperandSegments - ] - > { - let summary = "InferenceEngine EmbeddingBagOffsetsSum layer"; - - let arguments = (ins - AnyRankedTensor:$emb_table, - Optional<1DTensorOf<[SI32, SI64]>>:$indices, - Optional<1DTensorOf<[SI32, SI64]>>:$offsets, - Optional<1DTensorOf<[AnyInteger, AnyFloat]>>:$per_sample_weights, - - OptionalAttr:$indices_value, - OptionalAttr:$offsets_value, - IntAttr:$default_index_value, - OptionalAttr:$per_sample_weights_value - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// EmbeddingSegmentsSumOp -// - -def VPU_EmbeddingSegmentsSumOp : - VPU_LayerOp< - "EmbeddingSegmentsSum", - [ - AttrSizedOperandSegments - ] - > { - let summary = "EmbeddingSegmentsSum VPU layer"; - - let arguments = (ins - AnyRankedTensor:$emb_table, - Optional<1DTensorOf<[SI64, SI32]>>:$indices, - Optional<1DTensorOf<[SI64, SI32]>>:$segment_ids, - Optional<1DTensorOf<[AnyInteger, AnyFloat]>>:$per_sample_weights, - - OptionalAttr:$indices_value, - OptionalAttr:$segment_ids_value, - IntAttr:$num_segments_value, - IntAttr:$default_index_value, - OptionalAttr:$per_sample_weights_value - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// EmbeddingBagPackedSumOp -// - -def VPU_EmbeddingBagPackedSumOp : - VPU_LayerOp< - "EmbeddingBagPackedSum" - > { - let summary = "EmbeddingBagPackedSum VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$emb_table, - 2DTensorOf<[SI32, SI64]>:$indices, - Optional<2DTensorOf<[F16, F32]>>:$per_sample_weights - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// SeluOp -// - -def VPU_SeluOp : - VPU_LayerOp< - "Selu", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Selu VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - F64Attr:$alpha_value, - F64Attr:$lambda_value - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// NormalizeL2 -// - -def VPU_NormalizeL2Op : - VPU_LayerOp< - "NormalizeL2", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "NormalizeL2 VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$data, - ArrayAttr:$axes_value, - - F64Attr:$eps, - IE_EpsModeAttr:$eps_mode, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let hasVerifier = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// NormalizeIE -// - -def VPU_NormalizeIEOp : - VPU_LayerOp< - "NormalizeIE" - > { - let summary = "NormalizeIE VPU layer"; - - let arguments = (ins - AnyRankedTensor:$data, - AnyRankedTensor:$weights, - - F64Attr:$eps, - BoolAttr:$across_spatial, - BoolAttr:$channel_shared - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// CumSum -// - -def VPU_CumSumOp : - VPU_LayerOp< - "CumSum", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "CumSum VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - - OptionalAttr:$axis_value, - UnitAttr:$exclusive, - UnitAttr:$reverse, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - DimArr getTileableDims(); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::IntegerAttr":$axis_value, - "::mlir::UnitAttr":$exclusive, - "::mlir::UnitAttr":$reverse - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Eye -// - -def VPU_EyeOp : - VPU_LayerOp< - "Eye" - > { - let summary = "Eye VPU layer"; - - let arguments = (ins - 1DTensorOf<[SI32, SI64]>:$diagonal_index, - - IntAttr:$num_rows_value, - IntAttr:$num_columns_value, - I64ArrayAttr:$batch_shape_value, - - TypeAttr:$outputType - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// Ceiling -// - -def VPU_CeilingOp : - VPU_LayerOp< - "Ceiling", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Ceiling VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// SoftPlus -// - -def VPU_SoftPlusOp : - VPU_LayerOp< - "SoftPlus", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "SoftPlus VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Convert -// - -def VPU_ConvertOp : - VPU_LayerOp< - "Convert", - [ - DeclareOpInterfaceMethods, - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Convert VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - - TypeAttr:$dstElemType, - OptionalAttr:$multiClusterStrategy - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::TypeAttr":$dstElemType - )> - ]; - - let hasFolder = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// SoftMax -// - -def VPU_SoftMaxOp : - VPU_LayerOp< - "SoftMax", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "SoftMax VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - IntAttr:$axisInd, - OptionalAttr:$padSize, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - - bool isVFSupported(); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::IntegerAttr":$axisInd, - "::mlir::IntegerAttr":$padSize - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// LogSoftmax -// - -def VPU_LogSoftmaxOp : - VPU_LayerOp< - "LogSoftmax", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "LogSoftmax VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - - IntAttr:$axisInd, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::IntegerAttr":$axisInd - )> - ]; -} - -// -// PerAxisTile -// - -def VPU_PerAxisTileOp : - VPU_LayerOp< - "PerAxisTile" - > { - let summary = "Per axis Tile VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input, - - IntAttr:$axis, - IntAttr:$tiles - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// ReLU -// - -def VPU_ReLUOp : - VPU_LayerOp< - "ReLU", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "ReLU VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// LogicalNot -// - -def VPU_LogicalNotOp : - VPU_LayerOp< - "LogicalNot", - [ - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Logical Not VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[I8, F16, F32, SI32]>, VPU_DistributedTensor]>:$input, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[I8, F16, F32, SI32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Convolution -// - -def VPU_ConvolutionOp : - VPU_LayerOp< - "Convolution", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "Convolution VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - RankedTensorOf<[F16, F32]>:$filter, - Optional>:$bias, - - I64ArrayAttr:$strides, - I64ArrayAttr:$pads_begin, - I64ArrayAttr:$pads_end, - I64ArrayAttr:$dilations, - - OptionalAttr:$post_op - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Gather -// - -def VPU_GatherOp : - VPU_LayerOp< - "Gather", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Gather VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - AnyTypeOf<[RankedTensorOf<[AnyInteger]>, VPU_DistributedTensor]>:$indices, - Optional:$axis, - OptionalAttr:$axis_value, - IntAttr:$batch_dims, - - OptionalAttr:$indices_rank, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$indices, - "::mlir::Value":$axis, - "::mlir::IntegerAttr":$axis_value, - "::mlir::IntegerAttr":$batch_dims, - "::mlir::IntegerAttr":$indices_rank - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// GatherDMA -// - -def VPU_GatherDMAOp : - VPU_LayerOp< - "GatherDMA", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "GatherDMA VPU layer which will be lowered to DMA, used for GatherOps which can be lowered to DMA"; - - let arguments = (ins - AnyRankedTensor:$input, - RankedTensorOf<[AnyInteger]>:$indices, - Optional:$axis, - OptionalAttr:$axis_value, - IntAttr:$batch_dims - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// GatherNDOp -// - -def VPU_GatherNDOp : - VPU_LayerOp< - "GatherND", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "GatherND VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - AnyTypeOf<[RankedTensorOf<[AnyInteger]>, VPU_DistributedTensor]>:$indices, - IntAttr:$batch_dims, - - OptionalAttr:$original_shape, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$indices, - "::mlir::IntegerAttr":$batch_dims - )>, - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$indices, - "::mlir::IntegerAttr":$batch_dims, - "::mlir::ArrayAttr":$original_shape - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; - - let hasVerifier = 1; -} - -// -// GatherElements -// - -def VPU_GatherElementsOp : - VPU_LayerOp< - "GatherElements", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "GatherElements VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - AnyTypeOf<[RankedTensorOf<[AnyInteger]>, VPU_DistributedTensor]>:$indices, - IntAttr:$axis, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$indices, - "::mlir::IntegerAttr":$axis - )> - ]; -} - -// -// GatherTree -// - -def VPU_GatherTreeOp : - VPU_LayerOp< - "GatherTree" - > { - let summary = "GatherTree VPU layer"; - - let arguments = (ins - AnyRankedTensor:$stepIds, - AnyRankedTensor:$parentIds, - AnyRankedTensor:$maxSeqLen, - AnyRankedTensor:$endToken - ); - - let results = (outs - AnyRankedTensor:$finalIds - ); -} - -// -// ScatterNDUpdate -// - -def VPU_ScatterNDUpdateOp : - VPU_LayerOp< - "ScatterNDUpdate" - > { - let summary = "ScatterNDUpdate VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input, - RankedTensorOf<[AnyInteger]>:$indices, - AnyRankedTensor:$updates - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// ScatterUpdate -// - -def VPU_ScatterUpdateOp : - VPU_LayerOp< - "ScatterUpdate" - > { - let summary = "ScatterUpdate VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input, - RankedTensorOf<[AnyInteger]>:$indices, - AnyRankedTensor:$updates, - OptionalAttr:$axis_value - - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// ScatterElementsUpdate -// - -def VPU_ScatterElementsUpdateOp : - VPU_LayerOp< - "ScatterElementsUpdate" - > { - let summary = "ScatterElementsUpdate VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input, - RankedTensorOf<[AnyInteger]>:$indices, - AnyRankedTensor:$updates, - IntAttr:$axis, - IE_ScatterElementsUpdateReductionTypeAttr:$reduction, - BoolAttr:$use_init_val - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// Broadcast -// - -def VPU_BroadcastOp : - VPU_LayerOp< - "Broadcast" - > { - let summary = "Broadcast VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input, - 1DTensorOf<[AnyInteger]>:$target_shape, - Optional<1DTensorOf<[AnyInteger]>>:$axes_mapping, - - OptionalAttr:$mode - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// BucketizeOp -// - -def VPU_BucketizeOp : - VPU_LayerOp< - "Bucketize" - > { - let summary = "Bucketize VPU layer"; - - let arguments = (ins - AnyRankedTensor:$data, - 1DTensorOf<[AnyInteger, AnyFloat]>:$buckets, - - TypeAttr:$output_type, - UnitAttr:$with_right_bound - ); - - let results = (outs - RankedTensorOf<[SI32, SI64]>:$output - ); - - let hasVerifier = 1; -} - -// -// FakeQuantize -// - -def VPU_FakeQuantizeOp : - VPU_LayerOp< - "FakeQuantize", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "FakeQuantize VPU layer"; - - let description = [{ - The operation works in two modes: - * integral quantization: specified by the 'levels' attribute - * floating-point quantization: specified by the 'low_fp_type' attribute, [f8E4M3FN | f8E5M2] - - Only one of these attributes should be provided. - }]; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input_low, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input_high, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output_low, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output_high, - - OptionalAttr:$levels, - OptionalAttr:$low_fp_type, - IE_AutoBroadcastTypeAttr:$auto_broadcast, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$input_low, - "::mlir::Value":$input_high, - "::mlir::Value":$output_low, - "::mlir::Value":$output_high, - "::mlir::IntegerAttr":$levels, - "::mlir::TypeAttr":$low_fp_type, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let hasVerifier = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// FakeConvert -// - -def VPU_FakeConvertOp : - VPU_LayerOp< - "FakeConvert" - > { - let summary = "FakeConvert VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - RankedTensorOf<[F16, F32]>:$scale, - Optional>:$shift, - - TypeAttr:$dst_type - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); - - let hasVerifier = 1; -} - -// -// Proposal -// - -def VPU_ProposalOp : - VPU_LayerOp< - "Proposal" - > { - let summary = "Proposal VPU layer"; - - let description = [{ - Proposal operation filters bounding boxes and outputs only those with the highest prediction confidence. - The auxiliary buffer has the role of storing the intermediate results obtained inside the operation, - then sorting them. Depending on some criteria, it recalculates the results and extracts the output from them. - }]; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$class_probs, - RankedTensorOf<[F16, F32]>:$bbox_deltas, - RankedTensorOf<[F16, F32]>:$image_shape, - Optional<1DTensorOf<[UI8]>>:$auxiliary, - - IE_ProposalAttr:$proposal_attrs - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output, - RankedTensorOf<[F16, F32]>:$probs - ); - -} - -// -// Interpolate -// - -def VPU_InterpolateOp : - VPU_LayerOp< - "Interpolate", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - AttrSizedOperandSegments, - DeclareOpInterfaceMethods - ] - > { - let summary = "Interpolate VPU layer"; - - let description = [{ - The `coordinates` contain byte offsets for the current `input` tensor. - The `lambdas` contain two interleaved values for each coordinate. - }]; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[UI8, F16, F32, quant_QuantizedType]>, VPU_DistributedTensor]>:$input, - Optional, VPU_DistributedTensor]>>:$sizes, - Optional, VPU_DistributedTensor]>>:$scales, - Optional, VPU_DistributedTensor]>>:$axes, - Optional, VPU_DistributedTensor]>>:$coordinates, - Optional, VPU_DistributedTensor]>>:$lambdas, - - OptionalAttr:$sizes_attr, - OptionalAttr:$scales_attr, - OptionalAttr:$axes_attr, - OptionalAttr:$tile_offset_attr, - OptionalAttr:$initial_input_dims_attr, - OptionalAttr:$initial_output_dims_attr, - OptionalAttr:$initial_input_offset_attr, - OptionalAttr:$initial_output_offset_attr, - OptionalAttr:$multiClusterStrategy, - - IE_InterpolateAttr:$attr, - OptionalAttr:$output_padding, - OptionalAttr:$input_padding - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[UI8, F16, F32, quant_QuantizedType]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$sizes, - "::mlir::Value":$scales, - "::mlir::Value":$axes, - "::mlir::Value":$coordinates, - "::mlir::Value":$lambdas, - "::mlir::ArrayAttr":$sizes_attr, - "::mlir::ArrayAttr":$scales_attr, - "::mlir::ArrayAttr":$axes_attr, - "::mlir::ArrayAttr":$tile_offset_attr, - "::mlir::ArrayAttr":$initial_input_dims_attr, - "::mlir::ArrayAttr":$initial_output_dims_attr, - "vpux::IE::InterpolateAttr":$attr, - "::mlir::ArrayAttr":$output_padding, - "::mlir::ArrayAttr":$input_padding - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, - IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// TopK -// - -def VPU_TopKOp : - VPU_LayerOp< - "TopK", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - AttrSizedOperandSegments - ] - > { - let summary = "TopK VPU layer"; - - let description = [{ - * lineBuffer - it is an auxiliary buffer, which has the role of storing - some intermediate results in the software kernel of the operation. - }]; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - Optional>:$k, - Optional>:$lineBuffer, - OptionalAttr:$k_value, - - IntAttr:$axis, - IE_TopKModeAttr:$mode, - IE_TopKSortTypeAttr:$sort, - TypeAttr:$element_type, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output_values, - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$target_shape - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - OutputTiling getOutputTiling(const vpux::TileInfo& outputTile, vpux::Logger log); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// AdaptiveAvgPoolOp -// - -def VPU_AdaptiveAvgPoolOp : - VPU_LayerOp< - "AdaptiveAvgPool" - > { - let summary = "AdaptiveAvgPool VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - 1DTensorOf<[SI32, SI64]>:$pooled_spatial_shape - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// AdaptiveMaxPoolOp -// - -def VPU_AdaptiveMaxPoolOp : - VPU_LayerOp< - "AdaptiveMaxPool" - > { - let summary = "AdaptiveMaxPool VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - 1DTensorOf<[SI32, SI64]>:$pooled_spatial_shape, - TypeAttr:$index_element_type - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output, - RankedTensorOf<[SI32, SI64]>:$output_index - ); -} - -// -// RegionYolo -// - -def VPU_RegionYoloOp : - VPU_LayerOp< - "RegionYolo" - > { - let summary = "RegionYolo VPU layer"; - - let arguments = (ins - 4DTensorOf<[AnyFloat]>:$input, - - IntAttr:$coords, - IntAttr:$classes, - IntAttr:$num_regions, - BoolAttr:$do_softmax, - I64ArrayAttr:$mask, - IntAttr:$axis, - IntAttr:$end_axis, - F64ArrayAttr:$anchors - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// ReorgYolo -// - -def VPU_ReorgYoloOp : - VPU_LayerOp< - "ReorgYolo" - > { - let summary = "ReorgYolo VPU layer"; - - let arguments = (ins - 4DTensorOf<[AnyInteger, AnyFloat]>:$input, - - IntAttr:$stride - ); - - let results = (outs - AnyRankedTensor:$output - ); - - let hasVerifier = 1; -} - -// -// DetectionOutput -// - -def VPU_DetectionOutputOp : - VPU_LayerOp< - "DetectionOutput", - [ - AttrSizedOperandSegments - ] - > { - let summary = "DetectionOutput VPU layer"; - - let arguments = (ins - 2DTensorOf<[AnyFloat]>:$in_box_logits, - 2DTensorOf<[AnyFloat]>:$in_class_preds, - 3DTensorOf<[AnyFloat]>:$in_proposals, - Optional<2DTensorOf<[AnyFloat]>>:$in_additional_preds, - Optional<2DTensorOf<[AnyFloat]>>:$in_additional_proposals, - - IE_DetectionOutputAttr:$attr - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// DetectionOutputNormalize -// - -def VPU_DetectionOutputNormalizeOp: - VPU_LayerOp< - "DetectionOutputNormalize" - > { - let summary = "DetectionOutputNormalize VPU layer"; - - let arguments = (ins - 4DTensorOf<[AnyFloat]>:$prior_boxes, - - IntAttr:$input_width, - IntAttr:$input_height - ); - - let results = (outs - 4DTensorOf<[AnyFloat]>:$out_prior_boxes - ); - - let hasVerifier = 1; -} - -// -// DetectionOutputDecodeBoxes -// - -def VPU_DetectionOutputDecodeBoxesOp: - VPU_LayerOp< - "DetectionOutputDecodeBoxes", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "DetectionOutputDecodeBoxes VPU layer"; - - let arguments = (ins - 4DTensorOf<[AnyFloat]>:$box_logits, - 4DTensorOf<[AnyFloat]>:$prior_boxes, - - IE_DetectionOutputCodeTypeAttr:$code_type, - BoolAttr:$clip_before_nms - ); - - let results = (outs - 4DTensorOf<[AnyFloat]>:$out_decoded_boxes - ); -} - -// -// DetectionOutputSort -// - -def VPU_DetectionOutputSortOp: - VPU_LayerOp< - "DetectionOutputSort", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - ] - > { - let summary = "DetectionOutputSort VPU layer"; - - let arguments = (ins - AnyTypeOf<[4DTensorOf<[AnyFloat]>, VPU_DistributedTensor]>:$confidence, - AnyTypeOf<[4DTensorOf<[SI32]>, VPU_DistributedTensor]>:$indicesBuffer, - AnyTypeOf<[4DTensorOf<[SI32]>, VPU_DistributedTensor]>:$sortingBuffer, - - F64Attr:$confidence_threshold, - IntAttr:$top_k, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[4DTensorOf<[AnyFloat]>, VPU_DistributedTensor]>:$out_confidence, - AnyTypeOf<[4DTensorOf<[SI32]>, VPU_DistributedTensor]>:$out_indices, - AnyTypeOf<[4DTensorOf<[SI32]>, VPU_DistributedTensor]>:$out_sizes - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$confidence, - "::mlir::FloatAttr":$confidence_threshold, - "::mlir::IntegerAttr":$top_k - )> - ]; - - let extraClassDeclaration = [{ - OutputTiling getOutputTiling(const vpux::TileInfo& outputTile, vpux::Logger log); - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// DetectionOutputNmsCaffe -// - -def VPU_DetectionOutputNmsCaffeOp: - VPU_LayerOp< - "DetectionOutputNmsCaffe", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "DetectionOutputNmsCaffe VPU layer"; - - let arguments = (ins - 4DTensorOf<[AnyFloat]>:$confidence, - 4DTensorOf<[AnyFloat]>:$boxes, - 4DTensorOf<[SI32]>:$indices, - 4DTensorOf<[SI32]>:$sizes, - - IntAttr:$top_k, - F64Attr:$nms_threshold, - IntAttr:$background_id - ); - - let results = (outs - 4DTensorOf<[AnyFloat]>:$out_confidence, - 4DTensorOf<[AnyFloat]>:$out_boxes, - 4DTensorOf<[SI32]>:$out_sizes - ); - - let extraClassDeclaration = [{ - OutputTiling getOutputTiling(const vpux::TileInfo& outputTile, vpux::Logger log); - }] # baseExtraClassDeclaration; -} - -// -// DetectionOutputCollectResults -// - -def VPU_DetectionOutputCollectResultsOp: - VPU_LayerOp< - "DetectionOutputCollectResults" - > { - let summary = "DetectionOutputCollectResults VPU layer"; - - let arguments = (ins - 4DTensorOf<[AnyFloat]>:$confidence, - 4DTensorOf<[AnyFloat]>:$boxes, - 4DTensorOf<[SI32]>:$sizes, - - IntAttr:$keep_top_k, - BoolAttr:$clip_after_nms - ); - - let results = (outs - 4DTensorOf<[AnyFloat]>:$out_detections - ); -} - -// -// MVN -// - -def VPU_MVNOp : - VPU_LayerOp< - "MVN", - [ - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - - > { - let summary = "MVN1 VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - BoolAttr:$across_channels, - BoolAttr:$normalize_variance, - F64Attr:$eps, - OptionalAttr:$internal_reshape, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - - DimArr getNonNormDims(); - }] # baseExtraClassDeclaration; - - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::BoolAttr":$across_channels, - "::mlir::BoolAttr":$normalize_variance, - "::mlir::FloatAttr":$eps - )>, - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::BoolAttr":$across_channels, - "::mlir::BoolAttr":$normalize_variance, - "::mlir::FloatAttr":$eps, - "::mlir::ArrayAttr":$internal_reshape - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// MVN1Sum -// - -def VPU_MVN1SumOp : - VPU_LayerOp< - "MVN1SumOp", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "MVN1SumOp VPU layer (step 1/3 in MVN1 decomposition)"; - - let description = [{ - Overview: - - Large MVN1 tensors that cannot be tiled, are decomposed into 3 tileable sub-ops: - 1. **MVN1SumOp** : computes partial sums on input tiles - 2. **MVN1MeanVarOp** : sum-reduces concatenated partial sums from previous step and computes _mean_, _variance_ - 3. **MVN1Normalize**: applies normalization on input tiles - - Details: - - **input** - tile of original _MVN1Op_ input tensor - - **sum** - output tensor of shape **[N, C, H, W]**, with (0,1,2,3)->(0,2,3,1) layout (irrespective of input layout) - - N = input N - - C = input C if _across_channels_ = false, or 1 if _across_channels_ = true - - H = number of clusters (output_height) - - W = 2 if _normalize_variance_ = true (compute _sum_ and _sumOfSquares_ terms), else 1 (compute just _sum_ term) - }]; - - let arguments = (ins - AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - - BoolAttr:$across_channels, - BoolAttr:$normalize_variance, - IntAttr:$output_height, - OptionalAttr:$multiClusterStrategy - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - static bool buffsFitIntoCMX(mlir::ModuleOp module, vpux::NDTypeInterface in, vpux::NDTypeInterface out); - }] # baseExtraClassDeclaration; - - let results = (outs - AnyTypeOf<[4DTensorOf<[F32]>, VPU_DistributedTensor]>:$sum - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "bool":$across_channels, - "bool":$normalize_variance, - "int64_t":$output_height - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// MVN1MeanVar -// - -def VPU_MVN1MeanVarOp : - VPU_LayerOp< - "MVN1MeanVar", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "MVN1MeanVar VPU layer (step 2/3 in MVN1 decomposition)"; - - let description = [{ - Background: see _MVN1SumOp_ description - - Details: - Accumulates partial _sum_ (and optionally _sumOfSquares_) of concatenated input and computes _mean_ (and optionally _1/variance_ ) required for normalization. - - - **sum** - input is a concatenation (over W) of _MVN1SumOp_ outputs, shape = [N,C,W x num_parts] - - W = 2 if _normalize_variance_ = true (input _sum_ and _sumOfSquares_), else 1 (input _sum_) - - **meanVar** - output shape = [N,C,W], where - - W = 2 if _normalize_variance_ = true (output _mean_ and _1/variance_ terms), else 1 (output just _mean_) - }]; - - let arguments = (ins - AnyTypeOf<[4DTensorOf<[F32]>, VPU_DistributedTensor]>:$sum, - - I64ArrayAttr:$orig_shape, - BoolAttr:$across_channels, - BoolAttr:$normalize_variance, - F64Attr:$eps, - TypeAttr:$output_type, - OptionalAttr:$internal_reshape, - OptionalAttr:$multiClusterStrategy - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let results = (outs - AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$meanVar - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$sum, - "::mlir::ArrayAttr":$orig_shape, - "bool":$across_channels, - "bool":$normalize_variance, - "::mlir::APFloat":$eps, - "::mlir::Type":$output_type - )>, - OpBuilder<(ins - "::mlir::Value":$sum, - "::mlir::ArrayAttr":$orig_shape, - "bool":$across_channels, - "bool":$normalize_variance, - "::mlir::APFloat":$eps, - "::mlir::Type":$output_type, - "::mlir::ArrayAttr":$internal_reshape - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// MVN1Normalize -// - -def VPU_MVN1NormalizeOp : - VPU_LayerOp< - "MVN1Normalize", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "MVN1Normalize VPU layer (step 3/3 in MVN1 decomposition)"; - - let description = [{ - Background: see _MVN1SumOp_ description - - Applies normalization on a tile of input tensor. - - Details: - - **input** - input tile of original _MVN1Op_ input tensor - - **meanVar** - this input is the output of _MVN1MeanVarOp_ - - **output** - output tile of final result - }]; - - let arguments = (ins - AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$meanVar, - - BoolAttr:$across_channels, - BoolAttr:$normalize_variance, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$meanVar, - "::mlir::BoolAttr":$across_channels, - "::mlir::BoolAttr":$normalize_variance - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// MVN6 -// - -def VPU_MVN6Op : - VPU_LayerOp< - "MVN6", - [ - AttrSizedOperandSegments, - VPU_EltwiseOp, - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "MVN6 VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - Optional>:$scale, - Optional>:$bias, - I64ArrayAttr:$axes, - BoolAttr:$normalize_variance, - F64Attr:$eps, - IE_MvnEpsModeAttr:$eps_mode, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - DimArr getNonNormDims(); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::ArrayAttr":$axes, - "::mlir::BoolAttr":$normalize_variance, - "::mlir::FloatAttr":$eps, - "vpux::IE::MvnEpsModeAttr":$eps_mode - )>, - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$scale, - "::mlir::Value":$bias, - "::mlir::ArrayAttr":$axes, - "::mlir::BoolAttr":$normalize_variance, - "::mlir::FloatAttr":$eps, - "vpux::IE::MvnEpsModeAttr":$eps_mode - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// ROIPooling -// - -def VPU_ROIPoolingOp : - VPU_LayerOp< - "ROIPooling" - > { - let summary = "ROIPooling VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - RankedTensorOf<[F16, F32]>:$coords, - - I64ArrayAttr:$output_size, - F64Attr:$spatial_scale, - IE_ROIPoolingMethodAttr:$method - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// PSROIPooling -// - -def VPU_PSROIPoolingOp : - VPU_LayerOp< - "PSROIPooling" - > { - let summary = "PSROIPooling VPU layer"; - - let arguments = (ins - 4DTensorOf<[F16, F32]>:$input, - 2DTensorOf<[F16, F32]>:$coords, - - IntAttr:$output_dim, - F64Attr:$spatial_scale, - IntAttr:$group_size, - OptionalAttr:$spatial_bins_x, - OptionalAttr:$spatial_bins_y, - OptionalAttr:$mode - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// ROIAlign -// - -def VPU_ROIAlignOp : - VPU_LayerOp< - "ROIAlign", - [ - ResultsAreFloatLike - ] - > { - let summary = "ROIAlign VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - RankedTensorOf<[F16, F32]>:$coords, - 1DTensorOf<[AnyInteger]>:$roisIdx, - - IntAttr:$pooled_h, - IntAttr:$pooled_w, - IntAttr:$sampling_ratio, - F64Attr:$spatial_scale, - IE_ROIAlignMethodAttr:$poolingMode, - IE_ROIAlignAlignedMethodAttr:$alignedMode - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// ExperimentalDetectronROIFeatureExtractor -// - -def VPU_ExperimentalDetectronROIFeatureExtractorOp : - VPU_LayerOp< - "ExperimentalDetectronROIFeatureExtractor", [ - AttrSizedOperandSegments - ] - > { - let summary = "ExperimentalDetectronROIFeatureExtractor VPU layer"; - - let arguments = (ins - Variadic:$inputs, - Optional<1DTensorOf<[F16, F32]>>:$reorderedRois, - Optional<1DTensorOf<[UI32]>>:$originalRoiMap, - Optional<1DTensorOf<[F16, F32]>>:$outputRoisFeaturesTemp, - Optional<1DTensorOf<[UI32]>>:$levels, - - IE_ExperimentalDetectronROIFeatureExtractorAttr:$attr - - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output, - RankedTensorOf<[F16, F32]>:$outputROIs - ); -} - -// -// StridedSlice -// - -def VPU_StridedSliceOp : - VPU_LayerOp< - "StridedSlice", - [ - AttrSizedOperandSegments, - DeclareOpInterfaceMethods, - VPU_BoundsRepresentationInterface - ] - > { - let summary = "StridedSlice VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input, - Optional<1DTensorOf<[AnyInteger]>>:$begins, - Optional<1DTensorOf<[AnyInteger]>>:$ends, - Optional<1DTensorOf<[AnyInteger]>>:$strides, - - OptionalAttr:$begins_attr, - OptionalAttr:$ends_attr, - OptionalAttr:$strides_attr, - - I64ArrayAttr:$begin_mask, - I64ArrayAttr:$end_mask, - I64ArrayAttr:$new_axis_mask, - I64ArrayAttr:$shrink_axis_mask, - I64ArrayAttr:$ellipsis_mask, - - DefaultValuedAttr:$bounds_representation - ); - - let results = (outs - AnyRankedTensor:$output - ); - - let extraClassDeclaration = [{ - bool isSimplified(); - }]; -} - -// -// PRelu -// - -def VPU_PReluOp : - VPU_LayerOp< - "PRelu", - [ - DeclareOpInterfaceMethods, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "PRelu VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$negative_slope, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2 - )> - ]; - - let hasVerifier = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// LeakyRelu -// - -def VPU_LeakyReluOp : - VPU_LayerOp< - "LeakyRelu", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "LeakyRelu VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - - F64Attr:$negative_slope - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Swish -// - -def VPU_SwishOp : - VPU_LayerOp< - "Swish", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Swish VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - Optional, VPU_DistributedTensor]>>:$beta, - - OptionalAttr:$beta_value, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$beta, - "::mlir::FloatAttr":$beta_value - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// ScaleShift -// - -def VPU_ScaleShiftOp : - VPU_LayerOp< - "ScaleShift", - [ - VPU_TilingBuilderOpInterface, - AttrSizedOperandSegments, - VPU_EltwiseOp - ] - > { - let summary = "ScaleShift VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - Optional>:$weights, - Optional>:$biases - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Upsampling -// - -def VPU_UpsamplingOp : - VPU_LayerOp< - "Upsampling" - > { - let summary = "Upsampling VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32, quant_QuantizedType]>:$input, - I64ArrayAttr:$upsampling_factor, - OptionalAttr:$pad - ); - - let results = (outs - RankedTensorOf<[F16, F32, quant_QuantizedType]>:$output - ); -} - -// -// GRN -// - -def VPU_GRNOp : - VPU_LayerOp< - "GRN" - > { - let summary = "GRN VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - - F64Attr:$bias - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Negative -// - -def VPU_NegativeOp : - VPU_LayerOp< - "Negative", - [ - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Negative VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Sign -// - -def VPU_SignOp : - VPU_LayerOp< - "Sign", - [ - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Sign VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; -} - -// -// FullyConnected -// - -def VPU_FullyConnectedOp: - VPU_LayerOp< - "FullyConnected" - > { - let summary = "FullyConnected VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - RankedTensorOf<[F16, F32]>:$weights, - Optional>:$bias - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// MatMul -// - -def VPU_MatMulOp: - VPU_LayerOp< - "MatMul", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "MatMul VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input2, - - UnitAttr:$transpose_a, - UnitAttr:$transpose_b, - OptionalAttr:$post_op, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - - static bool isSupported(vpux::IE::MatMulOp matmulOp); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "::mlir::UnitAttr":$transpose_a, - "::mlir::UnitAttr":$transpose_b, - "vpux::IE::PostOpAttr":$post_op - )> - ]; - - let hasVerifier = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// CTCGreedyDecoder -// - -def VPU_CTCGreedyDecoderOp : - VPU_LayerOp< - "CTCGreedyDecoder" - > { - let summary = "CTCGreedyDecoder VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - RankedTensorOf<[F16, F32]>:$sequenceLengths, - - UnitAttr:$mergeRepeated - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// Reverse -// - -def VPU_ReverseOp : - VPU_LayerOp< - "Reverse", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Reverse VPU operation"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - - I64ArrayAttr:$axis_value, - IE_ReverseModeAttr:$mode, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - DimArr getTileableDims(); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::ArrayAttr":$axis_value, - "vpux::IE::ReverseModeAttr":$mode - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// CTCGreedyDecoderSeqLen -// - -def VPU_CTCGreedyDecoderSeqLenOp : - VPU_LayerOp< - "CTCGreedyDecoderSeqLen" - > { - let summary = "CTCGreedyDecoderSeqLen VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - RankedTensorOf<[SI32]>:$sequenceLength, - Optional>:$blankIndex, - - UnitAttr:$mergeRepeated - ); - - let results = (outs - RankedTensorOf<[SI32]>:$output, - RankedTensorOf<[SI32]>:$outputLength - ); -} - -// -// Pad -// - -def VPU_PadOp : - VPU_LayerOp< - "Pad", - [ - AttrSizedOperandSegments, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Pad VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - Optional>:$pads_begin, - Optional>:$pads_end, - Optional>:$pad_value, - - OptionalAttr:$pads_begin_attr, - OptionalAttr:$pads_end_attr, - OptionalAttr:$pad_value_attr, - - IE_PadModeAttr:$mode, - OptionalAttr:$multiClusterStrategy, - OptionalAttr:$output_padding, - OptionalAttr:$input_padding - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let assemblyFormat = [{ - `(` $input `)` (`[` $pads_begin^ `,` $pads_end (`,` $pad_value^)? `]`)? attr-dict `:` type(operands) `->` type(results) - }]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$pads_begin, - "::mlir::Value":$pads_end, - "::mlir::Value":$pad_value, - "::mlir::ArrayAttr":$pads_begin_attr, - "::mlir::ArrayAttr":$pads_end_attr, - "::mlir::FloatAttr":$pad_value_attr, - "vpux::IE::PadModeAttr":$mode, - "::mlir::ArrayAttr":$output_padding, - "::mlir::ArrayAttr":$input_padding - )>, - OpBuilder<(ins - "vpux::NDTypeInterface&":$input_type, - "::mlir::Value":$input, - "::mlir::Value":$pads_begin, - "::mlir::Value":$pads_end, - "::mlir::Value":$pad_value, - "::mlir::ArrayAttr":$pads_begin_attr, - "::mlir::ArrayAttr":$pads_end_attr, - "::mlir::FloatAttr":$pad_value_attr, - "vpux::IE::PadMode":$mode, - "::mlir::ArrayAttr":$output_padding, - "::mlir::ArrayAttr":$input_padding - )> - ]; - - let hasFolder = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// LSTMCell -// - -def VPU_LSTMCellOp : - VPU_LayerOp< - "LSTMCell" - > { - let summary = "LSTMCell VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$inputData, - RankedTensorOf<[F16, F32]>:$initialHiddenState, - RankedTensorOf<[F16, F32]>:$initialCellState, - RankedTensorOf<[F16, F32]>:$weights, - RankedTensorOf<[F16, F32]>:$recurrenceWeights, - RankedTensorOf<[F16, F32]>:$biases, - - IntAttr:$hiddenSize - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$outputHiddenState, - RankedTensorOf<[F16, F32]>:$outputCellState - ); - - let extraClassDeclaration = [{ - static bool isSupported(vpux::IE::LSTMCellOp op); - }] # baseExtraClassDeclaration; - - -} - -// -// LSTMGatesOp -// - -def VPU_LSTMGatesOp : - VPU_LayerOp< - "LSTMGates", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Computes LSTM activation functions"; - - let description = [{ - This operation is intended to be run as a software stage after computing and adding LSTM matrix multiplications. - - - **gatesInput** - tensor of shape **[batchSize, 4 * hiddenSize]** or **[1, 1, batchSize, 4 * hiddenSize]**. Formula: - ``` - gatesInput = (inputData * weights) + (initialHiddenState * recurrenceWeights) + biases - * - Matrix multiplication - + - Element-wise add - ``` - - The meaning of other operands are identical to those in LSTMCell operation. - }]; - - let arguments = (ins - AnyTypeOf<[2DTensorOf<[F16, F32]>, 4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$gatesInput, - AnyTypeOf<[2DTensorOf<[F16, F32]>, 4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$initialCellState, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[2DTensorOf<[F16, F32]>, 4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$outputHiddenState, - AnyTypeOf<[2DTensorOf<[F16, F32]>, 4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$outputCellState - ); - - let hasVerifier = 1; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$gatesInput, - "::mlir::Value":$initialCellState - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// LSTMSequence -// - -def VPU_LSTMSequenceOp : - VPU_LayerOp< - "LSTMSequence", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "LSTMSequence VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$inputData, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$initialHiddenState, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$initialCellState, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$reccurenceWeights, - Optional, VPU_DistributedTensor]>>:$biases, - AnyTypeOf<[RankedTensorOf<[SI32]>, VPU_DistributedTensor]>:$syncBuffer, - - OptionalAttr:$sequenceLength, - IE_RNNSequenceDirectionAttr:$direction, - OptionalAttr:$useDpu, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$outputHiddenValues, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$outputHiddenState, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$outputCellState - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$inputData, - "::mlir::Value":$initialHiddenState, - "::mlir::Value":$initialCellState, - "::mlir::Value":$reccurenceWeights, - "::mlir::Value":$biases, - "::mlir::IntegerAttr":$sequenceLength, - "vpux::IE::RNNSequenceDirectionAttr":$direction, - "::mlir::BoolAttr":$useDpu, - "vpux::VPU::MultiClusterStrategyAttr":$multiClusterStrategy - )>, - OpBuilder<(ins - "::mlir::Value":$inputData, - "::mlir::Value":$initialHiddenState, - "::mlir::Value":$initialCellState, - "::mlir::Value":$reccurenceWeights, - "::mlir::Value":$biases, - "::mlir::IntegerAttr":$sequenceLength, - "vpux::IE::RNNSequenceDirectionAttr":$direction, - "vpux::VPU::MultiClusterStrategyAttr":$multiClusterStrategy - )> - ]; - - let extraClassDeclaration = [{ - static bool isSupported(vpux::IE::LSTMSequenceOp origOp, bool useDpu=false); - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Select -// - -def VPU_SelectOp : - VPU_LayerOp< - "Select", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - ] - > { - let summary = "Select VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[Bool8, SI32, F16]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[SI32, F16]>, VPU_DistributedTensor]>:$input2, - AnyTypeOf<[RankedTensorOf<[SI32, F16]>, VPU_DistributedTensor]>:$input3, - IE_AutoBroadcastTypeAttr:$auto_broadcast, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[SI32, F16]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "::mlir::Value":$input3, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// SpaceToDepth -// - -def VPU_SpaceToDepthOp : - VPU_LayerOp< - "SpaceToDepthOp", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "SpaceToDepthOp VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input, - - DefaultValuedAttr:$block_size, - IE_SpaceToDepthModeAttr:$mode - ); - - let results = (outs - AnyRankedTensor:$output - ); - - let hasVerifier = 1; -} - -// -// SpaceToBatch -// - -def VPU_SpaceToBatch : - VPU_LayerOp< - "SpaceToBatch" - > { - let summary = "SpaceToBatch VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input, - - OptionalAttr:$block_shape_value, - OptionalAttr:$pads_begin_value, - OptionalAttr:$pads_end_value - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// BatchToSpace -// - -def VPU_BatchToSpace : - VPU_LayerOp< - "BatchToSpace" - > { - let summary = "BatchToSpace VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input, - - OptionalAttr:$block_shape_value, - OptionalAttr:$crops_begin_value, - OptionalAttr:$crops_end_value - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// ReverseSequence -// - -def VPU_ReverseSequenceOp : - VPU_LayerOp< - "ReverseSequence" - > { - let summary = "Reverse variable length sequence VPU operation"; - - let arguments = (ins - AnyRankedTensor:$data, - 1DTensorOf<[AnyInteger]>:$seq_length, - - IntAttr:$seq_axis, - IntAttr:$batch_axis - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// DepthToSpace -// - -def VPU_DepthToSpaceOp : - VPU_LayerOp< - "DepthToSpace", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "DepthToSpace VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - IntAttr:$block_size, - IE_DepthToSpaceModeAttr:$mode, - OptionalAttr:$padded_channels, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - - bool isVFSupported(); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::IntegerAttr":$block_size, - "vpux::IE::DepthToSpaceModeAttr":$mode, - "vpux::IE::ChannelPaddingAttr":$padded_channels - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; - - let hasVerifier = 1; -} - -// -// ExtractImagePatches -// - -def VPU_ExtractImagePatchesOp : - VPU_LayerOp< - "ExtractImagePatches" - > { - let summary = "InferenceEngine ExtractImagePatches layer"; - - let arguments = (ins - 4DTensorOf<[AnyType]>:$data, - - I64ArrayAttr:$sizes, - I64ArrayAttr:$strides, - I64ArrayAttr:$rates, - IE_PadTypeAttr:$autoPad - ); - - let results = (outs - 4DTensorOf<[AnyType]>:$output - ); -} - -// -// YuvToRgb -// Conversions: -// NV12toRGB, NV12toBGR, -// I420toRGB, I420toBGR -// - -def VPU_YuvToRgbOp : - VPU_LayerOp< - "YuvToRgb", - [ - DeclareOpInterfaceMethods, - AttrSizedOperandSegments - ] - > { - let summary = "InferenceEngine NV12/I420 to RGB/BGR layer"; - - let arguments = (ins - 4DTensorOf<[UI8, F16, F32]> :$input1, - Optional<4DTensorOf<[UI8, F16, F32]>>:$input2, - Optional<4DTensorOf<[UI8, F16, F32]>>:$input3, - - IE_ColorFmtAttr:$inFmt, - IE_ColorFmtAttr:$outFmt - ); - - let results = (outs - 4DTensorOf<[UI8, F16, F32]>:$output - ); -} - -// -// RandomUniform -// - -def VPU_RandomUniformOp : - VPU_LayerOp< - "RandomUniform", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "RandomUniform VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$min, - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$max, - - I64ArrayAttr:$output_shape, - TypeAttr:$outputType, - IntAttr:$global_seed, - IntAttr:$op_seed, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$min, - "::mlir::Value":$max, - - "::mlir::ArrayAttr":$output_shape, - "::mlir::TypeAttr":$outputType, - "::mlir::IntegerAttr":$global_seed, - "::mlir::IntegerAttr":$op_seed - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// OneHot -// - -def VPU_OneHotOp : - VPU_LayerOp< - "OneHot" - > { - let summary = "InferenceEngine OneHot layer"; - - let arguments = (ins - RankedTensorOf<[SI32, SI64]> :$input, - - IntAttr:$depth, - F64Attr:$on_value, - F64Attr:$off_value, - IntAttr:$axis, - - TypeAttr:$outputType - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// M2I.ColorConvert -// - -def VPU_M2IColorConvertOp : - VPU_LayerOp< - "M2I.ColorConvert" - > { - let summary = "M2I version for color-convert operations"; - - let arguments = (ins - 4DTensorOf<[UI8, F16]>:$input, - IE_ColorFmtAttr:$inFmt, - IE_ColorFmtAttr:$outFmt - ); - - let results = (outs - 4DTensorOf<[UI8, F16]>:$output - ); - - let assemblyFormat = [{ - `(` $input `)` - attr-dict - custom(type($input)) `` - `->` type(results) - }]; - - let extraClassDeclaration = [{ - static bool fitIntoCMX(mlir::Operation* op, vpux::NDTypeInterface input, vpux::NDTypeInterface output, Byte reservedMem); - static bool fitIntoCMX(mlir::Operation* op, vpux::NDTypeInterface input, vpux::NDTypeInterface output); - static bool isSupported(vpux::IE::YuvToRgbOp origOp, vpux::LogCb logCb, bool checkLayout = false, - bool checkChannelAlignment = false); - }] # baseExtraClassDeclaration; -} - -// -// M2I.Resize -// - -def VPU_M2IResizeOp : - VPU_LayerOp< - "M2I.Resize" - > { - let summary = "M2I version for resize operations"; - - let arguments = (ins - 4DTensorOf<[UI8, F16]>:$input, - - I64ArrayAttr:$sizes, - I64ArrayAttr:$axes, - VPU_M2iInterpAttr:$interp - ); - - let results = (outs - 4DTensorOf<[UI8, F16]>:$output - ); - - let assemblyFormat = [{ - `(` $input `)` - attr-dict - custom(type($input)) `` - `->` type(results) - }]; - - let extraClassDeclaration = [{ - static bool fitIntoCMX(mlir::Operation* op, vpux::NDTypeInterface input, vpux::NDTypeInterface output, Byte reservedMem); - static bool fitIntoCMX(mlir::Operation* op, vpux::NDTypeInterface input, vpux::NDTypeInterface output); - static bool isSupported(vpux::IE::InterpolateOp origOp, vpux::LogCb logCb, bool checkLayout = false, - bool checkChannelAlignment = false); - }] # baseExtraClassDeclaration; -} - -// -// M2I.Norm -// - -def VPU_M2INormOp : - VPU_LayerOp< - "M2I.Norm" - > { - let summary = "M2I version for BatchNormInference"; - - let arguments = (ins - 4DTensorOf<[F16]>:$input, - - F64ArrayAttr:$gamma_value, - F64ArrayAttr:$beta_value, - F64ArrayAttr:$mean_value, - F64ArrayAttr:$variance_value, - - F64Attr:$eps - ); - - let results = (outs - 4DTensorOf<[F16]>:$output - ); - - let assemblyFormat = [{ - `(` $input `)` - attr-dict - custom(type($input)) `` - `->` type(results) - }]; - - let extraClassDeclaration = [{ - static bool fitIntoCMX(mlir::Operation* op, vpux::NDTypeInterface input, vpux::NDTypeInterface output, Byte reservedMem); - static bool fitIntoCMX(mlir::Operation* op, vpux::NDTypeInterface input, vpux::NDTypeInterface output); - static bool isSupported(vpux::IE::BatchNormInferenceOp origOp, vpux::LogCb logCb, bool checkLayout = false, - bool checkChannelAlignment = false); - }] # baseExtraClassDeclaration; -} - -// -// M2I.Task -// - -def VPU_M2ITaskOp : - VPU_LayerOp< - "M2I.Task" - > { - let summary = "M2I full task op"; - - let arguments = (ins - 4DTensorOf<[UI8, F16]>:$input, - - BoolAttr:$do_csc, - BoolAttr:$do_norm, - VPU_M2iColorFmtAttr:$inFmt, - VPU_M2iColorFmtAttr:$outFmt, - UnitAttr:$chroma_in_reverse_channels, - UnitAttr:$chroma_out_reverse_channels, - UnitAttr:$luma_in_reverse_channels, - UnitAttr:$luma_out_reverse_channels, - OptionalAttr:$sizes, - OptionalAttr:$axes, - OptionalAttr:$norm, - DefaultValuedAttr:$interp - ); - - let results = (outs - 4DTensorOf<[UI8, F16]>:$output - ); - - let assemblyFormat = [{ - `(` $input `)` - attr-dict - custom(type($input)) `` - `->` type(results) - }]; -} - -// -// Tile -// - -def VPU_TileOp : - VPU_LayerOp< - "Tile", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "Tile VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input, - I64ArrayAttr:$repeats_values - ); - - let results = (outs - AnyRankedTensor:$output - ); - - let hasFolder = 1; -} - -// -// DynamicTile -// - -def VPU_DynamicTileOp : - VPU_LayerOp< - "DynamicTile", - [ - DeclareOpInterfaceMethods, - VPU_BoundsRepresentationInterface - ] - > { - let summary = "DynamicTile VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input, - RankedTensorOf<[AnyInteger]>:$target_shape, - - Optional>:$repeats, - OptionalAttr:$repeats_values, - - I64ArrayAttr:$output_shape, - I64ArrayAttr:$output_bounds, - - DefaultValuedAttr:$bounds_representation - ); - - let results = (outs - AnyRankedTensor:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; -} - -// -// Split -// - -def VPU_SplitOp : - VPU_LayerOp< - "Split", - [ - VPU_ViewLikeOpInterface - ] - > { - let summary = "Split VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$input, - Optional:$axis, - - IntAttr:$num_splits, - OptionalAttr:$axis_value - ); - - let results = (outs - Variadic>:$outputs - ); - - let checkInferredDimsOrder = 1; - let checkInferredMemSpace = 1; - let hasVerifier = 1; -} - -// -// Power -// - -def VPU_PowerOp : - VPU_LayerOp< - "Power", - [ - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Power VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Add -// - -def VPU_AddOp : - VPU_LayerOp< - "Add", - [ - VPU_TilingBuilderOpInterface, - Commutative, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Add VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - OptionalAttr:$post_op, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast, - "vpux::IE::PostOpAttr":$post_op - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; - let hasVerifier = 1; -} - -// -// Divide -// - -def VPU_DivideOp : - VPU_LayerOp< - "Divide", - [ - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Divide VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// SquaredDiff -// - -def VPU_SquaredDifferenceOp : - VPU_LayerOp< - "SquaredDiff", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "SquaredDiff VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32, SI32]>:$input1, - RankedTensorOf<[F16, F32, SI32]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast - ); - - let results = (outs - RankedTensorOf<[F16, F32, SI32]>:$output - ); -} - -// -// FloorMod -// - -def VPU_FloorModOp : - VPU_LayerOp< - "FloorMod", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "FloorMod VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input1, - AnyRankedTensor:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// Mod -// - -def VPU_ModOp : - VPU_LayerOp< - "Mod", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Mod VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input1, - AnyRankedTensor:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// Less -// - -def VPU_LessOp : - VPU_LayerOp< - "Less", - [ - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Less VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[Bool8]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// LessEqual -// - -def VPU_LessEqualOp : - VPU_LayerOp< - "LessEqual", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "LessEqual VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32, SI32]>:$input1, - RankedTensorOf<[F16, F32, SI32]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast - ); - - let results = (outs - RankedTensorOf<[Bool8]>:$output - ); -} - -// -// Greater -// - -def VPU_GreaterOp : - VPU_LayerOp< - "Greater", - [ - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Greater VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[Bool8]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// GreaterEqual -// - -def VPU_GreaterEqualOp : - VPU_LayerOp< - "GreaterEqual", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - ] - > { - let summary = "GreaterEqual VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - - OptionalAttr:$multiClusterStrategy - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$output - ); - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// LogicalOr -// - -def VPU_LogicalOrOp : - VPU_LayerOp< - "LogicalOr", - [ - VPU_TilingBuilderOpInterface, - Commutative, - VPU_EltwiseOp - ] - > { - let summary = "LogicalOr VPU layer"; - - let arguments = (ins - RankedTensorOf<[I8, F16, F32, SI32]>:$input1, - RankedTensorOf<[I8, F16, F32, SI32]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast - ); - - let results = (outs - RankedTensorOf<[I8, F16, F32, SI32]>:$output - ); -} - -// -// LogicalXor -// - -def VPU_LogicalXorOp : - VPU_LayerOp< - "LogicalXor", - [ - VPU_TilingBuilderOpInterface, - Commutative, - VPU_EltwiseOp - ] - > { - let summary = "LogicalXor VPU layer"; - - let arguments = (ins - RankedTensorOf<[I8, F16, F32, SI32]>:$input1, - RankedTensorOf<[I8, F16, F32, SI32]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast - ); - - let results = (outs - RankedTensorOf<[I8, F16, F32, SI32]>:$output - ); -} - -// -// Multiply -// - -def VPU_MultiplyOp : - VPU_LayerOp< - "Multiply", - [ - VPU_TilingBuilderOpInterface, - Commutative, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Multiply VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - OptionalAttr:$post_op, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast, - "vpux::IE::PostOpAttr":$post_op - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// And -// - -def VPU_AndOp : - VPU_LayerOp< - "And", - [ - VPU_TilingBuilderOpInterface, - Commutative, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "And VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[I8, F16, F32, SI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[I8, F16, F32, SI32]>, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[I8, F16, F32, SI32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// BitwiseAnd -// - -def VPU_BitwiseAndOp : - VPU_LayerOp< - "BitwiseAnd", - [ - VPU_TilingBuilderOpInterface, - Commutative, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "BitwiseAnd VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// BitwiseOr -// - -def VPU_BitwiseOrOp : - VPU_LayerOp< - "BitwiseOr", - [ - VPU_TilingBuilderOpInterface, - Commutative, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "BitwiseOr VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// BitwiseXor -// - -def VPU_BitwiseXorOp : - VPU_LayerOp< - "BitwiseXor", - [ - VPU_TilingBuilderOpInterface, - Commutative, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "BitwiseXor VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[I32, SI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[I32, SI32]>, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[I32, SI32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// BitwiseNot -// - -def VPU_BitwiseNotOp : - VPU_LayerOp< - "BitwiseNot", - [ - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "BitwiseNot VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$input1, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1 - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// GroupConvolution -// - -def VPU_GroupConvolutionOp : - VPU_LayerOp< - "GroupConvolution", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "GroupConvolution VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32, quant_QuantizedType]>:$input, - RankedTensorOf<[F16, F32, quant_QuantizedType]>:$filter, - Optional>:$bias, - - I64ArrayAttr:$strides, - I64ArrayAttr:$pads_begin, - I64ArrayAttr:$pads_end, - I64ArrayAttr:$dilations, - OptionalAttr:$groups, - - OptionalAttr:$post_op, - OptionalAttr:$output_padding, - OptionalAttr:$input_padding - ); - - let results = (outs - RankedTensorOf<[F16, F32, quant_QuantizedType]>:$output - ); - - list elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output, Byte reservedMem); - - bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output); - }] # baseExtraClassDeclaration; -} - -// -// GroupNormalizationOp -// - -def VPU_GroupNormalizationOp : - VPU_LayerOp< - "GroupNormalization", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "GroupNormalization VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - RankedTensorOf<[F16, F32]>:$scale, - RankedTensorOf<[F16, F32]>:$bias, - - I32Attr:$num_groups, - F32Attr: $epsilon - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// AvgPool -// - -def VPU_AvgPoolOp : - VPU_LayerOp< - "AvgPool", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "AvgPool VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32, SI32, SI8, UI8]>:$input, - - I64ArrayAttr:$kernel_size, - I64ArrayAttr:$strides, - I64ArrayAttr:$pads_begin, - I64ArrayAttr:$pads_end, - IE_RoundingTypeAttr:$rounding_type, - UnitAttr:$exclude_pads - ); - - let results = (outs - RankedTensorOf<[F16, F32, SI32, SI8, UI8]>:$output - ); -} - -// -// MaxPool -// - -def VPU_MaxPoolOp : - VPU_LayerOp< - "MaxPool", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "MaxPool VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32, SI32, SI8, UI8]>:$input, - - I64ArrayAttr:$kernel_size, - I64ArrayAttr:$strides, - I64ArrayAttr:$pads_begin, - I64ArrayAttr:$pads_end, - IE_RoundingTypeAttr:$rounding_type, - - OptionalAttr:$post_op - ); - - let results = (outs - RankedTensorOf<[F16, F32, SI32, SI8, UI8]>:$output - ); -} - -// -// MaxPool8 -// - -def VPU_MaxPool8Op : - VPU_LayerOp< - "MaxPool8" - > { - let summary = "MaxPool8 VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32, SI32, SI8, UI8]>:$input, - - I64ArrayAttr:$kernel_size, - I64ArrayAttr:$strides, - I64ArrayAttr:$dilations, - I64ArrayAttr:$pads_begin, - I64ArrayAttr:$pads_end, - IE_RoundingTypeAttr:$rounding_type, - TypeAttr:$index_element_type, - - IntAttr:$axis - ); - - let results = (outs - RankedTensorOf<[F16, F32, SI32, SI8, UI8]>:$output, - RankedTensorOf<[SI32, SI64]>:$output_index - ); -} - -// -// Reshape -// - -def VPU_ReshapeOp : - VPU_LayerOp< - "Reshape", - [ - VPU_ViewLikeOpInterface - ] - > { - let summary = "Reshape VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$input, - Optional>:$shape, - - UnitAttr:$special_zero, - OptionalAttr:$shape_value - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$output - ); - - let checkInferredDimsOrder = 1; - let checkInferredMemSpace = 1; - - let hasFolder = 1; - let hasCanonicalizer = 1; -} - -// -// Squeeze -// - -def VPU_SqueezeOp : - VPU_LayerOp< - "Squeeze", - [ - VPU_ViewLikeOpInterface - ] - > { - let summary = "Squeeze VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$input, - Optional>:$axes, - - OptionalAttr:$axes_value - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$output - ); - - let checkInferredDimsOrder = 1; - let checkInferredMemSpace = 1; -} - -// -// Unsqueeze -// - -def VPU_UnsqueezeOp : - VPU_LayerOp< - "Unsqueeze", - [ - VPU_ViewLikeOpInterface - ] - > { - - let summary = "Unsqueeze VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$input, - Optional>:$axes, - - OptionalAttr:$axes_value - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$output - ); - - let checkInferredDimsOrder = 1; - let checkInferredMemSpace = 1; -} - -// -// LRN -// - -def VPU_LRNOp : - VPU_LayerOp< - "LRN" - > { - let summary = "LRN VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - I64ArrayAttr:$axes, - - F64Attr:$alpha, - F64Attr:$beta, - F64Attr:$bias, - IntAttr:$size - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// LRN_IE -// - -def VPU_LRN_IEOp : - VPU_LayerOp< - "LRN_IE" - > { - let summary = "LRN_IE VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - - F64Attr:$alpha, - F64Attr:$beta, - F64Attr:$bias, - IntAttr:$size, - IE_LRN_IERegionAttr:$region - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// ReduceMax -// - -def VPU_ReduceMaxOp : - VPU_LayerOp< - "ReduceMax", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "ReduceMax VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - - I64ArrayAttr:$axes_value, - UnitAttr:$keep_dims, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::ArrayAttr":$axes_value, - "::mlir::UnitAttr":$keep_dims - )> - ]; - - let hasFolder = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// ReduceMean -// - -def VPU_ReduceMeanOp : - VPU_LayerOp< - "ReduceMean", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "ReduceMean VPU Layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - - I64ArrayAttr:$axes_value, - UnitAttr:$keep_dims, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::ArrayAttr":$axes_value, - "::mlir::UnitAttr":$keep_dims - )> - ]; - - let hasFolder = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// ReduceSum -// - -def VPU_ReduceSumOp : - VPU_LayerOp< - "ReduceSum", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "ReduceSum VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - - I64ArrayAttr:$axes_value, - UnitAttr:$keep_dims, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::ArrayAttr":$axes_value, - "::mlir::UnitAttr":$keep_dims - )> - ]; - - let hasFolder = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// ReduceL1 -// - -def VPU_ReduceL1Op : - VPU_LayerOp< - "ReduceL1", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "ReduceL1 VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - - I64ArrayAttr:$axes_value, - UnitAttr:$keep_dims, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::ArrayAttr":$axes_value, - "::mlir::UnitAttr":$keep_dims - )> - ]; - - let hasFolder = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// ReduceLogicalOr -// - -def VPU_ReduceLogicalOrOp : - VPU_LayerOp< - "ReduceLogicalOr", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "ReduceLogicalOr VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - - I64ArrayAttr:$axes_value, - UnitAttr:$keep_dims, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::ArrayAttr":$axes_value, - "::mlir::UnitAttr":$keep_dims - )> - ]; - - let hasFolder = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// ReduceLogicalAnd -// - -def VPU_ReduceLogicalAndOp : - VPU_LayerOp< - "ReduceLogicalAnd", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "ReduceLogicalAnd VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - - I64ArrayAttr:$axes_value, - UnitAttr:$keep_dims, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::ArrayAttr":$axes_value, - "::mlir::UnitAttr":$keep_dims - )> - ]; - - let hasFolder = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// ReduceProd -// - -def VPU_ReduceProdOp : - VPU_LayerOp< - "ReduceProd", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "ReduceProd VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - - I64ArrayAttr:$axes_value, - UnitAttr:$keep_dims, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::ArrayAttr":$axes_value, - "::mlir::UnitAttr":$keep_dims - )> - ]; - - let hasFolder = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// ReduceMin -// - -def VPU_ReduceMinOp : - VPU_LayerOp< - "ReduceMin", - [ - DeclareOpInterfaceMethods, - ] - > { - let summary = "ReduceMin VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input, - - I64ArrayAttr:$axes_value, - UnitAttr:$keep_dims - ); - - let results = (outs - AnyRankedTensor:$output - ); - - let hasFolder = 1; -} - -// -// ReduceL2 -// - -def VPU_ReduceL2Op : - VPU_LayerOp< - "ReduceL2", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "ReduceL2 VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - - I64ArrayAttr:$axes_value, - UnitAttr:$keep_dims, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::ArrayAttr":$axes_value, - "::mlir::UnitAttr":$keep_dims - )> - ]; - - let hasFolder = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Minimum -// - -def VPU_MinimumOp : - VPU_LayerOp< - "Minimum", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "InferenceEngine Minimum layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Maximum -// - -def VPU_MaximumOp : - VPU_LayerOp< - "Maximum", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Maximum VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input2, - IE_AutoBroadcastTypeAttr:$auto_broadcast, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Sparsify -// - -def VPU_SparsifyOp : - VPU_LayerOp<"Sparsify", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Sparsify VPU layer"; - - let arguments = (ins - 4DTensorOf<[quant_QuantizedType, F16, BF16]>:$input - ); - - let results = (outs - VPU_SparseTensor:$output - ); -} - -// -// Desparsify -// - -def VPU_DesparsifyOp : - VPU_LayerOp<"Desparsify", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Desparsify VPU layer"; - - let arguments = (ins - VPU_SparseTensor:$input - ); - - let results = (outs - 4DTensorOf<[quant_QuantizedType, F16, BF16]>:$output - ); -} - -// -// Quantize -// - -def VPU_QuantizeOp : - VPU_LayerOp<"Quantize", - [ - VPU_TilingBuilderOpInterface, - VPU_EltwiseOp - ] - > { - let summary = "Quantize VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - - TypeAttr:$dstElemType - ); - - let results = (outs - RankedTensorOf<[quant_QuantizedType]>:$output - ); -} - -// -// Dequantize -// - -def VPU_DequantizeOp : - VPU_LayerOp<"Dequantize", - [ - DeclareOpInterfaceMethods, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Dequantize VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[quant_QuantizedType]>, VPU_DistributedTensor]>:$input, - TypeAttr:$dstElemType, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32,]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::TypeAttr":$dstElemType - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - bool isVFSupported(); - }] # baseExtraClassDeclaration; -} - -// -// DynamicQuantizeOp -// - -def VPU_DynamicQuantizeOp : - VPU_LayerOp<"DynamicQuantize", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Dynamic-Quantize VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F32]>, VPU_DistributedTensor]>:$input, - AnyTypeOf<[RankedTensorOf<[F32]>, VPU_DistributedTensor]>:$min, - AnyTypeOf<[RankedTensorOf<[F32]>, VPU_DistributedTensor]>:$max, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[UI8]>, VPU_DistributedTensor]>:$output, - AnyTypeOf<[RankedTensorOf<[F32]>, VPU_DistributedTensor]>:$scale, - AnyTypeOf<[RankedTensorOf<[UI8]>, VPU_DistributedTensor]>:$zero_point - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$min, - "::mlir::Value":$max - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; -} - -// -// QuantizeCast -// - -def VPU_QuantizeCastOp : - VPU_LayerOp< - "QuantizeCast", - [ - VPU_ViewLikeOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Quantize Cast VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$input, - - TypeAttr:$dstElemType - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let hasVerifier = 1; - let hasFolder = 1; -} - -// -// TransposedConvolution -// - -def VPU_TransposedConvolutionOp: - VPU_LayerOp< - "TransposedConvolution", - [ - AttrSizedOperandSegments - ] - > { - let summary = "TransposedConvolution VPU layer"; - - let description = [{ - Represents a transposed convolution, which consumes an input and filter tensor - and generates an output larger than the input. - - Operands: - - `input`: the input tensor of the operation; 4D layout [N, C_IN, Y, X] - - `filter`: the convolutional kernel tensor; expected layout [C_OUT, C_IN, KY, KX] - - (optional) `output_shape`: specifies the spatial shape of the output; - expected values `[Y, X]` - - Attributes: - - `strides`: represents the distance in pixels to slide the filter on the output - tensor; expected values `[SY, SX]` - - `pads_begin`: represents the number of pixels to remove from the beginning of - each axis in the output; expected values `[PAD_TOP, PAD_LEFT]` - - `pads_end`: represents the number of pixels to remove from the end of each axis - in the output; expected values `[PAD_BOTTOM, PAD_RIGHT]` - - `dilations`: has the same definition as dilations for a regular Convolution but - applied in the backward way, for the output tensor; expected values `[DY, DX]` - - `spatial_output_padding`: adds additional amount of paddings per each spatial axis in - the output tensor; expected values `[PY; PX]` - - Results: - - `output`: the output tensor of the operation; 4D layout [N, C_OUT, Y, X] - }]; - - let arguments = (ins - AnyRankedTensor:$input, - AnyRankedTensor:$filter, - Optional<1DTensorOf<[AnyInteger]>>:$output_shape, - Optional>:$bias, - - I64ArrayAttr:$strides, - I64ArrayAttr:$pads_begin, - I64ArrayAttr:$pads_end, - I64ArrayAttr:$dilations, - I64ArrayAttr:$spatial_output_padding, - - OptionalAttr:$post_op, - OptionalAttr:$clamp, - OptionalAttr:$output_padding, - OptionalAttr:$input_padding - ); - - let results = (outs - AnyRankedTensor:$output - ); - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT]; -} - -// -// Expand -// - -def VPU_ExpandOp : - VPU_LayerOp< - "Expand" - > { - let summary = "Expand tensor with uninitialized values"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$input, - - I64ArrayAttr:$pads_begin, - I64ArrayAttr:$pads_end - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$output - ); - - let builders = [ - OpBuilder< - (ins "mlir::Value":$input, "std::optional":$pads_begin, "std::optional":$pads_end) - > - ]; - - let hasFolder = 1; - - let checkInferredDimsOrder = 1; - let checkInferredMemSpace = 1; -} - -// -// Subtract -// - -def VPU_SubtractOp : - VPU_LayerOp< - "Subtract", - [ - VPU_TilingBuilderOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Subtract VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - OptionalAttr:$post_op, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast, - "vpux::IE::PostOpAttr":$post_op - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// MemPermute -// - -def VPU_MemPermuteOp : - VPU_LayerOp< - "MemPermute", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "MemPermute VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - - AffineMapAttr:$dst_order, - AffineMapAttr:$mem_perm, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::AffineMapAttr":$dst_order, - "::mlir::AffineMapAttr":$mem_perm - )> - ]; - - let checkInferredDimsOrder = 1; - let checkInferredMemSpace = 0; - let hasCanonicalizer = 1; -} - -// -// PermuteCast -// - -def VPU_PermuteCastOp : - VPU_LayerOp< - "PermuteCast", - [ - VPU_ViewLikeOpInterface, - DeclareOpInterfaceMethods - ] - > { - let summary = "PermuteCast VPU layer"; - - let description = [{ - The op changes layout information in the following way: - * dst_order: layout attribute of result is set to value of this arg - * mem_perm: describes the permutation applied on the input value's memory shape - to obtain the memory shape of the output value. - }]; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, - - AffineMapAttr:$dst_order, - AffineMapAttr:$mem_perm - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output - ); - - let checkInferredDimsOrder = 1; - let checkInferredMemSpace = 1; - let hasCanonicalizer = 1; - let hasFolder = 1; -} - -// -// Equal -// - -def VPU_EqualOp : - VPU_LayerOp< - "Equal", - [ - VPU_TilingBuilderOpInterface, - Commutative, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - VPU_EltwiseOp - ] - > { - let summary = "Equal VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[Bool8]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// AffineReshape -// - -def VPU_AffineReshapeOp : - VPU_LayerOp< - "AffineReshape", - [ - VPU_ViewLikeOpInterface, - DeclareOpInterfaceMethods - ] - > { - let summary = "AffineReshape VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$input, - - I64ArrayOfArraysAttr:$dim_mapping, - I64ArrayAttr:$shape_value - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let checkInferredDimsOrder = 1; - let checkInferredMemSpace = 1; - let hasFolder = 1; -} - -// -// NotEqual -// - -def VPU_NotEqualOp : - VPU_LayerOp< - "NotEqual", - [ - VPU_TilingBuilderOpInterface, - Commutative, - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - ] - > { - let summary = "NotEqual VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input1, - AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input2, - - IE_AutoBroadcastTypeAttr:$auto_broadcast, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[Bool8]>, VPU_DistributedTensor]>:$output - ); - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input1, - "::mlir::Value":$input2, - "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Copy -// - -def VPU_CopyOp : - VPU_LayerOp< - "Copy" - > { - let summary = "Copy VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$input, - - OptionalAttr:$out_mem_space - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let hasFolder = 1; - let hasCanonicalizer = 1; - - let checkInferredDimsOrder = 1; - let checkInferredMemSpace = 1; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// ExpandDilated -// - -def VPU_ExpandDilatedOp : - VPU_LayerOp< - "ExpandDilated" - > { - let summary = "Expand tensor with uninitialized values according to dilations"; - - let arguments = (ins - AnyRankedTensor:$input, - - I64ArrayAttr:$dilations - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// StorageElementTable -// - -def VPU_StorageElementTableOp : - VPU_Op< - "StorageElementTable", - [ - Pure, - DeclareOpInterfaceMethods - ] - > { - let summary = "Declares a Storage Element Pointers table"; - - let description = [{ - A Storage Element represents a 1x1xN volume that contains sparse data, where N - represents the number of channels stored. The Storage Element Table is comprised - of pointers to these Storage Elements, which have the following structure: - - 31-29 28 9 8 0 - ------------------------------------------------- - | xx | DATA_PTR | BASE_PTR | - ------------------------------------------------- - - The DATA_PTR represents the offset to a Storage Element in relation to the start of - the input data. BASE_PTR is used to decide what base address is added to DATA_PTR - in order to find the location of the Storage Element in memory during inference. - - This operation represents the Storage Element Table in relation to the input data, - on top of which transformations can be applied. This operation will later get - converted to a constant, where the pointers are generated based on the information - contained in this operation. - - The following information is contained: - - dataShape, dataElemType, dataStrides: information about the input data that - is associated with this Storage Element Table - - seSize: the size of a Storage Element; seSize can be either: - * an integer, in which case every seDepth is the same size, equal to the seSize value; - * an array of integers, in which case each seDepth might have a different size; currently - the only use case is having different seSize per cluster, therefore the seSize array should - ultimately have as many values as there are clusters used for the DPU op this SETable belongs to. - - seDepth: the number of Storage Elements per depth - - seAttr: information on how the input data is transformed - - basePtrs: base pointers associated with each Storage Element pointer - }]; - - let arguments = (ins - I64ArrayAttr:$dataShape, - TypeAttr:$dataElemType, - I64ArrayAttr:$seSize, - IntAttr:$seDepth, - OptionalAttr:$seAttr, - OptionalAttr:$dataStrides, - OptionalAttr:$basePtrs - ); - - let results = (outs - RankedTensorOf<[I32]>:$output - ); - - let hasVerifier = 1; - - let assemblyFormat = [{ - attr-dict `->` type(results) - }]; - - let builders = [ - OpBuilder<(ins - CArg<"llvm::ArrayRef">:$dataShape, - CArg<"mlir::Type">:$dataElemType, - CArg<"llvm::ArrayRef">:$seSize, - CArg<"int64_t">:$seDepth, - CArg<"VPU::SEAttr">:$seAttr - )> - ]; - - let hasCanonicalizer = 1; -} - -// -// NonMaxSuppression -// - -def VPU_NonMaxSuppressionOp : - VPU_LayerOp< - "NonMaxSuppression" - > { - let summary = "NonMaxSuppression VPU layer"; - - let arguments = (ins - 3DTensorOf<[F16, F32]>:$in_box_coords, - 3DTensorOf<[F16, F32]>:$in_box_scores, - Optional>:$dataBuffer, - - IE_BoxEncodingTypeAttr:$box_encoding, - UnitAttr:$sort_result_descending, - - OptionalAttr:$max_output_boxes_per_class_value, - OptionalAttr:$iou_threshold_value, - OptionalAttr:$score_threshold_value, - OptionalAttr:$soft_nms_sigma_value - ); - - let results = (outs - 2DTensorOf<[SI32]>:$out_selected_indices, - 2DTensorOf<[F16, F32]>:$out_selected_scores, - 1DTensorOf<[SI32]>:$out_valid_outputs - ); -} - - -// -// StubOp -// - -def VPU_StubOp : - VPU_Op< - "Stub", - [ - Pure - ] - > { - let summary = "Substitute operation for stubbing."; - - let arguments = (ins - Variadic:$inputs - ); - - let results = (outs - Variadic:$outputs - ); - - let assemblyFormat = [{ - `(` operands `)` attr-dict `:` type(operands) `->` type(results) - }]; -} - - -// -// GRUGatesOp -// - -def VPU_GRUGatesOp : - VPU_LayerOp< - "GRUGates", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Computes GRU activation functions"; - - let description = [{ - This operation is intended to be run as a software stage after computing and adding GRU matrix multiplications. - Formula is just for Gru with should_linear_before_reset true. - ``` - -**input_data** = (inputData * weights + bias) [batchSize, 3 * hiddenSize]** or **[1, 1, batchSize, 3 * hiddenSize]** - -**initial_hidden_state** = [batchSize, hiddenSize]** or **[1, 1, batchSize, hiddenSize]** - -**hidden_data** = (initialHiddenState * recurrenceWeights) [batchSize, 3 * hiddenSize]** or **[1, 1, batchSize, 3 * hiddenSize]** - -**biases** = Rbh, 4't bias plan if exist (optional) [1, hiddenSize]** or **[1, 1, 1, hiddenSize]** - * - Matrix multiplication - + - Element-wise add - ``` - - The meaning of other operands are identical to those in GRUCell operation. - }]; - - - let arguments = (ins - AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input_data, - AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$initial_hidden_state, - AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$hidden_data, - Optional, VPU_DistributedTensor]>>:$biases, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$outputHiddenState - ); - - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input_data, - "::mlir::Value":$initial_hidden_state, - "::mlir::Value":$hidden_data, - "::mlir::Value":$biases - )> - ]; - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - - -// -// GRUSequence -// - -def VPU_GRUSequenceOp : - VPU_LayerOp< - "GRUSequence", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "GRUSequence VPU layer"; - - let arguments = (ins - 3DTensorOf<[F16, F32]>:$input_data, - 3DTensorOf<[F16, F32]>:$initial_hidden_state, - 3DTensorOf<[F16, F32]>:$weights, - 3DTensorOf<[F16, F32]>:$recurrence_weights, - 2DTensorOf<[F16, F32]>:$biases, - - IntAttr:$hidden_size, - IntAttr:$seq_length, - IE_RNNSequenceDirectionAttr:$direction, - UnitAttr:$should_linear_before_reset, - F64Attr:$clip - ); - - let results = (outs - 4DTensorOf<[F16, F32]>:$middle_hidden_state, - 3DTensorOf<[F16, F32]>:$output_hidden_state - ); - - let extraClassDeclaration = [{ - OutputTiling getOutputTiling(const vpux::TileInfo& outputTile, vpux::Logger log); - }] # baseExtraClassDeclaration; -} - -// -// GRUSequenceFirstPart -// - -def VPU_GRUSequenceFirstPartOp : - VPU_LayerOp< - "GRUSequenceFirstPart" - > { - let summary = "GRUSequenceFirstPart VPU layer"; - - let arguments = (ins - 3DTensorOf<[F16, F32]>:$input_data, - 3DTensorOf<[F16, F32]>:$weights, - - IntAttr:$hidden_size, - IntAttr:$seq_length, - F64Attr:$clip - ); - - let results = (outs - 4DTensorOf<[F16, F32]>:$output - ); -} - -// -// GRUSequenceLastPart -// - -def VPU_GRUSequenceLastPartOp : - VPU_LayerOp< - "GRUSequenceLastPart", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "GRUSequenceLastPart VPU layer"; - - let arguments = (ins - 4DTensorOf<[F16, F32]>:$first_part_output, - 3DTensorOf<[F16, F32]>:$initial_hidden_state, - 3DTensorOf<[F16, F32]>:$recurrence_weights, - 2DTensorOf<[F16, F32]>:$biases, - - IntAttr:$hidden_size, - IntAttr:$seq_length, - IE_RNNSequenceDirectionAttr:$direction, - UnitAttr:$should_linear_before_reset, - F64Attr:$clip - ); - - let results = (outs - 4DTensorOf<[F16, F32]>:$middle_hidden_state, - 3DTensorOf<[F16, F32]>:$output_hidden_state - ); - - let extraClassDeclaration = [{ - OutputTiling getOutputTiling(const vpux::TileInfo& outputTile, vpux::Logger log); - }] # baseExtraClassDeclaration; -} - -// -// DeformablePSROIPoolingOp -// - -def VPU_DeformablePSROIPoolingOp : - VPU_LayerOp< - "DeformablePSROIPooling" - > { - let summary = "DeformablePSROIPooling VPU layer"; - - let arguments = (ins - 4DTensorOf<[AnyFloat]>:$input_score_maps, - 2DTensorOf<[AnyFloat]>:$input_rois, - Optional<4DTensorOf<[AnyFloat]>>:$input_transformations, - - IntAttr:$output_dim, - F64Attr:$spatial_scale, - OptionalAttr:$group_size, - OptionalAttr:$spatial_bins_x, - OptionalAttr:$spatial_bins_y, - OptionalAttr:$trans_std, - OptionalAttr:$part_size, - OptionalAttr:$mode - ); - - let results = (outs - 4DTensorOf<[AnyFloat]>:$output - ); -} - -// -// PermuteQuantize -// - -def VPU_PermuteQuantizeOp : - VPU_LayerOp< - "PermuteQuantize", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "PermuteQuantize VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - - AffineMapAttr:$dst_order, - AffineMapAttr:$mem_perm, - TypeAttr:$dstElemType, - I64ArrayAttr:$pads_begin, - I64ArrayAttr:$pads_end - ); - - let results = (outs - RankedTensorOf<[quant_QuantizedType]>:$output - ); -} - -// -// DFTOp -// - -def VPU_DFTOp : - VPU_LayerOp< - "DFT", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "InferenceEngine DFT layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - Optional>:$twiddle_factors, - I64ArrayAttr:$axes_attr, - I64ArrayAttr:$signal_size_attr - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// RDFTOp -// - -def VPU_RDFTOp : - VPU_LayerOp< - "RDFT" - > { - let summary = "InferenceEngine RDFT layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - Optional>:$twiddle_factors, - I64ArrayAttr:$axes_attr, - I64ArrayAttr:$signal_size_attr - - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// IDFTOp -// - -def VPU_IDFTOp : - VPU_LayerOp< - "IDFT", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "InferenceEngine IDFT layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - Optional>:$twiddle_factors, - I64ArrayAttr:$axes_attr, - I64ArrayAttr:$signal_size_attr - - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// IRDFTOp -// - -def VPU_IRDFTOp : - VPU_LayerOp< - "IRDFT" - > { - let summary = "InferenceEngine IRDFT layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - Optional>:$twiddle_factors, - I64ArrayAttr:$axes_attr, - I64ArrayAttr:$signal_size_attr - - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// RDFTUncutOp -// - -def VPU_RDFTUncutOp : - VPU_LayerOp< - "RDFTUncut", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "RDFTUncut VPU layer"; - - let description = [{ - Operation apply RDFT transformation but not cut symmetric part on last axis width value from axes_attr. - }]; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - Optional>:$twiddle_factors, - I64ArrayAttr:$axes_attr, - I64ArrayAttr:$signal_size_attr - - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// IRDFTLastAxisOp -// - -def VPU_IRDFTLastAxisOp : - VPU_LayerOp< - "IRDFTLastAxis", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "IRDFTLastAxis VPU layer"; - - let description = [{ - Operation apply IRDFT transformation but just on last axis from standard IRDFT operation. - Used to produce full IRDFT capability (in combination with IDFT) without computation - and the data movement unnecessary for the direction of the last transform axis. - }]; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - Optional>:$twiddle_factors, - I64ArrayAttr:$axes_attr, - I64ArrayAttr:$signal_size_attr - - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// ShapeCastOp -// - -def VPU_ShapeCastOp : - VPU_LayerOp< - "ShapeCast", - [ - VPU_ViewLikeOpInterface, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "ShapeCast VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$source, - I64ArrayAttr:$shape - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$result - ); - - let extraClassDeclaration = [{ - bool isSupportedTilingDim(DimArrRef tilingDims); - bool isSupportedOutTile(const TileInfo& outTile); - }] # baseExtraClassDeclaration; - - let assemblyFormat = [{ - attr-dict - `inputs` `(` $source `:` type($source) `)` - `->` type(results) - }]; - - let hasFolder = 1; - - let hasCanonicalizer = 1; -} - -// -// LayoutCastOp -// - -def VPU_LayoutCastOp : - VPU_LayerOp< - "LayoutCast", - [ - VPU_ViewLikeOpInterface, - DeclareOpInterfaceMethods - ] - > { - let summary = "This layer overrides layout of a given tensor."; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$input, - AffineMapAttr:$dst_order - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let hasFolder = 1; - let hasVerifier = 1; -} - -// -// UnrolledTypeOp -// - -def VPU_UnrolledTypeOp : - VPU_Op< - "UnrolledType", - [ - VPU_ViewLikeOpInterface - ] - > { - let summary = "This layer mediate between unrolled distributed tensor type and usual type"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$input - ); - - let results = (outs - AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output - ); - - let assemblyFormat = [{ - `(` $input `:` qualified(type($input)) `)` - attr-dict - `->` qualified(type($output)) - }]; - - let hasCanonicalizer = 1; - let hasFolder = 1; -} - -// -// WorkloadCastOp -// - -def VPU_WorkloadCastOp : - VPU_Op< - "WorkloadCast", - [ - VPU_ViewLikeOpInterface - ] - > { - let summary = "Operation that casts one DistributedTensor type to another."; - - let description = [{ - This operation is required in order to support cluster tiling for VPU.NCE.PermuteQuantize. - PermuteQuantize operates on workloads split over width, while input DMA must tile over height. - For example consider the following chain of operations: - ``` - Input 1x3x16x32 -> Reshape 1x32x3x16 -> VPU.NCE.PermuteQuantize -> Reshape 1x3x16x32 - ``` - VPU.NCE.PermuteQuantize operates with 1x32x3x16 workload while original input has 1x3x16x32 shape. - Original input must be split over height, PermuteQuantize workload must be split over width: - ``` - Tile 1: Copy 1x3x8x32 -> Reshape 1x32x3x8 -> VPU.NCE.PermuteQuantize -> Reshape 1x3x8x32 - Tile 2: Copy 1x3x8x32 -> Reshape 1x32x3x8 -> VPU.NCE.PermuteQuantize -> Reshape 1x3x8x32 - ``` - However, Reshape prohibits such tiling because split axis differs in input and output. - VPU.WorkloadCast solves this problem because it doesn't have strict checks: - ``` - Copy 1x3x16x32, SOH -> WorkloadCast 1x32x3x8, SOW -> PermuteQuantize -> WorkloadCast 1x32x3x8, SOH - SOH = [1, 1, 2, 1] - SOW = [1, 1, 1, 2] - ``` - }]; - - let arguments = (ins - AnyTypeOf<[VPU_DistributedTensor, VPU_SparseTensor]>:$input - ); - - let results = (outs - AnyTypeOf<[VPU_DistributedTensor, VPU_SparseTensor]>:$output - ); - - let assemblyFormat = [{ - `(` $input `:` qualified(type($input)) `)` - attr-dict - `->` qualified(type($output)) - }]; -} - -// -// Accumulate -// - -def VPU_AccumulateOp : - VPU_LayerOp< - "Accumulate", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "Accumulate VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$lhs, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$rhs, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$lhsScale, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$rhsScale, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// ShapeOf -// - -def VPU_ShapeOfOp : - VPU_LayerOp< - "ShapeOf", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "ShapeOf VPU layer"; - - let arguments = (ins - AnyTypeOf<[AnyRankedTensor]>:$input - ); - - let results = (outs - 1DTensorOf<[SI32]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; -} - -// -// Range -// - -def VPU_RangeOp : - VPU_LayerOp< - "Range", - [ - VPU_BoundsRepresentationInterface - ] - > { - let summary = "Range VPU layer"; - - let arguments = (ins - 1DTensorOf<[AnyInteger, AnyFloat]>:$start, - 1DTensorOf<[AnyInteger, AnyFloat]>:$stop, - 1DTensorOf<[AnyInteger, AnyFloat]>:$step, - TypeAttr:$dstElemType, - - DefaultValuedAttr:$bounds_representation - ); - - let results = (outs - 1DTensorOf<[AnyInteger, AnyFloat]>:$output - ); - - let hasVerifier = 1; -} - -// -// NonZero -// - -def VPU_NonZeroOp : - VPU_LayerOp< - "NonZero", - [ - VPU_BoundsRepresentationInterface - ] - > { - let summary = "NonZero VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, SI32, Bool8]>:$input, - - DefaultValuedAttr:$bounds_representation - ); - - let results = (outs - 2DTensorOf<[SI32]>:$output - ); - - let hasVerifier = 1; -} - -// -// DynamicReshape -// - -def VPU_DynamicReshapeOp : - VPU_LayerOp< - "DynamicReshape", - [ - DeclareOpInterfaceMethods, - VPU_BoundsRepresentationInterface - ] - > { - let summary = "DynamicReshape VPU layer"; - - let arguments = (ins - AnyRankedTensor:$input, - RankedTensorOf<[SI32]>:$shape, - - I64ArrayAttr:$output_shape, - I64ArrayAttr:$output_bounds, - - UnitAttr:$only_set_shape, - - DefaultValuedAttr:$bounds_representation - ); - - let results = (outs - AnyRankedTensor:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; -} - -// -// DynamicDequantizeOp -// - -def VPU_DynamicDequantizeOp : - VPU_LayerOp<"DynamicDequantize", - [ - VPU_EltwiseOp, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "InferenceEngine Dynamic Dequantize layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[quant_QuantizedType]>, VPU_DistributedTensor]>:$input, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$scale, - Optional, VPU_DistributedTensor]>>:$zp, - - TypeAttr:$dstElemType, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$scale, - "::mlir::Value":$zp, - "::mlir::TypeAttr":$dstElemType - )> - ]; - - let hasVerifier = 1; -} - -// -// DeformableConvolution -// - -def VPU_DeformableConvolutionOp : - VPU_LayerOp< - "DeformableConvolution" - > { - let summary = "DeformableConvolution VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - RankedTensorOf<[F16, F32]>:$offset, - RankedTensorOf<[F16, F32]>:$kernel, - Optional>:$mask, - - I64ArrayAttr:$strides, - I64ArrayAttr:$pads_begin, - I64ArrayAttr:$pads_end, - I64ArrayAttr:$dilations, - - IntAttr:$group, - IntAttr:$deformable_group, - UnitAttr:$biliniar_interpolate_pad - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// PopulateWeightTable -// - -def VPU_PopulateWeightTableOp : - VPU_LayerOp< - "PopulateWeightTable", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - - > { - let summary = "Populate weight table VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>,VPU_DistributedTensor]>:$scale, - - IntAttr:$base, - IntAttr:$step, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[4DTensorOf<[SI32]>,VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$scale, - CArg<"int64_t">:$base, - CArg<"int64_t">:$step - )> - ]; - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// RMS -// - -def VPU_RMSOp : - VPU_LayerOp< - "RMS", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - ] - > { - let summary = "RMS VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$gamma, - - F64Attr:$epsilon, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$gamma, - "::mlir::FloatAttr":$epsilon - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// Inverse -// - -def VPU_InverseOp : - VPU_LayerOp< - "Inverse" - > { - let summary = "Inverse VPU layer"; - - let arguments = (ins - RankedTensorOf<[F16, F32]>:$input, - - UnitAttr:$adjoint - ); - - let results = (outs - RankedTensorOf<[F16, F32]>:$output - ); -} - -// -// DynamicExpandOp -// - -def VPU_DynamicExpandOp : - VPU_LayerOp< - "DynamicExpand" - > { - let summary = "DynamicExpand operation"; - - let description = [{ - DynamicExpand operation is designed to take an input tensor with dynamic shapes and perform zero-padding to - match the dimensions of an upper bound. - This operation is specifically intended for converting a subgraph of a dynamic network into a static one - Example: - Input tensor<1x1x?x?xf16, {bounds = [1, 1, 5, 5]> - Input dynamic shape: 1x1x3x3 - 1, 1, 1 - 1, 1, 1 - 1, 1, 1 - In memory: 1, 1, 1, 1, 1, 1, 1, 1, 1 - Output tensor<1x1x5x5xf16> - 1, 1, 1, 0, 0 - 1, 1, 1, 0, 0 - 1, 1, 1, 0, 0 - 0, 0, 0, 0, 0 - 0, 0, 0, 0, 0 - In memory: 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - }]; - - let arguments = (ins - AnyRankedTensor:$input - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// GenericSwLayerOp -// - -def VPU_GenericSwLayerOp : - VPU_Op< - "GenericSwLayer", - [ - Pure, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods - ] - > { - let summary = "VPU Op that describes a generic SW Layer"; - - let arguments = (ins - SymbolRefAttr:$callee, // Generated sw layer function - - Variadic:$inputs - ); - - let results = (outs - Variadic:$outputs - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - bool fitIntoCMX(::llvm::ArrayRef buffers); - }]; - - let assemblyFormat = [{ - `(` operands `)` attr-dict `:` type(operands) `->` type(results) - }]; -} - -// -// ExternalKernelOp -// - -def VPU_ExternalKernelOp : - VPU_Op< - "ExternalKernel", - [ - DeclareOpInterfaceMethods - ] - > { - let summary = "Represents a kernel whose details/implementation are defined externally"; - - let arguments = (ins - Variadic:$inputs, - DictionaryAttr:$attrDict, - StrAttr:$unique_id - ); - - let results = (outs - Variadic:$outputs - ); - - let assemblyFormat = [{ - $unique_id - (`inputs` `(` $inputs^ `:` type($inputs) `)`)? - `attrs` `(` $attrDict `)` - attr-dict - `->` type(results) - }]; -} - -// -// RoPE -// - -def VPU_RoPEOp : - VPU_LayerOp< - "RoPE", - [ - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - ] - > { - let summary = "RoPE VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input_cos, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input_sin, - - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$input, - "::mlir::Value":$input_cos, - "::mlir::Value":$input_sin - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} - -// -// DynamicDataMaskOp -// - -def VPU_DynamicDataMaskOp : - VPU_LayerOp< - "DynamicDataMask" - > { - let summary = "Create mask to clear garbage from pad area, added by processing dynamic tensors by upper bounds"; - - let arguments = (ins - Optional<1DTensorOf<[SI64, SI32]>>:$realShape, - TypeAttr:$outputTensorType - ); - - let results = (outs - AnyRankedTensor:$output - ); -} - -// -// SDPA -// - -def VPU_SDPAOp : - VPU_LayerOp< - "SDPA", - [ - AttrSizedOperandSegments, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - ] - > { - let summary = "SDPA VPU layer"; - - let arguments = (ins - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$inputQ, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$inputK, - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$inputV, - Optional, VPU_DistributedTensor]>>:$inputMask, - Optional, VPU_DistributedTensor]>>:$inputScale, - Optional, VPU_DistributedTensor]>>:$inputBias, - Optional>:$dataStorage, - OptionalAttr:$multiClusterStrategy - ); - - let results = (outs - AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output - ); - - let extraClassDeclaration = [{ - bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); - - bool fitIntoCMX(::llvm::ArrayRef buffers); - }] # baseExtraClassDeclaration; - - let builders = [ - OpBuilder<(ins - "::mlir::Value":$inputQ, - "::mlir::Value":$inputK, - "::mlir::Value":$inputV, - "::mlir::Value":$inputMask, - "::mlir::Value":$inputScale, - "::mlir::Value":$inputBias, - "::mlir::Value":$dataStorage - )> - ]; - - let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; -} #endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/activation.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/activation.td new file mode 100644 index 0000000000..6cbe032b67 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/activation.td @@ -0,0 +1,470 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_ACTIVATION +#define VPUX_COMPILER_DIALECT_VPU_OPS_ACTIVATION + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + +def VPU_EluOp : + VPU_LayerOp< + "Elu", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Elu VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + + F64Attr:$x + ); + + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_GeluOp : + VPU_LayerOp< + "Gelu", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Gelu VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_HardSigmoidOp : + VPU_LayerOp< + "HardSigmoid", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "HardSigmoid VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + F64Attr:$alpha_value, + F64Attr:$beta_value, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::FloatAttr":$alpha, + "::mlir::FloatAttr":$beta + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_HSigmoidOp : + VPU_LayerOp< + "HSigmoid", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "HSigmoid VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_HSwishOp : + VPU_LayerOp< + "HSwish", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "HSwish VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_LeakyReluOp : + VPU_LayerOp< + "LeakyRelu", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "LeakyRelu VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + + F64Attr:$negative_slope + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_LogSoftmaxOp : + VPU_LayerOp< + "LogSoftmax", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "LogSoftmax VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + + IntAttr:$axisInd, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::IntegerAttr":$axisInd + )> + ]; +} + +def VPU_MishOp : + VPU_LayerOp< + "Mish", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Mish VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input + )> + ]; +} + +def VPU_PReluOp : + VPU_LayerOp< + "PRelu", + [ + DeclareOpInterfaceMethods, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "PRelu VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$negative_slope, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2 + )> + ]; + + let hasVerifier = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_ReLUOp : + VPU_LayerOp< + "ReLU", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "ReLU VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_SeluOp : + VPU_LayerOp< + "Selu", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Selu VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + F64Attr:$alpha_value, + F64Attr:$lambda_value + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_SigmoidOp : + VPU_LayerOp< + "Sigmoid", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp, + DeclareOpInterfaceMethods + ] + > { + let summary = "Sigmoid VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_SoftMaxOp : + VPU_LayerOp< + "SoftMax", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "SoftMax VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + IntAttr:$axisInd, + OptionalAttr:$padSize, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + + bool isVFSupported(); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::IntegerAttr":$axisInd, + "::mlir::IntegerAttr":$padSize + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_SoftPlusOp : + VPU_LayerOp< + "SoftPlus", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "SoftPlus VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_SwishOp : + VPU_LayerOp< + "Swish", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Swish VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + Optional, VPU_DistributedTensor]>>:$beta, + + OptionalAttr:$beta_value, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$beta, + "::mlir::FloatAttr":$beta_value + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/arithmetic.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/arithmetic.td new file mode 100644 index 0000000000..1a174ac631 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/arithmetic.td @@ -0,0 +1,742 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_ARITHMETIC +#define VPUX_COMPILER_DIALECT_VPU_OPS_ARITHMETIC + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + +include "mlir/Dialect/Quant/QuantOpsBase.td" + + +def VPU_AbsOp : + VPU_LayerOp< + "Abs", + [ + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Abs VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_AcosOp : + VPU_LayerOp< + "Acos", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Acos VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_AcoshOp : + VPU_LayerOp< + "Acosh", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Acosh VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_AsinOp : + VPU_LayerOp< + "Asin", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Asin VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_AsinhOp : + VPU_LayerOp< + "Asinh", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Asinh VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_AtanOp : + VPU_LayerOp< + "Atan", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Atan VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_AtanhOp : + VPU_LayerOp< + "Atanh", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Atanh VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_CeilingOp : + VPU_LayerOp< + "Ceiling", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Ceiling VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_ClampOp : + VPU_LayerOp< + "Clamp", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Clamp VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[I16, I32, SI16, SI32, F16, F32, quant_QuantizedType]>, VPU_DistributedTensor]>:$input, + + F64Attr:$min, + F64Attr:$max, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[I16, I32, SI16, SI32, F16, F32, quant_QuantizedType]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::FloatAttr":$min, + "::mlir::FloatAttr":$max + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_CosOp : + VPU_LayerOp< + "Cos", + [ + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Cos VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; +} + +def VPU_CoshOp : + VPU_LayerOp< + "Cosh", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Cosh VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_CumSumOp : + VPU_LayerOp< + "CumSum", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "CumSum VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + + OptionalAttr:$axis_value, + UnitAttr:$exclusive, + UnitAttr:$reverse, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + DimArr getTileableDims(); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::IntegerAttr":$axis_value, + "::mlir::UnitAttr":$exclusive, + "::mlir::UnitAttr":$reverse + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_ErfOp : + VPU_LayerOp< + "Erf", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Erf VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_ExpOp : + VPU_LayerOp< + "Exp", + [ + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Exp VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; +} + +def VPU_FloorOp : + VPU_LayerOp< + "Floor", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Floor VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_InverseOp : + VPU_LayerOp< + "Inverse" + > { + let summary = "Inverse VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + + UnitAttr:$adjoint + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_LogOp : + VPU_LayerOp< + "Log", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Log VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_NegativeOp : + VPU_LayerOp< + "Negative", + [ + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Negative VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_RoundOp : + VPU_LayerOp< + "Round", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Round VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + + IE_RoundModeAttr:$mode, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "vpux::IE::RoundModeAttr":$mode + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_SignOp : + VPU_LayerOp< + "Sign", + [ + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Sign VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; +} + +def VPU_SinOp : + VPU_LayerOp< + "Sin", + [ + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Sin VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; +} + +def VPU_SinhOp : + VPU_LayerOp< + "Sinh", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Sinh VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_SqrtOp : + VPU_LayerOp< + "Sqrt", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Sqrt VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_TanOp : + VPU_LayerOp< + "Tan", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Tan VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + +def VPU_TanhOp : + VPU_LayerOp< + "Tanh", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Tanh VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/base.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/base.td new file mode 100644 index 0000000000..9e2f6e42b3 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/base.td @@ -0,0 +1,53 @@ + +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_BASE +#define VPUX_COMPILER_DIALECT_VPU_OPS_BASE + +include "vpux/compiler/dialect/IE/attributes.td" +include "vpux/compiler/dialect/VPU/dialect.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" + +include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/Interfaces/SideEffectInterfaces.td" + +class VPU_Op traits = []> : + Op< + VPU_Dialect, + mnemonic, + traits + >; + +class VPU_LayerOp traits = []> : + VPU_Op< + mnemonic, + [ + Pure, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] # traits + > { + list elemComparisonModes = [IE_TypeComparisonMode_STRICT_EQUAL]; + bit checkInferredDimsOrder = 0; + bit checkInferredMemSpace = 0; + + code baseExtraClassDeclaration = [{ + static bool isCompatibleReturnTypes(mlir::TypeRange lhs, mlir::TypeRange rhs) { + return vpux::areTypesCompatible(lhs, rhs, + }] # !interleave(elemComparisonModes, "|") # [{, + static_cast(}] # checkInferredDimsOrder # [{), + static_cast(}] # checkInferredMemSpace # [{) + ); + } + }]; + let extraClassDeclaration = baseExtraClassDeclaration; + + let assemblyFormat = [{ + `(` operands `)` attr-dict `:` type(operands) `->` type(results) + }]; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/bitwise.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/bitwise.td new file mode 100644 index 0000000000..baa0a73326 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/bitwise.td @@ -0,0 +1,185 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_BITWISE +#define VPUX_COMPILER_DIALECT_VPU_OPS_BITWISE + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + + +def VPU_BitwiseAndOp : + VPU_LayerOp< + "BitwiseAnd", + [ + VPU_TilingBuilderOpInterface, + Commutative, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "BitwiseAnd VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_BitwiseOrOp : + VPU_LayerOp< + "BitwiseOr", + [ + VPU_TilingBuilderOpInterface, + Commutative, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "BitwiseOr VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_BitwiseXorOp : + VPU_LayerOp< + "BitwiseXor", + [ + VPU_TilingBuilderOpInterface, + Commutative, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "BitwiseXor VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[I32, SI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[I32, SI32]>, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[I32, SI32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_BitwiseNotOp : + VPU_LayerOp< + "BitwiseNot", + [ + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "BitwiseNot VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$input1, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[I8, SI8, I32, SI32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1 + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/comparison.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/comparison.td new file mode 100644 index 0000000000..818c7f60b9 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/comparison.td @@ -0,0 +1,250 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_COMPARISON +#define VPUX_COMPILER_DIALECT_VPU_OPS_COMPARISON + +include "vpux/compiler/dialect/core/types.td" +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + + +def VPU_EqualOp : + VPU_LayerOp< + "Equal", + [ + VPU_TilingBuilderOpInterface, + Commutative, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Equal VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[Bool8]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_GreaterOp : + VPU_LayerOp< + "Greater", + [ + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Greater VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[Bool8]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_GreaterEqualOp : + VPU_LayerOp< + "GreaterEqual", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + ] + > { + let summary = "GreaterEqual VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + + OptionalAttr:$multiClusterStrategy + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$output + ); + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_LessOp : + VPU_LayerOp< + "Less", + [ + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Less VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[Bool8]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_LessEqualOp : + VPU_LayerOp< + "LessEqual", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "LessEqual VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32, SI32]>:$input1, + RankedTensorOf<[F16, F32, SI32]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast + ); + + let results = (outs + RankedTensorOf<[Bool8]>:$output + ); +} + + +def VPU_NotEqualOp : + VPU_LayerOp< + "NotEqual", + [ + VPU_TilingBuilderOpInterface, + Commutative, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + ] + > { + let summary = "NotEqual VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[Bool8]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/control_flow.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/control_flow.td new file mode 100644 index 0000000000..eb3333bf81 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/control_flow.td @@ -0,0 +1,67 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_CONTROL_FLOW +#define VPUX_COMPILER_DIALECT_VPU_OPS_CONTROL_FLOW + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + +include "mlir/Interfaces/ControlFlowInterfaces.td" + + +def VPU_LoopSelectOp : + VPU_LayerOp< + "LoopSelect" + > { + let summary = "Select a slice of the input according to collected execution conditions, helping implement Loop Op."; + + let arguments = (ins + 1DTensorOf<[Bool8, SI8]>:$initExecCond, + 1DTensorOf<[Bool8, SI8]>:$execConds, + AnyRankedTensor:$input, + + BoolAttr:$do_concat, + IntAttr:$axis, + IntAttr:$stride + ); + + let results = (outs + AnyRankedTensor:$output + ); + + let hasVerifier = 1; +} + + +def VPU_YieldOp : + VPU_Op< + "Yield", + [ + HasParent<"VerticalFusionOp">, + DeclareOpInterfaceMethods, + Pure, + Terminator + ] + > { + let summary = "Terminator for wrapping operation"; + + let arguments = (ins + Variadic>:$operands + ); + + let assemblyFormat = [{ + $operands + custom(type($operands)) `` + attr-dict + }]; + + let hasVerifier = 1; +} + + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/convolution.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/convolution.td new file mode 100644 index 0000000000..058a96d7d6 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/convolution.td @@ -0,0 +1,238 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_CONVOLUTION +#define VPUX_COMPILER_DIALECT_VPU_OPS_CONVOLUTION + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + +include "mlir/Dialect/Quant/QuantOpsBase.td" + + +def VPU_ConvolutionOp : + VPU_LayerOp< + "Convolution", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "Convolution VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + RankedTensorOf<[F16, F32]>:$filter, + Optional>:$bias, + + I64ArrayAttr:$strides, + I64ArrayAttr:$pads_begin, + I64ArrayAttr:$pads_end, + I64ArrayAttr:$dilations, + + OptionalAttr:$post_op + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_DeformableConvolutionOp : + VPU_LayerOp< + "DeformableConvolution" + > { + let summary = "DeformableConvolution VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + RankedTensorOf<[F16, F32]>:$offset, + RankedTensorOf<[F16, F32]>:$kernel, + Optional>:$mask, + + I64ArrayAttr:$strides, + I64ArrayAttr:$pads_begin, + I64ArrayAttr:$pads_end, + I64ArrayAttr:$dilations, + + IntAttr:$group, + IntAttr:$deformable_group, + UnitAttr:$biliniar_interpolate_pad + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_FullyConnectedOp: + VPU_LayerOp< + "FullyConnected" + > { + let summary = "FullyConnected VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + RankedTensorOf<[F16, F32]>:$weights, + Optional>:$bias + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_GroupConvolutionOp : + VPU_LayerOp< + "GroupConvolution", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "GroupConvolution VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32, quant_QuantizedType]>:$input, + RankedTensorOf<[F16, F32, quant_QuantizedType]>:$filter, + Optional>:$bias, + + I64ArrayAttr:$strides, + I64ArrayAttr:$pads_begin, + I64ArrayAttr:$pads_end, + I64ArrayAttr:$dilations, + OptionalAttr:$groups, + + OptionalAttr:$post_op, + OptionalAttr:$output_padding, + OptionalAttr:$input_padding + ); + + let results = (outs + RankedTensorOf<[F16, F32, quant_QuantizedType]>:$output + ); + + list elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output, Byte reservedMem); + + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output); + }] # baseExtraClassDeclaration; +} + + +def VPU_MatMulOp: + VPU_LayerOp< + "MatMul", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "MatMul VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$input2, + + UnitAttr:$transpose_a, + UnitAttr:$transpose_b, + OptionalAttr:$post_op, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + + static bool isSupported(vpux::IE::MatMulOp matmulOp); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "::mlir::UnitAttr":$transpose_a, + "::mlir::UnitAttr":$transpose_b, + "vpux::IE::PostOpAttr":$post_op + )> + ]; + + let hasVerifier = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_TransposedConvolutionOp: + VPU_LayerOp< + "TransposedConvolution", + [ + AttrSizedOperandSegments + ] + > { + let summary = "TransposedConvolution VPU layer"; + + let description = [{ + Represents a transposed convolution, which consumes an input and filter tensor + and generates an output larger than the input. + + Operands: + - `input`: the input tensor of the operation; 4D layout [N, C_IN, Y, X] + - `filter`: the convolutional kernel tensor; expected layout [C_OUT, C_IN, KY, KX] + - (optional) `output_shape`: specifies the spatial shape of the output; + expected values `[Y, X]` + + Attributes: + - `strides`: represents the distance in pixels to slide the filter on the output + tensor; expected values `[SY, SX]` + - `pads_begin`: represents the number of pixels to remove from the beginning of + each axis in the output; expected values `[PAD_TOP, PAD_LEFT]` + - `pads_end`: represents the number of pixels to remove from the end of each axis + in the output; expected values `[PAD_BOTTOM, PAD_RIGHT]` + - `dilations`: has the same definition as dilations for a regular Convolution but + applied in the backward way, for the output tensor; expected values `[DY, DX]` + - `spatial_output_padding`: adds additional amount of paddings per each spatial axis in + the output tensor; expected values `[PY; PX]` + + Results: + - `output`: the output tensor of the operation; 4D layout [N, C_OUT, Y, X] + }]; + + let arguments = (ins + AnyRankedTensor:$input, + AnyRankedTensor:$filter, + Optional<1DTensorOf<[AnyInteger]>>:$output_shape, + Optional>:$bias, + + I64ArrayAttr:$strides, + I64ArrayAttr:$pads_begin, + I64ArrayAttr:$pads_end, + I64ArrayAttr:$dilations, + I64ArrayAttr:$spatial_output_padding, + + OptionalAttr:$post_op, + OptionalAttr:$clamp, + OptionalAttr:$output_padding, + OptionalAttr:$input_padding + ); + + let results = (outs + AnyRankedTensor:$output + ); + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT]; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/data_movement.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/data_movement.td new file mode 100644 index 0000000000..184d6960a9 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/data_movement.td @@ -0,0 +1,1053 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_DATA_MOVEMENT +#define VPUX_COMPILER_DIALECT_VPU_OPS_DATA_MOVEMENT + +include "vpux/compiler/dialect/core/constraints.td" +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + +include "mlir/Dialect/Quant/QuantOpsBase.td" + + +def VPU_BatchToSpace : + VPU_LayerOp< + "BatchToSpace" + > { + let summary = "BatchToSpace VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input, + + OptionalAttr:$block_shape_value, + OptionalAttr:$crops_begin_value, + OptionalAttr:$crops_end_value + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_BroadcastOp : + VPU_LayerOp< + "Broadcast" + > { + let summary = "Broadcast VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input, + 1DTensorOf<[AnyInteger]>:$target_shape, + Optional<1DTensorOf<[AnyInteger]>>:$axes_mapping, + + OptionalAttr:$mode + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_ConcatOp : + VPU_LayerOp< + "Concat", + [ + VPU_ViewLikeOpInterface, + DeclareOpInterfaceMethods + ] + > { + let summary = "VPU Concat layer"; + + let arguments = (ins + Variadic>:$inputs, + + OptionalAttr:$per_axis, + OptionalAttr:$static_offsets, + OptionalAttr:$strides, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let assemblyFormat = [{ + `(` operands `)` attr-dict `:` type(operands) `->` type(results) + }]; + + let builders = [ + OpBuilder< + (ins "mlir::ValueRange":$inputs, "vpux::IE::ConcatAttr":$per_axis) + >, + OpBuilder< + (ins "mlir::ValueRange":$inputs, "vpux::IE::ConcatAttr":$per_axis, "mlir::ArrayAttr":$static_offsets) + >, + OpBuilder< + (ins "mlir::ValueRange":$inputs, "mlir::IntegerAttr":$axis, + CArg<"mlir::IntegerAttr", "{}">:$offset, CArg<"mlir::IntegerAttr", "{}">:$stride) + >, + OpBuilder< + (ins "mlir::ValueRange":$inputs, "int64_t":$axis, CArg<"int64_t", "0">:$offset, CArg<"int64_t", "1">:$stride) + >, + OpBuilder< + (ins "mlir::ValueRange":$inputs, "vpux::Dim":$axis, CArg<"int64_t", "0">:$offset, CArg<"int64_t", "1">:$stride) + >, + + OpBuilder< + (ins "mlir::Type":$outType, "mlir::ValueRange":$inputs, "mlir::ArrayAttr":$static_offsets) + >, + OpBuilder< + (ins "mlir::Type":$outType, "mlir::ValueRange":$inputs, "vpux::IE::ConcatAttr":$per_axis, "mlir::ArrayAttr":$static_offsets) + >, + OpBuilder< + (ins "mlir::Type":$outType, "mlir::ValueRange":$inputs, "vpux::ArrayRef":$static_offsets) + >, + OpBuilder< + (ins "mlir::Type":$outType, "mlir::ValueRange":$inputs, "vpux::ArrayRef":$static_offsets) + >, + OpBuilder< + (ins "mlir::Type":$outType, "mlir::ValueRange":$inputs, "mlir::ArrayAttr":$static_offsets, "mlir::ArrayAttr":$strides) + > + ]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ::mlir::LogicalResult customVerify(); + bool fitIntoCMX(vpux::NDTypeInterface output, Byte reservedMem); + bool fitIntoCMX(vpux::NDTypeInterface output); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; + + let hasCanonicalizer = 1; + let hasFolder = 1; +} + + + +def VPU_CopyOp : + VPU_LayerOp< + "Copy" + > { + let summary = "Copy VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$input, + + OptionalAttr:$out_mem_space + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let hasFolder = 1; + let hasCanonicalizer = 1; + + let checkInferredDimsOrder = 1; + let checkInferredMemSpace = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_ConditionalCopyOp : + VPU_LayerOp< + "ConditionalCopyOp" + > { + let summary = "Conditional copy VPU layer"; + + let arguments = (ins + 1DTensorOf<[Bool8, SI8]>:$cond, + AnyRankedTensor:$input1, + AnyRankedTensor:$input2 + ); + + let results = (outs + AnyRankedTensor:$output + ); + + let hasVerifier = 1; +} + + + +def VPU_DepthToSpaceOp : + VPU_LayerOp< + "DepthToSpace", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "DepthToSpace VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + IntAttr:$block_size, + IE_DepthToSpaceModeAttr:$mode, + OptionalAttr:$padded_channels, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + + bool isVFSupported(); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::IntegerAttr":$block_size, + "vpux::IE::DepthToSpaceModeAttr":$mode, + "vpux::IE::ChannelPaddingAttr":$padded_channels + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; + + let hasVerifier = 1; +} + + +def VPU_DynamicExpandOp : + VPU_LayerOp< + "DynamicExpand" + > { + let summary = "DynamicExpand operation"; + + let description = [{ + DynamicExpand operation is designed to take an input tensor with dynamic shapes and perform zero-padding to + match the dimensions of an upper bound. + This operation is specifically intended for converting a subgraph of a dynamic network into a static one + Example: + Input tensor<1x1x?x?xf16, {bounds = [1, 1, 5, 5]> + Input dynamic shape: 1x1x3x3 + 1, 1, 1 + 1, 1, 1 + 1, 1, 1 + In memory: 1, 1, 1, 1, 1, 1, 1, 1, 1 + Output tensor<1x1x5x5xf16> + 1, 1, 1, 0, 0 + 1, 1, 1, 0, 0 + 1, 1, 1, 0, 0 + 0, 0, 0, 0, 0 + 0, 0, 0, 0, 0 + In memory: 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }]; + + let arguments = (ins + AnyRankedTensor:$input + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_DynamicTileOp : + VPU_LayerOp< + "DynamicTile", + [ + DeclareOpInterfaceMethods, + VPU_BoundsRepresentationInterface + ] + > { + let summary = "DynamicTile VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input, + RankedTensorOf<[AnyInteger]>:$target_shape, + + Optional>:$repeats, + OptionalAttr:$repeats_values, + + I64ArrayAttr:$output_shape, + I64ArrayAttr:$output_bounds, + + DefaultValuedAttr:$bounds_representation + ); + + let results = (outs + AnyRankedTensor:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; +} + + +def VPU_ExpandOp : + VPU_LayerOp< + "Expand" + > { + let summary = "Expand tensor with uninitialized values"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$input, + + I64ArrayAttr:$pads_begin, + I64ArrayAttr:$pads_end + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$output + ); + + let builders = [ + OpBuilder< + (ins "mlir::Value":$input, "std::optional":$pads_begin, "std::optional":$pads_end) + > + ]; + + let hasFolder = 1; + + let checkInferredDimsOrder = 1; + let checkInferredMemSpace = 1; +} + + +def VPU_ExpandDilatedOp : + VPU_LayerOp< + "ExpandDilated" + > { + let summary = "Expand tensor with uninitialized values according to dilations"; + + let arguments = (ins + AnyRankedTensor:$input, + + I64ArrayAttr:$dilations + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_ExtractImagePatchesOp : + VPU_LayerOp< + "ExtractImagePatches" + > { + let summary = "InferenceEngine ExtractImagePatches layer"; + + let arguments = (ins + 4DTensorOf<[AnyType]>:$data, + + I64ArrayAttr:$sizes, + I64ArrayAttr:$strides, + I64ArrayAttr:$rates, + IE_PadTypeAttr:$autoPad + ); + + let results = (outs + 4DTensorOf<[AnyType]>:$output + ); +} + +def VPU_GatherOp : + VPU_LayerOp< + "Gather", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Gather VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + AnyTypeOf<[RankedTensorOf<[AnyInteger]>, VPU_DistributedTensor]>:$indices, + Optional:$axis, + OptionalAttr:$axis_value, + IntAttr:$batch_dims, + + OptionalAttr:$indices_rank, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$indices, + "::mlir::Value":$axis, + "::mlir::IntegerAttr":$axis_value, + "::mlir::IntegerAttr":$batch_dims, + "::mlir::IntegerAttr":$indices_rank + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_GatherDMAOp : + VPU_LayerOp< + "GatherDMA", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "GatherDMA VPU layer which will be lowered to DMA, used for GatherOps which can be lowered to DMA"; + + let arguments = (ins + AnyRankedTensor:$input, + AnyTypeOf<[RankedTensorOf<[AnyInteger]>, VPU_DistributedTensor]>:$indices, + Optional:$axis, + OptionalAttr:$axis_value, + IntAttr:$batch_dims, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(vpux::NDTypeInterface indices,vpux::NDTypeInterface output, Byte reservedMem); + + bool fitIntoCMX(vpux::NDTypeInterface indices,vpux::NDTypeInterface output); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_GatherNDOp : + VPU_LayerOp< + "GatherND", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "GatherND VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + AnyTypeOf<[RankedTensorOf<[AnyInteger]>, VPU_DistributedTensor]>:$indices, + IntAttr:$batch_dims, + + OptionalAttr:$original_shape, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$indices, + "::mlir::IntegerAttr":$batch_dims + )>, + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$indices, + "::mlir::IntegerAttr":$batch_dims, + "::mlir::ArrayAttr":$original_shape + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; + + let hasVerifier = 1; +} + + +def VPU_GatherElementsOp : + VPU_LayerOp< + "GatherElements", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "GatherElements VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + AnyTypeOf<[RankedTensorOf<[AnyInteger]>, VPU_DistributedTensor]>:$indices, + IntAttr:$axis, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$indices, + "::mlir::IntegerAttr":$axis + )> + ]; +} + + +def VPU_GatherTreeOp : + VPU_LayerOp< + "GatherTree" + > { + let summary = "GatherTree VPU layer"; + + let arguments = (ins + AnyRankedTensor:$stepIds, + AnyRankedTensor:$parentIds, + AnyRankedTensor:$maxSeqLen, + AnyRankedTensor:$endToken + ); + + let results = (outs + AnyRankedTensor:$finalIds + ); +} + + +def VPU_MemPermuteOp : + VPU_LayerOp< + "MemPermute", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "MemPermute VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + + AffineMapAttr:$dst_order, + AffineMapAttr:$mem_perm, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::AffineMapAttr":$dst_order, + "::mlir::AffineMapAttr":$mem_perm + )> + ]; + + let checkInferredDimsOrder = 1; + let checkInferredMemSpace = 0; + let hasCanonicalizer = 1; +} + + +def VPU_PadOp : + VPU_LayerOp< + "Pad", + [ + AttrSizedOperandSegments, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Pad VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + Optional>:$pads_begin, + Optional>:$pads_end, + Optional>:$pad_value, + + OptionalAttr:$pads_begin_attr, + OptionalAttr:$pads_end_attr, + OptionalAttr:$pad_value_attr, + + IE_PadModeAttr:$mode, + OptionalAttr:$multiClusterStrategy, + OptionalAttr:$output_padding, + OptionalAttr:$input_padding + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let assemblyFormat = [{ + `(` $input `)` (`[` $pads_begin^ `,` $pads_end (`,` $pad_value^)? `]`)? attr-dict `:` type(operands) `->` type(results) + }]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$pads_begin, + "::mlir::Value":$pads_end, + "::mlir::Value":$pad_value, + "::mlir::ArrayAttr":$pads_begin_attr, + "::mlir::ArrayAttr":$pads_end_attr, + "::mlir::FloatAttr":$pad_value_attr, + "vpux::IE::PadModeAttr":$mode, + "::mlir::ArrayAttr":$output_padding, + "::mlir::ArrayAttr":$input_padding + )>, + OpBuilder<(ins + "vpux::NDTypeInterface&":$input_type, + "::mlir::Value":$input, + "::mlir::Value":$pads_begin, + "::mlir::Value":$pads_end, + "::mlir::Value":$pad_value, + "::mlir::ArrayAttr":$pads_begin_attr, + "::mlir::ArrayAttr":$pads_end_attr, + "::mlir::FloatAttr":$pad_value_attr, + "vpux::IE::PadMode":$mode, + "::mlir::ArrayAttr":$output_padding, + "::mlir::ArrayAttr":$input_padding + )> + ]; + + let hasFolder = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_PerAxisTileOp : + VPU_LayerOp< + "PerAxisTile" + > { + let summary = "Per axis Tile VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input, + + IntAttr:$axis, + IntAttr:$tiles + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_ReverseOp : + VPU_LayerOp< + "Reverse", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Reverse VPU operation"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + + I64ArrayAttr:$axis_value, + IE_ReverseModeAttr:$mode, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + DimArr getTileableDims(); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::ArrayAttr":$axis_value, + "vpux::IE::ReverseModeAttr":$mode + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_ReverseSequenceOp : + VPU_LayerOp< + "ReverseSequence" + > { + let summary = "Reverse variable length sequence VPU operation"; + + let arguments = (ins + AnyRankedTensor:$data, + 1DTensorOf<[AnyInteger]>:$seq_length, + + IntAttr:$seq_axis, + IntAttr:$batch_axis + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_RollOp : + VPU_LayerOp< + "Roll", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Roll VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$data, + AnyTypeOf<[RankedTensorOf<[SI32, SI64]>, VPU_DistributedTensor]>:$shift, + AnyTypeOf<[RankedTensorOf<[SI32, SI64]>, VPU_DistributedTensor]>:$axes, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$data, + "::mlir::Value":$shift, + "::mlir::Value":$axes + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_ScatterElementsUpdateOp : + VPU_LayerOp< + "ScatterElementsUpdate" + > { + let summary = "ScatterElementsUpdate VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input, + RankedTensorOf<[AnyInteger]>:$indices, + AnyRankedTensor:$updates, + IntAttr:$axis, + IE_ScatterElementsUpdateReductionTypeAttr:$reduction, + BoolAttr:$use_init_val + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_ScatterNDUpdateOp : + VPU_LayerOp< + "ScatterNDUpdate" + > { + let summary = "ScatterNDUpdate VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input, + RankedTensorOf<[AnyInteger]>:$indices, + AnyRankedTensor:$updates + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_ScatterUpdateOp : + VPU_LayerOp< + "ScatterUpdate" + > { + let summary = "ScatterUpdate VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input, + RankedTensorOf<[AnyInteger]>:$indices, + AnyRankedTensor:$updates, + OptionalAttr:$axis_value + + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_SliceOp : + VPU_LayerOp< + "Slice", + [ + VPU_ViewLikeOpInterface + ] + > { + let summary = "Extract single slice from ranked tensor or distributed tensor"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor, VPU_SparseTensor]>:$input, + I64ArrayAttr:$static_offsets, + I64ArrayAttr:$static_sizes + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor, VPU_SparseTensor]>:$output + ); + + let assemblyFormat = [{ + $input $static_offsets $static_sizes + attr-dict `:` type($input) `to` type(results) + }]; + + let builders = [ + OpBuilder< + (ins "mlir::Value":$input, "vpux::ShapeRef":$static_offsets, "vpux::ShapeRef":$static_sizes) + >, + OpBuilder< + (ins "mlir::Value":$input, "vpux::ArrayRef":$static_offsets, "vpux::ArrayRef":$static_sizes) + > + ]; + + let hasFolder = 1; + let hasCanonicalizer = 1; + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; + + let extraClassDeclaration = [{ + mlir::Value getSource() { return getInput(); } + }] # baseExtraClassDeclaration; +} + + +def VPU_SpaceToDepthOp : + VPU_LayerOp< + "SpaceToDepthOp", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "SpaceToDepthOp VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input, + + DefaultValuedAttr:$block_size, + IE_SpaceToDepthModeAttr:$mode + ); + + let results = (outs + AnyRankedTensor:$output + ); + + let hasVerifier = 1; +} + + +def VPU_SpaceToBatch : + VPU_LayerOp< + "SpaceToBatch" + > { + let summary = "SpaceToBatch VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input, + + OptionalAttr:$block_shape_value, + OptionalAttr:$pads_begin_value, + OptionalAttr:$pads_end_value + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_SplitOp : + VPU_LayerOp< + "Split", + [ + VPU_ViewLikeOpInterface + ] + > { + let summary = "Split VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$input, + Optional:$axis, + + IntAttr:$num_splits, + OptionalAttr:$axis_value + ); + + let results = (outs + Variadic>:$outputs + ); + + let checkInferredDimsOrder = 1; + let checkInferredMemSpace = 1; + let hasVerifier = 1; +} + + +def VPU_StridedSliceOp : + VPU_LayerOp< + "StridedSlice", + [ + AttrSizedOperandSegments, + DeclareOpInterfaceMethods, + VPU_BoundsRepresentationInterface + ] + > { + let summary = "StridedSlice VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input, + Optional<1DTensorOf<[AnyInteger]>>:$begins, + Optional<1DTensorOf<[AnyInteger]>>:$ends, + Optional<1DTensorOf<[AnyInteger]>>:$strides, + + OptionalAttr:$begins_attr, + OptionalAttr:$ends_attr, + OptionalAttr:$strides_attr, + + I64ArrayAttr:$begin_mask, + I64ArrayAttr:$end_mask, + I64ArrayAttr:$new_axis_mask, + I64ArrayAttr:$shrink_axis_mask, + I64ArrayAttr:$ellipsis_mask, + + DefaultValuedAttr:$bounds_representation + ); + + let results = (outs + AnyRankedTensor:$output + ); + + let extraClassDeclaration = [{ + bool isSimplified(); + }]; +} + + +def VPU_TileOp : + VPU_LayerOp< + "Tile", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "Tile VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input, + I64ArrayAttr:$repeats_values + ); + + let results = (outs + AnyRankedTensor:$output + ); + + let hasFolder = 1; +} + + +def VPU_UpsamplingOp : + VPU_LayerOp< + "Upsampling" + > { + let summary = "Upsampling VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32, quant_QuantizedType]>:$input, + I64ArrayAttr:$upsampling_factor, + OptionalAttr:$pad + ); + + let results = (outs + RankedTensorOf<[F16, F32, quant_QuantizedType]>:$output + ); +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/data_type.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/data_type.td new file mode 100644 index 0000000000..75958ed373 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/data_type.td @@ -0,0 +1,352 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_DATA_TYPE +#define VPUX_COMPILER_DIALECT_VPU_OPS_DATA_TYPE + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + +include "mlir/Dialect/Quant/QuantOpsBase.td" +include "mlir/Interfaces/CastInterfaces.td" + + +def VPU_ConvertOp : + VPU_LayerOp< + "Convert", + [ + DeclareOpInterfaceMethods, + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Convert VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + + TypeAttr:$dstElemType, + OptionalAttr:$multiClusterStrategy + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::TypeAttr":$dstElemType + )> + ]; + + let hasFolder = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_DequantizeOp : + VPU_LayerOp<"Dequantize", + [ + DeclareOpInterfaceMethods, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Dequantize VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[quant_QuantizedType]>, VPU_DistributedTensor]>:$input, + TypeAttr:$dstElemType, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32,]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::TypeAttr":$dstElemType + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + bool isVFSupported(); + }] # baseExtraClassDeclaration; +} + + +def VPU_DynamicQuantizeOp : + VPU_LayerOp<"DynamicQuantize", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Dynamic-Quantize VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F32]>, VPU_DistributedTensor]>:$input, + AnyTypeOf<[RankedTensorOf<[F32]>, VPU_DistributedTensor]>:$min, + AnyTypeOf<[RankedTensorOf<[F32]>, VPU_DistributedTensor]>:$max, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[UI8]>, VPU_DistributedTensor]>:$output, + AnyTypeOf<[RankedTensorOf<[F32]>, VPU_DistributedTensor]>:$scale, + AnyTypeOf<[RankedTensorOf<[UI8]>, VPU_DistributedTensor]>:$zero_point + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$min, + "::mlir::Value":$max + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; +} + + +def VPU_SparsifyOp : + VPU_LayerOp<"Sparsify", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Sparsify VPU layer"; + + let arguments = (ins + 4DTensorOf<[quant_QuantizedType, F16, BF16]>:$input + ); + + let results = (outs + VPU_SparseTensor:$output + ); +} + + +def VPU_DesparsifyOp : + VPU_LayerOp<"Desparsify", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Desparsify VPU layer"; + + let arguments = (ins + VPU_SparseTensor:$input + ); + + let results = (outs + 4DTensorOf<[quant_QuantizedType, F16, BF16]>:$output + ); +} + + +def VPU_QuantizeOp : + VPU_LayerOp<"Quantize", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Quantize VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + + TypeAttr:$dstElemType + ); + + let results = (outs + RankedTensorOf<[quant_QuantizedType]>:$output + ); +} + + + + +def VPU_DynamicDequantizeOp : + VPU_LayerOp<"DynamicDequantize", + [ + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "InferenceEngine Dynamic Dequantize layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[quant_QuantizedType]>, VPU_DistributedTensor]>:$input, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$scale, + Optional, VPU_DistributedTensor]>>:$zp, + + TypeAttr:$dstElemType, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$scale, + "::mlir::Value":$zp, + "::mlir::TypeAttr":$dstElemType + )> + ]; + + let hasVerifier = 1; +} + + +def VPU_FakeConvertOp : + VPU_LayerOp< + "FakeConvert" + > { + let summary = "FakeConvert VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + RankedTensorOf<[F16, F32]>:$scale, + Optional>:$shift, + + TypeAttr:$dst_type + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); + + let hasVerifier = 1; +} + + +def VPU_FakeQuantizeOp : + VPU_LayerOp< + "FakeQuantize", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "FakeQuantize VPU layer"; + + let description = [{ + The operation works in two modes: + * integral quantization: specified by the 'levels' attribute + * floating-point quantization: specified by the 'low_fp_type' attribute, [f8E4M3FN | f8E5M2] + + Only one of these attributes should be provided. + }]; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input_low, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input_high, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output_low, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output_high, + + OptionalAttr:$levels, + OptionalAttr:$low_fp_type, + IE_AutoBroadcastTypeAttr:$auto_broadcast, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$input_low, + "::mlir::Value":$input_high, + "::mlir::Value":$output_low, + "::mlir::Value":$output_high, + "::mlir::IntegerAttr":$levels, + "::mlir::TypeAttr":$low_fp_type, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let hasVerifier = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_QuantizeCastOp : + VPU_LayerOp< + "QuantizeCast", + [ + VPU_ViewLikeOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Quantize Cast VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$input, + + TypeAttr:$dstElemType + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let hasVerifier = 1; + let hasFolder = 1; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/dpu.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/dpu.td new file mode 100644 index 0000000000..626c5b0532 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/dpu.td @@ -0,0 +1,997 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_DPU +#define VPUX_COMPILER_DIALECT_VPU_OPS_DPU + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + +include "mlir/Dialect/Quant/QuantOpsBase.td" + + +def VPU_DPUWorkloadOp : + VPU_Op< + "DPU.Workload", + [ + ParentOneOf<[ + "vpux::VPU::NCEConvolutionOp", + "vpux::VPU::NCEMatMulOp", + "vpux::VPU::NCEDepthConvolutionOp", + "vpux::VPU::NCEMaxPoolOp", + "vpux::VPU::NCEAveragePoolOp", + "vpux::VPU::NCEEltwiseOp", + "vpux::VPU::NCECompressConvolutionOp", + "vpux::VPU::NCEInterpolateOp", + "vpux::VPU::NCEPermuteOp", + "vpux::VPU::NCEReduceOp" + ]> + ] + > { + let summary = "Workload for a single DPU tile"; + + let arguments = (ins + I64ArrayAttr:$outOffsets, + I64ArrayAttr:$outSizes, + + OptionalAttr:$inOffsets, + OptionalAttr:$inSizes, + + VPU_PaddingAttr:$pad, + VPU_MPEModeAttr:$mpe_mode, + + OptionalAttr:$cluster_id + ); + + let builders = [ + OpBuilder<(ins + "mlir::ArrayAttr":$outOffsets, + "mlir::ArrayAttr":$outSizes, + "vpux::VPU::PaddingAttr":$kernelFunction, + "vpux::VPU::MPEMode":$mpe_mode + )>, + + OpBuilder<(ins + "mlir::ArrayAttr":$outOffsets, + "mlir::ArrayAttr":$outSizes, + "vpux::VPU::PaddingAttr":$kernelFunction, + "vpux::VPU::MPEModeAttr":$mpe_mode, + "mlir::IntegerAttr":$cluster_id + )>, + + OpBuilder<(ins + "mlir::ArrayAttr":$outOffsets, + "mlir::ArrayAttr":$outSizes, + "vpux::VPU::PaddingAttr":$kernelFunction, + "vpux::VPU::MPEMode":$mpe_mode, + "mlir::IntegerAttr":$cluster_id + )> + ]; + + let assemblyFormat = [{ + ( `inOffsets` $inOffsets^ )? ( `inSizes` $inSizes^ )? `outOffsets` $outOffsets `outSizes` $outSizes $pad $mpe_mode attr-dict-with-keyword + }]; +} + + +def VPU_NCEConvolutionOp : + VPU_LayerOp< + "NCE.Convolution", + [ + AttrSizedOperandSegments, + NoRegionArguments, + NoTerminator, + SingleBlock, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "NCE version of Convolution layer"; + + let arguments = (ins + AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input, + AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$filter, + Optional, VPU_DistributedTensor]>>:$weightsTable, + Optional, VPU_DistributedTensor]>>:$weight_table_data_ptr, + Optional, VPU_DistributedTensor]>>:$weight_table_sp_ptr, + Optional, VPU_DistributedTensor]>>:$weight_table_scale, + Optional, VPU_DistributedTensor]>>:$weight_table_bias, + Optional, VPU_DistributedTensor]>>:$weight_zero_points, + + ConfinedAttr]>:$strides, + VPU_PaddingAttr:$pad, + + VPU_PPEAttr:$ppe, + OptionalAttr:$mpe_engine, + + ConfinedAttr]>:$rawFilterShape, + + OptionalAttr:$multiClusterStrategy, + OptionalAttr:$output_padding, + OptionalAttr:$input_padding + ); + + let results = (outs + AnyTypeOf<[4DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let regions = (region + AnyRegion:$workloads + ); + + let assemblyFormat = [{ + `(` + $input `,` + $filter + (`,` $weightsTable^)? + (`,` $weight_table_data_ptr^)? + (`,` $weight_table_sp_ptr^)? + (`,` $weight_table_scale^)? + (`,` $weight_table_bias^)? + (`,` $weight_zero_points^)? + `)` + attr-dict `:` type(operands) `->` type(results) + custom($workloads) + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output, Byte reservedMem); + + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output); + + static bool isSupported(vpux::IE::ConvolutionOp origOp, vpux::LogCb logCb, bool checkLayout = false, + bool checkChannelAlignment = false); + + vpux::Shape inferAlignedFilterShape(vpux::NDTypeInterface input, vpux::NDTypeInterface output, vpux::NDTypeInterface filter); + + DimArr restrictedFusionAxes(); + + static mlir::LogicalResult verifyKernel(IE::ConvolutionOp origOp, Logger log = Logger::global()); + static mlir::LogicalResult verifyKernel(IE::TransposedConvolutionOp origOp, Logger log = Logger::global()); + + static mlir::LogicalResult verifyConvCMX(mlir::Location loc, mlir::ModuleOp module, vpux::NDTypeInterface inputType, + vpux::NDTypeInterface filterType, vpux::NDTypeInterface outputType, + mlir::ArrayAttr kernelStrides, Logger log = Logger::global()); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, + IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_NCEMatMulOp : + VPU_LayerOp< + "NCE.MatMul", + [ + NoRegionArguments, + NoTerminator, + SingleBlock, + SameVariadicOperandSize, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "NCE version of MatMul executed on DPU"; + + let description = [{ + NCE version of MatMul that will be executed on DPU. + + This operation supports 5D operands so that we may encode batch unrolling as part of the shape + and defer this unrolling until later in the compilation pipeline when it's more beneficial. + + Instead of unrolling batch into a chain of discrete slice -> convolution -> concat operations in IE dialect, + we instead encode this as part of the shape and preserve it until later on in multiclustering passes where we + can use split-over-group (in the case we have more groups than clusters or else use another approach). + + We use this split-over-group strategy to solve the problem of having too many small workloads generated + which introduce a lot of DMA overhead. + + With 5D operands, we have the following layouts (where G = Group): + * NCHW -> GNCHW + * NHWC -> GNHWC + * OIYX -> GOIYX + + Before we would convert IE.MatMul like this with repeated chains for the number of batches we need to unroll: + + | + ---/ \---------------------- ... (repeated for number of batches) + / \ \ + | | | + IE.Slice IE.Slice ... + | | | + IE.Reshape IE.Reshape ... + | | | + IE.MatMul => IE.Convolution IE.Convolution ... + | | | + IE.Reshape IE.Reshape ... + | | | + \ / / + ---\ /---------------------- ... (repeated for number of batches) + | + Concat + + Now we keep a single chain of operations (which we later unroll in multiclustering): + + VPU.AffineReshape + | + VPU.PermuteCast + | + IE.MatMul => VPU.NCE.MatMul + | + VPU.PermuteCast + | + VPU.AffineReshape + + See E#125047 for more information. + }]; + + let arguments = (ins + AnyTypeOf<[5DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input, + AnyTypeOf<[5DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$weights, + + AnyTypeOf<[5DTensorOf<[SI32]>, VPU_DistributedTensor]>:$weightsTable, + + ConfinedAttr]>:$strides, + VPU_PaddingAttr:$pad, + + VPU_PPEAttr:$ppe, + OptionalAttr:$mpe_engine, + ConfinedAttr]>:$rawFilterShape, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[5DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let regions = (region + AnyRegion:$workloads + ); + + let assemblyFormat = [{ + `(` + $input `,` + $weights `,` + $weightsTable + `)` + attr-dict + custom(type($input), type($weights), type($weightsTable)) `` + `->` type(results) + custom($workloads) + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + static bool isSupported(vpux::IE::MatMulOp origOp, vpux::LogCb logCb, + bool checkLayout = false, bool checkChannelAlignment = false); + static bool isSupported(vpux::VPU::NCEMatMulOp origOp, vpux::LogCb logCb, + bool checkLayout = false, bool checkChannelAlignment = false); + + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output, Byte reservedMem); + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output); + + static mlir::LogicalResult verifyKernel(IE::MatMulOp origOp); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [ + IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, + IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, + IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, + IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT + ]; +} + + +def VPU_NCEDepthConvolutionOp : + VPU_LayerOp< + "NCE.DepthConvolution", + [ + NoRegionArguments, + NoTerminator, + SingleBlock, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "NCE version of Depthwise Convolution layer"; + + let arguments = (ins + AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input, + AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$filter, + AnyTypeOf<[4DTensorOf<[SI32]>, VPU_DistributedTensor]>:$weightsTable, + + ConfinedAttr]>:$strides, + VPU_PaddingAttr:$pad, + + VPU_PPEAttr:$ppe, + + ConfinedAttr]>:$rawFilterShape, + + OptionalAttr:$multiClusterStrategy, + OptionalAttr:$output_padding, + OptionalAttr:$input_padding + ); + + let results = (outs + AnyTypeOf<[4DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let regions = (region + AnyRegion:$workloads + ); + + let assemblyFormat = [{ + `(` + $input `,` + $filter `,` + $weightsTable + `)` + attr-dict + custom(type($input), type($filter), type($weightsTable)) `` + `->` type(results) + custom($workloads) + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output, Byte reservedMem); + + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output); + + static bool isSupported(vpux::IE::GroupConvolutionOp origOp, vpux::LogCb logCb, bool checkLayout = false, + bool checkChannelAlignment = false); + + vpux::Shape inferAlignedFilterShape(vpux::NDTypeInterface output, vpux::NDTypeInterface filter); + + static mlir::LogicalResult verifyKernel(IE::GroupConvolutionOp origOp, Logger log = Logger::global()); + + static mlir::LogicalResult verifyGroupConvCMX(mlir::Location loc, mlir::ModuleOp module, + vpux::NDTypeInterface inputType, vpux::NDTypeInterface filterType, + vpux::NDTypeInterface outputType, mlir::ArrayAttr kernelStrides, + Logger log = Logger::global()); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, + IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_NCECompressConvolutionOp : + VPU_LayerOp< + "NCE.CompressConvolution", + [ + NoRegionArguments, + NoTerminator, + SingleBlock, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "NCE version of Compressed Convolution layer"; + + let description = [{ + This operation must have 4 or less input channels, + instead of the usual multiple of 16 as for a normal Convolution op. + }]; + + let arguments = (ins + AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input, + AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$filter, + AnyTypeOf<[4DTensorOf<[SI32]>, VPU_DistributedTensor]>:$weightsTable, + + ConfinedAttr]>:$strides, + VPU_PaddingAttr:$pad, + + VPU_PPEAttr:$ppe, + + ConfinedAttr]>:$rawFilterShape, + + OptionalAttr:$multiClusterStrategy, + IntAttr:$cm_sp_pattern, + OptionalAttr:$output_padding, + OptionalAttr:$input_padding + ); + + let results = (outs + AnyTypeOf<[4DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let regions = (region + AnyRegion:$workloads + ); + + let assemblyFormat = [{ + `(` $input `,` $filter `,` $weightsTable `)` + attr-dict + custom(type($input), type($filter), type($weightsTable)) `` + `->` type(results) + custom($workloads) + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output, Byte reservedMem); + + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output); + + static bool isSupported(vpux::IE::ConvolutionOp origOp, vpux::LogCb logCb, bool checkLayout = false, + bool checkChannelAlignment = false); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, + IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_NCEMaxPoolOp : + VPU_LayerOp< + "NCE.MaxPool", + [ + NoRegionArguments, + NoTerminator, + SingleBlock, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "NCE version of MaxPool layer"; + + let arguments = (ins + AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input, + Optional, VPU_DistributedTensor]>>:$weightsTable, + + ConfinedAttr]>:$kernel_size, + ConfinedAttr]>:$strides, + VPU_PaddingAttr:$pad, + + VPU_PPEAttr:$ppe, + + OptionalAttr:$multiClusterStrategy, + OptionalAttr:$output_padding, + OptionalAttr:$input_padding + ); + + let results = (outs + AnyTypeOf<[4DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let regions = (region + AnyRegion:$workloads + ); + + let assemblyFormat = [{ + `(` $input + (`,` $weightsTable^ custom(type($weightsTable)) ``)? + `)` + attr-dict + custom(type($input)) `` + `->` type(results) + custom($workloads) + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output, Byte reservedMem); + + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output); + + static bool isSupported(vpux::IE::MaxPoolOp origOp, vpux::LogCb logCb, bool checkLayout = false, + bool checkChannelAlignment = false); + + static mlir::LogicalResult verifyKernel(IE::MaxPoolOp origOp, Logger log = Logger::global()); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, + IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_NCEAveragePoolOp : + VPU_LayerOp< + "NCE.AveragePool", + [ + NoRegionArguments, + NoTerminator, + SingleBlock, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "NCE version of AveragePool layer"; + + let arguments = (ins + AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input, + + ConfinedAttr]>:$kernel_size, + ConfinedAttr]>:$strides, + VPU_PaddingAttr:$pad, + + VPU_PPEAttr:$ppe, + + OptionalAttr:$multiClusterStrategy, + OptionalAttr:$output_padding, + OptionalAttr:$input_padding + ); + + let results = (outs + AnyTypeOf<[4DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let regions = (region + AnyRegion:$workloads + ); + + let assemblyFormat = [{ + `(` $input `)` + attr-dict + custom(type($input)) `` + `->` type(results) + custom($workloads) + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output, Byte reservedMem); + + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output); + + static bool isSupported(vpux::IE::AvgPoolOp origOp, vpux::LogCb logCb, bool checkLayout = false, + bool checkChannelAlignment = false); + + static mlir::LogicalResult verifyKernel(IE::AvgPoolOp origOp, Logger log = Logger::global()); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, + IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_NCEEltwiseOp : + VPU_LayerOp< + "NCE.Eltwise", + [ + NoRegionArguments, + NoTerminator, + SingleBlock, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "NCE version of Eltwise layer"; + + let arguments = (ins + AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input2, + + VPU_EltwiseTypeAttr:$op_type, + + VPU_PPEAttr:$ppe, + + OptionalAttr:$multiClusterStrategy, + OptionalAttr:$is_inplace, + OptionalAttr:$output_padding, + OptionalAttr:$input_padding + ); + + let results = (outs + AnyTypeOf<[4DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let regions = (region + AnyRegion:$workloads + ); + + let assemblyFormat = [{ + `(` $input1 `,` $input2 `)` + attr-dict + custom(type($input1), type($input2)) `` + `->` type(results) + custom($workloads) + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + bool fitIntoCMX(vpux::NDTypeInterface input1, vpux::NDTypeInterface input2, vpux::NDTypeInterface output, Byte reservedMem); + + bool fitIntoCMX(vpux::NDTypeInterface input1, vpux::NDTypeInterface input2, vpux::NDTypeInterface output); + + bool fitIntoCMX(vpux::NDTypeInterface input1, vpux::NDTypeInterface input2, Byte reservedMem); + + static bool isSupported(mlir::Operation* op, bool allowDifferentScales, bool allowDifferentZp, + vpux::LogCb logCb, bool checkLayout = false, + bool checkChannelAlignment = false); + + static mlir::LogicalResult verifyKernel(IE::AddOp origOp, Logger log = Logger::global()); + static mlir::LogicalResult verifyKernel(IE::MultiplyOp origOp, Logger log = Logger::global()); + static mlir::LogicalResult verifyKernel(IE::SubtractOp origOp, Logger log = Logger::global()); + + static mlir::LogicalResult verifyEltwiseCMX(mlir::Location loc, mlir::ModuleOp module, bool isInplace, + vpux::NDTypeInterface firstInputType, + vpux::NDTypeInterface secondInputType, vpux::NDTypeInterface outputType, + Logger log = Logger::global()); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_NCEReduceOp : + VPU_LayerOp< + "NCE.Reduce", + [ + NoRegionArguments, + NoTerminator, + SingleBlock, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "NCE version of Reduce layer"; + + let arguments = (ins + AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_DistributedTensor]>:$input, + I64ArrayAttr:$axes, + + VPU_PPEAttr:$ppe, + VPU_ReduceTypeAttr:$op_type, + OptionalAttr:$multiClusterStrategy, + OptionalAttr:$output_padding, + OptionalAttr:$input_padding + ); + + let results = (outs + AnyTypeOf<[4DTensorOf<[F16, BF16, F32, quant_QuantizedType]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output, Byte reservedMem); + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output); + + static bool isSupported(mlir::Operation* op, + vpux::LogCb logCb, bool checkLayout = false, + bool checkChannelAlignment = false); + static mlir::LogicalResult verifyKernel(mlir::Operation* origOp, Logger log = Logger::global()); + + }] # baseExtraClassDeclaration; + + let assemblyFormat = [{ + `(` $input`)` + attr-dict + custom(type($input)) `` + `->` type(results) + custom($workloads) + }]; + + let regions = (region + AnyRegion:$workloads + ); + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; + + let hasVerifier = 1; +} + + +def VPU_NCEPermuteOp : + VPU_LayerOp< + "NCE.Permute", + [ + NoRegionArguments, + NoTerminator, + SingleBlock, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "More abstract version of combined NCE Permute and Quantization layers"; + + let description = [{ + Used to perform a datatype conversion, relayout of data and shape expansion, + all using a single NCE HW op. + + * expandedChannels - target size of output channels after expansion, usual values are 4 and 16 + * dstElemType - output tensor datatype + * dstOrder - output tensor layout, NCHW input to NHWC output relayout is supported + }]; + + let arguments = (ins + AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$input, + + IntAttr:$expandedChannels, + TypeAttr:$dstElemType, + AffineMapAttr:$dstOrder, + VPU_PPEAttr:$ppe, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let regions = (region + AnyRegion:$workloads + ); + + let assemblyFormat = [{ + `(` $input `)` + attr-dict + custom(type($input)) `` + `->` type(results) + custom($workloads) + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output, Byte reservedMem); + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface output); + static bool isSupported(vpux::IE::PermuteQuantizeOp origOp, vpux::LogCb logCb, bool checkLayout = true, bool checkAlignment = true); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_NCEInterpolateOp : + VPU_LayerOp< + "NCE.Interpolate", + [ + NoRegionArguments, + NoTerminator, + SingleBlock, + SameVariadicOperandSize, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "NCE version of Interpolate layer"; + + let arguments = (ins + AnyTypeOf<[VPU_SparseTensor, VPU_DistributedTensor]>:$input, + Optional>:$weights, + Optional, VPU_DistributedTensor]>>:$weightsTable, + + ConfinedAttr]>:$strides, + + VPU_PPEAttr:$ppe, + ConfinedAttr]>:$rawFilterShape, + OptionalAttr:$multiClusterStrategy, + OptionalAttr:$mode + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let regions = (region + AnyRegion:$workloads + ); + + let assemblyFormat = [{ + `(` $input + (`,` $weights^ `` custom(type($weights)))? + (`,` $weightsTable^ `` custom(type($weightsTable)))? + `)` + attr-dict + custom(type($input)) `` + `->` type(results) + custom($workloads) + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + static bool isSupported(vpux::IE::InterpolateOp origOp, vpux::LogCb logCb, + bool checkLayout = false, bool checkChannelAlignment = false, bool checkBatch = false); + static bool isSupported(vpux::VPU::InterpolateOp origOp, vpux::LogCb logCb, + bool checkLayout = false, bool checkChannelAlignment = false, bool checkBatch = false); + + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output, Byte reservedMem); + bool fitIntoCMX(vpux::NDTypeInterface input, vpux::NDTypeInterface filter, vpux::NDTypeInterface output); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, + IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_StorageElementTableOp : + VPU_Op< + "StorageElementTable", + [ + Pure, + DeclareOpInterfaceMethods + ] + > { + let summary = "Declares a Storage Element Pointers table"; + + let description = [{ + A Storage Element represents a 1x1xN volume that contains sparse data, where N + represents the number of channels stored. The Storage Element Table is comprised + of pointers to these Storage Elements, which have the following structure: + + 31-29 28 9 8 0 + ------------------------------------------------- + | xx | DATA_PTR | BASE_PTR | + ------------------------------------------------- + + The DATA_PTR represents the offset to a Storage Element in relation to the start of + the input data. BASE_PTR is used to decide what base address is added to DATA_PTR + in order to find the location of the Storage Element in memory during inference. + + This operation represents the Storage Element Table in relation to the input data, + on top of which transformations can be applied. This operation will later get + converted to a constant, where the pointers are generated based on the information + contained in this operation. + + The following information is contained: + - dataShape, dataElemType, dataStrides: information about the input data that + is associated with this Storage Element Table + - seSize: the size of a Storage Element; seSize can be either: + * an integer, in which case every seDepth is the same size, equal to the seSize value; + * an array of integers, in which case each seDepth might have a different size; currently + the only use case is having different seSize per cluster, therefore the seSize array should + ultimately have as many values as there are clusters used for the DPU op this SETable belongs to. + - seDepth: the number of Storage Elements per depth + - seAttr: information on how the input data is transformed + - basePtrs: base pointers associated with each Storage Element pointer + }]; + + let arguments = (ins + I64ArrayAttr:$dataShape, + TypeAttr:$dataElemType, + I64ArrayAttr:$seSize, + IntAttr:$seDepth, + OptionalAttr:$seAttr, + OptionalAttr:$dataStrides, + OptionalAttr:$basePtrs + ); + + let results = (outs + RankedTensorOf<[I32]>:$output + ); + + let hasVerifier = 1; + + let assemblyFormat = [{ + attr-dict `->` type(results) + }]; + + let builders = [ + OpBuilder<(ins + CArg<"llvm::ArrayRef">:$dataShape, + CArg<"mlir::Type">:$dataElemType, + CArg<"llvm::ArrayRef">:$seSize, + CArg<"int64_t">:$seDepth, + CArg<"VPU::SEAttr">:$seAttr + )> + ]; + + let hasCanonicalizer = 1; +} + + +def VPU_PopulateWeightTableOp : + VPU_LayerOp< + "PopulateWeightTable", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + + > { + let summary = "Populate weight table VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>,VPU_DistributedTensor]>:$scale, + + IntAttr:$base, + IntAttr:$step, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[4DTensorOf<[SI32]>,VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$scale, + CArg<"int64_t">:$base, + CArg<"int64_t">:$step + )> + ]; + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/eltwise.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/eltwise.td new file mode 100644 index 0000000000..8410e9af6b --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/eltwise.td @@ -0,0 +1,442 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_ELTWISE +#define VPUX_COMPILER_DIALECT_VPU_OPS_ELTWISE + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + + + +def VPU_AccumulateOp : + VPU_LayerOp< + "Accumulate", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Accumulate VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$lhs, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$rhs, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$lhsScale, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$rhsScale, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_AddOp : + VPU_LayerOp< + "Add", + [ + VPU_TilingBuilderOpInterface, + Commutative, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Add VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + OptionalAttr:$post_op, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast, + "vpux::IE::PostOpAttr":$post_op + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; + let hasVerifier = 1; +} + + +def VPU_DivideOp : + VPU_LayerOp< + "Divide", + [ + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Divide VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_FloorModOp : + VPU_LayerOp< + "FloorMod", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "FloorMod VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input1, + AnyRankedTensor:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_MinimumOp : + VPU_LayerOp< + "Minimum", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "InferenceEngine Minimum layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_MaximumOp : + VPU_LayerOp< + "Maximum", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Maximum VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input2, + IE_AutoBroadcastTypeAttr:$auto_broadcast, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_ModOp : + VPU_LayerOp< + "Mod", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "Mod VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input1, + AnyRankedTensor:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_MultiplyOp : + VPU_LayerOp< + "Multiply", + [ + VPU_TilingBuilderOpInterface, + Commutative, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Multiply VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + OptionalAttr:$post_op, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast, + "vpux::IE::PostOpAttr":$post_op + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_PowerOp : + VPU_LayerOp< + "Power", + [ + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Power VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_ScaleShiftOp : + VPU_LayerOp< + "ScaleShift", + [ + VPU_TilingBuilderOpInterface, + AttrSizedOperandSegments, + VPU_EltwiseOp + ] + > { + let summary = "ScaleShift VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + Optional>:$weights, + Optional>:$biases + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_SquaredDifferenceOp : + VPU_LayerOp< + "SquaredDiff", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp + ] + > { + let summary = "SquaredDiff VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32, SI32]>:$input1, + RankedTensorOf<[F16, F32, SI32]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast + ); + + let results = (outs + RankedTensorOf<[F16, F32, SI32]>:$output + ); +} + + +def VPU_SubtractOp : + VPU_LayerOp< + "Subtract", + [ + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Subtract VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + OptionalAttr:$post_op, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32, UI8, UI16, UI32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast, + "vpux::IE::PostOpAttr":$post_op + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/image.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/image.td new file mode 100644 index 0000000000..af541fdc7d --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/image.td @@ -0,0 +1,289 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_IMAGE +#define VPUX_COMPILER_DIALECT_VPU_OPS_IMAGE + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + +include "mlir/Dialect/Quant/QuantOpsBase.td" + + +def VPU_DeformablePSROIPoolingOp : + VPU_LayerOp< + "DeformablePSROIPooling" + > { + let summary = "DeformablePSROIPooling VPU layer"; + + let arguments = (ins + 4DTensorOf<[AnyFloat]>:$input_score_maps, + 2DTensorOf<[AnyFloat]>:$input_rois, + Optional<4DTensorOf<[AnyFloat]>>:$input_transformations, + + IntAttr:$output_dim, + F64Attr:$spatial_scale, + OptionalAttr:$group_size, + OptionalAttr:$spatial_bins_x, + OptionalAttr:$spatial_bins_y, + OptionalAttr:$trans_std, + OptionalAttr:$part_size, + OptionalAttr:$mode + ); + + let results = (outs + 4DTensorOf<[AnyFloat]>:$output + ); +} + + +def VPU_ExperimentalDetectronROIFeatureExtractorOp : + VPU_LayerOp< + "ExperimentalDetectronROIFeatureExtractor", [ + AttrSizedOperandSegments + ] + > { + let summary = "ExperimentalDetectronROIFeatureExtractor VPU layer"; + + let arguments = (ins + Variadic:$inputs, + Optional<1DTensorOf<[F16, F32]>>:$reorderedRois, + Optional<1DTensorOf<[UI32]>>:$originalRoiMap, + Optional<1DTensorOf<[F16, F32]>>:$outputRoisFeaturesTemp, + Optional<1DTensorOf<[UI32]>>:$levels, + + IE_ExperimentalDetectronROIFeatureExtractorAttr:$attr + + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output, + RankedTensorOf<[F16, F32]>:$outputROIs + ); +} + + +def VPU_GridSampleOp : + VPU_LayerOp< + "GridSample", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "GridSample VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$grid, + + UnitAttr:$align_corners, + OptionalAttr:$mode, + OptionalAttr:$padding_mode, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$grid, + "::mlir::UnitAttr":$align_corners, + "vpux::IE::GridSampleModeAttr":$mode, + "vpux::IE::GridSamplePaddingModeAttr":$padding_mode + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_InterpolateOp : + VPU_LayerOp< + "Interpolate", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + AttrSizedOperandSegments, + DeclareOpInterfaceMethods + ] + > { + let summary = "Interpolate VPU layer"; + + let description = [{ + The `coordinates` contain byte offsets for the current `input` tensor. + The `lambdas` contain two interleaved values for each coordinate. + }]; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[UI8, F16, F32, quant_QuantizedType]>, VPU_DistributedTensor]>:$input, + Optional, VPU_DistributedTensor]>>:$sizes, + Optional, VPU_DistributedTensor]>>:$scales, + Optional, VPU_DistributedTensor]>>:$axes, + Optional, VPU_DistributedTensor]>>:$coordinates, + Optional, VPU_DistributedTensor]>>:$lambdas, + + OptionalAttr:$sizes_attr, + OptionalAttr:$scales_attr, + OptionalAttr:$axes_attr, + OptionalAttr:$tile_offset_attr, + OptionalAttr:$initial_input_dims_attr, + OptionalAttr:$initial_output_dims_attr, + OptionalAttr:$initial_input_offset_attr, + OptionalAttr:$initial_output_offset_attr, + OptionalAttr:$multiClusterStrategy, + + IE_InterpolateAttr:$attr, + OptionalAttr:$output_padding, + OptionalAttr:$input_padding + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[UI8, F16, F32, quant_QuantizedType]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$sizes, + "::mlir::Value":$scales, + "::mlir::Value":$axes, + "::mlir::Value":$coordinates, + "::mlir::Value":$lambdas, + "::mlir::ArrayAttr":$sizes_attr, + "::mlir::ArrayAttr":$scales_attr, + "::mlir::ArrayAttr":$axes_attr, + "::mlir::ArrayAttr":$tile_offset_attr, + "::mlir::ArrayAttr":$initial_input_dims_attr, + "::mlir::ArrayAttr":$initial_output_dims_attr, + "vpux::IE::InterpolateAttr":$attr, + "::mlir::ArrayAttr":$output_padding, + "::mlir::ArrayAttr":$input_padding + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_MIXED_PRECISION, IE_TypeComparisonMode_ALLOW_DIFFERENT_QUANT, + IE_TypeComparisonMode_ALLOW_GROUPED_OUTPUT, IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_PSROIPoolingOp : + VPU_LayerOp< + "PSROIPooling" + > { + let summary = "PSROIPooling VPU layer"; + + let arguments = (ins + 4DTensorOf<[F16, F32]>:$input, + 2DTensorOf<[F16, F32]>:$coords, + + IntAttr:$output_dim, + F64Attr:$spatial_scale, + IntAttr:$group_size, + OptionalAttr:$spatial_bins_x, + OptionalAttr:$spatial_bins_y, + OptionalAttr:$mode + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_ROIAlignOp : + VPU_LayerOp< + "ROIAlign", + [ + ResultsAreFloatLike + ] + > { + let summary = "ROIAlign VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + RankedTensorOf<[F16, F32]>:$coords, + 1DTensorOf<[AnyInteger]>:$roisIdx, + + IntAttr:$pooled_h, + IntAttr:$pooled_w, + IntAttr:$sampling_ratio, + F64Attr:$spatial_scale, + IE_ROIAlignMethodAttr:$poolingMode, + IE_ROIAlignAlignedMethodAttr:$alignedMode + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_ROIPoolingOp : + VPU_LayerOp< + "ROIPooling" + > { + let summary = "ROIPooling VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + RankedTensorOf<[F16, F32]>:$coords, + + I64ArrayAttr:$output_size, + F64Attr:$spatial_scale, + IE_ROIPoolingMethodAttr:$method + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_YuvToRgbOp : + VPU_LayerOp< + "YuvToRgb", + [ + DeclareOpInterfaceMethods, + AttrSizedOperandSegments + ] + > { + let summary = "InferenceEngine NV12/I420 to RGB/BGR layer"; + + let arguments = (ins + 4DTensorOf<[UI8, F16, F32]> :$input1, + Optional<4DTensorOf<[UI8, F16, F32]>>:$input2, + Optional<4DTensorOf<[UI8, F16, F32]>>:$input3, + + IE_ColorFmtAttr:$inFmt, + IE_ColorFmtAttr:$outFmt + ); + + let results = (outs + 4DTensorOf<[UI8, F16, F32]>:$output + ); +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/internal.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/internal.td new file mode 100644 index 0000000000..5caa0480f0 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/internal.td @@ -0,0 +1,284 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_INTERNAL +#define VPUX_COMPILER_DIALECT_VPU_OPS_INTERNAL + +include "vpux/compiler/dialect/core/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + +include "mlir/Dialect/Quant/QuantOpsBase.td" +include "mlir/Interfaces/CallInterfaces.td" +include "mlir/IR/SymbolInterfaces.td" + + + +def VPU_GenericSwLayerOp : + VPU_Op< + "GenericSwLayer", + [ + Pure, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "VPU Op that describes a generic SW Layer"; + + let arguments = (ins + SymbolRefAttr:$callee, // Generated sw layer function + + Variadic:$inputs + ); + + let results = (outs + Variadic:$outputs + ); + + let assemblyFormat = [{ + `(` operands `)` attr-dict `:` type(operands) `->` type(results) + }]; +} + + +def VPU_ExternalKernelOp : + VPU_Op< + "ExternalKernel", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "Represents a kernel whose details/implementation are defined externally"; + + let arguments = (ins + Variadic:$inputs, + DictionaryAttr:$attrDict, + StrAttr:$unique_id + ); + + let results = (outs + Variadic:$outputs + ); + + let assemblyFormat = [{ + $unique_id + (`inputs` `(` $inputs^ `:` type($inputs) `)`)? + `attrs` `(` $attrDict `)` + attr-dict + `->` type(results) + }]; +} + + +def VPU_UnrolledTypeOp : + VPU_Op< + "UnrolledType", + [ + VPU_ViewLikeOpInterface + ] + > { + let summary = "This layer mediate between unrolled distributed tensor type and usual type"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$input + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let assemblyFormat = [{ + `(` $input `:` qualified(type($input)) `)` + attr-dict + `->` qualified(type($output)) + }]; + + let hasCanonicalizer = 1; + let hasFolder = 1; +} + + + + +def VPU_VerticalFusionOp : + VPU_Op< + "VerticalFusion", + [ + Pure, + IsolatedFromAbove, + DeclareOpInterfaceMethods, + SingleBlockImplicitTerminator<"YieldOp"> + ] + > { + let summary = "Operation that encapsulates details of VF subgraph"; + + let arguments = (ins + Variadic>:$operands, + I64ArrayAttr:$tilingStrategy, + OptionalAttr:$scenario + ); + + let results = (outs + Variadic>:$results + ); + + let regions = (region SizedRegion<1>:$ops); + + let hasVerifier = 1; + + let skipDefaultBuilders = 1; + let builders = [ + OpBuilder<(ins "mlir::TypeRange":$resultTypes, "mlir::ValueRange":$operands, + "llvm::function_ref":$bodyBuilder, + "mlir::ArrayAttr":$tilingInfo)>, + ]; + + let extraClassDeclaration = [{ + using BodyBuilderFn = + llvm::function_ref; + + mlir::Operation* getFirstInnerTaskOp(); + + void print(::mlir::OpAsmPrinter& p); + static ::mlir::ParseResult parse(::mlir::OpAsmParser& parser, ::mlir::OperationState& result); + }]; +} + + +def VPU_DistributedCastOp : + VPU_Op< + "DistributedCast", + [ + VPU_ViewLikeOpInterface + ] + > { + let summary = "Operation that casts one DistributedTensor type to another."; + + let description = [{ + Used to cast one DistributedTensor type to another and help with NNCMX retention + of data. + + Currently following distribution mode pairs are compatible: + + DUPLICATED|SEGMENTED -> DUPLICATED ## needed for K cluster tiling + }]; + + let arguments = (ins + AnyTypeOf<[VPU_DistributedTensor, VPU_SparseTensor]>:$input + ); + + let results = (outs + AnyTypeOf<[VPU_DistributedTensor, VPU_SparseTensor]>:$output + ); + + let assemblyFormat = [{ + `(` $input `:` qualified(type($input)) `)` + attr-dict + `->` qualified(type($output)) + }]; + + let hasFolder = 1; + + let hasVerifier = 1; +} + + +def VPU_GroupSparseTensorOp : + VPU_Op< + "GroupSparseTensor", + [ + Pure, + DeclareOpInterfaceMethods, + AttrSizedOperandSegments, + VPU_GroupedViewLikeOpInterface, + DeclareOpInterfaceMethods + ] + > { + let summary = "Groups sparse data and metadata into a single value"; + + let arguments = (ins + AnyTypeOf<[4DTensorOf<[F16, BF16, quant_QuantizedType]>, VPU_DistributedTensor]>:$data, + Optional>:$sparsityMap, + Optional>:$storageElementTable, + + UnitAttr:$is_weights, + OptionalAttr:$sparsity_compression, + + OptionalAttr:$seAttr + ); + + let results = (outs + AnyTypeOf<[VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder< + (ins "mlir::Value":$data, + CArg<"bool", "{}">:$is_weights, CArg<"VPU::SparsityCompressionAttr", "{}">:$sparsity_compression) + >, + OpBuilder< + (ins "mlir::Value":$data, "mlir::Value":$sparsityMap, + CArg<"bool", "{}">:$is_weights, CArg<"VPU::SparsityCompressionAttr", "{}">:$sparsity_compression) + >, + OpBuilder< + (ins "mlir::Value":$data, "mlir::Value":$sparsityMap, "mlir::Value":$storageElementTable, + CArg<"bool", "{}">:$is_weights, CArg<"VPU::SparsityCompressionAttr", "{}">:$sparsity_compression) + >, + OpBuilder< + (ins "mlir::Value":$data, "mlir::Value":$sparsityMap, "mlir::Value":$storageElementTable, + CArg<"VPU::SEAttr", "{}">:$seAttr) + > + ]; + + let assemblyFormat = [{ + `(` $data + (`,` $sparsityMap^ `` custom(type($sparsityMap)))? + (`,` $storageElementTable^ `` custom(type($storageElementTable)))? + `)` + attr-dict + `` custom(type($data)) + `->` type(results) + }]; + + let hasCanonicalizer = 1; +} + + + +def VPU_UngroupSparseTensorOp : + VPU_Op< + "UngroupSparseTensor", + [ + Pure, + AttrSizedResultSegments, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + ] + > { + let summary = "Ungroups sparse data and metadata into multiple values"; + + let arguments = (ins + VPU_SparseTensor:$input + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$data, + Optional>:$sparsityMap, + Optional>:$storageElementTable + ); + + let assemblyFormat = [{ + `(` $input `)` + attr-dict + `` custom(type($input)) + `->` type(results) + }]; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/logical.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/logical.td new file mode 100644 index 0000000000..b98f369510 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/logical.td @@ -0,0 +1,142 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_LOGICAL +#define VPUX_COMPILER_DIALECT_VPU_OPS_LOGICAL + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + +def VPU_AndOp : + VPU_LayerOp< + "And", + [ + VPU_TilingBuilderOpInterface, + Commutative, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "And VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[I8, F16, F32, SI32]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[I8, F16, F32, SI32]>, VPU_DistributedTensor]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[I8, F16, F32, SI32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_LogicalNotOp : + VPU_LayerOp< + "LogicalNot", + [ + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + > { + let summary = "Logical Not VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[I8, F16, F32, SI32]>, VPU_DistributedTensor]>:$input, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[I8, F16, F32, SI32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_LogicalOrOp : + VPU_LayerOp< + "LogicalOr", + [ + VPU_TilingBuilderOpInterface, + Commutative, + VPU_EltwiseOp + ] + > { + let summary = "LogicalOr VPU layer"; + + let arguments = (ins + RankedTensorOf<[I8, F16, F32, SI32]>:$input1, + RankedTensorOf<[I8, F16, F32, SI32]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast + ); + + let results = (outs + RankedTensorOf<[I8, F16, F32, SI32]>:$output + ); +} + + +def VPU_LogicalXorOp : + VPU_LayerOp< + "LogicalXor", + [ + VPU_TilingBuilderOpInterface, + Commutative, + VPU_EltwiseOp + ] + > { + let summary = "LogicalXor VPU layer"; + + let arguments = (ins + RankedTensorOf<[I8, F16, F32, SI32]>:$input1, + RankedTensorOf<[I8, F16, F32, SI32]>:$input2, + + IE_AutoBroadcastTypeAttr:$auto_broadcast + ); + + let results = (outs + RankedTensorOf<[I8, F16, F32, SI32]>:$output + ); +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/m2i.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/m2i.td new file mode 100644 index 0000000000..52466032f1 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/m2i.td @@ -0,0 +1,152 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_M2I +#define VPUX_COMPILER_DIALECT_VPU_OPS_M2I + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + +def VPU_M2IColorConvertOp : + VPU_LayerOp< + "M2I.ColorConvert" + > { + let summary = "M2I version for color-convert operations"; + + let arguments = (ins + 4DTensorOf<[UI8, F16]>:$input, + IE_ColorFmtAttr:$inFmt, + IE_ColorFmtAttr:$outFmt + ); + + let results = (outs + 4DTensorOf<[UI8, F16]>:$output + ); + + let assemblyFormat = [{ + `(` $input `)` + attr-dict + custom(type($input)) `` + `->` type(results) + }]; + + let extraClassDeclaration = [{ + static bool fitIntoCMX(mlir::Operation* op, vpux::NDTypeInterface input, vpux::NDTypeInterface output, Byte reservedMem); + static bool fitIntoCMX(mlir::Operation* op, vpux::NDTypeInterface input, vpux::NDTypeInterface output); + static bool isSupported(vpux::IE::YuvToRgbOp origOp, vpux::LogCb logCb, bool checkLayout = false, + bool checkChannelAlignment = false); + }] # baseExtraClassDeclaration; +} + + +def VPU_M2IResizeOp : + VPU_LayerOp< + "M2I.Resize" + > { + let summary = "M2I version for resize operations"; + + let arguments = (ins + 4DTensorOf<[UI8, F16]>:$input, + + I64ArrayAttr:$sizes, + I64ArrayAttr:$axes, + VPU_M2iInterpAttr:$interp + ); + + let results = (outs + 4DTensorOf<[UI8, F16]>:$output + ); + + let assemblyFormat = [{ + `(` $input `)` + attr-dict + custom(type($input)) `` + `->` type(results) + }]; + + let extraClassDeclaration = [{ + static bool fitIntoCMX(mlir::Operation* op, vpux::NDTypeInterface input, vpux::NDTypeInterface output, Byte reservedMem); + static bool fitIntoCMX(mlir::Operation* op, vpux::NDTypeInterface input, vpux::NDTypeInterface output); + static bool isSupported(vpux::IE::InterpolateOp origOp, vpux::LogCb logCb, bool checkLayout = false, + bool checkChannelAlignment = false); + }] # baseExtraClassDeclaration; +} + + +def VPU_M2INormOp : + VPU_LayerOp< + "M2I.Norm" + > { + let summary = "M2I version for BatchNormInference"; + + let arguments = (ins + 4DTensorOf<[F16]>:$input, + + F64ArrayAttr:$gamma_value, + F64ArrayAttr:$beta_value, + F64ArrayAttr:$mean_value, + F64ArrayAttr:$variance_value, + + F64Attr:$eps + ); + + let results = (outs + 4DTensorOf<[F16]>:$output + ); + + let assemblyFormat = [{ + `(` $input `)` + attr-dict + custom(type($input)) `` + `->` type(results) + }]; + + let extraClassDeclaration = [{ + static bool fitIntoCMX(mlir::Operation* op, vpux::NDTypeInterface input, vpux::NDTypeInterface output, Byte reservedMem); + static bool fitIntoCMX(mlir::Operation* op, vpux::NDTypeInterface input, vpux::NDTypeInterface output); + static bool isSupported(vpux::IE::BatchNormInferenceOp origOp, vpux::LogCb logCb, bool checkLayout = false, + bool checkChannelAlignment = false); + }] # baseExtraClassDeclaration; +} + + +def VPU_M2ITaskOp : + VPU_LayerOp< + "M2I.Task" + > { + let summary = "M2I full task op"; + + let arguments = (ins + 4DTensorOf<[UI8, F16]>:$input, + + BoolAttr:$do_csc, + BoolAttr:$do_norm, + VPU_M2iColorFmtAttr:$inFmt, + VPU_M2iColorFmtAttr:$outFmt, + UnitAttr:$chroma_in_reverse_channels, + UnitAttr:$chroma_out_reverse_channels, + UnitAttr:$luma_in_reverse_channels, + UnitAttr:$luma_out_reverse_channels, + OptionalAttr:$sizes, + OptionalAttr:$axes, + OptionalAttr:$norm, + DefaultValuedAttr:$interp + ); + + let results = (outs + 4DTensorOf<[UI8, F16]>:$output + ); + + let assemblyFormat = [{ + `(` $input `)` + attr-dict + custom(type($input)) `` + `->` type(results) + }]; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/normalization.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/normalization.td new file mode 100644 index 0000000000..b994b28fa2 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/normalization.td @@ -0,0 +1,493 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_NORMALIZATION +#define VPUX_COMPILER_DIALECT_VPU_OPS_NORMALIZATION + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + + + +def VPU_GRNOp : + VPU_LayerOp< + "GRN" + > { + let summary = "GRN VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + + F64Attr:$bias + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_GroupNormalizationOp : + VPU_LayerOp< + "GroupNormalization", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "GroupNormalization VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + RankedTensorOf<[F16, F32]>:$scale, + RankedTensorOf<[F16, F32]>:$bias, + + I32Attr:$num_groups, + F32Attr: $epsilon + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_LRNOp : + VPU_LayerOp< + "LRN" + > { + let summary = "LRN VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + I64ArrayAttr:$axes, + + F64Attr:$alpha, + F64Attr:$beta, + F64Attr:$bias, + IntAttr:$size + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_LRN_IEOp : + VPU_LayerOp< + "LRN_IE" + > { + let summary = "LRN_IE VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + + F64Attr:$alpha, + F64Attr:$beta, + F64Attr:$bias, + IntAttr:$size, + IE_LRN_IERegionAttr:$region + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_MVNOp : + VPU_LayerOp< + "MVN", + [ + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + VPU_EltwiseOp + ] + + > { + let summary = "MVN1 VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + BoolAttr:$across_channels, + BoolAttr:$normalize_variance, + F64Attr:$eps, + OptionalAttr:$internal_reshape, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + + DimArr getNonNormDims(); + }] # baseExtraClassDeclaration; + + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::BoolAttr":$across_channels, + "::mlir::BoolAttr":$normalize_variance, + "::mlir::FloatAttr":$eps + )>, + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::BoolAttr":$across_channels, + "::mlir::BoolAttr":$normalize_variance, + "::mlir::FloatAttr":$eps, + "::mlir::ArrayAttr":$internal_reshape + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_MVN1SumOp : + VPU_LayerOp< + "MVN1SumOp", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "MVN1SumOp VPU layer (step 1/3 in MVN1 decomposition)"; + + let description = [{ + Overview: + + Large MVN1 tensors that cannot be tiled, are decomposed into 3 tileable sub-ops: + 1. **MVN1SumOp** : computes partial sums on input tiles + 2. **MVN1MeanVarOp** : sum-reduces concatenated partial sums from previous step and computes _mean_, _variance_ + 3. **MVN1Normalize**: applies normalization on input tiles + + Details: + - **input** - tile of original _MVN1Op_ input tensor + - **sum** - output tensor of shape **[N, C, H, W]**, with (0,1,2,3)->(0,2,3,1) layout (irrespective of input layout) + - N = input N + - C = input C if _across_channels_ = false, or 1 if _across_channels_ = true + - H = number of clusters (output_height) + - W = 2 if _normalize_variance_ = true (compute _sum_ and _sumOfSquares_ terms), else 1 (compute just _sum_ term) + }]; + + let arguments = (ins + AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + + BoolAttr:$across_channels, + BoolAttr:$normalize_variance, + IntAttr:$output_height, + OptionalAttr:$multiClusterStrategy + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + static bool buffsFitIntoCMX(mlir::ModuleOp module, vpux::NDTypeInterface in, vpux::NDTypeInterface out); + }] # baseExtraClassDeclaration; + + let results = (outs + AnyTypeOf<[4DTensorOf<[F32]>, VPU_DistributedTensor]>:$sum + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "bool":$across_channels, + "bool":$normalize_variance, + "int64_t":$output_height + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_MVN1MeanVarOp : + VPU_LayerOp< + "MVN1MeanVar", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "MVN1MeanVar VPU layer (step 2/3 in MVN1 decomposition)"; + + let description = [{ + Background: see _MVN1SumOp_ description + + Details: + Accumulates partial _sum_ (and optionally _sumOfSquares_) of concatenated input and computes _mean_ (and optionally _1/variance_ ) required for normalization. + + - **sum** - input is a concatenation (over W) of _MVN1SumOp_ outputs, shape = [N,C,W x num_parts] + - W = 2 if _normalize_variance_ = true (input _sum_ and _sumOfSquares_), else 1 (input _sum_) + - **meanVar** - output shape = [N,C,W], where + - W = 2 if _normalize_variance_ = true (output _mean_ and _1/variance_ terms), else 1 (output just _mean_) + }]; + + let arguments = (ins + AnyTypeOf<[4DTensorOf<[F32]>, VPU_DistributedTensor]>:$sum, + + I64ArrayAttr:$orig_shape, + BoolAttr:$across_channels, + BoolAttr:$normalize_variance, + F64Attr:$eps, + TypeAttr:$output_type, + OptionalAttr:$internal_reshape, + OptionalAttr:$multiClusterStrategy + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let results = (outs + AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$meanVar + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$sum, + "::mlir::ArrayAttr":$orig_shape, + "bool":$across_channels, + "bool":$normalize_variance, + "::mlir::APFloat":$eps, + "::mlir::Type":$output_type + )>, + OpBuilder<(ins + "::mlir::Value":$sum, + "::mlir::ArrayAttr":$orig_shape, + "bool":$across_channels, + "bool":$normalize_variance, + "::mlir::APFloat":$eps, + "::mlir::Type":$output_type, + "::mlir::ArrayAttr":$internal_reshape + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_MVN1NormalizeOp : + VPU_LayerOp< + "MVN1Normalize", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "MVN1Normalize VPU layer (step 3/3 in MVN1 decomposition)"; + + let description = [{ + Background: see _MVN1SumOp_ description + + Applies normalization on a tile of input tensor. + + Details: + - **input** - input tile of original _MVN1Op_ input tensor + - **meanVar** - this input is the output of _MVN1MeanVarOp_ + - **output** - output tile of final result + }]; + + let arguments = (ins + AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$meanVar, + + BoolAttr:$across_channels, + BoolAttr:$normalize_variance, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$meanVar, + "::mlir::BoolAttr":$across_channels, + "::mlir::BoolAttr":$normalize_variance + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_MVN6Op : + VPU_LayerOp< + "MVN6", + [ + AttrSizedOperandSegments, + VPU_EltwiseOp, + VPU_TilingBuilderOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "MVN6 VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + Optional>:$scale, + Optional>:$bias, + I64ArrayAttr:$axes, + BoolAttr:$normalize_variance, + F64Attr:$eps, + IE_MvnEpsModeAttr:$eps_mode, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + DimArr getNonNormDims(); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::ArrayAttr":$axes, + "::mlir::BoolAttr":$normalize_variance, + "::mlir::FloatAttr":$eps, + "vpux::IE::MvnEpsModeAttr":$eps_mode + )>, + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$scale, + "::mlir::Value":$bias, + "::mlir::ArrayAttr":$axes, + "::mlir::BoolAttr":$normalize_variance, + "::mlir::FloatAttr":$eps, + "vpux::IE::MvnEpsModeAttr":$eps_mode + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_NormalizeL2Op : + VPU_LayerOp< + "NormalizeL2", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "NormalizeL2 VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$data, + ArrayAttr:$axes_value, + + F64Attr:$eps, + IE_EpsModeAttr:$eps_mode, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let hasVerifier = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_NormalizeIEOp : + VPU_LayerOp< + "NormalizeIE" + > { + let summary = "NormalizeIE VPU layer"; + + let arguments = (ins + AnyRankedTensor:$data, + AnyRankedTensor:$weights, + + F64Attr:$eps, + BoolAttr:$across_spatial, + BoolAttr:$channel_shared + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_RMSOp : + VPU_LayerOp< + "RMS", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + ] + > { + let summary = "RMS VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$gamma, + + F64Attr:$epsilon, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$gamma, + "::mlir::FloatAttr":$epsilon + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/pooling.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/pooling.td new file mode 100644 index 0000000000..f95f84966e --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/pooling.td @@ -0,0 +1,130 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_POOLING +#define VPUX_COMPILER_DIALECT_VPU_OPS_POOLING + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + + +def VPU_AdaptiveAvgPoolOp : + VPU_LayerOp< + "AdaptiveAvgPool" + > { + let summary = "AdaptiveAvgPool VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + 1DTensorOf<[SI32, SI64]>:$pooled_spatial_shape + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_AdaptiveMaxPoolOp : + VPU_LayerOp< + "AdaptiveMaxPool" + > { + let summary = "AdaptiveMaxPool VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + 1DTensorOf<[SI32, SI64]>:$pooled_spatial_shape, + TypeAttr:$index_element_type + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output, + RankedTensorOf<[SI32, SI64]>:$output_index + ); +} + + +def VPU_AvgPoolOp : + VPU_LayerOp< + "AvgPool", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "AvgPool VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32, SI32, SI8, UI8]>:$input, + + I64ArrayAttr:$kernel_size, + I64ArrayAttr:$strides, + I64ArrayAttr:$pads_begin, + I64ArrayAttr:$pads_end, + IE_RoundingTypeAttr:$rounding_type, + UnitAttr:$exclude_pads + ); + + let results = (outs + RankedTensorOf<[F16, F32, SI32, SI8, UI8]>:$output + ); +} + + +def VPU_MaxPoolOp : + VPU_LayerOp< + "MaxPool", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "MaxPool VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32, SI32, SI8, UI8]>:$input, + + I64ArrayAttr:$kernel_size, + I64ArrayAttr:$strides, + I64ArrayAttr:$pads_begin, + I64ArrayAttr:$pads_end, + IE_RoundingTypeAttr:$rounding_type, + + OptionalAttr:$post_op + ); + + let results = (outs + RankedTensorOf<[F16, F32, SI32, SI8, UI8]>:$output + ); +} + + +def VPU_MaxPool8Op : + VPU_LayerOp< + "MaxPool8" + > { + let summary = "MaxPool8 VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32, SI32, SI8, UI8]>:$input, + + I64ArrayAttr:$kernel_size, + I64ArrayAttr:$strides, + I64ArrayAttr:$dilations, + I64ArrayAttr:$pads_begin, + I64ArrayAttr:$pads_end, + IE_RoundingTypeAttr:$rounding_type, + TypeAttr:$index_element_type, + + IntAttr:$axis + ); + + let results = (outs + RankedTensorOf<[F16, F32, SI32, SI8, UI8]>:$output, + RankedTensorOf<[SI32, SI64]>:$output_index + ); +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/recurrent.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/recurrent.td new file mode 100644 index 0000000000..7bdf85a9c6 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/recurrent.td @@ -0,0 +1,352 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_RECURRENT +#define VPUX_COMPILER_DIALECT_VPU_OPS_RECURRENT + +include "vpux/compiler/dialect/core/attributes.td" +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + + + +def VPU_CTCGreedyDecoderOp : + VPU_LayerOp< + "CTCGreedyDecoder" + > { + let summary = "CTCGreedyDecoder VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + RankedTensorOf<[F16, F32]>:$sequenceLengths, + + UnitAttr:$mergeRepeated + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_CTCGreedyDecoderSeqLenOp : + VPU_LayerOp< + "CTCGreedyDecoderSeqLen" + > { + let summary = "CTCGreedyDecoderSeqLen VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + RankedTensorOf<[SI32]>:$sequenceLength, + Optional>:$blankIndex, + + UnitAttr:$mergeRepeated + ); + + let results = (outs + RankedTensorOf<[SI32]>:$output, + RankedTensorOf<[SI32]>:$outputLength + ); +} + + +def VPU_GRUGatesOp : + VPU_LayerOp< + "GRUGates", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Computes GRU activation functions"; + + let description = [{ + This operation is intended to be run as a software stage after computing and adding GRU matrix multiplications. + Formula is just for Gru with should_linear_before_reset true. + ``` + -**input_data** = (inputData * weights + bias) [batchSize, 3 * hiddenSize]** or **[1, 1, batchSize, 3 * hiddenSize]** + -**initial_hidden_state** = [batchSize, hiddenSize]** or **[1, 1, batchSize, hiddenSize]** + -**hidden_data** = (initialHiddenState * recurrenceWeights) [batchSize, 3 * hiddenSize]** or **[1, 1, batchSize, 3 * hiddenSize]** + -**biases** = Rbh, 4't bias plan if exist (optional) [1, hiddenSize]** or **[1, 1, 1, hiddenSize]** + * - Matrix multiplication + + - Element-wise add + ``` + - The meaning of other operands are identical to those in GRUCell operation. + }]; + + + let arguments = (ins + AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input_data, + AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$initial_hidden_state, + AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$hidden_data, + Optional, VPU_DistributedTensor]>>:$biases, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$outputHiddenState + ); + + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input_data, + "::mlir::Value":$initial_hidden_state, + "::mlir::Value":$hidden_data, + "::mlir::Value":$biases + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_GRUSequenceOp : + VPU_LayerOp< + "GRUSequence", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "GRUSequence VPU layer"; + + let arguments = (ins + 3DTensorOf<[F16, F32]>:$input_data, + 3DTensorOf<[F16, F32]>:$initial_hidden_state, + 3DTensorOf<[F16, F32]>:$weights, + 3DTensorOf<[F16, F32]>:$recurrence_weights, + 2DTensorOf<[F16, F32]>:$biases, + + IntAttr:$hidden_size, + IntAttr:$seq_length, + IE_RNNSequenceDirectionAttr:$direction, + UnitAttr:$should_linear_before_reset, + F64Attr:$clip + ); + + let results = (outs + 4DTensorOf<[F16, F32]>:$middle_hidden_state, + 3DTensorOf<[F16, F32]>:$output_hidden_state + ); + + let extraClassDeclaration = [{ + OutputTiling getOutputTiling(const vpux::TileInfo& outputTile, vpux::Logger log); + }] # baseExtraClassDeclaration; +} + + +def VPU_GRUSequenceFirstPartOp : + VPU_LayerOp< + "GRUSequenceFirstPart" + > { + let summary = "GRUSequenceFirstPart VPU layer"; + + let arguments = (ins + 3DTensorOf<[F16, F32]>:$input_data, + 3DTensorOf<[F16, F32]>:$weights, + + IntAttr:$hidden_size, + IntAttr:$seq_length, + F64Attr:$clip + ); + + let results = (outs + 4DTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_GRUSequenceLastPartOp : + VPU_LayerOp< + "GRUSequenceLastPart", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "GRUSequenceLastPart VPU layer"; + + let arguments = (ins + 4DTensorOf<[F16, F32]>:$first_part_output, + 3DTensorOf<[F16, F32]>:$initial_hidden_state, + 3DTensorOf<[F16, F32]>:$recurrence_weights, + 2DTensorOf<[F16, F32]>:$biases, + + IntAttr:$hidden_size, + IntAttr:$seq_length, + IE_RNNSequenceDirectionAttr:$direction, + UnitAttr:$should_linear_before_reset, + F64Attr:$clip + ); + + let results = (outs + 4DTensorOf<[F16, F32]>:$middle_hidden_state, + 3DTensorOf<[F16, F32]>:$output_hidden_state + ); + + let extraClassDeclaration = [{ + OutputTiling getOutputTiling(const vpux::TileInfo& outputTile, vpux::Logger log); + }] # baseExtraClassDeclaration; +} + + +def VPU_LSTMCellOp : + VPU_LayerOp< + "LSTMCell" + > { + let summary = "LSTMCell VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$inputData, + RankedTensorOf<[F16, F32]>:$initialHiddenState, + RankedTensorOf<[F16, F32]>:$initialCellState, + RankedTensorOf<[F16, F32]>:$weights, + RankedTensorOf<[F16, F32]>:$recurrenceWeights, + RankedTensorOf<[F16, F32]>:$biases, + + IntAttr:$hiddenSize + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$outputHiddenState, + RankedTensorOf<[F16, F32]>:$outputCellState + ); + + let extraClassDeclaration = [{ + static bool isSupported(vpux::IE::LSTMCellOp op); + }] # baseExtraClassDeclaration; + + +} + + +def VPU_LSTMGatesOp : + VPU_LayerOp< + "LSTMGates", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "Computes LSTM activation functions"; + + let description = [{ + This operation is intended to be run as a software stage after computing and adding LSTM matrix multiplications. + + - **gatesInput** - tensor of shape **[batchSize, 4 * hiddenSize]** or **[1, 1, batchSize, 4 * hiddenSize]**. Formula: + ``` + gatesInput = (inputData * weights) + (initialHiddenState * recurrenceWeights) + biases + * - Matrix multiplication + + - Element-wise add + ``` + - The meaning of other operands are identical to those in LSTMCell operation. + }]; + + let arguments = (ins + AnyTypeOf<[2DTensorOf<[F16, F32]>, 4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$gatesInput, + AnyTypeOf<[2DTensorOf<[F16, F32]>, 4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$initialCellState, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[2DTensorOf<[F16, F32]>, 4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$outputHiddenState, + AnyTypeOf<[2DTensorOf<[F16, F32]>, 4DTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$outputCellState + ); + + let hasVerifier = 1; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$gatesInput, + "::mlir::Value":$initialCellState + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_LSTMSequenceOp : + VPU_LayerOp< + "LSTMSequence", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "LSTMSequence VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$inputData, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$initialHiddenState, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$initialCellState, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$reccurenceWeights, + Optional, VPU_DistributedTensor]>>:$biases, + AnyTypeOf<[RankedTensorOf<[SI32]>, VPU_DistributedTensor]>:$syncBuffer, + + OptionalAttr:$sequenceLength, + IE_RNNSequenceDirectionAttr:$direction, + OptionalAttr:$useDpu, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$outputHiddenValues, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$outputHiddenState, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$outputCellState + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$inputData, + "::mlir::Value":$initialHiddenState, + "::mlir::Value":$initialCellState, + "::mlir::Value":$reccurenceWeights, + "::mlir::Value":$biases, + "::mlir::IntegerAttr":$sequenceLength, + "vpux::IE::RNNSequenceDirectionAttr":$direction, + "::mlir::BoolAttr":$useDpu, + "vpux::VPU::MultiClusterStrategyAttr":$multiClusterStrategy + )>, + OpBuilder<(ins + "::mlir::Value":$inputData, + "::mlir::Value":$initialHiddenState, + "::mlir::Value":$initialCellState, + "::mlir::Value":$reccurenceWeights, + "::mlir::Value":$biases, + "::mlir::IntegerAttr":$sequenceLength, + "vpux::IE::RNNSequenceDirectionAttr":$direction, + "vpux::VPU::MultiClusterStrategyAttr":$multiClusterStrategy + )> + ]; + + let extraClassDeclaration = [{ + static bool isSupported(vpux::IE::LSTMSequenceOp origOp, bool useDpu=false); + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/reduce.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/reduce.td new file mode 100644 index 0000000000..ffde82a605 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/reduce.td @@ -0,0 +1,374 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_REDUCE +#define VPUX_COMPILER_DIALECT_VPU_OPS_REDUCE + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + + +def VPU_ReduceL1Op : + VPU_LayerOp< + "ReduceL1", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "ReduceL1 VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + + I64ArrayAttr:$axes_value, + UnitAttr:$keep_dims, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::ArrayAttr":$axes_value, + "::mlir::UnitAttr":$keep_dims + )> + ]; + + let hasFolder = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_ReduceL2Op : + VPU_LayerOp< + "ReduceL2", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "ReduceL2 VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + + I64ArrayAttr:$axes_value, + UnitAttr:$keep_dims, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::ArrayAttr":$axes_value, + "::mlir::UnitAttr":$keep_dims + )> + ]; + + let hasFolder = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_ReduceLogicalAndOp : + VPU_LayerOp< + "ReduceLogicalAnd", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "ReduceLogicalAnd VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + + I64ArrayAttr:$axes_value, + UnitAttr:$keep_dims, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::ArrayAttr":$axes_value, + "::mlir::UnitAttr":$keep_dims + )> + ]; + + let hasFolder = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_ReduceLogicalOrOp : + VPU_LayerOp< + "ReduceLogicalOr", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "ReduceLogicalOr VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + + I64ArrayAttr:$axes_value, + UnitAttr:$keep_dims, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::ArrayAttr":$axes_value, + "::mlir::UnitAttr":$keep_dims + )> + ]; + + let hasFolder = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_ReduceMaxOp : + VPU_LayerOp< + "ReduceMax", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "ReduceMax VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + + I64ArrayAttr:$axes_value, + UnitAttr:$keep_dims, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::ArrayAttr":$axes_value, + "::mlir::UnitAttr":$keep_dims + )> + ]; + + let hasFolder = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_ReduceMeanOp : + VPU_LayerOp< + "ReduceMean", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "ReduceMean VPU Layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + + I64ArrayAttr:$axes_value, + UnitAttr:$keep_dims, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::ArrayAttr":$axes_value, + "::mlir::UnitAttr":$keep_dims + )> + ]; + + let hasFolder = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_ReduceMinOp : + VPU_LayerOp< + "ReduceMin", + [ + DeclareOpInterfaceMethods, + ] + > { + let summary = "ReduceMin VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input, + + I64ArrayAttr:$axes_value, + UnitAttr:$keep_dims + ); + + let results = (outs + AnyRankedTensor:$output + ); + + let hasFolder = 1; +} + + +def VPU_ReduceProdOp : + VPU_LayerOp< + "ReduceProd", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "ReduceProd VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + + I64ArrayAttr:$axes_value, + UnitAttr:$keep_dims, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::ArrayAttr":$axes_value, + "::mlir::UnitAttr":$keep_dims + )> + ]; + + let hasFolder = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_ReduceSumOp : + VPU_LayerOp< + "ReduceSum", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "ReduceSum VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + + I64ArrayAttr:$axes_value, + UnitAttr:$keep_dims, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::ArrayAttr":$axes_value, + "::mlir::UnitAttr":$keep_dims + )> + ]; + + let hasFolder = 1; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/shape_manipulation.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/shape_manipulation.td new file mode 100644 index 0000000000..798ad48c3f --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/shape_manipulation.td @@ -0,0 +1,217 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_SHAPE_MANIPULATION +#define VPUX_COMPILER_DIALECT_VPU_OPS_SHAPE_MANIPULATION + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + + +def VPU_AffineReshapeOp : + VPU_LayerOp< + "AffineReshape", + [ + VPU_ViewLikeOpInterface, + DeclareOpInterfaceMethods + ] + > { + let summary = "AffineReshape VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$input, + + I64ArrayOfArraysAttr:$dim_mapping, + I64ArrayAttr:$shape_value + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let checkInferredDimsOrder = 1; + let checkInferredMemSpace = 1; + let hasFolder = 1; +} + + +def VPU_DynamicReshapeOp : + VPU_LayerOp< + "DynamicReshape", + [ + DeclareOpInterfaceMethods, + VPU_BoundsRepresentationInterface + ] + > { + let summary = "DynamicReshape VPU layer"; + + let arguments = (ins + AnyRankedTensor:$input, + RankedTensorOf<[SI32]>:$shape, + + I64ArrayAttr:$output_shape, + I64ArrayAttr:$output_bounds, + + UnitAttr:$only_set_shape, + + DefaultValuedAttr:$bounds_representation + ); + + let results = (outs + AnyRankedTensor:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; +} + + +def VPU_ReshapeOp : + VPU_LayerOp< + "Reshape", + [ + VPU_ViewLikeOpInterface + ] + > { + let summary = "Reshape VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$input, + Optional>:$shape, + + UnitAttr:$special_zero, + OptionalAttr:$shape_value + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$output + ); + + let checkInferredDimsOrder = 1; + let checkInferredMemSpace = 1; + + let hasFolder = 1; + let hasCanonicalizer = 1; +} + + +def VPU_SqueezeOp : + VPU_LayerOp< + "Squeeze", + [ + VPU_ViewLikeOpInterface + ] + > { + let summary = "Squeeze VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$input, + Optional>:$axes, + + OptionalAttr:$axes_value + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$output + ); + + let checkInferredDimsOrder = 1; + let checkInferredMemSpace = 1; +} + + +def VPU_UnsqueezeOp : + VPU_LayerOp< + "Unsqueeze", + [ + VPU_ViewLikeOpInterface + ] + > { + + let summary = "Unsqueeze VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$input, + Optional>:$axes, + + OptionalAttr:$axes_value + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor]>:$output + ); + + let checkInferredDimsOrder = 1; + let checkInferredMemSpace = 1; +} + + +def VPU_ShapeCastOp : + VPU_LayerOp< + "ShapeCast", + [ + VPU_ViewLikeOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "ShapeCast VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$input, + I64ArrayAttr:$shape + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool isSupportedTilingDim(DimArrRef tilingDims); + bool isSupportedOutTile(const TileInfo& outTile); + // Note: to be compatible with existing code + mlir::Value getSource() { return getInput(); } + }] # baseExtraClassDeclaration; + + let assemblyFormat = [{ + attr-dict + `inputs` `(` $input `:` type($input) `)` + `->` type(results) + }]; + + let hasFolder = 1; + + let hasCanonicalizer = 1; +} + + +def VPU_ShapeOfOp : + VPU_LayerOp< + "ShapeOf", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "ShapeOf VPU layer"; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor]>:$input + ); + + let results = (outs + 1DTensorOf<[SI32]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/specialized.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/specialized.td new file mode 100644 index 0000000000..34d6b9c071 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/specialized.td @@ -0,0 +1,967 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_VPU_OPS_SPECIALIZED +#define VPUX_COMPILER_DIALECT_VPU_OPS_SPECIALIZED + +include "vpux/compiler/dialect/VPU/ops/base.td" +include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/VPU/ops_interfaces.td" +include "vpux/compiler/dialect/VPU/types.td" + + +def VPU_BucketizeOp : + VPU_LayerOp< + "Bucketize" + > { + let summary = "Bucketize VPU layer"; + + let arguments = (ins + AnyRankedTensor:$data, + 1DTensorOf<[AnyInteger, AnyFloat]>:$buckets, + + TypeAttr:$output_type, + UnitAttr:$with_right_bound + ); + + let results = (outs + RankedTensorOf<[SI32, SI64]>:$output + ); + + let hasVerifier = 1; +} + + +def VPU_DetectionOutputOp : + VPU_LayerOp< + "DetectionOutput", + [ + AttrSizedOperandSegments + ] + > { + let summary = "DetectionOutput VPU layer"; + + let arguments = (ins + 2DTensorOf<[AnyFloat]>:$in_box_logits, + 2DTensorOf<[AnyFloat]>:$in_class_preds, + 3DTensorOf<[AnyFloat]>:$in_proposals, + Optional<2DTensorOf<[AnyFloat]>>:$in_additional_preds, + Optional<2DTensorOf<[AnyFloat]>>:$in_additional_proposals, + + IE_DetectionOutputAttr:$attr + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_DetectionOutputNormalizeOp: + VPU_LayerOp< + "DetectionOutputNormalize" + > { + let summary = "DetectionOutputNormalize VPU layer"; + + let arguments = (ins + 4DTensorOf<[AnyFloat]>:$prior_boxes, + + IntAttr:$input_width, + IntAttr:$input_height + ); + + let results = (outs + 4DTensorOf<[AnyFloat]>:$out_prior_boxes + ); + + let hasVerifier = 1; +} + + +def VPU_DetectionOutputDecodeBoxesOp: + VPU_LayerOp< + "DetectionOutputDecodeBoxes", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "DetectionOutputDecodeBoxes VPU layer"; + + let arguments = (ins + 4DTensorOf<[AnyFloat]>:$box_logits, + 4DTensorOf<[AnyFloat]>:$prior_boxes, + + IE_DetectionOutputCodeTypeAttr:$code_type, + BoolAttr:$clip_before_nms + ); + + let results = (outs + 4DTensorOf<[AnyFloat]>:$out_decoded_boxes + ); +} + + +def VPU_DetectionOutputSortOp: + VPU_LayerOp< + "DetectionOutputSort", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + ] + > { + let summary = "DetectionOutputSort VPU layer"; + + let arguments = (ins + AnyTypeOf<[4DTensorOf<[AnyFloat]>, VPU_DistributedTensor]>:$confidence, + AnyTypeOf<[4DTensorOf<[SI32]>, VPU_DistributedTensor]>:$indicesBuffer, + AnyTypeOf<[4DTensorOf<[SI32]>, VPU_DistributedTensor]>:$sortingBuffer, + + F64Attr:$confidence_threshold, + IntAttr:$top_k, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[4DTensorOf<[AnyFloat]>, VPU_DistributedTensor]>:$out_confidence, + AnyTypeOf<[4DTensorOf<[SI32]>, VPU_DistributedTensor]>:$out_indices, + AnyTypeOf<[4DTensorOf<[SI32]>, VPU_DistributedTensor]>:$out_sizes + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$confidence, + "::mlir::FloatAttr":$confidence_threshold, + "::mlir::IntegerAttr":$top_k + )> + ]; + + let extraClassDeclaration = [{ + OutputTiling getOutputTiling(const vpux::TileInfo& outputTile, vpux::Logger log); + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_DetectionOutputNmsCaffeOp: + VPU_LayerOp< + "DetectionOutputNmsCaffe", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "DetectionOutputNmsCaffe VPU layer"; + + let arguments = (ins + 4DTensorOf<[AnyFloat]>:$confidence, + 4DTensorOf<[AnyFloat]>:$boxes, + 4DTensorOf<[SI32]>:$indices, + 4DTensorOf<[SI32]>:$sizes, + + IntAttr:$top_k, + F64Attr:$nms_threshold, + IntAttr:$background_id + ); + + let results = (outs + 4DTensorOf<[AnyFloat]>:$out_confidence, + 4DTensorOf<[AnyFloat]>:$out_boxes, + 4DTensorOf<[SI32]>:$out_sizes + ); + + let extraClassDeclaration = [{ + OutputTiling getOutputTiling(const vpux::TileInfo& outputTile, vpux::Logger log); + }] # baseExtraClassDeclaration; +} + + +def VPU_DetectionOutputCollectResultsOp: + VPU_LayerOp< + "DetectionOutputCollectResults" + > { + let summary = "DetectionOutputCollectResults VPU layer"; + + let arguments = (ins + 4DTensorOf<[AnyFloat]>:$confidence, + 4DTensorOf<[AnyFloat]>:$boxes, + 4DTensorOf<[SI32]>:$sizes, + + IntAttr:$keep_top_k, + BoolAttr:$clip_after_nms + ); + + let results = (outs + 4DTensorOf<[AnyFloat]>:$out_detections + ); +} + + +def VPU_DFTOp : + VPU_LayerOp< + "DFT", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "InferenceEngine DFT layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + Optional>:$twiddle_factors, + I64ArrayAttr:$axes_attr, + I64ArrayAttr:$signal_size_attr + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_RDFTOp : + VPU_LayerOp< + "RDFT" + > { + let summary = "InferenceEngine RDFT layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + Optional>:$twiddle_factors, + I64ArrayAttr:$axes_attr, + I64ArrayAttr:$signal_size_attr + + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_IDFTOp : + VPU_LayerOp< + "IDFT", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "InferenceEngine IDFT layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + Optional>:$twiddle_factors, + I64ArrayAttr:$axes_attr, + I64ArrayAttr:$signal_size_attr + + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_IRDFTOp : + VPU_LayerOp< + "IRDFT" + > { + let summary = "InferenceEngine IRDFT layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + Optional>:$twiddle_factors, + I64ArrayAttr:$axes_attr, + I64ArrayAttr:$signal_size_attr + + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_RDFTUncutOp : + VPU_LayerOp< + "RDFTUncut", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "RDFTUncut VPU layer"; + + let description = [{ + Operation apply RDFT transformation but not cut symmetric part on last axis width value from axes_attr. + }]; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + Optional>:$twiddle_factors, + I64ArrayAttr:$axes_attr, + I64ArrayAttr:$signal_size_attr + + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_IRDFTLastAxisOp : + VPU_LayerOp< + "IRDFTLastAxis", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "IRDFTLastAxis VPU layer"; + + let description = [{ + Operation apply IRDFT transformation but just on last axis from standard IRDFT operation. + Used to produce full IRDFT capability (in combination with IDFT) without computation + and the data movement unnecessary for the direction of the last transform axis. + }]; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + Optional>:$twiddle_factors, + I64ArrayAttr:$axes_attr, + I64ArrayAttr:$signal_size_attr + + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_DynamicDataMaskOp : + VPU_LayerOp< + "DynamicDataMask" + > { + let summary = "Create mask to clear garbage from pad area, added by processing dynamic tensors by upper bounds"; + + let arguments = (ins + Optional<1DTensorOf<[SI64, SI32]>>:$realShape, + TypeAttr:$outputTensorType + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_EmbeddingBagOffsetsSumOp : + VPU_LayerOp< + "EmbeddingBagOffsetsSum", + [ + AttrSizedOperandSegments + ] + > { + let summary = "InferenceEngine EmbeddingBagOffsetsSum layer"; + + let arguments = (ins + AnyRankedTensor:$emb_table, + Optional<1DTensorOf<[SI32, SI64]>>:$indices, + Optional<1DTensorOf<[SI32, SI64]>>:$offsets, + Optional<1DTensorOf<[AnyInteger, AnyFloat]>>:$per_sample_weights, + + OptionalAttr:$indices_value, + OptionalAttr:$offsets_value, + IntAttr:$default_index_value, + OptionalAttr:$per_sample_weights_value + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_EmbeddingSegmentsSumOp : + VPU_LayerOp< + "EmbeddingSegmentsSum", + [ + AttrSizedOperandSegments + ] + > { + let summary = "EmbeddingSegmentsSum VPU layer"; + + let arguments = (ins + AnyRankedTensor:$emb_table, + Optional<1DTensorOf<[SI64, SI32]>>:$indices, + Optional<1DTensorOf<[SI64, SI32]>>:$segment_ids, + Optional<1DTensorOf<[AnyInteger, AnyFloat]>>:$per_sample_weights, + + OptionalAttr:$indices_value, + OptionalAttr:$segment_ids_value, + IntAttr:$num_segments_value, + IntAttr:$default_index_value, + OptionalAttr:$per_sample_weights_value + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_EmbeddingBagPackedSumOp : + VPU_LayerOp< + "EmbeddingBagPackedSum" + > { + let summary = "EmbeddingBagPackedSum VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$emb_table, + 2DTensorOf<[SI32, SI64]>:$indices, + Optional<2DTensorOf<[F16, F32]>>:$per_sample_weights + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output + ); +} + + +def VPU_EyeOp : + VPU_LayerOp< + "Eye" + > { + let summary = "Eye VPU layer"; + + let arguments = (ins + 1DTensorOf<[SI32, SI64]>:$diagonal_index, + + IntAttr:$num_rows_value, + IntAttr:$num_columns_value, + I64ArrayAttr:$batch_shape_value, + + TypeAttr:$outputType + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_SDPAOp : + VPU_LayerOp< + "SDPA", + [ + AttrSizedOperandSegments, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + ] + > { + let summary = "SDPA VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$inputQ, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$inputK, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$inputV, + Optional, VPU_DistributedTensor]>>:$inputMask, + Optional, VPU_DistributedTensor]>>:$inputScale, + Optional, VPU_DistributedTensor]>>:$inputBias, + Optional>:$dataStorage, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$inputQ, + "::mlir::Value":$inputK, + "::mlir::Value":$inputV, + "::mlir::Value":$inputMask, + "::mlir::Value":$inputScale, + "::mlir::Value":$inputBias, + "::mlir::Value":$dataStorage + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_RoPEOp : + VPU_LayerOp< + "RoPE", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + ] + > { + let summary = "RoPE VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input_cos, + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$input_sin, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input, + "::mlir::Value":$input_cos, + "::mlir::Value":$input_sin + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_NonMaxSuppressionOp : + VPU_LayerOp< + "NonMaxSuppression" + > { + let summary = "NonMaxSuppression VPU layer"; + + let arguments = (ins + 3DTensorOf<[F16, F32]>:$in_box_coords, + 3DTensorOf<[F16, F32]>:$in_box_scores, + Optional>:$dataBuffer, + + IE_BoxEncodingTypeAttr:$box_encoding, + UnitAttr:$sort_result_descending, + + OptionalAttr:$max_output_boxes_per_class_value, + OptionalAttr:$iou_threshold_value, + OptionalAttr:$score_threshold_value, + OptionalAttr:$soft_nms_sigma_value + ); + + let results = (outs + 2DTensorOf<[SI32]>:$out_selected_indices, + 2DTensorOf<[F16, F32]>:$out_selected_scores, + 1DTensorOf<[SI32]>:$out_valid_outputs + ); +} + + +def VPU_NonZeroOp : + VPU_LayerOp< + "NonZero", + [ + VPU_BoundsRepresentationInterface + ] + > { + let summary = "NonZero VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, SI32, Bool8]>:$input, + + DefaultValuedAttr:$bounds_representation + ); + + let results = (outs + 2DTensorOf<[SI32]>:$output + ); + + let hasVerifier = 1; +} + + +def VPU_OneHotOp : + VPU_LayerOp< + "OneHot" + > { + let summary = "InferenceEngine OneHot layer"; + + let arguments = (ins + RankedTensorOf<[SI32, SI64]> :$input, + + IntAttr:$depth, + F64Attr:$on_value, + F64Attr:$off_value, + IntAttr:$axis, + + TypeAttr:$outputType + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_ProposalOp : + VPU_LayerOp< + "Proposal" + > { + let summary = "Proposal VPU layer"; + + let description = [{ + Proposal operation filters bounding boxes and outputs only those with the highest prediction confidence. + The auxiliary buffer has the role of storing the intermediate results obtained inside the operation, + then sorting them. Depending on some criteria, it recalculates the results and extracts the output from them. + }]; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$class_probs, + RankedTensorOf<[F16, F32]>:$bbox_deltas, + RankedTensorOf<[F16, F32]>:$image_shape, + Optional<1DTensorOf<[UI8]>>:$auxiliary, + + IE_ProposalAttr:$proposal_attrs + ); + + let results = (outs + RankedTensorOf<[F16, F32]>:$output, + RankedTensorOf<[F16, F32]>:$probs + ); + +} + + +def VPU_RandomUniformOp : + VPU_LayerOp< + "RandomUniform", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "RandomUniform VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$min, + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$max, + + I64ArrayAttr:$output_shape, + TypeAttr:$outputType, + IntAttr:$global_seed, + IntAttr:$op_seed, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[F16, F32, SI32]>, VPU_DistributedTensor]>:$output + ); + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$min, + "::mlir::Value":$max, + + "::mlir::ArrayAttr":$output_shape, + "::mlir::TypeAttr":$outputType, + "::mlir::IntegerAttr":$global_seed, + "::mlir::IntegerAttr":$op_seed + )> + ]; + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_RangeOp : + VPU_LayerOp< + "Range", + [ + VPU_BoundsRepresentationInterface + ] + > { + let summary = "Range VPU layer"; + + let arguments = (ins + 1DTensorOf<[AnyInteger, AnyFloat]>:$start, + 1DTensorOf<[AnyInteger, AnyFloat]>:$stop, + 1DTensorOf<[AnyInteger, AnyFloat]>:$step, + TypeAttr:$dstElemType, + + DefaultValuedAttr:$bounds_representation + ); + + let results = (outs + 1DTensorOf<[AnyInteger, AnyFloat]>:$output + ); + + let hasVerifier = 1; +} + + + + +def VPU_RegionYoloOp : + VPU_LayerOp< + "RegionYolo" + > { + let summary = "RegionYolo VPU layer"; + + let arguments = (ins + 4DTensorOf<[AnyFloat]>:$input, + + IntAttr:$coords, + IntAttr:$classes, + IntAttr:$num_regions, + BoolAttr:$do_softmax, + I64ArrayAttr:$mask, + IntAttr:$axis, + IntAttr:$end_axis, + F64ArrayAttr:$anchors + ); + + let results = (outs + AnyRankedTensor:$output + ); +} + + +def VPU_ReorgYoloOp : + VPU_LayerOp< + "ReorgYolo" + > { + let summary = "ReorgYolo VPU layer"; + + let arguments = (ins + 4DTensorOf<[AnyInteger, AnyFloat]>:$input, + + IntAttr:$stride + ); + + let results = (outs + AnyRankedTensor:$output + ); + + let hasVerifier = 1; +} + +def VPU_SelectOp : + VPU_LayerOp< + "Select", + [ + VPU_TilingBuilderOpInterface, + VPU_EltwiseOp, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + ] + > { + let summary = "Select VPU layer"; + + let arguments = (ins + AnyTypeOf<[RankedTensorOf<[Bool8, SI32, F16]>, VPU_DistributedTensor]>:$input1, + AnyTypeOf<[RankedTensorOf<[SI32, F16]>, VPU_DistributedTensor]>:$input2, + AnyTypeOf<[RankedTensorOf<[SI32, F16]>, VPU_DistributedTensor]>:$input3, + IE_AutoBroadcastTypeAttr:$auto_broadcast, + + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[RankedTensorOf<[SI32, F16]>, VPU_DistributedTensor]>:$output + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + + bool fitIntoCMX(::llvm::ArrayRef buffers); + }] # baseExtraClassDeclaration; + + let builders = [ + OpBuilder<(ins + "::mlir::Value":$input1, + "::mlir::Value":$input2, + "::mlir::Value":$input3, + "vpux::IE::AutoBroadcastTypeAttr":$auto_broadcast + )> + ]; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + + +def VPU_StubOp : + VPU_Op< + "Stub", + [ + Pure + ] + > { + let summary = "Substitute operation for stubbing."; + + let arguments = (ins + Variadic:$inputs + ); + + let results = (outs + Variadic:$outputs + ); + + let assemblyFormat = [{ + `(` operands `)` attr-dict `:` type(operands) `->` type(results) + }]; +} + + +def VPU_TopKOp : + VPU_LayerOp< + "TopK", + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + AttrSizedOperandSegments + ] + > { + let summary = "TopK VPU layer"; + + let description = [{ + * lineBuffer - it is an auxiliary buffer, which has the role of storing + some intermediate results in the software kernel of the operation. + }]; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + Optional>:$k, + Optional>:$lineBuffer, + OptionalAttr:$k_value, + + IntAttr:$axis, + IE_TopKModeAttr:$mode, + IE_TopKSortTypeAttr:$sort, + TypeAttr:$element_type, + OptionalAttr:$multiClusterStrategy + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output_values, + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$target_shape + ); + + let extraClassDeclaration = [{ + bool fitIntoCMX(::llvm::ArrayRef buffers, Byte reservedMem); + bool fitIntoCMX(::llvm::ArrayRef buffers); + OutputTiling getOutputTiling(const vpux::TileInfo& outputTile, vpux::Logger log); + }] # baseExtraClassDeclaration; + + let elemComparisonModes = [IE_TypeComparisonMode_ALLOW_DISTRIBUTED_OUTPUT]; +} + +def VPU_PermuteCastOp : + VPU_LayerOp< + "PermuteCast", + [ + VPU_ViewLikeOpInterface, + DeclareOpInterfaceMethods + ] + > { + let summary = "PermuteCast VPU layer"; + + let description = [{ + The op changes layout information in the following way: + * dst_order: layout attribute of result is set to value of this arg + * mem_perm: describes the permutation applied on the input value's memory shape + to obtain the memory shape of the output value. + }]; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$input, + + AffineMapAttr:$dst_order, + AffineMapAttr:$mem_perm + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_DistributedTensor]>:$output + ); + + let checkInferredDimsOrder = 1; + let checkInferredMemSpace = 1; + let hasCanonicalizer = 1; + let hasFolder = 1; +} + +def VPU_PermuteQuantizeOp : + VPU_LayerOp< + "PermuteQuantize", + [ + DeclareOpInterfaceMethods + ] + > { + let summary = "PermuteQuantize VPU layer"; + + let arguments = (ins + RankedTensorOf<[F16, F32]>:$input, + + AffineMapAttr:$dst_order, + AffineMapAttr:$mem_perm, + TypeAttr:$dstElemType, + I64ArrayAttr:$pads_begin, + I64ArrayAttr:$pads_end + ); + + let results = (outs + RankedTensorOf<[quant_QuantizedType]>:$output + ); +} + +def VPU_LayoutCastOp : + VPU_LayerOp< + "LayoutCast", + [ + VPU_ViewLikeOpInterface, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ] + > { + let summary = "This layer overrides layout of a given tensor."; + + let arguments = (ins + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$input, + AffineMapAttr:$dst_order + ); + + let results = (outs + AnyTypeOf<[AnyRankedTensor, VPU_SparseTensor, VPU_DistributedTensor]>:$output + ); + + let hasFolder = 1; + let hasVerifier = 1; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops_interfaces.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops_interfaces.td index e574c5fcc0..0f716abf56 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops_interfaces.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops_interfaces.td @@ -567,24 +567,6 @@ def VPU_DDRAccessOpInterface: OpInterface<"DDRAccessOpInterface"> { ]; } -// -// DefinedInArch op trait -// - -class DefinedInArch - : ParamNativeOpTrait<"DefinedInArch", arch> { - let cppNamespace = "vpux::VPU"; -} - -// -// LimitedToArch op trait -// - -class LimitedToArch archs> - : ParamNativeOpTrait<"LimitedToArch", !interleave(archs, ", ")> { - let cppNamespace = "vpux::VPU"; -} - // // AlignedWorkloadChannelOpInterface // diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/passes.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/passes.td index 060fd47951..faf478bebc 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/passes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/passes.td @@ -14,7 +14,6 @@ include "mlir/Pass/PassBase.td" def InitResources : PassBase<"init-resources", "vpux::ModulePass"> { let summary = "Initializes compiler for VPU platforms"; - let description = [{ This pass attaches VPU related compilation parameters to Module attributes and initializes run-time resources information. @@ -65,6 +64,103 @@ def InitResources : PassBase<"init-resources", "vpux::ModulePass"> { ]; } +// +// DMATaskProfilingReserveMem +// + +def DMATaskProfilingReserveMem : PassBase<"dma-task-profiling-reserve-mem", "vpux::ModulePass"> { + let summary = "DMA task profiling memory reserving"; + + let description = [{ + This pass adds in ModuleOp information about reserved memory for DMA profiling. + }]; + + let options = [ + Option< + "enableDMAProfiling", "dma-profiling", + "std::string", [{"false"}], + "Enable DMA task profiling (true|static|false)" + > + ]; + + let constructor = "vpux::VPU::createDMATaskProfilingReserveMemPass()"; +} + +// +// Reserve memory for compressed DMA +// + +def CompressDmaReserveMem : PassBase<"compress-dma-reserve-mem", "vpux::ModulePass"> { + let summary = "Reserve memory for additional compressedDMA metadata"; + + let description = [{ + Reserve memory in CMX where additional metadata is stored for compressed DMAs handling activation spilling. + In this memory Compressed DMA wil store a LUT with actual compressed data sizee that is later to be used + by decompress DMA + }]; + + let constructor = "vpux::VPU::createCompressDmaReserveMemPass()"; +} + +// +// SWKernelDataPrefetchReserveMem +// + +def SWKernelDataPrefetchReserveMem : PassBase<"sw-kernel-data-prefetch-reserve-mem", "vpux::ModulePass"> { + let summary = "Reserve memory for SW Kernel data prefetching"; + + let description = [{ + SW Kernel reads extra few bytes of data for better performance. + When input buffer is at the end of CMX, reading extra data would cause ACT SHAVE read violation. + Now we reserve CMX for profiling and activation compression at the end of CMX. + This pass checks the total reserved memory size and inserts a dummy section if necessary, to ensure + the total reserved memory size is safe for SW Kenel data prefetching. + + This pass eliminates the risk of ACT SHAVE read violation, but model performance might be impacted. + One case is described in E#122488, where DMA and DPU execution pipeline is impacted by the memory layout adjustment. + This can be optimized in the future by CMX allocation improvement. + }]; + + let constructor = "vpux::VPU::createSWKernelDataPrefetchReserveMemPass()"; +} + +// +// SWKernelInstructionPrefetchReserveMemForDummyKernels +// + +def SWKernelInstructionPrefetchReserveMemForDummyKernels : PassBase<"sw-kernel-instruction-prefetch-reserve-mem-for-dummy-kernels", "vpux::ModulePass"> { + let summary = "Reserve I/O memory for dummy SW kernels used instruction prefetching"; + + let description = [{ + Dummy SW kernel is used to mimic SW instruction prefetch feature. + To build these tasks up some reserved mememory is required as dummy I/O space. + }]; + + let constructor = "vpux::VPU::createSWKernelInstructionPrefetchReserveMemForDummyKernelsPass()"; +} + +// +// SCFVerticalFusion +// + +def SCFVerticalFusion : PassBase<"scf-vertical-fusion", "vpux::FunctionPass"> { + let summary = "Apply VF to operations and convert it to scf representation"; + + let description = [{ + Applies scf tiling to operation and searches for its parents to merge + to the loop. + For all operations which might be fused the optimal number of tiles is calculated. + For the largest dimension the number is chosen so that VF passes memory requirements to be optimally fused. + + Current algorithm doesn't compare cost of VF among different dimensions tiling and + it doesn't compare the cost of VF vs operations without VF. + Algorithm provides greedy approach to merge as much as possible based on same limitations which + non-scf algorithm has (aligned MC stratgies). To be extended to cost comparison further. + }]; + + let constructor = "vpux::VPU::createSCFVerticalFusionPass()"; +} + // // OutputPipelineTiling // @@ -266,6 +362,19 @@ def WrapVerticalFusionRegion : PassBase<"wrap-in-vertical-fusion", "vpux::Functi let constructor = "vpux::VPU::createWrapVerticalFusionRegionPass()"; + let options = [ + Option< + "workloadManagementModeOpt", "workload-management-mode", + "vpux::WorkloadManagementMode", "", + "Setting which controls mode of WLM", + [{::llvm::cl::values( + clEnumValN(vpux::WorkloadManagementMode::PWLM_V0_LCA, "PWLM_V0_LCA", "WLM enqueue barriers search algorithm at VPURT enabled"), + clEnumValN(vpux::WorkloadManagementMode::PWLM_V1_BARRIER_FIFO, "PWLM_V1_BARRIER_FIFO", "WLM enqueue barriers search algorithm at VPURT disabled"), + clEnumValN(vpux::WorkloadManagementMode::PWLM_V2_PAGES, "PWLM_V2_PAGES", "WLM with split into subgraphs(pages)") + )}] + > + ]; + let dependentDialects = [ "vpux::VPU::VPUDialect" ]; @@ -1407,6 +1516,11 @@ def OptimizeConcat : PassBase<"optimize-concat", "vpux::FunctionPass"> { "optimizeOnlyOuterConcat", "optimize-only-outer-concat", "bool", "false", "Flag to check dimension and only optimize concat when the dimension is the highest." + >, + Option< + "disablePassOnEntryFunction", "disable-pass-on-entry-function", + "bool", "false", + "Flag to disable the pass on entry function for host compilation." > ]; @@ -1468,9 +1582,14 @@ def SetupNpuConstraint : PassBase<"setup-npu-constraint", "vpux::ModulePass"> { let options = [ Option< - "workloadManagementEnable", "workload-management-enable", - "bool", "false", - "[Optional] Set partial workload management to true/false" + "workloadManagementStatus", "workload-management-status", + "vpux::VPU::WorkloadManagementStatus", "vpux::VPU::WorkloadManagementStatus::ENABLED", + "[Optional] Set workload management status", + [{::llvm::cl::values( + clEnumValN(vpux::VPU::WorkloadManagementStatus::ENABLED, "ENABLED", "workload management enabled"), + clEnumValN(vpux::VPU::WorkloadManagementStatus::DISABLED, "DISABLED", "workload management disabled"), + clEnumValN(vpux::VPU::WorkloadManagementStatus::FAILED, "FAILED", "workload management attempted, but failed") + )}] >, Option< "allowCustomValues", "allow-custom-values", @@ -1521,17 +1640,17 @@ def SetupMaxKernelSize : PassBase<"setup-max-kernel-size", "vpux::ModulePass"> { } // -// SetupChannelsAutoPadding +// SetTargetIndependentPassOptions // -def SetupChannelsAutoPadding : PassBase<"setup-channels-auto-padding", "vpux::ModulePass"> { - let summary = "Enable the auto padding for input/output channels and store it in the model/IR"; +def SetTargetIndependentPassOptions : PassBase<"set-target-independent-options", "vpux::ModulePass"> { + let summary = "Enable pass options and store it in the Module/IR"; let description = [{ - Uses the PipelineOptionsOp structure from the IR so the options could be exposed globally + Uses the PipelineOptionsOp structure from the IR so the options could be exposed globally. Adds several Symbols aka Options to PipelineOptions sumbol table }]; - let constructor = "vpux::VPU::createSetupChannelsAutoPaddingPass()"; + let constructor = "vpux::VPU::createSetTargetIndependentPassOptionsPass()"; let options = [ Option< @@ -1544,136 +1663,61 @@ def SetupChannelsAutoPadding : PassBase<"setup-channels-auto-padding", "vpux::Mo "bool", "false", "[Optional] Set auto padding for IDU to true/false" >, - Option< - "allowCustomValues", "allow-custom-values", - "bool", "", - "[Optional] Allows keep predefined values in IR" - > - ]; - - let dependentDialects = [ - "vpux::VPU::VPUDialect" - ]; -} - -// -// SetupIsReduceSupported -// - -def SetupIsReduceSupported : PassBase<"setup-is-reduce-supported", "vpux::ModulePass"> { - let summary = "Enable Reduce operations to be supported on NCE"; - - let description = [{ - Uses the PipelineOptionsOp structure from the IR so the options could be exposed globally - }]; - - let constructor = "vpux::VPU::createSetupIsReduceSupportedPass()"; - - let options = [ Option< "enableIsReduceSupported", "enable-is-reduce-supported", "bool", "false", "[Optional] Set IsReduceSupported for NCE to true/false" >, Option< - "allowCustomValues", "allow-custom-values", - "bool", "", - "[Optional] Allows keep predefined values in IR" - > - ]; - - let dependentDialects = [ - "vpux::VPU::VPUDialect" - ]; -} - -// -// SetupEnableSEPtrsOperations -// - -def SetupEnableSEPtrsOperations : PassBase<"setup-enable-se-ptrs-operations", "vpux::ModulePass"> { - let summary = "Enable SEPtrs operations to be supported"; - - let description = [{ - Uses the PipelineOptionsOp structure from the IR so the options could be exposed globally - }]; - - let constructor = "vpux::VPU::createSetupEnableSEPtrsOperationsPass()"; - - let options = [ + "enableFP16CompressedConvolution", "enable-fp16-compressed-convolution", + "bool", "false", + "Enable FP16 Compressed convolution op" + >, Option< - "enableSEPtrsOperations", "enable-se-ptrs-operations", + "enableVPUNNPreSplit", "enable-vpunn-pre-split", "bool", "false", - "[Optional] Set SEPtrsOperations to true/false" + "Enable VPUNN pre-split API" >, Option< - "allowCustomValues", "allow-custom-values", - "bool", "", - "[Optional] Allows keep predefined values in IR" + "weightsTableReuseMode", "weights-table-reuse-mode", + "vpux::WeightsTableReuseMode", "", + "Setting the weights table reuse mode", + [{::llvm::cl::values( + clEnumValN(vpux::WeightsTableReuseMode::ENABLED, "ENABLED", "Weights table can be reused for all operations that support it"), + clEnumValN(vpux::WeightsTableReuseMode::VF_ENABLED, "VF_ENABLED", "Weights table can be reused for operations in pure-vertical-fusion region, to avoid possible memory fragmentation"), + clEnumValN(vpux::WeightsTableReuseMode::DISABLED, "DISABLED", "Weights table cannot be reused") + )}] + >, + Option< + "enableSEPtrsOperations", "enable-se-ptrs-operations", + "bool", "false", + "[Optional] Set SEPtrsOperations to true/false" >, Option< "enableExperimentalSEPtrsOperations", "enable-experimental-se-ptrs-operations", "bool", "false", "[Optional] Set ExperimentalSEPtrsOperations to true/false" - > - ]; - - let dependentDialects = [ - "vpux::VPU::VPUDialect" - ]; -} - -// -// SetupEnableAdaptiveStripping -// - -def SetupEnableAdaptiveStripping : PassBase<"setup-enable-adaptive-stripping", "vpux::ModulePass"> { - let summary = "Enable adaptive stripping"; - - let description = [{ - Uses the PipelineOptionsOp structure from the IR so the options could be exposed globally - }]; - - let constructor = "vpux::VPU::createSetupEnableAdaptiveStrippingPass()"; - - let options = [ + >, Option< "enableAdaptiveStripping", "enable-adaptive-stripping", "bool", "false", "[Optional] Set AdaptiveStripping to true/false" >, - Option< - "allowCustomValues", "allow-custom-values", - "bool", "", - "[Optional] Allows keep predefined values in IR" - > - ]; - - let dependentDialects = [ - "vpux::VPU::VPUDialect" - ]; -} - - -// -// SetupEnableExtraStaticShapeOps -// - -def SetupEnableExtraStaticShapeOps : PassBase<"setup-enable-extra-static-shape-ops", "vpux::ModulePass"> { - let summary = "Enable extra shape bound ops"; - - let description = [{ - Uses the PipelineOptionsOp structure from the IR so the options could be exposed globally - }]; - - let constructor = "vpux::VPU::createSetupEnableExtraStaticShapeOpsPass()"; - - let options = [ Option< "enableExtraStaticShapeOps", "enable-extra-static-shape-ops", "bool", "true", "[Optional] Set ExtraStaticShapeOps to true/false" >, + Option< + "enableSprLUT", "enable-sprlut", + "bool", "false", + "[Optional] Set sprLUT to true/false" + >, + Option< + "enableDCIM", "enable-dcim", + "bool", "false", + "[Optional] Set EnableDCIM for NCE to true/false" + >, Option< "allowCustomValues", "allow-custom-values", "bool", "", @@ -1905,103 +1949,7 @@ def ConvertConstArgsToMultiConstants : PassBase<"convert-const-args-to-multi-con ]; } -// -// SetupEnableFP16CompressedConv -// - -def SetupEnableFP16CompressedConv : PassBase<"setup-enable-fp16-compressed-conv", "vpux::ModulePass"> { - let summary = "Enable fp16 compressed convolution and store it in the model/IR"; - - let description = [{ - Uses the PipelineOptionsOp structure from the IR so the options could be exposed globally - }]; - - let constructor = "vpux::VPU::createSetupEnableFP16CompressedConvPass()"; - - let options = [ - Option< - "enableFP16CompressedConvolution", "enable-fp16-compressed-convolution", - "bool", "false", - "Enable FP16 Compressed convolution op" - >, - Option< - "allowCustomValues", "allow-custom-values", - "bool", "", - "[Optional] Allows keep predefined values in IR" - > - ]; - let dependentDialects = [ - "vpux::VPU::VPUDialect" - ]; -} - -// -// SetupEnableVPUNNPreSplit -// - -def SetupEnableVPUNNPreSplit : PassBase<"setup-vpunn-pre-split", "vpux::ModulePass"> { - let summary = "Enable VPUNN pre-split API and store it in the model/IR"; - - let description = [{ - Use the pre-split VPUNN API for cost - }]; - - let constructor = "vpux::VPU::createSetupEnableVPUNNPreSplitPass()"; - - let options = [ - Option< - "enableVPUNNPreSplit", "enable-vpunn-pre-split", - "bool", "false", - "Enable VPUNN pre-split API" - >, - Option< - "allowCustomValues", "allow-custom-values", - "bool", "", - "[Optional] Allows keep predefined values in IR" - > - ]; - - let dependentDialects = [ - "vpux::VPU::VPUDialect" - ]; -} - -// -// SetupWeightsTableReuseMode -// - -def SetupWeightsTableReuseMode : PassBase<"setup-weights-table-reuse-mode", "vpux::ModulePass"> { - let summary = "Set the option of enabling weights table reuse and store it in the model/IR"; - - let description = [{ - Set the option of enabling weights table reuse in the model/IR. - }]; - - let constructor = "vpux::VPU::createSetupWeightsTableReuseModePass()"; - - let options = [ - Option< - "weightsTableReuseMode", "weights-table-reuse-mode", - "vpux::WeightsTableReuseMode", "", - "Setting the weights table reuse mode", - [{::llvm::cl::values( - clEnumValN(vpux::WeightsTableReuseMode::ENABLED, "ENABLED", "Weights table can be reused for all operations that support it"), - clEnumValN(vpux::WeightsTableReuseMode::VF_ENABLED, "VF_ENABLED", "Weights table can be reused for operations in pure-vertical-fusion region, to avoid possible memory fragmentation"), - clEnumValN(vpux::WeightsTableReuseMode::DISABLED, "DISABLED", "Weights table cannot be reused") - )}] - >, - Option< - "allowCustomValues", "allow-custom-values", - "bool", "", - "[Optional] Allows keep predefined values in IR" - > - ]; - - let dependentDialects = [ - "vpux::VPU::VPUDialect" - ]; -} // // ConcatRepeatingBlocksOutlining @@ -2220,6 +2168,20 @@ def QueryWSInfo : PassBase<"query-ws-info", "vpux::ModulePass"> { let constructor = "vpux::VPU::createQueryWSInfoPass()"; } +def ConcatInitInputs : PassBase<"concat-init-inputs", "vpux::ModulePass"> { + let summary = "Concatenates init function inputs into a single tensor"; + + let description = [{ + This pass concatenates the inputs of init function into a single tensor, + reducing the overall number of weights passed to init to a single + concatenated weight (note: it is currently highly preferred to limit the + amount of inputs, when possible, as every schedule input uses CMX-placed + meta information). + }]; + + let constructor = "vpux::VPU::createConcatInitInputsPass()"; +} + def ConcatInitResults : PassBase<"concat-init-results", "vpux::ModulePass"> { let summary = "Concatenates init function results into a single tensor"; @@ -2451,3 +2413,32 @@ def MoveReflectPadToCMX : PassBase<"move-reflect-pad-to-cmx", "vpux::FunctionPas let constructor = "vpux::VPU::createMoveReflectPadToCMXPass()"; } + +// +// +// + +def ConvertDynamicToStaticKernels : PassBase<"convert-dynamic-to-static-kernels", "vpux::ModulePass"> { + let summary = "Converts compute function ops with dynamic tensor shapes to static shapes"; + + let description = [{ + The pass identifies scf.for loops where the iteration space is determined by dynamic dimensions of input tensors for compute functions. + Using the step size of the loop and tensor slice operations, the pass will convert the dynamic tensor shapes + to static shapes by using index adjustments through backtracking. NPU compute function / kernel will be transformed + to use static shapes instead of dynamic shapes, based on the static shape inferred from the previous step. For the cases + where backtracking is not possible, the pass throws a runtime error. Pass currently does not support padding of the input tensors. + + This conversion enables better optimization and scheduling as static shapes allow for + more effective tiling and resource allocation during compilation. + }]; + + let constructor = "vpux::VPU::createConvertDynamicToStaticKernelsPass()"; + + let dependentDialects = [ + "vpux::VPU::VPUDialect", + "mlir::tensor::TensorDialect", + "mlir::arith::ArithDialect", + "mlir::scf::SCFDialect", + "mlir::cf::ControlFlowDialect", + ]; +} diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/CMakeLists.txt index ff9541e5fc..263098e246 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/CMakeLists.txt @@ -4,6 +4,7 @@ # add_vpux_dialect(VPUASM) +add_vpux_ops(VPUASM GENERIC) add_vpux_ops_interface(VPUASM dialect/VPUASM/) add_vpux_attribute(VPUASM ENABLE_VPUX_ATTR) add_vpux_type(VPUASM) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/attributes.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/attributes.td index 477e506cb7..78c1292e80 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/attributes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/attributes.td @@ -1,5 +1,5 @@ // -// Copyright (C) 2025 Intel Corporation. +// Copyright (C) 2022-2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/ops.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/ops.td index 34aa155100..dc11a649eb 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/ops.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/ops.td @@ -992,7 +992,7 @@ def VPUASM_DPUInvariantOp : VPUASM_IndexTypeAttr:$task_index, OptionalAttr:$task_location, - SymbolRefAttr:$input, + OptionalAttr:$input, OptionalAttr:$input_sparsity_map, OptionalAttr:$input_storage_element_table, @@ -1067,7 +1067,7 @@ def VPUASM_DPUInvariantOp : $sym_name `idx` `(` $task_index `)` `taskLocation` `(` $task_location `)` - `input` `(` $input `)` + (`input` `(` $input^ `)`)? (`input_sparsity_map` `(` $input_sparsity_map^ `)`)? (`input_storage_element_table` `(` $input_storage_element_table^ `)`)? (`weights` `(` $weights^ `)`)? diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/passes.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/passes.td index 7123cc62bd..13fb1e3812 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/passes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/passes.td @@ -1,4 +1,3 @@ - // // Copyright (C) 2022-2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/types.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/types.td index a88ed4cf3a..a24c4e4878 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/types.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUASM/types.td @@ -1,5 +1,5 @@ // -// Copyright (C) 2025 Intel Corporation. +// Copyright (C) 2022-2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/CMakeLists.txt index 3b43bc2c16..617a0996db 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/CMakeLists.txt @@ -4,6 +4,7 @@ # add_vpux_dialect(VPUIP) +add_vpux_ops(VPUIP GENERIC) add_vpux_ops_interface(VPUIP dialect/VPUIP/) add_vpux_attr_interface(VPUIP dialect/VPUIP/) add_vpux_attribute(VPUIP ENABLE_VPUX_ENUMS ENABLE_VPUX_ATTR) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/attributes.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/attributes.td index f442b2c956..6bac6a1dbf 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/attributes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/attributes.td @@ -345,23 +345,6 @@ def VPUIP_CompressionStateAttr : VPUIP_EnumAttr, - I64EnumAttrCase<"DISABLED", 1>, - I64EnumAttrCase<"FAILED", 2> - ] - > { -} - -def VPUIP_WlmStatusAttr : VPUIP_EnumAttr; // // VPUIPInlinerDispatch @@ -501,4 +484,56 @@ def VPUIP_DynamicScaleConfigAttr : VPUIP_Attr<"DynamicScaleConfig"> { let assemblyFormat = "`<` struct(params) `>`"; } +// +// VPUIP_FetchDMAAttr +// + +def VPUIP_FetchDMAAttr : VPUIP_Attr<"FetchDMA"> { + let description = [{ + "Represents attributes required to correctly lower with NNDMAs for fetch task descriptors" + }]; + + let parameters = (ins + "vpux::VPU::ExecutorKindAttr":$targetExecutorKindAttr, + "mlir::IntegerAttr":$tileIdx, + "mlir::IntegerAttr":$listIdx, + "mlir::IntegerAttr":$execGroupIdx + ); + + let assemblyFormat = [{ + `<` + $targetExecutorKindAttr `,` + `tile` `=` $tileIdx `,` + `list` `=` $listIdx `,` + `group` `=` $execGroupIdx + `>` + }]; +} + +// +// VPUIP_EnqueueDMAAttr +// + +def VPUIP_EnqueueDMAAttr : VPUIP_Attr<"EnqueueDMA"> { + let description = [{ + "Represents attributes required for identifying what operation is to be enqueued using this DMA" + }]; + + let parameters = (ins + "vpux::VPU::ExecutorKindAttr":$targetExecutorKindAttr, + "mlir::IntegerAttr":$tileIdx, + "mlir::IntegerAttr":$listIdx, + "mlir::IntegerAttr":$startTaskIdx, + "mlir::IntegerAttr":$endTaskIdx + ); + + let assemblyFormat = [{ + `<` $targetExecutorKindAttr `,` + `tile` `=` $tileIdx `,` + `list` `=` $listIdx `,` + `startTask` `=` $startTaskIdx `,` + `endTask` `=` $endTaskIdx `>` + }]; +} + #endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/ops.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/ops.td index eae9f5d7f6..ff14304241 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/ops.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/ops.td @@ -166,7 +166,8 @@ def VPUIP_PermuteDMAOp : OptionalAttr:$mem_perm, OptionalAttr:$dma_descriptor, OptionalAttr:$dma_hwp_id, - OptionalAttr:$profilingMetadata + OptionalAttr:$profilingMetadata, + OptionalAttr:$internalDataFlow ); let results = (outs @@ -225,8 +226,8 @@ def VPUIP_GatherDMAOp : let arguments = (ins AnyMemRef:$input, - MemRefOf<[I64]>:$indices, - AnyMemRef:$outputBuff, + AnyTypeOf<[MemRefOf<[I64]>, VPUIP_DistributedBuffer]>:$indices, + AnyTypeOf<[AnyMemRef, VPUIP_DistributedBuffer]>:$outputBuff, IntAttr:$elementSize, @@ -245,7 +246,7 @@ def VPUIP_GatherDMAOp : ); let results = (outs - AnyMemRef:$output + AnyTypeOf<[AnyMemRef, VPUIP_DistributedBuffer]>:$output ); let builders = [ @@ -742,7 +743,7 @@ def VPUIP_NCEClusterTaskOp : }]; let arguments = (ins - AnyTypeOf<[MemRefOf<[F16, BF16, quant_QuantizedType]>, VPUIP_DistributedBuffer, VPUIP_ITIBuffer]>:$input, + Optional, VPUIP_DistributedBuffer, VPUIP_ITIBuffer]>>:$input, Optional, VPUIP_DistributedBuffer]>>:$input_sparsity_map, Optional, VPUIP_DistributedBuffer]>>:$input_storage_element_table, Optional, VPUIP_DistributedBuffer]>>:$weights, @@ -2490,36 +2491,6 @@ def VPUIP_ExtractFlatSliceOp : } -// -// WorkloadCastOp -// - -def VPUIP_WorkloadCastOp : - VPUIP_Op< - "WorkloadCast", - [ - DeclareOpInterfaceMethods, - Pure - ] - > { - let summary = "Operation that casts one DistributedBuffer type to another."; - - let arguments = (ins - AnyTypeOf<[VPUIP_DistributedBuffer, VPUIP_SparseBuffer]>:$input - ); - - let results = (outs - AnyTypeOf<[VPUIP_DistributedBuffer, VPUIP_SparseBuffer]>:$output - ); - - let assemblyFormat = [{ - attr-dict - `inputs` `(` $input `:` qualified(type($input)) `)` - `->` qualified(type(results)) - }]; -} - - // // ProfilingSectionInfoOp // @@ -2705,4 +2676,104 @@ def VPUIP_BarProgDMAOp : let hasVerifier = 1; } +// +// FetchDMAOp +// + +def VPUIP_FetchDMAOp : + VPUIP_TaskOp<1, "FetchDMA", + [ + ViewLikeOpInterface, + VPUIP_DMATypeOpInterface, + DotInterface + ] + > { + let summary = "Fetch task descriptors DMA task"; + + let arguments = (ins + AnyMemRef:$input, + AnyMemRef:$output_buff, + OptionalAttr:$port, + UnitAttr:$is_out_of_order, + UnitAttr:$is_critical, + OptionalAttr:$dma_hwp_id, + OptionalAttr:$profilingMetadata, + VPUIP_FetchDMAAttr:$fetch_dma + ); + + let results = (outs + AnyMemRef:$output + ); + + let extraClassDeclaration = [{ + static vpux::VPU::ExecutorKind getExecutorKind() { + return vpux::VPU::ExecutorKind::DMA_NN; + } + + vpux::DotNodeColor getNodeColor() { + return vpux::DotNodeColor::GREEN; + } + }] # commonExtraClassDeclaration; + + let assemblyFormat = [{ + attr-dict + `inputs` `(` $input `:` type($input) `)` + `outputs` `(` $output_buff `:` type($output_buff) `)` + `fetch_dma` `(` $fetch_dma `)` + `->` type(results) + }]; + + let hasVerifier = 1; +} + +// +// EnqueueDMAOp +// + +def VPUIP_EnqueueDMAOp : + VPUIP_TaskOp<1, "EnqueueDMA", + [ + ViewLikeOpInterface, + VPUIP_DMATypeOpInterface, + DotInterface + ] + > { + let summary = "Task enqueue DMA operation"; + + let arguments = (ins + AnyMemRef:$input, + AnyMemRef:$output_buff, + OptionalAttr:$port, + UnitAttr:$is_out_of_order, + UnitAttr:$is_critical, + OptionalAttr:$dma_hwp_id, + OptionalAttr:$profilingMetadata, + VPUIP_EnqueueDMAAttr: $enqueue_dma_attr + ); + + let results = (outs + AnyMemRef:$output + ); + + let extraClassDeclaration = [{ + static vpux::VPU::ExecutorKind getExecutorKind() { + return vpux::VPU::ExecutorKind::DMA_NN; + } + + vpux::DotNodeColor getNodeColor() { + return vpux::DotNodeColor::GREEN; + } + }] # commonExtraClassDeclaration; + + let assemblyFormat = [{ + attr-dict + `inputs` `(` $input `:` type($input) `)` + `outputs` `(` $output_buff `:` type($output_buff) `)` + `enqueue_dma_attr` `(` $enqueue_dma_attr `)` + `->` type(results) + }]; + + let hasVerifier = 1; +} + #endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/passes.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/passes.td index 20e9b987f3..6a517da801 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/passes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/passes.td @@ -19,7 +19,7 @@ def AddSwKernelInstructionPrefetch : PassBase<"add-sw-kernel-instruction-prefetc This pass adds a CACHE_PREFETCH SwKernel which loads specified kernel's instructions into L2 cache. Prefetch operations are added at the beginning of the inference if there is enough idle time on shave which can be configured via minimumShaveStartTimeForPrefetch option. - + Pass will first go over SW kernels present in the inference and gather unique kernels until it runs out of free cache space or until it encounters a cache handling operation. After gathering prefetch candidates it checks if first SW kernel start time is greater than minimumShaveStartTimeForPrefetch and inserts prefetch @@ -37,19 +37,19 @@ def AddSwKernelInstructionPrefetch : PassBase<"add-sw-kernel-instruction-prefetc Before: DMA | DMA ... DMA DMA ... DMA - Cluster0 DPU | DPU ... DPU + Cluster0 DPU | DPU ... DPU Cluster0 SHAVE0 | softmax, convert Cluster0 SHAVE1 | softmax - Cluster1 DPU | DPU ... DPU + Cluster1 DPU | DPU ... DPU Cluster1 SHAVE0 | topk Cluster1 SHAVE1 | topk After: DMA | DMA ... DMA DMA ... DMA - Cluster0 DPU | DPU ... DPU + Cluster0 DPU | DPU ... DPU Cluster0 SHAVE0 | prefetch softmax softmax, convert Cluster0 SHAVE1 | prefetch convert softmax - Cluster1 DPU | DPU ... DPU + Cluster1 DPU | DPU ... DPU Cluster1 SHAVE0 | prefetch topk topk Cluster1 SHAVE1 | topk }]; @@ -280,6 +280,7 @@ def ConvertEltwiseToInPlace : PassBase<"convert-eltwise-to-in-place", "vpux::Fun // ConvertSprLUTToConst // + def ConvertSprLUTToConst : PassBase<"convert-sprlut-to-const", "vpux::FunctionPass"> { let summary = "Convert sprLUT from PPE attribute to Const"; @@ -294,6 +295,20 @@ def ConvertSprLUTToConst : PassBase<"convert-sprlut-to-const", "vpux::FunctionPa let constructor = "vpux::VPUIP::createConvertSprLUTToConstPass()"; } +// +// ConvertPalletLUTToConst +// + +def ConvertPalletLUTToConst : PassBase<"convert-pallet-lut-to-const", "vpux::FunctionPass"> { + let summary = "Convert pallet LUT to Const"; + + let description = [{ + Extract pallet LUT from input quantile type and make it Const inputs. + }]; + + let constructor = "vpux::VPUIP::createConvertPalletLUTToConstPass()"; +} + // // InsertCopyForEltwiseInPlaceInput // @@ -441,28 +456,6 @@ def BreakDataFlow : PassBase<"break-data-flow", "vpux::FunctionPass"> { let constructor = "vpux::VPUIP::createBreakDataFlowPass()"; } -// -// DMATaskProfilingReserveMem -// - -def DMATaskProfilingReserveMem : PassBase<"dma-task-profiling-reserve-mem", "vpux::ModulePass"> { - let summary = "DMA task profiling memory reserving"; - - let description = [{ - This pass adds in ModuleOp information about reserved memory for DMA profiling. - }]; - - let options = [ - Option< - "enableDMAProfiling", "dma-profiling", - "std::string", [{"false"}], - "Enable DMA task profiling (true|static|false)" - > - ]; - - let constructor = "vpux::VPUIP::createDMATaskProfilingReserveMemPass()"; -} - // // DMATaskProfilingAfterBarrierSched // @@ -474,14 +467,6 @@ def DMATaskProfilingAfterBarrierSched : PassBase<"dma-task-profiling-after-barri This pass adds DMA profiling tasks after barrier scheduler. }]; - let options = [ - Option< - "enableDMAProfiling", "dma-profiling", - "std::string", [{"false"}], - "Enable DMA task profiling (true|static|false)" - > - ]; - let constructor = "vpux::VPUIP::createDMATaskProfilingAfterBarrierSchedPass()"; } @@ -1107,6 +1092,21 @@ def UnrollExpandDMA : PassBase<"unroll-expand-dma", "vpux::FunctionPass"> { let constructor = "vpux::VPUIP::createUnrollExpandDMAPass()"; } +// +// UnrollGatherDMA +// + +def UnrollGatherDMA : PassBase<"unroll-gather-dma", "vpux::FunctionPass"> { + let summary = "Unroll gather DMA task with several NN DMA tasks"; + + let description = [{ + This pass unrolls GatherDMA tasks with several NN DMA tasks, which are functionally equivalent. + Each sub GatherDMA will be converted to a NNDMA. + }]; + + let constructor = "vpux::VPUIP::createUnrollGatherDMAPass()"; +} + // // UnrollPerAxisTileDMA // @@ -1270,24 +1270,6 @@ def ConvertAsyncOpsToTasks : PassBase<"convert-async-ops-to-tasks", "vpux::Funct ]; } - -// -// UnrollPermuteToNNDMA -// - -def UnrollPermuteToNNDMA : PassBase<"unroll-permute-to-nndma", "vpux::FunctionPass"> { - let summary = "Transform PermuteDMA task with one or several PermuteDMA tasks"; - - let description = [{ - This pass unrolls PermuteDMA task to one or several PermuteDMA tasks. - The number of PermuteDMA depend on the number of planes (num_planes <= 256). - 1. NCHW -> NHWC: The number of planes is C. - 2. NHWC -> NCHW: The number of planes is H * W, and W must <= 256. - }]; - - let constructor = "vpux::VPUIP::createUnrollPermuteToNNDMAPass()"; -} - // // Swizzling // @@ -1712,44 +1694,6 @@ def AdjustSpillSize : PassBase<"adjust-spill-size", "vpux::FunctionPass"> { let constructor = "vpux::VPUIP::createAdjustSpillSizePass()"; } -// -// Reserve memory for compressed DMA -// - -def CompressDmaReserveMem : PassBase<"compress-dma-reserve-mem", "vpux::ModulePass"> { - let summary = "Reserve memory for additional compressedDMA metadata"; - - let description = [{ - Reserve memory in CMX where additional metadata is stored for compressed DMAs handling activation spilling. - In this memory Compressed DMA wil store a LUT with actual compressed data sizee that is later to be used - by decompress DMA - }]; - - let constructor = "vpux::VPUIP::createCompressDmaReserveMemPass()"; -} - -// -// SWKernelPrefetchingReserveMem -// - -def SWKernelPrefetchingReserveMem : PassBase<"sw-kernel-prefetching-reserve-mem", "vpux::ModulePass"> { - let summary = "Reserve memory for SW Kernel data prefetching"; - - let description = [{ - SW Kernel reads extra few bytes of data for better performance. - When input buffer is at the end of CMX, reading extra data would cause ACT SHAVE read violation. - Now we reserve CMX for profiling and activation compression at the end of CMX. - This pass checks the total reserved memory size and inserts a dummy section if necessary, to ensure - the total reserved memory size is safe for SW Kenel data prefetching. - - This pass eliminates the risk of ACT SHAVE read violation, but model performance might be impacted. - One case is described in E#122488, where DMA and DPU execution pipeline is impacted by the memory layout adjustment. - This can be optimized in the future by CMX allocation improvement. - }]; - - let constructor = "vpux::VPUIP::createSWKernelPrefetchingReserveMemPass()"; -} - // // FuseDDRCopiesIntoConcats // diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/types.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/types.td index 78d009f129..412d803381 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/types.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIP/types.td @@ -207,7 +207,11 @@ def VPUIP_BoundedBuffer : "BoundedBuffer", [ DeclareTypeInterfaceMethods, - DeclareTypeInterfaceMethods + DeclareTypeInterfaceMethods, + DeclareTypeInterfaceMethods ] > { let summary = "VPUIP Bounded Buffer Type"; diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIPDPU/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIPDPU/CMakeLists.txt index 7156d0ae1a..fc880c65d0 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIPDPU/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIPDPU/CMakeLists.txt @@ -4,6 +4,7 @@ # add_vpux_dialect(VPUIPDPU) +add_vpux_ops(VPUIPDPU GENERIC) add_vpux_ops_interface(VPUIPDPU dialect/VPUIPDPU/) add_vpux_attribute(VPUIPDPU ENABLE_VPUX_ENUMS) add_vpux_pass(VPUIPDPU VPUIPDPU dialect/VPUIPDPU/) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIPDPU/attributes.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIPDPU/attributes.td index ce28bb75d2..d600425898 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIPDPU/attributes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIPDPU/attributes.td @@ -369,6 +369,10 @@ def VPUIPDPU_IDUWorkloadType : I32EnumAttrCase<"REDUCEMEAN", 5>, I32EnumAttrCase<"REDUCESUMSQUARE", 6>, I32EnumAttrCase<"REDUCESUM", 7>, + I32EnumAttrCase<"ACCU_STORE_I32", 8>, + I32EnumAttrCase<"ACCU_STORE_FP32", 9>, + I32EnumAttrCase<"ACCU_LOAD_I32", 10>, + I32EnumAttrCase<"ACCU_LOAD_FP32", 11> ] > { let cppNamespace = "vpux::VPUIPDPU"; diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIPDPU/ops.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIPDPU/ops.td index c262e9ec31..fcbce122f7 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIPDPU/ops.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUIPDPU/ops.td @@ -46,7 +46,7 @@ def VPUIPDPU_DPUInvariantOp : VPUASM_IndexTypeAttr:$task_index, OptionalAttr:$task_location, - SymbolRefAttr:$input, + OptionalAttr:$input, OptionalAttr:$input_sparsity_map, OptionalAttr:$input_storage_element_table, diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI37XX/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI37XX/CMakeLists.txt index 8581a512ac..8e32ff8b25 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI37XX/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI37XX/CMakeLists.txt @@ -4,6 +4,7 @@ # add_vpux_dialect(VPUMI37XX) +add_vpux_ops(VPUMI37XX GENERIC) add_vpux_ops_interface(VPUMI37XX dialect/VPUMI37XX/) add_vpux_attribute(VPUMI37XX ENABLE_VPUX_ENUMS ENABLE_VPUX_ATTR) add_vpux_type(VPUMI37XX) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI37XX/ops.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI37XX/ops.td index 05bda76eea..2c0e7a8662 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI37XX/ops.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI37XX/ops.td @@ -16,6 +16,7 @@ include "vpux/compiler/dialect/VPUMI37XX/ops_interfaces.td" include "vpux/compiler/dialect/VPUMI37XX/types.td" include "vpux/compiler/dialect/VPURT/types.td" include "vpux/compiler/dialect/VPU/attributes.td" +include "vpux/compiler/dialect/config/attributes.td" include "vpux/compiler/dialect/VPU/ops_interfaces.td" include "vpux/compiler/dialect/VPUIP/types.td" include "vpux/compiler/dialect/VPUIP/attributes.td" @@ -707,7 +708,7 @@ def PlatformInfoOp : let summary = "Represents an op describing various platform information, such as arch, revision, etc."; let arguments = (ins - VPU_ArchKindAttr:$archKind + Config_ArchKindAttr:$archKind ); let assemblyFormat = [{ diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI37XX/types.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI37XX/types.td index 0f857f0070..0162fbbf68 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI37XX/types.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI37XX/types.td @@ -1,5 +1,5 @@ // -// Copyright (C) 2025 Intel Corporation. +// Copyright (C) 2022-2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/CMakeLists.txt index e86eb731e6..a39996dba9 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/CMakeLists.txt @@ -4,6 +4,7 @@ # add_vpux_dialect(VPUMI40XX) +add_vpux_ops(VPUMI40XX GENERIC) add_vpux_ops_interface(VPUMI40XX dialect/VPUMI40XX/) add_vpux_attr_interface(VPUMI40XX dialect/VPUMI40XX/) add_vpux_attribute(VPUMI40XX ENABLE_VPUX_ENUMS ENABLE_VPUX_ATTR) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/ops.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/ops.td index 94a9778ebe..6d67d5d56d 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/ops.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/ops.td @@ -258,7 +258,9 @@ def VPUMI40XX_NNDMAOp : Optional:$indices, Optional:$enqueueBarrier, OptionalAttr:$wlmPage, - OptionalAttr:$physical_barrier_range + OptionalAttr:$physical_barrier_range, + OptionalAttr:$enqueue_dma_attr, + OptionalAttr:$fetch_dma ); let results = (outs @@ -737,7 +739,7 @@ def VPUMI40XX_DPUInvariantOp : let arguments = (ins Optional:$taskLocation, Optional:$previousTask, - MemRefOf<[F16, BF16, quant_QuantizedType]>:$input, + Optional>:$input, Optional>:$input_sparsity_map, Optional>:$input_storage_element_table, Optional>:$weights, @@ -818,7 +820,7 @@ def VPUMI40XX_DPUInvariantOp : attr-dict (`taskLocation` `(` $taskLocation^ `:` type($taskLocation) `)`)? ( `previousTask` `(` $previousTask^ `:` type($previousTask) `)`)? - `input` `(` $input `:` type($input) `)` + (`input` `(` $input^ `:` type($input) `)`)? (`input_sparsity_map` `(` $input_sparsity_map^ `:` type($input_sparsity_map) `)`)? (`input_storage_element_table` `(` $input_storage_element_table^ `:` type($input_storage_element_table) `)`)? (`weights` `(` $weights^ `:` type($weights) `)`)? diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/passes.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/passes.td index 6beff842ee..9d6db13e4d 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/passes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/passes.td @@ -29,10 +29,7 @@ def SetupProfilingVPUMI40XX : PassBase<"setup-profiling-VPUMI40XX", "vpux::Modul let dependentDialects = [ "vpux::VPUIP::VPUIPDialect", - "vpux::VPURT::VPURTDialect", - "vpux::VPUMI40XX::VPUMI40XXDialect", - "vpux::VPURegMapped::VPURegMappedDialect", - "vpux::ELFNPU37XX::ELFNPU37XXDialect" + "vpux::VPUMI40XX::VPUMI40XXDialect" ]; } @@ -174,6 +171,20 @@ def LinkEnqueueTargets : PassBase<"link-enqueue-targets", "vpux::FunctionPass"> }]; let constructor = "vpux::VPUMI40XX::createLinkEnqueueTargetsPass()"; + + let options = [ + Option< + "workloadManagementModeOpt", "workload-management-mode", + "vpux::WorkloadManagementMode", "", + "Setting which controls mode of WLM", + [{::llvm::cl::values( + clEnumValN(vpux::WorkloadManagementMode::PWLM_V0_LCA, "PWLM_V0_LCA", "WLM enqueue barriers search algorithm at VPURT ENABLED"), + clEnumValN(vpux::WorkloadManagementMode::PWLM_V1_BARRIER_FIFO, "PWLM_V1_BARRIER_FIFO", "WLM enqueue barriers search algorithm at VPURT DISABLED"), + clEnumValN(vpux::WorkloadManagementMode::PWLM_V2_PAGES, "PWLM_V2_PAGES", "WLM with split into subgraphs(pages)"), + clEnumValN(vpux::WorkloadManagementMode::FWLM_V1_PAGES, "FWLM_V1_PAGES", "Full WLM with split into pages") + )}] + > + ]; } def LinkAllOps : PassBase<"link-all-ops", "vpux::FunctionPass"> { @@ -199,12 +210,12 @@ def UnrollFetchTaskOps : PassBase<"unroll-fetch-ops", "vpux::FunctionPass"> { } def SplitEnqueueOps : PassBase<"split-enqueue-ops", "vpux::FunctionPass"> { - let summary = [{ Split enqueue Ops fot avoid bug when we have last op in task buffer in LL }]; + let summary = [{ Split enqueue Ops to avoid bug when we have last op in task buffer in LL }]; let description = [{ To account for preemption at a group boundary level scenario we have to leave last task in fetch group - unlinked. The problem with preemmption is due to concurrency between thread processing preemption and execution - thread, upon restore of previous state we can't realiably check if last task in group was linked, since it could be + unlinked. The problem with preemption is due to concurrency between thread processing preemption and execution + thread, upon restore of previous state we can't reliably check if last task in group was linked, since it could be replaced already with tasks from next group by execution thread. The solution is to leave last tasks in groups always unlinked. @@ -217,6 +228,25 @@ def SplitEnqueueOps : PassBase<"split-enqueue-ops", "vpux::FunctionPass"> { let constructor = "vpux::VPUMI40XX::createSplitEnqueueOpsPass()"; } +def SplitEnqueueDmaOps : PassBase<"split-enqueue-dma-ops", "vpux::FunctionPass"> { + let summary = [{ Split enqueue Ops to avoid bug when we have last op in task buffer in LL }]; + + let description = [{ + To account for preemption at a group boundary level scenario we have to leave last task in fetch group + unlinked. The problem with preemption is due to concurrency between thread processing preemption and execution + thread, upon restore of previous state we can't reliably check if last task in group was linked, since it could be + replaced already with tasks from next group by execution thread. The solution is to leave last tasks in groups + always unlinked. + + This pass splits single enqueue dma operation into multiple based on special attribute + (lastSecondaryTaskInExecutionGroup) set by previous passes in the pipeline (group-execution-ops). Since single + enqueue operation may cover a range of tasks spanning through multiple Fetch groups, it may be split into more + than 2 enqueue dma operations. + }]; + + let constructor = "vpux::VPUMI40XX::createSplitEnqueueDmaOpsPass()"; +} + def UnrollEnqueueOps : PassBase<"unroll-enqueue-ops", "vpux::FunctionPass"> { let summary = [{ Unroll enqueue Ops so that each targets only one task}]; @@ -231,7 +261,7 @@ def LinkEnqueueOpsForSameBarrier : PassBase<"link-enqueue-ops-for-same-barrier", def AddBootstrapBarriers : PassBase<"add-bootstrap-barriers", "vpux::FunctionPass"> { let summary = [{ Add Bootstrap Barriers}]; - let description = [{Set the initial barriers to be used during inference, we set the first BootstrapOp and a count + let description = [{Set the initial barriers to be used during inference, we set the first BootstrapOp and a count of how bootstrap barriers. Bootstrap operation allow us start barriers in any order}]; let constructor = "vpux::VPUMI40XX::createAddBootstrapBarriersPass()"; @@ -239,9 +269,27 @@ def AddBootstrapBarriers : PassBase<"add-bootstrap-barriers", "vpux::FunctionPas def AddBootstrapWorkItems : PassBase<"add-bootstrap-work-items", "vpux::FunctionPass"> { let summary = [{ Add Bootstrap WorkItems}]; - let description = [{Set the bootstrap work items in the blob, these WorkiIems are enqueued by FW directly without a consumer interrupt}]; + let description = [{ + Analyze schedule and check presence of already present EnqueueOps (WorkItems) or enqueue DMAs. For initial range + of tasks not handled by those enqueues create bootstrap EnqueueOps that will be scheduled directly by FW + at the beginning of inference. + }]; let constructor = "vpux::VPUMI40XX::createAddBootstrapWorkItemsPass()"; + + let options = [ + Option< + "workloadManagementModeOpt", "workload-management-mode", + "vpux::WorkloadManagementMode", "", + "Setting which controls mode of WLM", + [{::llvm::cl::values( + clEnumValN(vpux::WorkloadManagementMode::PWLM_V0_LCA, "PWLM_V0_LCA", "WLM enqueue barriers search algorithm at VPURT ENABLED"), + clEnumValN(vpux::WorkloadManagementMode::PWLM_V1_BARRIER_FIFO, "PWLM_V1_BARRIER_FIFO", "WLM enqueue barriers search algorithm at VPURT DISABLED"), + clEnumValN(vpux::WorkloadManagementMode::PWLM_V2_PAGES, "PWLM_V2_PAGES", "WLM with split into subgraphs(pages)"), + clEnumValN(vpux::WorkloadManagementMode::FWLM_V1_PAGES, "FWLM_V1_PAGES", "Full WLM with split into pages") + )}] + > + ]; } def NextSameIdAssignment : PassBase<"next-same-id-assignment", "vpux::FunctionPass"> { @@ -350,8 +398,8 @@ def AddBarrierConfigurationOps : PassBase<"add-barrier-configuration-ops", "vpux This mode does everything done at WorkloadManagementBarrierProgrammingMode::INITIAL_BARRIER_DMAS_SCHEDULED plus creates DMAs in the schedule at right place which will push barrier configurations to FIFO_BARRIERS_NCE_FILL_BARRIER_FIFO_ADR register. In this case runtime doesn't need to program any barriers - In full Workload Management mode, it is recommended to enable the consumer interrupt on at least one barrier per page to maintain inference activity. - This mechanism allows the firmware to send periodic heartbeat refresh to the host, indicating that the inference is still running. To reduce interrupt overhead, + In full Workload Management mode, it is recommended to enable the consumer interrupt on at least one barrier per page to maintain inference activity. + This mechanism allows the firmware to send periodic heartbeat refresh to the host, indicating that the inference is still running. To reduce interrupt overhead, the consumer interrupt is typically set only on the first barrier within each page, though less frequent signaling may still be sufficient depending on system requirements. }]; @@ -400,4 +448,27 @@ def AddEnqueueDMAOps : PassBase<"add-enqueue-dma-ops", "vpux::FunctionPass"> { }]; } +def UpdateEnqueueDMAInputAndOutput : PassBase<"update-enqueue-dma-input-and-output", "vpux::FunctionPass"> { + let summary = "Update input and output data for DMAOps that perform task enqueue"; + + let constructor = "vpux::VPUMI40XX::createUpdateEnqueueDMAInputAndOutput()"; + + let description = [{ + For each DMA that perform task enqueue (has EnqueueDmaAttr) update its input with a constant that contains + the address of task descriptor in CMX metadata space and output with HW FIFO register for given task type + }]; +} + +def ConvertFetchDmasToFetchTaskOps : PassBase<"convert-fetch-dmas-to-fetch-task-ops", "vpux::FunctionPass"> { + let summary = [{ Insert Fetch operations using predefined placeholders }]; + + let description = [{ + Insert FetchOps for DPU & Shv tasks + These tasks are responsible to copy descriptors for Variant,Invariant,ShaveInvocation and ShaveRange + to CMX Metadata space. Lower in the pipeline these tasks are lowered to NNDMAs + }]; + + let constructor = "vpux::VPUMI40XX::createConvertFetchDmasToFetchTaskOpsPass()"; +} + #endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/types.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/types.td index 572373b664..03e2573a39 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/types.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPUMI40XX/types.td @@ -1,5 +1,5 @@ // -// Copyright (C) 2025 Intel Corporation. +// Copyright (C) 2022-2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURT/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURT/CMakeLists.txt index ba34b6207d..944ecf678a 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURT/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURT/CMakeLists.txt @@ -4,6 +4,7 @@ # add_vpux_dialect(VPURT) +add_vpux_ops(VPURT GENERIC) add_vpux_ops_interface(VPURT dialect/VPURT/) add_vpux_type(VPURT) add_vpux_attribute(VPURT ENABLE_VPUX_ENUMS ENABLE_VPUX_ATTR) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/CMakeLists.txt index 3efc1de6ff..72cd7199bc 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/CMakeLists.txt @@ -4,6 +4,7 @@ # add_vpux_dialect(VPURegMapped) +add_vpux_ops(VPURegMapped GENERIC) add_vpux_ops_interface(VPURegMapped dialect/VPURegMapped/) add_vpux_attribute(VPURegMapped ENABLE_VPUX_ENUMS ENABLE_VPUX_ATTR) add_vpux_type(VPURegMapped) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/attr_interfaces.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/attr_interfaces.td index 2e7cf31cae..cc7072217a 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/attr_interfaces.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/attr_interfaces.td @@ -36,7 +36,7 @@ class PropertyBase : Property< }]; let convertFromAttribute = [{ auto desc = mlir::cast<}] # dialect # [{::}] # attr # [{>($_attr).getRegMapped(); - $_storage = desc; + $_storage = std::move(desc); return ::mlir::success(); }]; diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/ops.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/ops.td index 37036ba195..df2b065ea7 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/ops.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/ops.td @@ -90,6 +90,8 @@ def VPURegMapped_FetchTaskOp: let summary = "Op to represent fetching of specific tasks"; let arguments = (ins + Variadic:$waitBarriers, + Variadic:$updateBarriers, Optional:$previousTask, VPURegMapped_IndexType:$primary_start, VPURegMapped_IndexType:$primary_end, @@ -107,6 +109,8 @@ def VPURegMapped_FetchTaskOp: ); let assemblyFormat = [{ + (`waits` `(` $waitBarriers^ `:` type($waitBarriers) `)`)? + (`updates` `(` $updateBarriers^ `:` type($updateBarriers) `)`)? (`previousTask` `(` $previousTask^ `:` type($previousTask) `)` )? `primary` `(` $primary_start `->` $primary_end `)` `secondary` `(` $secondary_start `->` $secondary_end `)` diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/ops_interfaces.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/ops_interfaces.td index c78af18eb7..34b4a238de 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/ops_interfaces.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/ops_interfaces.td @@ -218,6 +218,35 @@ def VPURegMapped_DMATypeOpInterface : OpInterface<"DMATypeOpInterface"> { "void", "setPreviousTaskForOp", (ins "mlir::Value":$previousTask), [{}], [{ $_op.getPreviousTaskMutable().assign(previousTask); }] + >, + InterfaceMethod< + "Barriers that will free this task to run", + "mlir::ValueRange", "waitBarriers", (ins), + [{ + return $_op.getWaitBarriers(); + }] + >, + InterfaceMethod< + "Barriers that will free this task to run", + "mlir::MutableOperandRange", "waitBarriersMutable", (ins), + [{ + return $_op.getWaitBarriersMutable(); + }] + >, + + InterfaceMethod< + "Barriers that will be at least partially unlocked when this task is complete", + "mlir::ValueRange", "updateBarriers", (ins), + [{ + return $_op.getUpdateBarriers(); + }] + >, + InterfaceMethod< + "Barriers that will be at least partially unlocked when this task is complete", + "mlir::MutableOperandRange", "updateBarriersMutable", (ins), + [{ + return $_op.getUpdateBarriersMutable(); + }] > ]; } diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/types.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/types.td index 31e215e5cf..a2f84e9f94 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/types.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPURegMapped/types.td @@ -1,5 +1,5 @@ // -// Copyright (C) 2025 Intel Corporation. +// Copyright (C) 2022-2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/CMakeLists.txt index 0abbe47e13..fb8b335dd1 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/CMakeLists.txt @@ -4,4 +4,5 @@ # add_vpux_dialect(config) +add_vpux_ops(config GENERIC) add_vpux_attribute(config ENABLE_VPUX_ENUMS ENABLE_VPUX_ATTR) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/attributes.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/attributes.td index a21e229653..d4ed4925f6 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/attributes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/attributes.td @@ -50,4 +50,45 @@ def Config_CompilationMode : def Config_CompilationModeAttr : Config_EnumAttr; +// +// ArchKind +// + +def Config_ArchKind : + Config_I64EnumAttr< + "ArchKind", + "Represents VPU architecture generation", + [ + I64EnumAttrCase<"UNKNOWN", 0>, + I64EnumAttrCase<"NPU37XX", 3>, + I64EnumAttrCase<"NPU40XX", 4>, + ] + > { +} + +def Config_ArchKindAttr : Config_EnumAttr; + +// +// RevisionID +// + +def Config_RevisionID : + Config_I64EnumAttr< + "RevisionID", + "Revision ID", + [ + I64EnumAttrCase<"REVISION_A0", 0>, + I64EnumAttrCase<"REVISION_A1", 1>, + I64EnumAttrCase<"REVISION_A3", 2>, + I64EnumAttrCase<"REVISION_B", 3>, + I64EnumAttrCase<"REVISION_C", 4>, + I64EnumAttrCase<"REVISION_D", 5>, + I64EnumAttrCase<"REVISION_K", 6>, + I64EnumAttrCase<"REVISION_NONE", 7> + ] + > { +} + +def Config_RevisionIDAttr : Config_EnumAttr; + #endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/ops.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/ops.td index e81d1ee45a..049bad2829 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/ops.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/ops.td @@ -60,7 +60,7 @@ def Config_OptionOp : let arguments = (ins SymbolNameAttr:$sym_name, - AnyAttrOf<[IntAttr, F32Attr, BoolAttr]>:$optionValue + AnyAttrOf<[IntAttr, F32Attr, BoolAttr, StrAttr]>:$optionValue ); let assemblyFormat = [{ diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/ops_interfaces.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/ops_interfaces.td new file mode 100644 index 0000000000..751a29a6b1 --- /dev/null +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/config/ops_interfaces.td @@ -0,0 +1,29 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef VPUX_COMPILER_DIALECT_CONFIG_OPS_INTERFACES +#define VPUX_COMPILER_DIALECT_CONFIG_OPS_INTERFACES + +include "mlir/IR/OpBase.td" + +// +// DefinedInArch op trait +// + +class DefinedInArch + : ParamNativeOpTrait<"DefinedInArch", arch> { + let cppNamespace = "vpux::config"; +} + +// +// LimitedToArch op trait +// + +class LimitedToArch archs> + : ParamNativeOpTrait<"LimitedToArch", !interleave(archs, ", ")> { + let cppNamespace = "vpux::config"; +} + +#endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/const/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/const/CMakeLists.txt index b383afd5fe..e5fad9d0ad 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/const/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/const/CMakeLists.txt @@ -4,6 +4,7 @@ # add_vpux_dialect(const) +add_vpux_ops(const GENERIC) add_vpux_attr_interface(Const dialect/const/) add_vpux_attribute(Const ENABLE_VPUX_ATTR) add_vpux_pass(Const Const dialect/const/) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/core/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/core/CMakeLists.txt index 8379aadd0f..36eed59aac 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/core/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/core/CMakeLists.txt @@ -4,6 +4,7 @@ # add_vpux_dialect(Core) +add_vpux_ops(Core GENERIC) add_vpux_attr_interface(Core core/) add_vpux_ops_interface(Core core/) add_vpux_type_interface(Core core/) diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/core/passes.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/core/passes.td index 4cde527540..e3ec686eb4 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/core/passes.td +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/core/passes.td @@ -208,7 +208,7 @@ def UnpackNestedModules : PassBase<"unpack-nested-modules", "vpux::ModulePass"> // // AddNetInfoToModule -// +// def AddNetInfoToModule : PassBase<"add-netinfo-to-module", "vpux::ModulePass"> { let summary = "add net::NetInfoOp to a nested module based on function argument and results inside the module"; @@ -217,7 +217,33 @@ def AddNetInfoToModule : PassBase<"add-netinfo-to-module", "vpux::ModulePass"> { This pass adds NetInfo to nested ModuleOp for vpux passes that require the inputInfos and outputInfos information. }]; + let options = [ + Option< + "hasTensorSemantics", "has-tensor-semantics", + "bool", "false", + "Specifies if outputs come as inputs, to accommodate the usage for VPUIP or IE-VPU" + >, + ]; + let constructor = "vpux::Core::createAddNetInfoToModulePass()"; } +def WsFoldReinterpretCastIntoConst : PassBase<"ws-fold-reinterpret-cast-into-const", "vpux::FunctionPass"> { + let summary = "ReinterpretCastOp::fold() substitute"; + + let description = [{ + This pass substitutes folding mechanism for Core.ReinterpretCast. Under + normal conditions (production IR), ReinterpretCast is *never* expected + to have Const.Declare as direct origin. However, in some limited scope + the Const.Declare -> ReinterpretCast chain could appear (e.g. debug-only + monolithic weights separation pipeline). In order to have valid + legalization of ReinterpretCast in this case, this pass exists. + + Warning: the pass does NOT attempt to be optimal in any way. Instead, it + does the bare minimum to legalize IR. + }]; + + let constructor = "vpux::Core::createWsFoldReinterpretCastIntoConstPass()"; +} + #endif diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/net/CMakeLists.txt b/src/vpux_compiler/tblgen/vpux/compiler/dialect/net/CMakeLists.txt index 2d273abcbf..2b6952d442 100644 --- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/net/CMakeLists.txt +++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/net/CMakeLists.txt @@ -4,3 +4,4 @@ # add_vpux_dialect(net) +add_vpux_ops(net GENERIC) diff --git a/src/vpux_driver_compiler/CMakeLists.txt b/src/vpux_driver_compiler/CMakeLists.txt index da4f1e53c8..e1d04fc414 100644 --- a/src/vpux_driver_compiler/CMakeLists.txt +++ b/src/vpux_driver_compiler/CMakeLists.txt @@ -21,12 +21,12 @@ add_subdirectory(test) install( FILES - "${InferenceEngineVPUXPlugin_SOURCE_DIR}/src/vpux_driver_compiler/CHANGES.txt" + "${PROJECT_SOURCE_DIR}/src/vpux_driver_compiler/CHANGES.txt" DESTINATION cid COMPONENT ${CID_COMPONENT}) install( FILES - "${InferenceEngineVPUXPlugin_SOURCE_DIR}/src/vpux_driver_compiler/README.md" + "${PROJECT_SOURCE_DIR}/src/vpux_driver_compiler/README.md" DESTINATION cid COMPONENT ${CID_COMPONENT}) diff --git a/src/vpux_driver_compiler/README.md b/src/vpux_driver_compiler/README.md index 592978c75a..b3863be3c7 100644 --- a/src/vpux_driver_compiler/README.md +++ b/src/vpux_driver_compiler/README.md @@ -1,91 +1,66 @@ # What is Driver Compiler -This guide introduces Driver Compiler for Intel® Neural Processing Unit (NPU) devices. Driver Compiler is a set of C++ libraries providing a common API that allows the User Mode Driver to access compiler functions through vcl* interface methods. The action here is essentially compiling the IR format to the blob format. +This guide introduces Driver Compiler (also known as Compiler in Driver or CiD) for Intel® Neural Processing Unit (NPU) devices. Driver Compiler is a set of C++ libraries providing a common API that allows the User Mode Driver to access compiler functions through vcl* interface methods. The action here is essentially compiling the IR format to the blob format. To learn more about Driver Compiler, please see [intel_npu/README.md](https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_npu/README.md) in [OpenVINO Project]. ## Components -The main components for Driver Compiler are : +The main components for Driver Compiler are: * [CHANGES.txt](CHANGES.txt) contains the Driver Compiler history of changes. * [docs](./docs/) - documents that describe building and testing the Driver Compiler. * [loader](./src/loader/) - contains cmakefile to build and pack the elf from thirdparty used for some testing purposes. * [vpux_compiler_l0](./src/vpux_compiler_l0/) - contains source files of Driver Compiler. * [test](./test/) - contains test tools. +You can refer to the [API documentation](./docs/api_reference.md) for the usage workflow and detailed API descriptions. -## Basic workflow - -The main entrypoint for Driver Compiler is `vclCompilerCreate`. The basic work flow is as follow: -```C -... -vclCompilerCreate -... -vclCompilerGetProperties -... -/* If you want to query the supported layers of a network, please call following three lines. */ -... -vclQueryNetworkCreate -... -/* vclQueryNetwork should be called twice, first time to retrieve data size, second time to get data. */ -vclQueryNetwork -... -vclQueryNetworkDestroy -... -/* Fill buffer/weights with data read from command line arguments. Will set result blob size. */ -... -vclExecutableCreate -... -vclExecutableGetSeriablizableBlob -... -blobSize > 0 -blob = (uint8_t*)malloc(blobSize) -vclExecutableGetSeriablizableBlob -... -/* If log handle is created with vclCompilerCreate, can call vclLogHandleGetString to get last error message.*/ -... -vclLogHandleGetString -... -logSize > 0 -log = (char*)malloc(logSize) -vclLogHandleGetString -... -vclExecutableDestroy -vclCompilerDestroy -... -``` + +## Requirements to build related targets locally + +It is recommended to read the requirements section beforehand to ensure that you meet the prerequisites for local building. +- [Linux](./docs/requirements.md#linux-requirements) +- [Windows](./docs/requirements.md#windows-requirements) + +Here are the steps to clone repositories and **configure environment variables** before building. +- [Pre-build preparation](./docs/prebuild.md#pre-build-preparation) ## How to build related targets locally -Driver Compiler provides npu_driver_compiler, compilerTest, profilingTest and loaderTest to compile network and test. To build Driver Compiler related targets locally, refer to +Driver Compiler provides `npu_driver_compiler`, `compilerTest`, `profilingTest` and `loaderTest` to compile network and test. To build Driver Compiler-related targets locally, please refer to -- (Recommended) build using CMake Presets, requiring CMake version 3.19 or higher. - - [linux](./docs/how_to_build_driver_compiler_withCmakePresets_on_linux.md) - - [windows](./docs/how_to_build_driver_compiler_withCmakePresets_on_windows.md) +- (Recommended) Build using CMake Presets, requiring CMake version 3.19 or higher. + - [Linux](./docs/build/build_with_cmake_presets_linux.md) + - [Windows](./docs/build/build_with_cmake_presets_windows.md) -- build with cmake options - - [linux](./docs/how_to_build_driver_compiler_on_linux.md) - - [windows](./docs/how_to_build_driver_compiler_on_windows.md) +- Build with cmake options + - [Linux](./docs/build/build_linux.md) + - [Windows](./docs/build/build_windows.md) +- (Advanced) Build with LLVM Cache + - [Linux](./docs/build/build_with_llvm_cache_linux.md) + - [Windows](./docs/build/build_with_llvm_cache_windows.md) +## How to enable sideloading + +Please refer to [how to sideload the Driver Compiler](./docs/test_and_debug/enable_sideloading.md). -## How to release Driver Compiler package -Unlike local build of Driver Compiler related targets, we need to apply some patches to [OpenVINO Project] and [NPU-Plugin Project] and also pack the elf, pdb and tbb files together to meet driver requirements. ## How to test -Please refer to [How to test](./docs/how_to_test.md). +Please refer to [how to test](./docs/test_and_debug/test.md). -Please note that the instructions for testing the deprecated version [How to test with deprecated version](./docs/how_to_test_with_deprecated_version.md) will be removed in the future. +Please refer to [how to test with legacy methods](./docs/test_and_debug/legacy_test.md). ## How to debug -Please refer to [How to debug](./docs/how_to_debug.md). +Please refer to [how to debug](./docs/test_and_debug/debug.md). + +Please refer to [how to debug with legacy methods](./docs/test_and_debug/legacy_debug.md). -Please note that the instructions for [How to debug with deprecated version](./docs/how_to_debug_with_deprecated_version.md) will be removed in the future. [OpenVINO Project]: https://github.com/openvinotoolkit/openvino [NPU-Plugin Project]: https://github.com/openvinotoolkit/npu_compiler diff --git a/src/vpux_driver_compiler/docs/FAQ.md b/src/vpux_driver_compiler/docs/FAQ.md new file mode 100644 index 0000000000..78e10ad504 --- /dev/null +++ b/src/vpux_driver_compiler/docs/FAQ.md @@ -0,0 +1,218 @@ +# Driver Compiler Build FAQ & Troubleshooting Guide +## Corresponding Commits Required for OpenVINO and NPU-Plugin + +When building the Driver Compiler for Linux, ensure you use a supported OpenVINO version. + +You can find the required OpenVINO version in the [release notes](https://github.com/intel/linux-npu-driver/releases) under "OpenVINO built from source" or in the [Linux NPU driver source](https://github.com/intel/linux-npu-driver/blob/main/compiler/compiler_source.cmake#L20). + +### Common Clone Failure Issues + +If you encounter errors like the following when cloning repositories, you may need to configure your proxy: + +```sh +fatal: unable to access 'https://github.com/your-repo.git/': Could not resolve host: github.com +fatal: unable to access 'https://github.com/your-repo.git/': Failed to connect to github.com port 443: Connection timed out +fatal: unable to access 'https://github.com/your-repo.git/': Received HTTP code 407 from proxy after CONNECT +``` + +
+Set up proxy + +```sh +# Linux +export http_proxy= +export https_proxy= +export no_proxy= + +# Windows +set http_proxy= +set https_proxy= +set no_proxy= +``` +
+ + +## Common Build and Install Issues + +For installation, it is **not recommended** to use `cmake --install . --prefix /usr --component CiD` as this will also install elf, compilerTest, and other CiD targets which are unnecessary. + +### Linux Build Issues + +- **Error:** `c++: internal compiler error: Killed (program cc1plus)` + **Cause:** Usually due to insufficient memory. + **Solution:** Reduce the number of parallel build jobs (e.g., use `-j4` instead of `-j8`), or increase swap space. + +### Windows Build Issues + +1. **Clone error: `filename too long`** + Run: `git config --global core.longpaths true` and enable the long path feature on Windows as follows or this [image](./imgs/long_path_enable.png): + Open the Registry Editor, go to `HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem`, and set the DWORD value `LongPathsEnabled` to `1`. +2. **MT/MD mismatch error:** + If your commit is earlier than `d0719b79c5847` (2024-08-28), do **not** add `-D CMAKE_TOOLCHAIN_FILE=%OPENVINO_HOME%\cmake\toolchains\onecoreuap.toolchain.cmake`. +3. **Ccache is required for Windows build preset:** + If ccache is not installed, you may see `CreateProcess failed: The system cannot find the file specified during the cmake --build ... step.` + + +## TBB-Related Questions +### Choosing the Right TBB (Linux & Windows) +- **OpenVINO auto-download:** OpenVINO will automatically download a prebuilt oneTBB (see `${OPENVINO_HOME}/temp/tbb`), if + 1. `ENABLE_SYSTEM_TBB` is set to `OFF`, **and** + 2. CMake option `-D TBBROOT=/path/to/alternative/oneTBB` is **not passed**, **and** + 3. Environment variable `TBBROOT` is **not set**. +- **System oneTBB:** To use the system TBB, set `-D ENABLE_SYSTEM_TBB=ON` for CMake. +- **Custom oneTBB:** Download or build your own oneTBB from the [oneTBB Project](https://github.com/oneapi-src/oneTBB) and set `TBBROOT` environment variable. + +>Note: If you plan to use the sideloading feature on Windows, we recommend using the OneCore version of TBB. The official release version of TBB does not support OneCore, so please manually compile it according to the documentation. For instructions on how to build OneCore TBB, please see [how to build OneCore TBB](#windows-onecore-tbb-build). +- **No TBB:** Not recommended, as build speed will decrease. You can set `-D THREADING=SEQ` for CMake to disable TBB. + +#### Windows OneCore TBB Build +- Build OneCore oneTBB by yourself. You must build hwloc first, then build oneTBB. See the detailed steps below. + +
+ 1. Clone repository and build hwloc + + The `hwloc` library is a dependency of the `tbbbind_2_5` binary, so its version must be determined before proceeding with the build. The correct version can be confirmed by dumping the `tbbbind_2_5_debug.lib` binary. The corresponding `hwloc` version is `2.8.0` for oneTBB library `v2021.2.5`. + + Download the ZIP folder from [here](https://github.com/open-mpi/hwloc/archive/refs/tags/hwloc-2.8.0.zip), unzip, build the library or follow the commands below. Next, clone oneTBB and build oneTBB with MultiThreaded, + + ```bat + set WORKDIR=%cd% + + curl -L -o hwloc-2.8.0.zip https://github.com/open-mpi/hwloc/archive/refs/tags/hwloc-2.8.0.zip + tar -xf hwloc-2.8.0.zip + cd hwloc-hwloc-2.8.0\contrib\windows-cmake + cmake -A X64 --install-prefix=%cd%\install -DHWLOC_SKIP_TOOLS=ON -DHWLOC_WITH_LIBXML2=OFF -DBUILD_SHARED_LIBS=ON -D CMAKE_MSVC_RUNTIME_LIBRARY="MultiThreaded" -B build + cmake --build build --parallel --config Release + cmake --install build --config Release + + set HWLOC_INSTALL_DIR=%WORKDIR%\hwloc-hwloc-2.8.0\contrib\windows-cmake\install + + cd %WORKDIR% + git clone https://github.com/uxlfoundation/oneTBB.git oneTBB-2021.2.5-static + cd oneTBB-2021.2.5-static + git checkout v2021.2.5 + @REM Use another directory for dynamic build + cd .. + xcopy oneTBB-2021.2.5-static oneTBB-2021.2.5-dynamic /E /H /I /Q + + @REM set command build option environment variable. + @REM CMAKE_MSVC_RUNTIME_LIBRARY="MultiThreaded" build with MT + set buildOption=-D CMAKE_INSTALL_PREFIX=%cd%\install ^ + -D CMAKE_HWLOC_2_5_LIBRARY_PATH=%HWLOC_INSTALL_DIR%\lib\hwloc.lib ^ + -D CMAKE_HWLOC_2_5_INCLUDE_PATH=%HWLOC_INSTALL_DIR%\include ^ + -D CMAKE_HWLOC_2_5_DLL_PATH=%HWLOC_INSTALL_DIR%\bin\hwloc.dll ^ + -D TBB_TEST=OFF ^ + -D CMAKE_MSVC_RUNTIME_LIBRARY="MultiThreaded" + ``` +
+ +
+ 2. Static oneTBB build + + ```bat + set TBB_HOME=%WORKDIR%\oneTBB-2021.2.5-static + cd %TBB_HOME% + + mkdir build_static + cd build_static + + cmake ^ + -G "Visual Studio 16 2019" -A X64 ^ + -D BUILD_SHARED_LIBS=OFF ^ + %buildOption% ^ + .. + + cmake --build . --config Release + cmake --install . --config Release + cmake --build . --config Debug + cmake --install . --config Debug + + copy ..\LICENSE.txt install\LICENSE + copy %HWLOC_INSTALL_DIR%\bin\hwloc.dll install\lib + ``` + All static oneTBB `.lib` libraries will be found in the `build_static\install` folder. +
+ +
+ 3. Dynamic oneTBB build + + ```bat + set TBB_HOME=%WORKDIR%\oneTBB-2021.2.5-dynamic + + cd %TBB_HOME% + mkdir build_dynamic + cd build_dynamic + + cmake ^ + -G "Visual Studio 16 2019" -A X64 ^ + -D BUILD_SHARED_LIBS=ON ^ + %buildOption% ^ + .. + + cmake --build . --config Release + cmake --install . --config Release + cmake --build . --config Debug + cmake --install . --config Debug + + copy ..\LICENSE.txt install\LICENSE + copy %HWLOC_INSTALL_DIR%\bin\hwloc.dll install\bin + ``` + All dynamic oneTBB `.lib` and `.dll` libraries will be found in the `build_dynamic\install` folder. +
+ +#### TBB Settings for CMake Presets + +- You can set `TBBROOT` via environment variable or in the preset's `cacheVariables`. + +
+ Add TBBROOT option to Cmake Preset + + Adding the path to `TBBROOT` in `cacheVariables`, [under `cid` preset](../../../CMakePresets.json#L223), as shown below: + + ```json + "name": "cid", + "cacheVariables": { + "TBBROOT": { + "type": "FILEPATH", + "value": "/path/to/alternative/oneTBB" + } + } + ``` + Linux Cmake Preset usage is listed [here](./build_with_cmake_presets_linux.md). + Windows Cmake Preset usage is listed [here](./build_with_cmake_presets_windows.md). +
+ +- To disable TBB, set `THREADING=SEQ`. +
+ Disable oneTBB commands + + To build Driver Compiler without using oneTBB library (longer model compile time), replace `"value": "TBB"` with `"value": "SEQ"` for `THREADING` in `cacheVariables` under `cid` ([see here](../../../CMakePresets.json#L228)). + + ```json + "name": "cid", + "cacheVariables": { + "THREADING": { + "type": "STRING", + "value": "SEQ" + }, + }, + ``` + Refer to [this document](https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/cmake_options_for_custom_compilation.md#options-affecting-binary-size) for information related to SEQ threading. +
+ +#### TBB Usage with LLVM Cache + +- Any TBB version can be used when generating the LLVM cache, but when building with the cache, the parameters must match those in `build_manifest.txt`. +- Please note that when using sideloading on Windows, you must use the OneCore version of oneTBB. + + +## Additional Notes + +- For detailed CMake options, refer to the [CMake documentation](https://cmake.org/cmake/help/latest/index.html) and the respective `features.cmake` files in [OpenVINO](https://github.com/openvinotoolkit/openvino/blob/master/cmake/features.cmake) and [NPU-Plugin](https://github.com/openvinotoolkit/npu_compiler/blob/develop/cmake/features.cmake) repositories. + + +If you need further clarification or have additional questions, please refer to the main documentation in this repo or contact the development team. + + +[OpenVINO Project]: https://github.com/openvinotoolkit/openvino +[NPU-Plugin Project]: https://github.com/openvinotoolkit/npu_compiler diff --git a/src/vpux_driver_compiler/docs/api_reference.md b/src/vpux_driver_compiler/docs/api_reference.md new file mode 100644 index 0000000000..c2b3ff6ebb --- /dev/null +++ b/src/vpux_driver_compiler/docs/api_reference.md @@ -0,0 +1,908 @@ +# Intel® NPU Driver Compiler API Guide + +This guide provides a comprehensive introduction to the NPU Driver Compiler API covering both current and deprecated functions, along with usage patterns to help you get started quickly. + + +## Table of Contents + +- [API Overview](#1-api-overview) +- [Main Data Structures](#2-main-data-structures) +- [Basic Workflow](#3-basic-workflow) + - [`vclExecutableCreate` Full Procedure](#31-vclexecutablecreate-full-procedure) + - [`vclAllocatedExecutableCreate2` Full Procedure](#32-vclallocatedexecutablecreate2-full-procedure) + - [`vclAllocatedExecutableCreate` Full Procedure (Deprecated)](#33-vclallocatedexecutablecreate-full-procedure-deprecated) + - [Network Querying Workflow](#34-network-querying-workflow) + - [Error Handling Workflow](#35-error-handling-workflow) +- [Detailed API Reference](#4-detailed-api-reference) + - [Version and Properties Information](#41-version-and-properties-information) + - [Compiler Lifecycle](#42-compiler-lifecycle) + - [Network Capability Query](#43-network-capability-query) + - [Executable Creation and Management](#44-executable-creation-and-management) + - [Profiling Operations](#45-profiling-operations) + - [Logging Functions](#46-logging-functions) + - [Configuration Management](#47-configuration-management) +- [Frequently Asked Questions](#frequently-asked-questions) + + +## 1. API Overview + +The NPU Driver Compiler API provides interfaces for compiling neural network models to Intel® Neural Processing Unit (NPU) devices. It consists of functions for Driver Compiler creation and management, network compilation, profiling, error logging and configuration management. + + +## 2. Main Data Structures + +### Handle Types + +- `vcl_compiler_handle_t` — Compiler object handle +- `vcl_executable_handle_t` — Executable object handle +- `vcl_profiling_handle_t` — Profiling object handle +- `vcl_query_handle_t` — Query network object handle +- `vcl_log_handle_t` — Log object handle + +### Structs + +- `vcl_version_info_t` — Version information +- `vcl_compiler_properties_t` — Compiler properties +- `vcl_profiling_properties_t` — Profiling properties +- `vcl_device_desc_t` — Device description +- `vcl_compiler_desc_t` — Compiler description +- `vcl_executable_desc_t` — Executable description +- `vcl_query_desc_t` — Query description +- `vcl_profiling_input_t`, `*p_vcl_profiling_input_t` — Profiling input +- `vcl_profiling_output_t`, `*p_vcl_profiling_output_t` — Profiling output +- `vcl_allocator_t`: Allocator V1 (deprecated) +- `vcl_allocator2_t`: Allocator V2 (recommended) + +### VCL API Return Types + +All vcl API functions return a `vcl_result_t` status code: +| Value | Description | +|--------------------------------------|-----------------------------------| +| VCL_RESULT_SUCCESS | Success | +| VCL_RESULT_ERROR_OUT_OF_MEMORY | Insufficient memory | +| VCL_RESULT_ERROR_UNSUPPORTED_FEATURE | Unsupported feature | +| VCL_RESULT_ERROR_INVALID_ARGUMENT | Invalid argument | +| VCL_RESULT_ERROR_INVALID_NULL_HANDLE | Invalid handle | +| VCL_RESULT_ERROR_IO | IO error | +| VCL_RESULT_ERROR_INVALID_IR | Invalid IR | +| VCL_RESULT_ERROR_UNKNOWN | Unknown/internal error | + + +## 3. Basic Workflow + +A typical workflow consists of the following steps: + +1. **Get API version** using `vclGetVersion()` +2. **Create a compiler instance** using `vclCompilerCreate()` +3. **Perform network operations**: + - (Optional) Query network capabilities with `vclQueryNetworkCreate()` and `vclQueryNetwork()` + - Compile network: + - Compile network with Executable: + - Create executable with `vclExecutableCreate()` + - Get compiled blob with `vclExecutableGetSerializableBlob()` + - Compile network with AllocatedExecutable: + - Create executable with `vclAllocatedExecutableCreate2()` or `vclAllocatedExecutableCreate() (Deprecated)` +4. **Profile execution** (Optional) using the profiling functions +5. **Handle errors** by retrieving logs with `vclLogHandleGetString()` +6. **Clean up resources** by destroying handles with the appropriate destroy functions + +>Note: All objects created by `vclCompilerCreate`, `vclExecutableCreate`, and similar functions must be destroyed by their respective `Destroy` functions to avoid memory leaks. + +### 3.1 `vclExecutableCreate` Full Procedure + +```c +// 1. Create compiler and device description and instantiate compiler +vcl_compiler_desc_t compilerDesc = { ... }; +vcl_device_desc_t deviceDesc = { ... }; +vcl_compiler_handle_t compiler; +vcl_log_handle_t log; +vclCompilerCreate(&compilerDesc, &deviceDesc, &compiler, &log); + +// 2. Prepare model IR (e.g. xml+weights) and optional parameters +vcl_executable_desc_t execDesc = { + .modelIRData = ..., // pointer to IR data + .modelIRSize = ..., // IR data size + .options = ..., // compiler options (optional) + .optionsSize = ..., // options size +}; + +// 3. Create executable object +vcl_executable_handle_t exec; +vcl_result_t ret = vclExecutableCreate(compiler, execDesc, &exec); + +// 4. Export blob (if needed) +uint64_t blobSize; +vclExecutableGetSerializableBlob(exec, NULL, &blobSize); +uint8_t* blob = malloc(blobSize); +vclExecutableGetSerializableBlob(exec, blob, &blobSize); +// After use, free(blob); + +// 5. Destroy executable object +vclExecutableDestroy(exec); + +// 6. Destroy compiler object +vclCompilerDestroy(compiler); +``` +>Note: For the configuration of the `options` field, please refer to the detailed content of [`vclAllocatedExecutableCreate2` API](#vclallocatedexecutablecreate2-recommended). + +### 3.2 `vclAllocatedExecutableCreate2` Full Procedure + +```c +// 1. Create compiler and device description and instantiate compiler +vcl_compiler_desc_t compilerDesc = { ... }; +vcl_device_desc_t deviceDesc = { ... }; +vcl_compiler_handle_t compiler; +vcl_log_handle_t log; +vclCompilerCreate(&compilerDesc, &deviceDesc, &compiler, &log); + +// 2. Prepare model IR (e.g. xml+weights) and optional parameters +vcl_executable_desc_t execDesc = { + .modelIRData = ..., // pointer to IR data + .modelIRSize = ..., // IR data size + .options = ..., // compiler options (optional) + .optionsSize = ..., // options size +}; + +// 3. Create allocate and deallocate functions +uint8_t* my_allocate(vcl_allocator2_t* self, uint64_t size) { return (uint8_t*)malloc(size); } +void my_deallocate(vcl_allocator2_t* self, uint8_t* ptr) { free(ptr); } +vcl_allocator2_t allocator2 = { my_allocate, my_deallocate }; + +// 4. Compile and export blob +uint8_t* blob = NULL; +uint64_t blobSize = 0; +vclAllocatedExecutableCreate2(compiler, execDesc, &allocator2, &blob, &blobSize); + +// 5. Use the allocated blob ... + +// 6. Free blob +allocator2.deallocate(&allocator2, blob); + +// 7. Destroy compiler object +vclCompilerDestroy(compiler); +``` +>Note: For the configuration of the `options` field, please refer to the detailed content of [`vclAllocatedExecutableCreate2` API](#vclallocatedexecutablecreate2-recommended). + +### 3.3 `vclAllocatedExecutableCreate` Full Procedure (Deprecated) + +```c +// 1. Create compiler and device description and instantiate compiler +vcl_compiler_desc_t compilerDesc = { ... }; +vcl_device_desc_t deviceDesc = { ... }; +vcl_compiler_handle_t compiler; +vcl_log_handle_t log; +vclCompilerCreate(&compilerDesc, &deviceDesc, &compiler, &log); + +// 2. Prepare model IR (e.g. xml+weights) and optional parameters +vcl_executable_desc_t execDesc = { + .modelIRData = ..., // pointer to IR data + .modelIRSize = ..., // IR data size + .options = ..., // compiler options (optional) + .optionsSize = ..., // options size +}; + +// 3. Create allocate and deallocate functions +uint8_t* my_allocate(uint64_t size) { return (uint8_t*)malloc(size); } +void my_deallocate(uint8_t* ptr) { free(ptr); } +vcl_allocator_t allocator = { my_allocate, my_deallocate }; + +// 4. Compile, export and free blob (if needed) +uint8_t* blob = NULL; +uint64_t blobSize = 0; +vclAllocatedExecutableCreate(compiler, execDesc, &allocator, &blob, &blobSize); + +// 5. Use the allocated blob ... + +// 6. Free blob +allocator.deallocate(blob); + +// 7. Destroy compiler object +vclCompilerDestroy(compiler); +``` +>Note: For the configuration of the `options` field, please refer to the detailed content of [`vclAllocatedExecutableCreate2` API](#vclallocatedexecutablecreate2-recommended). + +### 3.4 Network Querying Workflow + +```c +// 1. Create query +vcl_query_desc_t queryDesc = { /* initialize with IR data */ }; +vcl_query_handle_t query; +vclQueryNetworkCreate(compiler, queryDesc, &query); + +// 2. Get query result +uint64_t querySize = 0; +vclQueryNetwork(query, NULL, &querySize); +uint8_t* queryBuffer = (uint8_t*)malloc(querySize); +vclQueryNetwork(query, queryBuffer, &querySize); + +// 3. Process query result - the format depends on implementation ... + +// 4. Clean up resources +free(queryBuffer); +vclQueryNetworkDestroy(query); +``` + +### 3.5 Error Handling Workflow + +```c +vcl_result_t result = vclSomeFunction(/* params */); +if (result != VCL_RESULT_SUCCESS) { + // 1. Get log message if we have a log handle + if (logHandle != NULL) { + // Get size of log message + size_t logSize = 0; + vclLogHandleGetString(logHandle, &logSize, NULL); + + // Get content of log message + char* logBuffer = (char*)malloc(logSize); + vclLogHandleGetString(logHandle, &logSize, logBuffer); + + fprintf(stderr, "Error: %s\n", logBuffer); + free(logBuffer); + } + + // 2. Handle the error based on error code + switch (result) { + case VCL_RESULT_ERROR_OUT_OF_MEMORY: + fprintf(stderr, "Out of memory\n"); + break; + case VCL_RESULT_ERROR_INVALID_ARGUMENT: + fprintf(stderr, "Invalid argument\n"); + break; + // ... other error cases ... + default: + fprintf(stderr, "Unknown error: 0x%x\n", result); + break; + } +} +``` + +## 4. Detailed API Reference + +### 4.1 Version and Properties Information + +#### vclGetVersion + +**Function**: +```c +vcl_result_t vclGetVersion(vcl_version_info_t* compilerVersion, vcl_version_info_t* profilingVersion); +``` + +**Purpose**: Retrieves the VCL API version. + +**Parameters**: +| Parameter | Type | Direction | Description | +|------------------|---------------------|-----------|---------------------------------------| +| compilerVersion | vcl_version_info_t* | [out] | Returns the vcl API version | +| profilingVersion | vcl_version_info_t* | [out] | Returns the vcl API profiling version | + + +**Usage Example**: +```c +vcl_version_info_t compilerVersion, profilingVersion; +vcl_result_t result = vclGetVersion(&compilerVersion, &profilingVersion); +if (result == VCL_RESULT_SUCCESS) { + printf("Compiler version: %d.%d\n", compilerVersion.major, compilerVersion.minor); + printf("Profiling version: %d.%d\n", profilingVersion.major, profilingVersion.minor); +} +``` + +#### `vclCompilerGetProperties` + +**Function**: +```c +vcl_result_t vclCompilerGetProperties(vcl_compiler_handle_t compiler, vcl_compiler_properties_t* properties); +``` + +**Purpose**: Retrieves the MLIR compiler version. + +**Parameters**: +| Parameter | Type | Direction | Description | +|------------|----------------------------|-----------|-----------------------------| +| compiler | vcl_compiler_handle_t | [in] | The compiler handle | +| properties | vcl_compiler_properties_t* | [out] | Returns the MLIR properties | + +**Usage Example**: +```c +vcl_compiler_properties_t properties; +result = vclCompilerGetProperties(compiler, &properties); +if (result == VCL_RESULT_SUCCESS) { + printf("Compiler ID: %s\n", properties.id); + printf("Supported opsets: 0x%x\n", properties.supportedOpsets); +} +``` + +#### `vclProfilingGetProperties` + +**Function**: +```c +vcl_result_t vclProfilingGetProperties(vcl_profiling_handle_t profilingHandle, vcl_profiling_properties_t* properties); +``` + +**Purpose**: Retrieves properties of the profiling module. + +**Parameters**: +| Parameter | Type | Direction | Description | +|-----------------|-----------------------------|-----------|------------------------| +| profilingHandle | vcl_profiling_handle_t | [in] | The profiling handle | +| properties | vcl_profiling_properties_t* | [out] | Returns the properties | + +**Usage Example**: +```c +vcl_profiling_properties_t profProps; +result = vclProfilingGetProperties(profHandle, &profProps); +if (result == VCL_RESULT_SUCCESS) { + printf("Profiling version: %d.%d\n", + profProps.version.major, + profProps.version.minor); +} +``` + +### 4.2 Compiler Lifecycle + +#### `vclCompilerCreate` + +**Function**: +```c +vcl_result_t vclCompilerCreate(vcl_compiler_desc_t* compilerDesc, vcl_device_desc_t* deviceDesc, vcl_compiler_handle_t* compiler, vcl_log_handle_t* logHandle); +``` + +**Purpose**: Creates a compiler instance for a specific device. + +**Parameters**: +| Parameter | Type | Direction | Description | +|--------------|------------------------|-----------|--------------------------------| +| compilerDesc | vcl_compiler_desc_t* | [in] | Pointer to compiler descriptor | +| deviceDesc | vcl_device_desc_t* | [in] | Pointer to device descriptor | +| compiler | vcl_compiler_handle_t* | [out] | Returns the compiler handle | +| logHandle | vcl_log_handle_t* | [out] | Returns the log handle | + +**Usage Example**: +```c +vcl_compiler_desc_t compilerDesc = { + .version = {7, 4}, // API version + .debugLevel = VCL_LOG_INFO // Debug level +}; + +vcl_device_desc_t deviceDesc = { + .size = sizeof(vcl_device_desc_t), + .deviceID = 0x1234, // PCI Device ID in lower 16 bits + .revision = 0, // NPU Revision (0 for first stepping) + .tileCount = 1 // Number of slices/tiles +}; + +vcl_compiler_handle_t compiler; +vcl_log_handle_t logHandle; +vcl_result_t result = vclCompilerCreate(&compilerDesc, &deviceDesc, &compiler, &logHandle); +``` + +`vcl_log_level_t` type for `debugLevel` field of `vcl_compiler_desc_t` struct: +| Value | Description | +|-----------------|------------------------| +| VCL_LOG_NONE | Logging disabled | +| VCL_LOG_ERROR | Error events | +| VCL_LOG_WARNING | Warning events | +| VCL_LOG_INFO | Informational messages | +| VCL_LOG_DEBUG | Debug messages | +| VCL_LOG_TRACE | Trace-level messages | + +#### `vclCompilerDestroy` + +**Function**: +```c +vcl_result_t vclCompilerDestroy(vcl_compiler_handle_t compiler); +``` + +**Purpose**: Releases all resources associated with a compiler instance. + +**Parameters**: +| Parameter | Type | Direction | Description | +|--------------|-----------------------|-----------|-----------------------------------------------| +| compiler | vcl_compiler_handle_t | [in] | Handle to the compiler object to be destroyed | + +**Usage Example**: +```c +result = vclCompilerDestroy(compiler); +``` + +### 4.3 Network Capability Query + +#### `vclQueryNetworkCreate` + +**Function**: +```c +vcl_result_t vclQueryNetworkCreate(vcl_compiler_handle_t compiler, vcl_query_desc_t desc, vcl_query_handle_t* query); +``` + +**Purpose**: Creates a query to check what operations in a network can be executed on the NPU. + +**Parameters**: +| Parameter | Type | Direction | Description | +|-----------|-----------------------|-----------|-------------------------------------------------------| +| compiler | vcl_compiler_handle_t | [in] | The compiler handle | +| desc | vcl_query_desc_t | [in] | Query description including model IR data and options | +| query | vcl_query_handle_t* | [out] | Returns the query handle | + +**Usage Example**: +```c +vcl_query_desc_t queryDesc = { + .modelIRData = irData, // IR model data buffer + .modelIRSize = irSize, // Size of the IR data + .options = options, // Compiler options string + .optionsSize = optionsLength // Length of options string +}; + +vcl_query_handle_t query; +result = vclQueryNetworkCreate(compiler, queryDesc, &query); +``` + +#### `vclQueryNetwork` + +**Function**: +```c +vcl_result_t vclQueryNetwork(vcl_query_handle_t query, uint8_t* queryResult, uint64_t* size); +``` + +**Purpose**: Retrieves the result of a network query, showing what operations are supported. + +**Parameters**: +| Parameter | Type | Direction | Description | +|-------------|--------------------|-----------|-----------------------------------------| +| query | vcl_query_handle_t | [in] | The query handle | +| queryResult | uint8_t* | [in] | Buffer to receive the query result data | +| size | uint64_t* | [in,out] | Pointer to size variable | + +**Usage Example**: +```c +// First call: get the required buffer size +uint64_t querySize = 0; +result = vclQueryNetwork(query, NULL, &querySize); + +// Allocate buffer +uint8_t* queryBuffer = (uint8_t*)malloc(querySize); + +// Second call: get the actual data +result = vclQueryNetwork(query, queryBuffer, &querySize); + +// Process the query results ... + +free(queryBuffer); +``` + +#### `vclQueryNetworkDestroy` + +**Function**: +```c +vcl_result_t vclQueryNetworkDestroy(vcl_query_handle_t query); +``` + +**Purpose**: Destroys a query handle and releases associated resources. + +**Parameters**: +| Parameter | Type | Direction | Description | +|--------------|--------------------|-----------|--------------------------------------------| +| query | vcl_query_handle_t | [in] | Handle to the query object to be destroyed | + +**Usage Example**: +```c +result = vclQueryNetworkDestroy(query); +``` + +### 4.4 Executable Creation and Management + +#### `vclExecutableCreate` + +**Function**: +```c +vcl_result_t vclExecutableCreate(vcl_compiler_handle_t compiler, vcl_executable_desc_t desc, vcl_executable_handle_t* executable); +``` + +**Purpose**: Create an executable object. Compiles IR (such as OpenVINO IR xml and weights) into a NPU-executable blob, managed by internal cache. + +**Parameters**: +| Parameter | Type | Direction | Description | +|------------|--------------------------|-----------|------------------------------------------------------------| +| compiler | vcl_compiler_handle_t | [in] | The compiler handle | +| desc | vcl_executable_desc_t | [in] | Executable description including model IR data and options | +| executable | vcl_executable_handle_t* | [out] | Returns the executable handle | + +**Usage Example**: +```c +// Prepare IR Data: Arrange model IR (such as xml and weights) in memory ... + +// Create Description: + vcl_executable_desc_t execDesc = { + .modelIRData = ..., // Pointer to IR data buffer + .modelIRSize = ..., // IR data size + .options = ..., // Optional compiler options (NULL if not needed) + .optionsSize = ..., // Options size + }; + +// Call the API: + vcl_executable_handle_t exec; + vcl_result_t ret = vclExecutableCreate(compiler, execDesc, &exec); + if (ret != VCL_RESULT_SUCCESS) { + // Error handling, e.g. use log API + } +``` + +#### `vclExecutableGetSerializableBlob` + +**Function**: +```c +vcl_result_t vclExecutableGetSerializableBlob(vcl_executable_handle_t executable, uint8_t* blobBuffer, uint64_t* blobSize); +``` + +**Purpose**: Retrieves the compiled blob from an executable. + +**Parameters**: +| Parameter | Type | Direction | Description | +|------------|-------------------------|-----------|---------------------------------| +| executable | vcl_executable_handle_t | [in] | The executable handle | +| blobBuffer | uint8_t* | [in] | Buffer to receive the blob data | +| blobSize | uint64_t* | [in,out] | Pointer to size variable | + +**Usage Example**: +```c +// First call: get the required buffer size +uint64_t blobSize = 0; +result = vclExecutableGetSerializableBlob(executable, NULL, &blobSize); + +// Allocate buffer +uint8_t* blobBuffer = (uint8_t*)malloc(blobSize); + +// Second call: get the actual blob data +result = vclExecutableGetSerializableBlob(executable, blobBuffer, &blobSize); + +// Process the blob ... + +// Free the blob buffer +free(blobBuffer); +``` + +#### `vclExecutableDestroy` + +**Function**: +```c +vcl_result_t vclExecutableDestroy(vcl_executable_handle_t executable); +``` + +**Purpose**: Destroys an executable and releases associated resources. + +**Parameters**: +| Parameter | Type | Direction | Description | +|--------------|-------------------------|-----------|-------------------------------------------------| +| executable | vcl_executable_handle_t | [in] | Handle to the executable object to be destroyed | + +**Usage Example**: +```c +result = vclExecutableDestroy(executable); +``` + +#### `vclAllocatedExecutableCreate2` (Recommended) + +**Function**: +```c +vcl_result_t vclAllocatedExecutableCreate2(vcl_compiler_handle_t compiler, vcl_executable_desc_t desc, vcl_allocator2_t* allocator, uint8_t** blobBuffer, uint64_t* blobSize); +``` + +**Purpose**: Creates an executable with a custom allocator for the blob buffer. + +**Parameters**: +| Parameter | Type | Direction | Description | +|------------|-----------------------|-----------|------------------------------------------------------------| +| compiler | vcl_compiler_handle_t | [in] | The compiler handle | +| desc | vcl_executable_desc_t | [in] | Executable description including model IR data and options | +| allocator | vcl_allocator2_t* | [in] | Custom memory allocator and deallocator functions | +| blobBuffer | uint8_t** | [out] | Pointer to receive the blob buffer pointer | +| blobSize | uint64_t* | [out] | Pointer to receive the blob size | + +**Usage Example**: +```c +// Prepare IR Data: Arrange model IR (such as xml and weights) in memory ... + +// Create Description + vcl_executable_desc_t execDesc = { + .modelIRData = ..., // Pointer to IR data buffer + .modelIRSize = ..., // IR data size + .options = ..., // Optional compiler options (NULL if not needed) + .optionsSize = ..., // Options size + }; + +// Custom allocator and deallocator implementation +uint8_t* customAllocate(vcl_allocator2_t* allocator, uint64_t size) { + return (uint8_t*)malloc(size); +} + +void customDeallocate(vcl_allocator2_t* allocator, uint8_t* ptr) { + free(ptr); +} + +vcl_allocator2_t allocator = { + .allocate = customAllocate, + .deallocate = customDeallocate +}; + +// Call the API: +uint8_t* blobBuffer; +uint64_t blobSize; +result = vclAllocatedExecutableCreate2( + compiler, execDesc, &allocator, &blobBuffer, &blobSize +); + +// Use the allocated blob ... + +// Free the buffer using the custom deallocate function +allocator.deallocate(&allocator, blobBuffer); +``` +
+How to obtain the `options` field for `desc` struct + +>Note: The `options` field is only valid for IR V10 model and to update their layout and precision. + +**Options Format**: `"--inputs_precisions=\"input_node_name:fp16\" --inputs_layouts=\"input_node_name:NCHW\" --outputs_precisions=\"output_node_name:fp16\" --outputs_layouts=\"output_node_name:NC\" --config NPU_PLATFORM=\"4000\" [OTHER_OPTIONS]"` + +The `options` field corresponds to the configuration of the legacy usage of [compilerTest](./test_and_debug/legacy_test.md). If you pass a config file to compilerTest, the `options` field should match the content of your config file. Alternatively, you can also construct the `options` string directly. To generate the `options` field for `desc` struct is as follows: + - Use XML file of IR model to get the `options` content (It is recommended to use this method to obtain the input configuration. For output node names, see the next section for a more convenient approach): + - Find the input layer with type `Parameter`, e.g.,"". Take the name attribute as input node name. + - Find the output layer with type `Result`, e.g.,"", then locate its preceding node to determine the output node name. The reason for using the name of the node preceding of the `Result` node in the config file is based on [the code](https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp#L398). The final string format of `options` field should be `"--inputs_precisions=\"input:fp16\" --inputs_layouts=\"input:C\" --outputs_precisions=\"output:fp16\" --outputs_layouts=\"output:C\" --config NPU_PLATFORM=\"4000\""`. + - For multiple inputs or outputs, separate each entry with a space, e.g., `--inputs_precisions=/"input1:fp16 input2:u8/" --inputs_layouts=/"input1:C input2:C/" --outputs_precisions=/"output1:fp16 output2:fp32/" --outputs_layouts=/"output1:C output2:C/" --config NPU_PLATFORM=/"4000/"` + - For config-related content, you can refer to this [README.md](https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_npu/README.md) to see the supported properties. + + - Use a visualization tool (such as [Netron](https://netron.app/)) to easily find the output node names: + - For example, using Netron: + - Enter the URL, click `Open Model...`, then select the model you want to use and click `open`. Wait for the network visualization to load. + - Press `Ctrl + F` to search for ‘result’ nodes. Click on these nodes, and the corresponding node will be displayed directly. Click the node preceding the result node to view its attributes in the sidebar. Find the `name` attribute, which is the name of the output node to use in the config. + + - Use `benchmark_app` to obtain the `options` field. If you have the `benchmark_app` tool from OpenVINO Project, you can also use this method to generate the content of the `options`. Please refer to the [legacy test method](./test_and_debug/legacy_test.md). +
+ +#### `vclAllocatedExecutableCreate` (Deprecated) + +**Deprecated In Favor Of**: `vclAllocatedExecutableCreate2` +**Function**: +```c +vcl_result_t vclAllocatedExecutableCreate(vcl_compiler_handle_t compiler, vcl_executable_desc_t desc, vcl_allocator_t const* allocator, uint8_t** blobBuffer, uint64_t* blobSize); +``` + +**Purpose**: Creates an executable with a custom allocator for the blob buffer. +>Note: Avoid using this function in new code. The deprecated `vcl_allocator_t` structure doesn't include context information for the allocator functions, making it less flexible than the newer `vcl_allocator2_t` structure. + +**Parameters**: +| Parameter | Type | Direction | Description | +|------------|-----------------------|-----------|------------------------------------------------------------| +| compiler | vcl_compiler_handle_t | [in] | The compiler handle | +| desc | vcl_executable_desc_t | [in] | Executable description including model IR data and options | +| allocator | vcl_allocator_t | [in] | Custom memory allocator and deallocator functions | +| blobBuffer | uint8_t** | [out] | Pointer to receive the blob buffer pointer | +| blobSize | uint64_t* | [out] | Pointer to receive the blob size | + +**Usage Example**: + +```c +// Prepare IR Data: Arrange model IR (such as xml and weights) in memory ... +// Create Description + vcl_executable_desc_t execDesc = { + .modelIRData = ..., // Pointer to IR data buffer + .modelIRSize = ..., // IR data size + .options = ..., // Optional compiler options (NULL if not needed) + .optionsSize = ..., // Options size + }; + +// Custom allocator and deallocator implementation +uint8_t* my_allocate(uint64_t size) { return (uint8_t*)malloc(size); } +void my_deallocate(uint8_t* ptr) { free(ptr); } +vcl_allocator_t allocator = { my_allocate, my_deallocate }; + +// Call the API +uint8_t* blob = NULL; +uint64_t blobSize = 0; +vclAllocatedExecutableCreate(compiler, execDesc, &allocator, &blob, &blobSize); +// Use the allocated blob ... + +// Free the buffer using the custom deallocate function +allocator.deallocate(blob); +``` + +### 4.5 Profiling Operations + +#### `vclProfilingCreate` + +**Function**: +```c +vcl_result_t vclProfilingCreate(p_vcl_profiling_input_t profilingInput, vcl_profiling_handle_t* profilingHandle, vcl_log_handle_t* logHandle); +``` + +**Purpose**: Creates a profiling handle to analyze execution performance data. + +**Parameters**: +| Parameter | Type | Direction | Description | +|-----------------|-------------------------|-----------|----------------------------------------------| +| profilingInput | p_vcl_profiling_input_t | [in] | Input data including blob and profiling data | +| profilingHandle | vcl_profiling_handle_t* | [out] | Pointer to receive the profiling handle | +| logHandle | vcl_log_handle_t* | [out] | Pointer to receive the log handle | + +**Usage Example**: +```c +vcl_profiling_input_t profilingInput = { + .blobData = blobBuffer, // Compiled blob data + .blobSize = blobSize, // Size of blob data + .profData = profRawData, // Raw profiling data from execution + .profSize = profRawSize // Size of raw profiling data +}; + +vcl_profiling_handle_t profHandle; +vcl_log_handle_t profLogHandle; +result = vclProfilingCreate(&profilingInput, &profHandle, &profLogHandle); +``` + +#### `vclGetDecodedProfilingBuffer` + +**Function**: +```c +vcl_result_t vclGetDecodedProfilingBuffer(vcl_profiling_handle_t profilingHandle, vcl_profiling_request_type_t requestType, p_vcl_profiling_output_t profilingOutput); +``` + +**Purpose**: Retrieves decoded profiling information for the requested detail level. + +**Parameters**: +| Parameter | Type | Direction | Description | +|-----------------|------------------------------|-----------|----------------------------------------------------------| +| profilingHandle | vcl_profiling_handle_t | [in] | The profiling handle | +| requestType | vcl_profiling_request_type_t | [in] | Type of profiling data to retrieve (layer, task, or raw) | +| profilingOutput | p_vcl_profiling_output_t | [out] | Pointer to receive the output data | + +*vcl_profiling_request_type_t* type: +| Value | Description | +|---------------------------|-----------------------| +| VCL_PROFILING_LAYER_LEVEL | Layer-level profiling | +| VCL_PROFILING_TASK_LEVEL | Task-level profiling | +| VCL_PROFILING_RAW | Raw profiling data | + +**Usage Example**: +```c +vcl_profiling_output_t profOutput; +result = vclGetDecodedProfilingBuffer( + profHandle, VCL_PROFILING_LAYER_LEVEL, &profOutput +); +if (result == VCL_RESULT_SUCCESS) { + // Process layer-level profiling data + // profOutput.data contains the decoded information + // profOutput.size is the size of the data +} +``` + +#### `vclProfilingDestroy` + +**Function**: +```c +vcl_result_t vclProfilingDestroy(vcl_profiling_handle_t profilingHandle); +``` + +**Purpose**: Destroys a profiling handle and releases associated resources. + +**Parameters**: +| Parameter | Type | Direction | Description | +|-----------------|------------------------|-----------|------------------------------------------------| +| profilingHandle | vcl_profiling_handle_t | [in] | Handle to the profiling object to be destroyed | + +**Usage Example**: +```c +result = vclProfilingDestroy(profHandle); +``` + +### 4.6 Logging Functions + +#### `vclLogHandleGetString` + +**Function**: +```c +vcl_result_t vclLogHandleGetString(vcl_log_handle_t logHandle, size_t* logSize, char* log); +``` + +**Purpose**: Retrieves error/debug messages from a log handle. + +**Parameters**: +| Parameter | Type | Direction | Description | +|-----------|------------------|-----------|----------------------------------| +| logHandle | vcl_log_handle_t | [in] | The log handle | +| logSize | size_t* | [in,out] | Pointer to size variable | +| log | char* | [out] | Buffer to receive the log string | + +**Usage Example**: +```c +// First call: get the required buffer size +size_t logSize = 0; +result = vclLogHandleGetString(logHandle, &logSize, NULL); + +// Allocate buffer +char* logBuffer = (char*)malloc(logSize); + +// Second call: get the actual log data +result = vclLogHandleGetString(logHandle, &logSize, logBuffer); + +// Process the log ... +printf("Log message: %s\n", logBuffer); + +free(logBuffer); +``` + +### 4.7 Configuration Management + +#### `vclGetCompilerSupportedOptions` + +**Function**: +```c +vcl_result_t vclGetCompilerSupportedOptions(vcl_compiler_handle_t compiler, char* result, uint64_t* size); +``` + +**Purpose**: Retrieves the list of compiler options supported by this version of the compiler. + +**Parameters**: +| Parameter | Type | Direction | Description | +|-----------|-----------------------|-----------|--------------------------------| +| compiler | vcl_compiler_handle_t | [in] | The compiler handle | +| result | char* | [out] | Buffer to receive options data | +| size | uint64_t* | [in,out] | Pointer to size variable | + +**Usage Example**: +```c +// First call: get the required buffer size +uint64_t optionsSize = 0; +result = vclGetCompilerSupportedOptions(compiler, NULL, &optionsSize); + +// Allocate buffer +char* optionsBuffer = (char*)malloc(optionsSize); + +// Second call: get the actual options data +result = vclGetCompilerSupportedOptions(compiler, optionsBuffer, &optionsSize); + +// Process the options data ... +printf("Supported options: %s\n", optionsBuffer); + +free(optionsBuffer); +``` + +#### `vclGetCompilerIsOptionSupported` + +**Function**: +```c +vcl_result_t vclGetCompilerIsOptionSupported(vcl_compiler_handle_t compiler, const char* option, const char* value); +``` + +**Purpose**: Checks if a given config option (or option-value pair) is supported by the compiler. + +**Parameters**: +| Parameter | Type | Direction | Description | +|-----------|-----------------------|-----------|-------------------------------------| +| compiler | vcl_compiler_handle_t | [in] | The compiler handle | +| option | const char* | [in] | Option name to check | +| value | const char* | [in] | Option value to check (can be NULL) | + +**Usage Example**: +```c +result = vclGetCompilerIsOptionSupported(compiler, "NPU_PLATFORM", "4000"); +if (result == VCL_RESULT_SUCCESS) { + printf("NPU_PLATFORM=4000 is supported\n"); +} else { + printf("NPU_PLATFORM=4000 is not supported\n"); +} +``` + + +## Frequently Asked Questions + +- **Q: What is the difference between `vclExecutableCreate` and `vclAllocatedExecutableCreate2`?** + A: vclExecutableCreate manages memory internally while vclAllocatedExecutableCreate2 lets you control memory allocation through custom allocators. + +- **Q: How do I choose between layer-level and task-level profiling?** + A: Use layer-level for high-level performance analysis and task-level for detailed optimization. + +## References + +- Header file: [npu_driver_compiler.h](../include/npu_driver_compiler.h) +- For detailed structure and parameter descriptions, refer to the header file comments. + +--- + +This guide covers the main APIs of the NPU Driver Compiler. If you need more detailed parameter explanations or code samples, please refer to the header file or contact the development support team. \ No newline at end of file diff --git a/src/vpux_driver_compiler/docs/how_to_build_driver_compiler_on_linux.md b/src/vpux_driver_compiler/docs/how_to_build_driver_compiler_on_linux.md deleted file mode 100644 index fbe6d7fcd1..0000000000 --- a/src/vpux_driver_compiler/docs/how_to_build_driver_compiler_on_linux.md +++ /dev/null @@ -1,275 +0,0 @@ -# How to build Driver Compiler on Linux - -## Dependencies - -Before you start to build Driver Compiler targets, please check the necessary components. -- Hardware - - Minimum requirements: 32GB RAM -- Software - - [CMake](https://cmake.org/download/) 3.22.1 for Ubuntu 22.04 (version 3.13 or higher) - - GCC 11.4.0 for Ubuntu 22.04 (version 7.5 or higher) - - Python 3.9 - 3.12 - - Git for Linux (requires installing `git lfs`) - - Ninja (optional, used for the documentation related to the installation part) - -> Notice: RAM is not mandatory either. If your RAM is less than 32GB, you can compensate by reducing the number of threads during the build or by increasing the swap memory. - -## Using Cmake Options - -Driver Compiler is built with OpenVINO static runtime. To build the library and related tests (npu_driver_compiler, npu_elf, compilerTest, profilingTest, loaderTest) using following commands : - -1. Clone repos: - - Clone [OpenVINO Project] repo and [NPU-Plugin Project] repo to special location. **Or** just unpack OPENVINO and NPU-Plugin source code to special location. - -
- Instructions - - ```sh - # set the proxy, if required. - # export http_proxy=xxxx - # export https_proxy=xxxx - - cd /home/useraccount/workspace (Just an example, you should use your own path.) - git clone https://github.com/openvinotoolkit/openvino.git - cd openvino - git checkout -b master origin/master (Just an example, you could use your own branch/tag/commit.) - git submodule update --init --recursiv - - - cd /home/useraccount/workspace (Just an example, you should use your own path.) - git clone https://github.com/openvinotoolkit/npu_compiler - cd applications.ai.vpu-accelerators.vpux-plugin - git checkout -b master origin/master (Just an example, you could use your own branch/tag/commit.) - git submodule update --init --recursive - - export OPENVINO_HOME=/home/useraccount/workspace/openvino (need to change to your own path) - export NPU_PLUGIN_HOME=/home/useraccount/workspace/applications.ai.vpu-accelerators.vpux-plugin (need to change to your own path) - ``` -
- - > Notice: If you are building the Driver Compiler targets with the goal of composing the Linux driver, it is important to pay attention to the version or commit of the [OpenVino Project] being used. Make sure to check the current supported version of the [OpenVino Project] from `OpenVINO built from source` entry in the table under the `Common` section in the [release notes](https://github.com/intel/linux-npu-driver/releases/) or within the [Linux NPU driver code](https://github.com/intel/linux-npu-driver/blob/main/compiler/compiler_source.cmake#L20). - -2. Create build folder and run build instructions: - - 2.1 Build instructions: - - Before building with the following instructions, please make sure `OPENVINO_HOME` and `NPU_PLUGIN_HOME` environment variables have been set. - -
- Instructions - - ```sh - cd $OPENVINO_HOME - mkdir build-x86_64 - cd build-x86_64 - - cmake \ - -D CMAKE_BUILD_TYPE=Release \ - -D BUILD_SHARED_LIBS=OFF \ - -D OPENVINO_EXTRA_MODULES=$NPU_PLUGIN_HOME \ - -D ENABLE_LTO=OFF \ - -D ENABLE_FASTER_BUILD=OFF \ - -D ENABLE_CPPLINT=OFF \ - -D ENABLE_TESTS=OFF \ - -D ENABLE_FUNCTIONAL_TESTS=OFF \ - -D ENABLE_SAMPLES=OFF \ - -D ENABLE_JS=OFF \ - -D ENABLE_PYTHON=OFF \ - -D ENABLE_PYTHON_PACKAGING=OFF \ - -D ENABLE_WHEEL=OFF \ - -D ENABLE_OV_ONNX_FRONTEND=OFF \ - -D ENABLE_OV_PYTORCH_FRONTEND=OFF \ - -D ENABLE_OV_PADDLE_FRONTEND=OFF \ - -D ENABLE_OV_TF_FRONTEND=OFF \ - -D ENABLE_OV_TF_LITE_FRONTEND=OFF \ - -D ENABLE_OV_JAX_FRONTEND=OFF \ - -D ENABLE_OV_IR_FRONTEND=ON \ - -D THREADING=TBB \ - -D ENABLE_TBBBIND_2_5=OFF \ - -D ENABLE_SYSTEM_TBB=OFF \ - -D ENABLE_TBB_RELEASE_ONLY=OFF \ - -D ENABLE_HETERO=OFF \ - -D ENABLE_MULTI=OFF \ - -D ENABLE_AUTO=OFF \ - -D ENABLE_AUTO_BATCH=OFF \ - -D ENABLE_TEMPLATE=OFF \ - -D ENABLE_PROXY=OFF \ - -D ENABLE_INTEL_CPU=OFF \ - -D ENABLE_INTEL_GPU=OFF \ - -D ENABLE_NPU_PLUGIN_ENGINE=OFF \ - -D ENABLE_ZEROAPI_BACKEND=OFF \ - -D ENABLE_DRIVER_COMPILER_ADAPTER=OFF \ - -D ENABLE_INTEL_NPU_INTERNAL=OFF \ - -D ENABLE_INTEL_NPU_PROTOPIPE=OFF \ - -D BUILD_COMPILER_FOR_DRIVER=ON \ - -D ENABLE_PRIVATE_TESTS=OFF \ - -D ENABLE_DIRECTML=OFF \ - -D ENABLE_NPU_LSP_SERVER=OFF \ - .. - - cmake --build . --config Release --target npu_driver_compiler compilerTest profilingTest vpuxCompilerL0Test loaderTest -j8 - ``` -
- - > Notice: If you encounter the following error during building `c++: internal compiler error: Killed (program cc1plus)`, you should consider decreasing the number of threads during compiling or try increasing the swap file size. For instance, to decrease the thread count, you could consider using `-j4` to decrease the thread number to 4 or a smaller value. - - 2.2 Build instructions notes: - - Many build options are listed here. To clarify these options, the following explains the list of CMake parameters. - -
- 2.2.1 Common build option - - ```sh - # Build type - CMAKE_BUILD_TYPE - - # Build library type - BUILD_SHARED_LIBS - ``` - -
- - -
- 2.2.2 Build option list in OpenVino Project - - For more details on the build options, please refer to [features.cmake](https://github.com/openvinotoolkit/openvino/blob/master/cmake/features.cmake) and intel NPU's [features.cmake](https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_npu/cmake/features.cmake) in [OpenVINO Project], which provide explanations for all the available build options. - - ```sh - # Specify external repo - OPENVINO_EXTRA_MODULES - - # Build optimization option - ENABLE_LTO - ENABLE_FASTER_BUILD - - # Cpplint checks during build time - ENABLE_CPPLINT - - # Tests and samples - ENABLE_TESTS - ENABLE_FUNCTIONAL_TESTS - ENABLE_SAMPLES - - # Enable JS API - ENABLE_JS - - # Enable Python API and generate python binary - ENABLE_PYTHON - ENABLE_PYTHON_PACKAGING - ENABLE_WHEEL - - # Frontend - ENABLE_OV_ONNX_FRONTEND - ENABLE_OV_PYTORCH_FRONTEND - ENABLE_OV_PADDLE_FRONTEND - ENABLE_OV_TF_FRONTEND - ENABLE_OV_TF_LITE_FRONTEND - ENABLE_OV_JAX_FRONTEND - ENABLE_OV_IR_FRONTEND - - # TBB related option - THREADING - ENABLE_TBBBIND_2_5 - ENABLE_SYSTEM_TBB - ENABLE_TBB_RELEASE_ONLY - - # Plugin platform - ENABLE_HETERO - ENABLE_MULTI - ENABLE_AUTO - ENABLE_AUTO_BATCH - ENABLE_PROXY - ENABLE_INTEL_CPU - ENABLE_INTEL_GPU - - # NPU plugin and its tools related option - ENABLE_NPU_PLUGIN_ENGINE - ENABLE_ZEROAPI_BACKEND - ENABLE_DRIVER_COMPILER_ADAPTER - ENABLE_INTEL_NPU_INTERNAL - ENABLE_INTEL_NPU_PROTOPIPE - ``` -
- -
- 2.2.3 Build option list in NPU-Plugin Project - - For more details on the build options, please refer to this [features.cmake](https://github.com/openvinotoolkit/npu_compiler/blob/develop/cmake/features.cmake) file in [NPU-Plugin Project], which provides explanations for all the available build options. - - ```sh - # Build Driver Compiler targets - BUILD_COMPILER_FOR_DRIVER - - # Compiler private tests - ENABLE_PRIVATE_TESTS - - # for Windows system - ENABLE_DIRECTML - - # Debug tools - ENABLE_NPU_LSP_SERVER - ``` -
- - 2.3 (Optional) Instruction notes about TBB: - -
- 2.3.1 Default tbb location - - The build instructions uses the `-DENABLE_SYSTEM_TBB=OFF` option, which means that the TBB library downloaded by [OpenVINO Project] will be used. The download path for this TBB library is `$OPENVINO_HOME/temp/tbb`. Within the downloaded TBB folder, `$OPENVINO_HOME/temp/tbb/lib/libtbb.so.12` and `$OPENVINO_HOME/temp/tbb/lib/libtbbmalloc.so.2` are required for the Release version. - -
- -
- 2.3.2 Use different TBB version - - If you wish to build with system TBB, you need to install TBB in your local system first and then use `-DENABLE_SYSTEM_TBB=ON` option instead of `-DENABLE_SYSTEM_TBB=OFF` option. - - If you wish to build with a specific version of TBB, you can download it from [oneTBB Project] and unzip its release package. Then use the `-DENABLE_SYSTEM_TBB=OFF -DTBBROOT=/home/username/path/to/downloaded/tbb` option to build. - - The version of TBB download by [OpenVINO Project] is 2021.13.0 and you can find the version info in this [file](https://github.com/openvinotoolkit/openvino/blob/master/cmake/dependencies.cmake#L120) in [OpenVINO Project]. If you would like to build TBB on your own, please refer to [INSTALL.md](https://github.com/oneapi-src/oneTBB/blob/master/INSTALL.md#build-onetbb) in [oneTBB Project] -
- -
- 2.3.3 Do not use TBB - - If you wish to build without TBB (which will result in a slower build process), you need to change `-D THREADING=TBB` to `-D THREADING=SEQ`. For more information about SEQ mode, please refer to this [file](https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/cmake_options_for_custom_compilation.md#options-affecting-binary-size). - -
- -3. (Optional) Prepare final Driver Compiler package for driver: - -
- Instructions - - All Driver Compiler related targets have now been generated in `$OPENVINO_HOME/bin/intel/Release` folder, where the binary libnpu_driver_compiler.so can be found. The following instructions are provided to pack Driver Compiler related targets to the specified location. - - ```sh - #install Driver compiler related targets to current path. A `cid` folder will be generated to `$OPENVINO_HOME/build-x86_64/`. - cd $OPENVINO_HOME/build-x86_64 - cmake --install . --prefix $PWD/ --component CiD - - # or to get a related compressed file. A RELEASE-CiD.tar.gz compressed file will be generated to `$OPENVINO_HOME/build-x86_64/`. - cpack -D CPACK_COMPONENTS_ALL=CiD -D CPACK_CMAKE_GENERATOR=Ninja -D CPACK_PACKAGE_FILE_NAME="RELEASE" -G "TGZ" - ``` -
- - > Notice: It is not recommended to use `cmake --install . --prefix /usr --component CiD` to install the Driver Compiler targets on the system, as this will not only install `libnpu_driver_compiler.so` but also many other related targets (such as `elf`, `compilerTest`) to the specified folder. - - -### See also - -Follow the blow guide to build the Driver Compiler library and test targets with Ninja: - * `Using ninja` section of [how-to-build.md](../../../guides/how-to-build.md) of [NPU-Plugin Project]. - -To use cmake presets to build, please see -* [how to build Driver Compiler with Cmake Presets on Linux](./how_to_build_driver_compiler_withCmakePresets_on_linux.md) - -Driver compiler build is a static build, to get a static build of [NPU-Plugin Project] repo, please see - * [how to build static](../../../guides/how-to-build-static.md). -[OpenVINO Project]: https://github.com/openvinotoolkit/openvino -[NPU-Plugin Project]: https://github.com/openvinotoolkit/npu_compiler -[oneTBB Project]: https://github.com/oneapi-src/oneTBB diff --git a/src/vpux_driver_compiler/docs/how_to_build_driver_compiler_on_windows.md b/src/vpux_driver_compiler/docs/how_to_build_driver_compiler_on_windows.md deleted file mode 100644 index 719f8ade96..0000000000 --- a/src/vpux_driver_compiler/docs/how_to_build_driver_compiler_on_windows.md +++ /dev/null @@ -1,285 +0,0 @@ -# How to build Driver Compiler on Windows - -## Dependencies - -Before you start to build Driver Compiler targets, please ensure that you have installed the necessary components. After installation, please make sure they are available from system environment path. - -- Hardware: - - Minimum requirements: 40 GB of disk space. - -- Software: - - [CMake](https://cmake.org/download/) 3.13 or higher - - Microsoft Visual Studio 2019 (recommended) or higher, version 16.3 or later - > Notice: Windows SDK and spectre libraries are required for building OpenVINO and NPU-Plugin. Install them via Visual Studio Installer: Modify -> Individual components -> Search components: "Windows SDK" and "Spectre x64/x86 latest". - - SDK (install via Visual Studio or from this [link](https://developer.microsoft.com/en-us/windows/downloads/sdk-archive/)) and WDK (install for this [link](https://learn.microsoft.com/en-ie/windows-hardware/drivers/other-wdk-downloads#step-2-install-the-wdk)). Please make sure the version is match to your system. - - Python 3.9 - 3.12 - - Git for Windows (requires installing `git lfs`) - - Ninja for installation (optional) -Before you start building, please refer to the notes to avoid potential build issue. - -## Using Cmake Options - -Driver Compiler is built with OpenVINO static runtime. To build the library and related tests (npu_driver_compiler, npu_elf, compilerTest, profilingTest, loaderTest) using following commands: - -All instructions are perfromed on **x64 Native Tools Command Prompt for VS XXXX**. - -1. Clone repos and set environment variables: - - Clone [OpenVINO Project] repo and [NPU-Plugin Project] repo to special location. - -
- Executed in x64 Native Tools Command Prompt for VS XXXX - - ```sh - # set the proxy, if required. - # set http_proxy=xxxx - # set https_proxy=xxxx - - cd C:\Users\Local_Admin\workspace (Just an example, you should use your own path.) - git clone https://github.com/openvinotoolkit/openvino.git - cd openvino - git checkout -b master origin/master (Just an example, you could use your own branch/tag/commit.) - git submodule update --init --recursive - - cd C:\Users\Local_Admin\workspace (Just an example, you should use your own path.) - git clone https://github.com/openvinotoolkit/npu_compiler - cd applications.ai.vpu-accelerators.vpux-plugin - git checkout -b master origin/master (Just an example, you could use your own branch/tag/commit.) - git submodule update --init --recursive - - set OPENVINO_HOME=C:\Users\Local_Admin\workspace\openvino (need to change to your own path) - set NPU_PLUGIN_HOME=C:\Users\Local_Admin\workspace\applications.ai.vpu-accelerators.vpux-plugin (need to change to your own path) - ``` -
- - > Notice: Please place the cloned repositories in the shortest possible path. - - > Notice: If you encounter the `filename too long` issue when cloning a repo, please use the command `git config --global core.longpaths true` in Git Bash. - - > Notice: To enable the long path feature on windows, please open the `Registry Editor` by pressing `win + R` and typing `regedit`. Navigate to `HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem, then find or create a DWORD (32-bit) value named `LongPathsEnabled`, and set its value to 1. - Follow these [steps](./imgs/long_path_enable.png) to complete the process. - -2. Create build folder and run build commands: - - 2.1 Build instructions - - Before building with the following instructions, please make sure `OPENVINO_HOME` and `NPU_PLUGIN_HOME` environment variables have been set. - -
- Executed in x64 Native Tools Command Prompt for VS XXXX - - ```sh - cd %OPENVINO_HOME% - md build-x86_64 - cd build-x86_64 - cmake ^ - -D CMAKE_BUILD_TYPE=Release ^ - -D BUILD_SHARED_LIBS=OFF ^ - -D OPENVINO_EXTRA_MODULES=%NPU_PLUGIN_HOME% ^ - -D ENABLE_LTO=OFF ^ - -D ENABLE_FASTER_BUILD=OFF ^ - -D ENABLE_CPPLINT=OFF ^ - -D ENABLE_TESTS=OFF ^ - -D ENABLE_FUNCTIONAL_TESTS=OFF ^ - -D ENABLE_SAMPLES=OFF ^ - -D ENABLE_JS=OFF ^ - -D ENABLE_PYTHON=OFF ^ - -D ENABLE_PYTHON_PACKAGING=OFF ^ - -D ENABLE_WHEEL=OFF ^ - -D ENABLE_OV_ONNX_FRONTEND=OFF ^ - -D ENABLE_OV_PYTORCH_FRONTEND=OFF ^ - -D ENABLE_OV_PADDLE_FRONTEND=OFF ^ - -D ENABLE_OV_TF_FRONTEND=OFF ^ - -D ENABLE_OV_TF_LITE_FRONTEND=OFF ^ - -D ENABLE_OV_JAX_FRONTEND=OFF ^ - -D ENABLE_OV_IR_FRONTEND=ON ^ - -D THREADING=TBB ^ - -D ENABLE_TBBBIND_2_5=OFF ^ - -D ENABLE_SYSTEM_TBB=OFF ^ - -D ENABLE_TBB_RELEASE_ONLY=OFF ^ - -D ENABLE_HETERO=OFF ^ - -D ENABLE_MULTI=OFF ^ - -D ENABLE_AUTO=OFF ^ - -D ENABLE_AUTO_BATCH=OFF ^ - -D ENABLE_TEMPLATE=OFF ^ - -D ENABLE_PROXY=OFF ^ - -D ENABLE_INTEL_CPU=OFF ^ - -D ENABLE_INTEL_GPU=OFF ^ - -D ENABLE_NPU_PLUGIN_ENGINE=OFF ^ - -D ENABLE_ZEROAPI_BACKEND=OFF ^ - -D ENABLE_DRIVER_COMPILER_ADAPTER=OFF ^ - -D ENABLE_INTEL_NPU_INTERNAL=OFF ^ - -D ENABLE_INTEL_NPU_PROTOPIPE=OFF ^ - -D BUILD_COMPILER_FOR_DRIVER=ON ^ - -D ENABLE_PRIVATE_TESTS=OFF ^ - -D ENABLE_DIRECTML=OFF ^ - -D ENABLE_NPU_LSP_SERVER=OFF ^ - -D CMAKE_TOOLCHAIN_FILE=%OPENVINO_HOME%\cmake\toolchains\onecoreuap.toolchain.cmake ^ - .. - - cmake --build . --config Release --target npu_driver_compiler compilerTest profilingTest vpuxCompilerL0Test loaderTest -j8 - ``` -
- - > Notice: If the commit is old than `d0719b79c5847` (Aug 28, 2024), build without the option `-D CMAKE_TOOLCHAIN_FILE=%OPENVINO_HOME%\cmake\toolchains\onecoreuap.toolchain.cmake` alse can pass. - - 2.2 Build instructions notes: - - Many build options are listed here. To clarify these options, the following explains the list of CMake parameters. - -
- 2.2.1 Common build option - - ```sh - # Build type - CMAKE_BUILD_TYPE - - # Build library type - BUILD_SHARED_LIBS - - # specifies locations for compilers and toolchain utilities, - CMAKE_TOOLCHAIN_FILE - ``` - -
- - -
- 2.2.2 Build option list in OpenVino Project - - For more details on the build options, please refer to [features.cmake](https://github.com/openvinotoolkit/openvino/blob/master/cmake/features.cmake) and intel NPU's [features.cmake](https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_npu/cmake/features.cmake) in [OpenVINO Project], which provide explanations for all the available build options. - - ```sh - # Specify external repo - OPENVINO_EXTRA_MODULES - - # Build optimization option - ENABLE_LTO - ENABLE_FASTER_BUILD - - # Cpplint checks during build time - ENABLE_CPPLINT - - # Tests and samples - ENABLE_TESTS - ENABLE_FUNCTIONAL_TESTS - ENABLE_SAMPLES - - # Enable JS API - ENABLE_JS - - # Enable Python API and generate python binary - ENABLE_PYTHON - ENABLE_PYTHON_PACKAGING - ENABLE_WHEEL - - # Frontend - ENABLE_OV_ONNX_FRONTEND - ENABLE_OV_PYTORCH_FRONTEND - ENABLE_OV_PADDLE_FRONTEND - ENABLE_OV_TF_FRONTEND - ENABLE_OV_TF_LITE_FRONTEND - ENABLE_OV_JAX_FRONTEND - ENABLE_OV_IR_FRONTEND - - # TBB related option - THREADING - ENABLE_TBBBIND_2_5 - ENABLE_SYSTEM_TBB - ENABLE_TBB_RELEASE_ONLY - - # Plugin platform - ENABLE_HETERO - ENABLE_MULTI - ENABLE_AUTO - ENABLE_AUTO_BATCH - ENABLE_PROXY - ENABLE_INTEL_CPU - ENABLE_INTEL_GPU - - # NPU plugin and its tools related option - ENABLE_NPU_PLUGIN_ENGINE - ENABLE_ZEROAPI_BACKEND - ENABLE_DRIVER_COMPILER_ADAPTER - ENABLE_INTEL_NPU_INTERNAL - ENABLE_INTEL_NPU_PROTOPIPE - ``` -
- -
- 2.2.3 Build option list in NPU-Plugin Project - - For more details on the build options, please refer to this [features.cmake](https://github.com/openvinotoolkit/npu_compiler/blob/develop/cmake/features.cmake) file in [NPU-Plugin Project], which provides explanations for all the available build options. - - ```sh - # Build Driver Compiler targets - BUILD_COMPILER_FOR_DRIVER - - # Compiler private tests - ENABLE_PRIVATE_TESTS - - # for Windows system - ENABLE_DIRECTML - - # Debug tools - ENABLE_NPU_LSP_SERVER - ``` -
- - 2.3 (Optional) Instruction notes about TBB: - -
- 2.3.1 Default tbb location - - The build instructions uses the `-DENABLE_SYSTEM_TBB=OFF` option, which means that the TBB library downloaded by [OpenVINO Project] will be used. The download path for this TBB library is `%OPENVINO_HOME%\temp\tbb`. Within the downloaded TBB folder, `%OPENVINO_HOME%\temp\tbb\bin\tbb12.dll` and `%OPENVINO_HOME%\temp\tbb\bin\tbbmalloc.dll` are required for the Release version. - -
- -
- 2.3.2 Use different TBB version - - If you wish to build with system TBB, you need to install TBB in your local system first and then use `-DENABLE_SYSTEM_TBB=ON` option instead of `-DENABLE_SYSTEM_TBB=OFF` option. - - If you wish to build with a specific version of TBB, you can download it from [oneTBB Project] and unzip its release package. Then use the `-DENABLE_SYSTEM_TBB=OFF -DTBBROOT=C:\Users\Local_Admin\workspace\path\to\downloaded\tbb` option to build. - - The version of TBB download by [OpenVINO Project] is 2021.2.5 and you can find the version info in this [file](https://github.com/openvinotoolkit/openvino/blob/master/cmake/dependencies.cmake#L105) in [OpenVINO Project]. If you would like to build TBB on your own, please refer to [INSTALL.md](https://github.com/oneapi-src/oneTBB/blob/master/INSTALL.md#build-onetbb) in [oneTBB Project] -
- -
- 2.3.3 Do not use TBB - - If you wish to build without TBB (which will result in a slower build process), you need to change `-D THREADING=TBB` to `-D THREADING=SEQ`. For more information about SEQ mode, please refer to this [file](https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/cmake_options_for_custom_compilation.md#options-affecting-binary-size). - -
- -3. (Optional) Prepare final Driver Compiler package for driver: - -
- Instructions executed in x64 Native Tools Command Prompt for VS XXXX - - All Driver Compiler related targets have now been generated in `%OPENVINO_HOME%\bin\intel\Release` folder, where the binary npu_driver_compiler.dll can be found. The following instructions are provided to pack Driver Compiler related targets to the specified location. - - ```sh - #install Driver compiler related targets to current path. A `cid` folder will be generated to `%OPENVINO_HOME%\build-x86_64\`. - cd %OPENVINO_HOME%\build-x86_64 - cmake --install .\ --prefix .\ --component CiD - - # or to get a related compressed file. A RELEASE-CiD.zip compressed file will be generated to `%OPENVINO_HOME%\build-x86_64\`. - cpack -D CPACK_COMPONENTS_ALL=CiD -D CPACK_CMAKE_GENERATOR=Ninja -D CPACK_PACKAGE_FILE_NAME="RELEASE" -G "ZIP" - ``` -
- - - -### See also - -Follow the blow guide to build the Driver Compiler library and test targets with Ninja: - * `Using ninja` section of [how-to-build.md](../../../guides/how-to-build.md) of [NPU-Plugin Project]. - -To use cmake presets to build, please see -* [how to build Driver Compiler with Cmake Presets on Windows](./how_to_build_driver_compiler_withCmakePresets_on_windows.md) - -Driver compiler build is a static build, to get a static build of [NPU-Plugin Project] repo, please see - * [how to build static](../../../guides/how-to-build-static.md). -[OpenVINO Project]: https://github.com/openvinotoolkit/openvino -[NPU-Plugin Project]: https://github.com/openvinotoolkit/npu_compiler diff --git a/src/vpux_driver_compiler/docs/how_to_build_driver_compiler_withCmakePresets_on_linux.md b/src/vpux_driver_compiler/docs/how_to_build_driver_compiler_withCmakePresets_on_linux.md deleted file mode 100644 index 753fb89ec1..0000000000 --- a/src/vpux_driver_compiler/docs/how_to_build_driver_compiler_withCmakePresets_on_linux.md +++ /dev/null @@ -1,147 +0,0 @@ -# How to build Driver Compiler with cmake presets on Linux - -## Dependencies - -Before you start to build Driver Compiler targets, please check the necessary components. -- Hardware - - Minimum requirements: 32GB RAM -- Software - - [CMake](https://cmake.org/download/) 3.22.1 for Ubuntu 22.04 (version 3.19 or higher) - - GCC 11.4.0 for Ubuntu 22.04 (version 7.5 or higher) - - Python 3.9 - 3.12 - - Git for Linux (requires installing `git lfs`) - - Ccache - - Ninja - -> Notice: RAM is not mandatory either. If your RAM is less than 32GB, you can compensate by reducing the number of threads during the build or by increasing the swap memory. - -> Notice: Ccache and Ninja are required for the build options defined in the CMake Presets. Therefore, both of these tools are necessary. If you are unable to install them, please remove and update the relevant sections in [CMakePresets.json](https://github.com/openvinotoolkit/npu_compiler/blob/develop/CMakePresets.json#L7C1-L16C19). - -## Using CMakePresets to build - -#### Using CMakePresets to build and using NPU-Plugin as an extra module of OpenVINO - -Here provides a default pre-configured CMake presets for users named: "npuCidReleasexxx", `npuCidReleaseLinux` for Linux and `npuCidReleaseWindows` for Windows. The setting is to build [NPU-Plugin Project] as an extra module of [OpenVINO Project]. In this case, `NPU_PLUGIN_HOME` environment variable must be set. - -1. Clone repos: - -
- Instructions - - ```sh - # set the proxy, if required. - # export http_proxy=xxxx - # export https_proxy=xxxx - - cd /home/useraccount/workspace (Just an example, you could use your own branch/tag/commit.) - git clone https://github.com/openvinotoolkit/openvino.git - cd openvino - git checkout -b master origin/master (Just an example, you could use your own branch/tag/commit.) - git submodule update --init --recursive - - cd /home/useraccount/workspace (Just an example, you could use your own branch/tag/commit.) - git clone https://github.com/openvinotoolkit/npu_compiler - cd applications.ai.vpu-accelerators.vpux-plugin - git checkout -b master origin/master (Just an example, you could use your own branch/tag/commit.) - git submodule update --init --recursive - ``` -
- - > Notice: If you are building the Driver Compiler targets with the goal of composing the Linux driver, it is important to pay attention to the version or commit of the [OpenVino Project] being used. Make sure to check the current supported version of the [OpenVino Project] from `OpenVINO built from source` entry in the table under the `Common` section in the [release notes](https://github.com/intel/linux-npu-driver/releases/) or within the [Linux NPU driver code](https://github.com/intel/linux-npu-driver/blob/main/compiler/compiler_source.cmake#L20). - -2. Set environment variables and create a symlink for a preset file in OpenVINO Project root: - -
- Instructions - - ```sh - # set the environment variables - export OPENVINO_HOME=/home/useraccount/workspace/openvino (need to change to your own path) - export NPU_PLUGIN_HOME=/home/useraccount/workspace/applications.ai.vpu-accelerators.vpux-plugin (need to change to your own path) - - cd $OPENVINO_HOME - ln -s $NPU_PLUGIN_HOME/CMakePresets.json ./CMakePresets.json - ``` -
- -3. Build with the following commands: - - Before building with the following instructions, please make sure `OPENVINO_HOME` and `NPU_PLUGIN_HOME` environment variables have been set. - -
- Instructions - - ```sh - cd $OPENVINO_HOME - cmake --preset npuCidReleaseLinux - cd build-x86_64/Release/ - cmake --build ./ --target npu_driver_compiler compilerTest profilingTest vpuxCompilerL0Test loaderTest -j8 - ``` -
- - The defined build option for npuCidReleaseLinux Cmake Preset is listed [here](../../../CMakePresets.json#L240). For additional information about its build options, please refer to section `2.2 Build instructions notes` in [how to build Driver Compiler on linux](./how_to_build_driver_compiler_on_linux.md). - -4. (Optional) Prepare final Driver Compiler package for driver: - -
- Instructions - - All Driver Compiler related targets have now been generated in `$OPENVINO_HOME/bin/intel/Release` folder, where the binary libnpu_driver_compiler.so can be found. The following instructions are provided to pack Driver Compiler related targets to the specified location. - - ```sh - #install Driver compiler related targets to current path. A `cid` folder will be generated to `$OPENVINO_HOME/build-x86_64/`. - cd $OPENVINO_HOME/build-x86_64 - cmake --install . --prefix $PWD/ --component CiD - - - # or to get a related compressed file. A RELEASE-CiD.tar.gz compressed file will be generated to `$OPENVINO_HOME/build-x86_64/`. - cpack -D CPACK_COMPONENTS_ALL=CiD -D CPACK_CMAKE_GENERATOR=Ninja -D CPACK_PACKAGE_FILE_NAME="RELEASE" -G "TGZ" - ``` -
- - > Notice: It is not recommended to use `cmake --install . --prefix /usr --component CiD` to install the Driver Compiler targets on the system, as this will not only install `libnpu_driver_compiler.so` but also many other related targets (such as `elf`, `compilerTest`) to the specified folder. - - -5. (Optional) Instruction notes about TBB: - -
- 5.1 Default tbb location - - The [build instructions](../../../CMakePresets.json#L274) uses the `"ENABLE_SYSTEM_TBB": false` option, which means that the TBB library downloaded by [OpenVINO Project] will be used. The download path for this TBB library is `$OPENVINO_HOME/temp/tbb`. Within the downloaded TBB folder, `$OPENVINO_HOME/temp/tbb/lib/libtbb.so.12` and `$OPENVINO_HOME/temp/tbb/lib/libtbbmalloc.so.2` are required for the Release version. - -
- -
- 5.2 Use different TBB version - - If you wish to build with system TBB, you need to install TBB in your local system first and then use `"ENABLE_SYSTEM_TBB": true` option instead of `"ENABLE_SYSTEM_TBB": false` option in [here](../../../CMakePresets.json#L274). - - If you wish to build with a specific version of TBB, you can download it from [oneTBB Project] and unzip its release package. Then, add the following new lines after line 251 in [CMakePresets.json](../../../CMakePresets.json#L251) file. - - ```sh - "TBBROOT": { - "type": "FILEPATH", - "value": "/home/username/path/to/downloaded/tbb" - } - ``` - - The version of TBB downloaded by [OpenVINO Project] is 2021.13.0, and you can find the version information in the [corresponding file](https://github.com/openvinotoolkit/openvino/blob/master/cmake/dependencies.cmake#L120) within [OpenVINO Project]. If you would like to build TBB on your own, please refer to [INSTALL.md](https://github.com/oneapi-src/oneTBB/blob/master/INSTALL.md#build-onetbb) in [oneTBB Project] -
- -
- 5.3 Do not use TBB - - If you wish to build without TBB (which will result in a slower build process), you need to change `"value": "TBB"` to `"value": "SEQ"` in [here](../../../CMakePresets.json#L228). For more information about SEQ mode, please refer to this [file](https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/cmake_options_for_custom_compilation.md#options-affecting-binary-size). - -
- -### Note - -1. Presets step for "npuCidReleasexxx" need to be built in [OpenVINO Project] folder. -2. The presets for "npuCidReleasexxx" define the build directory build-x86_64/Release. -3. The presets are configured to use Ninja as default generator, so installing Ninja package is an extra requirement. -4. Currently Presets for "npuCidReleasexxx" will build the smallest size targets of Driver Compiler. If the user wishes to build Driver Compiler and other targets, can directly inherit "Cid" preset and enable the needed build option to self-configuration presets. - -[OpenVINO Project]: https://github.com/openvinotoolkit/openvino -[NPU-Plugin Project]: https://github.com/openvinotoolkit/npu_compiler -[oneTBB Project]: https://github.com/oneapi-src/oneTBB diff --git a/src/vpux_driver_compiler/docs/how_to_build_driver_compiler_withCmakePresets_on_windows.md b/src/vpux_driver_compiler/docs/how_to_build_driver_compiler_withCmakePresets_on_windows.md deleted file mode 100644 index 5674c5cb27..0000000000 --- a/src/vpux_driver_compiler/docs/how_to_build_driver_compiler_withCmakePresets_on_windows.md +++ /dev/null @@ -1,158 +0,0 @@ -# How to build Driver Compiler with cmake presets on Windows - -## Dependencies - -Before you start to build Driver Compiler targets, please make sure you have installed the necessary components. After installation, please make sure they are available from system environment path. - -- Hardware: - - Minimum requirements: 40 GB of disk space. - -- Software: - - [CMake](https://cmake.org/download/) 3.19 or higher - - Microsoft Visual Studio 2019 (recommended) or higher, version 16.3 or later - > **Notice**: Windows SDK and spectre libraries are required for building OpenVINO and NPU-Plugin. Install them via Visual Studio Installer: Modify -> Individual components -> Search components: "Windows SDK" and "Spectre x64/x86 latest". - - SDK (install via Visual Studio or from this [link](https://developer.microsoft.com/en-us/windows/downloads/sdk-archive/)) and WDK (install for this [link](https://learn.microsoft.com/en-ie/windows-hardware/drivers/other-wdk-downloads#step-2-install-the-wdk)). Please make sure the version is match to your system. - - Python 3.9 - 3.12 - - Git for Windows (requires installing `git lfs`) - - Ccache (Download latest version of ccache binaries or build from source code on this [link](https://github.com/ccache/ccache/releases)) - - Ninja (Install it via Visual Studio Installer or "Getting Ninja" section on this [link](https://ninja-build.org/)) - -> Notice: Ccache and Ninja are required for the build options defined in the CMake Presets. Therefore, both of these tools are necessary. If you are unable to install them, please remove and update the relevant sections in [CMakePresets.json](https://github.com/openvinotoolkit/npu_compiler/blob/develop/CMakePresets.json#L7C1-L16C19). - -Before you start building, please refer to the notes at the end to avoid potential build issue. - - -## Using CMakePresets to build - -#### Using CMakePresets to build and using NPU-Plugin as an extra module of OpenVINO - -Here provides a default pre-configured CMake presets for users named: "npuCidReleasexxx", `npuCidReleaseLinux` for Linux and `npuCidReleaseWindows` for Windows. The setting is to build [NPU-Plugin Project] as an extra module of [OpenVINO Project]. In this case, `NPU_PLUGIN_HOME` environment variable must be set. - -All instructions are perfromed on **x64 Native Tools Command Prompt for VS XXXX(run as administrator)**. - -1. Clone repos: -
- Executed in x64 Native Tools Command Prompt for VS XXXX(run as administrator) - - ```sh - # set the proxy, if required. - # set http_proxy=xxxx - # set https_proxy=xxxx - - cd C:\workspace(Just an example, you could use your own branch/tag/commit.) - git clone https://github.com/openvinotoolkit/openvino.git - cd openvino - git checkout -b master origin/master (Just an example, you could use your own branch/tag/commit.) - git submodule update --init --recursive - - cd C:\workspace (Just an example, you could use your own branch/tag/commit.) - git clone https://github.com/openvinotoolkit/npu_compiler - cd applications.ai.vpu-accelerators.vpux-plugin - git checkout -b master origin/master (Just an example, you could use your own branch/tag/commit.) - git submodule update --init --recursive - ``` -
- - > Notice: Please place the cloned repositories in the shortest possible path. - - > Notice: If you encounter the `filename too long` issue when cloning a repo, please use the command `git config --global core.longpaths true` in Git Bash. - - > Notice: To enable the long path feature on windows, please open the `Registry Editor` by pressing `win + R` and typing `regedit`. Navigate to `HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem, then find or create a DWORD (32-bit) value named `LongPathsEnabled`, and set its value to 1. - Follow these [steps](./imgs/long_path_enable.png) to complete the process. - -2. Set environment variables and create a symlink for a preset file in OpenVINO Project root: -
- Executed in x64 Native Tools Command Prompt for VS XXXX(run as administrator) - - ```sh - # set the environment variables - set OPENVINO_HOME=C:\workspace\openvino (need to change to your own path) - set NPU_PLUGIN_HOME=C:\workspace\applications.ai.vpu-accelerators.vpux-plugin (need to change to your own path) - - cd %OPENVINO_HOME% - mklink .\CMakePresets.json %NPU_PLUGIN_HOME%\CMakePresets.json - ``` -
- - > Notice: Please make sure you do not have CMakePresets.json before you use `mklink .\CMakePresets.json %NPU_PLUGIN_HOME%\CMakePresets.json`. - -3. Build with the following commands: - - Before building with the following instructions, please make sure `OPENVINO_HOME` and `NPU_PLUGIN_HOME` environment variables have been set. - -
- Executed in x64 Native Tools Command Prompt for VS XXXX - - ```sh - cd %OPENVINO_HOME% - cmake --preset npuCidReleaseWindows - cd build-x86_64\Release\ - cmake --build .\ --target npu_driver_compiler compilerTest profilingTest vpuxCompilerL0Test loaderTest - ``` -
- - The defined build option for npuCidReleaseWindows cmake preset is listed [here](../../../CMakePresets.json#L280). For additional information about its build options, please refer to section `2.2 Build instructions notes` in [how to build Driver Compiler on windows](./how_to_build_driver_compiler_on_windows.md). - - > Notice: If you build Driver compiler using cmake presets and ccache is not installed on this device, the build will fail with the error: `CreateProcess failed: The system cannot find the file specified`during `cmake --build ...` on Windows. - -4. (Optional) Prepare final Driver Compiler package for driver: - -
- Instructions executed in x64 Native Tools Command Prompt for VS XXXX - - All Driver Compiler related targets have now been generated in `%OPENVINO_HOME%\bin\intel\Release` folder, where the binary npu_driver_compiler.dll can be found. The following instructions are provided to pack Driver Compiler related targets to the specified location. - - ```sh - #install Driver compiler related targets to current path. A `cid` folder will be generated to `%OPENVINO_HOME%\build-x86_64`. - cd %OPENVINO_HOME%\build-x86_64 - cmake --install .\ --prefix .\ --component CiD - - - # or to get a related compressed file. A RELEASE-CiD.zip compressed file will be generated to `%OPENVINO_HOME%\build-x86_64\`. - cpack -D CPACK_COMPONENTS_ALL=CiD -D CPACK_CMAKE_GENERATOR=Ninja -D CPACK_PACKAGE_FILE_NAME="RELEASE" -G "ZIP" - ``` -
- -5. (Optional) Instruction notes about TBB: - -
- 5.1 Default tbb location - - The [build instructions](../../../CMakePresets.json#L274) uses the `"ENABLE_SYSTEM_TBB": false` option, which means that the TBB library downloaded by [OpenVINO Project] will be used. The download path for this TBB library is `%OPENVINO_HOME%\temp\tbb`. Within the downloaded TBB folder, `%OPENVINO_HOME%\temp\tbb\bin\tbb12.dll` and `%OPENVINO_HOME%\temp\tbb\bin\tbbmalloc.dll` are required for the Release version. - -
- -
- 5.2 Use different TBB version - - If you wish to build with system TBB, you need to install TBB in your local system first and then use `"ENABLE_SYSTEM_TBB": true` option instead of `"ENABLE_SYSTEM_TBB": false` option in [here](../../../CMakePresets.json#L274). - - If you wish to build with a specific version of TBB, you can download it from [oneTBB Project] and unzip its release package. Then, add the following new lines after line 251 in [CMakePresets.json](../../../CMakePresets.json#L251) file. - - ```sh - "TBBROOT": { - "type": "FILEPATH", - "value": "C:\Users\Local_Admin\workspace\path\to\downloaded\tbb" - } - ``` - - The version of TBB download by [OpenVINO Project] is 2021.2.5 and you can find the version info in this [file](https://github.com/openvinotoolkit/openvino/blob/master/cmake/dependencies.cmake#L105) in [OpenVINO Project]. If you would like to build TBB on your own, please refer to [INSTALL.md](https://github.com/oneapi-src/oneTBB/blob/master/INSTALL.md#build-onetbb) in [oneTBB Project] -
- -
- 5.3 Do not use TBB - - If you wish to build without TBB (which will result in a slower build process), you need to change `"value": "TBB"` to `"value": "SEQ"` in [here](../../../CMakePresets.json#L228). For more information about SEQ mode, please refer to this [file](https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/cmake_options_for_custom_compilation.md#options-affecting-binary-size). - -
- -### Note - -1. Presets step for "npuCidReleasexxx" need to be built in [OpenVINO Project] folder. -2. The presets for "npuCidReleasexxx" and define the build directory %OPENVINO_HOME%\build-x86_64\Release. -3. The presets are configured to use Ninja as default generator, so installing Ninja package is an extra requirement. -4. Currently Presets for "npuCidReleasexxx" will build the smallest size targets of Driver Compiler. If the user wishes to build Driver Compiler and other targets, they can directly inherit "Cid" preset and enable the needed build option to self-configuration presets. - - -[OpenVINO Project]: https://github.com/openvinotoolkit/openvino -[NPU-Plugin Project]: https://github.com/openvinotoolkit/npu_compiler diff --git a/src/vpux_driver_compiler/docs/how_to_debug_with_deprecated_version.md b/src/vpux_driver_compiler/docs/how_to_debug_with_deprecated_version.md deleted file mode 100644 index 3018d0c22a..0000000000 --- a/src/vpux_driver_compiler/docs/how_to_debug_with_deprecated_version.md +++ /dev/null @@ -1,79 +0,0 @@ -# How to debug - -## Logs - -To change the compiler behavior, you can use a configuration file with the `compilerTest`. To change the log level, use `LOG_LEVEL="LOG_TRACE"` in the configuration file. - -A full configuration for `googlenet-v1` is as follows: -``` bash ---inputs_precisions="input:fp16" --inputs_layouts="input:NCHW" --outputs_precisions="InceptionV1/Logits/Predictions/Softmax:fp16" --outputs_layouts="InceptionV1/Logits/Predictions/Softmax:NC" --config NPU_PLATFORM="4000" DEVICE_ID="NPU.4000" LOG_LEVEL="LOG_TRACE" NPU_COMPILATION_MODE="DefaultHW" NPU_COMPILATION_MODE_PARAMS="swap-transpose-with-fq=1 force-z-major-concat=1 quant-dequant-removal=1 propagate-quant-dequant=0" -``` - -## Other tools - -One can use the tools from [NPU-Plugin Project] and [OpenVINO Project]. - -### compile_tool - -`compile_tool` can compile a network into a blob. If you test it for Driver Compiler, you need to set the configuration option in the configuration file. - -The general command on Git Bash is: -``` bash -./compile_tool -m -d NPU.4000 -c -``` - -Here is an example: -```bash -./compile_tool -m path/to/googlenet-v1.xml -d NPU.4000 -c /path/to/config.txt -``` -where the content of config.txt is: -```bash -NPU_COMPILER_TYPE DRIVER -``` - - -### benchmark_app - -`benchmark_app` is used to estimate inference performance. If you test it for Driver Compiler, you need to set the configuration option in the configuration file. - -The general command in Git Bash: -```bash -./benchmark_app -m -load_config= -d NPU.4000 -``` - -Here is an example: -``` bash -./benchmark_app -m /path/to/mobilenet-v2.xml -load_config=/path/to/config.txt -d NPU.4000 -``` -where the content of config.txt is: -``` -{ - "NPU" : { - "NPU_COMPILER_TYPE" : "DRIVER", "NPU_PLATFORM" : "4000", "LOG_LEVEL" : "LOG_INFO" - } -} -``` - -### timetest suite - -`timetest suite` is used to measure both total and partial execution time. You can install the timetest suite by following the [time_tests/README.md](https://github.com/openvinotoolkit/openvino/blob/master/tests/time_tests/README.md). If you test it for Driver Compiler, you need to set the configuration option in the configuration file. - -The general command in Git Bash: -```bash -python3 ./scripts/run_timetest.py ../../bin/intel64/Release/timetest_infer_api_2.exe -m -d NPU.4000 -f -``` - -Here is an example: -```bash -python3 scripts\run_timetest.py build\src\timetests\Release\timetest_infer.exe -m googlenet-v1.xml -d NPU.4000 -f config.txt -``` -where the content of config.txt is: -``` -NPU_COMPILER_TYPE DRIVER -``` - ->Note: For more debug methods and details, refer to **[how to debug](../../vpux_compiler/docs/guides/how_to_debug.md)** in the vpux_compiler section. - - -[OpenVINO Project]: https://github.com/openvinotoolkit/openvino -[NPU-Plugin Project]: https://github.com/openvinotoolkit/npu_compiler diff --git a/src/vpux_driver_compiler/docs/how_to_test.md b/src/vpux_driver_compiler/docs/how_to_test.md deleted file mode 100644 index 49a2c016fc..0000000000 --- a/src/vpux_driver_compiler/docs/how_to_test.md +++ /dev/null @@ -1,48 +0,0 @@ -# How to test - -## compilerTest - -`compilerTest` can check the full Driver Compiler API demo. You can use the IR models to test it in Git Bash on Windows or Linux shell. -General command: -```bash -./compilerTest -m xxx.xml -d NPU.XXXX -``` -Commonly used command line parameters (same command line options as [compile_tool](https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_npu/tools/compile_tool/main.cpp)): - -(required) `-m model.xml`: Specifies the model path. Please ensure the model IR file is complete. - -(required) `-d NPU.XXXX` : Specifies the simulated platform. - -(optional) `-o output.net` : Specifies the output network name. - -(optional) `-c config.file` : Uses the same configuration format as `compile_tool`. -To save the serialized IR, please use the `CID_GET_SERIALIZED_MODEL` environment variable. - -## profilingTest - -`profilingTest` is used to output profiling information. You can test it in Git Bash on Windows or Linux shell. - -General command: -```bash -./profilingTest .blob profiling-0.bin -``` - -To get the .blob, please use the compilerTest or [compile_tool](https://github.com/openvinotoolkit/npu_compiler/tree/master/tools/compile_tool) of the [NPU-Plugin Project]. - -To get the profiling-0.bin and more profiling detail, please see **[how to use profiling.md](../../../guides/how-to-use-profiling.md)** in the [NPU-Plugin Project]. - - -## loaderTest - -`loaderTest` is used to check whether driver compiler header is available. You can test it in Git Bash on Windows or Linux shell. - -General command: -```bash -./loaderTest -v=1 -./loaderTest -v=0 -``` ->Note: For more debug method and detail, refer to **[how to debug](../../vpux_compiler/docs/guides/how_to_debug.md)** in vpux_compiler part. - - -[OpenVINO Project]: https://github.com/openvinotoolkit/openvino -[NPU-Plugin Project]: https://github.com/openvinotoolkit/npu_compiler diff --git a/src/vpux_driver_compiler/docs/how_to_test_with_deprecated_version.md b/src/vpux_driver_compiler/docs/how_to_test_with_deprecated_version.md deleted file mode 100644 index db5030bb8b..0000000000 --- a/src/vpux_driver_compiler/docs/how_to_test_with_deprecated_version.md +++ /dev/null @@ -1,69 +0,0 @@ -# How to test - -## compilerTest - -`compilerTest` can check the full Driver Compiler API demo. -Typically, four models are used for testing: GoogLeNet-v1, MobileNet-v2, ResNet-50-PyTorch, and YOLO_v4_subgraph. You can use their IR models for testing in Git Bash on Windows or in a Linux shell. -General command: -```bash -./compilerTest .xml .bin output.net -./compilerTest .xml .bin output.net config.file -``` - -### usage explanation - -For example, a configuration for googlenet-v1 for old usage is as follows: -``` ---inputs_precisions="input:fp16" --inputs_layouts="input:NCHW" --outputs_precisions="InceptionV1/Logits/Predictions/Softmax:fp16" --outputs_layouts="InceptionV1/Logits/Predictions/Softmax:NC" --config NPU_PLATFORM="4000" DEVICE_ID="NPU.4000" NPU_COMPILATION_MODE="DefaultHW" NPU_COMPILATION_MODE_PARAMS="swap-transpose-with-fq=1 force-z-major-concat=1 quant-dequant-removal=1 propagate-quant-dequant=0" -``` - -In the configuration, the necessary command params are (need to be passed in order): -- `inputs_precisions`: Precision of input node. -- `inputs_layouts`: Layout of input node. -- `outputs_precisions`: Precision of output node. -- `outputs_layouts`: Layout of output node. - -The optional command params are: -- `config`: set device info, log level and other properties defined in [`Supported Properties` part](https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_npu/README.md#supported-properties). -- `NPU_COMPILATION_MODE_PARAMS`: set compile configuration defined [here](../../../src/vpux_compiler/include/vpux/compiler/core/pipelines_options.hpp). - ->Note: In `compilerTest`, there defined the [default configuration](https://github.com/openvinotoolkit/npu_compiler/blob/master/src/vpux_driver_compiler/test/compilerTest.c#L231) file for googlenet-v1. If you not pass configuration file in command line, this default configuration will be used for the tested model. - -To obtain a complete configuration file for a model, here is an example: - -To get a configuration file, you need to run the test model by benchmarking first to get its node names. Run `./benchmark -m /path/to/model.xml` in windows Git Bash or linux shell, here using googlenet-v1 as example: -```bash -./benchmark_app -m /path/to/googlenet-v1.xml -``` -The output info of `[step4/11] Reading model files` and `[step6/11] Configuring input of the model` shows the input and output node info. The log info of googlenet-v1 is as following image: - ![alt text](./imgs/image_config.png) - -Each parameter is composed of a node name and precision separate by a colon. If the parameter contain multiple input nodes or output nodes, separate each node with a space between them. -## profilingTest - -`profilingTest` is used to output profiling information. You can test it in Git Bash on Windows or Linux shell. - -General command: -```bash -./profilingTest .blob profiling-0.bin -``` - -To get the .blob, please use the compilerTest or the [compile_tool](https://github.com/openvinotoolkit/npu_compiler/tree/master/tools/compile_tool) of the [NPU-Plugin Project]. - -To get the profiling-0.bin and more profiling detail, please see **[how to use profiling.md](../../../guides/how-to-use-profiling.md)** in the [NPU-Plugin Project]. - - -## loaderTest - -`loaderTest` is used to check whether the driver compiler header is available. You can test it in Git Bash on Windows or Linux shell. - -General command: -```bash -./loaderTest -v=1 -./loaderTest -v=0 -``` ->Note: For more debug method and detail, refer to **[how to debug](../../vpux_compiler/docs/guides/how_to_debug.md)** in vpux_compiler part. - - -[OpenVINO Project]: https://github.com/openvinotoolkit/openvino -[NPU-Plugin Project]: https://github.com/openvinotoolkit/npu_compiler diff --git a/src/vpux_driver_compiler/docs/prebuild.md b/src/vpux_driver_compiler/docs/prebuild.md new file mode 100644 index 0000000000..13ce8bf3ad --- /dev/null +++ b/src/vpux_driver_compiler/docs/prebuild.md @@ -0,0 +1,53 @@ +# Pre-build for Driver Compiler +## Pre-build Preparation + +Before building, clone the required repositories (or download and extract the source code), and set the required environment variables. Set the cloned [OpenVINO Project] as environment variable `OPENVINO_HOME`, set the cloned [NPU-Plugin Project] as environment variable `NPU_PLUGIN_HOME`, and update their submodules. Follow these steps: + +
+Commands (Linux shell and Windows x64 Native Tools Command Prompt) + +```sh +# Set working directory +export WORKDIR=$(pwd) +cd ${WORKDIR} +git clone https://github.com/openvinotoolkit/openvino.git openvino +git clone https://github.com/openvinotoolkit/npu_compiler vpux_plugin + +export OPENVINO_HOME=${WORKDIR}/openvino +export NPU_PLUGIN_HOME=${WORKDIR}/vpux_plugin +# On Windows +# set OPENVINO_HOME=%WORKDIR%\openvino +# set NPU_PLUGIN_HOME=%WORKDIR%\vpux_plugin + +cd ${OPENVINO_HOME} +git checkout # Replace with your desired commit, tag, or branch +git submodule update --init --recursive +cd ${NPU_PLUGIN_HOME} +git checkout develop # Replace with your desired commit, tag, or branch +git submodule update --init --recursive +``` +
+ +Before proceeding with the build, it is recommended to do the following: +> - For matching corresponding commits, refer to the [commit correspondence section of OpenVINO and NPU-Plugin](./FAQ.md#corresponding-commits-required-for-openvino-and-npu-plugin). +> - Check [common clone issues](./FAQ.md#common-clone-failure-issues) and [common build issues](./FAQ.md#build-issues). +> - See [TBB selection](./FAQ.md#choosing-the-right-tbb-linux--windows) for threading library options. + + +## Next Steps + +Linux: build with CMake Options (Recommended for the first Driver Compiler build): +* [how to build Driver Compiler on Linux](./build/build_linux.md) + +Linux: build with CMake Presets: +* [how to build Driver Compiler with CMake Presets on Linux](./build/build_with_cmake_presets_linux.md) + +Windows: build with CMake Options (Recommended for the first Driver Compiler build): +* [how to build Driver Compiler on Windows](./build/build_windows.md) + +Windows: build with CMake Presets: +* [how to build Driver Compiler with CMake Presets on Windows](./build/build_with_cmake_presets_windows.md) + + +[OpenVINO Project]: https://github.com/openvinotoolkit/openvino +[NPU-Plugin Project]: https://github.com/openvinotoolkit/npu_compiler diff --git a/src/vpux_driver_compiler/docs/requirements.md b/src/vpux_driver_compiler/docs/requirements.md new file mode 100644 index 0000000000..a126c8a27b --- /dev/null +++ b/src/vpux_driver_compiler/docs/requirements.md @@ -0,0 +1,56 @@ +# Requirements and Setup to Build Driver Compiler +## Linux Requirements + +Ensure the following components are available on your system before building Driver Compiler. + +- Hardware: + - Minimum 32 GB RAM required + +- Software: + - [CMake](https://cmake.org/download) 3.22.1 for Ubuntu 22.04 (version 3.13 or higher) + - GCC 11.4.0 for Ubuntu 22.04 (version 7.5 or higher) + - Python 3.9 - 3.12 + - Git for Linux (required for `git lfs`) + - Ninja + - Additional dependency to build with CMake Presets: + - ccache (Download latest version of ccache binaries or [build from source](https://github.com/ccache/ccache/releases)) + >Note: The build options defined in the `CMakePresets.json` require ccache, and it must be installed to use the default presets. If installation is not possible, remove or update the relevant entries in [CMakePresets.json](../../../CMakePresets.json#L7-L16) accordingly. + +>Note: 32GB RAM is recommended, but not strictly required. For systems with smaller RAM, you can compensate by reducing the number of build threads or increasing swap memory. + +## Windows Requirements + +Ensure the following components are available on your system before building Driver Compiler. Make sure they are added to your system PATH after installation. + +- Hardware: + - Minimum 40 GB of disk space required + +- Software: + - [CMake](https://cmake.org/download) 3.13 or higher + - Microsoft Visual Studio 2022 + >Note: Windows SDK and spectre libraries are required to build OpenVINO and NPU-Plugin. Install them from Microsoft Visual Studio Installer: Modify -> Individual components -> Search components: "Windows SDK" and "Spectre x64/x86 latest". + - SDK (install from Microsoft Visual Studio or [here](https://developer.microsoft.com/en-us/windows/downloads/sdk-archive)) and [WDK](https://learn.microsoft.com/en-ie/windows-hardware/drivers/other-wdk-downloads#step-2-install-the-wdk). Ensure the versions match your Windows system. + - Python 3.9 - 3.12 + - Git for Windows (required for `git lfs`) + - Ninja + - Additional dependency to build with CMake Presets: + - ccache (Download latest version of ccache binaries or [build from source](https://github.com/ccache/ccache/releases)) + >Note: The build options defined in the CMakePresets.json require ccache, and it must be installed to use the default presets. If installation is not possible, the relevant entries in [CMakePresets.json](../../../CMakePresets.json#L7-L16) should be removed and updated accordingly. +## Next Steps + +Pre-build Preparation: +* [pre-build](./prebuild.md) + +Linux: build with CMake Options (Recommended for the first Driver Compiler build): +* [how to build Driver Compiler on Linux](./build/build_linux.md) + +Linux: build with CMake Presets: +* [how to build Driver Compiler with CMake Presets on Linux](./build/build_with_cmake_presets_linux.md) + +Windows: build with CMake Options (Recommended for the first Driver Compiler build): +* [how to build Driver Compiler on Windows](./build/build_windows.md) + +Windows: build with CMake Presets: +* [how to build Driver Compiler with CMake Presets on Windows](./build/build_with_cmake_presets_windows.md) + + diff --git a/src/vpux_driver_compiler/docs/how_to_debug.md b/src/vpux_driver_compiler/docs/test_and_debug/debug.md similarity index 82% rename from src/vpux_driver_compiler/docs/how_to_debug.md rename to src/vpux_driver_compiler/docs/test_and_debug/debug.md index fee280c1de..82333905d8 100644 --- a/src/vpux_driver_compiler/docs/how_to_debug.md +++ b/src/vpux_driver_compiler/docs/test_and_debug/debug.md @@ -1,5 +1,4 @@ -# How to debug - +# Driver Compiler Debug Methods ## Logs To change the compiler behavior, a configuration file can be used with the `compilerTest` tool. For example, to change the logging level, use `-log_level LOG_TRACE` in the command line or by using a configuration file. The content for the configuration file is as follows: @@ -7,11 +6,11 @@ To change the compiler behavior, a configuration file can be used with the `comp LOG_LEVEL LOG_TRACE ``` -## Other tools +## Other Tools One can also use the tools from [NPU-Plugin Project] and [OpenVINO Project]. -### compile_tool +### `compile_tool` `compile_tool` can compile a network into a blob. If you test it for Driver Compiler, you need to set the configuration option in the configuration file. @@ -20,29 +19,29 @@ The general command on Git Bash is: ./compile_tool -m -d NPU.4000 -c ``` -Here is an example: +Example: ```bash ./compile_tool -m path/to/googlenet-v1.xml -d NPU.4000 -c /path/to/config.txt ``` -where the content of config.txt is: +where the content of `config.txt` is: ```bash NPU_COMPILER_TYPE DRIVER ``` -### benchmark_app +### `benchmark_app` `benchmark_app` is used to estimate inference performance. If you test it for Driver Compiler, you need to set the configuration option in the configuration file. -The general command in Git Bash: +The general command in Git Bash is: ```bash ./benchmark_app -m -load_config= -d NPU.4000 ``` -Here is an example: +Example: ``` bash ./benchmark_app -m /path/to/mobilenet-v2.xml -load_config=/path/to/config.txt -d NPU.4000 ``` -where the content of config.txt is: +where the content of `config.txt` is: ``` { "NPU" : { @@ -51,7 +50,7 @@ where the content of config.txt is: } ``` -### timetest suite +### `timetest suite` `timetest suite` is used to measure both total and partial execution time. You can install the timetest suite by following the [time_tests/README.md](https://github.com/openvinotoolkit/openvino/blob/master/tests/time_tests/README.md). If you test it for Driver Compiler, you need to set the configuration option in the configuration file. @@ -60,16 +59,16 @@ The general command in Git Bash: python3 ./scripts/run_timetest.py ../../bin/intel64/Release/timetest_infer_api_2.exe -m -d NPU.4000 -f ``` -Here is an example: +Example: ```bash python3 scripts\run_timetest.py build\src\timetests\Release\timetest_infer.exe -m googlenet-v1.xml -d NPU.4000 -f config.txt ``` -where the content of config.txt is: +where the content of `config.txt` is: ``` NPU_COMPILER_TYPE DRIVER ``` ->Note: For more debug methods and details, refer to **[how to debug](../../vpux_compiler/docs/guides/how_to_debug.md)** in the vpux_compiler section. +>Note: For more debug methods and details, refer to [how to debug](../../../vpux_compiler/docs/guides/how_to_debug.md) in the vpux_compiler section. [OpenVINO Project]: https://github.com/openvinotoolkit/openvino diff --git a/src/vpux_driver_compiler/docs/test_and_debug/enable_sideloading.md b/src/vpux_driver_compiler/docs/test_and_debug/enable_sideloading.md new file mode 100644 index 0000000000..2c355da9f5 --- /dev/null +++ b/src/vpux_driver_compiler/docs/test_and_debug/enable_sideloading.md @@ -0,0 +1,165 @@ +# Enable Sideloading for Driver Compiler + +This guide explains how to sideload the Driver Compiler (also known as Compiler in Driver and CiD) on Linux and Windows. Sideloading allows you to test a custom-built Driver Compiler without reinstalling the driver. + +For example, sideloading can be used with `compile_tool` or `benchmark_app` from OpenVINO to compile or infer neural-network models with an external Driver Compiler. + +## Table of Contents +- [General Steps](#general-steps) +- [Linux Sideloading Example](#linux-sideloading-example) +- [Windows Sideloading Example](#windows-sideloading-example) +- [Notes](#notes) + +## General Steps + +1. **Create a new directory** (e.g., `cid_alt_rel`, `cid_alt_deb`, or `cid_alt_rdi`) for your sideload libraries. +2. **Copy the Driver Compiler library** (`libnpu_driver_compiler.so` for Linux, `npu_driver_compiler.dll` for Windows) into the directory. + - For Windows Debug builds, rename `npu_driver_compilerd.dll` to `npu_driver_compiler.dll`. +3. **Copy the required oneTBB libraries** (release or debug, matching your build type) into the same directory. +4. **Set the appropriate environment variable** to point to this directory: + - Linux: `export LD_LIBRARY_PATH=/path/to/your/dir` + - Windows: `set NPU_ALT_DEPENDENCY_PATH=C:\path\to\your\dir` +5. **(Windows Debug only)**: Copy `ucrtbased.dll` from `C:\Windows\System32` into your sideloading directory. + +>**Note**: For sideloading oneTBB libraries on Windows, you must use the OneCore version. If you are unsure how to build the OneCore version of oneTBB, please refer to [the instructions for building the OneCore version of oneTBB](../FAQ.md#windows-onecore-tbb-build). + + +## Linux Sideloading Example + +
+Example for Release + +```sh +# 1. Prepare directory +mkdir cid_alt_rel && cd cid_alt_rel + +# 2. Copy Driver Compiler +cp /path/to/libnpu_driver_compiler.so . + +# 3. Copy Release oneTBB libraries +# Release: +cp /path/to/libtbb.so.xx.xx libtbb.so.12 +cp /path/to/libtbbmalloc.so.xx.xx libtbbmalloc.so.2 + +# 4. Set sideloading environment variable +export LD_LIBRARY_PATH=$(pwd) +``` +>Note: Use ldd libnpu_driver_compiler.so to check which TBB variant is required. +
+ +
+Example for RelWithDebInfo + +```sh +# 1. Prepare directory +mkdir cid_alt_rdi && cd cid_alt_rdi + +# 2. Copy Driver Compiler +cp /path/to/libnpu_driver_compiler.so . + +# 3. Copy **Debug** oneTBB libraries +cp /path/to/libtbb_debug.so.xx.xx libtbb_debug.so.12 +cp /path/to/libtbbmalloc_debug.so.xx.xx libtbbmalloc_debug.so.2 + +# 4. Set sideloading environment variable +export LD_LIBRARY_PATH=$(pwd) +``` +
+ +
+Example for Debug + +```sh +# 1. Prepare directory +mkdir cid_alt_deb && cd cid_alt_deb + +# 2. Copy Driver Compiler +cp /path/to/libnpu_driver_compiler.so . + +# 3. Copy Debug oneTBB libraries +cp /path/to/libtbb_debug.so.xx.xx libtbb_debug.so.12 +cp /path/to/libtbbmalloc_debug.so.xx.xx libtbbmalloc_debug.so.2 + +# 4. Set sideloading environment variable +export LD_LIBRARY_PATH=$(pwd) +``` +
+ +## Windows Sideloading Example + +
+Example for Release + +```bat +@REM 1. Prepare directory +md cid_alt_rel +cd cid_alt_rel + +@REM 2. Copy Driver Compiler +copy C:\path\to\npu_driver_compiler.dll . + +@REM 3. Copy Release oneTBB libraries +@REM Release: +copy C:\path\to\tbb12.dll . +copy C:\path\to\tbbmalloc.dll . + +@REM 4. Set sideloading environment variable +set NPU_ALT_DEPENDENCY_PATH=%cd% +``` +
+ +
+Example for RelWithDebInfo + +```bat +@REM 1. Prepare directory +md cid_alt_rdi +cd cid_alt_rdi + +@REM 2. Copy Driver Compiler +copy C:\path\to\npu_driver_compiler.dll . + + +@REM 3. Copy **Debug** oneTBB libraries +copy C:\path\to\tbb12_debug.dll . +copy C:\path\to\tbbmalloc_debug.dll . + +@REM 4. Set sideloading environment variable +set NPU_ALT_DEPENDENCY_PATH=%cd% +``` +
+ +
+Example for Debug + +```bat +@REM 1. Prepare directory +md cid_alt_deb +cd cid_alt_deb + +@REM 2. Copy Debug Driver Compiler and rename for Debug build +copy C:\path\to\npu_driver_compilerd.dll npu_driver_compiler.dll + +@REM 3. Copy Debug oneTBB libraries +@REM Debug: +copy C:\path\to\tbb12_debug.dll . +copy C:\path\to\tbbmalloc_debug.dll . + +@REM For Debug builds, copy Debug CRT library +copy C:\Windows\System32\ucrtbased.dll . + +@REM 4. Set sideloading environment variable +set NPU_ALT_DEPENDENCY_PATH=%cd% +``` +>Note: For Debug builds, always rename the DLL and copy `ucrtbased.dll` if not present. +
+ + +## Notes + +- **TBB Source:** You can use prebuilt TBB from + - [oneTBB releases](https://github.com/oneapi-src/oneTBB/releases), + - or your own build of TBB +- **Library Names:** Always rename TBB libraries as required by your platform and build type. +- **Dependency Check:** Use `ldd` (Linux) or `dumpbin /dependents` (Windows) to verify all dependencies are satisfied. +- **RelWithDebInfo:** On both platforms, this build type may require debug TBB libraries. diff --git a/src/vpux_driver_compiler/docs/test_and_debug/legacy_debug.md b/src/vpux_driver_compiler/docs/test_and_debug/legacy_debug.md new file mode 100644 index 0000000000..a666daf1ef --- /dev/null +++ b/src/vpux_driver_compiler/docs/test_and_debug/legacy_debug.md @@ -0,0 +1,20 @@ +# Legacy Debug Methods for Driver Compiler + +>**Note**: The usage of `compilerTest` has been updated. This document describes legacy debug methods for historical reference only. For up-to-date debugging, please refer to the [main debug guide](./debug.md). + +## Logs for Legacy Usage of CompilerTest + +To change the compiler behavior, use a configuration file with the `compilerTest`. To change the log level, use `LOG_LEVEL="LOG_TRACE"` in the configuration file. + +Example configuration for `googlenet-v1`: +```sh +--inputs_precisions="input:fp16" --inputs_layouts="input:NCHW" --outputs_precisions="InceptionV1/Logits/Predictions/Softmax:fp16" --outputs_layouts="InceptionV1/Logits/Predictions/Softmax:NC" --config NPU_PLATFORM="4000" DEVICE_ID="NPU.4000" LOG_LEVEL="LOG_TRACE" NPU_COMPILATION_MODE="DefaultHW" NPU_COMPILATION_MODE_PARAMS="swap-transpose-with-fq=1 force-z-major-concat=1 quant-dequant-removal=1 propagate-quant-dequant=0" +``` + +## See Also + +See the main debug guide: +* [how to debug](./debug.md) + +See the main test guide: +* [how to test](./test.md) diff --git a/src/vpux_driver_compiler/docs/test_and_debug/legacy_test.md b/src/vpux_driver_compiler/docs/test_and_debug/legacy_test.md new file mode 100644 index 0000000000..b8cb6659a7 --- /dev/null +++ b/src/vpux_driver_compiler/docs/test_and_debug/legacy_test.md @@ -0,0 +1,55 @@ +# Legacy Test Methods for Driver Compiler + +>Note: The usage of `compilerTest` has been updated. This document describes legacy test methods for historical reference only. For up-to-date debugging, please refer to the [main test guide](./test.md). + +## Legacy Usage of compilerTest + +`compilerTest` demonstrates the full Driver Compiler API. You can use IR models for testing in Git Bash (Windows) or Linux shell. +General usage: +```sh +./compilerTest .xml .bin output.net +./compilerTest .xml .bin output.net config.file +``` + +>Note: In `compilerTest`, if you do not pass a configuration file in command line, an empty configuration will be used for the tested model. The `config.file` is used for updating the IR V10 version model's precision and layout. You can also use an empty `config.file` as input. For `config.file` details, please refer to the next section [Explanation for Configuration](#explanation-for-configuration). + +### Explanation for Configuration + +For example, a configuration for googlenet-v1 for legacy usage is as follows: +```sh +--inputs_precisions="input:fp16" +--inputs_layouts="input:NCHW" +--outputs_precisions="InceptionV1/Logits/Predictions/Softmax:fp16" +--outputs_layouts="InceptionV1/Logits/Predictions/Softmax:NC" +--config NPU_PLATFORM="4000" DEVICE_ID="NPU.4000" NPU_COMPILATION_MODE="DefaultHW" NPU_COMPILATION_MODE_PARAMS="swap-transpose-with-fq=1 force-z-major-concat=1 quant-dequant-removal=1 propagate-quant-dequant=0" +``` + +In the configuration, the contents are: +- `inputs_precisions`: Precision of input node to be used. +- `inputs_layouts`: Layout of input node to be used. +- `outputs_precisions`: Precision of output node to be used. +- `outputs_layouts`: Layout of output node to be used. +- `config`: sets compile configurations as defined in [`Supported Properties` part](https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_npu/README.md#supported-properties). + + +To obtain a configuration file, first run the test model with benchmark_app from the [OpenVINO Project] to get the model's node names. Run `./benchmark_app -m /path/to/model.xml` in Windows Git Bash or Linux shell, for example, using googlenet-v1: +```sh +./benchmark_app -m /path/to/googlenet-v1.xml +``` +The output of `[step4/11] Reading model files` and `[step6/11] Configuring input of the model` shows the input and output node information. The googlenet-v1 log output is shown below: + ![alt text](../imgs/image_config.png) + +Each parameter consists of a node name and precision, separated by a colon. If there are multiple input or output nodes, separate each node with a space. +Another method to generate the configuration file can be found [here](../api_reference.md#vclallocatedexecutablecreate2). + + +## See Also + +See the main test and debug guide: + +* [how to test](./test.md) +* [how to debug](./debug.md) + + +[OpenVINO Project]: https://github.com/openvinotoolkit/openvino +[NPU-Plugin Project]: https://github.com/openvinotoolkit/npu_compiler diff --git a/src/vpux_driver_compiler/docs/test_and_debug/test.md b/src/vpux_driver_compiler/docs/test_and_debug/test.md new file mode 100644 index 0000000000..175222c15b --- /dev/null +++ b/src/vpux_driver_compiler/docs/test_and_debug/test.md @@ -0,0 +1,97 @@ +# Driver Compiler Test Methods + +This guide covers the test tools provided by the Driver Compiler for validating functionality and performance. All tests run in Git Bash (Windows) or Linux shell. + +## compilerTest + +`compilerTest` demonstrates the full Driver Compiler API. +General command: +```sh +./compilerTest -m xxx.xml -d NPU.XXXX +``` +Commonly used command line parameters (same command line options as [compile_tool](https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_npu/tools/compile_tool/main.cpp)): + +- (required) `-m model.xml`: Specifies the model path. Please ensure the model IR file is complete. + +- (required) `-d NPU.XXXX`: Specifies the simulated platform. + +- (optional) `-o output.net`: Specifies the output network name. + +- (optional) `-c config.file`: Uses the same configuration format as `compile_tool`. +To save the serialized IR, please use the `CID_GET_SERIALIZED_MODEL` environment variable. + +## profilingTest + +`profilingTest` is used to output profiling information. + +General command: +```sh +./profilingTest .blob profiling-0.bin +``` + +To get the blob file, use compilerTest or [compile_tool](https://github.com/openvinotoolkit/npu_compiler/tree/master/tools/compile_tool) of the [NPU-Plugin Project]. + +To get the profiling-0.bin and more profiling details, please see [how to use profiling.md](../../../../guides/how-to-use-profiling.md) in the [NPU-Plugin Project]. + + +## loaderTest + +`loaderTest` is used to check whether Driver Compiler header is available. + +General command: +```sh +./loaderTest -v=1 +./loaderTest -v=0 +``` + +## vpuxCompilerL0Test + +`vpuxCompilerL0Test` is the test suite of the Driver Compiler. Its test range is defined in the [vpux_driver_compiler/test/functional/scripts](../../test/functional/scripts). + +### Setup + +Set `POR_PATH` manually. `POR_PATH` is the test models' root folder. +```sh +# Copy and unpack POR model to special location +tar -xvjf path/to/por_model.tar.bz2 +export POR_PATH=/path/to/por_model +``` + +Set `CID_TOOL` to load the configuration JSON files. You can use the configuration JSON files in the [vpux_driver_compiler/test/functional/scripts](../../test/functional/scripts) folder of this repository +```sh +# Set the configuration JSON files +export CID_TOOL=/path/to/configuration/JSON +``` + +
+Configuration JSON Format + +The [configuration JSON](../../test/functional/scripts) files contain: + +- `device`: Platform on which it will run. +- `enabled`: Whether this model will be executed. +- `network`: Name of the model to run (the simple_function model is defined [here](../../test/functional/vcl_tests_common.cpp#L126)). +- `info`: Model configuration (optional), format: `"--inputs_precisions=\"result.1:fp16\" --inputs_layouts=\"result.1:NCHW\" --outputs_precisions=\"473:fp16\" --outputs_layouts=\"473:NC\" --config NPU_COMPILATION_MODE=\"DefaultHW\""`. + - `--inputs_precisions` and `--outputs_precisions` consist of the names of the inputs and outputs along with the precision to be used. + - `--inputs_layouts` and `--outputs_layouts` consist of the names of the inputs and outputs along with the desired layout modifications. + - `--config` is what config you want to use. +- `path`: Relative path to the current model using the POR model or a custom model. +
+ +### Running Tests + +Run all tests: +```sh +./vpuxCompilerL0Test +``` +Or run specific tests using `gtest_filter`, e.g., to test resnet-50-pytorch: +```sh +./vpuxCompilerL0Test --gtest_filter=*resnet*50*pytorch* +``` + + +>Note: For more debugging methods and details, refer to [how to debug](../../../vpux_compiler/docs/guides/how_to_debug.md) in vpux_compiler part. + + +[OpenVINO Project]: https://github.com/openvinotoolkit/openvino +[NPU-Plugin Project]: https://github.com/openvinotoolkit/npu_compiler diff --git a/src/vpux_driver_compiler/src/loader/CMakeLists.txt b/src/vpux_driver_compiler/src/loader/CMakeLists.txt index 7245a39531..7aa2bb6a2a 100644 --- a/src/vpux_driver_compiler/src/loader/CMakeLists.txt +++ b/src/vpux_driver_compiler/src/loader/CMakeLists.txt @@ -9,12 +9,12 @@ endif() set(TARGET_NAME VPUXLoader) -set(ELF_DIR "${InferenceEngineVPUXPlugin_SOURCE_DIR}/thirdparty/elf/vpux_elf/") -set(LOADER_DIR_INC "${InferenceEngineVPUXPlugin_SOURCE_DIR}/thirdparty/elf/vpux_elf/loader/include/") -set(LOADER_DIR_HPI_INC "${InferenceEngineVPUXPlugin_SOURCE_DIR}/thirdparty/elf/vpux_elf/hpi_component/include/") -set(LOADER_DIR_HPI_SRC "${InferenceEngineVPUXPlugin_SOURCE_DIR}/thirdparty/elf/vpux_elf/hpi_component/src") -set(LOADER_DIR_SRC "${InferenceEngineVPUXPlugin_SOURCE_DIR}/thirdparty/elf/vpux_elf/loader/src") -set(LOADER_DIR "${InferenceEngineVPUXPlugin_SOURCE_DIR}/thirdparty/elf/vpux_elf/loader/") +set(ELF_DIR "${PROJECT_SOURCE_DIR}/thirdparty/elf/vpux_elf/") +set(LOADER_DIR_INC "${PROJECT_SOURCE_DIR}/thirdparty/elf/vpux_elf/loader/include/") +set(LOADER_DIR_HPI_INC "${PROJECT_SOURCE_DIR}/thirdparty/elf/vpux_elf/hpi_component/include/") +set(LOADER_DIR_HPI_SRC "${PROJECT_SOURCE_DIR}/thirdparty/elf/vpux_elf/hpi_component/src") +set(LOADER_DIR_SRC "${PROJECT_SOURCE_DIR}/thirdparty/elf/vpux_elf/loader/src") +set(LOADER_DIR "${PROJECT_SOURCE_DIR}/thirdparty/elf/vpux_elf/loader/") if (WIN32) set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) diff --git a/src/vpux_driver_compiler/src/vpux_compiler_l0/CMakeLists.txt b/src/vpux_driver_compiler/src/vpux_compiler_l0/CMakeLists.txt index ccdfd9fd5a..3f9e5eacbe 100644 --- a/src/vpux_driver_compiler/src/vpux_compiler_l0/CMakeLists.txt +++ b/src/vpux_driver_compiler/src/vpux_compiler_l0/CMakeLists.txt @@ -69,6 +69,10 @@ target_link_libraries(${TARGET_NAME} npu_profiling_utils ) +if(UNIX) + target_link_options(${TARGET_NAME} PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/vcl.exports) +endif() + add_dependencies(${TARGET_NAME} VPUXLoader) ov_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) @@ -89,6 +93,6 @@ endif() install( FILES - "${InferenceEngineVPUXPlugin_SOURCE_DIR}/src/vpux_driver_compiler/include/npu_driver_compiler.h" + "${PROJECT_SOURCE_DIR}/src/vpux_driver_compiler/include/npu_driver_compiler.h" DESTINATION cid COMPONENT ${CID_COMPONENT}) diff --git a/src/vpux_driver_compiler/src/vpux_compiler_l0/vcl.exports b/src/vpux_driver_compiler/src/vpux_compiler_l0/vcl.exports new file mode 100644 index 0000000000..be7020fcd0 --- /dev/null +++ b/src/vpux_driver_compiler/src/vpux_compiler_l0/vcl.exports @@ -0,0 +1,12 @@ +# +# Copyright (C) 2025 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +{ + global: + vcl*; + local: + *; +}; diff --git a/src/vpux_driver_compiler/test/functional/scripts/test.json b/src/vpux_driver_compiler/test/functional/scripts/test.json new file mode 100644 index 0000000000..e6f115e4bc --- /dev/null +++ b/src/vpux_driver_compiler/test/functional/scripts/test.json @@ -0,0 +1,8 @@ +{"device": "3720", "enabled": "false", "network": "yolo_v4", "info": "--inputs_precisions=\"image_input:fp16\" --inputs_layouts=\"image_input:NCHW\" --outputs_precisions=\"conv2d_93/BiasAdd/Add:fp16 conv2d_101/BiasAdd/Add:fp16 conv2d_109/BiasAdd/Add:fp16\" --outputs_layouts=\"conv2d_93/BiasAdd/Add:NCHW conv2d_101/BiasAdd/Add:NCHW conv2d_109/BiasAdd/Add:NCHW\" --config NPU_COMPILATION_MODE=\"DefaultHW\"", "path": "yolo_v4/tf/FP16-INT8"} +{"device": "3720", "enabled": "true", "network": "resnet-50-pytorch", "info": "", "path": "resnet-50-pytorch/onnx/model/dense/FP16"} +{"device": "3720", "enabled": "true", "network": "unet-camvid-onnx-0001", "info": "", "path": "unet-camvid-onnx-0001/onnx/model/dense/FP16"} +{"device": "3720", "enabled": "true", "network": "yolo-v4-tiny", "info": "", "path": "yolo-v4-tiny/tf/model/dense/FP16"} +{"device": "4000", "enabled": "true", "network": "resnet-50-pytorch", "info": "", "path": "resnet-50-pytorch/onnx/model/dense/FP16"} +{"device": "4000", "enabled": "true", "network": "edsr_nas_s", "info": "", "path": "edsr_nas_s/onnx/model/dense/FP16"} +{"device": "4000", "enabled": "true", "network": "unet-camvid-onnx-0001", "info": "", "path": "unet-camvid-onnx-0001/onnx/model/dense/FP16"} +{"device": "4000", "enabled": "true", "network": "yolo-v4-tiny", "info": "", "path": "yolo-v4-tiny/tf/model/dense/FP16"} diff --git a/src/vpux_driver_compiler/test/functional/scripts/test_smoke.json b/src/vpux_driver_compiler/test/functional/scripts/test_smoke.json new file mode 100644 index 0000000000..2d62a07af4 --- /dev/null +++ b/src/vpux_driver_compiler/test/functional/scripts/test_smoke.json @@ -0,0 +1,6 @@ +{"device": "3720", "enabled": "true", "network": "Intel_DNS", "info": "", "path": "Intel_DNS/onnx/model/dense/FP32"} +{"device": "3720", "enabled": "true", "network": "mobilenet-v2", "info": "", "path": "mobilenet-v2/onnx/model/dense/FP16"} +{"device": "3720", "enabled": "true", "network": "simple_function", "info": "", "path": ""} +{"device": "4000", "enabled": "true", "network": "Intel_DNS", "info": "", "path": "Intel_DNS/onnx/model/dense/FP32"} +{"device": "4000", "enabled": "true", "network": "mobilenet-v2", "info": "", "path": "mobilenet-v2/onnx/model/dense/FP16"} +{"device": "4000", "enabled": "true", "network": "simple_function", "info": "", "path": ""} diff --git a/src/vpux_elf/tests/act-kernel.mlir b/src/vpux_elf/tests/act-kernel.mlir index cede734044..7ba762ea69 100644 --- a/src/vpux_elf/tests/act-kernel.mlir +++ b/src/vpux_elf/tests/act-kernel.mlir @@ -3,9 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -module @Test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { - IE.MemoryResource 31457280 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} - IE.MemoryResource 2097152 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} +module @Test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + IE.MemoryResource 31457280 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 2097152 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DMA_NN IE.ExecutorResource 1 of @SHAVE_ACT IE.TileResource 1 of @NCE { diff --git a/src/vpux_elf/tests/nncmx-two-dma-barrier-elf.mlir b/src/vpux_elf/tests/nncmx-two-dma-barrier-elf.mlir index 5a0b380f7e..f3da0a83dc 100644 --- a/src/vpux_elf/tests/nncmx-two-dma-barrier-elf.mlir +++ b/src/vpux_elf/tests/nncmx-two-dma-barrier-elf.mlir @@ -3,9 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -module @Test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { - IE.MemoryResource 31457280 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} - IE.MemoryResource 2097152 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} +module @Test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + IE.MemoryResource 31457280 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 2097152 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DMA_NN IE.ExecutorResource 1 of @SHAVE_ACT IE.TileResource 1 of @NCE { diff --git a/src/vpux_elf/tests/two-dma-barrier-elf.mlir b/src/vpux_elf/tests/two-dma-barrier-elf.mlir index d23fa9a52f..c002f005e1 100644 --- a/src/vpux_elf/tests/two-dma-barrier-elf.mlir +++ b/src/vpux_elf/tests/two-dma-barrier-elf.mlir @@ -3,9 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -module @Test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { - IE.MemoryResource 31457280 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} - IE.MemoryResource 2097152 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} +module @Test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + IE.MemoryResource 31457280 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 2097152 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DMA_NN IE.ExecutorResource 1 of @SHAVE_ACT IE.TileResource 1 of @NCE { diff --git a/src/vpux_elf/tests/two-dma-ports.mlir b/src/vpux_elf/tests/two-dma-ports.mlir index a9f11dff1c..03e4819a73 100644 --- a/src/vpux_elf/tests/two-dma-ports.mlir +++ b/src/vpux_elf/tests/two-dma-ports.mlir @@ -4,15 +4,15 @@ // #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module @mainModule attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @mainModule attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 2 of @NCE at 1.300000e+03 MHz { IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} - IE.MemoryResource 524288000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 524288000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @race_condition_dma_f16_f16 inputsInfo : { DataInfo "input_0" : tensor<1x16x16x16xf16, {order = #NHWC}> } outputsInfo : { diff --git a/src/vpux_utils/include/vpux/utils/core/checked_cast.hpp b/src/vpux_utils/include/vpux/utils/core/checked_cast.hpp index 131120c7d0..d8acde23ae 100644 --- a/src/vpux_utils/include/vpux/utils/core/checked_cast.hpp +++ b/src/vpux_utils/include/vpux/utils/core/checked_cast.hpp @@ -20,26 +20,6 @@ namespace vpux { -namespace details { - -template -std::enable_if_t staticIf(Func&& func) { - func(); -} - -template -std::enable_if_t staticIf(Func&&) { -} - -// To overcame syntax parse error, when `>` comparison operator is threated as -// template closing bracket -template -constexpr bool Greater(T1&& v1, T2&& v2) { - return v1 > v2; -} - -} // namespace details - template enable_t> checked_cast(InT value) { return value; @@ -49,15 +29,15 @@ template enable_t, std::is_signed, std::is_integral, std::is_signed, not_>> checked_cast(InT value) { - details::staticIf::lowest() < std::numeric_limits::lowest()>([&] { + if constexpr (std::numeric_limits::lowest() < std::numeric_limits::lowest()) { VPUX_THROW_UNLESS(value >= std::numeric_limits::lowest(), "Can not safely cast {0} from {1} to {2}", static_cast(value), llvm::getTypeName(), llvm::getTypeName()); - }); + } - details::staticIf::max(), std::numeric_limits::max())>([&] { + if constexpr (std::numeric_limits::max() > std::numeric_limits::max()) { VPUX_THROW_UNLESS(value <= std::numeric_limits::max(), "Can not safely cast {0} from {1} to {2}", static_cast(value), llvm::getTypeName(), llvm::getTypeName()); - }); + } return static_cast(value); } @@ -66,10 +46,10 @@ template enable_t, std::is_unsigned, std::is_integral, std::is_unsigned, not_>> checked_cast(InT value) { - details::staticIf::max(), std::numeric_limits::max())>([&] { + if constexpr (std::numeric_limits::max() > std::numeric_limits::max()) { VPUX_THROW_UNLESS(value <= std::numeric_limits::max(), "Can not safely cast {0} from {1} to {2}", static_cast(value), llvm::getTypeName(), llvm::getTypeName()); - }); + } return static_cast(value); } @@ -77,12 +57,12 @@ checked_cast(InT value) { template enable_t, std::is_unsigned, std::is_integral, std::is_signed> checked_cast( InT value) { - details::staticIf::max(), - static_cast>(std::numeric_limits::max()))>([&] { + if constexpr (std::numeric_limits::max() > + static_cast>(std::numeric_limits::max())) { VPUX_THROW_UNLESS(value <= static_cast>(std::numeric_limits::max()), "Can not safely cast {0} from {1} to {2}", static_cast(value), llvm::getTypeName(), llvm::getTypeName()); - }); + } return static_cast(value); } @@ -93,12 +73,12 @@ enable_t, std::is_signed, std::is_integral= 0, "Can not safely cast {0} from {1} to {2}", static_cast(value), llvm::getTypeName(), llvm::getTypeName()); - details::staticIf>(std::numeric_limits::max()), - std::numeric_limits::max())>([&] { + if constexpr (static_cast>(std::numeric_limits::max()) > + std::numeric_limits::max()) { VPUX_THROW_UNLESS(static_cast>(value) <= std::numeric_limits::max(), "Can not safely cast {0} from {1} to {2}", static_cast(value), llvm::getTypeName(), llvm::getTypeName()); - }); + } return static_cast(value); } diff --git a/src/vpux_utils/include/vpux/utils/core/format.hpp b/src/vpux_utils/include/vpux/utils/core/format.hpp index 221e15e8f7..9202517bc7 100644 --- a/src/vpux_utils/include/vpux/utils/core/format.hpp +++ b/src/vpux_utils/include/vpux/utils/core/format.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -28,8 +29,6 @@ #include #include -#include - namespace vpux { // diff --git a/src/vpux_utils/include/vpux/utils/profiling/parser/records.hpp b/src/vpux_utils/include/vpux/utils/profiling/parser/records.hpp index adfca5d7d3..d4ab6895bf 100644 --- a/src/vpux_utils/include/vpux/utils/profiling/parser/records.hpp +++ b/src/vpux_utils/include/vpux/utils/profiling/parser/records.hpp @@ -16,7 +16,6 @@ #include "vpux/utils/profiling/parser/hw.hpp" #include "vpux/utils/profiling/tasknames.hpp" -#include "schema/graphfile_generated.h" #include "schema/profiling_generated.h" #include @@ -164,22 +163,6 @@ class RawProfilingRecord { : _name(name), _layerType(layerType), _waitBarriers(wBarriers), _updateBarriers(uBarriers) { } -private: - RawProfilingRecord(const std::string& cleanName, const std::string& layerType, const MVCNN::Task* task) - : _name(cleanName), _layerType(layerType) { - VPUX_THROW_WHEN(task == nullptr, "Invalid task"); - VPUX_THROW_WHEN(task->name() == nullptr, "Invalid task name"); - VPUX_THROW_WHEN(task->associated_barriers() == nullptr, "Task should have associated barriers"); - - auto barriers = task->associated_barriers(); - if (auto wBarriers = barriers->wait_barriers()) { - _waitBarriers = BarriersSet(wBarriers->cbegin(), wBarriers->cend()); - } - if (auto uBarriers = barriers->update_barriers()) { - _updateBarriers = BarriersSet(uBarriers->cbegin(), uBarriers->cend()); - } - } - protected: virtual ~RawProfilingRecord() = default; diff --git a/src/vpux_utils/src/profiling/CMakeLists.txt b/src/vpux_utils/src/profiling/CMakeLists.txt index 1879a4a46d..007c7d8bd9 100644 --- a/src/vpux_utils/src/profiling/CMakeLists.txt +++ b/src/vpux_utils/src/profiling/CMakeLists.txt @@ -17,7 +17,7 @@ endif() vpux_add_flatc_target(npu_profiling_schema SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/schema" - DST_DIR "${PROJECT_BINARY_DIR}/${gen_base_dst_include_dir}/profiling/generated" + DST_DIR "${PROJECT_BINARY_DIR}/profiling/generated" ARGS "--gen-object-api" "--reflect-names") set_target_properties(npu_profiling_schema PROPERTIES FOLDER "src") @@ -60,7 +60,6 @@ ov_link_system_libraries(${TARGET_NAME} PRIVATE flatbuffers npu_elf - npu_mlir_compiler_schema npu_profiling_schema ) diff --git a/src/vpux_utils/src/profiling/metadata.cpp b/src/vpux_utils/src/profiling/metadata.cpp index 550ed44db5..e5ef4d00af 100644 --- a/src/vpux_utils/src/profiling/metadata.cpp +++ b/src/vpux_utils/src/profiling/metadata.cpp @@ -5,7 +5,6 @@ #include "vpux/utils/profiling/metadata.hpp" -#include "schema/graphfile_generated.h" #include "schema/profiling_generated.h" #include diff --git a/sw_runtime_kernels/kernels/CMakeLists.txt b/sw_runtime_kernels/kernels/CMakeLists.txt index ab237ee29f..1c1b5c29a8 100644 --- a/sw_runtime_kernels/kernels/CMakeLists.txt +++ b/sw_runtime_kernels/kernels/CMakeLists.txt @@ -5,34 +5,40 @@ cmake_minimum_required(VERSION 3.20) project(kernels) + option(ENABLE_SHAVE_BINARIES_BUILD "Enable shave binaries build, if disabled, prebuilt binaries will be used" OFF) option(ENABLE_MANAGEMENT_KERNEL_BUILD "Enable management kernel build" OFF) option(ENABLE_FIRMWARE_SOURCES_KERNEL_BUILD "Enable firmware.vpu.client sources kernels build" OFF) +set(prebuild_binary_dir "${CMAKE_CURRENT_SOURCE_DIR}/prebuild/act_shave_bin") set(target_binary_dir "${CMAKE_CURRENT_BINARY_DIR}/act_shave_bin") file(MAKE_DIRECTORY ${target_binary_dir}) set(target_asm_dir "${CMAKE_CURRENT_BINARY_DIR}/act_shave_asm") file(MAKE_DIRECTORY ${target_asm_dir}) # Establish if we want to build the kernels or just use prebuilts -if(NOT (ENABLE_SHAVE_BINARIES_BUILD OR ENABLE_MANAGEMENT_KERNEL_BUILD OR ENABLE_FIRMWARE_SOURCES_KERNEL_BUILD)) - set(build_kernels FALSE) -else() +if(ENABLE_SHAVE_BINARIES_BUILD OR ENABLE_MANAGEMENT_KERNEL_BUILD OR ENABLE_FIRMWARE_SOURCES_KERNEL_BUILD) set(build_kernels TRUE) +else() + set(build_kernels FALSE) endif() # Defines section and general use code set(asm_suffix ".s") set(obj_suffix ".o") set(elf_suffix ".elf") -set(kernel_descrip_dir "${CMAKE_CURRENT_SOURCE_DIR}/descrip/") -set(kernel_descrip_list "") -set(act_shave_kernels "") +set(kernel_descrip_dir "${CMAKE_CURRENT_SOURCE_DIR}/descrip") +set(kernel_descrip_list) +set(all_kernels) +set(kernels_to_build) -add_custom_target(act_shave_kernels_ready) add_custom_target(act_shave_kernels_build) add_custom_target(act_shave_kernels_asm) +if(NOT DEFINED MV_TOOLS_PATH AND DEFINED ENV{MV_TOOLS_DIR} AND DEFINED ENV{MV_TOOLS_VERSION}) + set(MV_TOOLS_PATH $ENV{MV_TOOLS_DIR}/$ENV{MV_TOOLS_VERSION}) +endif() + # Enable building kernels from separate build folder if needed if(ENABLE_MLIR_COMPILER AND build_kernels) exists_mv_tools_version(available) @@ -45,12 +51,6 @@ else() set(MV_TOOLS_PATH "$ENV{IE_NPU_MV_TOOLS_PATH}") endif() -# Delete previously build binaries from the build folder to avoid copy of old binaries -file(REMOVE_RECURSE "${target_binary_dir}/*${elf_suffix}") -file(REMOVE_RECURSE "${target_asm_dir}/*${asm_suffix}") -file(REMOVE_RECURSE "${CMAKE_CURRENT_BINARY_DIR}/*${obj_suffix}") -file(REMOVE_RECURSE "${CMAKE_CURRENT_BINARY_DIR}/generated_shave_binary_resources.cpp") - if(build_kernels) if(UNIX) set(mv_tools_compile "${MV_TOOLS_PATH}/linux64/bin/moviCompile") @@ -65,31 +65,16 @@ if(build_kernels) set(link_libraries_list_VPU3720 "${MV_TOOLS_PATH}/common/moviCompile/lib/37xxxx/mlibm.a" "${MV_TOOLS_PATH}/common/moviCompile/lib/37xxxx/mlibc_lite.a" - "${MV_TOOLS_PATH}/common/moviCompile/lib/37xxxx/mlibc_lite_lgpl.a" "${MV_TOOLS_PATH}/common/moviCompile/lib/37xxxx/mlibcrt.a" ) set(link_libraries_list_VPU4000 "${MV_TOOLS_PATH}/common/moviCompile/lib/40xxxx/mlibm.a" "${MV_TOOLS_PATH}/common/moviCompile/lib/40xxxx/mlibc_lite.a" - "${MV_TOOLS_PATH}/common/moviCompile/lib/40xxxx/mlibc_lite_lgpl.a" "${MV_TOOLS_PATH}/common/moviCompile/lib/40xxxx/mlibcrt.a" ) endif() -# Main code block -# Only in case of binaries build we run through the whole descrip list -if(ENABLE_SHAVE_BINARIES_BUILD) - file(GLOB kernel_list "${kernel_descrip_dir}*.txt") - list(APPEND kernel_descrip_list ${kernel_list}) - # Remove MGMT kernels - file(GLOB mgmt_kernel_list "${kernel_descrip_dir}nnActEntry_*.txt") - list(REMOVE_ITEM kernel_descrip_list ${mgmt_kernel_list}) - # Remove kernels with firmware sources - file(GLOB firmware_kernel_list "${kernel_descrip_dir}*_fw.txt") - list(REMOVE_ITEM kernel_descrip_list ${firmware_kernel_list}) -endif() - if(ENABLE_MANAGEMENT_KERNEL_BUILD OR ENABLE_FIRMWARE_SOURCES_KERNEL_BUILD) if(ENABLE_NPU_MONO) if(DEFINED ENV{FIRMWARE_VPU_DIR}) @@ -106,23 +91,19 @@ if(ENABLE_MANAGEMENT_KERNEL_BUILD OR ENABLE_FIRMWARE_SOURCES_KERNEL_BUILD) set(FIRMWARE_VPU_DIR "$ENV{FIRMWARE_VPU_DIR}") endif() +endif() - # If we want to build MGMT kernels we should iterate through those descrip files - if(ENABLE_MANAGEMENT_KERNEL_BUILD) - file(GLOB mgmt_kernel_list "${kernel_descrip_dir}nnActEntry_*.txt") - list(APPEND kernel_descrip_list ${mgmt_kernel_list}) - endif() - - # If we want to build kernels with firmware sources we should iterate through those descrip files - if(ENABLE_FIRMWARE_SOURCES_KERNEL_BUILD) - file(GLOB firmware_kernel_list "${kernel_descrip_dir}*_fw.txt") - list(APPEND kernel_descrip_list ${firmware_kernel_list}) - endif() - +# Make list of all kernel descriptors +if(build_kernels) + file(GLOB kernel_descrip_list RELATIVE "${kernel_descrip_dir}" CONFIGURE_DEPENDS "${kernel_descrip_dir}/*.txt") + set(KERNELS_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") endif() -# Main iterator -foreach(kernel_descrip ${kernel_descrip_list}) +# +# Process descriptors and build kernels +# +foreach(kernel_descrip IN LISTS kernel_descrip_list) + ### PARSE DESCRIPTOR ### # Initial list of parameters, will be populated after we read descrip files set(kernel_entry "") set(kernel_src_dir "src") @@ -138,246 +119,274 @@ foreach(kernel_descrip ${kernel_descrip_list}) set(asm_include_list "") set(link_script_file "${CMAKE_CURRENT_SOURCE_DIR}/prebuild/shave_kernel.ld") - # Reading descrip files one by one from the list created above - get_filename_component(kernel_descrip_path ${kernel_descrip} DIRECTORY) + # Reading descrip file + include("${kernel_descrip_dir}/${kernel_descrip}") - include("${kernel_descrip}") - get_filename_component(kernel_name ${kernel_src} NAME_WE) + if(NOT kernel_cpunum) + message(SEND_ERROR "Missing kernel_cpunum in descriptor ${kernel_descrip}") + elseif(NOT kernel_src) + message(SEND_ERROR "Missing kernel_src in descriptor ${kernel_descrip}") + endif() + get_filename_component(kernel_name ${kernel_src} NAME_WE) if(kernel_entry STREQUAL "") set(kernel_entry "${kernel_name}") endif() - if(build_kernels) - if(kernel_cpunum STREQUAL "3720") - list(APPEND define_symbols_list "USE_3720_INSTRUCTIONS") - set(link_libraries_list ${link_libraries_list_VPU3720}) - if(rt_kernel STREQUAL "yes") - list(APPEND define_symbols_list "CONFIG_TARGET_SOC_3720") - endif() - elseif(kernel_cpunum STREQUAL "4000") - list(APPEND define_symbols_list "USE_4000_INSTRUCTIONS") - set(link_libraries_list ${link_libraries_list_VPU4000}) - if(rt_kernel STREQUAL "yes") - list(APPEND define_symbols_list "CONFIG_TARGET_SOC_4000") - endif() - endif() + # Collect names of all binaries for prebuild clean-up + set(kernel_cpu "${kernel_cpunum}xx") + set(kernel_cpu_suffix ".${kernel_cpunum}xx") + set(elf_file "${kernel_name}${kernel_cpu_suffix}${elf_suffix}") + list(APPEND all_kernels "${elf_file}") + if(kernel_cpunum STREQUAL "3720" AND NOT rt_kernel) # MTL DDR split access workaround + set(elf_file_lsu0_wo "${kernel_name}${kernel_cpu_suffix}_lsu0_wo${elf_suffix}") + list(APPEND all_kernels "${elf_file_lsu0_wo}") + endif() + + # Skip kernel build if not enabled + if((rt_kernel AND NOT ENABLE_MANAGEMENT_KERNEL_BUILD) OR + (firmware_kernel AND NOT ENABLE_FIRMWARE_SOURCES_KERNEL_BUILD) OR + (NOT (rt_kernel OR firmware_kernel) AND NOT ENABLE_SHAVE_BINARIES_BUILD)) + continue() + endif() - if(always_inline STREQUAL "yes") - list(APPEND define_symbols_list "CONFIG_ALWAYS_INLINE") + if(kernel_cpunum STREQUAL "3720") + list(APPEND define_symbols_list "USE_3720_INSTRUCTIONS") + set(link_libraries_list ${link_libraries_list_VPU3720}) + if(rt_kernel) + list(APPEND define_symbols_list "CONFIG_TARGET_SOC_3720") + endif() + elseif(kernel_cpunum STREQUAL "4000") + list(APPEND define_symbols_list "USE_4000_INSTRUCTIONS") + set(link_libraries_list ${link_libraries_list_VPU4000}) + if(rt_kernel) + list(APPEND define_symbols_list "CONFIG_TARGET_SOC_4000") endif() + endif() - list(INSERT include_dirs_list 0 "${CMAKE_CURRENT_SOURCE_DIR}/inc") - list(INSERT define_symbols_list 0 "__act_shave__") + if(always_inline) + list(APPEND define_symbols_list "CONFIG_ALWAYS_INLINE") endif() - set(kernel_cpu "${kernel_cpunum}xx") - set(kernel_cpu_suffix ".${kernel_cpunum}xx") + list(INSERT include_dirs_list 0 "${CMAKE_CURRENT_SOURCE_DIR}/inc") + list(INSERT define_symbols_list 0 "__act_shave__") + + ### RESOLVE FILE NAMES ### set(kernel_src_file "${kernel_src_dir}/${kernel_src}") + if (IS_ABSOLUTE "${kernel_src_file}") + set(kernel_src_path "${kernel_src_file}") + else() + set(kernel_src_path "${CMAKE_CURRENT_SOURCE_DIR}/${kernel_src_file}") + endif() set(obj_file "${kernel_src}${kernel_cpu_suffix}${obj_suffix}") - set(elf_file "${kernel_name}${kernel_cpu_suffix}${elf_suffix}") + set(dep_file "${kernel_src}${kernel_cpu_suffix}.d") set(elf_path "${target_binary_dir}/${elf_file}") set(asm_file "${kernel_name}${kernel_cpu_suffix}${asm_suffix}") set(asm_path "${target_asm_dir}/${asm_file}") - if(kernel_cpunum STREQUAL "3720" AND NOT rt_kernel STREQUAL "yes") # MTL DDR split access workaround + if(kernel_cpunum STREQUAL "3720" AND NOT rt_kernel) # MTL DDR split access workaround set(obj_file_lsu0_wo "${kernel_src}${kernel_cpu_suffix}_lsu0_wo${obj_suffix}") - set(elf_file_lsu0_wo "${kernel_name}${kernel_cpu_suffix}_lsu0_wo${elf_suffix}") + set(dep_file_lsu0_wo "${kernel_src}${kernel_cpu_suffix}_lsu0_wo.d") + set(elf_path_lsu0_wo "${target_binary_dir}/${elf_file_lsu0_wo}") set(asm_file_lsu0_wo "${kernel_name}${kernel_cpu_suffix}_lsu0_wo${asm_suffix}") + set(asm_path_lsu0_wo "${target_asm_dir}/${asm_file_lsu0_wo}") set(cppflags_list_lsu0_wo "-mllvm" "-shave-lsu-load-policy=use-only-lsu1" "-mllvm" "-shave-lsu-store-policy=prefer-lsu0" ) - set(elf_path_lsu0_wo "${target_binary_dir}/${elf_file_lsu0_wo}") - set(asm_path_lsu0_wo "${target_asm_dir}/${asm_file_lsu0_wo}") endif() - if(build_kernels) - if(rt_kernel STREQUAL "yes") - set(kernel_src_path "${kernel_src_file}") - # ACT management kernel must be built with Shave preemption checks disabled - set(shave_preemption_opt "-mshave-preemption-checks=off") - set(link_script_file "${CMAKE_CURRENT_SOURCE_DIR}/prebuild/shave_rt_kernel.ld") - else() - set(kernel_src_path "${CMAKE_CURRENT_SOURCE_DIR}/${kernel_src_file}") - set(shave_preemption_opt "-mshave-preemption-checks=restore") - list(APPEND shave_preemption_opt "-mshave-low-impact-preemption" ) - list(APPEND shave_preemption_opt "-mshave-preemption-max-loop-depth=1") - endif() + if(rt_kernel) + # ACT management kernel must be built with Shave preemption checks disabled + set(shave_preemption_opt "-mshave-preemption-checks=off") + set(link_script_file "${CMAKE_CURRENT_SOURCE_DIR}/prebuild/shave_rt_kernel.ld") + else() + set(shave_preemption_opt "-mshave-preemption-checks=restore") + list(APPEND shave_preemption_opt "-mshave-low-impact-preemption" ) + list(APPEND shave_preemption_opt "-mshave-preemption-max-loop-depth=1") endif() # List needed to create the final embedded description file - list(APPEND act_shave_kernels ${kernel_src_file}) + list(TRANSFORM include_dirs_list PREPEND "-I") + list(TRANSFORM define_symbols_list PREPEND "-D") + + set(compile_options + "-mcpu=${kernel_cpu}" + ${optimization_opts} + ${cppflags_list} + ${include_dirs_list} + ${define_symbols_list} + ${shave_preemption_opt}) + + # Compile the kernel and output an assembly file + # No output, no dependencies -- always regenerate + add_custom_target("${asm_file}" + COMMAND "${mv_tools_compile}" ${compile_options} -S "${kernel_src_path}" -o "${asm_path}" + COMMENT "Generating ${asm_file}" + ) + add_dependencies(act_shave_kernels_asm "${asm_file}") + + # Compile the kernel and output an object file + add_custom_command( + OUTPUT "${obj_file}" + DEPENDS "${kernel_src_path}" + DEPFILE "${dep_file}" + COMMAND "${mv_tools_compile}" -MD ${compile_options} -c "${kernel_src_path}" -o "${obj_file}" + ) - if(build_kernels) - # List needed to create the final embedded description file - list(TRANSFORM include_dirs_list PREPEND "-I") - list(TRANSFORM define_symbols_list PREPEND "-D") + set(obj_file_list "${obj_file}") - # Compile the kernel and output an assembly file - add_custom_command( - OUTPUT "${asm_path}" - DEPENDS "${kernel_src_file}" - COMMAND "${mv_tools_compile}" -mcpu=${kernel_cpu} ${optimization_opts} ${cppflags_list} ${include_dirs_list} ${define_symbols_list} ${shave_preemption_opt} -S "${kernel_src_path}" -o "${asm_path}" + if(kernel_cpunum STREQUAL "3720" AND NOT rt_kernel) # MTL DDR split access workaround + add_custom_target("${asm_file_lsu0_wo}" + COMMAND "${mv_tools_compile}" ${compile_options} ${cppflags_list_lsu0_wo} -S "${kernel_src_path}" -o "${asm_path_lsu0_wo}" + COMMENT "Generating ${asm_file_lsu0_wo}" ) + add_dependencies(act_shave_kernels_asm "${asm_file_lsu0_wo}") - add_custom_target("${asm_file}" DEPENDS "${asm_path}") - add_dependencies(act_shave_kernels_asm "${asm_file}") - - # Compile the kernel and output an object file add_custom_command( - OUTPUT "${obj_file}" - DEPENDS "${kernel_src_file}" - COMMAND "${mv_tools_compile}" -mcpu=${kernel_cpu} ${optimization_opts} ${cppflags_list} ${include_dirs_list} ${define_symbols_list} ${shave_preemption_opt} -c "${kernel_src_path}" -o "${obj_file}" + OUTPUT "${obj_file_lsu0_wo}" + DEPENDS "${kernel_src_path}" + DEPFILE "${dep_file_lsu0_wo}" + COMMAND "${mv_tools_compile}" -MD ${compile_options} ${cppflags_list_lsu0_wo} -c "${kernel_src_path}" -o "${obj_file_lsu0_wo}" ) - set(obj_file_list "${obj_file}") + set(obj_file_list_lsu0_wo "${obj_file_lsu0_wo}") + endif() - if(kernel_cpunum STREQUAL "3720" AND NOT rt_kernel STREQUAL "yes") # MTL DDR split access workaround - add_custom_command( - OUTPUT "${asm_path_lsu0_wo}" - DEPENDS "${kernel_src_file}" - COMMAND "${mv_tools_compile}" -mcpu=${kernel_cpu} ${optimization_opts} ${cppflags_list} ${cppflags_list_lsu0_wo} ${include_dirs_list} ${define_symbols_list} ${shave_preemption_opt} -S "${kernel_src_path}" -o "${asm_path_lsu0_wo}" - ) + # Compile extra sources if specified in the descrip file + if(extra_src_list) + foreach(extra_src_file ${extra_src_list}) + get_filename_component(src_name ${extra_src_file} NAME_WE) + get_filename_component(dir_name ${extra_src_file} DIRECTORY) + + # Discard full path, just keep name of the dir the file is in + get_filename_component(dir_name ${dir_name} NAME_WE) - add_custom_target("${asm_file_lsu0_wo}" DEPENDS "${asm_path_lsu0_wo}") - add_dependencies(act_shave_kernels_asm "${asm_file_lsu0_wo}") + # Some extra_src files have the same filename so the .o files must be in separate directories to avoid overwriting. + set(obj_path "${CMAKE_CURRENT_BINARY_DIR}/extra_src/${kernel_cpu}/${dir_name}") + file(MAKE_DIRECTORY "${obj_path}") + set(base_name "${obj_path}/${src_name}.${kernel_cpu}") + set(obj_file "${base_name}${obj_suffix}") + set(dep_file "${base_name}.d") add_custom_command( - OUTPUT "${obj_file_lsu0_wo}" - DEPENDS "${kernel_src_file}" - COMMAND "${mv_tools_compile}" -mcpu=${kernel_cpu} ${optimization_opts} ${cppflags_list} ${cppflags_list_lsu0_wo} ${include_dirs_list} ${define_symbols_list} ${shave_preemption_opt} -c "${kernel_src_path}" -o "${obj_file_lsu0_wo}" + OUTPUT "${obj_file}" + DEPENDS "${kernel_src_path}" + DEPFILE "${dep_file}" + COMMAND "${mv_tools_compile}" -MD ${compile_options} -c "${extra_src_file}" -o "${obj_file}" ) - set(obj_file_list_lsu0_wo "${obj_file_lsu0_wo}") - endif() + list(APPEND obj_file_list "${obj_file}") + if(kernel_cpunum STREQUAL "3720" AND NOT rt_kernel) # MTL DDR split access workaround + list(APPEND obj_file_list_lsu0_wo "${obj_file}") + endif() + endforeach() + endif() - # Compile extra sources if specified in the descrip file - if(NOT extra_src_list STREQUAL "") - foreach(extra_src_file ${extra_src_list}) - get_filename_component(src_name ${extra_src_file} NAME_WE) - get_filename_component(dir_name ${extra_src_file} DIRECTORY) - - # Discard full path, just keep name of the dir the file is in - get_filename_component(dir_name ${dir_name} NAME_WE) - - # Some extra_src files have the same filename so the .o files must be in separate directories to avoid overwriting. - file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/extra_src/${kernel_cpu}/${dir_name}) - set(obj_file "${CMAKE_CURRENT_BINARY_DIR}/extra_src/${kernel_cpu}/${dir_name}/${src_name}.${kernel_cpu}${obj_suffix}") - - add_custom_command( - OUTPUT "${obj_file}" - DEPENDS "${kernel_src_file}" - COMMAND "${mv_tools_compile}" -mcpu=${kernel_cpu} ${optimization_opts} ${cppflags_list} ${include_dirs_list} ${define_symbols_list} ${shave_preemption_opt} -c "${extra_src_file}" -o "${obj_file}" - ) - - list(APPEND obj_file_list "${obj_file}") - if(kernel_cpunum STREQUAL "3720" AND NOT rt_kernel STREQUAL "yes") # MTL DDR split access workaround - list(APPEND obj_file_list_lsu0_wo "${obj_file}") - endif() - endforeach() - endif() + # Compile asm files if specified in the descrip file + if(asm_src_list) + # Due to lack of depfile support + # find all .inc files in source directories + set(inc_list) + foreach(inc_dir IN LISTS asm_src_list kernel_src_dir) + file(GLOB_RECURSE inc_files CONFIGURE_DEPENDS "${inc_dir}/*.inc") + LIST(APPEND inc_list ${inc_files}) + endforeach() - # Compile asm files if specified in the descrip file - if(NOT asm_src_list STREQUAL "") - foreach(asm_src_file ${asm_src_list}) - get_filename_component(src_name ${asm_src_file} NAME_WE) - get_filename_component(dir_name ${asm_src_file} DIRECTORY) - - # Discard full path, just keep name of the dir the file is in - get_filename_component(dir_name ${dir_name} NAME_WE) - - # Some asm_src files have the same filename so the .o files must be in separate directories to avoid overwriting. - file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/asm_src/${kernel_cpu}/${dir_name}) - set(obj_file "${CMAKE_CURRENT_BINARY_DIR}/asm_src/${kernel_cpu}/${dir_name}/${src_name}.${kernel_cpu}${obj_suffix}") - - add_custom_command( - OUTPUT "${obj_file}" - DEPENDS "${kernel_src_file}" - COMMAND "${mv_tools_compile}" -mcpu=${kernel_cpu} ${asm_include_list} -c "${asm_src_file}" -o "${obj_file}" - ) - - list(APPEND obj_file_list "${obj_file}") - if(kernel_cpunum STREQUAL "3720" AND NOT rt_kernel STREQUAL "yes") # MTL DDR split access workaround - list(APPEND obj_file_list_lsu0_wo "${obj_file}") - endif() - endforeach() - endif() + foreach(asm_src_file ${asm_src_list}) + get_filename_component(src_name ${asm_src_file} NAME_WE) + get_filename_component(dir_name ${asm_src_file} DIRECTORY) - # Link the sources, add entry point and windowed sections, then dump the elf file - add_custom_command( - OUTPUT "${elf_path}" - DEPENDS ${obj_file_list} - COMMAND "${mv_tools_link}" --script "${link_script_file}" -entry "${kernel_entry}" --gc-sections --strip-debug --discard-all -zmax-page-size=16 ${obj_file_list} -EL ${link_libraries_list} --output "${elf_path}" - ) + # Discard full path, just keep name of the dir the file is in + get_filename_component(dir_name ${dir_name} NAME_WE) + + # Some asm_src files have the same filename so the .o files must be in separate directories to avoid overwriting. + file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/asm_src/${kernel_cpu}/${dir_name}") + set(obj_file "${CMAKE_CURRENT_BINARY_DIR}/asm_src/${kernel_cpu}/${dir_name}/${src_name}.${kernel_cpu}${obj_suffix}") - if(kernel_cpunum STREQUAL "3720" AND NOT rt_kernel STREQUAL "yes") # MTL DDR split access workaround add_custom_command( - OUTPUT "${elf_path_lsu0_wo}" - DEPENDS ${obj_file_list_lsu0_wo} - COMMAND "${mv_tools_link}" --script "${link_script_file}" -entry "${kernel_entry}" --gc-sections --strip-debug --discard-all -zmax-page-size=16 ${obj_file_list_lsu0_wo} -EL ${link_libraries_list} --output "${elf_path_lsu0_wo}" + OUTPUT "${obj_file}" + DEPENDS "${asm_src_file}" "${inc_list}" + COMMAND "${mv_tools_compile}" -mcpu=${kernel_cpu} ${asm_include_list} -c "${asm_src_file}" -o "${obj_file}" ) - endif() + list(APPEND obj_file_list "${obj_file}") - # Delete extra and asm sources after compile such that we can apply different optimizations if needed - if(NOT extra_src_list STREQUAL "") - foreach(extra_src_file ${extra_src_list}) - get_filename_component(src_name ${extra_src_file} NAME_WE) - file(REMOVE_RECURSE "${CMAKE_CURRENT_BINARY_DIR}/extra_src/${kernel_cpu}/${dir_name}/${src_name}.${kernel_cpu}${obj_suffix}") - endforeach() - endif() - - if(NOT asm_src_list STREQUAL "") - foreach(asm_src_file ${asm_src_list}) - get_filename_component(src_name ${asm_src_file} NAME_WE) - file(REMOVE_RECURSE "${CMAKE_CURRENT_BINARY_DIR}/asm_src/${kernel_cpu}/${dir_name}/${src_name}.${kernel_cpu}${obj_suffix}") - endforeach() - endif() + if(kernel_cpunum STREQUAL "3720" AND NOT rt_kernel) # MTL DDR split access workaround + list(APPEND obj_file_list_lsu0_wo "${obj_file}") + endif() + endforeach() endif() + set(link_options + "--script" "${link_script_file}" + "-entry" "${kernel_entry}" + "--gc-sections" + "--strip-debug" + "--discard-all" + "-zmax-page-size=16") + + # Link the sources, add entry point and windowed sections, then dump the elf file + add_custom_command( + OUTPUT "${elf_path}" + DEPENDS ${obj_file_list} "${link_script_file}" ${link_libraries_list} + COMMAND "${mv_tools_link}" ${link_options} ${obj_file_list} -EL ${link_libraries_list} --output "${elf_path}" + ) + list(APPEND kernels_to_build "${elf_path}") add_custom_target("${elf_file}" DEPENDS "${elf_path}") add_dependencies(act_shave_kernels_build "${elf_file}") - if(kernel_cpunum STREQUAL "3720" AND NOT rt_kernel STREQUAL "yes") # MTL DDR split access workaround + if(kernel_cpunum STREQUAL "3720" AND NOT rt_kernel) # MTL DDR split access workaround + add_custom_command( + OUTPUT "${elf_path_lsu0_wo}" + DEPENDS ${obj_file_list_lsu0_wo} "${link_script_file}" ${link_libraries_list} + COMMAND "${mv_tools_link}" ${link_options} ${obj_file_list_lsu0_wo} -EL ${link_libraries_list} --output "${elf_path_lsu0_wo}" + ) + list(APPEND kernels_to_build "${elf_path_lsu0_wo}") add_custom_target("${elf_file_lsu0_wo}" DEPENDS "${elf_path_lsu0_wo}") add_dependencies(act_shave_kernels_build "${elf_file_lsu0_wo}") endif() + endforeach() -# If we don't want to rebuild anything just copy the prebuilts and create the embedded sections file +### UPDATE PREBUILT KERNELS ### +file(GLOB prebuild_kernels RELATIVE "${prebuild_binary_dir}" CONFIGURE_DEPENDS "${prebuild_binary_dir}/*") if(build_kernels) - # Copy binaries after compile - add_custom_target(act_shave_kernels_copy - DEPENDS ${act_shave_kernels} - COMMAND ${CMAKE_COMMAND} -E copy_directory "${target_binary_dir}/" "${CMAKE_CURRENT_SOURCE_DIR}/prebuild/act_shave_bin/" - ) - add_dependencies(act_shave_kernels_copy act_shave_kernels_build) - add_dependencies(act_shave_kernels_ready act_shave_kernels_copy) - - # If we don't rebuild everything, copy the prebuilts for the kernels we haven't built - if(NOT (ENABLE_SHAVE_BINARIES_BUILD AND ENABLE_MANAGEMENT_KERNEL_BUILD AND ENABLE_FIRMWARE_SOURCES_KERNEL_BUILD)) - add_custom_target(update_prebuilt_binaries - # Update copy_directory function #E-167654 - COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_CURRENT_SOURCE_DIR}/prebuild/act_shave_bin" "${target_binary_dir}" - # Make sure we do this after updating the prebuilts, otherwise we would overwrite the rebuilt binaries - DEPENDS act_shave_kernels_copy - ) - add_dependencies(act_shave_kernels_ready update_prebuilt_binaries) + if(NOT kernels_to_build) + message(FATAL_ERROR "No kernels to build") endif() -else() + + # Remove prebuild binaries if descriptor is removed + set(kernels_to_remove ${prebuild_kernels}) + list(REMOVE_ITEM kernels_to_remove ${all_kernels}) + foreach(kernel IN LISTS kernels_to_remove) + message(STATUS "No descriptor for prebuild kernel ${kernel}, removing.") + file(REMOVE "${prebuild_binary_dir}/${kernel}") + endforeach() + + # If building kernels update prebuilt kernels unconditionally to ensure the + # prebuilt kernels are up to date even after git operations + # No output or byproducts to avoid removing these during clean add_custom_target(update_prebuilt_binaries - COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_CURRENT_SOURCE_DIR}/prebuild/act_shave_bin" "${target_binary_dir}" + DEPENDS act_shave_kernels_build + COMMAND "${CMAKE_COMMAND}" -E copy_if_different ${kernels_to_build} "${prebuild_binary_dir}" + COMMENT "Updating prebuilt kernels" ) - add_dependencies(act_shave_kernels_ready act_shave_kernels_build update_prebuilt_binaries) +else() + # dummy target if not building kernels + add_custom_target(update_prebuilt_binaries) endif() -# Add file-level and target-level dependencies +### LIBRARY TARGET #### +# Add file-level and target-level dependencies, we need this regenerated after +# kernels build and after any git actions +list(TRANSFORM prebuild_kernels PREPEND "${prebuild_binary_dir}/" OUTPUT_VARIABLE prebuild_bins) add_custom_command( OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/generated_shave_binary_resources.cpp" - COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/../../cmake/embed_shave_binaries.cmake" - DEPENDS ${act_shave_kernels} act_shave_kernels_ready + COMMAND ${CMAKE_COMMAND} -D KERNELS_BIN_DIR="${prebuild_binary_dir}" + -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/embed_shave_binaries.cmake" + DEPENDS act_shave_kernels_build update_prebuilt_binaries ${prebuild_bins} ${kernels_to_build} ) add_library(act_shave_kernels_lib OBJECT "${CMAKE_CURRENT_BINARY_DIR}/generated_shave_binary_resources.cpp") diff --git a/cmake/embed_shave_binaries.cmake b/sw_runtime_kernels/kernels/cmake/embed_shave_binaries.cmake similarity index 87% rename from cmake/embed_shave_binaries.cmake rename to sw_runtime_kernels/kernels/cmake/embed_shave_binaries.cmake index d7fad685c3..ad8278505a 100644 --- a/cmake/embed_shave_binaries.cmake +++ b/sw_runtime_kernels/kernels/cmake/embed_shave_binaries.cmake @@ -1,5 +1,5 @@ -# Copyright (C) 2022-2025 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 +# Copyright (C) 2022 Intel Corporation +# SPDX-License-Identifier: Apache 2.0 # Creates C resources file from files in given directory function(create_resources dir output) @@ -42,4 +42,4 @@ function(create_resources dir output) endfunction() # Embed all the binaries from act_shave_bin folder into generated_shave_binary_resources.cpp -create_resources("${CMAKE_CURRENT_BINARY_DIR}/act_shave_bin" "${CMAKE_CURRENT_BINARY_DIR}/generated_shave_binary_resources.cpp") +create_resources("${KERNELS_BIN_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/generated_shave_binary_resources.cpp") diff --git a/sw_runtime_kernels/kernels/cmake/mv_tools.cmake b/sw_runtime_kernels/kernels/cmake/mv_tools.cmake new file mode 100644 index 0000000000..e153c7688f --- /dev/null +++ b/sw_runtime_kernels/kernels/cmake/mv_tools.cmake @@ -0,0 +1,102 @@ +# +# Copyright (C) 2023 Intel Corporation. +# SPDX-License-Identifier: Apache 2.0 +# + +set(NPU_MV_TOOLS_DIR "${VPUX_SOURCE_DIR}/bin/MoviTools") + +function(get_mv_tools_url output) + file(READ "${VPUX_SOURCE_DIR}/artifacts/vpuip_2/revisions.json" json_string) + string(JSON json_common GET ${json_string} "common") + string(JSON json_runtime_kernels GET ${json_common} "runtime_kernels") + string(JSON json_movitools GET ${json_runtime_kernels} "movitools") + string(JSON json_artifactory_url GET ${json_runtime_kernels} "artifactory_url") + string(REPLACE "{movitools_version}" ${json_movitools} movitools_url ${json_artifactory_url}) + set(${output} ${movitools_url} PARENT_SCOPE) +endfunction() + +function(get_mv_tools_path output) + if(DEFINED ENV{IE_NPU_FORCE_MV_TOOLS_PATH}) + set(${output} $ENV{IE_NPU_FORCE_MV_TOOLS_PATH} PARENT_SCOPE) + else() + get_mv_tools_version(mv_tools_version) + file(MAKE_DIRECTORY "${NPU_MV_TOOLS_DIR}") + set(${output} "${NPU_MV_TOOLS_DIR}/${mv_tools_version}" PARENT_SCOPE) + endif() +endfunction() + +function(get_mv_tools_version output) + # get the last folder name from url, which is also the tools version + get_mv_tools_url(mv_tools_url) + get_filename_component(mv_tools_directory ${mv_tools_url} DIRECTORY) + get_filename_component(mv_tools_version ${mv_tools_directory} NAME) + set(${output} ${mv_tools_version} PARENT_SCOPE) +endfunction() + +function(exists_mv_tools_version exists) + if(DEFINED ENV{IE_NPU_FORCE_MV_TOOLS_PATH}) + message(WARNING "You are using forced MoviTools version which is recommended for debugging only.") + set(${exists} TRUE PARENT_SCOPE) + return() + endif() + + get_mv_tools_path(mv_tools_path) + + if(EXISTS "${mv_tools_path}") + set(${exists} TRUE PARENT_SCOPE) + else() + set(${exists} FALSE PARENT_SCOPE) + endif() +endfunction() + +function(remove_old_mv_tools mv_tools_dir limit) + file(GLOB children RELATIVE ${mv_tools_dir} "${mv_tools_dir}/*") + list(LENGTH children num_dirs) + + math(EXPR num_to_remove "${num_dirs} - ${limit}") + if(num_to_remove LESS_EQUAL 0) + return() + endif() + + set(dirs_with_date) + foreach(child ${children}) + file(TIMESTAMP ${mv_tools_dir}/${child} mtime "%Y%m%d%H%M%S") + list(APPEND dirs_with_date "${mtime},${mv_tools_dir}/${child}") + endforeach() + + message(${dirs_with_date}) + + list(SORT dirs_with_date) + + list(SUBLIST dirs_with_date 0 ${num_to_remove} dirs_to_remove) + foreach(dir_with_date ${dirs_to_remove}) + string(REGEX REPLACE "^[0-9]*," "" folder ${dir_with_date}) + message("Removing old tools: ${folder}") + file(REMOVE_RECURSE "${folder}") + endforeach() +endfunction() + +function(get_mv_tools) + get_mv_tools_url(mv_tools_url) + get_mv_tools_path(mv_tools_path) + get_mv_tools_version(mv_tools_version) + + get_filename_component(tools_archive_name ${mv_tools_url} NAME) + set(temp_dir "${CMAKE_BINARY_DIR}/MoviTools-temporary") + + remove_old_mv_tools("${NPU_MV_TOOLS_DIR}" 2) + + message("Downloading MoviTools to location ${temp_dir}/${tools_archive_name}") + file(DOWNLOAD ${mv_tools_url} "${temp_dir}/${tools_archive_name}" SHOW_PROGRESS STATUS download_status) + list(GET download_status 0 download_status_code) + if(download_status_code) + list(GET download_status 1 download_status_message) + message(SEND_ERROR "MoviTools download failed with the error: ${download_status_message}") + endif() + + file(ARCHIVE_EXTRACT INPUT "${temp_dir}/${tools_archive_name}" DESTINATION ${NPU_MV_TOOLS_DIR}) + file(TOUCH_NOCREATE ${mv_tools_path}) + message("MoviTools extracted to ${mv_tools_path}") + + file(REMOVE_RECURSE ${temp_dir}) +endfunction() diff --git a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/dynamic_broadcast.3720xx.elf b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/dynamic_broadcast.3720xx.elf deleted file mode 100755 index 6521326bfb..0000000000 --- a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/dynamic_broadcast.3720xx.elf +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5c7176578ea09bf9863108e9fe4c272f0c3655ba9cf8456f661e38ee7daf93d7 -size 684 diff --git a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/eltwise_power.3720xx.elf b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/eltwise_power.3720xx.elf index 54953b004a..70b4f76af6 100755 --- a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/eltwise_power.3720xx.elf +++ b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/eltwise_power.3720xx.elf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec48aa7bae8f68d1290d704a6381f742d37ee38aea4f2d3aeff1f333b78bc078 -size 17316 +oid sha256:6b03084df7034275171ad7111ff959ba316c00bd1b18878d3b99f1922eb6882b +size 16612 diff --git a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/eltwise_power.3720xx_lsu0_wo.elf b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/eltwise_power.3720xx_lsu0_wo.elf index 382e8faa6e..5ea56adc8c 100755 --- a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/eltwise_power.3720xx_lsu0_wo.elf +++ b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/eltwise_power.3720xx_lsu0_wo.elf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad67b491760fce3a257fc10d663e0aa725415f326a27238f7e0fb14c0f9c3a5d -size 17300 +oid sha256:190cb5b0c81c2d2bc5479be37e83fd8559d3228f815862486d11080bacb11931 +size 16596 diff --git a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/eltwise_power.4000xx.elf b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/eltwise_power.4000xx.elf index b092b4a62e..8b8aea0c15 100755 --- a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/eltwise_power.4000xx.elf +++ b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/eltwise_power.4000xx.elf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5676427cf96150527aab1f341456beb5babfae30767914c1c626c95dd3654af -size 31812 +oid sha256:058f21c2ccf43e2c2b5c1865ce90476ae153f244061731a1ef6ac1bf2c52dfd9 +size 22464 diff --git a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/grid_sample.4000xx.elf b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/grid_sample.4000xx.elf index 9d4af7209a..86bcdc4ea6 100755 --- a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/grid_sample.4000xx.elf +++ b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/grid_sample.4000xx.elf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3c748d305d624b372db62dc1f611f615b3520ec437fda4f0a64dc35f75b34d5 -size 20004 +oid sha256:bc34eb6a5eceeddd7f6433bfcda9290a2b1a060fb96ae0c7e6ed8f4a6557a926 +size 21144 diff --git a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/softmax.4000xx.elf b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/softmax.4000xx.elf index 4c24cf0bfd..18c8295de3 100755 --- a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/softmax.4000xx.elf +++ b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/softmax.4000xx.elf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b841541644cbe37d9700f05b59c65a96981bf55dccb8d276896565125ea9570f +oid sha256:24fe395f1b8985b796357b83e65eac01a233c0db286a9602c2b3f59c394746d6 size 21448 diff --git a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/strided_slice.3720xx.elf b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/strided_slice.3720xx.elf index 39733092ca..8b891220cd 100755 --- a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/strided_slice.3720xx.elf +++ b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/strided_slice.3720xx.elf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56a88586091940208da8f9ea63162bd3fa1f6034419290aa6877f3648f856695 -size 16344 +oid sha256:99faad0c5861d3d17627ac1c6cdeacd77cdab3ebb4a95b37846fb2539ae5c2db +size 21460 diff --git a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/strided_slice.3720xx_lsu0_wo.elf b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/strided_slice.3720xx_lsu0_wo.elf index 2932a1a8f9..0f837b0af7 100755 --- a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/strided_slice.3720xx_lsu0_wo.elf +++ b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/strided_slice.3720xx_lsu0_wo.elf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e9bd9db8e54b0a7d694ea072046240d61c16fa6cfecbf75cf7b1ea5cbc77858 -size 16392 +oid sha256:daeaed0227e0c64c5598b866300c7040305c19195dcb6ca704ae5d1d060835c5 +size 21556 diff --git a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/strided_slice.4000xx.elf b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/strided_slice.4000xx.elf index 9c69f15b25..6f62953377 100755 --- a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/strided_slice.4000xx.elf +++ b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/strided_slice.4000xx.elf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:194a88763836d8246cf96972d905334056466ab357ef84b8b502880df5d4eac9 -size 16024 +oid sha256:8cf52e28116b987db881d128aaab4f77ec703f34d9c581eb98c91505c787c44c +size 21396 diff --git a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/topk.3720xx.elf b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/topk.3720xx.elf index 5c3a1b2c27..413688cd01 100755 --- a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/topk.3720xx.elf +++ b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/topk.3720xx.elf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6f1f0d43a9fb82a22ecf85f1c7b9d2a1e6bb231c4ebfa3036cf1b7b9a409174 -size 30616 +oid sha256:c87740ad35e7dbaf32e5e9cc24ad6e7aed2a157fb54b7b496663eb8473465e88 +size 37224 diff --git a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/topk.3720xx_lsu0_wo.elf b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/topk.3720xx_lsu0_wo.elf index 3fb4207d38..6aae5ee6e6 100755 --- a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/topk.3720xx_lsu0_wo.elf +++ b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/topk.3720xx_lsu0_wo.elf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad31320e72de07cee0acc62ec098574e65efbbc57970eee0c0ccc3c3b8e4361a -size 30728 +oid sha256:9782e41466ab5761398e9cabbb9513e1e78838e2dc12e41da32effe95b83dc15 +size 37320 diff --git a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/topk.4000xx.elf b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/topk.4000xx.elf index aab4b13a34..3d0602c728 100755 --- a/sw_runtime_kernels/kernels/prebuild/act_shave_bin/topk.4000xx.elf +++ b/sw_runtime_kernels/kernels/prebuild/act_shave_bin/topk.4000xx.elf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b22bb981aa9e8bfa35926548c5904a3db240c1d921c63a41ab1767d445ee578 -size 32944 +oid sha256:b9feef16ca4ac47b22c38cc018732de7c9c5601833b25fbf3b76fed705b774ea +size 40640 diff --git a/tests/functional/CMakeLists.txt b/tests/functional/CMakeLists.txt index c6a302ba79..c1b31d078c 100644 --- a/tests/functional/CMakeLists.txt +++ b/tests/functional/CMakeLists.txt @@ -8,28 +8,24 @@ if(ENABLE_LTO) endif() set(TARGET_NAME npuFuncTests) -set(EXCLUDED_FUNC_TESTS_DIR "") set(OPTIONAL_FUNC_TESTS_INCLUDES "") set(OPTIONAL_FUNC_TESTS_LIBS "") set(SKIP_CONFIG "npu_skip_func_tests.xml") set(SKIP_CONFIG_PATH ${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/${SKIP_CONFIG}) -set(LEGACY_SKIP_CONFIG_PATH ${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/skip_tests.xml) ov_add_test_target( NAME ${TARGET_NAME} ROOT ${CMAKE_CURRENT_SOURCE_DIR} - EXCLUDED_SOURCE_PATHS - ${EXCLUDED_FUNC_TESTS_DIR} INCLUDES ${CMAKE_CURRENT_SOURCE_DIR} ${OPTIONAL_FUNC_TESTS_INCLUDES} - "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/tests/functional/shared_tests_instances" - "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/tests/functional/behavior" - "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/tests/functional/subgraph_tests" - "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/tests/functional/custom/single_layer_tests/classes" - "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/artifacts/vpuip_2" - "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/src/vpux_translate_utils/include" + "${PROJECT_SOURCE_DIR}/tests/functional/shared_tests_instances" + "${PROJECT_SOURCE_DIR}/tests/functional/behavior" + "${PROJECT_SOURCE_DIR}/tests/functional/subgraph_tests" + "${PROJECT_SOURCE_DIR}/tests/functional/custom/single_layer_tests/classes" + "${PROJECT_SOURCE_DIR}/artifacts/vpuip_2" + "${PROJECT_SOURCE_DIR}/src/vpux_translate_utils/include" "${CMAKE_CURRENT_SOURCE_DIR}/internal" "${CMAKE_CURRENT_SOURCE_DIR}/shared_test_classes/include" ${LLVM_INCLUDE_DIRS} @@ -42,7 +38,6 @@ ov_add_test_target( npu_llvm_utils npu_ov_utils openvino::npu_al - npu_mlir_compiler_schema # used by subgraph_tests/compress_weights_btc.cpp DEFINES DATA_PATH=\"${DATA_PATH}\" MODELS_PATH=\"${MODELS_PATH}\" @@ -86,14 +81,6 @@ install(FILES ${SKIP_CONFIG_PATH} EXCLUDE_FROM_ALL ) -#TODO remove legacy skip file after CI pipelines update -install( - FILES ${LEGACY_SKIP_CONFIG_PATH} - DESTINATION tests - COMPONENT tests - EXCLUDE_FROM_ALL -) - # For developer builds we need to find the output folder of compiled binaries get_target_property(EXE_LOCATION ${TARGET_NAME} RUNTIME_OUTPUT_DIRECTORY) @@ -112,8 +99,8 @@ if(ENABLE_DEVELOPER_BUILD AND UNIX) # PARALLEL_WORKERS env variable as -w workers command line argument add_test(NAME ${TARGET_NAME}-parallel COMMAND - ${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/scripts/run_gtest_parallel_manual_threads.sh - ${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/thirdparty/gtest-parallel/gtest-parallel + ${PROJECT_SOURCE_DIR}/scripts/run_gtest_parallel_manual_threads.sh + ${PROJECT_SOURCE_DIR}/thirdparty/gtest-parallel/gtest-parallel $/${TARGET_NAME} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} ) set_tests_properties(${TARGET_NAME}-parallel PROPERTIES diff --git a/tests/functional/behavior/compilation_pipeline_cfg_consistency.cpp b/tests/functional/behavior/compilation_pipeline_cfg_consistency.cpp index de15177195..dc981b9c54 100644 --- a/tests/functional/behavior/compilation_pipeline_cfg_consistency.cpp +++ b/tests/functional/behavior/compilation_pipeline_cfg_consistency.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include #include #include #include "common/functions.h" @@ -33,6 +33,7 @@ class CompilationPipelineCfgConsistencyTests : std::replace(targetDevice.begin(), targetDevice.end(), ':', '.'); std::ostringstream result; result << "targetDevice=" << targetDevice << "_"; + result << "targetPlatform=" << LayerTestsUtils::getTestsPlatformFromEnvironmentOr(targetDevice) << "_"; if (!configuration.empty()) { using namespace ov::test::utils; for (auto& configItem : configuration) { @@ -60,8 +61,8 @@ TEST_P(CompilationPipelineCfgConsistencyTests, CompilationWithBatchUnrollingDefa TEST_P(CompilationPipelineCfgConsistencyTests, CompilationWithBatchUnrollingSkipBatchOptions) { SKIP_IF_CURRENT_TEST_IS_DISABLED() { auto cfg = configuration; - cfg[ov::intel_npu::batch_compiler_mode_settings.name()] = - "batch-compile-method=unroll batch-unroll-settings={skip-unroll-batch=true}"; + cfg[ov::intel_npu::batch_compiler_mode_settings.name()] = "batch-compile-method=unroll " + "batch-unroll-settings={skip-unroll-batch=true}"; OV_ASSERT_NO_THROW(auto compiled_model = core->compile_model(ov_stub_model, target_device, cfg)); } } @@ -78,7 +79,8 @@ TEST_P(CompilationPipelineCfgConsistencyTests, CompilationWithDebatchNonDefaultO SKIP_IF_CURRENT_TEST_IS_DISABLED() { auto cfg = configuration; cfg[ov::intel_npu::batch_compiler_mode_settings.name()] = - "batch-compile-method=debatch debatcher-settings={debatching-inlining-method=reordering}"; + "batch-compile-method=debatch " + "debatcher-settings={debatching-inlining-method=reordering}"; OV_ASSERT_NO_THROW(auto compiled_model = core->compile_model(ov_stub_model, target_device, cfg)); } } @@ -87,32 +89,26 @@ TEST_P(CompilationPipelineCfgConsistencyTests, CompilationMixUnrollWithDebatchNo SKIP_IF_CURRENT_TEST_IS_DISABLED() { auto cfg = configuration; cfg[ov::intel_npu::batch_compiler_mode_settings.name()] = - "batch-compile-method=unroll debatcher-settings={debatching-inlining-method=reordering}"; - std::string device_id = cfg["DEVICE_ID"].as(); - if (device_id.find("3720") != std::string::npos) { - OV_ASSERT_NO_THROW(auto compiled_model = core->compile_model(ov_stub_model, target_device, cfg)); - } else { - OV_EXPECT_THROW_HAS_SUBSTRING(auto compiled_model = core->compile_model(ov_stub_model, target_device, cfg), - std::runtime_error, "is inconsistent"); - } + "batch-compile-method=unroll " + "debatcher-settings={debatching-inlining-method=reordering}"; + OV_EXPECT_THROW_HAS_SUBSTRING(auto compiled_model = core->compile_model(ov_stub_model, target_device, cfg), + std::runtime_error, "is inconsistent"); } } TEST_P(CompilationPipelineCfgConsistencyTests, CompilationMixDebatchWithBatchUnrollingSkipBatchOptions) { SKIP_IF_CURRENT_TEST_IS_DISABLED() { auto cfg = configuration; - cfg[ov::intel_npu::batch_compiler_mode_settings.name()] = - "batch-compile-method=debatch batch-unroll-settings={skip-unroll-batch=true}"; - std::string device_id = cfg["DEVICE_ID"].as(); + cfg[ov::intel_npu::batch_compiler_mode_settings.name()] = "batch-compile-method=debatch " + "batch-unroll-settings={skip-unroll-batch=true}"; OV_EXPECT_THROW_HAS_SUBSTRING(auto compiled_model = core->compile_model(ov_stub_model, target_device, cfg), std::runtime_error, "is inconsistent"); } } const std::vector configs = { - {{ov::device::id("3720")}, ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::MLIR)}, - {{ov::device::id("4000")}, ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::MLIR)}, -}; + {{ov::intel_npu::platform(ov::test::utils::getTestsPlatformCompilerInPlugin())}, + ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::MLIR)}}; INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest, CompilationPipelineCfgConsistencyTests, ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), diff --git a/tests/functional/behavior/elf_config.cpp b/tests/functional/behavior/elf_config.cpp index 2e23f1d3c3..927a971b8c 100644 --- a/tests/functional/behavior/elf_config.cpp +++ b/tests/functional/behavior/elf_config.cpp @@ -1,8 +1,9 @@ +// // Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -#include +#include #include #include #include "common/functions.h" diff --git a/tests/functional/behavior/remote_tensor_tests/dma_buf_remote_run.hpp b/tests/functional/behavior/remote_tensor_tests/dma_buf_remote_run.hpp index 8d0af72cbe..51f153daf3 100644 --- a/tests/functional/behavior/remote_tensor_tests/dma_buf_remote_run.hpp +++ b/tests/functional/behavior/remote_tensor_tests/dma_buf_remote_run.hpp @@ -27,10 +27,10 @@ #include #include -#include "base/ov_behavior_test_utils.hpp" #include "behavior/ov_infer_request/infer_request_dynamic.hpp" #include "common/npu_test_env_cfg.hpp" #include "common/utils.hpp" +#include "shared_test_classes/base/ov_behavior_test_utils.hpp" namespace ov { namespace test { diff --git a/tests/functional/behavior/remote_tensor_tests/dx12_remote_run.hpp b/tests/functional/behavior/remote_tensor_tests/dx12_remote_run.hpp index addd42758e..521e74ee43 100644 --- a/tests/functional/behavior/remote_tensor_tests/dx12_remote_run.hpp +++ b/tests/functional/behavior/remote_tensor_tests/dx12_remote_run.hpp @@ -43,10 +43,10 @@ #include #include -#include "base/ov_behavior_test_utils.hpp" #include "behavior/ov_infer_request/infer_request_dynamic.hpp" #include "common/npu_test_env_cfg.hpp" #include "common/utils.hpp" +#include "shared_test_classes/base/ov_behavior_test_utils.hpp" namespace ov { namespace test { diff --git a/tests/functional/behavior/work_with_devices.cpp b/tests/functional/behavior/work_with_devices.cpp index 8f1fe7a1b3..20f1b5a019 100644 --- a/tests/functional/behavior/work_with_devices.cpp +++ b/tests/functional/behavior/work_with_devices.cpp @@ -1,3 +1,4 @@ +// // Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/behavior/work_with_devices.hpp b/tests/functional/behavior/work_with_devices.hpp index 1914079f86..97778b0a9d 100644 --- a/tests/functional/behavior/work_with_devices.hpp +++ b/tests/functional/behavior/work_with_devices.hpp @@ -1,12 +1,12 @@ // -// Copyright (C) 2021-2025 Intel Corporation. +// Copyright (C) 2021-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include #include +#include #include #include #include "common/functions.h" diff --git a/tests/functional/common/npu_test_env_cfg.hpp b/tests/functional/common/npu_test_env_cfg.hpp index 548803359a..70cbe4f029 100644 --- a/tests/functional/common/npu_test_env_cfg.hpp +++ b/tests/functional/common/npu_test_env_cfg.hpp @@ -5,8 +5,8 @@ #pragma once -#include "base/ov_behavior_test_utils.hpp" #include "common/utils.hpp" +#include "shared_test_classes/base/ov_behavior_test_utils.hpp" #include "vpux/utils/IE/private_properties.hpp" #include diff --git a/tests/functional/common/utils.cpp b/tests/functional/common/utils.cpp index 26b6b19d3c..b634d1cbe6 100644 --- a/tests/functional/common/utils.cpp +++ b/tests/functional/common/utils.cpp @@ -3,17 +3,18 @@ // SPDX-License-Identifier: Apache-2.0 // +#include #include #include "intel_npu/npu_private_properties.hpp" #include "utils.hpp" std::string getBackendName(const ov::Core& core) { - return core.get_property("NPU", ov::intel_npu::backend_name.name()).as(); + return core.get_property(ov::test::utils::DEVICE_NPU, ov::intel_npu::backend_name.name()).as(); } std::vector getAvailableDevices(const ov::Core& core) { - return core.get_property("NPU", ov::available_devices.name()).as>(); + return core.get_property(ov::test::utils::DEVICE_NPU, ov::available_devices.name()).as>(); } std::string modelPriorityToString(const ov::hint::Priority priority) { diff --git a/tests/functional/common/utils.hpp b/tests/functional/common/utils.hpp index ec166b3547..941ae2490f 100644 --- a/tests/functional/common/utils.hpp +++ b/tests/functional/common/utils.hpp @@ -1,12 +1,14 @@ +// // Copyright (C) 2021-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include +#include #include -#include "common_test_utils/unicode_utils.hpp" + +#include std::string getBackendName(const ov::Core& core); diff --git a/tests/functional/shared_test_classes/src/shared_test_classes/subgraph/conv_act_base.cpp b/tests/functional/shared_test_classes/src/shared_test_classes/subgraph/conv_act_base.cpp index d92d32d131..e41cf42727 100644 --- a/tests/functional/shared_test_classes/src/shared_test_classes/subgraph/conv_act_base.cpp +++ b/tests/functional/shared_test_classes/src/shared_test_classes/subgraph/conv_act_base.cpp @@ -45,7 +45,7 @@ std::string ConvActTest::getTestCaseName(const testing::TestParamInfo kernel, stride, dilation; std::vector padBegin, padEnd; @@ -81,7 +81,7 @@ void ConvActTest::buildFloatFunction() { } void ConvActTest::buildFQFunction() { - auto modelType = ov::element::undefined; + auto modelType = ov::element::dynamic; ov::op::PadType padType; std::vector kernel, stride, dilation; std::vector padBegin, padEnd; diff --git a/tests/functional/shared_test_classes/src/shared_test_classes/subgraph/mixed_precision_convolution.cpp b/tests/functional/shared_test_classes/src/shared_test_classes/subgraph/mixed_precision_convolution.cpp index 76c8dd1e2f..2cc6ccd42f 100644 --- a/tests/functional/shared_test_classes/src/shared_test_classes/subgraph/mixed_precision_convolution.cpp +++ b/tests/functional/shared_test_classes/src/shared_test_classes/subgraph/mixed_precision_convolution.cpp @@ -65,7 +65,7 @@ void MixedPrecisionConvSubGraphTest::SetUp() { mixedPrecisionConvSpecificParams mixedPrecisionConvParams; std::vector inputShape; - auto modelType = ov::element::undefined; + auto modelType = ov::element::dynamic; std::tie(mixedPrecisionConvParams, modelType, inputShape, std::ignore) = this->GetParam(); diff --git a/tests/functional/shared_tests_instances/behavior/compiled_model/properties.cpp b/tests/functional/shared_tests_instances/behavior/compiled_model/properties.cpp deleted file mode 100644 index 418579b43f..0000000000 --- a/tests/functional/shared_tests_instances/behavior/compiled_model/properties.cpp +++ /dev/null @@ -1,318 +0,0 @@ -// -// Copyright (C) 2022-2025 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "behavior/compiled_model/properties.hpp" -#include "common/functions.h" -#include "common/npu_test_env_cfg.hpp" -#include "common/utils.hpp" -#include "intel_npu/npu_private_properties.hpp" - -using namespace ov::test::behavior; - -namespace { - -std::vector operator+(std::vector origAnyMapVector, - const std::vector>& newPair) { - std::vector newAnyMapVector(origAnyMapVector.size() * newPair.size()); - size_t index = 0; - for (const auto& pair : newPair) { - for (auto&& anyMap : origAnyMapVector) { - ov::AnyMap newAnyMap = anyMap; - ov::AnyMap& newAnyMapRef = (anyMap.find(ov::device::properties.name()) != anyMap.end()) - ? newAnyMap.find(ov::device::properties.name()) - ->second.as() - .begin() - ->second.as() - : newAnyMap; - - if (newAnyMapRef.find(pair.first) == newAnyMapRef.end()) { - newAnyMapRef.emplace(pair); - newAnyMapVector.at(index) = newAnyMap; - ++index; - } - } - } - newAnyMapVector.resize(index); - return newAnyMapVector; -} - -const std::vector> compiledModelProperties = { - {ov::intel_npu::dynamic_shape_to_static.name(), ov::Any(true)}}; - -// [Tracking number: E#132531], intel_npu::turbo appears dynamically based on backend -/*{{ov::supported_properties.name(), // needed for HETERO - ov::Any(std::vector{ - ov::PropertyName(ov::device::id.name()), ov::PropertyName(ov::hint::enable_cpu_pinning.name()), - ov::PropertyName(ov::execution_devices.name()), ov::PropertyName(ov::hint::execution_mode.name()), - ov::PropertyName(ov::hint::inference_precision.name()), - ov::PropertyName(ov::loaded_from_cache.name()), ov::PropertyName(ov::hint::model_priority.name()), - ov::PropertyName(ov::model_name.name()), - ov::PropertyName(ov::intel_npu::compilation_mode_params.name()), - ov::PropertyName(ov::intel_npu::turbo.name()), - ov::PropertyName(ov::optimal_number_of_infer_requests.name()), - ov::PropertyName(ov::hint::performance_mode.name()), ov::PropertyName(ov::hint::num_requests.name()), - ov::PropertyName(ov::enable_profiling.name()), ov::PropertyName(ov::supported_properties.name())})}}};*/ - -const std::vector> allModelPriorities = { - ov::hint::model_priority(ov::hint::Priority::LOW), ov::hint::model_priority(ov::hint::Priority::MEDIUM), - ov::hint::model_priority(ov::hint::Priority::HIGH)}; - -std::vector> compiledModelPropertiesAnyToString = - []() -> const std::vector> { - std::vector> compiledModelProps(compiledModelProperties.size()); - for (auto it = compiledModelProperties.cbegin(); it != compiledModelProperties.cend(); ++it) { - auto&& distance = it - compiledModelProperties.cbegin(); - compiledModelProps.at(distance) = {it->first, it->second.as()}; - } - return compiledModelProps; -}(); - -std::vector compiledModelConfigs = []() -> std::vector { - std::vector compiledModelConfigsMap(compiledModelProperties.size()); - for (auto it = compiledModelProperties.cbegin(); it != compiledModelProperties.cend(); ++it) { - auto&& distance = it - compiledModelProperties.cbegin(); - compiledModelConfigsMap.at(distance) = {*it}; - } - return compiledModelConfigsMap; -}(); - -auto heteroCompiledModelConfigs = []() -> std::vector { - std::vector heteroConfigs(compiledModelConfigs.size()); - for (auto it = compiledModelConfigs.cbegin(); it != compiledModelConfigs.cend(); ++it) { - auto&& distance = it - compiledModelConfigs.cbegin(); - heteroConfigs.at(distance) = { - ov::device::priorities(ov::test::utils::DEVICE_NPU), - {ov::device::properties.name(), ov::Any(ov::AnyMap{{ov::test::utils::DEVICE_NPU, ov::Any(*it)}})}}; - } - return heteroConfigs; -}(); - -auto combineParamsExecDevices = []() -> std::vector> { - std::vector> execParams(compiledModelConfigs.size()); - for (auto it = compiledModelConfigs.cbegin(); it != compiledModelConfigs.cend(); ++it) { - auto&& distance = it - compiledModelConfigs.cbegin(); - execParams.at(distance) = std::make_pair(*it, ov::test::utils::DEVICE_NPU); - } - return execParams; -}(); - -auto combineHeteroParamsExecDevices = []() -> std::vector> { - std::vector> execHeteroParams(heteroCompiledModelConfigs.size()); - for (auto it = heteroCompiledModelConfigs.cbegin(); it != heteroCompiledModelConfigs.cend(); ++it) { - auto&& distance = it - heteroCompiledModelConfigs.cbegin(); - execHeteroParams.at(distance) = std::make_pair(*it, ov::test::utils::DEVICE_NPU); - } - return execHeteroParams; -}(); - -const std::vector configsWithSecondaryProperties = { - {ov::device::properties(ov::test::utils::DEVICE_NPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT))}, - {ov::device::properties(ov::test::utils::DEVICE_NPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)), - ov::device::properties(ov::test::utils::DEVICE_NPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY))}}; - -const std::vector driverCompilerConfigsWithSecondaryProperties = { - {ov::device::properties(ov::test::utils::DEVICE_NPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER))}, - {ov::device::properties(ov::test::utils::DEVICE_NPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER)), - ov::device::properties(ov::test::utils::DEVICE_NPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER))}}; - -const std::vector multiConfigsWithSecondaryProperties = { - {ov::device::priorities(ov::test::utils::DEVICE_CPU), - ov::device::properties(ov::test::utils::DEVICE_CPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT))}, - {ov::device::priorities(ov::test::utils::DEVICE_CPU), - ov::device::properties(ov::test::utils::DEVICE_CPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)), - ov::device::properties(ov::test::utils::DEVICE_NPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY))}}; - -const std::vector autoConfigsWithSecondaryProperties = { - {ov::device::priorities(ov::test::utils::DEVICE_CPU), - ov::device::properties("AUTO", ov::enable_profiling(false), - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT))}, - {ov::device::priorities(ov::test::utils::DEVICE_CPU), - ov::device::properties(ov::test::utils::DEVICE_CPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT))}, - {ov::device::priorities(ov::test::utils::DEVICE_CPU), - ov::device::properties(ov::test::utils::DEVICE_CPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)), - ov::device::properties(ov::test::utils::DEVICE_NPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY))}, - {ov::device::priorities(ov::test::utils::DEVICE_CPU), - ov::device::properties("AUTO", ov::enable_profiling(false), - ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)), - ov::device::properties(ov::test::utils::DEVICE_CPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT))}, - {ov::device::priorities(ov::test::utils::DEVICE_CPU), - ov::device::properties("AUTO", ov::enable_profiling(false), - ov::device::priorities(ov::test::utils::DEVICE_NPU), - ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)), - ov::device::properties(ov::test::utils::DEVICE_CPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT)), - ov::device::properties(ov::test::utils::DEVICE_NPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY))}}; - -const std::vector driverCompilerMultiConfigsWithSecondaryProperties = { - {ov::device::priorities(ov::test::utils::DEVICE_CPU), - ov::device::properties(ov::test::utils::DEVICE_CPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER))}, - {ov::device::priorities(ov::test::utils::DEVICE_CPU), - ov::device::properties(ov::test::utils::DEVICE_CPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER)), - ov::device::properties(ov::test::utils::DEVICE_NPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER))}}; - -const std::vector driverCompilerAutoConfigsWithSecondaryProperties = { - {ov::device::priorities(ov::test::utils::DEVICE_CPU), - ov::device::properties("AUTO", ov::enable_profiling(false), - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER))}, - {ov::device::priorities(ov::test::utils::DEVICE_CPU), - ov::device::properties(ov::test::utils::DEVICE_CPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER))}, - {ov::device::priorities(ov::test::utils::DEVICE_CPU), - ov::device::properties(ov::test::utils::DEVICE_CPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER)), - ov::device::properties(ov::test::utils::DEVICE_NPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER))}, - {ov::device::priorities(ov::test::utils::DEVICE_CPU), - ov::device::properties("AUTO", ov::enable_profiling(false), - ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER)), - ov::device::properties(ov::test::utils::DEVICE_CPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER))}, - {ov::device::priorities(ov::test::utils::DEVICE_CPU), - ov::device::properties("AUTO", ov::enable_profiling(false), - ov::device::priorities(ov::test::utils::DEVICE_NPU), - ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER)), - ov::device::properties(ov::test::utils::DEVICE_CPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER)), - ov::device::properties(ov::test::utils::DEVICE_NPU, - ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY), - ov::intel_npu::compiler_type(ov::intel_npu::CompilerType::DRIVER))}}; - -INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVClassCompiledModelPropertiesTests, - ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), - ::testing::ValuesIn(compiledModelConfigs)), - ov::test::utils::appendPlatformTypeTestName); - -INSTANTIATE_TEST_SUITE_P(smoke_Hetero_BehaviorTests, OVClassCompiledModelPropertiesTests, - ::testing::Combine(::testing::Values(std::string(ov::test::utils::DEVICE_HETERO) + ":" + - ov::test::utils::DEVICE_NPU), - ::testing::ValuesIn(heteroCompiledModelConfigs)), - ov::test::utils::appendPlatformTypeTestName); - -INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVClassCompileModelWithCorrectPropertiesTest, - ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), - ::testing::ValuesIn(compiledModelConfigs)), - ov::test::utils::appendPlatformTypeTestName); - -INSTANTIATE_TEST_SUITE_P(smoke_Hetero_BehaviorTests, OVClassCompileModelWithCorrectPropertiesTest, - ::testing::Combine(::testing::Values(std::string(ov::test::utils::DEVICE_HETERO) + ":" + - ov::test::utils::DEVICE_NPU), - ::testing::ValuesIn(heteroCompiledModelConfigs)), - ov::test::utils::appendPlatformTypeTestName); - -INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests_OVClassLoadNetworkWithCorrectSecondaryPropertiesTest, - OVClassCompileModelWithCorrectPropertiesTest, - ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU, "AUTO:NPU", "MULTI:NPU"), - ::testing::ValuesIn(configsWithSecondaryProperties))); - -INSTANTIATE_TEST_SUITE_P(smoke_NPU_BehaviorTests_OVClassCompileModelWithCorrectPropertiesTest_Driver, - OVClassCompileModelWithCorrectPropertiesTest, - ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU, "AUTO:NPU", "MULTI:NPU"), - ::testing::ValuesIn(driverCompilerConfigsWithSecondaryProperties))); - -INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests_OVClassCompileModelWithCorrectPropertiesTest, - OVClassCompileModelWithCorrectPropertiesTest, - ::testing::Combine(::testing::Values("MULTI"), - ::testing::ValuesIn(multiConfigsWithSecondaryProperties))); - -INSTANTIATE_TEST_SUITE_P(smoke_AUTO_BehaviorTests_OVClassCompileModelWithCorrectPropertiesTest, - OVClassCompileModelWithCorrectPropertiesTest, - ::testing::Combine(::testing::Values("AUTO"), - ::testing::ValuesIn(autoConfigsWithSecondaryProperties))); - -INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests_OVClassCompileModelWithCorrectPropertiesTest_Driver, - OVClassCompileModelWithCorrectPropertiesTest, - ::testing::Combine(::testing::Values("MULTI"), - ::testing::ValuesIn(driverCompilerMultiConfigsWithSecondaryProperties))); - -INSTANTIATE_TEST_SUITE_P(smoke_AUTO_BehaviorTests_OVClassCompileModelWithCorrectPropertiesTest_Driver, - OVClassCompileModelWithCorrectPropertiesTest, - ::testing::Combine(::testing::Values("AUTO"), - ::testing::ValuesIn(driverCompilerAutoConfigsWithSecondaryProperties))); - -INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVClassCompiledModelSetCorrectConfigTest, - ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), - ::testing::ValuesIn(compiledModelPropertiesAnyToString)), - ov::test::utils::appendPlatformTypeTestName); - -INSTANTIATE_TEST_SUITE_P(smoke_Hetero_BehaviorTests, OVClassCompiledModelSetCorrectConfigTest, - ::testing::Combine(::testing::Values(std::string(ov::test::utils::DEVICE_HETERO) + ":" + - ov::test::utils::DEVICE_NPU), - ::testing::ValuesIn(compiledModelPropertiesAnyToString)), - ov::test::utils::appendPlatformTypeTestName); - -INSTANTIATE_TEST_SUITE_P( - smoke_BehaviorTests, OVClassCompiledModelGetPropertyTest_MODEL_PRIORITY, - ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), - ::testing::ValuesIn(compiledModelConfigs + allModelPriorities)), - ov::test::utils::appendPlatformTypeTestName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Hetero_BehaviorTests, OVClassCompiledModelGetPropertyTest_MODEL_PRIORITY, - ::testing::Combine(::testing::Values(std::string(ov::test::utils::DEVICE_HETERO) + ":" + - ov::test::utils::DEVICE_NPU), - ::testing::ValuesIn(heteroCompiledModelConfigs + allModelPriorities)), - ov::test::utils::appendPlatformTypeTestName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Hetero_BehaviorTests, OVClassCompiledModelGetPropertyTest_DEVICE_PRIORITY, - ::testing::Combine(::testing::Values(std::string(ov::test::utils::DEVICE_HETERO) + ":" + - ov::test::utils::DEVICE_NPU), - ::testing::ValuesIn(heteroCompiledModelConfigs)), - ov::test::utils::appendPlatformTypeTestName); - -INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVClassCompiledModelGetPropertyTest_EXEC_DEVICES, - ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), - ::testing::ValuesIn(combineParamsExecDevices)), - ov::test::utils::appendPlatformTypeTestName); - -INSTANTIATE_TEST_SUITE_P(smoke_Hetero_BehaviorTests, OVClassCompiledModelGetPropertyTest_EXEC_DEVICES, - ::testing::Combine(::testing::Values(std::string(ov::test::utils::DEVICE_HETERO) + ":" + - ov::test::utils::DEVICE_NPU), - ::testing::ValuesIn(combineHeteroParamsExecDevices)), - ov::test::utils::appendPlatformTypeTestName); - -INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVCompileModelGetExecutionDeviceTests, - ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), - ::testing::ValuesIn(combineParamsExecDevices)), - ov::test::utils::appendPlatformTypeTestName); - -INSTANTIATE_TEST_SUITE_P(smoke_Hetero_BehaviorTests, OVCompileModelGetExecutionDeviceTests, - ::testing::Combine(::testing::Values(std::string(ov::test::utils::DEVICE_HETERO) + ":" + - ov::test::utils::DEVICE_NPU), - ::testing::ValuesIn(combineHeteroParamsExecDevices)), - ov::test::utils::appendPlatformTypeTestName); - -} // namespace diff --git a/tests/functional/shared_tests_instances/npu_skip_func_tests.xml b/tests/functional/shared_tests_instances/npu_skip_func_tests.xml index b88469cb8a..3b157eb5e0 100644 --- a/tests/functional/shared_tests_instances/npu_skip_func_tests.xml +++ b/tests/functional/shared_tests_instances/npu_skip_func_tests.xml @@ -177,18 +177,6 @@ Example: - - - Tests are failing due to smoke_BI_PAD=0 - - LEVEL0 - !3720 - - - .*DeformableConvolution2DTest_Strides.* - - - Exception during loading to the device @@ -963,8 +951,6 @@ Example: .*ClassExecutableNetworkInvalidDeviceIDTestSuite.InvalidNPUdeviceIDTest.*DEVICE_ID=NPU.3990.* - - LSTM Sequence accuracy fails @@ -976,6 +962,17 @@ Example: + + + Compile option batch-compile-method=unroll not supported on MTL + + 3720 + + + .*CompilationPipelineCfgConsistencyTests.* + + + NPU_DPU_GROUPS is a deprecated property @@ -984,4 +981,45 @@ Example: + + + Test-cases failing with sri_7.0.y25ww26 SIMICS + + IMD + + + + + + + + Test-cases failing after driver update + + + + .*compatibility_smoke_BehaviorTests/OVCompileAndInferRequestTurbo.CompiledModelTurbo.* + + + + + + Skip test-cases until operators fix for GatherND and GatherElements. + + .*smoke_precommit.*GatherElementsLayerTestCommon.* + .*smoke_precommit.*GatherNDLayerTestCommon.* + .*smoke_tiling.*GatherNDLayerTestCommon.* + + + + + + Skip tests with u2 precision not supported on NPU3720. + + 3720 + + + .*InferRequestCheckTensorPrecision.*type=u2.* + + + diff --git a/tests/functional/shared_tests_instances/single_layer_tests/activation.cpp b/tests/functional/shared_tests_instances/single_layer_tests/activation.cpp index 739a514e29..f5dbe3dfce 100644 --- a/tests/functional/shared_tests_instances/single_layer_tests/activation.cpp +++ b/tests/functional/shared_tests_instances/single_layer_tests/activation.cpp @@ -177,7 +177,8 @@ const std::map>> shaveCodeGenAct {Cos, {{1.0f}}}, {Exp, {{1.0f}}}, {Log, {{1.0f}}}, {Sin, {{1.0f}}}, {Erf, {{1.0f}}}, {Sqrt, {{1.0f}}}, {RoundHalfToEven, {}}, {RoundHalfAwayFromZero, {}}, {Clamp, {{-1.0f, 1.0f}}}, {Tanh, {{1.0f}}}, {Tan, {{1.0f}}}, {Sinh, {{1.0f}}}, - {Cosh, {{1.0f}}}, {Atanh, {{1.0f}}}, {Atan, {{1.0f}}}}; + {Cosh, {{1.0f}}}, {Atanh, {{1.0f}}}, {Atan, {{1.0f}}}, {Abs, {{1.0f}}}, + {Negative, {{0.01f}}}, {Sign, {{1.0f}}}, {HSwish, {{1.0f}}}, {HSigmoid, {{1.0f}}}}; const std::map>> shaveCodeGenIntActivationTypes = { {Clamp, {{-1.0f, 1.0f}}}, diff --git a/tests/functional/shared_tests_instances/single_layer_tests/convolution_backprop_data.cpp b/tests/functional/shared_tests_instances/single_layer_tests/convolution_backprop_data.cpp index e0d79a48bd..5d2c1601f3 100644 --- a/tests/functional/shared_tests_instances/single_layer_tests/convolution_backprop_data.cpp +++ b/tests/functional/shared_tests_instances/single_layer_tests/convolution_backprop_data.cpp @@ -144,6 +144,21 @@ const auto se_conv2DParams_OutputPadding = ::testing::Combine( ::testing::ValuesIn(sePadEnds), ::testing::ValuesIn(seDilations), ::testing::ValuesIn(numOutChannels), ::testing::Values(ov::op::PadType::EXPLICIT), ::testing::ValuesIn(seOutputPadding)); +/* ============= 2D ConvolutionBackpropData SETable Patch SEP Op ============= */ +const std::vector> seTablePatchInputShapes = {{{1, 16, 4, 4}}}; + +const std::vector> seTablePatchKernels = {{3, 3}}; +const std::vector> seTablePatchStrides = {{1, 1}}; +const std::vector> seTablePatchPadBegins = {{1, 1}}; +const std::vector> seTablePatchPadEnds = {{1, 1}}; +const std::vector> seTablePatchDilations = {{1, 1}}; + +const auto se_conv2DParams_SETablePatch = + ::testing::Combine(::testing::ValuesIn(seTablePatchKernels), ::testing::ValuesIn(seTablePatchStrides), + ::testing::ValuesIn(seTablePatchPadBegins), ::testing::ValuesIn(seTablePatchPadEnds), + ::testing::ValuesIn(seTablePatchDilations), ::testing::ValuesIn(numOutChannels), + ::testing::Values(ov::op::PadType::EXPLICIT), ::testing::ValuesIn(emptyOutputPadding)); + // ------ NPU3720 ------ INSTANTIATE_TEST_SUITE_P(smoke_precommit_SEP_ConvolutionBackpropData2D_ExplicitPadding, ConvolutionBackpropDataSEPLayerTest_NPU3720, @@ -173,6 +188,13 @@ INSTANTIATE_TEST_SUITE_P(smoke_precommit_SEP_ConvolutionBackpropData2D_OutputPad ::testing::ValuesIn(static_shapes_to_test_representation(seInputShapes)), ::testing::ValuesIn(emptyOutputShape), ::testing::Values(DEVICE_NPU)), ConvolutionBackpropDataSEPLayerTest_NPU4000::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_precommit_SEP_ConvolutionBackpropData2D_SETablePatch, ConvolutionBackpropDataSEPLayerTest_NPU4000, + ::testing::Combine(se_conv2DParams_SETablePatch, ::testing::ValuesIn(netPrecisions), + ::testing::ValuesIn(static_shapes_to_test_representation(seTablePatchInputShapes)), + ::testing::ValuesIn(emptyOutputShape), ::testing::Values(DEVICE_NPU)), + ConvolutionBackpropDataSEPLayerTest_NPU4000::getTestCaseName); /* ============= 2D ConvolutionBackpropData with outputShape Convert to SEP Op ============= */ const std::vector> seInputShapesWithOS = {{{1, 16, 128, 128}}}; const std::vector seSpecifiedOutputShape = {{128, 128}}; diff --git a/tests/functional/shared_tests_instances/single_layer_tests/eltwise.cpp b/tests/functional/shared_tests_instances/single_layer_tests/eltwise.cpp index 8a95df47ef..95c25a20a2 100644 --- a/tests/functional/shared_tests_instances/single_layer_tests/eltwise.cpp +++ b/tests/functional/shared_tests_instances/single_layer_tests/eltwise.cpp @@ -220,7 +220,7 @@ const auto typesParams = ::testing::Combine(::testing::ValuesIn(ov::test::static_shapes_to_test_representation(bigShape)), ::testing::ValuesIn(eltwiseTypes), ::testing::ValuesIn(secondaryInputTypes), ::testing::ValuesIn(opTypes), ::testing::ValuesIn(netPrecisions), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(precommit_EltwiseTypes, EltwiseLayerTestCommon, typesParams, @@ -259,7 +259,7 @@ std::vector secondInputType = { const auto eltwise_params_dynamic = ::testing::Combine( ::testing::ValuesIn(in_shapes_dynamic), ::testing::ValuesIn(DynamicEltwiseOpTypes), ::testing::ValuesIn(secondInputType), ::testing::ValuesIn(opTypes), ::testing::ValuesIn(precision), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); // Dynamic shapes cases @@ -278,7 +278,7 @@ const auto broadcastTestParams = ::testing::Combine(::testing::ValuesIn(ov::test::static_shapes_to_test_representation(broadcastTestInputShape)), ::testing::ValuesIn(broadcastTestEltwiseTypes), ::testing::ValuesIn(secondaryInputTypes), ::testing::ValuesIn(opTypes), ::testing::ValuesIn(netPrecisionsF16), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(precommit_InputBroadcastEltwise, EltwiseLayerTestCommon, broadcastTestParams, @@ -293,8 +293,8 @@ std::vector> scalarInput2broadcastTestInputShape = { const auto scalarInput2BroadcastTestParams = ::testing::Combine( ::testing::ValuesIn(ov::test::static_shapes_to_test_representation(scalarInput2broadcastTestInputShape)), ::testing::ValuesIn(scalarInput2broadcastTestEltwiseTypes), ::testing::ValuesIn(secondaryInputTypes), - ::testing::ValuesIn(opTypes), ::testing::ValuesIn(netPrecisionsF16), ::testing::Values(ov::element::undefined), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::test::utils::DEVICE_NPU), + ::testing::ValuesIn(opTypes), ::testing::ValuesIn(netPrecisionsF16), ::testing::Values(ov::element::dynamic), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(precommit_scalarInput2BroadcastEltwise, EltwiseLayerTestCommon, scalarInput2BroadcastTestParams, EltwiseLayerTestCommon::getTestCaseName); @@ -310,8 +310,8 @@ std::vector> batchInputTestInputShape = {{{361, 4, 48, 48 const auto batchInputTestParams = ::testing::Combine( ::testing::ValuesIn(ov::test::static_shapes_to_test_representation(batchInputTestInputShape)), ::testing::ValuesIn(batchInputTestEltwiseTypes), ::testing::ValuesIn(secondaryInputTypes), - ::testing::ValuesIn(opTypes), ::testing::ValuesIn(netPrecisionsF16), ::testing::Values(ov::element::undefined), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::test::utils::DEVICE_NPU), + ::testing::ValuesIn(opTypes), ::testing::ValuesIn(netPrecisionsF16), ::testing::Values(ov::element::dynamic), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(precommit_BatchInputEltwise, EltwiseLayerTestCommon, batchInputTestParams, @@ -332,7 +332,7 @@ const auto scalarParams = ::testing::Combine(::testing::ValuesIn(ov::test::static_shapes_to_test_representation(inShapesScalar)), ::testing::ValuesIn(eltwiseTypes), ::testing::ValuesIn(secondaryInputTypes), ::testing::Values(ov::test::utils::OpType::SCALAR), ::testing::ValuesIn(netPrecisions), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(smoke_ScalarShapesND, EltwiseLayerTestCommon, scalarParams, @@ -369,7 +369,7 @@ const auto vectorParams = ::testing::Combine(::testing::ValuesIn(ov::test::static_shapes_to_test_representation(inShapesVector)), ::testing::ValuesIn(eltwiseTypes), ::testing::ValuesIn(secondaryInputTypes), ::testing::Values(ov::test::utils::OpType::VECTOR), ::testing::ValuesIn(netPrecisions), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(smoke_VectorShapesND, EltwiseLayerTestCommon, vectorParams, @@ -419,7 +419,7 @@ const auto bitwiseParams = ::testing::Combine(::testing::ValuesIn(ov::test::static_shapes_to_test_representation(bitwiseInput)), ::testing::ValuesIn(bitwiseTypes), ::testing::ValuesIn(secondaryInputTypes), ::testing::ValuesIn(opTypes), ::testing::ValuesIn(bitwiseNetPrecisions), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(precommit_Bitwise, EltwiseLayerTestCommon, bitwiseParams, @@ -435,7 +435,7 @@ const auto bitwiseParamsi8 = ::testing::Combine(::testing::ValuesIn(ov::test::static_shapes_to_test_representation(bitwiseInputi8)), ::testing::ValuesIn(bitwiseTypesi8), ::testing::ValuesIn(secondaryInputTypes), ::testing::ValuesIn(opTypes), ::testing::ValuesIn(bitwiseNetPrecisionsi8), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(precommit_Bitwisei8, EltwiseLayerTestCommon, bitwiseParamsi8, @@ -447,7 +447,7 @@ const auto bitwiseNotParams = ::testing::Combine(::testing::ValuesIn(ov::test::static_shapes_to_test_representation(bitwiseNotInput)), ::testing::Values(EltwiseTypes::BITWISE_NOT), ::testing::Values(InputLayerType::CONSTANT), ::testing::ValuesIn(opTypes), ::testing::ValuesIn(bitwiseNetPrecisions), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(precommit_BitwiseNot, EltwiseLayerTestCommon, bitwiseNotParams, @@ -470,7 +470,7 @@ const auto typesParamsUnsigned = ::testing::Combine( ::testing::ValuesIn(ov::test::static_shapes_to_test_representation(inShape)), ::testing::ValuesIn(eltwiseTypesUnsigned), ::testing::Values(InputLayerType::PARAMETER), ::testing::Values(ov::test::utils::OpType::VECTOR), ::testing::ValuesIn(netPrecisionsUnsigned), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(smoke_Eltwise_Unsigned, EltwiseIntegerLayerTest, typesParamsUnsigned, @@ -488,7 +488,7 @@ const auto typesParamsInteger = ::testing::Combine( ::testing::ValuesIn(ov::test::static_shapes_to_test_representation(inShape)), ::testing::ValuesIn(eltwiseTypesInteger), ::testing::Values(InputLayerType::PARAMETER), ::testing::Values(ov::test::utils::OpType::VECTOR), ::testing::ValuesIn(netPrecisionsInteger), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(smoke_Eltwise_Signed, EltwiseIntegerLayerTest, typesParamsInteger, @@ -555,7 +555,7 @@ const auto scgScalarParams = ::testing::Combine(::testing::ValuesIn(ov::test::static_shapes_to_test_representation(inShapesScalar)), ::testing::ValuesIn(scgEltwiseTypes), ::testing::ValuesIn(secondaryInputTypes), ::testing::Values(ov::test::utils::OpType::SCALAR), ::testing::ValuesIn(netPrecisions), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(smoke_ScalarShapesND, ShaveCodeGenEltwiseLayerTestCommon, scgScalarParams, @@ -594,7 +594,7 @@ const auto scgVectorParams = ::testing::Combine(::testing::ValuesIn(ov::test::static_shapes_to_test_representation(scgInShapesVector)), ::testing::ValuesIn(scgEltwiseTypes), ::testing::ValuesIn(secondaryInputTypes), ::testing::Values(ov::test::utils::OpType::VECTOR), ::testing::ValuesIn(netPrecisions), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(smoke_VectorShapesND, ShaveCodeGenEltwiseLayerTestCommon, scgVectorParams, @@ -621,7 +621,7 @@ const auto scgTypesParamsUnsigned = ::testing::Combine( ::testing::ValuesIn(ov::test::static_shapes_to_test_representation(inShape)), ::testing::ValuesIn(scgEltwiseTypesUnsigned), ::testing::Values(InputLayerType::PARAMETER), ::testing::Values(ov::test::utils::OpType::VECTOR), ::testing::ValuesIn(netPrecisionsUnsigned), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(smoke_Eltwise_Unsigned, ShaveCodeGenEltwiseIntegerLayerTest, scgTypesParamsUnsigned, @@ -638,7 +638,7 @@ const auto scgTypesParamsInteger = ::testing::Combine( ::testing::ValuesIn(ov::test::static_shapes_to_test_representation(inShape)), ::testing::ValuesIn(scgEltwiseTypesInteger), ::testing::Values(InputLayerType::PARAMETER), ::testing::Values(ov::test::utils::OpType::VECTOR), ::testing::ValuesIn(scgNetPrecisionsInteger), - ::testing::Values(ov::element::undefined), ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::dynamic), ::testing::Values(ov::element::dynamic), ::testing::Values(ov::test::utils::DEVICE_NPU), ::testing::Values(ov::test::Config{})); INSTANTIATE_TEST_SUITE_P(smoke_Eltwise_Signed, ShaveCodeGenEltwiseIntegerLayerTest, scgTypesParamsInteger, diff --git a/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp b/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp index a98148fd3c..f9dc81cb50 100644 --- a/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp +++ b/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp @@ -43,8 +43,8 @@ const std::vector epsilon = {0.0001}; std::vector additionalConfig = {{}}; const auto groupNormalizationParams = - testing::Combine(::testing::ValuesIn(netPrecisions), ::testing::Values(ov::element::undefined), - ::testing::Values(ov::element::undefined), + testing::Combine(::testing::ValuesIn(netPrecisions), ::testing::Values(ov::element::dynamic), + ::testing::Values(ov::element::dynamic), ::testing::ValuesIn(ov::test::static_shapes_to_test_representation(staticInputShapes)), ::testing::ValuesIn(numGroups), ::testing::ValuesIn(epsilon), ::testing::Values(DEVICE_NPU), ::testing::ValuesIn(additionalConfig)); diff --git a/tests/functional/shared_tests_instances/single_layer_tests/pooling.cpp b/tests/functional/shared_tests_instances/single_layer_tests/pooling.cpp index 1a8b43dc09..7ccc7352f8 100644 --- a/tests/functional/shared_tests_instances/single_layer_tests/pooling.cpp +++ b/tests/functional/shared_tests_instances/single_layer_tests/pooling.cpp @@ -692,6 +692,20 @@ const auto pool5DParams = ::testing::Combine( static_shapes_to_test_representation(std::vector{{1, 4, 16, 8, 12}})), // inputShapes ::testing::Values(DEVICE_NPU)); +// 5d usecase, no 4D conversion +const auto pool5DParams_no4D = ::testing::Combine( + ::testing::Combine(::testing::Values(PoolingTypes::AVG, PoolingTypes::MAX), + ::testing::ValuesIn>({{1, 7, 7}}), // kernels + ::testing::ValuesIn>({{1, 1, 1}}), // strides + ::testing::ValuesIn>({{0, 0, 0}}), // padBegins + ::testing::ValuesIn>({{0, 0, 0}}), // padEnds + ::testing::Values(ov::op::RoundingType::FLOOR), ::testing::Values(ov::op::PadType::EXPLICIT), + ::testing::Values(true)), // excludePad + ::testing::Values(ov::element::f32), // netPrc + ::testing::Values( + static_shapes_to_test_representation(std::vector{{1, 960, 4, 7, 7}})), // inputShapes + ::testing::Values(DEVICE_NPU)); + // pad outside of kernel size/2. Pad is valid until at kerneSize-1. const auto pooligBigPadEndParams = ::testing::Combine( ::testing::Combine(::testing::Values(PoolingTypes::AVG, PoolingTypes::MAX), @@ -866,6 +880,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_Pooling_3D, PoolingLayerTest_NPU3720, pool3DParam // 5d usecase INSTANTIATE_TEST_SUITE_P(smoke_Pooling_5D, PoolingLayerTest_NPU3720, pool5DParams, PoolingLayerTest_NPU3720::getTestCaseName); +// 5d usecase, no 4D conversion +INSTANTIATE_TEST_SUITE_P(smoke_Pooling_5D_no4D, PoolingLayerTest_NPU3720, pool5DParams_no4D, + PoolingLayerTest_NPU3720::getTestCaseName); // pad outside of kernel size/2. Pad is valid until at kerneSize-1. INSTANTIATE_TEST_SUITE_P(smoke_Pooling_BigPadEndParams, PoolingLayerTest_NPU3720, pooligBigPadEndParams, PoolingLayerTest_NPU3720::getTestCaseName); @@ -955,6 +972,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_Pooling_AllPadType, PoolingLayerTest_NPU4000, poo INSTANTIATE_TEST_SUITE_P(smoke_Pooling_3D, PoolingLayerTest_NPU4000, pool3DParams, PoolingLayerTest::getTestCaseName); // 5d usecase INSTANTIATE_TEST_SUITE_P(smoke_Pooling_5D, PoolingLayerTest_NPU4000, pool5DParams, PoolingLayerTest::getTestCaseName); +// 5d usecase, no 4D conversion +INSTANTIATE_TEST_SUITE_P(smoke_Pooling_5D_no4D, PoolingLayerTest_NPU4000, pool5DParams_no4D, + PoolingLayerTest::getTestCaseName); // pad outside of kernel size/2. Pad is valid until at kerneSize-1. INSTANTIATE_TEST_SUITE_P(smoke_Pooling_BigPadEndParams, PoolingLayerTest_NPU4000, pooligBigPadEndParams, PoolingLayerTest_NPU4000::getTestCaseName); diff --git a/tests/functional/shared_tests_instances/single_layer_tests/topk.cpp b/tests/functional/shared_tests_instances/single_layer_tests/topk.cpp index 4c6bca7724..60a9b4325b 100644 --- a/tests/functional/shared_tests_instances/single_layer_tests/topk.cpp +++ b/tests/functional/shared_tests_instances/single_layer_tests/topk.cpp @@ -154,6 +154,18 @@ INSTANTIATE_TEST_SUITE_P(smoke_TopK_K1, TopKLayerTestCommon, ::testing::Values(ov::test::utils::DEVICE_NPU)), TopKLayerTestCommon::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_TopK_K300, TopKLayerTestCommon, + ::testing::Combine(::testing::ValuesIn(std::vector{300}), + ::testing::ValuesIn(std::vector{1}), + ::testing::ValuesIn(modes_Tilling), + ::testing::ValuesIn(std::vector{ + ov::op::v3::TopK::SortType::SORT_VALUES}), + ::testing::ValuesIn(modelTypes_Tilling), + ::testing::ValuesIn(ov::test::static_shapes_to_test_representation( + std::vector>{{{1, 3600}}})), + ::testing::Values(ov::test::utils::DEVICE_NPU)), + TopKLayerTestCommon::getTestCaseName); + } // namespace namespace { // opset v11 diff --git a/tests/functional/shared_tests_instances/skip_tests.xml b/tests/functional/shared_tests_instances/skip_tests.xml deleted file mode 100644 index 11d40d7ce6..0000000000 --- a/tests/functional/shared_tests_instances/skip_tests.xml +++ /dev/null @@ -1,975 +0,0 @@ - - - - - - Tests break due to starting infer on IA side - - .*CorrectConfigAPITests.* - - - - - - ARM CPU Plugin is not available on Yocto - - .*IEClassLoadNetworkTest.*HETERO.* - .*IEClassLoadNetworkTest.*MULTI.* - - - - - - Hetero plugin doesn't throw an exception in case of big device ID - - .*OVClassLoadNetworkTestNPU.*LoadNetworkHETEROWithBigDeviceIDThrows.* - - - - - - NPU Plugin doesn't handle DEVICE_ID in QueryNetwork implementation - - .*OVClassQueryNetworkTest.* - - - - - - Cannot detect npu platform when it's not passed; Skip tests on Yocto which passes device without platform - - .*IEClassLoadNetworkTest.LoadNetworkWithDeviceIDNoThrow.* - .*IEClassLoadNetworkTest.LoadNetworkWithBigDeviceIDThrows.* - .*IEClassLoadNetworkTest.LoadNetworkWithInvalidDeviceIDThrows.* - - - - - - Disabled test E#28335 - - .*smoke_LoadNetworkToDefaultDeviceNoThrow.* - - - - - - Disabled test E#28335 - - .*LoadNetwork.*CheckDeviceInBlob.* - - - - - - double free detected - - .*InferConfigInTests\\.CanInferWithConfig.* - - - - - - GetExecGraphInfo function is not implemented for NPU plugin - - .*checkGetExecGraphInfoIsNotNullptr.* - .*CanCreateTwoExeNetworksAndCheckFunction.* - .*CanCreateTwoCompiledModelsAndCheckRuntimeModel.* - .*CheckExecGraphInfo.* - .*canLoadCorrectNetworkToGetExecutable.* - - - - - - Disabled test E#28335 - - .*checkInferTime.* - .*OVExecGraphImportExportTest.* - - - - - - Test uses legacy OpenVINO 1.0 API, no need to support it - - .*ExecutableNetworkBaseTest.checkGetMetric.* - - - - - - SetConfig function is not implemented for ExecutableNetwork interface (implemented only for npu plugin) - - .*ExecutableNetworkBaseTest.canSetConfigToExecNet.* - .*ExecutableNetworkBaseTest.canSetConfigToExecNetAndCheckConfigAndCheck.* - .*CanSetConfigToExecNet.* - - - - - - Exception 'Not implemented - - .*OVClassNetworkTestP.*LoadNetworkCreateDefaultExecGraphResult.* - - - - - - This is openvino specific test - - .*ExecutableNetworkBaseTest.canExport.* - - - - - - TensorIterator layer is not supported - - .*ReturnResultNotReadyFromWaitInAsyncModeForTooSmallTimeout.* - .*OVInferRequestDynamicTests.* - .*OVInferenceChaining.* - - - - - - Tests with unsupported precision - - .*InferRequestCheckTensorPrecision.*type=boolean.* - .*InferRequestCheckTensorPrecision.*type=bf16.* - .*InferRequestCheckTensorPrecision.*type=f64.* - .*InferRequestCheckTensorPrecision.*type=u1\D.* - - - - - - Tests enabled only for L0 NPU3720 and NPU4000 - - LEVEL0 - - - .*InferRequestCheckTensorPrecision.* - .*InferRequestIOTensorSetPrecisionTest.* - .*DriverCompilerAdapterDowngradeInterpolate11TestNPU.* - .*DriverCompilerAdapterInputsOutputsTestNPU.* - - - - - - Exception during loading to the device - - .*OVClassLoadNetworkTestNPU.*LoadNetworkHETEROwithMULTINoThrow.* - .*OVClassLoadNetworkTestNPU.*LoadNetworkMULTIwithHETERONoThrow.* - - - - - - compiler: Unsupported arch kind: NPUX311X - - .*CompilationForSpecificPlatform.*(3800|3900).* - - - - - - Cannot call setShape for Blobs - - .*(smoke_Behavior|smoke_Multi_Behavior|smoke_Auto_Behavior).*OVInferRequestIOTensorTest.*canInferAfterIOBlobReallocation.* - .*(smoke_Behavior|smoke_Multi_Behavior).*OVInferRequestIOTensorTest.*InferStaticNetworkSetChangedInputTensorThrow.*targetDevice=(NPU_|MULTI_configItem=MULTI_DEVICE_PRIORITIES_NPU).* - - - - - - Can't loadNetwork without cache for ReadConcatSplitAssign with precision f32 - - .*CachingSupportCase_NPU.*CompileModelCacheTestBase.*CompareWithRefImpl.*ReadConcatSplitAssign.* - - - - - - NPU Plugin currently fails to get a valid output in these test cases - - .*OVInferRequestIOTensorTest.InferStaticNetworkSetChangedInputTensorThrow.* - .*OVInferRequestIOTensorTestNPU.InferStaticNetworkSetChangedInputTensorThrow.* - .*OVInferRequestIOTensorTestNPU.InferStaticNetworkSetChangedInputTensorThrow/targetDevice=(NPU3720_|NPU4000_).* - .*OVInferRequestIOTensorTestNPU.InferStaticNetworkSetChangedInputTensorThrow/targetDevice=(NPU3720_|NPU4000_)configItem=MULTI_DEVICE_PRIORITIES_NPU_.* - .*OVInferRequestIOTensorTest.InferStaticNetworkSetChangedInputTensorThrow/targetDevice=(NPU3720_|NPU4000_).* - .*OVInferRequestIOTensorTest.InferStaticNetworkSetChangedInputTensorThrow/targetDevice=(NPU3720_|NPU4000_)configItem=MULTI_DEVICE_PRIORITIES_NPU_.* - - - - - - OV requires the plugin to throw when value of DEVICE_ID is unrecognized, but plugin does not throw - - smoke_BehaviorTests.*IncorrectConfigTests.SetConfigWithIncorrectKey.*(SOME_DEVICE_ID|DEVICE_UNKNOWN).* - smoke_BehaviorTests.*IncorrectConfigTests.SetConfigWithNoExistingKey.*SOME_DEVICE_ID.* - smoke_BehaviorTests.*IncorrectConfigAPITests.SetConfigWithNoExistingKey.*(SOME_DEVICE_ID|DEVICE_UNKNOWN).* - - - - - - OV requires the plugin to throw on network load when config file is incorrect, but plugin does not throw - - .*smoke_Auto_BehaviorTests.*IncorrectConfigTests.CanNotLoadNetworkWithIncorrectConfig.*AUTO_config.*unknown_file_MULTI_DEVICE_PRIORITIES=(NPU_|NPU,CPU_).* - - - - - - OV expects the plugin to not throw any exception on network load, but it actually throws - - .*(smoke_Multi_Behavior|smoke_Auto_Behavior).*SetPropLoadNetWorkGetPropTests.*SetPropLoadNetWorkGetProperty.* - - - - - - Plugin can not perform SetConfig for value like: device=NPU config key=LOG_LEVEL value=0 - - smoke_BehaviorTests/DefaultValuesConfigTests.CanSetDefaultValueBackToPlugin.* - - - - - - Disabled with ticket number 48480 - - .*OVExecutableNetworkBaseTest.* - - - - - - Disabled with ticket number 63708 - - .*smoke_BehaviorTests.*InferStaticNetworkSetInputTensor.* - .*smoke_Multi_BehaviorTests.*InferStaticNetworkSetInputTensor.* - - - - - - Disabled with ticket number 64490 - - .*OVClassNetworkTestP.*SetAffinityWithConstantBranches.* - - - - - - The output tensor gets freed when the inference request structure's destructor is called. The issue is unrelated to the caching feature. - - .*CacheTestBase.CompareWithRefImpl.* - - - - - - Expected: SetConfig(configuration, target_device) throws an exception of type InferenceEngine::Exception. Throws nothing. - - .*AutoBatch.*Behavior.*IncorrectConfigAPITests.SetConfigWithNoExistingKey.*AUTO_BATCH_TIMEOUT.* - - - - - - Expected: SetConfig(configuration, target_device) throws an exception of type InferenceEngine::Exception. Throws nothing. - - .*AutoBatch.*Behavior.*IncorrectConfigTests.SetConfigWithIncorrectKey.*AUTO_BATCH_TIMEOUT.* - .*AutoBatch.*Behavior.*IncorrectConfigTests.CanNotLoadNetworkWithIncorrectConfig.*AUTO_BATCH_TIMEOUT.* - - - - - - Dynamic I/O shapes are being used when running the tests. This feature is not yet supported by the NPU plugin. - - .*SetPreProcessTo.* - - - - - - This scenario became invalid upon refactoring the implementation as to use the 2.0 OV API. - - .*smoke_BehaviorTests/VersionTest.pluginCurrentVersionIsCorrect.* - - - - - - Tests throw errors as expected but drivers post-v.1657 will fail to catch them - - .*FailGracefullyTest.* - .*QueryNetworkTestSuite3NPU.* - - - - - Tests are disabled for all devices except NPU3720 - - !3720 - - - - .*NPU3720.* - - .*DriverCompilerAdapterDowngradeInterpolate11TestNPU.* - .*DriverCompilerAdapterInputsOutputsTestNPU.* - - - - - - Disabled for when backend is empty (i.e., no device) - - - - - - .*InferRequest.* - .*OVInferRequest.* - .*OVInferenceChaining.* - .*ExecutableNetworkBaseTest.* - .*OVExecutableNetworkBaseTest.* - .*ExecNetSetPrecision.* - .*SetBlobTest.* - .*InferRequestCallbackTests.* - .*PreprocessingPrecisionConvertTest.* - .*SetPreProcessToInputInfo.* - .*InferRequestPreprocess.* - .*HoldersTestOnImportedNetwork.* - .*HoldersTest.Orders.* - .*HoldersTestImportNetwork.Orders.* - - .*OVExecGraphImportExportTest.* - .*OVHoldersTest.* - .*OVClassExecutableNetworkGetMetricTest.* - .*OVClassExecutableNetworkGetConfigTest.* - .*OVClassNetworkTestP.*SetAffinityWithConstantBranches.* - .*OVClassNetworkTestP.*SetAffinityWithKSO.* - .*OVClassNetworkTestP.*LoadNetwork.* - .*FailGracefullyTest.* - .*DriverCompilerAdapterInputsOutputsTestNPU.* - - - .*OVClassImportExportTestP.* - .*OVClassLoadNetworkTestNPU.*LoadNetwork.* - - .*DriverCompilerAdapterDowngradeInterpolate11TestNPU.* - .*QueryNetworkTestSuite.* - - - - - - Failing test for NPU device - - .*OVClassImportExportTestP.*OVClassCompiledModelImportExportTestP.*ImportNetworkThrowWithDeviceName.* - - - - - - These tests runs only on LevelZero backend - - !LEVEL0 - - - .*InferRequestRunTests.* - .*OVClassGetMetricAndPrintNoThrow.* - .*IEClassGetMetricAndPrintNoThrow.* - .*CompileModelLoadFromFileTestBase.* - .*CorrectConfigTests.* - - - - - - Runs only on NPU3720 with Level Zero enabled #85493 - - !3720 - - - .*InferRequestRunTests.MultipleExecutorStreamsTestsSyncInfers.* - - - - - - Other devices than NPU doesn't allow to set NPU properties with OV1.0 and CACHE_DIR + MLIR is not supported - - .*smoke_AutoBatch_BehaviorTests/CorrectConfigTests.* - - - - - OpenVINO issues when using caching mechanism - - - .*smoke_Auto_BehaviorTests_CachingSupportCase_NPU_Driver/CompileModelLoadFromFileTestBase.* - - .*smoke_BehaviorTests_CachingSupportCase_NPU_Driver/CompileModelLoadFromFileTestBase.* - - - - - - IfTest segfaults npuFuncTest on Ubuntu - - LEVEL0 - 3720 - linux - - - .*smoke_IfTest.* - - - - - - IMD/Simics do not support the tests - - IMD - - - .*smoke_ClassPluginProperties.*DEVICE_UUID.* - - - - - - Run long time on IMD/Simics - - IMD - - - .*PreprocessingPrecisionConvertTestNPU.* - - - - - Memory tests take too long in SIMICS, application crashes - - IMD - - - .*VpuDeviceAllocMemSizeSameAfterDestroy.* - - - - - - Unicode paths are known to fail in SIMICS environments - - IMD - - - .*smoke_registerPluginsLibrariesUnicodePath.* - - - - - - Dynamic output issue in sync infer request where keepDims is off - - .*smoke_ReduceAllAxis/ReduceLayerTest_SW_FP16.*KeepDims=0.* - - - - - - M2I tests failing with device hung - - .*InterpolateM2ILayerTest.* - .*PreProcessTest_M2I.* - - - - - - Newly enabled, never tested - - LEVEL0 - - - .*smoke_BehaviorTests_Driver/OVCompiledGraphImportExportTest.importExportedFunctionConstantResultOnly.* - .*smoke_BehaviorTests_OVClassImportExportTestP/OVClassCompiledModelImportExportTestP.smoke_ImportNetworkThrowWithDeviceName.* - .*ClassExecutableNetworkTestSuite1NPU.PropertyIsSupportedAndImmutableAndGet.* - .*ClassExecutableNetworkTestSuite2NPU.PropertyIsSupportedAndImmutableAndCanNotSet.* - .*ClassPluginPropertiesTestSuite4NPU.CanNotSetGetInexistentProperty.* - .*BehaviorTests_OVCheckSetSupportedRWMandatoryMetricsPropsTests/OVCheckSetSupportedRWMetricsPropsTests.ChangeCorrectProperties.* - - - - - - QueryNetwork is only supported by 3720 platform - - !3720 - - - .*QueryNetworkTestSuite.* - - - - - - Failing properties tests for AUTO / MULTI - - .*OVCheckSetSupportedRWMetricsPropsTests.ChangeCorrectProperties.*MULTI.*LOG_LEVEL.* - .*OVCheckSetSupportedRWMetricsPropsTests.ChangeCorrectProperties.*AUTO.*LOG_LEVEL.* - - - - - - Disabled tests for NPU3720 and NPU4000 - - LEVEL0 - 3720 - 4000 - - - .*InferRequestVariableStateTest.inferreq_smoke_VariableState_2infers.* - .*OVInferRequestIOTensorTest.*InferStaticNetworkSetChangedInputTensorThrow.* - - - - - - GetExecGraphInfo function is not implemented for NPU plugin - - .*CanCreateTwoCompiledModelsAndCheckRuntimeModel.* - - - - - - Fails with CID - - .*smoke_BehaviorTests_OVClassLoadNetworkTest/OVClassLoadNetworkTestNPU.LoadNetworkHETEROWithDeviceIDNoThrow.* - - - - - - Unicode paths for ov::cache_dir are not correctly handled on Windows - - windows - - - .*CompiledKernelsCacheTest.*CanCreateCacheDirAndDumpBinariesUnicodePath.* - - - - - - Unsupported NPU properties - - LEVEL0 - - - .*OVCheckMetricsPropsTests_ModelDependceProps.* - .*OVClassCompileModelAndCheckSecondaryPropertiesTest.* - - - - - - Failing properties tests - - LEVEL0 - - - .*OVSpecificDeviceSetConfigTest.GetConfigSpecificDeviceNoThrow.* - - .*OVPropertiesIncorrectTests.SetPropertiesWithIncorrectKey.*DEVICE_ID.* - - - - - - GetExecGraphInfo function is not implemented for NPU plugin - - .*CanCreateTwoCompiledModelsAndCheckRuntimeModel.* - - - - - - Fails with CID - - .*smoke_BehaviorTests_OVClassLoadNetworkTest/OVClassLoadNetworkTestNPU.LoadNetworkHETEROWithDeviceIDNoThrow.*NPU_COMPILER_TYPE_DRIVER.* - - - - - - The implementation does not allow passing extra configuration options. This prohibits specifying the private platform codes explicitly. - - !3720 - - - .*smoke_OVClassImportExportTestP/OVClassCompiledModelImportExportTestP.smoke_ImportNetworkNoThrowWithDeviceName.* - .*nightly_OVClassModelOptionalTestP/OVClassModelOptionalTestP.CompileModel.* - .*smoke_BehaviorTests_OVCheckSetSupportedRWMetricsPropsTests.* - .*OVHoldersTest.* - .*OVHoldersTestOnImportedNetwork.* - .*OVHoldersTestWithConfig.* - .*OVInferRequestInferenceTests.* - - - - - - The private platform names cannot be identified via the \"ov::available_devices\" configuration. - - !3720 - - - .*smoke_BehaviorTests_OVClassSetDefaultDeviceIDPropTest/OVClassSetDefaultDeviceIDPropTest.SetDefaultDeviceIDNoThrow.* - .*smoke_BehaviorTests_OVClassSpecificDeviceTest/OVSpecificDeviceGetConfigTest.GetConfigSpecificDeviceNoThrow.* - .*smoke_BehaviorTests_OVClassSpecificDeviceTest/OVSpecificDeviceTestSetConfig.SetConfigSpecificDeviceNoThrow.* - - - - - - The tests are not actually running the compiler-in-driver module. - - .*smoke_BehaviorTests_OVCheckSetSupportedRWMetricsPropsTests_Driver.* - - - - - - Disabled tests for NPU3720 and NPU4000 - - 3720 - 4000 - - - .*smoke.*_BehaviorTests_Driver/OVInferRequestCheckTensorPrecision.*type=i16.*DRIVER.* - .*smoke.*_BehaviorTests_Driver/OVInferRequestCheckTensorPrecision.*type=u16.*DRIVER.* - .*smoke.*_BehaviorTests_Driver/OVInferRequestCheckTensorPrecision.*type=u64.*DRIVER.* - .*smoke_OVClassLoadNetworkTest_Driver/OVClassLoadNetworkTestNPU.*DRIVER.* - - - - - - Failing properties tests - - LEVEL0 - - - .*OVSpecificDeviceSetConfigTest.GetConfigSpecificDeviceNoThrow.* - - .*OVPropertiesIncorrectTests.SetPropertiesWithIncorrectKey.*DEVICE_ID.* - - - - - - platform and compiler_type are private - - LEVEL0 - !3720 - - - .*smoke_Multi_BehaviorTests/OVInferRequestCallbackTests.* - .*smoke_Auto_BehaviorTests/OVInferRequestCallbackTests.* - .*smoke_Auto_BehaviorTests/OVInferRequestCallbackTestsNPU.* - .*smoke_Multi_BehaviorTests_OVClassCompileModelWithCorrectPropertiesTest/OVClassCompileModelWithCorrectPropertiesTest.* - .*smoke_AUTO_BehaviorTests_OVClassCompileModelWithCorrectPropertiesTest/OVClassCompileModelWithCorrectPropertiesTest.* - - .*smoke_Auto_BehaviorTests/OVInferRequestIOTensorTest.* - .*smoke_Multi_BehaviorTests/OVInferRequestCallbackTestsNPU.* - .*smoke_Multi_BehaviorTests/OVInferRequestIOTensorTestNPU.* - .*smoke_Multi_BehaviorTests/OVInferRequestIOTensorTest.* - .*smoke_Multi_BehaviorTests/OVInferRequestMultithreadingTests.* - .*smoke_Multi_BehaviorTests/OVInferRequestMultithreadingTestsNPU.* - .*smoke_Multi_BehaviorTests/OVInferRequestPerfCountersExceptionTest.* - .*smoke_Multi_BehaviorTests/OVInferRequestPerfCountersTest.* - .*smoke_Multi_BehaviorTests/OVInferRequestWaitTests.* - .*smoke_Auto_BehaviorTests/OVInferRequestMultithreadingTests.* - .*smoke_Auto_BehaviorTests/OVInferRequestMultithreadingTestsNPU.* - .*smoke_Auto_BehaviorTests/OVInferRequestPerfCountersExceptionTest.* - .*smoke_Auto_BehaviorTests/OVInferRequestPerfCountersTest.* - .*smoke_Auto_BehaviorTests/OVInferRequestWaitTests.* - .*smoke_OVClassNetworkTestP/OVClassNetworkTestPNPU.* - .*smoke_OVClassLoadNetworkTest/OVClassLoadNetworkTestNPU.* - .*smoke_Hetero_BehaviorTests_VariableState/OVInferRequestVariableStateTest.* - - - - - - Private properties cannot be accessed by HETERO compiled model - - LEVEL0 - - - .*smoke_Hetero_BehaviorTests.*OVClassCompiledModelGetPropertyTest_MODEL_PRIORITY.* - .*smoke_Hetero_BehaviorTests.*OVClassCompiledModelGetPropertyTest_EXEC_DEVICES.* - .*smoke_Hetero_BehaviorTests.*OVCompileModelGetExecutionDeviceTests.* - - - - - - platform and compiler_type are private - - LEVEL0 - !3720 - - - .*smoke_Multi_BehaviorTests_Driver/OVInferRequestCallbackTests.* - .*smoke_Auto_BehaviorTests_Driver/OVInferRequestCallbackTests.* - .*smoke_Auto_BehaviorTests_Driver/OVInferRequestCallbackTestsNPU.* - .*smoke_Multi_BehaviorTests_OVClassCompileModelWithCorrectPropertiesTest_Driver/OVClassCompileModelWithCorrectPropertiesTest.* - .*smoke_AUTO_BehaviorTests_OVClassCompileModelWithCorrectPropertiesTest_Driver/OVClassCompileModelWithCorrectPropertiesTest.* - - .*smoke_Auto_BehaviorTests_Driver/OVInferRequestIOTensorTest.* - .*smoke_Multi_BehaviorTests_Driver/OVInferRequestCallbackTestsNPU.* - .*smoke_Multi_BehaviorTests_Driver/OVInferRequestIOTensorTestNPU.* - .*smoke_Multi_BehaviorTests_Driver/OVInferRequestIOTensorTest.* - .*smoke_Multi_BehaviorTests_Driver/OVInferRequestMultithreadingTests.* - .*smoke_Multi_BehaviorTests_Driver/OVInferRequestPerfCountersTest.* - .*smoke_Multi_BehaviorTests_Driver/OVInferRequestPerfCountersExceptionTest.* - .*smoke_Multi_BehaviorTests_Driver/OVInferRequestWaitTests.* - .*smoke_Auto_BehaviorTests_Driver/OVInferRequestMultithreadingTestsNPU.* - .*smoke_Auto_BehaviorTests_Driver/OVInferRequestPerfCountersTest.* - .*smoke_Auto_BehaviorTests_Driver/OVInferRequestPerfCountersExceptionTest.* - .*smoke_Auto_BehaviorTests_Driver/OVInferRequestWaitTests.* - .*smoke_OVClassNetworkTestP_Driver/OVClassNetworkTestPNPU.* - - - - - - NPU plugin doesn't support infer dynamic - - .*OVInferRequestBatchedTests.SetInputTensors_Can_Infer_Dynamic.* - - - - - - NPU fails for `OVIterationChaining.Simple` tests - - .*OVIterationChaining.Simple.* - - - - - - Missing model ops in profiling info - - .*OVInferRequestPerfCountersTest.CheckOperationInProfilingInfo.* - - - - - - NPU needs to implement ROITensor logic in zero_infer_request - - .*OVInferRequestInferenceTests.Inference_ROI_Tensor/roi_nchw.* - - - - - - OVClassQueryModel tests do not work with COMPILER_TYPE=DRIVER - - .*OVClassQueryModelTest.QueryModelHETEROWithDeviceIDNoThrow.* - .*OVClassQueryModelTest.QueryModelWithBigDeviceIDThrows.* - .*OVClassQueryModelTest.QueryModelWithInvalidDeviceIDThrows.* - - - - - - CheckWrongGraphExtAndThrow tests do not work with COMPILER_TYPE=DRIVER - - .*DriverCompilerAdapterExpectedThrowNPU.CheckWrongGraphExtAndThrow.* - - - - - - Skip tests that can not wrong when DRIVER is default compiler type - - .*OVClassLoadNetworkTestNPU.LoadNetworkHETEROWithDeviceIDNoThrow.* - .*MatMulTransposeConcatTest.* - - - - - - Compiler adapter is not extracting network name from metadata - - .*smoke_BehaviorTests/OVClassCompiledModelGetPropertyTest.GetMetricNoThrow_NETWORK_NAME.* - - - - - - Error message for empty model from stream must be changed to have \"device xml header\" - - .*smoke_BehaviorTests/OVClassCompiledModelImportExportTestP.smoke_ImportNetworkThrowWithDeviceName.* - .*smoke_Hetero_BehaviorTests/OVClassCompiledModelImportExportTestP.smoke_ImportNetworkThrowWithDeviceName.* - - - - - - NPU cannot set properties for compiled models - - .*OVClassCompiledModelSetCorrectConfigTest.canSetConfig.* - - - - - - Failing runtime model tests - - .*OVCompiledModelGraphUniqueNodeNamesTest.CheckUniqueNodeNames.* - .*OVExecGraphSerializationTest.ExecutionGraph.* - - - - - - Disabling test - it will set the default supported properties value which is an invalid value - - .*OVCompiledModelPropertiesDefaultSupportedTests.CanCompileWithDefaultValueFromPlugin.* - - - - - - Template plugin doesn't implement evaluate method for RoPE Op - - .*FuseRoPE.* - - - - - - compiled_blob test uses CACHE_MODE which is not supported on NPU. - - .*smoke_BehaviorTests/OVCompiledModelBaseTest.*import_from_.*_blob.*targetDevice=NPU.* - .*smoke_BehaviorTests/OVCompiledModelBaseTest.*compile_from_.*_blob.*targetDevice=NPU.* - .*smoke_BehaviorTests/OVCompiledModelBaseTest.*compile_from_cached_weightless_blob.*targetDevice=NPU.* - .*smoke_BehaviorTests/OVCompiledModelBaseTest.*use_blob_hint_.*targetDevice=NPU.* - .*smoke_Hetero_BehaviorTests/OVCompiledModelBaseTest.*import_from_.*_blob.*HETERO.*NPU.* - .*smoke_Hetero_BehaviorTests/OVCompiledModelBaseTest.*compile_from_.*_blob.*HETERO.*NPU.* - .*smoke_Hetero_BehaviorTests/OVCompiledModelBaseTest.*compile_from_cached_weightless_blob.*HETERO.*NPU.* - .*smoke_Hetero_BehaviorTests/OVCompiledModelBaseTest.*use_blob_hint_.*HETERO.*NPU.* - - - - - - Unsupported value f32 for INFERENCE_PRECISION_HINT property. - - .*CoreThreadingTestsWithIter.*CompileModel_Accuracy_SingleCore.* - .*CoreThreadingTestsWithIter.*CompileModel_Accuracy_MultipleCores.* - - - - - - ShaveCodeGen is currently working only on Ubuntu, skipping for Windows - - windows - - - .*ShaveCodeGen.* - - - - - - OVBlobCompatibilityNPU tests designed for NPU3720 - - !3720 - - - .*OVBlobCompatibilityNPU.* - - - - - - OVBlobCompatibilityNPU_PV_Driver_No_Throw tests designed for NPU3720, PV Driver - - 3720 - - - .*OVBlobCompatibilityNPU_PV_Driver_No_Throw.* - - - - - - Tests enabled only for public platforms, NPU3720 and NPU4000. Related to new getDevice logic from OV PR#30586. - - - - .*ClassExecutableNetworkInvalidDeviceIDTestSuite.InvalidNPUdeviceIDTest.*DEVICE_ID=NPU.1.* - .*ClassExecutableNetworkInvalidDeviceIDTestSuite.InvalidNPUdeviceIDTest.*DEVICE_ID=NPU.3990.* - - - - - - - LSTM Sequence accuracy fails - - - - .*smoke_precommit_LSTMSequencePt/LSTMSequenceLayerTestCommon.*direction=bidirectional_clip=0_WRBType=CONSTANT_modelType=f32.* - .*smoke_precommit_LSTMSequenceCommonZeroClip/LSTMSequenceLayerTestCommon.* - - - - - - NPU_DPU_GROUPS is a deprecated property - - .*NPU_DPU_GROUPS.* - - - - diff --git a/tests/functional/shared_tests_instances/subgraph_tests/nce_tasks.cpp b/tests/functional/shared_tests_instances/subgraph_tests/nce_tasks.cpp index 422b910ae5..d048ecd901 100644 --- a/tests/functional/shared_tests_instances/subgraph_tests/nce_tasks.cpp +++ b/tests/functional/shared_tests_instances/subgraph_tests/nce_tasks.cpp @@ -1,3 +1,4 @@ +// // Copyright (C) 2022-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/shared_tests_instances/vpu_ov2_layer_test.cpp b/tests/functional/shared_tests_instances/vpu_ov2_layer_test.cpp index b0bb878ce4..ea299efdff 100644 --- a/tests/functional/shared_tests_instances/vpu_ov2_layer_test.cpp +++ b/tests/functional/shared_tests_instances/vpu_ov2_layer_test.cpp @@ -8,6 +8,7 @@ #include +#include #include #include #include @@ -31,7 +32,7 @@ VpuOv2LayerTest::VpuOv2LayerTest(): testTool(envConfig) { _log.setName("VPUTest"); _log.setLevel(vpux::LogLevel::Info); - this->targetDevice = ov::test::utils::DEVICE_NPU; + this->targetDevice = DEVICE_NPU; if (!envConfig.IE_NPU_TESTS_LOG_LEVEL.empty()) { const auto logLevel = ::intel_npu::OptionParser::parse(envConfig.IE_NPU_TESTS_LOG_LEVEL); @@ -139,9 +140,7 @@ void VpuOv2LayerTest::run() { } summary.updateOPsStats(function, ov::test::utils::PassRate::Statuses::CRASHED); - ASSERT_FALSE(targetStaticShapes.empty()) << "Target Static Shape is empty!"; - auto crashHandler = std::make_unique(); #ifdef _WIN32 @@ -337,7 +336,7 @@ void VpuOv2LayerTest::printNetworkConfig() const { item.second.print(ostr); ostr << "; "; } - _log.info("NPU Plugin config: {0}", ostr.str()); + _log.info("{0} Plugin config: {1}", this->targetDevice, ostr.str()); } void VpuOv2LayerTest::setPlatform(const std::string_view platform) { diff --git a/tests/functional/shared_tests_instances/vpu_ov2_layer_test.hpp b/tests/functional/shared_tests_instances/vpu_ov2_layer_test.hpp index 2e76ce0fb1..57e0b0a602 100644 --- a/tests/functional/shared_tests_instances/vpu_ov2_layer_test.hpp +++ b/tests/functional/shared_tests_instances/vpu_ov2_layer_test.hpp @@ -1,19 +1,20 @@ +// // Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include +#include "common/npu_test_env_cfg.hpp" +#include "vpu_test_tool.hpp" +#include "vpux/utils/logger/logger.hpp" + #include -#include #include + +#include +#include #include -#include -#include "common/npu_test_env_cfg.hpp" -#include "common/utils.hpp" -#include "vpu_test_tool.hpp" namespace ov::test::utils { diff --git a/tests/functional/shared_tests_instances/vpu_test_tool.cpp b/tests/functional/shared_tests_instances/vpu_test_tool.cpp index 54b2b0e4e7..df1608f817 100644 --- a/tests/functional/shared_tests_instances/vpu_test_tool.cpp +++ b/tests/functional/shared_tests_instances/vpu_test_tool.cpp @@ -14,7 +14,7 @@ namespace ov::test::utils { VpuTestTool::VpuTestTool(const VpuTestEnvConfig& envCfg) : envConfig(envCfg), - DEVICE_NAME(envConfig.IE_NPU_TESTS_DEVICE_NAME.empty() ? "NPU" : envConfig.IE_NPU_TESTS_DEVICE_NAME), + DEVICE_NAME(envConfig.IE_NPU_TESTS_DEVICE_NAME.empty() ? DEVICE_NPU : envConfig.IE_NPU_TESTS_DEVICE_NAME), _log(vpux::Logger::global().nest("VpuTestTool", 1)) { } diff --git a/tests/functional/single_layer_tests/dynamic_strided_slice.cpp b/tests/functional/single_layer_tests/dynamic_strided_slice.cpp index be2c65db6c..6b2355345c 100644 --- a/tests/functional/single_layer_tests/dynamic_strided_slice.cpp +++ b/tests/functional/single_layer_tests/dynamic_strided_slice.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "shared_test_classes/base/ov_subgraph.hpp" #include "vpu_ov2_layer_test.hpp" #include "vpux/utils/core/error.hpp" @@ -142,18 +143,27 @@ class DynamicStridedSliceDynamicEndsLayerTest : public testing::WithParamInterface, public VpuOv2LayerTest { public: - void generate_inputs(const std::vector&) override { + void generate_inputs(const std::vector& staticShapes) override { inputs.clear(); const auto& funcInputs = function->inputs(); - auto& [_, endsValues, type] = this->GetParam(); - auto tensorValues = std::vector(endsValues); - auto dataTensorRank = tensorValues.size(); - auto tensor = ov::Tensor(ov::element::i64, ov::Shape{dataTensorRank}); + auto type = std::get(GetParam()); + auto& dataStaticShape = staticShapes[0]; + auto dataTensor = utils::create_and_fill_tensor(type, dataStaticShape); - std::copy_n(tensorValues.begin(), dataTensorRank, tensor.data()); + inputs.insert({funcInputs[0].get_node_shared_ptr(), dataTensor}); - inputs.insert({funcInputs[0].get_node_shared_ptr(), tensor}); + auto endsValues = static_cast>(std::get(GetParam())); + // Clamp ends values by data static shape + for (auto i = 0; i < static_cast(endsValues.size()); i++) { + endsValues[i] = std::min(endsValues[i], static_cast(dataStaticShape[i])); + } + + auto endsSize = endsValues.size(); + auto endsTensor = ov::Tensor(ov::element::i64, ov::Shape{endsSize}); + std::copy_n(endsValues.begin(), endsSize, endsTensor.data()); + + inputs.insert({funcInputs[1].get_node_shared_ptr(), endsTensor}); } protected: @@ -164,21 +174,24 @@ class DynamicStridedSliceDynamicEndsLayerTest : std::tie(dataTestShape, endsValues, type) = this->GetParam(); auto endsShape = ov::Shape{endsValues.size()}; - init_input_shapes({generateTestShape(endsShape)}); - auto inputParams = ov::ParameterVector{}; - for (auto&& shape : inputDynamicShapes) { - inputParams.push_back(std::make_shared(ov::element::i64, shape)); - } + init_input_shapes({dataTestShape, generateTestShape(endsShape)}); - const auto dataShape = dataTestShape.first.to_shape(); + VPUX_THROW_UNLESS(inputDynamicShapes.size() == 2, "Expected to have 2 input shapes, got {0}", + inputDynamicShapes.size()); + + auto inputParams = ov::ParameterVector{ + std::make_shared(type, inputDynamicShapes.at(0)), + std::make_shared(ov::element::i64, inputDynamicShapes.at(1))}; + + inputParams[0]->set_friendly_name("data"); + inputParams[1]->set_friendly_name("ends"); + + const auto dataShape = dataTestShape.first.get_max_shape(); const auto dataRank = dataShape.size(); VPUX_THROW_UNLESS(dataRank == endsValues.size(), "Input shape rank '{0}' and the size of 'ends' input '{1}' must be equal", dataRank, endsValues.size()); - auto dataValues = generateConst(dataShape); - auto dataInput = ov::op::v0::Constant::create(type, dataShape, dataValues); - const auto begins = std::vector(dataRank, 0); const auto strides = std::vector(dataRank, 1); const auto attrShape = ov::Shape{dataRank}; @@ -187,10 +200,9 @@ class DynamicStridedSliceDynamicEndsLayerTest : auto stridesParam = ov::op::v0::Constant::create(ov::element::i64, attrShape, strides); auto stridedSlice = std::make_shared( - dataInput, beginsParam, inputParams[0], stridesParam, std::vector{}, + inputParams[0], beginsParam, inputParams[1], stridesParam, std::vector{}, std::vector{}, std::vector{}, std::vector{}); - inputParams[0]->set_friendly_name("ends"); function = std::make_shared(stridedSlice, inputParams, "DynamicStridedSlice"); } }; @@ -207,6 +219,10 @@ TEST_P(DynamicStridedSliceDynamicEndsLayerTest, NPU4000_HW) { run(Platform::NPU4000); } +// dynamic shape inputs will cause the test to fail because a strided slice layer +// works with strides of the input buffer. But the input data is packed and does +// not respect the strides of an upper-bounded buffer. +// Need to have a full support of strided data for dynamic tensors by the NPU plugin. auto in = std::vector{generateTestShape(1, 2, 35, 512)}; auto ends = std::vector{{1, 2, 35, 512}, {1, 2, 10, 512}, {1, 2, 1, 512}, {1, 1, 10, 512}}; diff --git a/tests/functional/single_layer_tests/group_conv_backpropdata_test.cpp b/tests/functional/single_layer_tests/group_conv_backpropdata_test.cpp new file mode 100644 index 0000000000..952af1c75e --- /dev/null +++ b/tests/functional/single_layer_tests/group_conv_backpropdata_test.cpp @@ -0,0 +1,55 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/opsets/opset6.hpp" +#include "vpu_ov2_layer_test.hpp" + +namespace ov::test::subgraph { + +class GroupConvBackpropDataInputFilterLayerTest : public VpuOv2LayerTest { +public: + void SetUp() override { + const ov::Shape staticInputShape{1, 64, 64, 64}; + const std::vector inferenceInputShapes = {staticInputShape}; + const ov::test::InputShape dataShape = {staticInputShape, inferenceInputShapes}; + + const ov::Shape staticFilterShape{64, 1, 1, 4, 4}; + const std::vector inferenceFilterShapes = {staticFilterShape}; + const ov::test::InputShape filterShape = {staticFilterShape, inferenceFilterShapes}; + + init_input_shapes({dataShape, filterShape}); + + auto input = std::make_shared(ov::element::f16, inputDynamicShapes.at(0)); + auto filter = std::make_shared(ov::element::f16, inputDynamicShapes.at(1)); + + ov::Strides strides = {2, 2}; + ov::CoordinateDiff pads_begin = {1, 1}; + ov::CoordinateDiff pads_end = {1, 1}; + ov::Strides dilations = {1, 1}; + + // Example of GroupConvolutionBackpropData using non-constant inputs + auto group_conv = std::make_shared( + input, filter, strides, pads_begin, pads_end, dilations, ov::op::PadType::EXPLICIT); + + auto results = ov::ResultVector{std::make_shared(group_conv)}; + function = std::make_shared(results, ov::ParameterVector{input, filter}, + "GroupConvBackpropDataInputFilter"); + } +}; + +TEST_F(GroupConvBackpropDataInputFilterLayerTest, NPU3720_HW) { + // The threshold is marked because the test runs with fp16 precision + abs_threshold = 0.5f; + setDefaultHardwareMode(); + run(Platform::NPU3720); +} + +TEST_F(GroupConvBackpropDataInputFilterLayerTest, NPU4000_HW) { + // The threshold is marked because the test runs with fp16 precision + abs_threshold = 0.5f; + setDefaultHardwareMode(); + run(Platform::NPU4000); +} + +} // namespace ov::test::subgraph diff --git a/tests/functional/subgraph_tests/add_with_transpose.cpp b/tests/functional/subgraph_tests/add_with_transpose.cpp index 97b3621d1c..2878c117a3 100644 --- a/tests/functional/subgraph_tests/add_with_transpose.cpp +++ b/tests/functional/subgraph_tests/add_with_transpose.cpp @@ -1,3 +1,4 @@ +// // Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/autopad.cpp b/tests/functional/subgraph_tests/autopad.cpp index 670bb405f7..e004ff7553 100644 --- a/tests/functional/subgraph_tests/autopad.cpp +++ b/tests/functional/subgraph_tests/autopad.cpp @@ -222,6 +222,7 @@ INSTANTIATE_TEST_SUITE_P( Params{ov::Shape{1, 3, 16, 16}, OpType::MAXPOOL, OpType::SOFTMAX, /*quantized=*/false}, Params{ov::Shape{1, 3, 16, 16}, OpType::AVGPOOL, OpType::SOFTMAX, /*quantized=*/false}, Params{ov::Shape{1, 3, 16, 16}, OpType::REDUCE_SUM, OpType::SOFTMAX, /*quantized=*/false}, + Params{ov::Shape{1, 3, 16, 16}, OpType::REDUCE_MEAN, OpType::SOFTMAX, /*quantized=*/false}, }), AutoPaddingTest::getTestCaseName); diff --git a/tests/functional/subgraph_tests/conv_softmax_conv.cpp b/tests/functional/subgraph_tests/conv_softmax_conv.cpp index d20b690637..a2a55d7e18 100644 --- a/tests/functional/subgraph_tests/conv_softmax_conv.cpp +++ b/tests/functional/subgraph_tests/conv_softmax_conv.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2023-2025 Intel Corporation. +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/conv_with_transpose.cpp b/tests/functional/subgraph_tests/conv_with_transpose.cpp index 88916ec06f..50273051d0 100644 --- a/tests/functional/subgraph_tests/conv_with_transpose.cpp +++ b/tests/functional/subgraph_tests/conv_with_transpose.cpp @@ -1,3 +1,4 @@ +// // Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/depth_to_space.cpp b/tests/functional/subgraph_tests/depth_to_space.cpp index b49b7508d2..ab1fd4146b 100644 --- a/tests/functional/subgraph_tests/depth_to_space.cpp +++ b/tests/functional/subgraph_tests/depth_to_space.cpp @@ -1,3 +1,4 @@ +// // Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/dilated_convolution.cpp b/tests/functional/subgraph_tests/dilated_convolution.cpp index 38875465f5..03c4712c44 100644 --- a/tests/functional/subgraph_tests/dilated_convolution.cpp +++ b/tests/functional/subgraph_tests/dilated_convolution.cpp @@ -1,4 +1,5 @@ -// Copyright (C) 2025 Intel Corporation. +// +// Copyright (C) 2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/dynamic_lstm_sequence.cpp b/tests/functional/subgraph_tests/dynamic_lstm_sequence.cpp index b88befadbc..902daefaae 100644 --- a/tests/functional/subgraph_tests/dynamic_lstm_sequence.cpp +++ b/tests/functional/subgraph_tests/dynamic_lstm_sequence.cpp @@ -120,9 +120,10 @@ class DynamicTensorIteratorNPUTest : void generate_inputs(const std::vector& targetInputStaticShapes) override { inputs.clear(); - size_t num_directions = seq_direction == ov::op::RecurrentSequenceDirection::BIDIRECTIONAL ? 2 : 1; - ov::Shape default_shape{batch_size, num_directions, hidden_size}; + size_t numDirections = seq_direction == ov::op::RecurrentSequenceDirection::BIDIRECTIONAL ? 2 : 1; + ov::Shape defaultShape{batch_size, numDirections, hidden_size}; auto itTargetShape = targetInputStaticShapes.begin(); + int seed = 1; for (const auto& param : function->get_parameters()) { std::shared_ptr inputNode = param; for (size_t i = 0; i < param->get_output_size(); i++) { @@ -132,14 +133,14 @@ class DynamicTensorIteratorNPUTest : if (itTargetShape != targetInputStaticShapes.end()) { if (nodePtr->get_input_node_ptr(port)->shared_from_this() == inputNode->shared_from_this()) { - ov::Tensor tensor = ov::test::utils::create_and_fill_tensor(param->get_element_type(), - *itTargetShape, 100, 0); + ov::Tensor tensor = ov::test::utils::create_and_fill_tensor_real_distribution( + param->get_element_type(), *itTargetShape, -1.0f, 1.0f, seed++); inputs.insert({param, tensor}); break; } } else { - ov::Tensor tensor = ov::test::utils::create_and_fill_tensor(param->get_element_type(), - default_shape, 100, 0); + ov::Tensor tensor = ov::test::utils::create_and_fill_tensor_real_distribution( + param->get_element_type(), defaultShape, -1.0f, 1.0f, seed++); inputs.insert({param, tensor}); } } @@ -153,6 +154,7 @@ class DynamicTensorIteratorNPUTest : }; TEST_P(DynamicTensorIteratorNPUTest, NPU4000_HW_TestKindSubgraph) { + abs_threshold = 0.0001f; setDefaultHardwareMode(); run(Platform::NPU4000); } @@ -164,14 +166,14 @@ std::vector hidden_sizes = {128}; std::vector model_types = {ov::element::f32}; -std::vector reccurent_sequence_direction = { +std::vector recurent_sequence_direction = { ov::op::RecurrentSequenceDirection::FORWARD, ov::op::RecurrentSequenceDirection::REVERSE, ov::op::RecurrentSequenceDirection::BIDIRECTIONAL}; INSTANTIATE_TEST_SUITE_P(smoke_DynamicTensorIterator_LSTMSequence, DynamicTensorIteratorNPUTest, testing::Combine(testing::ValuesIn({LSTMType::LSTMSequence}), testing::ValuesIn(input_shapes), testing::ValuesIn(hidden_sizes), - testing::ValuesIn(reccurent_sequence_direction), + testing::ValuesIn(recurent_sequence_direction), testing::Values(ov::test::utils::DEVICE_NPU), testing::ValuesIn(model_types)), DynamicTensorIteratorNPUTest::getTestCaseName); diff --git a/tests/functional/subgraph_tests/eltwise_different_inputs.cpp b/tests/functional/subgraph_tests/eltwise_different_inputs.cpp index 25732c1408..6500bc5021 100644 --- a/tests/functional/subgraph_tests/eltwise_different_inputs.cpp +++ b/tests/functional/subgraph_tests/eltwise_different_inputs.cpp @@ -1,3 +1,4 @@ +// // Copyright (C) 2022-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/fake_quantize_mul_fuse.cpp b/tests/functional/subgraph_tests/fake_quantize_mul_fuse.cpp index dcbd5080ab..fb0acb00df 100644 --- a/tests/functional/subgraph_tests/fake_quantize_mul_fuse.cpp +++ b/tests/functional/subgraph_tests/fake_quantize_mul_fuse.cpp @@ -1,3 +1,4 @@ +// // Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/fq_clamp.cpp b/tests/functional/subgraph_tests/fq_clamp.cpp index d9ef5b08e7..f7aaef4509 100644 --- a/tests/functional/subgraph_tests/fq_clamp.cpp +++ b/tests/functional/subgraph_tests/fq_clamp.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2023-2025 Intel Corporation. +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/fuse_mvn.cpp b/tests/functional/subgraph_tests/fuse_mvn.cpp index 9a6c07a399..9b110eda53 100644 --- a/tests/functional/subgraph_tests/fuse_mvn.cpp +++ b/tests/functional/subgraph_tests/fuse_mvn.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2023-2025 Intel Corporation. +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/handle_fake_quant_has_negative_scales.cpp b/tests/functional/subgraph_tests/handle_fake_quant_has_negative_scales.cpp index 9a8411bfab..fe5ae82b6d 100644 --- a/tests/functional/subgraph_tests/handle_fake_quant_has_negative_scales.cpp +++ b/tests/functional/subgraph_tests/handle_fake_quant_has_negative_scales.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2023-2025 Intel Corporation. +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/matmul_with_transpose.cpp b/tests/functional/subgraph_tests/matmul_with_transpose.cpp index ecd255f64c..6bf598340f 100644 --- a/tests/functional/subgraph_tests/matmul_with_transpose.cpp +++ b/tests/functional/subgraph_tests/matmul_with_transpose.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2023-2025 Intel Corporation. +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/mixed_precision.cpp b/tests/functional/subgraph_tests/mixed_precision.cpp index 69fe3b387b..6ff7e22f4a 100644 --- a/tests/functional/subgraph_tests/mixed_precision.cpp +++ b/tests/functional/subgraph_tests/mixed_precision.cpp @@ -1,3 +1,4 @@ +// // Copyright (C) 2022-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/mvn_internal_reshape.cpp b/tests/functional/subgraph_tests/mvn_internal_reshape.cpp index 68af7bda04..4160470d43 100644 --- a/tests/functional/subgraph_tests/mvn_internal_reshape.cpp +++ b/tests/functional/subgraph_tests/mvn_internal_reshape.cpp @@ -38,7 +38,6 @@ namespace ov::test::subgraph { using MvnInternalReshapeParams = std::tuple; // precision @@ -47,20 +46,19 @@ class MvnInternalReshapeTestCommon : public testing::WithParamInterface { public: static std::string getTestCaseName(testing::TestParamInfo obj) { - const auto& [ioShape, mvnShape, acrossChannels, normVariance, prc] = obj.param; + const auto& [ioShape, mvnShape, normVariance, prc] = obj.param; const std::string sep = "_"; std::ostringstream result; result << "TestKind" << ov::test::utils::testKind(__FILE__) << sep; result << "IOS={" << vec2str(ioShape) << "}" << sep; result << "MvnS={" << vec2str(mvnShape) << "}" << sep; - result << "acrossChannels={" << acrossChannels << "}" << sep; result << "norm={" << normVariance << "}" << sep; result << "prc={" << prc << "}" << sep; return result.str(); } void SetUp() override { - const auto& [ioShape, mvnShape, acrossChannels, normVariance, prc] = GetParam(); + const auto& [ioShape, mvnShape, normVariance, prc] = GetParam(); const auto C = ioShape.at(1); const auto K = mvnShape.at(1); ASSERT_GT(C, K); @@ -79,7 +77,7 @@ class MvnInternalReshapeTestCommon : auto add1 = std::make_shared(params[0], ct1); auto reshape1 = buildReshape(add1, mvnShape); - auto mvn = std::make_shared(reshape1, acrossChannels, normVariance, 1.0E-6); + auto mvn = std::make_shared(reshape1, false, normVariance, 1.0E-6); auto reshape2 = buildReshape(mvn, ioShape); auto add2 = std::make_shared(reshape2, ct2); @@ -129,21 +127,10 @@ std::vector ioShape1 = { {1, C1 * 20, 1, W1 / 20}, #endif }; -const auto testParams1 = - ::testing::Combine(::testing::ValuesIn(ioShape1), ::testing::Values(mvnShape1), ::testing::Values(false), - ::testing::Values(true), ::testing::ValuesIn(precision)); +const auto testParams1 = ::testing::Combine(::testing::ValuesIn(ioShape1), ::testing::Values(mvnShape1), + ::testing::Values(true), ::testing::ValuesIn(precision)); INSTANTIATE_TEST_SUITE_P(smoke_MvnInternalReshape1, MvnInternalReshapeTestCommon, testParams1, MvnInternalReshapeTestCommon::getTestCaseName); -// Test batched MVN -ov::Shape mvnShape2 = {32, 16, 64, 64}; -ov::Shape ioShape2 = {1, 512, 64, 64}; -const auto testParams2 = - ::testing::Combine(::testing::Values(ioShape2), ::testing::Values(mvnShape2), ::testing::Values(true), - ::testing::Values(true), ::testing::ValuesIn(precision)); - -INSTANTIATE_TEST_SUITE_P(smoke_MvnInternalReshape2, MvnInternalReshapeTestCommon, testParams2, - MvnInternalReshapeTestCommon::getTestCaseName); - } // namespace diff --git a/tests/functional/subgraph_tests/mvn_with_scale_bias.cpp b/tests/functional/subgraph_tests/mvn_with_scale_bias.cpp index bab3451548..2b29f09064 100644 --- a/tests/functional/subgraph_tests/mvn_with_scale_bias.cpp +++ b/tests/functional/subgraph_tests/mvn_with_scale_bias.cpp @@ -1,3 +1,4 @@ +// // Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/mvn_with_transpose.cpp b/tests/functional/subgraph_tests/mvn_with_transpose.cpp index dfc7964517..0d01e6a9b8 100644 --- a/tests/functional/subgraph_tests/mvn_with_transpose.cpp +++ b/tests/functional/subgraph_tests/mvn_with_transpose.cpp @@ -1,3 +1,4 @@ +// // Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/process_asymmetric_zero_points_for_convolution.cpp b/tests/functional/subgraph_tests/process_asymmetric_zero_points_for_convolution.cpp index a759b7b4cb..dde498e3cb 100644 --- a/tests/functional/subgraph_tests/process_asymmetric_zero_points_for_convolution.cpp +++ b/tests/functional/subgraph_tests/process_asymmetric_zero_points_for_convolution.cpp @@ -23,7 +23,7 @@ class Conv2dInMixedModeIncorrectZeroPoint : public VpuOv2LayerTest { VPUX_THROW_UNLESS(funcInputs.size() == 1, "Only 1 input is supported"); const auto& inputStaticShape = inputShapes[0]; const auto totalSize = ov::shape_size(inputStaticShape); - const auto inputTensor = ov::Tensor{ov::element::f32, inputStaticShape}; + auto inputTensor = ov::Tensor{ov::element::f32, inputStaticShape}; auto inputData = inputTensor.data::value_type>(); for (size_t i = 0; i < totalSize; i++) { inputData[i] = 1 + i % 7; diff --git a/tests/functional/subgraph_tests/propagate_fq.cpp b/tests/functional/subgraph_tests/propagate_fq.cpp index 304c793664..e3c081ff58 100644 --- a/tests/functional/subgraph_tests/propagate_fq.cpp +++ b/tests/functional/subgraph_tests/propagate_fq.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2023-2025 Intel Corporation. +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/quantized_convolution.cpp b/tests/functional/subgraph_tests/quantized_convolution.cpp index da7ec360f8..9a940158e5 100644 --- a/tests/functional/subgraph_tests/quantized_convolution.cpp +++ b/tests/functional/subgraph_tests/quantized_convolution.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2021-2025 Intel Corporation. +// Copyright (C) 2021-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/quantized_convolution_clamp.cpp b/tests/functional/subgraph_tests/quantized_convolution_clamp.cpp index ea931e9a55..e0d5cb2953 100644 --- a/tests/functional/subgraph_tests/quantized_convolution_clamp.cpp +++ b/tests/functional/subgraph_tests/quantized_convolution_clamp.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2023-2025 Intel Corporation. +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/quantized_dilated_convolution.cpp b/tests/functional/subgraph_tests/quantized_dilated_convolution.cpp index 993fb6ac3f..b9e7d1130b 100644 --- a/tests/functional/subgraph_tests/quantized_dilated_convolution.cpp +++ b/tests/functional/subgraph_tests/quantized_dilated_convolution.cpp @@ -1,4 +1,5 @@ -// Copyright (C) 2025 Intel Corporation. +// +// Copyright (C) 2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/quantized_interpolate.cpp b/tests/functional/subgraph_tests/quantized_interpolate.cpp index e284998520..fabf0841a3 100644 --- a/tests/functional/subgraph_tests/quantized_interpolate.cpp +++ b/tests/functional/subgraph_tests/quantized_interpolate.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2023-2025 Intel Corporation. +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/reshape_with_transpose.cpp b/tests/functional/subgraph_tests/reshape_with_transpose.cpp index f84e9beaa9..3f3aa8f559 100644 --- a/tests/functional/subgraph_tests/reshape_with_transpose.cpp +++ b/tests/functional/subgraph_tests/reshape_with_transpose.cpp @@ -1,3 +1,4 @@ +// // Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/functional/subgraph_tests/tiling_with_concat.cpp b/tests/functional/subgraph_tests/tiling_with_concat.cpp index 70646c1c67..b4a38630b1 100644 --- a/tests/functional/subgraph_tests/tiling_with_concat.cpp +++ b/tests/functional/subgraph_tests/tiling_with_concat.cpp @@ -1,3 +1,4 @@ +// // Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/tests/fuzz/src/pipeline_default_hw_mode/fuzz_pipeline_default_hw_mode.cpp b/tests/fuzz/src/pipeline_default_hw_mode/fuzz_pipeline_default_hw_mode.cpp index cd1ccdcd18..4b65ca9951 100644 --- a/tests/fuzz/src/pipeline_default_hw_mode/fuzz_pipeline_default_hw_mode.cpp +++ b/tests/fuzz/src/pipeline_default_hw_mode/fuzz_pipeline_default_hw_mode.cpp @@ -59,7 +59,8 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { const auto log = vpux::Logger::global(); mlir::PassManager pm(moduleOp.get()->getName(), mlir::OpPassManager::Nesting::Implicit); - auto initCompilerOptions = VPU::InitCompilerOptions(VPU::ArchKind::NPU40XX, config::CompilationMode::DefaultHW); + auto initCompilerOptions = + VPU::InitCompilerOptions(config::ArchKind::NPU40XX, config::CompilationMode::DefaultHW); VPU::buildInitCompilerPipeline(pm, initCompilerOptions, log); diff --git a/tests/fuzz/src/pipeline_default_hw_mode/seeds/fc.mlir b/tests/fuzz/src/pipeline_default_hw_mode/seeds/fc.mlir index 8c5cfe68cc..b1068572c1 100644 --- a/tests/fuzz/src/pipeline_default_hw_mode/seeds/fc.mlir +++ b/tests/fuzz/src/pipeline_default_hw_mode/seeds/fc.mlir @@ -2,7 +2,6 @@ // Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // - module { net.NetworkInfo entryPoint : @main inputsInfo : { diff --git a/tests/fuzz/src/pipeline_default_hw_mode/seeds/interpolate.mlir b/tests/fuzz/src/pipeline_default_hw_mode/seeds/interpolate.mlir index f19eacff61..8deafdd138 100644 --- a/tests/fuzz/src/pipeline_default_hw_mode/seeds/interpolate.mlir +++ b/tests/fuzz/src/pipeline_default_hw_mode/seeds/interpolate.mlir @@ -2,7 +2,6 @@ // Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // - module { net.NetworkInfo entryPoint : @main inputsInfo : { diff --git a/tests/fuzz/src/pipeline_default_hw_mode/seeds/layout_cast.mlir b/tests/fuzz/src/pipeline_default_hw_mode/seeds/layout_cast.mlir index 9de88cf3bb..69dd1ed5c5 100644 --- a/tests/fuzz/src/pipeline_default_hw_mode/seeds/layout_cast.mlir +++ b/tests/fuzz/src/pipeline_default_hw_mode/seeds/layout_cast.mlir @@ -2,7 +2,6 @@ // Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // - #NCWH = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)> #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> diff --git a/tests/lit/CMakeLists.txt b/tests/lit/CMakeLists.txt index de5f16f6fd..45daafe406 100644 --- a/tests/lit/CMakeLists.txt +++ b/tests/lit/CMakeLists.txt @@ -5,35 +5,12 @@ # -vpux_setup_lit_tool(flatc) - -if(ENABLE_NPU_MONO) - if(NOT DEFINED GRAPHFILE_SCHEMA_SUBMODULE_PATH) - message(FATAL_ERROR "Graphfile schema submodule path is not set while `npu_mono` was activated") - endif() - set(SCHEMA_SOURCE_DIR ${GRAPHFILE_SCHEMA_SUBMODULE_PATH}/src/schema) -else() - set(SCHEMA_SOURCE_DIR ${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/thirdparty/elf/src/schema) # Legacy path -endif(ENABLE_NPU_MONO) - -add_custom_target(copy_vpuip_schema_tests ALL - COMMAND - ${CMAKE_COMMAND} -E remove_directory - "$/lit-tests/schema" - COMMAND - ${CMAKE_COMMAND} -E copy_directory - "${SCHEMA_SOURCE_DIR}" - "$/lit-tests/schema" - COMMENT "[LIT] Copy VPUIP schema files for tests" -) -set_target_properties(copy_vpuip_schema_tests PROPERTIES FOLDER "tests") - -set(vpuip_schema_file "path('../schema/graphfile.fbs')") +vpux_setup_lit_tool() add_custom_target(copy_lit_tests_script ALL COMMAND ${CMAKE_COMMAND} -E copy - "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/scripts/run_all_lit_tests.sh" + "${PROJECT_SOURCE_DIR}/scripts/run_all_lit_tests.sh" "$/lit-tests/run_all_lit_tests.sh" COMMENT "[LIT] Copy runner script" ) @@ -43,10 +20,10 @@ set(data_path_npu "path('NPU/data')") vpux_setup_lit_tests(NPU PATTERNS "*.mlir" "*.json" EXTRA_SOURCES "lit.local.cfg" "*.bin" "*.txt" "*.xml" - VARS "vpuip_schema_file" "data_path_npu" + VARS "data_path_npu" PARAMS "arch" PARAMS_DEFAULT_VALUES "NPU37XX" - SUBSTITUTIONS "vpuip_schema_file" "data_path_npu" + SUBSTITUTIONS "data_path_npu" ) set(_lit_tests_dir_full_path "$/lit-tests") diff --git a/tests/lit/NPU/backend/dual_tile_37XX.mlir b/tests/lit/NPU/backend/dual_tile_37XX.mlir index 94e6d4238c..d033065656 100644 --- a/tests/lit/NPU/backend/dual_tile_37XX.mlir +++ b/tests/lit/NPU/backend/dual_tile_37XX.mlir @@ -16,7 +16,7 @@ !qtype = !quant.uniform -module @dual_tile attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @dual_tile attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { net.NetworkInfo entryPoint : @main inputsInfo : { @@ -25,10 +25,10 @@ module @dual_tile attributes {VPU.arch = #VPU.arch_kind, config.compila DataInfo "output_0" : tensor<2x16x16x16xf16> } - IE.MemoryResource 31457280 bytes of @DDR {VPU.bandwidth = 8, VPU.derateFactor = 6.000000e-01} + IE.MemoryResource 31457280 bytes of @DDR {config.bandwidth = 8, config.derateFactor = 6.000000e-01} IE.ExecutorResource 2 of @DMA_NN IE.TileResource 1 of @NCE { - IE.MemoryResource 2097152 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 2097152 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } diff --git a/tests/lit/NPU/conversion/passes/AddELFRelocations/one_dma_40XX+.mlir b/tests/lit/NPU/conversion/passes/AddELFRelocations/one_dma_40XX+.mlir index b9e4fc006b..8edec75ab6 100644 --- a/tests/lit/NPU/conversion/passes/AddELFRelocations/one_dma_40XX+.mlir +++ b/tests/lit/NPU/conversion/passes/AddELFRelocations/one_dma_40XX+.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --init-compiler="vpu-arch=%arch% allow-custom-values=true" --split-input-file --convert-VPUASM-to-NPUReg40XX --create-elf-relocations %s | FileCheck %s // REQUIRES: arch-NPU40XX -module @OneDMAWithoutAttributes attributes {VPU.arch = #VPU.arch_kind} { +module @OneDMAWithoutAttributes attributes {config.arch = #config.arch_kind} { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.ExecutorResource 1 of @DPU } @@ -14,8 +14,8 @@ module @OneDMAWithoutAttributes attributes {VPU.arch = #VPU.arch_kind} IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} - IE.MemoryResource 524288000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 524288000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input_0" : tensor<1x2x3x4xf16> } outputsInfo : { diff --git a/tests/lit/NPU/conversion/passes/AddELFRelocations/one_dma_indices_40XX+.mlir b/tests/lit/NPU/conversion/passes/AddELFRelocations/one_dma_indices_40XX+.mlir index 27e7b45122..104c27d29f 100644 --- a/tests/lit/NPU/conversion/passes/AddELFRelocations/one_dma_indices_40XX+.mlir +++ b/tests/lit/NPU/conversion/passes/AddELFRelocations/one_dma_indices_40XX+.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --init-compiler="vpu-arch=%arch% allow-custom-values=true" --split-input-file --convert-VPUASM-to-NPUReg40XX --create-elf-relocations %s | FileCheck %s // REQUIRES: arch-NPU40XX -module @OneDMAWithoutAttributes attributes {VPU.arch = #VPU.arch_kind} { +module @OneDMAWithoutAttributes attributes {config.arch = #config.arch_kind} { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.ExecutorResource 1 of @DPU } @@ -14,8 +14,8 @@ module @OneDMAWithoutAttributes attributes {VPU.arch = #VPU.arch_kind} IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} - IE.MemoryResource 524288000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 524288000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input_0" : tensor<1x2x3x4xf16> } outputsInfo : { diff --git a/tests/lit/NPU/conversion/passes/IE2VPU/convert_IE_to_VPU_NCE_37XX_40XX.mlir b/tests/lit/NPU/conversion/passes/IE2VPU/convert_IE_to_VPU_NCE_37XX_40XX.mlir index a62a2a2763..d14e677ef1 100644 --- a/tests/lit/NPU/conversion/passes/IE2VPU/convert_IE_to_VPU_NCE_37XX_40XX.mlir +++ b/tests/lit/NPU/conversion/passes/IE2VPU/convert_IE_to_VPU_NCE_37XX_40XX.mlir @@ -1310,3 +1310,28 @@ func.func @QuantConvWithNegativeScales(%arg0: tensor<1x16x16x16xf16, {order = #N // CHECK: return [[VAL0]] : tensor<1x16x16x16xf16, {order = #NHWC}> } + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK-LABEL: @DontConvertGroupConvToNCEIfDilatedConv +func.func @DontConvertGroupConvToNCEIfDilatedConv(%arg0: tensor<1x16x48x48xf16, {order = #NHWC}>) -> tensor<1x16x48x48xf16, {order = #NHWC}> { + %weights = const.Declare tensor<16x1x3x3xf16, {order = #NHWC}> = + dense<1.000000e+00> : tensor<16x1x3x3xf16>, [#const.Reorder<#NHWC>] + + %0 = IE.GroupConvolution(%arg0, %weights) { + dilations = [2, 2], + groups = 16, + pads_begin = [2, 2], + pads_end = [2, 2], + strides = [1, 1], + post_op = #IE.LeakyRelu + } : tensor<1x16x48x48xf16, {order = #NHWC}>, tensor<16x1x3x3xf16, {order = #NHWC}> + -> tensor<1x16x48x48xf16, {order = #NHWC}> + + return %0 : tensor<1x16x48x48xf16, {order = #NHWC}> + + // CHECK-NOT: VPU.NCE.DepthConvolution + // CHECK: IE.GroupConvolution +} diff --git a/tests/lit/NPU/conversion/passes/ShaveCodeGen/convert_eltwise_layers_to_math_40XX+.mlir b/tests/lit/NPU/conversion/passes/ShaveCodeGen/convert_eltwise_layers_to_math_40XX+.mlir index 47a5cec082..f9e6b9d121 100644 --- a/tests/lit/NPU/conversion/passes/ShaveCodeGen/convert_eltwise_layers_to_math_40XX+.mlir +++ b/tests/lit/NPU/conversion/passes/ShaveCodeGen/convert_eltwise_layers_to_math_40XX+.mlir @@ -359,7 +359,7 @@ module @SingleLogLayer { return %0 : tensor<1x1x1x1000xf16> // CHECK-NOT: IE.Log - // CHECK: [[VAR0:%.+]] = math.log [[ARG0:%.+]] fastmath : tensor<1x1x1x1000xf16> + // CHECK: {{.*}} = math.log {{.*}} fastmath : tensor<1x1x1x1000xf16> } } // ----- @@ -379,7 +379,7 @@ module @SingleExpLayer { return %0 : tensor<1x1x1x1000xf16> // CHECK-NOT: IE.Log - // CHECK: [[VAR0:%.+]] = math.exp [[ARG0:%.+]] fastmath : tensor<1x1x1x1000xf16> + // CHECK: {{.*}} = math.exp {{.*}} fastmath : tensor<1x1x1x1000xf16> } } @@ -401,7 +401,7 @@ module @SingleSinLayer { return %0 : tensor<1x1x1x1000xf16> // CHECK-NOT: IE.Sin - // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [[[NCHW]], [[NCHW]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf16>) outs([[ARG0]] : tensor<1x1x1x1000xf16>) { + // CHECK: {{.*}} = linalg.generic {indexing_maps = [[[NCHW]], [[NCHW]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf16>) outs([[ARG0]] : tensor<1x1x1x1000xf16>) { // CHECK-NEXT: ^bb0([[IN:%.+]]: f16, {{%.+}}: f16): // CHECK-NEXT: [[EXT:%.+]] = arith.extf [[IN]] : f16 to f32 // CHECK-NEXT: [[SIN:%.+]] = math.sin [[EXT]] : f32 @@ -430,7 +430,7 @@ module @SingleSqrtLayer { return %0 : tensor<1x1x1x1000xf16> // CHECK-NOT: IE.Sqrt - // CHECK: [[VAR0:%.+]] = math.sqrt [[ARG0:%.+]] fastmath : tensor<1x1x1x1000xf16> + // CHECK: {{.*}} = math.sqrt {{.*}} fastmath : tensor<1x1x1x1000xf16> } } @@ -452,7 +452,7 @@ module @SingleRoundLayerHalfToEven { return %0 : tensor<1x1x1x1000xf16> // CHECK-NOT: IE.Round - // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [[[NCHW]], [[NCHW]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf16>) outs([[ARG0]] : tensor<1x1x1x1000xf16>) { + // CHECK: {{.*}} = linalg.generic {indexing_maps = [[[NCHW]], [[NCHW]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf16>) outs([[ARG0]] : tensor<1x1x1x1000xf16>) { // CHECK-NEXT: ^bb0([[IN:%.+]]: f16, {{%.+}}: f16): // CHECK-NEXT: [[VAR0:%.+]] = math.roundeven [[IN]] : f16 // CHECK-NEXT: linalg.yield [[VAR0]] : f16 @@ -477,7 +477,7 @@ module @SingleRoundLayerHalfAwayFromZero { return %0 : tensor<1x1x1x1000xf16> // CHECK-NOT: IE.Round - // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [[[NCHW]], [[NCHW]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf16>) outs([[ARG0]] : tensor<1x1x1x1000xf16>) { + // CHECK: {{.*}} = linalg.generic {indexing_maps = [[[NCHW]], [[NCHW]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf16>) outs([[ARG0]] : tensor<1x1x1x1000xf16>) { // CHECK-NEXT: ^bb0([[IN:%.+]]: f16, {{%.+}}: f16): // CHECK-NEXT: [[VAR0:%.+]] = math.round [[IN]] : f16 // CHECK-NEXT: linalg.yield [[VAR0]] : f16 @@ -503,7 +503,7 @@ module @SingleErfLayer { return %0 : tensor<1x1x1x1000xf16> // CHECK-NOT: IE.Erf - // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [[[NCHW]], [[NCHW]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf16>) outs([[ARG0]] : tensor<1x1x1x1000xf16>) { + // CHECK: {{.*}} = linalg.generic {indexing_maps = [[[NCHW]], [[NCHW]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.*]] : tensor<1x1x1x1000xf16>) outs([[ARG0]] : tensor<1x1x1x1000xf16>) { // CHECK-NEXT: ^bb0([[IN:%.+]]: f16, {{%.+}}: f16): // CHECK-NEXT: [[EXT:%.+]] = arith.extf [[IN]] : f16 to f32 // CHECK-NEXT: [[ERF:%.+]] = math.erf [[EXT]] : f32 @@ -533,8 +533,8 @@ module @SingleConvertFPToSILayer { // CHECK-NOT: IE.Convert // CHECK: [[EMPTY:%.+]] = tensor.empty() : tensor<1x1x1x1000xi32> // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [#NCHW, #NCHW], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf16>) outs([[EMPTY]] : tensor<1x1x1x1000xi32>) { -// CHECK: ^bb0([[ARG0:%.+]]: f16, {{%.+}}: i32): -// CHECK: [[OP:%.+]] = arith.fptosi %{{.+}} : f16 to i32 +// CHECK: ^bb0([[IN:%.+]]: f16, {{%.+}}: i32): +// CHECK: [[OP:%.+]] = arith.fptosi [[IN]] : f16 to i32 // CHECK: linalg.yield [[OP]] : i32 // CHECK: [[RET:%.+]] = tensor.bitcast [[LINALG_OP]] : tensor<1x1x1x1000xi32> to tensor<1x1x1x1000xsi32> // CHECK: IE.CGCYield [[RET]] : tensor<1x1x1x1000xsi32> @@ -561,8 +561,8 @@ module @SingleConvertFPToUILayer { // CHECK-NOT: IE.Convert // CHECK: [[EMPTY:%.+]] = tensor.empty() : tensor<1x1x1x1000xi32> // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [#NCHW, #NCHW], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf16>) outs([[EMPTY]] : tensor<1x1x1x1000xi32>) { -// CHECK: ^bb0([[ARG0:%.+]]: f16, {{%.+}}: i32): -// CHECK: [[OP:%.+]] = arith.fptoui %{{.+}} : f16 to i32 +// CHECK: ^bb0([[IN:%.+]]: f16, {{%.+}}: i32): +// CHECK: [[OP:%.+]] = arith.fptoui [[IN]] : f16 to i32 // CHECK: linalg.yield [[OP]] : i32 // CHECK: [[RET:%.+]] = tensor.bitcast [[LINALG_OP]] : tensor<1x1x1x1000xi32> to tensor<1x1x1x1000xui32> // CHECK: IE.CGCYield [[RET]] : tensor<1x1x1x1000xui32> @@ -590,10 +590,10 @@ module @SingleConvertSIToFPLayer { // CHECK: [[RET:%.+]] = tensor.bitcast [[ARG0:%.+]] : tensor<1x1x1x1000xsi32> to tensor<1x1x1x1000xi32> // CHECK: [[EMPTY:%.+]] = tensor.empty() : tensor<1x1x1x1000xf16> // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [#NCHW, #NCHW], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[RET]] : tensor<1x1x1x1000xi32>) outs([[EMPTY]] : tensor<1x1x1x1000xf16>) { -// CHECK: ^bb0([[ARG0:%.+]]: i32, {{%.+}}: f16): -// CHECK: [[OP:%.+]] = arith.sitofp %{{.+}} : i32 to f16 +// CHECK: ^bb0([[IN:%.+]]: i32, {{%.+}}: f16): +// CHECK: [[OP:%.+]] = arith.sitofp [[IN]] : i32 to f16 // CHECK: linalg.yield [[OP]] : f16 -// CHECK: IE.CGCYield [[RET:%.+]] : tensor<1x1x1x1000xf16> +// CHECK: IE.CGCYield [[LINALG_OP]] : tensor<1x1x1x1000xf16> } @@ -619,10 +619,10 @@ module @SingleConvertUIToFPLayer { // CHECK: [[RET:%.+]] = tensor.bitcast [[ARG0:%.+]] : tensor<1x1x1x1000xui32> to tensor<1x1x1x1000xi32> // CHECK: [[EMPTY:%.+]] = tensor.empty() : tensor<1x1x1x1000xf16> // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [#NCHW, #NCHW], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[RET]] : tensor<1x1x1x1000xi32>) outs([[EMPTY]] : tensor<1x1x1x1000xf16>) { -// CHECK: ^bb0([[ARG0:%.+]]: i32, {{%.+}}: f16): -// CHECK: [[OP:%.+]] = arith.uitofp %{{.+}} : i32 to f16 +// CHECK: ^bb0([[IN:%.+]]: i32, {{%.+}}: f16): +// CHECK: [[OP:%.+]] = arith.uitofp [[IN]] : i32 to f16 // CHECK: linalg.yield [[OP]] : f16 -// CHECK: IE.CGCYield [[RET:%.+]] : tensor<1x1x1x1000xf16> +// CHECK: IE.CGCYield [[LINALG_OP]] : tensor<1x1x1x1000xf16> } } @@ -646,10 +646,10 @@ module @SingleConvertExtFPLayer { // CHECK-NOT: IE.Convert // CHECK: [[EMPTY:%.+]] = tensor.empty() : tensor<1x1x1x1000xf32> // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [#NCHW, #NCHW], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf16>) outs([[EMPTY]] : tensor<1x1x1x1000xf32>) { -// CHECK: ^bb0([[ARG0:%.+]]: f16, {{%.+}}: f32): -// CHECK: [[OP:%.+]] = arith.extf %{{.+}} : f16 to f32 +// CHECK: ^bb0([[IN:%.+]]: f16, {{%.+}}: f32): +// CHECK: [[OP:%.+]] = arith.extf [[IN]] : f16 to f32 // CHECK: linalg.yield [[OP]] : f32 -// CHECK: IE.CGCYield [[RET:%.+]] : tensor<1x1x1x1000xf32> +// CHECK: IE.CGCYield [[LINALG_OP]] : tensor<1x1x1x1000xf32> } } @@ -671,12 +671,13 @@ module @SingleConvertTruncFPLayer { return %0 : tensor<1x1x1x1000xf16> // CHECK-NOT: IE.Convert +// CHECK: func.func @main([[ARG0:%.+]]: tensor<1x1x1x1000xf32>) -> tensor<1x1x1x1000xf16> { // CHECK: [[EMPTY:%.+]] = tensor.empty() : tensor<1x1x1x1000xf16> // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [#NCHW, #NCHW], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf32>) outs([[EMPTY]] : tensor<1x1x1x1000xf16>) { -// CHECK: ^bb0([[ARG0:%.+]]: f32, {{%.+}}: f16): -// CHECK: [[OP:%.+]] = arith.truncf %{{.+}} : f32 to f16 +// CHECK: ^bb0([[IN:%.+]]: f32, {{%.+}}: f16): +// CHECK: [[OP:%.+]] = arith.truncf [[IN]] : f32 to f16 // CHECK: linalg.yield [[OP]] : f16 -// CHECK: IE.CGCYield [[RET:%.+]] : tensor<1x1x1x1000xf16> +// CHECK: IE.CGCYield [[LINALG_OP]] : tensor<1x1x1x1000xf16> } } @@ -701,11 +702,11 @@ module @SingleConvertExtSILayer { // CHECK: [[RET:%.+]] = tensor.bitcast [[ARG:%.+]] : tensor<1x1x1x1000xsi16> to tensor<1x1x1x1000xi16> // CHECK: [[EMPTY:%.+]] = tensor.empty() : tensor<1x1x1x1000xi32> // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [#NCHW, #NCHW], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[RET]] : tensor<1x1x1x1000xi16>) outs([[EMPTY]] : tensor<1x1x1x1000xi32>) { -// CHECK: ^bb0([[ARG0:%.+]]: i16, {{%.+}}: i32): -// CHECK: [[OP:%.+]] = arith.extsi %{{.+}} : i16 to i32 +// CHECK: ^bb0([[IN:%.+]]: i16, {{%.+}}: i32): +// CHECK: [[OP:%.+]] = arith.extsi [[IN]] : i16 to i32 // CHECK: linalg.yield [[OP]] : i32 // CHECK: [[RET:%.+]] = tensor.bitcast [[LINALG_OP]] : tensor<1x1x1x1000xi32> to tensor<1x1x1x1000xsi32> -// CHECK: IE.CGCYield [[RET:%.+]] : tensor<1x1x1x1000xsi32> +// CHECK: IE.CGCYield [[RET]] : tensor<1x1x1x1000xsi32> } } @@ -734,7 +735,7 @@ module @SingleConvertExtUILayer { // CHECK: [[OP:%.+]] = arith.extui %{{.+}} : i16 to i32 // CHECK: linalg.yield [[OP]] : i32 // CHECK: [[RET:%.+]] = tensor.bitcast [[LINALG_OP]] : tensor<1x1x1x1000xi32> to tensor<1x1x1x1000xui32> -// CHECK: IE.CGCYield [[RET:%.+]] : tensor<1x1x1x1000xui32> +// CHECK: IE.CGCYield [[RET]] : tensor<1x1x1x1000xui32> } } @@ -758,10 +759,200 @@ module @SingleConvertTruncILayer { // CHECK-NOT: IE.Convert // CHECK: [[EMPTY:%.+]] = tensor.empty() : tensor<1x1x1x1000xi16> // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [#NCHW, #NCHW], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG:%.+]] : tensor<1x1x1x1000xi32>) outs([[EMPTY]] : tensor<1x1x1x1000xi16>) { -// CHECK: ^bb0([[ARG0:%.+]]: i32, {{%.+}}: i16): -// CHECK: [[OP:%.+]] = arith.trunci %{{.+}} : i32 to i16 +// CHECK: ^bb0([[IN:%.+]]: i32, {{%.+}}: i16): +// CHECK: [[OP:%.+]] = arith.trunci [[IN]] : i32 to i16 // CHECK: linalg.yield [[OP]] : i16 -// CHECK: IE.CGCYield [[RET:%.+]] : tensor<1x1x1x1000xi16> +// CHECK: IE.CGCYield [[LINALG_OP]] : tensor<1x1x1x1000xi16> + } +} +// ----- +// IE.Abs + +module @SingleAbsFloatLayer { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x1x1x1000xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x1x1x1000xf16> } + + func.func @main(%arg0: tensor<1x1x1x1000xf16>) -> tensor<1x1x1x1000xf16> { + %0 = IE.CodeGenCapsule inputs(%arg0 as %arg1: tensor<1x1x1x1000xf16>) { + %1 = IE.Abs(%arg1) : tensor<1x1x1x1000xf16> -> tensor<1x1x1x1000xf16> + IE.CGCYield %1 : tensor<1x1x1x1000xf16> + } -> tensor<1x1x1x1000xf16> + return %0 : tensor<1x1x1x1000xf16> + + // CHECK-NOT: IE.Abs + // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [#NCHW, #NCHW], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf16>) outs([[ARG0]] : tensor<1x1x1x1000xf16>) { + // CHECK: ^bb0([[IN:%.+]]: f16, [[OUT:%.+]]: f16): + // CHECK: [[ABS:%.+]] = math.absf [[IN]] : f16 + // CHECK: linalg.yield [[ABS]] : f16 + // CHECK: IE.CGCYield [[LINALG_OP]] : tensor<1x1x1x1000xf16> + } +} + +// ----- +// IE.Negative + +// CHECK: module @SingleNegativeFloatLayer +module @SingleNegativeFloatLayer { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x1x1x1000xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x1x1x1000xf16> + } + + func.func @main(%arg0: tensor<1x1x1x1000xf16>) -> tensor<1x1x1x1000xf16> { + %0 = IE.CodeGenCapsule inputs(%arg0 as %arg1: tensor<1x1x1x1000xf16>) { + %1 = IE.Negative(%arg1) : tensor<1x1x1x1000xf16> -> tensor<1x1x1x1000xf16> + IE.CGCYield %1 : tensor<1x1x1x1000xf16> + } -> tensor<1x1x1x1000xf16> + return %0 : tensor<1x1x1x1000xf16> + + // CHECK-NOT: IE.Negative + // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [#NCHW, #NCHW], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG:%.+]] : tensor<1x1x1x1000xf16>) outs([[ARG]] : tensor<1x1x1x1000xf16>) { + // CHECK: ^bb0([[IN:%.+]]: f16, [[OUT:%.+]]: f16): + // CHECK: [[ZERO:%.+]] = arith.constant 0.000000e+00 : f16 + // CHECK: [[NEG:%.+]] = arith.subf [[ZERO]], [[IN]] : f16 + // CHECK: linalg.yield [[NEG]] : f16 + // CHECK: IE.CGCYield [[LINALG_OP]] : tensor<1x1x1x1000xf16> + } +} + +// ----- + +// CHECK: module @SingleNegativeSI32Layer +module @SingleNegativeSI32Layer { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x1x1x1000xsi32> + } outputsInfo : { + DataInfo "output" : tensor<1x1x1x1000xsi32> + } + + func.func @main(%arg0: tensor<1x1x1x1000xsi32>) -> tensor<1x1x1x1000xsi32> { + %0 = IE.CodeGenCapsule inputs(%arg0 as %arg1: tensor<1x1x1x1000xsi32>) { + %1 = IE.Negative(%arg1) : tensor<1x1x1x1000xsi32> -> tensor<1x1x1x1000xsi32> + IE.CGCYield %1 : tensor<1x1x1x1000xsi32> + } -> tensor<1x1x1x1000xsi32> + return %0 : tensor<1x1x1x1000xsi32> + + // CHECK-NOT: IE.Negative + // CHECK: [[BC_ARG:%.+]] = tensor.bitcast [[ARG:%.+]] : tensor<1x1x1x1000xsi32> to tensor<1x1x1x1000xi32> + // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [#NCHW, #NCHW], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[BC_ARG]] : tensor<1x1x1x1000xi32>) outs([[BC_ARG]] : tensor<1x1x1x1000xi32>) { + // CHECK: ^bb0([[IN:%.+]]: i32, [[OUT:%.+]]: i32): + // CHECK: [[ZERO:%.+]] = arith.constant 0 : i32 + // CHECK: [[NEG:%.+]] = arith.subi [[ZERO]], [[IN]] : i32 + // CHECK: linalg.yield [[NEG]] : i32 + // CHECK: [[RES:%.+]] = tensor.bitcast [[LINALG_OP]] : tensor<1x1x1x1000xi32> to tensor<1x1x1x1000xsi32> + // CHECK: IE.CGCYield [[RES]] : tensor<1x1x1x1000xsi32> + } +} + +// ----- +// IE.Sign + +module @SingleSignFloatLayer { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x1x1x1000xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x1x1x1000xf16> + } + + func.func @main(%arg0: tensor<1x1x1x1000xf16>) -> tensor<1x1x1x1000xf16> { + %0 = IE.CodeGenCapsule inputs(%arg0 as %arg1: tensor<1x1x1x1000xf16>) { + %1 = IE.Sign(%arg1) : tensor<1x1x1x1000xf16> -> tensor<1x1x1x1000xf16> + IE.CGCYield %1 : tensor<1x1x1x1000xf16> + } -> tensor<1x1x1x1000xf16> + return %0 : tensor<1x1x1x1000xf16> + + // CHECK-NOT: IE.Sign + // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [#NCHW, #NCHW], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf16>) outs([[ARG0]] : tensor<1x1x1x1000xf16>) { + // CHECK: ^bb0([[IN:%.+]]: f16, [[OUT:%.+]]: f16): + // CHECK: [[ZERO:%.+]] = arith.constant 0.000000e+00 : f16 + // CHECK: [[NEGF_ONE:%.+]] = arith.constant -1.000000e+00 : f16 + // CHECK: [[POSF_ONE:%.+]] = arith.constant 1.000000e+00 : f16 + // CHECK: [[BITCAST:%.+]] = arith.bitcast [[IN]] : f16 to i16 + // CHECK: [[CST_32768:%.+]] = arith.constant -32768 : i16 + // CHECK: [[AND:%.+]] = arith.andi [[BITCAST]], [[CST_32768]] : i16 + // CHECK: [[ONE:%.+]] = arith.constant 1 : i16 + // CHECK: [[SHL:%.+]] = arith.shli [[BITCAST]], [[ONE]] : i16 + // CHECK: [[INT_ZERO:%.+]] = arith.constant 0 : i16 + // CHECK: [[CMP_EQ:%.+]] = arith.cmpi eq, [[SHL]], [[INT_ZERO]] : i16 + // CHECK: [[INT_ZERO2:%.+]] = arith.constant 0 : i16 + // CHECK: [[CMP_NE:%.+]] = arith.cmpi ne, [[AND]], [[INT_ZERO2]] : i16 + // CHECK: [[SELECT_1:%.+]] = arith.select [[CMP_NE]], [[NEGF_ONE]], [[POSF_ONE]] : f16 + // CHECK: [[SELECT_2:%.+]] = arith.select [[CMP_EQ]], [[ZERO]], [[SELECT_1]] : f16 + // CHECK: linalg.yield [[SELECT_2]] : f16 + // CHECK: IE.CGCYield [[LINALG_OP]] : tensor<1x1x1x1000xf16> + } +} + +// ----- +// IE.HSwish + +module @SingleHSwishFloatLayer { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x1x1x1000xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x1x1x1000xf16> + } + + func.func @main(%arg0: tensor<1x1x1x1000xf16>) -> tensor<1x1x1x1000xf16> { + %0 = IE.CodeGenCapsule inputs(%arg0 as %arg1: tensor<1x1x1x1000xf16>) { + %1 = IE.HSwish(%arg1) : tensor<1x1x1x1000xf16> -> tensor<1x1x1x1000xf16> + IE.CGCYield %1 : tensor<1x1x1x1000xf16> + } -> tensor<1x1x1x1000xf16> + return %0 : tensor<1x1x1x1000xf16> + + // CHECK-NOT: IE.HSwish + // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [#NCHW, #NCHW], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf16>) outs([[ARG0]] : tensor<1x1x1x1000xf16>) { + // CHECK: ^bb0([[IN:%.+]]: f16, [[OUT:%.+]]: f16): + // CHECK: [[ZERO:%.+]] = arith.constant 0.000000e+00 : f16 + // CHECK: [[THREE:%.+]] = arith.constant 3.000000e+00 : f16 + // CHECK: [[SIX:%.+]] = arith.constant 6.000000e+00 : f16 + // CHECK: [[DIV_CST:%.+]] = arith.constant 1.666260e-01 : f16 + // CHECK: [[ADD:%.+]] = arith.addf %{{.+}}, [[THREE]] : f16 + // CHECK: [[MAX:%.+]] = arith.maximumf [[ADD]], [[ZERO]] fastmath : f16 + // CHECK: [[MIN:%.+]] = arith.minimumf [[MAX]], [[SIX]] fastmath : f16 + // CHECK: [[DIV:%.+]] = arith.mulf [[MIN]], [[DIV_CST]] : f16 + // CHECK: [[MUL:%.+]] = arith.mulf [[IN]], [[DIV]] : f16 + // CHECK: linalg.yield [[MUL]] : f16 + // CHECK: IE.CGCYield [[LINALG_OP]] : tensor<1x1x1x1000xf16> + + } +} + +// ----- +// IE.HSigmoid + +module @SingleHSigmoidFloatLayer { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x1x1x1000xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x1x1x1000xf16> + } + + func.func @main(%arg0: tensor<1x1x1x1000xf16>) -> tensor<1x1x1x1000xf16> { + %0 = IE.CodeGenCapsule inputs(%arg0 as %arg1: tensor<1x1x1x1000xf16>) { + %1 = IE.HSigmoid(%arg1) : tensor<1x1x1x1000xf16> -> tensor<1x1x1x1000xf16> + IE.CGCYield %1 : tensor<1x1x1x1000xf16> + } -> tensor<1x1x1x1000xf16> + return %0 : tensor<1x1x1x1000xf16> + + // CHECK-NOT: IE.HSigmoid + // CHECK: [[LINALG_OP:%.+]] = linalg.generic {indexing_maps = [#NCHW, #NCHW], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins([[ARG0:%.+]] : tensor<1x1x1x1000xf16>) outs([[ARG0]] : tensor<1x1x1x1000xf16>) { + // CHECK: ^bb0([[IN:%.+]]: f16, [[OUT:%.+]]: f16): + // CHECK: [[ZERO:%.+]] = arith.constant 0.000000e+00 : f16 + // CHECK: [[THREE:%.+]] = arith.constant 3.000000e+00 : f16 + // CHECK: [[SIX:%.+]] = arith.constant 6.000000e+00 : f16 + // CHECK: [[DIV_CST:%.+]] = arith.constant 1.666260e-01 : f16 + // CHECK: [[ADD:%.+]] = arith.addf %{{.+}}, [[THREE]] : f16 + // CHECK: [[MAX:%.+]] = arith.maximumf [[ADD]], [[ZERO]] fastmath : f16 + // CHECK: [[MIN:%.+]] = arith.minimumf [[MAX]], [[SIX]] fastmath : f16 + // CHECK: [[MUL:%.+]] = arith.mulf [[MIN]], [[DIV_CST]] : f16 + // CHECK: linalg.yield [[MUL]] : f16 + // CHECK: IE.CGCYield [[LINALG_OP]] : tensor<1x1x1x1000xf16> + + } } diff --git a/tests/lit/NPU/conversion/passes/VPU2VPUIP/bufferize_VPU_ops_to_VPUIP_ops.mlir b/tests/lit/NPU/conversion/passes/VPU2VPUIP/bufferize_VPU_ops_to_VPUIP_ops.mlir index b8985fccce..3245a14d86 100644 --- a/tests/lit/NPU/conversion/passes/VPU2VPUIP/bufferize_VPU_ops_to_VPUIP_ops.mlir +++ b/tests/lit/NPU/conversion/passes/VPU2VPUIP/bufferize_VPU_ops_to_VPUIP_ops.mlir @@ -1407,14 +1407,6 @@ func.func @StridedConcat(%input0: tensor<1x16x16x16xf16>, %input1: tensor<1x16x1 !type_DDR_tensor = tensor<1x32x16x16xf16, {mem_space = @DDR, order = #NHWC}> !type_CMX_tensor = tensor<1x32x16x16xf16, {mem_space = @CMX_NN, order = #NHWC}> -// Copy operation with memref output -// Original operation before lowering: -// func.func @CopyOpTensorResult(%input0: !type_DDR_tensor) -> !type_CMX_tensor{ -// %tensor_cmx = IE.Copy(%input0) { out_mem_space = @CMX_NN } : !type_DDR_tensor -> !type_CMX_tensor - -// return %tensor_cmx : !type_CMX_tensor -// } - // CHECK-LABEL: @CopyOpTensorResult // CHECK-SAME: ([[ARG0:%.+]]: memref<1x32x16x16xf16, #NHWC, @DDR>) func.func @CopyOpTensorResult(%input0: !type_DDR_tensor) -> !type_CMX_tensor{ @@ -1479,16 +1471,6 @@ func.func @CopyOpDistributedResult(%input0: !type_DDR_tensor) -> !typeCmxDistrib !type_CMX_tensor = tensor<1x32x16x16xf16, {mem_space = @CMX_NN, order = #NHWC}> !type_CMX_memref = memref<1x32x16x16xf16, #NHWC, @CMX_NN> -// 2 Operations with distributed type passed in between -// Original operation before lowering -// func.func @DistributedCopy2CopyOp(%input0: !type_DDR_tensor) -> !type_DDR_tensor { -// %tensor_distributed_cmx = IE.Copy(%input0) { out_mem_space = @CMX_NN } : !type_DDR_tensor -> !typeCMXDistributed - -// %tensor_ddr = IE.Copy(%tensor_distributed_cmx) { out_mem_space = @DDR } : !typeCMXDistributed -> !type_DDR_tensor - -// return %tensor_ddr : !type_DDR_tensor -// } - // CHECK-LABEL: @DistributedCopy2CopyOp // CHECK-SAME: ([[ARG0:%.+]]: memref<1x32x16x16xf16, #NHWC, @DDR>) func.func @DistributedCopy2CopyOp(%input0: !type_DDR_tensor) -> !type_DDR_tensor { diff --git a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/act_shave_40XX.mlir b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/act_shave_40XX.mlir index d722822a99..deb188c7d0 100644 --- a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/act_shave_40XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/act_shave_40XX.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --split-input-file --vpu-arch=%arch% --convert-VPUASM-to-NPUReg40XX %s | FileCheck %s // REQUIRES: arch-NPU40XX -module @SingleHswishFP16 attributes {VPU.arch = #VPU.arch_kind} { +module @SingleHswishFP16 attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @single_hswish inputsInfo : { @@ -97,7 +97,7 @@ module @SingleHswishFP16 attributes {VPU.arch = #VPU.arch_kind} { // ----- -module @QuadripleHswishFP16 attributes {VPU.arch = #VPU.arch_kind} { +module @QuadripleHswishFP16 attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @single_hswish inputsInfo : { diff --git a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/barriers_40XX.mlir b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/barriers_40XX.mlir index 151dc71f9e..e41f5c976c 100644 --- a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/barriers_40XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/barriers_40XX.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --convert-VPUASM-to-NPUReg40XX %s | FileCheck %s // REQUIRES: arch-NPU40XX -module @OneDMAWithoutAttributes attributes {VPU.arch = #VPU.arch_kind} { +module @OneDMAWithoutAttributes attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN IE.TileResource 6 of @NCE at 6.000000e+02 MHz diff --git a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/compress_weights_dma_40XX.mlir b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/compress_weights_dma_40XX.mlir index efceb552f5..5f4b0ffe67 100644 --- a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/compress_weights_dma_40XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/compress_weights_dma_40XX.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --split-input-file --vpu-arch=%arch% --convert-VPUASM-to-NPUReg40XX %s | FileCheck %s // REQUIRES: arch-NPU40XX -module @mainModule attributes {VPU.arch = #VPU.arch_kind} { +module @mainModule attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @dma_compressed_constant inputsInfo : { diff --git a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/nnrt_config_40xx.mlir b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/nnrt_config_40xx.mlir index 4a968fb4cb..18bec7ab52 100644 --- a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/nnrt_config_40xx.mlir +++ b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/nnrt_config_40xx.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --convert-VPUASM-to-NPUReg40XX %s | FileCheck %s // REQUIRES: arch-NPU40XX -module @Test attributes {VPU.arch = #VPU.arch_kind} { +module @Test attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN IE.TileResource 6 of @NCE at 6.000000e+02 MHz diff --git a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/one-dma-convert_40XX.mlir b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/one-dma-convert_40XX.mlir index 46ac4a8532..eb3d5f126c 100644 --- a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/one-dma-convert_40XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/one-dma-convert_40XX.mlir @@ -75,3 +75,42 @@ module @OneDMAWithoutAttributes { return } } + +// ----- + +!qElemType = !quant.uniform +!qElemType1 = !quant.uniform + +module @OneDMAWithoutAttributes { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input_0" : tensor<1x2x3x4xui8> + } outputsInfo : { + DataInfo "output_0" : tensor<1x3x3x4xui8> + } + VPUASM.IOBindings inputDeclarations : { + VPUASM.DeclareBuffer @input_0_buffDecl !VPUASM.Buffer< "NetworkInput"[0] <0> : memref<1x2x3x4x!qElemType, @DDR> : swizzling(0)> + } outputDeclarations : { + VPUASM.DeclareBuffer @output_0_buffDecl !VPUASM.Buffer< "NetworkOutput"[0] <0> : memref<1x3x3x4x!qElemType1, @DDR> : swizzling(0)> + } profilingBuffDeclarations : { + } + func.func @main() { + ELF.Main @ELFMain { + VPUASM.DeclareBuffer @DeclareBuffer0 !VPUASM.Buffer< "NetworkInput"[0] <0> : memref<1x2x3x4x!qElemType, @DDR> : swizzling(0)> + VPUASM.DeclareBuffer @DeclareBuffer1 !VPUASM.Buffer< "NetworkOutput"[0] <0> : memref<1x3x3x4x!qElemType1, @DDR> : swizzling(0)> + ELF.CreateLogicalSection @builtin.tasks.DMA0 aligned(64) secType(SHT_NOBITS) secFlags(SHF_ALLOC) secLocation() { + VPUASM.DeclareTaskBuffer @DeclareTaskBuffer_DMA_0 idx(!VPURegMapped.Index<0:0:0>) + } + ELF.CreateSection @text.nndma0 aligned(64) secType(SHT_PROGBITS) secFlags(SHF_ALLOC) secLocation() { + VPUASM.NNDMA @NNDMA_0_0_0 idx(!VPURegMapped.Index<0:0:0>) taskLocation(@builtin.tasks.DMA0::@DeclareTaskBuffer_DMA_0) input(@DeclareBuffer0) outputs([@DeclareBuffer1]) waits([]) updates([]) start_after(0) clean_after(0) dma_descriptor(#VPUIP.DMADescriptorAttr) acceleration_mode() + // CHECK-NOT: VPUASM.NNDMA + // CHECK: NPUReg40XX.NNDMA + // CHECK: UINT dma_cfg_fields_conversion_cfg = 0 + // CHECK: UINT dma_width_src = 0x18 + // CHECK: UINT dma_width_dst = 0x24 + // CHECK: UINT dma_src = 0 + // CHECK: UINT dma_dst = 0 + } + } + return + } +} diff --git a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/spil-dma_40XX.mlir b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/spil-dma_40XX.mlir index 7ddc97eb7c..d46532f2a5 100644 --- a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/spil-dma_40XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/spil-dma_40XX.mlir @@ -7,7 +7,7 @@ // REQUIRES: arch-NPU40XX #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module @OneDMAWithoutAttributes attributes {VPU.arch = #VPU.arch_kind} { +module @OneDMAWithoutAttributes attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @main inputsInfo : { diff --git a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/three_dmas_40XX.mlir b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/three_dmas_40XX.mlir index 2dcb9822e7..33257dd745 100644 --- a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/three_dmas_40XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/three_dmas_40XX.mlir @@ -7,7 +7,7 @@ // REQUIRES: arch-NPU40XX #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module @mainModule attributes {VPU.arch = #VPU.arch_kind} { +module @mainModule attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @race_condition_dma_f16_f16 inputsInfo : { diff --git a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/two_dma_ports_40XX.mlir b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/two_dma_ports_40XX.mlir index 29d1868f80..ac6291828e 100644 --- a/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/two_dma_ports_40XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUASM2NPUReg40XX/two_dma_ports_40XX.mlir @@ -7,7 +7,7 @@ // REQUIRES: arch-NPU40XX #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module @mainModule attributes {VPU.arch = #VPU.arch_kind} { +module @mainModule attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @race_condition_dma_f16_f16 inputsInfo : { diff --git a/tests/lit/NPU/conversion/passes/VPUIP2VPUMI40XX/enqueue_dma_40XX.mlir b/tests/lit/NPU/conversion/passes/VPUIP2VPUMI40XX/enqueue_dma_40XX.mlir new file mode 100644 index 0000000000..f1e4264373 --- /dev/null +++ b/tests/lit/NPU/conversion/passes/VPUIP2VPUMI40XX/enqueue_dma_40XX.mlir @@ -0,0 +1,50 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=DefaultHW" --convert-VPUIP-to-VPUMI40XX %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +module @Module0 { + net.NetworkInfo entryPoint : @EnqueueDma inputsInfo : { + DataInfo "input_0" : tensor<1x16x3x6xf16> + } outputsInfo : { + DataInfo "output_0" : tensor<1x4x6x12xf16> + } + + func.func @EnqueueDma(%arg0: memref<1x16x3x6xf16>, %arg1: memref<1x16x3x6xf16>) -> memref<1x16x3x6xf16> { + %bar0 = VPURT.ConfigureBarrier<0> -> !VPURT.Barrier + %bar1 = VPURT.ConfigureBarrier<1> {isFinalBarrier} -> !VPURT.Barrier + + %dummy_in = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %dummy_out = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + + %dpu_in = VPURT.DeclareBuffer [0] <9216> -> memref<1x64x16x16xf16, #NHWC, [@CMX_NN, 0]> + %dpu_out = VPURT.DeclareBuffer [0] <0> -> memref<1x64x9x8xf16, #NHWC, [@CMX_NN, 0]> + %dpu_par_in = VPURT.DeclareBuffer [0] <9216> -> memref<1x64x16x16xf16, #NHWC, [@CMX_NN, 0]> + %dpu_par_out = VPURT.DeclareBuffer [0] <0> -> memref<1x64x9x8xf16, #NHWC, [@CMX_NN, 0]> + %dpu_wt = VPURT.DeclareBuffer [0] <42000> -> memref<64x1x1x4xsi32, #NHWC, [@CMX_NN, 0]> + + VPURT.Task updates(%bar0 : !VPURT.Barrier) { + %enq_dma = VPUIP.EnqueueDMA {port = 0 : i64} inputs(%dummy_in : memref<0x0x0x0xi32, @DDR>) outputs(%dummy_out : memref<0x0x0x0xi32, @DDR>) enqueue_dma_attr(<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 0 : i64>) -> memref<0x0x0x0xi32, @DDR> + } + + VPURT.Task waits(%bar0 : !VPURT.Barrier) updates(%bar1 : !VPURT.Barrier) { + %dpu = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [2, 2], kernel_strides = [2, 2], task_type = #VPUIP.nce_task_type} input(%dpu_in : memref<1x64x16x16xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%dpu_wt : memref<64x1x1x4xsi32, #NHWC, [@CMX_NN, 0]>) parent_input(%dpu_par_in : memref<1x64x16x16xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%dpu_par_out : memref<1x64x9x8xf16, #NHWC, [@CMX_NN, 0]>) outputs(%dpu_out : memref<1x64x9x8xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x64x9x8xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inStart = [0, 0, 0], inEnd = [15, 15, 15], outEnd = [7, 8, 63], mpe_mode = #VPU.mpe_mode, pad = #VPU.Padding, outStart = [0, 0, 0]} + } PPE : { + PPETask {ppe = #VPU.PPEStub<>} + } + } + + return %arg1 : memref<1x16x3x6xf16> + } +} + +// CHECK: VPUMI40XX.NNDMA +// CHECK-SAME: enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 0 : i64> +// CHECK-SAME: port = 0 : i64 +// CHECK-SAME: !VPURegMapped.Index<0:0:0> diff --git a/tests/lit/NPU/conversion/passes/VPUIP2VPUMI40XX/permute_dma_40XX.mlir b/tests/lit/NPU/conversion/passes/VPUIP2VPUMI40XX/permute_dma_40XX.mlir index 527844363d..dfe1eeb4f7 100644 --- a/tests/lit/NPU/conversion/passes/VPUIP2VPUMI40XX/permute_dma_40XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUIP2VPUMI40XX/permute_dma_40XX.mlir @@ -6,143 +6,292 @@ // RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=DefaultHW" --convert-VPUIP-to-VPUMI40XX %s | FileCheck %s // REQUIRES: arch-NPU40XX +// Based on PermuteDMAWithNHWCToNCHW from VPUIP PermuteDMA unrolling + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + module @permuteDMA { net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "input_0" : tensor<16x256xf16> + DataInfo "input_0" : tensor<1xf16> // Dummy value } outputsInfo : { - DataInfo "output_0" : tensor<16x256xf16> + DataInfo "output_0" : tensor<1xf16> // Dummy value } - func.func @main(%arg0: memref<16x256xf16, @DDR>, %arg1: memref<16x256xf16, @DDR>) -> memref<16x256xf16, @DDR> { - %0 = VPURT.DeclareBuffer [0] <0> -> memref<16x256xf16, @DDR> - %1 = VPURT.DeclareBuffer [0] <0> -> memref<256x16xf16, @DDR> - VPURT.Task attributes {isTrailingSWLayer = false} { - %3 = VPUIP.NNDMA {port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%0 : memref<16x256xf16, @DDR>) -> memref<16x256xf16, @DDR> - } - - // CHECK-NOT: VPUIP.NNDMA - // CHECK: %[[VAL0:.*]] = VPUMI40XX.NNDMA {port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%0 : memref<16x256xf16, @DDR>) start_after(0) clean_after(0) acceleration_mode(){{.*}}-> !VPURegMapped.Index<0:0:0> - - VPURT.Task attributes {isTrailingSWLayer = false} { - %4 = VPUIP.NNDMA {port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%0 : memref<16x256xf16, @DDR>) -> memref<16x256xf16, @DDR> - } - - // CHECK-NOT: VPUIP.NNDMA - // CHECK: %[[VAL1:.*]] = VPUMI40XX.NNDMA {port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%0 : memref<16x256xf16, @DDR>) previousDMA(%[[VAL0]] : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode(){{.*}}-> !VPURegMapped.Index<0:0:1> - VPURT.Task attributes {isTrailingSWLayer = false} { - %5 = VPUIP.PermuteDMA {dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%1 : memref<256x16xf16, @DDR>) -> memref<256x16xf16, @DDR> + // Func simply returns arg1 without copying any PermuteDMA results to it beforehand + func.func @main(%arg0: memref<1xf16, @DDR>, %arg1: memref<1xf16, @DDR>) -> memref<1xf16, @DDR> { + %0 = VPURT.DeclareBuffer [0] <0> -> memref<1x8x8x16xf16, {order = #NHWC, strides = [2048, 1, 128, 8]}, [@CMX_NN, 0]> + %1 = VPURT.DeclareBuffer [0] <2048> -> memref<1x8x8x16xf16, {order = #NHWC, strides = [2048, 1, 128, 8]}, [@CMX_NN, 0]> + + %2 = VPURT.DeclareBuffer [0] <4096> -> memref<1x8x8x16xf16, {order = #NCHW, strides = [2048, 256, 16, 1]}, [@CMX_NN, 0]> + %3 = VPURT.DeclareBuffer [0] <4352> -> memref<1x8x8x16xf16, {order = #NCHW, strides = [2048, 256, 16, 1]}, [@CMX_NN, 0]> + + VPURT.Task { + %4 = VPUIP.PermuteDMA { + internalDataFlow = #VPUIP.InternalDataFlowAttr< + inputType = memref<1x8x8x16xf16, {order = #NHWC, strides = [2048, 1, 128, 8]}, [@CMX_NN, 0]>, + outputType = memref<1x8x8x16xf16, {order = #NCHW, strides = [2048, 256, 16, 1]}, [@CMX_NN, 0]>, + mappingOrder = #NCHW, loopOrder = #NHWC + >, + port = 0 : i64 + } + inputs(%0 : memref<1x8x8x16xf16, {order = #NHWC, strides = [2048, 1, 128, 8]}, [@CMX_NN, 0]>) + outputs(%2 : memref<1x8x8x16xf16, {order = #NCHW, strides = [2048, 256, 16, 1]}, [@CMX_NN, 0]>) + -> memref<1x8x8x16xf16, {order = #NCHW, strides = [2048, 256, 16, 1]}, [@CMX_NN, 0]> } - // CHECK-NOT: VPUIP.PermuteDMA - // CHECK: %[[VAL2:.*]] = VPUMI40XX.NNDMA {allow_different_in_out_shapes, dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%1 : memref<256x16xf16, @DDR>) previousDMA(%[[VAL1]] : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode(){{.*}}-> !VPURegMapped.Index<0:0:2> - - VPURT.Task attributes {isTrailingSWLayer = false} { - %6 = VPUIP.PermuteDMA {dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%1 : memref<256x16xf16, @DDR>) -> memref<256x16xf16, @DDR> + VPURT.Task { + %4 = VPUIP.PermuteDMA { + internalDataFlow = #VPUIP.InternalDataFlowAttr< + inputType = memref<1x8x8x16xf16, {order = #NHWC, strides = [2048, 1, 128, 8]}, [@CMX_NN, 0]>, + outputType = memref<1x8x8x16xf16, {order = #NCHW, strides = [2048, 256, 16, 1]}, [@CMX_NN, 0]>, + mappingOrder = #NCHW, loopOrder = #NHWC + >, + port = 1 : i64 + } + inputs(%1 : memref<1x8x8x16xf16, {order = #NHWC, strides = [2048, 1, 128, 8]}, [@CMX_NN, 0]>) + outputs(%3 : memref<1x8x8x16xf16, {order = #NCHW, strides = [2048, 256, 16, 1]}, [@CMX_NN, 0]>) + -> memref<1x8x8x16xf16, {order = #NCHW, strides = [2048, 256, 16, 1]}, [@CMX_NN, 0]> } - // CHECK-NOT: VPUIP.PermuteDMA - // CHECK: %[[VAL3:.*]] = VPUMI40XX.NNDMA {allow_different_in_out_shapes, dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%1 : memref<256x16xf16, @DDR>) previousDMA(%[[VAL2]] : !VPURegMapped.Index<0:0:2>) start_after(0) clean_after(0) acceleration_mode(){{.*}}-> !VPURegMapped.Index<0:0:3> - - VPURT.Task attributes {isTrailingSWLayer = false} { - %7 = VPUIP.NNDMA {port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%0 : memref<16x256xf16, @DDR>) -> memref<16x256xf16, @DDR> - } - - // CHECK-NOT: VPUIP.NNDMA - // CHECK: %[[VAL4:.*]] = VPUMI40XX.NNDMA {port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%0 : memref<16x256xf16, @DDR>) previousDMA(%[[VAL3]] : !VPURegMapped.Index<0:0:3>) start_after(0) clean_after(0) acceleration_mode(){{.*}}-> !VPURegMapped.Index<0:0:4> - - VPURT.Task attributes {isTrailingSWLayer = false} { - %8 = VPUIP.NNDMA {port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%0 : memref<16x256xf16, @DDR>) -> memref<16x256xf16, @DDR> - } - - // CHECK-NOT: VPUIP.NNDMA - // CHECK: %[[VAL5:.*]] = VPUMI40XX.NNDMA {port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%0 : memref<16x256xf16, @DDR>) previousDMA(%[[VAL4]] : !VPURegMapped.Index<0:0:4>) start_after(0) clean_after(0) acceleration_mode(){{.*}}-> !VPURegMapped.Index<0:0:5> - - return %arg1 : memref<16x256xf16, @DDR> + return %arg1 : memref<1xf16, @DDR> + + // CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+]] + // CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <2048> -> [[INPUT_TYPE_1:.+]] + + // CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[OUTPUT_TYPE_0:.+]] + // CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <4352> -> [[OUTPUT_TYPE_1:.+]] + + // CHECK-NOT: VPUIP.NNDMA + // CHECK: [[DMA_0:%.+]] = VPUMI40XX.NNDMA + // CHECK-SAME: allow_different_in_out_shapes + // CHECK-SAME: port = 0 + // CHECK-SAME: inputs([[INPUT_BUFFER_0]] : [[INPUT_TYPE_0]]) + // CHECK-SAME: outputs([[OUTPUT_BUFFER_0]] : [[OUTPUT_TYPE_0]]) + // CHECK-SAME: start_after(0) + // CHECK-SAME: clean_after(0) + // CHECK-SAME: acceleration_mode() + // CHECK-SAME: dma_transaction + // CHECK-SAME: #VPUMI40XX.PermuteDMATransaction + // CHECK-SAME: inputType = [[INPUT_TYPE_0]] + // CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + // CHECK-SAME: mappingOrder = #NCHW + // CHECK-SAME: loopOrder = #NHWC + // CHECK-SAME: -> !VPURegMapped.Index<0:1:0> + + // CHECK-NOT: VPUIP.NNDMA + // CHECK: [[DMA_1:%.+]] = VPUMI40XX.NNDMA + // CHECK-SAME: allow_different_in_out_shapes + // CHECK-SAME: port = 1 + // CHECK-SAME: inputs([[INPUT_BUFFER_1]] : [[INPUT_TYPE_1]]) + // CHECK-SAME: outputs([[OUTPUT_BUFFER_1]] : [[OUTPUT_TYPE_1]]) + // CHECK-SAME: start_after(0) + // CHECK-SAME: clean_after(0) + // CHECK-SAME: acceleration_mode() + // CHECK-SAME: dma_transaction + // CHECK-SAME: #VPUMI40XX.PermuteDMATransaction + // CHECK-SAME: inputType = [[INPUT_TYPE_1]] + // CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + // CHECK-SAME: mappingOrder = #NCHW + // CHECK-SAME: loopOrder = #NHWC + // CHECK-SAME: -> !VPURegMapped.Index<1:1:0> } } // ----- +// Based on PermuteDMAFromTranspose from VPUIP PermuteDMA unrolling + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWHC = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2, d1)> + module @permuteDMA { net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "input_0" : tensor<16x256xf16> + DataInfo "input_0" : tensor<1xf16> // Dummy value } outputsInfo : { - DataInfo "output_0" : tensor<16x256xf16> + DataInfo "output_0" : tensor<1xf16> // Dummy value } - func.func @main(%arg0: memref<16x256xf16, @DDR>, %arg1: memref<16x256xf16, @DDR>) -> memref<16x256xf16, @DDR> { - %0 = VPURT.DeclareBuffer [0] <0> -> memref<16x256xf16, @DDR> - %1 = VPURT.DeclareBuffer [0] <0> -> memref<256x16xf16, @DDR> - - VPURT.Task attributes {isTrailingSWLayer = false} { - %2 = VPUIP.PermuteDMA {dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%1 : memref<256x16xf16, @DDR>) -> memref<256x16xf16, @DDR> - } - // CHECK-NOT: VPUIP.PermuteDMA - // CHECK: %[[VAL0:.*]] = VPUMI40XX.NNDMA {allow_different_in_out_shapes, dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%1 : memref<256x16xf16, @DDR>) start_after(0) clean_after(0) acceleration_mode(){{.*}}-> !VPURegMapped.Index<0:0:0> - - VPURT.Task attributes {isTrailingSWLayer = false} { - %3 = VPUIP.PermuteDMA {dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%1 : memref<256x16xf16, @DDR>) -> memref<256x16xf16, @DDR> - } - - // CHECK-NOT: VPUIP.PermuteDMA - // CHECK: %[[VAL1:.*]] = VPUMI40XX.NNDMA {allow_different_in_out_shapes, dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%1 : memref<256x16xf16, @DDR>) previousDMA(%[[VAL0]] : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode(){{.*}}-> !VPURegMapped.Index<0:0:1> - - VPURT.Task attributes {isTrailingSWLayer = false} { - %4 = VPUIP.NNDMA {port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%0 : memref<16x256xf16, @DDR>) -> memref<16x256xf16, @DDR> - } - - // CHECK-NOT: VPUIP.NNDMA - // CHECK: %[[VAL2:.*]] = VPUMI40XX.NNDMA {port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%0 : memref<16x256xf16, @DDR>) previousDMA(%[[VAL1]] : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode(){{.*}}-> !VPURegMapped.Index<0:0:2> - - VPURT.Task attributes {isTrailingSWLayer = false} { - %5 = VPUIP.NNDMA {port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%0 : memref<16x256xf16, @DDR>) -> memref<16x256xf16, @DDR> + // Func simply returns arg1 without copying any PermuteDMA results to it beforehand + func.func @main(%arg0: memref<1xf16, @DDR>, %arg1: memref<1xf16, @DDR>) -> memref<1xf16, @DDR> { + %0 = VPURT.DeclareBuffer [0] <0> -> memref<1x8x1x16xf16, {order = #NHWC, strides = [256, 1, 256, 8]}, [@CMX_NN, 0]> + %1 = VPURT.DeclareBuffer [0] <256> -> memref<1x8x1x16xf16, {order = #NHWC, strides = [256, 1, 256, 8]}, [@CMX_NN, 0]> + + %2 = VPURT.DeclareBuffer [0] <4096> -> memref<1x16x1x8xf16, {order = #NHWC, strides = [256, 1, 256, 32]}, [@CMX_NN, 0]> + %3 = VPURT.DeclareBuffer [0] <4128> -> memref<1x16x1x8xf16, {order = #NHWC, strides = [256, 1, 256, 32]}, [@CMX_NN, 0]> + + VPURT.Task { + %4 = VPUIP.PermuteDMA { + internalDataFlow = #VPUIP.InternalDataFlowAttr< + inputType = memref<1x8x1x16xf16, {order = #NHWC, strides = [256, 1, 256, 8]}, [@CMX_NN, 0]>, + outputType = memref<1x16x1x8xf16, {order = #NHWC, strides = [256, 1, 256, 32]}, [@CMX_NN, 0]>, + mappingOrder = #NWHC, + loopOrder = #NHWC + >, + port = 0 : i64 + } + inputs(%0 : memref<1x8x1x16xf16, {order = #NHWC, strides = [256, 1, 256, 8]}, [@CMX_NN, 0]>) + outputs(%2 : memref<1x16x1x8xf16, {order = #NHWC, strides = [256, 1, 256, 32]}, [@CMX_NN, 0]>) + -> memref<1x16x1x8xf16, {order = #NHWC, strides = [256, 1, 256, 32]}, [@CMX_NN, 0]> } - // CHECK-NOT: VPUIP.NNDMA - // CHECK: %[[VAL3:.*]] = VPUMI40XX.NNDMA {port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%0 : memref<16x256xf16, @DDR>) previousDMA(%[[VAL2]] : !VPURegMapped.Index<0:0:2>) start_after(0) clean_after(0) acceleration_mode(){{.*}}-> !VPURegMapped.Index<0:0:3> - - VPURT.Task attributes {isTrailingSWLayer = false} { - %6 = VPUIP.PermuteDMA {dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%1 : memref<256x16xf16, @DDR>) -> memref<256x16xf16, @DDR> + VPURT.Task { + %4 = VPUIP.PermuteDMA { + internalDataFlow = #VPUIP.InternalDataFlowAttr< + inputType = memref<1x8x1x16xf16, {order = #NHWC, strides = [256, 1, 256, 8]}, [@CMX_NN, 0]>, + outputType = memref<1x16x1x8xf16, {order = #NHWC, strides = [256, 1, 256, 32]}, [@CMX_NN, 0]>, + mappingOrder = #NWHC, + loopOrder = #NHWC + >, + port = 1 : i64 + } + inputs(%1 : memref<1x8x1x16xf16, {order = #NHWC, strides = [256, 1, 256, 8]}, [@CMX_NN, 0]>) + outputs(%3 : memref<1x16x1x8xf16, {order = #NHWC, strides = [256, 1, 256, 32]}, [@CMX_NN, 0]>) + -> memref<1x16x1x8xf16, {order = #NHWC, strides = [256, 1, 256, 32]}, [@CMX_NN, 0]> } - // CHECK-NOT: VPUIP.PermuteDMA - // CHECK: %[[VAL4:.*]] = VPUMI40XX.NNDMA {allow_different_in_out_shapes, dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%1 : memref<256x16xf16, @DDR>) previousDMA(%[[VAL3]] : !VPURegMapped.Index<0:0:3>) start_after(0) clean_after(0) acceleration_mode(){{.*}}-> !VPURegMapped.Index<0:0:4> - - VPURT.Task attributes {isTrailingSWLayer = false} { - %7 = VPUIP.PermuteDMA {dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%1 : memref<256x16xf16, @DDR>) -> memref<256x16xf16, @DDR> - } - - // CHECK-NOT: VPUIP.PermuteDMA - // CHECK: %[[VAL5:.*]] = VPUMI40XX.NNDMA {allow_different_in_out_shapes, dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64} inputs(%0 : memref<16x256xf16, @DDR>) outputs(%1 : memref<256x16xf16, @DDR>) previousDMA(%[[VAL4]] : !VPURegMapped.Index<0:0:4>) start_after(0) clean_after(0) acceleration_mode(){{.*}}-> !VPURegMapped.Index<0:0:5> - - return %arg1 : memref<16x256xf16, @DDR> + return %arg1 : memref<1xf16, @DDR> + + // CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+]] + // CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <256> -> [[INPUT_TYPE_1:.+]] + + // CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[OUTPUT_TYPE_0:.+]] + // CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <4128> -> [[OUTPUT_TYPE_1:.+]] + + // CHECK-NOT: VPUIP.NNDMA + // CHECK: [[DMA_0:%.+]] = VPUMI40XX.NNDMA + // CHECK-SAME: allow_different_in_out_shapes + // CHECK-SAME: port = 0 + // CHECK-SAME: inputs([[INPUT_BUFFER_0]] : [[INPUT_TYPE_0]]) + // CHECK-SAME: outputs([[OUTPUT_BUFFER_0]] : [[OUTPUT_TYPE_0]]) + // CHECK-SAME: start_after(0) + // CHECK-SAME: clean_after(0) + // CHECK-SAME: acceleration_mode() + // CHECK-SAME: dma_transaction + // CHECK-SAME: #VPUMI40XX.PermuteDMATransaction + // CHECK-SAME: inputType = [[INPUT_TYPE_0]] + // CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + // CHECK-SAME: mappingOrder = #NWHC + // CHECK-SAME: loopOrder = #NHWC + // CHECK-SAME: -> !VPURegMapped.Index<0:1:0> + + // CHECK-NOT: VPUIP.NNDMA + // CHECK: [[DMA_1:%.+]] = VPUMI40XX.NNDMA + // CHECK-SAME: allow_different_in_out_shapes + // CHECK-SAME: port = 1 + // CHECK-SAME: inputs([[INPUT_BUFFER_1]] : [[INPUT_TYPE_1]]) + // CHECK-SAME: outputs([[OUTPUT_BUFFER_1]] : [[OUTPUT_TYPE_1]]) + // CHECK-SAME: start_after(0) + // CHECK-SAME: clean_after(0) + // CHECK-SAME: acceleration_mode() + // CHECK-SAME: dma_transaction + // CHECK-SAME: #VPUMI40XX.PermuteDMATransaction + // CHECK-SAME: inputType = [[INPUT_TYPE_1]] + // CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + // CHECK-SAME: mappingOrder = #NWHC + // CHECK-SAME: loopOrder = #NHWC + // CHECK-SAME: -> !VPURegMapped.Index<1:1:0> } } // ----- -#NC = affine_map<(d0, d1) -> (d0, d1)> +// Based on ClusterPermuteDMAWithDistributedInputAndOutput from VPUIP PermuteDMA unrolling + +!qElemType = !quant.uniform +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + module @permuteDMA { -net.NetworkInfo entryPoint : @UnrollDistributedPermuteDMAOutput inputsInfo : { - DataInfo "input_0" : tensor<1x16x16x16xf16> +net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input_0" : tensor<1xf16> // Dummy value } outputsInfo : { - DataInfo "output_0" : tensor<64x32x1x1xf16> + DataInfo "output_0" : tensor<1xf16> // Dummy value } -func.func @UnrollDistributedPermuteDMAOutput(%arg0: memref<1x16x16x16xf16, @DDR>, %arg1: memref<64x32x1x1xf16, @DDR>) -> memref<64x32x1x1xf16, @DDR> { - %cst = const.Declare memref<16x256xf16, #NC, @DDR> = dense<1.000000e+00> : tensor<16x256xf16>, [#const.Reorder<#NC>] - // CHECK-DAG: %[[CST:.*]] = const.Declare memref<16x256xf16, @DDR> - - %3 = VPURT.DeclareBuffer [0, 1] <0> -> !VPUIP.DistributedBuffer<16x256xf16, {order = #NC, strides = [256, 1]}, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments}> - // CHECK-NOT: VPURT.DeclareBuffer [0, 1] <0> -> !VPUIP.DistributedBuffer<16x256xf16, {order = #NC, strides = [256, 1]}, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments}> +func.func @main(%arg0: memref<1xf16, @DDR>, %arg1: memref<1xf16, @DDR>) -> memref<1xf16, @DDR> { + %0 = VPURT.DeclareBuffer [0] <0> -> memref<1x4x4x8x!qElemType, {order = #NHWC, strides = [256, 1, 32, 4]}, [@CMX_NN, 0]> + %1 = VPURT.DeclareBuffer [0] <128> -> memref<1x4x4x8x!qElemType, {order = #NHWC, strides = [256, 1, 32, 4]}, [@CMX_NN, 0]> + + %2 = VPURT.DeclareBuffer [0, 1] <2000> -> !VPUIP.DistributedBuffer<1x4x4x8x!qElemType, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + %3 = VPURT.DeclareBuffer [0, 1] <2032> -> !VPUIP.DistributedBuffer<1x4x4x8x!qElemType, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + + VPURT.Task { + %4 = VPUIP.PermuteDMA { + internalDataFlow = #VPUIP.InternalDataFlowAttr< + inputType = memref<1x4x4x8x!qElemType, {order = #NHWC, strides = [256, 1, 32, 4]}, [@CMX_NN, 0]>, + outputType = !VPUIP.DistributedBuffer<1x4x4x8x!qElemType, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>, + mappingOrder = #NCHW, loopOrder = #NHWC + >, + port = 0 : i64 + } + inputs(%0 : memref<1x4x4x8x!qElemType, {order = #NHWC, strides = [256, 1, 32, 4]}, [@CMX_NN, 0]>) + outputs(%2 : !VPUIP.DistributedBuffer<1x4x4x8x!qElemType, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) + -> !VPUIP.DistributedBuffer<1x4x4x8x!qElemType, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + } + VPURT.Task { + %4 = VPUIP.PermuteDMA { + internalDataFlow = #VPUIP.InternalDataFlowAttr< + inputType = memref<1x4x4x8x!qElemType, {order = #NHWC, strides = [256, 1, 32, 4]}, [@CMX_NN, 0]>, + outputType = !VPUIP.DistributedBuffer<1x4x4x8x!qElemType, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>, + mappingOrder = #NCHW, + loopOrder = #NHWC + >, + port = 1 : i64 + } + inputs(%1 : memref<1x4x4x8x!qElemType, {order = #NHWC, strides = [256, 1, 32, 4]}, [@CMX_NN, 0]>) + outputs(%3 : !VPUIP.DistributedBuffer<1x4x4x8x!qElemType, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) + -> !VPUIP.DistributedBuffer<1x4x4x8x!qElemType, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + } - VPURT.Task attributes {isTrailingSWLayer = false} { - %4 = VPUIP.PermuteDMA {dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64} inputs(%cst : memref<16x256xf16, #NC, @DDR>) outputs(%3 : !VPUIP.DistributedBuffer<16x256xf16, {order = #NC, strides = [256, 1]}, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments}>) -> !VPUIP.DistributedBuffer<16x256xf16, {order = #NC, strides = [256, 1]}, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments}> + return %arg1: memref<1xf16, @DDR> + + // CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+]] + // CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <128> -> [[INPUT_TYPE_1:.+]] + + // CHECK: [[OUTPUT_BUFFER_0_0:%.+]] = VPURT.DeclareBuffer [0] <2000> -> [[OUTPUT_TYPE_0_0:.+]] + // CHECK: [[OUTPUT_BUFFER_0_1:%.+]] = VPURT.DeclareBuffer [1] <2000> -> [[OUTPUT_TYPE_0_1:.+]] + + // CHECK-NOT: VPUIP.NNDMA + // CHECK: [[DMA_0:%.+]] = VPUMI40XX.NNDMA + // CHECK-SAME: allow_different_in_out_shapes + // CHECK-SAME: port = 0 + // CHECK-SAME: inputs([[INPUT_BUFFER_0]] : [[INPUT_TYPE_0]]) + // CHECK-SAME: outputs([[OUTPUT_BUFFER_0_0]], [[OUTPUT_BUFFER_0_1]] : [[OUTPUT_TYPE_0_0]], [[OUTPUT_TYPE_0_1]]) + // CHECK-SAME: start_after(0) + // CHECK-SAME: clean_after(0) + // CHECK-SAME: acceleration_mode() + // CHECK-SAME: dma_transaction + // CHECK-SAME: #VPUMI40XX.PermuteDMATransaction + // CHECK-SAME: inputType = [[INPUT_TYPE_0]] + + // PermuteDMATransaction type does not get updated here + // CHECK-SAME: outputType = !VPUIP.DistributedBuffer<1x4x4x8x!qElemType, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + + // CHECK-SAME: mappingOrder = #NCHW + // CHECK-SAME: loopOrder = #NHWC + // CHECK-SAME: -> !VPURegMapped.Index<0:1:0> + + // CHECK: [[OUTPUT_BUFFER_1_0:%.+]] = VPURT.DeclareBuffer [0] <2032> -> [[OUTPUT_TYPE_1_0:.+]] + // CHECK: [[OUTPUT_BUFFER_1_1:%.+]] = VPURT.DeclareBuffer [1] <2032> -> [[OUTPUT_TYPE_1_1:.+]] + + // CHECK-NOT: VPUIP.NNDMA + // CHECK: [[DMA_1:%.+]] = VPUMI40XX.NNDMA + // CHECK-SAME: allow_different_in_out_shapes + // CHECK-SAME: port = 1 + // CHECK-SAME: inputs([[INPUT_BUFFER_1]] : [[INPUT_TYPE_1]]) + // CHECK-SAME: outputs([[OUTPUT_BUFFER_1_0]], [[OUTPUT_BUFFER_1_1]] : [[OUTPUT_TYPE_1_0]], [[OUTPUT_TYPE_1_1]]) + // CHECK-SAME: start_after(0) + // CHECK-SAME: clean_after(0) + // CHECK-SAME: acceleration_mode() + // CHECK-SAME: dma_transaction + // CHECK-SAME: #VPUMI40XX.PermuteDMATransaction + // CHECK-SAME: inputType = [[INPUT_TYPE_1]] + + // PermuteDMATransaction type does not get updated here + // CHECK-SAME: outputType = !VPUIP.DistributedBuffer<1x4x4x8x!qElemType, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + + // CHECK-SAME: mappingOrder = #NCHW + // CHECK-SAME: loopOrder = #NHWC + // CHECK-SAME: -> !VPURegMapped.Index<1:1:0> } - // CHECK: %[[BUFF_TILE_0:.*]] = VPURT.DeclareBuffer [0] <0> -> memref<16x256xf16, [@CMX_NN, 0]> - // CHECK: %[[BUFF_TILE_1:.*]] = VPURT.DeclareBuffer [1] <0> -> memref<16x256xf16, [@CMX_NN, 1]> - // CHECK-NOT: VPURT.Task - // CHECK: %[[DMA0:.*]] = VPUMI40XX.NNDMA {allow_different_in_out_shapes, dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64} inputs(%[[CST]] : memref<16x256xf16, @DDR>) outputs(%[[BUFF_TILE_0]], %[[BUFF_TILE_1]] : memref<16x256xf16, [@CMX_NN, 0]>, memref<16x256xf16, [@CMX_NN, 1]>) start_after(0) clean_after(0) acceleration_mode(){{.*}}-> !VPURegMapped.Index<0:0:0> - - return %arg1 : memref<64x32x1x1xf16, @DDR> -} } diff --git a/tests/lit/NPU/conversion/passes/VPUIP2VPUMI40XX/sections_and_padding_40XX.mlir b/tests/lit/NPU/conversion/passes/VPUIP2VPUMI40XX/sections_and_padding_40XX.mlir index 338c0499a5..cd7be4e7a7 100644 --- a/tests/lit/NPU/conversion/passes/VPUIP2VPUMI40XX/sections_and_padding_40XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUIP2VPUMI40XX/sections_and_padding_40XX.mlir @@ -7,7 +7,7 @@ // REQUIRES: arch-NPU40XX // this test can only be (correctly) run manually until E#48620 is solved -module @Test attributes {VPU.arch = #VPU.arch_kind} { +module @Test attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz diff --git a/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/dummy_dinamic_shape_37XX.mlir b/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/dummy_dinamic_shape_37XX.mlir index c59f5af577..6772b1704a 100644 --- a/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/dummy_dinamic_shape_37XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/dummy_dinamic_shape_37XX.mlir @@ -6,16 +6,16 @@ // RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --convert-VPUMI37XX-to-ELF %s | FileCheck %s // REQUIRES: arch-NPU37XX -module attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 2 of @NCE at 1.300000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @SWKernelDynamicInputs inputsInfo : { DataInfo "input_bound" : tensor<1x3x10x10xf16> DataInfo "input_shape" : tensor<4xsi32> diff --git a/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/sigmoid_dual_tile_37XX.mlir b/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/sigmoid_dual_tile_37XX.mlir index 182be30e00..cda158138c 100644 --- a/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/sigmoid_dual_tile_37XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/sigmoid_dual_tile_37XX.mlir @@ -7,11 +7,11 @@ // REQUIRES: arch-NPU37XX // -module @Test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { - IE.MemoryResource 31457280 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} +module @Test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + IE.MemoryResource 31457280 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} IE.ExecutorResource 1 of @DMA_NN IE.TileResource 2 of @NCE { - IE.MemoryResource 2097152 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 2097152 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } diff --git a/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/single_hswish_37XX.mlir b/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/single_hswish_37XX.mlir index b8f1303176..605cc3ba64 100644 --- a/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/single_hswish_37XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/single_hswish_37XX.mlir @@ -7,11 +7,11 @@ // REQUIRES: arch-NPU37XX // -module @Test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { - IE.MemoryResource 31457280 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} +module @Test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + IE.MemoryResource 31457280 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE { - IE.MemoryResource 2097152 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 2097152 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } diff --git a/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/single_relu_dynamic_shape_37XX.mlir b/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/single_relu_dynamic_shape_37XX.mlir index b0f19d3382..b2fe5eef36 100644 --- a/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/single_relu_dynamic_shape_37XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/single_relu_dynamic_shape_37XX.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --init-compiler="vpu-arch=%arch% allow-custom-values=true" --convert-VPUMI37XX-to-ELF %s | FileCheck %s // REQUIRES: arch-NPU37XX -module @SimpleActivation attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @SimpleActivation attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] module @VPU.SW { func.func private @builtin_ReLU(memref<2x4x20x20xf16, [@CMX_NN, 0]>, memref<2x4x20x20xf16, [@CMX_NN, 0]>) attributes {VPU.kernel_code = "activation_relu.cpp", VPU.kernel_entry = "activation_relu"} diff --git a/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/single_softmax_37XX.mlir b/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/single_softmax_37XX.mlir index 9caf8e9e41..661ce22f78 100644 --- a/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/single_softmax_37XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/single_softmax_37XX.mlir @@ -6,11 +6,11 @@ // RUN: vpux-opt --init-compiler="vpu-arch=%arch% allow-custom-values=true" --convert-VPUMI37XX-to-ELF %s | FileCheck %s // REQUIRES: arch-NPU37XX -module @Test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { - IE.MemoryResource 31457280 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} +module @Test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + IE.MemoryResource 31457280 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE { - IE.MemoryResource 2097152 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 2097152 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } diff --git a/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/triple_hswish_37XX.mlir b/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/triple_hswish_37XX.mlir index b827383515..152d0386d0 100644 --- a/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/triple_hswish_37XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUMI37XX2ELFNPU37XX/act_shave/triple_hswish_37XX.mlir @@ -6,11 +6,11 @@ // RUN: vpux-opt --init-compiler="vpu-arch=%arch% allow-custom-values=true" --convert-VPUMI37XX-to-ELF %s | FileCheck %s // REQUIRES: arch-NPU37XX -module @Test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { - IE.MemoryResource 31457280 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} +module @Test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + IE.MemoryResource 31457280 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE { - IE.MemoryResource 2097152 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 2097152 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } diff --git a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_40XX+.mlir b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_40XX+.mlir index 92dfb3646f..51aaf866d0 100644 --- a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_40XX+.mlir +++ b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_40XX+.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --split-input-file --vpu-arch=%arch% --convert-VPUMI40XX-to-VPUASM %s | FileCheck %s // REQUIRES: arch-NPU40XX -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @nndma_0d_to_0d inputsInfo : { @@ -39,7 +39,7 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @nndma_1d_to_1d inputsInfo : { @@ -72,7 +72,7 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @nndma_1d_to_3d inputsInfo : { @@ -105,7 +105,7 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @nndma_2d_to_3d inputsInfo : { @@ -138,7 +138,7 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @nndma_2d_to_3d_with_single_shape inputsInfo : { @@ -171,7 +171,7 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @nndma_3d_to_2d inputsInfo : { @@ -204,7 +204,7 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @nndma_3d_to_2d_with_single_shape inputsInfo : { @@ -237,7 +237,7 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @nndma_3d_to_1d inputsInfo : { @@ -272,7 +272,7 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz !qElemType = !quant.uniform -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @nndma_2d_to_3d_input_stride_on_the_highest_dim inputsInfo : { @@ -308,7 +308,7 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz !qElemType = !quant.uniform -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @nndma_3d_to_2d_output_stride_on_the_highest_dim inputsInfo : { @@ -344,7 +344,7 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @nndma_nf4 inputsInfo : { @@ -377,7 +377,7 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @dma_writing_to_register inputsInfo : { @@ -405,9 +405,9 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz } // CHECK: ELF.CreateSection @buffer.Constant.0.constant aligned(64) secType(SHT_PROGBITS) secFlags(SHF_ALLOC) secLocation() { - // CHECK: VPUASM.ConstBuffer @Declare0 !VPUASM.Buffer< "Constant"[0] <0> : memref<1024xui8> : swizzling(0)> = dense<1> : tensor<1024xui8> + // CHECK: VPUASM.ConstBuffer @Declare_0 !VPUASM.Buffer< "Constant"[0] <0> : memref<1024xui8> : swizzling(0)> = dense<1> : tensor<1024xui8> // CHECK: ELF.CreateLogicalSection @reg.Register.0 aligned(64) secType(SHT_NOBITS) secFlags("SHF_NONE") secLocation() { - // CHECK: VPUASM.DeclareBuffer @DeclareBuffer0 !VPUASM.Buffer< "Register"[0] <788594688> : memref<1024xui8, @Register> : swizzling(0)> + // CHECK: VPUASM.DeclareBuffer @DeclareBuffer_0 !VPUASM.Buffer< "Register"[0] <788594688> : memref<1024xui8, @Register> : swizzling(0)> // CHECK: ELF.CreateLogicalSection @program.metadata.cmx aligned(64) secType(VPU_SHT_CMX_METADATA) secFlags("SHF_NONE") secLocation() { // CHECK: VPUASM.DeclareTaskBuffer @DeclareTaskBuffer_DMA_0_0_0 idx(!VPURegMapped.Index<0:0:0>) diff --git a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_40XX.mlir b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_40XX.mlir index 48894adb1b..fced702744 100644 --- a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_40XX.mlir +++ b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_40XX.mlir @@ -8,7 +8,7 @@ #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @nndma_4d_to_4d_with_single_shape inputsInfo : { @@ -46,7 +46,7 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @ConvertDMAWithF32ToF16 inputsInfo : { diff --git a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_BDCAST_40XX+.mlir b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_BDCAST_40XX+.mlir index 7a10a159b5..308a189b98 100644 --- a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_BDCAST_40XX+.mlir +++ b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_BDCAST_40XX+.mlir @@ -7,7 +7,7 @@ // REQUIRES: arch-NPU40XX #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @dma_broadcast inputsInfo : { @@ -51,7 +51,7 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz } -//CHECK: VPUASM.NNDMA @NNDMA_0_0_0 idx(!VPURegMapped.Index<0:0:0>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_0_0) input(@io.NetworkInput.0::@DeclareBuffer0) outputs([@buffer.CMX_NN.0::@DeclareBuffer4, @buffer.CMX_NN.2::@DeclareBuffer5, @buffer.CMX_NN.4::@DeclareBuffer6]) waits([]) updates([]) start_after(1) clean_after(0) dma_descriptor() acceleration_mode() tile_indexes([0, 2, 4]) -//CHECK: VPUASM.NNDMA @NNDMA_0_1_0 idx(!VPURegMapped.Index<0:1:0>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_0) links(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_1) input(@buffer.CMX_NN.0::@DeclareBuffer7) outputs([@io.NetworkOutput.0::@DeclareBuffer1]) waits([]) updates([]) start_after(1) clean_after(1) dma_descriptor() acceleration_mode() -//CHECK: VPUASM.NNDMA @NNDMA_0_1_1 idx(!VPURegMapped.Index<0:1:1>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_1) links(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_2) input(@buffer.CMX_NN.2::@DeclareBuffer8) outputs([@io.NetworkOutput.1::@DeclareBuffer2]) waits([]) updates([]) start_after(1) clean_after(1) dma_descriptor() acceleration_mode() -//CHECK: VPUASM.NNDMA @NNDMA_0_1_2 idx(!VPURegMapped.Index<0:1:2>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_2) input(@buffer.CMX_NN.4::@DeclareBuffer9) outputs([@io.NetworkOutput.2::@DeclareBuffer3]) waits([]) updates([]) start_after(1) clean_after(1) dma_descriptor() acceleration_mode() +//CHECK: VPUASM.NNDMA @NNDMA_0_0_0 idx(!VPURegMapped.Index<0:0:0>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_0_0) input(@io.NetworkInput.0::@DeclareBuffer_0) outputs([@buffer.CMX_NN.0::@DeclareBuffer_4, @buffer.CMX_NN.2::@DeclareBuffer_5, @buffer.CMX_NN.4::@DeclareBuffer_6]) waits([]) updates([]) start_after(1) clean_after(0) dma_descriptor() acceleration_mode() tile_indexes([0, 2, 4]) +//CHECK: VPUASM.NNDMA @NNDMA_0_1_0 idx(!VPURegMapped.Index<0:1:0>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_0) links(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_1) input(@buffer.CMX_NN.0::@DeclareBuffer_7) outputs([@io.NetworkOutput.0::@DeclareBuffer_1]) waits([]) updates([]) start_after(1) clean_after(1) dma_descriptor() acceleration_mode() +//CHECK: VPUASM.NNDMA @NNDMA_0_1_1 idx(!VPURegMapped.Index<0:1:1>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_1) links(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_2) input(@buffer.CMX_NN.2::@DeclareBuffer_8) outputs([@io.NetworkOutput.1::@DeclareBuffer_2]) waits([]) updates([]) start_after(1) clean_after(1) dma_descriptor() acceleration_mode() +//CHECK: VPUASM.NNDMA @NNDMA_0_1_2 idx(!VPURegMapped.Index<0:1:2>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_2) input(@buffer.CMX_NN.4::@DeclareBuffer_9) outputs([@io.NetworkOutput.2::@DeclareBuffer_3]) waits([]) updates([]) start_after(1) clean_after(1) dma_descriptor() acceleration_mode() diff --git a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_Indices_40XX+.mlir b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_Indices_40XX+.mlir index d176d3701d..8bf2146fdd 100644 --- a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_Indices_40XX+.mlir +++ b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_DMA_Indices_40XX+.mlir @@ -7,7 +7,7 @@ // REQUIRES: arch-NPU40XX #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @dma_broadcast inputsInfo : { @@ -52,7 +52,7 @@ IE.TileResource 1 of @NCE at 6.000000e+02 MHz } -//CHECK: VPUASM.NNDMA @NNDMA_0_0_0 idx(!VPURegMapped.Index<0:0:0>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_0_0) input(@io.NetworkInput.0::@DeclareBuffer0) outputs([@buffer.CMX_NN.0::@DeclareBuffer4, @buffer.CMX_NN.2::@DeclareBuffer5, @buffer.CMX_NN.4::@DeclareBuffer6]) waits([]) updates([]) start_after(1) clean_after(0) dma_descriptor() acceleration_mode() tile_indexes([0, 2, 4]) -//CHECK: VPUASM.NNDMA @NNDMA_0_1_0 idx(!VPURegMapped.Index<0:1:0>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_0) links(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_1) input(@buffer.CMX_NN.0::@DeclareBuffer7) outputs([@io.NetworkOutput.0::@DeclareBuffer1]) waits([]) updates([]) start_after(1) clean_after(1) dma_descriptor() acceleration_mode() indices(@buffer.CMX_NN.4::@DeclareBuffer10) -//CHECK: VPUASM.NNDMA @NNDMA_0_1_1 idx(!VPURegMapped.Index<0:1:1>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_1) links(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_2) input(@buffer.CMX_NN.2::@DeclareBuffer8) outputs([@io.NetworkOutput.1::@DeclareBuffer2]) waits([]) updates([]) start_after(1) clean_after(1) dma_descriptor() acceleration_mode() -//CHECK: VPUASM.NNDMA @NNDMA_0_1_2 idx(!VPURegMapped.Index<0:1:2>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_2) input(@buffer.CMX_NN.4::@DeclareBuffer9) outputs([@io.NetworkOutput.2::@DeclareBuffer3]) waits([]) updates([]) start_after(1) clean_after(1) dma_descriptor() acceleration_mode() +//CHECK: VPUASM.NNDMA @NNDMA_0_0_0 idx(!VPURegMapped.Index<0:0:0>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_0_0) input(@io.NetworkInput.0::@DeclareBuffer_0) outputs([@buffer.CMX_NN.0::@DeclareBuffer_4, @buffer.CMX_NN.2::@DeclareBuffer_5, @buffer.CMX_NN.4::@DeclareBuffer_6]) waits([]) updates([]) start_after(1) clean_after(0) dma_descriptor() acceleration_mode() tile_indexes([0, 2, 4]) +//CHECK: VPUASM.NNDMA @NNDMA_0_1_0 idx(!VPURegMapped.Index<0:1:0>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_0) links(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_1) input(@buffer.CMX_NN.0::@DeclareBuffer_7) outputs([@io.NetworkOutput.0::@DeclareBuffer_1]) waits([]) updates([]) start_after(1) clean_after(1) dma_descriptor() acceleration_mode() indices(@buffer.CMX_NN.4::@DeclareBuffer_10) +//CHECK: VPUASM.NNDMA @NNDMA_0_1_1 idx(!VPURegMapped.Index<0:1:1>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_1) links(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_2) input(@buffer.CMX_NN.2::@DeclareBuffer_8) outputs([@io.NetworkOutput.1::@DeclareBuffer_2]) waits([]) updates([]) start_after(1) clean_after(1) dma_descriptor() acceleration_mode() +//CHECK: VPUASM.NNDMA @NNDMA_0_1_2 idx(!VPURegMapped.Index<0:1:2>) taskLocation(@program.metadata.cmx::@DeclareTaskBuffer_DMA_0_1_2) input(@buffer.CMX_NN.4::@DeclareBuffer_9) outputs([@io.NetworkOutput.2::@DeclareBuffer_3]) waits([]) updates([]) start_after(1) clean_after(1) dma_descriptor() acceleration_mode() diff --git a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_move_ops_to_sections_40XX+.mlir b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_move_ops_to_sections_40XX+.mlir deleted file mode 100644 index fed68852df..0000000000 --- a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_move_ops_to_sections_40XX+.mlir +++ /dev/null @@ -1,145 +0,0 @@ -// -// Copyright (C) 2022-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -// RUN: vpux-opt --split-input-file --vpu-arch=%arch% --convert-VPUMI40XX-to-VPUASM %s | FileCheck %s -// REQUIRES: arch-NPU40XX - -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - -module @mainModule attributes {VPU.arch = #VPU.arch_kind} { - IE.ExecutorResource 1 of @DMA_NN - IE.TileResource 1 of @NCE at 6.000000e+02 MHz - net.NetworkInfo entryPoint : @oneDma inputsInfo : { - DataInfo "input" : tensor<1x2x3x4xf16> - } outputsInfo : { - DataInfo "output" : tensor<1x2x3x4xf16> - } - func.func @oneDma() { - %0 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> - %1 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x2x3x4xf16, {order = #NHWC}, @DDR> - %2 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x2x3x4xf16, {order = #NHWC}, @DDR> - %3 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%0 : !VPURegMapped.Index<0:0:0>) inputs(%1 : memref<1x2x3x4xf16, {order = #NHWC}, @DDR>) outputs(%2 : memref<1x2x3x4xf16, {order = #NHWC}, @DDR>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> - %miV = VPUMI40XX.MappedInferenceVersion(11 _ 4 _ 10) -> !VPURegMapped.Index<0:0:0> - VPUMI40XX.MappedInference dmas((%3) : (!VPURegMapped.Index<0:0:0>)) dmaCount([[1, 0]]) invariantCount([0]) variantCount([0]) actKernelRangesCount([[0, 0]]) actKernelInvocationsCount([[0, 0]]) mediaCount(0) barrierCount(0) mappedInferenceVersion(%miV : !VPURegMapped.Index<0:0:0>) -> !VPURegMapped.Index<0:0:0> - ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} - VPUMI40XX.OpRanges - } -} - -//CHECK: ELF.Main @ELFMain - -//CHECK-DAG: ELF.CreateLogicalSection [[MetadataTaskSec:@.*]] aligned(64) secType(VPU_SHT_CMX_METADATA) secFlags("SHF_NONE") secLocation() -//CHECK-NEXT: VPUASM.DeclareTaskBuffer {{.*}} idx(!VPURegMapped.Index<0:0:0>) - -//CHECK-DAG: ELF.CreateLogicalSection [[NetworkInput:@.*]] aligned(64) secType(SHT_NOBITS) secFlags("SHF_WRITE|SHF_ALLOC|VPU_SHF_USERINPUT") secLocation() -//CHECK-NEXT: VPUASM.DeclareBuffer {{.*}} !VPUASM.Buffer< "NetworkInput"[0] - -//CHECK-DAG: ELF.CreateLogicalSection [[NetworkOutput:@.*]] aligned(64) secType(SHT_NOBITS) secFlags("SHF_WRITE|SHF_ALLOC|VPU_SHF_USEROUTPUT") secLocation() -//CHECK-NEXT: VPUASM.DeclareBuffer {{.*}} !VPUASM.Buffer< "NetworkOutput"[0] - -//CHECK-DAG: ELF.CreateSection [[DMA0SEC:@.*]] aligned(64) secType(SHT_PROGBITS) secFlags(SHF_ALLOC) secLocation() { -//CHECK-NEXT: VPUASM.NNDMA @NNDMA_0_0_0 idx(!VPURegMapped.Index<0:0:0>) - -//CHECK-DAG: ELF.CreateSection [[MappedInferenceSection:@.*]] aligned(64) secType(SHT_PROGBITS) secFlags(SHF_ALLOC) secLocation() -//CHECK-NEXT: VPUASM.MappedInference - -// ----- - -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module @mainModule attributes {VPU.arch = #VPU.arch_kind} { - IE.ExecutorResource 1 of @DMA_NN - IE.TileResource 1 of @NCE at 6.000000e+02 MHz - net.NetworkInfo entryPoint : @twoDma inputsInfo : { - DataInfo "input_0" : tensor<1x16x16x16xf16> - } outputsInfo : { - DataInfo "output_0" : tensor<1x16x16x16xf16> - DataInfo "output_1" : tensor<1x16x16x16xf16> - } - func.func @twoDma() { - %0 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> - %1 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> - %2 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:2> - %3 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:1:0> - %4 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:1:1> - %5 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:1:2> - %6 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x16x16x16xf16, {order = #NHWC}, @DDR> - %7 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x16x16x16xf16, {order = #NHWC}, @DDR> - %8 = VPURT.DeclareBuffer [1] <0> {swizzlingKey = 0 : i64} -> memref<1x16x16x16xf16, {order = #NHWC}, @DDR> - %9 = VPURT.DeclareBuffer [0] <0> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> - %10 = VPURT.DeclareBuffer [1] <0> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]> - %11 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 2 : ui8}<0, -1> -> !VPURegMapped.Index<0:0:0> - %12 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 2 : ui8}<1, -1> -> !VPURegMapped.Index<0:0:1> - %13 = VPUMI40XX.NNDMA {HardLinkedAttrName, port = 0 : i64} taskLocation(%0 : !VPURegMapped.Index<0:0:0>) inputs(%6 : memref<1x16x16x16xf16, {order = #NHWC}, @DDR>) outputs(%9 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> - %14 = VPUMI40XX.NNDMA {HardLinkedAttrName, port = 0 : i64} taskLocation(%1 : !VPURegMapped.Index<0:0:1>) inputs(%6 : memref<1x16x16x16xf16, {order = #NHWC}, @DDR>) outputs(%9 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:1> - %15 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%2 : !VPURegMapped.Index<0:0:2>) inputs(%9 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%7 : memref<1x16x16x16xf16, {order = #NHWC}, @DDR>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:2> - %16 = VPUMI40XX.NNDMA {HardLinkedAttrName, port = 0 : i64} taskLocation(%3 : !VPURegMapped.Index<0:1:0>) inputs(%6 : memref<1x16x16x16xf16, {order = #NHWC}, @DDR>) outputs(%10 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:1:0> - %17 = VPUMI40XX.NNDMA {HardLinkedAttrName, port = 0 : i64} taskLocation(%4 : !VPURegMapped.Index<0:1:1>) inputs(%6 : memref<1x16x16x16xf16, {order = #NHWC}, @DDR>) outputs(%10 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:1:1> - %18 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%5 : !VPURegMapped.Index<0:1:2>) inputs(%10 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]>) outputs(%8 : memref<1x16x16x16xf16, {order = #NHWC}, @DDR>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:1:2> - %miV = VPUMI40XX.MappedInferenceVersion(11 _ 4 _ 10) -> !VPURegMapped.Index<0:0:0> - VPUMI40XX.MappedInference dmas((%13, %16) : (!VPURegMapped.Index<0:0:0>, !VPURegMapped.Index<0:1:0>)) barriers(%11: !VPURegMapped.Index<0:0:0>) dmaCount([[3, 3]]) invariantCount([0]) variantCount([0]) actKernelRangesCount([[0, 0]]) actKernelInvocationsCount([[0, 0]]) mediaCount(0) barrierCount(2) mappedInferenceVersion(%miV : !VPURegMapped.Index<0:0:0>)-> !VPURegMapped.Index<0:0:0> - ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} - VPUMI40XX.OpRanges - } -} - -//CHECK: ELF.Main @ELFMain { - -//CHECK-DAG: ELF.CreateLogicalSection [[MetadataSec:@.*]] aligned(64) secType(VPU_SHT_CMX_METADATA) secFlags("SHF_NONE") secLocation() -//CHECK-NEXT: VPUASM.DeclareTaskBuffer [[DMATASKBUFF00:@.*]] idx(!VPURegMapped.Index<0:0:0>) -//CHECK-NEXT: VPUASM.DeclareTaskBuffer [[DMATASKBUFF01:@.*]] idx(!VPURegMapped.Index<0:0:1>) -//CHECK-NEXT: VPUASM.DeclareTaskBuffer [[DMATASKBUFF02:@.*]] idx(!VPURegMapped.Index<0:0:2>) -//CHECK-NEXT: VPUASM.DeclareTaskBuffer [[DMATASKBUFF10:@.*]] idx(!VPURegMapped.Index<0:1:0>) -//CHECK-NEXT: VPUASM.DeclareTaskBuffer [[DMATASKBUFF11:@.*]] idx(!VPURegMapped.Index<0:1:1>) -//CHECK-NEXT: VPUASM.DeclareTaskBuffer [[DMATASKBUFF12:@.*]] idx(!VPURegMapped.Index<0:1:2>) - -//CHECK-DAG: ELF.CreateLogicalSection [[NetworkInput:@.*]] aligned(64) secType(SHT_NOBITS) secFlags("SHF_WRITE|SHF_ALLOC|VPU_SHF_USERINPUT") secLocation() -//CHECK-NEXT: VPUASM.DeclareBuffer {{.*}} !VPUASM.Buffer< "NetworkInput"[0] - -//CHECK-DAG: ELF.CreateLogicalSection [[NetworkOutput0:@.*]] aligned(64) secType(SHT_NOBITS) secFlags("SHF_WRITE|SHF_ALLOC|VPU_SHF_USEROUTPUT") secLocation() -//CHECK-NEXT: VPUASM.DeclareBuffer {{.*}} !VPUASM.Buffer< "NetworkOutput"[0] - -//CHECK-DAG: ELF.CreateLogicalSection [[NetworkOutput1:@.*]] aligned(64) secType(SHT_NOBITS) secFlags("SHF_WRITE|SHF_ALLOC|VPU_SHF_USEROUTPUT") secLocation() -//CHECK-NEXT: VPUASM.DeclareBuffer {{.*}} !VPUASM.Buffer< "NetworkOutput"[1] - -//CHECK-DAG: ELF.CreateLogicalSection [[NNCMX0:@.*]] aligned(64) secType(VPU_SHT_CMX_WORKSPACE) secFlags("SHF_NONE") secLocation() { -//CHECK-NEXT: VPUASM.DeclareBuffer [[BUFF0:@.*]] !VPUASM.Buffer< "CMX_NN"[0] - -//CHECK-DAG: ELF.CreateLogicalSection [[NNCMX1:@.*]] aligned(64) secType(VPU_SHT_CMX_WORKSPACE) secFlags("SHF_NONE") secLocation() { -//CHECK-NEXT: VPUASM.DeclareBuffer [[BUFF1:@.*]] !VPUASM.Buffer< "CMX_NN"[1] - -//CHECK-DAG: ELF.CreateSection [[BARRSEC:@.*]] aligned(64) secType(SHT_PROGBITS) secFlags(SHF_ALLOC) secLocation() { -//CHECK-NEXT: VPUASM.ConfigureBarrier [[BARR0:@.*]] idx(!VPURegMapped.Index<0:0:0>) -//CHECK-NEXT: VPUASM.ConfigureBarrier [[BARR1:@.*]] idx(!VPURegMapped.Index<0:0:1>) - -//CHECK-DAG: ELF.CreateSection [[DMA0SEC:@.*]] aligned(64) secType(SHT_PROGBITS) secFlags(SHF_ALLOC) secLocation() -//CHECK-NEXT: VPUASM.NNDMA [[DMA00:@.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation([[MetadataSec]]::[[DMATASKBUFF00]]) - //CHECK-SAME: outputs([ - //CHECK-SAME: [[NNCMX0]]::[[BUFF0]]]) - -//CHECK-NEXT: VPUASM.NNDMA [[DMA01:@.*]] idx(!VPURegMapped.Index<0:0:1>) taskLocation([[MetadataSec]]::[[DMATASKBUFF01]]) - //CHECK-SAME: outputs([ - //CHECK-SAME: [[NNCMX0]]::[[BUFF0]]]) - -//CHECK-NEXT: VPUASM.NNDMA [[DMA02:@.*]] idx(!VPURegMapped.Index<0:0:2>) taskLocation([[MetadataSec]]::[[DMATASKBUFF02]]) - //CHECK-SAME: input([[NNCMX0]]::[[BUFF0]]) - -//CHECK-DAG: ELF.CreateSection [[DMA1SEC:@.*]] aligned(64) secType(SHT_PROGBITS) secFlags(SHF_ALLOC) secLocation() -//CHECK-NEXT: VPUASM.NNDMA [[DMA10:@.*]] idx(!VPURegMapped.Index<0:1:0>) taskLocation([[MetadataSec]]::[[DMATASKBUFF10]]) - //CHECK-SAME: outputs([ - //CHECK-SAME: [[NNCMX1]]::[[BUFF1]]]) - -//CHECK-NEXT: VPUASM.NNDMA [[DMA11:@.*]] idx(!VPURegMapped.Index<0:1:1>) taskLocation([[MetadataSec]]::[[DMATASKBUFF11]]) - //CHECK-SAME: outputs([ - //CHECK-SAME: [[NNCMX1]]::[[BUFF1]]]) - -//CHECK-NEXT: VPUASM.NNDMA [[DMA12:@.*]] idx(!VPURegMapped.Index<0:1:2>) taskLocation([[MetadataSec]]::[[DMATASKBUFF12]]) - //CHECK-SAME: input([[NNCMX1]]::[[BUFF1]]) - -//CHECK-DAG: ELF.CreateSection [[MappedInferenceSection:@.*]] aligned(64) secType(SHT_PROGBITS) secFlags(SHF_ALLOC) secLocation() { -//CHECK-NEXT: VPUASM.MappedInference @MappedInference - //CHECK-SAME: dmas([ - //CHECK-SAME: [ - //CHECK-SAME: [[DMA0SEC]]::[[DMA00]], [[DMA1SEC]]::[[DMA10]]]]) - //CHECK-SAME: barriers([[BARRSEC]]::[[BARR0]]) diff --git a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_move_ops_to_sections_nowlm_40XX+.mlir b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_move_ops_to_sections_nowlm_40XX+.mlir new file mode 100644 index 0000000000..89b0a46946 --- /dev/null +++ b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_move_ops_to_sections_nowlm_40XX+.mlir @@ -0,0 +1,137 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% workload-management-enable=false" --convert-VPUMI40XX-to-VPUASM="workload-management-enable=false" %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +net.NetworkInfo entryPoint : @oneDma inputsInfo : { + DataInfo "input" : tensor<1x2x3x4xf16> +} outputsInfo : { + DataInfo "output" : tensor<1x2x3x4xf16> +} +func.func @oneDma() { + %0 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %1 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x2x3x4xf16, {order = #NHWC}, @DDR> + %2 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x2x3x4xf16, {order = #NHWC}, @DDR> + %3 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%0 : !VPURegMapped.Index<0:0:0>) inputs(%1 : memref<1x2x3x4xf16, {order = #NHWC}, @DDR>) outputs(%2 : memref<1x2x3x4xf16, {order = #NHWC}, @DDR>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + %miV = VPUMI40XX.MappedInferenceVersion(11 _ 4 _ 10) -> !VPURegMapped.Index<0:0:0> + VPUMI40XX.MappedInference dmas((%3) : (!VPURegMapped.Index<0:0:0>)) dmaCount([[1, 0]]) invariantCount([0]) variantCount([0]) actKernelRangesCount([[0, 0]]) actKernelInvocationsCount([[0, 0]]) mediaCount(0) barrierCount(0) mappedInferenceVersion(%miV : !VPURegMapped.Index<0:0:0>) -> !VPURegMapped.Index<0:0:0> + ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} + VPUMI40XX.OpRanges +} + +//CHECK: ELF.Main @ELFMain + +//CHECK-DAG: ELF.CreateLogicalSection [[MetadataTaskSec:@.*]] aligned(64) secType(VPU_SHT_CMX_METADATA) secFlags("SHF_NONE") secLocation() +//CHECK-NEXT: VPUASM.DeclareTaskBuffer {{.*}} idx(!VPURegMapped.Index<0:0:0>) + +//CHECK-DAG: ELF.CreateLogicalSection [[NetworkInput:@.*]] aligned(64) secType(SHT_NOBITS) secFlags("SHF_WRITE|SHF_ALLOC|VPU_SHF_USERINPUT") secLocation() +//CHECK-NEXT: VPUASM.DeclareBuffer {{.*}} !VPUASM.Buffer< "NetworkInput"[0] + +//CHECK-DAG: ELF.CreateLogicalSection [[NetworkOutput:@.*]] aligned(64) secType(SHT_NOBITS) secFlags("SHF_WRITE|SHF_ALLOC|VPU_SHF_USEROUTPUT") secLocation() +//CHECK-NEXT: VPUASM.DeclareBuffer {{.*}} !VPUASM.Buffer< "NetworkOutput"[0] + +//CHECK-DAG: ELF.CreateSection [[DMA0SEC:@.*]] aligned(64) secType(SHT_PROGBITS) secFlags(SHF_ALLOC) secLocation() { +//CHECK-NEXT: VPUASM.NNDMA @NNDMA_0_0_0 idx(!VPURegMapped.Index<0:0:0>) + +//CHECK-DAG: ELF.CreateSection [[MappedInferenceSection:@.*]] aligned(64) secType(SHT_PROGBITS) secFlags(SHF_ALLOC) secLocation() +//CHECK-NEXT: VPUASM.MappedInference + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +net.NetworkInfo entryPoint : @twoDma inputsInfo : { + DataInfo "input_0" : tensor<1x16x16x16xf16> +} outputsInfo : { + DataInfo "output_0" : tensor<1x16x16x16xf16> + DataInfo "output_1" : tensor<1x16x16x16xf16> +} +func.func @twoDma() { + %0 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %1 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %2 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:2> + %3 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:1:0> + %4 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:1:1> + %5 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:1:2> + %6 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x16x16x16xf16, {order = #NHWC}, @DDR> + %7 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x16x16x16xf16, {order = #NHWC}, @DDR> + %8 = VPURT.DeclareBuffer [1] <0> {swizzlingKey = 0 : i64} -> memref<1x16x16x16xf16, {order = #NHWC}, @DDR> + %9 = VPURT.DeclareBuffer [0] <0> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %10 = VPURT.DeclareBuffer [1] <0> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]> + %11 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 2 : ui8}<0, -1> -> !VPURegMapped.Index<0:0:0> + %12 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 2 : ui8}<1, -1> -> !VPURegMapped.Index<0:0:1> + %13 = VPUMI40XX.NNDMA {HardLinkedAttrName, port = 0 : i64} taskLocation(%0 : !VPURegMapped.Index<0:0:0>) inputs(%6 : memref<1x16x16x16xf16, {order = #NHWC}, @DDR>) outputs(%9 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + %14 = VPUMI40XX.NNDMA {HardLinkedAttrName, port = 0 : i64} taskLocation(%1 : !VPURegMapped.Index<0:0:1>) inputs(%6 : memref<1x16x16x16xf16, {order = #NHWC}, @DDR>) outputs(%9 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:1> + %15 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%2 : !VPURegMapped.Index<0:0:2>) inputs(%9 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%7 : memref<1x16x16x16xf16, {order = #NHWC}, @DDR>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:2> + %16 = VPUMI40XX.NNDMA {HardLinkedAttrName, port = 0 : i64} taskLocation(%3 : !VPURegMapped.Index<0:1:0>) inputs(%6 : memref<1x16x16x16xf16, {order = #NHWC}, @DDR>) outputs(%10 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:1:0> + %17 = VPUMI40XX.NNDMA {HardLinkedAttrName, port = 0 : i64} taskLocation(%4 : !VPURegMapped.Index<0:1:1>) inputs(%6 : memref<1x16x16x16xf16, {order = #NHWC}, @DDR>) outputs(%10 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:1:1> + %18 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%5 : !VPURegMapped.Index<0:1:2>) inputs(%10 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]>) outputs(%8 : memref<1x16x16x16xf16, {order = #NHWC}, @DDR>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:1:2> + %miV = VPUMI40XX.MappedInferenceVersion(11 _ 4 _ 10) -> !VPURegMapped.Index<0:0:0> + VPUMI40XX.MappedInference dmas((%13, %16) : (!VPURegMapped.Index<0:0:0>, !VPURegMapped.Index<0:1:0>)) barriers(%11: !VPURegMapped.Index<0:0:0>) dmaCount([[3, 3]]) invariantCount([0]) variantCount([0]) actKernelRangesCount([[0, 0]]) actKernelInvocationsCount([[0, 0]]) mediaCount(0) barrierCount(2) mappedInferenceVersion(%miV : !VPURegMapped.Index<0:0:0>)-> !VPURegMapped.Index<0:0:0> + ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} + VPUMI40XX.OpRanges +} + +//CHECK: ELF.Main @ELFMain { + +//CHECK-DAG: ELF.CreateLogicalSection [[MetadataSec:@.*]] aligned(64) secType(VPU_SHT_CMX_METADATA) secFlags("SHF_NONE") secLocation() +//CHECK-NEXT: VPUASM.DeclareTaskBuffer [[DMATASKBUFF00:@.*]] idx(!VPURegMapped.Index<0:0:0>) +//CHECK-NEXT: VPUASM.DeclareTaskBuffer [[DMATASKBUFF01:@.*]] idx(!VPURegMapped.Index<0:0:1>) +//CHECK-NEXT: VPUASM.DeclareTaskBuffer [[DMATASKBUFF02:@.*]] idx(!VPURegMapped.Index<0:0:2>) +//CHECK-NEXT: VPUASM.DeclareTaskBuffer [[DMATASKBUFF10:@.*]] idx(!VPURegMapped.Index<0:1:0>) +//CHECK-NEXT: VPUASM.DeclareTaskBuffer [[DMATASKBUFF11:@.*]] idx(!VPURegMapped.Index<0:1:1>) +//CHECK-NEXT: VPUASM.DeclareTaskBuffer [[DMATASKBUFF12:@.*]] idx(!VPURegMapped.Index<0:1:2>) + +//CHECK-DAG: ELF.CreateLogicalSection [[NetworkInput:@.*]] aligned(64) secType(SHT_NOBITS) secFlags("SHF_WRITE|SHF_ALLOC|VPU_SHF_USERINPUT") secLocation() +//CHECK-NEXT: VPUASM.DeclareBuffer {{.*}} !VPUASM.Buffer< "NetworkInput"[0] + +//CHECK-DAG: ELF.CreateLogicalSection [[NetworkOutput0:@.*]] aligned(64) secType(SHT_NOBITS) secFlags("SHF_WRITE|SHF_ALLOC|VPU_SHF_USEROUTPUT") secLocation() +//CHECK-NEXT: VPUASM.DeclareBuffer {{.*}} !VPUASM.Buffer< "NetworkOutput"[0] + +//CHECK-DAG: ELF.CreateLogicalSection [[NetworkOutput1:@.*]] aligned(64) secType(SHT_NOBITS) secFlags("SHF_WRITE|SHF_ALLOC|VPU_SHF_USEROUTPUT") secLocation() +//CHECK-NEXT: VPUASM.DeclareBuffer {{.*}} !VPUASM.Buffer< "NetworkOutput"[1] + +//CHECK-DAG: ELF.CreateLogicalSection [[NNCMX0:@.*]] aligned(64) secType(VPU_SHT_CMX_WORKSPACE) secFlags("SHF_NONE") secLocation() { +//CHECK-NEXT: VPUASM.DeclareBuffer [[BUFF0:@.*]] !VPUASM.Buffer< "CMX_NN"[0] + +//CHECK-DAG: ELF.CreateLogicalSection [[NNCMX1:@.*]] aligned(64) secType(VPU_SHT_CMX_WORKSPACE) secFlags("SHF_NONE") secLocation() { +//CHECK-NEXT: VPUASM.DeclareBuffer [[BUFF1:@.*]] !VPUASM.Buffer< "CMX_NN"[1] + +//CHECK-DAG: ELF.CreateSection [[BARRSEC:@.*]] aligned(64) secType(SHT_PROGBITS) secFlags(SHF_ALLOC) secLocation() { +//CHECK-NEXT: VPUASM.ConfigureBarrier [[BARR0:@.*]] idx(!VPURegMapped.Index<0:0:0>) +//CHECK-NEXT: VPUASM.ConfigureBarrier [[BARR1:@.*]] idx(!VPURegMapped.Index<0:0:1>) + +//CHECK-DAG: ELF.CreateSection [[DMA0SEC:@.*]] aligned(64) secType(SHT_PROGBITS) secFlags(SHF_ALLOC) secLocation() +//CHECK-NEXT: VPUASM.NNDMA [[DMA00:@.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation([[MetadataSec]]::[[DMATASKBUFF00]]) + //CHECK-SAME: outputs([ + //CHECK-SAME: [[NNCMX0]]::[[BUFF0]]]) + +//CHECK-NEXT: VPUASM.NNDMA [[DMA01:@.*]] idx(!VPURegMapped.Index<0:0:1>) taskLocation([[MetadataSec]]::[[DMATASKBUFF01]]) + //CHECK-SAME: outputs([ + //CHECK-SAME: [[NNCMX0]]::[[BUFF0]]]) + +//CHECK-NEXT: VPUASM.NNDMA [[DMA02:@.*]] idx(!VPURegMapped.Index<0:0:2>) taskLocation([[MetadataSec]]::[[DMATASKBUFF02]]) + //CHECK-SAME: input([[NNCMX0]]::[[BUFF0]]) + +//CHECK-DAG: ELF.CreateSection [[DMA1SEC:@.*]] aligned(64) secType(SHT_PROGBITS) secFlags(SHF_ALLOC) secLocation() +//CHECK-NEXT: VPUASM.NNDMA [[DMA10:@.*]] idx(!VPURegMapped.Index<0:1:0>) taskLocation([[MetadataSec]]::[[DMATASKBUFF10]]) + //CHECK-SAME: outputs([ + //CHECK-SAME: [[NNCMX1]]::[[BUFF1]]]) + +//CHECK-NEXT: VPUASM.NNDMA [[DMA11:@.*]] idx(!VPURegMapped.Index<0:1:1>) taskLocation([[MetadataSec]]::[[DMATASKBUFF11]]) + //CHECK-SAME: outputs([ + //CHECK-SAME: [[NNCMX1]]::[[BUFF1]]]) + +//CHECK-NEXT: VPUASM.NNDMA [[DMA12:@.*]] idx(!VPURegMapped.Index<0:1:2>) taskLocation([[MetadataSec]]::[[DMATASKBUFF12]]) + //CHECK-SAME: input([[NNCMX1]]::[[BUFF1]]) + +//CHECK-DAG: ELF.CreateSection [[MappedInferenceSection:@.*]] aligned(64) secType(SHT_PROGBITS) secFlags(SHF_ALLOC) secLocation() { +//CHECK-NEXT: VPUASM.MappedInference @MappedInference + //CHECK-SAME: dmas([ + //CHECK-SAME: [ + //CHECK-SAME: [[DMA0SEC]]::[[DMA00]], [[DMA1SEC]]::[[DMA10]]]]) + //CHECK-SAME: barriers([[BARRSEC]]::[[BARR0]]) diff --git a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_multi_tile_wlm_40XX+.mlir b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_multi_tile_wlm_40XX+.mlir index 0e6873d184..450c211b96 100644 --- a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_multi_tile_wlm_40XX+.mlir +++ b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_multi_tile_wlm_40XX+.mlir @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --convert-VPUMI40XX-to-VPUASM="workload-management-enable=true" %s | FileCheck %s +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --convert-VPUMI40XX-to-VPUASM %s | FileCheck %s // REQUIRES: arch-NPU40XX @@ -23,13 +23,13 @@ module @"resnet-320-pytorch" { } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo {inferenceTiming = 18466 : i64} entryPoint : @main inputsInfo : { DataInfo "result.1" tensorNames = ["result.1"] : tensor<1x16x16x16xf16> } outputsInfo : { diff --git a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_platform-info_40XX+.mlir b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_platform-info_40XX+.mlir index 7bc883da55..fc54a42b45 100644 --- a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_platform-info_40XX+.mlir +++ b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_platform-info_40XX+.mlir @@ -7,7 +7,7 @@ // REQUIRES: arch-NPU40XX #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @main inputsInfo : { diff --git a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_single-tile_40XX+.mlir b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_single-tile_40XX+.mlir deleted file mode 100644 index 6ddeb0f71d..0000000000 --- a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_single-tile_40XX+.mlir +++ /dev/null @@ -1,451 +0,0 @@ -// -// Copyright (C) 2022-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -// RUN: vpux-opt --split-input-file --vpu-arch=%arch% --convert-VPUMI40XX-to-VPUASM %s | FileCheck %s -// REQUIRES: arch-NPU40XX - -module attributes {VPU.arch = #VPU.arch_kind} { - IE.ExecutorResource 1 of @DMA_NN - IE.TileResource 1 of @NCE at 6.000000e+02 MHz - net.NetworkInfo entryPoint : @oneDma inputsInfo : { - DataInfo "input" : tensor<1x2x3x4xf16> - } outputsInfo : { - DataInfo "output" : tensor<1x2x3x4xf16> - } - - func.func @oneDma() { - %0 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> - %1 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x2x3x4xf16, @DDR> - %2 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x2x3x4xf16, @DDR> - %3 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%0 : !VPURegMapped.Index<0:0:0>) inputs(%1 : memref<1x2x3x4xf16, @DDR>) outputs(%2 : memref<1x2x3x4xf16, @DDR>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> - - %miV = VPUMI40XX.MappedInferenceVersion(11 _ 4 _ 10) -> !VPURegMapped.Index<0:0:0> - - %4 = VPUMI40XX.MappedInference dmas((%3) : (!VPURegMapped.Index<0:0:0>)) dmaCount([[1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) invariantCount([0, 0, 0, 0, 0, 0]) variantCount([0, 0, 0, 0, 0, 0]) actKernelRangesCount([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) actKernelInvocationsCount([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) mediaCount(0) barrierCount(0) mappedInferenceVersion(%miV : !VPURegMapped.Index<0:0:0>) -> !VPURegMapped.Index<0:0:0> - ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} - VPUMI40XX.OpRanges - } -} - -// CHECK: func.func @oneDma() -// CHECK: ELF.CreateLogicalSection @[[SECMETA:.*]] aligned -// CHECK-NEXT: VPUASM.DeclareTaskBuffer @[[TB0:.*]] idx(!VPURegMapped.Index<0:0:0>) - -// CHECK: ELF.CreateLogicalSection @[[SECIN0:.*]] aligned -// CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUF0:.*]] !VPUASM.Buffer< "NetworkInput"[0] <0> - -// CHECK: ELF.CreateLogicalSection @[[SECOUT0:.*]] aligned -// CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUF1:.*]] !VPUASM.Buffer< "NetworkOutput"[0] <0> - -// CHECK: ELF.CreateSection @[[SECDMA00:.*]] aligned -// CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TB0]]) input(@[[SECIN0]]::@[[SYMBUF0]]) outputs([@[[SECOUT0]]::@[[SYMBUF1]]]) - -// CHECK{LITERAL}: VPUASM.MappedInference @MappedInference : dmas([[ -// CHECK-SAME: @[[SECDMA00]]::@[[SYMDMA0]]]]) -// CHECK-SAME{LITERAL}: dmaCount([[1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) -// CHECK-SAME: invariantCount([0, 0, 0, 0, 0, 0]) variantCount([0, 0, 0, 0, 0, 0]) actKernelRangesCount([0, 0, 0, 0, 0, 0]) actKernelInvocationsCount([0, 0, 0, 0, 0, 0]) mediaCount(0) barrierCount(0) - -// ----- - -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPU.arch = #VPU.arch_kind} { - IE.ExecutorResource 1 of @DMA_NN - IE.TileResource 1 of @NCE at 6.000000e+02 MHz - net.NetworkInfo entryPoint : @twoDma inputsInfo : { - DataInfo "input_0" : tensor<1x16x16x16xf16> - } outputsInfo : { - DataInfo "output_0" : tensor<1x16x16x16xf16> - DataInfo "output_1" : tensor<1x16x16x16xf16> - } - - func.func @twoDma() { - %0 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<1:0:0> - %1 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<1:0:1> - %2 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<1:0:2> - %3 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> - %4 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> - %5 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:2> - - %6 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x16x16x16xf16, #NHWC, @DDR> - %7 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x16x16x16xf16, #NHWC, @DDR> - %8 = VPURT.DeclareBuffer [1] <0> {swizzlingKey = 0 : i64} -> memref<1x16x16x16xf16, #NHWC, @DDR> - %9 = VPURT.DeclareBuffer [0] <0> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> - %10 = VPURT.DeclareBuffer [1] <0> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]> - - %11 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 2 : ui8}<0, -1> -> !VPURegMapped.Index<0:0:0> - %12 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 2 : ui8}<1, -1> -> !VPURegMapped.Index<0:0:1> - - %13 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%3 : !VPURegMapped.Index<0:0:0>) inputs(%6 : memref<1x16x16x16xf16, #NHWC, @DDR>) outputs(%9 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) updates(%11 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> - %14 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%4 : !VPURegMapped.Index<0:0:1>) inputs(%6 : memref<1x16x16x16xf16, #NHWC, @DDR>) outputs(%9 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) previousDMA(%13 : !VPURegMapped.Index<0:0:0>) waits(%11 : !VPURegMapped.Index<0:0:0>) updates(%12 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:1> - %15 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%5 : !VPURegMapped.Index<0:0:2>) inputs(%9 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%7 : memref<1x16x16x16xf16, #NHWC, @DDR>) previousDMA(%14 : !VPURegMapped.Index<0:0:1>) waits(%12 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:2> - %16 = VPUMI40XX.NNDMA {port = 1 : i64} taskLocation(%0 : !VPURegMapped.Index<1:0:0>) inputs(%6 : memref<1x16x16x16xf16, #NHWC, @DDR>) outputs(%10 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]>) updates(%11 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<1:0:0> - %17 = VPUMI40XX.NNDMA {port = 1 : i64} taskLocation(%1 : !VPURegMapped.Index<1:0:1>) inputs(%6 : memref<1x16x16x16xf16, #NHWC, @DDR>) outputs(%10 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]>) previousDMA(%16 : !VPURegMapped.Index<1:0:0>) waits(%11 : !VPURegMapped.Index<0:0:0>) updates(%12 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<1:0:1> - %18 = VPUMI40XX.NNDMA {port = 1 : i64} taskLocation(%2 : !VPURegMapped.Index<1:0:2>) inputs(%10 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]>) outputs(%8 : memref<1x16x16x16xf16, #NHWC, @DDR>) previousDMA(%17 : !VPURegMapped.Index<1:0:1>) waits(%12 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<1:0:2> - - %miV = VPUMI40XX.MappedInferenceVersion(11 _ 4 _ 10) -> !VPURegMapped.Index<0:0:0> - - %19 = VPUMI40XX.MappedInference dmas((%13), (%16) : (!VPURegMapped.Index<0:0:0>), (!VPURegMapped.Index<1:0:0>)) barriers(%11 : !VPURegMapped.Index<0:0:0>) dmaCount([[3, 0], [3, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) invariantCount([0, 0, 0, 0, 0, 0]) variantCount([0, 0, 0, 0, 0, 0]) actKernelRangesCount([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) actKernelInvocationsCount([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) mediaCount(0) barrierCount(2) mappedInferenceVersion(%miV : !VPURegMapped.Index<0:0:0>) -> !VPURegMapped.Index<0:0:0> - ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} - VPUMI40XX.OpRanges - } -} - -// CHECK: func.func @twoDma() - -//CHECK: ELF.CreateLogicalSection @[[SECMETA:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareTaskBuffer @[[TB100:.*]] idx(!VPURegMapped.Index<1:0:0>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TB101:.*]] idx(!VPURegMapped.Index<1:0:1>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TB102:.*]] idx(!VPURegMapped.Index<1:0:2>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TB000:.*]] idx(!VPURegMapped.Index<0:0:0>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TB001:.*]] idx(!VPURegMapped.Index<0:0:1>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TB002:.*]] idx(!VPURegMapped.Index<0:0:2>) - -//CHECK: ELF.CreateLogicalSection @[[SECIN0:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUF0:.*]] !VPUASM.Buffer< "NetworkInput"[0] <0> - -//CHECK: ELF.CreateLogicalSection @[[SECOUT0:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUF1:.*]] !VPUASM.Buffer< "NetworkOutput"[0] <0> - -//CHECK: ELF.CreateLogicalSection @[[SECOUT1:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUF2:.*]] !VPUASM.Buffer< "NetworkOutput"[1] <0> - -//CHECK: ELF.CreateLogicalSection @[[SECCMX0:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUF3:.*]] !VPUASM.Buffer< "CMX_NN"[0] <0> - -//CHECK: ELF.CreateLogicalSection @[[SECCMX1:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUF4:.*]] !VPUASM.Buffer< "CMX_NN"[1] <0> - -//CHECK: ELF.CreateSection @[[SECBAR:.*]] aligned -//CHECK-NEXT: VPUASM.ConfigureBarrier @[[SYMBARRIER0:.*]] idx(!VPURegMapped.Index<0:0:0>) (0) => (-1) counts(2 : 2) -//CHECK-NEXT: VPUASM.ConfigureBarrier @[[SYMBARRIER1:.*]] idx(!VPURegMapped.Index<0:0:1>) (1) => (-1) counts(2 : 2) - -//CHECK: ELF.CreateSection @[[SECDMA00:.*]] aligned -//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA000:.*]] idx(!VPURegMapped.Index<0:0:0>) - //CHECK-SAME: taskLocation(@[[SECMETA]]::@[[TB000]]) links(@[[SECMETA]]::@[[TB001]]) input(@[[SECIN0]]::@[[SYMBUF0]]) outputs([@[[SECCMX0]]::@[[SYMBUF3]]]) waits([]) updates([0 : ui8]) start_after(0) - //CHECK-SAME: dma_descriptor() - -//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA001:.*]] idx(!VPURegMapped.Index<0:0:1>) - //CHECK-SAME: taskLocation(@[[SECMETA]]::@[[TB001]]) links(@[[SECMETA]]::@[[TB002]]) input(@[[SECIN0]]::@[[SYMBUF0]]) outputs([@[[SECCMX0]]::@[[SYMBUF3]]]) waits([0 : ui8]) updates([1 : ui8]) start_after(0) - //CHECK-SAME: dma_descriptor() - -//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA010:.*]] idx(!VPURegMapped.Index<0:0:2>) - //CHECK-SAME: taskLocation(@[[SECMETA]]::@[[TB002]]) input(@[[SECCMX0]]::@[[SYMBUF3]]) outputs([@[[SECOUT0]]::@[[SYMBUF1]]]) waits([1 : ui8]) updates([]) start_after(0) - //CHECK-SAME: dma_descriptor() - -//CHECK: ELF.CreateSection @[[SECDMA10:.*]] aligned -//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA100:.*]] idx(!VPURegMapped.Index<1:0:0>) - //CHECK-SAME: taskLocation(@[[SECMETA]]::@[[TB100]]) links(@[[SECMETA]]::@[[TB101]]) input(@[[SECIN0]]::@[[SYMBUF0]]) outputs([@[[SECCMX1]]::@[[SYMBUF4]]]) waits([]) updates([0 : ui8]) start_after(0) - //CHECK-SAME: dma_descriptor() - -//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA101:.*]] idx(!VPURegMapped.Index<1:0:1>) - //CHECK-SAME: taskLocation(@[[SECMETA]]::@[[TB101]]) links(@[[SECMETA]]::@[[TB102]]) input(@[[SECIN0]]::@[[SYMBUF0]]) outputs([@[[SECCMX1]]::@[[SYMBUF4]]]) waits([0 : ui8]) updates([1 : ui8]) start_after(0) - //CHECK-SAME: dma_descriptor() - -//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA110:.*]] idx(!VPURegMapped.Index<1:0:2>) - //CHECK-SAME: taskLocation(@[[SECMETA]]::@[[TB102]]) input(@[[SECCMX1]]::@[[SYMBUF4]]) outputs([@[[SECOUT1]]::@[[SYMBUF2]]]) waits([1 : ui8]) updates([]) start_after(0) - //CHECK-SAME: dma_descriptor() - -// CHECK{LITERAL}: VPUASM.MappedInference @MappedInference : dmas([[ -// CHECK-SAME: @[[SECDMA00]]::@[[SYMDMA000]]], [@[[SECDMA10]]::@[[SYMDMA100]]]]) barriers(@[[SECBAR]]::@[[SYMBARRIER0]]) -// CHECK-SAME{LITERAL}: dmaCount([[3, 0], [3, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) -// CHECK-SAME: invariantCount([0, 0, 0, 0, 0, 0]) variantCount([0, 0, 0, 0, 0, 0]) actKernelRangesCount([0, 0, 0, 0, 0, 0]) actKernelInvocationsCount([0, 0, 0, 0, 0, 0]) mediaCount(0) barrierCount(2) - -// ----- - -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPU.arch = #VPU.arch_kind} { - IE.ExecutorResource 1 of @DMA_NN - IE.TileResource 1 of @NCE at 6.000000e+02 MHz - net.NetworkInfo entryPoint : @maxpool_f16_f16 inputsInfo : { - DataInfo "input_0" : tensor<1x64x16x16xf16> - } outputsInfo : { - DataInfo "output_0" : tensor<1x64x8x8xf16> - } - - func.func @maxpool_f16_f16() { - %0 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> - %1 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> - - %2 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> - %3 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> - %4 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:2> - %5 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:3> - - %cst = const.Declare memref<64x1x1x4xsi32, #NHWC, @DDR> = dense<1> : tensor<64x1x1x4xsi32>, [#const.Reorder<#NHWC>] - %cst_0 = const.Declare memref<1x1x1x16xui8, #NHWC, @DDR> = dense<[[[[3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]]> : tensor<1x1x1x16xui8>, [#const.Reorder<#NHWC>] - - %6 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x64x16x16xf16, #NHWC, @DDR> - %7 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x64x8x8xf16, #NHWC, @DDR> - - %8 = VPURT.DeclareBuffer [0] <8192> -> memref<1x64x16x16xf16, #NHWC, [@CMX_NN, 0]> - %9 = VPURT.DeclareBuffer [0] <0> -> memref<1x64x8x8xf16, #NHWC, [@CMX_NN, 0]> - %10 = VPURT.DeclareBuffer [0] <8192> -> memref<1x64x16x16xf16, #NHWC, [@CMX_NN, 0]> - %11 = VPURT.DeclareBuffer [0] <0> -> memref<1x64x8x8xf16, #NHWC, [@CMX_NN, 0]> - %12 = VPURT.DeclareBuffer [0] <40960> -> memref<1x1x1x16xui8, #NHWC, [@CMX_NN, 0]> - %13 = VPURT.DeclareBuffer [0] <40976> -> memref<64x1x1x4xsi32, #NHWC, [@CMX_NN, 0]> - - %14 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 3 : ui8}<0, -1> -> !VPURegMapped.Index<0:0:0> - %15 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8}<1, -1> -> !VPURegMapped.Index<0:0:1> - - %16 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%2 : !VPURegMapped.Index<0:0:0>) inputs(%6 : memref<1x64x16x16xf16, #NHWC, @DDR>) outputs(%8 : memref<1x64x16x16xf16, #NHWC, [@CMX_NN, 0]>) updates(%14 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> - %17 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%3 : !VPURegMapped.Index<0:0:1>) inputs(%cst_0 : memref<1x1x1x16xui8, #NHWC, @DDR>) outputs(%12 : memref<1x1x1x16xui8, #NHWC, [@CMX_NN, 0]>) previousDMA(%16 : !VPURegMapped.Index<0:0:0>) updates(%14 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:1> - %18 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%4 : !VPURegMapped.Index<0:0:2>) inputs(%cst : memref<64x1x1x4xsi32, #NHWC, @DDR>) outputs(%13 : memref<64x1x1x4xsi32, #NHWC, [@CMX_NN, 0]>) previousDMA(%17 : !VPURegMapped.Index<0:0:1>) updates(%14 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:2> - %19 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%5 : !VPURegMapped.Index<0:0:3>) inputs(%9 : memref<1x64x8x8xf16, #NHWC, [@CMX_NN, 0]>) outputs(%7 : memref<1x64x8x8xf16, #NHWC, @DDR>) previousDMA(%18 : !VPURegMapped.Index<0:0:2>) waits(%15 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:3> - - %20 = VPUMI40XX.DPUInvariant {clean_after = 0 : ui64, kernel_padding = #VPU.Padding, kernel_size = [2, 2], kernel_strides = [2, 2], mpe_frequent_mode = #VPU.mpe_mode, start_after = 0 : ui64, nce_task_type = #VPUIP.nce_task_type} taskLocation(%1 : !VPURegMapped.Index<0:0:0>) input(%8 : memref<1x64x16x16xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%13 : memref<64x1x1x4xsi32, #NHWC, [@CMX_NN, 0]>) outputs(%9 : memref<1x64x8x8xf16, #NHWC, [@CMX_NN, 0]>) waits(%14 : !VPURegMapped.Index<0:0:0>) updates(%15 : !VPURegMapped.Index<0:0:1>) -> <0:0:0> PPE : { - VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} - } - - %21 = VPUMI40XX.DPUVariant taskLocation(%0 : !VPURegMapped.Index<0:0:0>) calls(%20 : !VPURegMapped.Index<0:0:0>) weight_table(%13 : memref<64x1x1x4xsi32, #NHWC, [@CMX_NN, 0]>) {inStart = [0, 0, 0], inEnd = [15, 15, 15], end = [7, 7, 63], mpe_mode = #VPU.mpe_mode, pad = #VPU.Padding, start = [0, 0, 0], nce_task_type = #VPUIP.nce_task_type} -> !VPURegMapped.Index<0:0:0> - - %miV = VPUMI40XX.MappedInferenceVersion(11 _ 4 _ 10) -> !VPURegMapped.Index<0:0:0> - - %22 = VPUMI40XX.MappedInference dmas((%16, %19) : (!VPURegMapped.Index<0:0:0>, !VPURegMapped.Index<0:0:3>)) invariants(%20 : !VPURegMapped.Index<0:0:0>) variants(%21 : !VPURegMapped.Index<0:0:0>) barriers(%14 : !VPURegMapped.Index<0:0:0>) dmaCount([[3, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) invariantCount([1, 0, 0, 0, 0, 0]) variantCount([1, 0, 0, 0, 0, 0]) actKernelRangesCount([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) actKernelInvocationsCount([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) mediaCount(0) barrierCount(2) mappedInferenceVersion(%miV : !VPURegMapped.Index<0:0:0>) -> !VPURegMapped.Index<0:0:0> - ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} - VPUMI40XX.OpRanges - } -} - -//CHECK: func.func @maxpool_f16_f16 - -//CHECK: ELF.CreateLogicalSection @[[SECMETA:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareTaskBuffer @[[TBVAR000:.*]] idx(!VPURegMapped.Index<0:0:0>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TBIVAR000:.*]] idx(!VPURegMapped.Index<0:0:0>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TBD000:.*]] idx(!VPURegMapped.Index<0:0:0>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TBD001:.*]] idx(!VPURegMapped.Index<0:0:1>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TBD002:.*]] idx(!VPURegMapped.Index<0:0:2>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TBD003:.*]] idx(!VPURegMapped.Index<0:0:3>) - -//CHECK: ELF.CreateSection @[[SECCONST:.*]] aligned -//CHECK-NEXT: VPUASM.ConstBuffer @[[SYMCONST0:.*]] !VPUASM.Buffer< "Constant"[0] <0> -//CHECK: VPUASM.ConstBuffer @[[SYMCONST1:.*]] !VPUASM.Buffer< "Constant"[0] <0> - -//CHECK: ELF.CreateLogicalSection @[[SECIN0:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUFF0:.*]] !VPUASM.Buffer< "NetworkInput"[0] <0> -//CHECK: ELF.CreateLogicalSection @[[SECOUT0:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUFF1:.*]] !VPUASM.Buffer< "NetworkOutput"[0] <0> - -//CHECK: ELF.CreateLogicalSection @[[SECCMX0:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUFF2:.*]] !VPUASM.Buffer< "CMX_NN"[0] <8192> -//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF3:.*]] !VPUASM.Buffer< "CMX_NN"[0] <0> -//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF4:.*]] !VPUASM.Buffer< "CMX_NN"[0] <8192> -//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF5:.*]] !VPUASM.Buffer< "CMX_NN"[0] <0> -//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF6:.*]] !VPUASM.Buffer< "CMX_NN"[0] <40960> -//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF7:.*]] !VPUASM.Buffer< "CMX_NN"[0] <40976> - -//CHECK: ELF.CreateSection @[[SECBAR:.*]] aligned -//CHECK-NEXT: VPUASM.ConfigureBarrier @[[SYMBARR0:.*]] idx(!VPURegMapped.Index<0:0:0>) (0) => (-1) counts(3 : 1) -//CHECK: VPUASM.ConfigureBarrier @[[SYMBARR1:.*]] idx(!VPURegMapped.Index<0:0:1>) (1) => (-1) counts(1 : 1) - -//CHECK: ELF.CreateSection @[[SECDMA00:.*]] aligned -//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA_0_0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBD000]]) - //CHECK-SAME: links(@[[SECMETA]]::@[[TBD001]]) input(@[[SECIN0]]::@[[SYMBUFF0]]) outputs([@[[SECCMX0]]::@[[SYMBUFF2]]]) waits([]) updates([0 : ui8]) - -//CHECK: VPUASM.NNDMA @[[SYMDMA_0_1:.*]] idx(!VPURegMapped.Index<0:0:1>) taskLocation(@[[SECMETA]]::@[[TBD001]]) - // CHECK-SAME: links(@[[SECMETA]]::@[[TBD002]]) input(@[[SECCONST]]::@[[SYMCONST1]]) outputs([@[[SECCMX0]]::@[[SYMBUFF6]]]) waits([]) updates([0 : ui8]) - -//CHECK: VPUASM.NNDMA @[[SYMDMA_0_2:.*]] idx(!VPURegMapped.Index<0:0:2>) taskLocation(@[[SECMETA]]::@[[TBD002]]) - // CHECK-SAME: links(@[[SECMETA]]::@[[TBD003]]) input(@[[SECCONST]]::@[[SYMCONST0]]) outputs([@[[SECCMX0]]::@[[SYMBUFF7]]]) waits([]) updates([0 : ui8]) - -//CHECK: VPUASM.NNDMA @[[SYMDMA_0_3:.*]] idx(!VPURegMapped.Index<0:0:3>) taskLocation(@[[SECMETA]]::@[[TBD003]]) - // CHECK-SAME: input(@[[SECCMX0]]::@[[SYMBUFF3]]) outputs([@[[SECOUT0]]::@[[SYMBUFF1]]]) waits([1 : ui8]) updates([]) - -//CHECK: ELF.CreateSection @[[SECINV:.*]] aligned -//CHECK-NEXT: VPUASM.DPUInvariant @[[SYMINV0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBIVAR000]]) - // CHECK-SAME: input(@[[SECCMX0]]::@[[SYMBUFF2]]) weight_table(@[[SECCMX0]]::@[[SYMBUFF7]]) - // CHECK-SAME: output(@[[SECCMX0]]::@[[SYMBUFF3]]) waits([0 : ui8]) updates([1 : ui8]) - -//CHECK: ELF.CreateSection @[[SECVAR:.*]] aligned -//CHECK-NEXT: VPUASM.DPUVariant @[[SYMVAR0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBVAR000]]) - // CHECK-SAME: calls @[[SECMETA]]::@[[TBIVAR000]] - // CHECK-SAME: weight_table(@[[SECCMX0]]::@[[SYMBUFF7]]) - -// CHECK{LITERAL}: VPUASM.MappedInference @MappedInference : dmas([[ -// CHECK-SAME: @[[SECDMA00]]::@[[SYMDMA_0_0]], @[[SECDMA00]]::@[[SYMDMA_0_3]]]]) invariants([@[[SECINV]]::@[[SYMINV0]]]) variants([@[[SECVAR]]::@[[SYMVAR0]]]) barriers(@[[SECBAR]]::@[[SYMBARR0]]) -// CHECK-SAME{LITERAL}: dmaCount([[3, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) -// CHECK-SAME: invariantCount([1, 0, 0, 0, 0, 0]) variantCount([1, 0, 0, 0, 0, 0]) actKernelRangesCount([0, 0, 0, 0, 0, 0]) actKernelInvocationsCount([0, 0, 0, 0, 0, 0]) mediaCount(0) barrierCount(2) - -// ----- - -module attributes {VPU.arch = #VPU.arch_kind} { - IE.ExecutorResource 1 of @DMA_NN - IE.TileResource 1 of @NCE at 6.000000e+02 MHz - net.NetworkInfo entryPoint : @single_hswish inputsInfo : { - DataInfo "input" : tensor<1x1000xf16> - } outputsInfo : { - DataInfo "hswish" : tensor<1x1000xf16> - } - VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096] - module @VPU.SW { - func.func private @builtin_hswish(memref<*xf16>, memref<*xf16>) attributes {VPU.kernel_code = "activation_hswish.cpp", VPU.kernel_entry = "activation_hswish"} - func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} - } - - func.func @single_hswish() { - %0 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> - %1 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> - %2 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> - %3 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> - - %4 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x1x1x1000xf16> - %5 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x1x1x1000xf16> - %6 = VPURT.DeclareBuffer [0] <0> -> memref<1x1x1x1000xf16, [@CMX_NN, 0]> - %7 = VPURT.DeclareBuffer [0] <2000> -> memref<1x1x1x1000xf16, [@CMX_NN, 0]> - - %8 = VPUMI40XX.DeclareKernelText kernel_path("activation_hswish") -> !VPURegMapped.Index<0:0:0> - %9 = VPUMI40XX.DeclareKernelEntry kernel_path("activation_hswish") -> !VPURegMapped.Index<0:0:0> - %10 = VPUMI40XX.DeclareKernelArgs kernel_path("activation_hswish") -> !VPURegMapped.Index<0:0:0> - %11 = VPUMI40XX.KernelParams inputs(%6 : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) outputs(%7 : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) kernel_type("activation_hswish") kernel_params(dense<[0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 33, 67, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 33, 67, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]> : vector<72xui8>) -> !VPURegMapped.Index<0:0:0> - - %12 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8}<0, -1> -> !VPURegMapped.Index<0:0:0> - %13 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8}<1, -1> -> !VPURegMapped.Index<0:0:1> - - %14 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%2 : !VPURegMapped.Index<0:0:0>) inputs(%4 : memref<1x1x1x1000xf16>) outputs(%6 : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) updates(%12 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> - %15 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%3 : !VPURegMapped.Index<0:0:1>) inputs(%7 : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) outputs(%5 : memref<1x1x1x1000xf16>) previousDMA(%14 : !VPURegMapped.Index<0:0:0>) waits(%13 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:1> - - %16 = VPUMI40XX.ActKernelRange taskLocation(%0 : !VPURegMapped.Index<0:0:0>) kernel_text_index(%8 : !VPURegMapped.Index<0:0:0>) kernel_args_index(%10 : !VPURegMapped.Index<0:0:0>) kernel_entry_index(%9 : !VPURegMapped.Index<0:0:0>) kernelTaskType(@COMPUTE) -> !VPURegMapped.Index<0:0:0> - - %17 = VPUMI40XX.ActKernelInvocation taskLocation(%1 : !VPURegMapped.Index<0:0:0>) range_index(%16 : <0:0:0>) kernel_params(%11 : <0:0:0>) waits(%12 : !VPURegMapped.Index<0:0:0>) updates(%13 : !VPURegMapped.Index<0:0:1>) tile(0) start_after(0) clean_after(0) -> !VPURegMapped.Index<0:0:0> - - %miV = VPUMI40XX.MappedInferenceVersion(11 _ 4 _ 10) -> !VPURegMapped.Index<0:0:0> - - %18 = VPUMI40XX.MappedInference dmas((%14, %15) : (!VPURegMapped.Index<0:0:0>, !VPURegMapped.Index<0:0:1>)) actKernelRanges((%16) : (!VPURegMapped.Index<0:0:0>)) actKernelInvocations((%17) : (!VPURegMapped.Index<0:0:0>)) barriers(%12 : !VPURegMapped.Index<0:0:0>) dmaCount([[1, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) invariantCount([0, 0, 0, 0, 0, 0]) variantCount([0, 0, 0, 0, 0, 0]) actKernelRangesCount([[1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) actKernelInvocationsCount([[1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) mediaCount(0) barrierCount(2) mappedInferenceVersion(%miV : !VPURegMapped.Index<0:0:0>) -> !VPURegMapped.Index<0:0:0> - ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} - VPUMI40XX.OpRanges - } -} - -//CHECK: func.func @single_hswish - -//CHECK: ELF.CreateLogicalSection @[[SECMETA:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareTaskBuffer @[[TBRANGE:.*]] idx(!VPURegMapped.Index<0:0:0>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TBINVO:.*]] idx(!VPURegMapped.Index<0:0:0>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TBDMA000:.*]] idx(!VPURegMapped.Index<0:0:0>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TBDMA001:.*]] idx(!VPURegMapped.Index<0:0:1>) - -//CHECK: ELF.CreateLogicalSection @[[SECIN0:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUFF0:.*]] !VPUASM.Buffer - -//CHECK: ELF.CreateLogicalSection @[[SECOUT0:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUFF1:.*]] !VPUASM.Buffer - -//CHECK: ELF.CreateLogicalSection @[[SECCMX0:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUFF2:.*]] !VPUASM.Buffer -//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF3:.*]] !VPUASM.Buffer - -//CHECK: ELF.CreateSection @[[SECSHVTEXT:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareKernelText @[[SYMTEXT0:.*]] : "activation_hswish" - -//CHECK: VPUASM.DeclareKernelEntry @[[SYMENTRY0:.*]] : "activation_hswish" - -//CHECK: ELF.CreateSection @[[SECSHVDATA:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareKernelData @[[SYMDATA0:.*]] : "activation_hswish" - -//CHECK: ELF.CreateSection @[[SECSHVPARAMS:.*]] aligned -//CHECK-NEXT: VPUASM.KernelParams @[[SYMPARAMS0:.*]] inputs([@[[SECCMX0]]::@[[SYMBUFF2]]]) outputs([@[[SECCMX0]]::@[[SYMBUFF3]]]) dynamicInputShapes([]) dynamicOutputShapes([]) kernel_type("activation_hswish") - -//CHECK: ELF.CreateSection @[[SECBAR:.*]] aligned -//CHECK-NEXT: VPUASM.ConfigureBarrier @[[SYMBARR0:.*]] idx(!VPURegMapped.Index<0:0:0>) (0) => (-1) counts(1 : 1) -//CHECK: VPUASM.ConfigureBarrier @[[SYMBARR1:.*]] idx(!VPURegMapped.Index<0:0:1>) (1) => (-1) counts(1 : 1) - -//CHECK: ELF.CreateSection @[[SECDMA00:.*]] aligned -//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBDMA000]]) - //CHECK-SAME: links(@[[SECMETA]]::@[[TBDMA001]]) input(@[[SECIN0]]::@[[SYMBUFF0]]) outputs([@[[SECCMX0]]::@[[SYMBUFF2]]]) waits([]) updates([0 : ui8]) -//CHECK: VPUASM.NNDMA @[[SYMDMA1:.*]] idx(!VPURegMapped.Index<0:0:1>) taskLocation(@[[SECMETA]]::@[[TBDMA001]]) - //CHECK-SAME: input(@[[SECCMX0]]::@[[SYMBUFF3]]) outputs([@[[SECOUT0]]::@[[SYMBUFF1]]]) waits([1 : ui8]) updates([]) - -//CHECK: ELF.CreateSection @[[SECSHVRANGE:.*]] aligned -//CHECK: VPUASM.ActKernelRange @[[SYMACTRANGE0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBRANGE]]) - //CHECK-SAME: calls @[[SECSHVTEXT]]::@[[SYMTEXT0]] : @[[SYMENTRY0]] - -//CHECK: ELF.CreateSection @[[SECSHVINVOCATION:.*]] aligned -//CHECK-NEXT: VPUASM.ActKernelInvocation @[[SYMACTINVO0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBINVO]]) - //CHECK-SAME: -> @[[SECMETA]]::@[[TBRANGE]](kernel_data : @[[SECSHVDATA]]::@[[SYMDATA0]], kernel_params : @[[SECSHVPARAMS]]::@[[SYMPARAMS0]]) waits([0 : ui8]) updates([1 : ui8]) - -// CHECK{LITERAL}: VPUASM.MappedInference @MappedInference : dmas([[ -// CHECK-SAME: @[[SECDMA00]]::@[[SYMDMA0]], @[[SECDMA00]]::@[[SYMDMA1]]]]) actKernelRanges([@[[SECSHVRANGE]]::@[[SYMACTRANGE0]]]) actKernelInvocations([@[[SECSHVINVOCATION]]::@[[SYMACTINVO0]]]) barriers(@[[SECBAR]]::@[[SYMBARR0]]) -// CHECK-SAME{LITERAL}: dmaCount([[1, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) -// CHECK-SAME: invariantCount([0, 0, 0, 0, 0, 0]) variantCount([0, 0, 0, 0, 0, 0]) actKernelRangesCount([1, 0, 0, 0, 0, 0]) actKernelInvocationsCount([1, 0, 0, 0, 0, 0]) mediaCount(0) barrierCount(2) - -// ----- - -module @mainModule attributes {VPU.arch = #VPU.arch_kind} { - IE.ExecutorResource 1 of @DMA_NN - IE.TileResource 1 of @NCE at 6.000000e+02 MHz - net.NetworkInfo entryPoint : @continued_conv_f16_f16_f16 inputsInfo : { - DataInfo "input_0" : tensor<1x16384x1x1xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> - } outputsInfo : { - DataInfo "output_0" : tensor<1x16x1x1xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> - } - func.func @continued_conv_f16_f16_f16() { - %0 = VPUMI40XX.DeclareTaskBuffer {offset = 0 : ui64} -> !VPURegMapped.Index<0:0:0> - %1 = VPUMI40XX.DeclareTaskBuffer {offset = 352 : ui64} -> !VPURegMapped.Index<0:0:1> - %2 = VPUMI40XX.DeclareTaskBuffer {offset = 22528 : ui64} -> !VPURegMapped.Index<0:0:0> - %3 = VPUMI40XX.DeclareTaskBuffer {offset = 22752 : ui64} -> !VPURegMapped.Index<0:0:1> - %14 = VPURT.DeclareBuffer [0] <96> -> memref<1x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> - %15 = VPURT.DeclareBuffer [0] <33376> -> memref<16x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> - %16 = VPURT.DeclareBuffer [0] <16480> -> memref<1x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> - %17 = VPURT.DeclareBuffer [0] <295520> -> memref<16x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> - %18 = VPURT.DeclareBuffer [0] <64> -> memref<1x16x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> - %19 = VPURT.DeclareBuffer [0] <32864> -> memref<16x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> - %20 = VPURT.DeclareBuffer [0] <33120> -> memref<16x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> - %21 = VPURT.DeclareBuffer [0] <32> -> memref<1x16x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @Register> - %23 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8} <0, -1> -> !VPURegMapped.Index<0:0:0> - %26 = VPUMI40XX.DPUInvariant {clean_after = 1 : ui64, is_continued, kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 2 : ui64} taskLocation(%0 : !VPURegMapped.Index<0:0:0>) input(%14 : memref<1x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) weights(%15 : memref<16x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) weight_table(%19 : memref<16x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) outputs(%21 : memref<1x16x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @Register>) updates(%23 : !VPURegMapped.Index<0:0:0>) -> <0:0:0> PPE : { - } - %27 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%1 : !VPURegMapped.Index<0:0:1>) previousTask(%26 : !VPURegMapped.Index<0:0:0>) input(%16 : memref<1x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) weights(%17 : memref<16x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) weight_table(%20 : memref<16x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) outputs(%18 : memref<1x16x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) waits(%23 : !VPURegMapped.Index<0:0:0>) -> <0:0:1> PPE : { - } - %28 = VPUMI40XX.DPUVariant taskLocation(%2 : !VPURegMapped.Index<0:0:0>) calls(%26 : <0:0:0>) weights(%15 : memref<16x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) weight_table(%19 : memref<16x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) {end = [0, 0, 15], inEnd = [0, 0, 8191], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:0> - %29 = VPUMI40XX.DPUVariant taskLocation(%3 : !VPURegMapped.Index<0:0:1>) previousTask(%28 : !VPURegMapped.Index<0:0:0>) calls(%27 : <0:0:1>) weights(%17 : memref<16x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) weight_table(%20 : memref<16x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) {HardLinkedAttrName, end = [0, 0, 15], inEnd = [0, 0, 8191], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:1> - VPURegMapped.TaskBufferLayout {ActKernelInvocation = [[#VPURegMapped.TaskGroup]], ActKernelRange = [[#VPURegMapped.TaskGroup]], DMA = [[#VPURegMapped.TaskGroup, #VPURegMapped.TaskGroup]], DPUInvariant = [[#VPURegMapped.TaskGroup]], DPUVariant = [[#VPURegMapped.TaskGroup]], M2I = [[#VPURegMapped.TaskGroup]]} - %miV = VPUMI40XX.MappedInferenceVersion(11 _ 4 _ 10) -> !VPURegMapped.Index<0:0:0> - %36 = VPUMI40XX.MappedInference invariants(%26 : !VPURegMapped.Index<0:0:0>) variants(%28 : !VPURegMapped.Index<0:0:0>) barriers(%23 : !VPURegMapped.Index<0:0:0>) dmaCount([[0, 0]]) invariantCount([2]) variantCount([2]) actKernelRangesCount([[0, 0]]) actKernelInvocationsCount([[0, 0]]) mediaCount(0) barrierCount(4) mappedInferenceVersion(%miV : !VPURegMapped.Index<0:0:0>) -> !VPURegMapped.Index<0:0:0> - ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} - VPUMI40XX.OpRanges types([#VPURegMapped.task_type, #VPURegMapped.task_type]) begins(%26, %28 : !VPURegMapped.Index<0:0:0>, !VPURegMapped.Index<0:0:0>) ends(%27, %29 : !VPURegMapped.Index<0:0:1>, !VPURegMapped.Index<0:0:1>) - } -} - - -//CHECK: func.func @continued_conv_f16_f16_f16 - -//CHECK: ELF.CreateLogicalSection @[[SECMETA:.*]] aligned -//CHECK-NEXT: VPUASM.DeclareTaskBuffer @[[TBIVAR000:.*]] idx(!VPURegMapped.Index<0:0:0>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TBIVAR001:.*]] idx(!VPURegMapped.Index<0:0:1>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TBVAR000:.*]] idx(!VPURegMapped.Index<0:0:0>) -//CHECK: VPUASM.DeclareTaskBuffer @[[TBVAR001:.*]] idx(!VPURegMapped.Index<0:0:1>) - -//CHECK: ELF.CreateLogicalSection @[[SECCMX0:.*]] aligned -//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF0:.*]] !VPUASM.Buffer< "CMX_NN"[0] <96> -//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF1:.*]] !VPUASM.Buffer< "CMX_NN"[0] <33376> -//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF2:.*]] !VPUASM.Buffer< "CMX_NN"[0] <16480> -//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF3:.*]] !VPUASM.Buffer< "CMX_NN"[0] <295520> -//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF4:.*]] !VPUASM.Buffer< "CMX_NN"[0] <64> -//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF5:.*]] !VPUASM.Buffer< "CMX_NN"[0] <32864> -//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF6:.*]] !VPUASM.Buffer< "CMX_NN"[0] <33120> - -//CHECK-NOT: VPUASM.DeclareBuffer @[[SYMBUFF7:.*]] !VPUASM.Buffer< "MAC_Accumulators"[0] <32> - -//CHECK: VPUASM.ConfigureBarrier @[[SYMBARR0:.*]] idx(!VPURegMapped.Index<0:0:0>) (0) => (-1) counts(1 : 1) - -//CHECK: VPUASM.DPUInvariant @[[SYMINV0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBIVAR000]]) - // CHECK-SAME: input(@[[SECCMX0]]::@[[SYMBUFF0]]) weights(@[[SECCMX0]]::@[[SYMBUFF1]]) weight_table(@[[SECCMX0]]::@[[SYMBUFF5]]) - // CHECK-NOT: output( - // CHECK-SAME: updates([0 : ui8]) - // CHECK-SAME: is_continued - // CHECK-SAME: output_type_continued = !VPUASM.Buffer< "MAC_Accumulators"[0] <32> - -//CHECK: VPUASM.DPUInvariant @[[SYMINV1:.*]] idx(!VPURegMapped.Index<0:0:1>) taskLocation(@[[SECMETA]]::@[[TBIVAR001]]) - // CHECK-SAME: input(@[[SECCMX0]]::@[[SYMBUFF2]]) weights(@[[SECCMX0]]::@[[SYMBUFF3]]) weight_table(@[[SECCMX0]]::@[[SYMBUFF6]]) - // CHECK-SAME: output(@[[SECCMX0]]::@[[SYMBUFF4]]) waits([0 : ui8]) - -//CHECK: VPUASM.DPUVariant @[[SYMVAR0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBVAR000]]) - // CHECK-SAME: calls @[[SECMETA]]::@[[TBIVAR000]] - // CHECK-SAME: weights(@[[SECCMX0]]::@[[SYMBUFF1]]) - // CHECK-SAME: weight_table(@[[SECCMX0]]::@[[SYMBUFF5]]) - -//CHECK: VPUASM.DPUVariant @[[SYMVAR1:.*]] idx(!VPURegMapped.Index<0:0:1>) taskLocation(@[[SECMETA]]::@[[TBVAR001]]) - // CHECK-SAME: calls @[[SECMETA]]::@[[TBIVAR001]] - // CHECK-SAME: weights(@[[SECCMX0]]::@[[SYMBUFF3]]) - // CHECK-SAME: weight_table(@[[SECCMX0]]::@[[SYMBUFF6]]) diff --git a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_single-tile_nowlm_40XX+.mlir b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_single-tile_nowlm_40XX+.mlir new file mode 100644 index 0000000000..14b6ccbe0f --- /dev/null +++ b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_single-tile_nowlm_40XX+.mlir @@ -0,0 +1,431 @@ +// +// Copyright (C) 2022-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% workload-management-enable=false" --convert-VPUMI40XX-to-VPUASM="workload-management-enable=false" %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +net.NetworkInfo entryPoint : @oneDma inputsInfo : { + DataInfo "input" : tensor<1x2x3x4xf16> +} outputsInfo : { + DataInfo "output" : tensor<1x2x3x4xf16> +} + +func.func @oneDma() { + %0 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %1 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x2x3x4xf16, @DDR> + %2 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x2x3x4xf16, @DDR> + %3 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%0 : !VPURegMapped.Index<0:0:0>) inputs(%1 : memref<1x2x3x4xf16, @DDR>) outputs(%2 : memref<1x2x3x4xf16, @DDR>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + + %miV = VPUMI40XX.MappedInferenceVersion(11 _ 4 _ 10) -> !VPURegMapped.Index<0:0:0> + + %4 = VPUMI40XX.MappedInference dmas((%3) : (!VPURegMapped.Index<0:0:0>)) dmaCount([[1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) invariantCount([0, 0, 0, 0, 0, 0]) variantCount([0, 0, 0, 0, 0, 0]) actKernelRangesCount([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) actKernelInvocationsCount([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) mediaCount(0) barrierCount(0) mappedInferenceVersion(%miV : !VPURegMapped.Index<0:0:0>) -> !VPURegMapped.Index<0:0:0> + ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} + VPUMI40XX.OpRanges +} + +// CHECK: func.func @oneDma() +// CHECK: ELF.CreateLogicalSection @[[SECMETA:.*]] aligned +// CHECK-NEXT: VPUASM.DeclareTaskBuffer @[[TB0:.*]] idx(!VPURegMapped.Index<0:0:0>) + +// CHECK: ELF.CreateLogicalSection @[[SECIN0:.*]] aligned +// CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUF0:.*]] !VPUASM.Buffer< "NetworkInput"[0] <0> + +// CHECK: ELF.CreateLogicalSection @[[SECOUT0:.*]] aligned +// CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUF1:.*]] !VPUASM.Buffer< "NetworkOutput"[0] <0> + +// CHECK: ELF.CreateSection @[[SECDMA00:.*]] aligned +// CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TB0]]) input(@[[SECIN0]]::@[[SYMBUF0]]) outputs([@[[SECOUT0]]::@[[SYMBUF1]]]) + +// CHECK{LITERAL}: VPUASM.MappedInference @MappedInference : dmas([[ +// CHECK-SAME: @[[SECDMA00]]::@[[SYMDMA0]]]]) +// CHECK-SAME{LITERAL}: dmaCount([[1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) +// CHECK-SAME: invariantCount([0, 0, 0, 0, 0, 0]) variantCount([0, 0, 0, 0, 0, 0]) actKernelRangesCount([0, 0, 0, 0, 0, 0]) actKernelInvocationsCount([0, 0, 0, 0, 0, 0]) mediaCount(0) barrierCount(0) + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +net.NetworkInfo entryPoint : @twoDma inputsInfo : { + DataInfo "input_0" : tensor<1x16x16x16xf16> +} outputsInfo : { + DataInfo "output_0" : tensor<1x16x16x16xf16> + DataInfo "output_1" : tensor<1x16x16x16xf16> +} + +func.func @twoDma() { + %0 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<1:0:0> + %1 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<1:0:1> + %2 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<1:0:2> + %3 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %4 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %5 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:2> + + %6 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x16x16x16xf16, #NHWC, @DDR> + %7 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x16x16x16xf16, #NHWC, @DDR> + %8 = VPURT.DeclareBuffer [1] <0> {swizzlingKey = 0 : i64} -> memref<1x16x16x16xf16, #NHWC, @DDR> + %9 = VPURT.DeclareBuffer [0] <0> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %10 = VPURT.DeclareBuffer [1] <0> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]> + + %11 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 2 : ui8}<0, -1> -> !VPURegMapped.Index<0:0:0> + %12 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 2 : ui8}<1, -1> -> !VPURegMapped.Index<0:0:1> + + %13 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%3 : !VPURegMapped.Index<0:0:0>) inputs(%6 : memref<1x16x16x16xf16, #NHWC, @DDR>) outputs(%9 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) updates(%11 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + %14 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%4 : !VPURegMapped.Index<0:0:1>) inputs(%6 : memref<1x16x16x16xf16, #NHWC, @DDR>) outputs(%9 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) previousDMA(%13 : !VPURegMapped.Index<0:0:0>) waits(%11 : !VPURegMapped.Index<0:0:0>) updates(%12 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:1> + %15 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%5 : !VPURegMapped.Index<0:0:2>) inputs(%9 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%7 : memref<1x16x16x16xf16, #NHWC, @DDR>) previousDMA(%14 : !VPURegMapped.Index<0:0:1>) waits(%12 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:2> + %16 = VPUMI40XX.NNDMA {port = 1 : i64} taskLocation(%0 : !VPURegMapped.Index<1:0:0>) inputs(%6 : memref<1x16x16x16xf16, #NHWC, @DDR>) outputs(%10 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]>) updates(%11 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<1:0:0> + %17 = VPUMI40XX.NNDMA {port = 1 : i64} taskLocation(%1 : !VPURegMapped.Index<1:0:1>) inputs(%6 : memref<1x16x16x16xf16, #NHWC, @DDR>) outputs(%10 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]>) previousDMA(%16 : !VPURegMapped.Index<1:0:0>) waits(%11 : !VPURegMapped.Index<0:0:0>) updates(%12 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<1:0:1> + %18 = VPUMI40XX.NNDMA {port = 1 : i64} taskLocation(%2 : !VPURegMapped.Index<1:0:2>) inputs(%10 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 1]>) outputs(%8 : memref<1x16x16x16xf16, #NHWC, @DDR>) previousDMA(%17 : !VPURegMapped.Index<1:0:1>) waits(%12 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<1:0:2> + + %miV = VPUMI40XX.MappedInferenceVersion(11 _ 4 _ 10) -> !VPURegMapped.Index<0:0:0> + + %19 = VPUMI40XX.MappedInference dmas((%13), (%16) : (!VPURegMapped.Index<0:0:0>), (!VPURegMapped.Index<1:0:0>)) barriers(%11 : !VPURegMapped.Index<0:0:0>) dmaCount([[3, 0], [3, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) invariantCount([0, 0, 0, 0, 0, 0]) variantCount([0, 0, 0, 0, 0, 0]) actKernelRangesCount([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) actKernelInvocationsCount([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) mediaCount(0) barrierCount(2) mappedInferenceVersion(%miV : !VPURegMapped.Index<0:0:0>) -> !VPURegMapped.Index<0:0:0> + ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} + VPUMI40XX.OpRanges +} + +// CHECK: func.func @twoDma() + +//CHECK: ELF.CreateLogicalSection @[[SECMETA:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareTaskBuffer @[[TB100:.*]] idx(!VPURegMapped.Index<1:0:0>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TB101:.*]] idx(!VPURegMapped.Index<1:0:1>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TB102:.*]] idx(!VPURegMapped.Index<1:0:2>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TB000:.*]] idx(!VPURegMapped.Index<0:0:0>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TB001:.*]] idx(!VPURegMapped.Index<0:0:1>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TB002:.*]] idx(!VPURegMapped.Index<0:0:2>) + +//CHECK: ELF.CreateLogicalSection @[[SECIN0:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUF0:.*]] !VPUASM.Buffer< "NetworkInput"[0] <0> + +//CHECK: ELF.CreateLogicalSection @[[SECOUT0:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUF1:.*]] !VPUASM.Buffer< "NetworkOutput"[0] <0> + +//CHECK: ELF.CreateLogicalSection @[[SECOUT1:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUF2:.*]] !VPUASM.Buffer< "NetworkOutput"[1] <0> + +//CHECK: ELF.CreateLogicalSection @[[SECCMX0:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUF3:.*]] !VPUASM.Buffer< "CMX_NN"[0] <0> + +//CHECK: ELF.CreateLogicalSection @[[SECCMX1:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUF4:.*]] !VPUASM.Buffer< "CMX_NN"[1] <0> + +//CHECK: ELF.CreateSection @[[SECBAR:.*]] aligned +//CHECK-NEXT: VPUASM.ConfigureBarrier @[[SYMBARRIER0:.*]] idx(!VPURegMapped.Index<0:0:0>) (0) => (-1) counts(2 : 2) +//CHECK-NEXT: VPUASM.ConfigureBarrier @[[SYMBARRIER1:.*]] idx(!VPURegMapped.Index<0:0:1>) (1) => (-1) counts(2 : 2) + +//CHECK: ELF.CreateSection @[[SECDMA00:.*]] aligned +//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA000:.*]] idx(!VPURegMapped.Index<0:0:0>) + //CHECK-SAME: taskLocation(@[[SECMETA]]::@[[TB000]]) links(@[[SECMETA]]::@[[TB001]]) input(@[[SECIN0]]::@[[SYMBUF0]]) outputs([@[[SECCMX0]]::@[[SYMBUF3]]]) waits([]) updates([0 : ui8]) start_after(0) + //CHECK-SAME: dma_descriptor() + +//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA001:.*]] idx(!VPURegMapped.Index<0:0:1>) + //CHECK-SAME: taskLocation(@[[SECMETA]]::@[[TB001]]) links(@[[SECMETA]]::@[[TB002]]) input(@[[SECIN0]]::@[[SYMBUF0]]) outputs([@[[SECCMX0]]::@[[SYMBUF3]]]) waits([0 : ui8]) updates([1 : ui8]) start_after(0) + //CHECK-SAME: dma_descriptor() + +//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA010:.*]] idx(!VPURegMapped.Index<0:0:2>) + //CHECK-SAME: taskLocation(@[[SECMETA]]::@[[TB002]]) input(@[[SECCMX0]]::@[[SYMBUF3]]) outputs([@[[SECOUT0]]::@[[SYMBUF1]]]) waits([1 : ui8]) updates([]) start_after(0) + //CHECK-SAME: dma_descriptor() + +//CHECK: ELF.CreateSection @[[SECDMA10:.*]] aligned +//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA100:.*]] idx(!VPURegMapped.Index<1:0:0>) + //CHECK-SAME: taskLocation(@[[SECMETA]]::@[[TB100]]) links(@[[SECMETA]]::@[[TB101]]) input(@[[SECIN0]]::@[[SYMBUF0]]) outputs([@[[SECCMX1]]::@[[SYMBUF4]]]) waits([]) updates([0 : ui8]) start_after(0) + //CHECK-SAME: dma_descriptor() + +//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA101:.*]] idx(!VPURegMapped.Index<1:0:1>) + //CHECK-SAME: taskLocation(@[[SECMETA]]::@[[TB101]]) links(@[[SECMETA]]::@[[TB102]]) input(@[[SECIN0]]::@[[SYMBUF0]]) outputs([@[[SECCMX1]]::@[[SYMBUF4]]]) waits([0 : ui8]) updates([1 : ui8]) start_after(0) + //CHECK-SAME: dma_descriptor() + +//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA110:.*]] idx(!VPURegMapped.Index<1:0:2>) + //CHECK-SAME: taskLocation(@[[SECMETA]]::@[[TB102]]) input(@[[SECCMX1]]::@[[SYMBUF4]]) outputs([@[[SECOUT1]]::@[[SYMBUF2]]]) waits([1 : ui8]) updates([]) start_after(0) + //CHECK-SAME: dma_descriptor() + +// CHECK{LITERAL}: VPUASM.MappedInference @MappedInference : dmas([[ +// CHECK-SAME: @[[SECDMA00]]::@[[SYMDMA000]]], [@[[SECDMA10]]::@[[SYMDMA100]]]]) barriers(@[[SECBAR]]::@[[SYMBARRIER0]]) +// CHECK-SAME{LITERAL}: dmaCount([[3, 0], [3, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) +// CHECK-SAME: invariantCount([0, 0, 0, 0, 0, 0]) variantCount([0, 0, 0, 0, 0, 0]) actKernelRangesCount([0, 0, 0, 0, 0, 0]) actKernelInvocationsCount([0, 0, 0, 0, 0, 0]) mediaCount(0) barrierCount(2) + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +net.NetworkInfo entryPoint : @maxpool_f16_f16 inputsInfo : { + DataInfo "input_0" : tensor<1x64x16x16xf16> +} outputsInfo : { + DataInfo "output_0" : tensor<1x64x8x8xf16> +} + +func.func @maxpool_f16_f16() { + %0 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %1 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + + %2 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %3 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %4 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:2> + %5 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:3> + + %cst = const.Declare memref<64x1x1x4xsi32, #NHWC, @DDR> = dense<1> : tensor<64x1x1x4xsi32>, [#const.Reorder<#NHWC>] + %cst_0 = const.Declare memref<1x1x1x16xui8, #NHWC, @DDR> = dense<[[[[3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]]> : tensor<1x1x1x16xui8>, [#const.Reorder<#NHWC>] + + %6 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x64x16x16xf16, #NHWC, @DDR> + %7 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x64x8x8xf16, #NHWC, @DDR> + + %8 = VPURT.DeclareBuffer [0] <8192> -> memref<1x64x16x16xf16, #NHWC, [@CMX_NN, 0]> + %9 = VPURT.DeclareBuffer [0] <0> -> memref<1x64x8x8xf16, #NHWC, [@CMX_NN, 0]> + %10 = VPURT.DeclareBuffer [0] <8192> -> memref<1x64x16x16xf16, #NHWC, [@CMX_NN, 0]> + %11 = VPURT.DeclareBuffer [0] <0> -> memref<1x64x8x8xf16, #NHWC, [@CMX_NN, 0]> + %12 = VPURT.DeclareBuffer [0] <40960> -> memref<1x1x1x16xui8, #NHWC, [@CMX_NN, 0]> + %13 = VPURT.DeclareBuffer [0] <40976> -> memref<64x1x1x4xsi32, #NHWC, [@CMX_NN, 0]> + + %14 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 3 : ui8}<0, -1> -> !VPURegMapped.Index<0:0:0> + %15 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8}<1, -1> -> !VPURegMapped.Index<0:0:1> + + %16 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%2 : !VPURegMapped.Index<0:0:0>) inputs(%6 : memref<1x64x16x16xf16, #NHWC, @DDR>) outputs(%8 : memref<1x64x16x16xf16, #NHWC, [@CMX_NN, 0]>) updates(%14 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + %17 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%3 : !VPURegMapped.Index<0:0:1>) inputs(%cst_0 : memref<1x1x1x16xui8, #NHWC, @DDR>) outputs(%12 : memref<1x1x1x16xui8, #NHWC, [@CMX_NN, 0]>) previousDMA(%16 : !VPURegMapped.Index<0:0:0>) updates(%14 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:1> + %18 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%4 : !VPURegMapped.Index<0:0:2>) inputs(%cst : memref<64x1x1x4xsi32, #NHWC, @DDR>) outputs(%13 : memref<64x1x1x4xsi32, #NHWC, [@CMX_NN, 0]>) previousDMA(%17 : !VPURegMapped.Index<0:0:1>) updates(%14 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:2> + %19 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%5 : !VPURegMapped.Index<0:0:3>) inputs(%9 : memref<1x64x8x8xf16, #NHWC, [@CMX_NN, 0]>) outputs(%7 : memref<1x64x8x8xf16, #NHWC, @DDR>) previousDMA(%18 : !VPURegMapped.Index<0:0:2>) waits(%15 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:3> + + %20 = VPUMI40XX.DPUInvariant {clean_after = 0 : ui64, kernel_padding = #VPU.Padding, kernel_size = [2, 2], kernel_strides = [2, 2], mpe_frequent_mode = #VPU.mpe_mode, start_after = 0 : ui64, nce_task_type = #VPUIP.nce_task_type} taskLocation(%1 : !VPURegMapped.Index<0:0:0>) input(%8 : memref<1x64x16x16xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%13 : memref<64x1x1x4xsi32, #NHWC, [@CMX_NN, 0]>) outputs(%9 : memref<1x64x8x8xf16, #NHWC, [@CMX_NN, 0]>) waits(%14 : !VPURegMapped.Index<0:0:0>) updates(%15 : !VPURegMapped.Index<0:0:1>) -> <0:0:0> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + + %21 = VPUMI40XX.DPUVariant taskLocation(%0 : !VPURegMapped.Index<0:0:0>) calls(%20 : !VPURegMapped.Index<0:0:0>) weight_table(%13 : memref<64x1x1x4xsi32, #NHWC, [@CMX_NN, 0]>) {inStart = [0, 0, 0], inEnd = [15, 15, 15], end = [7, 7, 63], mpe_mode = #VPU.mpe_mode, pad = #VPU.Padding, start = [0, 0, 0], nce_task_type = #VPUIP.nce_task_type} -> !VPURegMapped.Index<0:0:0> + + %miV = VPUMI40XX.MappedInferenceVersion(11 _ 4 _ 10) -> !VPURegMapped.Index<0:0:0> + + %22 = VPUMI40XX.MappedInference dmas((%16, %19) : (!VPURegMapped.Index<0:0:0>, !VPURegMapped.Index<0:0:3>)) invariants(%20 : !VPURegMapped.Index<0:0:0>) variants(%21 : !VPURegMapped.Index<0:0:0>) barriers(%14 : !VPURegMapped.Index<0:0:0>) dmaCount([[3, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) invariantCount([1, 0, 0, 0, 0, 0]) variantCount([1, 0, 0, 0, 0, 0]) actKernelRangesCount([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) actKernelInvocationsCount([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) mediaCount(0) barrierCount(2) mappedInferenceVersion(%miV : !VPURegMapped.Index<0:0:0>) -> !VPURegMapped.Index<0:0:0> + ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} + VPUMI40XX.OpRanges +} + +//CHECK: func.func @maxpool_f16_f16 + +//CHECK: ELF.CreateLogicalSection @[[SECMETA:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareTaskBuffer @[[TBVAR000:.*]] idx(!VPURegMapped.Index<0:0:0>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TBIVAR000:.*]] idx(!VPURegMapped.Index<0:0:0>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TBD000:.*]] idx(!VPURegMapped.Index<0:0:0>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TBD001:.*]] idx(!VPURegMapped.Index<0:0:1>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TBD002:.*]] idx(!VPURegMapped.Index<0:0:2>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TBD003:.*]] idx(!VPURegMapped.Index<0:0:3>) + +//CHECK: ELF.CreateSection @[[SECCONST:.*]] aligned +//CHECK-NEXT: VPUASM.ConstBuffer @[[SYMCONST0:.*]] !VPUASM.Buffer< "Constant"[0] <0> +//CHECK: VPUASM.ConstBuffer @[[SYMCONST1:.*]] !VPUASM.Buffer< "Constant"[0] <0> + +//CHECK: ELF.CreateLogicalSection @[[SECIN0:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUFF0:.*]] !VPUASM.Buffer< "NetworkInput"[0] <0> +//CHECK: ELF.CreateLogicalSection @[[SECOUT0:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUFF1:.*]] !VPUASM.Buffer< "NetworkOutput"[0] <0> + +//CHECK: ELF.CreateLogicalSection @[[SECCMX0:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUFF2:.*]] !VPUASM.Buffer< "CMX_NN"[0] <8192> +//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF3:.*]] !VPUASM.Buffer< "CMX_NN"[0] <0> +//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF4:.*]] !VPUASM.Buffer< "CMX_NN"[0] <8192> +//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF5:.*]] !VPUASM.Buffer< "CMX_NN"[0] <0> +//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF6:.*]] !VPUASM.Buffer< "CMX_NN"[0] <40960> +//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF7:.*]] !VPUASM.Buffer< "CMX_NN"[0] <40976> + +//CHECK: ELF.CreateSection @[[SECBAR:.*]] aligned +//CHECK-NEXT: VPUASM.ConfigureBarrier @[[SYMBARR0:.*]] idx(!VPURegMapped.Index<0:0:0>) (0) => (-1) counts(3 : 1) +//CHECK: VPUASM.ConfigureBarrier @[[SYMBARR1:.*]] idx(!VPURegMapped.Index<0:0:1>) (1) => (-1) counts(1 : 1) + +//CHECK: ELF.CreateSection @[[SECDMA00:.*]] aligned +//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA_0_0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBD000]]) + //CHECK-SAME: links(@[[SECMETA]]::@[[TBD001]]) input(@[[SECIN0]]::@[[SYMBUFF0]]) outputs([@[[SECCMX0]]::@[[SYMBUFF2]]]) waits([]) updates([0 : ui8]) + +//CHECK: VPUASM.NNDMA @[[SYMDMA_0_1:.*]] idx(!VPURegMapped.Index<0:0:1>) taskLocation(@[[SECMETA]]::@[[TBD001]]) + // CHECK-SAME: links(@[[SECMETA]]::@[[TBD002]]) input(@[[SECCONST]]::@[[SYMCONST1]]) outputs([@[[SECCMX0]]::@[[SYMBUFF6]]]) waits([]) updates([0 : ui8]) + +//CHECK: VPUASM.NNDMA @[[SYMDMA_0_2:.*]] idx(!VPURegMapped.Index<0:0:2>) taskLocation(@[[SECMETA]]::@[[TBD002]]) + // CHECK-SAME: links(@[[SECMETA]]::@[[TBD003]]) input(@[[SECCONST]]::@[[SYMCONST0]]) outputs([@[[SECCMX0]]::@[[SYMBUFF7]]]) waits([]) updates([0 : ui8]) + +//CHECK: VPUASM.NNDMA @[[SYMDMA_0_3:.*]] idx(!VPURegMapped.Index<0:0:3>) taskLocation(@[[SECMETA]]::@[[TBD003]]) + // CHECK-SAME: input(@[[SECCMX0]]::@[[SYMBUFF3]]) outputs([@[[SECOUT0]]::@[[SYMBUFF1]]]) waits([1 : ui8]) updates([]) + +//CHECK: ELF.CreateSection @[[SECINV:.*]] aligned +//CHECK-NEXT: VPUASM.DPUInvariant @[[SYMINV0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBIVAR000]]) + // CHECK-SAME: input(@[[SECCMX0]]::@[[SYMBUFF2]]) weight_table(@[[SECCMX0]]::@[[SYMBUFF7]]) + // CHECK-SAME: output(@[[SECCMX0]]::@[[SYMBUFF3]]) waits([0 : ui8]) updates([1 : ui8]) + +//CHECK: ELF.CreateSection @[[SECVAR:.*]] aligned +//CHECK-NEXT: VPUASM.DPUVariant @[[SYMVAR0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBVAR000]]) + // CHECK-SAME: calls @[[SECMETA]]::@[[TBIVAR000]] + // CHECK-SAME: weight_table(@[[SECCMX0]]::@[[SYMBUFF7]]) + +// CHECK{LITERAL}: VPUASM.MappedInference @MappedInference : dmas([[ +// CHECK-SAME: @[[SECDMA00]]::@[[SYMDMA_0_0]], @[[SECDMA00]]::@[[SYMDMA_0_3]]]]) invariants([@[[SECINV]]::@[[SYMINV0]]]) variants([@[[SECVAR]]::@[[SYMVAR0]]]) barriers(@[[SECBAR]]::@[[SYMBARR0]]) +// CHECK-SAME{LITERAL}: dmaCount([[3, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) +// CHECK-SAME: invariantCount([1, 0, 0, 0, 0, 0]) variantCount([1, 0, 0, 0, 0, 0]) actKernelRangesCount([0, 0, 0, 0, 0, 0]) actKernelInvocationsCount([0, 0, 0, 0, 0, 0]) mediaCount(0) barrierCount(2) + +// ----- + +net.NetworkInfo entryPoint : @single_hswish inputsInfo : { + DataInfo "input" : tensor<1x1000xf16> +} outputsInfo : { + DataInfo "hswish" : tensor<1x1000xf16> +} +VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096] +module @VPU.SW { + func.func private @builtin_hswish(memref<*xf16>, memref<*xf16>) attributes {VPU.kernel_code = "activation_hswish.cpp", VPU.kernel_entry = "activation_hswish"} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} +} + +func.func @single_hswish() { + %0 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %1 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %2 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %3 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + + %4 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x1x1x1000xf16> + %5 = VPURT.DeclareBuffer [0] <0> {swizzlingKey = 0 : i64} -> memref<1x1x1x1000xf16> + %6 = VPURT.DeclareBuffer [0] <0> -> memref<1x1x1x1000xf16, [@CMX_NN, 0]> + %7 = VPURT.DeclareBuffer [0] <2000> -> memref<1x1x1x1000xf16, [@CMX_NN, 0]> + + %8 = VPUMI40XX.DeclareKernelText kernel_path("activation_hswish") -> !VPURegMapped.Index<0:0:0> + %9 = VPUMI40XX.DeclareKernelEntry kernel_path("activation_hswish") -> !VPURegMapped.Index<0:0:0> + %10 = VPUMI40XX.DeclareKernelArgs kernel_path("activation_hswish") -> !VPURegMapped.Index<0:0:0> + %11 = VPUMI40XX.KernelParams inputs(%6 : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) outputs(%7 : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) kernel_type("activation_hswish") kernel_params(dense<[0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 33, 67, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 33, 67, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]> : vector<72xui8>) -> !VPURegMapped.Index<0:0:0> + + %12 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8}<0, -1> -> !VPURegMapped.Index<0:0:0> + %13 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8}<1, -1> -> !VPURegMapped.Index<0:0:1> + + %14 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%2 : !VPURegMapped.Index<0:0:0>) inputs(%4 : memref<1x1x1x1000xf16>) outputs(%6 : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) updates(%12 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + %15 = VPUMI40XX.NNDMA {port = 0 : i64} taskLocation(%3 : !VPURegMapped.Index<0:0:1>) inputs(%7 : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) outputs(%5 : memref<1x1x1x1000xf16>) previousDMA(%14 : !VPURegMapped.Index<0:0:0>) waits(%13 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:1> + + %16 = VPUMI40XX.ActKernelRange taskLocation(%0 : !VPURegMapped.Index<0:0:0>) kernel_text_index(%8 : !VPURegMapped.Index<0:0:0>) kernel_args_index(%10 : !VPURegMapped.Index<0:0:0>) kernel_entry_index(%9 : !VPURegMapped.Index<0:0:0>) kernelTaskType(@COMPUTE) -> !VPURegMapped.Index<0:0:0> + + %17 = VPUMI40XX.ActKernelInvocation taskLocation(%1 : !VPURegMapped.Index<0:0:0>) range_index(%16 : <0:0:0>) kernel_params(%11 : <0:0:0>) waits(%12 : !VPURegMapped.Index<0:0:0>) updates(%13 : !VPURegMapped.Index<0:0:1>) tile(0) start_after(0) clean_after(0) -> !VPURegMapped.Index<0:0:0> + + %miV = VPUMI40XX.MappedInferenceVersion(11 _ 4 _ 10) -> !VPURegMapped.Index<0:0:0> + + %18 = VPUMI40XX.MappedInference dmas((%14, %15) : (!VPURegMapped.Index<0:0:0>, !VPURegMapped.Index<0:0:1>)) actKernelRanges((%16) : (!VPURegMapped.Index<0:0:0>)) actKernelInvocations((%17) : (!VPURegMapped.Index<0:0:0>)) barriers(%12 : !VPURegMapped.Index<0:0:0>) dmaCount([[1, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) invariantCount([0, 0, 0, 0, 0, 0]) variantCount([0, 0, 0, 0, 0, 0]) actKernelRangesCount([[1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) actKernelInvocationsCount([[1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) mediaCount(0) barrierCount(2) mappedInferenceVersion(%miV : !VPURegMapped.Index<0:0:0>) -> !VPURegMapped.Index<0:0:0> + ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} + VPUMI40XX.OpRanges +} + +//CHECK: func.func @single_hswish + +//CHECK: ELF.CreateLogicalSection @[[SECMETA:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareTaskBuffer @[[TBRANGE:.*]] idx(!VPURegMapped.Index<0:0:0>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TBINVO:.*]] idx(!VPURegMapped.Index<0:0:0>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TBDMA000:.*]] idx(!VPURegMapped.Index<0:0:0>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TBDMA001:.*]] idx(!VPURegMapped.Index<0:0:1>) + +//CHECK: ELF.CreateLogicalSection @[[SECIN0:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUFF0:.*]] !VPUASM.Buffer + +//CHECK: ELF.CreateLogicalSection @[[SECOUT0:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUFF1:.*]] !VPUASM.Buffer + +//CHECK: ELF.CreateLogicalSection @[[SECCMX0:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareBuffer @[[SYMBUFF2:.*]] !VPUASM.Buffer +//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF3:.*]] !VPUASM.Buffer + +//CHECK: ELF.CreateSection @[[SECSHVTEXT:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareKernelText @[[SYMTEXT0:.*]] : "activation_hswish" + +//CHECK: VPUASM.DeclareKernelEntry @[[SYMENTRY0:.*]] : "activation_hswish" + +//CHECK: ELF.CreateSection @[[SECSHVDATA:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareKernelData @[[SYMDATA0:.*]] : "activation_hswish" + +//CHECK: ELF.CreateSection @[[SECSHVPARAMS:.*]] aligned +//CHECK-NEXT: VPUASM.KernelParams @[[SYMPARAMS0:.*]] inputs([@[[SECCMX0]]::@[[SYMBUFF2]]]) outputs([@[[SECCMX0]]::@[[SYMBUFF3]]]) dynamicInputShapes([]) dynamicOutputShapes([]) kernel_type("activation_hswish") + +//CHECK: ELF.CreateSection @[[SECBAR:.*]] aligned +//CHECK-NEXT: VPUASM.ConfigureBarrier @[[SYMBARR0:.*]] idx(!VPURegMapped.Index<0:0:0>) (0) => (-1) counts(1 : 1) +//CHECK: VPUASM.ConfigureBarrier @[[SYMBARR1:.*]] idx(!VPURegMapped.Index<0:0:1>) (1) => (-1) counts(1 : 1) + +//CHECK: ELF.CreateSection @[[SECDMA00:.*]] aligned +//CHECK-NEXT: VPUASM.NNDMA @[[SYMDMA0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBDMA000]]) + //CHECK-SAME: links(@[[SECMETA]]::@[[TBDMA001]]) input(@[[SECIN0]]::@[[SYMBUFF0]]) outputs([@[[SECCMX0]]::@[[SYMBUFF2]]]) waits([]) updates([0 : ui8]) +//CHECK: VPUASM.NNDMA @[[SYMDMA1:.*]] idx(!VPURegMapped.Index<0:0:1>) taskLocation(@[[SECMETA]]::@[[TBDMA001]]) + //CHECK-SAME: input(@[[SECCMX0]]::@[[SYMBUFF3]]) outputs([@[[SECOUT0]]::@[[SYMBUFF1]]]) waits([1 : ui8]) updates([]) + +//CHECK: ELF.CreateSection @[[SECSHVRANGE:.*]] aligned +//CHECK: VPUASM.ActKernelRange @[[SYMACTRANGE0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBRANGE]]) + //CHECK-SAME: calls @[[SECSHVTEXT]]::@[[SYMTEXT0]] : @[[SYMENTRY0]] + +//CHECK: ELF.CreateSection @[[SECSHVINVOCATION:.*]] aligned +//CHECK-NEXT: VPUASM.ActKernelInvocation @[[SYMACTINVO0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBINVO]]) + //CHECK-SAME: -> @[[SECMETA]]::@[[TBRANGE]](kernel_data : @[[SECSHVDATA]]::@[[SYMDATA0]], kernel_params : @[[SECSHVPARAMS]]::@[[SYMPARAMS0]]) waits([0 : ui8]) updates([1 : ui8]) + +// CHECK{LITERAL}: VPUASM.MappedInference @MappedInference : dmas([[ +// CHECK-SAME: @[[SECDMA00]]::@[[SYMDMA0]], @[[SECDMA00]]::@[[SYMDMA1]]]]) actKernelRanges([@[[SECSHVRANGE]]::@[[SYMACTRANGE0]]]) actKernelInvocations([@[[SECSHVINVOCATION]]::@[[SYMACTINVO0]]]) barriers(@[[SECBAR]]::@[[SYMBARR0]]) +// CHECK-SAME{LITERAL}: dmaCount([[1, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]) +// CHECK-SAME: invariantCount([0, 0, 0, 0, 0, 0]) variantCount([0, 0, 0, 0, 0, 0]) actKernelRangesCount([1, 0, 0, 0, 0, 0]) actKernelInvocationsCount([1, 0, 0, 0, 0, 0]) mediaCount(0) barrierCount(2) + +// ----- + +net.NetworkInfo entryPoint : @continued_conv_f16_f16_f16 inputsInfo : { + DataInfo "input_0" : tensor<1x16384x1x1xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> +} outputsInfo : { + DataInfo "output_0" : tensor<1x16x1x1xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> +} +func.func @continued_conv_f16_f16_f16() { + %0 = VPUMI40XX.DeclareTaskBuffer {offset = 0 : ui64} -> !VPURegMapped.Index<0:0:0> + %1 = VPUMI40XX.DeclareTaskBuffer {offset = 352 : ui64} -> !VPURegMapped.Index<0:0:1> + %2 = VPUMI40XX.DeclareTaskBuffer {offset = 22528 : ui64} -> !VPURegMapped.Index<0:0:0> + %3 = VPUMI40XX.DeclareTaskBuffer {offset = 22752 : ui64} -> !VPURegMapped.Index<0:0:1> + %14 = VPURT.DeclareBuffer [0] <96> -> memref<1x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> + %15 = VPURT.DeclareBuffer [0] <33376> -> memref<16x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> + %16 = VPURT.DeclareBuffer [0] <16480> -> memref<1x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> + %17 = VPURT.DeclareBuffer [0] <295520> -> memref<16x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> + %18 = VPURT.DeclareBuffer [0] <64> -> memref<1x16x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> + %19 = VPURT.DeclareBuffer [0] <32864> -> memref<16x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> + %20 = VPURT.DeclareBuffer [0] <33120> -> memref<16x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> + %21 = VPURT.DeclareBuffer [0] <32> -> memref<1x16x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @Register> + %23 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8} <0, -1> -> !VPURegMapped.Index<0:0:0> + %26 = VPUMI40XX.DPUInvariant {clean_after = 1 : ui64, is_continued, kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 2 : ui64} taskLocation(%0 : !VPURegMapped.Index<0:0:0>) input(%14 : memref<1x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) weights(%15 : memref<16x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) weight_table(%19 : memref<16x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) outputs(%21 : memref<1x16x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @Register>) updates(%23 : !VPURegMapped.Index<0:0:0>) -> <0:0:0> PPE : { + } + %27 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%1 : !VPURegMapped.Index<0:0:1>) previousTask(%26 : !VPURegMapped.Index<0:0:0>) input(%16 : memref<1x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) weights(%17 : memref<16x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) weight_table(%20 : memref<16x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) outputs(%18 : memref<1x16x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) waits(%23 : !VPURegMapped.Index<0:0:0>) -> <0:0:1> PPE : { + } + %28 = VPUMI40XX.DPUVariant taskLocation(%2 : !VPURegMapped.Index<0:0:0>) calls(%26 : <0:0:0>) weights(%15 : memref<16x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) weight_table(%19 : memref<16x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) {end = [0, 0, 15], inEnd = [0, 0, 8191], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:0> + %29 = VPUMI40XX.DPUVariant taskLocation(%3 : !VPURegMapped.Index<0:0:1>) previousTask(%28 : !VPURegMapped.Index<0:0:0>) calls(%27 : <0:0:1>) weights(%17 : memref<16x8192x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) weight_table(%20 : memref<16x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]>) {HardLinkedAttrName, end = [0, 0, 15], inEnd = [0, 0, 8191], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:1> + VPURegMapped.TaskBufferLayout {ActKernelInvocation = [[#VPURegMapped.TaskGroup]], ActKernelRange = [[#VPURegMapped.TaskGroup]], DMA = [[#VPURegMapped.TaskGroup, #VPURegMapped.TaskGroup]], DPUInvariant = [[#VPURegMapped.TaskGroup]], DPUVariant = [[#VPURegMapped.TaskGroup]], M2I = [[#VPURegMapped.TaskGroup]]} + %miV = VPUMI40XX.MappedInferenceVersion(11 _ 4 _ 10) -> !VPURegMapped.Index<0:0:0> + %36 = VPUMI40XX.MappedInference invariants(%26 : !VPURegMapped.Index<0:0:0>) variants(%28 : !VPURegMapped.Index<0:0:0>) barriers(%23 : !VPURegMapped.Index<0:0:0>) dmaCount([[0, 0]]) invariantCount([2]) variantCount([2]) actKernelRangesCount([[0, 0]]) actKernelInvocationsCount([[0, 0]]) mediaCount(0) barrierCount(4) mappedInferenceVersion(%miV : !VPURegMapped.Index<0:0:0>) -> !VPURegMapped.Index<0:0:0> + ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} + VPUMI40XX.OpRanges types([#VPURegMapped.task_type, #VPURegMapped.task_type]) begins(%26, %28 : !VPURegMapped.Index<0:0:0>, !VPURegMapped.Index<0:0:0>) ends(%27, %29 : !VPURegMapped.Index<0:0:1>, !VPURegMapped.Index<0:0:1>) +} + + +//CHECK: func.func @continued_conv_f16_f16_f16 + +//CHECK: ELF.CreateLogicalSection @[[SECMETA:.*]] aligned +//CHECK-NEXT: VPUASM.DeclareTaskBuffer @[[TBIVAR000:.*]] idx(!VPURegMapped.Index<0:0:0>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TBIVAR001:.*]] idx(!VPURegMapped.Index<0:0:1>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TBVAR000:.*]] idx(!VPURegMapped.Index<0:0:0>) +//CHECK: VPUASM.DeclareTaskBuffer @[[TBVAR001:.*]] idx(!VPURegMapped.Index<0:0:1>) + +//CHECK: ELF.CreateLogicalSection @[[SECCMX0:.*]] aligned +//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF0:.*]] !VPUASM.Buffer< "CMX_NN"[0] <96> +//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF1:.*]] !VPUASM.Buffer< "CMX_NN"[0] <33376> +//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF2:.*]] !VPUASM.Buffer< "CMX_NN"[0] <16480> +//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF3:.*]] !VPUASM.Buffer< "CMX_NN"[0] <295520> +//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF4:.*]] !VPUASM.Buffer< "CMX_NN"[0] <64> +//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF5:.*]] !VPUASM.Buffer< "CMX_NN"[0] <32864> +//CHECK: VPUASM.DeclareBuffer @[[SYMBUFF6:.*]] !VPUASM.Buffer< "CMX_NN"[0] <33120> + +//CHECK-NOT: VPUASM.DeclareBuffer @[[SYMBUFF7:.*]] !VPUASM.Buffer< "MAC_Accumulators"[0] <32> + +//CHECK: VPUASM.ConfigureBarrier @[[SYMBARR0:.*]] idx(!VPURegMapped.Index<0:0:0>) (0) => (-1) counts(1 : 1) + +//CHECK: VPUASM.DPUInvariant @[[SYMINV0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBIVAR000]]) + // CHECK-SAME: input(@[[SECCMX0]]::@[[SYMBUFF0]]) weights(@[[SECCMX0]]::@[[SYMBUFF1]]) weight_table(@[[SECCMX0]]::@[[SYMBUFF5]]) + // CHECK-NOT: output( + // CHECK-SAME: updates([0 : ui8]) + // CHECK-SAME: is_continued + // CHECK-SAME: output_type_continued = !VPUASM.Buffer< "MAC_Accumulators"[0] <32> + +//CHECK: VPUASM.DPUInvariant @[[SYMINV1:.*]] idx(!VPURegMapped.Index<0:0:1>) taskLocation(@[[SECMETA]]::@[[TBIVAR001]]) + // CHECK-SAME: input(@[[SECCMX0]]::@[[SYMBUFF2]]) weights(@[[SECCMX0]]::@[[SYMBUFF3]]) weight_table(@[[SECCMX0]]::@[[SYMBUFF6]]) + // CHECK-SAME: output(@[[SECCMX0]]::@[[SYMBUFF4]]) waits([0 : ui8]) + +//CHECK: VPUASM.DPUVariant @[[SYMVAR0:.*]] idx(!VPURegMapped.Index<0:0:0>) taskLocation(@[[SECMETA]]::@[[TBVAR000]]) + // CHECK-SAME: calls @[[SECMETA]]::@[[TBIVAR000]] + // CHECK-SAME: weights(@[[SECCMX0]]::@[[SYMBUFF1]]) + // CHECK-SAME: weight_table(@[[SECCMX0]]::@[[SYMBUFF5]]) + +//CHECK: VPUASM.DPUVariant @[[SYMVAR1:.*]] idx(!VPURegMapped.Index<0:0:1>) taskLocation(@[[SECMETA]]::@[[TBVAR001]]) + // CHECK-SAME: calls @[[SECMETA]]::@[[TBIVAR001]] + // CHECK-SAME: weights(@[[SECCMX0]]::@[[SYMBUFF3]]) + // CHECK-SAME: weight_table(@[[SECCMX0]]::@[[SYMBUFF6]]) diff --git a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_single_tile_wlm_40XX+.mlir b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_single_tile_wlm_40XX+.mlir index de0b5bb265..f772faee1c 100644 --- a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_single_tile_wlm_40XX+.mlir +++ b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_single_tile_wlm_40XX+.mlir @@ -3,37 +3,33 @@ // SPDX-License-Identifier: Apache-2.0 // -// RUN: vpux-opt --split-input-file --vpu-arch=%arch% --convert-VPUMI40XX-to-VPUASM="workload-management-enable=true" %s | FileCheck %s +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --convert-VPUMI40XX-to-VPUASM %s | FileCheck %s // REQUIRES: arch-NPU40XX #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPU.arch = #VPU.arch_kind} { - IE.ExecutorResource 1 of @DMA_NN - IE.TileResource 1 of @NCE at 6.000000e+02 MHz - net.NetworkInfo entryPoint : @twoDma inputsInfo : { - DataInfo "input_0" : tensor<1x16x16x16xf16> - } outputsInfo : { - DataInfo "output_0" : tensor<1x16x16x16xf16> - DataInfo "output_1" : tensor<1x16x16x16xf16> - } - func.func @twoDma() { - %11 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 2 : ui8} <0, -1> -> !VPURegMapped.Index<0:0:0> - %12 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 2 : ui8} <1, -1> -> !VPURegMapped.Index<0:0:1> - %19 = VPUMI40XX.Bootstrap inputs(%11 : <0:0:0>) -> !VPURegMapped.Index<0:0:0> - %20 = VPUMI40XX.Bootstrap inputs(%12 : <0:0:1>) -> !VPURegMapped.Index<0:0:1> - ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} - VPUMI40XX.OpRanges - } +net.NetworkInfo entryPoint : @twoDma inputsInfo : { + DataInfo "input_0" : tensor<1x16x16x16xf16> +} outputsInfo : { + DataInfo "output_0" : tensor<1x16x16x16xf16> + DataInfo "output_1" : tensor<1x16x16x16xf16> +} +func.func @twoDma() { + %11 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 2 : ui8} <0, -1> -> !VPURegMapped.Index<0:0:0> + %12 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 2 : ui8} <1, -1> -> !VPURegMapped.Index<0:0:1> + %19 = VPUMI40XX.Bootstrap inputs(%11 : <0:0:0>) -> !VPURegMapped.Index<0:0:0> + %20 = VPUMI40XX.Bootstrap inputs(%12 : <0:0:1>) -> !VPURegMapped.Index<0:0:1> + ELF.ABIVersion(1 _ 0 _ 0) {sym_name = "LoaderABIVersion"} + VPUMI40XX.OpRanges } -//CHECK: VPUASM.Bootstrap @Bootstrap_0_0 {barrier_id = 0 : ui32} -//CHECK: VPUASM.Bootstrap @Bootstrap_0_1 {barrier_id = 1 : ui32} +//CHECK: VPUASM.Bootstrap @Bootstrap_0_0_0 {barrier_id = 0 : ui32} +//CHECK: VPUASM.Bootstrap @Bootstrap_0_0_1 {barrier_id = 1 : ui32} // ----- #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> -module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @Convolution attributes {config.compilationMode = #config.compilation_mode} { IE.TileResource 1 of @NCE at 1.700000e+03 MHz { builtin.module @ReservedMemory { module @DmaProfilingReservedMemory { @@ -41,13 +37,13 @@ module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compi } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { @@ -76,7 +72,7 @@ module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compi #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> -module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @Convolution attributes {config.compilationMode = #config.compilation_mode} { IE.TileResource 1 of @NCE at 1.700000e+03 MHz { builtin.module @ReservedMemory { module @DmaProfilingReservedMemory { @@ -84,13 +80,13 @@ module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compi } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { @@ -160,26 +156,26 @@ module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compi } } -//CHECK: VPUASM.ManagedBarrier @ConfigureBarrier_0_0 idx(!VPURegMapped.Index<0:0:0>) workItemIdx(!VPURegMapped.Index<0:0:0>) +//CHECK: VPUASM.ManagedBarrier @ConfigureBarrier_0_0_0 idx(!VPURegMapped.Index<0:0:0>) workItemIdx(!VPURegMapped.Index<0:0:0>) //CHECK-SAME: work_item_count = 2 : ui32 -//CHECK: VPUASM.ManagedBarrier @ConfigureBarrier_0_1 idx(!VPURegMapped.Index<0:0:1>) -//CHECK: VPUASM.ManagedBarrier @ConfigureBarrier_0_2 idx(!VPURegMapped.Index<0:0:2>) -//CHECK: VPUASM.ManagedBarrier @ConfigureBarrier_0_3 idx(!VPURegMapped.Index<0:0:3>) -//CHECK: VPUASM.ManagedBarrier @ConfigureBarrier_0_4 idx(!VPURegMapped.Index<0:0:4>) +//CHECK: VPUASM.ManagedBarrier @ConfigureBarrier_0_0_1 idx(!VPURegMapped.Index<0:0:1>) +//CHECK: VPUASM.ManagedBarrier @ConfigureBarrier_0_0_2 idx(!VPURegMapped.Index<0:0:2>) +//CHECK: VPUASM.ManagedBarrier @ConfigureBarrier_0_0_3 idx(!VPURegMapped.Index<0:0:3>) +//CHECK: VPUASM.ManagedBarrier @ConfigureBarrier_0_0_4 idx(!VPURegMapped.Index<0:0:4>) //CHECK: VPUASM.WorkItem @[[Enqueue0:.*]] idx(!VPURegMapped.Index<0:0:0>) real_task_index(!VPURegMapped.Index<0:0:0>) next_workitem_idx(!VPURegMapped.Index<0:0:1>) task_type() first_task(@program.metadata.cmx::@DeclareTaskBuffer_DPUVariant_0_0_0) task_count(1) //CHECK: VPUASM.WorkItem @[[Enqueue1:.*]] idx(!VPURegMapped.Index<0:0:1>) real_task_index(!VPURegMapped.Index<0:0:1>) task_type() first_task(@program.metadata.cmx::@DeclareTaskBuffer_DPUVariant_0_0_1) task_count(1) -//CHECK: VPUASM.Bootstrap @Bootstrap_0_0 {barrier_id = 0 : ui32} -//CHECK: VPUASM.Bootstrap @Bootstrap_0_1 {barrier_id = 1 : ui32} -//CHECK: VPUASM.Bootstrap @Bootstrap_0_2 {barrier_id = 2 : ui32} -//CHECK: VPUASM.Bootstrap @Bootstrap_0_3 {barrier_id = 3 : ui32} -//CHECK: VPUASM.Bootstrap @Bootstrap_0_4 {barrier_id = 4 : ui32} +//CHECK: VPUASM.Bootstrap @Bootstrap_0_0_0 {barrier_id = 0 : ui32} +//CHECK: VPUASM.Bootstrap @Bootstrap_0_0_1 {barrier_id = 1 : ui32} +//CHECK: VPUASM.Bootstrap @Bootstrap_0_0_2 {barrier_id = 2 : ui32} +//CHECK: VPUASM.Bootstrap @Bootstrap_0_0_3 {barrier_id = 3 : ui32} +//CHECK: VPUASM.Bootstrap @Bootstrap_0_0_4 {barrier_id = 4 : ui32} //CHECK{LITERAL}: VPUASM.MappedInference @MappedInference : dmas([[@task.dma.0.0::@NNDMA_0_0_0, @task.dma.0.1::@NNDMA_0_1_0]]) //CHECK-SAME: managedMappedInference(@program.mapped_inference::@MappedInference_managed) //CHECK{LITERAL}: VPUASM.ManagedMappedInference @MappedInference_managed //CHECK-SAME: workItems(@program.workItem::@[[Enqueue0]]) -//CHECK-SAME: bootstrapBarriers(@program.bootstrap::@Bootstrap_0_0) +//CHECK-SAME: bootstrapBarriers(@program.bootstrap::@Bootstrap_0_0_0) //CHECK-SAME: nnrtConfig(@program.nnrt_config::@MappedInference_nnrtConfigManaged) //CHECK-SAME: actshv_used = 0 //CHECK-SAME: dma_from_cmx_used = 1 @@ -191,9 +187,10 @@ module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compi //CHECK: VPUASM.nnrtConfig {isActKernelInvocations} @MappedInference_nnrtConfigManaged // ----- + #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> -module @BarrierProgramming attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode, VPU.revisionID = #VPU.revision_id} { +module @BarrierProgramming attributes {config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { config.PipelineOptions @Options { config.Option @VPU.FP16CompressedConv : false config.Option @VPU.ReduceSupported : false @@ -210,13 +207,13 @@ module @BarrierProgramming attributes {VPU.arch = #VPU.arch_kind, confi } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { diff --git a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_write_sections_40XX+.mlir b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_write_sections_40XX+.mlir index f42acfd0a3..aa4bacdf42 100644 --- a/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_write_sections_40XX+.mlir +++ b/tests/lit/NPU/conversion/passes/VPUMI40XX2VPUASM/convert_VPUMI40XX_to_VPUASM_write_sections_40XX+.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --split-input-file --vpu-arch=%arch% --convert-VPUMI40XX-to-VPUASM %s | FileCheck %s // REQUIRES: arch-NPU40XX -module @Test attributes {VPU.arch = #VPU.arch_kind} { +module @Test attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @main inputsInfo : { diff --git a/tests/lit/NPU/conversion/pipelines/VPUIP2ELF/mi_relocs_40XX_no_wlm.mlir b/tests/lit/NPU/conversion/pipelines/VPUIP2ELF/mi_relocs_40XX_no_wlm.mlir index 87d03f0216..70cd414b45 100644 --- a/tests/lit/NPU/conversion/pipelines/VPUIP2ELF/mi_relocs_40XX_no_wlm.mlir +++ b/tests/lit/NPU/conversion/pipelines/VPUIP2ELF/mi_relocs_40XX_no_wlm.mlir @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --lower-VPUIP-to-ELF="workload-management-enable=false" %s | FileCheck %s +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true workload-management-enable=false" --lower-VPUIP-to-ELF="workload-management-enable=false" %s | FileCheck %s // REQUIRES: arch-NPU40XX // diff --git a/tests/lit/NPU/conversion/pipelines/VPUIP2ELF/one_dma_40XX_no_wlm.mlir b/tests/lit/NPU/conversion/pipelines/VPUIP2ELF/one_dma_40XX_no_wlm.mlir index 83716ffc4f..0929a141a7 100644 --- a/tests/lit/NPU/conversion/pipelines/VPUIP2ELF/one_dma_40XX_no_wlm.mlir +++ b/tests/lit/NPU/conversion/pipelines/VPUIP2ELF/one_dma_40XX_no_wlm.mlir @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --lower-VPUIP-to-ELF="workload-management-enable=false" %s | FileCheck %s +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true workload-management-enable=false" --lower-VPUIP-to-ELF="workload-management-enable=false" %s | FileCheck %s // REQUIRES: arch-NPU40XX module @OneDMAWithoutAttributes { diff --git a/tests/lit/NPU/conversion/pipelines/VPUIP2ELF/two_dmas_40XX_no_wlm.mlir b/tests/lit/NPU/conversion/pipelines/VPUIP2ELF/two_dmas_40XX_no_wlm.mlir index ed41c0783e..d1a2f5f763 100644 --- a/tests/lit/NPU/conversion/pipelines/VPUIP2ELF/two_dmas_40XX_no_wlm.mlir +++ b/tests/lit/NPU/conversion/pipelines/VPUIP2ELF/two_dmas_40XX_no_wlm.mlir @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --lower-VPUIP-to-ELF="workload-management-enable=false" %s | FileCheck %s +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true workload-management-enable=false" --lower-VPUIP-to-ELF="workload-management-enable=false" %s | FileCheck %s // REQUIRES: arch-NPU40XX module @OneDMAWithoutAttributes { diff --git a/tests/lit/NPU/data/network_GRUSequence_37XX.mlir.txt b/tests/lit/NPU/data/network_GRUSequence_37XX.mlir.txt index a8271b065c..17b8e2963c 100644 --- a/tests/lit/NPU/data/network_GRUSequence_37XX.mlir.txt +++ b/tests/lit/NPU/data/network_GRUSequence_37XX.mlir.txt @@ -6,7 +6,7 @@ #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #loc = loc(unknown) #loc1 = loc("profiling_result") -module @dumpsubgraph attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @dumpsubgraph attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] loc(#loc) module @VPU.SW { func.func private @cache_flush_invalidate() attributes {VPU.task_type = @CACHE_FLUSH_INVALIDATE} loc(#loc) @@ -22,13 +22,13 @@ module @dumpsubgraph attributes {VPU.arch = #VPU.arch_kind, config.comp } loc(#loc) } loc(#loc) IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware loc(#loc) - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} loc(#loc) + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} loc(#loc) IE.ExecutorResource 2 of @SHAVE_ACT loc(#loc) IE.ExecutorResource 1 of @SHAVE_NN loc(#loc) IE.ExecutorResource 1 of @DPU loc(#loc) } loc(#loc) IE.ExecutorResource 2 of @DMA_NN loc(#loc) - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} loc(#loc) + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} loc(#loc) net.NetworkInfo {inferenceTiming = 27805 : i64} entryPoint : @main inputsInfo : { DataInfo "Parameter_214" : tensor<1x768xf32> loc(#loc) } outputsInfo : { diff --git a/tests/lit/NPU/data/network_source_GRUSequence_37XX.txt b/tests/lit/NPU/data/network_source_GRUSequence_37XX.txt index c8cc7a4a1e..3367db3e83 100644 --- a/tests/lit/NPU/data/network_source_GRUSequence_37XX.txt +++ b/tests/lit/NPU/data/network_source_GRUSequence_37XX.txt @@ -2,6 +2,7 @@ // Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // + #loc = loc(unknown) module @dumpsubgraph { net.NetworkInfo entryPoint : @main inputsInfo : { diff --git a/tests/lit/NPU/data/network_source_MVN_37XX.txt b/tests/lit/NPU/data/network_source_MVN_37XX.txt index 0d409583d1..6bd4058df3 100644 --- a/tests/lit/NPU/data/network_source_MVN_37XX.txt +++ b/tests/lit/NPU/data/network_source_MVN_37XX.txt @@ -2,6 +2,7 @@ // Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // + #loc0 = loc(unknown) module @MVN_case1 { net.NetworkInfo entryPoint : @main inputsInfo : { diff --git a/tests/lit/NPU/data/profiling-37XX.mlir.txt b/tests/lit/NPU/data/profiling-37XX.mlir.txt index c0c60047e6..0ba6ea122e 100644 --- a/tests/lit/NPU/data/profiling-37XX.mlir.txt +++ b/tests/lit/NPU/data/profiling-37XX.mlir.txt @@ -2,12 +2,13 @@ // Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // + #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> #loc = loc(unknown) #loc3 = loc("profiling_result") -module @age_gender attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @age_gender attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] loc(#loc) module @VPU.SW { func.func private @builtin_Convert(memref<*xf32, @CMX_NN>, memref<*xf16, @CMX_NN>) attributes {VPU.kernel_code = "convert.cpp", VPU.kernel_entry = "convert", VPU.task_type = @COMPUTE} loc(#loc) @@ -20,13 +21,13 @@ module @age_gender attributes {VPU.arch = #VPU.arch_kind, config.compil } loc(#loc) } loc(#loc) IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware loc(#loc) - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} loc(#loc) + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} loc(#loc) IE.ExecutorResource 2 of @SHAVE_ACT loc(#loc) IE.ExecutorResource 1 of @SHAVE_NN loc(#loc) IE.ExecutorResource 1 of @DPU loc(#loc) } loc(#loc) IE.ExecutorResource 2 of @DMA_NN loc(#loc) - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} loc(#loc) + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} loc(#loc) net.NetworkInfo {inferenceTiming = 104492 : i64} entryPoint : @main inputsInfo : { DataInfo "data" : tensor<1x3x62x62xf32> loc(#loc42) } outputsInfo : { diff --git a/tests/lit/NPU/data/profiling-40XX.mlir.txt b/tests/lit/NPU/data/profiling-40XX.mlir.txt index f708af604d..92ab9ca5d6 100644 --- a/tests/lit/NPU/data/profiling-40XX.mlir.txt +++ b/tests/lit/NPU/data/profiling-40XX.mlir.txt @@ -8,7 +8,7 @@ #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> #loc = loc(unknown) #loc3 = loc("profiling_result") -module @age_gender attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @age_gender attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096] loc(#loc) module @VPU.SW { func.func private @builtin_Convert(memref<*xf16, @CMX_NN>, memref<*xf32, @CMX_NN>) attributes {VPU.kernel_code = "convert.cpp", VPU.kernel_entry = "convert", VPU.task_type = @COMPUTE} loc(#loc) @@ -21,13 +21,13 @@ module @age_gender attributes {VPU.arch = #VPU.arch_kind, config.compil } loc(#loc) } loc(#loc) IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware loc(#loc) - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} loc(#loc) + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} loc(#loc) IE.ExecutorResource 2 of @SHAVE_ACT loc(#loc) IE.ExecutorResource 1 of @DPU loc(#loc) } loc(#loc) IE.ExecutorResource 1 of @M2I loc(#loc) IE.ExecutorResource 2 of @DMA_NN loc(#loc) - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} loc(#loc) + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} loc(#loc) net.NetworkInfo {inferenceTiming = 93066 : i64} entryPoint : @main inputsInfo : { DataInfo "data" : tensor<1x3x62x62xf32> loc(#loc73) } outputsInfo : { diff --git a/tests/lit/NPU/dialect/ELF/handle_alignment_requirements_40XX.mlir b/tests/lit/NPU/dialect/ELF/handle_alignment_requirements_40XX.mlir index d4bc388d8a..3a174798a7 100644 --- a/tests/lit/NPU/dialect/ELF/handle_alignment_requirements_40XX.mlir +++ b/tests/lit/NPU/dialect/ELF/handle_alignment_requirements_40XX.mlir @@ -7,7 +7,7 @@ // REQUIRES: arch-NPU40XX #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module @mainModule attributes {VPU.arch = #VPU.arch_kind} { +module @mainModule attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @read_after_write_act_dma_f16_f16 inputsInfo : { diff --git a/tests/lit/NPU/dialect/ELF/set_elf_op_offsets_40XX+.mlir b/tests/lit/NPU/dialect/ELF/set_elf_op_offsets_40XX+.mlir index ddea86bff7..e8779969e1 100644 --- a/tests/lit/NPU/dialect/ELF/set_elf_op_offsets_40XX+.mlir +++ b/tests/lit/NPU/dialect/ELF/set_elf_op_offsets_40XX+.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --split-input-file --vpu-arch=%arch% --convert-VPUASM-to-NPUReg40XX --convert-VPUIPDPU-to-NPUReg40XX --set-elf-op-offsets %s | FileCheck %s // REQUIRES: arch-NPU40XX -module @mainModule attributes {VPU.arch = #VPU.arch_kind} { +module @mainModule attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @tests inputsInfo : { diff --git a/tests/lit/NPU/dialect/ELF/update_elf_section_flags_40XX+.mlir b/tests/lit/NPU/dialect/ELF/update_elf_section_flags_40XX+.mlir index 9a01cc46f0..0fe6360652 100644 --- a/tests/lit/NPU/dialect/ELF/update_elf_section_flags_40XX+.mlir +++ b/tests/lit/NPU/dialect/ELF/update_elf_section_flags_40XX+.mlir @@ -7,7 +7,7 @@ // RUN: vpux-opt --vpu-arch=%arch% --update-ELF-section-flags %s | FileCheck %s // REQUIRES: arch-NPU40XX -module @mainModule attributes {VPU.arch = #VPU.arch_kind} { +module @mainModule attributes {config.arch = #config.arch_kind} { VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096] module @VPU.SW { diff --git a/tests/lit/NPU/dialect/HostExec/passes/convert_to_llvm_umd_40XX.mlir b/tests/lit/NPU/dialect/HostExec/passes/convert_to_llvm_umd_40XX.mlir index 7608b55c8d..31a6044b2f 100644 --- a/tests/lit/NPU/dialect/HostExec/passes/convert_to_llvm_umd_40XX.mlir +++ b/tests/lit/NPU/dialect/HostExec/passes/convert_to_llvm_umd_40XX.mlir @@ -3,127 +3,159 @@ // SPDX-License-Identifier: Apache-2.0 // -// RUN: vpux-opt --split-input-file --vpu-arch=%arch% --mlir-elide-elementsattrs-if-larger 8 --convert-to-llvm-umd-calls %s | FileCheck %s +// RUN: vpux-opt --split-input-file --vpu-arch=%arch% --mlir-elide-elementsattrs-if-larger 8 --convert-to-llvm-umd-calls %s | FileCheck %s // REQUIRES: arch-NPU40XX -// CHECK-LABEL: @OneInputOneOutput -module @OneInputOneOutput { - -// CHECK-DAG: llvm.func @[[npu_level_zero_alloc:.+]](i64, !llvm.ptr) -> !llvm.ptr -// CHECK-DAG: llvm.func @[[npu_level_zero_append_memory_copy:.+]](!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -// CHECK: func.func @main([[arg0:%.+]]: memref<1x3x60x60xf16>, [[arg1:%.+]]: memref<1x3x60x60xf16> - - func.func @main(%arg0: memref<1x3x60x60xf16>, %arg1: memref<1x3x60x60xf16>) -> memref<1x3x60x60xf16> { - %alloc = memref.alloc() : memref<1x3x60x60xf16> - memref.copy %alloc, %arg1 : memref<1x3x60x60xf16> to memref<1x3x60x60xf16> - return %arg1 : memref<1x3x60x60xf16> - -// CHECK: [[op0:%.+]] = builtin.unrealized_conversion_cast [[arg1]] : memref<1x3x60x60xf16> to !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> -// CHECK-DAG: [[op1:%.+]] = llvm.mlir.constant(1 : index) : i64 -// CHECK-DAG: [[op2:%.+]] = llvm.mlir.constant(3 : index) : i64 -// CHECK-DAG: [[op3:%.+]] = llvm.mlir.constant(60 : index) : i64 -// CHECK-DAG: [[op4:%.+]] = llvm.mlir.constant(60 : index) : i64 -// CHECK-DAG: [[op5:%.+]] = llvm.mlir.constant(1 : index) : i64 -// CHECK-DAG: [[op6:%.+]] = llvm.mlir.constant(3600 : index) : i64 -// CHECK-DAG: [[op7:%.+]] = llvm.mlir.constant(10800 : index) : i64 -// CHECK-DAG: [[op8:%.+]] = llvm.mlir.constant(10800 : index) : i64 -// CHECK-DAG: [[op9:%.+]] = llvm.mlir.zero : !llvm.ptr -// CHECK: [[op10:%.+]] = llvm.getelementptr [[op9]][[[op8]]] : (!llvm.ptr, i64) -> !llvm.ptr, f16 -// CHECK-NEXT: [[op11:%.+]] = llvm.ptrtoint [[op10]] : !llvm.ptr to i64 -// CHECK-NEXT: [[op12:%.+]] = llvm.call @[[npu_level_zero_alloc]]([[op11]], %arg2) : (i64, !llvm.ptr) -> !llvm.ptr -// CHECK: [[op13:%.+]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> -// CHECK-NEXT: [[op14:%.+]] = llvm.insertvalue [[op12]], [[op13]][0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> -// CHECK-NEXT: [[op15:%.+]] = llvm.insertvalue [[op12]], [[op14]][1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> -// CHECK: [[op16:%.+]] = llvm.mlir.constant(0 : index) : i64 -// CHECK-NEXT: [[op17:%.+]] = llvm.insertvalue [[op16]], [[op15]][2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> -// CHECK-NEXT: [[op18:%.+]] = llvm.insertvalue [[op1]], [[op17]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> -// CHECK-NEXT: [[op19:%.+]] = llvm.insertvalue [[op2]], [[op18]][3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> -// CHECK-NEXT: [[op20:%.+]] = llvm.insertvalue [[op3]], [[op19]][3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> -// CHECK-NEXT: [[op21:%.+]] = llvm.insertvalue [[op4]], [[op20]][3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> -// CHECK-NEXT: [[op22:%.+]] = llvm.insertvalue [[op7]], [[op21]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> -// CHECK-NEXT: [[op23:%.+]] = llvm.insertvalue [[op6]], [[op22]][4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> -// CHECK-NEXT: [[op24:%.+]] = llvm.insertvalue [[op4]], [[op23]][4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> -// CHECK-NEXT: [[op25:%.+]] = llvm.insertvalue [[op5]], [[op24]][4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> -// CHECK: [[op26:%.+]] = llvm.intr.stacksave : !llvm.ptr -// CHECK-DAG: [[op27:%.+]] = llvm.mlir.constant(4 : i64) : i64 -// CHECK-DAG: [[op28:%.+]] = llvm.mlir.constant(1 : index) : i64 -// CHECK-NEXT: [[op29:%.+]] = llvm.alloca [[op28]] x !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> : (i64) -> !llvm.ptr -// CHECK-NEXT: llvm.store [[op25]], [[op29]] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>, !llvm.ptr -// CHECK: [[op30:%.+]] = llvm.mlir.undef : !llvm.struct<(i64, ptr)> -// CHECK-NEXT: [[op31:%.+]] = llvm.insertvalue [[op27]], [[op30]][0] : !llvm.struct<(i64, ptr)> -// CHECK-NEXT: [[op32:%.+]] = llvm.insertvalue [[op29]], [[op31]][1] : !llvm.struct<(i64, ptr)> -// CHECK-DAG: [[op33:%.+]] = llvm.mlir.constant(4 : i64) : i64 -// CHECK-DAG: [[op34:%.+]] = llvm.mlir.constant(1 : index) : i64 -// CHECK-NEXT: [[op35:%.+]] = llvm.alloca [[op34]] x !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> : (i64) -> !llvm.ptr -// CHECK-NEXT: llvm.store [[op0]], [[op35]] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>, !llvm.ptr -// CHECK: [[op36:%.+]] = llvm.mlir.undef : !llvm.struct<(i64, ptr)> -// CHECK-NEXT: [[op37:%.+]] = llvm.insertvalue [[op33]], [[op36]][0] : !llvm.struct<(i64, ptr)> -// CHECK-NEXT: [[op38:%.+]] = llvm.insertvalue [[op35]], [[op37]][1] : !llvm.struct<(i64, ptr)> -// CHECK: [[op39:%.+]] = llvm.mlir.constant(1 : index) : i64 -// CHECK-NEXT: [[op40:%.+]] = llvm.alloca [[op39]] x !llvm.struct<(i64, ptr)> : (i64) -> !llvm.ptr -// CHECK-DAG: llvm.store [[op32]], [[op40]] : !llvm.struct<(i64, ptr)>, !llvm.ptr -// CHECK-DAG: [[op41:%.+]] = llvm.alloca [[op39]] x !llvm.struct<(i64, ptr)> : (i64) -> !llvm.ptr -// CHECK-NEXT: llvm.store [[op38]], [[op41]] : !llvm.struct<(i64, ptr)>, !llvm.ptr -// CHECK: [[op42:%.+]] = llvm.mlir.zero : !llvm.ptr -// CHECK-NEXT: [[op43:%.+]] = llvm.getelementptr [[op42]][1] : (!llvm.ptr) -> !llvm.ptr, f16 -// CHECK-NEXT: [[op44:%.+]] = llvm.ptrtoint [[op43]] : !llvm.ptr to i64 -// CHECK-NEXT: llvm.call @npu_level_zero_append_memory_copy([[op40]], [[op41]], [[op44]], %arg5) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () -// CHECK-NEXT: llvm.intr.stackrestore [[op26]] : !llvm.ptr - } +module @StaticEltwiseNHWC attributes {config.arch = #config.arch_kind, config.revisionID = #config.revision_id, config.compilationMode = #config.compilation_mode} { + //CHECK: llvm.mlir.global internal constant @main1_kernel + config.PipelineOptions @Options { + config.Option @VPU.EnableExtraStaticShapeOps : true + config.Option @VPU.EnableAdaptiveStripping : false + config.Option @VPU.EnableSEPtrsOperations : false + config.Option @VPU.EnableExperimentalSEPtrsOperations : false + config.Option @VPU.EnableVPUNNPreSplit : false + config.Option @VPU.FP16CompressedConv : false + config.Option @VPU.EnableDCIM : true + config.Option @VPU.ReduceSupported : false + config.Option @VPU.AutoPaddingODU : false + config.Option @VPU.AutoPaddingIDU : false + config.Option @VPU.SprLUTEnabled : false + config.Option @VPU.FragmentationAvoidRatioPipeliningLargeWeights : 4.500000e-01 : f32 + config.Option @VPU.UseDedicatedFifoPerShaveEngine : false + config.Option @VPU.BarrierMaxVariantSum : 64 : ui64 + config.Option @VPU.BarrierMaxVariantCount : 128 : ui64 + config.Option @VPU.MetadataMaxVariantCount : 128 : ui64 + config.Option @VPU.MetadataMaxInvariantCount : 64 : ui64 + config.Option @VPU.MetadataMaxKernelInvocationCount : 64 : ui64 + config.Option @VPU.MetadataMaxKernelRangeCount : 64 : ui64 + config.Option @VPU.MetadataMaxMediaCount : 4 : ui64 + config.Option @VPU.MaxKernelSize : 11 : si64 + } + IE.TileResource 6 of @NCE at 1.850000e+03 MHz { + IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.ExecutorResource 2 of @SHAVE_ACT + IE.ExecutorResource 1 of @DPU + } + IE.ExecutorResource 1 of @M2I + IE.ExecutorResource 2 of @DMA_NN + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input1" : tensor<1x16x720x1000xf16> + DataInfo "input2" : tensor<1x16x720x1000xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x720x1000xf16> + } + HostExec.Binary @OneDMAWithoutAttributes { + HostExec.BinaryData @serialized_main1 + func.func private @main1(memref<1x90x1000x16xf16>, memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> + } + func.func @main(%arg0: memref<1x720x1000x16xf16>, %arg1: memref<1x720x1000x16xf16>) -> memref<1x720x1000x16xf16> { + %0 = llvm.mlir.constant(0 : index) : i64 + %1 = llvm.mlir.constant(720 : index) : i64 + %2 = llvm.mlir.constant(90 : index) : i64 + %3 = builtin.unrealized_conversion_cast %2 : i64 to index + %4 = builtin.unrealized_conversion_cast %1 : i64 to index + %5 = builtin.unrealized_conversion_cast %0 : i64 to index + %6 = llvm.sdiv %1, %2 : i64 + %7 = builtin.unrealized_conversion_cast %6 : i64 to index + %8 = async.create_group %7 : !async.group + scf.for %arg2 = %5 to %4 step %3 { + %subview = memref.subview %arg0[0, %arg2, 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x720x1000x16xf16> to memref<1x90x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> + %9 = builtin.unrealized_conversion_cast %subview : memref<1x90x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> to memref<1x90x1000x16xf16> + %subview_0 = memref.subview %arg1[0, %arg2, 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x720x1000x16xf16> to memref<1x90x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> + %10 = builtin.unrealized_conversion_cast %subview_0 : memref<1x90x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> to memref<1x90x1000x16xf16> + %token, %bodyResults = async.execute -> !async.value> { + %13 = Core.NestedCall @OneDMAWithoutAttributes::@main1(%9, %10) : (memref<1x90x1000x16xf16>, memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> + async.yield %10 : memref<1x90x1000x16xf16> + } + %11 = async.add_to_group %token, %8 : !async.token + %12 = async.await %bodyResults : !async.value> + } + async.await_all %8 + return %arg1 : memref<1x720x1000x16xf16> + } + //CHECK-NOT: async.create_group + //CHECK-NOT: async.add_to_group + //CHECK-NOT: async.await_all + //CHECK-NOT: async.await + //CHECK-NOT: async.execute + //CHECK: llvm.call @npu_level_zero_reset_commandlist + //CHECK: llvm.call @npu_level_zero_execute_graph + //CHECK: llvm.call @npu_level_zero_submit_commandlist } // ----- -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module @StaticEltwiseNHWC attributes {config.compilationMode = #config.compilation_mode} { +module @Add attributes {config.compilationMode = #config.compilation_mode} { net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "input1" : tensor<1x16x720x1000xf16> - DataInfo "input2" : tensor<1x16x720x1000xf16> - } outputsInfo : { - DataInfo "output1" : tensor<1x16x720x1000xf16> - } - - module @Module0 { - func.func private @main_func0(%arg0: memref<1x16x90x1000xf16>, %arg1: memref<1x16x90x1000xf16>, %arg2: memref<1x16x90x1000xf16>) -> memref<1x16x90x1000xf16> { - %7 = VPUIP.Copy inputs(%arg0 : memref<1x16x90x1000xf16>) outputs(%arg2 : memref<1x16x90x1000xf16>) -> memref<1x16x90x1000xf16> - return %7 : memref<1x16x90x1000xf16> - } + DataInfo "input1" : tensor<1x16x720x1000xf16> + DataInfo "input2" : tensor<1x16x720x1000xf16> + } outputsInfo : { + DataInfo "Add_3" friendlyName = "output" : tensor<1x16x720x1000xf16> + } + HostExec.Binary @Module0 { + HostExec.BinaryData @serialized_main_func0 + func.func private @main_func0(memref<1x16x720x1000xf16, @DDR>, memref<1x16x720x1000xf16, @DDR>, memref<1x720x1000x16xf16, @DDR>, memref<1x720x1000x16xf16, @DDR>) -> (memref<1x720x1000x16xf16, @DDR>, memref<1x720x1000x16xf16, @DDR>) + } + HostExec.Binary @Module1 { + HostExec.BinaryData @serialized_main_func1 + func.func private @main_func1(memref<1x720x1000x16xf16, @DDR>, memref<1x16x720x1000xf16, @DDR>) -> memref<1x16x720x1000xf16, @DDR> + } + HostExec.Binary @Module2 { + HostExec.BinaryData @serialized_main_func2 + func.func private @main_func2(memref<1x80x1000x16xf16, @DDR>, memref<1x80x1000x16xf16, @DDR>, memref<1x80x1000x16xf16, @DDR>) -> memref<1x80x1000x16xf16, @DDR> } func.func @main(%arg0: memref<1x16x720x1000xf16>, %arg1: memref<1x16x720x1000xf16>, %arg2: memref<1x16x720x1000xf16>) -> memref<1x16x720x1000xf16> { - %c90 = llvm.mlir.constant(90 : index) : i64 - %c720 = llvm.mlir.constant(720 : index) : i64 - %c0 = llvm.mlir.constant(0 : index) : i64 - %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x16x720x1000xf16> - memref.copy %arg1, %alloc : memref<1x16x720x1000xf16> to memref<1x16x720x1000xf16> - %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<1x16x720x1000xf16> - memref.copy %arg0, %alloc_0 : memref<1x16x720x1000xf16> to memref<1x16x720x1000xf16> - %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x16x720x1000xf16> - %0 = llvm.icmp "slt" %c720, %c0 : i64 - %i1 = llvm.sdiv %c720, %c90 : i64 - %1 = builtin.unrealized_conversion_cast %i1 : i64 to index - %c0i = builtin.unrealized_conversion_cast %c0 : i64 to index - %c720i = builtin.unrealized_conversion_cast %c720 : i64 to index - %c90i = builtin.unrealized_conversion_cast %c90 : i64 to index - %2 = async.create_group %1 : !async.group - scf.for %count = %c0i to %c720i step %c90i { - %alloc1 = memref.alloc() {alignment = 64 : i64} : memref<1x16x90x1000xf16> - %alloc2 = memref.alloc() {alignment = 64 : i64} : memref<1x16x90x1000xf16> - %alloc_3 = memref.alloc() : memref<1x16x90x1000xf16> - %token, %bodyResults = async.execute -> !async.value> { - %8 = Core.NestedCall @Module0::@main_func0(%alloc1, %alloc2, %alloc_3) : (memref<1x16x90x1000xf16>, memref<1x16x90x1000xf16>, memref<1x16x90x1000xf16>) -> memref<1x16x90x1000xf16> - async.yield %8 : memref<1x16x90x1000xf16> + %0 = llvm.mlir.constant(80 : index) : i64 + %1 = builtin.unrealized_conversion_cast %0 : i64 to index + %2 = llvm.mlir.constant(720 : index) : i64 + %3 = builtin.unrealized_conversion_cast %2 : i64 to index + %4 = llvm.mlir.constant(0 : index) : i64 + %5 = builtin.unrealized_conversion_cast %4 : i64 to index + %alloc = memref.alloc() : memref<1x720x1000x16xf16> + %alloc_0 = memref.alloc() : memref<1x720x1000x16xf16> + %token, %bodyResults:2 = async.execute -> (!async.value>, !async.value>) { + %12:2 = Core.NestedCall @Module0::@main_func0(%arg0, %arg1, %alloc, %alloc_0) : (memref<1x16x720x1000xf16>, memref<1x16x720x1000xf16>, memref<1x720x1000x16xf16>, memref<1x720x1000x16xf16>) -> (memref<1x720x1000x16xf16, @DDR>, memref<1x720x1000x16xf16, @DDR>) + async.yield %alloc, %alloc_0 : memref<1x720x1000x16xf16>, memref<1x720x1000x16xf16> + } + %6 = async.await %bodyResults#0 : !async.value> + %7 = async.await %bodyResults#1 : !async.value> + %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x720x1000x16xf16> + %8 = llvm.sdiv %2, %0 : i64 + %9 = builtin.unrealized_conversion_cast %8 : i64 to index + %10 = async.create_group %9 : !async.group + scf.for %arg3 = %5 to %3 step %1 { + %subview = memref.subview %6[0, %arg3, 0, 0] [1, 80, 1000, 16] [1, 1, 1, 1] : memref<1x720x1000x16xf16> to memref<1x80x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> + %subview_5 = memref.subview %7[0, %arg3, 0, 0] [1, 80, 1000, 16] [1, 1, 1, 1] : memref<1x720x1000x16xf16> to memref<1x80x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> + %12 = builtin.unrealized_conversion_cast %subview : memref<1x80x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> to memref<1x80x1000x16xf16> + %13 = builtin.unrealized_conversion_cast %subview_5 : memref<1x80x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> to memref<1x80x1000x16xf16> + %subview_6 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 80, 1000, 16] [1, 1, 1, 1] : memref<1x720x1000x16xf16> to memref<1x80x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> + %14 = builtin.unrealized_conversion_cast %subview_6 : memref<1x80x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> to memref<1x80x1000x16xf16> + %token_7, %bodyResults_8 = async.execute -> !async.value> { + %17 = Core.NestedCall @Module2::@main_func2(%12, %13, %14) : (memref<1x80x1000x16xf16>, memref<1x80x1000x16xf16>, memref<1x80x1000x16xf16>) -> memref<1x80x1000x16xf16, @DDR> + async.yield %14 : memref<1x80x1000x16xf16> } - %6 = async.add_to_group %token, %2 : !async.token - %7 = async.await %bodyResults : !async.value> + %15 = async.add_to_group %token_7, %10 : !async.token + %16 = async.await %bodyResults_8 : !async.value> + } + async.await_all %10 + %alloc_2 = memref.alloc() : memref<1x16x720x1000xf16> + %token_3, %bodyResults_4 = async.execute -> !async.value> { + %12 = Core.NestedCall @Module1::@main_func1(%alloc_1, %alloc_2) : (memref<1x720x1000x16xf16>, memref<1x16x720x1000xf16>) -> memref<1x16x720x1000xf16, @DDR> + async.yield %alloc_2 : memref<1x16x720x1000xf16> } - async.await_all %2 - %3 = VPUIP.Copy inputs(%alloc_1 : memref<1x16x720x1000xf16>) outputs(%arg2 : memref<1x16x720x1000xf16>) -> memref<1x16x720x1000xf16> - return %3 : memref<1x16x720x1000xf16> + %11 = async.await %bodyResults_4 : !async.value> + memref.copy %11, %arg2 : memref<1x16x720x1000xf16> to memref<1x16x720x1000xf16> + return %arg2 : memref<1x16x720x1000xf16> } //CHECK-NOT: async.create_group //CHECK-NOT: async.add_to_group //CHECK-NOT: async.await_all //CHECK-NOT: async.await + //CHECK-NOT: async.execute //CHECK: llvm.call @npu_level_zero_reset_commandlist + //CHECK: llvm.call @npu_level_zero_execute_graph //CHECK: llvm.call @npu_level_zero_submit_commandlist } + diff --git a/tests/lit/NPU/dialect/HostExec/passes/optimize_memref_copies.mlir b/tests/lit/NPU/dialect/HostExec/passes/optimize_memref_copies.mlir index b52d40dd63..5a77870dcf 100644 --- a/tests/lit/NPU/dialect/HostExec/passes/optimize_memref_copies.mlir +++ b/tests/lit/NPU/dialect/HostExec/passes/optimize_memref_copies.mlir @@ -87,3 +87,121 @@ func.func @main(%arg0: memref<1x90x1000x16xf16>, %arg1: memref<1x90x1000x16xf16> // CHECK: func.func @main([[ARG0:%.+]]: memref<1x90x1000x16xf16>, [[ARG1:%.+]]: memref<1x90x1000x16xf16>, [[ARG2:%.+]]: memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> { // CHECK: [[CALL:%.+]] = call @main_func0([[ARG0]], [[ARG1]], [[ARG2]]) // CHECK: return [[ARG2]] : memref<1x90x1000x16xf16> + +// ----- + +module @ScheduleCopyFunction { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input1" : tensor<1x16x?x1000xf16> + DataInfo "input2" : tensor<1x16x?x1000xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x?x1000xf16> + } + func.func @main_func0_static(%arg0: memref<1x90x1000x16xf16>, %arg1: memref<1x90x1000x16xf16>, %arg2: memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> { + %1 = VPUIP.Copy inputs(%arg1 : memref<1x90x1000x16xf16>) outputs(%arg2 : memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> + return %1 : memref<1x90x1000x16xf16> + } + func.func @main(%arg0: memref<1x?x1000x16xf16>, %arg1: memref<1x?x1000x16xf16>, %arg2: memref<1x?x1000x16xf16>) -> memref<1x?x1000x16xf16> { + %c90 = arith.constant 90 : index + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x1000x1000x16xf16> + scf.for %arg3 = %c0 to %c1000 step %c90 { + %input0_subview = memref.subview %arg0[0, %arg3, 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x?x1000x16xf16> to memref<1x90x1000x16xf16, strided<[?, 16000, 16, 1], offset: ?>> + %input1_subview = memref.subview %arg1[0, %arg3, 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x?x1000x16xf16> to memref<1x90x1000x16xf16, strided<[?, 16000, 16, 1], offset: ?>> + %3 = builtin.unrealized_conversion_cast %input0_subview : memref<1x90x1000x16xf16, strided<[?, 16000, 16, 1], offset: ?>> to memref<1x90x1000x16xf16> + %4 = builtin.unrealized_conversion_cast %input1_subview : memref<1x90x1000x16xf16, strided<[?, 16000, 16, 1], offset: ?>> to memref<1x90x1000x16xf16> + %tmp_buffer = memref.alloc() : memref<1x90x1000x16xf16> + %5 = func.call @main_func0_static(%3, %4, %tmp_buffer) : (memref<1x90x1000x16xf16>, memref<1x90x1000x16xf16>, memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> + %output_buf_subview = memref.subview %alloc[0, %arg3, 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x1000x1000x16xf16> to memref<1x90x1000x16xf16, strided<[16000000, 16000, 16, 1], offset: ?>> + memref.copy %5, %output_buf_subview : memref<1x90x1000x16xf16> to memref<1x90x1000x16xf16, strided<[16000000, 16000, 16, 1], offset: ?>> + } + memref.copy %alloc, %arg2 : memref<1x1000x1000x16xf16> to memref<1x?x1000x16xf16> + return %arg2 : memref<1x?x1000x16xf16> + } + + // CHECK: func.func [[MAIN_FUNC0:@.+]]([[_:%.+]]: memref<1x90x1000x16xf16>, [[_:%.+]]: memref<1x90x1000x16xf16>, [[_:%.+]]: memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> { + + // CHECK: func.func @main([[ARG0:%.+]]: memref<1x?x1000x16xf16>, [[ARG1:%.+]]: memref<1x?x1000x16xf16>, [[ARG2:%.+]]: memref<1x?x1000x16xf16>) -> memref<1x?x1000x16xf16> { + // CHECK: [[STEP:%.+]] = arith.constant 90 : index + // CHECK: [[LOOP_START:%.+]] = arith.constant 0 : index + // CHECK: [[LOOP_END:%.+]] = arith.constant 1000 : index + + // CHECK: scf.for [[ITER:%.+]] = [[LOOP_START]] to [[LOOP_END]] step [[STEP]] { + // CHECK: [[SUBVIEW0:%.+]] = memref.subview [[ARG0]] + // CHECK: [[SUBVIEW1:%.+]] = memref.subview [[ARG1]] + // CHECK: [[CAST0:%.+]] = builtin.unrealized_conversion_cast [[SUBVIEW0]] + // CHECK: [[CAST1:%.+]] = builtin.unrealized_conversion_cast [[SUBVIEW1]] + // CHECK: [[SUBVIEW2:%.+]] = memref.subview [[ARG2]] + // CHECK: [[CAST2:%.+]] = builtin.unrealized_conversion_cast [[SUBVIEW2]] + // CHECK: [[CALL:%.+]] = func.call [[MAIN_FUNC0]]([[CAST0]], [[CAST1]], [[CAST2]]) +} + +// ----- + +module @ScheduleCopyAndEltwiseFunctions { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input1" : tensor<1x16x?x1000xf16> + DataInfo "input2" : tensor<1x16x?x1000xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x?x1000xf16> + } + func.func @main_func0_static(%arg0: memref<1x90x1000x16xf16>, %arg1: memref<1x90x1000x16xf16>, %arg2: memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> { + %1 = VPUIP.Copy inputs(%arg1 : memref<1x90x1000x16xf16>) outputs(%arg2 : memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> + return %1 : memref<1x90x1000x16xf16> + } + func.func @main_func1_static(%arg0: memref<1x90x1000x16xf16>, %arg1: memref<1x90x1000x16xf16>, %arg2: memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> { + %1 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Add + inputs(%arg0 as %input_0: memref<1x90x1000x16xf16>, %arg1 as %input_1: memref<1x90x1000x16xf16>) + outputs(%arg2 as %output: memref<1x90x1000x16xf16>) on tile 0 -> memref<1x90x1000x16xf16> { + VPUIP.SW.Kernel.run (%input_0, %input_1, %output) : memref<1x90x1000x16xf16>, memref<1x90x1000x16xf16>, memref<1x90x1000x16xf16> + } + return %1 : memref<1x90x1000x16xf16> + } + func.func @main(%arg0: memref<1x?x1000x16xf16>, %arg1: memref<1x?x1000x16xf16>, %arg2: memref<1x?x1000x16xf16>) -> memref<1x?x1000x16xf16> { + %c90 = arith.constant 90 : index + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x1000x1000x16xf16> + scf.for %arg3 = %c0 to %c1000 step %c90 { + %input0_subview = memref.subview %arg0[0, %arg3, 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x?x1000x16xf16> to memref<1x90x1000x16xf16, strided<[?, 16000, 16, 1], offset: ?>> + %input1_subview = memref.subview %arg1[0, %arg3, 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x?x1000x16xf16> to memref<1x90x1000x16xf16, strided<[?, 16000, 16, 1], offset: ?>> + %3 = builtin.unrealized_conversion_cast %input0_subview : memref<1x90x1000x16xf16, strided<[?, 16000, 16, 1], offset: ?>> to memref<1x90x1000x16xf16> + %4 = builtin.unrealized_conversion_cast %input1_subview : memref<1x90x1000x16xf16, strided<[?, 16000, 16, 1], offset: ?>> to memref<1x90x1000x16xf16> + %tmp_buffer = memref.alloc() : memref<1x90x1000x16xf16> + %5 = func.call @main_func0_static(%3, %4, %tmp_buffer) : (memref<1x90x1000x16xf16>, memref<1x90x1000x16xf16>, memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> + %eltwise_input = memref.subview %alloc[0, %arg3, 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x1000x1000x16xf16> to memref<1x90x1000x16xf16, strided<[16000000, 16000, 16, 1], offset: ?>> + %eltwise_input_static = builtin.unrealized_conversion_cast %eltwise_input : memref<1x90x1000x16xf16, strided<[16000000, 16000, 16, 1], offset: ?>> to memref<1x90x1000x16xf16> + %dim = memref.dim %alloc, %c0 : memref<1x1000x1000x16xf16> + %6 = func.call @main_func1_static(%5, %eltwise_input_static, %tmp_buffer) : (memref<1x90x1000x16xf16>, memref<1x90x1000x16xf16>, memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> + %subview_3 = memref.subview %alloc[0, %arg3, 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x1000x1000x16xf16> to memref<1x90x1000x16xf16, strided<[16000000, 16000, 16, 1], offset: ?>> + memref.copy %6, %subview_3 : memref<1x90x1000x16xf16> to memref<1x90x1000x16xf16, strided<[16000000, 16000, 16, 1], offset: ?>> + } + + memref.copy %alloc, %arg2 : memref<1x1000x1000x16xf16> to memref<1x?x1000x16xf16> + return %arg2 : memref<1x?x1000x16xf16> + } + + // CHECK: func.func [[MAIN_FUNC0:@.+]]([[_:%.+]]: memref<1x90x1000x16xf16>, [[_:%.+]]: memref<1x90x1000x16xf16>, [[_:%.+]]: memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> { + + // CHECK: func.func [[MAIN_FUNC1:@.+]]([[_:%.+]]: memref<1x90x1000x16xf16>, [[_:%.+]]: memref<1x90x1000x16xf16>, [[_:%.+]]: memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> { + + // CHECK: func.func @main([[ARG0:%.+]]: memref<1x?x1000x16xf16>, [[ARG1:%.+]]: memref<1x?x1000x16xf16>, [[ARG2:%.+]]: memref<1x?x1000x16xf16>) -> memref<1x?x1000x16xf16> { + // CHECK: [[STEP:%.+]] = arith.constant 90 : index + // CHECK: [[LOOP_START:%.+]] = arith.constant 0 : index + // CHECK: [[LOOP_END:%.+]] = arith.constant 1000 : index + + // CHECK: scf.for [[ITER:%.+]] = [[LOOP_START]] to [[LOOP_END]] step [[STEP]] { + // CHECK: [[IN0_SUBVIEW:%.+]] = memref.subview [[ARG0]][0, [[ITER]], 0, 0] + // CHECK: [[IN1_SUBVIEW:%.+]] = memref.subview [[ARG1]][0, [[ITER]], 0, 0] + // CHECK: [[IN0_SUBVIEW_STATIC:%.+]] = builtin.unrealized_conversion_cast [[IN0_SUBVIEW]] + // CHECK: [[IN1_SUBVIEW_STATIC:%.+]] = builtin.unrealized_conversion_cast [[IN1_SUBVIEW]] + // CHECK: [[ALLOC:%.+]] = memref.alloc() + // CHECK: [[COPY_OUTPUT:%.+]] = func.call [[MAIN_FUNC0]]([[IN0_SUBVIEW_STATIC]], [[IN1_SUBVIEW_STATIC]], [[ALLOC]]) + // CHECK: [[ELTWISE_IN0_SUBVIEW:%.+]] = memref.subview [[ARG2]][0, [[ITER]], 0, 0] + // CHECK: [[ELTWISE_IN0_STATIC:%.+]] = builtin.unrealized_conversion_cast [[ELTWISE_IN0_SUBVIEW]] + // CHECK: [[DIM:%.+]] = memref.dim [[ARG2]] + // CHECK: [[OUTPUT_SUBVIEW:%.+]] = memref.subview [[ARG2]][0, [[ITER]], 0, 0] + // CHECK: [[OUTPUT_STATIC:%.+]] = builtin.unrealized_conversion_cast [[OUTPUT_SUBVIEW]] + // CHECK: [[CALL0:%.+]] = func.call [[MAIN_FUNC1]]([[COPY_OUTPUT]], [[ELTWISE_IN0_STATIC]], [[OUTPUT_STATIC]]) +} diff --git a/tests/lit/NPU/dialect/HostExec/passes/serialize_elf_binary_37XX.mlir b/tests/lit/NPU/dialect/HostExec/passes/serialize_elf_binary_37XX.mlir index 24d4b8b96c..e15b5fa278 100644 --- a/tests/lit/NPU/dialect/HostExec/passes/serialize_elf_binary_37XX.mlir +++ b/tests/lit/NPU/dialect/HostExec/passes/serialize_elf_binary_37XX.mlir @@ -7,7 +7,7 @@ // REQUIRES: arch-NPU37XX // CHECK-LABEL: @OneInputOneOutput -module @OneInputOneOutput attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode, VPU.revisionID = #VPU.revision_id} { +module @OneInputOneOutput attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { config.PipelineOptions @Options { config.Option @VPU.FP16CompressedConv : false config.Option @VPU.ReduceSupported : false @@ -20,19 +20,19 @@ module @OneInputOneOutput attributes {VPU.arch = #VPU.arch_kind, config } IE.TileResource 2 of @NCE at 1.300000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x3x60x60xf16> } outputsInfo : { DataInfo "output" : tensor<1x3x60x60xf16> } - module @module0 attributes {VPU.arch = #VPU.arch_kind} { + module @module0 attributes {config.arch = #config.arch_kind} { net.NetworkInfo {inferenceTiming = 2282 : i64} entryPoint : @dma_copy inputsInfo : { DataInfo "input" : tensor<1x3x60x60xf16> } outputsInfo : { @@ -66,7 +66,7 @@ module @OneInputOneOutput attributes {VPU.arch = #VPU.arch_kind, config VPUMI37XX.MappedInferenceVersion(7 _ 0 _ 4) } %12 = ELFNPU37XX.CreateSection secType(VPU_SHT_PLATFORM_INFO) secFlags("SHF_NONE") {secAddrAlign = 8 : i64, secInfo = 0 : i64, secName = ".meta.PlatformInfo"} -> !ELFNPU37XX.Section { - VPUMI37XX.PlatformInfo {archKind = #VPU.arch_kind} + VPUMI37XX.PlatformInfo {archKind = #config.arch_kind} } %13 = ELFNPU37XX.Symbol %5 name("sym_dmaSection0") : !ELFNPU37XX.Section %14 = ELFNPU37XX.Symbol %6 name("sym_barrierSection") : !ELFNPU37XX.Section @@ -137,7 +137,7 @@ module @OneInputOneOutput attributes {VPU.arch = #VPU.arch_kind, config } IE.TileResource {activity_factor = 0.000000e+00 : f64} 2 of @NCE at 1.300000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN IE.ExecutorResource 1 of @DPU @@ -153,7 +153,7 @@ module @OneInputOneOutput attributes {VPU.arch = #VPU.arch_kind, config config.Option @VPU.MaxKernelSize : 11 } IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} } func.func @main(%arg0: memref<1x3x60x60xf16, @DDR>, %arg1: memref<1x3x60x60xf16>) -> memref<1x3x60x60xf16> { %alloc = memref.alloc() : memref<1x3x60x60xf16, @DDR> diff --git a/tests/lit/NPU/dialect/HostExec/passes/serialize_elf_binary_40XX+.mlir b/tests/lit/NPU/dialect/HostExec/passes/serialize_elf_binary_40XX+.mlir index ab7bbb8ec4..300d7ba76f 100644 --- a/tests/lit/NPU/dialect/HostExec/passes/serialize_elf_binary_40XX+.mlir +++ b/tests/lit/NPU/dialect/HostExec/passes/serialize_elf_binary_40XX+.mlir @@ -9,7 +9,7 @@ // CHECK-LABEL: @StaticEltwiseNHWC #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module @StaticEltwiseNHWC attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} { +module @StaticEltwiseNHWC attributes {config.arch = #config.arch_kind, config.revisionID = #config.revision_id, config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.EnableExtraStaticShapeOps : true config.Option @VPU.EnableAdaptiveStripping : false @@ -35,20 +35,20 @@ module @StaticEltwiseNHWC attributes {VPU.arch = #VPU.arch_kind, VPU.re } IE.TileResource 6 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input1" : tensor<1x16x720x1000xf16> DataInfo "input2" : tensor<1x16x720x1000xf16> } outputsInfo : { DataInfo "output" : tensor<1x16x720x1000xf16> } - module @OneDMAWithoutAttributes attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} { + module @OneDMAWithoutAttributes attributes {config.arch = #config.arch_kind, config.revisionID = #config.revision_id, config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.EnableExtraStaticShapeOps : true config.Option @VPU.EnableAdaptiveStripping : false @@ -74,7 +74,7 @@ module @StaticEltwiseNHWC attributes {VPU.arch = #VPU.arch_kind, VPU.re } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main1 inputsInfo : { DataInfo "input_0" : tensor<1x90x1000x16xf16> } outputsInfo : { @@ -82,7 +82,7 @@ module @StaticEltwiseNHWC attributes {VPU.arch = #VPU.arch_kind, VPU.re } IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU builtin.module @ReservedMemory { diff --git a/tests/lit/NPU/dialect/HostExec/passes/serialize_elf_binary_invalid.mlir b/tests/lit/NPU/dialect/HostExec/passes/serialize_elf_binary_invalid.mlir index 943ba2e376..73bd3c8186 100644 --- a/tests/lit/NPU/dialect/HostExec/passes/serialize_elf_binary_invalid.mlir +++ b/tests/lit/NPU/dialect/HostExec/passes/serialize_elf_binary_invalid.mlir @@ -11,7 +11,7 @@ // expected-error@+1 {{Failed to get FuncType: 'main1'}} // expected-error@+2 {{Failed to serialize '@OneDMAWithoutAttributes::@main1'}} #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module @StaticEltwiseNHWC attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} { +module @StaticEltwiseNHWC attributes {config.arch = #config.arch_kind, config.revisionID = #config.revision_id, config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.EnableExtraStaticShapeOps : true config.Option @VPU.EnableAdaptiveStripping : false @@ -37,20 +37,20 @@ module @StaticEltwiseNHWC attributes {VPU.arch = #VPU.arch_kind, VPU.re } IE.TileResource 6 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input1" : tensor<1x16x720x1000xf16> DataInfo "input2" : tensor<1x16x720x1000xf16> } outputsInfo : { DataInfo "output" : tensor<1x16x720x1000xf16> } - module @OneDMAWithoutAttributes attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} { + module @OneDMAWithoutAttributes attributes {config.arch = #config.arch_kind, config.revisionID = #config.revision_id, config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.EnableExtraStaticShapeOps : true config.Option @VPU.EnableAdaptiveStripping : false @@ -76,7 +76,7 @@ module @StaticEltwiseNHWC attributes {VPU.arch = #VPU.arch_kind, VPU.re } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main1 inputsInfo : { DataInfo "input_0" : tensor<1x90x1000x16xf16> } outputsInfo : { @@ -84,7 +84,7 @@ module @StaticEltwiseNHWC attributes {VPU.arch = #VPU.arch_kind, VPU.re } IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU builtin.module @ReservedMemory { @@ -92,7 +92,7 @@ module @StaticEltwiseNHWC attributes {VPU.arch = #VPU.arch_kind, VPU.re IE.MemoryResource 512 bytes of @CMX_NN offset 0 } } - } + } func.func @main1() { ELF.Main @ELFMain { ELF.CreateLogicalSection @program.metadata.cmx aligned(1) secType(VPU_SHT_CMX_METADATA) secFlags("SHF_NONE") secLocation() { diff --git a/tests/lit/NPU/dialect/IE/ops/copy.mlir b/tests/lit/NPU/dialect/IE/ops/copy.mlir deleted file mode 100644 index 5be1fd2a15..0000000000 --- a/tests/lit/NPU/dialect/IE/ops/copy.mlir +++ /dev/null @@ -1,26 +0,0 @@ -// -// Copyright (C) 2022-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --canonicalize %s | FileCheck %s -// REQUIRES: arch-NPU37XX || arch-NPU40XX - -// CHECK-LABEL: @Fold -func.func @Fold(%arg0: tensor<1x3x16x16xf32>) -> tensor<1x3x16x16xf32> { - %0 = IE.Copy(%arg0) : tensor<1x3x16x16xf32> -> tensor<1x3x16x16xf32> - return %0 : tensor<1x3x16x16xf32> - - // CHECK: return %arg0 : tensor<1x3x16x16xf32> -} - -// ----- - -// CHECK-LABEL: @FuseCopies -func.func @FuseCopies(%arg0: tensor<1x3x16x16xf32>) -> tensor<1x3x16x16xf32> { - %0 = IE.Copy(%arg0) {out_mem_space = @CMX_NN} : tensor<1x3x16x16xf32> -> tensor<1x3x16x16xf32, {mem_space = @CMX_NN}> - %1 = IE.Copy(%0) : tensor<1x3x16x16xf32, {mem_space = @CMX_NN}> -> tensor<1x3x16x16xf32> - return %1 : tensor<1x3x16x16xf32> - - // CHECK: return %arg0 : tensor<1x3x16x16xf32> -} diff --git a/tests/lit/NPU/dialect/IE/ops/mvn.mlir b/tests/lit/NPU/dialect/IE/ops/mvn.mlir index b809ca9d31..eb0d691ff9 100644 --- a/tests/lit/NPU/dialect/IE/ops/mvn.mlir +++ b/tests/lit/NPU/dialect/IE/ops/mvn.mlir @@ -15,18 +15,3 @@ func.func @LegalizeEps(%arg0 : tensor<1x100x512x1xf32>) -> tensor<1x100x512x1xf3 // CHECK: [[MVN:%.+]] = IE.MVN([[INPUT]]) {across_channels = false, eps = 1.1920928955078125E-7 : f64, normalize_variance = true} : tensor<1x100x512x1xf32> -> tensor<1x100x512x1xf32> // CHECK: return [[MVN]] } - -// CHECK-LABEL: @ReshapeBatched -// CHECK-SAME: ([[INPUT:%.+]]: tensor<32x16x64x64xf32>) -func.func @ReshapeBatched(%arg0 : tensor<32x16x64x64xf32>) -> tensor<32x16x64x64xf32> { - %0 = IE.MVN(%arg0) {across_channels = true, eps = 9.999999960041972E-13 : f64, normalize_variance = true} : tensor<32x16x64x64xf32> -> tensor<32x16x64x64xf32> - return %0 : tensor<32x16x64x64xf32> - - // CHECK: [[RESHAPE_1:%.+]] = IE.AffineReshape([[INPUT]]) - // CHECK-SAME{LITERAL}: {dim_mapping = [[0, 1], [2], [3], [3]], shape_value = [1, 32, 16, 4096]} : tensor<32x16x64x64xf32> -> tensor<1x32x16x4096xf32> - // CHECK: [[MVN:%.+]] = IE.MVN([[RESHAPE_1]]) - // CHECK-SAME{LITERAL}: {across_channels = false, eps = 1.1920928955078125E-7 : f64, normalize_variance = true} : tensor<1x32x16x4096xf32> -> tensor<1x32x16x4096xf32> - // CHECK: [[RESHAPE_2:%.+]] = IE.AffineReshape([[MVN]]) - // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [0], [1], [2, 3]], shape_value = [32, 16, 64, 64]} : tensor<1x32x16x4096xf32> -> tensor<32x16x64x64xf32> - // CHECK: return [[RESHAPE_2]] -} diff --git a/tests/lit/NPU/dialect/IE/passes/adjust_fake_qdq_params.mlir b/tests/lit/NPU/dialect/IE/passes/adjust_fake_qdq_params.mlir new file mode 100644 index 0000000000..fc3afdb227 --- /dev/null +++ b/tests/lit/NPU/dialect/IE/passes/adjust_fake_qdq_params.mlir @@ -0,0 +1,913 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --adjust-fake-qdq-params %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + + +// This is single FQ op check, important because it also checks that when we reach top of the +// graph we can insert a multiply to an input argument. +// The defining op of an input argument is nullptr and needs to be handled correctly. +// CHECK-LABEL: @AdjustSingleFakeQuantizeOp +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32> +func.func @AdjustSingleFakeQuantizeOp(%arg0 : tensor<1x128x32x64xf32>) -> tensor<1x128x32x64xf32> { + + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + + + %1 = IE.FakeQuantize(%arg0, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + return %1 : tensor<1x128x32x64xf32> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + + // CHECK: [[MUL0:%.+]] = IE.Multiply([[INPUT_0]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[FQ1:%.+]] = IE.FakeQuantize([[MUL0]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL1:%.+]] = IE.Multiply([[FQ1]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + +} + + + +// ----- + +// This test checks propagation of multiply operations all the way to the top and bottom. +// The traversal also encounters a subtract and an add operation. +// CHECK-LABEL: @AdjustFQBetweenSubAndAdd +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x128x1x1xf32> +func.func @AdjustFQBetweenSubAndAdd(%arg0 : tensor<1x128x32x64xf32>, %arg1 : tensor<1x128x1x1xf32>) -> tensor<1x128x32x64xf32> { + + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + + + %0 = IE.Subtract(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.FakeQuantize(%0, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Add(%1, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + return %2 : tensor<1x128x32x64xf32> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + + // CHECK-NEXT: [[MUL0:%.+]] = IE.Multiply([[INPUT_1]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x1x1xf32>, tensor<1xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[MUL1:%.+]] = IE.Multiply([[INPUT_1]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x1x1xf32>, tensor<1xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[MUL2:%.+]] = IE.Multiply([[INPUT_0]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[SUB3:%.+]] = IE.Subtract([[MUL2]], [[MUL1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[FQ4:%.+]] = IE.FakeQuantize([[SUB3]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[ADD5:%.+]] = IE.Add([[FQ4]], [[MUL0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL6:%.+]] = IE.Multiply([[ADD5]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> +} + +// ----- + +// Test to check propagation through multiply ops with non-identical inputs both in the up and down directions. +// CHECK-LABEL: @AdjustFQMul +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x128x1x1xf32> +func.func @AdjustFQMul(%arg0 : tensor<1x128x32x64xf32>, %arg1 : tensor<1x128x1x1xf32>) -> tensor<1x128x32x64xf32> { + + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + + + %0 = IE.Multiply(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.FakeQuantize(%0, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Multiply(%1, %0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + return %2 : tensor<1x128x32x64xf32> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + // CHECK-NEXT: [[OUT0:%.+]] = IE.Multiply([[INPUT_1]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x1x1xf32>, tensor<1xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.Multiply([[INPUT_0]], [[OUT0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.FakeQuantize([[OUT1]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.Multiply([[OUT2]], [[OUT1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT4:%.+]] = IE.Multiply([[OUT3]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + +} + +// ----- + + +// Test to check propagation of through a multiply op at the top with identical inputs +// CHECK-LABEL: @AdjustFQMulSquareUp +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32> +func.func @AdjustFQMulSquareUp(%arg0 : tensor<1x128x32x64xf32>) -> tensor<1x128x32x64xf32> { + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + + + %0 = IE.Multiply(%arg0, %arg0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %1 = IE.FakeQuantize(%0, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Multiply(%1, %0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + return %2 : tensor<1x128x32x64xf32> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + // CHECK-NEXT: [[OUT0:%.+]] = IE.Multiply([[INPUT_0]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.Multiply([[INPUT_0]], [[OUT0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.FakeQuantize([[OUT1]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.Multiply([[OUT2]], [[OUT1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT4:%.+]] = IE.Multiply([[OUT3]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> +} + + +// ----- + +// Test to check propagation of multiply op down with square inputs +// CHECK-LABEL: @AdjustFQMulSquareDown +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32> +func.func @AdjustFQMulSquareDown(%arg0 : tensor<1x128x32x64xf32>) -> tensor<1x128x32x64xf32> { + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %mul_in = const.Declare tensor<1x1x1x1xf32> = dense <8.537536814401392e-06> : tensor<1x1x1x1xf32> isSplat + + + %0 = IE.Multiply(%arg0, %mul_in) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.FakeQuantize(%0, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Multiply(%1, %1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + return %2 : tensor<1x128x32x64xf32> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<8.53753681E-6> : tensor<1x1x1x1xf32>, [#const.Rescale<0.20441406965255737 : f64>] + // CHECK-NEXT: [[OUT0:%.+]] = IE.Multiply([[INPUT_0]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.FakeQuantize([[OUT0]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.Multiply([[OUT1]], [[OUT1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.Multiply([[OUT2]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + +} + +// ----- + +// Test to check propagation of multiply op down with square inputs and fusion of multiply factor in +// up propagation in particular propagated mul does not reach all the way to subtract. +// CHECK-LABEL: @AdjustFQMulUpFuseConst +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32> +func.func @AdjustFQMulUpFuseConst(%arg0 : tensor<1x128x32x64xf32>) -> tensor<1x128x32x64xf32> { + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %mul_in = const.Declare tensor<1x1x1x1xf32> = dense <8.537536814401392e-06> : tensor<1x1x1x1xf32> isSplat + + + %0 = IE.Subtract(%arg0, %mul_in) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.Multiply(%0, %mul_in) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.FakeQuantize(%1, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %3 = IE.Multiply(%2, %2) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + return %3 : tensor<1x128x32x64xf32> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<8.53753681E-6> : tensor<1x1x1x1xf32>, [#const.Rescale<0.20441406965255737 : f64>] + // CHECK-DAG: [[CST_3:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<8.53753681E-6> : tensor<1x1x1x1xf32> + // CHECK: [[OUT0:%.+]] = IE.Subtract(%arg0, [[CST_3]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.Multiply([[OUT0]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.FakeQuantize([[OUT1]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.Multiply([[OUT2]], [[OUT2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT4:%.+]] = IE.Multiply([[OUT3]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + + +} + + + +// ----- + +// Test to check propagation of multiply ops with square inputs both up and down +// CHECK-LABEL: @AdjustFQMulSquareUpDown +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32> +func.func @AdjustFQMulSquareUpDown(%arg0 : tensor<1x128x32x64xf32>) -> tensor<1x128x32x64xf32> { + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + + + %0 = IE.Multiply(%arg0, %arg0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %1 = IE.FakeQuantize(%0, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Multiply(%1, %1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + return %2 : tensor<1x128x32x64xf32> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + // CHECK-NEXT: [[OUT0:%.+]] = IE.Multiply([[INPUT_0]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.Multiply([[INPUT_0]], [[OUT0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.FakeQuantize([[OUT1]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.Multiply([[OUT2]], [[OUT2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT4:%.+]] = IE.Multiply([[OUT3]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + + +} + +// ----- + +// Mul propagation test with an add at bottom which creates an upward and downward mul. +// CHECK-LABEL: @AdjustFQMulDownAdd +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32> +func.func @AdjustFQMulDownAdd(%arg0 : tensor<1x128x32x64xf32>) -> tensor<1x128x32x64xf32> { + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + + + %0 = IE.Multiply(%arg0, %arg0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %1 = IE.FakeQuantize(%0, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Multiply(%1, %0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %3 = IE.Add(%2, %0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + // CHECK-NEXT: [[OUT0:%.+]] = IE.Multiply([[INPUT_0]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.Multiply([[OUT0]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.Multiply([[INPUT_0]], [[OUT1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.FakeQuantize([[OUT2]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT4:%.+]] = IE.Multiply([[OUT3]], [[OUT2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT5:%.+]] = IE.Add([[OUT4]], [[OUT2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT6:%.+]] = IE.Multiply([[OUT5]], %cst) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + + + return %3 : tensor<1x128x32x64xf32> + +} + + +// ----- + + +// Test IE.multiply with mul propagation up and add with identical inputs propagation of Mul down. +// This tests add with identical inputs DOES NOT propagate a mul upward. Therefore FQ is updated only once. +// CHECK-LABEL: @AdjustFQMulUpAddDownSq +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32> +func.func @AdjustFQMulUpAddDownSq(%arg0 : tensor<1x128x32x64xf32>) -> tensor<1x128x32x64xf32> { + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + + + %0 = IE.Multiply(%arg0, %arg0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %1 = IE.FakeQuantize(%0, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Add(%1, %1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + return %2 : tensor<1x128x32x64xf32> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + // CHECK-NEXT: [[OUT0:%.+]] = IE.Multiply([[INPUT_0]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.Multiply([[INPUT_0]], [[OUT0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.FakeQuantize([[OUT1]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.Add([[OUT2]], [[OUT2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT4:%.+]] = IE.Multiply([[OUT3]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> +} + + +// ----- + + +// Test propagation through an identical input Add with identical input multiply at the top. +// CHECK-LABEL: @AdjustFQMulDownAddSq +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32> +func.func @AdjustFQMulDownAddSq(%arg0 : tensor<1x128x32x64xf32>) -> tensor<1x128x32x64xf32> { + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + + + %0 = IE.Multiply(%arg0, %arg0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %1 = IE.FakeQuantize(%0, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Multiply(%1, %0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %3 = IE.Add(%2, %2) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + return %3 : tensor<1x128x32x64xf32> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + // CHECK-NEXT: [[OUT0:%.+]] = IE.Multiply([[INPUT_0]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.Multiply([[INPUT_0]], [[OUT0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.FakeQuantize([[OUT1]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.Multiply([[OUT2]], [[OUT1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT4:%.+]] = IE.Add([[OUT3]], [[OUT3]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT5:%.+]] = IE.Multiply([[OUT4]], %cst) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + +} + + + + + +// ----- + +// Test propagation through identical input add with identical input multiplies at top and bottom +// CHECK-LABEL: @AdjustFQMulSquareUpDownAdd +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32> +func.func @AdjustFQMulSquareUpDownAdd(%arg0 : tensor<1x128x32x64xf32>) -> tensor<1x128x32x64xf32> { + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + + + %0 = IE.Multiply(%arg0, %arg0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %1 = IE.FakeQuantize(%0, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Multiply(%1, %1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %3 = IE.Add(%2, %2) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + return %3 : tensor<1x128x32x64xf32> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + // CHECK-NEXT: [[OUT0:%.+]] = IE.Multiply([[INPUT_0]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.Multiply([[INPUT_0]], [[OUT0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.FakeQuantize([[OUT1]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.Multiply([[OUT2]], [[OUT2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT4:%.+]] = IE.Add([[OUT3]], [[OUT3]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT5:%.+]] = IE.Multiply([[OUT4]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + + +} + +// ----- + +// CHECK-LABEL: @AdjustFakeQuantizeWithFQ +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x128x1x1xf32> +func.func @AdjustFakeQuantizeWithFQ(%arg0 : tensor<1x128x32x64xf32>, %arg1 : tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32> { + %fq1_in_low = const.Declare tensor<1x1x1x1xf32> = dense <-182.1645050048828> : tensor<1x1x1x1xf32> isSplat + %fq1_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <392.4757080078125> : tensor<1x1x1x1xf32> isSplat + %fq1_out_low = const.Declare tensor<1x1x1x1xf32> = dense <-182.1645050048828> : tensor<1x1x1x1xf32> isSplat + %fq1_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <392.4757080078125> : tensor<1x1x1x1xf32> isSplat + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq3_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq3_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <504.5491638183594> : tensor<1x1x1x1xf32> isSplat + %fq3_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq3_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <504.5491638183594> : tensor<1x1x1x1xf32> isSplat + %add_cst = const.Declare tensor<1x1x1x1xf32> = dense <8.537536814401392e-06> : tensor<1x1x1x1xf32> isSplat + + %0 = IE.Subtract(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.FakeQuantize(%0, %fq1_in_low, %fq1_in_hi, %fq1_out_low, %fq1_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Multiply(%1, %1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %3 = IE.FakeQuantize(%2, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %4 = IE.ReduceMean(%3) {axes_value = [2, 3], keep_dims} : tensor<1x128x32x64xf32> -> tensor<1x128x1x1xf32> + %5 = IE.FakeQuantize(%4, %fq3_in_low, %fq3_in_hi, %fq3_out_low, %fq3_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x1x1xf32> + %6 = IE.Add(%5, %add_cst) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x1x1xf32> + + return %6 : tensor<1x128x1x1xf32> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<80.2275543> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<-37.2369881> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_3:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<8.53753681E-6> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_4:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<504.549164> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_5:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_6:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + // CHECK-NEXT: [[OUT_0:%.+]] = IE.Multiply([[INPUT_1]], [[CST_6]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x1x1xf32>, tensor<1xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[OUT_1:%.+]] = IE.Multiply([[INPUT_0]], [[CST_6]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT_2:%.+]] = IE.Subtract([[OUT_1]], [[OUT_0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT_3:%.+]] = IE.FakeQuantize([[OUT_2]], [[CST_2]], [[CST_1]], [[CST_2]], [[CST_1]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT_4:%.+]] = IE.Multiply([[OUT_3]], [[OUT_3]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT_5:%.+]] = IE.FakeQuantize([[OUT_4]], [[CST_5]], [[CST_0]], [[CST_5]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT_6:%.+]] = IE.Multiply([[OUT_5]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT_7:%.+]] = IE.ReduceMean([[OUT_6]]) {axes_value = [2, 3], keep_dims} : tensor<1x128x32x64xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[OUT_8:%.+]] = IE.FakeQuantize([[OUT_7]], [[CST_5]], [[CST_4]], [[CST_5]], [[CST_4]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[OUT_9:%.+]] = IE.Add([[OUT_8]], [[CST_3]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x1x1xf32> + + +} + +// ----- + +// This tests propagation of muls all the way up and bottom. +// CHECK-LABEL: @AdjustFakeQuantizeWithFQ2 +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x128x1x1xf32> +func.func @AdjustFakeQuantizeWithFQ2(%arg0 : tensor<1x128x32x64xf32>, %arg1 : tensor<1x128x1x1xf32>) -> tensor<1x128x32x64xf32> { + %fq1_in_low = const.Declare tensor<1x1x1x1xf32> = dense <-182.1645050048828> : tensor<1x1x1x1xf32> isSplat + %fq1_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <392.4757080078125> : tensor<1x1x1x1xf32> isSplat + %fq1_out_low = const.Declare tensor<1x1x1x1xf32> = dense <-182.1645050048828> : tensor<1x1x1x1xf32> isSplat + %fq1_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <392.4757080078125> : tensor<1x1x1x1xf32> isSplat + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq3_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq3_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <504.5491638183594> : tensor<1x1x1x1xf32> isSplat + %fq3_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq3_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <504.5491638183594> : tensor<1x1x1x1xf32> isSplat + %add_cst = const.Declare tensor<1x1x1x1xf32> = dense <8.537536814401392e-06> : tensor<1x1x1x1xf32> isSplat + + %0 = IE.Subtract(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.FakeQuantize(%0, %fq1_in_low, %fq1_in_hi, %fq1_out_low, %fq1_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Multiply(%0, %1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %3 = IE.FakeQuantize(%2, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %4 = IE.Multiply(%3, %0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %5 = IE.FakeQuantize(%4, %fq3_in_low, %fq3_in_hi, %fq3_out_low, %fq3_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %6 = IE.Add(%5, %add_cst) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + + return %6 : tensor<1x128x32x64xf32> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<103.136948> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_3:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<80.2275543> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_4:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<-37.2369881> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_5:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<8.53753681E-6> : tensor<1x1x1x1xf32>, [#const.Rescale<0.20441406965255737 : f64>] + // CHECK-DAG: [[CST_6:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + // CHECK-NEXT: [[OUT0:%.+]] = IE.Multiply([[INPUT_1]], [[CST_6]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x1x1xf32>, tensor<1xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.Multiply([[INPUT_0]], [[CST_6]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.Subtract([[OUT1]], [[OUT0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.FakeQuantize([[OUT2]], [[CST_4]], [[CST_3]], [[CST_4]], [[CST_3]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT4:%.+]] = IE.Multiply([[OUT2]], [[OUT3]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT5:%.+]] = IE.FakeQuantize([[OUT4]], [[CST_2]], [[CST_1]], [[CST_2]], [[CST_1]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT6:%.+]] = IE.Multiply([[OUT5]], [[OUT2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT7:%.+]] = IE.FakeQuantize([[OUT6]], [[CST_2]], [[CST_0]], [[CST_2]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT8:%.+]] = IE.Add([[OUT7]], [[CST_5]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT9:%.+]] = IE.Multiply([[OUT8]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + +} + +// ----- + +// Test to check propagation of multiply op down with square inputs and fusion of multiply factor in +// up propagation in particular propagated mul does not reach all the way to subtract. +// Test fusion to bias. +// CHECK-LABEL: @AdjustFQMulUpFuseConstWithBias +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32> +func.func @AdjustFQMulUpFuseConstWithBias(%arg0 : tensor<1x128x32x64xf32>) -> tensor<1x1x32x64xf32> { + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %mul_in = const.Declare tensor<1x128x1x1xf32> = dense <8.537536814401392e-06> : tensor<1x128x1x1xf32> isSplat + %bias_in = const.Declare tensor<1x128x1x1xf32> = dense <9.537536814401392e-06> : tensor<1x128x1x1xf32> isSplat + + + %0 = IE.Subtract(%arg0, %mul_in) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.Convolution(%0, %mul_in, %bias_in) {dilations = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1], auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32>, tensor<1x128x1x1xf32> -> tensor<1x1x32x64xf32> + %2 = IE.FakeQuantize(%1, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x1x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x1x32x64xf32> + %3 = IE.Multiply(%2, %2) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x1x32x64xf32>, tensor<1x1x32x64xf32> -> tensor<1x1x32x64xf32> + return %3 : tensor<1x1x32x64xf32> + + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1x128x1x1xf32> = dense<8.53753681E-6> : tensor<1x128x1x1xf32>, [#const.Rescale<0.20441406965255737 : f64>] + // CHECK-DAG: [[CST_3:%.+]] = const.Declare tensor<1x128x1x1xf32> = dense<8.53753681E-6> : tensor<1x128x1x1xf32> + // CHECK-DAG: [[CST_4:%.+]] = const.Declare tensor<1x128x1x1xf32> = dense<9.53753715E-6> : tensor<1x128x1x1xf32>, [#const.Rescale<0.20441406965255737 : f64>] + // CHECK: [[OUT0:%.+]] = IE.Subtract([[INPUT_0]], [[CST_3]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.Convolution([[OUT0]], [[CST_2]], [[CST_4]]) {auto_broadcast = #IE.auto_broadcast_type, dilations = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32>, tensor<1x128x1x1xf32> -> tensor<1x1x32x64xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.FakeQuantize([[OUT1]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x1x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x1x32x64xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.Multiply([[OUT2]], [[OUT2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x1x32x64xf32>, tensor<1x1x32x64xf32> -> tensor<1x1x32x64xf32> + // CHECK-NEXT: [[OUT4:%.+]] = IE.Multiply([[OUT3]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x1x32x64xf32>, tensor<1xf32> -> tensor<1x1x32x64xf32> + +} + + +// ----- + + +// Testing propagation of mul to quantized weights, a case seen in PSD7 model for example. +// CHECK-LABEL: @AFQDQConvQuantizedWts +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x256x256x256xf32> +func.func @AFQDQConvQuantizedWts(%arg0 : tensor<1x256x256x256xf32>) -> tensor<1x256x256x256xf32> { + %cst_1340 = const.Declare tensor<256x256x3x3xf32> = dense<100> : tensor<256x256x3x3xui8>, [#const.CastElemType] + %cst_1341 = const.Declare tensor<1x1x1x1xf32> = dense<4.163311> : tensor<1x1x1x1xf32> + %cst_1342 = const.Declare tensor<1x1x1x1xf32> = dense<-3.87944865> : tensor<1x1x1x1xf32> + %cst_1383 = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + %cst_1384 = const.Declare tensor<1x1x1x1xf32> = dense<2.550000e+02> : tensor<1x1x1x1xf32> + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + + // %16 and %564 are from before AFQDQ pass in psd7. + %16 = IE.FakeQuantize(%cst_1340, %cst_1383, %cst_1384, %cst_1342, %cst_1341) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : tensor<256x256x3x3xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<256x256x3x3xf32> + %564 = IE.Convolution(%arg0, %16) {dilations = [1, 1], pads_begin = [1, 1], pads_end = [1, 1], strides = [1, 1]} : tensor<1x256x256x256xf32>, tensor<256x256x3x3xf32> -> tensor<1x256x256x256xf32> + %2000 = IE.FakeQuantize(%564, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x256x256x256xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x256x256x256xf32> + return %2000 : tensor<1x256x256x256xf32> + + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<256x256x3x3xf32> = dense<100> : tensor<256x256x3x3xui8>, [#const.CastElemType] + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_3:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<2.550000e+02> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_4:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<-0.79301387> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_5:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.85103935> : tensor<1x1x1x1xf32> + // CHECK-NEXT: [[OUT0:%.+]] = IE.FakeQuantize([[CST_1]], [[CST_2]], [[CST_3]], [[CST_4]], [[CST_5]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : tensor<256x256x3x3xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<256x256x3x3xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.Convolution([[INPUT_0]], [[OUT0]]) {dilations = [1, 1], pads_begin = [1, 1], pads_end = [1, 1], strides = [1, 1]} : tensor<1x256x256x256xf32>, tensor<256x256x3x3xf32> -> tensor<1x256x256x256xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.FakeQuantize([[OUT1]], [[CST_2]], [[CST_0]], [[CST_2]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x256x256x256xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x256x256x256xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.Multiply([[OUT2]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x256x256x256xf32>, tensor<1xf32> -> tensor<1x256x256x256xf32> +} + +// ----- + +// Tests FQ prop to multiple users. +// CHECK-LABEL: @AdjustFakeQuantizeWithMultipleUsers +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x128x1x1xf32> +func.func @AdjustFakeQuantizeWithMultipleUsers(%arg0 : tensor<1x128x32x64xf32>, %arg1 : tensor<1x128x1x1xf32>) -> (tensor<1x128x1x1xf32>, tensor<1x128x32x64xf32>) { + %fq1_in_low = const.Declare tensor<1x1x1x1xf32> = dense <-182.1645050048828> : tensor<1x1x1x1xf32> isSplat + %fq1_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <392.4757080078125> : tensor<1x1x1x1xf32> isSplat + %fq1_out_low = const.Declare tensor<1x1x1x1xf32> = dense <-182.1645050048828> : tensor<1x1x1x1xf32> isSplat + %fq1_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <392.4757080078125> : tensor<1x1x1x1xf32> isSplat + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq3_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq3_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <504.5491638183594> : tensor<1x1x1x1xf32> isSplat + %fq3_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq3_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <504.5491638183594> : tensor<1x1x1x1xf32> isSplat + %mul_in = const.Declare tensor<1x1x1x1xf32> = dense <8.537536814401392e-06> : tensor<1x1x1x1xf32> isSplat + + %0 = IE.Subtract(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.FakeQuantize(%0, %fq1_in_low, %fq1_in_hi, %fq1_out_low, %fq1_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Multiply(%1, %1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %3 = IE.FakeQuantize(%2, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %4 = IE.ReduceMean(%3) {axes_value = [2, 3], keep_dims} : tensor<1x128x32x64xf32> -> tensor<1x128x1x1xf32> + %5 = IE.FakeQuantize(%4, %fq3_in_low, %fq3_in_hi, %fq3_out_low, %fq3_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x1x1xf32> + %6 = IE.Multiply(%3, %mul_in) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %7 = IE.FakeQuantize(%6, %fq3_in_low, %fq3_in_hi, %fq3_out_low, %fq3_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + + return %5, %7 : tensor<1x128x1x1xf32>, tensor<1x128x32x64xf32> + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<80.2275543> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<-37.2369881> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_3:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<8.53753681E-6> : tensor<1x1x1x1xf32>, [#const.Rescale<4.892031192779541 : f64>] + // CHECK-DAG: [[CST_4:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<504.549164> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_5:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_6:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + // CHECK-NEXT: [[OUT0:%.+]] = IE.Multiply([[INPUT_1]], [[CST_6]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x1x1xf32>, tensor<1xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.Multiply([[INPUT_0]], [[CST_6]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.Subtract([[OUT1]], [[OUT0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.FakeQuantize([[OUT2]], [[CST_2]], [[CST_1]], [[CST_2]], [[CST_1]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT4:%.+]] = IE.Multiply([[OUT3]], [[OUT3]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT5:%.+]] = IE.FakeQuantize([[OUT4]], [[CST_5]], [[CST_0]], [[CST_5]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT6:%.+]] = IE.Multiply([[OUT5]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT7:%.+]] = IE.ReduceMean([[OUT6]]) {axes_value = [2, 3], keep_dims} : tensor<1x128x32x64xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[OUT8:%.+]] = IE.FakeQuantize([[OUT7]], [[CST_5]], [[CST_4]], [[CST_5]], [[CST_4]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[OUT9:%.+]] = IE.Multiply([[OUT5]], [[CST_3]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT10:%.+]] = IE.FakeQuantize([[OUT9]], [[CST_5]], [[CST_4]], [[CST_5]], [[CST_4]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + +} + + +// ----- + +// Tests FQ prop to multiple users. +// Further tests down prop from an FQ and fusion with mul_in in the down direction. +// CHECK-LABEL: @AdjustFakeQuantizeWithMultipleUsers2 +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x128x1x1xf32> +func.func @AdjustFakeQuantizeWithMultipleUsers2(%arg0 : tensor<1x128x32x64xf32>, %arg1 : tensor<1x128x1x1xf32>) -> (tensor<1x128x1x1xf32>, tensor<1x128x32x64xf32>) { + %fq1_in_low = const.Declare tensor<1x1x1x1xf32> = dense <-182.1645050048828> : tensor<1x1x1x1xf32> isSplat + %fq1_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <392.4757080078125> : tensor<1x1x1x1xf32> isSplat + %fq1_out_low = const.Declare tensor<1x1x1x1xf32> = dense <-182.1645050048828> : tensor<1x1x1x1xf32> isSplat + %fq1_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <392.4757080078125> : tensor<1x1x1x1xf32> isSplat + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq3_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq3_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <504.5491638183594> : tensor<1x1x1x1xf32> isSplat + %fq3_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq3_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <504.5491638183594> : tensor<1x1x1x1xf32> isSplat + %mul_in = const.Declare tensor<1x1x1x1xf32> = dense <8.537536814401392e-06> : tensor<1x1x1x1xf32> isSplat + + %0 = IE.Subtract(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.FakeQuantize(%0, %fq1_in_low, %fq1_in_hi, %fq1_out_low, %fq1_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Multiply(%1, %1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %3 = IE.FakeQuantize(%2, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %4 = IE.ReduceMean(%3) {axes_value = [2, 3], keep_dims} : tensor<1x128x32x64xf32> -> tensor<1x128x1x1xf32> + %5 = IE.FakeQuantize(%4, %fq3_in_low, %fq3_in_hi, %fq3_out_low, %fq3_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x1x1xf32> + %6 = IE.Multiply(%1, %mul_in) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %7 = IE.FakeQuantize(%6, %fq3_in_low, %fq3_in_hi, %fq3_out_low, %fq3_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + + return %5, %7 : tensor<1x128x1x1xf32>, tensor<1x128x32x64xf32> + + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<80.2275543> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<-37.2369881> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_3:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<8.53753681E-6> : tensor<1x1x1x1xf32>, [#const.Rescale<4.892031192779541 : f64>] + // CHECK-DAG: [[CST_4:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<504.549164> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_5:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_6:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + // CHECK-NEXT: [[OUT0:%.+]] = IE.Multiply([[INPUT_1]], [[CST_6]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x1x1xf32>, tensor<1xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.Multiply([[INPUT_0]], [[CST_6]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.Subtract([[OUT1]], [[OUT0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.FakeQuantize([[OUT2]], [[CST_2]], [[CST_1]], [[CST_2]], [[CST_1]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT4:%.+]] = IE.Multiply([[OUT3]], [[OUT3]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT5:%.+]] = IE.FakeQuantize([[OUT4]], [[CST_5]], [[CST_0]], [[CST_5]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT6:%.+]] = IE.Multiply([[OUT5]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT7:%.+]] = IE.ReduceMean([[OUT6]]) {axes_value = [2, 3], keep_dims} : tensor<1x128x32x64xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[OUT8:%.+]] = IE.FakeQuantize([[OUT7]], [[CST_5]], [[CST_4]], [[CST_5]], [[CST_4]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[OUT9:%.+]] = IE.Multiply([[OUT3]], [[CST_3]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT10:%.+]] = IE.FakeQuantize([[OUT9]], [[CST_5]], [[CST_4]], [[CST_5]], [[CST_4]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> +} + +// ----- + +// Tests FQ prop to multiple users. +// Further tests down prop from an FQ and update of another FQ in the down direction. +// CHECK-LABEL: @AdjustFakeQuantizeWithMultipleUsers3 +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x128x1x1xf32> +func.func @AdjustFakeQuantizeWithMultipleUsers3(%arg0 : tensor<1x128x32x64xf32>, %arg1 : tensor<1x128x1x1xf32>) -> (tensor<1x128x1x1xf32>, tensor<1x128x32x64xf32>) { + %fq1_in_low = const.Declare tensor<1x1x1x1xf32> = dense <-182.1645050048828> : tensor<1x1x1x1xf32> isSplat + %fq1_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <392.4757080078125> : tensor<1x1x1x1xf32> isSplat + %fq1_out_low = const.Declare tensor<1x1x1x1xf32> = dense <-182.1645050048828> : tensor<1x1x1x1xf32> isSplat + %fq1_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <392.4757080078125> : tensor<1x1x1x1xf32> isSplat + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq3_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq3_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <504.5491638183594> : tensor<1x1x1x1xf32> isSplat + %fq3_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq3_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <504.5491638183594> : tensor<1x1x1x1xf32> isSplat + %mul_in = const.Declare tensor<1x1x1x1xf32> = dense <8.537536814401392e-06> : tensor<1x1x1x1xf32> isSplat + + %0 = IE.Subtract(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.FakeQuantize(%0, %fq1_in_low, %fq1_in_hi, %fq1_out_low, %fq1_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Multiply(%1, %1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %3 = IE.FakeQuantize(%2, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %4 = IE.ReduceMean(%3) {axes_value = [2, 3], keep_dims} : tensor<1x128x32x64xf32> -> tensor<1x128x1x1xf32> + %5 = IE.FakeQuantize(%4, %fq3_in_low, %fq3_in_hi, %fq3_out_low, %fq3_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x1x1xf32> + %7 = IE.FakeQuantize(%1, %fq3_in_low, %fq3_in_hi, %fq3_out_low, %fq3_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + + return %5, %7 : tensor<1x128x1x1xf32>, tensor<1x128x32x64xf32> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<103.136948> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<80.2275543> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_3:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<-37.2369881> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_4:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<504.549164> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_5:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_6:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + // CHECK-NEXT: [[OUT0:%.+]] = IE.Multiply([[INPUT_1]], [[CST_6]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x1x1xf32>, tensor<1xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[OUT1:%.+]] = IE.Multiply([[INPUT_0]], [[CST_6]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT2:%.+]] = IE.Subtract([[OUT1]], [[OUT0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT3:%.+]] = IE.FakeQuantize([[OUT2]], [[CST_3]], [[CST_2]], [[CST_3]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT4:%.+]] = IE.Multiply([[OUT3]], [[OUT3]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT5:%.+]] = IE.FakeQuantize([[OUT4]], [[CST_5]], [[CST_1]], [[CST_5]], [[CST_1]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT6:%.+]] = IE.Multiply([[OUT5]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT7:%.+]] = IE.ReduceMean([[OUT6]]) {axes_value = [2, 3], keep_dims} : tensor<1x128x32x64xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[OUT8:%.+]] = IE.FakeQuantize([[OUT7]], [[CST_5]], [[CST_4]], [[CST_5]], [[CST_4]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x1x1xf32> + // CHECK-NEXT: [[OUT9:%.+]] = IE.FakeQuantize([[OUT3]], [[CST_5]], [[CST]], [[CST_5]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT10:%.+]] = IE.Multiply([[OUT9]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + +} + + + +// ----- + +// CHECK-LABEL: @AdjustFakeQDQComplexAdd +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x128x1x1xf32> +func.func @AdjustFakeQDQComplexAdd(%arg0 : tensor<1x128x32x64xf32>, %arg1 : tensor<1x128x1x1xf32>) -> tensor<1x128x32x64xf32> { + + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + + + %0 = IE.And(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.And(%0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Add(%0, %1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %3 = IE.FakeQuantize(%2, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %4 = IE.Add(%3, %2) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + return %4 : tensor<1x128x32x64xf32> + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + + // CHECK-NEXT: [[AND0:%.+]] = IE.And([[INPUT_0]], [[INPUT_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL1:%.+]] = IE.Multiply([[AND0]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL2:%.+]] = IE.Multiply([[MUL1]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[AND3:%.+]] = IE.And([[AND0]], [[INPUT_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL4:%.+]] = IE.Multiply([[AND3]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL5:%.+]] = IE.Multiply([[MUL4]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[ADD6:%.+]] = IE.Add([[MUL2]], [[MUL5]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[FQ7:%.+]] = IE.FakeQuantize([[ADD6]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[ADD8:%.+]] = IE.Add([[FQ7]], [[ADD6]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL9:%.+]] = IE.Multiply([[ADD8]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> +} + +// ----- + +// CHECK-LABEL: @AdjustFakeQDQComplexSub +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x128x1x1xf32> +func.func @AdjustFakeQDQComplexSub(%arg0 : tensor<1x128x32x64xf32>, %arg1 : tensor<1x128x1x1xf32>) -> tensor<1x128x32x64xf32> { + + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + + + %0 = IE.And(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.And(%0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Subtract(%0, %1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %3 = IE.FakeQuantize(%2, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %4 = IE.Add(%3, %2) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + return %4 : tensor<1x128x32x64xf32> + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + + // CHECK-NEXT: [[AND0:%.+]] = IE.And([[INPUT_0]], [[INPUT_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL1:%.+]] = IE.Multiply([[AND0]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL2:%.+]] = IE.Multiply([[MUL1]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[AND3:%.+]] = IE.And([[AND0]], [[INPUT_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL4:%.+]] = IE.Multiply([[AND3]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL5:%.+]] = IE.Multiply([[MUL4]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[SUB6:%.+]] = IE.Subtract([[MUL2]], [[MUL5]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[FQ7:%.+]] = IE.FakeQuantize([[SUB6]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[ADD8:%.+]] = IE.Add([[FQ7]], [[SUB6]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL9:%.+]] = IE.Multiply([[ADD8]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> +} + + +// ----- + +// CHECK-LABEL: @AdjustFakeQDQComplexSub2 +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x128x1x1xf32> +func.func @AdjustFakeQDQComplexSub2(%arg0 : tensor<1x128x32x64xf32>, %arg1 : tensor<1x128x1x1xf32>) -> tensor<1x128x32x64xf32> { + + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + + + %0 = IE.And(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.And(%0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.Subtract(%0, %1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + %3 = IE.FakeQuantize(%2, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %4 = IE.Subtract(%3, %2) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + + return %4 : tensor<1x128x32x64xf32> + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + + // CHECK-NEXT: [[AND0:%.+]] = IE.And([[INPUT_0]], [[INPUT_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL1:%.+]] = IE.Multiply([[AND0]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL2:%.+]] = IE.Multiply([[MUL1]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[AND3:%.+]] = IE.And([[AND0]], [[INPUT_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL4:%.+]] = IE.Multiply([[AND3]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL5:%.+]] = IE.Multiply([[MUL4]], [[CST_2]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[SUB6:%.+]] = IE.Subtract([[MUL2]], [[MUL5]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[FQ7:%.+]] = IE.FakeQuantize([[SUB6]], [[CST_1]], [[CST_0]], [[CST_1]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[SUB8:%.+]] = IE.Subtract([[FQ7]], [[SUB6]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x32x64xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL9:%.+]] = IE.Multiply([[SUB8]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> +} + + +// ----- + + +// Sandwich an overflowing FQ in between two non-overflowing FQs and make sure propagation works properly +// CHECK-LABEL: @AdjustFQSandwichNF_OF_NF +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x128x1x1xf32> +func.func @AdjustFQSandwichNF_OF_NF(%arg0 : tensor<1x128x32x64xf32>, %arg1 : tensor<1x128x1x1xf32>) -> tensor<1x128x32x64xf32> { + %fq1_in_low = const.Declare tensor<1x1x1x1xf32> = dense <-182.1645050048828> : tensor<1x1x1x1xf32> isSplat + %fq1_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <392.4757080078125> : tensor<1x1x1x1xf32> isSplat + %fq1_out_low = const.Declare tensor<1x1x1x1xf32> = dense <-182.1645050048828> : tensor<1x1x1x1xf32> isSplat + %fq1_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <392.4757080078125> : tensor<1x1x1x1xf32> isSplat + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq3_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq3_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <504.5491638183594> : tensor<1x1x1x1xf32> isSplat + %fq3_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq3_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <504.5491638183594> : tensor<1x1x1x1xf32> isSplat + %add_cst = const.Declare tensor<1x1x1x1xf32> = dense <8.537536814401392e-06> : tensor<1x1x1x1xf32> isSplat + + %0 = IE.Subtract(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.And(%0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.FakeQuantize(%1, %fq1_in_low, %fq1_in_hi, %fq1_out_low, %fq1_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %3 = IE.FakeQuantize(%2, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %4 = IE.FakeQuantize(%3, %fq3_in_low, %fq3_in_hi, %fq3_out_low, %fq3_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + + + return %4 : tensor<1x128x32x64xf32> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<103.136948> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_3:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<80.2275543> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_4:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<-37.2369881> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_5:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + // CHECK-NEXT: [[OUT_0:%.+]] = IE.Subtract([[INPUT_0]], [[INPUT_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT_1:%.+]] = IE.And([[OUT_0]], [[INPUT_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT_2:%.+]] = IE.Multiply([[OUT_1]], [[CST_5]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT_3:%.+]] = IE.FakeQuantize([[OUT_2]], [[CST_4]], [[CST_3]], [[CST_4]], [[CST_3]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT_4:%.+]] = IE.FakeQuantize([[OUT_3]], [[CST_2]], [[CST_1]], [[CST_2]], [[CST_1]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT_5:%.+]] = IE.FakeQuantize([[OUT_4]], [[CST_2]], [[CST_0]], [[CST_2]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[OUT_6:%.+]] = IE.Multiply([[OUT_5]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> +} + +// Sandwich + Added Reshape ops to check propagation +// CHECK-LABEL: @AdjustFQSandwichNF_OF_NF_Reshape +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x128x32x64xf32>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x128x1x1xf32> +func.func @AdjustFQSandwichNF_OF_NF_Reshape(%arg0 : tensor<1x128x32x64xf32>, %arg1 : tensor<1x128x1x1xf32>) -> tensor<1x128x32x64xf32> { + %fq1_in_low = const.Declare tensor<1x1x1x1xf32> = dense <-182.1645050048828> : tensor<1x1x1x1xf32> isSplat + %fq1_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <392.4757080078125> : tensor<1x1x1x1xf32> isSplat + %fq1_out_low = const.Declare tensor<1x1x1x1xf32> = dense <-182.1645050048828> : tensor<1x1x1x1xf32> isSplat + %fq1_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <392.4757080078125> : tensor<1x1x1x1xf32> isSplat + %fq2_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq2_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq2_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <160302.078125> : tensor<1x1x1x1xf32> isSplat + %fq3_in_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq3_in_hi = const.Declare tensor<1x1x1x1xf32> = dense <504.5491638183594> : tensor<1x1x1x1xf32> isSplat + %fq3_out_low = const.Declare tensor<1x1x1x1xf32> = dense <0.000000e+00> : tensor<1x1x1x1xf32> isSplat + %fq3_out_hi = const.Declare tensor<1x1x1x1xf32> = dense <504.5491638183594> : tensor<1x1x1x1xf32> isSplat + %add_cst = const.Declare tensor<1x1x1x1xf32> = dense <8.537536814401392e-06> : tensor<1x1x1x1xf32> isSplat + + %0 = IE.Subtract(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %1 = IE.And(%0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + %2 = IE.FakeQuantize(%1, %fq1_in_low, %fq1_in_hi, %fq1_out_low, %fq1_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %3 = IE.Reshape(%2) { shape_value = [1, 128, 16, 128] } : tensor<1x128x32x64xf32> -> tensor<1x128x16x128xf32> + %4 = IE.Reshape(%3) { shape_value = [1, 128, 32, 64] } : tensor<1x128x16x128xf32> -> tensor<1x128x32x64xf32> + + %5 = IE.FakeQuantize(%4, %fq2_in_low, %fq2_in_hi, %fq2_out_low, %fq2_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + %6 = IE.FakeQuantize(%5, %fq3_in_low, %fq3_in_hi, %fq3_out_low, %fq3_out_hi) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + + + return %6 : tensor<1x128x32x64xf32> + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<4.89203119> : tensor<1xf32> + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<103.136948> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<3.276800e+04> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_3:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<80.2275543> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_4:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<-37.2369881> : tensor<1x1x1x1xf32> + // CHECK-DAG: [[CST_5:%.+]] = const.Declare tensor<1xf32> = dense<0.20441407> : tensor<1xf32> + // CHECK: [[SUB:%.+]] = IE.Subtract([[INPUT_0]], [[INPUT_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[AND1:%.+]] = IE.And([[SUB]], [[INPUT_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1x128x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL2:%.+]] = IE.Multiply([[AND1]], [[CST_5]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[FQ3:%.+]] = IE.FakeQuantize([[MUL2]], [[CST_4]], [[CST_3]], [[CST_4]], [[CST_3]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[RS4:%.+]] = IE.Reshape([[FQ3]]) {shape_value = [1, 128, 16, 128]} : tensor<1x128x32x64xf32> -> tensor<1x128x16x128xf32> + // CHECK-NEXT: [[RS5:%.+]] = IE.Reshape([[RS4]]) {shape_value = [1, 128, 32, 64]} : tensor<1x128x16x128xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[FQ6:%.+]] = IE.FakeQuantize([[RS5]], [[CST_2]], [[CST_1]], [[CST_2]], [[CST_1]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[FQ7:%.+]] = IE.FakeQuantize([[FQ6]], [[CST_2]], [[CST_0]], [[CST_2]], [[CST_0]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 65536 : i64} : tensor<1x128x32x64xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x128x32x64xf32> + // CHECK-NEXT: [[MUL8:%.+]] = IE.Multiply([[FQ7]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x128x32x64xf32>, tensor<1xf32> -> tensor<1x128x32x64xf32> +} + diff --git a/tests/lit/NPU/dialect/IE/passes/adjust_input_shape.mlir b/tests/lit/NPU/dialect/IE/passes/adjust_input_shape.mlir index c1ae8e708c..6927cf1525 100644 --- a/tests/lit/NPU/dialect/IE/passes/adjust_input_shape.mlir +++ b/tests/lit/NPU/dialect/IE/passes/adjust_input_shape.mlir @@ -1417,3 +1417,53 @@ func.func @AdjustInputShapeWithMinimalDimChange(%arg0: tensor<1x4x1600x2560xf16, // CHECK: [[EXPAND_OUTPUT:%.+]] = IE.Expand([[CAST_OUTPUT]]) {pads_begin = [0, 0, 0, 0], pads_end = [0, 12, 0, 0]} : tensor<1x4x1600x2560xf16, {order = #NHWC}> -> tensor<1x16x1600x2560xf16, {order = #NHWC}> // CHECK: return [[EXPAND_OUTPUT]] } + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NCWH = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)> + +// CHECK-LABEL: @AdjustInputShapeForMemPermuteWithDimN +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<4x75x16x64xf16> +func.func @AdjustInputShapeForMemPermuteWithDimN(%arg0: tensor<4x75x16x64xf16>) -> tensor<4x16x64x75xf16> { + %0 = IE.MemPermute(%arg0) {dstElemType = f16, dst_order = #NCHW, mem_perm = #NHWC} : + tensor<4x75x16x64xf16> -> tensor<4x16x64x75xf16> + + return %0 : tensor<4x16x64x75xf16> + + // CHECK: [[SHAPECAST_IN:%.+]] = IE.ShapeCast {shape = [1, 4, 75, 1024]} inputs([[INPUT]] : tensor<4x75x16x64xf16>) + // CHECK-SAME: -> tensor<1x4x75x1024xf16> + // CHECK: [[MEM_PERMUTE:%.+]] = IE.MemPermute([[SHAPECAST_IN]]) {dst_order = #NCHW, mem_perm = #NCWH} : tensor<1x4x75x1024xf16> + // CHECK-SAME: -> tensor<1x4x1024x75xf16> + // CHECK: [[SHAPECAST_OUT:%.+]] = IE.ShapeCast {shape = [4, 16, 64, 75]} inputs([[MEM_PERMUTE]] : tensor<1x4x1024x75xf16>) + // CHECK-SAME: -> tensor<4x16x64x75xf16> + + // CHECK: return [[SHAPECAST_OUT]] : tensor<4x16x64x75xf16> +} + +// ----- + +!qElemType = !quant.uniform +!qElemType1 = !quant.uniform + +// CHECK-LABEL: @NotExpandGroupConvToShapeCastForAxisQuantizedType +// CHECK-SAME: [[INPUT:%arg0]]: tensor<1x3x32x32x!qElemType> +func.func @NotExpandGroupConvToShapeCastForAxisQuantizedType(%arg0: tensor<1x3x32x32x!qElemType>) -> tensor<1x6x32x32x!qElemType1> { + %filters = const.Declare tensor<6x1x1x1xf32> = dense<1.0> : tensor<1x1x1x1xf32>, [#const.Broadcast<0 : i64, 6 : i64>] + %bias = const.Declare tensor<1x6x1x1xf32> = dense<0.0> : tensor<1x1x1x1xf32>, [#const.Broadcast<1 : i64, 6 : i64>] + %0 = IE.Expand(%arg0) {pads_begin = [0, 0, 0, 0], pads_end = [0, 3, 0, 0]} : tensor<1x3x32x32x!qElemType> -> tensor<1x6x32x32x!qElemType1> + %1 = IE.GroupConvolution(%0, %filters, %bias) {dilations = [1, 1], groups = 6 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x6x32x32x!qElemType1>, tensor<6x1x1x1xf32>, tensor<1x6x1x1xf32> -> tensor<1x6x32x32x!qElemType1> + return %1 : tensor<1x6x32x32x!qElemType1> + + // CHECK-DAG: [[BIAS:%.+]] = const.Declare tensor<1x6x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32>, [#const.Broadcast<1 : i64, 6 : i64>] + // CHECK-DAG: [[FILTER:%.+]] = const.Declare tensor<6x1x1x1xf32> = dense<1.000000e+00> : tensor<1x1x1x1xf32>, [#const.Broadcast<0 : i64, 6 : i64>] + + // CHECK-NOT: IE.ShapeCast + // CHECK: [[EXPAND:%.+]] = IE.Expand([[INPUT]]) {pads_begin = [0, 0, 0, 0], pads_end = [0, 3, 0, 0]} : tensor<1x3x32x32x!qElemType> -> tensor<1x6x32x32x!qElemType1> + // CHECK: [[GROUP_CONV:%.+]] = IE.GroupConvolution([[EXPAND]], [[FILTER]], [[BIAS]]) + // CHECK-SAME: dilations = [1, 1], groups = 6 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1] + // CHECK-SAME: -> tensor<1x6x32x32x!qElemType1> + + // CHECK: return [[GROUP_CONV]] +} diff --git a/tests/lit/NPU/dialect/IE/passes/adjust_layouts.mlir b/tests/lit/NPU/dialect/IE/passes/adjust_layouts.mlir index 83416fe406..6d30a0108f 100644 --- a/tests/lit/NPU/dialect/IE/passes/adjust_layouts.mlir +++ b/tests/lit/NPU/dialect/IE/passes/adjust_layouts.mlir @@ -377,30 +377,6 @@ func.func @MvnLayoutForReshapeFuse(%arg0: tensor<1x512x128x128xf16>) -> tensor<1 // ----- -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - -// CHECK-LABEL: @MvnWithAffineReshapeLayoutForReshapeFuse -func.func @MvnWithAffineReshapeLayoutForReshapeFuse(%arg0: tensor<1x512x128x128xf16>) -> tensor<1x512x128x128xf16> { - - %cst_0 = const.Declare tensor<512x512x3x3xf16> = dense<1.0> : tensor<512x512x3x3xf16> - %cst_1 = const.Declare tensor<1x512x1x1xf16> = dense<1.5> : tensor<1x512x1x1xf16> - %cst_2 = const.Declare tensor<512x1x1x1xf16> = dense<2.0> : tensor<1x512x1x1xf16>, [#const.Reshape<[512, 1, 1, 1]>] - %cst_3 = const.Declare tensor<1x512x1x1xf16> = dense<2.5> : tensor<1x512x1x1xf16> - - %103 = IE.Convolution(%arg0, %cst_0, %cst_1) {dilations = [1, 1], pads_begin = [1, 1], pads_end = [1, 1], strides = [1, 1]} : tensor<1x512x128x128xf16>, tensor<512x512x3x3xf16>, tensor<1x512x1x1xf16> -> tensor<1x512x128x128xf16> - %104 = IE.AffineReshape(%103) {dim_mapping = [[0], [1, 2], [3], [3]], shape_value = [1, 32, 16, 16384]} : tensor<1x512x128x128xf16> -> tensor<1x32x16x16384xf16> - %105 = IE.MVN(%104) {across_channels = false, eps = 9.9999999747524271E-7 : f64, normalize_variance = true} : tensor<1x32x16x16384xf16> -> tensor<1x32x16x16384xf16> - %106 = IE.AffineReshape(%105) {dim_mapping = [[0], [1], [1], [2, 3]], shape_value = [1, 512, 128, 128]} : tensor<1x32x16x16384xf16> -> tensor<1x512x128x128xf16> - %107 = IE.GroupConvolution(%106, %cst_2, %cst_3) {dilations = [1, 1], groups = 512 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x512x128x128xf16>, tensor<512x1x1x1xf16>, tensor<1x512x1x1xf16> -> tensor<1x512x128x128xf16> - - return %107 : tensor<1x512x128x128xf16> - - // CHECK: [[MVN_OUT:%.+]] = IE.MVN - // CHECK-SAME: -> tensor<1x32x16x16384xf16, {order = #NHWC}> -} - -// ----- - #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #map = affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)> diff --git a/tests/lit/NPU/dialect/IE/passes/convert_convbackpropdata_to_transposedconv.mlir b/tests/lit/NPU/dialect/IE/passes/convert_convbackpropdata_to_transposedconv.mlir index aa4e01b13e..be12f06efe 100644 --- a/tests/lit/NPU/dialect/IE/passes/convert_convbackpropdata_to_transposedconv.mlir +++ b/tests/lit/NPU/dialect/IE/passes/convert_convbackpropdata_to_transposedconv.mlir @@ -396,3 +396,21 @@ func.func @Convert3DConvBackpropDataWithNonConstFilterToTransposedConv(%input0: // CHECK: return [[OUTPUT]] } + +// ----- + +// CHECK-LABEL: @ConvertGroupConvBackpropDataWithNonConstFilterToGroupTransposedConv +// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x128x32x32xf16>, [[ARG1:%.+]]: tensor<128x1x1x4x4xf16>) +func.func @ConvertGroupConvBackpropDataWithNonConstFilterToGroupTransposedConv(%arg0: tensor<1x128x32x32xf16>, %arg1: tensor<128x1x1x4x4xf16>) -> tensor<1x128x64x64xf16> { + %0 = IE.GroupConvolutionBackpropData(%arg0, %arg1) {dilations = [1, 1], pads_begin = [1, 1], pads_end = [1, 1], spatial_output_padding = [0, 0], strides = [2, 2]} : tensor<1x128x32x32xf16>, tensor<128x1x1x4x4xf16> -> tensor<1x128x64x64xf16> + return %0 : tensor<1x128x64x64xf16> + + // CHECK-NOT: IE.GroupConvolutionBackpropData + // CHECK: [[REVERSE:%.+]] = IE.Reverse([[ARG1]]) {axis_value = [3, 4], mode = #IE.reverse_mode} : tensor<128x1x1x4x4xf16> -> tensor<128x1x1x4x4xf16> + // CHECK: [[OUTPUT:%.+]] = IE.GroupTransposedConvolution([[ARG0]], [[REVERSE]]) { + // CHECK-SAME: dilations = [1, 1], pads_begin = [1, 1], pads_end = [1, 1], spatial_output_padding = [0, 0], strides = [2, 2] + // CHECK-SAME: } : tensor<1x128x32x32xf16>, tensor<128x1x1x4x4xf16> -> tensor<1x128x64x64xf16> + + // CHECK: return [[OUTPUT]] + +} diff --git a/tests/lit/NPU/dialect/IE/passes/convert_dynamic_dequantize_to_dequantize.mlir b/tests/lit/NPU/dialect/IE/passes/convert_dynamic_dequantize_to_dequantize.mlir index 6ea537aae7..821fcfabd5 100644 --- a/tests/lit/NPU/dialect/IE/passes/convert_dynamic_dequantize_to_dequantize.mlir +++ b/tests/lit/NPU/dialect/IE/passes/convert_dynamic_dequantize_to_dequantize.mlir @@ -385,3 +385,30 @@ func.func @NotConvertForMultiAxes(%arg0: tensor<4096x4096x!qElemType>, %arg1: te // CHECK: [[FC:%.+]] = IE.FullyConnected // CHECK: return [[FC]] : tensor<1x4096xf16> } + +// ----- + +!qElemType = !quant.uniform +// CHECK-DAG: [[QELEMTYPE_OUT:!.+]] = !quant.uniform + +// Note that "CHECK-LABEL" directive is deliberately skipped here because it resets previously captured variables +// CHECK: @RescaleForI8WeightsAsInputs +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1024x8960xf16>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1536x8960xsi8>, +// CHECK-SAME: [[INPUT_2:%.+]]: tensor<1536x1xf16> +func.func @RescaleForI8WeightsAsInputs(%arg0: tensor<1024x8960xf16>, %arg1: tensor<1536x8960xsi8>, %arg2: tensor<1536x1xf16>) -> tensor<1024x1536xf16> { + %0 = IE.QuantizeCast(%arg1) {dstElemType = !qElemType} : tensor<1536x8960xsi8> -> tensor<1536x8960x!qElemType> + %1 = IE.DynamicDequantize(%0, %arg2) {dstElemType = f16} : tensor<1536x8960x!qElemType>, tensor<1536x1xf16> -> tensor<1536x8960xf16> + %2 = IE.FullyConnected(%arg0, %1) : tensor<1024x8960xf16>, tensor<1536x8960xf16> -> tensor<1024x1536xf16> + + return %2 : tensor<1024x1536xf16> + + // CHECK: [[CONST0:%.+]] = const.Declare tensor<1xf16> = dense<1.600000e+01> : tensor<1xf16> + // CHECK: [[RESHAPE0:%.+]] = IE.Reshape([[INPUT_2]]) {shape_value = [1, 1536]} : tensor<1536x1xf16> -> tensor<1x1536xf16> + // CHECK: [[QUANTIZECAST0:%.+]] = IE.QuantizeCast([[INPUT_1]]) {dstElemType = [[QELEMTYPE_OUT]]} : tensor<1536x8960xsi8> -> tensor<1536x8960x[[QELEMTYPE_OUT]]> + // CHECK: [[MULTIPLY0:%.+]] = IE.Multiply([[RESHAPE0]], [[CONST0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x1536xf16>, tensor<1xf16> -> tensor<1x1536xf16> + // CHECK: [[DEQUANTIZE0:%.+]] = IE.Dequantize([[QUANTIZECAST0]]) {dstElemType = f16} : tensor<1536x8960x[[QELEMTYPE_OUT]]> -> tensor<1536x8960xf16> + // CHECK: [[FULLYCONNECTED0:%.+]] = IE.FullyConnected([[INPUT_0]], [[DEQUANTIZE0]]) : tensor<1024x8960xf16>, tensor<1536x8960xf16> -> tensor<1024x1536xf16> + // CHECK: [[MULTIPLY1:%.+]] = IE.Multiply([[FULLYCONNECTED0]], [[MULTIPLY0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1024x1536xf16>, tensor<1x1536xf16> -> tensor<1024x1536xf16> + // CHECK: return [[MULTIPLY1]] : tensor<1024x1536xf16> +} diff --git a/tests/lit/NPU/dialect/IE/passes/convert_shape_to_4d.mlir b/tests/lit/NPU/dialect/IE/passes/convert_shape_to_4d.mlir index a7f6b3db8d..722b0326b6 100644 --- a/tests/lit/NPU/dialect/IE/passes/convert_shape_to_4d.mlir +++ b/tests/lit/NPU/dialect/IE/passes/convert_shape_to_4d.mlir @@ -1846,6 +1846,24 @@ func.func @ConvertTileWith5DInput5DRepeats(%arg0: tensor<1x2x4x1536x1xf16>) -> ( // ----- +// CHECK-LABEL: func.func @ConvertTileWith6DInput6DRepeats +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x19947x1x4x1x2xf16> +func.func @ConvertTileWith6DInput6DRepeats(%arg0: tensor<1x19947x1x4x1x2xf16>) -> (tensor<1x19947x8x4x4x2xf16>) { + %0 = IE.Tile(%arg0) {repeats_values = [1, 1, 8, 1, 4, 1]} : tensor<1x19947x1x4x1x2xf16> -> tensor<1x19947x8x4x4x2xf16> + return %0 : tensor<1x19947x8x4x4x2xf16> + + // CHECK: [[AFFINERESHAPE0:%.+]] = IE.AffineReshape([[INPUT]]) + // CHECK{LITERAL}: {dim_mapping = [[0], [0], [1], [2], [3], [4]], shape_value = [19947, 1, 4, 1, 2]} : tensor<1x19947x1x4x1x2xf16> -> tensor<19947x1x4x1x2xf16> + // CHECK: [[TILE:%.+]] = IE.Tile([[AFFINERESHAPE0]]) + // CHECK{LITERAL}: {repeats_values = [1, 8, 1, 4, 1]} : tensor<19947x1x4x1x2xf16> -> tensor<19947x8x4x4x2xf16> + // CHECK: [[AFFINERESHAPE1:%.+]] = IE.AffineReshape([[TILE]]) + // CHECK{LITERAL}: {dim_mapping = [[0, 1], [2], [3], [4], [5]], shape_value = [1, 19947, 8, 4, 4, 2]} : tensor<19947x8x4x4x2xf16> -> tensor<1x19947x8x4x4x2xf16> + + // CHECK: return [[AFFINERESHAPE1]] : tensor<1x19947x8x4x4x2xf16> +} + +// ----- + // CHECK-LABEL: func.func @ConvertSequeezedTileWith5DInput5DRepeats // CHECK-SAME: [[INPUT:%.+]]: tensor<1024x1x1x1x128xf16> func.func @ConvertSequeezedTileWith5DInput5DRepeats(%arg0: tensor<1024x1x1x1x128xf16>) -> (tensor<1024x1x1x16x128xf16>) { @@ -3006,3 +3024,175 @@ func.func @QuantizedAdd(%arg0: tensor<2x4x!qElemType>) -> tensor<2x4x!qElemType> // CHECK-SAME: tensor<1x1x2x4x[[Q_ELEM_TYPE_1]]> -> tensor<2x4x[[Q_ELEM_TYPE_0]]> // CHECK: return [[RESULT]] : tensor<2x4x[[Q_ELEM_TYPE_0]]> } + +// ----- + +!dynType = tensor<1x1x?x1x128xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1, 128]> : tensor<5xsi64>, order = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>}> + +// CHECK-LABEL: @SemiDynamicAdd5D +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x1x?x1x128xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1, 128]> : tensor<5xsi64>, order = #NCDHW}> +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x1x1x1x128xf16> +func.func @SemiDynamicAdd5D(%arg0: !dynType, %arg1: tensor<1x1x1x1x128xf16>) -> !dynType { + %0 = IE.Add(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : !dynType, tensor<1x1x1x1x128xf16> -> !dynType + return %0 : !dynType + + // CHECK: [[SHAPE_OUT:%.+]] = const.Declare tensor<5xsi32> = dense<[1, 1, -1, 1, 128]> : tensor<5xsi32> + // CHECK: [[SHAPE_IN:%.+]] = const.Declare tensor<4xsi32> = dense<[1, -1, 1, 128]> : tensor<4xsi32> + + // CHECK: [[RESHAPE_IN_0:%.+]] = IE.DynamicReshape([[INPUT_0]], [[SHAPE_IN]]) {only_set_shape, output_bounds = [1, 500, 1, 128], output_shape = [1, -9223372036854775808, 1, 128]} : tensor<1x1x?x1x128xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1, 128]> : tensor<5xsi64>, order = #NCDHW}>, tensor<4xsi32> -> tensor<1x?x1x128xf16, {bounds = #const.OpaqueI64Elements<[1, 500, 1, 128]> : tensor<4xsi64>, order = #NCHW}> + // CHECK: [[RESHAPE_IN_1:%.+]] = IE.AffineReshape([[INPUT_1]]) { + // CHECK-SAME{LITERAL}: dim_mapping = [[0], [1], [2], [2], [3]], shape_value = [1, 1, 1, 128]} : tensor<1x1x1x1x128xf16> -> tensor<1x1x1x128xf16> + // CHECK: [[ADD:%.+]] = IE.Add([[RESHAPE_IN_0]], [[RESHAPE_IN_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x?x1x128xf16, {bounds = #const.OpaqueI64Elements<[1, 500, 1, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x1x128xf16> -> tensor<1x?x1x128xf16, {bounds = #const.OpaqueI64Elements<[1, 500, 1, 128]> : tensor<4xsi64>, order = #NCHW}> + // CHECK: [[RESHAPE_OUT:%.+]] = IE.DynamicReshape([[ADD]], [[SHAPE_OUT]]) {only_set_shape, output_bounds = [1, 1, 500, 1, 128], output_shape = [1, 1, -9223372036854775808, 1, 128]} : tensor<1x?x1x128xf16, {bounds = #const.OpaqueI64Elements<[1, 500, 1, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<5xsi32> -> tensor<1x1x?x1x128xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1, 128]> : tensor<5xsi64>, order = #NCDHW}> + + // CHECK: return [[RESHAPE_OUT]] : tensor<1x1x?x1x128xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1, 128]> : tensor<5xsi64>, order = #NCDHW}> +} + +// ----- + +!dynType = tensor : tensor<5xsi64>, order = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>}> + +// CHECK-LABEL: @SemiMultiaxisDynamicAdd5D +// CHECK-SAME: [[INPUT_0:%.+]]: tensor : tensor<5xsi64>, order = #NCDHW}> +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x1x1x1x128xf16> +func.func @SemiMultiaxisDynamicAdd5D(%arg0: !dynType, %arg1: tensor<1x1x1x1x128xf16>) -> !dynType { + %0 = IE.Add(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : !dynType, tensor<1x1x1x1x128xf16> -> !dynType + return %0 : !dynType + + // CHECK: [[SHAPE_OUT:%.+]] = const.Declare tensor<5xsi32> = dense<[-1, -1, -1, -1, 128]> : tensor<5xsi32> + // CHECK: [[SHAPE_IN:%.+]] = const.Declare tensor<4xsi32> = dense<[1, -1, -1, 128]> : tensor<4xsi32> + + // CHECK: [[RESHAPE_IN_0:%.+]] = IE.DynamicReshape([[INPUT_0]], [[SHAPE_IN]]) {only_set_shape, output_bounds = [1, 6, 20, 128], output_shape = [1, -9223372036854775808, -9223372036854775808, 128]} : tensor : tensor<5xsi64>, order = #NCDHW}>, tensor<4xsi32> -> tensor<1x?x?x128xf16, {bounds = #const.OpaqueI64Elements<[1, 6, 20, 128]> : tensor<4xsi64>, order = #NCHW}> + // CHECK: [[RESHAPE_IN_1:%.+]] = IE.AffineReshape([[INPUT_1]]) { + // CHECK-SAME{LITERAL}: dim_mapping = [[0], [1], [2], [2], [3]], shape_value = [1, 1, 1, 128]} : tensor<1x1x1x1x128xf16> -> tensor<1x1x1x128xf16> + // CHECK: [[ADD:%.+]] = IE.Add([[RESHAPE_IN_0]], [[RESHAPE_IN_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x?x?x128xf16, {bounds = #const.OpaqueI64Elements<[1, 6, 20, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x1x128xf16> -> tensor<1x?x?x128xf16, {bounds = #const.OpaqueI64Elements<[1, 6, 20, 128]> : tensor<4xsi64>, order = #NCHW}> + // CHECK: [[RESHAPE_OUT:%.+]] = IE.DynamicReshape([[ADD]], [[SHAPE_OUT]]) {only_set_shape, output_bounds = [2, 3, 4, 5, 128], output_shape = [-9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, 128]} : tensor<1x?x?x128xf16, {bounds = #const.OpaqueI64Elements<[1, 6, 20, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<5xsi32> -> tensor : tensor<5xsi64>, order = #NCDHW}> + + // CHECK: return [[RESHAPE_OUT]] : tensor : tensor<5xsi64>, order = #NCDHW}> +} + +// ----- + +!dynType1 = tensor<1x1x?x1x128xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1, 128]> : tensor<5xsi64>, order = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>}> +!dynType2 = tensor<1x1x?x1x1xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1, 1]> : tensor<5xsi64>, order = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>}> + +// CHECK-LABEL: @FullyDynamicAdd5D +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x1x?x1x128xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1, 128]> : tensor<5xsi64>, order = #NCDHW}> +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x1x?x1x1xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1, 1]> : tensor<5xsi64>, order = #NCDHW}> +func.func @FullyDynamicAdd5D(%arg0: !dynType1, %arg1: !dynType2) -> !dynType1 { + %0 = IE.Add(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : !dynType1, !dynType2 -> !dynType1 + return %0 : !dynType1 + + // CHECK: [[SHAPE_IN_1:%.+]] = const.Declare tensor<4xsi32> = dense<[1, 1, -1, 1]> : tensor<4xsi32> + // CHECK: [[SHAPE_OUT:%.+]] = const.Declare tensor<5xsi32> = dense<[1, 1, -1, 1, 128]> : tensor<5xsi32> + // CHECK: [[SHAPE_IN_0:%.+]] = const.Declare tensor<4xsi32> = dense<[1, 1, -1, 128]> : tensor<4xsi32> + + // CHECK: [[RESHAPE_IN_0:%.+]] = IE.DynamicReshape([[INPUT_0]], [[SHAPE_IN_0]]) {only_set_shape, output_bounds = [1, 1, 500, 128], output_shape = [1, 1, -9223372036854775808, 128]} : tensor<1x1x?x1x128xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1, 128]> : tensor<5xsi64>, order = #NCDHW}>, tensor<4xsi32> -> tensor<1x1x?x128xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 128]> : tensor<4xsi64>, order = #NCHW}> + // CHECK: [[RESHAPE_IN_1:%.+]] = IE.DynamicReshape([[INPUT_1]], [[SHAPE_IN_1]]) {only_set_shape, output_bounds = [1, 1, 500, 1], output_shape = [1, 1, -9223372036854775808, 1]} : tensor<1x1x?x1x1xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1, 1]> : tensor<5xsi64>, order = #NCDHW}>, tensor<4xsi32> -> tensor<1x1x?x1xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1]> : tensor<4xsi64>, order = #NCHW}> + // CHECK: [[ADD:%.+]] = IE.Add([[RESHAPE_IN_0]], [[RESHAPE_IN_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x1x?x128xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x?x1xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1]> : tensor<4xsi64>, order = #NCHW}> -> tensor<1x1x?x128xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 128]> : tensor<4xsi64>, order = #NCHW}> + // CHECK: [[RESHAPE_OUT:%.+]] = IE.DynamicReshape([[ADD]], [[SHAPE_OUT]]) {only_set_shape, output_bounds = [1, 1, 500, 1, 128], output_shape = [1, 1, -9223372036854775808, 1, 128]} : tensor<1x1x?x128xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<5xsi32> -> tensor<1x1x?x1x128xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1, 128]> : tensor<5xsi64>, order = #NCDHW}> + + // CHECK: return [[RESHAPE_OUT]] : tensor<1x1x?x1x128xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 1, 128]> : tensor<5xsi64>, order = #NCDHW}> +} + +// ----- + +!dynType = tensor : tensor<3xsi64>, order = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> + +// CHECK-LABEL: @SemiDynamicAdd3D +// CHECK-SAME: [[INPUT_0:%.+]]: tensor : tensor<3xsi64>, order = #CHW}> +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x1x768xf16> +func.func @SemiDynamicAdd3D(%arg0: !dynType, %arg1: tensor<1x1x768xf16>) -> !dynType { + %0 = IE.Add(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : !dynType, tensor<1x1x768xf16> -> !dynType + return %0 : !dynType + + // CHECK: [[SHAPE_OUT:%.+]] = const.Declare tensor<3xsi32> = dense<[-1, -1, 768]> : tensor<3xsi32> + // CHECK: [[SHAPE_IN:%.+]] = const.Declare tensor<4xsi32> = dense<[1, -1, -1, 768]> : tensor<4xsi32> + + // CHECK: [[RESHAPE_IN_1:%.+]] = IE.AffineReshape([[INPUT_1]]) { + // CHECK-SAME{LITERAL}: dim_mapping = [[0], [1, 2], [3]], shape_value = [1, 1, 1, 768]} : tensor<1x1x768xf16> -> tensor<1x1x1x768xf16> + // CHECK: [[RESHAPE_IN_0:%.+]] = IE.DynamicReshape([[INPUT_0]], [[SHAPE_IN]]) {only_set_shape, output_bounds = [1, 10, 10, 768], output_shape = [1, -9223372036854775808, -9223372036854775808, 768]} : tensor : tensor<3xsi64>, order = #CHW}>, tensor<4xsi32> -> tensor<1x?x?x768xf16, {bounds = #const.OpaqueI64Elements<[1, 10, 10, 768]> : tensor<4xsi64>, order = #NCHW}> + // CHECK: [[ADD:%.+]] = IE.Add([[RESHAPE_IN_0]], [[RESHAPE_IN_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x?x?x768xf16, {bounds = #const.OpaqueI64Elements<[1, 10, 10, 768]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x1x768xf16> -> tensor<1x?x?x768xf16, {bounds = #const.OpaqueI64Elements<[1, 10, 10, 768]> : tensor<4xsi64>, order = #NCHW}> + // CHECK: [[RESHAPE_OUT:%.+]] = IE.DynamicReshape([[ADD]], [[SHAPE_OUT]]) {only_set_shape, output_bounds = [10, 10, 768], output_shape = [-9223372036854775808, -9223372036854775808, 768]} : tensor<1x?x?x768xf16, {bounds = #const.OpaqueI64Elements<[1, 10, 10, 768]> : tensor<4xsi64>, order = #NCHW}>, tensor<3xsi32> -> tensor : tensor<3xsi64>, order = #CHW}> + + // CHECK: return [[RESHAPE_OUT]] : tensor : tensor<3xsi64>, order = #CHW}> +} + +// ----- + +!dynType = tensor : tensor<5xsi64>, order = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>}> + +// CHECK-LABEL: @DynamicFakeQuantize5D +// CHECK-SAME: [[INPUT:%.+]]: tensor : tensor<5xsi64>, order = #NCDHW}> +func.func @DynamicFakeQuantize5D(%arg0: !dynType) -> !dynType { + %cst = const.Declare tensor<1x1x1x1x1xf16> = dense<1.0> : tensor<1x1x1x1x1xf16> + %0 = IE.FakeQuantize(%arg0, %cst, %cst, %cst, %cst) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : !dynType, tensor<1x1x1x1x1xf16>, tensor<1x1x1x1x1xf16>, tensor<1x1x1x1x1xf16>, tensor<1x1x1x1x1xf16> -> !dynType + return %0 : !dynType + + // CHECK: [[SHAPE_OUT:%.+]] = const.Declare tensor<5xsi32> = dense<[-1, 1, 1, 2, 64]> : tensor<5xsi32> + // CHECK: [[FQ_CONST:%.+]] = const.Declare tensor<1x1x1x1xf16> = dense<1.000000e+00> : tensor<1x1x1x1x1xf16>, [#const.Reshape<[1, 1, 1, 1]>] + // CHECK: [[SHAPE_IN:%.+]] = const.Declare tensor<4xsi32> = dense<[1, -1, 2, 64]> : tensor<4xsi32> + + // CHECK: [[RESHAPE_IN:%.+]] = IE.DynamicReshape([[INPUT]], [[SHAPE_IN]]) {only_set_shape, output_bounds = [1, 500, 2, 64], output_shape = [1, -9223372036854775808, 2, 64]} : tensor : tensor<5xsi64>, order = #NCDHW}>, tensor<4xsi32> -> tensor<1x?x2x64xf16, {bounds = #const.OpaqueI64Elements<[1, 500, 2, 64]> : tensor<4xsi64>, order = #NCHW}> + // CHECK: [[FQ:%.+]] = IE.FakeQuantize([[RESHAPE_IN]], [[FQ_CONST]], [[FQ_CONST]], [[FQ_CONST]], [[FQ_CONST]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : tensor<1x?x2x64xf16, {bounds = #const.OpaqueI64Elements<[1, 500, 2, 64]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16> -> tensor<1x?x2x64xf16, {bounds = #const.OpaqueI64Elements<[1, 500, 2, 64]> : tensor<4xsi64>, order = #NCHW}> + // CHECK: [[RESHAPE_OUT:%.+]] = IE.DynamicReshape([[FQ]], [[SHAPE_OUT]]) {only_set_shape, output_bounds = [500, 1, 1, 2, 64], output_shape = [-9223372036854775808, 1, 1, 2, 64]} : tensor<1x?x2x64xf16, {bounds = #const.OpaqueI64Elements<[1, 500, 2, 64]> : tensor<4xsi64>, order = #NCHW}>, tensor<5xsi32> -> tensor : tensor<5xsi64>, order = #NCDHW}> + + // CHECK: return [[RESHAPE_OUT]] : tensor : tensor<5xsi64>, order = #NCDHW}> +} + +// ----- + +!dynType = tensor<1x?x2xf16, {bounds = #const.OpaqueI64Elements<[1, 500, 2]> : tensor<3xsi64>, order = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> + +// CHECK-LABEL: @DynamicFakeQuantize3D +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x?x2xf16, {bounds = #const.OpaqueI64Elements<[1, 500, 2]> : tensor<3xsi64>, order = #CHW}> +func.func @DynamicFakeQuantize3D(%arg0: !dynType) -> !dynType { + %cst = const.Declare tensor<1x1x1xf16> = dense<1.0> : tensor<1x1x1xf16> + %0 = IE.FakeQuantize(%arg0, %cst, %cst, %cst, %cst) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : !dynType, tensor<1x1x1xf16>, tensor<1x1x1xf16>, tensor<1x1x1xf16>, tensor<1x1x1xf16> -> !dynType + return %0 : !dynType + + // CHECK: [[SHAPE_OUT:%.+]] = const.Declare tensor<3xsi32> = dense<[1, -1, 2]> : tensor<3xsi32> + // CHECK: [[FQ_CONST:%.+]] = const.Declare tensor<1x1x1x1xf16> = dense<1.000000e+00> : tensor<1x1x1xf16>, [#const.Reshape<[1, 1, 1, 1]>] + // CHECK: [[SHAPE_IN:%.+]] = const.Declare tensor<4xsi32> = dense<[1, 1, -1, 2]> : tensor<4xsi32> + + // CHECK: [[RESHAPE_IN:%.+]] = IE.DynamicReshape([[INPUT]], [[SHAPE_IN]]) {only_set_shape, output_bounds = [1, 1, 500, 2], output_shape = [1, 1, -9223372036854775808, 2]} : tensor<1x?x2xf16, {bounds = #const.OpaqueI64Elements<[1, 500, 2]> : tensor<3xsi64>, order = #CHW}>, tensor<4xsi32> -> tensor<1x1x?x2xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 2]> : tensor<4xsi64>, order = #NCHW}> + // CHECK: [[FQ:%.+]] = IE.FakeQuantize([[RESHAPE_IN]], [[FQ_CONST]], [[FQ_CONST]], [[FQ_CONST]], [[FQ_CONST]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : tensor<1x1x?x2xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 2]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16> -> tensor<1x1x?x2xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 2]> : tensor<4xsi64>, order = #NCHW}> + // CHECK: [[RESHAPE_OUT:%.+]] = IE.DynamicReshape([[FQ]], [[SHAPE_OUT]]) {only_set_shape, output_bounds = [1, 500, 2], output_shape = [1, -9223372036854775808, 2]} : tensor<1x1x?x2xf16, {bounds = #const.OpaqueI64Elements<[1, 1, 500, 2]> : tensor<4xsi64>, order = #NCHW}>, tensor<3xsi32> -> tensor<1x?x2xf16, {bounds = #const.OpaqueI64Elements<[1, 500, 2]> : tensor<3xsi64>, order = #CHW}> + + // CHECK: return [[RESHAPE_OUT]] : tensor<1x?x2xf16, {bounds = #const.OpaqueI64Elements<[1, 500, 2]> : tensor<3xsi64>, order = #CHW}> +} + +// ----- + +// CHECK-LABEL: @Convert3DNormalizeL2 +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x801x28xf16> +func.func @Convert3DNormalizeL2(%arg0: tensor<1x801x28xf16>) -> tensor<1x801x28xf16> { + %0 = IE.NormalizeL2(%arg0) {axes_value = [2], eps = 9.999999960041972E-13 : f64, eps_mode = #IE.eps_mode} : tensor<1x801x28xf16> -> tensor<1x801x28xf16> + + return %0 : tensor<1x801x28xf16> + + // CHECK: [[RESHAPE_IN:%.+]] = IE.AffineReshape([[INPUT]]) { + // CHECK-SAME{LITERAL}: dim_mapping = [[0, 1], [2], [3]], shape_value = [1, 1, 801, 28]} : tensor<1x801x28xf16> -> tensor<1x1x801x28xf16> + // CHECK: [[NORMALIZE:%.+]] = IE.NormalizeL2([[RESHAPE_IN]]) {axes_value = [3], eps = 9.999999960041972E-13 : f64, eps_mode = #IE.eps_mode} : tensor<1x1x801x28xf16> -> tensor<1x1x801x28xf16> + // CHECK: [[RESHAPE_OUT:%.+]] = IE.AffineReshape([[NORMALIZE]]) { + // CHECK-SAME{LITERAL}: dim_mapping = [[0], [0], [1], [2]], shape_value = [1, 801, 28]} : tensor<1x1x801x28xf16> -> tensor<1x801x28xf16> + + // CHECK: return [[RESHAPE_OUT]] : tensor<1x801x28xf16> +} + +// ----- + +// CHECK-LABEL: @Convert5DNormalizeL2 +// CHECK-SAME: [[INPUT:%.+]]: tensor<384x768x7x7x7xf16> +func.func @Convert5DNormalizeL2(%arg0: tensor<384x768x7x7x7xf16>) -> tensor<384x768x7x7x7xf16> { + %0 = IE.NormalizeL2(%arg0) {axes_value = [1], eps = 9.999999960041972E-13 : f64, eps_mode = #IE.eps_mode} : tensor<384x768x7x7x7xf16> -> tensor<384x768x7x7x7xf16> + return %0 : tensor<384x768x7x7x7xf16> + + // CHECK: [[RESHAPE_IN:%.+]] = IE.AffineReshape([[INPUT]]) { + // CHECK-SAME{LITERAL}: dim_mapping = [[0, 1], [2], [3], [3], [3]], shape_value = [1, 384, 768, 343]} : tensor<384x768x7x7x7xf16> -> tensor<1x384x768x343xf16> + // CHECK: [[NORMALIZE:%.+]] = IE.NormalizeL2([[RESHAPE_IN]]) {axes_value = [2], eps = 9.999999960041972E-13 : f64, eps_mode = #IE.eps_mode} : tensor<1x384x768x343xf16> -> tensor<1x384x768x343xf16> + // CHECK: [[RESHAPE_OUT:%.+]] = IE.AffineReshape([[NORMALIZE]]) { + // CHECK-SAME{LITERAL}: dim_mapping = [[0], [0], [1], [2, 3, 4]], shape_value = [384, 768, 7, 7, 7]} : tensor<1x384x768x343xf16> -> tensor<384x768x7x7x7xf16> + + // CHECK: return [[RESHAPE_OUT]] : tensor<384x768x7x7x7xf16> +} diff --git a/tests/lit/NPU/dialect/IE/passes/convert_shape_to_4d_37XX.mlir b/tests/lit/NPU/dialect/IE/passes/convert_shape_to_4d_37XX.mlir index 33b44ec45e..79aa83b7f2 100644 --- a/tests/lit/NPU/dialect/IE/passes/convert_shape_to_4d_37XX.mlir +++ b/tests/lit/NPU/dialect/IE/passes/convert_shape_to_4d_37XX.mlir @@ -57,7 +57,7 @@ func.func @Convert2dTopKPositiveAxis(%arg0: tensor<80x77xsi32>) -> (tensor<80x1x return %output_values, %target_shape : tensor<80x1xsi32>, tensor<80x1xsi32> // CHECK: [[RESHAPE_INPUT:%.*]] = IE.AffineReshape(%arg0) { - // CHECK-SMAE: shape_value = [1, 1, 80, 77] + // CHECK-SAME: shape_value = [1, 1, 80, 77] // CHECK-SAME: } : tensor<80x77xsi32> -> tensor<1x1x80x77xsi32> // CHECK: [[VALUE:%.*]], [[SHAPE:%.*]] = IE.TopK([[RESHAPE_INPUT]]) @@ -83,7 +83,7 @@ func.func @Convert2dTopKNegativeAxis(%arg0: tensor<80x77xsi32>) -> (tensor<1x77x return %output_values, %target_shape : tensor<1x77xsi32>, tensor<1x77xsi32> // CHECK: [[RESHAPE_INPUT:%.*]] = IE.AffineReshape(%arg0) { - // CHECK-SMAE: shape_value = [1, 1, 80, 77] + // CHECK-SAME: shape_value = [1, 1, 80, 77] // CHECK-SAME: } : tensor<80x77xsi32> -> tensor<1x1x80x77xsi32> // CHECK: [[VALUE:%.*]], [[SHAPE:%.*]] = IE.TopK([[RESHAPE_INPUT]]) @@ -181,11 +181,11 @@ func.func @ConvertGather3DInTo4D(%arg0: tensor<2x468x2xf16>) -> tensor<2x71x2xf1 // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1x71x1x1xsi32> = dense<1> : tensor<71xsi32>, [#const.Reshape<[1, 71, 1, 1]>] // CHECK: [[RESHAPE_IN:%.+]] = IE.AffineReshape([[INPUT]]) { - // CHECK-SMAE: dim_mapping = [[0, 1], [2], [3]], shape_value = [1, 1, 468, 2]} : tensor<2x468x2xf16> -> tensor<1x2x468x2xf16> + // CHECK-SAME{LITERAL}: dim_mapping = [[0, 1], [2], [3]], shape_value = [1, 2, 468, 2]} : tensor<2x468x2xf16> -> tensor<1x2x468x2xf16> // CHECK: [[GATHER:%.+]] = IE.Gather([[RESHAPE_IN]], [[CST]]) { - // CHECK-SMAE: axis_value = 2 : i64, batch_dims = 1 : i64, indices_rank = 2 : i64} : tensor<1x2x468x2xf16>, tensor<1x71x1x1xsi32> -> tensor<1x2x71x2xf16> + // CHECK-SAME: axis_value = 2 : i64, batch_dims = 1 : i64, indices_rank = 2 : i64} : tensor<1x2x468x2xf16>, tensor<1x71x1x1xsi32> -> tensor<1x2x71x2xf16> // CHECK: [[RESHAPE_OUT:%.+]] = IE.AffineReshape([[GATHER]]) { - // CHECK-SMAE: dim_mapping = [[0], [0], [1], [2]], shape_value = [1, 71, 2]} : tensor<1x2x71x2xf16> -> tensor<2x71x2xf16> + // CHECK-SAME{LITERAL}: dim_mapping = [[0], [0], [1], [2]], shape_value = [2, 71, 2]} : tensor<1x2x71x2xf16> -> tensor<2x71x2xf16> // CHECK: return [[RESHAPE_OUT]] } @@ -201,10 +201,10 @@ func.func @ConvertGather5DOutTo4D(%arg0: tensor<2x5x8xf16>) -> tensor<2x5x3x1x4x // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<2x12x1x1xsi32> = dense<1> : tensor<2x3x1x4xsi32>, [#const.Reshape<[2, 12, 1, 1]>] // CHECK: [[RESHAPE_IN:%.+]] = IE.AffineReshape([[INPUT]]) { - // CHECK-SMAE: dim_mapping = [[0], [1], [2, 3]], shape_value = [2, 5, 8, 1]} : tensor<2x5x8xf16> -> tensor<2x5x8x1xf16> + // CHECK-SAME{LITERAL} dim_mapping = [[0], [1], [2, 3]], shape_value = [2, 5, 8, 1]} : tensor<2x5x8xf16> -> tensor<2x5x8x1xf16> // CHECK: [[GATHER:%.+]] = IE.Gather([[RESHAPE_IN]], [[CST]]) { - // CHECK-SMAE: axis_value = 2 : i64, batch_dims = 1 : i64, indices_rank = 2 : i64} : tensor<2x5x8x1xf16>, tensor<2x12x1x1xsi32> -> tensor<2x5x12x1xf16> + // CHECK-SAME: axis_value = 2 : i64, batch_dims = 1 : i64, indices_rank = 2 : i64} : tensor<2x5x8x1xf16>, tensor<2x12x1x1xsi32> -> tensor<2x5x12x1xf16> // CHECK: [[RESHAPE_OUT:%.+]] = IE.Reshape([[GATHER]]) { - // CHECK-SMAE: shape_value = [2, 5, 3, 1, 4]} : tensor<2x5x12x1xf16> -> tensor<2x5x3x1x4xf16> + // CHECK-SAME: shape_value = [2, 5, 3, 1, 4]} : tensor<2x5x12x1xf16> -> tensor<2x5x3x1x4xf16> // CHECK: return [[RESHAPE_OUT]] } diff --git a/tests/lit/NPU/dialect/IE/passes/convert_shape_to_4d_40XX+.mlir b/tests/lit/NPU/dialect/IE/passes/convert_shape_to_4d_40XX+.mlir index cd810b03c6..96882bcf7c 100644 --- a/tests/lit/NPU/dialect/IE/passes/convert_shape_to_4d_40XX+.mlir +++ b/tests/lit/NPU/dialect/IE/passes/convert_shape_to_4d_40XX+.mlir @@ -14,7 +14,7 @@ func.func @Convert2dTopKPositiveAxis(%arg0: tensor<80x77xsi32>) -> (tensor<80x1x return %output_values, %target_shape : tensor<80x1xsi32>, tensor<80x1xsi32> // CHECK: [[RESHAPE_INPUT:%.*]] = IE.AffineReshape(%arg0) { - // CHECK-SMAE: shape_value = [1, 1, 80, 77] + // CHECK-SAME: shape_value = [1, 1, 80, 77] // CHECK-SAME: } : tensor<80x77xsi32> -> tensor<1x1x80x77xsi32> // CHECK: [[VALUE:%.*]], [[SHAPE:%.*]] = IE.TopK([[RESHAPE_INPUT]]) @@ -40,7 +40,7 @@ func.func @Convert2dTopKNegativeAxis(%arg0: tensor<80x77xsi32>) -> (tensor<1x77x return %output_values, %target_shape : tensor<1x77xsi32>, tensor<1x77xsi32> // CHECK: [[RESHAPE_INPUT:%.*]] = IE.AffineReshape(%arg0) { - // CHECK-SMAE: shape_value = [1, 1, 80, 77] + // CHECK-SAME: shape_value = [1, 1, 80, 77] // CHECK-SAME: } : tensor<80x77xsi32> -> tensor<1x1x80x77xsi32> // CHECK: [[VALUE:%.*]], [[SHAPE:%.*]] = IE.TopK([[RESHAPE_INPUT]]) diff --git a/tests/lit/NPU/dialect/IE/passes/convert_subtract_to_negative_add.mlir b/tests/lit/NPU/dialect/IE/passes/convert_subtract_to_negative_add.mlir index b3536faa98..f0e0bea055 100644 --- a/tests/lit/NPU/dialect/IE/passes/convert_subtract_to_negative_add.mlir +++ b/tests/lit/NPU/dialect/IE/passes/convert_subtract_to_negative_add.mlir @@ -516,11 +516,11 @@ func.func @ConvertSubtractWithFQConstantMultiplyInput(%arg0: tensor<1x1x1x1xf16> // CHECK-LABEL: @DynamicSubtract // CHECK-SAME: [[INPUT:%.+]]: tensor<1x1x1x?xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 1, 64]> : tensor<4xsi64>, order = #NCHW}> func.func @DynamicSubtract(%arg0: tensor<1x1x1x?xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 1, 64]> : tensor<4xsi64>, order = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>}>) -> tensor<1x1x1x?xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 1, 64]> : tensor<4xsi64>, order = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>}> { - %cst = const.Declare tensor<1x1x1x1xf32> = dense_resource : tensor<1x1x1x1xf32> isSplat + %cst = const.Declare tensor<1x1x1x1xf32> = dense_resource : tensor<1x1x1x1xf32> isSplat %0 = IE.Subtract(%cst, %arg0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x1x1x1xf32>, tensor<1x1x1x?xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 1, 64]> : tensor<4xsi64>, order = #NCHW}> -> tensor<1x1x1x?xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 1, 64]> : tensor<4xsi64>, order = #NCHW}> return %0 : tensor<1x1x1x?xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 1, 64]> : tensor<4xsi64>, order = #NCHW}> - // CHECK: [[CST:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense_resource : tensor<1x1x1x1xf32> isSplat + // CHECK: [[CST:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense_resource : tensor<1x1x1x1xf32> isSplat // CHECK: [[SUB:%.+]] = IE.Subtract([[CST]], [[INPUT]]) {auto_broadcast = #IE.auto_broadcast_type} : // CHECK-SAME: tensor<1x1x1x1xf32>, tensor<1x1x1x?xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 1, 64]> : tensor<4xsi64>, order = #NCHW}> -> tensor<1x1x1x?xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 1, 64]> : tensor<4xsi64>, order = #NCHW}> // CHECK: return [[SUB]] : tensor<1x1x1x?xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 1, 64]> : tensor<4xsi64>, order = #NCHW}> @@ -530,7 +530,7 @@ func.func @DynamicSubtract(%arg0: tensor<1x1x1x?xf32, {bounds = #const.OpaqueI64 {-# dialect_resources: { builtin: { - ov: "0x1000000000004040" + vpux_ow_1: "0x1000000000004040" } } #-} diff --git a/tests/lit/NPU/dialect/IE/passes/convert_to_mixed_precision.mlir b/tests/lit/NPU/dialect/IE/passes/convert_to_mixed_precision.mlir index 38fd0cdf1a..1af577c228 100644 --- a/tests/lit/NPU/dialect/IE/passes/convert_to_mixed_precision.mlir +++ b/tests/lit/NPU/dialect/IE/passes/convert_to_mixed_precision.mlir @@ -894,31 +894,3 @@ func.func @MixedPrecisionGroupMatmul(%arg0: tensor<1x8x1x64xf16>) -> tensor<1x8x //CHECK: return [[VAL2]] } -// ----- - -!qElemType = !quant.uniform - - func.func @AvoidMixedPrecisionForConvWithPostOpReluAndNegativeScales(%arg0: tensor<1x3x448x448xf32>) -> tensor<1x32x224x224xf32> { - %cst = const.Declare tensor<1x32x1x1xf16> = dense<1.0> : tensor<1x32x1x1xf16>, [#const.CastElemType] - %cst_0 = const.Declare tensor<32x3x3x3x!qElemType> = dense<1> : tensor<32x3x3x3xsi8>, [#const.CastElemType, #const.CastElemType] - %0 = IE.Dequantize(%cst_0) {dstElemType = f16} : tensor<32x3x3x3x!qElemType> -> tensor<32x3x3x3xf16> - %1 = IE.Convert(%arg0) {dstElemType = f16} : tensor<1x3x448x448xf32> -> tensor<1x3x448x448xf16> - %2 = IE.Convolution(%1, %0, %cst) { - dilations = [1, 1], - pads_begin = [1, 1], - pads_end = [0, 0], - post_op = #IE.Relu<>, - strides = [2, 2] - } : tensor<1x3x448x448xf16>, tensor<32x3x3x3xf16>, tensor<1x32x1x1xf16> -> tensor<1x32x224x224xf16> - %3 = IE.Convert(%2) {dstElemType = f32} : tensor<1x32x224x224xf16> -> tensor<1x32x224x224xf32> - return %3 : tensor<1x32x224x224xf32> - - //CHECK: [[CST:%.+]] = const.Declare tensor<1x32x1x1xf16> = dense<1.000000e+00> : tensor<1x32x1x1xf16>, [#const.CastElemType] - //CHECK: [[CST_0:%.+]] = const.Declare tensor<32x3x3x3x!qElemType> = dense<1> : tensor<32x3x3x3xsi8>, [#const.CastElemType, #const.CastElemType] - - //CHECK: [[VAL0:%.+]] = IE.Dequantize([[CST_0]]) {dstElemType = f16} : tensor<32x3x3x3x!qElemType> -> tensor<32x3x3x3xf16> - //CHECK: [[VAL1:%.+]] = IE.Convert(%arg0) {dstElemType = f16} : tensor<1x3x448x448xf32> -> tensor<1x3x448x448xf16> - //CHECK: [[VAL2:%.+]] = IE.Convolution([[VAL1]], [[VAL0]], [[CST]]) {dilations = [1, 1], pads_begin = [1, 1], pads_end = [0, 0], post_op = #IE.Relu<>, strides = [2, 2]} : tensor<1x3x448x448xf16>, tensor<32x3x3x3xf16>, tensor<1x32x1x1xf16> -> tensor<1x32x224x224xf16> - //CHECK: [[VAL3:%.+]] = IE.Convert([[VAL2]]) {dstElemType = f32} : tensor<1x32x224x224xf16> -> tensor<1x32x224x224xf32> - //CHECK: return [[VAL3]] : tensor<1x32x224x224xf32> -} diff --git a/tests/lit/NPU/dialect/IE/passes/convert_to_mixed_precision_37XX.mlir b/tests/lit/NPU/dialect/IE/passes/convert_to_mixed_precision_37XX_40XX.mlir similarity index 55% rename from tests/lit/NPU/dialect/IE/passes/convert_to_mixed_precision_37XX.mlir rename to tests/lit/NPU/dialect/IE/passes/convert_to_mixed_precision_37XX_40XX.mlir index c6f0cbf619..739ffc5ed2 100644 --- a/tests/lit/NPU/dialect/IE/passes/convert_to_mixed_precision_37XX.mlir +++ b/tests/lit/NPU/dialect/IE/passes/convert_to_mixed_precision_37XX_40XX.mlir @@ -4,7 +4,7 @@ // // RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --convert-to-mixed-precision %s | FileCheck %s -// REQUIRES: arch-NPU37XX +// REQUIRES: arch-NPU37XX || arch-NPU40XX !qElemType = !quant.uniform @@ -66,3 +66,34 @@ func.func @MixedPrecisionConvQuantile(%arg0: tensor<1x16x1x1xf16>) -> tensor<1x1 //CHECK: [[CONV:%.+]] = IE.Convolution([[QUANT]], [[CST]]) {dilations = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x16x1x1x!qElemType1>, tensor<16x16x1x1x!qElemType> -> tensor<1x16x1x1xf16> //CHECK: return [[CONV]] } + +// ----- + +!qElemType = !quant.uniform + +// CHECK-LABEL: @AvoidMixedPrecisionForConvWithPostOpReluAndNegativeScales +// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x3x448x448xf32>) +func.func @AvoidMixedPrecisionForConvWithPostOpReluAndNegativeScales(%arg0: tensor<1x3x448x448xf32>) -> tensor<1x32x224x224xf32> { + %cst = const.Declare tensor<1x32x1x1xf16> = dense<1.0> : tensor<1x32x1x1xf16>, [#const.CastElemType] + %cst_0 = const.Declare tensor<32x3x3x3x!qElemType> = dense<1> : tensor<32x3x3x3xsi8>, [#const.CastElemType, #const.CastElemType] + %0 = IE.Dequantize(%cst_0) {dstElemType = f16} : tensor<32x3x3x3x!qElemType> -> tensor<32x3x3x3xf16> + %1 = IE.Convert(%arg0) {dstElemType = f16} : tensor<1x3x448x448xf32> -> tensor<1x3x448x448xf16> + %2 = IE.Convolution(%1, %0, %cst) { + dilations = [1, 1], + pads_begin = [1, 1], + pads_end = [0, 0], + post_op = #IE.Relu<>, + strides = [2, 2] + } : tensor<1x3x448x448xf16>, tensor<32x3x3x3xf16>, tensor<1x32x1x1xf16> -> tensor<1x32x224x224xf16> + %3 = IE.Convert(%2) {dstElemType = f32} : tensor<1x32x224x224xf16> -> tensor<1x32x224x224xf32> + return %3 : tensor<1x32x224x224xf32> + + //CHECK: [[CST:%.+]] = const.Declare tensor<1x32x1x1xf16> = dense<1.000000e+00> : tensor<1x32x1x1xf16>, [#const.CastElemType] + //CHECK: [[CST_0:%.+]] = const.Declare tensor<32x3x3x3x!qElemType> = dense<1> : tensor<32x3x3x3xsi8>, [#const.CastElemType, #const.CastElemType] + + //CHECK: [[VAL0:%.+]] = IE.Dequantize([[CST_0]]) {dstElemType = f16} : tensor<32x3x3x3x!qElemType> -> tensor<32x3x3x3xf16> + //CHECK: [[VAL1:%.+]] = IE.Convert([[ARG0]]) {dstElemType = f16} : tensor<1x3x448x448xf32> -> tensor<1x3x448x448xf16> + //CHECK: [[VAL2:%.+]] = IE.Convolution([[VAL1]], [[VAL0]], [[CST]]) {dilations = [1, 1], pads_begin = [1, 1], pads_end = [0, 0], post_op = #IE.Relu<>, strides = [2, 2]} : tensor<1x3x448x448xf16>, tensor<32x3x3x3xf16>, tensor<1x32x1x1xf16> -> tensor<1x32x224x224xf16> + //CHECK: [[VAL3:%.+]] = IE.Convert([[VAL2]]) {dstElemType = f32} : tensor<1x32x224x224xf16> -> tensor<1x32x224x224xf32> + //CHECK: return [[VAL3]] : tensor<1x32x224x224xf32> +} diff --git a/tests/lit/NPU/dialect/IE/passes/decompose_lstm_sequence_40XX+.mlir b/tests/lit/NPU/dialect/IE/passes/decompose_lstm_sequence_40XX+.mlir index b9fda2afd0..1266aee825 100644 --- a/tests/lit/NPU/dialect/IE/passes/decompose_lstm_sequence_40XX+.mlir +++ b/tests/lit/NPU/dialect/IE/passes/decompose_lstm_sequence_40XX+.mlir @@ -114,12 +114,12 @@ func.func @DecomposeDynamicLSTMSequence(%arg0: tensor<1x?x512xf32, {bounds = #co %outputHiddenValues, %outputHiddenState, %outputCellState = IE.LSTMSequence(%arg0, %arg1, %arg2, %cst_1, %cst_2, %cst_3) {direction = #IE.rnn_seq_direction, operandSegmentSizes = array} : tensor<1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 35, 512]> : tensor<3xsi64>, order = #CHW}>, tensor<1x1x128xf32>, tensor<1x1x128xf32>, tensor<1x512x512xf32>, tensor<1x512x128xf32>, tensor<1x512xf32> -> tensor<1x1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x128xf32>, tensor<1x1x128xf32> return %outputHiddenValues, %outputHiddenState, %outputCellState : tensor<1x1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x128xf32>, tensor<1x1x128xf32> - // CHECK: [[CST:%.+]] = const.Declare tensor<1xsi64> = dense<[1, 1, 35, 512]> : tensor<4xsi64>, [#const.SubView<[3], [1]>] - // CHECK: [[CST_0:%.+]] = const.Declare tensor<2xsi64> = dense<[1, 1, 512, 512]> : tensor<4xsi64>, [#const.SubView<[0], [2]>] - // CHECK: [[BIAS:%.+]] = const.Declare tensor<1x1x512xf32> = dense<0.000000e+00> : tensor<1x512xf32>, [#const.Reshape<[1, 1, 512]>] - // CHECK: [[CST_1:%.+]] = const.Declare tensor<1x1x512x512xf32> = dense<0.000000e+00> : tensor<1x512x512xf32>, [#const.Reshape<[1, 1, 512, 512]>] - // CHECK: [[CST_2:%.+]] = const.Declare tensor<1xsi64> = dense<1> : tensor<1xsi64> - // CHECK: [[CST_3:%.+]] = const.Declare tensor<1x512x128xf32> = dense<0.000000e+00> : tensor<1x512x128xf32> + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1xsi64> = dense<[1, 1, 512, 512]> : tensor<4xsi64>, [#const.SubView<[0], [1]>] + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1xsi64> = dense<[1, 1, 35, 512]> : tensor<4xsi64>, [#const.SubView<[3], [1]>] + // CHECK-DAG: [[CST_1:%.+]] = const.Declare tensor<1x1x512x512xf32> = dense<0.000000e+00> : tensor<1x512x512xf32>, [#const.Reshape<[1, 1, 512, 512]>] + // CHECK-DAG: [[CST_2:%.+]] = const.Declare tensor<1xsi64> = dense<1> : tensor<1xsi64> + // CHECK-DAG: [[CST_3:%.+]] = const.Declare tensor<1x512x128xf32> = dense<0.000000e+00> : tensor<1x512x128xf32> + // CHECK-DAG: [[CST_4:%.+]] = const.Declare tensor<1x1x512xf32> = dense<0.000000e+00> : tensor<1x512xf32>, [#const.Reshape<[1, 1, 512]>] // CHECK: [[SHAPE_OF_0:%.+]] = IE.ShapeOf([[ARG_0]]) {dstElemType = si64} : tensor<1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 35, 512]> : tensor<3xsi64>, order = #CHW}> -> tensor<3xsi64> // CHECK: [[SLICE_0:%.+]] = IE.Slice [[SHAPE_OF_0]] [0] [1] : tensor<3xsi64> to tensor<1xsi64> // CHECK: [[SLICE_1:%.+]] = IE.Slice [[SHAPE_OF_0]] [1] [2] : tensor<3xsi64> to tensor<2xsi64> @@ -129,65 +129,8 @@ func.func @DecomposeDynamicLSTMSequence(%arg0: tensor<1x?x512xf32, {bounds = #co // CHECK: [[MAT_MUL:%.+]] = IE.MatMul([[DYN_EXPAND]], [[CST_1]]) {transpose_b} : tensor<1x1x35x512xf32>, tensor<1x1x512x512xf32> -> tensor<1x1x35x512xf32> // CHECK: [[SHAPE_OF_1:%.+]] = IE.ShapeOf([[DYN_RESHAPE_0]]) {dstElemType = si64} : tensor<1x1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 512]> : tensor<4xsi64>, order = #NCHW}> -> tensor<4xsi64> // CHECK: [[SLICE_3:%.+]] = IE.Slice [[SHAPE_OF_1]] [2] [1] : tensor<4xsi64> to tensor<1xsi64> - // CHECK: [[CONCAT_1:%.+]] = IE.Concat([[CST_0]], [[SLICE_3]], [[CST]]) {per_axis = #IE.Concat} : tensor<2xsi64>, tensor<1xsi64>, tensor<1xsi64> -> tensor<4xsi64> + // CHECK: [[CONCAT_1:%.+]] = IE.Concat([[CST]], [[CST_2]], [[SLICE_3]], [[CST_0]]) {per_axis = #IE.Concat} : tensor<1xsi64>, tensor<1xsi64>, tensor<1xsi64>, tensor<1xsi64> -> tensor<4xsi64> // CHECK: [[DYN_RESHAPE_1:%.+]] = IE.DynamicReshape([[MAT_MUL]], [[CONCAT_1]]) {output_bounds = [1, 1, 35, 512], output_shape = [1, 1, -9223372036854775808, 512]} : tensor<1x1x35x512xf32>, tensor<4xsi64> -> tensor<1x1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 512]> : tensor<4xsi64>, order = #NCHW}> - // CHECK: [[OUT_HV:%.+]], [[OUT_HS:%.+]], [[OUT_CS:%.+]] = IE.LSTMSequence([[DYN_RESHAPE_1]], [[ARG_1]], [[ARG_2]], [[CST_3]], [[BIAS]]) {direction = #IE.rnn_seq_direction, operandSegmentSizes = array} : tensor<1x1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 512]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x128xf32>, tensor<1x1x128xf32>, tensor<1x512x128xf32>, tensor<1x1x512xf32> -> tensor<1x1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x128xf32>, tensor<1x1x128xf32> + // CHECK: [[OUT_HV:%.+]], [[OUT_HS:%.+]], [[OUT_CS:%.+]] = IE.LSTMSequence([[DYN_RESHAPE_1]], [[ARG_1]], [[ARG_2]], [[CST_3]], [[CST_4]]) {direction = #IE.rnn_seq_direction, operandSegmentSizes = array} : tensor<1x1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 512]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x128xf32>, tensor<1x1x128xf32>, tensor<1x512x128xf32>, tensor<1x1x512xf32> -> tensor<1x1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x128xf32>, tensor<1x1x128xf32> // CHECK: return [[OUT_HV]], [[OUT_HS]], [[OUT_CS]] : tensor<1x1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x128xf32>, tensor<1x1x128xf32> } - -// ----- - -#CHW = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> - -// CHECK-LABEL: func.func @TestDecomposeLSTMSequence( -// CHECK-SAME: [[ARG_0:.+]]: tensor<1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 35, 512]> : tensor<3xsi64>, order = #CHW}>, [[ARG_1:.+]]: tensor<1x2x128xf32>, [[ARG_2:.+]]: tensor<1x2x128xf32>) -> (tensor<1x2x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 2, 35, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x2x128xf32>, tensor<1x2x128xf32>) -func.func @TestDecomposeLSTMSequence(%arg0: tensor<1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 35, 512]> : tensor<3xsi64>, order = #CHW}>, %arg1: tensor<1x2x128xf32>, %arg2: tensor<1x2x128xf32>) -> (tensor<1x2x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 2, 35, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x2x128xf32>, tensor<1x2x128xf32>) { - %cst = const.Declare tensor<2x512xf32> = dense<0.000000e+00> : tensor<2x512xf32> - %cst_0 = const.Declare tensor<2x512x128xf32> = dense<0.000000e+00> : tensor<2x512x128xf32> - %cst_1 = const.Declare tensor<2x512x512xf32> = dense<0.000000e+00> : tensor<2x512x512xf32> - %outputHiddenValues, %outputHiddenState, %outputCellState = IE.LSTMSequence(%arg0, %arg1, %arg2, %cst_1, %cst_0, %cst) {direction = #IE.rnn_seq_direction, operandSegmentSizes = array} : tensor<1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 35, 512]> : tensor<3xsi64>, order = #CHW}>, tensor<1x2x128xf32>, tensor<1x2x128xf32>, tensor<2x512x512xf32>, tensor<2x512x128xf32>, tensor<2x512xf32> -> tensor<1x2x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 2, 35, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x2x128xf32>, tensor<1x2x128xf32> - return %outputHiddenValues, %outputHiddenState, %outputCellState : tensor<1x2x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 2, 35, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x2x128xf32>, tensor<1x2x128xf32> - - // CHECK: [[BIAS1:%.+]] = const.Declare tensor<1x1x512xf32> = dense<0.000000e+00> : tensor<2x512xf32>, [#const.SubView<[1, 0], [1, 512]>, #const.Reshape<[1, 1, 512]>] - // CHECK: [[CST:%.+]] = const.Declare tensor<1x1x512x512xf32> = dense<0.000000e+00> : tensor<2x512x512xf32>, [#const.SubView<[1, 0, 0], [1, 512, 512]>, #const.Reshape<[1, 1, 512, 512]>] - // CHECK: [[CST_0:%.+]] = const.Declare tensor<1xsi64> = dense<[1, 1, 35, 512]> : tensor<4xsi64>, [#const.SubView<[3], [1]>] - // CHECK: [[CST_1:%.+]] = const.Declare tensor<2xsi64> = dense<[1, 1, 512, 512]> : tensor<4xsi64>, [#const.SubView<[0], [2]>] - // CHECK: [[BIAS2:%.+]] = const.Declare tensor<1x1x512xf32> = dense<0.000000e+00> : tensor<2x512xf32>, [#const.SubView<[0, 0], [1, 512]>, #const.Reshape<[1, 1, 512]>] - // CHECK: [[CST_2:%.+]] = const.Declare tensor<1x1x512x512xf32> = dense<0.000000e+00> : tensor<2x512x512xf32>, [#const.SubView<[0, 0, 0], [1, 512, 512]>, #const.Reshape<[1, 1, 512, 512]>] - // CHECK: [[CST_3:%.+]] = const.Declare tensor<1xsi64> = dense<1> : tensor<1xsi64> - // CHECK: [[CST_4:%.+]] = const.Declare tensor<1x512x128xf32> = dense<0.000000e+00> : tensor<2x512x128xf32>, [#const.SubView<[1, 0, 0], [1, 512, 128]>] - // CHECK: [[CST_5:%.+]] = const.Declare tensor<1x512x128xf32> = dense<0.000000e+00> : tensor<2x512x128xf32>, [#const.SubView<[0, 0, 0], [1, 512, 128]>] - // CHECK: [[SLICE_0:%.+]] = IE.Slice [[ARG_1]] [0, 0, 0] [1, 1, 128] : tensor<1x2x128xf32> to tensor<1x1x128xf32> - // CHECK: [[SLICE_1:%.+]] = IE.Slice [[ARG_1]] [0, 1, 0] [1, 1, 128] : tensor<1x2x128xf32> to tensor<1x1x128xf32> - // CHECK: [[SLICE_2:%.+]] = IE.Slice [[ARG_2]] [0, 0, 0] [1, 1, 128] : tensor<1x2x128xf32> to tensor<1x1x128xf32> - // CHECK: [[SLICE_3:%.+]] = IE.Slice [[ARG_2]] [0, 1, 0] [1, 1, 128] : tensor<1x2x128xf32> to tensor<1x1x128xf32> - // CHECK: [[SHAPE_OF_0:%.+]] = IE.ShapeOf([[ARG_0]]) {dstElemType = si64} : tensor<1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 35, 512]> : tensor<3xsi64>, order = #CHW}> -> tensor<3xsi64> - // CHECK: [[SLICE_4:%.+]] = IE.Slice [[SHAPE_OF_0]] [0] [1] : tensor<3xsi64> to tensor<1xsi64> - // CHECK: [[SLICE_5:%.+]] = IE.Slice [[SHAPE_OF_0]] [1] [2] : tensor<3xsi64> to tensor<2xsi64> - // CHECK: [[CONCAT_0:%.+]] = IE.Concat([[SLICE_4]], [[CST_3]], [[SLICE_5]]) {per_axis = #IE.Concat} : tensor<1xsi64>, tensor<1xsi64>, tensor<2xsi64> -> tensor<4xsi64> - // CHECK: [[DYN_RESHAPE_0:%.+]] = IE.DynamicReshape([[ARG_0]], [[CONCAT_0]]) {output_bounds = [1, 1, 35, 512], output_shape = [1, 1, -9223372036854775808, 512]} : tensor<1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 35, 512]> : tensor<3xsi64>, order = #CHW}>, tensor<4xsi64> -> tensor<1x1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 512]> : tensor<4xsi64>, order = #NCHW}> - // CHECK: [[EXPAND_0:%.+]] = IE.DynamicExpand([[DYN_RESHAPE_0]]) : tensor<1x1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 512]> : tensor<4xsi64>, order = #NCHW}> -> tensor<1x1x35x512xf32> - // CHECK: [[MATMUL_0:%.+]] = IE.MatMul([[EXPAND_0]], [[CST_2]]) {transpose_b} : tensor<1x1x35x512xf32>, tensor<1x1x512x512xf32> -> tensor<1x1x35x512xf32> - // CHECK: [[SHAPE_OF_1:%.+]] = IE.ShapeOf([[DYN_RESHAPE_0]]) {dstElemType = si64} : tensor<1x1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 512]> : tensor<4xsi64>, order = #NCHW}> -> tensor<4xsi64> - // CHECK: [[SLICE_6:%.+]] = IE.Slice [[SHAPE_OF_1]] [2] [1] : tensor<4xsi64> to tensor<1xsi64> - // CHECK: [[CONCAT_1:%.+]] = IE.Concat([[CST_1]], [[SLICE_6]], [[CST_0]]) {per_axis = #IE.Concat} : tensor<2xsi64>, tensor<1xsi64>, tensor<1xsi64> -> tensor<4xsi64> - // CHECK: [[DYN_RESHAPE_1:%.+]] = IE.DynamicReshape([[MATMUL_0]], [[CONCAT_1]]) {output_bounds = [1, 1, 35, 512], output_shape = [1, 1, -9223372036854775808, 512]} : tensor<1x1x35x512xf32>, tensor<4xsi64> -> tensor<1x1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 512]> : tensor<4xsi64>, order = #NCHW}> - // CHECK: [[OUT_HV_0:%.+]], [[OUT_HS_0:%.+]], [[OUT_CS_0:%.+]] = IE.LSTMSequence([[DYN_RESHAPE_1]], [[SLICE_0]], [[SLICE_2]], [[CST_5]], [[BIAS2]]) {direction = #IE.rnn_seq_direction, operandSegmentSizes = array} : tensor<1x1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 512]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x128xf32>, tensor<1x1x128xf32>, tensor<1x512x128xf32>, tensor<1x1x512xf32> -> tensor<1x1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x128xf32>, tensor<1x1x128xf32> - // CHECK: [[SHAPE_OF_2:%.+]] = IE.ShapeOf([[ARG_0]]) {dstElemType = si64} : tensor<1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 35, 512]> : tensor<3xsi64>, order = #CHW}> -> tensor<3xsi64> - // CHECK: [[SLICE_7:%.+]] = IE.Slice [[SHAPE_OF_2]] [0] [1] : tensor<3xsi64> to tensor<1xsi64> - // CHECK: [[SLICE_8:%.+]] = IE.Slice [[SHAPE_OF_2]] [1] [2] : tensor<3xsi64> to tensor<2xsi64> - // CHECK: [[CONCAT_2:%.+]] = IE.Concat([[SLICE_7]], [[CST_3]], [[SLICE_8]]) {per_axis = #IE.Concat} : tensor<1xsi64>, tensor<1xsi64>, tensor<2xsi64> -> tensor<4xsi64> - // CHECK: [[DYN_RESHAPE_2:%.+]] = IE.DynamicReshape([[ARG_0]], [[CONCAT_2]]) {output_bounds = [1, 1, 35, 512], output_shape = [1, 1, -9223372036854775808, 512]} : tensor<1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 35, 512]> : tensor<3xsi64>, order = #CHW}>, tensor<4xsi64> -> tensor<1x1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 512]> : tensor<4xsi64>, order = #NCHW}> - // CHECK: [[EXPAND_1:%.+]] = IE.DynamicExpand([[DYN_RESHAPE_2]]) : tensor<1x1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 512]> : tensor<4xsi64>, order = #NCHW}> -> tensor<1x1x35x512xf32> - // CHECK: [[MATMUL_1:%.+]] = IE.MatMul([[EXPAND_1]], [[CST]]) {transpose_b} : tensor<1x1x35x512xf32>, tensor<1x1x512x512xf32> -> tensor<1x1x35x512xf32> - // CHECK: [[SHAPE_OF_3:%.+]] = IE.ShapeOf([[DYN_RESHAPE_2]]) {dstElemType = si64} : tensor<1x1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 512]> : tensor<4xsi64>, order = #NCHW}> -> tensor<4xsi64> - // CHECK: [[SLICE_9:%.+]] = IE.Slice [[SHAPE_OF_3]] [2] [1] : tensor<4xsi64> to tensor<1xsi64> - // CHECK: [[CONCAT_3:%.+]] = IE.Concat([[CST_1]], [[SLICE_9]], [[CST_0]]) {per_axis = #IE.Concat} : tensor<2xsi64>, tensor<1xsi64>, tensor<1xsi64> -> tensor<4xsi64> - // CHECK: [[DYN_RESHAPE_3:%.+]] = IE.DynamicReshape([[MATMUL_1]], [[CONCAT_3]]) {output_bounds = [1, 1, 35, 512], output_shape = [1, 1, -9223372036854775808, 512]} : tensor<1x1x35x512xf32>, tensor<4xsi64> -> tensor<1x1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 512]> : tensor<4xsi64>, order = #NCHW}> - // CHECK: [[OUT_HV_1:%.+]], [[OUT_HS_1:%.+]], [[OUT_CS_1:%.+]] = IE.LSTMSequence([[DYN_RESHAPE_3]], [[SLICE_1]], [[SLICE_3]], [[CST_4]], [[BIAS1]]) {direction = #IE.rnn_seq_direction, operandSegmentSizes = array} : tensor<1x1x?x512xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 512]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x128xf32>, tensor<1x1x128xf32>, tensor<1x512x128xf32>, tensor<1x1x512xf32> -> tensor<1x1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x128xf32>, tensor<1x1x128xf32> - // CHECK: [[CONCAT_4:%.+]] = IE.Concat([[OUT_HV_0]], [[OUT_HV_1]]) {per_axis = #IE.Concat} : tensor<1x1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 1, 35, 128]> : tensor<4xsi64>, order = #NCHW}> -> tensor<1x2x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 2, 35, 128]> : tensor<4xsi64>, order = #NCHW}> - // CHECK: [[CONCAT_5:%.+]] = IE.Concat([[OUT_HS_0]], [[OUT_HS_1]]) {per_axis = #IE.Concat} : tensor<1x1x128xf32>, tensor<1x1x128xf32> -> tensor<1x2x128xf32> - // CHECK: [[CONCAT_6:%.+]] = IE.Concat([[OUT_CS_0]], [[OUT_CS_1]]) {per_axis = #IE.Concat} : tensor<1x1x128xf32>, tensor<1x1x128xf32> -> tensor<1x2x128xf32> - // CHECK: return [[CONCAT_4]], [[CONCAT_5]], [[CONCAT_6]] : tensor<1x2x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 2, 35, 128]> : tensor<4xsi64>, order = #NCHW}>, tensor<1x2x128xf32>, tensor<1x2x128xf32> -} diff --git a/tests/lit/NPU/dialect/IE/passes/dump_statistics_of_ie_ops.mlir b/tests/lit/NPU/dialect/IE/passes/dump_statistics_of_ie_ops.mlir new file mode 100644 index 0000000000..37cb1b189c --- /dev/null +++ b/tests/lit/NPU/dialect/IE/passes/dump_statistics_of_ie_ops.mlir @@ -0,0 +1,154 @@ +// +// Copyright (C) 2024-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: env OV_NPU_LOG_LEVEL=LOG_INFO env IE_NPU_LOG_FILTER=dump-statistics-of-ie-ops vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --dump-statistics-of-ie-ops -o /dev/null %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +!qElemType = !quant.uniform + +module @NonComputationalOps { + net.NetworkInfo entryPoint : @init inputsInfo : { + DataInfo "vpux_ow_1" : tensor<2x1xui8> + DataInfo "vpux_ow_2" : tensor<2x1xf16> + } outputsInfo : { + DataInfo "vpux_tw_1" : tensor<2x1xui8> + DataInfo "vpux_tw_2" : tensor<1x2xf16> + } + + func.func @init(%ov1: tensor<2x1xui8>, %ov2: tensor<2x1xf16>) -> (tensor<2x1xui8>, tensor<1x2xf16>) { + %0 = IE.QuantizeCast(%ov1) {dstElemType = !qElemType} : tensor<2x1xui8> -> tensor<2x1x!qElemType> + %res0 = IE.QuantizeCast(%0) {dstElemType = ui8} : tensor<2x1x!qElemType> -> tensor<2x1xui8> + %res1 = IE.Reshape(%ov2) {shape_value = [1, 2]} : tensor<2x1xf16> -> tensor<1x2xf16> + return %res0, %res1 : tensor<2x1xui8>, tensor<1x2xf16> + } + + // CHECK: IE dialect statistics: + // CHECK: IE - 3 + // CHECK: Non-computational - 3 (100.00%) + // CHECK: IE.QuantizeCast - 2 (66.67%) + // CHECK: IE.Reshape - 1 (33.33%) +} + +// ----- + +module @ComputationalOps { + net.NetworkInfo entryPoint : @init inputsInfo : { + DataInfo "vpux_ow_1" : tensor<2x1xf32> + DataInfo "vpux_ow_2" : tensor<2x1xf16> + } outputsInfo : { + DataInfo "vpux_tw_1" : tensor<10x1xf16> + DataInfo "vpux_tw_2" : tensor<2x1xf16> + } + + func.func @init(%ov1: tensor<2x1xf32>, %ov2: tensor<2x1xf16>) -> (tensor<10x1xf16>, tensor<2x1xf16>) { + %0 = IE.Convert(%ov1) {dstElemType = f16} : tensor<2x1xf32> -> tensor<2x1xf16> + %fourty_two = const.Declare tensor<1xf16> = dense<42.0> : tensor<1xf16> + %1 = IE.Add(%0, %fourty_two) {auto_broadcast = #IE.auto_broadcast_type} + : tensor<2x1xf16>, tensor<1xf16> -> tensor<2x1xf16> + %res0 = IE.Pad(%1) { + mode = #IE.pad_mode, pad_value_attr = 0.0 : f64, pads_begin_attr = [0, 0], + pads_end_attr = [8, 0] + } : tensor<2x1xf16> -> tensor<10x1xf16> + %two = const.Declare tensor<1xf16> = dense<2.0> : tensor<1xf16> + %res1 = IE.Divide(%ov2, %two) {auto_broadcast = #IE.auto_broadcast_type} + : tensor<2x1xf16>, tensor<1xf16> -> tensor<2x1xf16> + return %res0, %res1 : tensor<10x1xf16>, tensor<2x1xf16> + } + + // CHECK: IE dialect statistics: + // CHECK: IE - 4 + // CHECK: Computational - 4 (100.00%) + // CHECK: IE.Convert - 1 (25.00%) + // CHECK: f32 -> f16 - 1 (25.00%) + // CHECK: IE.Add - 1 (25.00%) + // CHECK: IE.Divide - 1 (25.00%) + // CHECK: IE.Pad - 1 (25.00%) +} + +// ----- + +!qElemType1 = !quant.uniform +!qElemType2 = !quant.uniform + +module @WeightsSeparation_InterQuantizedConvert { + net.NetworkInfo entryPoint : @init inputsInfo : { + DataInfo "vpux_ow_1" : tensor<2x1xf16> + } outputsInfo : { + DataInfo "vpux_tw_1" : tensor<2x1xui8> + } + + func.func @init(%ov1: tensor<2x1xf16>) -> tensor<2x1xui8> { + %1 = IE.Convert(%ov1) {dstElemType = i8} : tensor<2x1xf16> -> tensor<2x1xi8> + %2 = IE.QuantizeCast(%1) {dstElemType = !qElemType1} : tensor<2x1xi8> -> tensor<2x1x!qElemType1> + %3 = IE.Reshape(%2) {shape_value = [1, 1, 2, 1]} : tensor<2x1x!qElemType1> -> tensor<1x1x2x1x!qElemType1> + %4 = IE.AvgPool(%3) { + exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], + rounding_type = #IE.rounding_type, strides = [1, 1] + } : tensor<1x1x2x1x!qElemType1> -> tensor<1x1x2x1x!qElemType2> + %5 = IE.Reshape(%4) {shape_value = [2, 1]} : tensor<1x1x2x1x!qElemType2> -> tensor<2x1x!qElemType2> + %res = IE.QuantizeCast(%5) {dstElemType = ui8} : tensor<2x1x!qElemType2> -> tensor<2x1xui8> + return %res : tensor<2x1xui8> + } + + // CHECK: IE dialect statistics: + // CHECK: IE - 6 + // CHECK: Non-computational - 4 (66.67%) + // CHECK: IE.QuantizeCast - 2 (33.33%) + // CHECK: IE.Reshape - 2 (33.33%) + // CHECK: Computational - 2 (33.33%) + // CHECK: IE.Convert - 1 (16.67%) + // CHECK: f16 -> i8 - 1 (16.67%) + // CHECK: IE.AvgPool - 1 (16.67%) + // CHECK: qtype -> qtype - 1 (16.67%) +} + +// ----- + +// same scales, same zero points +!qElemType_simple = !quant.uniform +// scales = {0.84, 1.5} +!qElemType_diff_scales = !quant.uniform +// zero points = {1, 42} +!qElemType_diff_zps = !quant.uniform + +module @PerAxisConvert { + net.NetworkInfo entryPoint : @init inputsInfo : { + DataInfo "vpux_ow_1" : tensor<2x1xf16> + } outputsInfo : { + DataInfo "out1" : tensor<2x1xi8> + DataInfo "out2" : tensor<2x1xi8> + DataInfo "out3" : tensor<2x1xi8> + } + + func.func @init(%ov1: tensor<2x1xf16>) -> (tensor<2x1xi8>, tensor<2x1xi8>, tensor<2x1xi8>) { + %1 = IE.Convert(%ov1) {dstElemType = i8} : tensor<2x1xf16> -> tensor<2x1xi8> + + %simple = IE.Convert(%1) {dstElemType = !qElemType_simple} + : tensor<2x1xi8> -> tensor<2x1x!qElemType_simple> + %res0 = IE.QuantizeCast(%simple) {dstElemType = i8} + : tensor<2x1x!qElemType_simple> -> tensor<2x1xi8> + + %diff_scales = IE.Convert(%1) {dstElemType = !qElemType_diff_scales} + : tensor<2x1xi8> -> tensor<2x1x!qElemType_diff_scales> + %res1 = IE.QuantizeCast(%diff_scales) {dstElemType = i8} + : tensor<2x1x!qElemType_diff_scales> -> tensor<2x1xi8> + + %diff_zps = IE.Convert(%1) {dstElemType = !qElemType_diff_zps} + : tensor<2x1xi8> -> tensor<2x1x!qElemType_diff_zps> + %res2 = IE.QuantizeCast(%diff_zps) {dstElemType = i8} + : tensor<2x1x!qElemType_diff_zps> -> tensor<2x1xi8> + + return %res0, %res1, %res2 : tensor<2x1xi8>, tensor<2x1xi8>, tensor<2x1xi8> + } + + // CHECK: IE dialect statistics: + // CHECK: IE - 7 + // CHECK: Non-computational - 3 (42.86%) + // CHECK: IE.QuantizeCast - 3 (42.86%) + // CHECK: Computational - 4 (57.14%) + // CHECK: IE.Convert - 4 (57.14%) + // CHECK: f16 -> i8 - 1 (14.29%) + // CHECK: i8 -> qtype - 3 (42.86%) +} diff --git a/tests/lit/NPU/dialect/IE/passes/expand_activation_channels_37XX_40XX.mlir b/tests/lit/NPU/dialect/IE/passes/expand_activation_channels_37XX_40XX.mlir index df09be6b3a..b3c2984f1a 100644 --- a/tests/lit/NPU/dialect/IE/passes/expand_activation_channels_37XX_40XX.mlir +++ b/tests/lit/NPU/dialect/IE/passes/expand_activation_channels_37XX_40XX.mlir @@ -740,3 +740,27 @@ func.func @ExpandConvolutionChannelsWithSoftMaxAfter(%arg0: tensor<1x512x56x56xf // CHECK: return [[SLICE]] : tensor<1x510x56x56xf16, {order = #NHWC}> } + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK-LABEL: @ExpandConvolutionChannelsDynamic +// CHECK-SAME: ([[INPUT:%.+]]: tensor<1x3x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 3, 720, 1280]> : tensor<4xsi64>, order = #NHWC}>) +func.func @ExpandConvolutionChannelsDynamic(%arg0: tensor<1x3x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 3, 720, 1280]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x32x360x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 360, 640]> : tensor<4xsi64>, order = #NHWC}> { + + %cst = const.Declare tensor<32x3x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x3x3x3xf16>, [#const.Reorder<#NHWC>] + %0 = IE.Convolution(%arg0, %cst) { + dilations = [1, 1], pads_begin = [1, 1], pads_end = [0, 0], strides = [2, 2] + } : tensor<1x3x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 3, 720, 1280]> : tensor<4xsi64>, order = #NHWC}>, tensor<32x3x3x3xf16, {order = #NHWC}> -> tensor<1x32x360x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 360, 640]> : tensor<4xsi64>, order = #NHWC}> + + return %0 : tensor<1x32x360x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 360, 640]> : tensor<4xsi64>, order = #NHWC}> + + // CHECK: [[CST:%.+]] = const.Declare tensor<32x16x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x3x3x3xf16>, [#const.Reorder<#NHWC>, #const.PadWithZero<[0, 0, 0, 0], [0, 13, 0, 0]>] + // CHECK: [[EXPAND:%.+]] = IE.Expand([[INPUT]]) {pads_begin = [0, 0, 0, 0], pads_end = [0, 13, 0, 0]} : tensor<1x3x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 3, 720, 1280]> : tensor<4xsi64>, order = #NHWC}> -> tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1280]> : tensor<4xsi64>, order = #NHWC}> + + // CHECK: [[CONV:%.+]] = IE.Convolution([[EXPAND]], [[CST]]) + // CHECK-SAME: -> tensor<1x32x360x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 360, 640]> : tensor<4xsi64>, order = #NHWC}> + + // CHECK: return [[CONV]] : tensor<1x32x360x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 360, 640]> : tensor<4xsi64>, order = #NHWC}> +} diff --git a/tests/lit/NPU/dialect/IE/passes/fuse_activation_ops_37XX_40XX.mlir b/tests/lit/NPU/dialect/IE/passes/fuse_activation_ops_37XX_40XX.mlir index 01a6e93a78..c5fb1e3491 100644 --- a/tests/lit/NPU/dialect/IE/passes/fuse_activation_ops_37XX_40XX.mlir +++ b/tests/lit/NPU/dialect/IE/passes/fuse_activation_ops_37XX_40XX.mlir @@ -123,7 +123,7 @@ func.func @AvgPoolWithLeakyReluFuseTest(%arg0: tensor<1x16x4x4xf16>) -> tensor<1 pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1], - rounding_type = #IE.rounding_type + rounding_type = #IE.rounding_type } : tensor<1x16x4x4xf16> -> tensor<1x16x3x3xf16> @@ -138,7 +138,7 @@ func.func @AvgPoolWithLeakyReluFuseTest(%arg0: tensor<1x16x4x4xf16>) -> tensor<1 // CHECK-SAME: pads_begin = [0, 0], // CHECK-SAME: pads_end = [0, 0], // CHECK-SAME: post_op = #IE.LeakyRelu - // CHECK-SAME: rounding_type = #IE.rounding_type, + // CHECK-SAME: rounding_type = #IE.rounding_type, // CHECK-SAME: strides = [1, 1] // CHECK-SAME: } : tensor<1x16x4x4xf16> -> tensor<1x16x3x3xf16> @@ -222,7 +222,7 @@ func.func @SkipMaxPoolWithReluTest(%arg0: tensor<1x16x4x4xf16>) -> tensor<1x16x3 pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1], - rounding_type = #IE.rounding_type + rounding_type = #IE.rounding_type } : tensor<1x16x4x4xf16> -> tensor<1x16x3x3xf16> @@ -234,7 +234,7 @@ func.func @SkipMaxPoolWithReluTest(%arg0: tensor<1x16x4x4xf16>) -> tensor<1x16x3 // CHECK-SAME: kernel_size = [2, 2], // CHECK-SAME: pads_begin = [0, 0], // CHECK-SAME: pads_end = [0, 0], - // CHECK-SAME: rounding_type = #IE.rounding_type, + // CHECK-SAME: rounding_type = #IE.rounding_type, // CHECK-SAME: strides = [1, 1] // CHECK-SAME: } : tensor<1x16x4x4xf16> -> tensor<1x16x3x3xf16> @@ -253,7 +253,7 @@ func.func @SkipMaxPoolWithLeakyReluTest(%arg0: tensor<1x16x4x4xf16>) -> tensor<1 pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1], - rounding_type = #IE.rounding_type + rounding_type = #IE.rounding_type } : tensor<1x16x4x4xf16> -> tensor<1x16x3x3xf16> @@ -265,7 +265,7 @@ func.func @SkipMaxPoolWithLeakyReluTest(%arg0: tensor<1x16x4x4xf16>) -> tensor<1 // CHECK-SAME: kernel_size = [2, 2], // CHECK-SAME: pads_begin = [0, 0], // CHECK-SAME: pads_end = [0, 0], - // CHECK-SAME: rounding_type = #IE.rounding_type, + // CHECK-SAME: rounding_type = #IE.rounding_type, // CHECK-SAME: strides = [1, 1] // CHECK-SAME: } : tensor<1x16x4x4xf16> -> tensor<1x16x3x3xf16> @@ -403,3 +403,28 @@ func.func @SkipMaxPoolWithClampTest(%arg0: tensor<1x16x4x4xf16>) -> tensor<1x16x // CHECK: return [[CLAMP]] : tensor<1x16x3x3xf16> } + + +// ----- + +// CHECK-LABEL: @NotFuseClampI32 +// CHECK-SAME: ([[INPUT:%.+]]: tensor<1x50x1x1xsi32>) +func.func @NotFuseClampI32(%arg0: tensor<1x50x1x1xsi32>) -> tensor<1x50x1x1xsi32> { + %0 = IE.AvgPool(%arg0) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x50x1x1xsi32> -> tensor<1x50x1x1xsi32> + %1 = IE.Clamp(%0) {max = 1.000000e+00 : f64, min = -1.000000e+00 : f64} : tensor<1x50x1x1xsi32> -> tensor<1x50x1x1xsi32> + return %1 : tensor<1x50x1x1xsi32> + + // CHECK: [[AvgPool:%.+]] = IE.AvgPool([[INPUT]]) { + // CHECK-SAME: exclude_pads + // CHECK-SAME: kernel_size = [1, 1], + // CHECK-SAME: pads_begin = [0, 0], + // CHECK-SAME: pads_end = [0, 0], + // CHECK-SAME: rounding_type = #IE.rounding_type, + // CHECK-SAME: strides = [1, 1] + // CHECK-SAME: } : tensor<1x50x1x1xsi32> -> tensor<1x50x1x1xsi32> + // CHECK: [[Clamp:%.+]] = IE.Clamp([[AvgPool]]) { + // CHECK-SAME: max = 1.000000e+00 : f64, + // CHECK-SAME: min = -1.000000e+00 : f64 + // CHECK-SAME: } : tensor<1x50x1x1xsi32> -> tensor<1x50x1x1xsi32> + +} diff --git a/tests/lit/NPU/dialect/IE/passes/fuse_reshape_mvn.mlir b/tests/lit/NPU/dialect/IE/passes/fuse_reshape_mvn.mlir index 15af63a982..8f7edba1fc 100644 --- a/tests/lit/NPU/dialect/IE/passes/fuse_reshape_mvn.mlir +++ b/tests/lit/NPU/dialect/IE/passes/fuse_reshape_mvn.mlir @@ -9,8 +9,8 @@ #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -// CHECK-LABEL: @FuseReshapeWithMVN -module @FuseReshapeWithMVN { +// CHECK-LABEL: @FuseReorderWithMVN1 +module @FuseReorderWithMVN1 { // CHECK-LABEL: @main // CHECK-SAME: ([[INPUT:%.+]]: tensor<1x256x256x256xf16, {order = #NHWC}>) @@ -24,33 +24,7 @@ module @FuseReshapeWithMVN { %8 = IE.Reorder(%7) {dstOrder = #NHWC} : tensor<1x256x256x256xf16> -> tensor<1x256x256x256xf16, {order = #NHWC}> return %8 : tensor<1x256x256x256xf16, {order = #NHWC}> - // CHECK: [[VAR0:%.+]] = IE.MVN([[INPUT]]) {across_channels = false, eps = 5.000000e-01 : f64, internal_reshape = [1, 32, 524288, 1], normalize_variance = true} : tensor<1x256x256x256xf16, {order = #NHWC}> -> tensor<1x256x256x256xf16, {order = #NHWC}> - // CHECK: return [[VAR0]] : tensor<1x256x256x256xf16, {order = #NHWC}> - } - -} - -// ----- - -#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - -// CHECK-LABEL: @FuseAffineReshapeWithMVN -module @FuseAffineReshapeWithMVN { - -// CHECK-LABEL: @main -// CHECK-SAME: ([[INPUT:%.+]]: tensor<1x256x256x256xf16, {order = #NHWC}>) - func.func @main(%arg0: tensor<1x256x256x256xf16, {order = #NHWC}>) -> tensor<1x256x256x256xf16, {order = #NHWC}> { - %2 = IE.Reorder(%arg0) {dstOrder = #NCHW} : tensor<1x256x256x256xf16, {order = #NHWC}> -> tensor<1x256x256x256xf16> - %3 = IE.AffineReshape(%2) {dim_mapping = [[0], [1, 2], [3], [3]], shape_value = [1, 32, 8, 65536]} : tensor<1x256x256x256xf16> -> tensor<1x32x8x65536xf16> - %4 = IE.Reorder(%3) {dstOrder = #NHWC} : tensor<1x32x8x65536xf16> -> tensor<1x32x8x65536xf16, {order = #NHWC}> - %5 = IE.MVN(%4) {across_channels = false, eps = 5.000000e-01 : f64, normalize_variance = true} : tensor<1x32x8x65536xf16, {order = #NHWC}> -> tensor<1x32x8x65536xf16, {order = #NHWC}> - %6 = IE.Reorder(%5) {dstOrder = #NCHW} : tensor<1x32x8x65536xf16, {order = #NHWC}> -> tensor<1x32x8x65536xf16> - %7 = IE.AffineReshape(%6) {dim_mapping = [[0], [1], [1], [2, 3]], shape_value = [1, 256, 256, 256]} : tensor<1x32x8x65536xf16> -> tensor<1x256x256x256xf16> - %8 = IE.Reorder(%7) {dstOrder = #NHWC} : tensor<1x256x256x256xf16> -> tensor<1x256x256x256xf16, {order = #NHWC}> - return %8 : tensor<1x256x256x256xf16, {order = #NHWC}> - - // CHECK: [[VAR0:%.+]] = IE.MVN([[INPUT]]) {across_channels = false, eps = 5.000000e-01 : f64, internal_reshape = [1, 32, 8, 65536], normalize_variance = true} : tensor<1x256x256x256xf16, {order = #NHWC}> -> tensor<1x256x256x256xf16, {order = #NHWC}> + // CHECK: [[VAR0:%.*]] = IE.MVN([[INPUT]]) {across_channels = false, eps = 5.000000e-01 : f64, internal_reshape = [1, 32, 524288, 1], normalize_variance = true} : tensor<1x256x256x256xf16, {order = #NHWC}> -> tensor<1x256x256x256xf16, {order = #NHWC}> // CHECK: return [[VAR0]] : tensor<1x256x256x256xf16, {order = #NHWC}> } @@ -82,9 +56,9 @@ module @MoveGroupConvPostFuseReorderWithMVN { return %12 : tensor<1x256x256x256xf16, {order = #NHWC}> - // CHECK: [[CST:%.+]] = const.Declare tensor<256x1x1x1xf16, {order = #NHWC}> = dense<9.000000e+00> : tensor<256x1x1x1xf16, {order = #NHWC}> - // CHECK: [[VAR0:%.+]] = IE.MVN([[INPUT]]) {across_channels = false, eps = 5.000000e-01 : f64, internal_reshape = [1, 32, 524288, 1], normalize_variance = true} : tensor<1x256x256x256xf16, {order = #NHWC}> -> tensor<1x256x256x256xf16, {order = #NHWC}> - // CHECK: [[VAR1:%.+]] = IE.GroupConvolution([[VAR0]], [[CST]]) {dilations = [1, 1], groups = 256 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x256x256x256xf16, {order = #NHWC}>, tensor<256x1x1x1xf16, {order = #NHWC}> -> tensor<1x256x256x256xf16, {order = #NHWC}> + // CHECK: [[CST:%.*]] = const.Declare tensor<256x1x1x1xf16, {order = #NHWC}> = dense<9.000000e+00> : tensor<256x1x1x1xf16, {order = #NHWC}> + // CHECK: [[VAR0:%.*]] = IE.MVN([[INPUT]]) {across_channels = false, eps = 5.000000e-01 : f64, internal_reshape = [1, 32, 524288, 1], normalize_variance = true} : tensor<1x256x256x256xf16, {order = #NHWC}> -> tensor<1x256x256x256xf16, {order = #NHWC}> + // CHECK: [[VAR1:%.*]] = IE.GroupConvolution([[VAR0]], [[CST]]) {dilations = [1, 1], groups = 256 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x256x256x256xf16, {order = #NHWC}>, tensor<256x1x1x1xf16, {order = #NHWC}> -> tensor<1x256x256x256xf16, {order = #NHWC}> // CHECK: return [[VAR1]] : tensor<1x256x256x256xf16, {order = #NHWC}> } diff --git a/tests/lit/NPU/dialect/IE/passes/map_bilinear_interpolate_on_DPU_37XX.mlir b/tests/lit/NPU/dialect/IE/passes/map_bilinear_interpolate_on_DPU_37XX.mlir index cb28b43c87..f5c46bf283 100644 --- a/tests/lit/NPU/dialect/IE/passes/map_bilinear_interpolate_on_DPU_37XX.mlir +++ b/tests/lit/NPU/dialect/IE/passes/map_bilinear_interpolate_on_DPU_37XX.mlir @@ -1,5 +1,5 @@ // -// Copyright (C) 2025 Intel Corporation. +// Copyright (C) 2022-2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // @@ -7,6 +7,244 @@ // REQUIRES: arch-NPU37XX +// CHECK-LABEL: @MapBilinearPytorchHalfPixelInterpolateOnDPU +func.func @MapBilinearPytorchHalfPixelInterpolateOnDPU(%arg0: tensor<1x128x72x72xf16>) -> tensor<1x128x140x140xf16> { + %0 = IE.Interpolate(%arg0) {attr = #IE.Interpolate, shape_calc_mode = , coord_mode = , nearest_mode = , + antialias = false, + pads_begin = [0, 0, 0, 0], pads_end = [0, 0, 0, 0], + cube_coeff = -7.500000e-01 : f64>, + axes_attr = [0, 1, 2, 3], + operandSegmentSizes = array, + scales_attr = [1.0000100135803223, 1.0000100135803223, 1.9444544315338135, 1.9444544315338135], sizes_attr = [1, 128, 140, 140] + } : tensor<1x128x72x72xf16> -> tensor<1x128x140x140xf16> + + return %0 : tensor<1x128x140x140xf16> + + // CHECK-NOT: IE.Interpolate + // Vertical scale + // CHECK: [[VSLICE0:%.*]] = IE.Slice %arg0 [0, 0, 0, 0] [1, 128, 1, 72] : tensor<1x128x72x72xf16> to tensor<1x128x1x72xf16> + // CHECK: [[VAVGPOOL0:%.*]] = IE.AvgPool([[VSLICE0]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x128x1x72xf16> -> tensor<1x128x1x72xf16> + // CHECK: [[VSLICE1:%.*]] = IE.Slice %arg0 [0, 0, 0, 0] [1, 128, 2, 72] : tensor<1x128x72x72xf16> to tensor<1x128x2x72xf16> + // CHECK-DAG: [[VCST:%.*]] = const.Declare tensor<128x1x2x1xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]]]> : tensor<128x1x2x1xf16> + // CHECK: [[VGROUP_CONV:%.*]] = IE.GroupConvolution([[VSLICE1]], [[VCST]]) {dilations = [1, 1], groups = 128 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x128x2x72xf16>, tensor<128x1x2x1xf16> -> tensor<1x128x1x72xf16> + // CHECK-DAG: [[VCST1:%.*]] = const.Declare tensor<128x1x2x1xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]]]> : tensor<128x1x2x1xf16> + + // More Slice -> Const -> GroupConvolution + + // CHECK: [[VSLICELAST:%.*]] = IE.Slice %arg0 [0, 0, 71, 0] [1, 128, 1, 72] : tensor<1x128x72x72xf16> to tensor<1x128x1x72xf16> + // CHECK: [[VAVGPOOL1:%.*]] = IE.AvgPool([[VSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x128x1x72xf16> -> tensor<1x128x1x72xf16> + // CHECK: [[VERTICALCONCAT:%.*]] = IE.Concat + + // Horizontal scale + // CHECK: [[HSLICE0:%.*]] = IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 0] [1, 128, 140, 1] : tensor<1x128x140x72xf16> to tensor<1x128x140x1xf16> + // CHECK: [[HAVGPOOL0:%.*]] = IE.AvgPool([[HSLICE0]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x128x140x1xf16> -> tensor<1x128x140x1xf16> + // CHECK: [[HSLICE1:%.*]] = IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 0] [1, 128, 140, 2] : tensor<1x128x140x72xf16> to tensor<1x128x140x2xf16> + // CHECK-DAG: [[HCST:%.*]] = const.Declare tensor<128x1x1x2xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]]]> : tensor<128x1x1x2xf16> + // CHECK: [[HGROUP_CONV:%.*]] = IE.GroupConvolution([[HSLICE1]], [[HCST]]) {dilations = [1, 1], groups = 128 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x128x140x2xf16>, tensor<128x1x1x2xf16> -> tensor<1x128x140x1xf16> + // CHECK-DAG: [[HCST1:%.*]] = const.Declare tensor<128x1x1x2xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]]]> : tensor<128x1x1x2xf16> + + // More Slice -> Const -> GroupConvolution + + // CHECK: [[HSLICELAST:%.*]]= IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 71] [1, 128, 140, 1] : tensor<1x128x140x72xf16> to tensor<1x128x140x1xf16> + // CHECK: [[HAVGPOOL1:%.*]] = IE.AvgPool([[HSLICELAST:%.*]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x128x140x1xf16> -> tensor<1x128x140x1xf16> + // CHECK: [[HORIZONTALCONCAT:%.*]] = IE.Concat + + // CHECK: return [[HORIZONTALCONCAT]] : tensor<1x128x140x140xf16> +} + +// ----- + +// CHECK-LABEL: @MapBilinearAsymmetricInterpolateOnDPU +func.func @MapBilinearAsymmetricInterpolateOnDPU(%arg0: tensor<1x21x65x65xf16>) -> tensor<1x21x513x513xf16> { + %0 = IE.Interpolate(%arg0) { + attr = #IE.Interpolate, shape_calc_mode = , coord_mode = , nearest_mode = , + antialias = false, + pads_begin = [0, 0, 0, 0], pads_end = [0, 0, 0, 0], + cube_coeff = -7.500000e-01 : f64>, + axes_attr = [0, 1, 2, 3], + operandSegmentSizes = array, + scales_attr = [1.000000e+00, 1.000000e+00, 7.8923077583312988, 7.8923077583312988], + sizes_attr = [1, 21, 513, 513] + } : tensor<1x21x65x65xf16> -> tensor<1x21x513x513xf16> + + return %0 : tensor<1x21x513x513xf16> + + // CHECK-NOT: IE.Interpolate + // Vertical scale + // CHECK: [[EXPAND:%.*]] = IE.Expand(%arg0) {pads_begin = [0, 0, 0, 0], pads_end = [0, 11, 0, 0]} : tensor<1x21x65x65xf16> -> tensor<1x32x65x65xf16> + // CHECK: [[VSLICE0:%.*]] = IE.Slice [[EXPAND]] [0, 0, 0, 0] [1, 32, 2, 65] : tensor<1x32x65x65xf16> to tensor<1x32x2x65xf16> + // CHECK-DAG: [[VCST0:%.*]] = const.Declare tensor<32x1x2x1xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]]]> : tensor<32x1x2x1xf16> + // CHECK: [[VGROUP_CONV0:%.*]] = IE.GroupConvolution([[VSLICE0]], [[VCST0]]) {dilations = [1, 1], groups = 32 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x32x2x65xf16>, tensor<32x1x2x1xf16> -> tensor<1x32x1x65xf16> + // CHECK-DAG: [[VCST1:%.*]] = const.Declare tensor<32x1x2x1xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]]]> : tensor<32x1x2x1xf16> + + // A lot of other Slice -> Const -> GroupConvolution + + // CHECK: [[VSLICELAST:%.*]] = IE.Slice [[EXPAND:%.*]] [0, 0, 64, 0] [1, 32, 1, 65] : tensor<1x32x65x65xf16> to tensor<1x32x1x65xf16> + // CHECK: [[VAVGPOOL0:%.*]] = IE.AvgPool([[VSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x32x1x65xf16> -> tensor<1x32x1x65xf16> + // CHECK: [[VAVGPOOL1:%.*]] = IE.AvgPool([[VSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x32x1x65xf16> -> tensor<1x32x1x65xf16> + // CHECK: [[VERTICALCONCAT:%.*]] = IE.Concat + + // Horizontal scale + // CHECK: [[HSLICE0:%.*]] = IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 0] [1, 32, 513, 2] : tensor<1x32x513x65xf16> to tensor<1x32x513x2xf16> + // CHECK-DAG: [[HCST0:%.*]] = const.Declare tensor<32x1x1x2xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]]]> : tensor<32x1x1x2xf16> + // CHECK: [[HGROUP_CONV0:%.*]] IE.GroupConvolution([[HSLICE0]], [[HCST0]]) {dilations = [1, 1], groups = 32 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x32x513x2xf16>, tensor<32x1x1x2xf16> -> tensor<1x32x513x1xf16> + // CHECK: [[HCST1:%.*]] = const.Declare tensor<32x1x1x2xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]]]> : tensor<32x1x1x2xf16> + + // A lot of other Slice -> Const -> GroupConvolution + + // CHECK: [[HSLICELAST:%.*]] = IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 64] [1, 32, 513, 1] : tensor<1x32x513x65xf16> to tensor<1x32x513x1xf16> + // CHECK: [[HAVGPOOL0:%.*]] = IE.AvgPool([[HSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x32x513x1xf16> -> tensor<1x32x513x1xf16> + // CHECK: [[HAVGPOOL1:%.*]] = IE.AvgPool([[HSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x32x513x1xf16> -> tensor<1x32x513x1xf16> + + // CHECK: [[HORIZONALCONCAT:%.*]] = IE.Concat + // CHECK: [[FINALSLICE:%.*]] = IE.Slice [[HORIZONALCONCAT]] [0, 0, 0, 0] [1, 21, 513, 513] : tensor<1x32x513x513xf16> to tensor<1x21x513x513xf16> + + // CHECK: return [[FINALSLICE]] : tensor<1x21x513x513xf16> +} + +// ----- + +// CHECK-LABEL: @MapBilinearAlignCornersInterpolateOnDPU +func.func @MapBilinearAlignCornersInterpolateOnDPU(%arg0: tensor<1x32x180x320xf16>) -> tensor<1x32x92x120xf16> { + %0 = IE.Interpolate(%arg0) { + attr = #IE.Interpolate, shape_calc_mode = , coord_mode = , nearest_mode = , + antialias = false, + pads_begin = [0, 0, 0, 0], + pads_end = [0, 0, 0, 0], + cube_coeff = -7.500000e-01 : f64>, + axes_attr = [2, 3], operandSegmentSizes = array, + scales_attr = [0.51111114025115967, 5.000000e-01], + sizes_attr = [92, 120]} : tensor<1x32x180x320xf16> -> tensor<1x32x92x120xf16> + return %0 : tensor<1x32x92x120xf16> + + // CHECK-NOT: IE.Interpolate + // Vertical scale + // CHECK: [[VSLICE0:%.*]] = IE.Slice %arg0 [0, 0, 0, 0] [1, 32, 2, 320] : tensor<1x32x180x320xf16> to tensor<1x32x2x320xf16> + // CHECK-DAG: [[VCST0:%.*]] = const.Declare tensor<32x1x2x1xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]]]> : tensor<32x1x2x1xf16> + // CHECK: [[VGROUP_CONV0:%.*]] = IE.GroupConvolution([[VSLICE0]], [[VCST0]]) {dilations = [1, 1], groups = 32 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x32x2x320xf16>, tensor<32x1x2x1xf16> -> tensor<1x32x1x320xf16> + // CHECK: [[VSLICE1:%.*]] = IE.Slice %arg0 [0, 0, 1, 0] [1, 32, 2, 320] : tensor<1x32x180x320xf16> to tensor<1x32x2x320xf16> + // CHECK-DAG: [[VCST1:%.*]] = const.Declare tensor<32x1x2x1xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]]]> : tensor<32x1x2x1xf16> + + // More Slice -> Const -> GroupConvolution + + // CHECK: [[VSLICELAST:%.*]] = IE.Slice %arg0 [0, 0, 179, 0] [1, 32, 1, 320] : tensor<1x32x180x320xf16> to tensor<1x32x1x320xf16> + // CHECK: [[VAVGPOOL:%.*]] = IE.AvgPool([[VSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x32x1x320xf16> -> tensor<1x32x1x320xf16> + // CHECK: [[VERTICALCONCAT:%.*]] = IE.Concat + + // Vertical scale + // CHECK: [[HSLICE0:%.*]] = IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 0] [1, 32, 92, 2] : tensor<1x32x92x320xf16> to tensor<1x32x92x2xf16> + // CHECK-DAG: [[HCST0:%.*]] = const.Declare tensor<32x1x1x2xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]]]> : tensor<32x1x1x2xf16> + // CHECK: [[HGROUP_CONV0:%.*]] = IE.GroupConvolution([[HSLICE0]], [[HCST0]]) {dilations = [1, 1], groups = 32 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x32x92x2xf16>, tensor<32x1x1x2xf16> -> tensor<1x32x92x1xf16> + // CHECK: [[HSLICE1:%.*]] = IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 2] [1, 32, 92, 2] : tensor<1x32x92x320xf16> to tensor<1x32x92x2xf16> + // CHECK-DAG: [[HCST1:%.*]] = const.Declare tensor<32x1x1x2xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]]]> : tensor<32x1x1x2xf16> + + // More Slice -> Const -> GroupConvolution + + // CHECK: [[HSLICELAST:%.*]] = IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 319] [1, 32, 92, 1] : tensor<1x32x92x320xf16> to tensor<1x32x92x1xf16> + // CHECK: [[HAVGPOOL:%.*]] IE.AvgPool([[HSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x32x92x1xf16> -> tensor<1x32x92x1xf16> + // CHECK: [[HERTICALCONCAT:%.*]] = IE.Concat + + // CHECK: return [[HERTICALCONCAT]] : tensor<1x32x92x120xf16> +} + +// ----- + +// CHECK-LABEL: @MapBilinearInterpolateOnDPUHalfPixel +func.func @MapBilinearInterpolateOnDPUHalfPixel(%arg0: tensor<1x256x69x69xf16>) -> tensor<1x256x138x138xf16> { + %0 = IE.Interpolate(%arg0) { + attr = #IE.Interpolate, shape_calc_mode = , coord_mode = , nearest_mode = , + antialias = false, + pads_begin = [0, 0, 0, 0], pads_end = [0, 0, 0, 0], + cube_coeff = -7.500000e-01 : f64>, + axes_attr = [0, 1, 2, 3], + operandSegmentSizes = array, + scales_attr = [1.0000100135803223, 1.0000100135803223, 2.0000100135803223, 2.0000100135803223], + sizes_attr = [1, 256, 138, 138] + } : tensor<1x256x69x69xf16> -> tensor<1x256x138x138xf16> + + return %0 : tensor<1x256x138x138xf16> + + // CHECK-NOT: IE.Interpolate + // Vertical scale + // CHECK: [[VSLICE0:%.*]] = IE.Slice %arg0 [0, 0, 0, 0] [1, 256, 1, 69] : tensor<1x256x69x69xf16> to tensor<1x256x1x69xf16> + // CHECK: [[VAVGPOOL:%.*]] = IE.AvgPool([[VSLICE0]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x256x1x69xf16> -> tensor<1x256x1x69xf16> + // CHECK: [[VSLICE1:%.*]] = IE.Slice %arg0 [0, 0, 0, 0] [1, 256, 2, 69] : tensor<1x256x69x69xf16> to tensor<1x256x2x69xf16> + // CHECK-DAG: [[VCST0:%.*]] = const.Declare tensor<256x1x2x1xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]]]> : tensor<256x1x2x1xf16> + // CHECK: [[VGROUPCONV0:%.*]] = IE.GroupConvolution([[VSLICE1]], [[VCST0]]) {dilations = [1, 1], groups = 256 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x256x2x69xf16>, tensor<256x1x2x1xf16> -> tensor<1x256x1x69xf16> + // CHECK-DAG: [[VCST1:%.*]] = const.Declare tensor<256x1x2x1xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]]]> : tensor<256x1x2x1xf16> + + // More Slice -> Const -> GroupConvolution + + // CHECK: [[VSLICELAST:%.*]] = IE.Slice %arg0 [0, 0, 68, 0] [1, 256, 1, 69] : tensor<1x256x69x69xf16> to tensor<1x256x1x69xf16> + // CHECK: [[VAVGPOOL1:%.*]] = IE.AvgPool([[VSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x256x1x69xf16> -> tensor<1x256x1x69xf16> + // CHECK: [[VCONCAT:%.*]] = IE.Concat + + // Horizontal scale + + // CHECK: [[HSLICE0:%.*]] = IE.Slice [[VCONCAT]] [0, 0, 0, 0] [1, 256, 138, 1] : tensor<1x256x138x69xf16> to tensor<1x256x138x1xf16> + // CHECK: [[HAVGPOOL0:%.*]] = IE.AvgPool([[HSLICE0]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x256x138x1xf16> -> tensor<1x256x138x1xf16> + // CHECK: [[HSLICE1:%.*]] = IE.Slice [[VCONCAT]] [0, 0, 0, 0] [1, 256, 138, 2] : tensor<1x256x138x69xf16> to tensor<1x256x138x2xf16> + // CHECK-DAG: [[HCST0:%.*]] = const.Declare tensor<256x1x1x2xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]]]> : tensor<256x1x1x2xf16> + // CHECK: [[HGROUPCONV0:%.*]] = IE.GroupConvolution([[HSLICE1]], [[HCST0]]) {dilations = [1, 1], groups = 256 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x256x138x2xf16>, tensor<256x1x1x2xf16> -> tensor<1x256x138x1xf16> + // CHECK-DAG: [[HCST1:%.*]] = const.Declare tensor<256x1x1x2xf16> = + // CHECK-SAME{LITERAL}: dense<[[[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]]]> : tensor<256x1x1x2xf16> + + // More Slice -> Const -> GroupConvolution + + // CHECK: [[HSLICELAST:%.*]] = IE.Slice [[VCONCAT]] [0, 0, 0, 68] [1, 256, 138, 1] : tensor<1x256x138x69xf16> to tensor<1x256x138x1xf16> + // CHECK: [[HAVGPOOL1:%.*]] = IE.AvgPool([[HSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x256x138x1xf16> -> tensor<1x256x138x1xf16> + // CHECK: [[HCONCAT:%.*]] = IE.Concat + + // return [[HCONCAT]] : tensor<1x256x138x138xf16> +} + + +// ----- + +// CHECK-LABEL: @DoNotMapBilinearInterpolateOnDPUBecauseFitInCMX +func.func @DoNotMapBilinearInterpolateOnDPUBecauseFitInCMX(%arg0: tensor<1x32x45x60xf16>) -> tensor<1x32x23x30xf16> { + %0 = IE.Interpolate(%arg0) { + attr = #IE.Interpolate, shape_calc_mode = , coord_mode = , nearest_mode = , + antialias = false, + pads_begin = [0, 0, 0, 0], + pads_end = [0, 0, 0, 0], + cube_coeff = -7.500000e-01 : f64>, + axes_attr = [2, 3], + operandSegmentSizes = array, + scales_attr = [0.51111114025115967, 5.000000e-01], + sizes_attr = [23, 30]} : tensor<1x32x45x60xf16> -> tensor<1x32x23x30xf16> + return %0 : tensor<1x32x23x30xf16> + + // CHECK: [[INTERP:%.*]] = IE.Interpolate(%arg0) { + // CHECK-SAME: attr = #IE.Interpolate, shape_calc_mode = , coord_mode = , nearest_mode = , + // CHECK-SAME: antialias = false, + // CHECK-SAME: pads_begin = [0, 0, 0, 0], + // CHECK-SAME: pads_end = [0, 0, 0, 0], + // CHECK-SAME: cube_coeff = -7.500000e-01 : f64>, + // CHECK-SAME: axes_attr = [2, 3], + // CHECK-SAME: operandSegmentSizes = array, + // CHECK-SAME: scales_attr = [0.51111114025115967, 5.000000e-01], + // CHECK-SAME: sizes_attr = [23, 30]} : tensor<1x32x45x60xf16> -> tensor<1x32x23x30xf16> + // return [[INTERP]] : return %0 : tensor<1x32x23x30xf16> +} + +// ----- + // CHECK-LABEL: @DoNotMapBilinearAlignCornersInterpolateOnDPUBecauseSmallChannel // CHECK-SAME: [[INPUT:%.+]]: tensor<1x3x1024x1024xf16> func.func @DoNotMapBilinearAlignCornersInterpolateOnDPUBecauseSmallChannel(%arg0: tensor<1x3x1024x1024xf16>) -> tensor<1x3x512x512xf16> { diff --git a/tests/lit/NPU/dialect/IE/passes/map_bilinear_interpolate_on_DPU_37XX_extended.mlir b/tests/lit/NPU/dialect/IE/passes/map_bilinear_interpolate_on_DPU_37XX_extended.mlir deleted file mode 100644 index 7afae721aa..0000000000 --- a/tests/lit/NPU/dialect/IE/passes/map_bilinear_interpolate_on_DPU_37XX_extended.mlir +++ /dev/null @@ -1,244 +0,0 @@ -// -// Copyright (C) 2022-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -// RUN: vpux-opt --split-input-file --mlir-print-elementsattrs-with-hex-if-larger 8192 --init-compiler="vpu-arch=%arch% compilation-mode=DefaultHW" --map-bilinear-interpolate-on-dpu %s | FileCheck %s -// REQUIRES: arch-NPU37XX - - -// CHECK-LABEL: @MapBilinearPytorchHalfPixelInterpolateOnDPU -func.func @MapBilinearPytorchHalfPixelInterpolateOnDPU(%arg0: tensor<1x128x72x72xf16>) -> tensor<1x128x140x140xf16> { - %0 = IE.Interpolate(%arg0) {attr = #IE.Interpolate, shape_calc_mode = , coord_mode = , nearest_mode = , - antialias = false, - pads_begin = [0, 0, 0, 0], pads_end = [0, 0, 0, 0], - cube_coeff = -7.500000e-01 : f64>, - axes_attr = [0, 1, 2, 3], - operandSegmentSizes = array, - scales_attr = [1.0000100135803223, 1.0000100135803223, 1.9444544315338135, 1.9444544315338135], sizes_attr = [1, 128, 140, 140] - } : tensor<1x128x72x72xf16> -> tensor<1x128x140x140xf16> - - return %0 : tensor<1x128x140x140xf16> - - // CHECK-NOT: IE.Interpolate - // Vertical scale - // CHECK: [[VSLICE0:%.*]] = IE.Slice %arg0 [0, 0, 0, 0] [1, 128, 1, 72] : tensor<1x128x72x72xf16> to tensor<1x128x1x72xf16> - // CHECK: [[VAVGPOOL0:%.*]] = IE.AvgPool([[VSLICE0]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x128x1x72xf16> -> tensor<1x128x1x72xf16> - // CHECK: [[VSLICE1:%.*]] = IE.Slice %arg0 [0, 0, 0, 0] [1, 128, 2, 72] : tensor<1x128x72x72xf16> to tensor<1x128x2x72xf16> - // CHECK-DAG: [[VCST:%.*]] = const.Declare tensor<128x1x2x1xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]], [[[7.285150e-01], [2.714840e-01]]]]> : tensor<128x1x2x1xf16> - // CHECK: [[VGROUP_CONV:%.*]] = IE.GroupConvolution([[VSLICE1]], [[VCST]]) {dilations = [1, 1], groups = 128 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x128x2x72xf16>, tensor<128x1x2x1xf16> -> tensor<1x128x1x72xf16> - // CHECK-DAG: [[VCST1:%.*]] = const.Declare tensor<128x1x2x1xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]], [[[2.142330e-01], [7.856440e-01]]]]> : tensor<128x1x2x1xf16> - - // More Slice -> Const -> GroupConvolution - - // CHECK: [[VSLICELAST:%.*]] = IE.Slice %arg0 [0, 0, 71, 0] [1, 128, 1, 72] : tensor<1x128x72x72xf16> to tensor<1x128x1x72xf16> - // CHECK: [[VAVGPOOL1:%.*]] = IE.AvgPool([[VSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x128x1x72xf16> -> tensor<1x128x1x72xf16> - // CHECK: [[VERTICALCONCAT:%.*]] = IE.Concat - - // Horizontal scale - // CHECK: [[HSLICE0:%.*]] = IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 0] [1, 128, 140, 1] : tensor<1x128x140x72xf16> to tensor<1x128x140x1xf16> - // CHECK: [[HAVGPOOL0:%.*]] = IE.AvgPool([[HSLICE0]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x128x140x1xf16> -> tensor<1x128x140x1xf16> - // CHECK: [[HSLICE1:%.*]] = IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 0] [1, 128, 140, 2] : tensor<1x128x140x72xf16> to tensor<1x128x140x2xf16> - // CHECK-DAG: [[HCST:%.*]] = const.Declare tensor<128x1x1x2xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]], [[[7.285150e-01, 2.714840e-01]]]]> : tensor<128x1x1x2xf16> - // CHECK: [[HGROUP_CONV:%.*]] = IE.GroupConvolution([[HSLICE1]], [[HCST]]) {dilations = [1, 1], groups = 128 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x128x140x2xf16>, tensor<128x1x1x2xf16> -> tensor<1x128x140x1xf16> - // CHECK-DAG: [[HCST1:%.*]] = const.Declare tensor<128x1x1x2xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]], [[[2.142330e-01, 7.856440e-01]]]]> : tensor<128x1x1x2xf16> - - // More Slice -> Const -> GroupConvolution - - // CHECK: [[HSLICELAST:%.*]]= IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 71] [1, 128, 140, 1] : tensor<1x128x140x72xf16> to tensor<1x128x140x1xf16> - // CHECK: [[HAVGPOOL1:%.*]] = IE.AvgPool([[HSLICELAST:%.*]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x128x140x1xf16> -> tensor<1x128x140x1xf16> - // CHECK: [[HORIZONTALCONCAT:%.*]] = IE.Concat - - // CHECK: return [[HORIZONTALCONCAT]] : tensor<1x128x140x140xf16> -} - -// ----- - -// CHECK-LABEL: @MapBilinearAsymmetricInterpolateOnDPU -func.func @MapBilinearAsymmetricInterpolateOnDPU(%arg0: tensor<1x21x65x65xf16>) -> tensor<1x21x513x513xf16> { - %0 = IE.Interpolate(%arg0) { - attr = #IE.Interpolate, shape_calc_mode = , coord_mode = , nearest_mode = , - antialias = false, - pads_begin = [0, 0, 0, 0], pads_end = [0, 0, 0, 0], - cube_coeff = -7.500000e-01 : f64>, - axes_attr = [0, 1, 2, 3], - operandSegmentSizes = array, - scales_attr = [1.000000e+00, 1.000000e+00, 7.8923077583312988, 7.8923077583312988], - sizes_attr = [1, 21, 513, 513] - } : tensor<1x21x65x65xf16> -> tensor<1x21x513x513xf16> - - return %0 : tensor<1x21x513x513xf16> - - // CHECK-NOT: IE.Interpolate - // Vertical scale - // CHECK: [[EXPAND:%.*]] = IE.Expand(%arg0) {pads_begin = [0, 0, 0, 0], pads_end = [0, 11, 0, 0]} : tensor<1x21x65x65xf16> -> tensor<1x32x65x65xf16> - // CHECK: [[VSLICE0:%.*]] = IE.Slice [[EXPAND]] [0, 0, 0, 0] [1, 32, 2, 65] : tensor<1x32x65x65xf16> to tensor<1x32x2x65xf16> - // CHECK-DAG: [[VCST0:%.*]] = const.Declare tensor<32x1x2x1xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]]]> : tensor<32x1x2x1xf16> - // CHECK: [[VGROUP_CONV0:%.*]] = IE.GroupConvolution([[VSLICE0]], [[VCST0]]) {dilations = [1, 1], groups = 32 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x32x2x65xf16>, tensor<32x1x2x1xf16> -> tensor<1x32x1x65xf16> - // CHECK-DAG: [[VCST1:%.*]] = const.Declare tensor<32x1x2x1xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]], [[[8.735350e-01], [1.267090e-01]]]]> : tensor<32x1x2x1xf16> - - // A lot of other Slice -> Const -> GroupConvolution - - // CHECK: [[VSLICELAST:%.*]] = IE.Slice [[EXPAND:%.*]] [0, 0, 64, 0] [1, 32, 1, 65] : tensor<1x32x65x65xf16> to tensor<1x32x1x65xf16> - // CHECK: [[VAVGPOOL0:%.*]] = IE.AvgPool([[VSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x32x1x65xf16> -> tensor<1x32x1x65xf16> - // CHECK: [[VAVGPOOL1:%.*]] = IE.AvgPool([[VSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x32x1x65xf16> -> tensor<1x32x1x65xf16> - // CHECK: [[VERTICALCONCAT:%.*]] = IE.Concat - - // Horizontal scale - // CHECK: [[HSLICE0:%.*]] = IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 0] [1, 32, 513, 2] : tensor<1x32x513x65xf16> to tensor<1x32x513x2xf16> - // CHECK-DAG: [[HCST0:%.*]] = const.Declare tensor<32x1x1x2xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]]]> : tensor<32x1x1x2xf16> - // CHECK: [[HGROUP_CONV0:%.*]] IE.GroupConvolution([[HSLICE0]], [[HCST0]]) {dilations = [1, 1], groups = 32 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x32x513x2xf16>, tensor<32x1x1x2xf16> -> tensor<1x32x513x1xf16> - // CHECK: [[HCST1:%.*]] = const.Declare tensor<32x1x1x2xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]], [[[8.735350e-01, 1.267090e-01]]]]> : tensor<32x1x1x2xf16> - - // A lot of other Slice -> Const -> GroupConvolution - - // CHECK: [[HSLICELAST:%.*]] = IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 64] [1, 32, 513, 1] : tensor<1x32x513x65xf16> to tensor<1x32x513x1xf16> - // CHECK: [[HAVGPOOL0:%.*]] = IE.AvgPool([[HSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x32x513x1xf16> -> tensor<1x32x513x1xf16> - // CHECK: [[HAVGPOOL1:%.*]] = IE.AvgPool([[HSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x32x513x1xf16> -> tensor<1x32x513x1xf16> - - // CHECK: [[HORIZONALCONCAT:%.*]] = IE.Concat - // CHECK: [[FINALSLICE:%.*]] = IE.Slice [[HORIZONALCONCAT]] [0, 0, 0, 0] [1, 21, 513, 513] : tensor<1x32x513x513xf16> to tensor<1x21x513x513xf16> - - // CHECK: return [[FINALSLICE]] : tensor<1x21x513x513xf16> -} - -// ----- - -// CHECK-LABEL: @MapBilinearAlignCornersInterpolateOnDPU -func.func @MapBilinearAlignCornersInterpolateOnDPU(%arg0: tensor<1x32x180x320xf16>) -> tensor<1x32x92x120xf16> { - %0 = IE.Interpolate(%arg0) { - attr = #IE.Interpolate, shape_calc_mode = , coord_mode = , nearest_mode = , - antialias = false, - pads_begin = [0, 0, 0, 0], - pads_end = [0, 0, 0, 0], - cube_coeff = -7.500000e-01 : f64>, - axes_attr = [2, 3], operandSegmentSizes = array, - scales_attr = [0.51111114025115967, 5.000000e-01], - sizes_attr = [92, 120]} : tensor<1x32x180x320xf16> -> tensor<1x32x92x120xf16> - return %0 : tensor<1x32x92x120xf16> - - // CHECK-NOT: IE.Interpolate - // Vertical scale - // CHECK: [[VSLICE0:%.*]] = IE.Slice %arg0 [0, 0, 0, 0] [1, 32, 2, 320] : tensor<1x32x180x320xf16> to tensor<1x32x2x320xf16> - // CHECK-DAG: [[VCST0:%.*]] = const.Declare tensor<32x1x2x1xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]], [[[1.000000e+00], [0.000000e+00]]]]> : tensor<32x1x2x1xf16> - // CHECK: [[VGROUP_CONV0:%.*]] = IE.GroupConvolution([[VSLICE0]], [[VCST0]]) {dilations = [1, 1], groups = 32 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x32x2x320xf16>, tensor<32x1x2x1xf16> -> tensor<1x32x1x320xf16> - // CHECK: [[VSLICE1:%.*]] = IE.Slice %arg0 [0, 0, 1, 0] [1, 32, 2, 320] : tensor<1x32x180x320xf16> to tensor<1x32x2x320xf16> - // CHECK-DAG: [[VCST1:%.*]] = const.Declare tensor<32x1x2x1xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]], [[[3.295900e-02], [9.667960e-01]]]]> : tensor<32x1x2x1xf16> - - // More Slice -> Const -> GroupConvolution - - // CHECK: [[VSLICELAST:%.*]] = IE.Slice %arg0 [0, 0, 179, 0] [1, 32, 1, 320] : tensor<1x32x180x320xf16> to tensor<1x32x1x320xf16> - // CHECK: [[VAVGPOOL:%.*]] = IE.AvgPool([[VSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x32x1x320xf16> -> tensor<1x32x1x320xf16> - // CHECK: [[VERTICALCONCAT:%.*]] = IE.Concat - - // Vertical scale - // CHECK: [[HSLICE0:%.*]] = IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 0] [1, 32, 92, 2] : tensor<1x32x92x320xf16> to tensor<1x32x92x2xf16> - // CHECK-DAG: [[HCST0:%.*]] = const.Declare tensor<32x1x1x2xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]], [[[1.000000e+00, 0.000000e+00]]]]> : tensor<32x1x1x2xf16> - // CHECK: [[HGROUP_CONV0:%.*]] = IE.GroupConvolution([[HSLICE0]], [[HCST0]]) {dilations = [1, 1], groups = 32 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x32x92x2xf16>, tensor<32x1x1x2xf16> -> tensor<1x32x92x1xf16> - // CHECK: [[HSLICE1:%.*]] = IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 2] [1, 32, 92, 2] : tensor<1x32x92x320xf16> to tensor<1x32x92x2xf16> - // CHECK-DAG: [[HCST1:%.*]] = const.Declare tensor<32x1x1x2xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]], [[[3.193360e-01, 6.806640e-01]]]]> : tensor<32x1x1x2xf16> - - // More Slice -> Const -> GroupConvolution - - // CHECK: [[HSLICELAST:%.*]] = IE.Slice [[VERTICALCONCAT]] [0, 0, 0, 319] [1, 32, 92, 1] : tensor<1x32x92x320xf16> to tensor<1x32x92x1xf16> - // CHECK: [[HAVGPOOL:%.*]] IE.AvgPool([[HSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x32x92x1xf16> -> tensor<1x32x92x1xf16> - // CHECK: [[HERTICALCONCAT:%.*]] = IE.Concat - - // CHECK: return [[HERTICALCONCAT]] : tensor<1x32x92x120xf16> -} - -// ----- - -// CHECK-LABEL: @MapBilinearInterpolateOnDPUHalfPixel -func.func @MapBilinearInterpolateOnDPUHalfPixel(%arg0: tensor<1x256x69x69xf16>) -> tensor<1x256x138x138xf16> { - %0 = IE.Interpolate(%arg0) { - attr = #IE.Interpolate, shape_calc_mode = , coord_mode = , nearest_mode = , - antialias = false, - pads_begin = [0, 0, 0, 0], pads_end = [0, 0, 0, 0], - cube_coeff = -7.500000e-01 : f64>, - axes_attr = [0, 1, 2, 3], - operandSegmentSizes = array, - scales_attr = [1.0000100135803223, 1.0000100135803223, 2.0000100135803223, 2.0000100135803223], - sizes_attr = [1, 256, 138, 138] - } : tensor<1x256x69x69xf16> -> tensor<1x256x138x138xf16> - - return %0 : tensor<1x256x138x138xf16> - - // CHECK-NOT: IE.Interpolate - // Vertical scale - // CHECK: [[VSLICE0:%.*]] = IE.Slice %arg0 [0, 0, 0, 0] [1, 256, 1, 69] : tensor<1x256x69x69xf16> to tensor<1x256x1x69xf16> - // CHECK: [[VAVGPOOL:%.*]] = IE.AvgPool([[VSLICE0]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x256x1x69xf16> -> tensor<1x256x1x69xf16> - // CHECK: [[VSLICE1:%.*]] = IE.Slice %arg0 [0, 0, 0, 0] [1, 256, 2, 69] : tensor<1x256x69x69xf16> to tensor<1x256x2x69xf16> - // CHECK-DAG: [[VCST0:%.*]] = const.Declare tensor<256x1x2x1xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]], [[[7.500000e-01], [2.500000e-01]]]]> : tensor<256x1x2x1xf16> - // CHECK: [[VGROUPCONV0:%.*]] = IE.GroupConvolution([[VSLICE1]], [[VCST0]]) {dilations = [1, 1], groups = 256 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x256x2x69xf16>, tensor<256x1x2x1xf16> -> tensor<1x256x1x69xf16> - // CHECK-DAG: [[VCST1:%.*]] = const.Declare tensor<256x1x2x1xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]], [[[2.500000e-01], [7.500000e-01]]]]> : tensor<256x1x2x1xf16> - - // More Slice -> Const -> GroupConvolution - - // CHECK: [[VSLICELAST:%.*]] = IE.Slice %arg0 [0, 0, 68, 0] [1, 256, 1, 69] : tensor<1x256x69x69xf16> to tensor<1x256x1x69xf16> - // CHECK: [[VAVGPOOL1:%.*]] = IE.AvgPool([[VSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x256x1x69xf16> -> tensor<1x256x1x69xf16> - // CHECK: [[VCONCAT:%.*]] = IE.Concat - - // Horizontal scale - - // CHECK: [[HSLICE0:%.*]] = IE.Slice [[VCONCAT]] [0, 0, 0, 0] [1, 256, 138, 1] : tensor<1x256x138x69xf16> to tensor<1x256x138x1xf16> - // CHECK: [[HAVGPOOL0:%.*]] = IE.AvgPool([[HSLICE0]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x256x138x1xf16> -> tensor<1x256x138x1xf16> - // CHECK: [[HSLICE1:%.*]] = IE.Slice [[VCONCAT]] [0, 0, 0, 0] [1, 256, 138, 2] : tensor<1x256x138x69xf16> to tensor<1x256x138x2xf16> - // CHECK-DAG: [[HCST0:%.*]] = const.Declare tensor<256x1x1x2xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]], [[[7.500000e-01, 2.500000e-01]]]]> : tensor<256x1x1x2xf16> - // CHECK: [[HGROUPCONV0:%.*]] = IE.GroupConvolution([[HSLICE1]], [[HCST0]]) {dilations = [1, 1], groups = 256 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x256x138x2xf16>, tensor<256x1x1x2xf16> -> tensor<1x256x138x1xf16> - // CHECK-DAG: [[HCST1:%.*]] = const.Declare tensor<256x1x1x2xf16> = - // CHECK-SAME{LITERAL}: dense<[[[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]], [[[2.500000e-01, 7.500000e-01]]]]> : tensor<256x1x1x2xf16> - - // More Slice -> Const -> GroupConvolution - - // CHECK: [[HSLICELAST:%.*]] = IE.Slice [[VCONCAT]] [0, 0, 0, 68] [1, 256, 138, 1] : tensor<1x256x138x69xf16> to tensor<1x256x138x1xf16> - // CHECK: [[HAVGPOOL1:%.*]] = IE.AvgPool([[HSLICELAST]]) {exclude_pads, kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x256x138x1xf16> -> tensor<1x256x138x1xf16> - // CHECK: [[HCONCAT:%.*]] = IE.Concat - - // return [[HCONCAT]] : tensor<1x256x138x138xf16> -} - - -// ----- - -// CHECK-LABEL: @DoNotMapBilinearInterpolateOnDPUBecauseFitInCMX -func.func @DoNotMapBilinearInterpolateOnDPUBecauseFitInCMX(%arg0: tensor<1x32x45x60xf16>) -> tensor<1x32x23x30xf16> { - %0 = IE.Interpolate(%arg0) { - attr = #IE.Interpolate, shape_calc_mode = , coord_mode = , nearest_mode = , - antialias = false, - pads_begin = [0, 0, 0, 0], - pads_end = [0, 0, 0, 0], - cube_coeff = -7.500000e-01 : f64>, - axes_attr = [2, 3], - operandSegmentSizes = array, - scales_attr = [0.51111114025115967, 5.000000e-01], - sizes_attr = [23, 30]} : tensor<1x32x45x60xf16> -> tensor<1x32x23x30xf16> - return %0 : tensor<1x32x23x30xf16> - - // CHECK: [[INTERP:%.*]] = IE.Interpolate(%arg0) { - // CHECK-SAME: attr = #IE.Interpolate, shape_calc_mode = , coord_mode = , nearest_mode = , - // CHECK-SAME: antialias = false, - // CHECK-SAME: pads_begin = [0, 0, 0, 0], - // CHECK-SAME: pads_end = [0, 0, 0, 0], - // CHECK-SAME: cube_coeff = -7.500000e-01 : f64>, - // CHECK-SAME: axes_attr = [2, 3], - // CHECK-SAME: operandSegmentSizes = array, - // CHECK-SAME: scales_attr = [0.51111114025115967, 5.000000e-01], - // CHECK-SAME: sizes_attr = [23, 30]} : tensor<1x32x45x60xf16> -> tensor<1x32x23x30xf16> - // return [[INTERP]] : return %0 : tensor<1x32x23x30xf16> -} diff --git a/tests/lit/NPU/dialect/IE/passes/map_bilinear_interpolate_on_DPU_40XX.mlir b/tests/lit/NPU/dialect/IE/passes/map_bilinear_interpolate_on_DPU_40XX+.mlir similarity index 100% rename from tests/lit/NPU/dialect/IE/passes/map_bilinear_interpolate_on_DPU_40XX.mlir rename to tests/lit/NPU/dialect/IE/passes/map_bilinear_interpolate_on_DPU_40XX+.mlir diff --git a/tests/lit/NPU/dialect/IE/passes/merge_fully_connected.mlir b/tests/lit/NPU/dialect/IE/passes/merge_fully_connected.mlir index 04a4c675a8..469b757cb0 100644 --- a/tests/lit/NPU/dialect/IE/passes/merge_fully_connected.mlir +++ b/tests/lit/NPU/dialect/IE/passes/merge_fully_connected.mlir @@ -233,6 +233,61 @@ func.func @MergeMatMulForDQPatternWithDequantize(%arg0: tensor<1x1x256xf16>, %ar !qElemType = !quant.uniform +// CHECK-LABEL: @MergeMatMulForDQPatternWithDequantizeAndInputIsShared +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x1x256xf16>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<2x2560x128xsi4> +func.func @MergeMatMulForDQPatternWithDequantizeAndInputIsShared(%arg0: tensor<1x1x256xf16>, %arg1: tensor<2x2560x128xsi4>) +-> (tensor<1x2x1x2560xf16>, tensor<1x128xf16>) { + %0 = IE.AffineReshape(%arg0) {dim_mapping = [[0], [0], [1]], shape_value = [1, 256]} : tensor<1x1x256xf16> -> tensor<1x256xf16> + %1 = IE.QuantizeCast(%arg1) {dstElemType = !qElemType} : tensor<2x2560x128xsi4> -> tensor<2x2560x128x!qElemType> + + %2 = IE.Slice %1 [0, 0, 0] [1, 2560, 128] : tensor<2x2560x128x!qElemType> to tensor<1x2560x128x!qElemType> + %3 = IE.Slice %1 [1, 0, 0] [1, 2560, 128] : tensor<2x2560x128x!qElemType> to tensor<1x2560x128x!qElemType> + %4 = IE.Dequantize(%2) {dstElemType = f16} : tensor<1x2560x128x!qElemType> -> tensor<1x2560x128xf16> + %5 = IE.Dequantize(%3) {dstElemType = f16} : tensor<1x2560x128x!qElemType> -> tensor<1x2560x128xf16> + %6 = IE.Reshape(%4) {shape_value = [2560, 128]} : tensor<1x2560x128xf16> -> tensor<2560x128xf16> + %7 = IE.Reshape(%5) {shape_value = [2560, 128]} : tensor<1x2560x128xf16> -> tensor<2560x128xf16> + + %8 = IE.Slice %0 [0, 0] [1, 128] : tensor<1x256xf16> to tensor<1x128xf16> + %9 = IE.Slice %0 [0, 128] [1, 128] : tensor<1x256xf16> to tensor<1x128xf16> + + %10 = IE.FullyConnected(%8, %6) : tensor<1x128xf16>, tensor<2560x128xf16> -> tensor<1x2560xf16> + %11 = IE.FullyConnected(%9, %7) : tensor<1x128xf16>, tensor<2560x128xf16> -> tensor<1x2560xf16> + + %12 = IE.Reshape(%10) {shape_value = [1, 1, 1, 2560]} : tensor<1x2560xf16> -> tensor<1x1x1x2560xf16> + %13 = IE.Reshape(%11) {shape_value = [1, 1, 1, 2560]} : tensor<1x2560xf16> -> tensor<1x1x1x2560xf16> + + %14 = IE.Concat(%12, %13) {per_axis = #IE.Concat} : tensor<1x1x1x2560xf16>, tensor<1x1x1x2560xf16> -> tensor<1x2x1x2560xf16> + + %15 = IE.Slice %0 [0, 0] [1, 128] : tensor<1x256xf16> to tensor<1x128xf16> + + return %14, %15 : tensor<1x2x1x2560xf16>, tensor<1x128xf16> + + // CHECK: [[SOURCE:%.+]] = IE.AffineReshape([[INPUT_0]]) + // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [0], [1]], shape_value = [1, 256]} : tensor<1x1x256xf16> -> tensor<1x256xf16> + // CHECK: [[RESHAPE:%.+]] = IE.Reshape([[SOURCE]]) {shape_value = [2, 128]} : tensor<1x256xf16> -> tensor<2x128xf16> + + // CHECK: [[WEIGHTS:%.+]] = IE.QuantizeCast([[INPUT_1]]) {dstElemType = !qElemType} : tensor<2x2560x128xsi4> -> tensor<2x2560x128x!qElemType> + // CHECK: [[WEIGHTS_DQ:%.+]] = IE.Dequantize([[WEIGHTS]]) {dstElemType = f16} : tensor<2x2560x128x!qElemType> -> tensor<2x2560x128xf16> + // CHECK: [[WEIGHTS_RESHAPE:%.+]] = IE.AffineReshape([[WEIGHTS_DQ]]) + // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [0], [1]], shape_value = [5120, 128]} : tensor<2x2560x128xf16> -> tensor<5120x128xf16> + + // CHECK: [[MATMUL:%.+]] = IE.FullyConnected([[RESHAPE]], [[WEIGHTS_RESHAPE]]) : tensor<2x128xf16>, tensor<5120x128xf16> -> tensor<2x5120xf16> + + // CHECK: [[SLICE_0:%.+]] = IE.Slice [[MATMUL]] [0, 0] [1, 2560] : tensor<2x5120xf16> to tensor<1x2560xf16> + // CHECK: [[RESHAPE_0:%.+]] = IE.Reshape([[SLICE_0]]) {shape_value = [1, 1, 1, 2560]} : tensor<1x2560xf16> -> tensor<1x1x1x2560xf16> + // CHECK: [[SLICE_1:%.+]] = IE.Slice [[MATMUL]] [1, 2560] [1, 2560] : tensor<2x5120xf16> to tensor<1x2560xf16> + // CHECK: [[RESHAPE_1:%.+]] = IE.Reshape([[SLICE_1]]) {shape_value = [1, 1, 1, 2560]} : tensor<1x2560xf16> -> tensor<1x1x1x2560xf16> + + // CHECK: [[CONCAT:%.+]] = IE.Concat([[RESHAPE_0]], [[RESHAPE_1]]) {per_axis = #IE.Concat} : tensor<1x1x1x2560xf16>, tensor<1x1x1x2560xf16> -> tensor<1x2x1x2560xf16> + // CHECK: [[OUT_SLICE:%.+]] = IE.Slice [[SOURCE]] [0, 0] [1, 128] : tensor<1x256xf16> to tensor<1x128xf16> + // CHECK: return [[CONCAT]], [[OUT_SLICE]] : tensor<1x2x1x2560xf16>, tensor<1x128xf16> +} + +// ----- + +!qElemType = !quant.uniform + // CHECK-LABEL: @MergeMatMulForDQPatternWithDequantizeAndRegroup // CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x1x1536xf16>, // CHECK-SAME: [[INPUT_1:%.+]]: tensor<12x2560x128xsi4> diff --git a/tests/lit/NPU/dialect/IE/passes/optimize_reorders.mlir b/tests/lit/NPU/dialect/IE/passes/optimize_reorders.mlir index 573e801dc0..fcf7cec2f4 100644 --- a/tests/lit/NPU/dialect/IE/passes/optimize_reorders.mlir +++ b/tests/lit/NPU/dialect/IE/passes/optimize_reorders.mlir @@ -2630,3 +2630,113 @@ func.func @NotOptReorderWithGroupConvForPerChannelPostOp(%arg0: tensor<1x32x1152 // CHECK: [[IN_REORDER:%.+]] = IE.Reorder([[INPUT0]]) } + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +!qElemType = !quant.uniform +!qElemType1 = !quant.uniform + +// CHECK-LABEL: @ReorderAddQuantCastSlice +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x2x1x1024xf16> +func.func @ReorderAddQuantCastSlice(%arg0: tensor<1x2x1x1024xf16>) -> (tensor<1x1x1x1024x!qElemType, {order = #NHWC}>, tensor<1x1x1x1024x!qElemType, {order = #NHWC}>) { + %0 = IE.Reorder(%arg0) {dstOrder = #NHWC} : tensor<1x2x1x1024xf16> -> tensor<1x2x1x1024xf16, {order = #NHWC}> + %1 = IE.Reorder(%arg0) {dstOrder = #NHWC} : tensor<1x2x1x1024xf16> -> tensor<1x2x1x1024xf16, {order = #NHWC}> + %2 = IE.Add(%0, %1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x2x1x1024xf16, {order = #NHWC}>, tensor<1x2x1x1024xf16, {order = #NHWC}> -> tensor<1x2x1x1024x!qElemType1, {order = #NHWC}> + %3 = IE.QuantizeCast(%2) {dstElemType = !qElemType} : tensor<1x2x1x1024x!qElemType1, {order = #NHWC}> -> tensor<1x2x1x1024x!qElemType, {order = #NHWC}> + %4 = IE.Slice %3 [0, 0, 0, 0] [1, 1, 1, 1024] : tensor<1x2x1x1024x!qElemType, {order = #NHWC}> to tensor<1x1x1x1024x!qElemType, {order = #NHWC}> + %5 = IE.Slice %3 [0, 1, 0, 0] [1, 1, 1, 1024] : tensor<1x2x1x1024x!qElemType, {order = #NHWC}> to tensor<1x1x1x1024x!qElemType, {order = #NHWC}> + return %4, %5 : tensor<1x1x1x1024x!qElemType, {order = #NHWC}>, tensor<1x1x1x1024x!qElemType, {order = #NHWC}> + + // CHECK: [[LAYOUTCAST_0:%.+]] = IE.LayoutCast([[INPUT]]) {dst_order = #NHWC} : tensor<1x2x1x1024xf16> -> tensor<1x2x1x1024xf16, {order = #NHWC}> + // CHECK: [[LAYOUTCAST_1:%.+]] = IE.LayoutCast([[INPUT]]) {dst_order = #NHWC} : tensor<1x2x1x1024xf16> -> tensor<1x2x1x1024xf16, {order = #NHWC}> + // CHECK: [[ADD:%.+]] = IE.Add([[LAYOUTCAST_0]], [[LAYOUTCAST_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x2x1x1024xf16, {order = #NHWC}>, tensor<1x2x1x1024xf16, {order = #NHWC}> -> tensor<1x2x1x1024x!qElemType1, {order = #NHWC}> + // CHECK: [[LAYOUTCAST_2:%.+]] = IE.LayoutCast([[ADD]]) {dst_order = #NCHW} : tensor<1x2x1x1024x!qElemType1, {order = #NHWC}> -> tensor<1x2x1x1024x!qElemType1> + // CHECK: [[QUANTIZECAST:%.+]] = IE.QuantizeCast([[LAYOUTCAST_2]]) {dstElemType = !qElemType} : tensor<1x2x1x1024x!qElemType1> -> tensor<1x2x1x1024x!qElemType> + // CHECK: [[SLICE_0:%.+]] = IE.Slice [[QUANTIZECAST]] [0, 0, 0, 0] [1, 1, 1, 1024] : tensor<1x2x1x1024x!qElemType> to tensor<1x1x1x1024x!qElemType> + // CHECK: [[REORDER_0:%.+]] = IE.Reorder([[SLICE_0]]) {dstOrder = #NHWC} : tensor<1x1x1x1024x!qElemType> -> tensor<1x1x1x1024x!qElemType, {order = #NHWC}> + // CHECK: [[SLICE_1:%.+]] = IE.Slice [[QUANTIZECAST]] [0, 1, 0, 0] [1, 1, 1, 1024] : tensor<1x2x1x1024x!qElemType> to tensor<1x1x1x1024x!qElemType> + // CHECK: [[REORDER_1:%.+]] = IE.Reorder([[SLICE_1]]) {dstOrder = #NHWC} : tensor<1x1x1x1024x!qElemType> -> tensor<1x1x1x1024x!qElemType, {order = #NHWC}> + // CHECK: return [[REORDER_0]], [[REORDER_1]] : tensor<1x1x1x1024x!qElemType, {order = #NHWC}>, tensor<1x1x1x1024x!qElemType, {order = #NHWC}> +} + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +!qElemType = !quant.uniform + +// CHECK-LABEL: @ReorderSameInputsAddSlice +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x2x1x1024xf16> +func.func @ReorderSameInputsAddSlice(%arg0: tensor<1x2x1x1024xf16>) -> (tensor<1x1x1x1024x!qElemType, {order = #NHWC}>, tensor<1x1x1x1024x!qElemType, {order = #NHWC}>) { + %0 = IE.Reorder(%arg0) {dstOrder = #NHWC} : tensor<1x2x1x1024xf16> -> tensor<1x2x1x1024xf16, {order = #NHWC}> + %1 = IE.Add(%0, %0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x2x1x1024xf16, {order = #NHWC}>, tensor<1x2x1x1024xf16, {order = #NHWC}> -> tensor<1x2x1x1024x!qElemType, {order = #NHWC}> + %2 = IE.Slice %1 [0, 0, 0, 0] [1, 1, 1, 1024] : tensor<1x2x1x1024x!qElemType, {order = #NHWC}> to tensor<1x1x1x1024x!qElemType, {order = #NHWC}> + %3 = IE.Slice %1 [0, 1, 0, 0] [1, 1, 1, 1024] : tensor<1x2x1x1024x!qElemType, {order = #NHWC}> to tensor<1x1x1x1024x!qElemType, {order = #NHWC}> + return %2, %3 : tensor<1x1x1x1024x!qElemType, {order = #NHWC}>, tensor<1x1x1x1024x!qElemType, {order = #NHWC}> + + // CHECK: [[LAYOUTCAST_0:%.+]] = IE.LayoutCast([[INPUT]]) {dst_order = #NHWC} : tensor<1x2x1x1024xf16> -> tensor<1x2x1x1024xf16, {order = #NHWC}> + // CHECK: [[ADD:%.+]] = IE.Add([[LAYOUTCAST_0]], [[LAYOUTCAST_0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x2x1x1024xf16, {order = #NHWC}>, tensor<1x2x1x1024xf16, {order = #NHWC}> -> tensor<1x2x1x1024x!qElemType, {order = #NHWC}> + // CHECK: [[LAYOUTCAST_1:%.+]] = IE.LayoutCast([[ADD]]) {dst_order = #NCHW} : tensor<1x2x1x1024x!qElemType, {order = #NHWC}> -> tensor<1x2x1x1024x!qElemType> + // CHECK: [[SLICE_0:%.+]] = IE.Slice [[LAYOUTCAST_1]] [0, 0, 0, 0] [1, 1, 1, 1024] : tensor<1x2x1x1024x!qElemType> to tensor<1x1x1x1024x!qElemType> + // CHECK: [[REORDER_0:%.+]] = IE.Reorder([[SLICE_0]]) {dstOrder = #NHWC} : tensor<1x1x1x1024x!qElemType> -> tensor<1x1x1x1024x!qElemType, {order = #NHWC}> + // CHECK: [[SLICE_1:%.+]] = IE.Slice [[LAYOUTCAST_1]] [0, 1, 0, 0] [1, 1, 1, 1024] : tensor<1x2x1x1024x!qElemType> to tensor<1x1x1x1024x!qElemType> + // CHECK: [[REORDER_1:%.+]] = IE.Reorder([[SLICE_1]]) {dstOrder = #NHWC} : tensor<1x1x1x1024x!qElemType> -> tensor<1x1x1x1024x!qElemType, {order = #NHWC}> + // CHECK: return [[REORDER_0]], [[REORDER_1]] : tensor<1x1x1x1024x!qElemType, {order = #NHWC}>, tensor<1x1x1x1024x!qElemType, {order = #NHWC}> +} + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +!qElemType = !quant.uniform + +// CHECK-LABEL: @NotOptReorderAddSliceDueToBranch +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x2x1x1024xf16> +func.func @NotOptReorderAddSliceDueToBranch(%arg0: tensor<1x2x1x1024xf16>) -> (tensor<1x1x1x1024x!qElemType, {order = #NHWC}>, tensor<1x1x1x1024x!qElemType, {order = #NHWC}>, tensor<1x2x1x1024xf16, {order = #NHWC}>) { + %0 = IE.Reorder(%arg0) {dstOrder = #NHWC} : tensor<1x2x1x1024xf16> -> tensor<1x2x1x1024xf16, {order = #NHWC}> + %1 = IE.Add(%0, %0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x2x1x1024xf16, {order = #NHWC}>, tensor<1x2x1x1024xf16, {order = #NHWC}> -> tensor<1x2x1x1024x!qElemType, {order = #NHWC}> + %2 = IE.Slice %1 [0, 0, 0, 0] [1, 1, 1, 1024] : tensor<1x2x1x1024x!qElemType, {order = #NHWC}> to tensor<1x1x1x1024x!qElemType, {order = #NHWC}> + %3 = IE.Slice %1 [0, 1, 0, 0] [1, 1, 1, 1024] : tensor<1x2x1x1024x!qElemType, {order = #NHWC}> to tensor<1x1x1x1024x!qElemType, {order = #NHWC}> + %4 = IE.Add(%0, %0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x2x1x1024xf16, {order = #NHWC}>, tensor<1x2x1x1024xf16, {order = #NHWC}> -> tensor<1x2x1x1024xf16, {order = #NHWC}> + return %2, %3, %4 : tensor<1x1x1x1024x!qElemType, {order = #NHWC}>, tensor<1x1x1x1024x!qElemType, {order = #NHWC}>, tensor<1x2x1x1024xf16, {order = #NHWC}> + + // CHECK: IE.Reorder([[INPUT]]) +} + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +!qElemType = !quant.uniform + +// CHECK-LABEL: @NotOptReorderAddSliceDueToNonTrivialReorder +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x16x1x1024xf16> +func.func @NotOptReorderAddSliceDueToNonTrivialReorder(%arg0: tensor<1x16x1x1024xf16>) -> (tensor<1x8x1x1024x!qElemType, {order = #NHWC}>, tensor<1x8x1x1024x!qElemType, {order = #NHWC}>) { + %0 = IE.Reorder(%arg0) {dstOrder = #NHWC} : tensor<1x16x1x1024xf16> -> tensor<1x16x1x1024xf16, {order = #NHWC}> + %1 = IE.Add(%0, %0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x16x1x1024xf16, {order = #NHWC}>, tensor<1x16x1x1024xf16, {order = #NHWC}> -> tensor<1x16x1x1024x!qElemType, {order = #NHWC}> + %2 = IE.Slice %1 [0, 0, 0, 0] [1, 8, 1, 1024] : tensor<1x16x1x1024x!qElemType, {order = #NHWC}> to tensor<1x8x1x1024x!qElemType, {order = #NHWC}> + %3 = IE.Slice %1 [0, 8, 0, 0] [1, 8, 1, 1024] : tensor<1x16x1x1024x!qElemType, {order = #NHWC}> to tensor<1x8x1x1024x!qElemType, {order = #NHWC}> + return %2, %3 : tensor<1x8x1x1024x!qElemType, {order = #NHWC}>, tensor<1x8x1x1024x!qElemType, {order = #NHWC}> + + // CHECK: IE.Reorder([[INPUT]]) +} + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +!qElemType = !quant.uniform + +// CHECK-LABEL: @NotOptReorderAddSliceDueToWorseSlice +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x1024x1x2xf16> +func.func @NotOptReorderAddSliceDueToWorseSlice(%arg0: tensor<1x1024x1x2xf16>) -> (tensor<1x1024x1x1x!qElemType, {order = #NHWC}>, tensor<1x1024x1x1x!qElemType, {order = #NHWC}>) { + %0 = IE.Reorder(%arg0) {dstOrder = #NHWC} : tensor<1x1024x1x2xf16> -> tensor<1x1024x1x2xf16, {order = #NHWC}> + %1 = IE.Add(%0, %0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x1024x1x2xf16, {order = #NHWC}>, tensor<1x1024x1x2xf16, {order = #NHWC}> -> tensor<1x1024x1x2x!qElemType, {order = #NHWC}> + %2 = IE.Slice %1 [0, 0, 0, 0] [1, 1024, 1, 1] : tensor<1x1024x1x2x!qElemType, {order = #NHWC}> to tensor<1x1024x1x1x!qElemType, {order = #NHWC}> + %3 = IE.Slice %1 [0, 0, 0, 1] [1, 1024, 1, 1] : tensor<1x1024x1x2x!qElemType, {order = #NHWC}> to tensor<1x1024x1x1x!qElemType, {order = #NHWC}> + return %2, %3 : tensor<1x1024x1x1x!qElemType, {order = #NHWC}>, tensor<1x1024x1x1x!qElemType, {order = #NHWC}> + + // CHECK: IE.Reorder([[INPUT]]) +} diff --git a/tests/lit/NPU/dialect/IE/passes/optimize_slice_expand.mlir b/tests/lit/NPU/dialect/IE/passes/optimize_slice_expand.mlir index faf32bf2f0..e1f60b411a 100644 --- a/tests/lit/NPU/dialect/IE/passes/optimize_slice_expand.mlir +++ b/tests/lit/NPU/dialect/IE/passes/optimize_slice_expand.mlir @@ -1210,3 +1210,31 @@ func.func @notFuseSliceSoftmaxExpandWithOffsetOfSliceNotAllZero(%arg0: tensor<1x // CHECK: return [[OUTPUT]] } } + +// ----- + + +!qElemType = !quant.uniform +!qElemType1 = !quant.uniform +!qElemType2 = !quant.uniform + +// CHECK-LABEL: @OptimizeSliceConcatExpandForQuantizedType +module @OptimizeSliceConcatExpandForQuantizedType { +// CHECK-LABEL: @OptimizeSliceConcatExpandForQuantizedType +// CHECK-SAME: [[INPUT1:%arg0]]: tensor<1x6x32x56x!qElemType>, +// CHECK-SAME: [[INPUT2:%arg1]]: tensor<1x6x32x56x!qElemType>) -> tensor<2x6x32x56x!qElemType> +func.func @OptimizeSliceConcatExpandForQuantizedType(%arg0: tensor<1x6x32x56x!qElemType>, %arg1: tensor<1x6x32x56x!qElemType>) -> tensor<2x6x32x56x!qElemType> { + + %0 = IE.Slice %arg0 [0, 0, 0, 0] [1, 3, 32, 56] : tensor<1x6x32x56x!qElemType> to tensor<1x3x32x56x!qElemType1> + %1 = IE.Slice %arg1 [0, 0, 0, 0] [1, 3, 32, 56] : tensor<1x6x32x56x!qElemType> to tensor<1x3x32x56x!qElemType1> + %2 = IE.Concat(%0, %1) {per_axis = #IE.Concat} : tensor<1x3x32x56x!qElemType1>, tensor<1x3x32x56x!qElemType1> -> tensor<2x3x32x56x!qElemType1> + %3 = IE.Expand(%2) {pads_begin = [0, 0, 0, 0], pads_end = [0, 3, 0, 0]} : tensor<2x3x32x56x!qElemType1> -> tensor<2x6x32x56x!qElemType2> + %4 = IE.QuantizeCast(%3) {dstElemType = !qElemType} : tensor<2x6x32x56x!qElemType2> -> tensor<2x6x32x56x!qElemType> + return %4 : tensor<2x6x32x56x!qElemType> + + // CHECK: [[VAR0:%.+]] = IE.Concat([[INPUT1]], [[INPUT2]]) + // CHECK-SAME: tensor<1x6x32x56x!qElemType>, tensor<1x6x32x56x!qElemType> -> tensor<2x6x32x56x!qElemType> + // CHECK: return [[VAR0]] : tensor<2x6x32x56x!qElemType> + +} +} diff --git a/tests/lit/NPU/dialect/IE/passes/optimize_slice_with_stride.mlir b/tests/lit/NPU/dialect/IE/passes/optimize_slice_with_stride.mlir index e9c2fa6dbc..30cfe44f9e 100644 --- a/tests/lit/NPU/dialect/IE/passes/optimize_slice_with_stride.mlir +++ b/tests/lit/NPU/dialect/IE/passes/optimize_slice_with_stride.mlir @@ -275,6 +275,44 @@ func.func @OptimizeSliceConcat(%arg0: tensor<1x1024x32x32xf16, {order = #NHWC}>) #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +// CHECK-LABEL: @NotOptimizeSliceConcatWithTwoUsers +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x1024x32x32xf16, {order = #NHWC} +func.func @NotOptimizeSliceConcatWithTwoUsers(%arg0: tensor<1x1024x32x32xf16, {order = #NHWC}>) + -> (tensor<1x512x32x32xf16, {order = #NHWC}>, tensor<1x511x16x64xf16, {order = #NHWC}>) { + %WEIGHTS = const.Declare tensor<512x1024x3x3xf16, {order = #NHWC}> = dense<1.250000e-01> : tensor<512x1024x3x3xf16>, [#const.Reorder<#NHWC>] + %CST_0 = const.Declare tensor<1x1x32x32xf16, {order = #NHWC}> = dense<1.250000e-01> : tensor<1x1x32x32xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %WEIGHTS2 = const.Declare tensor<512x512x3x3xf16, {order = #NHWC}> = dense<1.250000e-01> : tensor<512x512x3x3xf16>, [#const.Reorder<#NHWC>] + %CONV = IE.Convolution(%arg0, %WEIGHTS) { + dilations = [1, 1], + pads_begin = [1, 1], pads_end = [1, 1], + strides = [1, 1] + } : tensor<1x1024x32x32xf16, {order = #NHWC}>, tensor<512x1024x3x3xf16, {order = #NHWC}> -> tensor<1x512x32x32xf16, {order = #NHWC}> + %SLICE = IE.Slice %CONV [0, 0, 0, 0] [1, 511, 32, 32] : tensor<1x512x32x32xf16, {order = #NHWC}> to tensor<1x511x32x32xf16, {order = #NHWC}> + %CONCAT = IE.Concat(%SLICE, %CST_0) { + static_offsets = [[0, 0, 0, 0], [0, 511, 0, 0]]} : tensor<1x511x32x32xf16, {order = #NHWC}>, tensor<1x1x32x32xf16, {order = #NHWC}> -> tensor<1x512x32x32xf16, {order = #NHWC}> + %RESHAPE = IE.AffineReshape(%SLICE) { + dim_mapping = [[0], [1], [2, 3], [3]], shape_value = [1, 511, 16, 64] } : tensor<1x511x32x32xf16, {order = #NHWC}> -> tensor<1x511x16x64xf16, {order = #NHWC}> + + return %CONCAT, %RESHAPE : tensor<1x512x32x32xf16, {order = #NHWC}>, tensor<1x511x16x64xf16, {order = #NHWC}> + + // CHECK-DAG: [[WEIGHTS:%.+]] = const.Declare tensor<512x1024x3x3xf16, {order = #NHWC}> = dense<1.250000e-01> : tensor<512x1024x3x3xf16>, [#const.Reorder<#NHWC>] + // CHECK-DAG: [[CST_0:%.+]] = const.Declare tensor<1x1x32x32xf16, {order = #NHWC}> = dense<1.250000e-01> : tensor<1x1x32x32xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + // CHECK: [[CONV:%.+]] = IE.Convolution([[INPUT]], [[WEIGHTS]]) { + // CHECK-SAME: dilations = [1, 1], pads_begin = [1, 1], pads_end = [1, 1], strides = [1, 1] + // CHECK: [[SLICE:%.+]] = IE.Slice [[CONV]] [0, 0, 0, 0] [1, 511, 32, 32] : tensor<1x512x32x32xf16, {order = #NHWC}> to tensor<1x511x32x32xf16, {order = #NHWC}> + + // CHECK: [[CONCAT:%.+]] = IE.Concat([[SLICE]], [[CST_0]]) { + // CHECK-SAME{LITERAL}: static_offsets = [[0, 0, 0, 0], [0, 511, 0, 0]]} : tensor<1x511x32x32xf16, {order = #NHWC}>, tensor<1x1x32x32xf16, {order = #NHWC}> -> tensor<1x512x32x32xf16, {order = #NHWC}> + // CHECK: [[RESHAPE:%.+]] = IE.AffineReshape([[SLICE]]) { + // CHECK-SAME{LITERAL}: dim_mapping = [[0], [1], [2, 3], [3]], shape_value = [1, 511, 16, 64]} : tensor<1x511x32x32xf16, {order = #NHWC}> -> tensor<1x511x16x64xf16, {order = #NHWC}> + + // CHECK: return [[CONCAT]], [[RESHAPE]] : tensor<1x512x32x32xf16, {order = #NHWC}>, tensor<1x511x16x64xf16, {order = #NHWC}> +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + // CHECK-LABEL: @NotOptimizeSliceConcatIfNotLowestDim func.func @NotOptimizeSliceConcatIfNotLowestDim(%arg0: tensor<1x1024x32x32xf16, {order = #NHWC}>) -> tensor<1x512x32x32xf16, {order = #NHWC}> { %WEIGHTS = const.Declare tensor<512x1024x3x3xf16, {order = #NHWC}> = dense<1.250000e-01> : tensor<512x1024x3x3xf16>, [#const.Reorder<#NHWC>] diff --git a/tests/lit/NPU/dialect/IE/passes/pad_dynamic_inputs.mlir b/tests/lit/NPU/dialect/IE/passes/pad_dynamic_inputs.mlir index 32737e058b..e16a088683 100644 --- a/tests/lit/NPU/dialect/IE/passes/pad_dynamic_inputs.mlir +++ b/tests/lit/NPU/dialect/IE/passes/pad_dynamic_inputs.mlir @@ -520,8 +520,8 @@ func.func @NoStridedSliceAfterStaticSubgraph( {-# dialect_resources: { builtin: { - ov: "0x1000000000000000", - ov_2: "0x10000000000000AB" + vpux_ow_1: "0x1000000000000000", + vpux_ow_2: "0x10000000000000AB" } } #-} @@ -532,8 +532,8 @@ func.func @PadTwoDynamicInputsSubgraph( ) -> tensor<1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 64, 128]> : tensor<3xsi64>, order = #CHW }> { %cst = const.Declare tensor<1xsi64> = dense<128> : tensor<1xsi64> %cst_0 = const.Declare tensor<1xsi64> = dense<1> : tensor<1xsi64> - %cst_1 = const.Declare tensor<2x128xf32> = dense_resource : tensor<2x128xf32> - %cst_2 = const.Declare tensor<8x128xf32> = dense_resource : tensor<8x128xf32> + %cst_1 = const.Declare tensor<2x128xf32> = dense_resource : tensor<2x128xf32> + %cst_2 = const.Declare tensor<8x128xf32> = dense_resource : tensor<8x128xf32> %0 = IE.Gather(%cst_2, %IN) {axis_value = 0 : i64, batch_dims = 0 : i64, indices_rank = 2 : i64} : tensor<8x128xf32>, tensor<1x?xsi64, {bounds = #const.OpaqueI64Elements<[1, 64]> : tensor<2xsi64>, order = #NC}> -> tensor<1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 64, 128]> : tensor<3xsi64>, order = #CHW }> %1 = IE.Gather(%cst_1, %IN) {axis_value = 0 : i64, batch_dims = 0 : i64, indices_rank = 2 : i64} : tensor<2x128xf32>, tensor<1x?xsi64, {bounds = #const.OpaqueI64Elements<[1, 64]> : tensor<2xsi64>, order = #NC}> -> tensor<1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 64, 128]> : tensor<3xsi64>, order = #CHW }> %2 = IE.Add(%0, %1) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 64, 128]> : tensor<3xsi64>, order = #CHW }>, tensor<1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 64, 128]> : tensor<3xsi64>, order = #CHW }> -> tensor<1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 64, 128]> : tensor<3xsi64>, order = #CHW }> @@ -546,8 +546,8 @@ func.func @PadTwoDynamicInputsSubgraph( // CHECK: [[IN:%.+]]: tensor<1x?xsi64, {bounds = #const.OpaqueI64Elements<[1, 64]> : tensor<2xsi64>, order = #NC}> // CHECK: [[CST:%.+]] = const.Declare tensor<1xsi64> = dense<128> : tensor<1xsi64> // CHECK: [[CST_0:%.+]] = const.Declare tensor<1xsi64> = dense<1> : tensor<1xsi64> - // CHECK: [[CST_1:%.+]] = const.Declare tensor<2x128xf32> = dense_resource : tensor<2x128xf32> - // CHECK: [[CST_2:%.+]] = const.Declare tensor<8x128xf32> = dense_resource : tensor<8x128xf32> + // CHECK: [[CST_1:%.+]] = const.Declare tensor<2x128xf32> = dense_resource : tensor<2x128xf32> + // CHECK: [[CST_2:%.+]] = const.Declare tensor<8x128xf32> = dense_resource : tensor<8x128xf32> // CHECK: [[GATHER_0:%.+]] = IE.Gather([[CST_2]], [[IN]]) {axis_value = 0 : i64, batch_dims = 0 : i64, indices_rank = 2 : i64} : tensor<8x128xf32>, tensor<1x?xsi64, {bounds = #const.OpaqueI64Elements<[1, 64]> : tensor<2xsi64>, order = #NC}> -> tensor<1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 64, 128]> : tensor<3xsi64>, order = #CHW}> // CHECK: [[GATHER_1:%.+]] = IE.Gather([[CST_1]], [[IN]]) {axis_value = 0 : i64, batch_dims = 0 : i64, indices_rank = 2 : i64} : tensor<2x128xf32>, tensor<1x?xsi64, {bounds = #const.OpaqueI64Elements<[1, 64]> : tensor<2xsi64>, order = #NC}> -> tensor<1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 64, 128]> : tensor<3xsi64>, order = #CHW}> // CHECK: [[EXPAND_0:%.+]] = IE.DynamicExpand([[GATHER_0]]) : tensor<1x?x128xf32, {bounds = #const.OpaqueI64Elements<[1, 64, 128]> : tensor<3xsi64>, order = #CHW}> -> tensor<1x64x128xf32> diff --git a/tests/lit/NPU/dialect/IE/passes/process_asymmetric_zero_points_for_matmul.mlir b/tests/lit/NPU/dialect/IE/passes/process_asymmetric_zero_points_for_matmul.mlir index 6ae629c7dc..2399dca8ea 100644 --- a/tests/lit/NPU/dialect/IE/passes/process_asymmetric_zero_points_for_matmul.mlir +++ b/tests/lit/NPU/dialect/IE/passes/process_asymmetric_zero_points_for_matmul.mlir @@ -27,7 +27,7 @@ func.func @FixZeroPointForMatmul(%arg0: tensor<2x16x32xf16>) -> tensor<2x16x64xf // CHECK: [[CST_DIFF:%.+]] = const.Declare tensor<1x64x1x1xf16> = dense<4.000000e+00> : tensor<1x64x1x1xf16> // CHECK: [[CST0:%.+]] = const.Declare tensor<1x1x1x1xf16> = dense<0.000000e+00> : tensor<1x1x1x1xf16> - // CHECK: [[CST1:%.+]] = const.Declare tensor<1x1x1x1xf16> = dense<2.550000e+02> : tensor<1x1x1x1xf16> + // CHECK: [[CST1:%.+]] = const.Declare tensor<1x1x1x1xf16> = dense<2.550000e+02> : tensor<1x1x1x1xf16> // CHECK: [[CST2:%.+]] = const.Declare tensor<1x1x1x1xf16> = dense<-2.560000e+02> : tensor<1x1x1x1xf16> // CHECK: [[CST3:%.+]] = const.Declare tensor<1x1x1x1xf16> = dense<2.540000e+02> : tensor<1x1x1x1xf16> // CHECK: [[CST4:%.+]] = const.Declare tensor<1x1x32x64xf16> = dense<1.000000e+00> : tensor<32x64xf16>, [#const.Reshape<[1, 1, 32, 64]>] @@ -37,17 +37,17 @@ func.func @FixZeroPointForMatmul(%arg0: tensor<2x16x32xf16>) -> tensor<2x16x64xf // CHECK-SAME: tensor<1x1x32x64xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16> // CHECK-SAME: -> tensor<1x1x32x64xf16> - // CHECK: [[TRANSPOSE:%.+]] = IE.Transpose([[FQ]]) {order_value = #NCWH} : tensor<1x1x32x64xf16> -> tensor<1x1x64x32xf16> - // CHECK: [[RESHAPE1:%.+]] = IE.AffineReshape([[ARG0]]) + // CHECK: [[TRANSPOSE:%.+]] = IE.Transpose([[FQ]]) {order_value = #NCWH} : tensor<1x1x32x64xf16> -> tensor<1x1x64x32xf16> + // CHECK: [[RESHAPE1:%.+]] = IE.AffineReshape([[ARG0]]) // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [0], [1, 2, 3]], shape_value = [32, 32, 1, 1]} : tensor<2x16x32xf16> -> tensor<32x32x1x1xf16> - // CHECK: [[RESHAPE2:%.+]] = IE.AffineReshape([[TRANSPOSE]]) + // CHECK: [[RESHAPE2:%.+]] = IE.AffineReshape([[TRANSPOSE]]) // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [0], [0], [1, 2, 3]], shape_value = [64, 32, 1, 1]} : tensor<1x1x64x32xf16> -> tensor<64x32x1x1xf16> - // CHECK: [[TRANSPOSE2:%.+]] = IE.Transpose([[RESHAPE1]]) {order_value = #map} : tensor<32x32x1x1xf16> -> tensor<1x32x32x1xf16> - // CHECK: [[RESHAPE3:%.+]] = IE.AffineReshape([[TRANSPOSE2]]) + // CHECK: [[TRANSPOSE2:%.+]] = IE.Transpose([[RESHAPE1]]) {order_value = #map} : tensor<32x32x1x1xf16> -> tensor<1x32x32x1xf16> + // CHECK: [[RESHAPE3:%.+]] = IE.AffineReshape([[TRANSPOSE2]]) // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [1], [2, 3], [3]], shape_value = [1, 32, 8, 4]} : tensor<1x32x32x1xf16> -> tensor<1x32x8x4xf16> - // CHECK: [[REDUCE_SUM:%.+]] = IE.ReduceSum([[RESHAPE3]]) {axes_value = [1], keep_dims} : tensor<1x32x8x4xf16> -> tensor<1x1x8x4xf16> - // CHECK: [[MULTIPLY:%.+]] = IE.Multiply([[REDUCE_SUM]], [[CST_DIFF]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x1x8x4xf16>, tensor<1x64x1x1xf16> -> tensor<1x64x8x4xf16> + // CHECK: [[REDUCE_SUM:%.+]] = IE.ReduceSum([[RESHAPE3]]) {axes_value = [1], keep_dims} : tensor<1x32x8x4xf16> -> tensor<1x1x8x4xf16> + // CHECK: [[MULTIPLY:%.+]] = IE.Multiply([[REDUCE_SUM]], [[CST_DIFF]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x1x8x4xf16>, tensor<1x64x1x1xf16> -> tensor<1x64x8x4xf16> // CHECK: [[CONV:%.+]] = IE.Convolution([[RESHAPE3]], [[RESHAPE2]]) {dilations = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], // CHECK-SAME{LITERAL}: strides = [1, 1]} : tensor<1x32x8x4xf16>, tensor<64x32x1x1xf16> -> tensor<1x64x8x4xf16> // CHECK: [[FIXED_CONV:%.+]] = IE.Add([[CONV]], [[MULTIPLY]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x64x8x4xf16>, tensor<1x64x8x4xf16> -> tensor<1x64x8x4xf16> @@ -116,3 +116,64 @@ func.func @FixZeroPointForMatmulPerChannelQuantized(%arg0: tensor<1x1x2048xf16>) // CHECK: return [[RESHAPE4]] : tensor<1x1x2xf16> } +// ----- + +!qElemType = !quant.uniform +// CHECK-LABEL: @FixZeroPointForMatmulFQToWeightsPattern +// CHECK-SAME: ([[ARG0:%.+]]: tensor<2x16x32xf16> +func.func @FixZeroPointForMatmulFQToWeightsPattern(%arg0: tensor<2x16x32xf16>) -> tensor<2x16x64xf16> { + %cst = const.Declare tensor<1x1x1x1xf16> = dense<0.000000e+00> : tensor<1x1x1x1xf16> + %cst_0 = const.Declare tensor<1x1x1x1xf16> = dense<2.550000e+02> : tensor<1x1x1x1xf16> + %cst_1 = const.Declare tensor<1x1x1x1xf16> = dense<-2.520000e+02> : tensor<1x1x1x1xf16> + %cst_2 = const.Declare tensor<1x1x1x1xf16> = dense<2.580000e+02> : tensor<1x1x1x1xf16> + %cst_3 = const.Declare tensor<64x32x1x1xf16> = dense<1.0> : tensor<64x32xf16>, [#const.Reshape<[64, 32, 1, 1]>] + %0 = IE.FakeQuantize(%cst_3, %cst, %cst_0, %cst_1, %cst_2) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} + : tensor<64x32x1x1xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16> + -> tensor<64x32x1x1xf16> + %1 = IE.AffineReshape(%arg0) {dim_mapping = [[0], [0], [1, 2, 3]], shape_value = [32, 32, 1, 1]} + : tensor<2x16x32xf16> -> tensor<32x32x1x1xf16> + %2 = IE.Convolution(%1, %0) {dilations = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} + : tensor<32x32x1x1xf16>, tensor<64x32x1x1xf16> -> tensor<32x64x1x1xf16> + %3 = IE.AffineReshape(%2) {dim_mapping = [[0], [1], [1], [1]], shape_value = [32, 64]} + : tensor<32x64x1x1xf16> -> tensor<32x64xf16> + %4 = IE.AffineReshape(%3) {dim_mapping = [[0, 1], [2]], shape_value = [2, 16, 64]} + : tensor<32x64xf16> -> tensor<2x16x64xf16> + return %4 : tensor<2x16x64xf16> + + // CHECK: [[CST_DIFF:%.+]] = const.Declare tensor<1x64x1x1xf16> = dense<4.000000e+00> : tensor<1x64x1x1xf16> + + // CHECK: [[CST0:%.+]] = const.Declare tensor<1x1x1x1xf16> = dense<0.000000e+00> : tensor<1x1x1x1xf16> + // CHECK: [[CST1:%.+]] = const.Declare tensor<1x1x1x1xf16> = dense<2.550000e+02> : tensor<1x1x1x1xf16> + // CHECK: [[CST2:%.+]] = const.Declare tensor<1x1x1x1xf16> = dense<-2.560000e+02> : tensor<1x1x1x1xf16> + // CHECK: [[CST3:%.+]] = const.Declare tensor<1x1x1x1xf16> = dense<2.540000e+02> : tensor<1x1x1x1xf16> + // CHECK: [[CST4:%.+]] = const.Declare tensor<64x32x1x1xf16> = dense<1.000000e+00> : tensor<64x32xf16>, [#const.Reshape<[64, 32, 1, 1]>] + + // CHECK: [[FQ:%.+]] = IE.FakeQuantize([[CST4]], [[CST0]], [[CST1]], [[CST2]], [[CST3]]) + // CHECK-SAME: {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : + // CHECK-SAME: tensor<64x32x1x1xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16> + // CHECK-SAME: -> tensor<64x32x1x1xf16> + + // CHECK: [[RESHAPE1:%.+]] = IE.AffineReshape([[ARG0]]) + // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [0], [1, 2, 3]], shape_value = [32, 32, 1, 1]} : tensor<2x16x32xf16> -> tensor<32x32x1x1xf16> + + // CHECK: [[TRANSPOSE2:%.+]] = IE.Transpose([[RESHAPE1]]) {order_value = #map} : tensor<32x32x1x1xf16> -> tensor<1x32x32x1xf16> + // CHECK: [[RESHAPE3:%.+]] = IE.AffineReshape([[TRANSPOSE2]]) + // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [1], [2, 3], [3]], shape_value = [1, 32, 8, 4]} : tensor<1x32x32x1xf16> -> tensor<1x32x8x4xf16> + + // CHECK: [[REDUCE_SUM:%.+]] = IE.ReduceSum([[RESHAPE3]]) {axes_value = [1], keep_dims} : tensor<1x32x8x4xf16> -> tensor<1x1x8x4xf16> + // CHECK: [[MULTIPLY:%.+]] = IE.Multiply([[REDUCE_SUM]], [[CST_DIFF]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x1x8x4xf16>, tensor<1x64x1x1xf16> -> tensor<1x64x8x4xf16> + + // CHECK: [[CONV:%.+]] = IE.Convolution([[RESHAPE3]], [[FQ]]) {dilations = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], + // CHECK-SAME{LITERAL}: strides = [1, 1]} : tensor<1x32x8x4xf16>, tensor<64x32x1x1xf16> -> tensor<1x64x8x4xf16> + // CHECK: [[FIXED_CONV:%.+]] = IE.Add([[CONV]], [[MULTIPLY]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x64x8x4xf16>, tensor<1x64x8x4xf16> -> tensor<1x64x8x4xf16> + + // CHECK: [[RESHAPE4:%.+]] = IE.AffineReshape([[FIXED_CONV]]) + // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [1], [2], [2, 3]], shape_value = [1, 64, 32, 1]} : tensor<1x64x8x4xf16> -> tensor<1x64x32x1xf16> + // CHECK: [[TRANSPOSE3:%.+]] = IE.Transpose([[RESHAPE4]]) {order_value = #map} : tensor<1x64x32x1xf16> -> tensor<32x64x1x1xf16> + // CHECK: [[RESHAPE5:%.+]] = IE.AffineReshape([[TRANSPOSE3]]) + // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [1], [1], [1]], shape_value = [32, 64]} : tensor<32x64x1x1xf16> -> tensor<32x64xf16> + // CHECK: [[RESHAPE6:%.+]] = IE.AffineReshape([[RESHAPE5]]) + // CHECK-SAME{LITERAL}: {dim_mapping = [[0, 1], [2]], shape_value = [2, 16, 64]} : tensor<32x64xf16> -> tensor<2x16x64xf16> + + // CHECK: return [[RESHAPE6]] : tensor<2x16x64xf16> +} diff --git a/tests/lit/NPU/dialect/IE/passes/propagate_mem_permute_through_eltwise.mlir b/tests/lit/NPU/dialect/IE/passes/propagate_mem_permute_through_eltwise.mlir index 3eee9ac8db..7d361ef4d0 100644 --- a/tests/lit/NPU/dialect/IE/passes/propagate_mem_permute_through_eltwise.mlir +++ b/tests/lit/NPU/dialect/IE/passes/propagate_mem_permute_through_eltwise.mlir @@ -562,9 +562,9 @@ func.func @PropagatePermuteAddPermuteReserveShapeCast(%arg0 : tensor<1x129x16x48 // CHECK: [[PERMUTE_0:%.+]] = IE.PermuteQuantize([[INPUT]]) {dstElemType = f16, dst_order = #NHWC, mem_perm = #NHWC, pads_begin = [0, 0, 0, 0], pads_end = [0, 0, 0, 0]} : tensor<1x129x16x48xf16> -> tensor<1x129x16x48xf16, {order = #NHWC}> // CHECK: [[PERMUTE_1:%.+]] = IE.MemPermute([[PERMUTE_0]]) {dst_order = #NHWC, mem_perm = #map} : tensor<1x129x16x48xf16, {order = #NHWC}> -> tensor<129x48x1x16xf16, {order = #NHWC}> - // CHECK: [[SHAPE_CAST_0:%.+]] = IE.ShapeCast {shape = [1, 48, 129, 16]} inputs([[PERMUTE_1]] : tensor<129x48x1x16xf16, {order = #NHWC}>) -> tensor<1x48x129x16xf16, {order = #NHWC}> - // CHECK: [[ADD:%.+]] = IE.Add([[SHAPE_CAST_0]], [[SHAPE_CAST_0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x48x129x16xf16, {order = #NHWC}>, tensor<1x48x129x16xf16, {order = #NHWC}> -> tensor<1x48x129x16x!qElemType, {order = #NHWC}> - // CHECK: [[SHAPE_CAST_1:%.+]] = IE.ShapeCast {shape = [129, 48, 1, 16]} inputs([[ADD]] : tensor<1x48x129x16x!qElemType, {order = #NHWC}>) -> tensor<129x48x1x16x!qElemType, {order = #NHWC}> + // CHECK: [[SHAPE_CAST_0:%.+]] = IE.ShapeCast {shape = [1, 6192, 1, 16]} inputs([[PERMUTE_1]] : tensor<129x48x1x16xf16, {order = #NHWC}>) -> tensor<1x6192x1x16xf16, {order = #NHWC}> + // CHECK: [[ADD:%.+]] = IE.Add([[SHAPE_CAST_0]], [[SHAPE_CAST_0]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x6192x1x16xf16, {order = #NHWC}>, tensor<1x6192x1x16xf16, {order = #NHWC}> -> tensor<1x6192x1x16x!qElemType, {order = #NHWC}> + // CHECK: [[SHAPE_CAST_1:%.+]] = IE.ShapeCast {shape = [129, 48, 1, 16]} inputs([[ADD]] : tensor<1x6192x1x16x!qElemType, {order = #NHWC}>) -> tensor<129x48x1x16x!qElemType, {order = #NHWC}> // CHECK: [[PERMUTE_2:%.+]] = IE.PermuteCast([[SHAPE_CAST_1]]) {dst_order = #NCHW, mem_perm = #NCHW} : tensor<129x48x1x16x!qElemType, {order = #NHWC}> -> tensor<129x1x16x48x!qElemType> // CHECK: return [[PERMUTE_2]] : tensor<129x1x16x48x!qElemType> @@ -1939,3 +1939,130 @@ func.func @DoNotPropagateMemPermuteWithMultipleUsers(%arg0: tensor<1x4x1600x2560 // CHECK-SAME: : tensor<1x4x1600x2560xf16, {order = #NHWC}> -> tensor<1x4x1600x2560xf16> // CHECK: return [[OUT_MEM_PERMUTE]] : tensor<1x4x1600x2560xf16> } + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> + +// CHECK-LABEL: @NotPropagatePermuteThroughMultiply +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x32x64x256xf16>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x16x128x256xf16> +func.func @NotPropagatePermuteThroughMultiply(%arg0 : tensor<1x32x64x256xf16>, %arg1 : tensor<1x16x128x256xf16>) -> tensor<1x16x128x256xf16> { + %LHS_MEM_PERMUTE = IE.MemPermute(%arg0) {dst_order = #NCHW, mem_perm = #NHWC} : tensor<1x32x64x256xf16> -> tensor<1x64x256x32xf16> + %LHS_SHAPE_CAST = IE.ShapeCast {shape = [1, 512, 64, 16]} inputs(%LHS_MEM_PERMUTE : tensor<1x64x256x32xf16>) -> tensor<1x512x64x16xf16> + + %RHS_MEM_PERMUTE = IE.MemPermute(%arg1) {dst_order = #NCHW, mem_perm = #NHWC} : tensor<1x16x128x256xf16> -> tensor<1x128x256x16xf16> + %RHS_SHAPE_CAST = IE.ShapeCast {shape = [1, 512, 64, 16]} inputs(%RHS_MEM_PERMUTE : tensor<1x128x256x16xf16>) -> tensor<1x512x64x16xf16> + + %MULTIPLY = IE.Multiply(%LHS_SHAPE_CAST, %RHS_SHAPE_CAST) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x512x64x16xf16>, tensor<1x512x64x16xf16> -> tensor<1x512x64x16xf16> + %OUT_SHAPE_CAST = IE.ShapeCast {shape = [1, 128, 256, 16]} inputs(%MULTIPLY : tensor<1x512x64x16xf16>) -> tensor<1x128x256x16xf16> + %OUT_MEM_PERMUTE = IE.MemPermute(%OUT_SHAPE_CAST) {dst_order = #NCHW, mem_perm = #NWCH} : tensor<1x128x256x16xf16> -> tensor<1x16x128x256xf16> + + return %OUT_MEM_PERMUTE : tensor<1x16x128x256xf16> + + // CHECK: [[LHS_IN_MEM_PERMUTE:%.+]] = IE.MemPermute + // CHECK: [[LHS_SHAPE_CAST:%.+]] = IE.ShapeCast + + // CHECK: [[RHS_IN_MEM_PERMUTE:%.+]] = IE.MemPermute + // CHECK: [[RHS_SHAPE_CAST:%.+]] = IE.ShapeCast + + // CHECK: [[MULTIPLY:%.+]] = IE.Multiply([[LHS_SHAPE_CAST]], [[RHS_SHAPE_CAST]]) + + // CHECK: [[OUT_SHAPE_CAST:%.+]] = IE.ShapeCast + // CHECK: [[OUT_MEM_PERMUTE:%.*]] = IE.MemPermute +} + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> + +// CHECK-LABEL: @PropagatePermuteWhenDimNIsNotOneNeedBroadcast +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<4x16x32x56xf16, {order = #NHWC}>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<4x16x1x1xf16, {order = #NHWC}> +func.func @PropagatePermuteWhenDimNIsNotOneNeedBroadcast(%arg0 : tensor<4x16x32x56xf16, {order = #NHWC}>, %arg1 : tensor<4x16x1x1xf16, {order = #NHWC}>) -> tensor<4x16x32x56xf16, {order = #NHWC}> { + %0 = IE.MemPermute(%arg0) {dst_order = #NCHW, mem_perm = #NWCH} : tensor<4x16x32x56xf16, {order = #NHWC}> -> tensor<4x16x32x56xf16> + %1 = IE.ShapeCast {shape = [1, 64, 32, 56]} inputs(%0 : tensor<4x16x32x56xf16>) -> tensor<1x64x32x56xf16> + %2 = IE.MemPermute(%arg1) {dst_order = #NCHW, mem_perm = #NWCH} : tensor<4x16x1x1xf16, {order = #NHWC}> -> tensor<4x16x1x1xf16> + %3 = IE.ShapeCast {shape = [1, 64, 1, 1]} inputs(%2 : tensor<4x16x1x1xf16>) -> tensor<1x64x1x1xf16> + %4 = IE.Multiply(%1, %3) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x64x32x56xf16>, tensor<1x64x1x1xf16> -> tensor<1x64x32x56xf16> + %5 = IE.ShapeCast {shape = [4, 16, 32, 56]} inputs(%4 : tensor<1x64x32x56xf16>) -> tensor<4x16x32x56xf16> + %6 = IE.MemPermute(%5) {dst_order = #NHWC, mem_perm = #NHWC} : tensor<4x16x32x56xf16> -> tensor<4x16x32x56xf16, {order = #NHWC}> + + return %6 : tensor<4x16x32x56xf16, {order = #NHWC}> + + // CHECK: [[PERMUTE_0:%.+]] = IE.MemPermute([[INPUT_0]]) {dst_order = #NCHW, mem_perm = #NWCH} : tensor<4x16x32x56xf16, {order = #NHWC}> -> tensor<4x16x32x56xf16> + // CHECK: [[PERMUTE_1:%.+]] = IE.MemPermute([[INPUT_1]]) {dst_order = #NCHW, mem_perm = #NWCH} : tensor<4x16x1x1xf16, {order = #NHWC}> -> tensor<4x16x1x1xf16> + // CHECK: [[PERMUTE_2:%.+]] = IE.MemPermute([[PERMUTE_0]]) {dst_order = #NCHW, mem_perm = #NHWC} : tensor<4x16x32x56xf16> -> tensor<4x32x56x16xf16> + // CHECK: [[SHAPE_CAST_0:%.+]] = IE.ShapeCast {shape = [1, 32, 56, 64]} inputs([[PERMUTE_2]] : tensor<4x32x56x16xf16>) -> tensor<1x32x56x64xf16> + // CHECK: [[PERMUTE_3:%.+]] = IE.MemPermute([[PERMUTE_1]]) {dst_order = #NCHW, mem_perm = #NHWC} : tensor<4x16x1x1xf16> -> tensor<4x1x1x16xf16> + // CHECK: [[SHAPE_CAST_1:%.+]] = IE.ShapeCast {shape = [1, 1, 1, 64]} inputs([[PERMUTE_3]] : tensor<4x1x1x16xf16>) -> tensor<1x1x1x64xf16> + // CHECK: [[MULTIPLY:%.+]] = IE.Multiply([[SHAPE_CAST_0]], [[SHAPE_CAST_1]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x32x56x64xf16>, tensor<1x1x1x64xf16> -> tensor<1x32x56x64xf16> + // CHECK: [[SHAPE_CAST_2:%.+]] = IE.ShapeCast {shape = [4, 32, 56, 16]} inputs([[MULTIPLY]] : tensor<1x32x56x64xf16>) -> tensor<4x32x56x16xf16> + // CHECK: [[PERMUTE_4:%.+]] = IE.PermuteCast([[SHAPE_CAST_2]]) {dst_order = #NHWC, mem_perm = #NCHW} : tensor<4x32x56x16xf16> -> tensor<4x16x32x56xf16, {order = #NHWC}> + + // CHECK: return [[PERMUTE_4]] : tensor<4x16x32x56xf16, {order = #NHWC}> +} + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> + +// CHECK-LABEL: @NotPropagatePermuteWhenDimNOfOneInputIsNotOne +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<4x1x32x56xf16, {order = #NHWC}>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x1x56x32xf16, {order = #NHWC}> +func.func @NotPropagatePermuteWhenDimNOfOneInputIsNotOne(%arg0 : tensor<4x1x32x56xf16, {order = #NHWC}>, %arg1 : tensor<1x1x56x32xf16, {order = #NHWC}>) -> tensor<4x1x32x56xf16, {order = #NHWC}> { + %0 = IE.MemPermute(%arg0) {dst_order = #NCHW, mem_perm = #NWCH} : tensor<4x1x32x56xf16, {order = #NHWC}> -> tensor<4x1x32x56xf16> + %1 = IE.ShapeCast {shape = [1, 4, 32, 56]} inputs(%0 : tensor<4x1x32x56xf16>) -> tensor<1x4x32x56xf16> + %2 = IE.MemPermute(%arg1) {dst_order = #NCHW, mem_perm = #NWCH} : tensor<1x1x56x32xf16, {order = #NHWC}> -> tensor<1x1x56x32xf16> + %3 = IE.ShapeCast {shape = [1, 1, 32, 56]} inputs(%2 : tensor<1x1x56x32xf16>) -> tensor<1x1x32x56xf16> + %4 = IE.Multiply(%1, %3) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x4x32x56xf16>, tensor<1x1x32x56xf16> -> tensor<1x4x32x56xf16> + %5 = IE.ShapeCast {shape = [4, 1, 32, 56]} inputs(%4 : tensor<1x4x32x56xf16>) -> tensor<4x1x32x56xf16> + %6 = IE.MemPermute(%5) {dst_order = #NHWC, mem_perm = #NHWC} : tensor<4x1x32x56xf16> -> tensor<4x1x32x56xf16, {order = #NHWC}> + + return %6 : tensor<4x1x32x56xf16, {order = #NHWC}> + + // CHECK: [[PERMUTE_0:%.+]] = IE.MemPermute([[INPUT_0]]) + // CHECK: [[SHAPE_CAST_0:%.+]] = IE.ShapeCast {shape = [1, 4, 32, 56]} + // CHECK: [[PERMUTE_1:%.+]] = IE.MemPermute([[INPUT_1]]) + // CHECK: [[SHAPE_CAST_1:%.+]] = IE.ShapeCast {shape = [1, 1, 32, 56]} + // CHECK: [[MULTIPLY:%.+]] = IE.Multiply([[SHAPE_CAST_0]], [[SHAPE_CAST_1]]) + // CHECK: [[SHAPE_CAST_2:%.+]] = IE.ShapeCast {shape = [4, 1, 32, 56]} + // CHECK: [[PERMUTE_2:%.+]] = IE.MemPermute + + // CHECK: return [[PERMUTE_2]] : tensor<4x1x32x56xf16, {order = #NHWC}> +} + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> + +// CHECK-LABEL: @NotPropagatePermuteWhenDimNOfOneInputIsNotOneSameHW +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<4x1x32x56xf16, {order = #NHWC}>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x1x32x56xf16, {order = #NHWC}> +func.func @NotPropagatePermuteWhenDimNOfOneInputIsNotOneSameHW(%arg0 : tensor<4x1x32x56xf16, {order = #NHWC}>, %arg1 : tensor<1x1x32x56xf16, {order = #NHWC}>) -> tensor<4x1x32x56xf16, {order = #NHWC}> { + %0 = IE.MemPermute(%arg0) {dst_order = #NCHW, mem_perm = #NWCH} : tensor<4x1x32x56xf16, {order = #NHWC}> -> tensor<4x1x32x56xf16> + %1 = IE.ShapeCast {shape = [1, 4, 32, 56]} inputs(%0 : tensor<4x1x32x56xf16>) -> tensor<1x4x32x56xf16> + %2 = IE.MemPermute(%arg1) {dst_order = #NCHW, mem_perm = #NWCH} : tensor<1x1x32x56xf16, {order = #NHWC}> -> tensor<1x1x32x56xf16> + %3 = IE.Multiply(%1, %2) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x4x32x56xf16>, tensor<1x1x32x56xf16> -> tensor<1x4x32x56xf16> + %4 = IE.ShapeCast {shape = [4, 1, 32, 56]} inputs(%3 : tensor<1x4x32x56xf16>) -> tensor<4x1x32x56xf16> + %5 = IE.MemPermute(%4) {dst_order = #NHWC, mem_perm = #NHWC} : tensor<4x1x32x56xf16> -> tensor<4x1x32x56xf16, {order = #NHWC}> + + return %5 : tensor<4x1x32x56xf16, {order = #NHWC}> + + // CHECK: [[PERMUTE_0:%.+]] = IE.MemPermute([[INPUT_0]]) + // CHECK: [[SHAPE_CAST_0:%.+]] = IE.ShapeCast {shape = [1, 4, 32, 56]} + // CHECK: [[PERMUTE_1:%.+]] = IE.MemPermute([[INPUT_1]]) + // CHECK: [[MULTIPLY:%.+]] = IE.Multiply([[SHAPE_CAST_0]], [[PERMUTE_1]]) + // CHECK: [[SHAPE_CAST_1:%.+]] = IE.ShapeCast {shape = [4, 1, 32, 56]} + // CHECK: [[PERMUTE_2:%.+]] = IE.MemPermute + + // CHECK: return [[PERMUTE_2]] : tensor<4x1x32x56xf16, {order = #NHWC}> +} diff --git a/tests/lit/NPU/dialect/IE/passes/propagate_op_through_batch_concat.mlir b/tests/lit/NPU/dialect/IE/passes/propagate_op_through_batch_concat.mlir index eb29da444e..effc13ee8a 100644 --- a/tests/lit/NPU/dialect/IE/passes/propagate_op_through_batch_concat.mlir +++ b/tests/lit/NPU/dialect/IE/passes/propagate_op_through_batch_concat.mlir @@ -154,6 +154,41 @@ func.func @PropagateNonConstantInputAddSoftmaxThroughBatchUnrolledMatmul( // ----- +// CHECK-LABEL: @PropagateNonConstantInputSqueezeAddSoftmaxThroughBatchUnrolledMatmul +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x1x1024x1024xf32> +func.func @PropagateNonConstantInputSqueezeAddSoftmaxThroughBatchUnrolledMatmul( + %arg0: tensor<1024x128xf32>, + %arg1: tensor<1024x128xf32>, + %arg2: tensor<1x1x1024x1024xf32>) -> tensor<1x2x1024x1024xf32> { + %cst = const.Declare tensor<1024x128xf32> = dense<1.000000e+00> : tensor<1024x128xf32> + %0 = IE.MatMul(%arg0, %cst) {transpose_b} : tensor<1024x128xf32>, tensor<1024x128xf32> -> tensor<1024x1024xf32> + %1 = IE.MatMul(%arg1, %cst) {transpose_b} : tensor<1024x128xf32>, tensor<1024x128xf32> -> tensor<1024x1024xf32> + %2 = IE.Reshape(%0) {shape_value = [1, 1, 1024, 1024]} : tensor<1024x1024xf32> -> tensor<1x1x1024x1024xf32> + %3 = IE.Reshape(%1) {shape_value = [1, 1, 1024, 1024]} : tensor<1024x1024xf32> -> tensor<1x1x1024x1024xf32> + %4 = IE.Concat(%2, %3) {per_axis = #IE.Concat} : tensor<1x1x1024x1024xf32>, tensor<1x1x1024x1024xf32> -> tensor<1x2x1024x1024xf32> + %5 = IE.Squeeze(%arg2) {axes_value = [0, 1]} : tensor<1x1x1024x1024xf32> -> tensor<1024x1024xf32> + %6 = IE.Add(%4, %5) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x2x1024x1024xf32>, tensor<1024x1024xf32> -> tensor<1x2x1024x1024xf32> + %7 = IE.SoftMax(%6) {axisInd = 3 : i64} : tensor<1x2x1024x1024xf32> -> tensor<1x2x1024x1024xf32> + + return %7 : tensor<1x2x1024x1024xf32> + + // CHECK: [[CST:%.+]] = const.Declare tensor<1024x128xf32> = dense<1.000000e+00> : tensor<1024x128xf32> + // CHECK: [[MATMUL_1:%.+]] = IE.MatMul + // CHECK: [[MATMUL_2:%.+]] = IE.MatMul + // CHECK: [[RESHAPE_1:%.+]] = IE.Reshape([[MATMUL_1]]) {shape_value = [1, 1, 1024, 1024]} : tensor<1024x1024xf32> -> tensor<1x1x1024x1024xf32> + // CHECK: [[RESHAPE_2:%.+]] = IE.Reshape([[MATMUL_2]]) {shape_value = [1, 1, 1024, 1024]} : tensor<1024x1024xf32> -> tensor<1x1x1024x1024xf32> + // CHECK: [[SQUEEZE:%.+]] = IE.Squeeze([[INPUT]]) {axes_value = [0, 1]} : tensor<1x1x1024x1024xf32> -> tensor<1024x1024xf32> + // CHECK: [[ADD_1:%.+]] = IE.Add([[RESHAPE_1]], [[SQUEEZE]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x1x1024x1024xf32>, tensor<1024x1024xf32> -> tensor<1x1x1024x1024xf32> + // CHECK: [[SOFTMAX_1:%.+]] = IE.SoftMax([[ADD_1]]) {axisInd = 3 : i64} : tensor<1x1x1024x1024xf32> -> tensor<1x1x1024x1024xf32> + // CHECK: [[ADD_2:%.+]] = IE.Add([[RESHAPE_2]], [[SQUEEZE]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x1x1024x1024xf32>, tensor<1024x1024xf32> -> tensor<1x1x1024x1024xf32> + // CHECK: [[SOFTMAX_2:%.+]] = IE.SoftMax([[ADD_2]]) {axisInd = 3 : i64} : tensor<1x1x1024x1024xf32> -> tensor<1x1x1024x1024xf32> + // CHECK: [[CONCAT:%.+]] = IE.Concat([[SOFTMAX_1]], [[SOFTMAX_2]]) {per_axis = #IE.Concat} : tensor<1x1x1024x1024xf32>, tensor<1x1x1024x1024xf32> -> tensor<1x2x1024x1024xf32> + + // CHECK: return [[CONCAT]] : tensor<1x2x1024x1024xf32> +} + +// ----- + // CHECK-LABEL: @NotPropagateAddSoftmaxThroughBatchUnrolledMatmulDueToNoDimLeftForSplit // CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x1x1x1024xf32> func.func @NotPropagateAddSoftmaxThroughBatchUnrolledMatmulDueToNoDimLeftForSplit( @@ -236,6 +271,37 @@ func.func @NoPropagateAddSoftmaxWithInvalidAddSource(%arg0: tensor<16x2xf32>, %a // ----- +// CHECK-LABEL: @NotPropagateNonConstantSmallSizeInputAddSoftmaxThroughBatchUnrolledMatmul +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x1x77x77xf32> +func.func @NotPropagateNonConstantSmallSizeInputAddSoftmaxThroughBatchUnrolledMatmul( + %arg0: tensor<77x128xf32>, + %arg1: tensor<77x128xf32>, + %arg2: tensor<1x1x77x77xf32>) -> tensor<1x2x77x77xf32> { + %cst = const.Declare tensor<77x128xf32> = dense<1.000000e+00> : tensor<77x128xf32> + %0 = IE.MatMul(%arg0, %cst) {transpose_b} : tensor<77x128xf32>, tensor<77x128xf32> -> tensor<77x77xf32> + %1 = IE.MatMul(%arg1, %cst) {transpose_b} : tensor<77x128xf32>, tensor<77x128xf32> -> tensor<77x77xf32> + %2 = IE.Reshape(%0) {shape_value = [1, 1, 77, 77]} : tensor<77x77xf32> -> tensor<1x1x77x77xf32> + %3 = IE.Reshape(%1) {shape_value = [1, 1, 77, 77]} : tensor<77x77xf32> -> tensor<1x1x77x77xf32> + %4 = IE.Concat(%2, %3) {per_axis = #IE.Concat} : tensor<1x1x77x77xf32>, tensor<1x1x77x77xf32> -> tensor<1x2x77x77xf32> + %5 = IE.Add(%4, %arg2) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x2x77x77xf32>, tensor<1x1x77x77xf32> -> tensor<1x2x77x77xf32> + %6 = IE.SoftMax(%5) {axisInd = 3 : i64} : tensor<1x2x77x77xf32> -> tensor<1x2x77x77xf32> + + return %6 : tensor<1x2x77x77xf32> + + // CHECK: [[CST:%.+]] = const.Declare tensor<77x128xf32> = dense<1.000000e+00> : tensor<77x128xf32> + // CHECK: [[MATMUL_1:%.+]] = IE.MatMul + // CHECK: [[MATMUL_2:%.+]] = IE.MatMul + // CHECK: [[RESHAPE_1:%.+]] = IE.Reshape([[MATMUL_1]]) {shape_value = [1, 1, 77, 77]} : tensor<77x77xf32> -> tensor<1x1x77x77xf32> + // CHECK: [[RESHAPE_2:%.+]] = IE.Reshape([[MATMUL_2]]) {shape_value = [1, 1, 77, 77]} : tensor<77x77xf32> -> tensor<1x1x77x77xf32> + // CHECK: [[CONCAT:%.+]] = IE.Concat([[RESHAPE_1]], [[RESHAPE_2]]) {per_axis = #IE.Concat} : tensor<1x1x77x77xf32>, tensor<1x1x77x77xf32> -> tensor<1x2x77x77xf32> + // CHECK: [[ADD:%.+]] = IE.Add([[CONCAT]], %arg2) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x2x77x77xf32>, tensor<1x1x77x77xf32> -> tensor<1x2x77x77xf32> + // CHECK: [[SOFTMAX:%.+]] = IE.SoftMax([[ADD]]) {axisInd = 3 : i64} : tensor<1x2x77x77xf32> -> tensor<1x2x77x77xf32> + + // CHECK: return [[SOFTMAX]] : tensor<1x2x77x77xf32> +} + +// ----- + // CHECK-LABEL: @PropagateReshapeThroughBatchUnrolledMatmul func.func @PropagateReshapeThroughBatchUnrolledMatmul(%arg0: tensor<16x2xf32>, %arg1: tensor<16x2xf32>) -> tensor<2x16x2xf32> { %cst = const.Declare tensor<2x2xf32> = dense<1.000000e+00> : tensor<2x2xf32> diff --git a/tests/lit/NPU/dialect/IE/passes/reduce_num_tiles_for_small_models_40XX.mlir b/tests/lit/NPU/dialect/IE/passes/reduce_num_tiles_for_small_models_40XX.mlir index 0d000982c2..8fef845328 100644 --- a/tests/lit/NPU/dialect/IE/passes/reduce_num_tiles_for_small_models_40XX.mlir +++ b/tests/lit/NPU/dialect/IE/passes/reduce_num_tiles_for_small_models_40XX.mlir @@ -10,7 +10,7 @@ module @NoMultiplyNumClustersRemained { IE.TileResource 6 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } @@ -39,7 +39,7 @@ module @NoMultiplyNumClustersRemained { module @NoMatMulNumClustersRemained { IE.TileResource 6 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } @@ -68,7 +68,7 @@ module @NoMatMulNumClustersRemained { module @NoSoftMaxNumClustersRemained { IE.TileResource 6 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } @@ -98,7 +98,7 @@ module @NoSoftMaxNumClustersRemained { module @MatMulMultiplySoftMaxNumClustersReduced { IE.TileResource 6 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } @@ -129,7 +129,7 @@ module @MatMulMultiplySoftMaxNumClustersReduced { module @BigShapesNumClustersRemained { IE.TileResource 6 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } diff --git a/tests/lit/NPU/dialect/IE/passes/resolve_shaped_type_result_dims.mlir b/tests/lit/NPU/dialect/IE/passes/resolve_shaped_type_result_dims.mlir index eb14bdbb68..303c80ff11 100644 --- a/tests/lit/NPU/dialect/IE/passes/resolve_shaped_type_result_dims.mlir +++ b/tests/lit/NPU/dialect/IE/passes/resolve_shaped_type_result_dims.mlir @@ -105,8 +105,8 @@ func.func @ReifySoftmaxShape(%IN: !BoundedType) -> (!BoundedType, index) { func.func @ReifyMaxPoolShape(%IN: !BoundedType) -> (tensor<1x16x30x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 30, 64]> : tensor<4xsi64>, order = #NCHW}>, index) { // CHECK: [[IN:%.+]]: tensor<1x16x32x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 32, 64]> : tensor<4xsi64>, order = #NCHW}> %C3 = arith.constant 3 : index + // CHECK-DAG: [[C2:%.+]] = arith.constant -2 : index // CHECK-DAG: [[C3:%.+]] = arith.constant 3 : index - // CHECK-DAG: [[C2:%.+]] = arith.constant 2 : index %MAXPOOL = IE.MaxPool(%IN) { kernel_size = [3, 3], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1] @@ -115,10 +115,10 @@ func.func @ReifyMaxPoolShape(%IN: !BoundedType) -> (tensor<1x16x30x?xf16, {bound %DIM = tensor.dim %MAXPOOL, %C3 : tensor<1x16x30x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 30, 64]> : tensor<4xsi64>, order = #NCHW}> // CHECK: [[DIM:%.+]] = tensor.dim [[IN]], [[C3]] - // CHECK: [[DIM_SUB:%.+]] = arith.subi [[DIM]], [[C2]] + // CHECK: [[OUTPUTSHAPE:%.+]] = arith.addi [[DIM]], [[C2]] : index return %MAXPOOL, %DIM : tensor<1x16x30x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 30, 64]> : tensor<4xsi64>, order = #NCHW}>, index - // CHECK: return [[MAXPOOL]], [[DIM_SUB]] + // CHECK: return [[MAXPOOL]], [[OUTPUTSHAPE]] } // ----- @@ -131,8 +131,8 @@ func.func @ReifyMaxPoolShape(%IN: !BoundedType) -> (tensor<1x16x30x?xf16, {bound func.func @ReifyMaxPoolShape(%IN: !InBoundedType) -> (!OutBoundedType, index) { // CHECK: [[IN:%.+]]: tensor<1x16x32x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 32, 64]> : tensor<4xsi64>, order = #NCHW}> %C3 = arith.constant 3 : index - // CHECK-DAG: [[C3:%.+]] = arith.constant 3 : index // CHECK-DAG: [[C2:%.+]] = arith.constant 2 : index + // CHECK-DAG: [[C3:%.+]] = arith.constant 3 : index %MAXPOOL = IE.MaxPool(%IN) { kernel_size = [2, 2], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [2, 2] @@ -141,10 +141,10 @@ func.func @ReifyMaxPoolShape(%IN: !InBoundedType) -> (!OutBoundedType, index) { %DIM = tensor.dim %MAXPOOL, %C3 : !OutBoundedType // CHECK: [[DIM:%.+]] = tensor.dim [[IN]], [[C3]] - // CHECK: [[DIM_DIV:%.+]] = arith.divsi [[DIM]], [[C2]] + // CHECK: [[OUTPUTSHAPE:%.+]] = arith.divsi [[DIM]], [[C2]] : index return %MAXPOOL, %DIM : !OutBoundedType, index - // CHECK: return [[MAXPOOL]], [[DIM_DIV]] + // CHECK: return [[MAXPOOL]], [[OUTPUTSHAPE]] } // ----- @@ -166,10 +166,10 @@ func.func @ReifyMaxPoolShape(%IN: !InBoundedType) -> (!OutBoundedType, index) { %DIM = tensor.dim %MAXPOOL, %C2 : !OutBoundedType // CHECK: [[DIM:%.+]] = tensor.dim [[IN]], [[C2]] - // CHECK: [[DIM_DIV:%.+]] = arith.divsi [[DIM]], [[C2]] + // CHECK: [[OUTPUTSHAPE:%.+]] = arith.divsi [[DIM]], [[C2]] return %MAXPOOL, %DIM : !OutBoundedType, index - // CHECK: return [[MAXPOOL]], [[DIM_DIV]] + // CHECK: return [[MAXPOOL]], [[OUTPUTSHAPE]] } // ----- @@ -192,7 +192,6 @@ func.func @ReifyConvShape(%IN: tensor<1x16x32x?xf16, {bounds = #const.OpaqueI64E %DIM = tensor.dim %CONV, %C3 : tensor<1x32x32x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 32, 64]> : tensor<4xsi64>, order = #NCHW}> // CHECK: [[DIM:%.+]] = tensor.dim [[IN]], [[C3]] - return %CONV, %DIM : tensor<1x32x32x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 32, 64]> : tensor<4xsi64>, order = #NCHW}>, index // CHECK: return [[CONV]], [[DIM]] } @@ -206,8 +205,8 @@ func.func @ReifyMaxPoolConvReLUMaxPoolConvShape(%IN: tensor<1x16x64x?xf16, {boun -> (tensor<1x16x16x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 16, 64]> : tensor<4xsi64>, order = #NCHW}>, index) { // CHECK: [[IN:%.+]]: tensor<1x16x64x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 64, 64]> : tensor<4xsi64>, order = #NCHW}> %C3 = arith.constant 3 : index - // CHECK-DAG: [[C3:%.+]] = arith.constant 3 : index // CHECK-DAG: [[C2:%.+]] = arith.constant 2 : index + // CHECK-DAG: [[C3:%.+]] = arith.constant 3 : index %CST1 = const.Declare tensor<32x16x3x3xf16, {order = #NCHW}> = dense<1.000000e+00> : tensor<32x16x3x3xf16> %CST2 = const.Declare tensor<16x32x1x1xf16, {order = #NCHW}> = dense<1.000000e+00> : tensor<16x32x1x1xf16> // CHECK-DAG: [[CST1:%.+]] = const.Declare @@ -245,11 +244,11 @@ func.func @ReifyMaxPoolConvReLUMaxPoolConvShape(%IN: tensor<1x16x64x?xf16, {boun // CHECK: [[CONV2:%.+]] = IE.Convolution([[MAXPOOL2]], [[CST2]]) // CHECK: [[DIM:%.+]] = tensor.dim [[IN]], [[C3]] - // CHECK: [[DIM_DIV_1:%.+]] = arith.divsi [[DIM]], [[C2]] - // CHECK: [[DIM_DIV_2:%.+]] = arith.divsi [[DIM_DIV_1]], [[C2]] + // CHECK: [[PADDED:%.+]] = arith.divsi [[DIM]], [[C2]] : index + // CHECK: [[SHAPE:%.+]] = arith.divsi [[PADDED]], [[C2]] : index return %CONV2, %DIM : tensor<1x16x16x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 16, 64]> : tensor<4xsi64>, order = #NCHW}>, index - // CHECK: return [[CONV2]], [[DIM_DIV_2]] + // CHECK: return [[CONV2]], [[SHAPE]] } // ----- @@ -964,7 +963,6 @@ func.func @ReifyFqConvFqShape(%IN: tensor<1x16x32x?xf16, {bounds = #const.Opaque %CST0 = const.Declare tensor<1x1x1x1xf16> = dense<0.000000e+00> : tensor<1x1x1x1xf16> %CST1 = const.Declare tensor<1x1x1x1xf16> = dense<5.000000e+00> : tensor<1x1x1x1xf16> - // CHECK: [[C3:%.+]] = arith.constant 3 : index // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<32x16x3x3xf16, {order = #NCHW}> = dense<1.000000e+00> @@ -995,7 +993,6 @@ func.func @ReifyFqConvFqShape(%IN: tensor<1x16x32x?xf16, {bounds = #const.Opaque // CHECK: [[FQ1:%.+]] = IE.FakeQuantize([[CONV]], [[CST0]], [[CST1]], [[CST0]], [[CST1]]) // CHECK: [[DIM:%.+]] = tensor.dim [[IN]], [[C3]] - return %FQ1, %DIM : tensor<1x32x32x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 32, 64]> : tensor<4xsi64>, order = #NCHW}>, index // CHECK: return [[FQ1]], [[DIM]] } diff --git a/tests/lit/NPU/dialect/IE/passes/shrink_matmul_groups.mlir b/tests/lit/NPU/dialect/IE/passes/shrink_matmul_groups.mlir index 44189ff204..5343fb42be 100644 --- a/tests/lit/NPU/dialect/IE/passes/shrink_matmul_groups.mlir +++ b/tests/lit/NPU/dialect/IE/passes/shrink_matmul_groups.mlir @@ -73,3 +73,43 @@ func.func @ShrinkMatmulGroupsWithTrivialD1(%arg0: tensor<1x8x1x64xf32>, %arg1: t // CHECK: return [[RESULT]] : tensor<1x8x1x1024xf32> } + +// ----- + +// CHECK-LABEL: @ShrinkForBeneficialGroupMatMul +// CHECK-SAME: [[INPUT1:%.+]]: tensor<1x24x16x64xf32>, +// CHECK-SAME: [[INPUT2:%.+]]: tensor<1x8x1x1024x64xf32> +func.func @ShrinkForBeneficialGroupMatMul(%arg0: tensor<1x24x16x64xf32>, %arg1: tensor<1x8x1x1024x64xf32>) -> tensor<1x24x16x1024xf32> { + %cst = const.Declare tensor<5xsi64> = dense<[1, 8, 3, 1024, 64]> : tensor<5xsi64> + + %0 = IE.Broadcast(%arg1, %cst) {mode = #IE.broadcast_type} : tensor<1x8x1x1024x64xf32>, tensor<5xsi64> -> tensor<1x8x3x1024x64xf32> + %1 = IE.AffineReshape(%0) {dim_mapping = [[0], [1], [1], [2], [3]], shape_value = [1, 24, 1024, 64]} : tensor<1x8x3x1024x64xf32> -> tensor<1x24x1024x64xf32> + %2 = IE.MatMul(%arg0, %1) {transpose_b} : tensor<1x24x16x64xf32>, tensor<1x24x1024x64xf32> -> tensor<1x24x16x1024xf32> + + return %2 : tensor<1x24x16x1024xf32> + + // CHECK: [[LHS:%.+]] = IE.Reshape([[INPUT1]]) {shape_value = [1, 8, 48, 64]} : tensor<1x24x16x64xf32> -> tensor<1x8x48x64xf32> + // CHECK: [[RHS:%.+]] = IE.Reshape([[INPUT2]]) {shape_value = [1, 8, 1024, 64]} : tensor<1x8x1x1024x64xf32> -> tensor<1x8x1024x64xf32> + // CHECK: [[MATMUL:%.+]] = IE.MatMul([[LHS]], [[RHS]]) {transpose_b} : tensor<1x8x48x64xf32>, tensor<1x8x1024x64xf32> -> tensor<1x8x48x1024xf32> + // CHECK: [[RESULT:%.+]] = IE.Reshape([[MATMUL]]) {shape_value = [1, 24, 16, 1024]} : tensor<1x8x48x1024xf32> -> tensor<1x24x16x1024xf32> + + // CHECK: return [[RESULT]] : tensor<1x24x16x1024xf32> +} + + +// ----- + +// CHECK-LABEL: @NotShrinkForUnbeneficialGroupMatMul +// CHECK-SAME: [[INPUT1:%.+]]: tensor<1x24x1024x64xf32>, +// CHECK-SAME: [[INPUT2:%.+]]: tensor<1x8x1x1024x64xf32> +func.func @NotShrinkForUnbeneficialGroupMatMul(%arg0: tensor<1x24x1024x64xf32>, %arg1: tensor<1x8x1x1024x64xf32>) -> tensor<1x24x1024x1024xf32> { + %cst = const.Declare tensor<5xsi64> = dense<[1, 8, 3, 1024, 64]> : tensor<5xsi64> + + %0 = IE.Broadcast(%arg1, %cst) {mode = #IE.broadcast_type} : tensor<1x8x1x1024x64xf32>, tensor<5xsi64> -> tensor<1x8x3x1024x64xf32> + %1 = IE.AffineReshape(%0) {dim_mapping = [[0], [1], [1], [2], [3]], shape_value = [1, 24, 1024, 64]} : tensor<1x8x3x1024x64xf32> -> tensor<1x24x1024x64xf32> + %2 = IE.MatMul(%arg0, %1) {transpose_b} : tensor<1x24x1024x64xf32>, tensor<1x24x1024x64xf32> -> tensor<1x24x1024x1024xf32> + + return %2 : tensor<1x24x1024x1024xf32> + + // CHECK: IE.Broadcast +} diff --git a/tests/lit/NPU/dialect/IE/passes/swap_convert_with_reshape_kind_ops.mlir b/tests/lit/NPU/dialect/IE/passes/swap_convert_with_reshape_kind_ops.mlir new file mode 100644 index 0000000000..064601bc22 --- /dev/null +++ b/tests/lit/NPU/dialect/IE/passes/swap_convert_with_reshape_kind_ops.mlir @@ -0,0 +1,254 @@ +// +// Copyright (C) 2022-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --swap-convert-with-reshape-kind-ops --canonicalize %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +func.func @SwapTransposeWithConvert(%arg0: tensor<1x70x1x28xui8>) -> tensor<1x1x28x70xf16> { + %0 = IE.Convert(%arg0) {dstElemType = f16} + : tensor<1x70x1x28xui8> -> tensor<1x70x1x28xf16> + + %1 = IE.Transpose(%0) {order_value = #NHWC} : tensor<1x70x1x28xf16> -> tensor<1x1x28x70xf16> + return %1 : tensor<1x1x28x70xf16> + + // CHECK: %[[TRANSPOSE:.*]] = IE.Transpose(%arg0) {order_value = #NHWC} + // CHECK-SAME: : tensor<1x70x1x28xui8> -> tensor<1x1x28x70xui8> + + // CHECK: %[[CONVERT:.*]] = IE.Convert(%[[TRANSPOSE]]) + // CHECK-SAME: {dstElemType = f16} + // CHECK-SAME: : tensor<1x1x28x70xui8> -> tensor<1x1x28x70xf16> + + // CHECK: return %[[CONVERT]] : tensor<1x1x28x70xf16> +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +func.func @SwapReshapeWithConvert(%arg0: tensor<1x70x1x28xui8>) -> tensor<1x1x28x70xf16> { + %0 = IE.Convert(%arg0) {dstElemType = f16} + : tensor<1x70x1x28xui8> -> tensor<1x70x1x28xf16> + + %1 = IE.Reshape(%0) {shape_value = [1, 1, 28, 70]} : tensor<1x70x1x28xf16> -> tensor<1x1x28x70xf16> + return %1 : tensor<1x1x28x70xf16> + + // CHECK: %[[RESHAPE:.*]] = IE.Reshape(%arg0) {shape_value = [1, 1, 28, 70]} + // CHECK-SAME: : tensor<1x70x1x28xui8> -> tensor<1x1x28x70xui8> + + // CHECK: %[[CONVERT:.*]] = IE.Convert(%[[RESHAPE]]) + // CHECK-SAME: {dstElemType = f16} + // CHECK-SAME: : tensor<1x1x28x70xui8> -> tensor<1x1x28x70xf16> + + // CHECK: return %[[CONVERT]] : tensor<1x1x28x70xf16> +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +func.func @SwapAffineReshapeWithConvert(%arg0: tensor<1x70x1x28xui8>) -> tensor<1x1x28x70xf16> { + %0 = IE.Convert(%arg0) {dstElemType = f16} + : tensor<1x70x1x28xui8> -> tensor<1x70x1x28xf16> + + %1 = IE.AffineReshape(%0) {dim_mapping = [[0], [1], [2], [2], [3], [3]], shape_value = [1, 1, 28, 70]} : tensor<1x70x1x28xf16> -> tensor<1x1x28x70xf16> + return %1 : tensor<1x1x28x70xf16> + + // CHECK: %[[AFFINERESHAPE:.*]] = IE.AffineReshape(%arg0) + // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [1], [2], [2], [3], [3]], shape_value = [1, 1, 28, 70]} + // CHECK-SAME: : tensor<1x70x1x28xui8> -> tensor<1x1x28x70xui8> + + // CHECK: %[[CONVERT:.*]] = IE.Convert(%[[AFFINERESHAPE]]) + // CHECK-SAME: {dstElemType = f16} + // CHECK-SAME: : tensor<1x1x28x70xui8> -> tensor<1x1x28x70xf16> + + // CHECK: return %[[CONVERT]] : tensor<1x1x28x70xf16> +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +func.func @SwapSqueezeWithConvert(%arg0: tensor<1x1x70x28xui8>) -> tensor<70x28xf16> { + %0 = IE.Convert(%arg0) {dstElemType = f16} + : tensor<1x1x70x28xui8> -> tensor<1x1x70x28xf16> + + %1 = IE.Squeeze(%0) {axes_value = [0, 1]} : tensor<1x1x70x28xf16> -> tensor<70x28xf16> + return %1 : tensor<70x28xf16> + + // CHECK: %[[SQUEEZE:.*]] = IE.Squeeze(%arg0) {axes_value = [0, 1]} + // CHECK-SAME: : tensor<1x1x70x28xui8> -> tensor<70x28xui8> + + // CHECK: %[[CONVERT:.*]] = IE.Convert(%[[SQUEEZE]]) + // CHECK-SAME: {dstElemType = f16} + // CHECK-SAME: : tensor<70x28xui8> -> tensor<70x28xf16> + + // CHECK: return %[[CONVERT]] : tensor<70x28xf16> +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +func.func @SwapUnsqueezeWithConvert(%arg0: tensor<70x28xui8>) -> tensor<1x1x70x28xf16> { + %0 = IE.Convert(%arg0) {dstElemType = f16} + : tensor<70x28xui8> -> tensor<70x28xf16> + + %1 = IE.Unsqueeze(%0) {axes_value = [0, 1]} : tensor<70x28xf16> -> tensor<1x1x70x28xf16> + return %1 : tensor<1x1x70x28xf16> + + // CHECK: %[[UNSQUEEZE:.*]] = IE.Unsqueeze(%arg0) {axes_value = [0, 1]} + // CHECK-SAME: : tensor<70x28xui8> -> tensor<1x1x70x28xui8> + + // CHECK: %[[CONVERT:.*]] = IE.Convert(%[[UNSQUEEZE]]) + // CHECK-SAME: {dstElemType = f16} + // CHECK-SAME: : tensor<1x1x70x28xui8> -> tensor<1x1x70x28xf16> + + // CHECK: return %[[CONVERT]] : tensor<1x1x70x28xf16> +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +!qElemType = !quant.uniform + +func.func @DoNotSwapTransposeWithConvert(%arg0: tensor<1x70x1x28xui8>) -> tensor<1x1x28x70xf16> { + %0 = IE.QuantizeCast(%arg0) {dstElemType = !qElemType} : tensor<1x70x1x28xui8> -> tensor<1x70x1x28x!qElemType> + %1 = IE.Add(%0, %0) { auto_broadcast = #IE.auto_broadcast_type } : + tensor<1x70x1x28x!qElemType>, tensor<1x70x1x28x!qElemType> -> tensor<1x70x1x28x!qElemType> + %2 = IE.Convert(%1) {dstElemType = f16} : tensor<1x70x1x28x!qElemType> -> tensor<1x70x1x28xf16> + + %3 = IE.Transpose(%2) {order_value = #NHWC} : tensor<1x70x1x28xf16> -> tensor<1x1x28x70xf16> + return %3 : tensor<1x1x28x70xf16> + + // CHECK: %[[VAR0:.*]] = IE.QuantizeCast(%arg0) {dstElemType = !qElemType} : + // CHECK-SAME: tensor<1x70x1x28xui8> -> tensor<1x70x1x28x!qElemType> + + // CHECK: %[[ADD:.*]] = IE.Add(%[[VAR0]], %[[VAR0]]) {auto_broadcast = #IE.auto_broadcast_type} + // CHECK-SAME: : tensor<1x70x1x28x!qElemType>, tensor<1x70x1x28x!qElemType> -> tensor<1x70x1x28x!qElemType> + + // CHECK: %[[CONVERT:.*]] = IE.Convert(%[[ADD]]) {dstElemType = f16} : tensor<1x70x1x28x!qElemType> -> tensor<1x70x1x28xf16> + // CHECK: %[[TRANSPOSE:.*]] = IE.Transpose(%[[CONVERT]]) {order_value = #NHWC} + // CHECK-SAME: : tensor<1x70x1x28xf16> -> tensor<1x1x28x70xf16> + + // CHECK: return %[[TRANSPOSE]] : tensor<1x1x28x70xf16> +} + +// ----- + +// CHECK-LABEL: func @SwapConvertWithDepthToSpaceOutput +// CHECK-SAME: [[INPUT:%arg0]]: tensor<1x16x800x1279xf32> +func.func @SwapConvertWithDepthToSpaceOutput(%arg0: tensor<1x16x800x1279xf32>) -> tensor<1x4x1600x2558xui8> { + %in_low = const.Declare tensor<1x1x1x1xf32> = dense<-0.34410953521> : tensor<1x1x1x1xf32> + %in_high = const.Declare tensor<1x1x1x1xf32> = dense<1.1431435> : tensor<1x1x1x1xf32> + %out_low = const.Declare tensor<1x1x1x1xf32> = dense<0.0> : tensor<1x1x1x1xf32> + %out_high = const.Declare tensor<1x1x1x1xf32> = dense<255.0> : tensor<1x1x1x1xf32> + %0 = IE.FakeQuantize(%arg0, %in_low, %in_high, %out_low, %out_high) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : tensor<1x16x800x1279xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x16x800x1279xf32> + %1 = IE.Convert(%0) {dstElemType = ui8} : tensor<1x16x800x1279xf32> -> tensor<1x16x800x1279xui8> + %2 = IE.DepthToSpace(%1) {block_size = 2 : i64, mode = #IE.depth_to_space_mode} : tensor<1x16x800x1279xui8> -> tensor<1x4x1600x2558xui8> + return %2 : tensor<1x4x1600x2558xui8> + + // CHECK: [[IN_LOW:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<-0.344109535> : tensor<1x1x1x1xf32> + // CHECK: [[IN_HIGH:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<1.14314353> : tensor<1x1x1x1xf32> + // CHECK: [[OUT_LOW:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK: [[OUT_HIGH:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<2.550000e+02> : tensor<1x1x1x1xf32> + // CHECK: [[FAKE_QUANT:%.+]] = IE.FakeQuantize([[INPUT]], [[IN_LOW]], [[IN_HIGH]], [[OUT_LOW]], [[OUT_HIGH]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : tensor<1x16x800x1279xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x16x800x1279xf32> + // CHECK: [[DEPTH_TO_SPACE:%.+]] = IE.DepthToSpace([[FAKE_QUANT]]) {block_size = 2 : i64, mode = #IE.depth_to_space_mode} : tensor<1x16x800x1279xf32> -> tensor<1x4x1600x2558xf32> + // CHECK: [[CONVERT:%.+]] = IE.Convert([[DEPTH_TO_SPACE]]) {dstElemType = ui8} : tensor<1x4x1600x2558xf32> -> tensor<1x4x1600x2558xui8> + // CHECK: return [[CONVERT]] : tensor<1x4x1600x2558xui8> + +} + +// ----- + +#NWHC = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2, d1)> + +// CHECK-LABEL: func @SwapConvert2LayersOutput +// CHECK-SAME: [[INPUT:%arg0]]: tensor<1x16x800x1279xf32> +func.func @SwapConvert2LayersOutput(%arg0: tensor<1x16x800x1279xf32>) -> tensor<1x2558x1600x4xui8> { + %cst_0 = const.Declare tensor<1x1x1x1xf32> = dense<-0.34410953521> : tensor<1x1x1x1xf32> + %cst_1 = const.Declare tensor<1x1x1x1xf32> = dense<1.1431435> : tensor<1x1x1x1xf32> + %cst_2 = const.Declare tensor<1x1x1x1xf32> = dense<0.0> : tensor<1x1x1x1xf32> + %cst_3 = const.Declare tensor<1x1x1x1xf32> = dense<255.0> : tensor<1x1x1x1xf32> + %0 = IE.FakeQuantize(%arg0, %cst_0, %cst_1, %cst_2, %cst_3) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : tensor<1x16x800x1279xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x16x800x1279xf32> + %1 = IE.Convert(%0) {dstElemType = ui8} : tensor<1x16x800x1279xf32> -> tensor<1x16x800x1279xui8> + %2 = IE.DepthToSpace(%1) {block_size = 2 : i64, mode = #IE.depth_to_space_mode} : tensor<1x16x800x1279xui8> -> tensor<1x4x1600x2558xui8> + %3 = IE.Transpose(%2) {order_value = #NWHC} : tensor<1x4x1600x2558xui8> -> tensor<1x2558x1600x4xui8> + return %3 : tensor<1x2558x1600x4xui8> + + // CHECK: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<-0.344109535> : tensor<1x1x1x1xf32> + // CHECK: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<1.14314353> : tensor<1x1x1x1xf32> + // CHECK: [[CST_2:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK: [[CST_3:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<2.550000e+02> : tensor<1x1x1x1xf32> + // CHECK: [[FAKE_QUANT:%.+]] = IE.FakeQuantize([[INPUT]], [[CST_0]], [[CST_1]], [[CST_2]], [[CST_3]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : tensor<1x16x800x1279xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x16x800x1279xf32> + // CHECK: [[DEPTH_TO_SPACE:%.+]] = IE.DepthToSpace([[FAKE_QUANT]]) {block_size = 2 : i64, mode = #IE.depth_to_space_mode} : tensor<1x16x800x1279xf32> -> tensor<1x4x1600x2558xf32> + // CHECK: [[TRANSPOSE:%.+]] = IE.Transpose([[DEPTH_TO_SPACE]]) {order_value = #NWHC} : tensor<1x4x1600x2558xf32> -> tensor<1x2558x1600x4xf32> + // CHECK: [[CONVERT:%.+]] = IE.Convert([[TRANSPOSE]]) {dstElemType = ui8} : tensor<1x2558x1600x4xf32> -> tensor<1x2558x1600x4xui8> + // CHECK: return [[CONVERT]] : tensor<1x2558x1600x4xui8> +} + +// ----- + +#NWHC = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2, d1)> + +// CHECK-LABEL: func @SwapConvert3LayersOutput +// CHECK-SAME: [[INPUT:%arg0]]: tensor<1x16x800x1279xf32> +func.func @SwapConvert3LayersOutput(%arg0: tensor<1x16x800x1279xf32>) -> tensor<2558x1600x4xui8> { + %cst_0 = const.Declare tensor<1x1x1x1xf32> = dense<-0.34410953521> : tensor<1x1x1x1xf32> + %cst_1 = const.Declare tensor<1x1x1x1xf32> = dense<1.1431435> : tensor<1x1x1x1xf32> + %cst_2 = const.Declare tensor<1x1x1x1xf32> = dense<0.0> : tensor<1x1x1x1xf32> + %cst_3 = const.Declare tensor<1x1x1x1xf32> = dense<255.0> : tensor<1x1x1x1xf32> + %0 = IE.FakeQuantize(%arg0, %cst_0, %cst_1, %cst_2, %cst_3) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : tensor<1x16x800x1279xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x16x800x1279xf32> + %1 = IE.Convert(%0) {dstElemType = ui8} : tensor<1x16x800x1279xf32> -> tensor<1x16x800x1279xui8> + %2 = IE.DepthToSpace(%1) {block_size = 2 : i64, mode = #IE.depth_to_space_mode} : tensor<1x16x800x1279xui8> -> tensor<1x4x1600x2558xui8> + %3 = IE.Transpose(%2) {order_value = #NWHC} : tensor<1x4x1600x2558xui8> -> tensor<1x2558x1600x4xui8> + %4 = IE.Squeeze(%3) {axes_value = [0]} : tensor<1x2558x1600x4xui8> -> tensor<2558x1600x4xui8> + return %4 : tensor<2558x1600x4xui8> + + // CHECK: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<-0.344109535> : tensor<1x1x1x1xf32> + // CHECK: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<1.14314353> : tensor<1x1x1x1xf32> + // CHECK: [[CST_2:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK: [[CST_3:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<2.550000e+02> : tensor<1x1x1x1xf32> + // CHECK: [[FAKE_QUANT:%.+]] = IE.FakeQuantize([[INPUT]], [[CST_0]], [[CST_1]], [[CST_2]], [[CST_3]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : tensor<1x16x800x1279xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x16x800x1279xf32> + // CHECK: [[DEPTH_TO_SPACE:%.+]] = IE.DepthToSpace([[FAKE_QUANT]]) {block_size = 2 : i64, mode = #IE.depth_to_space_mode} : tensor<1x16x800x1279xf32> -> tensor<1x4x1600x2558xf32> + // CHECK: [[TRANSPOSE:%.+]] = IE.Transpose([[DEPTH_TO_SPACE]]) {order_value = #NWHC} : tensor<1x4x1600x2558xf32> -> tensor<1x2558x1600x4xf32> + // CHECK: [[SQUEEZE:%.+]] = IE.Squeeze([[TRANSPOSE]]) {axes_value = [0]} : tensor<1x2558x1600x4xf32> -> tensor<2558x1600x4xf32> + // CHECK: [[CONVERT:%.+]] = IE.Convert([[SQUEEZE]]) {dstElemType = ui8} : tensor<2558x1600x4xf32> -> tensor<2558x1600x4xui8> + // CHECK: return [[CONVERT]] : tensor<2558x1600x4xui8> +} + +// ----- + +#NWHC = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2, d1)> + + +// CHECK-LABEL: func @NotSwapConvertWithNonAgnosticOp +// CHECK-SAME: [[INPUT:%arg0]]: tensor<1x16x800x1279xf32> +func.func @NotSwapConvertWithNonAgnosticOp(%arg0: tensor<1x16x800x1279xf32>) -> tensor<2558x1600x4xf16> { + %cst_0 = const.Declare tensor<1x1x1x1xf32> = dense<-0.34410953521> : tensor<1x1x1x1xf32> + %cst_1 = const.Declare tensor<1x1x1x1xf32> = dense<1.1431435> : tensor<1x1x1x1xf32> + %cst_2 = const.Declare tensor<1x1x1x1xf32> = dense<0.0> : tensor<1x1x1x1xf32> + %cst_3 = const.Declare tensor<1x1x1x1xf32> = dense<255.0> : tensor<1x1x1x1xf32> + %0 = IE.FakeQuantize(%arg0, %cst_0, %cst_1, %cst_2, %cst_3) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : tensor<1x16x800x1279xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x16x800x1279xf32> + %1 = IE.Convert(%0) {dstElemType = ui8} : tensor<1x16x800x1279xf32> -> tensor<1x16x800x1279xui8> + %2 = IE.DepthToSpace(%1) {block_size = 2 : i64, mode = #IE.depth_to_space_mode} : tensor<1x16x800x1279xui8> -> tensor<1x4x1600x2558xui8> + %3 = IE.Transpose(%2) {order_value = #NWHC} : tensor<1x4x1600x2558xui8> -> tensor<1x2558x1600x4xui8> + %4 = IE.Squeeze(%3) {axes_value = [0]} : tensor<1x2558x1600x4xui8> -> tensor<2558x1600x4xui8> + %5 = IE.Convert(%4) {dstElemType = f16} : tensor<2558x1600x4xui8> -> tensor<2558x1600x4xf16> + return %5 : tensor<2558x1600x4xf16> + + // CHECK: [[CST_0:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<-0.344109535> : tensor<1x1x1x1xf32> + // CHECK: [[CST_1:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<1.14314353> : tensor<1x1x1x1xf32> + // CHECK: [[CST_2:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<0.000000e+00> : tensor<1x1x1x1xf32> + // CHECK: [[CST_3:%.+]] = const.Declare tensor<1x1x1x1xf32> = dense<2.550000e+02> : tensor<1x1x1x1xf32> + // CHECK: [[FAKE_QUANT:%.+]] = IE.FakeQuantize([[INPUT]], [[CST_0]], [[CST_1]], [[CST_2]], [[CST_3]]) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : tensor<1x16x800x1279xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> -> tensor<1x16x800x1279xf32> + // CHECK: [[CONVERT:%.+]] = IE.Convert([[FAKE_QUANT]]) {dstElemType = ui8} : tensor<1x16x800x1279xf32> -> tensor<1x16x800x1279xui8> + // CHECK: [[DEPTH_TO_SPACE:%.+]] = IE.DepthToSpace([[CONVERT]]) {block_size = 2 : i64, mode = #IE.depth_to_space_mode} : tensor<1x16x800x1279xui8> -> tensor<1x4x1600x2558xui8> + // CHECK: [[TRANSPOSE:%.+]] = IE.Transpose([[DEPTH_TO_SPACE]]) {order_value = #NWHC} : tensor<1x4x1600x2558xui8> -> tensor<1x2558x1600x4xui8> + // CHECK: [[SQUEEZE:%.+]] = IE.Squeeze([[TRANSPOSE]]) {axes_value = [0]} : tensor<1x2558x1600x4xui8> -> tensor<2558x1600x4xui8> + // CHECK: [[CONVERT_1:%.+]] = IE.Convert([[SQUEEZE]]) {dstElemType = f16} : tensor<2558x1600x4xui8> -> tensor<2558x1600x4xf16> + // CHECK: return [[CONVERT_1]] : tensor<2558x1600x4xf16> +} diff --git a/tests/lit/NPU/dialect/IE/passes/swap_convert_with_sw_op_40XX+.mlir b/tests/lit/NPU/dialect/IE/passes/swap_convert_with_sw_op_40XX+.mlir index 4bc2651eef..f3a5c166c6 100644 --- a/tests/lit/NPU/dialect/IE/passes/swap_convert_with_sw_op_40XX+.mlir +++ b/tests/lit/NPU/dialect/IE/passes/swap_convert_with_sw_op_40XX+.mlir @@ -123,3 +123,21 @@ func.func @NoSwapInterpolate(%arg0: tensor<1x8x128x128xf16, {order = #NHWC}>) -> // CHECK-SAME: -> tensor<1x2048x1x256xf32> // CHECK: return [[RET]] } + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +// CHECK-LABEL: @SwapAddWithConvert +// CHECK-SAME: ([[INPUT:%.+]]: tensor<1x16x270x54xsi32, {order = #NHWC}>) +func.func @SwapAddWithConvert(%arg0: tensor<1x16x270x54xsi32, {order = #NHWC}>) -> tensor<1x16x270x54xf16, {order = #NHWC}> { + %cst_0 = const.Declare tensor<1x1x1x1xsi32, {order = #NHWC}> = dense<1> : tensor<1x1x1x1xsi32> isSplat, [#const.Reshape<[1, 1, 1, 1]>, #const.CastElemType, #const.Reorder<#NHWC>] + %0 = IE.Add(%arg0, %cst_0) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x16x270x54xsi32, {order = #NHWC}>, tensor<1x1x1x1xsi32, {order = #NHWC}> -> tensor<1x16x270x54xsi32, {order = #NHWC}> + %1 = IE.Convert(%0) {dstElemType = f16} : tensor<1x16x270x54xsi32, {order = #NHWC}> -> tensor<1x16x270x54xf16, {order = #NHWC}> + return %1 : tensor<1x16x270x54xf16, {order = #NHWC}> + + // CHECK-DAG: [[CST_WEIGHTS:%.+]] = const.Declare tensor<1x1x1x1xf16, {order = #NHWC}> = dense<1> : tensor<1x1x1x1xsi32>, [#const.Reshape<[1, 1, 1, 1]>, #const.CastElemType, #const.Reorder<#NHWC>, #const.ChangeShapeAndElemType<[1, 1, 1, 1], f16>] + // CHECK: [[CONVERT:%.+]] = IE.Convert([[INPUT]]) {dstElemType = f16} : tensor<1x16x270x54xsi32, {order = #NHWC}> -> tensor<1x16x270x54xf16, {order = #NHWC}> + // CHECK: [[ADD:%.+]] = IE.Add([[CONVERT]], [[CST_WEIGHTS]]) {auto_broadcast = #IE.auto_broadcast_type} : tensor<1x16x270x54xf16, {order = #NHWC}>, tensor<1x1x1x1xf16, {order = #NHWC}> -> tensor<1x16x270x54xf16, {order = #NHWC}> + + // CHECK: return [[ADD]] +} diff --git a/tests/lit/NPU/dialect/IE/passes/swap_convert_with_transpose_reshape.mlir b/tests/lit/NPU/dialect/IE/passes/swap_convert_with_transpose_reshape.mlir deleted file mode 100644 index 023158f7a9..0000000000 --- a/tests/lit/NPU/dialect/IE/passes/swap_convert_with_transpose_reshape.mlir +++ /dev/null @@ -1,138 +0,0 @@ -// -// Copyright (C) 2022-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --swap-convert-with-transpose-reshape --canonicalize %s | FileCheck %s -// REQUIRES: arch-NPU37XX || arch-NPU40XX - -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - -func.func @SwapTransposeWithConvert(%arg0: tensor<1x70x1x28xui8>) -> tensor<1x1x28x70xf16> { - %0 = IE.Convert(%arg0) {dstElemType = f16} - : tensor<1x70x1x28xui8> -> tensor<1x70x1x28xf16> - - %1 = IE.Transpose(%0) {order_value = #NHWC} : tensor<1x70x1x28xf16> -> tensor<1x1x28x70xf16> - return %1 : tensor<1x1x28x70xf16> - - // CHECK: %[[TRANSPOSE:.*]] = IE.Transpose(%arg0) {order_value = #NHWC} - // CHECK-SAME: : tensor<1x70x1x28xui8> -> tensor<1x1x28x70xui8> - - // CHECK: %[[CONVERT:.*]] = IE.Convert(%[[TRANSPOSE]]) - // CHECK-SAME: {dstElemType = f16} - // CHECK-SAME: : tensor<1x1x28x70xui8> -> tensor<1x1x28x70xf16> - - // CHECK: return %[[CONVERT]] : tensor<1x1x28x70xf16> -} - -// ----- - -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - -func.func @SwapReshapeWithConvert(%arg0: tensor<1x70x1x28xui8>) -> tensor<1x1x28x70xf16> { - %0 = IE.Convert(%arg0) {dstElemType = f16} - : tensor<1x70x1x28xui8> -> tensor<1x70x1x28xf16> - - %1 = IE.Reshape(%0) {shape_value = [1, 1, 28, 70]} : tensor<1x70x1x28xf16> -> tensor<1x1x28x70xf16> - return %1 : tensor<1x1x28x70xf16> - - // CHECK: %[[RESHAPE:.*]] = IE.Reshape(%arg0) {shape_value = [1, 1, 28, 70]} - // CHECK-SAME: : tensor<1x70x1x28xui8> -> tensor<1x1x28x70xui8> - - // CHECK: %[[CONVERT:.*]] = IE.Convert(%[[RESHAPE]]) - // CHECK-SAME: {dstElemType = f16} - // CHECK-SAME: : tensor<1x1x28x70xui8> -> tensor<1x1x28x70xf16> - - // CHECK: return %[[CONVERT]] : tensor<1x1x28x70xf16> -} - -// ----- - -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - -func.func @SwapAffineReshapeWithConvert(%arg0: tensor<1x70x1x28xui8>) -> tensor<1x1x28x70xf16> { - %0 = IE.Convert(%arg0) {dstElemType = f16} - : tensor<1x70x1x28xui8> -> tensor<1x70x1x28xf16> - - %1 = IE.AffineReshape(%0) {dim_mapping = [[0], [1], [2], [2], [3], [3]], shape_value = [1, 1, 28, 70]} : tensor<1x70x1x28xf16> -> tensor<1x1x28x70xf16> - return %1 : tensor<1x1x28x70xf16> - - // CHECK: %[[AFFINERESHAPE:.*]] = IE.AffineReshape(%arg0) - // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [1], [2], [2], [3], [3]], shape_value = [1, 1, 28, 70]} - // CHECK-SAME: : tensor<1x70x1x28xui8> -> tensor<1x1x28x70xui8> - - // CHECK: %[[CONVERT:.*]] = IE.Convert(%[[AFFINERESHAPE]]) - // CHECK-SAME: {dstElemType = f16} - // CHECK-SAME: : tensor<1x1x28x70xui8> -> tensor<1x1x28x70xf16> - - // CHECK: return %[[CONVERT]] : tensor<1x1x28x70xf16> -} - -// ----- - -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - -func.func @SwapSqueezeWithConvert(%arg0: tensor<1x1x70x28xui8>) -> tensor<70x28xf16> { - %0 = IE.Convert(%arg0) {dstElemType = f16} - : tensor<1x1x70x28xui8> -> tensor<1x1x70x28xf16> - - %1 = IE.Squeeze(%0) {axes_value = [0, 1]} : tensor<1x1x70x28xf16> -> tensor<70x28xf16> - return %1 : tensor<70x28xf16> - - // CHECK: %[[SQUEEZE:.*]] = IE.Squeeze(%arg0) {axes_value = [0, 1]} - // CHECK-SAME: : tensor<1x1x70x28xui8> -> tensor<70x28xui8> - - // CHECK: %[[CONVERT:.*]] = IE.Convert(%[[SQUEEZE]]) - // CHECK-SAME: {dstElemType = f16} - // CHECK-SAME: : tensor<70x28xui8> -> tensor<70x28xf16> - - // CHECK: return %[[CONVERT]] : tensor<70x28xf16> -} - -// ----- - -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - -func.func @SwapUnsqueezeWithConvert(%arg0: tensor<70x28xui8>) -> tensor<1x1x70x28xf16> { - %0 = IE.Convert(%arg0) {dstElemType = f16} - : tensor<70x28xui8> -> tensor<70x28xf16> - - %1 = IE.Unsqueeze(%0) {axes_value = [0, 1]} : tensor<70x28xf16> -> tensor<1x1x70x28xf16> - return %1 : tensor<1x1x70x28xf16> - - // CHECK: %[[UNSQUEEZE:.*]] = IE.Unsqueeze(%arg0) {axes_value = [0, 1]} - // CHECK-SAME: : tensor<70x28xui8> -> tensor<1x1x70x28xui8> - - // CHECK: %[[CONVERT:.*]] = IE.Convert(%[[UNSQUEEZE]]) - // CHECK-SAME: {dstElemType = f16} - // CHECK-SAME: : tensor<1x1x70x28xui8> -> tensor<1x1x70x28xf16> - - // CHECK: return %[[CONVERT]] : tensor<1x1x70x28xf16> -} - -// ----- - -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -!qElemType = !quant.uniform - -func.func @DoNotSwapTransposeWithConvert(%arg0: tensor<1x70x1x28xui8>) -> tensor<1x1x28x70xf16> { - %0 = IE.QuantizeCast(%arg0) {dstElemType = !qElemType} : tensor<1x70x1x28xui8> -> tensor<1x70x1x28x!qElemType> - %1 = IE.Add(%0, %0) { auto_broadcast = #IE.auto_broadcast_type } : - tensor<1x70x1x28x!qElemType>, tensor<1x70x1x28x!qElemType> -> tensor<1x70x1x28x!qElemType> - %2 = IE.Convert(%1) {dstElemType = f16} : tensor<1x70x1x28x!qElemType> -> tensor<1x70x1x28xf16> - - %3 = IE.Transpose(%2) {order_value = #NHWC} : tensor<1x70x1x28xf16> -> tensor<1x1x28x70xf16> - return %3 : tensor<1x1x28x70xf16> - - // CHECK: %[[VAR0:.*]] = IE.QuantizeCast(%arg0) {dstElemType = !qElemType} : - // CHECK-SAME: tensor<1x70x1x28xui8> -> tensor<1x70x1x28x!qElemType> - - // CHECK: %[[ADD:.*]] = IE.Add(%[[VAR0]], %[[VAR0]]) {auto_broadcast = #IE.auto_broadcast_type} - // CHECK-SAME: : tensor<1x70x1x28x!qElemType>, tensor<1x70x1x28x!qElemType> -> tensor<1x70x1x28x!qElemType> - - // CHECK: %[[CONVERT:.*]] = IE.Convert(%[[ADD]]) {dstElemType = f16} : tensor<1x70x1x28x!qElemType> -> tensor<1x70x1x28xf16> - // CHECK: %[[TRANSPOSE:.*]] = IE.Transpose(%[[CONVERT]]) {order_value = #NHWC} - // CHECK-SAME: : tensor<1x70x1x28xf16> -> tensor<1x1x28x70xf16> - - // CHECK: return %[[TRANSPOSE]] : tensor<1x1x28x70xf16> -} diff --git a/tests/lit/NPU/dialect/IE/passes/swap_viewop_and_clamp.mlir b/tests/lit/NPU/dialect/IE/passes/swap_viewop_and_clamp.mlir index 55495a1a41..0f63eadb28 100644 --- a/tests/lit/NPU/dialect/IE/passes/swap_viewop_and_clamp.mlir +++ b/tests/lit/NPU/dialect/IE/passes/swap_viewop_and_clamp.mlir @@ -124,7 +124,6 @@ func.func @swapSliceWithClamp(%arg0: tensor<1x16x80x80x!qElemType>) -> tensor<1x // CHECK: return [[SLICE]] } - // ----- !qElemType = !quant.uniform @@ -143,3 +142,24 @@ func.func @notSwapWithNCEAlreadyHasPostOp(%arg0: tensor<1x16x80x80x!qElemType>) // CHECK: [[CLAMP:%.*]] = IE.Clamp([[SLICE]]) // CHECK: return [[CLAMP]] } + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK-LABEL: @swapPermuteCastWithClamp +// CHECK-SAME: [[INPUT:%.*]]: tensor<1x499x768x1xf16, {order = #NHWC}> +func.func @swapPermuteCastWithClamp(%arg0: tensor<1x499x768x1xf16, {order = #NHWC}>) -> tensor<1x768x1x1xf16> { + %weights = const.Declare tensor<1x499x1x1xf16> = dense<1.0> : tensor<1x499x1x1xf32>, [#const.CastElemType] + %0 = IE.Convolution(%arg0, %weights) {dilations = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x499x768x1xf16, {order = #NHWC}>, tensor<1x499x1x1xf16> -> tensor<1x1x768x1xf16, {order = #NHWC}> + %1 = IE.PermuteCast(%0) {dst_order = #NCHW, mem_perm = #NCHW} : tensor<1x1x768x1xf16, {order = #NHWC}> -> tensor<1x768x1x1xf16> + %2 = IE.Clamp(%1) {max = 6.000000e+00 : f64, min = 0.000000e+00 : f64} : tensor<1x768x1x1xf16> -> tensor<1x768x1x1xf16> + return %2 : tensor<1x768x1x1xf16> + + // CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<1x499x1x1xf16> = dense<1.000000e+00> : tensor<1x499x1x1xf32>, [#const.CastElemType] + // CHECK: [[CONVOLUTION:%.*]] = IE.Convolution([[INPUT]], [[WEIGHTS]]) {dilations = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x499x768x1xf16, {order = #NHWC}>, tensor<1x499x1x1xf16> -> tensor<1x1x768x1xf16, {order = #NHWC}> + // CHECK: [[CLAMP:%.*]] = IE.Clamp([[CONVOLUTION]]) {max = 6.000000e+00 : f64, min = 0.000000e+00 : f64} : tensor<1x1x768x1xf16, {order = #NHWC}> -> tensor<1x1x768x1xf16, {order = #NHWC}> + // CHECK: [[PERMUTECAST:%.*]] = IE.PermuteCast([[CLAMP]]) {dst_order = #NCHW, mem_perm = #NCHW} : tensor<1x1x768x1xf16, {order = #NHWC}> -> tensor<1x768x1x1xf16> + // CHECK: return [[PERMUTECAST]] +} diff --git a/tests/lit/NPU/dialect/IE/pipelines/convert_mem_permute_to_op.mlir b/tests/lit/NPU/dialect/IE/pipelines/convert_mem_permute_to_op.mlir index dbe3234d73..78c68cbe61 100644 --- a/tests/lit/NPU/dialect/IE/pipelines/convert_mem_permute_to_op.mlir +++ b/tests/lit/NPU/dialect/IE/pipelines/convert_mem_permute_to_op.mlir @@ -783,6 +783,8 @@ func.func @AdjustMemPermuteShape(%arg0: tensor<1x1024x16x128xf16, {order = #NCHW #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #WCHN = affine_map<(d0, d1, d2, d3) -> (d3, d1, d2, d0)> +#map = affine_map<(d0, d1, d2, d3) -> (d1, d0, d2, d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3, d2)> // CHECK-LABEL: @AdjustMemPermuteShapeWithDimsOne // CHECK-SAME: [[INPUT:%.+]]: tensor<1024x1x1x128xf16, {order = #NCHW}> @@ -791,18 +793,18 @@ func.func @AdjustMemPermuteShapeWithDimsOne(%arg0: tensor<1024x1x1x128xf16, {ord return %MEM_PERMUTE : tensor<128x1024x1x1xf16, {order = #NHWC}> - // CHECK: [[SHAPECAST_IN:%.+]] = IE.ShapeCast {shape = [1, 1024, 1, 128]} inputs([[INPUT]] : tensor<1024x1x1x128xf16, {order = #NCHW}>) -> tensor<1x1024x1x128xf16> - // CHECK: [[PERMUTE_CAST:%.+]] = IE.PermuteCast([[SHAPECAST_IN]]) {dst_order = #NHWC, mem_perm = #NCHW} : tensor<1x1024x1x128xf16> -> tensor<1x128x1024x1xf16, {order = #NHWC}> + // CHECK: [[PERMUTE_CAST:%.+]] = IE.PermuteCast([[INPUT]]) {dst_order = #NHWC, mem_perm = #map} + // CHECK-SAME: : tensor<1024x1x1x128xf16, {order = #NCHW}> -> tensor<1x128x1024x1xf16, {order = #NHWC}> // CHECK: [[MAX_POOL:%.+]] = IE.MaxPool([[PERMUTE_CAST]]) { // CHECK-SAME: kernel_size = [1, 1], // CHECK-SAME: pads_begin = [0, 0], // CHECK-SAME: pads_end = [0, 0], // CHECK-SAME: rounding_type = #IE.rounding_type, - // CHECK-SAME: strides = [1, 1]} : tensor<1x128x1024x1xf16, {order = #NHWC}> -> tensor<1x128x1024x1xf16, {order = #NCWH}> - // CHECK: [[PERMUTE_CAST_1:%.+]] = IE.PermuteCast([[MAX_POOL]]) {dst_order = #NHWC, mem_perm = #NCHW} : tensor<1x128x1024x1xf16, {order = #NCWH}> -> tensor<1x1024x128x1xf16, {order = #NHWC}> - // CHECK: [[SHAPECAST_OUT:%.+]] = IE.ShapeCast {shape = [128, 1024, 1, 1]} inputs([[PERMUTE_CAST_1]] : tensor<1x1024x128x1xf16, {order = #NHWC}>) -> tensor<128x1024x1x1xf16, {order = #NHWC}> + // CHECK-SAME: strides = [1, 1]} : tensor<1x128x1024x1xf16, {order = #NHWC}> -> tensor<1x128x1024x1xf16> + // CHECK: [[PERMUTE_CAST_1:%.+]] = IE.PermuteCast([[MAX_POOL]]) {dst_order = #NHWC, mem_perm = #map1} + // CHECK-SAME: : tensor<1x128x1024x1xf16> -> tensor<128x1024x1x1xf16, {order = #NHWC}> - // CHECK: return [[SHAPECAST_OUT]] : tensor<128x1024x1x1xf16, {order = #NHWC}> + // CHECK: return [[PERMUTE_CAST_1]] : tensor<128x1024x1x1xf16, {order = #NHWC}> } // ----- @@ -883,9 +885,9 @@ func.func @AdjustMemPermuteForPerAxisQuantize(%arg0: tensor<1x12800x1x32x!qElemT // CHECK-LABEL: @BigMemPermute // CHECK-SAME: [[INPUT:%.+]]: tensor<1x4x16x19320xf16> func.func @BigMemPermute(%arg0: tensor<1x4x16x19320xf16>) -> tensor<1x16x4x19320xf16, {order = #NHWC}> { - %MEM_PERMUTE = IE.MemPermute(%arg0) {dst_order = #NHWC, mem_perm = #NCWH } : + %MEM_PERMUTE = IE.MemPermute(%arg0) {dst_order = #NHWC, mem_perm = #NCWH } : tensor<1x4x16x19320xf16> -> tensor<1x16x4x19320xf16, {order = #NHWC}> - + return %MEM_PERMUTE : tensor<1x16x4x19320xf16, {order = #NHWC}> @@ -1142,37 +1144,6 @@ func.func @ConvertMemPermuteToPermuteQuantize(%arg0: tensor<1x9x86016x1xf16>) #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -// CHECK-LABEL: @NotConvertMemPermuteToPermuteQuantizeDueToChannel -// CHECK-SAME: [[INPUT:%.+]]: tensor<1x16x300x4xf16> -func.func @NotConvertMemPermuteToPermuteQuantizeDueToChannel(%arg0: tensor<1x16x300x4xf16>) - -> tensor<1x16x300x4xf16, {order = #NHWC}> { - %MEM_PERMUTE = IE.MemPermute(%arg0) { - dst_order = #NHWC, mem_perm = #NHWC - } : tensor<1x16x300x4xf16> -> tensor<1x16x300x4xf16, {order = #NHWC}> - - return %MEM_PERMUTE : tensor<1x16x300x4xf16, {order = #NHWC}> - - // CHECK-NOT: IE.PermuteQuantize - // CHECK: [[IN_PERMUTE:%.+]] = IE.PermuteCast([[INPUT]]) - // CHECK: {dst_order = #NHWC, mem_perm = #NCHW} : tensor<1x16x300x4xf16> -> tensor<1x4x16x300xf16, {order = #NHWC}> - // CHECK: [[IN_SHAPECAST:%.+]] = IE.ShapeCast {shape = [1, 16, 16, 75]} - // CHECK: inputs([[IN_PERMUTE]] : tensor<1x4x16x300xf16, {order = #NHWC}>) -> tensor<1x16x16x75xf16, {order = #NHWC}> - - // CHECK: [[MAXPOOL:%.+]] = IE.MaxPool([[IN_SHAPECAST]]) - // CHECK: {kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} - // CHECK: tensor<1x16x16x75xf16, {order = #NHWC}> -> tensor<1x16x16x75xf16, {order = #NWCH}> - - // CHECK: [[OUT_PERMUTE:%.+]] = IE.PermuteCast([[MAXPOOL]]) - // CHECK: {dst_order = #NHWC, mem_perm = #NCHW} : tensor<1x16x16x75xf16, {order = #NWCH}> -> tensor<1x16x75x16xf16, {order = #NHWC}> - // CHECK: [[OUT_SHAPECAST:%.+]] = IE.ShapeCast {shape = [1, 16, 300, 4]} - // CHECK: inputs([[OUT_PERMUTE]] : tensor<1x16x75x16xf16, {order = #NHWC}>) -> tensor<1x16x300x4xf16, {order = #NHWC}> - // CHECK: return [[OUT_SHAPECAST]] : tensor<1x16x300x4xf16, {order = #NHWC}> -} - -// ----- - -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - // CHECK-LABEL: @NotConvertMemPermuteToPermuteQuantizeDueToHW // CHECK-SAME: [[INPUT:%.+]]: tensor<1x13x13x3xf16> func.func @NotConvertMemPermuteToPermuteQuantizeDueToHW(%arg0: tensor<1x13x13x3xf16>) @@ -1195,15 +1166,17 @@ func.func @NotConvertMemPermuteToPermuteQuantizeDueToHW(%arg0: tensor<1x13x13x3x #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #CHWN = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3, d0)> -#map = affine_map<(d0, d1, d2, d3) -> (d1, d0, d2, d3)> +#CNHW = affine_map<(d0, d1, d2, d3) -> (d1, d0, d2, d3)> + +// CHECK: #map = affine_map<(d0, d1, d2, d3) -> (d1, d0, d2, d3)> // CHECK-LABEL: @ConvertMemPermuteWithCNHWInput // CHECK-SAME: [[INPUT:%.+]]: tensor<1x13x16x80xf16, {order = #map}> -func.func @ConvertMemPermuteWithCNHWInput(%arg0: tensor<1x13x16x80xf16, {order = #map}>) +func.func @ConvertMemPermuteWithCNHWInput(%arg0: tensor<1x13x16x80xf16, {order = #CNHW}>) -> tensor<1x13x16x80xf16, {order = #NHWC}> { %MEM_PERMUTE = IE.MemPermute(%arg0) { dst_order = #NHWC, mem_perm = #CHWN - } : tensor<1x13x16x80xf16, {order = #map}> -> tensor<1x13x16x80xf16, {order = #NHWC}> + } : tensor<1x13x16x80xf16, {order = #CNHW}> -> tensor<1x13x16x80xf16, {order = #NHWC}> return %MEM_PERMUTE : tensor<1x13x16x80xf16, {order = #NHWC}> @@ -1224,3 +1197,153 @@ func.func @ConvertMemPermuteWithCNHWInput(%arg0: tensor<1x13x16x80xf16, {order = // CHECK: return [[PERMUTE_QUANTIZE]] : tensor<1x13x16x80xf16, {order = #NHWC}> } + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#CHWN = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3, d0)> +#CNHW = affine_map<(d0, d1, d2, d3) -> (d1, d0, d2, d3)> + +// CHECK: #map = affine_map<(d0, d1, d2, d3) -> (d1, d0, d2, d3)> + +// CHECK-LABEL: @ConvertMemPermuteWithCNHWInputNCHWOutput +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x13x16x80xf16, {order = #map}> +func.func @ConvertMemPermuteWithCNHWInputNCHWOutput(%arg0: tensor<1x13x16x80xf16, {order = #CNHW}>) + -> tensor<1x16x80x13xf16> { + %MEM_PERMUTE = IE.MemPermute(%arg0) { + dst_order = #NCHW, mem_perm = #CHWN + } : tensor<1x13x16x80xf16, {order = #CNHW}> -> tensor<1x16x80x13xf16> + + return %MEM_PERMUTE : tensor<1x16x80x13xf16> + + // CHECK-NOT: IE.MemPermute + + // CHECK: [[IN_PERMUTE_CAST:%.+]] = IE.PermuteCast([[INPUT]]) { + // CHECK-SAME: dst_order = #NCHW, + // CHECK-SAME: mem_perm = #map + // CHECK-SAME: } : tensor<1x13x16x80xf16, {order = #map}> -> tensor<1x13x16x80xf16> + + // CHECK: [[PERMUTE_QUANTIZE:%.+]] = IE.PermuteQuantize([[IN_PERMUTE_CAST]]) { + // CHECK-SAME: dstElemType = f16, + // CHECK-SAME: dst_order = #NHWC, + // CHECK-SAME: mem_perm = #NHWC, + // CHECK-SAME: pads_begin = [0, 0, 0, 0], + // CHECK-SAME: pads_end = [0, 0, 0, 0] + // CHECK-SAME: } : tensor<1x13x16x80xf16> -> tensor<1x13x16x80xf16, {order = #NHWC}> + + // CHECK: [[OUT_PERMUTE_CAST:%.+]] = IE.PermuteCast([[PERMUTE_QUANTIZE]]) { + // CHECK-SAME: dst_order = #NCHW, + // CHECK-SAME: mem_perm = #NCHW + // CHECK-SAME: } : tensor<1x13x16x80xf16, {order = #NHWC}> -> tensor<1x16x80x13xf16> + + // CHECK: return [[OUT_PERMUTE_CAST]] : tensor<1x16x80x13xf16> +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#map = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3, d0)> +#WCHN = affine_map<(d0, d1, d2, d3) -> (d3, d1, d2, d0)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d1, d0, d2, d3)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3, d2)> + +!intype = tensor<32x3072x1x1x!quant.uniform, {order = #map}> +!outtype = tensor<32x3072x1x1x!quant.uniform, {order = #NHWC}> + +// CHECK-LABEL: @ConvertToDPUPermuteWith2AxisAdaptation +// CHECK-SAME: [[ARG0:%.+]]: tensor<32x3072x1x1x!qElemType, {order = #map}> +func.func @ConvertToDPUPermuteWith2AxisAdaptation(%arg0: !intype) -> !outtype { + %0 = IE.MemPermute(%arg0) {dst_order = #NHWC, mem_perm = #WCHN} : !intype -> !outtype + + return %0 : !outtype +} + +// CHECK: [[IN_PERM:%.+]] = IE.PermuteCast([[ARG0]]) {dst_order = #NHWC, mem_perm = #map1} +// CHECK-SAME: : tensor<32x3072x1x1x!qElemType, {order = #map}> -> tensor<1x32x3072x1x!qElemType, {order = #NHWC}> +// CHECK: [[MAX_POOL:%.+]] = IE.MaxPool([[IN_PERM]]) +// CHECK-SAME: {kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} +// CHECK-SAME: : tensor<1x32x3072x1x!qElemType, {order = #NHWC}> -> tensor<1x32x3072x1x!qElemType> +// CHECK: [[OUT_PERM:%.+]] = IE.PermuteCast([[MAX_POOL]]) {dst_order = #NHWC, mem_perm = #map2} +// CHECK-SAME: : tensor<1x32x3072x1x!qElemType> -> tensor<32x3072x1x1x!qElemType, {order = #NHWC}> + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#map = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3, d0)> +#WCHN = affine_map<(d0, d1, d2, d3) -> (d3, d1, d2, d0)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)> + +!intype = tensor<32x1x3072x1x!quant.uniform, {order = #map}> +!outtype = tensor<32x1x3072x1x!quant.uniform, {order = #NHWC}> + +// CHECK-LABEL: @ConvertToDPUPermuteWith2AxisAdaptationScenario2 +// CHECK-SAME: [[ARG0:%.+]]: tensor<32x1x3072x1x!qElemType, {order = #map}> +func.func @ConvertToDPUPermuteWith2AxisAdaptationScenario2(%arg0: !intype) -> !outtype { + %0 = IE.MemPermute(%arg0) {dst_order = #NHWC, mem_perm = #WCHN} : !intype -> !outtype + + return %0 : !outtype +} + +// CHECK: [[IN_PERM:%.+]] = IE.PermuteCast([[ARG0]]) {dst_order = #NHWC, mem_perm = #NCHW} +// CHECK-SAME: : tensor<32x1x3072x1x!qElemType, {order = #map}> -> tensor<1x32x3072x1x!qElemType, {order = #NHWC}> +// CHECK: [[MAX_POOL:%.+]] = IE.MaxPool([[IN_PERM]]) +// CHECK-SAME: {kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} +// CHECK-SAME: : tensor<1x32x3072x1x!qElemType, {order = #NHWC}> -> tensor<1x32x3072x1x!qElemType> +// CHECK: [[OUT_PERM:%.+]] = IE.PermuteCast([[MAX_POOL]]) {dst_order = #NHWC, mem_perm = #map1} +// CHECK-SAME: : tensor<1x32x3072x1x!qElemType> -> tensor<32x1x3072x1x!qElemType, {order = #NHWC}> + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#HWCN = affine_map<(d0, d1, d2, d3) -> (d2, d3, d1, d0)> +#map = affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> + +!intype = tensor<64x64x1x1xf16> +!outtype = tensor<1x1x64x64xf16> + +// CHECK-LABEL: @ConvertToDPUPermuteSameValOn2Axis +// CHECK-SAME: [[ARG0:%.+]]: tensor<64x64x1x1xf16> +func.func @ConvertToDPUPermuteSameValOn2Axis(%arg0: !intype) -> !outtype { + %0 = IE.MemPermute(%arg0) {dst_order = #NCHW, mem_perm = #HWCN} : !intype -> !outtype + + return %0 : !outtype +} + +// CHECK: [[IN_PERM:%.+]] = IE.PermuteCast([[ARG0]]) {dst_order = #NHWC, mem_perm = #map} +// CHECK-SAME: : tensor<64x64x1x1xf16> -> tensor<1x64x64x1xf16, {order = #NHWC}> +// CHECK: [[MAX_POOL:%.+]] = IE.MaxPool([[IN_PERM]]) +// CHECK-SAME: {kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} +// CHECK-SAME: : tensor<1x64x64x1xf16, {order = #NHWC}> -> tensor<1x64x64x1xf16> +// CHECK: [[OUT_PERM:%.+]] = IE.PermuteCast([[MAX_POOL]]) {dst_order = #NCHW, mem_perm = #NWCH} +// CHECK-SAME: : tensor<1x64x64x1xf16> -> tensor<1x1x64x64xf16> + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#map = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3, d0)> +#WCHN = affine_map<(d0, d1, d2, d3) -> (d3, d1, d2, d0)> + +!qElemType = !quant.uniform + +!intype = tensor<16x1x3072x1x!qElemType, {order = #map}> +!outtype = tensor<16x1x3072x1x!qElemType, {order = #NHWC}> + +// CHECK-DAG: [[QELEM_TYPE:!.+]] = !quant.uniform +func.func @DontConvertToDPUPermuteWith2AxisAdaptationPerAxisQuant(%arg0: !intype) -> !outtype { + %0 = IE.MemPermute(%arg0) {dst_order = #NHWC, mem_perm = #WCHN} : !intype -> !outtype + + return %0 : !outtype + +// CHECK-NOT: IE.MaxPool +// CHECK: IE.MemPermute([[ARG0]]) + +} diff --git a/tests/lit/NPU/dialect/IE/pipelines/default_hw_mode.mlir b/tests/lit/NPU/dialect/IE/pipelines/default_hw_mode.mlir index e5799b3bf2..83e614e1e7 100644 --- a/tests/lit/NPU/dialect/IE/pipelines/default_hw_mode.mlir +++ b/tests/lit/NPU/dialect/IE/pipelines/default_hw_mode.mlir @@ -116,13 +116,10 @@ module @RepeatingBlocks { return %call2 : tensor<1x48x60x60xf32> // CHECK: [[CONVERT1:%.+]] = IE.Convert([[ARG0]]) {dstElemType = f16} : tensor<1x48x60x60xf32> -> tensor<1x48x60x60xf16> - // CHECK: [[PERMUTECAST1:%.+]] = IE.PermuteCast([[CONVERT1]]) {dst_order = #NHWC, mem_perm = #NCHW} : tensor<1x48x60x60xf16> -> tensor<1x60x48x60xf16, {order = #NHWC}> - // CHECK: [[SHAPECAST1:%.+]] = IE.ShapeCast {shape = [1, 16, 48, 225]} inputs([[PERMUTECAST1]] : tensor<1x60x48x60xf16, {order = #NHWC}>) -> tensor<1x16x48x225xf16, {order = #NHWC}> - // CHECK: [[MAXPOOL1:%.+]] = IE.MaxPool([[SHAPECAST1]]) { - // CHECK-SAME: kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1] - // CHECK-SAME: } : tensor<1x16x48x225xf16, {order = #NHWC}> -> tensor<1x16x48x225xf16, {order = #NWCH}> - // CHECK: [[PERMUTECAST2:%.+]] = IE.PermuteCast([[MAXPOOL1]]) {dst_order = #NHWC, mem_perm = #NCHW} : tensor<1x16x48x225xf16, {order = #NWCH}> -> tensor<1x48x225x16xf16, {order = #NHWC}> - // CHECK: [[SHAPECAST2:%.+]] = IE.ShapeCast {shape = [1, 48, 60, 60]} inputs([[PERMUTECAST2]] : tensor<1x48x225x16xf16, {order = #NHWC}>) -> tensor<1x48x60x60xf16, {order = #NHWC}> + // CHECK: [[SHAPECAST1:%.+]] = IE.ShapeCast {shape = [1, 48, 225, 16]} inputs([[CONVERT1]] : tensor<1x48x60x60xf16>) -> tensor<1x48x225x16xf16> + // CHECK: [[PERMQUANT:%.+]] = IE.PermuteQuantize([[SHAPECAST1]]) {dstElemType = f16, dst_order = #NHWC, mem_perm = #NHWC, pads_begin = [0, 0, 0, 0], pads_end = [0, 0, 0, 0]} + // CHECK-SAME: : tensor<1x48x225x16xf16> -> tensor<1x48x225x16xf16, {order = #NHWC}> + // CHECK: [[SHAPECAST2:%.+]] = IE.ShapeCast {shape = [1, 48, 60, 60]} inputs([[PERMQUANT]] : tensor<1x48x225x16xf16, {order = #NHWC}>) -> tensor<1x48x60x60xf16, {order = #NHWC}> // CHECK: [[SOFTMAX:%.+]] = IE.SoftMax([[SHAPECAST2]]) {axisInd = 1 : i64} : tensor<1x48x60x60xf16, {order = #NHWC}> -> tensor<1x48x60x60xf16, {order = #NHWC}> // CHECK: [[MAXPOOL2:%.+]] = IE.MaxPool([[SOFTMAX]]) { // CHECK-SAME: kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1] diff --git a/tests/lit/NPU/dialect/IE/pipelines/default_hw_mode_40XX+.mlir b/tests/lit/NPU/dialect/IE/pipelines/default_hw_mode_40XX+.mlir index 31663477d5..ca1abf5fc6 100644 --- a/tests/lit/NPU/dialect/IE/pipelines/default_hw_mode_40XX+.mlir +++ b/tests/lit/NPU/dialect/IE/pipelines/default_hw_mode_40XX+.mlir @@ -165,13 +165,10 @@ module @RepeatingBlocks { return %call2 : tensor<1x48x60x60xf32> // CHECK: [[CONVERT1:%.+]] = IE.Convert([[ARG0]]) {dstElemType = f16} : tensor<1x48x60x60xf32> -> tensor<1x48x60x60xf16> - // CHECK: [[PERMUTECAST1:%.+]] = IE.PermuteCast([[CONVERT1]]) {dst_order = #NHWC, mem_perm = #NCHW} : tensor<1x48x60x60xf16> -> tensor<1x60x48x60xf16, {order = #NHWC}> - // CHECK: [[SHAPECAST1:%.+]] = IE.ShapeCast {shape = [1, 16, 48, 225]} inputs([[PERMUTECAST1]] : tensor<1x60x48x60xf16, {order = #NHWC}>) -> tensor<1x16x48x225xf16, {order = #NHWC}> - // CHECK: [[MAXPOOL1:%.+]] = IE.MaxPool([[SHAPECAST1]]) { - // CHECK-SAME: kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1] - // CHECK-SAME: } : tensor<1x16x48x225xf16, {order = #NHWC}> -> tensor<1x16x48x225xf16, {order = #NWCH}> - // CHECK: [[PERMUTECAST2:%.+]] = IE.PermuteCast([[MAXPOOL1]]) {dst_order = #NHWC, mem_perm = #NCHW} : tensor<1x16x48x225xf16, {order = #NWCH}> -> tensor<1x48x225x16xf16, {order = #NHWC}> - // CHECK: [[SHAPECAST2:%.+]] = IE.ShapeCast {shape = [1, 48, 60, 60]} inputs([[PERMUTECAST2]] : tensor<1x48x225x16xf16, {order = #NHWC}>) -> tensor<1x48x60x60xf16, {order = #NHWC}> + // CHECK: [[SHAPECAST1:%.+]] = IE.ShapeCast {shape = [1, 48, 225, 16]} inputs([[CONVERT1]] : tensor<1x48x60x60xf16>) -> tensor<1x48x225x16xf16> + // CHECK: [[PERMQUANT:%.+]] = IE.PermuteQuantize([[SHAPECAST1]]) {dstElemType = f16, dst_order = #NHWC, mem_perm = #NHWC, pads_begin = [0, 0, 0, 0], pads_end = [0, 0, 0, 0]} + // CHECK-SAME: : tensor<1x48x225x16xf16> -> tensor<1x48x225x16xf16, {order = #NHWC}> + // CHECK: [[SHAPECAST2:%.+]] = IE.ShapeCast {shape = [1, 48, 60, 60]} inputs([[PERMQUANT]] : tensor<1x48x225x16xf16, {order = #NHWC}>) -> tensor<1x48x60x60xf16, {order = #NHWC}> // CHECK: [[SOFTMAX:%.+]] = IE.SoftMax([[SHAPECAST2]]) {axisInd = 1 : i64} : tensor<1x48x60x60xf16, {order = #NHWC}> -> tensor<1x48x60x60xf16, {order = #NHWC}> // CHECK: [[MAXPOOL2:%.+]] = IE.MaxPool([[SOFTMAX]]) { // CHECK-SAME: kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1] diff --git a/tests/lit/NPU/dialect/IE/pipelines/low_precision_37XX.mlir b/tests/lit/NPU/dialect/IE/pipelines/low_precision.mlir similarity index 66% rename from tests/lit/NPU/dialect/IE/pipelines/low_precision_37XX.mlir rename to tests/lit/NPU/dialect/IE/pipelines/low_precision.mlir index b81cc76a9d..e64577674b 100644 --- a/tests/lit/NPU/dialect/IE/pipelines/low_precision_37XX.mlir +++ b/tests/lit/NPU/dialect/IE/pipelines/low_precision.mlir @@ -4,7 +4,7 @@ // // RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --low-precision %s | FileCheck %s -// REQUIRES: arch-NPU37XX +// REQUIRES: arch-NPU37XX || arch-NPU40XX !qElemType = !quant.uniform:f32:0, {0.0078740157480314959:127,0.0086614175105658095:127,0.0094488192731001247:127,0.010236220096978615:127}> !qElemType1 = !quant.uniform @@ -89,3 +89,31 @@ func.func @MixedPrecisionI8Convolution(%arg0: tensor<1x2x1x1xf32>) -> tensor<1x2 // CHECK: [[CAST:%.*]] = IE.Convert([[CONV]]) {dstElemType = f32} : tensor<1x2x1x1xf16> -> tensor<1x2x1x1xf32> // CHECK: return [[CAST]] } + +// ----- + +!qElemType = !quant.uniform +!qElemType1 = !quant.uniform +// CHECK-LABEL: @ConvertQuantizeCastAgnosticOp +func.func @ConvertQuantizeCastAgnosticOp(%arg0: tensor<1x96x800x1279xf16>, %arg1: tensor<16x96x1x1xf16>) -> tensor<1x4x1600x2558xui8> { + %cst = const.Declare tensor<1x1x1x1xf16> = dense<-0.34410953521> : tensor<1x1x1x1xf32>, [#const.CastElemType] + %cst_0 = const.Declare tensor<1x1x1x1xf16> = dense<1.1431435> : tensor<1x1x1x1xf32>, [#const.CastElemType] + %cst_1 = const.Declare tensor<1x1x1x1xf16> = dense<0.0> : tensor<1x1x1x1xf32>, [#const.CastElemType] + %cst_2 = const.Declare tensor<1x1x1x1xf16> = dense<255.0> : tensor<1x1x1x1xf32>, [#const.CastElemType] + %cst_3 = const.Declare tensor<1x16x1x1xf16> = dense<1.0> : tensor<1x16x1x1xf16>, [#const.CastElemType] + %3 = IE.Convolution(%arg0, %arg1, %cst_3) {dilations = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x96x800x1279xf16>, tensor<16x96x1x1xf16>, tensor<1x16x1x1xf16> -> tensor<1x16x800x1279xf16> + %0 = IE.FakeQuantize(%3, %cst, %cst_0, %cst_1, %cst_2) {auto_broadcast = #IE.auto_broadcast_type, levels = 256 : i64} : tensor<1x16x800x1279xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16> -> tensor<1x16x800x1279xf16> + %1 = IE.Convert(%0) {dstElemType = ui8} : tensor<1x16x800x1279xf16> -> tensor<1x16x800x1279xui8> + %2 = IE.DepthToSpace(%1) {block_size = 2 : i64, mode = #IE.depth_to_space_mode} : tensor<1x16x800x1279xui8> -> tensor<1x4x1600x2558xui8> + return %2 : tensor<1x4x1600x2558xui8> + + // CHECK: [[CST:%.*]] = const.Declare tensor<1x16x1x1xf16> = dense<1.000000e+00> : tensor<1x16x1x1xf16>, [#const.CastElemType] + // CHECK: [[CONV:%.*]] = IE.Convolution([[ARG0:%.*]], [[ARG1:%.*]], [[CST]]) + // CHECK-SAME: {dilations = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : tensor<1x96x800x1279xf16>, tensor<16x96x1x1xf16>, tensor<1x16x1x1xf16> -> tensor<1x16x800x1279x!qElemType> + // CHECK: [[QUANTIZE_CAST:%.*]] = IE.QuantizeCast([[CONV]]) {dstElemType = !qElemType1} : + // CHECK-SAME: tensor<1x16x800x1279x!qElemType> -> tensor<1x16x800x1279x!qElemType1> + // CHECK: [[DEPTH_TO_SPACE:%.*]] = IE.DepthToSpace([[QUANTIZE_CAST]]) {block_size = 2 : i64, mode = #IE.depth_to_space_mode} : tensor<1x16x800x1279x!qElemType1> -> tensor<1x4x1600x2558x!qElemType1> + // CHECK: [[QUANTIZE_CAST_1:%.*]] = IE.QuantizeCast([[DEPTH_TO_SPACE]]) {dstElemType = ui8} : + // CHECK-SAME: tensor<1x4x1600x2558x!qElemType1> -> tensor<1x4x1600x2558xui8> + // CHECK: return [[QUANTIZE_CAST_1]] : tensor<1x4x1600x2558xui8> +} diff --git a/tests/lit/NPU/dialect/IE/pipelines/low_precision_37XX+.mlir b/tests/lit/NPU/dialect/IE/pipelines/low_precision_37XX+.mlir new file mode 100644 index 0000000000..fc5a4064f8 --- /dev/null +++ b/tests/lit/NPU/dialect/IE/pipelines/low_precision_37XX+.mlir @@ -0,0 +1,80 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --low-precision %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +!qElemType = !quant.uniform +!qElemType1 = !quant.uniform +!qElemType2 = !quant.uniform +!qElemType3 = !quant.uniform + +// CHECK-LABEL: @PropagateDequantizeTwiceToFuseMul +// CHECK-SAME: ([[INPUT:%.+]]: tensor<4096x320x1x1xf16>, +// CHECK-SAME: [[WEIGHTS1:%.+]]: tensor<320x320x1x1xf16>) +func.func @PropagateDequantizeTwiceToFuseMul( + %input: tensor<4096x320x1x1xf16>, + %weights1: tensor<320x320x1x1xf16>) -> tensor<4096x4096x1x1xf16> { + %conv1 = IE.Convolution(%input, %weights1) + {dilations = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : + tensor<4096x320x1x1xf16>, tensor<320x320x1x1xf16> -> tensor<4096x320x1x1xf16> + + %input_low = const.Declare tensor = dense<0.0> : tensor + %input_high = const.Declare tensor = dense<255.0> : tensor + %input_fq = IE.FakeQuantize(%conv1, %input_low, %input_high, %input_low, %input_high) + { auto_broadcast = #IE.auto_broadcast_type, levels = 256 } : + tensor<4096x320x1x1xf16>, tensor, tensor, tensor, tensor -> tensor<4096x320x1x1xf16> + + %reshape = IE.Reshape(%input_fq) {shape_value = [1, 4096, 8, 40]} : tensor<4096x320x1x1xf16> -> tensor<1x4096x8x40xf16> + + %transpose = IE.Transpose(%reshape) {order_value = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>} : + tensor<1x4096x8x40xf16> -> tensor<1x8x4096x40xf16> + + %groupconv_weights = const.Declare tensor<8x1x1x1xf16> = dense<2.0> : tensor<8x1x1x1xf16> + + %mul = IE.GroupConvolution(%transpose, %groupconv_weights) + {dilations = [1, 1], groups = 8 : i64, pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : + tensor<1x8x4096x40xf16>, tensor<8x1x1x1xf16> -> tensor<1x8x4096x40xf16> + + %slice = IE.Slice %mul [0, 0, 0, 0] [1, 1, 4096, 40] : tensor<1x8x4096x40xf16> to tensor<1x1x4096x40xf16> + + %affine_reshape = IE.AffineReshape(%slice) {dim_mapping = [[0], [0], [0], [1, 2, 3]], shape_value = [4096, 40, 1, 1]} : + tensor<1x1x4096x40xf16> -> tensor<4096x40x1x1xf16> + + %weights2 = const.Declare tensor<4096x40x1x1xf16> = dense<10.0> : tensor<4096x40x1x1xf16> + %weights_low = const.Declare tensor<4096x1x1x1xf16> = dense<-127.0> : tensor<4096x1x1x1xf16> + %weights_high = const.Declare tensor<4096x1x1x1xf16> = dense<127.0> : tensor<4096x1x1x1xf16> + %weights_fq = IE.FakeQuantize(%weights2, %weights_low, %weights_high, %weights_low, %weights_high) + { auto_broadcast = #IE.auto_broadcast_type, levels = 256 } : + tensor<4096x40x1x1xf16>, tensor<4096x1x1x1xf16>, tensor<4096x1x1x1xf16>, tensor<4096x1x1x1xf16>, tensor<4096x1x1x1xf16> -> tensor<4096x40x1x1xf16> + + %conv2 = IE.Convolution(%affine_reshape, %weights_fq) + {dilations = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : + tensor<4096x40x1x1xf16>, tensor<4096x40x1x1xf16> -> tensor<4096x4096x1x1xf16> + + return %conv2 : tensor<4096x4096x1x1xf16> + + // CHECK-DAG: [[WEIGHTS2:%.+]] = const.Declare tensor<4096x40x1x1x!qElemType> = dense<1.000000e+01> : + // CHECK-SAME: tensor<4096x40x1x1xf16>, [#const.Quantize, #const.CastElemType, #const.ConvertElemType] + + // CHECK: [[CONV1:%.+]] = IE.Convolution([[INPUT]], [[WEIGHTS1]]) + // CHECK-SAME: tensor<4096x320x1x1xf16>, tensor<320x320x1x1xf16> -> tensor<4096x320x1x1x!qElemType2> + + // CHECK: [[RESHAPE:%.+]] = IE.Reshape([[CONV1]]) + + // CHECK: [[TRANSPOSE:%.+]] = IE.Transpose([[RESHAPE]]) + + // CHECK: [[QUANTIZECAST:%.+]] = IE.QuantizeCast([[TRANSPOSE]]) {dstElemType = !qElemType3} : + // CHECK-SAME: tensor<1x8x4096x40x!qElemType2> -> tensor<1x8x4096x40x!qElemType3> + + // CHECK: [[SLICE:%.+]] = IE.Slice [[QUANTIZECAST]] + + // CHECK: [[AFFINE_RESHAPE:%.+]] = IE.AffineReshape([[SLICE]]) + + // CHECK: [[CONV2:%.+]] = IE.Convolution([[AFFINE_RESHAPE]], [[WEIGHTS2]]) + // CHECK-SAME: tensor<4096x40x1x1x!qElemType3>, tensor<4096x40x1x1x!qElemType> -> tensor<4096x4096x1x1xf16> + + // CHECK: return [[CONV2]] +} diff --git a/tests/lit/NPU/dialect/IE/pipelines/optimize_mem_permute_and_activation_channels.mlir b/tests/lit/NPU/dialect/IE/pipelines/optimize_mem_permute_and_activation_channels.mlir index dcf91c89ef..29c8330d54 100644 --- a/tests/lit/NPU/dialect/IE/pipelines/optimize_mem_permute_and_activation_channels.mlir +++ b/tests/lit/NPU/dialect/IE/pipelines/optimize_mem_permute_and_activation_channels.mlir @@ -114,13 +114,30 @@ func.func @MemPermuteProcessingForNHCWMemPermute(%arg0: tensor<1x380x720x1xf16>) return %0 : tensor<1x720x380x1xf16> - // CHECK: [[PERMUTECAST0:%.+]] = IE.PermuteCast([[INPUT]]) {dst_order = #NHWC, mem_perm = #NCHW} : tensor<1x380x720x1xf16> -> tensor<1x1x380x720xf16, {order = #NHWC}> - // CHECK: [[SHAPECAST1:%.+]] = IE.ShapeCast {shape = [1, 16, 380, 45]} inputs([[PERMUTECAST0]] : tensor<1x1x380x720xf16, {order = #NHWC}>) -> tensor<1x16x380x45xf16, {order = #NHWC}> - // CHECK: [[MAXPOOL:%.+]] = IE.MaxPool([[SHAPECAST1]]) {kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x16x380x45xf16, {order = #NHWC}> -> tensor<1x16x380x45xf16, {order = #NWCH}> - // CHECK: [[PERMUTECAST3:%.+]] = IE.PermuteCast([[MAXPOOL]]) {dst_order = #NCHW, mem_perm = #NCHW} : tensor<1x16x380x45xf16, {order = #NWCH}> -> tensor<1x45x16x380xf16> - // CHECK: [[SHAPECAST4:%.+]] = IE.ShapeCast {shape = [1, 720, 1, 380]} inputs([[PERMUTECAST3]] : tensor<1x45x16x380xf16>) -> tensor<1x720x1x380xf16> - // CHECK: [[PERMUTECAST5:%.+]] = IE.PermuteCast([[SHAPECAST4]]) {dst_order = #NCHW, mem_perm = #NHCW} : tensor<1x720x1x380xf16> -> tensor<1x1x720x380xf16> - // CHECK: [[SHAPECAST6:%.+]] = IE.ShapeCast {shape = [1, 720, 380, 1]} inputs([[PERMUTECAST5]] : tensor<1x1x720x380xf16>) -> tensor<1x720x380x1xf16> - - // CHECK: return [[SHAPECAST6]] : tensor<1x720x380x1xf16> + // CHECK: [[SHAPECAST0:%.+]] = IE.ShapeCast {shape = [1, 1, 380, 720]} inputs([[INPUT]] : tensor<1x380x720x1xf16>) -> tensor<1x1x380x720xf16> + // CHECK: [[PERMUTECAST1:%.+]] = IE.PermuteCast([[SHAPECAST0]]) {dst_order = #NHWC, mem_perm = #NCHW} : tensor<1x1x380x720xf16> -> tensor<1x720x1x380xf16, {order = #NHWC}> + // CHECK: [[MAXPOOL:%.+]] = IE.MaxPool([[PERMUTECAST1]]) {kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x720x1x380xf16, {order = #NHWC}> -> tensor<1x720x1x380xf16, {order = #NHCW}> + // CHECK: [[PERMUTECAST3:%.+]] = IE.PermuteCast([[MAXPOOL]]) {dst_order = #NCHW, mem_perm = #NCHW} : tensor<1x720x1x380xf16, {order = #NHCW}> -> tensor<1x1x720x380xf16> + // CHECK: [[SHAPECAST4:%.+]] = IE.ShapeCast {shape = [1, 720, 380, 1]} inputs(%3 : tensor<1x1x720x380xf16>) -> tensor<1x720x380x1xf16> + + // CHECK: return [[SHAPECAST4]] : tensor<1x720x380x1xf16> +} + +// ----- + +// CHECK-LABEL: @MemPermuteProcessingForNHWCMemPermute +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<4x75x16x64xf16> +func.func @MemPermuteProcessingForNHWCMemPermute(%arg0: tensor<4x75x16x64xf16>) -> tensor<4x16x64x75xf16> { + %0 = IE.MemPermute(%arg0) {dst_order = #NCHW, mem_perm = #NHWC} : + tensor<4x75x16x64xf16> -> tensor<4x16x64x75xf16> + + return %0 : tensor<4x16x64x75xf16> + + // CHECK: [[SHAPECAST0:%.+]] = IE.ShapeCast {shape = [1, 4, 75, 1024]} inputs([[INPUT]] : tensor<4x75x16x64xf16>) -> tensor<1x4x75x1024xf16> + // CHECK: [[PERMUTECAST1:%.+]] = IE.PermuteCast([[SHAPECAST0]]) {dst_order = #NHWC, mem_perm = #NCHW} : tensor<1x4x75x1024xf16> -> tensor<1x1024x4x75xf16, {order = #NHWC}> + // CHECK: [[MAXPOOL:%.+]] = IE.MaxPool([[PERMUTECAST1]]) {kernel_size = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], rounding_type = #IE.rounding_type, strides = [1, 1]} : tensor<1x1024x4x75xf16, {order = #NHWC}> -> tensor<1x1024x4x75xf16, {order = #NHCW}> + // CHECK: [[PERMUTECAST3:%.+]] = IE.PermuteCast([[MAXPOOL]]) {dst_order = #NCHW, mem_perm = #NCHW} : tensor<1x1024x4x75xf16, {order = #NHCW}> -> tensor<1x4x1024x75xf16> + // CHECK: [[SHAPECAST4:%.+]] = IE.ShapeCast {shape = [4, 16, 64, 75]} inputs([[PERMUTECAST3]] : tensor<1x4x1024x75xf16>) -> tensor<4x16x64x75xf16> + + // CHECK: return [[SHAPECAST4]] : tensor<4x16x64x75xf16> } diff --git a/tests/lit/NPU/dialect/IE/pipelines/reference_sw_mode.mlir b/tests/lit/NPU/dialect/IE/pipelines/reference_sw_mode.mlir new file mode 100644 index 0000000000..92cf146872 --- /dev/null +++ b/tests/lit/NPU/dialect/IE/pipelines/reference_sw_mode.mlir @@ -0,0 +1,66 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=ReferenceSW" --mlir-elide-elementsattrs-if-larger 8 --reference-sw-mode-ie %s | FileCheck %s --strict-whitespace +// REQUIRES: arch-NPU37XX || arch-NPU40XX + + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK-LABEL: @Convolution +module @Convolution { + + net.NetworkInfo entryPoint : @main + inputsInfo : { + DataInfo "input" : tensor<1x3x62x62xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x48x60x60xf16> + } + + // CHECK: func.func @main([[ARG0:%.+]]: tensor<1x3x62x62xf16>) -> tensor<1x48x60x60xf16> + func.func @main(%arg: tensor<1x3x62x62xf32>) -> tensor<1x48x60x60xf32> { + %cst = const.Declare tensor<48x3x3x3xf32> = dense<1.0> : tensor<48x3x3x3xf32> + %1 = IE.Convolution(%arg, %cst) { + dilations = [1, 1], + pads_begin = [0, 0], + pads_end = [0, 0], + strides = [1, 1] + } : tensor<1x3x62x62xf32>, tensor<48x3x3x3xf32> -> tensor<1x48x60x60xf32> + return %1 : tensor<1x48x60x60xf32> + } + + // CHECK: [[CST:%.+]] = const.Declare tensor<48x3x3x3xf16> = dense<1.000000e+00> : + // CHECK-SAME: tensor<48x3x3x3xf32>, [#const.CastElemType] + // CHECK: [[OUT:%.+]] = IE.Convolution(%arg0, %cst) {dilations = [1, 1], pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1]} : + // CHECK-SAME: tensor<1x3x62x62xf16>, tensor<48x3x3x3xf16> -> tensor<1x48x60x60xf16> + // CHECK: return [[OUT]] : tensor<1x48x60x60xf16> + +} + + +// ----- + +module @SoftMax { + +net.NetworkInfo + entryPoint : @main + inputsInfo : { + DataInfo "input" : tensor<1x8x24x64xf16> + } + outputsInfo : { + DataInfo "grn" : tensor<1x8x24x64xf16> + } + + func.func @main(%arg0: tensor<1x8x24x64xf16>) -> tensor<1x8x24x64xf16> { + %0 = IE.GRN(%arg0) {bias = 0.33000001311302185 : f64} : tensor<1x8x24x64xf16> -> tensor<1x8x24x64xf16> + return %0 : tensor<1x8x24x64xf16> + } + + // CHECK: [[NORMALIZEL2:%.*]] = IE.NormalizeL2(%arg0) + // CHECK-SAME: {axes_value = [1 : si64], eps = 0.33000001311302185 : f64, eps_mode = #IE.eps_mode} + // CHECK-SAME: : tensor<1x8x24x64xf16> -> tensor<1x8x24x64xf16> + + // CHECK: return [[NORMALIZEL2]] +} diff --git a/tests/lit/NPU/dialect/NPUReg40XX/act_shave_roundtrip.mlir b/tests/lit/NPU/dialect/NPUReg40XX/act_shave_roundtrip.mlir index cfd92956d7..acb33eed16 100755 --- a/tests/lit/NPU/dialect/NPUReg40XX/act_shave_roundtrip.mlir +++ b/tests/lit/NPU/dialect/NPUReg40XX/act_shave_roundtrip.mlir @@ -7,7 +7,7 @@ // RUN: vpux-opt --emit-bytecode --vpu-arch=%arch% %s | vpux-opt --vpu-arch=%arch% | FileCheck %s // REQUIRES: arch-NPU40XX -module @SingleHswishFP16 attributes {VPU.arch = #VPU.arch_kind} { +module @SingleHswishFP16 attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @single_hswish inputsInfo : { diff --git a/tests/lit/NPU/dialect/NPUReg40XX/one_dma_elf_export_roundtrip.mlir b/tests/lit/NPU/dialect/NPUReg40XX/one_dma_elf_export_roundtrip.mlir index 35fa1f95a5..63f67605a6 100644 --- a/tests/lit/NPU/dialect/NPUReg40XX/one_dma_elf_export_roundtrip.mlir +++ b/tests/lit/NPU/dialect/NPUReg40XX/one_dma_elf_export_roundtrip.mlir @@ -8,16 +8,16 @@ !quantileFloatType = !QuantileFloat.quantileFloat -module @Test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @Test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "Input" : tensor<1x1024x!quantileFloatType> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPU/ops/transposed_convolution.mlir b/tests/lit/NPU/dialect/VPU/ops/transposed_convolution.mlir new file mode 100644 index 0000000000..51d2637d15 --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/ops/transposed_convolution.mlir @@ -0,0 +1,81 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --canonicalize %s | FileCheck %s + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!dynInputType = tensor<1x32x23x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 23, 30]> : tensor<4xsi64>, order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> +!dynOutputType = tensor<1x16x24x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 24, 31]> : tensor<4xsi64>, order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> + +// CHECK: func.func @TransposedConvolutionDynamicInputConstFilter([[INPUT_DATA:%.+]]: tensor<1x32x23x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 23, 30]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x24x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 24, 31]> : tensor<4xsi64>, order = #NHWC}> { +func.func @TransposedConvolutionDynamicInputConstFilter(%input: !dynInputType) -> tensor<1x16x24x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 24, 31]> : tensor<4xsi64>, order = #NHWC}> { + %weights = const.Declare tensor<16x32x2x2xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x32x2x2xf16, {order = #NHWC}> + %output = VPU.TransposedConvolution(%input, %weights) { + dilations = [1, 1], operandSegmentSizes = array, spatial_output_padding = [0, 0], pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1] + } : !dynInputType, tensor<16x32x2x2xf16, {order = #NHWC}> -> !dynOutputType + return %output : !dynOutputType + + // CHECK-DAG: [[FILTER:%.+]] = const.Declare tensor<16x32x2x2xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x32x2x2xf16, {order = #NHWC}> + // CHECK: [[TRANSPOSED_CONV:%.+]] = VPU.TransposedConvolution([[INPUT_DATA]], [[FILTER]]) { + // CHECK-SAME: dilations = [1, 1], + // CHECK-SAME: operandSegmentSizes = array, + // CHECK-SAME: pads_begin = [0, 0], + // CHECK-SAME: pads_end = [0, 0], + // CHECK-SAME: spatial_output_padding = [0, 0], + // CHECK-SAME: strides = [1, 1]} + // CHECK-SAME: : tensor<1x32x23x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 23, 30]> : tensor<4xsi64>, order = #NHWC}>, tensor<16x32x2x2xf16, {order = #NHWC}> -> tensor<1x16x24x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 24, 31]> : tensor<4xsi64>, order = #NHWC}> + // CHECK: return [[TRANSPOSED_CONV]] : tensor<1x16x24x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 24, 31]> : tensor<4xsi64>, order = #NHWC}> +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!dynInputType = tensor<1x32x23x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 23, 30]> : tensor<4xsi64>, order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> +!dynOutputType = tensor<1x16x24x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 24, 31]> : tensor<4xsi64>, order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> + +// CHECK: func.func @TransposedConvolutionDynamicInput([[INPUT_DATA:%.+]]: tensor<1x32x23x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 23, 30]> : tensor<4xsi64>, order = #NHWC}>, [[FILTER:%.+]]: tensor<16x32x2x2xf16, {order = #NHWC}>) -> tensor<1x16x24x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 24, 31]> : tensor<4xsi64>, order = #NHWC}> { +func.func @TransposedConvolutionDynamicInput(%input: !dynInputType, %weights: tensor<16x32x2x2xf16, {order = #NHWC}>) -> !dynOutputType { + %output = VPU.TransposedConvolution(%input, %weights) { + dilations = [1, 1], operandSegmentSizes = array, spatial_output_padding = [0, 0], pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1] + } : !dynInputType, tensor<16x32x2x2xf16, {order = #NHWC}> -> !dynOutputType + return %output : !dynOutputType + + // CHECK: [[TRANSPOSED_CONV:%.+]] = VPU.TransposedConvolution([[INPUT_DATA]], [[FILTER]]) { + // CHECK-SAME: dilations = [1, 1], + // CHECK-SAME: operandSegmentSizes = array, + // CHECK-SAME: pads_begin = [0, 0], + // CHECK-SAME: pads_end = [0, 0], + // CHECK-SAME: spatial_output_padding = [0, 0], + // CHECK-SAME: strides = [1, 1]} + // CHECK-SAME: : tensor<1x32x23x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 23, 30]> : tensor<4xsi64>, order = #NHWC}>, tensor<16x32x2x2xf16, {order = #NHWC}> -> tensor<1x16x24x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 24, 31]> : tensor<4xsi64>, order = #NHWC}> + // CHECK: return [[TRANSPOSED_CONV]] : tensor<1x16x24x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 24, 31]> : tensor<4xsi64>, order = #NHWC}> +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!dynFilterType = tensor<16x32x2x?xf16, {bounds = #const.OpaqueI64Elements<[16, 32, 2, 2]> : tensor<4xsi64>, order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> +!dynOutputType = tensor<1x16x24x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 24, 31]> : tensor<4xsi64>, order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> + +// CHECK: func.func @TransposedConvolutionDynamicFilter([[INPUT_DATA:%.+]]: tensor<1x32x23x30xf16, {order = #NHWC}>, [[FILTER:%.+]]: tensor<16x32x2x?xf16, {bounds = #const.OpaqueI64Elements<[16, 32, 2, 2]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x24x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 24, 31]> : tensor<4xsi64>, order = #NHWC}> { +func.func @TransposedConvolutionDynamicFilter(%input: tensor<1x32x23x30xf16, {order = #NHWC}>, %weights: !dynFilterType) -> !dynOutputType { + %output = VPU.TransposedConvolution(%input, %weights) { + dilations = [1, 1], operandSegmentSizes = array, spatial_output_padding = [0, 0], pads_begin = [0, 0], pads_end = [0, 0], strides = [1, 1] + } : tensor<1x32x23x30xf16, {order = #NHWC}>, !dynFilterType -> !dynOutputType + return %output : !dynOutputType + + // CHECK: [[TRANSPOSED_CONV:%.+]] = VPU.TransposedConvolution([[INPUT_DATA]], [[FILTER]]) { + // CHECK-SAME: dilations = [1, 1], + // CHECK-SAME: operandSegmentSizes = array, + // CHECK-SAME: pads_begin = [0, 0], + // CHECK-SAME: pads_end = [0, 0], + // CHECK-SAME: spatial_output_padding = [0, 0], + // CHECK-SAME: strides = [1, 1]} + // CHECK-SAME: : tensor<1x32x23x30xf16, {order = #NHWC}>, tensor<16x32x2x?xf16, {bounds = #const.OpaqueI64Elements<[16, 32, 2, 2]> : tensor<4xsi64>, order = #NHWC}> -> tensor<1x16x24x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 24, 31]> : tensor<4xsi64>, order = #NHWC}> + // CHECK: return [[TRANSPOSED_CONV]] : tensor<1x16x24x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 24, 31]> : tensor<4xsi64>, order = #NHWC}> +} diff --git a/tests/lit/NPU/dialect/VPU/passes/add_sw_op_auxiliary_buffer.mlir b/tests/lit/NPU/dialect/VPU/passes/add_sw_op_auxiliary_buffer.mlir index 5459643cf5..0ba05a0724 100644 --- a/tests/lit/NPU/dialect/VPU/passes/add_sw_op_auxiliary_buffer.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/add_sw_op_auxiliary_buffer.mlir @@ -91,8 +91,8 @@ func.func @NonMaxSuppressionSoftNMSSigma0(%arg0: tensor<1x76725x4xf16>, %arg1: t return %out_selected_indices, %out_selected_scores, %out_valid_outputs : tensor<100x3xsi32>, tensor<100x3xf16>, tensor<1xsi32> // CHECK: [[CST:%.+]] = const.Declare tensor<1x1x1x1074152xui8> = dense<0> : tensor<1x1x1x1074152xui8> - // CHECK: [[OUT_INDICES:%.+]], [[OUT_SCORES:%.+]], [[OUT_VALID:%.+]] = VPU.NonMaxSuppression([[ARG0]], [[ARG1]], [[CST]]) - // CHECK: {box_encoding = #IE.box_encoding_type, iou_threshold_value = 5.000000e-01 : f64, max_output_boxes_per_class_value = 100 : i64, score_threshold_value = 0.39990234375 : f64, soft_nms_sigma_value = 0.000000e+00 : f64} + // CHECK: [[OUT_INDICES:%.+]], [[OUT_SCORES:%.+]], [[OUT_VALID:%.+]] = VPU.NonMaxSuppression([[ARG0]], [[ARG1]], [[CST]]) + // CHECK: {box_encoding = #IE.box_encoding_type, iou_threshold_value = 5.000000e-01 : f64, max_output_boxes_per_class_value = 100 : i64, score_threshold_value = 0.39990234375 : f64, soft_nms_sigma_value = 0.000000e+00 : f64} // CHECK: : tensor<1x76725x4xf16>, tensor<1x1x76725xf16>, tensor<1x1x1x1074152xui8> -> tensor<100x3xsi32>, tensor<100x3xf16>, tensor<1xsi32> // CHECK: return [[OUT_INDICES]], [[OUT_SCORES]], [[OUT_VALID]] : tensor<100x3xsi32>, tensor<100x3xf16>, tensor<1xsi32> } @@ -106,8 +106,8 @@ func.func @NonMaxSuppressionSoftNMSSigma05(%arg0: tensor<1x76725x4xf16>, %arg1: return %out_selected_indices, %out_selected_scores, %out_valid_outputs : tensor<100x3xsi32>, tensor<100x3xf16>, tensor<1xsi32> // CHECK: [[CST:%.+]] = const.Declare tensor<1x1x1x460352xui8> = dense<0> : tensor<1x1x1x460352xui8> - // CHECK: [[OUT_INDICES:%.+]], [[OUT_SCORES:%.+]], [[OUT_VALID:%.+]] = VPU.NonMaxSuppression([[ARG0]], [[ARG1]], [[CST]]) - // CHECK: {box_encoding = #IE.box_encoding_type, iou_threshold_value = 5.000000e-01 : f64, max_output_boxes_per_class_value = 100 : i64, score_threshold_value = 0.39990234375 : f64, soft_nms_sigma_value = 5.000000e-01 : f64} + // CHECK: [[OUT_INDICES:%.+]], [[OUT_SCORES:%.+]], [[OUT_VALID:%.+]] = VPU.NonMaxSuppression([[ARG0]], [[ARG1]], [[CST]]) + // CHECK: {box_encoding = #IE.box_encoding_type, iou_threshold_value = 5.000000e-01 : f64, max_output_boxes_per_class_value = 100 : i64, score_threshold_value = 0.39990234375 : f64, soft_nms_sigma_value = 5.000000e-01 : f64} // CHECK: : tensor<1x76725x4xf16>, tensor<1x1x76725xf16>, tensor<1x1x1x460352xui8> -> tensor<100x3xsi32>, tensor<100x3xf16>, tensor<1xsi32> // CHECK: return [[OUT_INDICES]], [[OUT_SCORES]], [[OUT_VALID]] : tensor<100x3xsi32>, tensor<100x3xf16>, tensor<1xsi32> } diff --git a/tests/lit/NPU/dialect/VPU/passes/apply_tiling.mlir b/tests/lit/NPU/dialect/VPU/passes/apply_tiling.mlir index 6609f3db6e..ea255193a8 100644 --- a/tests/lit/NPU/dialect/VPU/passes/apply_tiling.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/apply_tiling.mlir @@ -1326,6 +1326,52 @@ func.func @ApplyTilingNCEMatMulTileOverGroup(%arg0: tensor<64x8x64x32xf16>, %arg // ----- +#NCDHW = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> + +// CHECK-LABEL: func.func @ApplyTilingAvgPool5D +// CHECK-SAME: [[INPUT0:%arg[0-7]]]: tensor<1x24x8x56x56xf32> +func.func @ApplyTilingAvgPool5D(%arg0: tensor<1x24x8x56x56xf32>) -> tensor<1x24x4x56x56xf32> { + %0 = VPU.AffineReshape(%arg0) {dim_mapping = [[0], [1], [2], [2], [3]], shape_value = [1, 24, 448, 56]} : tensor<1x24x8x56x56xf32> -> tensor<1x24x448x56xf32> + %1 = VPU.Convert(%0) {dstElemType = f16, multiClusterStrategy = #VPU.multi_cluster_strategy} : tensor<1x24x448x56xf32> -> tensor<1x24x448x56xf16> + %2 = VPU.AffineReshape(%1) {dim_mapping = [[0], [1], [2, 3], [4]], shape_value = [1, 24, 8, 56, 56]} : tensor<1x24x448x56xf16> -> tensor<1x24x8x56x56xf16> + %3 = VPU.AvgPool(%2) {exclude_pads, kernel_size = [2, 1, 1], pads_begin = [0, 0, 0], pads_end = [0, 0, 0], rounding_type = #IE.rounding_type, strides = [2, 1, 1], tilingStrategy = [1, 1, 2, 1, 1]} : tensor<1x24x8x56x56xf16> -> tensor<1x24x4x56x56xf16> + %4 = VPU.AffineReshape(%3) {dim_mapping = [[0], [1], [2], [2], [3]], shape_value = [1, 24, 224, 56]} : tensor<1x24x4x56x56xf16> -> tensor<1x24x224x56xf16> + %5 = VPU.Convert(%4) {dstElemType = f32, multiClusterStrategy = #VPU.multi_cluster_strategy} : tensor<1x24x224x56xf16> -> tensor<1x24x224x56xf32> + %6 = VPU.AffineReshape(%5) {dim_mapping = [[0], [1], [2, 3], [4]], shape_value = [1, 24, 4, 56, 56]} : tensor<1x24x224x56xf32> -> tensor<1x24x4x56x56xf32> + return %6 : tensor<1x24x4x56x56xf32> + + // CHECK: [[INPUT_AFFINE_RESHAPE:%.+]] = VPU.AffineReshape([[INPUT0]]) + // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [1], [2], [2], [3]], shape_value = [1, 24, 448, 56]} + // CHECK-SAME: tensor<1x24x8x56x56xf32> -> tensor<1x24x448x56xf32> + // CHECK: [[INPUT_CONVERT:%.+]] = VPU.Convert([[INPUT_AFFINE_RESHAPE]]) {dstElemType = f16, multiClusterStrategy = #VPU.multi_cluster_strategy} : tensor<1x24x448x56xf32> -> tensor<1x24x448x56xf16> + // CHECK: [[INPUT_AFFINE_RESHAPE2:%.+]] = VPU.AffineReshape([[INPUT_CONVERT]]) + // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [1], [2, 3], [4]], shape_value = [1, 24, 8, 56, 56]} + // CHECK-SAME: tensor<1x24x448x56xf16> -> tensor<1x24x8x56x56xf16> + // CHECK: [[SLICE_0:%.+]] = VPU.Slice [[INPUT_AFFINE_RESHAPE2]] [0, 0, 0, 0, 0] [1, 24, 4, 56, 56] + // CHECK-SAME: tensor<1x24x8x56x56xf16> to tensor<1x24x4x56x56xf16> + // CHECK: [[AVG_POOL_0:%.+]] = VPU.AvgPool([[SLICE_0]]) + // CHECK-SAME{LITERAL}: exclude_pads, kernel_size = [2, 1, 1], pads_begin = [0, 0, 0], pads_end = [0, 0, 0], rounding_type = #IE.rounding_type, strides = [2, 1, 1] + // CHECK-SAME: tensor<1x24x4x56x56xf16> -> tensor<1x24x2x56x56xf16> + // CHECK: [[SLICE_1:%.+]] = VPU.Slice [[INPUT_AFFINE_RESHAPE2]] [0, 0, 4, 0, 0] [1, 24, 4, 56, 56] + // CHECK-SAME: tensor<1x24x8x56x56xf16> to tensor<1x24x4x56x56xf16> + // CHECK: [[AVG_POOL_1:%.+]] = VPU.AvgPool([[SLICE_1]]) + // CHECK-SAME{LITERAL}: exclude_pads, kernel_size = [2, 1, 1], pads_begin = [0, 0, 0], pads_end = [0, 0, 0], rounding_type = #IE.rounding_type, strides = [2, 1, 1] + // CHECK-SAME: tensor<1x24x4x56x56xf16> -> tensor<1x24x2x56x56xf16> + // CHECK: [[CONCAT:%.+]] = VPU.Concat([[AVG_POOL_0]], [[AVG_POOL_1]]) + // CHECK-SAME{LITERAL}: {static_offsets = [[0, 0, 0, 0, 0], [0, 0, 2, 0, 0]]} + // CHECK-SAME: tensor<1x24x2x56x56xf16>, tensor<1x24x2x56x56xf16> -> tensor<1x24x4x56x56xf16> + // CHECK: [[AVG_POOL_AFFINE_RESHAPE:%.+]] = VPU.AffineReshape([[CONCAT]]) + // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [1], [2], [2], [3]], shape_value = [1, 24, 224, 56]} + // CHECK-SAME: tensor<1x24x4x56x56xf16> -> tensor<1x24x224x56xf16> + // CHECK: [[AVG_POOL_CONVERT:%.+]] = VPU.Convert([[AVG_POOL_AFFINE_RESHAPE]]) {dstElemType = f32, multiClusterStrategy = #VPU.multi_cluster_strategy} : tensor<1x24x224x56xf16> -> tensor<1x24x224x56xf32> + // CHECK: [[AVG_POOL_AFFINE_RESHAPE2:%.+]] = VPU.AffineReshape([[AVG_POOL_CONVERT]]) + // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [1], [2, 3], [4]], shape_value = [1, 24, 4, 56, 56]} + // CHECK-SAME: tensor<1x24x224x56xf32> -> tensor<1x24x4x56x56xf32> + // CHECK: return [[AVG_POOL_AFFINE_RESHAPE2]] : tensor<1x24x4x56x56xf32> +} + +// ----- + #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> !qElemType = !quant.uniform diff --git a/tests/lit/NPU/dialect/VPU/passes/apply_tiling_mvn1sum_prefetch.mlir b/tests/lit/NPU/dialect/VPU/passes/apply_tiling_mvn1sum_prefetch_37XX_40XX.mlir similarity index 96% rename from tests/lit/NPU/dialect/VPU/passes/apply_tiling_mvn1sum_prefetch.mlir rename to tests/lit/NPU/dialect/VPU/passes/apply_tiling_mvn1sum_prefetch_37XX_40XX.mlir index a2554e9d98..2938320ed5 100644 --- a/tests/lit/NPU/dialect/VPU/passes/apply_tiling_mvn1sum_prefetch.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/apply_tiling_mvn1sum_prefetch_37XX_40XX.mlir @@ -10,7 +10,7 @@ module @executors { IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SOHTilingMVN1SumNotCorrectHForMS // CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<3x1x1000001x1xf16, {order = #NHWC}> @@ -51,7 +51,7 @@ module @executors { module @executors { IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SOHTilingMVN1SumCorrectHForMS // CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x3x1000001x1xf16, {order = #NHWC}> @@ -92,7 +92,7 @@ module @executors { module @executors { IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @ClusteringTilingMVN1SumCorrectHForMS // CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x3x1000001x1xf16, {order = #NHWC}> @@ -145,7 +145,7 @@ module @executors { module @executors { IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SOKTilingMVN1Sum // CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x3x1000001x1xf16> @@ -192,7 +192,7 @@ module { IE.TileResource 2 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } diff --git a/tests/lit/NPU/dialect/VPU/passes/apply_tiling_scf_40XX+.mlir b/tests/lit/NPU/dialect/VPU/passes/apply_tiling_scf_40XX+.mlir index a8ea74b68d..593d19c3b5 100644 --- a/tests/lit/NPU/dialect/VPU/passes/apply_tiling_scf_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/apply_tiling_scf_40XX+.mlir @@ -8,7 +8,11 @@ #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + //CHECK: #[[$MAP:.*]] = affine_map<(d0) -> (d0 - 1, 0)> +//CHECK: #[[$MAP1:.*]] = affine_map<(d0) -> (-(d0 - 1), 0)> +//CHECK: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0, 1)> +//CHECK: #[[$MAP3:.*]] = affine_map<(d0, d1) -> (d1 + d0 + 1 - 64, 0)> // CHECK-LABEL: @ApplyTilingNCEConv // CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x32x64x64xf16, {order = #NHWC}> @@ -37,20 +41,23 @@ func.func @ApplyTilingNCEConv(%arg0: tensor<1x32x64x64xf16, {order = #NHWC}>) -> //CHECK-SAME: [[LOOP_ITER:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END]] step [[LOOP_STEP]] //CHECK-SAME: iter_args([[LOOP_OUT:%arg[0-9]]] = [[LOOP_OUTPUT]]) -> (tensor<1x256x64x64xf16, {order = #NHWC}>) { - //CHECK: [[OFFSET:%.+]] = affine.max #[[$MAP]]([[LOOP_ITER]]) - //CHECK: [[ZERO:%.+]] = arith.constant 0 : index - //CHECK: [[CONDITION:%.+]] = arith.cmpi eq, [[LOOP_ITER]], [[ZERO]] : index - //CHECK: [[SLICE:%.+]] = tensor.extract_slice [[INPUT]][0, 0, [[OFFSET]], 0] [1, 32, 33, 64] [1, 1, 1, 1] : tensor<1x32x64x64xf16, {order = #NHWC}> to tensor<1x32x33x64xf16, {order = #NHWC}> - //CHECK: [[IF:%.+]] = scf.if [[CONDITION]] -> (tensor<1x256x32x64xf16, {order = #NHWC}>) { - //CHECK: [[CONV0:%.+]] = VPU.NCE.Convolution([[SLICE]], [[WEIGHTS]], [[WEIGHTS_TABLE]]) - //CHECK-SAME: pad = #VPU.Padding - //CHECK: scf.yield [[CONV0]] : tensor<1x256x32x64xf16, {order = #NHWC}> - //CHECK: else - //CHECK: [[CONV1:%.+]] = VPU.NCE.Convolution([[SLICE]], [[WEIGHTS]], [[WEIGHTS_TABLE]]) - //CHECK-SAME: pad = #VPU.Padding - //CHECK: scf.yield [[CONV1]] : tensor<1x256x32x64xf16, {order = #NHWC}> - - //CHECK: [[INSERT:%.+]] = tensor.insert_slice [[IF]] into [[LOOP_OUT]][0, 0, [[LOOP_ITER]], 0] [1, 256, 32, 64] [1, 1, 1, 1] : tensor<1x256x32x64xf16, {order = #NHWC}> into tensor<1x256x64x64xf16, {order = #NHWC}> + //CHECK: [[SLICE_OFFSET:%.+]] = affine.max #[[$MAP]]([[LOOP_ITER]]) + //CHECK: [[DIFF1:%.+]] = affine.min #[[$MAP1]](%arg1) + //CHECK: [[PAD_LOW:%.+]] = affine.max #[[$MAP2]]()[[[DIFF1]]] + //CHECK: [[DIFF2:%.+]] = affine.min #[[$MAP3]](%arg1, [[SLICE_OFFSET]]) + //CHECK: [[PAD_HIGH:%.+]] = affine.max #[[$MAP2]]()[[[DIFF2]]] + + //CHECK: [[SLICE:%.+]] = tensor.extract_slice [[INPUT]][0, 0, [[SLICE_OFFSET]], 0] [1, 32, 33, 64] [1, 1, 1, 1] : tensor<1x32x64x64xf16, {order = #NHWC}> to tensor<1x32x33x64xf16, {order = #NHWC}> + //CHECK: [[PAD_VALUE:%.+]] = arith.constant 0.000000e+00 : f16 + //CHECK: [[PAD:%.+]] = tensor.pad [[SLICE]] low[0, 0, [[PAD_LOW]], 1] high[0, 0, [[PAD_HIGH]], 1] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x33x64xf16, {order = #NHWC}> to tensor<1x32x?x66xf16, {order = #NHWC}> + //CHECK: [[CONV:%.+]] = VPU.NCE.Convolution([[PAD]], [[WEIGHTS]], [[WEIGHTS_TABLE]]) + //CHECK-SAME: {pad = #VPU.Padding + //CHECK-SAME: tensor<1x32x?x66xf16, {order = #NHWC}>, tensor<256x32x3x3xf16, {order = #NHWC}>, tensor<256x1x1x4xsi32, {order = #NCHW}> -> tensor<1x256x?x64xf16, {order = #NHWC}> + //CHECK: [[RESULT_SIZE:%.+]] = arith.constant 32 : index + + //CHECK: [[INSERT:%.+]] = tensor.insert_slice [[CONV]] into [[LOOP_OUT]][0, 0, [[LOOP_ITER]], 0] [1, 256, [[RESULT_SIZE]], 64] [1, 1, 1, 1] : tensor<1x256x?x64xf16, {order = #NHWC}> into tensor<1x256x64x64xf16, {order = #NHWC}> //CHECK: scf.yield [[INSERT]] : tensor<1x256x64x64xf16, {order = #NHWC}> //CHECK: return [[LOOP]] : tensor<1x256x64x64xf16, {order = #NHWC}> } @@ -59,7 +66,10 @@ func.func @ApplyTilingNCEConv(%arg0: tensor<1x32x64x64xf16, {order = #NHWC}>) -> #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -//CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 - 1, 0)> +//CHECK: #[[$MAP:.*]] = affine_map<(d0) -> (d0 - 1, 0)> +//CHECK: #[[$MAP1:.*]] = affine_map<(d0) -> (-(d0 - 1), 0)> +//CHECK: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0, 1)> +//CHECK: #[[$MAP3:.*]] = affine_map<(d0, d1) -> (d1 + d0 + 1 - 200, 0)> // CHECK-LABEL: @ApplyTilingMaxPool // CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x16x200x200xf16, {order = #NHWC}>) @@ -86,21 +96,23 @@ func.func @ApplyTilingMaxPool(%arg0: tensor<1x16x200x200xf16, {order = #NHWC}>) //CHECK-SAME: [[LOOP_ITER:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END]] step [[LOOP_STEP]] //CHECK-SAME: iter_args([[LOOP_OUT:%arg[0-9]]] = [[LOOP_OUTPUT]]) -> (tensor<1x16x200x200xf16, {order = #NHWC}>) { - //CHECK: [[OFFSET:%.+]] = affine.max #[[$MAP]]([[LOOP_ITER]]) - //CHECK: [[ZERO:%.+]] = arith.constant 0 : index - //CHECK: [[CONDITION:%.+]] = arith.cmpi eq, [[LOOP_ITER]], [[ZERO]] : index - //CHECK: [[SLICE:%.+]] = tensor.extract_slice [[INPUT]][0, 0, [[OFFSET]], 0] [1, 16, 101, 200] [1, 1, 1, 1] : tensor<1x16x200x200xf16, {order = #NHWC}> to tensor<1x16x101x200xf16, {order = #NHWC}> - //CHECK: [[IF:%.+]] = scf.if [[CONDITION]] -> (tensor<1x16x100x200xf16, {order = #NHWC}>) { - //CHECK: [[MAXPOOL0:%.+]] = VPU.NCE.MaxPool([[SLICE]], [[WEIGHTS_TABLE]] ) - //CHECK-SAME: pad = #VPU.Padding - //CHECK: scf.yield [[MAXPOOL0]] : tensor<1x16x100x200xf16, {order = #NHWC}> - //CHECK: else - //CHECK: [[MAXPOOL1:%.+]] = VPU.NCE.MaxPool([[SLICE]], [[WEIGHTS_TABLE]] ) - //CHECK-SAME: pad = #VPU.Padding - //CHECK: scf.yield [[MAXPOOL1]] : tensor<1x16x100x200xf16, {order = #NHWC}> - - //CHECK: [[INSERT:%.+]] = tensor.insert_slice [[IF]] into [[LOOP_OUT]][0, 0, [[LOOP_ITER]], 0] [1, 16, 100, 200] [1, 1, 1, 1] : tensor<1x16x100x200xf16, {order = #NHWC}> into tensor<1x16x200x200xf16, {order = #NHWC}> - //CHECK: scf.yield [[INSERT]] : tensor<1x16x200x200xf16, {order = #NHWC}> + //CHECK: [[SLICE_OFFSET:%.+]] = affine.max #[[$MAP]]([[LOOP_ITER]]) + //CHECK: [[DIFF1:%.+]] = affine.min #[[$MAP1]]([[LOOP_ITER]]) + //CHECK: [[PAD_LOW:%.+]] = affine.max #[[$MAP2]]()[[[DIFF1]]] + //CHECK: [[DIFF2:%.+]] = affine.min #[[$MAP3]]([[LOOP_ITER]], [[SLICE_OFFSET]]) + //CHECK: [[PAD_HIGH:%.+]] = affine.max #[[$MAP2]]()[[[DIFF2]]] + //CHECK: [[SLICE:%.+]] = tensor.extract_slice [[INPUT]][0, 0, [[SLICE_OFFSET]], 0] [1, 16, 101, 200] [1, 1, 1, 1] : tensor<1x16x200x200xf16, {order = #NHWC}> to tensor<1x16x101x200xf16, {order = #NHWC}> + //CHECK: [[PAD_VALUE:%.+]] = arith.constant 0.000000e+00 : f16 + //CHECK: [[PAD:%.+]] = tensor.pad [[SLICE]] low[0, 0, [[PAD_LOW]], 1] high[0, 0, [[PAD_HIGH]], 1] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x16x101x200xf16, {order = #NHWC}> to tensor<1x16x?x202xf16, {order = #NHWC}> + //CHECK: [[MAXPOOL:%.+]] = VPU.NCE.MaxPool([[PAD]], [[WEIGHTS_TABLE]] ) + //CHECK-SAME: pad = #VPU.Padding + //CHECK-SAME: tensor<1x16x?x200xf16, {order = #NHWC}> + //CHECK: [[RESULT_SIZE:%.+]] = arith.constant 100 : index + + //CHECK: [[INSERT:%.+]] = tensor.insert_slice [[MAXPOOL]] into [[LOOP_OUT]][0, 0, [[LOOP_ITER]], 0] [1, 16, [[RESULT_SIZE]], 200] [1, 1, 1, 1] : tensor<1x16x?x200xf16, {order = #NHWC}> into tensor<1x16x200x200xf16, {order = #NHWC}> + //CHECK: scf.yield [[INSERT]] : tensor<1x16x200x200xf16, {order = #NHWC}> //CHECK: return [[LOOP]] : tensor<1x16x200x200xf16, {order = #NHWC}> } @@ -110,6 +122,9 @@ func.func @ApplyTilingMaxPool(%arg0: tensor<1x16x200x200xf16, {order = #NHWC}>) #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> //CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 - 1, 0)> +//CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (-(d0 - 1), 0)> +//CHECK: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0, 1)> +//CHECK: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 + d0 + 1 - 200, 0)> // CHECK-LABEL: @ApplyTilingMaxPool4Tiles // CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x16x200x200xf16, {order = #NHWC}>) @@ -135,31 +150,25 @@ func.func @ApplyTilingMaxPool4Tiles(%arg0: tensor<1x16x200x200xf16, {order = #NH //CHECK-SAME: [[LOOP_ITER:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END]] step [[LOOP_STEP]] //CHECK-SAME: iter_args([[LOOP_OUT:%arg[0-9]]] = [[LOOP_OUTPUT]]) -> (tensor<1x16x200x200xf16, {order = #NHWC}>) { - //CHECK: [[OFFSET:%.+]] = affine.max #[[$MAP]]([[LOOP_ITER]]) - //CHECK: [[ZERO:%.+]] = arith.constant 0 : index - //CHECK: [[CONDITION0:%.+]] = arith.cmpi eq, [[LOOP_ITER]], [[ZERO]] : index - //CHECK: [[SLICE0:%.+]] = tensor.extract_slice [[INPUT]][0, 0, [[OFFSET]], 0] [1, 16, 51, 200] [1, 1, 1, 1] : tensor<1x16x200x200xf16, {order = #NHWC}> to tensor<1x16x51x200xf16, {order = #NHWC}> - //CHECK: [[OUTER_IF:%.+]] = scf.if [[CONDITION0]] -> (tensor<1x16x50x200xf16, {order = #NHWC}>) - //CHECK: [[MAXPOOL0:%.+]] = VPU.NCE.MaxPool([[SLICE0]], [[WEIGHTS_TABLE]] ) - //CHECK-SAME: pad = #VPU.Padding - //CHECK: scf.yield [[MAXPOOL0]] : tensor<1x16x50x200xf16, {order = #NHWC}> - //CHECK: else - //CHECK: [[SIZE:%.+]] = arith.constant 200 : index - //CHECK: [[SUB:%.+]] = arith.subi [[SIZE]], [[LOOP_ITER]] : index - //CHECK: [[CONDITION1:%.+]] = arith.cmpi eq, [[LOOP_ITER]], [[SUB]] : index - //CHECK: [[INNER_IF:%.+]] = scf.if [[CONDITION1]] -> (tensor<1x16x50x200xf16, {order = #NHWC}>) { - //CHECK: [[MAXPOOL1:%.+]] = VPU.NCE.MaxPool([[SLICE0]], [[WEIGHTS_TABLE]] ) - //CHECK-SAME: pad = #VPU.Padding - //CHECK: scf.yield [[MAXPOOL1]] : tensor<1x16x50x200xf16, {order = #NHWC}> - //CHECK: else - //CHECK: [[SLICE1:%.+]] = tensor.extract_slice [[INPUT]][0, 0, [[OFFSET]], 0] [1, 16, 52, 200] [1, 1, 1, 1] : tensor<1x16x200x200xf16, {order = #NHWC}> to tensor<1x16x52x200xf16, {order = #NHWC}> - //CHECK: [[MAXPOOL2:%.+]] = VPU.NCE.MaxPool([[SLICE1]], [[WEIGHTS_TABLE]] ) - //CHECK-SAME: pad = #VPU.Padding - //CHECK: scf.yield [[MAXPOOL2]] : tensor<1x16x50x200xf16, {order = #NHWC}> - //CHECK: scf.yield [[INNER_IF]] : tensor<1x16x50x200xf16, {order = #NHWC}> - - //CHECK: [[INSERT:%.+]] = tensor.insert_slice [[OUTER_IF]] into [[LOOP_OUT]][0, 0, [[LOOP_ITER]], 0] [1, 16, 50, 200] [1, 1, 1, 1] : tensor<1x16x50x200xf16, {order = #NHWC}> into tensor<1x16x200x200xf16, {order = #NHWC}> - //CHECK: scf.yield [[INSERT]] : tensor<1x16x200x200xf16, {order = #NHWC}> + //CHECK: [[OFFSET:%.+]] = affine.max #[[$MAP]]([[LOOP_ITER]]) + //CHECK: [[DIFF1:%.+]] = affine.min #[[$MAP1]]([[LOOP_ITER]]) + //CHECK: [[PAD_LOW:%.+]] = affine.max #[[$MAP2]]()[[[DIFF1]]] + //CHECK: [[DIFF2:%.+]] = affine.min #[[$MAP3]]([[LOOP_ITER]], [[OFFSET]]) + //CHECK: [[PAD_HIGH:%.+]] = affine.max #[[$MAP2]]()[[[DIFF2]]] + + //CHECK: [[SLICE:%.+]] = tensor.extract_slice [[INPUT]][0, 0, [[OFFSET]], 0] [1, 16, 51, 200] [1, 1, 1, 1] : tensor<1x16x200x200xf16, {order = #NHWC}> to tensor<1x16x51x200xf16, {order = #NHWC}> + //CHECK: [[PAD_VALUE:%.+]] = arith.constant 0.000000e+00 : f16 + //CHECK: [[PAD:%.+]] = tensor.pad [[SLICE]] low[0, 0, [[PAD_LOW]], 1] high[0, 0, [[PAD_HIGH]], 1] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x16x51x200xf16, {order = #NHWC}> to tensor<1x16x?x202xf16, {order = #NHWC}> + + //CHECK: [[MAXPOOL:%.+]] = VPU.NCE.MaxPool([[PAD]], [[WEIGHTS_TABLE]] ) + //CHECK-SAME: pad = #VPU.Padding + //CHECK-SAME: tensor<1x16x?x200xf16, {order = #NHWC}> + //CHECK: [[RESULT_SIZE:%.+]] = arith.constant 50 : index + + //CHECK: [[INSERT:%.+]] = tensor.insert_slice [[MAXPOOL]] into [[LOOP_OUT]][0, 0, [[LOOP_ITER]], 0] [1, 16, [[RESULT_SIZE]], 200] [1, 1, 1, 1] : tensor<1x16x?x200xf16, {order = #NHWC}> into tensor<1x16x200x200xf16, {order = #NHWC}> + //CHECK: scf.yield [[INSERT]] : tensor<1x16x200x200xf16, {order = #NHWC}> //CHECK: return [[LOOP]] : tensor<1x16x200x200xf16, {order = #NHWC}> } @@ -496,3 +505,351 @@ func.func @DynamicEltwiseTiling( //CHECK: scf.yield [[INSERT]] : tensor<1x16x127x480xf16, {order = #NHWC}> //CHECK: return [[LOOP]] : tensor<1x16x127x480xf16, {order = #NHWC}> } + +// ----- + + #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + + func.func @NotTileLayoutCast( + %arg0: tensor<1x16x127x480xf16> + ) -> tensor<1x16x127x480xf16, {order = #NHWC}> { + %0 = VPU.LayoutCast(%arg0) {dst_order = #NHWC, tilingStrategy = [1, 1, 1, 2]} : tensor<1x16x127x480xf16> -> tensor<1x16x127x480xf16, {order = #NHWC}> + + return %0 : tensor<1x16x127x480xf16, {order = #NHWC}> + + //CHECK-NOT: scf.for + } + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 - 1, 0)> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (-(d0 - 1), 0)> +// CHECK: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0, 1)> +// CHECK: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 + d0 + 1 - 512, 0)> +// CHECK: #[[$MAP4:.+]] = affine_map<(d0, d1) -> (d1 + d0 + 1 - 480, 0)> + +// CHECK-LABEL: @Tiling2DNotPaddedMaxPool +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x16x512x480xf16, {order = #NHWC}> + func.func @Tiling2DNotPaddedMaxPool( + %arg0: tensor<1x16x512x480xf16, {order = #NHWC}> + ) -> tensor<1x16x512x480xf16, {order = #NHWC}> { + %1 = VPU.NCE.MaxPool(%arg0) { + kernel_size = [3, 3], + multiClusterStrategy = #VPU.multi_cluster_strategy, + pad = #VPU.Padding< + left = 1 : i64, + right = 1 : i64, + top = 1 : i64, + bottom = 1 : i64>, + ppe = #VPU.PPEInt, + clamp_low = -2147483648 : i64, + clamp_high = 2147483647 : i64, + lrelu_mult = 1 : i64, + lrelu_shift = 0 : i64, + fp_prelu_alpha = 1.000000e+00 : f64 + >, + strides = [1, 1], + tilingStrategy = [1, 1, 2, 4] + } -> tensor<1x16x512x480xf16, {order = #NHWC}> + + return %1 : tensor<1x16x512x480xf16, {order = #NHWC}> + + //CHECK: [[LOOP_OUTPUT:%.+]] = tensor.empty() : tensor<1x16x512x480xf16, {order = #NHWC}> + //CHECK: [[LOOP_BEGIN:%.+]] = arith.constant 0 : index + //CHECK: [[LOOP_END_H:%.+]] = arith.constant 512 : index + //CHECK: [[LOOP_STEP_H:%.+]] = arith.constant 256 : index + + //CHECK: [[LOOP_H:%.+]] = scf.for + //CHECK-SAME: [[LOOP_ITER_H:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END_H]] step [[LOOP_STEP_H]] + //CHECK-SAME: iter_args([[LOOP_OUT:%arg[0-9]]] = [[LOOP_OUTPUT]]) -> (tensor<1x16x512x480xf16, {order = #NHWC}>) + + //CHECK: [[LOOP_END_W:%.+]] = arith.constant 480 : index + //CHECK: [[LOOP_STEP_W:%.+]] = arith.constant 120 : index + //CHECK: [[LOOP_W:%.+]] = scf.for + //CHECK-SAME: [[LOOP_ITER_W:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END_W]] step [[LOOP_STEP_W]] + //CHECK-SAME: iter_args([[LOOP_OUT_W:%arg[0-9]]] = [[LOOP_OUT]]) -> (tensor<1x16x512x480xf16, {order = #NHWC}>) + + //CHECK: [[SLICE_OFFSET_H:%.+]] = affine.max #[[$MAP0]]([[LOOP_ITER_H]]) + //CHECK: [[TEMP_VALUE0:%.+]] = affine.min #[[$MAP1]]([[LOOP_ITER_H]]) + //CHECK: [[PAD_LOW_H:%.+]] = affine.max #[[$MAP2]]()[[[TEMP_VALUE0]]] + //CHECK: [[TEMP_VALUE1:%.+]] = affine.min #[[$MAP3]]([[LOOP_ITER_H]], [[SLICE_OFFSET_H]]) + //CHECK: [[PAD_LOW_W:%.+]] = affine.max #[[$MAP2]]()[[[TEMP_VALUE1]]] + //CHECK: [[SLICE_OFFSET_W:%.+]] = affine.max #[[$MAP0]]([[LOOP_ITER_W]]) + //CHECK: [[TEMP_VALUE2:%.+]] = affine.min #[[$MAP1]]([[LOOP_ITER_W]]) + //CHECK: [[PAD_HIGH_H:%.+]] = affine.max #[[$MAP2]]()[[[TEMP_VALUE2]]] + //CHECK: [[TEMP_VALUE3:%.+]] = affine.min #[[$MAP4]]([[LOOP_ITER_W]], [[SLICE_OFFSET_W]]) + //CHECK: [[PAD_HIGH_W:%.+]] = affine.max #[[$MAP2]]()[[[TEMP_VALUE3]]] + + //CHECK: [[SLICE:%.+]] = tensor.extract_slice [[INPUT]][0, 0, [[SLICE_OFFSET_H]], [[SLICE_OFFSET_W]]] [1, 16, 257, 121] [1, 1, 1, 1] : tensor<1x16x512x480xf16, {order = #NHWC}> to tensor<1x16x257x121xf16, {order = #NHWC}> + //CHECK: [[PAD_VALUE:%.+]] = arith.constant 0.000000e+00 : f16 + //CHECK: [[PAD:%.+]] = tensor.pad [[SLICE]] low[0, 0, [[PAD_LOW_H]], [[PAD_HIGH_H]]] high[0, 0, [[PAD_LOW_W]], [[PAD_HIGH_W]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x16x257x121xf16, {order = #NHWC}> to tensor<1x16x?x?xf16, {order = #NHWC}> + //CHECK: [[POOL:%.+]] = VPU.NCE.MaxPool([[PAD]]) + //CHECK: [[PAD_DIM_H:%.+]] = arith.constant 256 : index + //CHECK: [[PAD_DIM_W:%.+]] = arith.constant 120 : index + //CHECK: [[INSERT:%.+]] = tensor.insert_slice [[POOL]] into [[LOOP_OUT_W]][0, 0, [[LOOP_ITER_H]], [[LOOP_ITER_W]]] [1, 16, [[PAD_DIM_H]], [[PAD_DIM_W]]] [1, 1, 1, 1] + //CHECK-SAME: tensor<1x16x?x?xf16, {order = #NHWC}> into tensor<1x16x512x480xf16, {order = #NHWC}> + + //CHECK: scf.yield [[INSERT]] : tensor<1x16x512x480xf16, {order = #NHWC}> + //CHECK: scf.yield [[LOOP_W]] : tensor<1x16x512x480xf16, {order = #NHWC}> + //CHECK: return [[LOOP_H]] : tensor<1x16x512x480xf16, {order = #NHWC}> + } + + // ----- + + #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +//CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 - 1, 0)> +//CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (-(d0 - 1), 0)> +//CHECK: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0, 1)> +//CHECK: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 + d0 + 1 - 64, 0)> + +// CHECK-LABEL: @ConvChannel2DTiling +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x512x64x64xf16, {order = #NHWC}> +func.func @ConvChannel2DTiling(%arg0: tensor<1x512x64x64xf16, {order = #NHWC}>) -> tensor<1x256x64x64xf16, {order = #NHWC}> { + %weights = const.Declare tensor<256x512x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<256x512x3x3xf16>, [#const.Reorder<#NHWC>] + %weights_table = const.Declare tensor<256x1x1x4xsi32> = dense<1> : tensor<256x1x1x4xsi32> + + %0 = VPU.NCE.Convolution(%arg0, %weights, %weights_table) { + ppe = #VPU.PPEStub<>, + pad = #VPU.Padding, + rawFilterShape = [256, 512, 3, 3], + strides = [1, 1], + tilingStrategy = [1, 2, 8, 1] + } : tensor<1x512x64x64xf16, {order = #NHWC}>, tensor<256x512x3x3xf16, {order = #NHWC}>, tensor<256x1x1x4xsi32> -> tensor<1x256x64x64xf16, {order = #NHWC}> + + return %0 : tensor<1x256x64x64xf16, {order = #NHWC}> + + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<256x512x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<256x512x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK: [[WEIGHTS_TABLE:%.+]] = const.Declare tensor<256x1x1x4xsi32> = dense<1> : tensor<256x1x1x4xsi32> + + //CHECK: [[LOOP_OUTPUT:%.+]] = tensor.empty() : tensor<1x256x64x64xf16, {order = #NHWC}> + //CHECK: [[LOOP_BEGIN:%.+]] = arith.constant 0 : index + //CHECK: [[LOOP_END_H:%.+]] = arith.constant 64 : index + //CHECK: [[LOOP_STEP_H:%.+]] = arith.constant 8 : index + + //CHECK: [[LOOP_H:%.+]] = scf.for + //CHECK-SAME: [[LOOP_ITER_H:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END_H]] step [[LOOP_STEP_H]] + //CHECK-SAME: iter_args([[LOOP_OUT:%arg[0-9]]] = [[LOOP_OUTPUT]]) -> (tensor<1x256x64x64xf16, {order = #NHWC}>) + + //CHECK: [[LOOP_END_C:%.+]] = arith.constant 256 : index + //CHECK: [[LOOP_STEP_C:%.+]] = arith.constant 128 : index + //CHECK: [[LOOP_C:%.+]] = scf.for + //CHECK-SAME: [[LOOP_ITER_C:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END_C]] step [[LOOP_STEP_C]] + //CHECK-SAME: iter_args([[LOOP_OUT_C:%arg[0-9]]] = [[LOOP_OUT]]) -> (tensor<1x256x64x64xf16, {order = #NHWC}>) + + //CHECK: [[SLICE_OFFSET:%.+]] = affine.max #[[$MAP0]]([[LOOP_ITER_H]]) + //CHECK: [[TEMP_VALUE0:%.+]] = affine.min #[[$MAP1]]([[LOOP_ITER_H]]) + //CHECK: [[PAD_LOW:%.+]] = affine.max #[[$MAP2]]()[[[TEMP_VALUE0]]] + //CHECK: [[TEMP_VALUE1:%.+]] = affine.min #[[$MAP3]]([[LOOP_ITER_H]], [[SLICE_OFFSET]]) + //CHECK: [[PAD_HIGH:%.+]] = affine.max #[[$MAP2]]()[[[TEMP_VALUE1]]] + + //CHECK: [[SLICE_INPUT:%.+]] = tensor.extract_slice [[INPUT]][0, 0, [[SLICE_OFFSET]], 0] [1, 512, 9, 64] [1, 1, 1, 1] : tensor<1x512x64x64xf16, {order = #NHWC}> to tensor<1x512x9x64xf16, {order = #NHWC}> + //CHECK: [[SLICE_WEIGHTS:%.+]] = tensor.extract_slice [[WEIGHTS]][[[LOOP_ITER_C]], 0, 0, 0] [128, 512, 3, 3] [1, 1, 1, 1] : tensor<256x512x3x3xf16, {order = #NHWC}> to tensor<128x512x3x3xf16, {order = #NHWC}> + //CHECK: [[SLICE_WEIGHTS_TABLE:%.+]] = tensor.extract_slice [[WEIGHTS_TABLE]][[[LOOP_ITER_C]], 0, 0, 0] [128, 1, 1, 4] [1, 1, 1, 1] : tensor<256x1x1x4xsi32> to tensor<128x1x1x4xsi32> + //CHECK: [[PAD_VALUE:%.+]] = arith.constant 0.000000e+00 : f16 + //CHECK: [[PAD:%.+]] = tensor.pad [[SLICE_INPUT]] low[0, 0, [[PAD_LOW]], 1] high[0, 0, [[PAD_HIGH]], 1] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x512x9x64xf16, {order = #NHWC}> to tensor<1x512x?x66xf16, {order = #NHWC}> + + //CHECK: [[CONV:%.+]] = VPU.NCE.Convolution([[PAD]], [[SLICE_WEIGHTS]], [[SLICE_WEIGHTS_TABLE]]) + //CHECK: [[PADDED_DIM:%.+]] = arith.constant 8 : index + //CHECK: [[INSERT:%.+]] = tensor.insert_slice [[CONV]] into [[LOOP_OUT_C]][0, [[LOOP_ITER_C]], [[LOOP_ITER_H]], 0] [1, 128, [[PADDED_DIM]], 64] [1, 1, 1, 1] : tensor<1x128x?x64xf16, {order = #NHWC}> into tensor<1x256x64x64xf16, {order = #NHWC}> + //CHECK: scf.yield [[INSERT]] : tensor<1x256x64x64xf16, {order = #NHWC}> + //CHECK: scf.yield [[LOOP_C]] + //CHECK: return [[LOOP_H]] : tensor<1x256x64x64xf16, {order = #NHWC}> +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +// CHECK: #[[$MAP_MIN_H:.+]] = affine_map<(d0)[s0] -> (128, -d0 + s0)> +// CHECK: #[[$MAP_MIN_W:.+]] = affine_map<(d0)[s0] -> (240, -d0 + s0)> + +// CHECK-LABEL: @Dynamic2DEltwiseTiling +// CHECK-SAME: [[INPUT0:%arg[0-9]]]: tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}>, +// CHECK-SAME: [[INPUT1:%arg[0-9]]]: tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}>) +func.func @Dynamic2DEltwiseTiling( + %arg0: tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}>, + %arg1: tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> +) -> tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> { + %0 = VPU.NCE.Eltwise(%arg0, %arg1) { + is_inplace = true, + multiClusterStrategy = #VPU.multi_cluster_strategy, + op_type = #VPU.eltwise_type, + ppe = #VPU.PPEInt< + mode = , + clamp_low = -2147483648 : i64, + clamp_high = 2147483647 : i64, + lrelu_mult = 1 : i64, + lrelu_shift = 0 : i64, + quant_scale = [1.000000e+00], + fp_prelu_alpha = 1.000000e+00 : f64 + >, + tilingStrategy = [1, 1, 2, 2] + } -> tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + + return %0 : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + + //CHECK: [[DIM_VALUE_H_0:%.+]] = arith.constant 2 : index + //CHECK: [[DIM_H_0:%.+]] = tensor.dim [[INPUT0]], [[DIM_VALUE_H_0]] : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[DIM_VALUE_W_0:%.+]] = arith.constant 3 : index + //CHECK: [[DIM_W_0:%.+]] = tensor.dim [[INPUT0]], [[DIM_VALUE_W_0]] : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + + //CHECK: [[LOOP_OUTPUT:%.+]] = tensor.empty([[DIM_H_0]], [[DIM_W_0]]) : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[LOOP_BEGIN:%.+]] = arith.constant 0 : index + //CHECK: [[DIM_VALUE_H_1:%.+]] = arith.constant 2 : index + //CHECK: [[LOOP_END_H:%.+]] = tensor.dim [[INPUT0]], [[DIM_VALUE_H_1]] : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[DIM_VALUE_W_1:%.+]] = arith.constant 3 : index + //CHECK: [[LOOP_END_W:%.+]] = tensor.dim [[INPUT0]], [[DIM_VALUE_W_1]] : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[LOOP_STEP_H:%.+]] = arith.constant 128 : index + //CHECK: [[LOOP_H:%.+]] = scf.for + //CHECK-SAME: [[LOOP_ITER_H:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END_H]] step [[LOOP_STEP_H]] + //CHECK-SAME: iter_args([[LOOP_OUT:%arg[0-9]]] = [[LOOP_OUTPUT]]) -> (tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}>) + //CHECK: [[LOOP_STEP_W:%.+]] = arith.constant 240 : index + + //CHECK: [[LOOP_W:%.+]] = scf.for + //CHECK-SAME: [[LOOP_ITER_W:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END_W]] step [[LOOP_STEP_W]] + //CHECK-SAME: iter_args([[LOOP_OUT_W:%arg[0-9]]] = [[LOOP_OUT]]) + + //CHECK: [[SLICE_SIZE_H:%.+]] = affine.min #[[$MAP_MIN_H]]([[LOOP_ITER_H]])[[[LOOP_END_H]]] + //CHECK: [[SLICE_SIZE_W:%.+]] = affine.min #[[$MAP_MIN_W]]([[LOOP_ITER_W]])[[[LOOP_END_W]]] + //CHECK: [[SLICE_INPUT0:%.+]] = tensor.extract_slice [[INPUT0]][0, 0, [[LOOP_ITER_H]], [[LOOP_ITER_W]]] [1, 16, [[SLICE_SIZE_H]], [[SLICE_SIZE_W]]] [1, 1, 1, 1] + //CHECK: [[SLICE_INPUT1:%.+]] = tensor.extract_slice [[INPUT1]][0, 0, [[LOOP_ITER_H]], [[LOOP_ITER_W]]] [1, 16, [[SLICE_SIZE_H]], [[SLICE_SIZE_W]]] [1, 1, 1, 1] + //CHECK: [[ELTWISE:%.+]] = VPU.NCE.Eltwise([[SLICE_INPUT0]], [[SLICE_INPUT1]]) + //CHECK: [[INSERT:%.+]] = tensor.insert_slice [[ELTWISE]] into [[LOOP_OUT_W]][0, 0, [[LOOP_ITER_H]], [[LOOP_ITER_W]]] [1, 16, [[SLICE_SIZE_H]], [[SLICE_SIZE_W]]] [1, 1, 1, 1] + //CHECK-SAME: tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 128, 240]> : tensor<4xsi64>, order = #NHWC}> into tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: scf.yield [[INSERT]] : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + + //CHECK: scf.yield [[LOOP_W]] : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: return [[LOOP_H]] : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> +} + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +//CHECK: #[[$MAP:.*]] = affine_map<(d0)[s0] -> (512, -d0 + s0)> +//CHECK: #[[$MAP1:.*]] = affine_map<(d0) -> (d0 - 1, 0)> +//CHECK: #[[$MAP2:.*]] = affine_map<(d0) -> (-(d0 - 1), 0)> +//CHECK: #[[$MAP3:.*]] = affine_map<()[s0] -> (s0, 1)> +//CHECK: #[[$MAP4:.*]] = affine_map<(d0) -> (d0 + 1)> +//CHECK: #[[$MAP5:.*]] = affine_map<(d0, d1) -> (d1 + d0 + 1 - -9223372036854775808, 0)> + +// CHECK-LABEL: @ApplyTilingNCEConvDyn +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x32x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 1024, 64]> : tensor<4xsi64>, order = #NHWC}> +func.func @ApplyTilingNCEConvDyn(%arg0: tensor<1x32x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 1024, 64]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x256x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 256, 1024, 64]> : tensor<4xsi64>, order = #NHWC}> { + %weights = const.Declare tensor<256x32x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<256x32x3x3xf16>, [#const.Reorder<#NHWC>] + %weights_table = const.Declare tensor<256x1x1x4xsi32, {order = #NCHW}> = dense<10> : tensor<256x1x1x4xsi32> + + %0 = VPU.NCE.Convolution(%arg0, %weights, %weights_table) { + pad = #VPU.Padding, + ppe = #VPU.PPEStub<>, + rawFilterShape = [256, 32, 3, 3], + strides = [1, 1], + tilingStrategy = [1, 1, 2, 1] + } : tensor<1x32x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 1024, 64]> : tensor<4xsi64>, order = #NHWC}>, tensor<256x32x3x3xf16, {order = #NHWC}>, tensor<256x1x1x4xsi32, {order = #NCHW}> -> tensor<1x256x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 256, 1024, 64]> : tensor<4xsi64>, order = #NHWC}> + + return %0 : tensor<1x256x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 256, 1024, 64]> : tensor<4xsi64>, order = #NHWC}> + + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<256x32x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<256x32x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK: [[WEIGHTS_TABLE:%.+]] = const.Declare tensor<256x1x1x4xsi32, {order = #NCHW}> = dense<10> : tensor<256x1x1x4xsi32> + //CHECK: [[C2:%.+]] = arith.constant 2 : index + //CHECK: [[DIM:%.+]] = tensor.dim [[INPUT]], [[C2]] : tensor<1x32x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 1024, 64]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[C0:%.+]] = arith.constant 0 : index + //CHECK: [[C1:%.+]] = arith.constant 1 : index + //CHECK: [[LOOP_OUTPUT:%.+]] = tensor.empty([[DIM]]) : tensor<1x256x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 256, 1024, 64]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[LOOP_BEGIN:%.+]] = arith.constant 0 : index + //CHECK: [[DIM_INDEX:%.+]] = arith.constant 2 : index + //CHECK: [[LOOP_END:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX]] : tensor<1x32x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 1024, 64]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[C0_1:%.+]] = arith.constant 0 : index + //CHECK: [[C1_1:%.+]] = arith.constant 1 : index + //CHECK: [[LOOP_STEP:%.+]] = arith.constant 512 : index + //CHECK: [[LOOP:%.+]] = scf.for + //CHECK-SAME: [[LOOP_ITER:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END]] step [[LOOP_STEP]] + //CHECK-SAME: iter_args([[LOOP_OUT:%arg[0-9]]] = [[LOOP_OUTPUT]]) -> (tensor<1x256x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 256, 1024, 64]> : tensor<4xsi64>, order = #NHWC}>) { + + //CHECK: [[RESULT_SIZE:%.+]] = affine.min #[[$MAP]]([[LOOP_ITER]])[[[LOOP_END]]] + //CHECK: [[SLICE_OFFSET:%.+]] = affine.max #[[$MAP1]]([[LOOP_ITER]]) + //CHECK: [[TEMP_VALUE0:%.+]] = affine.min #[[$MAP2]]([[LOOP_ITER]]) + //CHECK: [[PAD_LOW:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE0]]] + //CHECK: [[STRIDE_OFFSET:%.+]] = affine.apply #[[$MAP4]]([[RESULT_SIZE]]) + //CHECK: [[TEMP_VALUE1:%.+]] = affine.min #[[$MAP5]]([[LOOP_ITER]], [[SLICE_OFFSET]]) + //CHECK: [[PAD_HIGH:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE1]]] + + //CHECK: [[SLICE:%.+]] = tensor.extract_slice [[INPUT]][0, 0, [[SLICE_OFFSET]], 0] [1, 32, [[STRIDE_OFFSET]], 64] [1, 1, 1, 1] + //CHECK-SAME: : tensor<1x32x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 1024, 64]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x32x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 512, 64]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[PAD_VALUE:%.+]] = arith.constant 0.000000e+00 : f16 + //CHECK: [[PAD:%.+]] = tensor.pad [[SLICE]] low[0, 0, [[PAD_LOW]], 1] high[0, 0, [[PAD_HIGH]], 1] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 512, 64]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x32x?x66xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 512, 66]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[CONV:%.+]] = VPU.NCE.Convolution([[PAD]], [[WEIGHTS]], [[WEIGHTS_TABLE]]) + //CHECK-SAME: {pad = #VPU.Padding + //CHECK-SAME: tensor<1x32x?x66xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 512, 66]> : tensor<4xsi64>, order = #NHWC}>, tensor<256x32x3x3xf16, {order = #NHWC}>, tensor<256x1x1x4xsi32, {order = #NCHW}> -> tensor<1x256x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 256, 512, 64]> : tensor<4xsi64>, order = #NHWC}> + + //CHECK: [[INSERT:%.+]] = tensor.insert_slice [[CONV]] into [[LOOP_OUT]][0, 0, [[LOOP_ITER]], 0] [1, 256, [[RESULT_SIZE]], 64] [1, 1, 1, 1] : tensor<1x256x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 256, 512, 64]> : tensor<4xsi64>, order = #NHWC}> into tensor<1x256x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 256, 1024, 64]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: scf.yield [[INSERT]] : tensor<1x256x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 256, 1024, 64]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: return [[LOOP]] : tensor<1x256x?x64xf16, {bounds = #const.OpaqueI64Elements<[1, 256, 1024, 64]> : tensor<4xsi64>, order = #NHWC}> +} + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +//CHECK: #[[$MAP:.+]] = affine_map<(d0)[s0] -> (100, -d0 + s0)> +//CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 - 1, 0)> +//CHECK: #[[$MAP2:.+]] = affine_map<(d0) -> (-(d0 - 1), 0)> +//CHECK: #[[$MAP3:.+]] = affine_map<()[s0] -> (s0, 1)> +//CHECK: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 + 1)> +//CHECK: #[[$MAP5:.+]] = affine_map<(d0, d1) -> (d1 + d0 + 1 - -9223372036854775808, 0)> +// CHECK-LABEL: @ApplyTilingMaxPool4Tiles +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 400, 200]> : tensor<4xsi64>, order = #NHWC}>) +func.func @ApplyTilingMaxPool4Tiles(%arg0: tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 400, 200]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 400, 200]> : tensor<4xsi64>, order = #NHWC}> { + %weights_table = const.Declare tensor<16x1x1x4xsi32, {order = #NCHW}> = dense<10> : tensor<16x1x1x4xsi32> + + %0 = VPU.NCE.MaxPool(%arg0, %weights_table) { + kernel_size = [3, 3], + pad = #VPU.Padding, + ppe = #VPU.PPEStub<>, + strides = [1, 1], + tilingStrategy = [1, 1, 4, 1] + } -> tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 400, 200]> : tensor<4xsi64>, order = #NHWC}> + + return %0 : tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 400, 200]> : tensor<4xsi64>, order = #NHWC}> + + //CHECK: [[WEIGHTS_TABLE:%.+]] = const.Declare tensor<16x1x1x4xsi32, {order = #NCHW}> = dense<10> : tensor<16x1x1x4xsi32> + //CHECK: [[C2:%.+]] = arith.constant 2 : index + //CHECK: [[DIM:%.+]] = tensor.dim [[INPUT]], [[C2]] : tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 400, 200]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[C0:%.+]] = arith.constant 0 : index + //CHECK: [[C1:%.+]] = arith.constant 1 : index + //CHECK: [[OUTPUT:%.+]] = tensor.empty([[DIM]]) : tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 400, 200]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[LOOP_START:%.+]] = arith.constant 0 : index + //CHECK: [[INDEX:%.+]] = arith.constant 2 : index + //CHECK: [[LOOP_END:%.+]] = tensor.dim [[INPUT]], [[INDEX]] : tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 400, 200]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[C0_3:%.+]] = arith.constant 0 : index + //CHECK: [[C1_4:%.+]] = arith.constant 1 : index + //CHECK: [[STEP:%.+]] = arith.constant 100 : index + //CHECK: [[RESULT:%.+]] = scf.for [[LOOP_ITER:%.+]] = [[LOOP_START]] to [[LOOP_END]] step [[STEP]] iter_args([[LOOP_OUT:%.+]] = [[OUTPUT]]) -> (tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 400, 200]> : tensor<4xsi64>, order = #NHWC}>) { + //CHECK: [[MIN_OFFSET:%.+]] = affine.min #[[$MAP]]([[LOOP_ITER]])[[[LOOP_END]]] + //CHECK: [[OFFSET:%.+]] = affine.max #[[$MAP1]]([[LOOP_ITER]]) + //CHECK: [[TEMP_VALUE0:%.+]] = affine.min #[[$MAP2]]([[LOOP_ITER]]) + //CHECK: [[PAD_LOW:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE0]]] + //CHECK: [[SIZE:%.+]] = affine.apply #[[$MAP4]]([[MIN_OFFSET]]) + //CHECK: [[TEMP_VALUE1:%.+]] = affine.min #[[$MAP5]]([[LOOP_ITER]], [[OFFSET]]) + //CHECK: [[PAD_HIGH:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE1]]] + //CHECK: [[SLICE0:%.+]] = tensor.extract_slice [[INPUT]][0, 0, [[OFFSET]], 0] [1, 16, [[SIZE]], 200] [1, 1, 1, 1] : tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 400, 200]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 100, 200]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[PAD_VALUE:%.+]] = arith.constant 0.000000e+00 : f16 + //CHECK: [[PAD:%.+]] = tensor.pad [[SLICE0]] low[0, 0, [[PAD_LOW]], 1] high[0, 0, [[PAD_HIGH]], 1] { + //CHECK: ^bb0([[ARG3:%.+]]: index, [[ARG4:%.+]]: index, [[ARG5:%.+]]: index, [[ARG6:%.+]]: index): + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: } : tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 100, 200]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x?x202xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 100, 202]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[POOL_RESULT:%.+]] = VPU.NCE.MaxPool([[PAD]], [[WEIGHTS_TABLE]] ) {kernel_size = [3, 3], pad = #VPU.Padding, ppe = #VPU.PPEStub<>, strides = [1, 1]} -> tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 100, 200]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[SLICE1:%.+]] = tensor.insert_slice [[POOL_RESULT]] into [[LOOP_OUT]][0, 0, [[LOOP_ITER]], 0] [1, 16, [[MIN_OFFSET]], 200] [1, 1, 1, 1] : tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 100, 200]> : tensor<4xsi64>, order = #NHWC}> into tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 400, 200]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: scf.yield [[SLICE1]] : tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 400, 200]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: } + //CHECK: return [[RESULT]] : tensor<1x16x?x200xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 400, 200]> : tensor<4xsi64>, order = #NHWC}> +} diff --git a/tests/lit/NPU/dialect/VPU/passes/bounded_tensors_to_dynamic_dims_mask_skip_main_func.mlir b/tests/lit/NPU/dialect/VPU/passes/bounded_tensors_to_dynamic_dims_mask_skip_main_func.mlir new file mode 100644 index 0000000000..c846b23de3 --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/bounded_tensors_to_dynamic_dims_mask_skip_main_func.mlir @@ -0,0 +1,75 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=HostCompile" --bounded-tensors-to-dynamic-dims-mask %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +// CHECK-LABEL: @EmptyFunction +module @EmptyFunction{ + net.NetworkInfo entryPoint : @EmptyFunction + inputsInfo : { + DataInfo "input" : tensor + } outputsInfo : { + DataInfo "output" : tensor + } + + func.func @EmptyFunction(%arg0: tensor : tensor<3xsi64>, order = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}>) -> tensor : tensor<3xsi64>, order = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> { + return %arg0 : tensor : tensor<3xsi64>, order = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> + } + + // CHECK: func.func [[EMPTY_FUNC:@.+]]([[_:%.+]]: tensor : tensor<3xsi64>, order = #CHW}>) -> tensor : tensor<3xsi64>, order = #CHW}> +} + +// ----- + +// CHECK-LABEL: @ScheduleEltwiseNHWC +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#map = affine_map<(d0)[s0] -> (-d0 + s0, 90)> +module @ScheduleEltwiseNHWC { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input1" : tensor<1x16x?x1000xf16> + DataInfo "input2" : tensor<1x16x?x1000xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x?x1000xf16> + } + + func.func @main_func0_static(%arg0: tensor<1x16x90x1000xf16, {order = #NHWC}>, %arg1: tensor<1x16x90x1000xf16, {order = #NHWC}>) -> tensor<1x16x90x1000xf16, {order = #NHWC}> { + %0 = VPU.NCE.Eltwise(%arg0, %arg1) {multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, quant_scale = [1.000000e+00], fp_prelu_alpha = 1.000000e+00 : f64>} -> tensor<1x16x90x1000xf16, {order = #NHWC}> + return %0 : tensor<1x16x90x1000xf16, {order = #NHWC}> + } + + func.func @main(%arg0: tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>, %arg1: tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> { + %c90 = arith.constant 90 : index + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %dim = tensor.dim %arg0, %c2 : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + %0 = tensor.empty(%dim) : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + %dim_0 = tensor.dim %arg0, %c2 : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + %1 = scf.for %arg2 = %c0 to %dim_0 step %c90 iter_args(%arg3 = %0) -> (tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>) { + %2 = affine.min #map(%arg2)[%dim_0] + %3 = arith.cmpi ne, %2, %c90 : index + %4 = scf.if %3 -> (index) { + %6 = arith.subi %c90, %2 : index %7 = arith.cmpi slt, %arg2, %6 : index + cf.assert %7, "Not enough elements to backtrack in scf.for loop" %8 = arith.subi %arg2, %6 : index + scf.yield %8 : index + } else { + scf.yield %arg2 : index + } + %extracted_slice = tensor.extract_slice %arg0[0, 0, %4, 0] [1, 16, %c90, 1000] [1, 1, 1, 1] : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 90, 1000]> : tensor<4xsi64>, order = #NHWC}> + %cast = tensor.cast %extracted_slice : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 90, 1000]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x90x1000xf16, {order = #NHWC}> + %extracted_slice_1 = tensor.extract_slice %arg1[0, 0, %4, 0] [1, 16, %c90, 1000] [1, 1, 1, 1] : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 90, 1000]> : tensor<4xsi64>, order = #NHWC}> + %cast_2 = tensor.cast %extracted_slice_1 : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 90, 1000]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x90x1000xf16, {order = #NHWC}> + %5 = func.call @main_func0_static(%cast, %cast_2) : (tensor<1x16x90x1000xf16, {order = #NHWC}>, tensor<1x16x90x1000xf16, {order = #NHWC}>) -> tensor<1x16x90x1000xf16, {order = #NHWC}> + %cast_3 = tensor.cast %5 : tensor<1x16x90x1000xf16, {order = #NHWC}> to tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 90, 1000]> : tensor<4xsi64>, order = #NHWC}> + %inserted_slice = tensor.insert_slice %cast_3 into %arg3[0, 0, %4, 0] [1, 16, %c90, 1000] [1, 1, 1, 1] : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 90, 1000]> : tensor<4xsi64>, order = #NHWC}> into tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + scf.yield %inserted_slice : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + } + return %1 : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + } + + // CHECK: func.func [[STATIC_FUNC:@.+]]([[_:%.+]]: tensor<1x16x90x1000xf16, {order = #NHWC}>, [[_:%.+]]: tensor<1x16x90x1000xf16, {order = #NHWC}>) -> tensor<1x16x90x1000xf16, {order = #NHWC}> + + // CHECK: func.func [[MAIN:@.+]]([[_:%.+]]: tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>, [[_:%.+]]: tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> +} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/compress_dma_reserve_mem_40XX+.mlir b/tests/lit/NPU/dialect/VPU/passes/compress_dma_reserve_mem_40XX+.mlir similarity index 100% rename from tests/lit/NPU/dialect/VPUIP/passes/compress_dma_reserve_mem_40XX+.mlir rename to tests/lit/NPU/dialect/VPU/passes/compress_dma_reserve_mem_40XX+.mlir diff --git a/tests/lit/NPU/dialect/VPU/passes/compute_interpolate_coordinates.mlir b/tests/lit/NPU/dialect/VPU/passes/compute_interpolate_coordinates.mlir index b8711edebb..ce7db74e5e 100644 --- a/tests/lit/NPU/dialect/VPU/passes/compute_interpolate_coordinates.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/compute_interpolate_coordinates.mlir @@ -104,7 +104,7 @@ func.func @interpolateNCHWAxes12(%arg0: tensor<1x14x14x21xf16>) -> tensor<1x16x1 //CHECK: [[UNROLLED_LAMBDAS:%.+]] = VPU.UnrolledType([[LAMBDAS]] : tensor<1x1x1x20xf16>) -> !VPU.DistributedTensor<1x1x1x20xf16 // CHECK: [[INTERPOLATE:%.+]] = VPU.Interpolate([[UNROLLED_INPUT]], [[UNROLLED_COORDINATES]], [[UNROLLED_LAMBDAS]]) -// CHECK-SAME: : !VPU.DistributedTensor<1x14x14x21xf16, +// CHECK-SAME: : !VPU.DistributedTensor<1x14x14x21xf16, // CHECK: [[UNROLLED_OUTPUT:%.+]] = VPU.UnrolledType([[INTERPOLATE]] diff --git a/tests/lit/NPU/dialect/VPU/passes/concat_init_inputs.mlir b/tests/lit/NPU/dialect/VPU/passes/concat_init_inputs.mlir new file mode 100644 index 0000000000..f6012ab703 --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/concat_init_inputs.mlir @@ -0,0 +1,77 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler=vpu-arch=%arch% --concat-init-inputs %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +// CHECK: module @SingleInput +module @SingleInput { + net.NetworkInfo entryPoint : @init inputsInfo : { + DataInfo "vpux_ow_1" : tensor<2x2x5x3xf16> + } outputsInfo : { + DataInfo "vpux_tw_1_hash_123456789" : tensor<2x2x5x3xf16> + } + + // CHECK: inputsInfo + // CHECK-NEXT: DataInfo "vpux_ow_1" : tensor<2x2x5x3xf16> + // CHECK: outputsInfo + // CHECK-NEXT: DataInfo "vpux_tw_1_hash_123456789" : tensor<2x2x5x3xf16> + + func.func @init(%ov1: tensor<2x2x5x3xf16>) -> tensor<2x2x5x3xf16> { + %one = const.Declare tensor<1xf16> = dense<1.0> : tensor<1xf16> + %res = IE.Add(%ov1, %one) {auto_broadcast = #IE.auto_broadcast_type} + : tensor<2x2x5x3xf16>, tensor<1xf16> -> tensor<2x2x5x3xf16> + return %res : tensor<2x2x5x3xf16> + } + + // CHECK: func.func @init([[OV1:%.+]]: tensor<2x2x5x3xf16>) -> tensor<2x2x5x3xf16> + // CHECK: [[RES:%.+]] = IE.Add([[OV1]], {{%.+}}) + // CHECK: return [[RES]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK: module @TwoInputs +module @TwoInputs { + net.NetworkInfo entryPoint : @init inputsInfo : { + DataInfo "vpux_ow_1" : tensor<2x2x5x3xf16> + DataInfo "vpux_ow_2" : tensor<42x100x1x1xui8> + } outputsInfo : { + DataInfo "vpux_tw_1_hash_123456789" : tensor<2x2x5x3xf16> + DataInfo "vpux_tw_2_hash_987654321" : tensor<42x100x1x1xsi8> + } + + // CHECK: inputsInfo + // CHECK-NEXT: DataInfo "vpux_ow_hash_2211455849133395826_concat" : tensor<4320xi8> + // CHECK: outputsInfo + // CHECK-NEXT: DataInfo "vpux_tw_1_hash_123456789" : tensor<2x2x5x3xf16> + // CHECK-NEXT: DataInfo "vpux_tw_2_hash_987654321" : tensor<42x100x1x1xsi8> + + func.func @init(%ov1: tensor<2x2x5x3xf16>, %ov2: tensor<42x100x1x1xui8, {order = #NHWC}>) + -> (tensor<2x2x5x3xf16>, tensor<42x100x1x1xsi8, {order = #NHWC}>) { + %one = const.Declare tensor<1xf16> = dense<1.0> : tensor<1xf16> + %res1 = IE.Add(%ov1, %one) {auto_broadcast = #IE.auto_broadcast_type} + : tensor<2x2x5x3xf16>, tensor<1xf16> -> tensor<2x2x5x3xf16> + + %res2 = IE.Convert(%ov2) {dstElemType = si8} + : tensor<42x100x1x1xui8, {order = #NHWC}> -> tensor<42x100x1x1xsi8, {order = #NHWC}> + + return %res1, %res2 : tensor<2x2x5x3xf16>, tensor<42x100x1x1xsi8, {order = #NHWC}> + } + + // CHECK: func.func @init([[BLOB:%.+]]: tensor<4320xi8>) + // CHECK-SAME: -> (tensor<2x2x5x3xf16>, tensor<42x100x1x1xsi8, {order = #NHWC}>) + // CHECK: [[SLICE_OV1:%.+]] = IE.Slice [[BLOB]] [0] [120] + // CHECK: [[RESTORED_OV1:%.+]] = Core.ReinterpretCast([[SLICE_OV1]]) {{.*}} -> tensor<2x2x5x3xf16> + // CHECK: [[SLICE_OV2:%.+]] = IE.Slice [[BLOB]] [120] [4200] + // CHECK: [[RESTORED_OV2:%.+]] = Core.ReinterpretCast([[SLICE_OV2]]) + // CHECK-SAME: -> tensor<42x100x1x1xui8, {order = #NHWC}> + + // CHECK: [[RES1:%.+]] = IE.Add([[RESTORED_OV1]], {{%.+}}) + // CHECK: [[RES2:%.+]] = IE.Convert([[RESTORED_OV2]]) + // CHECK: return [[RES1]], [[RES2]] +} diff --git a/tests/lit/NPU/dialect/VPU/passes/concat_init_results_pipeline.mlir b/tests/lit/NPU/dialect/VPU/passes/concat_init_results_pipeline.mlir index 747d16c6f4..f675c743f7 100644 --- a/tests/lit/NPU/dialect/VPU/passes/concat_init_results_pipeline.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/concat_init_results_pipeline.mlir @@ -8,13 +8,14 @@ // RUN: vpux-opt --split-input-file --init-compiler=vpu-arch=%arch% --introduce-init-function="ws-extraction-mode=gen-init memory-limit=0 init-part=0" --concat-init-results="ws-extraction-mode=gen-init memory-limit=0 init-part=0" %s | FileCheck --check-prefix=CHECK-INIT-PART0 %s // RUN: vpux-opt --split-input-file --init-compiler=vpu-arch=%arch% --introduce-init-function="ws-extraction-mode=gen-init memory-limit=0 init-part=1" --concat-init-results="ws-extraction-mode=gen-init memory-limit=0 init-part=1" %s | FileCheck --check-prefix=CHECK-INIT-PART1 %s // RUN: vpux-opt --split-input-file --init-compiler=vpu-arch=%arch% --introduce-init-function="ws-extraction-mode=gen-main memory-limit=0" --concat-init-results="ws-extraction-mode=gen-main memory-limit=0" %s | FileCheck --check-prefix=CHECK-MAIN-PARTS %s +// RUN: vpux-opt --split-input-file --init-compiler=vpu-arch=%arch% --introduce-init-function="ws-extraction-mode=gen-all" --concat-init-results="ws-extraction-mode=gen-all" %s | FileCheck --check-prefix=CHECK-GEN-ALL %s // REQUIRES: arch-NPU37XX || arch-NPU40XX {-# dialect_resources: { builtin: { - ov_1: "0x10000000AABBCCDDEE", - ov_2: "0x10000000AABBCCDDAABBCCDD" + vpux_ow_1: "0x10000000AABBCCDDEE", + vpux_ow_2: "0x10000000AABBCCDDAABBCCDD" } } #-} @@ -24,6 +25,7 @@ // CHECK-INIT-PART0: module @TwoConstants // CHECK-INIT-PART1: module @TwoConstants // CHECK-MAIN-PARTS: module @TwoConstants +// CHECK-GEN-ALL: module @TwoConstants module @TwoConstants { net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input1" : tensor<4x16xf16> @@ -32,46 +34,46 @@ module @TwoConstants { } // CHECK-INIT-FULL: net.NetworkInfo entryPoint : @init inputsInfo : { - // CHECK-INIT-FULL-NEXT: DataInfo "in_ov_1" : tensor<1x1x5x1xui8> - // CHECK-INIT-FULL-NEXT: DataInfo "in_ov_2" : tensor<2x1x1x2xf16> + // CHECK-INIT-FULL-NEXT: DataInfo "vpux_ow_1" : tensor<1x1x5x1xui8> + // CHECK-INIT-FULL-NEXT: DataInfo "vpux_ow_2" : tensor<2x1x1x2xf16> // CHECK-INIT-FULL-NEXT: } outputsInfo : { - // CHECK-INIT-FULL-NEXT: DataInfo "out_ov_0_hash_10575773572454930408_concat" : tensor<25xi8> + // CHECK-INIT-FULL-NEXT: DataInfo "vpux_tw_0_hash_10575773572454930408_concat" : tensor<25xi8> // CHECK-INIT-FULL-NEXT: } // CHECK-MAIN-FULL: net.NetworkInfo entryPoint : @main inputsInfo : { // CHECK-MAIN-FULL-NEXT: DataInfo "input1" : tensor<4x16xf16> - // CHECK-MAIN-FULL-NEXT: DataInfo "out_ov_0_hash_10575773572454930408_concat" : tensor<25xi8> + // CHECK-MAIN-FULL-NEXT: DataInfo "vpux_tw_0_hash_10575773572454930408_concat" : tensor<25xi8> // CHECK-MAIN-FULL-NEXT: } outputsInfo : { // CHECK-MAIN-FULL-NEXT: DataInfo "output1" : tensor<4x16xf16> // CHECK-MAIN-FULL-NEXT: } // CHECK-INIT-PART0: net.NetworkInfo entryPoint : @init_part0 inputsInfo : { - // CHECK-INIT-PART0-NEXT: DataInfo "in_ov_1" : tensor<1x1x5x1xui8> + // CHECK-INIT-PART0-NEXT: DataInfo "vpux_ow_1" : tensor<1x1x5x1xui8> // CHECK-INIT-PART0-NEXT: } outputsInfo : { - // CHECK-INIT-PART0-NEXT: DataInfo "out_ov_1_hash_16529380580407486960" : tensor<1x1x5x1xui8> + // CHECK-INIT-PART0-NEXT: DataInfo "vpux_tw_1_hash_16529380580407486960" : tensor<1x1x5x1xui8> // CHECK-INIT-PART0-NEXT: } // CHECK-INIT-PART1: net.NetworkInfo entryPoint : @init_part1 inputsInfo : { - // CHECK-INIT-PART1-NEXT: DataInfo "in_ov_2" : tensor<2x1x1x2xf16> + // CHECK-INIT-PART1-NEXT: DataInfo "vpux_ow_2" : tensor<2x1x1x2xf16> // CHECK-INIT-PART1-NEXT: } outputsInfo : { - // CHECK-INIT-PART1-NEXT: DataInfo "out_ov_1_hash_11405229062126076964_concat" : tensor<20xi8> + // CHECK-INIT-PART1-NEXT: DataInfo "vpux_tw_1_hash_11405229062126076964_concat" : tensor<20xi8> // CHECK-INIT-PART1-NEXT: } // CHECK-MAIN-PARTS: net.NetworkInfo entryPoint : @main inputsInfo : { // CHECK-MAIN-PARTS-NEXT: DataInfo "input1" : tensor<4x16xf16> - // CHECK-MAIN-PARTS-NEXT: DataInfo "out_ov_1_hash_16529380580407486960" : tensor<1x1x5x1xui8> - // CHECK-MAIN-PARTS-NEXT: DataInfo "out_ov_1_hash_11405229062126076964_concat" : tensor<20xi8> + // CHECK-MAIN-PARTS-NEXT: DataInfo "vpux_tw_1_hash_16529380580407486960" : tensor<1x1x5x1xui8> + // CHECK-MAIN-PARTS-NEXT: DataInfo "vpux_tw_1_hash_11405229062126076964_concat" : tensor<20xi8> // CHECK-MAIN-PARTS-NEXT: } outputsInfo : { // CHECK-MAIN-PARTS-NEXT: DataInfo "output1" : tensor<4x16xf16> // CHECK-MAIN-PARTS-NEXT: } func.func @main(%arg: tensor<4x16xf16>) -> tensor<4x16xf16> { - %ov1 = const.Declare tensor<1x1x5x1xui8> = dense_resource : tensor<1x1x5x1xui8>, [#const.Add<1.0>] + %ov1 = const.Declare tensor<1x1x5x1xui8> = dense_resource : tensor<1x1x5x1xui8>, [#const.Add<1.0>] - %ov2_0 = const.Declare tensor<2x1x1x2xf16> = dense_resource : tensor<2x1x1x2xf16>, + %ov2_0 = const.Declare tensor<2x1x1x2xf16> = dense_resource : tensor<2x1x1x2xf16>, [#const.Add<2.0>] - %ov2_1 = const.Declare tensor<2x1x1x3xf16> = dense_resource : tensor<2x1x1x2xf16>, + %ov2_1 = const.Declare tensor<2x1x1x3xf16> = dense_resource : tensor<2x1x1x2xf16>, [#const.Add<2.0>, #const.Rescale<0.5>, #const.PadWithZero<[0, 0, 0, 0], [0, 0, 0, 1]>] return %arg : tensor<4x16xf16> @@ -111,6 +113,14 @@ module @TwoConstants { // CHECK-MAIN-PARTS: [[SLICE11:%.+]] = VPU.Slice [[BLOB1]] [8] [12] // CHECK-MAIN-PARTS: [[CAST11:%.+]] = Core.ReinterpretCast([[SLICE11]]) {{.*}} -> tensor<2x1x1x3xf16> // CHECK-MAIN-PARTS: return [[IN]] + + // CHECK-GEN-ALL: func.func @wrapper_main([[IN:%.+]]: tensor<4x16xf16>) -> tensor<4x16xf16> + // CHECK-GEN-ALL: [[OV1:%.+]] = const.Declare tensor<1x1x5x1xui8> = dense_resource + // CHECK-GEN-ALL: [[OV2:%.+]] = const.Declare tensor<2x1x1x2xf16> = dense_resource + // CHECK-GEN-ALL: [[CALL_INIT:%.+]] = call @init([[OV1]], [[OV2]]) + // CHECK-GEN-ALL-SAME: -> tensor<25xi8> + // CHECK-GEN-ALL: [[CALL_MAIN:%.+]] = call @main([[IN]], [[CALL_INIT]]) + // CHECK-GEN-ALL: return [[CALL_MAIN]] } // ----- @@ -121,10 +131,10 @@ module @TwoConstants { {-# dialect_resources: { builtin: { - ov_1: "0x10000000AABBCCDD", + vpux_ow_1: "0x10000000AABBCCDD", // Note: required to successfully compile "init-part=1" - ov_dummy: "0x10000000AABBCCDD" + vpux_ow_dummy: "0x10000000AABBCCDD" } } #-} @@ -139,6 +149,7 @@ module @TwoConstants { // CHECK-INIT-PART0: module @QuantizedType // CHECK-INIT-PART1: module @QuantizedType // CHECK-MAIN-PARTS: module @QuantizedType +// CHECK-GEN-ALL: module @QuantizedType module @QuantizedType { net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input1" : tensor<4x16xf16> @@ -147,46 +158,46 @@ module @QuantizedType { } // CHECK-INIT-FULL: net.NetworkInfo entryPoint : @init inputsInfo : { - // CHECK-INIT-FULL-NEXT: DataInfo "in_ov_dummy" : tensor<2xf16> - // CHECK-INIT-FULL-NEXT: DataInfo "in_ov_1" : tensor<2xf16> + // CHECK-INIT-FULL-NEXT: DataInfo "vpux_ow_dummy" : tensor<2xf16> + // CHECK-INIT-FULL-NEXT: DataInfo "vpux_ow_1" : tensor<2xf16> // CHECK-INIT-FULL-NEXT: } outputsInfo : { - // CHECK-INIT-FULL-NEXT: DataInfo "out_ov_0_hash_2864067019402973834_concat" : tensor<11xi8> + // CHECK-INIT-FULL-NEXT: DataInfo "vpux_tw_0_hash_2864067019402973834_concat" : tensor<11xi8> // CHECK-INIT-FULL-NEXT: } // CHECK-MAIN-FULL: net.NetworkInfo entryPoint : @main inputsInfo : { // CHECK-MAIN-FULL-NEXT: DataInfo "input1" : tensor<4x16xf16> - // CHECK-MAIN-FULL-NEXT: DataInfo "out_ov_0_hash_2864067019402973834_concat" : tensor<11xi8> + // CHECK-MAIN-FULL-NEXT: DataInfo "vpux_tw_0_hash_2864067019402973834_concat" : tensor<11xi8> // CHECK-MAIN-FULL-NEXT: } outputsInfo : { // CHECK-MAIN-FULL-NEXT: DataInfo "output1" : tensor<4x16xf16> // CHECK-MAIN-FULL-NEXT: } // CHECK-INIT-PART0: net.NetworkInfo entryPoint : @init_part0 inputsInfo : { - // CHECK-INIT-PART0-NEXT: DataInfo "in_ov_dummy" : tensor<2xf16> + // CHECK-INIT-PART0-NEXT: DataInfo "vpux_ow_dummy" : tensor<2xf16> // CHECK-INIT-PART0-NEXT: } outputsInfo : { - // CHECK-INIT-PART0-NEXT: DataInfo "out_ov_dummy_hash_16529380580407486960" : tensor<2xf16> + // CHECK-INIT-PART0-NEXT: DataInfo "vpux_tw_dummy_hash_16529380580407486960" : tensor<2xf16> // CHECK-INIT-PART0-NEXT: } // CHECK-INIT-PART1: net.NetworkInfo entryPoint : @init_part1 inputsInfo : { - // CHECK-INIT-PART1-NEXT: DataInfo "in_ov_1" : tensor<2xf16> + // CHECK-INIT-PART1-NEXT: DataInfo "vpux_ow_1" : tensor<2xf16> // CHECK-INIT-PART1-NEXT: } outputsInfo : { - // CHECK-INIT-PART1-NEXT: DataInfo "out_ov_1_hash_8290054905247884848_concat" : tensor<7xi8> + // CHECK-INIT-PART1-NEXT: DataInfo "vpux_tw_1_hash_8290054905247884848_concat" : tensor<7xi8> // CHECK-INIT-PART1-NEXT: } // CHECK-MAIN-PARTS: net.NetworkInfo entryPoint : @main inputsInfo : { // CHECK-MAIN-PARTS-NEXT: DataInfo "input1" : tensor<4x16xf16> - // CHECK-MAIN-PARTS-NEXT: DataInfo "out_ov_dummy_hash_16529380580407486960" : tensor<2xf16> - // CHECK-MAIN-PARTS-NEXT: DataInfo "out_ov_1_hash_8290054905247884848_concat" : tensor<7xi8> + // CHECK-MAIN-PARTS-NEXT: DataInfo "vpux_tw_dummy_hash_16529380580407486960" : tensor<2xf16> + // CHECK-MAIN-PARTS-NEXT: DataInfo "vpux_tw_1_hash_8290054905247884848_concat" : tensor<7xi8> // CHECK-MAIN-PARTS-NEXT: } outputsInfo : { // CHECK-MAIN-PARTS-NEXT: DataInfo "output1" : tensor<4x16xf16> // CHECK-MAIN-PARTS-NEXT: } func.func @main(%arg: tensor<4x16xf16>) -> tensor<4x16xf16> { - %ov1_0 = const.Declare tensor<5x!qElemType> = dense_resource : tensor<2xf16>, + %ov1_0 = const.Declare tensor<5x!qElemType> = dense_resource : tensor<2xf16>, [#const.Add<1.0>, #const.CastElemType, #const.PadWithZero<[0], [3]>] - %ov1_1 = const.Declare tensor<2x!qElemType> = dense_resource : tensor<2xf16>, + %ov1_1 = const.Declare tensor<2x!qElemType> = dense_resource : tensor<2xf16>, [#const.Add<1.0>, #const.CastElemType] - %dummy = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<1.0>] + %dummy = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<1.0>] return %arg : tensor<4x16xf16> } @@ -226,6 +237,14 @@ module @QuantizedType { // CHECK-MAIN-PARTS: [[BOUNDARY_CAST0:%.+]] = VPU.QuantizeCast([[CAST_OV1_1]]) {dstElemType = [[QTYPE]]} // CHECK-MAIN-PARTS: [[BOUNDARY_CAST1:%.+]] = VPU.QuantizeCast([[CAST_OV1_0]]) {dstElemType = [[QTYPE]]} // CHECK-MAIN-PARTS: return [[IN]] + + // CHECK-GEN-ALL: func.func @wrapper_main([[IN:%.+]]: tensor<4x16xf16>) -> tensor<4x16xf16> + // CHECK-GEN-ALL: [[OV1:%.+]] = const.Declare tensor<2xf16> = dense_resource + // CHECK-GEN-ALL: [[OVDUMMY:%.+]] = const.Declare tensor<2xf16> = dense_resource + // CHECK-GEN-ALL: [[CALL_INIT:%.+]] = call @init([[OV1]], [[OVDUMMY]]) + // CHECK-GEN-ALL-SAME: -> tensor<11xi8> + // CHECK-GEN-ALL: [[CALL_MAIN:%.+]] = call @main([[IN]], [[CALL_INIT]]) + // CHECK-GEN-ALL: return [[CALL_MAIN]] } @@ -234,10 +253,10 @@ module @QuantizedType { {-# dialect_resources: { builtin: { - ov_1: "0x10000000AABBCCDD", + vpux_ow_1: "0x10000000AABBCCDD", // Note: required to successfully compile "init-part=1" - ov_dummy: "0x10000000AABBCCDD" + vpux_ow_dummy: "0x10000000AABBCCDD" } } #-} @@ -247,6 +266,7 @@ module @QuantizedType { // CHECK-INIT-PART0: module @SimpleOutlining // CHECK-INIT-PART1: module @SimpleOutlining // CHECK-MAIN-PARTS: module @SimpleOutlining +// CHECK-GEN-ALL: module @SimpleOutlining module @SimpleOutlining { net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input1" : tensor<4x16xf16> @@ -255,33 +275,33 @@ module @SimpleOutlining { } // CHECK-INIT-FULL: net.NetworkInfo entryPoint : @init inputsInfo : { - // CHECK-INIT-FULL-NEXT: DataInfo "in_ov_dummy" : tensor<1x1x1x2xf16> - // CHECK-INIT-FULL-NEXT: DataInfo "in_ov_1" : tensor<1x1x1x2xf16> + // CHECK-INIT-FULL-NEXT: DataInfo "vpux_ow_dummy" : tensor<1x1x1x2xf16> + // CHECK-INIT-FULL-NEXT: DataInfo "vpux_ow_1" : tensor<1x1x1x2xf16> // CHECK-INIT-FULL-NEXT: } outputsInfo : { - // CHECK-INIT-FULL-NEXT: DataInfo "out_ov_0_hash_14095452562947179690_concat" : tensor<24xi8> + // CHECK-INIT-FULL-NEXT: DataInfo "vpux_tw_0_hash_14095452562947179690_concat" : tensor<24xi8> // CHECK-INIT-FULL-NEXT: } // CHECK-MAIN-FULL: net.NetworkInfo entryPoint : @main inputsInfo : { // CHECK-MAIN-FULL-NEXT: DataInfo "input1" : tensor<4x16xf16> - // CHECK-MAIN-FULL-NEXT: DataInfo "out_ov_0_hash_14095452562947179690_concat" : tensor<24xi8> + // CHECK-MAIN-FULL-NEXT: DataInfo "vpux_tw_0_hash_14095452562947179690_concat" : tensor<24xi8> // CHECK-MAIN-FULL-NEXT: } outputsInfo : { // CHECK-MAIN-FULL-NEXT: DataInfo "output1" : tensor<4x16xf16> // CHECK-MAIN-FULL-NEXT: } func.func private @main_part1() -> tensor<1x1x1x3xf16> { - %ov_internal = const.Declare tensor<1x1x1x3xf16> = dense_resource : tensor<1x1x1x2xf16>, + %ov_internal = const.Declare tensor<1x1x1x3xf16> = dense_resource : tensor<1x1x1x2xf16>, [#const.Add<42.0>, #const.PadWithZero<[0, 0, 0, 0], [0, 0, 0, 1]>] return %ov_internal : tensor<1x1x1x3xf16> } func.func @main(%arg: tensor<4x16xf16>) -> tensor<4x16xf16> { - %ov1_0 = const.Declare tensor<1x1x1x5xf16> = dense_resource : tensor<1x1x1x2xf16>, + %ov1_0 = const.Declare tensor<1x1x1x5xf16> = dense_resource : tensor<1x1x1x2xf16>, [#const.Add<1.0>, #const.PadWithZero<[0, 0, 0, 0], [0, 0, 0, 3]>] - %ov1_1 = const.Declare tensor<1x1x1x2xf16> = dense_resource : tensor<1x1x1x2xf16>, + %ov1_1 = const.Declare tensor<1x1x1x2xf16> = dense_resource : tensor<1x1x1x2xf16>, [#const.Add<1.0>] %ov1_2 = func.call @main_part1() : () -> tensor<1x1x1x3xf16> - %dummy = const.Declare tensor<1x1x1x2xf16> = dense_resource : tensor<1x1x1x2xf16>, [#const.Add<1.0>] + %dummy = const.Declare tensor<1x1x1x2xf16> = dense_resource : tensor<1x1x1x2xf16>, [#const.Add<1.0>] return %arg : tensor<4x16xf16> } @@ -309,6 +329,14 @@ module @SimpleOutlining { // CHECK-MAIN-FULL: [[CAST_OV1_2:%.+]] = Core.ReinterpretCast([[SLICE_OV1_2]]) {{.*}} -> tensor<1x1x1x3xf16> // CHECK-MAIN-FULL: {{%.+}} = call @main_part1([[CAST_OV1_2]]) // CHECK-MAIN-FULL: return [[IN]] + + // CHECK-GEN-ALL: func.func @wrapper_main([[IN:%.+]]: tensor<4x16xf16>) -> tensor<4x16xf16> + // CHECK-GEN-ALL: [[OV1:%.+]] = const.Declare tensor<1x1x1x2xf16> = dense_resource + // CHECK-GEN-ALL: [[OVDUMMY:%.+]] = const.Declare tensor<1x1x1x2xf16> = dense_resource + // CHECK-GEN-ALL: [[CALL_INIT:%.+]] = call @init([[OV1]], [[OVDUMMY]]) + // CHECK-GEN-ALL-SAME: -> tensor<24xi8> + // CHECK-GEN-ALL: [[CALL_MAIN:%.+]] = call @main([[IN]], [[CALL_INIT]]) + // CHECK-GEN-ALL: return [[CALL_MAIN]] } // ----- @@ -316,8 +344,8 @@ module @SimpleOutlining { {-# dialect_resources: { builtin: { - ov_1: "0x10000000AABBCCDD", - ov_2: "0x10000000AABBCCDD" + vpux_ow_1: "0x10000000AABBCCDD", + vpux_ow_2: "0x10000000AABBCCDD" } } #-} @@ -327,6 +355,7 @@ module @SimpleOutlining { // CHECK-INIT-PART0: module @SingleConstantInTheBeginning // CHECK-INIT-PART1: module @SingleConstantInTheBeginning // CHECK-MAIN-PARTS: module @SingleConstantInTheBeginning +// CHECK-GEN-ALL: module @SingleConstantInTheBeginning module @SingleConstantInTheBeginning { net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input1" : tensor<4x16xf16> @@ -335,9 +364,9 @@ module @SingleConstantInTheBeginning { } // CHECK-INIT-PART0: net.NetworkInfo entryPoint : @init_part0 inputsInfo : { - // CHECK-INIT-PART0-NEXT: DataInfo "in_ov_2" : tensor<2xf16> + // CHECK-INIT-PART0-NEXT: DataInfo "vpux_ow_2" : tensor<2xf16> // CHECK-INIT-PART0-NEXT: } outputsInfo : { - // CHECK-INIT-PART0-NEXT: DataInfo "out_ov_2_hash_8938469330746701159" : tensor<12xf16> + // CHECK-INIT-PART0-NEXT: DataInfo "vpux_tw_2_hash_8938469330746701159" : tensor<12xf16> // CHECK-INIT-PART0-NEXT: } // CHECK-INIT-PART0: func.func @init_part0([[OV_2:%.+]]: tensor<2xf16>) -> tensor<12xf16> @@ -345,17 +374,17 @@ module @SingleConstantInTheBeginning { // CHECK-INIT-PART0: return [[OUT]] // CHECK-INIT-PART1: net.NetworkInfo entryPoint : @init_part1 inputsInfo : { - // CHECK-INIT-PART1-NEXT: DataInfo "in_ov_1" : tensor<2xf16> + // CHECK-INIT-PART1-NEXT: DataInfo "vpux_ow_1" : tensor<2xf16> // CHECK-INIT-PART1-NEXT: } outputsInfo : { - // CHECK-INIT-PART1-NEXT: DataInfo "out_ov_1_hash_8692743050400081167_concat" : tensor<208xi8> + // CHECK-INIT-PART1-NEXT: DataInfo "vpux_tw_1_hash_8692743050400081167_concat" : tensor<208xi8> // CHECK-INIT-PART1-NEXT: } // CHECK-INIT-PART1: func.func @init_part1([[OV_1:%.+]]: tensor<2xf16>) -> tensor<208xi8> // CHECK-MAIN-PARTS: net.NetworkInfo entryPoint : @main inputsInfo : { // CHECK-MAIN-PARTS-NEXT: DataInfo "input1" : tensor<4x16xf16> - // CHECK-MAIN-PARTS-NEXT: DataInfo "out_ov_2_hash_8938469330746701159" : tensor<12xf16> - // CHECK-MAIN-PARTS-NEXT: DataInfo "out_ov_1_hash_8692743050400081167_concat" : tensor<208xi8> + // CHECK-MAIN-PARTS-NEXT: DataInfo "vpux_tw_2_hash_8938469330746701159" : tensor<12xf16> + // CHECK-MAIN-PARTS-NEXT: DataInfo "vpux_tw_1_hash_8692743050400081167_concat" : tensor<208xi8> // CHECK-MAIN-PARTS-NEXT: } outputsInfo : { // CHECK-MAIN-PARTS-NEXT: DataInfo "output1" : tensor<4x16xf16> // CHECK-MAIN-PARTS-NEXT: } @@ -363,12 +392,12 @@ module @SingleConstantInTheBeginning { // CHECK-MAIN-PARTS: func.func @main([[IN:%.+]]: tensor<4x16xf16>, [[OV_2:%.+]]: tensor<12xf16>, [[BLOB0:%.+]]: tensor<208xi8>) func.func @main(%arg: tensor<4x16xf16>) -> tensor<4x16xf16> { - %ov2_single = const.Declare tensor<12xf16> = dense_resource : tensor<2xf16>, + %ov2_single = const.Declare tensor<12xf16> = dense_resource : tensor<2xf16>, [#const.PadWithZero<[0], [10]>] - %ov1_0 = const.Declare tensor<102xf16> = dense_resource : tensor<2xf16>, + %ov1_0 = const.Declare tensor<102xf16> = dense_resource : tensor<2xf16>, [#const.Add<1.0>, #const.PadWithZero<[0], [100]>] - %ov1_1 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<2.0>] + %ov1_1 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<2.0>] return %arg : tensor<4x16xf16> } @@ -379,9 +408,9 @@ module @SingleConstantInTheBeginning { {-# dialect_resources: { builtin: { - ov_1: "0x10000000AABBCCDD", - ov_2: "0x10000000AABBCCDD", - ov_3: "0x10000000AABBCCDD" + vpux_ow_1: "0x10000000AABBCCDD", + vpux_ow_2: "0x10000000AABBCCDD", + vpux_ow_3: "0x10000000AABBCCDD" } } #-} @@ -391,6 +420,7 @@ module @SingleConstantInTheBeginning { // CHECK-INIT-PART0: module @SingleConstantInTheMiddle // CHECK-INIT-PART1: module @SingleConstantInTheMiddle // CHECK-MAIN-PARTS: module @SingleConstantInTheMiddle +// CHECK-GEN-ALL: module @SingleConstantInTheMiddle module @SingleConstantInTheMiddle { net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input1" : tensor<4x16xf16> @@ -399,17 +429,17 @@ module @SingleConstantInTheMiddle { } // CHECK-INIT-PART0: net.NetworkInfo entryPoint : @init_part0 inputsInfo : { - // CHECK-INIT-PART0-NEXT: DataInfo "in_ov_1" : tensor<2xf16> + // CHECK-INIT-PART0-NEXT: DataInfo "vpux_ow_1" : tensor<2xf16> // CHECK-INIT-PART0-NEXT: } outputsInfo : { - // CHECK-INIT-PART0-NEXT: DataInfo "out_ov_0_hash_13109616749475806820_concat" : tensor<8xi8> + // CHECK-INIT-PART0-NEXT: DataInfo "vpux_tw_0_hash_13109616749475806820_concat" : tensor<8xi8> // CHECK-INIT-PART0-NEXT: } // CHECK-INIT-PART0: func.func @init_part0([[OV_1:%.+]]: tensor<2xf16>) -> tensor<8xi8> // CHECK-INIT-PART1: net.NetworkInfo entryPoint : @init_part1 inputsInfo : { - // CHECK-INIT-PART1-NEXT: DataInfo "in_ov_2" : tensor<2xf16> + // CHECK-INIT-PART1-NEXT: DataInfo "vpux_ow_2" : tensor<2xf16> // CHECK-INIT-PART1-NEXT: } outputsInfo : { - // CHECK-INIT-PART1-NEXT: DataInfo "out_ov_2_hash_8938469330746701159" : tensor<12xf16> + // CHECK-INIT-PART1-NEXT: DataInfo "vpux_tw_2_hash_8938469330746701159" : tensor<12xf16> // CHECK-INIT-PART1-NEXT: } // CHECK-INIT-PART1: func.func @init_part1([[OV_2:%.+]]: tensor<2xf16>) -> tensor<12xf16> @@ -418,9 +448,9 @@ module @SingleConstantInTheMiddle { // CHECK-MAIN-PARTS: net.NetworkInfo entryPoint : @main inputsInfo : { // CHECK-MAIN-PARTS-NEXT: DataInfo "input1" : tensor<4x16xf16> - // CHECK-MAIN-PARTS-NEXT: DataInfo "out_ov_0_hash_13109616749475806820_concat" : tensor<8xi8> - // CHECK-MAIN-PARTS-NEXT: DataInfo "out_ov_2_hash_8938469330746701159" : tensor<12xf16> - // CHECK-MAIN-PARTS-NEXT: DataInfo "out_ov_2_hash_2332981286748766850_concat" : tensor<108xi8> + // CHECK-MAIN-PARTS-NEXT: DataInfo "vpux_tw_0_hash_13109616749475806820_concat" : tensor<8xi8> + // CHECK-MAIN-PARTS-NEXT: DataInfo "vpux_tw_2_hash_8938469330746701159" : tensor<12xf16> + // CHECK-MAIN-PARTS-NEXT: DataInfo "vpux_tw_2_hash_2332981286748766850_concat" : tensor<108xi8> // CHECK-MAIN-PARTS-NEXT: } outputsInfo : { // CHECK-MAIN-PARTS-NEXT: DataInfo "output1" : tensor<4x16xf16> // CHECK-MAIN-PARTS-NEXT: } @@ -428,14 +458,14 @@ module @SingleConstantInTheMiddle { // CHECK-MAIN-PARTS: func.func @main([[IN:%.+]]: tensor<4x16xf16>, [[BLOB0:%.+]]: tensor<8xi8>, [[OV_2:%.+]]: tensor<12xf16>, [[BLOB2:%.+]]: tensor<108xi8>) func.func @main(%arg: tensor<4x16xf16>) -> tensor<4x16xf16> { - %ov1_0 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<1.0>] - %ov1_1 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<2.0>] + %ov1_0 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<1.0>] + %ov1_1 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<2.0>] - %ov2_single = const.Declare tensor<12xf16> = dense_resource : tensor<2xf16>, + %ov2_single = const.Declare tensor<12xf16> = dense_resource : tensor<2xf16>, [#const.PadWithZero<[0], [10]>] - %ov3_0 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<1.0>] - %ov3_1 = const.Declare tensor<52xf16> = dense_resource : tensor<2xf16>, + %ov3_0 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<1.0>] + %ov3_1 = const.Declare tensor<52xf16> = dense_resource : tensor<2xf16>, [#const.PadWithZero<[0], [50]>] return %arg : tensor<4x16xf16> @@ -447,8 +477,8 @@ module @SingleConstantInTheMiddle { {-# dialect_resources: { builtin: { - ov_1: "0x10000000AABBCCDD", - ov_2: "0x10000000AABBCCDD" + vpux_ow_1: "0x10000000AABBCCDD", + vpux_ow_2: "0x10000000AABBCCDD" } } #-} @@ -458,6 +488,7 @@ module @SingleConstantInTheMiddle { // CHECK-INIT-PART0: module @SingleConstantInTheEnd // CHECK-INIT-PART1: module @SingleConstantInTheEnd // CHECK-MAIN-PARTS: module @SingleConstantInTheEnd +// CHECK-GEN-ALL: module @SingleConstantInTheEnd module @SingleConstantInTheEnd { net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input1" : tensor<4x16xf16> @@ -466,17 +497,17 @@ module @SingleConstantInTheEnd { } // CHECK-INIT-PART0: net.NetworkInfo entryPoint : @init_part0 inputsInfo : { - // CHECK-INIT-PART0-NEXT: DataInfo "in_ov_1" : tensor<2xf16> + // CHECK-INIT-PART0-NEXT: DataInfo "vpux_ow_1" : tensor<2xf16> // CHECK-INIT-PART0-NEXT: } outputsInfo : { - // CHECK-INIT-PART0-NEXT: DataInfo "out_ov_0_hash_13109616749475806820_concat" : tensor<8xi8> + // CHECK-INIT-PART0-NEXT: DataInfo "vpux_tw_0_hash_13109616749475806820_concat" : tensor<8xi8> // CHECK-INIT-PART0-NEXT: } // CHECK-INIT-PART0: func.func @init_part0([[OV_1:%.+]]: tensor<2xf16>) -> tensor<8xi8> // CHECK-INIT-PART1: net.NetworkInfo entryPoint : @init_part1 inputsInfo : { - // CHECK-INIT-PART1-NEXT: DataInfo "in_ov_2" : tensor<2xf16> + // CHECK-INIT-PART1-NEXT: DataInfo "vpux_ow_2" : tensor<2xf16> // CHECK-INIT-PART1-NEXT: } outputsInfo : { - // CHECK-INIT-PART1-NEXT: DataInfo "out_ov_2_hash_8938469330746701159" : tensor<12xf16> + // CHECK-INIT-PART1-NEXT: DataInfo "vpux_tw_2_hash_8938469330746701159" : tensor<12xf16> // CHECK-INIT-PART1-NEXT: } // CHECK-INIT-PART1: func.func @init_part1([[OV_2:%.+]]: tensor<2xf16>) -> tensor<12xf16> @@ -485,8 +516,8 @@ module @SingleConstantInTheEnd { // CHECK-MAIN-PARTS: net.NetworkInfo entryPoint : @main inputsInfo : { // CHECK-MAIN-PARTS-NEXT: DataInfo "input1" : tensor<4x16xf16> - // CHECK-MAIN-PARTS-NEXT: DataInfo "out_ov_0_hash_13109616749475806820_concat" : tensor<8xi8> - // CHECK-MAIN-PARTS-NEXT: DataInfo "out_ov_2_hash_8938469330746701159" : tensor<12xf16> + // CHECK-MAIN-PARTS-NEXT: DataInfo "vpux_tw_0_hash_13109616749475806820_concat" : tensor<8xi8> + // CHECK-MAIN-PARTS-NEXT: DataInfo "vpux_tw_2_hash_8938469330746701159" : tensor<12xf16> // CHECK-MAIN-PARTS-NEXT: } outputsInfo : { // CHECK-MAIN-PARTS-NEXT: DataInfo "output1" : tensor<4x16xf16> // CHECK-MAIN-PARTS-NEXT: } @@ -494,10 +525,10 @@ module @SingleConstantInTheEnd { // CHECK-MAIN-PARTS: func.func @main([[IN:%.+]]: tensor<4x16xf16>, [[BLOB0:%.+]]: tensor<8xi8>, [[OV_2:%.+]]: tensor<12xf16>) func.func @main(%arg: tensor<4x16xf16>) -> tensor<4x16xf16> { - %ov1_0 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<1.0>] - %ov1_1 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<2.0>] + %ov1_0 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<1.0>] + %ov1_1 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<2.0>] - %ov2_single = const.Declare tensor<12xf16> = dense_resource : tensor<2xf16>, + %ov2_single = const.Declare tensor<12xf16> = dense_resource : tensor<2xf16>, [#const.PadWithZero<[0], [10]>] return %arg : tensor<4x16xf16> @@ -509,9 +540,9 @@ module @SingleConstantInTheEnd { {-# dialect_resources: { builtin: { - ov_1: "0x10000000AABBCCDD", - ov_2: "0x10000000AABBCCDD", - ov_3: "0x10000000AABBCCDD" + vpux_ow_1: "0x10000000AABBCCDD", + vpux_ow_2: "0x10000000AABBCCDD", + vpux_ow_3: "0x10000000AABBCCDD" } } #-} @@ -525,6 +556,7 @@ module @SingleConstantInTheEnd { // CHECK-INIT-PART0: module @SingleConstantWithLayout // CHECK-INIT-PART1: module @SingleConstantWithLayout // CHECK-MAIN-PARTS: module @SingleConstantWithLayout +// CHECK-GEN-ALL: module @SingleConstantWithLayout module @SingleConstantWithLayout { net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input1" : tensor<4x16xf16> @@ -533,11 +565,11 @@ module @SingleConstantWithLayout { } // CHECK-INIT-FULL: net.NetworkInfo entryPoint : @init inputsInfo : { - // CHECK-INIT-FULL-NEXT: DataInfo "in_ov_1" : tensor<2xf16> - // CHECK-INIT-FULL-NEXT: DataInfo "in_ov_2" : tensor<1x1x1x2xf16> - // CHECK-INIT-FULL-NEXT: DataInfo "in_ov_3" : tensor<2xf16> + // CHECK-INIT-FULL-NEXT: DataInfo "vpux_ow_1" : tensor<2xf16> + // CHECK-INIT-FULL-NEXT: DataInfo "vpux_ow_2" : tensor<1x1x1x2xf16> + // CHECK-INIT-FULL-NEXT: DataInfo "vpux_ow_3" : tensor<2xf16> // CHECK-INIT-FULL-NEXT: } outputsInfo : { - // CHECK-INIT-FULL-NEXT: DataInfo "out_ov_0_hash_14224487553552936504_concat" : tensor<140xi8> + // CHECK-INIT-FULL-NEXT: DataInfo "vpux_tw_0_hash_14224487553552936504_concat" : tensor<140xi8> // CHECK-INIT-FULL-NEXT: } // CHECK-INIT-FULL: func.func @init({{%.+}}: tensor<2xf16>, {{%.+}}: tensor<1x1x1x2xf16>, {{%.+}}: tensor<2xf16>) @@ -545,7 +577,7 @@ module @SingleConstantWithLayout { // CHECK-MAIN-FULL: net.NetworkInfo entryPoint : @main inputsInfo : { // CHECK-MAIN-FULL-NEXT: DataInfo "input1" : tensor<4x16xf16> - // CHECK-MAIN-FULL-NEXT: DataInfo "out_ov_0_hash_14224487553552936504_concat" : tensor<140xi8> + // CHECK-MAIN-FULL-NEXT: DataInfo "vpux_tw_0_hash_14224487553552936504_concat" : tensor<140xi8> // CHECK-MAIN-FULL-NEXT: } outputsInfo : { // CHECK-MAIN-FULL-NEXT: DataInfo "output1" : tensor<4x16xf16> // CHECK-MAIN-FULL-NEXT: } @@ -554,17 +586,17 @@ module @SingleConstantWithLayout { // CHECK-INIT-PART0: net.NetworkInfo entryPoint : @init_part0 inputsInfo : { - // CHECK-INIT-PART0-NEXT: DataInfo "in_ov_1" : tensor<2xf16> + // CHECK-INIT-PART0-NEXT: DataInfo "vpux_ow_1" : tensor<2xf16> // CHECK-INIT-PART0-NEXT: } outputsInfo : { - // CHECK-INIT-PART0-NEXT: DataInfo "out_ov_0_hash_13109616749475806820_concat" : tensor<8xi8> + // CHECK-INIT-PART0-NEXT: DataInfo "vpux_tw_0_hash_13109616749475806820_concat" : tensor<8xi8> // CHECK-INIT-PART0-NEXT: } // CHECK-INIT-PART0: func.func @init_part0([[OV_1:%.+]]: tensor<2xf16>) -> tensor<8xi8> // CHECK-INIT-PART1: net.NetworkInfo entryPoint : @init_part1 inputsInfo : { - // CHECK-INIT-PART1-NEXT: DataInfo "in_ov_2" : tensor<1x1x1x2xf16> + // CHECK-INIT-PART1-NEXT: DataInfo "vpux_ow_2" : tensor<1x1x1x2xf16> // CHECK-INIT-PART1-NEXT: } outputsInfo : { - // CHECK-INIT-PART1-NEXT: DataInfo "out_ov_2_hash_5073444534634115717" : tensor<1x1x1x12xf16> + // CHECK-INIT-PART1-NEXT: DataInfo "vpux_tw_2_hash_5073444534634115717" : tensor<1x1x1x12xf16> // CHECK-INIT-PART1-NEXT: } // CHECK-INIT-PART1: func.func @init_part1([[OV_2:%.+]]: tensor<1x1x1x2xf16>) @@ -575,9 +607,9 @@ module @SingleConstantWithLayout { // CHECK-MAIN-PARTS: net.NetworkInfo entryPoint : @main inputsInfo : { // CHECK-MAIN-PARTS-NEXT: DataInfo "input1" : tensor<4x16xf16> - // CHECK-MAIN-PARTS-NEXT: DataInfo "out_ov_0_hash_13109616749475806820_concat" : tensor<8xi8> - // CHECK-MAIN-PARTS-NEXT: DataInfo "out_ov_2_hash_5073444534634115717" : tensor<1x1x1x12xf16> - // CHECK-MAIN-PARTS-NEXT: DataInfo "out_ov_2_hash_2332981286748766850_concat" : tensor<108xi8> + // CHECK-MAIN-PARTS-NEXT: DataInfo "vpux_tw_0_hash_13109616749475806820_concat" : tensor<8xi8> + // CHECK-MAIN-PARTS-NEXT: DataInfo "vpux_tw_2_hash_5073444534634115717" : tensor<1x1x1x12xf16> + // CHECK-MAIN-PARTS-NEXT: DataInfo "vpux_tw_2_hash_2332981286748766850_concat" : tensor<108xi8> // CHECK-MAIN-PARTS-NEXT: } outputsInfo : { // CHECK-MAIN-PARTS-NEXT: DataInfo "output1" : tensor<4x16xf16> // CHECK-MAIN-PARTS-NEXT: } @@ -585,14 +617,14 @@ module @SingleConstantWithLayout { // CHECK-MAIN-PARTS: func.func @main([[IN:%.+]]: tensor<4x16xf16>, [[BLOB0:%.+]]: tensor<8xi8>, [[OV_2:%.+]]: tensor<1x1x1x12xf16, {order = [[NHWC]]}>, [[BLOB2:%.+]]: tensor<108xi8>) func.func @main(%arg: tensor<4x16xf16>) -> tensor<4x16xf16> { - %ov1_0 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<1.0>] - %ov1_1 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<2.0>] + %ov1_0 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<1.0>] + %ov1_1 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<2.0>] - %ov2_single_reordered = const.Declare tensor<1x1x1x12xf16, {order = #NHWC}> = dense_resource + %ov2_single_reordered = const.Declare tensor<1x1x1x12xf16, {order = #NHWC}> = dense_resource : tensor<1x1x1x2xf16>, [#const.PadWithZero<[0, 0, 0, 0], [0, 0, 0, 10]>, #const.Reorder<#NHWC>] - %ov3_0 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<1.0>] - %ov3_1 = const.Declare tensor<52xf16> = dense_resource : tensor<2xf16>, + %ov3_0 = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, [#const.Add<1.0>] + %ov3_1 = const.Declare tensor<52xf16> = dense_resource : tensor<2xf16>, [#const.PadWithZero<[0], [50]>] return %arg : tensor<4x16xf16> diff --git a/tests/lit/NPU/dialect/VPU/passes/concat_init_results_pipeline_empty_init.mlir b/tests/lit/NPU/dialect/VPU/passes/concat_init_results_pipeline_empty_init.mlir index 2c9756db15..01a3609142 100644 --- a/tests/lit/NPU/dialect/VPU/passes/concat_init_results_pipeline_empty_init.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/concat_init_results_pipeline_empty_init.mlir @@ -4,6 +4,7 @@ // // RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --introduce-init-function="ws-extraction-mode=gen-main" --concat-init-results="ws-extraction-mode=gen-main" %s | FileCheck --check-prefix=CHECK-MAIN %s +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --introduce-init-function="ws-extraction-mode=gen-all" --concat-init-results="ws-extraction-mode=gen-all" %s | FileCheck --check-prefix=CHECK-ALL %s // REQUIRES: arch-NPU37XX || arch-NPU40XX {-# @@ -29,4 +30,12 @@ module @NoConstants { // CHECK-MAIN: func.func @main({{%.+}}: tensor<4x16xf16>) -> tensor<4x16xf16> // CHECK-MAIN-NEXT: {{%.+}} = {{.*}} dense_resource {{.*}} [#const.Add<1.000000e+00 : f64>] + + + // CHECK-ALL: func.func private @main({{%.+}}: tensor<4x16xf16>) -> tensor<4x16xf16> + // CHECK-ALL-NEXT: {{%.+}} = {{.*}} dense_resource {{.*}} [#const.Add<1.000000e+00 : f64>] + + // CHECK-ALL: func.func @wrapper_main([[IN:%.+]]: tensor<4x16xf16>) -> tensor<4x16xf16> + // CHECK-ALL-NEXT: [[OUT:%.+]] = call @main([[IN]]) + // CHECK-ALL-NEXT: return [[OUT]] } diff --git a/tests/lit/NPU/dialect/VPU/passes/convert_dynamic_to_static_kernels.mlir b/tests/lit/NPU/dialect/VPU/passes/convert_dynamic_to_static_kernels.mlir new file mode 100644 index 0000000000..af4bfbbfa5 --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/convert_dynamic_to_static_kernels.mlir @@ -0,0 +1,103 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=DefaultHW" --convert-dynamic-to-static-kernels %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#map = affine_map<(d0)[s0] -> (-d0 + s0, 100)> +// CHECK-LABEL: @StaticEltwiseNHWC +module @StaticEltwiseNHWC { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input1" : tensor<1x16x720x?xf16> + DataInfo "input2" : tensor<1x16x720x?xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x720x?xf16> + } + func.func private @main_func0(%arg0: tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}>, %arg1: tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> { + %0 = VPU.NCE.Eltwise(%arg0, %arg1) {multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, quant_scale = [1.000000e+00], fp_prelu_alpha = 1.000000e+00 : f64>} -> tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> + return %0 : tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> + } + func.func @main(%arg0: tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>, %arg1: tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> { + %c100 = arith.constant 100 : index + %c0 = arith.constant 0 : index + %c3 = arith.constant 3 : index + %dim = tensor.dim %arg0, %c3 : tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + %0 = tensor.empty(%dim) : tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + %dim_0 = tensor.dim %arg0, %c3 : tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + %1 = scf.for %arg2 = %c0 to %dim_0 step %c100 iter_args(%arg3 = %0) -> (tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>) { + %2 = affine.min #map(%arg2)[%dim_0] + %extracted_slice = tensor.extract_slice %arg0[0, 0, 0, %arg2] [1, 16, 720, %2] [1, 1, 1, 1] : tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> + %extracted_slice_1 = tensor.extract_slice %arg1[0, 0, 0, %arg2] [1, 16, 720, %2] [1, 1, 1, 1] : tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> + %3 = func.call @main_func0(%extracted_slice, %extracted_slice_1) : (tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}>, tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> + %inserted_slice = tensor.insert_slice %3 into %arg3[0, 0, 0, %arg2] [1, 16, 720, %2] [1, 1, 1, 1] : tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> into tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + scf.yield %inserted_slice : tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + } + return %1 : tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + } + + // CHECK: func.func @main_func0_static([[ARGS0:%.*]]: tensor<1x16x720x100xf16, {order = #NHWC}>, [[ARGS1:%.*]]: tensor<1x16x720x100xf16, {order = #NHWC}>) -> tensor<1x16x720x100xf16, {order = #NHWC}> { + // CHECK: [[ADDRESULT:%.*]] = VPU.NCE.Eltwise([[ARGS0]], [[ARGS1]]) + // CHECK: return [[ADDRESULT]] : tensor<1x16x720x100xf16, {order = #NHWC}> + // CHECK: } + // CHECK: func.func @main([[ARG0:%.*]]: tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>, [[ARG1:%.*]]: tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> { + + // CHECK: [[MIN:%.*]] = affine.min #map(%arg2)[%dim_0] + // CHECK: [[COND:%.*]] = arith.cmpi ne, [[MIN]], %c100 : index + // CHECK: [[INDEX:%.*]] = scf.if [[COND]] -> (index) { + // CHECK: [[ELEMENTS:%.*]] = arith.subi %c100, [[MIN]] : index + // CHECK: [[CHECK:%.*]] = arith.cmpi slt, %arg2, [[ELEMENTS]] : index + // CHECK: cf.assert [[CHECK]], "Not enough elements to backtrack in scf.for loop" + // CHECK: [[FINAL_INDEX:%.*]] = arith.subi %arg2, [[ELEMENTS]] : index + // CHECK: scf.yield [[FINAL_INDEX]] : index + // CHECK: } else { + // CHECK: scf.yield %arg2 : index + // CHECK: } + // CHECK: [[SLICE0:%.*]] = tensor.extract_slice [[ARG0]][0, 0, 0, [[INDEX]]] [1, 16, 720, %c100] [1, 1, 1, 1] : + // CHECK: [[IN0:%.*]] = tensor.cast [[SLICE0]] : tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : {{.*}}}> to tensor<1x16x720x100xf16, {order = #NHWC}> + // CHECK: [[SLICE1:%.*]] = tensor.extract_slice [[ARG1]][0, 0, 0, [[INDEX]]] [1, 16, 720, %c100] [1, 1, 1, 1] : tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : {{.*}}}> to tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : {{.*}}}> + // CHECK: [[IN1:%.*]] = tensor.cast [[SLICE1]] : tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : {{.*}}}> to tensor<1x16x720x100xf16, {order = #NHWC}> + // CHECK: [[CALL_OUTPUT:%.*]] = func.call @main_func0_static([[IN0]], [[IN1]]) : (tensor<1x16x720x100xf16, {order = #NHWC}>, tensor<1x16x720x100xf16, {order = #NHWC}>) -> tensor<1x16x720x100xf16, {order = #NHWC}> + // CHECK: [[OUT:%.*]] = tensor.cast [[CALL_OUTPUT]] : tensor<1x16x720x100xf16, {order = #NHWC}> to tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : {{.*}}}> + // CHECK: [[INSERT_SLICE:%.*]] = tensor.insert_slice [[OUT]] into %arg3[0, 0, 0, [[INDEX]]] [1, 16, 720, %c100] [1, 1, 1, 1] : tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : {{.*}}}> into tensor<1x16x720x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : {{.*}}}> + // CHECK: scf.yield [[INSERT_SLICE]] +} + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +// CHECK-LABEL: @CopyInputOutput +module @CopyInputOutput { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input1" : tensor<1x720x1000x16xf16> + DataInfo "input2" : tensor<1x720x1000x16xf16> } outputsInfo : { + DataInfo "output" : tensor<1x720x1000x16xf16> + } + func.func private @main_func0(%arg0: memref<1x90x1000x16xf16>, %arg1: memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> { + %0 = VPUIP.Copy inputs(%arg0 : memref<1x90x1000x16xf16>) outputs(%arg1 : memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> + return %0 : memref<1x90x1000x16xf16> + } + func.func @main(%arg0: memref<1x720x1000x16xf16>, %arg1: memref<1x720x1000x16xf16>) -> memref<1x720x1000x16xf16> { + %c90 = arith.constant 90 : index + %c720 = arith.constant 720 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x720x1000x16xf16> + %0 = scf.for %arg2 = %c0 to %c720 step %c90 iter_args(%arg3 = %alloc) -> (memref<1x720x1000x16xf16>) { + %subview = memref.subview %arg0[0, %arg2, 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x720x1000x16xf16> to memref<1x90x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> + %subview_0 = memref.subview %arg1[0, %arg2, 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x720x1000x16xf16> to memref<1x90x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> + %1 = builtin.unrealized_conversion_cast %subview : memref<1x90x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> to memref<1x90x1000x16xf16> + %2 = builtin.unrealized_conversion_cast %subview_0 : memref<1x90x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> to memref<1x90x1000x16xf16> + %3 = func.call @main_func0(%1, %2) : (memref<1x90x1000x16xf16>, memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> + %subview_1 = memref.subview %arg3[0, %arg2, 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x720x1000x16xf16> to memref<1x90x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> + memref.copy %3, %subview_1 : memref<1x90x1000x16xf16> to memref<1x90x1000x16xf16, strided<[11520000, 16000, 16, 1], offset: ?>> + scf.yield %arg3 : memref<1x720x1000x16xf16> + } + return %0 : memref<1x720x1000x16xf16> + } + + // CHECK-NOT: func.func @main_func0_static + // CHECK-NOT: scf.if +} diff --git a/tests/lit/NPU/dialect/VPU/passes/convert_dynamic_to_static_kernels_invalid.mlir b/tests/lit/NPU/dialect/VPU/passes/convert_dynamic_to_static_kernels_invalid.mlir new file mode 100644 index 0000000000..f3a9a3850a --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/convert_dynamic_to_static_kernels_invalid.mlir @@ -0,0 +1,42 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=DefaultHW" --convert-dynamic-to-static-kernels --verify-diagnostics %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#map = affine_map<(d0)[s0] -> (-d0 + s0, 100)> +module @StaticEltwiseNHWCInvalid { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input1" : tensor<1x16x?x?xf16> + DataInfo "input2" : tensor<1x16x?x?xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x?x?xf16> + } + func.func private @main_func0(%arg0: tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}>, %arg1: tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> { + %0 = VPU.NCE.Eltwise(%arg0, %arg1) {multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, quant_scale = [1.000000e+00], fp_prelu_alpha = 1.000000e+00 : f64>} -> tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> + return %0 : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> + } + func.func @main(%arg0: tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>, %arg1: tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> { + %c100 = arith.constant 100 : index + %c0 = arith.constant 0 : index + %c3 = arith.constant 3 : index + %c2 = arith.constant 2 : index + %dim = tensor.dim %arg0, %c3 : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + %dim_1 = tensor.dim %arg0, %c2 : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + %0 = tensor.empty(%dim, %dim_1) : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + %dim_0 = tensor.dim %arg0, %c3 : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + %1 = scf.for %arg2 = %c0 to %dim_0 step %c100 iter_args(%arg3 = %0) -> (tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>) { + %2 = affine.min #map(%arg2)[%dim_0] + // expected-error@+1 {{Expected a ranked tensor type with exactly one dynamic dimension}} + %extracted_slice = tensor.extract_slice %arg0[0, 0, %arg2, %arg2] [1, 16, %2, %2] [1, 1, 1, 1] : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> + %extracted_slice_1 = tensor.extract_slice %arg1[0, 0, %arg2, %arg2] [1, 16, %2, %2] [1, 1, 1, 1] : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> + %3 = func.call @main_func0(%extracted_slice, %extracted_slice_1) : (tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}>, tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> + %inserted_slice = tensor.insert_slice %3 into %arg3[0, 0, %arg2, %arg2] [1, 16, %2, %2] [1, 1, 1, 1] : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 100]> : tensor<4xsi64>, order = #NHWC}> into tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + scf.yield %inserted_slice : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + } + return %1 : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + } +} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/dma_task_profiling_reserve_mem_37XX.mlir b/tests/lit/NPU/dialect/VPU/passes/dma_task_profiling_reserve_mem_37XX.mlir similarity index 100% rename from tests/lit/NPU/dialect/VPUIP/passes/dma_task_profiling_reserve_mem_37XX.mlir rename to tests/lit/NPU/dialect/VPU/passes/dma_task_profiling_reserve_mem_37XX.mlir diff --git a/tests/lit/NPU/dialect/VPUIP/passes/dma_task_profiling_reserve_mem_40XX+.mlir b/tests/lit/NPU/dialect/VPU/passes/dma_task_profiling_reserve_mem_40XX.mlir similarity index 100% rename from tests/lit/NPU/dialect/VPUIP/passes/dma_task_profiling_reserve_mem_40XX+.mlir rename to tests/lit/NPU/dialect/VPU/passes/dma_task_profiling_reserve_mem_40XX.mlir diff --git a/tests/lit/NPU/dialect/VPUIP/passes/dma_task_profiling_reserve_mem_disabled_40XX.mlir b/tests/lit/NPU/dialect/VPU/passes/dma_task_profiling_reserve_mem_disabled_40XX.mlir similarity index 100% rename from tests/lit/NPU/dialect/VPUIP/passes/dma_task_profiling_reserve_mem_disabled_40XX.mlir rename to tests/lit/NPU/dialect/VPU/passes/dma_task_profiling_reserve_mem_disabled_40XX.mlir diff --git a/tests/lit/NPU/dialect/VPUIP/passes/dma_task_profiling_reserve_mem_static_40XX+.mlir b/tests/lit/NPU/dialect/VPU/passes/dma_task_profiling_reserve_mem_static_40XX.mlir similarity index 99% rename from tests/lit/NPU/dialect/VPUIP/passes/dma_task_profiling_reserve_mem_static_40XX+.mlir rename to tests/lit/NPU/dialect/VPU/passes/dma_task_profiling_reserve_mem_static_40XX.mlir index 2133e2c054..11881f04ac 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/dma_task_profiling_reserve_mem_static_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/dma_task_profiling_reserve_mem_static_40XX.mlir @@ -19,4 +19,5 @@ module @SimpleGraph { // CHECK: ReservedMemory // CHECK-NEXT: DmaProfilingReservedMemory // CHECK-NEXT: IE.MemoryResource 512 bytes of @CMX_NN + } diff --git a/tests/lit/NPU/dialect/VPU/passes/init_compiler/init_compiler_invalid.mlir b/tests/lit/NPU/dialect/VPU/passes/init_compiler/init_compiler_invalid.mlir index 182207e926..b8bb0000e6 100644 --- a/tests/lit/NPU/dialect/VPU/passes/init_compiler/init_compiler_invalid.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/init_compiler/init_compiler_invalid.mlir @@ -7,7 +7,7 @@ // REQUIRES: arch-NPU37XX || arch-NPU40XX // expected-error@+1 {{Architecture is already defined, probably you run '--init-compiler' twice}} -module @test attributes {VPU.arch = #VPU.arch_kind} { +module @test attributes {config.arch = #config.arch_kind} { } // ----- @@ -20,5 +20,5 @@ module @error { // ----- // expected-error@+1 {{RevisionID is already defined, probably you run '--init-compiler' twice}} -module @revtest attributes {VPU.revisionID = #VPU.revision_id} { +module @revtest attributes {config.revisionID = #config.revision_id} { } diff --git a/tests/lit/NPU/dialect/VPU/passes/init_compiler/init_compiler_invalid_custom.mlir b/tests/lit/NPU/dialect/VPU/passes/init_compiler/init_compiler_invalid_custom.mlir index e1ef1e98b0..f3d0d4197e 100644 --- a/tests/lit/NPU/dialect/VPU/passes/init_compiler/init_compiler_invalid_custom.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/init_compiler/init_compiler_invalid_custom.mlir @@ -13,7 +13,7 @@ module @mode attributes {config.compilationMode = #config.compilation_mode} { +module @arch attributes {config.arch = #config.arch_kind} { } // ----- diff --git a/tests/lit/NPU/dialect/VPU/passes/init_compiler_37XX.mlir b/tests/lit/NPU/dialect/VPU/passes/init_compiler_37XX.mlir index 6991df098c..0cbc575c61 100644 --- a/tests/lit/NPU/dialect/VPU/passes/init_compiler_37XX.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/init_compiler_37XX.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --init-compiler="vpu-arch=%arch% compilation-mode=ReferenceSW" %s | FileCheck %s --strict-whitespace // REQUIRES: arch-NPU37XX -// CHECK: module @test attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} +// CHECK: module @test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} module @test { // CHECK-DAG: {{ }}config.PipelineOptions @Options { @@ -25,7 +25,7 @@ module @test { // CHECK-DAG: {{ }}IE.ExecutorResource 1 of @SHAVE_NN // CHECK-DAG: {{ }}IE.ExecutorResource 1 of @DPU // CHECK-DAG: {{ }}IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware -// CHECK-DAG: {{ }}IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} -// CHECK-DAG: {{ }}IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} +// CHECK-DAG: {{ }}IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} +// CHECK-DAG: {{ }}IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} } diff --git a/tests/lit/NPU/dialect/VPU/passes/init_compiler_40XX.mlir b/tests/lit/NPU/dialect/VPU/passes/init_compiler_40XX.mlir index b846a11aa2..18d6770701 100644 --- a/tests/lit/NPU/dialect/VPU/passes/init_compiler_40XX.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/init_compiler_40XX.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --init-compiler="vpu-arch=%arch% compilation-mode=ReferenceSW" %s | FileCheck %s --strict-whitespace // REQUIRES: arch-NPU40XX -// CHECK: module @test attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} +// CHECK: module @test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} module @test { // CHECK-DAG: {{ }}config.PipelineOptions @Options { @@ -24,7 +24,7 @@ module @test { // CHECK-DAG: {{ }}IE.ExecutorResource 2 of @SHAVE_ACT // CHECK-DAG: {{ }}IE.ExecutorResource 1 of @DPU // CHECK-DAG: {{ }}IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -// CHECK-DAG: {{ }}IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} -// CHECK-DAG: {{ }}IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} +// CHECK-DAG: {{ }}IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} +// CHECK-DAG: {{ }}IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} } diff --git a/tests/lit/NPU/dialect/VPU/passes/init_compiler_40XX_wlm.mlir b/tests/lit/NPU/dialect/VPU/passes/init_compiler_40XX_wlm.mlir index facd3ce2af..c2a9d6d3c4 100644 --- a/tests/lit/NPU/dialect/VPU/passes/init_compiler_40XX_wlm.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/init_compiler_40XX_wlm.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --init-compiler="vpu-arch=%arch% compilation-mode=ReferenceSW workload-management-enable=true" %s | FileCheck %s --strict-whitespace // REQUIRES: arch-NPU40XX -// CHECK: module @test attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} +// CHECK: module @test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} module @test { // CHECK-DAG: {{ }}config.PipelineOptions @Options { @@ -17,6 +17,7 @@ module @test { // CHECK-DAG: {{ }}config.Option @VPU.AutoPaddingIDU : false // CHECK-DAG: {{ }}config.Option @VPU.MaxKernelSize : 11 // CHECK-DAG: {{ }}config.Option @VPU.FragmentationAvoidRatioPipeliningLargeWeights : 4.500000e-01 : f32 +// CHECK-DAG: {{ }}config.Option @VPU.WorkloadManagementStatus : "ENABLED" // CHECK-DAG: {{ }}} // CHECK-DAG: {{ }}IE.ExecutorResource 2 of @DMA_NN @@ -25,7 +26,7 @@ module @test { // CHECK-DAG: {{ }}IE.ExecutorResource 2 of @SHAVE_ACT // CHECK-DAG: {{ }}IE.ExecutorResource 1 of @DPU // CHECK-DAG: {{ }}IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -// CHECK-DAG: {{ }}IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} -// CHECK-DAG: {{ }}IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} +// CHECK-DAG: {{ }}IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} +// CHECK-DAG: {{ }}IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} } diff --git a/tests/lit/NPU/dialect/VPU/passes/init_compiler_hwrevision_40XX.mlir b/tests/lit/NPU/dialect/VPU/passes/init_compiler_hwrevision_40XX.mlir index 15ba08a2e7..78d81e8480 100644 --- a/tests/lit/NPU/dialect/VPU/passes/init_compiler_hwrevision_40XX.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/init_compiler_hwrevision_40XX.mlir @@ -6,6 +6,6 @@ // RUN: vpux-opt --init-compiler="vpu-arch=%arch% compilation-mode=ReferenceSW revision-id=3" %s | FileCheck %s --strict-whitespace // REQUIRES: arch-NPU40XX -// CHECK: module @test attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} +// CHECK: module @test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} module @test { } diff --git a/tests/lit/NPU/dialect/VPU/passes/init_resources_custom_37XX.mlir b/tests/lit/NPU/dialect/VPU/passes/init_resources_custom_37XX.mlir index 1edfe3ba9e..b5791c557f 100644 --- a/tests/lit/NPU/dialect/VPU/passes/init_resources_custom_37XX.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/init_resources_custom_37XX.mlir @@ -6,19 +6,19 @@ // RUN: vpux-opt --split-input-file --vpu-arch=%arch% --init-resources="vpu-arch=%arch% compilation-mode=DefaultHW allow-custom-values=true" %s | FileCheck %s --strict-whitespace // REQUIRES: arch-NPU37XX -// CHECK: module @mode attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} +// CHECK: module @mode attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} module @mode attributes {config.compilationMode = #config.compilation_mode} { } // ----- -// CHECK: module @arch attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} -module @arch attributes {VPU.arch = #VPU.arch_kind} { +// CHECK: module @arch attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} +module @arch attributes {config.arch = #config.arch_kind} { } // ----- -// CHECK: module @executors attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} +// CHECK: module @executors attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} module @executors { IE.ExecutorResource 5 of @DMA_NN IE.TileResource 5 of @NCE at 6.000000e+02 MHz @@ -30,17 +30,17 @@ module @executors { // CHECK-DAG: {{ }}IE.ExecutorResource 2 of @SHAVE_ACT // CHECK-DAG: {{ }}IE.ExecutorResource 1 of @SHAVE_NN // CHECK-DAG: {{ }}IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware -// CHECK-DAG: {{ }}IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} -// CHECK-DAG: {{ }}IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} +// CHECK-DAG: {{ }}IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} +// CHECK-DAG: {{ }}IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} // ----- -// CHECK: module @memory attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} +// CHECK: module @memory attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} module @memory { IE.TileResource 2 of @NCE at 1.300000e+03 MHz { IE.MemoryResource 5 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 10000 bytes of @CMX_NN {VPU.bandwidth = 10 : i64, VPU.derateFactor = 2.0 : f64} + IE.MemoryResource 10000 bytes of @CMX_NN {config.bandwidth = 10 : i64, config.derateFactor = 2.0 : f64} } IE.MemoryResource 500000 bytes of @DDR } @@ -51,5 +51,5 @@ module @memory { // CHECK-DAG: {{ }}IE.ExecutorResource 1 of @SHAVE_NN // CHECK-DAG: {{ }}IE.ExecutorResource 1 of @DPU // CHECK-DAG: {{ }}IE.MemoryResource 5 bytes of @CMX_NN_FragmentationAware -// CHECK-DAG: {{ }}IE.MemoryResource 10000 bytes of @CMX_NN {VPU.bandwidth = 10 : i64, VPU.derateFactor = 2.000000e+00 : f64} -// CHECK-DAG: {{ }}IE.MemoryResource 500000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} +// CHECK-DAG: {{ }}IE.MemoryResource 10000 bytes of @CMX_NN {config.bandwidth = 10 : i64, config.derateFactor = 2.000000e+00 : f64} +// CHECK-DAG: {{ }}IE.MemoryResource 500000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} diff --git a/tests/lit/NPU/dialect/VPU/passes/init_resources_custom_40XX.mlir b/tests/lit/NPU/dialect/VPU/passes/init_resources_custom_40XX.mlir index a4debb56b9..821645dfb3 100644 --- a/tests/lit/NPU/dialect/VPU/passes/init_resources_custom_40XX.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/init_resources_custom_40XX.mlir @@ -6,19 +6,19 @@ // RUN: vpux-opt --split-input-file --vpu-arch=%arch% --init-resources="vpu-arch=%arch% compilation-mode=DefaultHW allow-custom-values=true" %s | FileCheck %s --strict-whitespace // REQUIRES: arch-NPU40XX -// CHECK: module @mode attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} +// CHECK: module @mode attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} module @mode attributes {config.compilationMode = #config.compilation_mode} { } // ----- -// CHECK: module @arch attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} -module @arch attributes {VPU.arch = #VPU.arch_kind} { +// CHECK: module @arch attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} +module @arch attributes {config.arch = #config.arch_kind} { } // ----- -// CHECK: module @executors attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} +// CHECK: module @executors attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} module @executors { IE.ExecutorResource 5 of @DMA_NN IE.TileResource 5 of @NCE at 6.000000e+02 MHz @@ -30,16 +30,16 @@ module @executors { // CHECK-DAG: {{ }}IE.ExecutorResource 1 of @DPU // CHECK-DAG: {{ }}IE.ExecutorResource 2 of @SHAVE_ACT // CHECK-DAG: {{ }}IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -// CHECK-DAG: {{ }}IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} -// CHECK-DAG: {{ }}IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} +// CHECK-DAG: {{ }}IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} +// CHECK-DAG: {{ }}IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} // ----- -// CHECK: module @memory attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} +// CHECK: module @memory attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} module @memory { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 5 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 10000 bytes of @CMX_NN {VPU.bandwidth = 10 : i64, VPU.derateFactor = 2.0 : f64} + IE.MemoryResource 10000 bytes of @CMX_NN {config.bandwidth = 10 : i64, config.derateFactor = 2.0 : f64} } IE.MemoryResource 500000 bytes of @DDR } @@ -50,5 +50,5 @@ module @memory { // CHECK-DAG: {{ }}IE.ExecutorResource 1 of @DPU // CHECK-DAG: {{ }}IE.ExecutorResource 2 of @SHAVE_ACT // CHECK-DAG: {{ }}IE.MemoryResource 5 bytes of @CMX_NN_FragmentationAware -// CHECK-DAG: {{ }}IE.MemoryResource 10000 bytes of @CMX_NN {VPU.bandwidth = 10 : i64, VPU.derateFactor = 2.000000e+00 : f64} -// CHECK-DAG: {{ }}IE.MemoryResource 500000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} +// CHECK-DAG: {{ }}IE.MemoryResource 10000 bytes of @CMX_NN {config.bandwidth = 10 : i64, config.derateFactor = 2.000000e+00 : f64} +// CHECK-DAG: {{ }}IE.MemoryResource 500000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} diff --git a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_all_modes.mlir b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_all_modes.mlir index 6343e02d32..1f195793e0 100644 --- a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_all_modes.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_all_modes.mlir @@ -15,7 +15,7 @@ {-# dialect_resources: { builtin: { - ov_1: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd" + vpux_ow_1: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd" } } #-} @@ -29,7 +29,7 @@ module @TestAllOptions { } func.func @main(%input: tensor<4x16xf16>) -> (tensor<2x2xf32>, tensor<4x16xf32>) { - %cst = const.Declare tensor<2x2xf32> = dense_resource : tensor<4x4xf32>, + %cst = const.Declare tensor<2x2xf32> = dense_resource : tensor<4x4xf32>, [#const.Add<1.0 : f32>, #const.SubView<[2, 2], [2, 2]>] %out = IE.Convert(%input) {dstElemType = f32} : tensor<4x16xf16> -> tensor<4x16xf32> return %cst, %out : tensor<2x2xf32>, tensor<4x16xf32> @@ -56,7 +56,7 @@ module @TestAllOptions { // CHECK-ALL-NEXT: return [[SLICE]], [[CVT]] // CHECK-ALL: func.func @wrapper_main([[IN:%.+]]: tensor<4x16xf16>) -> (tensor<2x2xf32>, tensor<4x16xf32>) -// CHECK-ALL-NEXT: [[CST:%.+]] = const.Declare tensor<4x4xf32> = dense_resource +// CHECK-ALL-NEXT: [[CST:%.+]] = const.Declare tensor<4x4xf32> = dense_resource // CHECK-ALL-NEXT: [[INIT_CST:%.+]] = call @init([[CST]]) // CHECK-ALL-NEXT: [[MAIN_RES:%.+]]:2 = call @main([[IN]], [[INIT_CST]]) // CHECK-ALL-NEXT: return [[MAIN_RES]]#0, [[MAIN_RES]]#1 @@ -65,9 +65,9 @@ module @TestAllOptions { // CHECK-INIT-LABEL: @TestAllOptions // CHECK-INIT: net.NetworkInfo entryPoint : @init // CHECK-INIT: inputsInfo : { -// CHECK-INIT-NEXT: DataInfo "in_ov_1" : tensor<4x4xf32> +// CHECK-INIT-NEXT: DataInfo "vpux_ow_1" : tensor<4x4xf32> // CHECK-INIT: outputsInfo : { -// CHECK-INIT-NEXT: DataInfo "out_ov_1_hash_11258667776708180655" : tensor<4x4xf32> +// CHECK-INIT-NEXT: DataInfo "vpux_tw_1_hash_11258667776708180655" : tensor<4x4xf32> // CHECK-INIT: func.func @init([[ORIG_CST:%.+]]: tensor<4x4xf32>) -> tensor<4x4xf32> // CHECK-INIT-NEXT: [[ADDEND:%.+]] = const.Declare tensor<1xf32> @@ -82,7 +82,7 @@ module @TestAllOptions { // CHECK-MAIN: net.NetworkInfo entryPoint : @main // CHECK-MAIN: inputsInfo : { // CHECK-MAIN-NEXT: DataInfo "input1" : tensor<4x16xf16> -// CHECK-MAIN-NEXT: DataInfo "out_ov_1_hash_11258667776708180655" : tensor<4x4xf32> +// CHECK-MAIN-NEXT: DataInfo "vpux_tw_1_hash_11258667776708180655" : tensor<4x4xf32> // CHECK-MAIN: outputsInfo : { // CHECK-MAIN-NEXT: DataInfo "output1" : tensor<2x2xf32> // CHECK-MAIN-NEXT: DataInfo "output2" : tensor<4x16xf32> @@ -105,7 +105,7 @@ module @TestAllOptions { {-# dialect_resources: { builtin: { - ov_1: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd" + vpux_ow_1: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd" } } #-} @@ -120,12 +120,12 @@ module @OutlinedConstants { } func.func private @main_part1() -> tensor<4x4xf32> { - %cst = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32>, [#const.Add<5.0 : f32>] + %cst = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32>, [#const.Add<5.0 : f32>] return %cst : tensor<4x4xf32> } func.func @main(%input: tensor<4x16xf16>) -> (tensor<2x2xf32>, tensor<4x16xf16>, tensor<4x4xf32>) { - %cst = const.Declare tensor<2x2xf32> = dense_resource : tensor<4x4xf32>, + %cst = const.Declare tensor<2x2xf32> = dense_resource : tensor<4x4xf32>, [#const.Add<1.0 : f32>, #const.SubView<[2, 2], [2, 2]>] // Note: called twice to catch additional bugs %out = call @main_part1() : () -> tensor<4x4xf32> @@ -151,10 +151,10 @@ module @OutlinedConstants { // CHECK-INIT-LABEL: @OutlinedConstants // CHECK-INIT: net.NetworkInfo entryPoint : @init // CHECK-INIT: inputsInfo : { -// CHECK-INIT-NEXT: DataInfo "in_ov_1" : tensor<4x4xf32> +// CHECK-INIT-NEXT: DataInfo "vpux_ow_1" : tensor<4x4xf32> // CHECK-INIT: outputsInfo : { -// CHECK-INIT-NEXT: DataInfo "out_ov_1_hash_11258667776708180655" : tensor<4x4xf32> -// CHECK-INIT-NEXT: DataInfo "out_ov_1_hash_4063002564071487318" : tensor<4x4xf32> +// CHECK-INIT-NEXT: DataInfo "vpux_tw_1_hash_11258667776708180655" : tensor<4x4xf32> +// CHECK-INIT-NEXT: DataInfo "vpux_tw_1_hash_4063002564071487318" : tensor<4x4xf32> // CHECK-INIT-NOT: func.func private @main_part1 // CHECK-INIT: func.func @init @@ -166,8 +166,8 @@ module @OutlinedConstants { // CHECK-MAIN: net.NetworkInfo entryPoint : @main // CHECK-MAIN: inputsInfo : { // CHECK-MAIN-NEXT: DataInfo "input1" : tensor<4x16xf16> -// CHECK-MAIN-NEXT: DataInfo "out_ov_1_hash_11258667776708180655" : tensor<4x4xf32> -// CHECK-MAIN-NEXT: DataInfo "out_ov_1_hash_4063002564071487318" : tensor<4x4xf32> +// CHECK-MAIN-NEXT: DataInfo "vpux_tw_1_hash_11258667776708180655" : tensor<4x4xf32> +// CHECK-MAIN-NEXT: DataInfo "vpux_tw_1_hash_4063002564071487318" : tensor<4x4xf32> // CHECK-MAIN: outputsInfo : { // CHECK-MAIN-NEXT: DataInfo "output1" : tensor<2x2xf32> // CHECK-MAIN-NEXT: DataInfo "output2" : tensor<4x16xf16> @@ -186,8 +186,8 @@ module @OutlinedConstants { {-# dialect_resources: { builtin: { - ov_1: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd", - ov_42: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd" + vpux_ow_1: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd", + vpux_ow_42: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd" } } #-} @@ -202,12 +202,12 @@ module @HashConsistency { func.func @main(%input: tensor<1x1x4x4xf16>) -> tensor<1x1x4x4xf16> { // Note: {ov1, ov42}_common_hash share the exact same "in init" // transformations -> this means their hashes would match - %ov1_common_hash = const.Declare tensor<1x1x1x1xf16> = dense_resource : tensor<1x1x4x4xf32>, + %ov1_common_hash = const.Declare tensor<1x1x1x1xf16> = dense_resource : tensor<1x1x4x4xf32>, [#const.CastElemType, #const.Rescale<3.0>, #const.SubView<[0, 0, 0, 0], [1, 1, 1, 1]>] - %ov42_common_hash = const.Declare tensor<1x1x1x1xf16> = dense_resource : tensor<1x1x2x8xf32>, + %ov42_common_hash = const.Declare tensor<1x1x1x1xf16> = dense_resource : tensor<1x1x2x8xf32>, [#const.CastElemType, #const.Rescale<3.0>, #const.SubView<[0, 0, 2, 2], [1, 1, 1, 1]>] - %ov1_unique_hash = const.Declare tensor<1x1x1x1xf16> = dense_resource : tensor<1x1x4x4xf32>, + %ov1_unique_hash = const.Declare tensor<1x1x1x1xf16> = dense_resource : tensor<1x1x4x4xf32>, [#const.CastElemType, #const.Add<42.0>, #const.SubView<[0, 0, 0, 0], [1, 1, 1, 1]>] %0 = VPU.Add(%input, %ov1_common_hash) {auto_broadcast = #IE.auto_broadcast_type} @@ -252,8 +252,8 @@ module @HashConsistency { // CHECK-ALL: func.func @wrapper_main([[INPUT:%.+]]: tensor<1x1x4x4xf16>) -> tensor<1x1x4x4xf16> -// CHECK-ALL: [[OV_1:%.+]] = const.Declare tensor<1x1x4x4xf32> = dense_resource -// CHECK-ALL: [[OV_42:%.+]] = const.Declare tensor<1x1x2x8xf32> = dense_resource +// CHECK-ALL: [[OV_1:%.+]] = const.Declare tensor<1x1x4x4xf32> = dense_resource +// CHECK-ALL: [[OV_42:%.+]] = const.Declare tensor<1x1x2x8xf32> = dense_resource // CHECK-ALL: [[INIT:%.+]]:3 = call @init([[OV_1]], [[OV_42]]) // CHECK-ALL: [[MAIN:%.+]] = call @main([[INPUT]], [[INIT]]#0, [[INIT]]#1, [[INIT]]#2) // CHECK-ALL: return [[MAIN]] @@ -262,12 +262,12 @@ module @HashConsistency { // CHECK-INIT-LABEL: @HashConsistency // CHECK-INIT: net.NetworkInfo entryPoint : @init // CHECK-INIT: inputsInfo : { -// CHECK-INIT-NEXT: DataInfo "in_ov_42" : tensor<1x1x2x8xf32> -// CHECK-INIT-NEXT: DataInfo "in_ov_1" : tensor<1x1x4x4xf32> +// CHECK-INIT-NEXT: DataInfo "vpux_ow_42" : tensor<1x1x2x8xf32> +// CHECK-INIT-NEXT: DataInfo "vpux_ow_1" : tensor<1x1x4x4xf32> // CHECK-INIT: outputsInfo : { -// CHECK-INIT-NEXT: DataInfo "out_ov_42_hash_6705143075530545067" : tensor<1x1x2x8xf16> -// CHECK-INIT-NEXT: DataInfo "out_ov_1_hash_7071254137056153727" : tensor<1x1x4x4xf16> -// CHECK-INIT-NEXT: DataInfo "out_ov_1_hash_6705143075530545067" : tensor<1x1x4x4xf16> +// CHECK-INIT-NEXT: DataInfo "vpux_tw_42_hash_6705143075530545067" : tensor<1x1x2x8xf16> +// CHECK-INIT-NEXT: DataInfo "vpux_tw_1_hash_7071254137056153727" : tensor<1x1x4x4xf16> +// CHECK-INIT-NEXT: DataInfo "vpux_tw_1_hash_6705143075530545067" : tensor<1x1x4x4xf16> // CHECK-INIT: func.func @init([[OV_42:%.+]]: tensor<1x1x2x8xf32>, [[OV_1:%.+]]: tensor<1x1x4x4xf32>) // CHECK-INIT-SAME: -> (tensor<1x1x2x8xf16>, tensor<1x1x4x4xf16>, tensor<1x1x4x4xf16>) @@ -277,9 +277,9 @@ module @HashConsistency { // CHECK-MAIN: net.NetworkInfo entryPoint : @main // CHECK-MAIN: inputsInfo : { // CHECK-MAIN-NEXT: DataInfo "input1" : tensor<1x1x4x4xf16> -// CHECK-MAIN-NEXT: DataInfo "out_ov_1_hash_7071254137056153727" : tensor<1x1x4x4xf16> -// CHECK-MAIN-NEXT: DataInfo "out_ov_1_hash_6705143075530545067" : tensor<1x1x4x4xf16> -// CHECK-MAIN-NEXT: DataInfo "out_ov_42_hash_6705143075530545067" : tensor<1x1x2x8xf16> +// CHECK-MAIN-NEXT: DataInfo "vpux_tw_1_hash_7071254137056153727" : tensor<1x1x4x4xf16> +// CHECK-MAIN-NEXT: DataInfo "vpux_tw_1_hash_6705143075530545067" : tensor<1x1x4x4xf16> +// CHECK-MAIN-NEXT: DataInfo "vpux_tw_42_hash_6705143075530545067" : tensor<1x1x2x8xf16> // CHECK-MAIN: outputsInfo : { // CHECK-MAIN-NEXT: DataInfo "output1" : tensor<1x1x4x4xf16> diff --git a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_dump_statistics.mlir b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_dump_statistics.mlir new file mode 100644 index 0000000000..54c52bb4c5 --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_dump_statistics.mlir @@ -0,0 +1,161 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: env OV_NPU_LOG_LEVEL=LOG_INFO vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --introduce-init-function="ws-extraction-mode=gen-init" -o /dev/null %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + + +{-# + dialect_resources: { + builtin: { + vpux_ow_11bytes: "0x000000040011223300aabbcc00aabb", + vpux_ow_10bytes: "0x00000004aabbccddee1122334455" + } + } +#-} + +module @SizePreserved { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1xf32> + } outputsInfo : { + DataInfo "output1" : tensor<1xf32> + } + + func.func @main_part0(%arg: tensor<1xf32>) -> tensor<1xf32> { + %cst_ov_11bytes = const.Declare tensor<11xui8> = dense_resource : tensor<11xui8>, + [#const.CastElemType, #const.Add<42.0>, #const.CastElemType] + return %arg : tensor<1xf32> + } + + func.func @main(%arg: tensor<1xf32>) -> tensor<1xf32> { + %cst_ov_10bytes = const.Declare tensor<5xf16> = dense_resource : tensor<5xf16>, + [#const.Add<42.0>] + + %call = func.call @main_part0(%arg) : (tensor<1xf32>) -> tensor<1xf32> + return %call : tensor<1xf32> + } + + // Note: total bytes = 10 + 11 = 21 ~ 0.02 KB (21 / 1024) + + // CHECK: Summary about constants: + // CHECK: All imported unique weights: 2 (0.02 KB) + // CHECK: Available unique weights[1]: 2 (0.02 KB which is 100.00%) + // CHECK: Unique weights used by schedule (from available): 2 (0.02 KB which is 100.00%) + // CHECK: OV-originated constants[2] in IR: 2 (0.02 KB) + // CHECK: Unused constants[3]: 0 (0.00 KB which is 0.00%) + // CHECK: Unsupported constants[4]: 0 (0.00 KB which is 0.00%) + // CHECK: Size percentage of *used* constants: 100.00% + // CHECK: Generated schedule's total I/O size: 0.04 KB +} + +// ----- + +{-# + dialect_resources: { + builtin: { + vpux_ow_2bytes: "0x00000004aabb" + } + } +#-} + +module @LargeOutput { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1xf32> + } outputsInfo : { + DataInfo "output1" : tensor<1xf32> + } + + func.func @main(%arg: tensor<1xf32>) -> tensor<1xf32> { + %cst_ov_2bytes = const.Declare tensor<1024xsi8> = dense_resource : tensor<2xsi8>, + [#const.PadWithZero<[0], [1022]>] + + return %arg: tensor<1xf32> + } + + // CHECK: Summary about constants: + // CHECK: All imported unique weights: 1 (0.00 KB) + // CHECK: Available unique weights[1]: 1 (0.00 KB which is 100.00%) + // CHECK: Unique weights used by schedule (from available): 1 (0.00 KB which is 100.00%) + // CHECK: OV-originated constants[2] in IR: 1 (1.00 KB) + // CHECK: Unused constants[3]: 0 (0.00 KB which is 0.00%) + // CHECK: Unsupported constants[4]: 0 (0.00 KB which is 0.00%) + // CHECK: Size percentage of *used* constants: 100.00% + // CHECK: Generated schedule's total I/O size: 1.00 KB +} + +// ----- + +{-# + dialect_resources: { + builtin: { + vpux_ow_1: "0x000000040011223300aabbcc00aabbcc00aabbcc", + vpux_ow_2: "0x0000000400112233", + vpux_ow_outlined: "0x00000004aabbccddee", + vpux_ow_splat: "0x0000000412341234", + vpux_ow_noop: "0x000000040011223300aabbcc00aabbcc00aabbcc", + + vpux_ow_unused: "0x0000000400112233" + } + } +#-} + +!qElemType1 = !quant.uniform +!qElemType2 = !quant.uniform + +module @ManyConstants { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1xf32> + } outputsInfo : { + DataInfo "output1" : tensor<1xf32> + } + + func.func @main_part0(%arg: tensor<1xf32>) -> tensor<1xf32> { + %cst_ov_outlined = const.Declare tensor<5xf16> = dense_resource : tensor<5xui8>, + [#const.CastElemType, #const.Add<42.0>] + + // Not suitable for weights separation below: + %cst_ov_splat = const.Declare tensor<2xf16> = dense_resource : tensor<2xf16>, + [#const.Add<1.0>] + %cst_ov_noop = const.Declare tensor<2x2xf32> = dense_resource : tensor<2x2xf32> + %cst_ov1_non_supported = const.Declare tensor<1x1x3x3xf32> = dense_resource : tensor<1x1x2x2xf32>, + [#const.ExpandDilated<[2, 2]>] + + return %arg : tensor<1xf32> + } + + func.func @main(%arg: tensor<1xf32>) -> tensor<1xf32> { + %cst_ov1_0 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf32>, + [#const.CastElemType] + %cst_ov1_1 = const.Declare tensor<2x2xf32> = dense_resource : tensor<2x2xf32>, + [#const.Add<1.0>] + + %cst_ov2 = const.Declare tensor<2x!qElemType1> = dense_resource : tensor<2xf16>, + [#const.Rescale<0.5>, #const.CastElemType] + + + %not_ov_weight = const.Declare tensor<2x2xf16> = dense<[[4.0, 2.0], [12.0, 18.0]]> : tensor<2x2xf16>, + [#const.Add<42.0>] + + + %call = func.call @main_part0(%arg) : (tensor<1xf32>) -> tensor<1xf32> + return %call : tensor<1xf32> + } + + // CHECK: Summary about constants: + // CHECK: All imported unique weights: 6 (0.05 KB) + // CHECK: Available unique weights[1]: 5 (0.04 KB which is 91.84%) + // CHECK: Unique weights used by schedule (from available): 3 (0.02 KB which is 55.56%) + // CHECK: OV-originated constants[2] in IR: 7 (0.09 KB) + // CHECK: Unused constants[3]: 2 (0.02 KB which is 21.74%) + // CHECK: Unsupported constants[4]: 1 (0.04 KB which is 39.13%) + // CHECK: Size percentage of *used* constants: 39.13% + // CHECK: Generated schedule's total I/O size: 0.06 KB + + // CHECK: [1]: available unique weights - weights that come from original model and are used in the compiled schedule (via constant operations) + // CHECK: [2]: OV-originated constants - constant operations that combine OV weights with transformations (e.g. subview, reorder) + // CHECK: Note: the same unique weight could be used in multiple constants + // CHECK: [3]: unused constants - OV-originated constants that are ignored by weights separation (e.g. splats, only trivial transformations) + // CHECK: [4]: unsupported constants - OV-originated constants that have unsupported transformations +} diff --git a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_empty_init.mlir b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_empty_init.mlir index dad76fabea..5cf4324b87 100644 --- a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_empty_init.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_empty_init.mlir @@ -33,7 +33,6 @@ module @NoConstants { // CHECK-ALL-NEXT: return [[IN]] // CHECK-ALL: func.func @wrapper_main([[IN:%.+]]: tensor<4x16xf16>) -> tensor<4x16xf16> - // CHECK-ALL-NEXT: call @init() : () -> () // CHECK-ALL-NEXT: [[MAIN:%.+]] = call @main([[IN]]) // CHECK-ALL-NEXT: return [[MAIN]] diff --git a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_gen_all.mlir b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_gen_all.mlir index e8e9e00ec2..bfad5dd48d 100644 --- a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_gen_all.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_gen_all.mlir @@ -11,8 +11,8 @@ {-# dialect_resources: { builtin: { - ov_1: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd", - ov_2: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd" + vpux_ow_1: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd", + vpux_ow_2: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd" } } #-} @@ -30,13 +30,13 @@ module @CommonSubexpressionElimination { } func.func @main() -> (tensor<4x4xf32>, tensor<4x4xf32>, tensor<8x4xf32>, tensor<8x4xf16>, tensor<8x4xf16>, tensor<4x4xf32>, tensor<4x4xf32>) { - %cst_t1 = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32>, [#const.Add<1.0 : f32>] - %cst_t2 = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32>, [#const.Add<1.0 : f32>] - %cst_t2_t3_t4 = const.Declare tensor<8x4xf32> = dense_resource : tensor<4x4xf32>, [#const.Add<1.0 : f32>, #const.PadWithZero<[0, 0], [4, 0]>, #const.Rescale<5.0 : f32>] - %cst_t2_t3_t5 = const.Declare tensor<8x4xf16> = dense_resource : tensor<4x4xf32>, [#const.Add<1.0 : f32>, #const.PadWithZero<[0, 0], [4, 0]>, #const.ConvertElemType] - %cst_t2_t3_t5_copy = const.Declare tensor<8x4xf16> = dense_resource : tensor<4x4xf32>, [#const.Add<1.0 : f32>, #const.PadWithZero<[0, 0], [4, 0]>, #const.ConvertElemType] - %cst_empty_1 = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32> - %cst_empty_2 = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32>, [] + %cst_t1 = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32>, [#const.Add<1.0 : f32>] + %cst_t2 = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32>, [#const.Add<1.0 : f32>] + %cst_t2_t3_t4 = const.Declare tensor<8x4xf32> = dense_resource : tensor<4x4xf32>, [#const.Add<1.0 : f32>, #const.PadWithZero<[0, 0], [4, 0]>, #const.Rescale<5.0 : f32>] + %cst_t2_t3_t5 = const.Declare tensor<8x4xf16> = dense_resource : tensor<4x4xf32>, [#const.Add<1.0 : f32>, #const.PadWithZero<[0, 0], [4, 0]>, #const.ConvertElemType] + %cst_t2_t3_t5_copy = const.Declare tensor<8x4xf16> = dense_resource : tensor<4x4xf32>, [#const.Add<1.0 : f32>, #const.PadWithZero<[0, 0], [4, 0]>, #const.ConvertElemType] + %cst_empty_1 = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32> + %cst_empty_2 = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32>, [] return %cst_t1, %cst_t2, %cst_t2_t3_t4, %cst_t2_t3_t5, %cst_t2_t3_t5_copy, %cst_empty_1, %cst_empty_2 : tensor<4x4xf32>, tensor<4x4xf32>, tensor<8x4xf32>, tensor<8x4xf16>, tensor<8x4xf16>, tensor<4x4xf32>, tensor<4x4xf32> } @@ -58,13 +58,13 @@ module @CommonSubexpressionElimination { // CHECK: func.func private @main([[ARG0:%.+]]: tensor<4x4xf32>, [[ARG2:%.+]]: tensor<8x4xf32>, [[ARG3:%.+]]: tensor<8x4xf16>, [[ARG1:%.+]]: tensor<4x4xf32>) // CHECK-SAME: -> (tensor<4x4xf32>, tensor<4x4xf32>, tensor<8x4xf32>, tensor<8x4xf16>, tensor<8x4xf16>, tensor<4x4xf32>, tensor<4x4xf32>) - // CHECK: [[CST2:%.+]] = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32> - // CHECK: [[CST3:%.+]] = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32> + // CHECK: [[CST2:%.+]] = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32> + // CHECK: [[CST3:%.+]] = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32> // CHECK: return [[ARG0]], [[ARG1]], [[ARG2]], [[ARG3]], [[ARG3]], [[CST2]], [[CST3]] : tensor<4x4xf32>, tensor<4x4xf32>, tensor<8x4xf32>, tensor<8x4xf16>, tensor<8x4xf16>, tensor<4x4xf32>, tensor<4x4xf32> // CHECK: func.func @wrapper_main() -> (tensor<4x4xf32>, tensor<4x4xf32>, tensor<8x4xf32>, tensor<8x4xf16>, tensor<8x4xf16>, tensor<4x4xf32>, tensor<4x4xf32>) - // CHECK: [[CST0:%.+]] = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32> - // CHECK: [[CST1:%.+]] = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32> + // CHECK: [[CST0:%.+]] = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32> + // CHECK: [[CST1:%.+]] = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32> // CHECK: [[CALL:%.+]]:4 = call @init([[CST0]], [[CST1]]) // CHECK: [[RET:%.+]]:7 = call @main([[CALL]]#0, [[CALL]]#1, [[CALL]]#2, [[CALL]]#3) // CHECK: return [[RET]]#0, [[RET]]#1, [[RET]]#2, [[RET]]#3, [[RET]]#4, [[RET]]#5, [[RET]]#6 @@ -76,8 +76,8 @@ module @CommonSubexpressionElimination { {-# dialect_resources: { builtin: { - ov_1: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd", - ov_2: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd" + vpux_ow_1: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd", + vpux_ow_2: "0x0000000400aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbcc00aabbdd" } } #-} @@ -89,7 +89,7 @@ module @SubViewOutside { } func.func @main() -> (tensor<2x2xf32>) { - %cst_t1 = const.Declare tensor<2x2xf32> = dense_resource : tensor<4x4xf32>, [#const.Add<1.0 : f32>, #const.SubView<[2, 2], [2, 2]>] + %cst_t1 = const.Declare tensor<2x2xf32> = dense_resource : tensor<4x4xf32>, [#const.Add<1.0 : f32>, #const.SubView<[2, 2], [2, 2]>] return %cst_t1 : tensor<2x2xf32> } @@ -107,7 +107,7 @@ module @SubViewOutside { // CHECK: return [[SLICE]] : tensor<2x2xf32> // CHECK: func.func @wrapper_main() -> tensor<2x2xf32> - // CHECK: [[CST0:%.+]] = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32> + // CHECK: [[CST0:%.+]] = const.Declare tensor<4x4xf32> = dense_resource : tensor<4x4xf32> // CHECK: [[CALL:%.+]] = call @init([[CST0]]) : (tensor<4x4xf32>) -> tensor<4x4xf32> // CHECK: [[RET:%.+]] = call @main([[CALL]]) // CHECK: return [[RET]] : tensor<2x2xf32> @@ -118,7 +118,7 @@ module @SubViewOutside { {-# dialect_resources: { builtin: { - ov: "0x10000000ABABABABCDCDCDCD" + vpux_ow_: "0x10000000ABABABABCDCDCDCD" } } #-} @@ -150,7 +150,7 @@ module @SubViewOutsideAdvanced { // CHECK: DataInfo "Convolution_63" friendlyName = "Result_64" : tensor<48x16x1x1xf16> func.func @main(%arg0: tensor<1x192x100x100xf16>) -> tensor<48x16x1x1xf16, {order = #NHWC}> { - %cst = const.Declare tensor<48x16x1x1xf16, {order = #NHWC}> = dense_resource : tensor<2x2x1x1xf16>, [#const.Reorder<#NHWC>, #const.PadWithZero<[0, 0, 0, 0], [46, 14, 0, 0]>, #const.SubView<[0, 0, 0, 0], [48, 16, 1, 1]>] + %cst = const.Declare tensor<48x16x1x1xf16, {order = #NHWC}> = dense_resource : tensor<2x2x1x1xf16>, [#const.Reorder<#NHWC>, #const.PadWithZero<[0, 0, 0, 0], [46, 14, 0, 0]>, #const.SubView<[0, 0, 0, 0], [48, 16, 1, 1]>] %0 = VPU.Copy(%cst) {out_mem_space = @CMX_NN} : tensor<48x16x1x1xf16, {order = #NHWC}> -> !DistributedTensor0 %1 = VPU.Copy(%0) : !DistributedTensor0 -> tensor<48x16x1x1xf16, {order = #NHWC}> @@ -174,7 +174,7 @@ module @SubViewOutsideAdvanced { // CHECK: func.func @wrapper_main([[ARG0:%.+]]: tensor<1x192x100x100xf16>) -> tensor<48x16x1x1xf16, {order = #NHWC}> // -- Ensure that the stripped ngraph constants are outside. - // CHECK-DAG: [[CST0:%.+]] = const.Declare tensor<2x2x1x1xf16> = dense_resource : tensor<2x2x1x1xf16> + // CHECK-DAG: [[CST0:%.+]] = const.Declare tensor<2x2x1x1xf16> = dense_resource : tensor<2x2x1x1xf16> // CHECK: [[CALL:%.+]] = call @init([[CST0]]) : (tensor<2x2x1x1xf16>) -> tensor<48x16x1x1xf16, {order = #NHWC}> // CHECK: [[RET:%.+]] = call @main([[ARG0]], [[CALL]]) // CHECK: return [[RET:%.+]] : tensor<48x16x1x1xf16, {order = #NHWC}> @@ -185,7 +185,7 @@ module @SubViewOutsideAdvanced { {-# dialect_resources: { builtin: { - ov_0: "0x10000000AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30" + vpux_ow_0: "0x10000000AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30" } } #-} @@ -211,7 +211,7 @@ module @QuantizedToQuantizedConversion { // CHECK: DataInfo "output_0" : tensor<16x3x3x3xui8> func.func @main() -> (tensor<16x3x3x3xui8>) { - %cst = const.Declare tensor<16x3x3x3x!qElemType2> = dense_resource : tensor<16x3x3x3xsi8>, + %cst = const.Declare tensor<16x3x3x3x!qElemType2> = dense_resource : tensor<16x3x3x3xsi8>, [#const.CastElemType, #const.ConvertElemType] // Normally QuantizeCast ops are part of transformations @@ -241,7 +241,7 @@ module @QuantizedToQuantizedConversion { // CHECK: return [[RES]] // CHECK: func.func @wrapper_main() -> tensor<16x3x3x3xui8> - // CHECK: [[CST0:%.+]] = const.Declare tensor<16x3x3x3xsi8> = dense_resource + // CHECK: [[CST0:%.+]] = const.Declare tensor<16x3x3x3xsi8> = dense_resource // CHECK: [[INIT:%.+]] = call @init([[CST0]]) // CHECK: [[RET:%.+]] = call @main([[INIT]]) // CHECK: return [[RET]] @@ -252,8 +252,8 @@ module @QuantizedToQuantizedConversion { {-# dialect_resources: { builtin: { - ov_0: "0x10000000AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30", - ov_1: "0x100000000ABDCE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE30" + vpux_ow_0: "0x10000000AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30", + vpux_ow_1: "0x100000000ABDCE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE300AB0CE300AB0CE30CE30" } } #-} @@ -302,16 +302,16 @@ module @QuantizedToQuantizedConversion_PerAxis { // CHECK: DataInfo "output_2" : tensor<10x20x1x1xui8> func.func @main() -> (tensor<16x1x3x3xui8, {order = #NHWC}>, tensor<16x2x3x3xui8, {order = #NHWC}>, tensor<10x20x1x1xui8>) { - %cst_0 = const.Declare tensor<16x1x3x3x!qElemType3, {order = #NHWC}> = dense_resource : tensor<16x3x3x3xsi8>, + %cst_0 = const.Declare tensor<16x1x3x3x!qElemType3, {order = #NHWC}> = dense_resource : tensor<16x3x3x3xsi8>, [#const.CastElemType, #const.ConvertElemType, #const.Reorder<#NHWC>, #const.SubView<[0, 0, 0, 0], [16, 1, 3, 3]>] - %cst_1 = const.Declare tensor<16x2x3x3x!qElemType4, {order = #NHWC}> = dense_resource : tensor<16x3x3x3xsi8>, + %cst_1 = const.Declare tensor<16x2x3x3x!qElemType4, {order = #NHWC}> = dense_resource : tensor<16x3x3x3xsi8>, [#const.CastElemType, #const.ConvertElemType, #const.Reorder<#NHWC>, #const.SubView<[0, 1, 0, 0], [16, 2, 3, 3]>] - %cst_2 = const.Declare tensor<10x20x1x1x!qElemType7> = dense_resource : tensor<10x20xsi8>, + %cst_2 = const.Declare tensor<10x20x1x1x!qElemType7> = dense_resource : tensor<10x20xsi8>, [#const.Reshape<[1, 10, 1, 20]>, #const.CastElemType, #const.ChangeShapeAndElemType<[10, 20, 1, 1], !qElemType6>, #const.ConvertElemType] @@ -376,8 +376,8 @@ module @QuantizedToQuantizedConversion_PerAxis { // CHECK: return [[QUANTIZECAST13]], [[QUANTIZECAST14]], [[QUANTIZECAST15]] // CHECK: func.func @wrapper_main() -> (tensor<16x1x3x3xui8, {order = #NHWC}>, tensor<16x2x3x3xui8, {order = #NHWC}>, tensor<10x20x1x1xui8>) - // CHECK-DAG: [[CST0:%.+]] = const.Declare tensor<16x3x3x3xsi8> = dense_resource - // CHECK-DAG: [[CST1:%.+]] = const.Declare tensor<10x20xsi8> = dense_resource + // CHECK-DAG: [[CST0:%.+]] = const.Declare tensor<16x3x3x3xsi8> = dense_resource + // CHECK-DAG: [[CST1:%.+]] = const.Declare tensor<10x20xsi8> = dense_resource // CHECK: [[CALL:%[0-9]+]]:2 = call @init([[CST0]], [[CST1]]) // CHECK: [[RET:%.+]]:3 = call @main([[CALL]]#0, [[CALL]]#1) // CHECK: return [[RET]]#0, [[RET]]#1, [[RET]]#2 @@ -388,8 +388,8 @@ module @QuantizedToQuantizedConversion_PerAxis { {-# dialect_resources: { builtin: { - ov_0: "0x10000000AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30", - ov_1: "0x100000000AB0CE30" + vpux_ow_0: "0x10000000AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30AEB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E300EB00E30", + vpux_ow_1: "0x100000000AB0CE30" } } #-} @@ -416,7 +416,7 @@ module @Convolution { func.func @main(%arg0: tensor<1x3x62x62xf16>) -> (tensor<1x16x60x60xf16>, tensor<2x1x1x1xf16, {order = #NHWC}>, tensor<1x2x1x1xf16>) { %cst = const.Declare tensor<16x1x1x4xsi32> = dense<1> : tensor<16x1x1x4xsi32> %cst_0 = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> - = dense_resource : tensor<16x3x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>, #const.PadWithZero<[0, 0, 0, 0], [0, 13, 0, 0]>] + = dense_resource : tensor<16x3x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>, #const.PadWithZero<[0, 0, 0, 0], [0, 13, 0, 0]>] %0 = VPU.Expand(%arg0) {pads_begin = [0, 0, 0, 0], pads_end = [0, 0, 0, 2]} : tensor<1x3x62x62xf16> -> tensor<1x3x62x64xf16> %1 = VPU.NCE.Permute(%0) {dstElemType = f16, dstOrder = #NHWC, expandedChannels = 16 : i64, ppe = #VPU.PPEStub<>} -> tensor<1x16x62x64xf16, {order = #NHWC}> @@ -425,8 +425,8 @@ module @Convolution { pad = #VPU.Padding, rawFilterShape = [16, 16, 3, 3], strides = [1, 1], ppe = #VPU.PPEStub<>} : tensor<1x16x62x62xf16, {order = #NHWC}>, tensor<16x16x3x3xf16, {order = #NHWC}>, tensor<16x1x1x4xsi32> -> tensor<1x16x60x60xf16> - %cst_1 = const.Declare tensor<2x1x1x1xf16, {order = #NHWC}> = dense_resource : tensor<1x2x1x1xf16>, [#const.Reshape<[2, 1, 1, 1]>, #const.Reorder<#NHWC>] - %cst_2 = const.Declare tensor<1x2x1x1xf16> = dense_resource : tensor<1x2x1x1xf16>, [#const.Add<1.0>] + %cst_1 = const.Declare tensor<2x1x1x1xf16, {order = #NHWC}> = dense_resource : tensor<1x2x1x1xf16>, [#const.Reshape<[2, 1, 1, 1]>, #const.Reorder<#NHWC>] + %cst_2 = const.Declare tensor<1x2x1x1xf16> = dense_resource : tensor<1x2x1x1xf16>, [#const.Add<1.0>] return %3, %cst_1, %cst_2 : tensor<1x16x60x60xf16>, tensor<2x1x1x1xf16, {order = #NHWC}>, tensor<1x2x1x1xf16> } @@ -448,13 +448,13 @@ module @Convolution { // CHECK: [[PERMUTE0:%.+]] = VPU.NCE.Permute([[EXPAND0]]) // CHECK: [[SLICE0:%.+]] = VPU.Slice [[PERMUTE0]] [0, 0, 0, 0] [1, 16, 62, 62] : tensor<1x16x62x64xf16, {order = #NHWC}> to tensor<1x16x62x62xf16, {order = #NHWC}> // CHECK: [[CONVOLUTION0:%.+]] = VPU.NCE.Convolution([[SLICE0]], [[INIT_OUT0]], [[CST]]) - // CHECK: [[CST2:%.+]] = const.Declare tensor<2x1x1x1xf16, {order = #NHWC}> = dense_resource + // CHECK: [[CST2:%.+]] = const.Declare tensor<2x1x1x1xf16, {order = #NHWC}> = dense_resource // CHECK-SAME: [#const.Reshape<[2, 1, 1, 1]>, #const.Reorder<#NHWC>] // CHECK: return [[CONVOLUTION0]], [[CST2]], [[INIT_OUT2]] // CHECK: func.func @wrapper_main([[ARG0:%.+]]: tensor<1x3x62x62xf16>) -> (tensor<1x16x60x60xf16>, tensor<2x1x1x1xf16, {order = #NHWC}>, tensor<1x2x1x1xf16>) - // CHECK-DAG: [[CST0:%.+]] = const.Declare tensor<16x3x3x3xf32> = dense_resource : tensor<16x3x3x3xf32> - // CHECK-DAG: [[CST1:%.+]] = const.Declare tensor<1x2x1x1xf16> = dense_resource : tensor<1x2x1x1xf16> + // CHECK-DAG: [[CST0:%.+]] = const.Declare tensor<16x3x3x3xf32> = dense_resource : tensor<16x3x3x3xf32> + // CHECK-DAG: [[CST1:%.+]] = const.Declare tensor<1x2x1x1xf16> = dense_resource : tensor<1x2x1x1xf16> // CHECK: [[CALL:%[0-9]+]]:2 = call @init([[CST0]], [[CST1]]) // CHECK: [[RET:%.+]]:3 = call @main([[ARG0]], [[CALL]]#0, [[CALL]]#1) // CHECK: return [[RET]]#0, [[RET]]#1, [[RET]]#2 @@ -465,7 +465,7 @@ module @Convolution { {-# dialect_resources: { builtin: { - ov_0: "0x10000000ABCDABCDABCDABCE" + vpux_ow_0: "0x10000000ABCDABCDABCDABCE" } } #-} @@ -486,7 +486,7 @@ module @QuantizeAttr { // CHECK: DataInfo "output" : tensor<2x2xf16> func.func @main(%dummy: tensor<2x2xf16>) -> tensor<2x2xf16> { - %cst = const.Declare tensor<2x2x!qElemType> = dense_resource : tensor<2x2xf16>, [#const.Quantize] + %cst = const.Declare tensor<2x2x!qElemType> = dense_resource : tensor<2x2xf16>, [#const.Quantize] return %dummy : tensor<2x2xf16> } @@ -500,7 +500,7 @@ module @QuantizeAttr { // CHECK: return [[ARG0]] : tensor<2x2xf16> // CHECK: func.func @wrapper_main([[ARG2:%.+]]: tensor<2x2xf16>) -> tensor<2x2xf16> - // CHECK: [[CST:%.+]] = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16> + // CHECK: [[CST:%.+]] = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16> // CHECK: [[CALL0:%.+]] = call @init([[CST]]) : (tensor<2x2xf16>) -> tensor<2x2xsi8> // CHECK: [[CALL1:%.+]] = call @main([[ARG2]], [[CALL0]]) : (tensor<2x2xf16>, tensor<2x2xsi8>) -> tensor<2x2xf16> // CHECK: return [[CALL1]] : tensor<2x2xf16> @@ -511,7 +511,7 @@ module @QuantizeAttr { {-# dialect_resources: { builtin: { - ov_0: "0x10000000ABCDABCDABCDABCE" + vpux_ow_0: "0x10000000ABCDABCDABCDABCE" } } #-} @@ -533,8 +533,8 @@ module @UniqueArgumentChains { // CHECK: DataInfo "output" : tensor<2x2xf16> func.func @main(%dummy: tensor<2x2xf16>) -> tensor<2x2xf16> { - %cst0 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<1.0>] - %cst1 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<2.0>] + %cst0 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<1.0>] + %cst1 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<2.0>] return %dummy : tensor<2x2xf16> } @@ -549,7 +549,7 @@ module @UniqueArgumentChains { // CHECK: return [[ARG0]] // CHECK: func.func @wrapper_main([[ARG2:%.+]]: tensor<2x2xf16>) -> tensor<2x2xf16> - // CHECK: [[CST:%.+]] = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16> + // CHECK: [[CST:%.+]] = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16> // CHECK: [[INIT:%.+]]:2 = call @init([[CST]]) // CHECK: [[MAIN:%.+]] = call @main([[ARG2]], [[INIT]]#0, [[INIT]]#1) // CHECK: return [[MAIN]] @@ -560,7 +560,7 @@ module @UniqueArgumentChains { {-# dialect_resources: { builtin: { - ov_0: "0x10000000ABCDABCDABCDABCE" + vpux_ow_0: "0x10000000ABCDABCDABCDABCE" } } #-} @@ -579,8 +579,8 @@ module @OutlinedConstants { // CHECK: DataInfo "output" : tensor<2x2xf16> func.func private @main_foo1(%dummy: tensor<2x2xf16>) -> tensor<2x2xf16> { - %cst = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<15.0>] - %cst_bar_duplicate = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, + %cst = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<15.0>] + %cst_bar_duplicate = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Rescale<2.0>] %user_cst = VPU.Convert(%cst) {dstElemType = f32} : tensor<2x2xf16> -> tensor<2x2xf32> %user_cst_bar_duplicate = VPU.Convert(%cst_bar_duplicate) {dstElemType = f32} @@ -594,8 +594,8 @@ module @OutlinedConstants { // CHECK: return [[DUMMY]] func.func private @main_bar() -> (tensor<4x1xf16>, tensor<2x2xf16>) { - %cst1 = const.Declare tensor<4x1xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<15.0>, #const.Reshape<[4, 1]>] - %cst2 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Rescale<2.0>] + %cst1 = const.Declare tensor<4x1xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<15.0>, #const.Reshape<[4, 1]>] + %cst2 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Rescale<2.0>] return %cst1, %cst2 : tensor<4x1xf16>, tensor<2x2xf16> } @@ -603,8 +603,8 @@ module @OutlinedConstants { // CHECK: return [[CST1]], [[CST2]] func.func private @main_foo2(%dummy: tensor<2x2xf16>) -> (tensor<2x2xf16>, tensor<4x1xf16>, tensor<2x2xf16>) { - %cst = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<10.0>] - %cst_bar_duplicate = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, + %cst = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<10.0>] + %cst_bar_duplicate = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Rescale<2.0>] %user_cst = VPU.Convert(%cst) {dstElemType = f32} : tensor<2x2xf16> -> tensor<2x2xf32> @@ -625,23 +625,23 @@ module @OutlinedConstants { // CHECK: func.func private @init([[OV_CONST0:%.+]]: tensor<2x2xf16>) // CHECK-SAME: -> (tensor<2x2xf16>, tensor<2x2xf16>, tensor<2x2xf16>, tensor<4x1xf16>) - // foo2 && main: dense_resource : tensor<2x2xf16>, [#const.Add<10.0>] + // foo2 && main: dense_resource : tensor<2x2xf16>, [#const.Add<10.0>] // CHECK: [[CST3:%.+]] = const.Declare {{.*}} dense<1.000000e+01> // CHECK: [[CST_ADD10:%.+]] = IE.Add([[OV_CONST0]], [[CST3]]) - // foo1: dense_resource : tensor<2x2xf16>, [#const.Add<15.0>] - // foo2: dense_resource : tensor<2x2xf16>, [#const.Add<15.0>] + // foo1: dense_resource : tensor<2x2xf16>, [#const.Add<15.0>] + // foo2: dense_resource : tensor<2x2xf16>, [#const.Add<15.0>] // CHECK: [[CST1:%.+]] = const.Declare {{.*}} dense<1.500000e+01> // CHECK: [[CST_ADD15:%.+]] = IE.Add([[OV_CONST0]], [[CST1]]) - // foo2 && bar: dense_resource : tensor<2x2xf16>, [#const.Rescale<2.0>] + // foo2 && bar: dense_resource : tensor<2x2xf16>, [#const.Rescale<2.0>] // CHECK: [[CST2:%.+]] = const.Declare {{.*}} dense<2.000000e+00> // CHECK: [[CST_RESCALE2:%.+]] = IE.Multiply([[OV_CONST0]], [[CST2]]) - // bar: dense_resource : tensor<2x2xf16>, [#const.Add<15.0>, #const.Reshape<[4, 1]>] + // bar: dense_resource : tensor<2x2xf16>, [#const.Add<15.0>, #const.Reshape<[4, 1]>] // CHECK: [[CST_RESHAPE_4_1:%.+]] = IE.Reshape([[CST_ADD15]]) {shape_value = [4, 1]} @@ -649,7 +649,7 @@ module @OutlinedConstants { func.func @main(%dummy: tensor<2x2xf16>) -> tensor<2x2xf16> { - %cst0 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<10.0>] + %cst0 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<10.0>] %user_cst0 = VPU.Convert(%cst0) {dstElemType = f32} : tensor<2x2xf16> -> tensor<2x2xf32> %call_foo1 = func.call @main_foo1(%dummy) : (tensor<2x2xf16>) -> tensor<2x2xf16> @@ -676,7 +676,7 @@ module @OutlinedConstants { // CHECK: func.func @wrapper_main([[DUMMY:%.+]]: tensor<2x2xf16>) - // CHECK: [[OV_0:%.+]] = const.Declare {{.*}} dense_resource + // CHECK: [[OV_0:%.+]] = const.Declare {{.*}} dense_resource // CHECK: [[INIT:%.+]]:4 = call @init([[OV_0]]) // CHECK: [[MAIN:%.+]] = call @main([[DUMMY]], [[INIT]]#0, [[INIT]]#1, [[INIT]]#2, [[INIT]]#3) // CHECK: return [[MAIN]] @@ -687,8 +687,8 @@ module @OutlinedConstants { {-# dialect_resources: { builtin: { - ov_0: "0x10000000ABCDABCDABCDABCE", - ov_1: "0x10000000ABCDABCDABCDABCE" + vpux_ow_0: "0x10000000ABCDABCDABCDABCE", + vpux_ow_1: "0x10000000ABCDABCDABCDABCE" } } #-} @@ -707,7 +707,7 @@ module @OutlinedConstants_MultiCall { // CHECK: DataInfo "output" : tensor<2x2xf16> func.func private @multi_call(%dummy: tensor<2x2xf16>) -> tensor<2x2xf16> { - %cst = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Rescale<42.0>] + %cst = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Rescale<42.0>] %user_cst = VPU.Convert(%cst) {dstElemType = f32} : tensor<2x2xf16> -> tensor<2x2xf32> return %dummy : tensor<2x2xf16> } @@ -717,7 +717,7 @@ module @OutlinedConstants_MultiCall { // CHECK: return [[DUMMY]] func.func private @single_call(%dummy: tensor<2x2xf16>) -> (tensor<2x2xf16>, tensor<2x2xf16>) { - %cst1 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<15.0>] + %cst1 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<15.0>] %call = func.call @multi_call(%dummy) : (tensor<2x2xf16>) -> tensor<2x2xf16> return %cst1, %call : tensor<2x2xf16>, tensor<2x2xf16> } @@ -754,8 +754,8 @@ module @OutlinedConstants_MultiCall { // CHECK: func.func @wrapper_main([[DUMMY:%.+]]: tensor<2x2xf16>) - // CHECK: [[OV_0:%.+]] = const.Declare {{.*}} dense_resource - // CHECK: [[OV_1:%.+]] = const.Declare {{.*}} dense_resource + // CHECK: [[OV_0:%.+]] = const.Declare {{.*}} dense_resource + // CHECK: [[OV_1:%.+]] = const.Declare {{.*}} dense_resource // CHECK: [[INIT:%.+]]:2 = call @init([[OV_0]], [[OV_1]]) // CHECK: [[MAIN:%.+]] = call @main([[DUMMY]], [[INIT]]#0, [[INIT]]#1) // CHECK: return [[MAIN]] @@ -772,7 +772,7 @@ module @OutlinedConstants_MultiCall { {-# dialect_resources: { builtin: { - ov_0: "0x10000000ABCDABCDABCDABCE" + vpux_ow_0: "0x10000000ABCDABCDABCDABCE" } } #-} @@ -794,7 +794,7 @@ module @OutlinedConstants_Quantized { // CHECK: DataInfo "output" : tensor<2x2xf16> func.func private @quant_cst(%dummy: tensor<2x2xf16>) -> tensor<2x2xf16> { - %cst = const.Declare tensor<2x2x!qElemType1> = dense_resource : tensor<2x2xf16>, + %cst = const.Declare tensor<2x2x!qElemType1> = dense_resource : tensor<2x2xf16>, [#const.CastElemType] return %dummy : tensor<2x2xf16> } @@ -818,9 +818,9 @@ module @OutlinedConstants_Quantized { func.func @main(%dummy: tensor<2x2xf16>) -> tensor<2x2xf16> { - %cst = const.Declare tensor<2x2x!qElemType1> = dense_resource : tensor<2x2xf16>, + %cst = const.Declare tensor<2x2x!qElemType1> = dense_resource : tensor<2x2xf16>, [#const.CastElemType] - %cst2 = const.Declare tensor<2x2x!qElemType2> = dense_resource : tensor<2x2xf16>, + %cst2 = const.Declare tensor<2x2x!qElemType2> = dense_resource : tensor<2x2xf16>, [#const.CastElemType] %call = func.call @quant_cst(%dummy) : (tensor<2x2xf16>) -> tensor<2x2xf16> return %call : tensor<2x2xf16> @@ -834,7 +834,7 @@ module @OutlinedConstants_Quantized { // CHECK: func.func @wrapper_main([[DUMMY:%.+]]: tensor<2x2xf16>) - // CHECK: [[OV_0:%.+]] = const.Declare {{.*}} dense_resource + // CHECK: [[OV_0:%.+]] = const.Declare {{.*}} dense_resource // CHECK: [[INIT:%.+]]:2 = call @init([[OV_0]]) // CHECK: [[MAIN:%.+]] = call @main([[DUMMY]], [[INIT]]#0, [[INIT]]#1) // CHECK: return [[MAIN]] @@ -846,7 +846,7 @@ module @OutlinedConstants_Quantized { {-# dialect_resources: { builtin: { - ov_0: "0x10000000ABCDABCDABCDABCE" + vpux_ow_0: "0x10000000ABCDABCDABCDABCE" } } #-} @@ -869,7 +869,7 @@ module @OutlinedConstants_PostInitTransformations { // CHECK: DataInfo "output" : tensor<2x2xf16> func.func private @subview_cst(%dummy: tensor<2x2xf16>) -> tensor<2x2xf16> { - %cst = const.Declare tensor<2x1xf16> = dense_resource : tensor<2x2xf16>, + %cst = const.Declare tensor<2x1xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<42.0>, #const.SubView<[0, 1], [2, 1]>] return %dummy : tensor<2x2xf16> } @@ -886,9 +886,9 @@ module @OutlinedConstants_PostInitTransformations { func.func @main(%dummy: tensor<2x2xf16>) -> tensor<2x2xf16> { - %cst = const.Declare tensor<2x1xf16> = dense_resource : tensor<2x2xf16>, + %cst = const.Declare tensor<2x1xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<42.0>, #const.SubView<[0, 1], [2, 1]>] - %cst2 = const.Declare tensor<1x1xf16> = dense_resource : tensor<2x2xf16>, + %cst2 = const.Declare tensor<1x1xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<42.0>, #const.SubView<[0, 0], [1, 1]>] %call = func.call @subview_cst(%dummy) : (tensor<2x2xf16>) -> tensor<2x2xf16> return %call : tensor<2x2xf16> @@ -902,7 +902,63 @@ module @OutlinedConstants_PostInitTransformations { // CHECK: func.func @wrapper_main([[DUMMY:%.+]]: tensor<2x2xf16>) - // CHECK: [[OV_0:%.+]] = const.Declare {{.*}} dense_resource + // CHECK: [[OV_0:%.+]] = const.Declare {{.*}} dense_resource + // CHECK: [[INIT:%.+]] = call @init([[OV_0]]) + // CHECK: [[MAIN:%.+]] = call @main([[DUMMY]], [[INIT]]) + // CHECK: return [[MAIN]] +} + +// ----- + +{-# + dialect_resources: { + builtin: { + vpux_ow_0: "0x10000000ABCDABCDABCDABCE" + } + } +#-} + +// CHECK-LABEL: @DoNotNestFunctions +module @DoNotNestFunctions { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<2x2xf16> + } outputsInfo : { + DataInfo "output" : tensor<2x2xf16> + } + + // CHECK: net.NetworkInfo entryPoint : @wrapper_main inputsInfo : { + // CHECK: DataInfo "input" : tensor<2x2xf16> + // CHECK: } outputsInfo : { + // CHECK: DataInfo "output" : tensor<2x2xf16> + + func.func private @subview_cst(%dummy: tensor<2x2xf16>) -> tensor<2x2xf16> { + %cst = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, + [#const.Add<42.0>] + return %dummy : tensor<2x2xf16> + } + + // CHECK: func.func private @subview_cst([[DUMMY:%.+]]: tensor<2x2xf16>, [[CST:%.+]]: tensor<2x2xf16>) -> tensor<2x2xf16> attributes {do_not_nest} + + // CHECK: func.func private @init([[OV_CONST0:%.+]]: tensor<2x2xf16>) -> tensor<2x2xf16 + // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense<4.200000e+01> + // CHECK: [[CST_ADD42:%.+]] = IE.Add([[OV_CONST0]], [[CST]]) + // CHECK: return [[CST_ADD42]] + + + func.func @main(%dummy: tensor<2x2xf16>) -> tensor<2x2xf16> { + %cst = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, + [#const.Add<42.0>] + %call = func.call @subview_cst(%dummy) : (tensor<2x2xf16>) -> tensor<2x2xf16> + return %call : tensor<2x2xf16> + } + + // CHECK: func.func private @main([[DUMMY:%.+]]: tensor<2x2xf16>, [[CST_ADD42:%.+]]: tensor<2x2xf16>) -> tensor<2x2xf16> attributes {do_not_nest} + // CHECK: [[CALL:%.+]] = call @subview_cst([[DUMMY]], [[CST_ADD42]]) + // CHECK: return [[CALL]] + + + // CHECK: func.func @wrapper_main([[DUMMY:%.+]]: tensor<2x2xf16>) + // CHECK: [[OV_0:%.+]] = const.Declare {{.*}} dense_resource // CHECK: [[INIT:%.+]] = call @init([[OV_0]]) // CHECK: [[MAIN:%.+]] = call @main([[DUMMY]], [[INIT]]) // CHECK: return [[MAIN]] diff --git a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_single_transformations.mlir b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_single_transformations.mlir index d0c738bb09..d934e88788 100644 --- a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_single_transformations.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_single_transformations.mlir @@ -9,7 +9,7 @@ {-# dialect_resources: { builtin: { - ov_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" + vpux_ow_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" } } #-} @@ -22,7 +22,7 @@ module @CastRegular { } func.func @main() -> tensor<2x2xf16> { - %cst = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf32>, + %cst = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf32>, [#const.CastElemType] return %cst : tensor<2x2xf16> } @@ -37,7 +37,7 @@ module @CastRegular { {-# dialect_resources: { builtin: { - ov_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" + vpux_ow_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" } } #-} @@ -53,7 +53,7 @@ module @CastToQuantizedType { } func.func @main() -> tensor<2x2xi8> { - %cst = const.Declare tensor<2x2x!qElemType> = dense_resource : tensor<2x2xf32>, + %cst = const.Declare tensor<2x2x!qElemType> = dense_resource : tensor<2x2xf32>, [#const.CastElemType] // do quant-cast to satisfy compiler's requirement - output cannot be @@ -76,7 +76,7 @@ module @CastToQuantizedType { {-# dialect_resources: { builtin: { - ov_1: "0x0000000400112233" + vpux_ow_1: "0x0000000400112233" } } #-} @@ -92,7 +92,7 @@ module @CastFromQuantizedType { } func.func @main() -> tensor<2x2xf32> { - %cst = const.Declare tensor<2x2xf32> = dense_resource : tensor<2x2xi8>, + %cst = const.Declare tensor<2x2xf32> = dense_resource : tensor<2x2xi8>, [#const.CastElemType, #const.CastElemType] return %cst : tensor<2x2xf32> } @@ -111,7 +111,7 @@ module @CastFromQuantizedType { {-# dialect_resources: { builtin: { - ov_1: "0x0000000411223344551122334455" + vpux_ow_1: "0x0000000411223344551122334455" } } #-} @@ -133,7 +133,7 @@ module @PositiveZeroPointDelta { func.func @main() -> tensor<10x1x1x1xi8> { // Note: #const.CastElemType here is only used to satisfy the constraints of allowed @main function IO types. - %cst = const.Declare tensor<10x1x1x1xi8> = dense_resource : tensor<10x1x1x1xi8>, + %cst = const.Declare tensor<10x1x1x1xi8> = dense_resource : tensor<10x1x1x1xi8>, [#const.CastElemType, #const.ConvertElemType, #const.CastElemType] return %cst : tensor<10x1x1x1xi8> } @@ -154,7 +154,7 @@ module @PositiveZeroPointDelta { {-# dialect_resources: { builtin: { - ov_1: "0x0000000411223344551122334455" + vpux_ow_1: "0x0000000411223344551122334455" } } #-} @@ -176,7 +176,7 @@ module @NegativeZeroPointDelta { func.func @main() -> tensor<10x1x1x1xi8> { // Note: #const.CastElemType here is only used to satisfy the constraints of allowed @main function IO types. - %cst = const.Declare tensor<10x1x1x1xi8> = dense_resource : tensor<10x1x1x1xi8>, + %cst = const.Declare tensor<10x1x1x1xi8> = dense_resource : tensor<10x1x1x1xi8>, [#const.CastElemType, #const.ConvertElemType, #const.CastElemType] return %cst : tensor<10x1x1x1xi8> } @@ -198,7 +198,7 @@ module @NegativeZeroPointDelta { {-# dialect_resources: { builtin: { - ov_0: "0x10000000AEB00E30" + vpux_ow_0: "0x10000000AEB00E30" } } #-} @@ -215,7 +215,7 @@ module @QuantizedPadValuePerAxis { func.func @main() -> (tensor<4x1x2x1xsi8>) { // Note: #const.CastElemType here is only used to satisfy the constraints of allowed @main function IO types. - %cst = const.Declare tensor<4x1x2x1xsi8> = dense_resource : tensor<4x1x1x1xsi8>, + %cst = const.Declare tensor<4x1x2x1xsi8> = dense_resource : tensor<4x1x1x1xsi8>, [#const.CastElemType, #const.PadWithZero<[0, 0, 0, 0], [0, 0, 1, 0]>, #const.CastElemType] return %cst : tensor<4x1x2x1xsi8> } @@ -232,7 +232,7 @@ module @QuantizedPadValuePerAxis { {-# dialect_resources: { builtin: { - ov_0: "0x10000000AEB00E30" + vpux_ow_0: "0x10000000AEB00E30" } } #-} @@ -249,7 +249,7 @@ module @QuantizedPadValue { func.func @main() -> (tensor<1x5x1x1xsi8>) { // Note: #const.CastElemType here is only used to satisfy the constraints of allowed @main function IO types. - %cst = const.Declare tensor<1x5x1x1xsi8> = dense_resource : tensor<1x4x1x1xsi8>, + %cst = const.Declare tensor<1x5x1x1xsi8> = dense_resource : tensor<1x4x1x1xsi8>, [#const.CastElemType, #const.PadWithZero<[0, 0, 0, 0], [0, 1, 0, 0]>, #const.CastElemType] return %cst : tensor<1x5x1x1xsi8> } @@ -267,7 +267,7 @@ module @QuantizedPadValue { {-# dialect_resources: { builtin: { - ov_1: "0x0000000411223344551122334455" + vpux_ow_1: "0x0000000411223344551122334455" } } #-} @@ -286,7 +286,7 @@ module @QuantizedToQuantizedCast { func.func @main() -> tensor<10x1x1x1xi8> { // Note: #const.CastElemType here is only used to satisfy the constraints of allowed @main function IO types. - %cst = const.Declare tensor<10x1x1x1xi8> = dense_resource : tensor<10x1x1x1xi8>, + %cst = const.Declare tensor<10x1x1x1xi8> = dense_resource : tensor<10x1x1x1xi8>, [#const.CastElemType, #const.CastElemType, #const.CastElemType] return %cst : tensor<10x1x1x1xi8> } @@ -304,7 +304,7 @@ module @QuantizedToQuantizedCast { {-# dialect_resources: { builtin: { - ov_1: "0x0000000411223344551122334455" + vpux_ow_1: "0x0000000411223344551122334455" } } #-} @@ -323,7 +323,7 @@ module @QuantizedToQuantizedConversion_1D { func.func @main() -> tensor<10xui8> { // Note: surrounding casts is to abide I/O requirements - %cst = const.Declare tensor<10xui8> = dense_resource : tensor<10xsi8>, + %cst = const.Declare tensor<10xui8> = dense_resource : tensor<10xsi8>, [#const.CastElemType, #const.ConvertElemType, #const.CastElemType] return %cst : tensor<10xui8> } @@ -346,7 +346,7 @@ module @QuantizedToQuantizedConversion_1D { {-# dialect_resources: { builtin: { - ov_1: "0x0000000411223344551122334455" + vpux_ow_1: "0x0000000411223344551122334455" } } #-} @@ -365,7 +365,7 @@ module @QuantizedToQuantizedConversion_5D { func.func @main() -> tensor<5x1x1x1x2xui8> { // Note: surrounding casts is to abide I/O requirements - %cst = const.Declare tensor<5x1x1x1x2xui8> = dense_resource : tensor<5x1x1x1x2xsi8>, + %cst = const.Declare tensor<5x1x1x1x2xui8> = dense_resource : tensor<5x1x1x1x2xsi8>, [#const.CastElemType, #const.ConvertElemType, #const.CastElemType] return %cst : tensor<5x1x1x1x2xui8> } @@ -386,7 +386,7 @@ module @QuantizedToQuantizedConversion_5D { {-# dialect_resources: { builtin: { - ov_1: "0x0000000411223344551122334455" + vpux_ow_1: "0x0000000411223344551122334455" } } #-} @@ -407,7 +407,7 @@ module @QuantizedToQuantizedConversionPerAxis_2D { func.func @main() -> tensor<2x5xui8> { // Note: surrounding casts is to abide I/O requirements - %cst = const.Declare tensor<2x5xui8> = dense_resource : tensor<2x5xsi8>, + %cst = const.Declare tensor<2x5xui8> = dense_resource : tensor<2x5xsi8>, [#const.CastElemType, #const.ConvertElemType, #const.CastElemType] return %cst : tensor<2x5xui8> } @@ -435,7 +435,7 @@ module @QuantizedToQuantizedConversionPerAxis_2D { {-# dialect_resources: { builtin: { - ov_1: "0x0000000411223344551122334455" + vpux_ow_1: "0x0000000411223344551122334455" } } #-} @@ -457,7 +457,7 @@ module @QuantizedToQuantizedConversionPerAxisNegativeZp { func.func @main() -> tensor<5x1x1x2xui8> { // Note: surrounding casts is to abide I/O requirements - %cst = const.Declare tensor<5x1x1x2xui8> = dense_resource : tensor<5x1x1x2xsi8>, + %cst = const.Declare tensor<5x1x1x2xui8> = dense_resource : tensor<5x1x1x2xsi8>, [#const.CastElemType, #const.ConvertElemType, #const.CastElemType] return %cst : tensor<5x1x1x2xui8> } @@ -483,7 +483,7 @@ module @QuantizedToQuantizedConversionPerAxisNegativeZp { {-# dialect_resources: { builtin: { - ov_1: "0x0000000411223344551122334455" + vpux_ow_1: "0x0000000411223344551122334455" } } #-} @@ -504,7 +504,7 @@ module @QuantizedToQuantizedConversionPerTensorNegativeZp { func.func @main() -> tensor<5x1x1x2xui8> { // Note: surrounding casts is to abide I/O requirements - %cst = const.Declare tensor<5x1x1x2xui8> = dense_resource : tensor<5x1x1x2xsi8>, + %cst = const.Declare tensor<5x1x1x2xui8> = dense_resource : tensor<5x1x1x2xsi8>, [#const.CastElemType, #const.ConvertElemType, #const.CastElemType] return %cst : tensor<5x1x1x2xui8> } @@ -530,7 +530,7 @@ module @QuantizedToQuantizedConversionPerTensorNegativeZp { {-# dialect_resources: { builtin: { - ov_1: "0x0000000411223344551122334455" + vpux_ow_1: "0x0000000411223344551122334455" } } #-} @@ -551,7 +551,7 @@ module @QuantizedToQuantizedConversionPerTensorNegativeOutZp { func.func @main() -> tensor<5x1x1x2xsi8> { // Note: surrounding casts is to abide I/O requirements - %cst = const.Declare tensor<5x1x1x2xsi8> = dense_resource : tensor<5x1x1x2xui8>, + %cst = const.Declare tensor<5x1x1x2xsi8> = dense_resource : tensor<5x1x1x2xui8>, [#const.CastElemType, #const.ConvertElemType, #const.CastElemType] return %cst : tensor<5x1x1x2xsi8> } @@ -577,7 +577,7 @@ module @QuantizedToQuantizedConversionPerTensorNegativeOutZp { {-# dialect_resources: { builtin: { - ov_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" + vpux_ow_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" } } #-} @@ -590,12 +590,12 @@ module @Reverse { } func.func @main() -> tensor<2x2xf32> { - %NOT_supported = const.Declare tensor<2x2xf32> = dense_resource : tensor<2x2xf32>, + %NOT_supported = const.Declare tensor<2x2xf32> = dense_resource : tensor<2x2xf32>, [#const.Reverse<0 : i64>] // Note: the "supported" constant is here to show that the exact same // constant, given a different transformation, ends up in init schedule - %supported = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf32>, + %supported = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf32>, [#const.CastElemType] return %NOT_supported : tensor<2x2xf32> @@ -611,7 +611,7 @@ module @Reverse { {-# dialect_resources: { builtin: { - ov_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" + vpux_ow_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" } } #-} @@ -624,12 +624,12 @@ module @ExpandDilated { } func.func @main() -> tensor<1x1x3x3xf32> { - %NOT_supported = const.Declare tensor<1x1x3x3xf32> = dense_resource : tensor<1x1x2x2xf32>, + %NOT_supported = const.Declare tensor<1x1x3x3xf32> = dense_resource : tensor<1x1x2x2xf32>, [#const.ExpandDilated<[2, 2]>] // Note: the "supported" constant is here to show that the exact same // constant, given a different transformation, ends up in init schedule - %supported = const.Declare tensor<1x1x2x2xf16> = dense_resource : tensor<1x1x2x2xf32>, + %supported = const.Declare tensor<1x1x2x2xf16> = dense_resource : tensor<1x1x2x2xf32>, [#const.CastElemType] return %NOT_supported : tensor<1x1x3x3xf32> @@ -645,7 +645,7 @@ module @ExpandDilated { {-# dialect_resources: { builtin: { - ov_1: "0x00000004aabbccddaabbccddaabbccddaabbccddaabbccddaabbccddaabbccddaabbccddaabbccd6" + vpux_ow_1: "0x00000004aabbccddaabbccddaabbccddaabbccddaabbccddaabbccddaabbccddaabbccddaabbccd6" } } #-} @@ -661,7 +661,7 @@ module @AffineReshape { func.func @main() -> tensor<1x1x3x3xf16, {order = #NCWH}> { // Note: CastElemType is only here to avoid this constant from being ignored as #const.AffineReshape is view-like. - %cst = const.Declare tensor<1x1x3x3xf16, {order = #NCWH}> = dense_resource : tensor<1x1x3x3xf32>, + %cst = const.Declare tensor<1x1x3x3xf16, {order = #NCWH}> = dense_resource : tensor<1x1x3x3xf32>, [#const.CastElemType, #const.AffineReshape<[[0], [1], [3], [2]], [1, 1, 3, 3]>] return %cst : tensor<1x1x3x3xf16, {order = #NCWH}> } @@ -673,12 +673,275 @@ module @AffineReshape { // CHECK-NEXT: return [[AFFINE]] } +// ----- + +{-# + dialect_resources: { + builtin: { + vpux_ow_1: "0x01000000000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f7071727374757677" + } + } +#-} + +#NCWH = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)> +// CHECK: [[NCWH:#.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +// CHECK: [[NHWC:#.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK: module @LayoutCastAttr +module @LayoutCastAttr { + net.NetworkInfo entryPoint : @main inputsInfo : { + } outputsInfo : { + DataInfo "output1" : tensor<2x3x4x5xui8, {order = #NHWC}> + } + + func.func @main() -> tensor<2x3x4x5xui8, {order = #NHWC}> { + // Note: Reorder is only here to avoid this constant from being ignored as #const.LayoutCast is view-like. + %cst = const.Declare tensor<2x3x4x5xui8, {order = #NHWC}> = dense_resource : tensor<2x3x4x5xui8>, [#const.Reorder<#NCWH>, #const.LayoutCast<#NHWC>] + return %cst : tensor<2x3x4x5xui8, {order = #NHWC}> + } + + // CHECK: func.func @init([[CST:%.+]]: tensor<2x3x4x5xui8>) -> tensor<2x3x4x5xui8, {order = [[NHWC]]}> + // CHECK-NEXT: [[REO:%.+]] = IE.Reorder([[CST]]) {dstOrder = [[NCWH]]} + // CHECK-NEXT: [[L_CST:%.+]] = IE.LayoutCast([[REO]]) {dst_order = [[NHWC]]} + // CHECK-NEXT: return [[L_CST]] + +} + + +// ----- + +{-# + dialect_resources: { + builtin: { + vpux_ow_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" + } + } +#-} + + +// CHECK: module @BroadcastAttr +module @BroadcastAttr { + net.NetworkInfo entryPoint : @main inputsInfo : { + } outputsInfo : { + DataInfo "output1" : tensor<16x8xf32> + } + + func.func @main() -> tensor<16x8xf32> { + %cst = const.Declare tensor<16x8xf32> = dense_resource : tensor<4x1xf32>, + [#const.Broadcast<0 : i64, 16 : i64>, #const.Broadcast<1 : i64, 8 : i64>] + return %cst : tensor<16x8xf32> + } + + // CHECK: func.func @init([[IN:%.+]]: tensor<4x1xf32>) -> tensor<16x8xf32> + // CHECK-NEXT: [[SH_0:%.+]] = const.Declare tensor<2xi64> = dense<[16, 1]> : tensor<2xi64> + // CHECK-NEXT: [[BR_0:%.+]] = IE.Broadcast([[IN]], [[SH_0]]) : tensor<4x1xf32>, tensor<2xi64> -> tensor<16x1xf32> + // CHECK-NEXT: [[SH_1:%.+]] = const.Declare tensor<2xi64> = dense<[16, 8]> : tensor<2xi64> + // CHECK-NEXT: [[BR_1:%.+]] = IE.Broadcast([[BR_0]], [[SH_1]]) : tensor<16x1xf32>, tensor<2xi64> -> tensor<16x8xf32> + // CHECK-NEXT: return [[BR_1]] +} + + +// ----- + + +{-# + dialect_resources: { + builtin: { + vpux_ow_1: "0x04000000000000000000803f0000004000004040000080400000a0400000c0400000e04000000041000010410000204100003041" + } + } +#-} + + +// CHECK: module @AddAttr +module @AddAttr { + net.NetworkInfo entryPoint : @main inputsInfo : { + } outputsInfo : { + DataInfo "output1" : tensor<2x3x2xf32> + } + + func.func @main() -> tensor<2x3x2xf32> { + %cst = const.Declare tensor<2x3x2xf32> = dense_resource : tensor<2x3x2xf32>, [#const.Add<1.27e-03>] + return %cst : tensor<2x3x2xf32> + } + + // CHECK: func.func @init([[IN:%.+]]: tensor<2x3x2xf32>) -> tensor<2x3x2xf32> { + // CHECK-NEXT: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<1.270000e-03> : tensor<1xf32>, [#const.CastElemType] + // CHECK-NEXT: [[RET:%.+]] = IE.Add([[IN]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} + // CHECK-NEXT: return [[RET]] +} + + +// ----- + + +{-# + dialect_resources: { + builtin: { + vpux_ow_1: "0x04000000000000000000803f0000004000004040000080400000a0400000c0400000e04000000041000010410000204100003041" + } + } +#-} + + +// CHECK: module @ZeroPadAttr +module @ZeroPadAttr { + net.NetworkInfo entryPoint : @main inputsInfo : { + } outputsInfo : { + DataInfo "output1" : tensor<5x10x13xf32> + } + + func.func @main() -> tensor<5x10x13xf32> { + %cst = const.Declare tensor<5x10x13xf32> = dense_resource : tensor<2x3x2xf32>, [#const.PadWithZero<[1, 3, 5], [2, 4, 6]>] + return %cst : tensor<5x10x13xf32> + } + + // CHECK: func.func @init([[IN:%.+]]: tensor<2x3x2xf32>) -> tensor<5x10x13xf32> + // CHECK-NEXT: [[RET:%.+]] = IE.Pad([[IN]]) {mode = #IE.pad_mode, pad_value_attr = 0.000000e+00 : f64, pads_begin_attr = [1, 3, 5], pads_end_attr = [2, 4, 6]} + // CHECK-NEXT: return [[RET]] +} + +// ----- + +{-# + dialect_resources: { + builtin: { + vpux_ow_1: "0x04000000000000000000803f0000004000004040000080400000a0400000c0400000e04000000041000010410000204100003041" + } + } +#-} + +// per axis quantization with zero point +!qElemType = !quant.uniform +// CHECK: [[QTYPE:!.+]] = !quant.uniform +// aligned with padded shape +!qElemTypeP = !quant.uniform +// CHECK: [[QTYPEP:!.+]] = !quant.uniform + +// CHECK: module @ZeroPadQuantized +module @ZeroPadQuantized { + net.NetworkInfo entryPoint : @main inputsInfo : { + } outputsInfo : { + DataInfo "output1" : tensor<5x10x13xi8> + } + + func.func @main() -> tensor<5x10x13xi8> { + %cst = const.Declare tensor<5x10x13x!qElemTypeP> = dense_resource : tensor<2x3x2xf32>, [#const.CastElemType, #const.PadWithZero<[1, 3, 5], [2, 4, 6]>] + + // do quant-cast to satisfy compiler's requirement - output cannot be quantized type. + %workaround = VPU.QuantizeCast(%cst) { dstElemType = i8 } + : tensor<5x10x13x!qElemTypeP> -> tensor<5x10x13xi8> + return %workaround : tensor<5x10x13xi8> + } + + // CHECK: func.func @init([[IN:%.+]]: tensor<2x3x2xf32>) -> tensor<5x10x13xsi8> + // CHECK-NEXT: [[DATA_I8:%.+]] = IE.Convert([[IN]]) {dstElemType = i8} + // CHECK-NEXT: [[DATA_Q:%.+]] = IE.QuantizeCast([[DATA_I8]]) {dstElemType = [[QTYPE]]} : tensor<2x3x2xi8> -> tensor<2x3x2x[[QTYPE]]> + // CHECK-NEXT: [[PADED:%.+]] = IE.Pad([[DATA_Q]]) {mode = #IE.pad_mode, pad_value_attr = 2.100000e+01 : f64, pads_begin_attr = [1, 3, 5], pads_end_attr = [2, 4, 6]} : tensor<2x3x2x[[QTYPE]]> -> tensor<5x10x13x[[QTYPEP]]> + // CHECK-NEXT: [[RET:%.+]] = IE.QuantizeCast([[PADED]]) {dstElemType = si8} + // CHECK-NEXT: return [[RET]] +} + + +// ----- + + +{-# + dialect_resources: { + builtin: { + vpux_ow_1: "0x04000000000000000000803f0000004000004040000080400000a0400000c0400000e04000000041000010410000204100003041" + } + } +#-} + +// CHECK: module @RescaleAttr +module @RescaleAttr { + net.NetworkInfo entryPoint : @main inputsInfo : { + } outputsInfo : { + DataInfo "output1" : tensor<2x3x2xf32> + } + + func.func @main() -> tensor<2x3x2xf32> { + %cst = const.Declare tensor<2x3x2xf32> = dense_resource : tensor<2x3x2xf32>, [#const.Rescale<0.66666666666666663e-05 : f32>] + return %cst : tensor<2x3x2xf32> + } + + // CHECK: func.func @init([[IN:%.+]]: tensor<2x3x2xf32>) -> tensor<2x3x2xf32> + // CHECK-NEXT: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<6.66666664E-6> : tensor<1xf32>, [#const.CastElemType] + // CHECK-NEXT: [[RET:%.+]] = IE.Multiply([[IN]], [[CST]]) {auto_broadcast = #IE.auto_broadcast_type} + // CHECK-NEXT: return [[RET]] +} + + +// ----- + + +{-# + dialect_resources: { + builtin: { + vpux_ow_1: "0x04000000000000000000803f0000004000004040000080400000a0400000c0400000e04000000041000010410000204100003041" + } + } +#-} + +// Inverts all values in the tensor, i.e. 1 / x for each x in the tensor. +// CHECK: module @ScalarMultInverseAttr +module @ScalarMultInverseAttr { + net.NetworkInfo entryPoint : @main inputsInfo : { + } outputsInfo : { + DataInfo "output1" : tensor<2x3x2xf32> + } + + func.func @main() -> tensor<2x3x2xf32> { + %cst = const.Declare tensor<2x3x2xf32> = dense_resource : tensor<2x3x2xf32>, [#const.ScalarMultInverse] + return %cst : tensor<2x3x2xf32> + } + + // CHECK: func.func @init([[IN:%.+]]: tensor<2x3x2xf32>) -> tensor<2x3x2xf32> + // CHECK-NEXT: [[CST:%.+]] = const.Declare tensor<1xf32> = dense<1.000000e+00> : tensor<1xf32> + // CHECK-NEXT: [[RET:%.+]] = IE.Divide([[CST]], [[IN]]) {auto_broadcast = #IE.auto_broadcast_type} + // CHECK-NEXT: return [[RET]] +} + +// ----- + + +{-# + dialect_resources: { + builtin: { + vpux_ow_1: "0x04000000000000000000803f0000004000004040000080400000a0400000c0400000e04000000041000010410000204100003041" + } + } +#-} + +#transpose_map = affine_map<(d0, d1, d2) -> (d1, d0, d2)> +// CHECK: [[HCW:#.+]] = affine_map<(d0, d1, d2) -> (d1, d0, d2)> + +// CHECK: module @TransposeAttr +module @TransposeAttr { + net.NetworkInfo entryPoint : @main inputsInfo : { + } outputsInfo : { + DataInfo "output1" : tensor<3x2x2xf32> + } + + func.func @main() -> tensor<3x2x2xf32> { + %cst = const.Declare tensor<3x2x2xf32> = dense_resource : tensor<2x3x2xf32>, [#const.Transpose<#transpose_map>] + return %cst : tensor<3x2x2xf32> + } + + // CHECK: func.func @init([[IN:%.+]]: tensor<2x3x2xf32>) -> tensor<3x2x2xf32> + // CHECK-NEXT: [[RET:%.+]] = IE.Transpose([[IN]]) {order_value = [[HCW]]} : tensor<2x3x2xf32> -> tensor<3x2x2xf32> + // CHECK-NEXT: return [[RET]] +} + + // ----- {-# dialect_resources: { builtin: { - ov_0: "0x10000000AEB00E30AEB00E30AEB00E30AEB00E30AEB00E30AEB00E30" + vpux_ow_0: "0x10000000AEB00E30AEB00E30AEB00E30AEB00E30AEB00E30AEB00E30" } } #-} @@ -694,7 +957,7 @@ module @MemPermuteConversion { } func.func @main() -> (tensor<1x4x2x3xsi8>) { - %cst = const.Declare tensor<1x4x2x3xsi8> = dense_resource : tensor<1x2x3x4xsi8>, + %cst = const.Declare tensor<1x4x2x3xsi8> = dense_resource : tensor<1x2x3x4xsi8>, [#const.MemPermute<#NCHW, #NWCH>] return %cst : tensor<1x4x2x3xsi8> } @@ -712,7 +975,7 @@ module @MemPermuteConversion { {-# dialect_resources: { builtin: { - ov_0: "0x10000000AEB00E30AEB00E30AEB00E30AEB00E30AEB00E30AEB00E30" + vpux_ow_0: "0x10000000AEB00E30AEB00E30AEB00E30AEB00E30AEB00E30AEB00E30" } } #-} @@ -727,7 +990,7 @@ module @MemPermuteConversionNoTranspose { } func.func @main() -> (tensor<1x2x3x4xsi8, {order = #NHWC}>) { - %cst = const.Declare tensor<1x2x3x4xsi8, {order = #NHWC}> = dense_resource : tensor<1x2x3x4xsi8>, + %cst = const.Declare tensor<1x2x3x4xsi8, {order = #NHWC}> = dense_resource : tensor<1x2x3x4xsi8>, [#const.MemPermute<#NHWC, #NHWC>] return %cst : tensor<1x2x3x4xsi8, {order = #NHWC}> } @@ -745,7 +1008,7 @@ module @MemPermuteConversionNoTranspose { {-# dialect_resources: { builtin: { - ov_0: "0x10000000AEB00E30AEB0" + vpux_ow_0: "0x10000000AEB00E30AEB0" } } #-} @@ -761,7 +1024,7 @@ module @MemPermuteConversion3D { } func.func @main() -> (tensor<1x3x2xsi8>) { - %cst = const.Declare tensor<1x3x2xsi8> = dense_resource : tensor<1x2x3xsi8>, + %cst = const.Declare tensor<1x3x2xsi8> = dense_resource : tensor<1x2x3xsi8>, [#const.MemPermute<#CHW, #map>] return %cst : tensor<1x3x2xsi8> } @@ -779,7 +1042,7 @@ module @MemPermuteConversion3D { {-# dialect_resources: { builtin: { - ov_0: "0x10000000AEB00E30AEB0" + vpux_ow_0: "0x10000000AEB00E30AEB0" } } #-} @@ -795,7 +1058,7 @@ module @MemPermuteConversionNoTranspose3D { } func.func @main() -> (tensor<1x2x3xsi8, {order = #map}>) { - %cst = const.Declare tensor<1x2x3xsi8, {order = #map}> = dense_resource : tensor<1x2x3xsi8>, + %cst = const.Declare tensor<1x2x3xsi8, {order = #map}> = dense_resource : tensor<1x2x3xsi8>, [#const.MemPermute<#map, #map>] return %cst : tensor<1x2x3xsi8, {order = #map}> } @@ -813,7 +1076,7 @@ module @MemPermuteConversionNoTranspose3D { {-# dialect_resources: { builtin: { - ov_1: "0x0000000400112233" + vpux_ow_1: "0x0000000400112233" } } #-} @@ -831,11 +1094,11 @@ module @DequantizePerAxis4D { } func.func @main() -> (tensor<2x2x1x1xf32>, tensor<2x2x1x1xi8>) { - %NOT_supported = const.Declare tensor<2x2x1x1xf32> = dense_resource : tensor<2x2x1x1xi8>, + %NOT_supported = const.Declare tensor<2x2x1x1xf32> = dense_resource : tensor<2x2x1x1xi8>, [#const.CastElemType, #const.Dequantize] // Added the last cast to differentiate between the constants - %processed_cst = const.Declare tensor<2x2x1x1xi8> = dense_resource : tensor<2x2x1x1xi8>, + %processed_cst = const.Declare tensor<2x2x1x1xi8> = dense_resource : tensor<2x2x1x1xi8>, [#const.CastElemType, #const.Dequantize, #const.CastElemType] return %NOT_supported, %processed_cst : tensor<2x2x1x1xf32>, tensor<2x2x1x1xi8> @@ -853,7 +1116,7 @@ module @DequantizePerAxis4D { {-# dialect_resources: { builtin: { - ov_1: "0x0000000400112233" + vpux_ow_1: "0x0000000400112233" } } #-} @@ -871,11 +1134,11 @@ module @DequantizePerAxis5D { } func.func @main() -> (tensor<2x2x1x1x1xf32>, tensor<2x2x1x1x1xi8>) { - %NOT_supported = const.Declare tensor<2x2x1x1x1xf32> = dense_resource : tensor<2x2x1x1x1xi8>, + %NOT_supported = const.Declare tensor<2x2x1x1x1xf32> = dense_resource : tensor<2x2x1x1x1xi8>, [#const.CastElemType, #const.Dequantize] // Added the last cast to differentiate between the constants - %processed_cst = const.Declare tensor<2x2x1x1x1xi8> = dense_resource : tensor<2x2x1x1x1xi8>, + %processed_cst = const.Declare tensor<2x2x1x1x1xi8> = dense_resource : tensor<2x2x1x1x1xi8>, [#const.CastElemType, #const.Dequantize, #const.CastElemType] return %NOT_supported, %processed_cst : tensor<2x2x1x1x1xf32>, tensor<2x2x1x1x1xi8> diff --git a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_slicing.mlir b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_slicing.mlir index 79b7e814c2..d53f5b89df 100644 --- a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_slicing.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_slicing.mlir @@ -13,8 +13,8 @@ {-# dialect_resources: { builtin: { - ov_1: "0x10000000AABBCCDD", - ov_2: "0x10000000AABBCCDD" + vpux_ow_1: "0x10000000AABBCCDD", + vpux_ow_2: "0x10000000AABBCCDD" } } #-} @@ -29,8 +29,8 @@ module @MemoryLimitTest { } func.func @main(%arg: tensor<4x16xf16>) -> tensor<4x16xf16> { - %cst1 = const.Declare tensor<4xui8> = dense_resource : tensor<4xui8>, [#const.Add<1.0>] - %cst2 = const.Declare tensor<4xui8> = dense_resource : tensor<4xui8>, [#const.Add<2.0>] + %cst1 = const.Declare tensor<4xui8> = dense_resource : tensor<4xui8>, [#const.Add<1.0>] + %cst2 = const.Declare tensor<4xui8> = dense_resource : tensor<4xui8>, [#const.Add<2.0>] return %arg : tensor<4x16xf16> } diff --git a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_verify_locations.mlir b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_verify_locations.mlir index 6143815654..c654cf704f 100644 --- a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_verify_locations.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_verify_locations.mlir @@ -10,7 +10,7 @@ {-# dialect_resources: { builtin: { - ov: "0x10000000ABABABABCDCDCDCD" + vpux_ow_: "0x10000000ABABABABCDCDCDCD" } } #-} @@ -28,8 +28,8 @@ module @SimilarConstantsSimilarLocationNames { } func.func @main(%arg0: tensor<2x2x1x1xf16>) -> (tensor<2x2x1x1xi8>, tensor<2x2x1x1xi8>) { - %cst = const.Declare tensor<2x2x1x1x!qElemType> = dense_resource : tensor<2x2x1x1xf16>, [#const.SubView<[1, 0, 0, 0], [2, 2, 1, 1]>, #const.CastElemType] - %cst1 = const.Declare tensor<2x2x1x1x!qElemType> = dense_resource : tensor<2x2x1x1xf16>, [#const.SubView<[0, 0, 0, 0], [2, 2, 1, 1]>, #const.CastElemType] + %cst = const.Declare tensor<2x2x1x1x!qElemType> = dense_resource : tensor<2x2x1x1xf16>, [#const.SubView<[1, 0, 0, 0], [2, 2, 1, 1]>, #const.CastElemType] + %cst1 = const.Declare tensor<2x2x1x1x!qElemType> = dense_resource : tensor<2x2x1x1xf16>, [#const.SubView<[0, 0, 0, 0], [2, 2, 1, 1]>, #const.CastElemType] %0 = VPU.QuantizeCast(%cst) { dstElemType = i8 } : tensor<2x2x1x1x!qElemType> -> tensor<2x2x1x1xi8> diff --git a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_view_like_only.mlir b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_view_like_only.mlir index 8d93094d2a..8413469a8e 100644 --- a/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_view_like_only.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/introduce_init_function_view_like_only.mlir @@ -11,7 +11,7 @@ {-# dialect_resources: { builtin: { - ov_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" + vpux_ow_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" } } #-} @@ -24,12 +24,12 @@ module @NoTransformations { } func.func @main() -> tensor<4x1xf32> { - %cst = const.Declare tensor<4x1xf32> = dense_resource : tensor<4x1xf32> + %cst = const.Declare tensor<4x1xf32> = dense_resource : tensor<4x1xf32> return %cst : tensor<4x1xf32> } // CHECK: @main() -> tensor<4x1xf32> - // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource + // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource // CHECK: return [[CST]] } @@ -38,7 +38,7 @@ module @NoTransformations { {-# dialect_resources: { builtin: { - ov_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" + vpux_ow_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" } } #-} @@ -51,13 +51,13 @@ module @Reshape { } func.func @main() -> tensor<2x2xf32> { - %cst = const.Declare tensor<2x2xf32> = dense_resource : tensor<4x1xf32>, + %cst = const.Declare tensor<2x2xf32> = dense_resource : tensor<4x1xf32>, [#const.Reshape<[2, 2]>] return %cst : tensor<2x2xf32> } // CHECK: @main() -> tensor<2x2xf32> - // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource {{.*}} [#const.Reshape<[2, 2]>] + // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource {{.*}} [#const.Reshape<[2, 2]>] // CHECK: return [[CST]] } @@ -66,7 +66,7 @@ module @Reshape { {-# dialect_resources: { builtin: { - ov_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" + vpux_ow_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" } } #-} @@ -82,13 +82,13 @@ module @ReshapeNonIdentityOrder { } func.func @main() -> tensor<2x2xf32, {order = #CN}> { - %cst = const.Declare tensor<2x2xf32, {order = #CN}> = dense_resource : tensor<4x1xf32, {order = #CN}>, + %cst = const.Declare tensor<2x2xf32, {order = #CN}> = dense_resource : tensor<4x1xf32, {order = #CN}>, [#const.Reshape<[2, 2]>] return %cst : tensor<2x2xf32, {order = #CN}> } // CHECK: @main() -> tensor<2x2xf32, {order = [[CN]]}> - // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource {{.*}} [#const.Reshape<[2, 2]>] + // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource {{.*}} [#const.Reshape<[2, 2]>] // CHECK: return [[CST]] } @@ -97,7 +97,7 @@ module @ReshapeNonIdentityOrder { {-# dialect_resources: { builtin: { - ov_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" + vpux_ow_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" } } #-} @@ -110,13 +110,13 @@ module @SubView { } func.func @main() -> tensor<2x1xf32> { - %cst = const.Declare tensor<2x1xf32> = dense_resource : tensor<4x1xf32>, + %cst = const.Declare tensor<2x1xf32> = dense_resource : tensor<4x1xf32>, [#const.SubView<[0, 0], [2, 1]>] return %cst : tensor<2x1xf32> } // CHECK: @main() -> tensor<2x1xf32> - // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource {{.*}} [#const.SubView<[0, 0], [2, 1]>] + // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource {{.*}} [#const.SubView<[0, 0], [2, 1]>] // CHECK: return [[CST]] } @@ -125,7 +125,7 @@ module @SubView { {-# dialect_resources: { builtin: { - ov_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" + vpux_ow_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" } } #-} @@ -141,13 +141,13 @@ module @LayoutCast { } func.func @main() -> tensor<4x1xf32, {order = #CN}> { - %cst = const.Declare tensor<4x1xf32, {order = #CN}> = dense_resource : tensor<4x1xf32>, + %cst = const.Declare tensor<4x1xf32, {order = #CN}> = dense_resource : tensor<4x1xf32>, [#const.LayoutCast<#CN>] return %cst : tensor<4x1xf32, {order = #CN}> } // CHECK: @main() -> tensor<4x1xf32, {order = [[CN]]}> - // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource {{.*}} [#const.LayoutCast<[[CN]]>] + // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource {{.*}} [#const.LayoutCast<[[CN]]>] // CHECK: return [[CST]] } @@ -156,7 +156,7 @@ module @LayoutCast { {-# dialect_resources: { builtin: { - ov_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" + vpux_ow_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" } } #-} @@ -174,13 +174,13 @@ module @TrivialMemPermute { } func.func @main() -> tensor<4x1xf32> { - %cst = const.Declare tensor<4x1xf32> = dense_resource : tensor<4x1xf32, {order = #CN}>, + %cst = const.Declare tensor<4x1xf32> = dense_resource : tensor<4x1xf32, {order = #CN}>, [#const.MemPermute<#NC, #CN>] return %cst : tensor<4x1xf32> } // CHECK: @main() -> tensor<4x1xf32> - // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource {{.*}} [#const.MemPermute<[[NC]], [[CN]]>] + // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource {{.*}} [#const.MemPermute<[[NC]], [[CN]]>] // CHECK: return [[CST]] } @@ -189,7 +189,7 @@ module @TrivialMemPermute { {-# dialect_resources: { builtin: { - ov_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" + vpux_ow_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" } } #-} @@ -206,13 +206,13 @@ module @TrivialTranspose { } func.func @main() -> tensor<1x4xf32> { - %cst = const.Declare tensor<1x4xf32> = dense_resource : tensor<4x1xf32>, + %cst = const.Declare tensor<1x4xf32> = dense_resource : tensor<4x1xf32>, [#const.Transpose<#swap>] return %cst : tensor<1x4xf32> } // CHECK: @main() -> tensor<1x4xf32> - // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource {{.*}} [#const.Transpose<[[swap]]>] + // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource {{.*}} [#const.Transpose<[[swap]]>] // CHECK: return [[CST]] } @@ -221,7 +221,7 @@ module @TrivialTranspose { {-# dialect_resources: { builtin: { - ov_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" + vpux_ow_1: "0x000000040011223300aabbcc00aabbcc00aabbcc" } } #-} @@ -237,13 +237,13 @@ module @TrivialReorder { } func.func @main() -> tensor<4x1xf32, {order = #CN}> { - %cst = const.Declare tensor<4x1xf32, {order = #CN}> = dense_resource : tensor<4x1xf32>, + %cst = const.Declare tensor<4x1xf32, {order = #CN}> = dense_resource : tensor<4x1xf32>, [#const.Reorder<#CN>] return %cst : tensor<4x1xf32, {order = #CN}> } // CHECK: @main() -> tensor<4x1xf32, {order = [[CN]]}> - // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource {{.*}} [#const.Reorder<[[CN]]>] + // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource {{.*}} [#const.Reorder<[[CN]]>] // CHECK: return [[CST]] } @@ -252,7 +252,7 @@ module @TrivialReorder { {-# dialect_resources: { builtin: { - ov_1: "0x00000004aabbccddaabbccddaabbccddaabbccddaabbccddaabbccddaabbccddaabbccddaabbccd6" + vpux_ow_1: "0x00000004aabbccddaabbccddaabbccddaabbccddaabbccddaabbccddaabbccddaabbccddaabbccd6" } } #-} @@ -267,13 +267,13 @@ module @AffineReshape { } func.func @main() -> tensor<1x1x3x3xf32, {order = #NCWH}> { - %cst = const.Declare tensor<1x1x3x3xf32, {order = #NCWH}> = dense_resource : tensor<1x1x3x3xf32>, + %cst = const.Declare tensor<1x1x3x3xf32, {order = #NCWH}> = dense_resource : tensor<1x1x3x3xf32>, [#const.AffineReshape<[[0], [1], [3], [2]], [1, 1, 3, 3]>] return %cst : tensor<1x1x3x3xf32, {order = #NCWH}> } // CHECK: func.func @main() -> tensor<1x1x3x3xf32, {order = #NCWH}> - // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource + // CHECK: [[CST:%.+]] = const.Declare {{.*}} dense_resource // CHECK-LITERAL: [#const.AffineReshape<[[0], [1], [3], [2]], [1, 1, 3, 3]>] // CHECK: return [[CST]] } diff --git a/tests/lit/NPU/dialect/VPU/passes/lower_experimental_ops_to_se_nce_37XX_40XX.mlir b/tests/lit/NPU/dialect/VPU/passes/lower_experimental_ops_to_se_nce_37XX_40XX.mlir index 20f03601ca..c95369dfbe 100644 --- a/tests/lit/NPU/dialect/VPU/passes/lower_experimental_ops_to_se_nce_37XX_40XX.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/lower_experimental_ops_to_se_nce_37XX_40XX.mlir @@ -182,3 +182,102 @@ func.func @DilatedGroupConvToSeNCE(%arg0: tensor<1x960x65x65xf16, {order = #NHWC // CHECK: return [[CONCAT]] : tensor<1x960x65x65xf16> } + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!qElemTypeIn = !quant.uniform +!qElemTypeWeights = !quant.uniform +!qElemTypeOut = !quant.uniform + +// CHECK-DAG: [[QELEMTYPE_IN:!.+]] = !quant.uniform +// CHECK-DAG: [[QELEMTYPE_W:!.+]] = !quant.uniform +// CHECK-DAG: [[QELEMTYPE_OUT:!.+]] = !quant.uniform + +// CHECK: func.func @QuantizedDilatedGroupConvToSeNCE +// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x960x65x65x[[QELEMTYPE_IN]], {order = #NHWC}>) +func.func @QuantizedDilatedGroupConvToSeNCE(%arg0: tensor<1x960x65x65x!qElemTypeIn, {order = #NHWC}>) -> tensor<1x960x65x65x!qElemTypeOut> { + %cst = const.Declare tensor<960x1x3x3x!qElemTypeWeights, {order = #NHWC}> = dense<1.0> : + tensor<960x1x1x3x3xf32>, [#const.Reshape<[960, 1, 3, 3]>, #const.ConvertElemType, + #const.Reorder<#NHWC>] + + %0 = VPU.GroupConvolution(%arg0, %cst) {dilations = [2, 2], groups = 960 : i64, pads_begin = [2, 2], pads_end = [2, 2], + strides = [1, 1]} : tensor<1x960x65x65x!qElemTypeIn, {order = #NHWC}>, + tensor<960x1x3x3x!qElemTypeWeights, {order = #NHWC}> -> tensor<1x960x65x65x!qElemTypeOut> + return %0 : tensor<1x960x65x65x!qElemTypeOut> + + // CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<960x16x1x1x[[QELEMTYPE_W]], {order = #NHWC}> + // CHECK: [[WEIGHTTABLE:%.+]] = const.Declare tensor<960x1x1x4xsi32> + + // CHECK: [[SET1:%.+]] = VPU.StorageElementTable {dataElemType = [[QELEMTYPE_IN]], dataShape = [1, 960, 65, 65], + // CHECK-SAME: seAttr = #VPU.SEDilatedConv, seDepth = 60 : i64, seSize = [{{(16, ){59}16}}] + // CHECK-SAME: -> tensor<1x60x33x33xi32, {order = #NHWC}> + // CHECK: [[SM1:%.+]] = const.Declare tensor<1x960x33x33xi1, {order = #NHWC}> = dense<1> : tensor<1x960x33x33xi8>, [#const.Reorder<#NHWC>, #const.CastElemType] + // CHECK: [[SPARSETENSOR1:%.+]] = VPU.GroupSparseTensor([[ARG0]], [[SM1]], [[SET1]]) {seAttr = #VPU.SEDilatedConv} -> + // CHECK-SAME: !VPU.SparseTensor, + // CHECK-SAME: sparsity_map=tensor<1x960x33x33xi1, {order = #NHWC}>, + // CHECK-SAME: storage_element_table=tensor<1x60x33x33xi32, {order = #NHWC}>, + // CHECK-SAME: #VPU.SEDilatedConv> + + // CHECK: [[DEPTHCONV1:%.+]] = VPU.NCE.DepthConvolution([[SPARSETENSOR1]], [[WEIGHTS]], [[WEIGHTTABLE:%.+]]) + // CHECK-SAME: {pad = #VPU.Padding, + // CHECK-SAME: rawFilterShape = [960, 1, 3, 3], strides = [1, 1]} -> tensor<1x960x33x33x[[QELEMTYPE_OUT]]> + + // CHECK: [[SET2:%.+]] = VPU.StorageElementTable {dataElemType = [[QELEMTYPE_IN]], dataShape = [1, 960, 65, 65], + // CHECK-SAME: seAttr = #VPU.SEDilatedConv, seDepth = 60 : i64, seSize = [{{(16, ){59}16}}] + // CHECK-SAME: -> tensor<1x60x33x32xi32, {order = #NHWC}> + // CHECK: [[SM2:%.+]] = const.Declare tensor<1x960x33x32xi1, {order = #NHWC}> = dense<1> : tensor<1x960x33x32xi8>, [#const.Reorder<#NHWC>, #const.CastElemType] + // CHECK: [[SPARSETENSOR2:%.+]] = VPU.GroupSparseTensor([[ARG0]], [[SM2]], [[SET2]]) {seAttr = #VPU.SEDilatedConv} -> + // CHECK-SAME: !VPU.SparseTensor, + // CHECK-SAME: sparsity_map=tensor<1x960x33x32xi1, {order = #NHWC}>, + // CHECK-SAME: storage_element_table=tensor<1x60x33x32xi32, {order = #NHWC}>, + // CHECK-SAME: #VPU.SEDilatedConv> + + // CHECK: [[DEPTHCONV2:%.+]] = VPU.NCE.DepthConvolution([[SPARSETENSOR2]], [[WEIGHTS]], [[WEIGHTTABLE:%.+]]) + // CHECK-SAME: {pad = #VPU.Padding, + // CHECK-SAME: rawFilterShape = [960, 1, 3, 3], strides = [1, 1]} -> tensor<1x960x33x32x[[QELEMTYPE_OUT]]> + + // CHECK: [[SET3:%.+]] = VPU.StorageElementTable {dataElemType = [[QELEMTYPE_IN]], dataShape = [1, 960, 65, 65], + // CHECK-SAME: seAttr = #VPU.SEDilatedConv, seDepth = 60 : i64, seSize = [{{(16, ){59}16}}] + // CHECK-SAME: -> tensor<1x60x32x33xi32, {order = #NHWC}> + // CHECK: [[SM3:%.+]] = const.Declare tensor<1x960x32x33xi1, {order = #NHWC}> = dense<1> : tensor<1x960x32x33xi8>, [#const.Reorder<#NHWC>, #const.CastElemType] + // CHECK: [[SPARSETENSOR3:%.+]] = VPU.GroupSparseTensor([[ARG0]], [[SM3]], [[SET3]]) {seAttr = #VPU.SEDilatedConv} -> + // CHECK-SAME: !VPU.SparseTensor, + // CHECK-SAME: sparsity_map=tensor<1x960x32x33xi1, {order = #NHWC}>, + // CHECK-SAME: storage_element_table=tensor<1x60x32x33xi32, {order = #NHWC}>, + // CHECK-SAME: #VPU.SEDilatedConv> + + // CHECK: [[DEPTHCONV3:%.+]] = VPU.NCE.DepthConvolution([[SPARSETENSOR3]], [[WEIGHTS]], [[WEIGHTTABLE:%.+]]) + // CHECK-SAME: {pad = #VPU.Padding, + // CHECK-SAME: rawFilterShape = [960, 1, 3, 3], strides = [1, 1]} -> tensor<1x960x32x33x[[QELEMTYPE_OUT]]> + + // CHECK: [[SET4:%.+]] = VPU.StorageElementTable {dataElemType = [[QELEMTYPE_IN]], dataShape = [1, 960, 65, 65], + // CHECK-SAME: seAttr = #VPU.SEDilatedConv, seDepth = 60 : i64, seSize = [{{(16, ){59}16}}] + // CHECK-SAME: -> tensor<1x60x32x32xi32, {order = #NHWC}> + // CHECK: [[SM4:%.+]] = const.Declare tensor<1x960x32x32xi1, {order = #NHWC}> = dense<1> : tensor<1x960x32x32xi8>, [#const.Reorder<#NHWC>, #const.CastElemType] + // CHECK: [[SPARSETENSOR4:%.+]] = VPU.GroupSparseTensor([[ARG0]], [[SM4]], [[SET4]]) {seAttr = #VPU.SEDilatedConv} -> + // CHECK-SAME: !VPU.SparseTensor, + // CHECK-SAME: sparsity_map=tensor<1x960x32x32xi1, {order = #NHWC}>, + // CHECK-SAME: storage_element_table=tensor<1x60x32x32xi32, {order = #NHWC}>, + // CHECK-SAME: #VPU.SEDilatedConv> + + // CHECK: [[DEPTHCONV4:%.+]] = VPU.NCE.DepthConvolution([[SPARSETENSOR4]], [[WEIGHTS]], [[WEIGHTTABLE:%.+]]) + // CHECK-SAME: {pad = #VPU.Padding, + // CHECK-SAME: rawFilterShape = [960, 1, 3, 3], strides = [1, 1]} -> tensor<1x960x32x32x[[QELEMTYPE_OUT]]> + + // CHECK: [[CONCAT:%.+]] = VPU.Concat([[DEPTHCONV1]], [[DEPTHCONV2]], [[DEPTHCONV3]], [[DEPTHCONV4]]) + // CHECK-SAME{LITERAL}: static_offsets = [[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 1, 1]] + // CHECK-SAME{LITERAL}: strides = [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]] + // CHECK-SAME: tensor<1x960x33x33x[[QELEMTYPE_OUT]]>, tensor<1x960x33x32x[[QELEMTYPE_OUT]]>, tensor<1x960x32x33x[[QELEMTYPE_OUT]]>, tensor<1x960x32x32x[[QELEMTYPE_OUT]]> -> tensor<1x960x65x65x[[QELEMTYPE_OUT]]> + + // CHECK: return [[CONCAT]] : tensor<1x960x65x65x[[QELEMTYPE_OUT]]> +} diff --git a/tests/lit/NPU/dialect/VPU/passes/make_distributed_copies.mlir b/tests/lit/NPU/dialect/VPU/passes/make_distributed_copies.mlir index 76b22e6a94..dbe29d989d 100644 --- a/tests/lit/NPU/dialect/VPU/passes/make_distributed_copies.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/make_distributed_copies.mlir @@ -14,13 +14,13 @@ // CHECK-SAME: ([[ARG0:%.+]]: tensor<1x3x112x112xf16> func.func @UnrolledTypeSimpleConversion(%arg0: tensor<1x3x112x112xf16>) -> tensor<1x4x112x112x!qElemType, {order = #NHWC}> { %0 = VPU.UnrolledType(%arg0 : tensor<1x3x112x112xf16>) -> !VPU.DistributedTensor<1x3x112x112xf16, #NCHW, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], kernel = [3, 3], pads = #VPU.Padding, strides = [2, 2], num_clusters = 2 : i64}> - %1 = VPU.NCE.Permute(%0) {dstElemType = !qElemType, dstOrder = #NHWC, expandedChannels = 4 : i64, ppe = #VPU.PPEInt, clamp_low = 0 : i64, clamp_high = 255 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 49.895641326904297 : f64>} -> !VPU.DistributedTensor<1x4x112x112x!qElemType, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], kernel = [3, 3], pads = #VPU.Padding, strides = [2, 2], num_clusters = 2 : i64, equal_memory_and_compute_view}> + %1 = VPU.NCE.Permute(%0) {dstElemType = !qElemType, dstOrder = #NHWC, expandedChannels = 4 : i64, ppe = #VPU.PPEInt, clamp_low = 0 : i64, clamp_high = 255 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 49.895641326904297 : f64>} -> !VPU.DistributedTensor<1x4x112x112x!qElemType, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], kernel = [3, 3], pads = #VPU.Padding, strides = [2, 2], num_clusters = 2 : i64, equal_memory_and_compute_view}> %2 = VPU.UnrolledType(%1 : !VPU.DistributedTensor<1x4x112x112x!qElemType, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], kernel = [3, 3], pads = #VPU.Padding, strides = [2, 2], num_clusters = 2 : i64, equal_memory_and_compute_view}>) -> tensor<1x4x112x112x!qElemType, {order = #NHWC}> - + return %2 : tensor<1x4x112x112x!qElemType, {order = #NHWC}> //CHECK: [[COPY_0:%.+]] = VPU.Copy([[ARG0]]) {out_mem_space = @CMX_NN} : tensor<1x3x112x112xf16> -> !VPU.DistributedTensor<1x3x112x112xf16, #NCHW, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], kernel = [3, 3], pads = #VPU.Padding, strides = [2, 2], num_clusters = 2 : i64}> - //CHECK: [[PERMUTE:%.+]] = VPU.NCE.Permute([[COPY_0]]) {dstElemType = !qElemType, dstOrder = #NHWC, expandedChannels = 4 : i64, ppe = #VPU.PPEInt, clamp_low = 0 : i64, clamp_high = 255 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 49.895641326904297 : f64>} -> !VPU.DistributedTensor<1x4x112x112x!qElemType, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], kernel = [3, 3], pads = #VPU.Padding, strides = [2, 2], num_clusters = 2 : i64, equal_memory_and_compute_view}> + //CHECK: [[PERMUTE:%.+]] = VPU.NCE.Permute([[COPY_0]]) {dstElemType = !qElemType, dstOrder = #NHWC, expandedChannels = 4 : i64, ppe = #VPU.PPEInt, clamp_low = 0 : i64, clamp_high = 255 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 49.895641326904297 : f64>} -> !VPU.DistributedTensor<1x4x112x112x!qElemType, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], kernel = [3, 3], pads = #VPU.Padding, strides = [2, 2], num_clusters = 2 : i64, equal_memory_and_compute_view}> //CHECK: [[COPY_1:%.+]] = VPU.Copy([[PERMUTE]]) : !VPU.DistributedTensor<1x4x112x112x!qElemType, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], kernel = [3, 3], pads = #VPU.Padding, strides = [2, 2], num_clusters = 2 : i64, equal_memory_and_compute_view}> -> tensor<1x4x112x112x!qElemType, {order = #NHWC}> //CHECK: return [[COPY_1]] : tensor<1x4x112x112x!qElemType, {order = #NHWC}> } @@ -36,7 +36,7 @@ func.func @UnrolledTypeSimpleConversion(%arg0: tensor<1x3x112x112xf16>) -> tenso func.func @DeleteUnrolledType(%arg0: tensor<1x3x112x112xf16>) -> tensor<1x4x112x112x!qElemType, {order = #NHWC}> { %0 = VPU.UnrolledType(%arg0 : tensor<1x3x112x112xf16>) -> tensor<1x3x112x112xf16> %1 = VPU.NCE.Permute(%0) {dstElemType = !qElemType, dstOrder = #NHWC, expandedChannels = 4 : i64, ppe = #VPU.PPEInt, clamp_low = 0 : i64, clamp_high = 255 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 49.895641326904297 : f64>} -> tensor<1x4x112x112x!qElemType, {order = #NHWC}> - + return %1 : tensor<1x4x112x112x!qElemType, {order = #NHWC}> //CHECK-NOT: VPU.UnrolledType diff --git a/tests/lit/NPU/dialect/VPU/passes/make_distributed_copies_skip_main_func.mlir b/tests/lit/NPU/dialect/VPU/passes/make_distributed_copies_skip_main_func.mlir new file mode 100644 index 0000000000..87907527f2 --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/make_distributed_copies_skip_main_func.mlir @@ -0,0 +1,68 @@ +// +// Copyright (C) 2024-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=HostCompile allow-custom-values=true" --make-distributed-copies %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +// CHECK-LABEL: @ReplaceUnrolledTypeWithCopyOps +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#map = affine_map<(d0)[s0] -> (-d0 + s0, 90)> +module @ReplaceUnrolledTypeWithCopyOps { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input1" : tensor<1x16x?x1000xf16> + DataInfo "input2" : tensor<1x16x?x1000xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x?x1000xf16> + } + func.func @main_func0_static(%arg0: tensor<1x16x90x1000xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}>, %arg1: tensor<1x16x90x1000xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}>) -> tensor<1x16x90x1000xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> { + %0 = VPU.UnrolledType(%arg0 : tensor<1x16x90x1000xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}>) -> !VPU.DistributedTensor<1x16x90x1000xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, compute_shapes = [[1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000]], compute_offsets = [[0, 0, 0, 0], [0, 0, 15, 0], [0, 0, 30, 0], [0, 0, 45, 0], [0, 0, 60, 0], [0, 0, 75, 0]], memory_shapes = [[1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000]], memory_offsets = [[0, 0, 0, 0], [0, 0, 15, 0], [0, 0, 30, 0], [0, 0, 45, 0], [0, 0, 60, 0], [0, 0, 75, 0]]}> + %1 = VPU.UnrolledType(%arg1 : tensor<1x16x90x1000xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}>) -> !VPU.DistributedTensor<1x16x90x1000xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, compute_shapes = [[1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000]], compute_offsets = [[0, 0, 0, 0], [0, 0, 15, 0], [0, 0, 30, 0], [0, 0, 45, 0], [0, 0, 60, 0], [0, 0, 75, 0]], memory_shapes = [[1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000]], memory_offsets = [[0, 0, 0, 0], [0, 0, 15, 0], [0, 0, 30, 0], [0, 0, 45, 0], [0, 0, 60, 0], [0, 0, 75, 0]]}> + %2 = VPU.NCE.Eltwise(%0, %1) {op_type = #VPU.eltwise_type, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, quant_scale = [1.000000e+00], fp_prelu_alpha = 1.000000e+00 : f64>} -> !VPU.DistributedTensor<1x16x90x1000xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, compute_shapes = [[1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000]], compute_offsets = [[0, 0, 0, 0], [0, 0, 15, 0], [0, 0, 30, 0], [0, 0, 45, 0], [0, 0, 60, 0], [0, 0, 75, 0]], memory_shapes = [[1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000]], memory_offsets = [[0, 0, 0, 0], [0, 0, 15, 0], [0, 0, 30, 0], [0, 0, 45, 0], [0, 0, 60, 0], [0, 0, 75, 0]]}> + %3 = VPU.UnrolledType(%2 : !VPU.DistributedTensor<1x16x90x1000xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, compute_shapes = [[1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000]], compute_offsets = [[0, 0, 0, 0], [0, 0, 15, 0], [0, 0, 30, 0], [0, 0, 45, 0], [0, 0, 60, 0], [0, 0, 75, 0]], memory_shapes = [[1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000], [1, 16, 15, 1000]], memory_offsets = [[0, 0, 0, 0], [0, 0, 15, 0], [0, 0, 30, 0], [0, 0, 45, 0], [0, 0, 60, 0], [0, 0, 75, 0]]}>) -> tensor<1x16x90x1000xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> + return %3 : tensor<1x16x90x1000xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> + } + func.func @main(%arg0: tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>, %arg1: tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> { + %c90 = arith.constant 90 : index + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %dim = tensor.dim %arg0, %c2 : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + %0 = tensor.empty(%dim) : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + %dim_0 = tensor.dim %arg0, %c2 : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + %1 = scf.for %arg2 = %c0 to %dim_0 step %c90 iter_args(%arg3 = %0) -> (tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>) { + %2 = affine.min #map(%arg2)[%dim_0] + %3 = arith.cmpi ne, %2, %c90 : index + %4 = scf.if %3 -> (index) { + %6 = arith.subi %c90, %2 : index + %7 = arith.cmpi slt, %arg2, %6 : index + cf.assert %7, "Not enough elements to backtrack in scf.for loop" + %8 = arith.subi %arg2, %6 : index + scf.yield %8 : index + } else { + scf.yield %arg2 : index + } + %extracted_slice = tensor.extract_slice %arg0[0, 0, %4, 0] [1, 16, %c90, 1000] [1, 1, 1, 1] : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 90, 1000]> : tensor<4xsi64>, order = #NHWC}> + %cast = tensor.cast %extracted_slice : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 90, 1000]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x90x1000xf16, {order = #NHWC}> + %extracted_slice_1 = tensor.extract_slice %arg1[0, 0, %4, 0] [1, 16, %c90, 1000] [1, 1, 1, 1] : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 90, 1000]> : tensor<4xsi64>, order = #NHWC}> + %cast_2 = tensor.cast %extracted_slice_1 : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 90, 1000]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x90x1000xf16, {order = #NHWC}> + %5 = func.call @main_func0_static(%cast, %cast_2) : (tensor<1x16x90x1000xf16, {order = #NHWC}>, tensor<1x16x90x1000xf16, {order = #NHWC}>) -> tensor<1x16x90x1000xf16, {order = #NHWC}> + %cast_3 = tensor.cast %5 : tensor<1x16x90x1000xf16, {order = #NHWC}> to tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 90, 1000]> : tensor<4xsi64>, order = #NHWC}> + %inserted_slice = tensor.insert_slice %cast_3 into %arg3[0, 0, %4, 0] [1, 16, %c90, 1000] [1, 1, 1, 1] : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 90, 1000]> : tensor<4xsi64>, order = #NHWC}> into tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + scf.yield %inserted_slice : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + } + return %1 : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + } + + // CHECK: func.func [[STATIC_FUNC:@.+]]([[ARG0:%.+]]: tensor<1x16x90x1000xf16, {order = #NHWC}>, [[ARG1:%.+]]: tensor<1x16x90x1000xf16, {order = #NHWC}>) -> tensor<1x16x90x1000xf16, {order = #NHWC}> { + // CHECK: [[INPUT0:%.+]] = VPU.Copy([[ARG0]]) + // CHECK: [[INPUT1:%.+]] = VPU.Copy([[ARG1]]) + // CHECK: [[ELTWISE_OUTPUT:%.+]] = VPU.NCE.Eltwise([[INPUT0]], [[INPUT1]]) + // CHECK: [[OUTPUT:%.+]] = VPU.Copy([[ELTWISE_OUTPUT]]) + // CHECK: return [[OUTPUT]] + // CHECK: } + + // CHECK: func.func [[MAIN_FUNC:@.+]]([[_:%.+]]: tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>, [[_:%.+]]: tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> +} + + diff --git a/tests/lit/NPU/dialect/VPU/passes/make_ops_with_distributed_tensors_and_copies_37XX.mlir b/tests/lit/NPU/dialect/VPU/passes/make_ops_with_distributed_tensors_and_copies_37XX.mlir index decf376173..61fe253e31 100644 --- a/tests/lit/NPU/dialect/VPU/passes/make_ops_with_distributed_tensors_and_copies_37XX.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/make_ops_with_distributed_tensors_and_copies_37XX.mlir @@ -1755,8 +1755,8 @@ func.func @NCEInterpolateToDistributedOpClustering(%arg0: tensor<1x16x1x1xf16, { // CHECK-DAG: [[WEIGHTS:%.+]] = const.Declare tensor<16x16x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x1x1xf16>, [#const.Reorder<#NHWC>] // CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<16x1x1x4xsi32> = dense<1> : tensor<16x1x1x4xsi32> - // CHECK-DAG: [[INPUT_SM:%.*]] = const.Declare tensor<1x16x2x2xi1> = dense : tensor<1x16x2x2xi1> - // CHECK: [[INPUT_SE:%.*]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 16, 1, 1], + // CHECK-DAG: [[INPUT_SM:%.+]] = const.Declare tensor<1x16x2x2xi1> = dense : tensor<1x16x2x2xi1> + // CHECK: [[INPUT_SE:%.+]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 16, 1, 1], // CHECK-SAME: seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , // CHECK-SAME: scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00], nearest_mode = , offsets = [0, 0, 0, 0], sizes = [1, 16, 2, 2]>, // CHECK-SAME: seDepth = 1 : i64, seSize = [16]} @@ -1829,8 +1829,8 @@ func.func @NCEInterpolateToDistributedOpSOH(%arg0: tensor<1x64x5x10xf16, {order // CHECK-DAG: [[WEIGHTS:%.+]] = const.Declare tensor<64x64x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<64x64x1x1xf16>, [#const.Reorder<#NHWC>] // CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<64x1x1x4xsi32> = dense<1> : tensor<64x1x1x4xsi32> - // CHECK-DAG: [[INPUT_SM:%.*]] = const.Declare tensor<1x64x10x20xi1> = dense : tensor<1x64x10x20xi1> - // CHECK: [[INPUT_SE:%.*]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 64, 5, 10], + // CHECK-DAG: [[INPUT_SM:%.+]] = const.Declare tensor<1x64x10x20xi1> = dense : tensor<1x64x10x20xi1> + // CHECK: [[INPUT_SE:%.+]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 64, 5, 10], // CHECK-SAME: seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , // CHECK-SAME: scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00], nearest_mode = , offsets = [0, 0, 0, 0], sizes = [1, 64, 10, 20]>, // CHECK-SAME: seDepth = 1 : i64, seSize = [64]} @@ -1904,8 +1904,8 @@ func.func @NCEInterpolateToDistributedOpSOK(%arg0: tensor<1x64x5x10xf16, {order // CHECK-DAG: [[WEIGHTS:%.+]] = const.Declare tensor<64x64x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<64x64x1x1xf16>, [#const.Reorder<#NHWC>] // CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<64x1x1x4xsi32> = dense<1> : tensor<64x1x1x4xsi32> - // CHECK-DAG: [[INPUT_SM:%.*]] = const.Declare tensor<1x64x10x20xi1> = dense : tensor<1x64x10x20xi1> - // CHECK: [[INPUT_SE:%.*]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 64, 5, 10], + // CHECK-DAG: [[INPUT_SM:%.+]] = const.Declare tensor<1x64x10x20xi1> = dense : tensor<1x64x10x20xi1> + // CHECK: [[INPUT_SE:%.+]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 64, 5, 10], // CHECK-SAME: seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , // CHECK-SAME: scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00], nearest_mode = , offsets = [0, 0, 0, 0], sizes = [1, 64, 10, 20]>, // CHECK-SAME: seDepth = 1 : i64, seSize = [64]} @@ -1981,7 +1981,7 @@ func.func @TopKSWTilingSOH(%arg0: tensor<1x31x103x513xf16, {order = #NHWC}>) -> //CHECK: [[OUTPUT:%.+]], [[TARGET:%.+]] = VPU.TopK([[INPUT]], [[AUX_BUFFER]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x1x103x513xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>, - //CHECK-SMAE: !VPU.DistributedTensor<1x1x103x513xsi32, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + //CHECK-SAME: !VPU.DistributedTensor<1x1x103x513xsi32, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> //CHECK: [[OUTPUT_VALUES:%.+]] = VPU.Copy([[OUTPUT]] //CHECK-SAME: -> tensor<1x1x103x513xf16, {order = #NHWC}> @@ -2014,7 +2014,7 @@ func.func @TopKSWTilingSOK(%arg0: tensor<1x103x513x31xf16, {order = #NHWC}>) -> //CHECK: [[OUTPUT:%.+]], [[TARGET:%.+]] = VPU.TopK([[INPUT]], [[AUX_BUFFER]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x103x513x1xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}>, - //CHECK-SMAE: !VPU.DistributedTensor<1x103x513x1xsi32, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}> + //CHECK-SAME: !VPU.DistributedTensor<1x103x513x1xsi32, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}> //CHECK: [[OUTPUT_VALUES:%.+]] = VPU.Copy([[OUTPUT]] //CHECK-SAME: -> tensor<1x103x513x1xf16, {order = #NHWC}> @@ -4112,3 +4112,28 @@ func.func @LogicalNotSWWithClustering(%arg0: tensor<1x1x1x44xf16>) -> tensor<1x1 // CHECK: [[OUT:%.+]] = VPU.Copy([[LOGICALNOT]] // CHECK: return [[OUT]] : tensor<1x1x1x44xf16> } + +// ----- + +// CHECK-LABEL: @GatherDMA +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x1x128256x2048xf16>, +// CHECK-SAME: [[INDICES:%.+]]: tensor<1x1x1024x1xi64> +func.func @GatherDMA(%input: tensor<1x1x128256x2048xf16>, %indices: tensor<1x1x1024x1xi64>) -> tensor<1x1x1024x2048xf16> { + + %gatherDMA = VPU.GatherDMA(%input, %indices) {axis_value = 2 : i64, batch_dims = 1 : i64, multiClusterStrategy = #VPU.multi_cluster_strategy} : + tensor<1x1x128256x2048xf16>, tensor<1x1x1024x1xi64> -> tensor<1x1x1024x2048xf16> + return %gatherDMA : tensor<1x1x1024x2048xf16> + + // CHECK: [[INDICES_COPY:%.+]] = VPU.Copy([[INDICES]]) {out_mem_space = @CMX_NN} : tensor<1x1x1024x1xi64> + // CHECK-SAME: -> !VPU.DistributedTensor<1x1x1024x1xi64, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + + // CHECK: [[GATHER_DMA:%.+]] = VPU.GatherDMA([[INPUT]], [[INDICES_COPY]]) {axis_value = 2 : i64, batch_dims = 1 : i64} : + // CHECK-SAME: tensor<1x1x128256x2048xf16>, !VPU.DistributedTensor<1x1x1024x1xi64, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + // CHECK-SAME: -> !VPU.DistributedTensor<1x1x1024x2048xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 1, 2], num_clusters = 2 : i64}> + + // CHECK: [[OUT:%.+]] = VPU.Copy([[GATHER_DMA]]) : + // CHECK-SAME: !VPU.DistributedTensor<1x1x1024x2048xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 1, 2], num_clusters = 2 : i64}> + // CHECK-SAME: -> tensor<1x1x1024x2048xf16> + + // CHECK: return [[OUT]] : tensor<1x1x1024x2048xf16> +} diff --git a/tests/lit/NPU/dialect/VPU/passes/make_ops_with_distributed_tensors_and_copies_40XX+.mlir b/tests/lit/NPU/dialect/VPU/passes/make_ops_with_distributed_tensors_and_copies_40XX+.mlir index b06a00d8ae..49a5d6b28a 100644 --- a/tests/lit/NPU/dialect/VPU/passes/make_ops_with_distributed_tensors_and_copies_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/make_ops_with_distributed_tensors_and_copies_40XX+.mlir @@ -95,6 +95,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ConvToDistributedOpSOHOverlapped +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x64x28x28xf16, {order = #NHWC}> func.func @ConvToDistributedOpSOHOverlapped(%arg0: tensor<1x64x28x28xf16, {order = #NHWC}>) -> tensor<1x80x28x28xf16, {order = #NHWC}> { %cst = const.Declare tensor<80x1x1x4xsi32> = dense<10> : tensor<80x1x1x4xsi32> %cst_0 = const.Declare tensor<80x64x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<80x64x3x3xf16>, [#const.Reorder<#NHWC>] @@ -107,10 +108,10 @@ func.func @ConvToDistributedOpSOHOverlapped(%arg0: tensor<1x64x28x28xf16, {order : tensor<1x64x28x28xf16, {order = #NHWC}>, tensor<80x64x3x3xf16, {order = #NHWC}>, tensor<80x1x1x4xsi32> -> tensor<1x80x28x28xf16, {order = #NHWC}> return %0 : tensor<1x80x28x28xf16, {order = #NHWC}> - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<80x1x1x4xsi32> = dense<10> : tensor<80x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<80x64x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<80x64x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<80x1x1x4xsi32> = dense<10> : tensor<80x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<80x64x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<80x64x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0) + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x64x28x28xf16, #NHWC, @CMX_NN //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 64, 5, 28], [1, 64, 5, 28], [1, 64, 5, 28], [1, 64, 5, 28], [1, 64, 4, 28], [1, 64, 4, 28]] @@ -118,21 +119,21 @@ func.func @ConvToDistributedOpSOHOverlapped(%arg0: tensor<1x64x28x28xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 6, 28], [1, 64, 7, 28], [1, 64, 7, 28], [1, 64, 7, 28], [1, 64, 6, 28], [1, 64, 5, 28]] //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 23, 0]]} - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]]) + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]]) //CHECK-SAME: -> !VPU.DistributedTensor<80x64x3x3xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3]], //CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], //CHECK-SAME{LITERAL}: memory_shapes = [[80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]]) + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]]) //CHECK-SAME: -> !VPU.DistributedTensor<80x1x1x4xsi32, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4]] //CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] //CHECK-SAME{LITERAL}: memory_shapes = [[80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4]] //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]], //CHECK-SAME: [[WEIGHTS_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -143,7 +144,7 @@ func.func @ConvToDistributedOpSOHOverlapped(%arg0: tensor<1x64x28x28xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 80, 5, 28], [1, 80, 5, 28], [1, 80, 5, 28], [1, 80, 5, 28], [1, 80, 4, 28], [1, 80, 4, 28]] //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 5, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 24, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x80x28x28xf16, {order = #NHWC}> } @@ -157,7 +158,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ConvToDistributedOpHKSwitch -// CHECK-SAME: ([[ARG0:%.*]]: tensor<1x64x28x28xf16, {order = #NHWC}>) +// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x64x28x28xf16, {order = #NHWC}>) func.func @ConvToDistributedOpHKSwitch(%arg0: tensor<1x64x28x28xf16, {order = #NHWC}>) -> tensor<1x80x28x28xf16, {order = #NHWC}> { %cst = const.Declare tensor<80x1x1x4xsi32> = dense<10> : tensor<80x1x1x4xsi32> %cst_0 = const.Declare tensor<80x64x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<80x64x3x3xf16>, [#const.Reorder<#NHWC>] @@ -170,10 +171,10 @@ func.func @ConvToDistributedOpHKSwitch(%arg0: tensor<1x64x28x28xf16, {order = #N : tensor<1x64x28x28xf16, {order = #NHWC}>, tensor<80x64x3x3xf16, {order = #NHWC}>, tensor<80x1x1x4xsi32> -> tensor<1x80x28x28xf16, {order = #NHWC}> return %0 : tensor<1x80x28x28xf16, {order = #NHWC}> - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<80x1x1x4xsi32> = dense<10> : tensor<80x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<80x64x3x3xf16, {order = #NHWC}> + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<80x1x1x4xsi32> = dense<10> : tensor<80x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<80x64x3x3xf16, {order = #NHWC}> - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[ARG0]]) + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[ARG0]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x64x28x28xf16, #NHWC, @CMX_NN //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 64, 5, 28], [1, 64, 5, 28], [1, 64, 5, 28], [1, 64, 5, 28], [1, 64, 4, 28], [1, 64, 4, 28]] @@ -181,21 +182,21 @@ func.func @ConvToDistributedOpHKSwitch(%arg0: tensor<1x64x28x28xf16, {order = #N //CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 6, 28], [1, 64, 7, 28], [1, 64, 7, 28], [1, 64, 7, 28], [1, 64, 6, 28], [1, 64, 5, 28]] //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 23, 0]]} - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]]) + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]]) //CHECK-SAME: -> !VPU.DistributedTensor<80x64x3x3xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3]], //CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], //CHECK-SAME{LITERAL}: memory_shapes = [[80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3], [80, 64, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]]) + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]]) //CHECK-SAME: -> !VPU.DistributedTensor<80x1x1x4xsi32, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4]] //CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] //CHECK-SAME{LITERAL}: memory_shapes = [[80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4], [80, 1, 1, 4]] //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]], //CHECK-SAME: [[WEIGHTS_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -206,7 +207,7 @@ func.func @ConvToDistributedOpHKSwitch(%arg0: tensor<1x64x28x28xf16, {order = #N //CHECK-SAME{LITERAL}: memory_shapes = [[1, 80, 28, 28], [1, 80, 28, 28], [1, 80, 28, 28], [1, 80, 28, 28], [1, 80, 28, 28], [1, 80, 28, 28]] //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x80x28x28xf16, {order = #NHWC}> } @@ -221,16 +222,17 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ConvToDistributedOpSOK +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x128x28x28xf16, {order = #NHWC}> func.func @ConvToDistributedOpSOK(%arg0: tensor<1x128x28x28xf16, {order = #NHWC}>) -> tensor<1x96x28x28xf16, {order = #NHWC}> { %cst = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> %cst_0 = const.Declare tensor<96x128x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x1x1xf16>, [#const.Reorder<#NHWC>] %0 = VPU.NCE.Convolution(%arg0, %cst_0, %cst) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [96, 128, 1, 1], strides = [1, 1]} : tensor<1x128x28x28xf16, {order = #NHWC}>, tensor<96x128x1x1xf16, {order = #NHWC}>, tensor<96x1x1x4xsi32> -> tensor<1x96x28x28xf16, {order = #NHWC}> return %0 : tensor<1x96x28x28xf16, {order = #NHWC}> - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<96x128x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x1x1xf16>, [#const.Reorder<#NHWC>] + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<96x128x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x1x1xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0) + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x128x28x28xf16, #NHWC, @CMX_NN //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28]] @@ -238,7 +240,7 @@ func.func @ConvToDistributedOpSOK(%arg0: tensor<1x128x28x28xf16, {order = #NHWC} //CHECK-SAME{LITERAL}: memory_shapes = [[1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28]] //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]]) + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]]) //CHECK-SAME: -> !VPU.DistributedTensor<96x128x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[16, 128, 1, 1], [16, 128, 1, 1], [16, 128, 1, 1], [16, 128, 1, 1], [16, 128, 1, 1], [16, 128, 1, 1]], @@ -246,7 +248,7 @@ func.func @ConvToDistributedOpSOK(%arg0: tensor<1x128x28x28xf16, {order = #NHWC} //CHECK-SAME{LITERAL}: memory_shapes = [[16, 128, 1, 1], [16, 128, 1, 1], [16, 128, 1, 1], [16, 128, 1, 1], [16, 128, 1, 1], [16, 128, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0], [64, 0, 0, 0], [80, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]]) + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]]) //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], @@ -254,7 +256,7 @@ func.func @ConvToDistributedOpSOK(%arg0: tensor<1x128x28x28xf16, {order = #NHWC} //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0], [64, 0, 0, 0], [80, 0, 0, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]], //CHECK-SAME: [[WEIGHTS_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -265,7 +267,7 @@ func.func @ConvToDistributedOpSOK(%arg0: tensor<1x128x28x28xf16, {order = #NHWC} //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x96x28x28xf16, {order = #NHWC}> } @@ -279,16 +281,17 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ConvToDistributedOpSOK4Clusters +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x128x28x28xf16, {order = #NHWC}> func.func @ConvToDistributedOpSOK4Clusters(%arg0: tensor<1x128x28x28xf16, {order = #NHWC}>) -> tensor<1x64x28x28xf16, {order = #NHWC}> { %cst = const.Declare tensor<64x1x1x4xsi32> = dense<10> : tensor<64x1x1x4xsi32> %cst_0 = const.Declare tensor<64x128x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<64x128x1x1xf16>, [#const.Reorder<#NHWC>] %0 = VPU.NCE.Convolution(%arg0, %cst_0, %cst) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [64, 128, 1, 1], strides = [1, 1]} : tensor<1x128x28x28xf16, {order = #NHWC}>, tensor<64x128x1x1xf16, {order = #NHWC}>, tensor<64x1x1x4xsi32> -> tensor<1x64x28x28xf16, {order = #NHWC}> return %0 : tensor<1x64x28x28xf16, {order = #NHWC}> - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<64x1x1x4xsi32> = dense<10> : tensor<64x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<64x128x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<64x128x1x1xf16>, [#const.Reorder<#NHWC>] + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<64x1x1x4xsi32> = dense<10> : tensor<64x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<64x128x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<64x128x1x1xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0) + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x128x28x28xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 4 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28]], @@ -296,7 +299,7 @@ func.func @ConvToDistributedOpSOK4Clusters(%arg0: tensor<1x128x28x28xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]]) + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]]) //CHECK-SAME: -> !VPU.DistributedTensor<64x128x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[16, 128, 1, 1], [16, 128, 1, 1], [16, 128, 1, 1], [16, 128, 1, 1]], @@ -304,7 +307,7 @@ func.func @ConvToDistributedOpSOK4Clusters(%arg0: tensor<1x128x28x28xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[16, 128, 1, 1], [16, 128, 1, 1], [16, 128, 1, 1], [16, 128, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]]) + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]]) //CHECK-SAME: -> !VPU.DistributedTensor<64x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], @@ -312,7 +315,7 @@ func.func @ConvToDistributedOpSOK4Clusters(%arg0: tensor<1x128x28x28xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]], //CHECK-SAME: [[WEIGHTS_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -323,7 +326,7 @@ func.func @ConvToDistributedOpSOK4Clusters(%arg0: tensor<1x128x28x28xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 28, 28], [1, 64, 28, 28], [1, 64, 28, 28], [1, 64, 28, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x64x28x28xf16, {order = #NHWC}> } @@ -345,10 +348,10 @@ func.func @ConvToDistributedOpSOB3Batches(%arg0: tensor<3x1024x14x14xf16, {order %0 = VPU.NCE.Convolution(%arg0, %cst_0, %cst) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [256, 1024, 1, 1], strides = [1, 1]} : tensor<3x1024x14x14xf16, {order = #NHWC}>, tensor<256x1024x1x1xf16, {order = #NHWC}>, tensor<256x1x1x4xsi32> -> tensor<3x256x14x14xf16, {order = #NHWC}> return %0 : tensor<3x256x14x14xf16, {order = #NHWC}> - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<256x1x1x4xsi32> = dense<10> : tensor<256x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<256x1024x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<256x1024x1x1xf16>, [#const.Reorder<#NHWC>] + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<256x1x1x4xsi32> = dense<10> : tensor<256x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<256x1024x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<256x1024x1x1xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[INPUT]]) + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]]) //CHECK-SAME: -> !VPU.DistributedTensor<3x1024x14x14xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [3, 1, 1, 1], num_clusters = 3 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 1024, 14, 14], [1, 1024, 14, 14], [1, 1024, 14, 14]], @@ -356,7 +359,7 @@ func.func @ConvToDistributedOpSOB3Batches(%arg0: tensor<3x1024x14x14xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 1024, 14, 14], [1, 1024, 14, 14], [1, 1024, 14, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [1, 0, 0, 0], [2, 0, 0, 0]] - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]]) + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]]) //CHECK-SAME: -> !VPU.DistributedTensor<256x1024x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 3 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[256, 1024, 1, 1], [256, 1024, 1, 1], [256, 1024, 1, 1]], @@ -364,7 +367,7 @@ func.func @ConvToDistributedOpSOB3Batches(%arg0: tensor<3x1024x14x14xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[256, 1024, 1, 1], [256, 1024, 1, 1], [256, 1024, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]]) + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]]) //CHECK-SAME: -> !VPU.DistributedTensor<256x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 3 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[256, 1, 1, 4], [256, 1, 1, 4], [256, 1, 1, 4]], @@ -372,7 +375,7 @@ func.func @ConvToDistributedOpSOB3Batches(%arg0: tensor<3x1024x14x14xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[256, 1, 1, 4], [256, 1, 1, 4], [256, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: [[WEIGHTS_CMX]] //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -383,7 +386,7 @@ func.func @ConvToDistributedOpSOB3Batches(%arg0: tensor<3x1024x14x14xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 256, 14, 14], [1, 256, 14, 14], [1, 256, 14, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [1, 0, 0, 0], [2, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<3x256x14x14xf16, {order = #NHWC}> } @@ -405,10 +408,10 @@ func.func @ConvToDistributedOpSOB(%arg0: tensor<6x1024x14x14xf16, {order = #NHWC %0 = VPU.NCE.Convolution(%arg0, %cst_0, %cst) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [256, 1024, 1, 1], strides = [1, 1]} : tensor<6x1024x14x14xf16, {order = #NHWC}>, tensor<256x1024x1x1xf16, {order = #NHWC}>, tensor<256x1x1x4xsi32> -> tensor<6x256x14x14xf16, {order = #NHWC}> return %0 : tensor<6x256x14x14xf16, {order = #NHWC}> - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<256x1x1x4xsi32> = dense<10> : tensor<256x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<256x1024x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<256x1024x1x1xf16>, [#const.Reorder<#NHWC>] + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<256x1x1x4xsi32> = dense<10> : tensor<256x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<256x1024x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<256x1024x1x1xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[INPUT]] + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<6x1024x14x14xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 1024, 14, 14], [1, 1024, 14, 14], [1, 1024, 14, 14], [1, 1024, 14, 14], [1, 1024, 14, 14], [1, 1024, 14, 14]], @@ -416,7 +419,7 @@ func.func @ConvToDistributedOpSOB(%arg0: tensor<6x1024x14x14xf16, {order = #NHWC //CHECK-SAME{LITERAL}: memory_shapes = [[1, 1024, 14, 14], [1, 1024, 14, 14], [1, 1024, 14, 14], [1, 1024, 14, 14], [1, 1024, 14, 14], [1, 1024, 14, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [1, 0, 0, 0], [2, 0, 0, 0], [3, 0, 0, 0], [4, 0, 0, 0], [5, 0, 0, 0]] - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] //CHECK-SAME: -> !VPU.DistributedTensor<256x1024x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[256, 1024, 1, 1], [256, 1024, 1, 1], [256, 1024, 1, 1], [256, 1024, 1, 1], [256, 1024, 1, 1], [256, 1024, 1, 1]], @@ -424,7 +427,7 @@ func.func @ConvToDistributedOpSOB(%arg0: tensor<6x1024x14x14xf16, {order = #NHWC //CHECK-SAME{LITERAL}: memory_shapes = [[256, 1024, 1, 1], [256, 1024, 1, 1], [256, 1024, 1, 1], [256, 1024, 1, 1], [256, 1024, 1, 1], [256, 1024, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<256x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[256, 1, 1, 4], [256, 1, 1, 4], [256, 1, 1, 4], [256, 1, 1, 4], [256, 1, 1, 4], [256, 1, 1, 4]], @@ -432,7 +435,7 @@ func.func @ConvToDistributedOpSOB(%arg0: tensor<6x1024x14x14xf16, {order = #NHWC //CHECK-SAME{LITERAL}: memory_shapes = [[256, 1, 1, 4], [256, 1, 1, 4], [256, 1, 1, 4], [256, 1, 1, 4], [256, 1, 1, 4], [256, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: [[WEIGHTS_CMX]] //CHECK-SAME: [[WEIGHTSTABLE_CMX]] @@ -443,7 +446,7 @@ func.func @ConvToDistributedOpSOB(%arg0: tensor<6x1024x14x14xf16, {order = #NHWC //CHECK-SAME{LITERAL}: memory_shapes = [[1, 256, 14, 14], [1, 256, 14, 14], [1, 256, 14, 14], [1, 256, 14, 14], [1, 256, 14, 14], [1, 256, 14, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [1, 0, 0, 0], [2, 0, 0, 0], [3, 0, 0, 0], [4, 0, 0, 0], [5, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<6x256x14x14xf16, {order = #NHWC}> } @@ -458,16 +461,17 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ConvToDistributedOpClustering +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x64x14x14xf16, {order = #NHWC}> func.func @ConvToDistributedOpClustering(%arg0: tensor<1x64x14x14xf16, {order = #NHWC}>) -> tensor<1x48x14x14xf16, {order = #NHWC}> { %cst = const.Declare tensor<48x1x1x4xsi32> = dense<10> : tensor<48x1x1x4xsi32> %cst_0 = const.Declare tensor<48x64x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<48x64x3x3xf16>, [#const.Reorder<#NHWC>] %0 = VPU.NCE.Convolution(%arg0, %cst_0, %cst) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [48, 64, 3, 3], strides = [1, 1]} : tensor<1x64x14x14xf16, {order = #NHWC}>, tensor<48x64x3x3xf16, {order = #NHWC}>, tensor<48x1x1x4xsi32> -> tensor<1x48x14x14xf16, {order = #NHWC}> return %0 : tensor<1x48x14x14xf16, {order = #NHWC}> - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<48x1x1x4xsi32> = dense<10> : tensor<48x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<48x64x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<48x64x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<48x1x1x4xsi32> = dense<10> : tensor<48x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<48x64x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<48x64x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x64x14x14xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 64, 14, 14], [1, 64, 14, 14], [1, 64, 14, 14], [1, 64, 14, 14], [1, 64, 14, 14], [1, 64, 14, 14]], @@ -475,7 +479,7 @@ func.func @ConvToDistributedOpClustering(%arg0: tensor<1x64x14x14xf16, {order = //CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 14, 14], [1, 64, 14, 14], [1, 64, 14, 14], [1, 64, 14, 14], [1, 64, 14, 14], [1, 64, 14, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] //CHECK-SAME: -> !VPU.DistributedTensor<48x64x3x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[48, 64, 3, 3], [48, 64, 3, 3], [48, 64, 3, 3], [48, 64, 3, 3], [48, 64, 3, 3], [48, 64, 3, 3]], @@ -483,7 +487,7 @@ func.func @ConvToDistributedOpClustering(%arg0: tensor<1x64x14x14xf16, {order = //CHECK-SAME{LITERAL}: memory_shapes = [[48, 64, 3, 3], [48, 64, 3, 3], [48, 64, 3, 3], [48, 64, 3, 3], [48, 64, 3, 3], [48, 64, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<48x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4]], @@ -491,7 +495,7 @@ func.func @ConvToDistributedOpClustering(%arg0: tensor<1x64x14x14xf16, {order = //CHECK-SAME{LITERAL}: memory_shapes = [[48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: [[WEIGHTS_CMX]] //CHECK-SAME: [[WEIGHTSTABLE_CMX]] @@ -502,7 +506,7 @@ func.func @ConvToDistributedOpClustering(%arg0: tensor<1x64x14x14xf16, {order = //CHECK-SAME{LITERAL}: memory_shapes = [[1, 48, 14, 14], [1, 48, 14, 14], [1, 48, 14, 14], [1, 48, 14, 14], [1, 48, 14, 14], [1, 48, 14, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x48x14x14xf16, {order = #NHWC}> } @@ -517,17 +521,18 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @DepthConvToDistributedOpSOHOverlapped +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x32x112x112xf16, {order = #NHWC}> func.func @DepthConvToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16, {order = #NHWC}>) -> tensor<1x32x112x112xf16, {order = #NHWC}> { %cst_0 = const.Declare tensor<32x1x1x4xsi32> = dense<10> : tensor<32x1x1x4xsi32> %cst_1 = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x16x1x1xf16>, [#const.Reorder<#NHWC>] %0 = VPU.NCE.DepthConvolution(%arg0, %cst_1, %cst_0) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [32, 1, 3, 3], strides = [1, 1]} -> tensor<1x32x112x112xf16, {order = #NHWC}> return %0 : tensor<1x32x112x112xf16, {order = #NHWC}> - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<32x1x1x4xsi32> = dense<10> : tensor<32x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<32x1x1x4xsi32> = dense<10> : tensor<32x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> //CHECK-SAME: = dense<1.000000e+00> : tensor<32x16x1x1xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], @@ -535,7 +540,7 @@ func.func @DepthConvToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16, //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 20, 112], [1, 32, 21, 112], [1, 32, 21, 112], [1, 32, 21, 112], [1, 32, 20, 112], [1, 32, 19, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 18, 0], [0, 0, 37, 0], [0, 0, 56, 0], [0, 0, 75, 0], [0, 0, 93, 0]] - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] //CHECK-SAME: -> !VPU.DistributedTensor<32x16x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1]], @@ -543,7 +548,7 @@ func.func @DepthConvToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16, //CHECK-SAME{LITERAL}: memory_shapes = [[32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<32x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4]], @@ -551,7 +556,7 @@ func.func @DepthConvToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16, //CHECK-SAME{LITERAL}: memory_shapes = [[32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.DepthConvolution( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.DepthConvolution( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: [[WEIGHTS_CMX]] //CHECK-SAME: [[WEIGHTSTABLE_CMX]] @@ -562,7 +567,7 @@ func.func @DepthConvToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16, //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 19, 0], [0, 0, 38, 0], [0, 0, 57, 0], [0, 0, 76, 0], [0, 0, 94, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x32x112x112xf16, {order = #NHWC}> } @@ -578,7 +583,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @DepthConvToDistributedOpHKSwitch -// CHECK-SAME: ([[ARG0:%.*]]: tensor<1x32x112x112xf16, {order = #NHWC}>) +// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x32x112x112xf16, {order = #NHWC}>) func.func @DepthConvToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {order = #NHWC}>) -> tensor<1x32x112x112xf16, {order = #NHWC}> { %cst_0 = const.Declare tensor<32x1x1x4xsi32> = dense<10> : tensor<32x1x1x4xsi32> %cst_1 = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x16x1x1xf16>, [#const.Reorder<#NHWC>] @@ -591,11 +596,11 @@ func.func @DepthConvToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {ord -> tensor<1x32x112x112xf16, {order = #NHWC}> return %0 : tensor<1x32x112x112xf16, {order = #NHWC}> - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<32x1x1x4xsi32> = dense<10> : tensor<32x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<32x1x1x4xsi32> = dense<10> : tensor<32x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> //CHECK-SAME: = dense<1.000000e+00> : tensor<32x16x1x1xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[ARG0]] + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[ARG0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], @@ -603,7 +608,7 @@ func.func @DepthConvToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {ord //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 20, 112], [1, 32, 21, 112], [1, 32, 21, 112], [1, 32, 21, 112], [1, 32, 20, 112], [1, 32, 19, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 18, 0], [0, 0, 37, 0], [0, 0, 56, 0], [0, 0, 75, 0], [0, 0, 93, 0]] - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] //CHECK-SAME: -> !VPU.DistributedTensor<32x16x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1]], @@ -611,7 +616,7 @@ func.func @DepthConvToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {ord //CHECK-SAME{LITERAL}: memory_shapes = [[32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<32x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4]], @@ -619,7 +624,7 @@ func.func @DepthConvToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {ord //CHECK-SAME{LITERAL}: memory_shapes = [[32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.DepthConvolution( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.DepthConvolution( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: [[WEIGHTS_CMX]] //CHECK-SAME: [[WEIGHTSTABLE_CMX]] @@ -630,7 +635,7 @@ func.func @DepthConvToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {ord //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x32x112x112xf16, {order = #NHWC}> } @@ -646,18 +651,18 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @DepthConvToDistributedOpSOHOverlappedNoAlign -// CHECK-SAME: ([[ARG0:%.*]]: tensor<1x32x14x14xf16, {order = #NHWC}>) +// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x32x14x14xf16, {order = #NHWC}>) func.func @DepthConvToDistributedOpSOHOverlappedNoAlign(%arg0: tensor<1x32x14x14xf16, {order = #NHWC}>) -> tensor<1x32x14x14xf16, {order = #NHWC}> { %cst_0 = const.Declare tensor<32x1x1x4xsi32> = dense<10> : tensor<32x1x1x4xsi32> %cst_1 = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x16x1x1xf16>, [#const.Reorder<#NHWC>] %0 = VPU.NCE.DepthConvolution(%arg0, %cst_1, %cst_0) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [32, 1, 3, 3], strides = [1, 1]} -> tensor<1x32x14x14xf16, {order = #NHWC}> return %0 : tensor<1x32x14x14xf16, {order = #NHWC}> - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<32x1x1x4xsi32> = dense<10> : tensor<32x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<32x1x1x4xsi32> = dense<10> : tensor<32x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> //CHECK-SAME: = dense<1.000000e+00> : tensor<32x16x1x1xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[ARG0]] + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[ARG0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x14x14xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments, @@ -666,7 +671,7 @@ func.func @DepthConvToDistributedOpSOHOverlappedNoAlign(%arg0: tensor<1x32x14x14 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 4, 14], [1, 32, 5, 14], [1, 32, 4, 14], [1, 32, 4, 14], [1, 32, 4, 14], [1, 32, 3, 14]] //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 2, 0], [0, 0, 5, 0], [0, 0, 7, 0], [0, 0, 9, 0], [0, 0, 11, 0]]}> - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] //CHECK-SAME: -> !VPU.DistributedTensor<32x16x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1]] @@ -674,7 +679,7 @@ func.func @DepthConvToDistributedOpSOHOverlappedNoAlign(%arg0: tensor<1x32x14x14 //CHECK-SAME{LITERAL}: memory_shapes = [[32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1]] //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<32x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4]] @@ -682,7 +687,7 @@ func.func @DepthConvToDistributedOpSOHOverlappedNoAlign(%arg0: tensor<1x32x14x14 //CHECK-SAME{LITERAL}: memory_shapes = [[32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4]] //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.DepthConvolution( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.DepthConvolution( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: [[WEIGHTS_CMX]] //CHECK-SAME: [[WEIGHTSTABLE_CMX]] @@ -693,7 +698,7 @@ func.func @DepthConvToDistributedOpSOHOverlappedNoAlign(%arg0: tensor<1x32x14x14 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 3, 14], [1, 32, 3, 14], [1, 32, 2, 14], [1, 32, 2, 14], [1, 32, 2, 14], [1, 32, 2, 14]] //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 6, 0], [0, 0, 8, 0], [0, 0, 10, 0], [0, 0, 12, 0]]}> - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x32x14x14xf16, {order = #NHWC}> } @@ -708,17 +713,18 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @DepthConvToDistributedOpSOK +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x128x56x56xf16, {order = #NHWC}> func.func @DepthConvToDistributedOpSOK(%arg0: tensor<1x128x56x56xf16, {order = #NHWC}>) -> tensor<1x128x56x56xf16, {order = #NHWC}> { %cst_0 = const.Declare tensor<128x1x1x4xsi32> = dense<10> : tensor<128x1x1x4xsi32> %cst_1 = const.Declare tensor<128x16x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<128x16x1x1xf16>, [#const.Reorder<#NHWC>] %0 = VPU.NCE.DepthConvolution(%arg0, %cst_1, %cst_0) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [128, 1, 3, 3], strides = [1, 1]} -> tensor<1x128x56x56xf16, {order = #NHWC}> return %0 : tensor<1x128x56x56xf16, {order = #NHWC}> - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<128x1x1x4xsi32> = dense<10> : tensor<128x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<128x16x1x1xf16, {order = #NHWC}> + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<128x1x1x4xsi32> = dense<10> : tensor<128x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<128x16x1x1xf16, {order = #NHWC}> //CHECK-SAME: = dense<1.000000e+00> : tensor<128x16x1x1xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x128x56x56xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 56, 56], [1, 32, 56, 56], [1, 16, 56, 56], [1, 16, 56, 56], [1, 16, 56, 56], [1, 16, 56, 56]], @@ -726,7 +732,7 @@ func.func @DepthConvToDistributedOpSOK(%arg0: tensor<1x128x56x56xf16, {order = # //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 56, 56], [1, 32, 56, 56], [1, 16, 56, 56], [1, 16, 56, 56], [1, 16, 56, 56], [1, 16, 56, 56]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 32, 0, 0], [0, 64, 0, 0], [0, 80, 0, 0], [0, 96, 0, 0], [0, 112, 0, 0]] - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] //CHECK-SAME: -> !VPU.DistributedTensor<128x16x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[32, 16, 1, 1], [32, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1]], @@ -734,7 +740,7 @@ func.func @DepthConvToDistributedOpSOK(%arg0: tensor<1x128x56x56xf16, {order = # //CHECK-SAME{LITERAL}: memory_shapes = [[32, 16, 1, 1], [32, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [32, 0, 0, 0], [64, 0, 0, 0], [80, 0, 0, 0], [96, 0, 0, 0], [112, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<128x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[32, 1, 1, 4], [32, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], @@ -742,7 +748,7 @@ func.func @DepthConvToDistributedOpSOK(%arg0: tensor<1x128x56x56xf16, {order = # //CHECK-SAME{LITERAL}: memory_shapes = [[32, 1, 1, 4], [32, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [32, 0, 0, 0], [64, 0, 0, 0], [80, 0, 0, 0], [96, 0, 0, 0], [112, 0, 0, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.DepthConvolution( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.DepthConvolution( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: [[WEIGHTS_CMX]] //CHECK-SAME: [[WEIGHTSTABLE_CMX]] @@ -753,7 +759,7 @@ func.func @DepthConvToDistributedOpSOK(%arg0: tensor<1x128x56x56xf16, {order = # //CHECK-SAME{LITERAL}: memory_shapes = [[1, 128, 56, 56], [1, 128, 56, 56], [1, 128, 56, 56], [1, 128, 56, 56], [1, 128, 56, 56], [1, 128, 56, 56]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x128x56x56xf16, {order = #NHWC}> } @@ -768,17 +774,18 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @DepthConvToDistributedOpClustering +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x32x14x14xf16, {order = #NHWC}> func.func @DepthConvToDistributedOpClustering(%arg0: tensor<1x32x14x14xf16, {order = #NHWC}>) -> tensor<1x32x14x14xf16, {order = #NHWC}> { %cst_0 = const.Declare tensor<32x1x1x4xsi32> = dense<10> : tensor<32x1x1x4xsi32> %cst_1 = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x16x1x1xf16>, [#const.Reorder<#NHWC>] %0 = VPU.NCE.DepthConvolution(%arg0, %cst_1, %cst_0) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [32, 1, 3, 3], strides = [1, 1]} -> tensor<1x32x14x14xf16, {order = #NHWC}> return %0 : tensor<1x32x14x14xf16, {order = #NHWC}> - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<32x1x1x4xsi32> = dense<10> : tensor<32x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<32x1x1x4xsi32> = dense<10> : tensor<32x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> //CHECK-SAME: = dense<1.000000e+00> : tensor<32x16x1x1xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x14x14xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14]], @@ -786,7 +793,7 @@ func.func @DepthConvToDistributedOpClustering(%arg0: tensor<1x32x14x14xf16, {ord //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] //CHECK-SAME: -> !VPU.DistributedTensor<32x16x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1]], @@ -794,7 +801,7 @@ func.func @DepthConvToDistributedOpClustering(%arg0: tensor<1x32x14x14xf16, {ord //CHECK-SAME{LITERAL}: memory_shapes = [[32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1], [32, 16, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<32x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4]], @@ -802,7 +809,7 @@ func.func @DepthConvToDistributedOpClustering(%arg0: tensor<1x32x14x14xf16, {ord //CHECK-SAME{LITERAL}: memory_shapes = [[32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.DepthConvolution( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.DepthConvolution( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: [[WEIGHTS_CMX]] //CHECK-SAME: [[WEIGHTSTABLE_CMX]] @@ -813,7 +820,7 @@ func.func @DepthConvToDistributedOpClustering(%arg0: tensor<1x32x14x14xf16, {ord //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x32x14x14xf16, {order = #NHWC}> } @@ -827,6 +834,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @MaxPoolToDistributedOpSOHOverlapped +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x32x112x112xf16, {order = #NHWC}> func.func @MaxPoolToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16, {order = #NHWC}>) -> tensor<1x32x112x112xf16, {order = #NHWC}> { %0 = VPU.NCE.MaxPool(%arg0) { multiClusterStrategy = #VPU.multi_cluster_strategy, @@ -837,7 +845,7 @@ func.func @MaxPoolToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16, { } -> tensor<1x32x112x112xf16, {order = #NHWC}> return %0 : tensor<1x32x112x112xf16, {order = #NHWC}> - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], @@ -845,7 +853,7 @@ func.func @MaxPoolToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16, { //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 19, 0], [0, 0, 38, 0], [0, 0, 57, 0], [0, 0, 76, 0], [0, 0, 94, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.MaxPool( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.MaxPool( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments @@ -854,7 +862,7 @@ func.func @MaxPoolToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16, { //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 19, 0], [0, 0, 38, 0], [0, 0, 57, 0], [0, 0, 76, 0], [0, 0, 94, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x32x112x112xf16, {order = #NHWC}> } @@ -869,7 +877,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @MaxPoolToDistributedOpHKSwitch -// CHECK-SAME: ([[ARG0:%.*]]: tensor<1x32x112x112xf16, {order = #NHWC}>) +// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x32x112x112xf16, {order = #NHWC}>) func.func @MaxPoolToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {order = #NHWC}>) -> tensor<1x32x112x112xf16, {order = #NHWC}> { %0 = VPU.NCE.MaxPool(%arg0) { multiClusterStrategy = #VPU.multi_cluster_strategy, @@ -880,7 +888,7 @@ func.func @MaxPoolToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {order } -> tensor<1x32x112x112xf16, {order = #NHWC}> return %0 : tensor<1x32x112x112xf16, {order = #NHWC}> - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[ARG0]] + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[ARG0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], @@ -888,7 +896,7 @@ func.func @MaxPoolToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 19, 0], [0, 0, 38, 0], [0, 0, 57, 0], [0, 0, 76, 0], [0, 0, 94, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.MaxPool( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.MaxPool( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED|MULTICASTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments @@ -897,7 +905,7 @@ func.func @MaxPoolToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x32x112x112xf16, {order = #NHWC}> } @@ -912,7 +920,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @MaxPoolToDistributedOpSOHOverlappedNoAlign -// CHECK-SAME: ([[ARG0:%.*]]: tensor<1x32x14x14xf16, {order = #NHWC}>) +// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x32x14x14xf16, {order = #NHWC}>) func.func @MaxPoolToDistributedOpSOHOverlappedNoAlign(%arg0: tensor<1x32x14x14xf16, {order = #NHWC}>) -> tensor<1x32x14x14xf16, {order = #NHWC}> { %0 = VPU.NCE.MaxPool(%arg0) { multiClusterStrategy = #VPU.multi_cluster_strategy, @@ -923,7 +931,7 @@ func.func @MaxPoolToDistributedOpSOHOverlappedNoAlign(%arg0: tensor<1x32x14x14xf } -> tensor<1x32x14x14xf16, {order = #NHWC}> return %0 : tensor<1x32x14x14xf16, {order = #NHWC}> - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[ARG0]] + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[ARG0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x14x14xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 3, 14], [1, 32, 3, 14], [1, 32, 2, 14], [1, 32, 2, 14], [1, 32, 2, 14], [1, 32, 2, 14]] @@ -931,7 +939,7 @@ func.func @MaxPoolToDistributedOpSOHOverlappedNoAlign(%arg0: tensor<1x32x14x14xf //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 3, 14], [1, 32, 3, 14], [1, 32, 2, 14], [1, 32, 2, 14], [1, 32, 2, 14], [1, 32, 2, 14]] //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 6, 0], [0, 0, 8, 0], [0, 0, 10, 0], [0, 0, 12, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.MaxPool( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.MaxPool( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x14x14xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, @@ -940,7 +948,7 @@ func.func @MaxPoolToDistributedOpSOHOverlappedNoAlign(%arg0: tensor<1x32x14x14xf //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 3, 14], [1, 32, 3, 14], [1, 32, 2, 14], [1, 32, 2, 14], [1, 32, 2, 14], [1, 32, 2, 14]] //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 6, 0], [0, 0, 8, 0], [0, 0, 10, 0], [0, 0, 12, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy( + //CHECK: [[OUT:%.+]] = VPU.Copy( //CHECK-SAME: [[OUT_CMX]] //CHECK-SAME: -> tensor<1x32x14x14xf16, {order = #NHWC}> @@ -957,6 +965,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @MaxPoolToDistributedOpClustering +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x32x14x14xf16, {order = #NHWC}> func.func @MaxPoolToDistributedOpClustering(%arg0: tensor<1x32x14x14xf16, {order = #NHWC}>) -> tensor<1x32x14x14xf16, {order = #NHWC}> { %0 = VPU.NCE.MaxPool(%arg0) { multiClusterStrategy = #VPU.multi_cluster_strategy, @@ -967,7 +976,7 @@ func.func @MaxPoolToDistributedOpClustering(%arg0: tensor<1x32x14x14xf16, {order } -> tensor<1x32x14x14xf16, {order = #NHWC}> return %0 : tensor<1x32x14x14xf16, {order = #NHWC}> - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x14x14xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14]], @@ -975,7 +984,7 @@ func.func @MaxPoolToDistributedOpClustering(%arg0: tensor<1x32x14x14xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.MaxPool( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.MaxPool( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x14x14xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments @@ -984,7 +993,7 @@ func.func @MaxPoolToDistributedOpClustering(%arg0: tensor<1x32x14x14xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x32x14x14xf16, {order = #NHWC}> } @@ -1177,13 +1186,15 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @EltwiseAddToDistributedOpSOHOverlapped +// CHECK-SAME: [[INPUT0:%.+]]: tensor<1x32x112x112xf16, {order = #NHWC}>, +// CHECK-SAME: [[INPUT1:%.+]]: tensor<1x32x112x112xf16, {order = #NHWC}> func.func @EltwiseAddToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16, {order = #NHWC}>, %arg1: tensor<1x32x112x112xf16, {order = #NHWC}>) -> tensor<1x32x112x112xf16, {order = #NHWC}> { %0 = VPU.NCE.Eltwise(%arg0, %arg1) { multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, ppe = #VPU.PPEStub<>} : tensor<1x32x112x112xf16, {order = #NHWC}>, tensor<1x32x112x112xf16, {order = #NHWC}> -> tensor<1x32x112x112xf16, {order = #NHWC}> return %0: tensor<1x32x112x112xf16, {order = #NHWC}> - //CHECK: [[INPUT0_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT0_CMX:%.+]] = VPU.Copy([[INPUT0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], @@ -1191,7 +1202,7 @@ func.func @EltwiseAddToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 19, 0], [0, 0, 38, 0], [0, 0, 57, 0], [0, 0, 76, 0], [0, 0, 94, 0]] - //CHECK: [[INPUT1_CMX:%.*]] = VPU.Copy(%arg1 + //CHECK: [[INPUT1_CMX:%.+]] = VPU.Copy([[INPUT1]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], @@ -1199,7 +1210,7 @@ func.func @EltwiseAddToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 19, 0], [0, 0, 38, 0], [0, 0, 57, 0], [0, 0, 76, 0], [0, 0, 94, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Eltwise( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Eltwise( //CHECK-SAME: [[INPUT0_CMX]], //CHECK-SAME: [[INPUT1_CMX]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, @@ -1209,7 +1220,7 @@ func.func @EltwiseAddToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 19, 0], [0, 0, 38, 0], [0, 0, 57, 0], [0, 0, 76, 0], [0, 0, 94, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x32x112x112xf16, {order = #NHWC}> } @@ -1232,7 +1243,7 @@ func.func @EltwiseAddToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {or -> tensor<1x32x112x112xf16, {order = #NHWC}> return %0: tensor<1x32x112x112xf16, {order = #NHWC}> - //CHECK: [[INPUT0_CMX:%.*]] = VPU.Copy([[ARG0]] + //CHECK: [[INPUT0_CMX:%.+]] = VPU.Copy([[ARG0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], @@ -1240,7 +1251,7 @@ func.func @EltwiseAddToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {or //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 19, 0], [0, 0, 38, 0], [0, 0, 57, 0], [0, 0, 76, 0], [0, 0, 94, 0]] - //CHECK: [[INPUT1_CMX:%.*]] = VPU.Copy([[ARG1]] + //CHECK: [[INPUT1_CMX:%.+]] = VPU.Copy([[ARG1]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], @@ -1248,7 +1259,7 @@ func.func @EltwiseAddToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {or //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 19, 0], [0, 0, 38, 0], [0, 0, 57, 0], [0, 0, 76, 0], [0, 0, 94, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Eltwise( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Eltwise( //CHECK-SAME: [[INPUT0_CMX]], //CHECK-SAME: [[INPUT1_CMX]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, @@ -1258,7 +1269,7 @@ func.func @EltwiseAddToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {or //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x32x112x112xf16, {order = #NHWC}> } @@ -1273,13 +1284,15 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @EltwiseAddToDistributedOpClustering +// CHECK-SAME: [[INPUT0:%.+]]: tensor<1x32x14x14xf16, {order = #NHWC}>, +// CHECK-SAME: [[INPUT1:%.+]]: tensor<1x32x14x14xf16, {order = #NHWC}> func.func @EltwiseAddToDistributedOpClustering(%arg0: tensor<1x32x14x14xf16, {order = #NHWC}>, %arg1: tensor<1x32x14x14xf16, {order = #NHWC}>) -> tensor<1x32x14x14xf16, {order = #NHWC}> { %0 = VPU.NCE.Eltwise(%arg0, %arg1) { multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, ppe = #VPU.PPEStub<>} : tensor<1x32x14x14xf16, {order = #NHWC}>, tensor<1x32x14x14xf16, {order = #NHWC}> -> tensor<1x32x14x14xf16, {order = #NHWC}> return %0: tensor<1x32x14x14xf16, {order = #NHWC}> - //CHECK: [[INPUT0_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT0_CMX:%.+]] = VPU.Copy([[INPUT0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x14x14xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14]], @@ -1287,7 +1300,7 @@ func.func @EltwiseAddToDistributedOpClustering(%arg0: tensor<1x32x14x14xf16, {or //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[INPUT1_CMX:%.*]] = VPU.Copy(%arg1 + //CHECK: [[INPUT1_CMX:%.+]] = VPU.Copy([[INPUT1]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x14x14xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14]], @@ -1295,7 +1308,7 @@ func.func @EltwiseAddToDistributedOpClustering(%arg0: tensor<1x32x14x14xf16, {or //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Eltwise( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Eltwise( //CHECK-SAME: [[INPUT0_CMX]], //CHECK-SAME: [[INPUT1_CMX]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x32x14x14xf16, #NHWC, @CMX_NN, @@ -1305,7 +1318,7 @@ func.func @EltwiseAddToDistributedOpClustering(%arg0: tensor<1x32x14x14xf16, {or //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14], [1, 32, 14, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x32x14x14xf16, {order = #NHWC}> } @@ -1319,6 +1332,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @AvgPoolToDistributedOpSOHOverlapped +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x32x112x112xf16, {order = #NHWC}> func.func @AvgPoolToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16, {order = #NHWC}>) -> tensor<1x32x112x112xf16, {order = #NHWC}> { %0 = VPU.NCE.AveragePool(%arg0) { multiClusterStrategy = #VPU.multi_cluster_strategy, @@ -1329,7 +1343,7 @@ func.func @AvgPoolToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16, { } -> tensor<1x32x112x112xf16, {order = #NHWC}> return %0 : tensor<1x32x112x112xf16, {order = #NHWC}> - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], @@ -1337,7 +1351,7 @@ func.func @AvgPoolToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16, { //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 20, 112], [1, 32, 21, 112], [1, 32, 21, 112], [1, 32, 21, 112], [1, 32, 20, 112], [1, 32, 19, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 18, 0], [0, 0, 37, 0], [0, 0, 56, 0], [0, 0, 75, 0], [0, 0, 93, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.AveragePool( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.AveragePool( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments @@ -1346,7 +1360,7 @@ func.func @AvgPoolToDistributedOpSOHOverlapped(%arg0: tensor<1x32x112x112xf16, { //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 19, 0], [0, 0, 38, 0], [0, 0, 57, 0], [0, 0, 76, 0], [0, 0, 94, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x32x112x112xf16, {order = #NHWC}> } @@ -1361,7 +1375,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @AvgPoolToDistributedOpHKSwitch -// CHECK-SAME: ([[ARG0:%.*]]: tensor<1x32x112x112xf16, {order = #NHWC}>) +// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x32x112x112xf16, {order = #NHWC}>) func.func @AvgPoolToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {order = #NHWC}>) -> tensor<1x32x112x112xf16, {order = #NHWC}> { %0 = VPU.NCE.AveragePool(%arg0) { multiClusterStrategy = #VPU.multi_cluster_strategy, @@ -1372,7 +1386,7 @@ func.func @AvgPoolToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {order } -> tensor<1x32x112x112xf16, {order = #NHWC}> return %0 : tensor<1x32x112x112xf16, {order = #NHWC}> - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[ARG0]] + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[ARG0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 19, 112], [1, 32, 18, 112], [1, 32, 18, 112]], @@ -1380,7 +1394,7 @@ func.func @AvgPoolToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 20, 112], [1, 32, 21, 112], [1, 32, 21, 112], [1, 32, 21, 112], [1, 32, 20, 112], [1, 32, 19, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 18, 0], [0, 0, 37, 0], [0, 0, 56, 0], [0, 0, 75, 0], [0, 0, 93, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.AveragePool( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.AveragePool( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED|MULTICASTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments @@ -1389,7 +1403,7 @@ func.func @AvgPoolToDistributedOpHKSwitch(%arg0: tensor<1x32x112x112xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112], [1, 32, 112, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x32x112x112xf16, {order = #NHWC}> } @@ -1404,7 +1418,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @AvgPoolToDistributedOpSOHOverlappedNoAlign -// CHECK-SAME: ([[ARG0:%.*]]: tensor<1x32x14x14xf16, {order = #NHWC}>) +// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x32x14x14xf16, {order = #NHWC}>) func.func @AvgPoolToDistributedOpSOHOverlappedNoAlign(%arg0: tensor<1x32x14x14xf16, {order = #NHWC}>) -> tensor<1x32x14x14xf16, {order = #NHWC}> { %0 = VPU.NCE.AveragePool(%arg0) { multiClusterStrategy = #VPU.multi_cluster_strategy, @@ -1415,7 +1429,7 @@ func.func @AvgPoolToDistributedOpSOHOverlappedNoAlign(%arg0: tensor<1x32x14x14xf } -> tensor<1x32x14x14xf16, {order = #NHWC}> return %0 : tensor<1x32x14x14xf16, {order = #NHWC}> - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[ARG0]] + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[ARG0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x14x14xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 3, 14], [1, 32, 3, 14], [1, 32, 2, 14], [1, 32, 2, 14], [1, 32, 2, 14], [1, 32, 2, 14]], @@ -1423,7 +1437,7 @@ func.func @AvgPoolToDistributedOpSOHOverlappedNoAlign(%arg0: tensor<1x32x14x14xf //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 4, 14], [1, 32, 5, 14], [1, 32, 4, 14], [1, 32, 4, 14], [1, 32, 4, 14], [1, 32, 3, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 2, 0], [0, 0, 5, 0], [0, 0, 7, 0], [0, 0, 9, 0], [0, 0, 11, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.AveragePool( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.AveragePool( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x14x14xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, @@ -1432,7 +1446,7 @@ func.func @AvgPoolToDistributedOpSOHOverlappedNoAlign(%arg0: tensor<1x32x14x14xf //CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 3, 14], [1, 32, 3, 14], [1, 32, 2, 14], [1, 32, 2, 14], [1, 32, 2, 14], [1, 32, 2, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 6, 0], [0, 0, 8, 0], [0, 0, 10, 0], [0, 0, 12, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<1x32x14x14xf16, {order = #NHWC}> } @@ -1751,6 +1765,8 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @SparseConvToDistributedOpSOHOverlapped +// CHECK-SAME: [[INPUT0:%.+]]: tensor<1x64x28x28xf16, {order = #NHWC}>, +// CHECK-SAME: [[INPUT1:%.+]]: tensor<1x64x28x28xi1, {order = #NHWC}> func.func @SparseConvToDistributedOpSOHOverlapped(%arg0 : tensor<1x64x28x28xf16, {order = #NHWC}>, %arg1 : tensor<1x64x28x28xi1, {order = #NHWC}>) -> !VPU.SparseTensor, sparsity_map=tensor<1x80x28x28xi1, {order = #NHWC}>> { @@ -1781,7 +1797,7 @@ func.func @SparseConvToDistributedOpSOHOverlapped(%arg0 : tensor<1x64x28x28xf16, return %0 : !VPU.SparseTensor, sparsity_map=tensor<1x80x28x28xi1, {order = #NHWC}>> - // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor(%arg0, %arg1) + // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor([[INPUT0]], [[INPUT1]]) // CHECK-SAME: -> !VPU.SparseTensor, // CHECK-SAME: sparsity_map=tensor<1x64x28x28xi1, {order = #NHWC}>> @@ -1869,8 +1885,8 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @SparseConvToDistributedOpHKSwitch -// CHECK-SAME: ([[ARG0:%.*]]: tensor<1x64x28x28xf16, {order = #NHWC}> -// CHECK-SAME: [[ARG1:%.*]]: tensor<1x64x28x28xi1, {order = #NHWC}> +// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x64x28x28xf16, {order = #NHWC}> +// CHECK-SAME: [[ARG1:%.+]]: tensor<1x64x28x28xi1, {order = #NHWC}> func.func @SparseConvToDistributedOpHKSwitch(%arg0 : tensor<1x64x28x28xf16, {order = #NHWC}>, %arg1 : tensor<1x64x28x28xi1, {order = #NHWC}>) -> !VPU.SparseTensor, sparsity_map=tensor<1x80x28x28xi1, {order = #NHWC}>> { @@ -1989,6 +2005,8 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @DontSetAlignmentForConvEltwiseChainCase1 +// CHECK-SAME: [[INPUT0:%.+]]: tensor<1x16x22x22xf16, {order = #NHWC}>, +// CHECK-SAME: [[INPUT1:%.+]]: tensor<1x16x22x22xf16, {order = #NHWC}> func.func @DontSetAlignmentForConvEltwiseChainCase1(%arg0: tensor<1x16x22x22xf16, {order = #NHWC}>, %arg1: tensor<1x16x22x22xf16, {order = #NHWC}>) -> tensor<1x16x22x22xf16, {order = #NHWC}> { %cst = const.Declare tensor<16x1x1x4xsi32> = dense<10> : tensor<16x1x1x4xsi32> %cst_0 = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x3x3xf16>, [#const.Reorder<#NHWC>] @@ -1997,10 +2015,10 @@ func.func @DontSetAlignmentForConvEltwiseChainCase1(%arg0: tensor<1x16x22x22xf16 %2 = VPU.NCE.Eltwise(%0, %1) {op_type = #VPU.eltwise_type, ppe = #VPU.PPEStub<>} -> tensor<1x16x22x22xf16, {order = #NHWC}> return %2 : tensor<1x16x22x22xf16, {order = #NHWC}> - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<16x1x1x4xsi32> = dense<10> : tensor<16x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<16x1x1x4xsi32> = dense<10> : tensor<16x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX_0:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX_0:%.+]] = VPU.Copy([[INPUT0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x22x22xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], @@ -2008,7 +2026,7 @@ func.func @DontSetAlignmentForConvEltwiseChainCase1(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 5, 22], [1, 16, 6, 22], [1, 16, 6, 22], [1, 16, 6, 22], [1, 16, 5, 22], [1, 16, 4, 22]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 7, 0], [0, 0, 11, 0], [0, 0, 15, 0], [0, 0, 18, 0]] - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x3x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], @@ -2016,7 +2034,7 @@ func.func @DontSetAlignmentForConvEltwiseChainCase1(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], @@ -2024,7 +2042,7 @@ func.func @DontSetAlignmentForConvEltwiseChainCase1(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_CMX_0:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_CMX_0:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_0]], //CHECK-SAME: [[WEIGHTS_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -2035,9 +2053,9 @@ func.func @DontSetAlignmentForConvEltwiseChainCase1(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 8, 0], [0, 0, 12, 0], [0, 0, 16, 0], [0, 0, 19, 0]] - //CHECK: [[OUT_0:%.*]] = VPU.Copy([[OUT_CMX_0]] + //CHECK: [[OUT_0:%.+]] = VPU.Copy([[OUT_CMX_0]] - //CHECK: [[INPUT0_CMX_1:%.*]] = VPU.Copy(%arg0) + //CHECK: [[INPUT0_CMX_1:%.+]] = VPU.Copy([[INPUT0]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x16x22x22xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], @@ -2045,7 +2063,7 @@ func.func @DontSetAlignmentForConvEltwiseChainCase1(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 5, 22], [1, 16, 6, 22], [1, 16, 6, 22], [1, 16, 6, 22], [1, 16, 5, 22], [1, 16, 4, 22]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 7, 0], [0, 0, 11, 0], [0, 0, 15, 0], [0, 0, 18, 0]] - //CHECK: [[INPUT1_CMX_1:%.*]] = VPU.Copy(%arg1 + //CHECK: [[INPUT1_CMX_1:%.+]] = VPU.Copy([[INPUT1]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x22x22xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], @@ -2053,7 +2071,7 @@ func.func @DontSetAlignmentForConvEltwiseChainCase1(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 5, 22], [1, 16, 6, 22], [1, 16, 6, 22], [1, 16, 6, 22], [1, 16, 5, 22], [1, 16, 4, 22]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 7, 0], [0, 0, 11, 0], [0, 0, 15, 0], [0, 0, 18, 0]] - //CHECK: [[OUT_CMX_1:%.*]] = VPU.NCE.Eltwise( + //CHECK: [[OUT_CMX_1:%.+]] = VPU.NCE.Eltwise( //CHECK-SAME: [[INPUT0_CMX_1]], //CHECK-SAME: [[INPUT1_CMX_1]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x16x22x22xf16, #NHWC, @CMX_NN, @@ -2063,9 +2081,9 @@ func.func @DontSetAlignmentForConvEltwiseChainCase1(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 8, 0], [0, 0, 12, 0], [0, 0, 16, 0], [0, 0, 19, 0]] - //CHECK: [[OUT_1:%.*]] = VPU.Copy([[OUT_CMX_1]] + //CHECK: [[OUT_1:%.+]] = VPU.Copy([[OUT_CMX_1]] - //CHECK: [[OUT_2:%.*]] = VPU.NCE.Eltwise([[OUT_0]], [[OUT_1]]) {op_type = #VPU.eltwise_type, ppe = #VPU.PPEStub<>} -> tensor<1x16x22x22xf16, {order = #NHWC}> + //CHECK: [[OUT_2:%.+]] = VPU.NCE.Eltwise([[OUT_0]], [[OUT_1]]) {op_type = #VPU.eltwise_type, ppe = #VPU.PPEStub<>} -> tensor<1x16x22x22xf16, {order = #NHWC}> //CHECK: return [[OUT_2]] : tensor<1x16x22x22xf16, {order = #NHWC}> } @@ -2079,6 +2097,8 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @DontSetAlignmentForConvEltwiseChainCase2 +// CHECK-SAME: [[INPUT0:%.+]]: tensor<1x16x22x22xf16, {order = #NHWC}>, +// CHECK-SAME: [[INPUT1:%.+]]: tensor<1x16x22x22xf16, {order = #NHWC}> func.func @DontSetAlignmentForConvEltwiseChainCase2(%arg0: tensor<1x16x22x22xf16, {order = #NHWC}>, %arg1: tensor<1x16x22x22xf16, {order = #NHWC}>) -> tensor<1x16x22x22xf16, {order = #NHWC}> { %0 = VPU.NCE.Eltwise(%arg0, %arg1) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, op_type = #VPU.eltwise_type} -> tensor<1x16x22x22xf16, {order = #NHWC}> %1 = VPU.NCE.Eltwise(%0, %arg1) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, op_type = #VPU.eltwise_type} -> tensor<1x16x22x22xf16, {order = #NHWC}> @@ -2087,7 +2107,7 @@ func.func @DontSetAlignmentForConvEltwiseChainCase2(%arg0: tensor<1x16x22x22xf16 %2 = VPU.NCE.Convolution(%1, %cst_0, %cst) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [16, 16, 3, 3], strides = [1, 1]} : tensor<1x16x22x22xf16, {order = #NHWC}>, tensor<16x16x3x3xf16, {order = #NHWC}>, tensor<16x1x1x4xsi32> -> tensor<1x16x22x22xf16, {order = #NHWC}> return %2 : tensor<1x16x22x22xf16, {order = #NHWC}> - //CHECK: [[INPUT0_CMX_0:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT0_CMX_0:%.+]] = VPU.Copy([[INPUT0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x22x22xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], @@ -2095,7 +2115,7 @@ func.func @DontSetAlignmentForConvEltwiseChainCase2(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 8, 0], [0, 0, 12, 0], [0, 0, 16, 0], [0, 0, 19, 0]] - //CHECK: [[INPUT1_CMX_0:%.*]] = VPU.Copy(%arg1 + //CHECK: [[INPUT1_CMX_0:%.+]] = VPU.Copy([[INPUT1]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x22x22xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], @@ -2103,7 +2123,7 @@ func.func @DontSetAlignmentForConvEltwiseChainCase2(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 8, 0], [0, 0, 12, 0], [0, 0, 16, 0], [0, 0, 19, 0]] - //CHECK: [[OUT_CMX_0:%.*]] = VPU.NCE.Eltwise( + //CHECK: [[OUT_CMX_0:%.+]] = VPU.NCE.Eltwise( //CHECK-SAME: [[INPUT0_CMX_0]], //CHECK-SAME: [[INPUT1_CMX_0]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x16x22x22xf16, #NHWC, @CMX_NN, @@ -2113,9 +2133,9 @@ func.func @DontSetAlignmentForConvEltwiseChainCase2(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 8, 0], [0, 0, 12, 0], [0, 0, 16, 0], [0, 0, 19, 0]] - //CHECK: [[OUT_0:%.*]] = VPU.Copy([[OUT_CMX_0]] + //CHECK: [[OUT_0:%.+]] = VPU.Copy([[OUT_CMX_0]] - //CHECK: [[INPUT0_CMX_1:%.*]] = VPU.Copy([[OUT_0]]) + //CHECK: [[INPUT0_CMX_1:%.+]] = VPU.Copy([[OUT_0]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x16x22x22xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], @@ -2123,7 +2143,7 @@ func.func @DontSetAlignmentForConvEltwiseChainCase2(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 8, 0], [0, 0, 12, 0], [0, 0, 16, 0], [0, 0, 19, 0]] - //CHECK: [[INPUT1_CMX_1:%.*]] = VPU.Copy(%arg1 + //CHECK: [[INPUT1_CMX_1:%.+]] = VPU.Copy([[INPUT1]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x22x22xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], @@ -2131,7 +2151,7 @@ func.func @DontSetAlignmentForConvEltwiseChainCase2(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 8, 0], [0, 0, 12, 0], [0, 0, 16, 0], [0, 0, 19, 0]] - //CHECK: [[OUT_CMX_1:%.*]] = VPU.NCE.Eltwise( + //CHECK: [[OUT_CMX_1:%.+]] = VPU.NCE.Eltwise( //CHECK-SAME: [[INPUT0_CMX_1]], //CHECK-SAME: [[INPUT1_CMX_1]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x16x22x22xf16, #NHWC, @CMX_NN, @@ -2141,12 +2161,12 @@ func.func @DontSetAlignmentForConvEltwiseChainCase2(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 5, 22], [1, 16, 6, 22], [1, 16, 6, 22], [1, 16, 6, 22], [1, 16, 5, 22], [1, 16, 4, 22]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 7, 0], [0, 0, 11, 0], [0, 0, 15, 0], [0, 0, 18, 0]] - //CHECK: [[OUT_1:%.*]] = VPU.Copy([[OUT_CMX_1]] + //CHECK: [[OUT_1:%.+]] = VPU.Copy([[OUT_CMX_1]] - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<16x1x1x4xsi32> = dense<10> : tensor<16x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<16x1x1x4xsi32> = dense<10> : tensor<16x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX_2:%.*]] = VPU.Copy([[OUT_1]]) + //CHECK: [[INPUT_CMX_2:%.+]] = VPU.Copy([[OUT_1]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x16x22x22xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], @@ -2154,7 +2174,7 @@ func.func @DontSetAlignmentForConvEltwiseChainCase2(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 5, 22], [1, 16, 6, 22], [1, 16, 6, 22], [1, 16, 6, 22], [1, 16, 5, 22], [1, 16, 4, 22]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 7, 0], [0, 0, 11, 0], [0, 0, 15, 0], [0, 0, 18, 0]] - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]]) + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]]) //CHECK-SAME: -> !VPU.DistributedTensor<16x16x3x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], @@ -2162,7 +2182,7 @@ func.func @DontSetAlignmentForConvEltwiseChainCase2(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]]) + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]]) //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], @@ -2170,7 +2190,7 @@ func.func @DontSetAlignmentForConvEltwiseChainCase2(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_CMX_2:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_CMX_2:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_2]], //CHECK-SAME: [[WEIGHTS_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -2181,7 +2201,7 @@ func.func @DontSetAlignmentForConvEltwiseChainCase2(%arg0: tensor<1x16x22x22xf16 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 4, 22], [1, 16, 3, 22], [1, 16, 3, 22]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 8, 0], [0, 0, 12, 0], [0, 0, 16, 0], [0, 0, 19, 0]] - //CHECK: [[OUT_2:%.*]] = VPU.Copy([[OUT_CMX_2]] + //CHECK: [[OUT_2:%.+]] = VPU.Copy([[OUT_CMX_2]] //CHECK: return [[OUT_2]] : tensor<1x16x22x22xf16, {order = #NHWC}> } @@ -2195,13 +2215,14 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @MVNToDistributedOpDuplicateBuffer +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x4x512x1xf16, {order = #NCWH}> func.func @MVNToDistributedOpDuplicateBuffer(%arg0: tensor<1x4x512x1xf16, {order = #NCWH}>) -> tensor<1x4x512x1xf16, {order = #NCWH}> { %0 = VPU.MVN(%arg0) {across_channels = false, eps = 1.0013580322265625E-5 : f64, multiClusterStrategy = #VPU.multi_cluster_strategy, normalize_variance = true} : tensor<1x4x512x1xf16, {order = #NCWH}> -> tensor<1x4x512x1xf16, {order = #NCWH}> return %0: tensor<1x4x512x1xf16, {order = #NCWH}> - //CHECK: [[ClusterCopy:%.*]] = VPU.Copy(%arg0 + //CHECK: [[ClusterCopy:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x4x512x1xf16, #NCWH, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1]], @@ -2209,7 +2230,7 @@ func.func @MVNToDistributedOpDuplicateBuffer(%arg0: tensor<1x4x512x1xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[RClusterMVN:%.*]] = VPU.MVN([[ClusterCopy]] + //CHECK: [[RClusterMVN:%.+]] = VPU.MVN([[ClusterCopy]] //CHECK-SAME: -> !VPU.DistributedTensor<1x4x512x1xf16, #NCWH, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1]], @@ -2217,7 +2238,7 @@ func.func @MVNToDistributedOpDuplicateBuffer(%arg0: tensor<1x4x512x1xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1], [1, 4, 512, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[RClusterMVN]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[RClusterMVN]] //CHECK: return [[OUT]] : tensor<1x4x512x1xf16, {order = #NCWH}> } @@ -2231,13 +2252,14 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @MVNToDistributedOpSegmentedBuffer +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x12x512x1xf16, {order = #NCWH}> func.func @MVNToDistributedOpSegmentedBuffer(%arg0: tensor<1x12x512x1xf16, {order = #NCWH}>) -> tensor<1x12x512x1xf16, {order = #NCWH}> { %0 = VPU.MVN(%arg0) {across_channels = false, eps = 1.0013580322265625E-5 : f64, multiClusterStrategy = #VPU.multi_cluster_strategy, normalize_variance = true} : tensor<1x12x512x1xf16, {order = #NCWH}> -> tensor<1x12x512x1xf16, {order = #NCWH}> return %0: tensor<1x12x512x1xf16, {order = #NCWH}> - //CHECK: [[ClusterCopy:%.*]] = VPU.Copy(%arg0 + //CHECK: [[ClusterCopy:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x12x512x1xf16, #NCWH, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1]], @@ -2245,7 +2267,7 @@ func.func @MVNToDistributedOpSegmentedBuffer(%arg0: tensor<1x12x512x1xf16, {orde //CHECK-SAME{LITERAL}: memory_shapes = [[1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 2, 0, 0], [0, 4, 0, 0], [0, 6, 0, 0], [0, 8, 0, 0], [0, 10, 0, 0]] - //CHECK: [[RClusterMVN:%.*]] = VPU.MVN([[ClusterCopy]] + //CHECK: [[RClusterMVN:%.+]] = VPU.MVN([[ClusterCopy]] //CHECK-SAME: -> !VPU.DistributedTensor<1x12x512x1xf16, #NCWH, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1]], @@ -2253,7 +2275,7 @@ func.func @MVNToDistributedOpSegmentedBuffer(%arg0: tensor<1x12x512x1xf16, {orde //CHECK-SAME{LITERAL}: memory_shapes = [[1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1], [1, 2, 512, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 2, 0, 0], [0, 4, 0, 0], [0, 6, 0, 0], [0, 8, 0, 0], [0, 10, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[RClusterMVN]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[RClusterMVN]] //CHECK: return [[OUT]] : tensor<1x12x512x1xf16, {order = #NCWH}> } @@ -2267,13 +2289,14 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @MVNToDistributedOpSegmentedBufferReducedClusters +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x4x512x1xf16, {order = #NCWH}> func.func @MVNToDistributedOpSegmentedBufferReducedClusters(%arg0: tensor<1x4x512x1xf16, {order = #NCWH}>) -> tensor<1x4x512x1xf16, {order = #NCWH}> { %0 = VPU.MVN(%arg0) {across_channels = false, eps = 1.0013580322265625E-5 : f64, multiClusterStrategy = #VPU.multi_cluster_strategy, normalize_variance = true} : tensor<1x4x512x1xf16, {order = #NCWH}> -> tensor<1x4x512x1xf16, {order = #NCWH}> return %0: tensor<1x4x512x1xf16, {order = #NCWH}> - //CHECK: [[ClusterCopy:%.*]] = VPU.Copy(%arg0 + //CHECK: [[ClusterCopy:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x4x512x1xf16, #NCWH, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 4, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 512, 1], [1, 1, 512, 1], [1, 1, 512, 1], [1, 1, 512, 1]], @@ -2281,7 +2304,7 @@ func.func @MVNToDistributedOpSegmentedBufferReducedClusters(%arg0: tensor<1x4x51 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 512, 1], [1, 1, 512, 1], [1, 1, 512, 1], [1, 1, 512, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 1, 0, 0], [0, 2, 0, 0], [0, 3, 0, 0]] - //CHECK: [[RClusterMVN:%.*]] = VPU.MVN([[ClusterCopy]] + //CHECK: [[RClusterMVN:%.+]] = VPU.MVN([[ClusterCopy]] //CHECK-SAME: -> !VPU.DistributedTensor<1x4x512x1xf16, #NCWH, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 4, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 512, 1], [1, 1, 512, 1], [1, 1, 512, 1], [1, 1, 512, 1]], @@ -2289,7 +2312,7 @@ func.func @MVNToDistributedOpSegmentedBufferReducedClusters(%arg0: tensor<1x4x51 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 512, 1], [1, 1, 512, 1], [1, 1, 512, 1], [1, 1, 512, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 1, 0, 0], [0, 2, 0, 0], [0, 3, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[RClusterMVN]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[RClusterMVN]] //CHECK: return [[OUT]] : tensor<1x4x512x1xf16, {order = #NCWH}> } @@ -2311,13 +2334,13 @@ func.func @MVN6SOK(%arg0: tensor<1x32x15x64xf16>) -> tensor<1x32x15x64xf16> { %0 = VPU.MVN6(%arg0) {axes = [2], eps = 1.000000e-02 : f64, eps_mode = #IE.mvn_eps_mode, multiClusterStrategy = #VPU.multi_cluster_strategy, normalize_variance = true, operandSegmentSizes = array} : tensor<1x32x15x64xf16> -> tensor<1x32x15x64xf16> return %0 : tensor<1x32x15x64xf16> - //CHECK: [[INPUT:%.*]] = VPU.Copy([[INPUT_DATA]] + //CHECK: [[INPUT:%.+]] = VPU.Copy([[INPUT_DATA]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x15x64xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 4, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, - //CHECK: [[MVN:%.*]] = VPU.MVN6([[INPUT]]) + //CHECK: [[MVN:%.+]] = VPU.MVN6([[INPUT]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x32x15x64xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 4, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[MVN]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[MVN]] //CHECK: return [[OUTPUT]] : tensor<1x32x15x64xf16> } @@ -2339,13 +2362,13 @@ func.func @MVN6SOH(%arg0: tensor<1x32x15x64xf16>) -> tensor<1x32x15x64xf16> { %0 = VPU.MVN6(%arg0) {axes = [1, 3], eps = 1.000000e-02 : f64, eps_mode = #IE.mvn_eps_mode, multiClusterStrategy = #VPU.multi_cluster_strategy, normalize_variance = true, operandSegmentSizes = array} : tensor<1x32x15x64xf16> -> tensor<1x32x15x64xf16> return %0 : tensor<1x32x15x64xf16> - //CHECK: [[INPUT:%.*]] = VPU.Copy([[INPUT_DATA]] + //CHECK: [[INPUT:%.+]] = VPU.Copy([[INPUT_DATA]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x15x64xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64, uniform_distributed_segments, - //CHECK: [[MVN:%.*]] = VPU.MVN6([[INPUT]] + //CHECK: [[MVN:%.+]] = VPU.MVN6([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x15x64xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64, uniform_distributed_segments, - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[MVN]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[MVN]] //CHECK: return [[OUTPUT]] : tensor<1x32x15x64xf16> } @@ -2366,21 +2389,21 @@ func.func @PadSwSOH(%arg0: tensor<1x16x32x50xf16>) -> tensor<1x17x32x60xf16> { %0 = VPU.Pad(%arg0) {mode = #IE.pad_mode, multiClusterStrategy = #VPU.multi_cluster_strategy, pad_value_attr = 0.000000e+00 : f64, pads_begin_attr = [0, 0, 0, 0], pads_end_attr = [0, 1, 0, 10]} : tensor<1x16x32x50xf16> -> tensor<1x17x32x60xf16> return %0 : tensor<1x17x32x60xf16> - //CHECK: [[INPUT:%.*]] = VPU.Copy([[INPUT_DATA]] + //CHECK: [[INPUT:%.+]] = VPU.Copy([[INPUT_DATA]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x32x50xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 8, 50], [1, 16, 8, 50], [1, 16, 8, 50], [1, 16, 8, 50]], //CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 24, 0]], //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 8, 50], [1, 16, 8, 50], [1, 16, 8, 50], [1, 16, 8, 50]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 24, 0]]}> - //CHECK: [[PAD:%.*]] = VPU.Pad([[INPUT]]) + //CHECK: [[PAD:%.+]] = VPU.Pad([[INPUT]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x17x32x60xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 17, 8, 60], [1, 17, 8, 60], [1, 17, 8, 60], [1, 17, 8, 60]], //CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 24, 0]], //CHECK-SAME{LITERAL}: memory_shapes = [[1, 17, 8, 60], [1, 17, 8, 60], [1, 17, 8, 60], [1, 17, 8, 60]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 24, 0]]}> - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[PAD]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[PAD]] } } @@ -2399,21 +2422,21 @@ func.func @PadSwSOK(%arg0: tensor<1x16x30x50xf16>) -> tensor<1x16x33x53xf16> { %0 = VPU.Pad(%arg0) {mode = #IE.pad_mode, multiClusterStrategy = #VPU.multi_cluster_strategy, pad_value_attr = 0.000000e+00 : f64, pads_begin_attr = [0, 0, 0, 0], pads_end_attr = [0, 0, 3, 3]} : tensor<1x16x30x50xf16> -> tensor<1x16x33x53xf16> return %0 : tensor<1x16x33x53xf16> - //CHECK: [[INPUT:%.*]] = VPU.Copy([[INPUT_DATA]] + //CHECK: [[INPUT:%.+]] = VPU.Copy([[INPUT_DATA]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x30x50xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 4, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 4, 30, 50], [1, 4, 30, 50], [1, 4, 30, 50], [1, 4, 30, 50]], //CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 4, 0, 0], [0, 8, 0, 0], [0, 12, 0, 0]], //CHECK-SAME{LITERAL}: memory_shapes = [[1, 4, 30, 50], [1, 4, 30, 50], [1, 4, 30, 50], [1, 4, 30, 50]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 4, 0, 0], [0, 8, 0, 0], [0, 12, 0, 0]]}> - //CHECK: [[PAD:%.*]] = VPU.Pad([[INPUT]]) + //CHECK: [[PAD:%.+]] = VPU.Pad([[INPUT]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x16x33x53xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 4, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 4, 33, 53], [1, 4, 33, 53], [1, 4, 33, 53], [1, 4, 33, 53]], //CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 4, 0, 0], [0, 8, 0, 0], [0, 12, 0, 0]], //CHECK-SAME{LITERAL}: memory_shapes = [[1, 4, 33, 53], [1, 4, 33, 53], [1, 4, 33, 53], [1, 4, 33, 53]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 4, 0, 0], [0, 8, 0, 0], [0, 12, 0, 0]]}> - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[PAD]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[PAD]] } } @@ -2445,7 +2468,7 @@ func.func @UnrollSOKConvOutputSegmented(%input: tensor<1x64x64x64xf16, {order = // (DUP 4 CL) CONV (SEG 4 CL) -> (SEG 6 CL) MVN (SEG 6 CL) - //CHECK: [[CONV_IN:%.*]] = VPU.Copy(%arg0 + //CHECK: [[CONV_IN:%.+]] = VPU.Copy(%arg0 //CHECK-SAME: -> !VPU.DistributedTensor<1x64x64x64xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 4 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 64, 64, 64], [1, 64, 64, 64], [1, 64, 64, 64], [1, 64, 64, 64]], @@ -2453,7 +2476,7 @@ func.func @UnrollSOKConvOutputSegmented(%input: tensor<1x64x64x64xf16, {order = //CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 64, 64], [1, 64, 64, 64], [1, 64, 64, 64], [1, 64, 64, 64]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[SOK_CONV:%.*]] = VPU.NCE.Convolution([[CONV_IN]] + //CHECK: [[SOK_CONV:%.+]] = VPU.NCE.Convolution([[CONV_IN]] //CHECK-SAME: -> !VPU.DistributedTensor<1x64x64x64xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 4, 1, 1], num_clusters = 4 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 64, 64], [1, 16, 64, 64], [1, 16, 64, 64], [1, 16, 64, 64]], @@ -2461,7 +2484,7 @@ func.func @UnrollSOKConvOutputSegmented(%input: tensor<1x64x64x64xf16, {order = //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 64, 64], [1, 16, 64, 64], [1, 16, 64, 64], [1, 16, 64, 64]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 16, 0, 0], [0, 32, 0, 0], [0, 48, 0, 0]] - //CHECK: [[MVN_IN:%.*]] = VPU.Copy(%4 + //CHECK: [[MVN_IN:%.+]] = VPU.Copy(%4 //CHECK-SAME: -> !VPU.DistributedTensor<1x64x64x64xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 11, 64, 64], [1, 11, 64, 64], [1, 11, 64, 64], [1, 11, 64, 64], [1, 10, 64, 64], [1, 10, 64, 64]], @@ -2469,7 +2492,7 @@ func.func @UnrollSOKConvOutputSegmented(%input: tensor<1x64x64x64xf16, {order = //CHECK-SAME{LITERAL}: memory_shapes = [[1, 11, 64, 64], [1, 11, 64, 64], [1, 11, 64, 64], [1, 11, 64, 64], [1, 10, 64, 64], [1, 10, 64, 64]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 11, 0, 0], [0, 22, 0, 0], [0, 33, 0, 0], [0, 44, 0, 0], [0, 54, 0, 0]] - //CHECK: [[SOK_MVN:%.*]] = VPU.MVN(%5 + //CHECK: [[SOK_MVN:%.+]] = VPU.MVN(%5 //CHECK-SAME: -> !VPU.DistributedTensor<1x64x64x64xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 11, 64, 64], [1, 11, 64, 64], [1, 11, 64, 64], [1, 11, 64, 64], [1, 10, 64, 64], [1, 10, 64, 64]], @@ -2489,6 +2512,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @UnrollSOKDWConvInputOutputDuplicated +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x1x320x1xf16> func.func @UnrollSOKDWConvInputOutputDuplicated(%input: tensor<1x1x320x1xf16>) -> tensor<1x320x1x1xf16, {order = #NHWC}> { %weights = const.Declare tensor<320x16x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<1x1x320xf32>, [#const.CastElemType, #const.Reshape<[1, 1, 1, 320]>, #const.Reshape<[1, 320, 1, 1]>, #const.Reshape<[320, 1, 1, 1]>, #const.Reorder<#NHWC>, #const.Reorder<#NCHW>, #const.Reshape<[320, 1, 1, 1]>, #const.PadWithZero<[0, 0, 0, 0], [0, 15, 0, 0]>, #const.Reorder<#NHWC>] %weights_table = const.Declare tensor<320x1x1x4xsi32> = dense<10> : tensor<320x1x1x4xsi32> @@ -2516,7 +2540,7 @@ func.func @UnrollSOKDWConvInputOutputDuplicated(%input: tensor<1x1x320x1xf16>) - // (DUP) MVN (DUP) -> (DUP) DWCONV (SEG) -> (SEG) Sigmoid (SEG) - //CHECK: [[MVN_COPY_IN:%.*]] = VPU.Copy(%arg0 + //CHECK: [[MVN_COPY_IN:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x1x320x1xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1]], @@ -2524,7 +2548,7 @@ func.func @UnrollSOKDWConvInputOutputDuplicated(%input: tensor<1x1x320x1xf16>) - //CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[MVN:%.*]] = VPU.MVN([[MVN_COPY_IN]]) + //CHECK: [[MVN:%.+]] = VPU.MVN([[MVN_COPY_IN]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x1x320x1xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1]], @@ -2532,14 +2556,14 @@ func.func @UnrollSOKDWConvInputOutputDuplicated(%input: tensor<1x1x320x1xf16>) - //CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[MVN_COPY_OUT:%.*]] = VPU.Copy([[MVN]]) + //CHECK: [[MVN_COPY_OUT:%.+]] = VPU.Copy([[MVN]]) //CHECK-SAME: -> tensor<1x1x320x1xf16> - //CHECK: [[RESHAPE:%.*]] = VPU.AffineReshape([[MVN_COPY_OUT]]) + //CHECK: [[RESHAPE:%.+]] = VPU.AffineReshape([[MVN_COPY_OUT]]) //CHECK-SAME{LITERAL}: {dim_mapping = [[0], [0], [1], [2, 3]], shape_value = [1, 320, 1, 1]} : tensor<1x1x320x1xf16> -> tensor<1x320x1x1xf16> - //CHECK: [[CAST:%.*]] = VPU.PermuteCast([[RESHAPE]]) {dst_order = #NHWC, mem_perm = #NHWC} : tensor<1x320x1x1xf16> -> tensor<1x320x1x1xf16, {order = #NHWC}> + //CHECK: [[CAST:%.+]] = VPU.PermuteCast([[RESHAPE]]) {dst_order = #NHWC, mem_perm = #NHWC} : tensor<1x320x1x1xf16> -> tensor<1x320x1x1xf16, {order = #NHWC}> - //CHECK: [[DWCONV_INPUT_COPY_IN:%.*]] = VPU.Copy([[CAST]]) + //CHECK: [[DWCONV_INPUT_COPY_IN:%.+]] = VPU.Copy([[CAST]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x320x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1]], @@ -2547,7 +2571,7 @@ func.func @UnrollSOKDWConvInputOutputDuplicated(%input: tensor<1x1x320x1xf16>) - //CHECK-SAME{LITERAL}: memory_shapes = [[1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[DWCONV_WEIGHTS_COPY_IN:%.*]] = VPU.Copy(%cst) + //CHECK: [[DWCONV_WEIGHTS_COPY_IN:%.+]] = VPU.Copy(%cst) //CHECK-SAME: -> !VPU.DistributedTensor<320x16x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[64, 16, 1, 1], [64, 16, 1, 1], [48, 16, 1, 1], [48, 16, 1, 1], [48, 16, 1, 1], [48, 16, 1, 1]], @@ -2555,7 +2579,7 @@ func.func @UnrollSOKDWConvInputOutputDuplicated(%input: tensor<1x1x320x1xf16>) - //CHECK-SAME{LITERAL}: memory_shapes = [[64, 16, 1, 1], [64, 16, 1, 1], [48, 16, 1, 1], [48, 16, 1, 1], [48, 16, 1, 1], [48, 16, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [64, 0, 0, 0], [128, 0, 0, 0], [176, 0, 0, 0], [224, 0, 0, 0], [272, 0, 0, 0]]}> - //CHECK: [[DWCONV_WEIGHTS_TABLE_COPY_IN:%.*]] = VPU.Copy(%cst_0) + //CHECK: [[DWCONV_WEIGHTS_TABLE_COPY_IN:%.+]] = VPU.Copy(%cst_0) //CHECK-SAME: -> !VPU.DistributedTensor<320x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[64, 1, 1, 4], [64, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4]], @@ -2563,7 +2587,7 @@ func.func @UnrollSOKDWConvInputOutputDuplicated(%input: tensor<1x1x320x1xf16>) - //CHECK-SAME{LITERAL}: memory_shapes = [[64, 1, 1, 4], [64, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [64, 0, 0, 0], [128, 0, 0, 0], [176, 0, 0, 0], [224, 0, 0, 0], [272, 0, 0, 0]]}> - //CHECK: [[DWCONV:%.*]] = VPU.NCE.DepthConvolution([[DWCONV_INPUT_COPY_IN]], + //CHECK: [[DWCONV:%.+]] = VPU.NCE.DepthConvolution([[DWCONV_INPUT_COPY_IN]], //CHECK-SAME: [[DWCONV_WEIGHTS_COPY_IN]], //CHECK-SAME: [[DWCONV_WEIGHTS_TABLE_COPY_IN]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x320x1x1xf16, #NHWC, @CMX_NN, @@ -2573,10 +2597,10 @@ func.func @UnrollSOKDWConvInputOutputDuplicated(%input: tensor<1x1x320x1xf16>) - //CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 1, 1], [1, 64, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 64, 0, 0], [0, 128, 0, 0], [0, 176, 0, 0], [0, 224, 0, 0], [0, 272, 0, 0]]}> - //CHECK: [[DWCONV_COPY_OUT:%.*]] = VPU.Copy([[DWCONV]]) + //CHECK: [[DWCONV_COPY_OUT:%.+]] = VPU.Copy([[DWCONV]]) //CHECK-SAME: -> tensor<1x320x1x1xf16, {order = #NHWC}> - //CHECK: [[SIGMOID_COPY_IN:%.*]] = VPU.Copy([[DWCONV_COPY_OUT]]) + //CHECK: [[SIGMOID_COPY_IN:%.+]] = VPU.Copy([[DWCONV_COPY_OUT]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x320x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 64, 1, 1], [1, 64, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1]], @@ -2584,7 +2608,7 @@ func.func @UnrollSOKDWConvInputOutputDuplicated(%input: tensor<1x1x320x1xf16>) - //CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 1, 1], [1, 64, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 64, 0, 0], [0, 128, 0, 0], [0, 176, 0, 0], [0, 224, 0, 0], [0, 272, 0, 0]]}> - //CHECK: [[SIGMOID:%.*]] = VPU.Sigmoid([[SIGMOID_COPY_IN]]) + //CHECK: [[SIGMOID:%.+]] = VPU.Sigmoid([[SIGMOID_COPY_IN]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x320x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 64, 1, 1], [1, 64, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1]], @@ -2592,7 +2616,7 @@ func.func @UnrollSOKDWConvInputOutputDuplicated(%input: tensor<1x1x320x1xf16>) - //CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 1, 1], [1, 64, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 64, 0, 0], [0, 128, 0, 0], [0, 176, 0, 0], [0, 224, 0, 0], [0, 272, 0, 0]]}> - //CHECK: [[SIGMOID_COPY_OUT:%.*]] = VPU.Copy([[SIGMOID]]) + //CHECK: [[SIGMOID_COPY_OUT:%.+]] = VPU.Copy([[SIGMOID]]) //CHECK-SAME: -> tensor<1x320x1x1xf16, {order = #NHWC}> } @@ -2606,6 +2630,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @UnrollSOKConvOutputSegmentedWithSlice +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x80x1x3000xf16, {order = #NHWC}> func.func @UnrollSOKConvOutputSegmentedWithSlice(%input: tensor<1x80x1x3000xf16, {order = #NHWC}>) -> tensor<1x384x1x1500xf16, {order = #NHWC}> { %weights = const.Declare tensor<384x80x1x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<384x80x1x3xf16>, [#const.Reorder<#NHWC>] %weights_table = const.Declare tensor<384x1x1x4xsi32> = dense<10> : tensor<384x1x1x4xsi32> @@ -2624,7 +2649,7 @@ func.func @UnrollSOKConvOutputSegmentedWithSlice(%input: tensor<1x80x1x3000xf16, // (DUP) CONV (SEG) -> SLICE -> (SEG) GELU (SEG) - // CHECK: [[CONV_IN:%.*]] = VPU.Copy(%arg0) + // CHECK: [[CONV_IN:%.+]] = VPU.Copy([[INPUT]]) // CHECK-SAME: -> !VPU.DistributedTensor<1x80x1x3000xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000]], @@ -2632,7 +2657,7 @@ func.func @UnrollSOKConvOutputSegmentedWithSlice(%input: tensor<1x80x1x3000xf16, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - // CHECK: [[SOK_CONV:%.*]] = VPU.NCE.Convolution([[CONV_IN]] + // CHECK: [[SOK_CONV:%.+]] = VPU.NCE.Convolution([[CONV_IN]] // CHECK-SAME: -> !VPU.DistributedTensor<1x384x1x3000xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[1, 64, 1, 3000], [1, 64, 1, 3000], [1, 64, 1, 3000], [1, 64, 1, 3000], [1, 64, 1, 3000], [1, 64, 1, 3000]], @@ -2640,12 +2665,12 @@ func.func @UnrollSOKConvOutputSegmentedWithSlice(%input: tensor<1x80x1x3000xf16, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 1, 3000], [1, 64, 1, 3000], [1, 64, 1, 3000], [1, 64, 1, 3000], [1, 64, 1, 3000], [1, 64, 1, 3000]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 64, 0, 0], [0, 128, 0, 0], [0, 192, 0, 0], [0, 256, 0, 0], [0, 320, 0, 0]] - // CHECK: [[CONV_OUT:%.*]] = VPU.Copy([[SOK_CONV]]) + // CHECK: [[CONV_OUT:%.+]] = VPU.Copy([[SOK_CONV]]) // CHECK-SAME: -> tensor<1x384x1x3000xf16, {order = #NHWC}> - // CHECK: [[SLICE:%.*]] = VPU.Slice [[CONV_OUT]] [0, 0, 0, 0] [1, 384, 1, 1500] : tensor<1x384x1x3000xf16, {order = #NHWC}> to tensor<1x384x1x1500xf16, {order = #NHWC}> + // CHECK: [[SLICE:%.+]] = VPU.Slice [[CONV_OUT]] [0, 0, 0, 0] [1, 384, 1, 1500] : tensor<1x384x1x3000xf16, {order = #NHWC}> to tensor<1x384x1x1500xf16, {order = #NHWC}> - // CHECK: [[GELU_IN:%.*]] = VPU.Copy([[SLICE]]) + // CHECK: [[GELU_IN:%.+]] = VPU.Copy([[SLICE]]) // CHECK-SAME: -> !VPU.DistributedTensor<1x384x1x1500xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[1, 64, 1, 1500], [1, 64, 1, 1500], [1, 64, 1, 1500], [1, 64, 1, 1500], [1, 64, 1, 1500], [1, 64, 1, 1500]], @@ -2653,7 +2678,7 @@ func.func @UnrollSOKConvOutputSegmentedWithSlice(%input: tensor<1x80x1x3000xf16, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 1, 1500], [1, 64, 1, 1500], [1, 64, 1, 1500], [1, 64, 1, 1500], [1, 64, 1, 1500], [1, 64, 1, 1500]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 64, 0, 0], [0, 128, 0, 0], [0, 192, 0, 0], [0, 256, 0, 0], [0, 320, 0, 0]] - // CHECK: [[SOK_GELU:%.*]] = VPU.Gelu([[GELU_IN]]) + // CHECK: [[SOK_GELU:%.+]] = VPU.Gelu([[GELU_IN]]) // CHECK-SAME: -> !VPU.DistributedTensor<1x384x1x1500xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[1, 64, 1, 1500], [1, 64, 1, 1500], [1, 64, 1, 1500], [1, 64, 1, 1500], [1, 64, 1, 1500], [1, 64, 1, 1500]], @@ -2671,6 +2696,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @UnrollSOKDWConvInputOutputSegmented +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x64x64x64xf16, {order = #NHWC}> func.func @UnrollSOKDWConvInputOutputSegmented(%input: tensor<1x64x64x64xf16, {order = #NHWC}>) -> tensor<1x64x64x64xf16, {order = #NHWC}> { %weights = const.Declare tensor<64x16x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<64x16x1x1xf16>, [#const.Reorder<#NHWC>] %weights_table = const.Declare tensor<64x1x1x4xsi32> = dense<10> : tensor<64x1x1x4xsi32> @@ -2689,7 +2715,7 @@ func.func @UnrollSOKDWConvInputOutputSegmented(%input: tensor<1x64x64x64xf16, {o // (SEG 6 CL) MVN (SEG 6 CL) -> (SEG 4 CL) DWCONV (SEG|DUP 4 CL) // DW is SEG|DUP since only consequent SW layer is compatible with SEG, in all other cases it is SEG|DUP - //CHECK: [[MVN_IN:%.*]] = VPU.Copy(%arg0 + //CHECK: [[MVN_IN:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x64x64x64xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 11, 64, 64], [1, 11, 64, 64], [1, 11, 64, 64], [1, 11, 64, 64], [1, 10, 64, 64], [1, 10, 64, 64]], @@ -2697,7 +2723,7 @@ func.func @UnrollSOKDWConvInputOutputSegmented(%input: tensor<1x64x64x64xf16, {o //CHECK-SAME{LITERAL}: memory_shapes = [[1, 11, 64, 64], [1, 11, 64, 64], [1, 11, 64, 64], [1, 11, 64, 64], [1, 10, 64, 64], [1, 10, 64, 64]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 11, 0, 0], [0, 22, 0, 0], [0, 33, 0, 0], [0, 44, 0, 0], [0, 54, 0, 0]] - //CHECK: [[SOK_MVN:%.*]] = VPU.MVN([[MVN_IN]]) + //CHECK: [[SOK_MVN:%.+]] = VPU.MVN([[MVN_IN]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x64x64x64xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 11, 64, 64], [1, 11, 64, 64], [1, 11, 64, 64], [1, 11, 64, 64], [1, 10, 64, 64], [1, 10, 64, 64]], @@ -2705,7 +2731,7 @@ func.func @UnrollSOKDWConvInputOutputSegmented(%input: tensor<1x64x64x64xf16, {o //CHECK-SAME{LITERAL}: memory_shapes = [[1, 11, 64, 64], [1, 11, 64, 64], [1, 11, 64, 64], [1, 11, 64, 64], [1, 10, 64, 64], [1, 10, 64, 64]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 11, 0, 0], [0, 22, 0, 0], [0, 33, 0, 0], [0, 44, 0, 0], [0, 54, 0, 0]] - //CHECK: [[DWCONV_IN:%.*]] = VPU.Copy(%2 + //CHECK: [[DWCONV_IN:%.+]] = VPU.Copy(%2 //CHECK-SAME: -> !VPU.DistributedTensor<1x64x64x64xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 4, 1, 1], num_clusters = 4 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 64, 64], [1, 16, 64, 64], [1, 16, 64, 64], [1, 16, 64, 64]], @@ -2713,7 +2739,7 @@ func.func @UnrollSOKDWConvInputOutputSegmented(%input: tensor<1x64x64x64xf16, {o //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 64, 64], [1, 16, 64, 64], [1, 16, 64, 64], [1, 16, 64, 64]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 16, 0, 0], [0, 32, 0, 0], [0, 48, 0, 0]] - //CHECK: [[SOK_DWCONV:%.*]] = VPU.NCE.DepthConvolution + //CHECK: [[SOK_DWCONV:%.+]] = VPU.NCE.DepthConvolution //CHECK-SAME: -> !VPU.DistributedTensor<1x64x64x64xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 4, 1, 1], num_clusters = 4 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 64, 64], [1, 16, 64, 64], [1, 16, 64, 64], [1, 16, 64, 64]], @@ -2741,11 +2767,11 @@ func.func @ChainOpsToNCEClusteringKHSwitch(%arg0: tensor<1x128x28x28xf16, {order %1 = VPU.NCE.Convolution(%0, %cst_1, %cst) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [96, 96, 5, 5], strides = [2, 2]} : tensor<1x96x28x28xf16, {order = #NHWC}>, tensor<96x96x5x5xf16, {order = #NHWC}>, tensor<96x1x1x4xsi32> -> tensor<1x96x14x14xf16, {order = #NHWC}> return %1 : tensor<1x96x14x14xf16, {order = #NHWC}> - //CHECK-DAG: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> - //CHECK-DAG: [[WEIGHTS_0:%.*]] = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK-DAG: [[WEIGHTS_1:%.*]] = const.Declare tensor<96x96x5x5xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x5x5xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> + //CHECK-DAG: [[WEIGHTS_0:%.+]] = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTS_1:%.+]] = const.Declare tensor<96x96x5x5xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x5x5xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy //CHECK-SAME: -> !VPU.DistributedTensor<1x128x28x28xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28]], @@ -2753,15 +2779,15 @@ func.func @ChainOpsToNCEClusteringKHSwitch(%arg0: tensor<1x128x28x28xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28], [1, 128, 28, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTS_0_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]] + //CHECK: [[WEIGHTS_0_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]] //CHECK-SAME: -> !VPU.DistributedTensor<96x128x3x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments, - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments, - //CHECK: [[OUT_0_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_0_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]], //CHECK-SAME: [[WEIGHTS_0_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -2772,10 +2798,10 @@ func.func @ChainOpsToNCEClusteringKHSwitch(%arg0: tensor<1x128x28x28xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_0:%.*]] = VPU.Copy([[OUT_0_CMX]]) + //CHECK: [[OUT_0:%.+]] = VPU.Copy([[OUT_0_CMX]]) //CHECK-SAME: -> tensor<1x96x28x28xf16, {order = #NHWC}> - //CHECK: [[OUT_0_COPYBACK:%.*]] = VPU.Copy([[OUT_0]]) + //CHECK: [[OUT_0_COPYBACK:%.+]] = VPU.Copy([[OUT_0]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x96x28x28xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28]], @@ -2783,7 +2809,7 @@ func.func @ChainOpsToNCEClusteringKHSwitch(%arg0: tensor<1x128x28x28xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28], [1, 96, 28, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTS_1_CMX:%.*]] = VPU.Copy([[WEIGHTS_1]]) + //CHECK: [[WEIGHTS_1_CMX:%.+]] = VPU.Copy([[WEIGHTS_1]]) //CHECK-SAME: -> !VPU.DistributedTensor<96x96x5x5xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5]], @@ -2791,7 +2817,7 @@ func.func @ChainOpsToNCEClusteringKHSwitch(%arg0: tensor<1x128x28x28xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_1_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]]) + //CHECK: [[WEIGHTSTABLE_1_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]]) //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -2799,7 +2825,7 @@ func.func @ChainOpsToNCEClusteringKHSwitch(%arg0: tensor<1x128x28x28xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_1_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_1_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[OUT_0_COPYBACK]], //CHECK-SAME: [[WEIGHTS_1_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_1_CMX]]) @@ -2810,7 +2836,7 @@ func.func @ChainOpsToNCEClusteringKHSwitch(%arg0: tensor<1x128x28x28xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 3, 14], [1, 96, 3, 14], [1, 96, 2, 14], [1, 96, 2, 14], [1, 96, 2, 14], [1, 96, 2, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 6, 0], [0, 0, 8, 0], [0, 0, 10, 0], [0, 0, 12, 0]] - //CHECK: [[OUT_1:%.*]] = VPU.Copy([[OUT_1_CMX]] + //CHECK: [[OUT_1:%.+]] = VPU.Copy([[OUT_1_CMX]] //CHECK: return [[OUT_1]] : tensor<1x96x14x14xf16, {order = #NHWC}> } @@ -2825,6 +2851,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ChainOpsToNCEClusteringSOHOverlapped +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x128x28x28xf16, {order = #NHWC}> func.func @ChainOpsToNCEClusteringSOHOverlapped(%arg0: tensor<1x128x28x28xf16, {order = #NHWC}>) -> tensor<1x96x14x14xf16, {order = #NHWC}> { %cst = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> %cst_0 = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] @@ -2833,11 +2860,11 @@ func.func @ChainOpsToNCEClusteringSOHOverlapped(%arg0: tensor<1x128x28x28xf16, { %1 = VPU.NCE.Convolution(%0, %cst_1, %cst) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [96, 96, 5, 5], strides = [2, 2]} : tensor<1x96x28x28xf16, {order = #NHWC}>, tensor<96x96x5x5xf16, {order = #NHWC}>, tensor<96x1x1x4xsi32> -> tensor<1x96x14x14xf16, {order = #NHWC}> return %1 : tensor<1x96x14x14xf16, {order = #NHWC}> - //CHECK-DAG: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> - //CHECK-DAG: [[WEIGHTS_0:%.*]] = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK-DAG: [[WEIGHTS_1:%.*]] = const.Declare tensor<96x96x5x5xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x5x5xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> + //CHECK-DAG: [[WEIGHTS_0:%.+]] = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTS_1:%.+]] = const.Declare tensor<96x96x5x5xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x5x5xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x128x28x28xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 128, 5, 28], [1, 128, 5, 28], [1, 128, 5, 28], [1, 128, 5, 28], [1, 128, 4, 28], [1, 128, 4, 28]], @@ -2845,7 +2872,7 @@ func.func @ChainOpsToNCEClusteringSOHOverlapped(%arg0: tensor<1x128x28x28xf16, { //CHECK-SAME{LITERAL}: memory_shapes = [[1, 128, 6, 28], [1, 128, 7, 28], [1, 128, 7, 28], [1, 128, 7, 28], [1, 128, 6, 28], [1, 128, 5, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 23, 0]] - //CHECK: [[WEIGHTS_0_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]] + //CHECK: [[WEIGHTS_0_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]] //CHECK-SAME: -> !VPU.DistributedTensor<96x128x3x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3]], @@ -2853,7 +2880,7 @@ func.func @ChainOpsToNCEClusteringSOHOverlapped(%arg0: tensor<1x128x28x28xf16, { //CHECK-SAME{LITERAL}: memory_shapes = [[96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -2861,7 +2888,7 @@ func.func @ChainOpsToNCEClusteringSOHOverlapped(%arg0: tensor<1x128x28x28xf16, { //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_0_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_0_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]], //CHECK-SAME: [[WEIGHTS_0_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -2872,10 +2899,10 @@ func.func @ChainOpsToNCEClusteringSOHOverlapped(%arg0: tensor<1x128x28x28xf16, { //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 7, 28], [1, 96, 9, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 6, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 14, 0], [0, 0, 18, 0], [0, 0, 22, 0]] - //CHECK: [[OUT_0:%.*]] = VPU.Copy([[OUT_0_CMX]]) + //CHECK: [[OUT_0:%.+]] = VPU.Copy([[OUT_0_CMX]]) //CHECK-SAME: -> tensor<1x96x28x28xf16, {order = #NHWC}> - //CHECK: [[OUT_0_COPYBACK:%.*]] = VPU.Copy([[OUT_0]] + //CHECK: [[OUT_0_COPYBACK:%.+]] = VPU.Copy([[OUT_0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x96x28x28xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 4, 28], [1, 96, 4, 28]], @@ -2883,7 +2910,7 @@ func.func @ChainOpsToNCEClusteringSOHOverlapped(%arg0: tensor<1x128x28x28xf16, { //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 7, 28], [1, 96, 9, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 6, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 14, 0], [0, 0, 18, 0], [0, 0, 22, 0]] - //CHECK: [[WEIGHTS_1_CMX:%.*]] = VPU.Copy([[WEIGHTS_1]] + //CHECK: [[WEIGHTS_1_CMX:%.+]] = VPU.Copy([[WEIGHTS_1]] //CHECK-SAME: -> !VPU.DistributedTensor<96x96x5x5xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5]], @@ -2891,7 +2918,7 @@ func.func @ChainOpsToNCEClusteringSOHOverlapped(%arg0: tensor<1x128x28x28xf16, { //CHECK-SAME{LITERAL}: memory_shapes = [[96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_1_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_1_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -2899,7 +2926,7 @@ func.func @ChainOpsToNCEClusteringSOHOverlapped(%arg0: tensor<1x128x28x28xf16, { //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_1_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_1_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[OUT_0_COPYBACK]], //CHECK-SAME: [[WEIGHTS_1_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_1_CMX]]) @@ -2910,7 +2937,7 @@ func.func @ChainOpsToNCEClusteringSOHOverlapped(%arg0: tensor<1x128x28x28xf16, { //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 3, 14], [1, 96, 3, 14], [1, 96, 2, 14], [1, 96, 2, 14], [1, 96, 2, 14], [1, 96, 2, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 6, 0], [0, 0, 8, 0], [0, 0, 10, 0], [0, 0, 12, 0]] - //CHECK: [[OUT_1:%.*]] = VPU.Copy([[OUT_1_CMX]]) + //CHECK: [[OUT_1:%.+]] = VPU.Copy([[OUT_1_CMX]]) //CHECK-SAME: -> tensor<1x96x14x14xf16, {order = #NHWC}> //CHECK: return [[OUT_1]] : tensor<1x96x14x14xf16, {order = #NHWC}> @@ -2925,6 +2952,8 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ChainSparseOpsoDistributedOpSOHOverlapped +// CHECK-SAME: [[INPUT0:%.+]]: tensor<1x64x28x28xf16, {order = #NHWC}>, +// CHECK-SAME: [[INPUT1:%.+]]: tensor<1x64x28x28xi1, {order = #NHWC}> func.func @ChainSparseOpsoDistributedOpSOHOverlapped(%arg0 : tensor<1x64x28x28xf16, {order = #NHWC}>, %arg1 : tensor<1x64x28x28xi1, {order = #NHWC}>) -> !VPU.SparseTensor, sparsity_map=tensor<1x64x28x28xi1, {order = #NHWC}>> { @@ -2966,7 +2995,7 @@ func.func @ChainSparseOpsoDistributedOpSOHOverlapped(%arg0 : tensor<1x64x28x28xf return %1 : !VPU.SparseTensor, sparsity_map=tensor<1x64x28x28xi1, {order = #NHWC}>> - // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor(%arg0, %arg1) + // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor([[INPUT0]], [[INPUT1]]) // CHECK-SAME: -> !VPU.SparseTensor, // CHECK-SAME: sparsity_map=tensor<1x64x28x28xi1, {order = #NHWC}>> @@ -3117,6 +3146,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x128x28x28xf16, {order = #NHWC}> func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped(%arg0: tensor<1x128x28x28xf16, {order = #NHWC}>) -> (tensor<1x96x28x28xf16, {order = #NHWC}>, tensor<1x96x28x28xf16, {order = #NHWC}>) { %cst = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> @@ -3128,14 +3158,14 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped(%arg0: tensor<1 %2 = VPU.NCE.Convolution(%0, %cst_2, %cst) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [96, 96, 5, 5], strides = [1, 1]} : tensor<1x96x28x28xf16, {order = #NHWC}>, tensor<96x96x5x5xf16, {order = #NHWC}>, tensor<96x1x1x4xsi32> -> tensor<1x96x28x28xf16, {order = #NHWC}> return %1, %2 : tensor<1x96x28x28xf16, {order = #NHWC}>, tensor<1x96x28x28xf16, {order = #NHWC}> - //CHECK-DAG: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> - //CHECK-DAG: [[WEIGHTS_0:%.*]] = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK-DAG: [[WEIGHTS_1:%.*]] = const.Declare tensor<96x96x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK-DAG: [[WEIGHTS_2:%.*]] = const.Declare tensor<96x96x5x5xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x5x5xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> + //CHECK-DAG: [[WEIGHTS_0:%.+]] = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTS_1:%.+]] = const.Declare tensor<96x96x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTS_2:%.+]] = const.Declare tensor<96x96x5x5xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x5x5xf16>, [#const.Reorder<#NHWC>] // Conv producer - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x128x28x28xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 128, 5, 28], [1, 128, 5, 28], [1, 128, 5, 28], [1, 128, 5, 28], [1, 128, 4, 28], [1, 128, 4, 28]], @@ -3143,7 +3173,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped(%arg0: tensor<1 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 128, 6, 28], [1, 128, 7, 28], [1, 128, 7, 28], [1, 128, 7, 28], [1, 128, 6, 28], [1, 128, 5, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 23, 0]] - //CHECK: [[WEIGHTS_0_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]] + //CHECK: [[WEIGHTS_0_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]] //CHECK-SAME: -> !VPU.DistributedTensor<96x128x3x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3]], @@ -3151,7 +3181,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped(%arg0: tensor<1 //CHECK-SAME{LITERAL}: memory_shapes = [[96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3159,7 +3189,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped(%arg0: tensor<1 //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_0_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_0_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]], //CHECK-SAME: [[WEIGHTS_0_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -3170,12 +3200,12 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped(%arg0: tensor<1 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 7, 28], [1, 96, 9, 28], [1, 96, 9, 28], [1, 96, 9, 28], [1, 96, 8, 28], [1, 96, 6, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 8, 0], [0, 0, 13, 0], [0, 0, 18, 0], [0, 0, 22, 0] - //CHECK: [[OUT_0:%.*]] = VPU.Copy([[OUT_0_CMX]]) + //CHECK: [[OUT_0:%.+]] = VPU.Copy([[OUT_0_CMX]]) //CHECK-SAME: -> tensor<1x96x28x28xf16, {order = #NHWC}> // First conv comsumer - //CHECK: [[OUT_0_COPYBACK:%.*]] = VPU.Copy([[OUT_0]] + //CHECK: [[OUT_0_COPYBACK:%.+]] = VPU.Copy([[OUT_0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x96x28x28xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 4, 28], [1, 96, 4, 28]], @@ -3183,7 +3213,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped(%arg0: tensor<1 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 7, 28], [1, 96, 9, 28], [1, 96, 9, 28], [1, 96, 9, 28], [1, 96, 8, 28], [1, 96, 6, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 8, 0], [0, 0, 13, 0], [0, 0, 18, 0], [0, 0, 22, 0]] - //CHECK: [[WEIGHTS_1_CMX:%.*]] = VPU.Copy([[WEIGHTS_1]] + //CHECK: [[WEIGHTS_1_CMX:%.+]] = VPU.Copy([[WEIGHTS_1]] //CHECK-SAME: -> !VPU.DistributedTensor<96x96x3x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3]], @@ -3191,7 +3221,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped(%arg0: tensor<1 //CHECK-SAME{LITERAL}: memory_shapes = [[96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_1_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_1_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3199,7 +3229,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped(%arg0: tensor<1 //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0] - //CHECK: [[OUT_1_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_1_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[OUT_0_COPYBACK]], //CHECK-SAME: [[WEIGHTS_1_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_1_CMX]]) @@ -3210,11 +3240,11 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped(%arg0: tensor<1 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 4, 28], [1, 96, 4, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 5, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 24, 0]] - //CHECK: [[OUT_1:%.*]] = VPU.Copy([[OUT_1_CMX]] + //CHECK: [[OUT_1:%.+]] = VPU.Copy([[OUT_1_CMX]] // Second conv comsumer - //CHECK: [[OUT_0_COPYBACK_1:%.*]] = VPU.Copy([[OUT_0]]) + //CHECK: [[OUT_0_COPYBACK_1:%.+]] = VPU.Copy([[OUT_0]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x96x28x28xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 4, 28], [1, 96, 4, 28]], @@ -3222,7 +3252,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped(%arg0: tensor<1 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 7, 28], [1, 96, 9, 28], [1, 96, 9, 28], [1, 96, 9, 28], [1, 96, 8, 28], [1, 96, 6, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 8, 0], [0, 0, 13, 0], [0, 0, 18, 0], [0, 0, 22, 0] - //CHECK: [[WEIGHTS_2_CMX:%.*]] = VPU.Copy([[WEIGHTS_2]]) + //CHECK: [[WEIGHTS_2_CMX:%.+]] = VPU.Copy([[WEIGHTS_2]]) //CHECK-SAME: -> !VPU.DistributedTensor<96x96x5x5xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5]], @@ -3230,7 +3260,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped(%arg0: tensor<1 //CHECK-SAME{LITERAL}: memory_shapes = [[96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_2_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]]) + //CHECK: [[WEIGHTSTABLE_2_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]]) //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3238,7 +3268,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped(%arg0: tensor<1 //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_2_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_2_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[OUT_0_COPYBACK_1]], //CHECK-SAME: [[WEIGHTS_2_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_2_CMX]]) @@ -3249,7 +3279,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlapped(%arg0: tensor<1 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 4, 28], [1, 96, 4, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 5, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 24, 0]] - //CHECK: [[OUT_2:%.*]] = VPU.Copy([[OUT_2_CMX]] + //CHECK: [[OUT_2:%.+]] = VPU.Copy([[OUT_2_CMX]] //CHECK: return [[OUT_1]], [[OUT_2]] : tensor<1x96x28x28xf16, {order = #NHWC}>, tensor<1x96x28x28xf16, {order = #NHWC}> } @@ -3264,6 +3294,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewUnion0 +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x128x28x28xf16, {order = #NHWC}> func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewUnion0(%arg0: tensor<1x128x28x28xf16, {order = #NHWC}>) -> (tensor<1x96x28x28xf16, {order = #NHWC}>, tensor<1x96x14x14xf16, {order = #NHWC}>) { %cst = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> @@ -3275,14 +3306,14 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU %2 = VPU.NCE.Convolution(%0, %cst_2, %cst) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [96, 96, 5, 5], strides = [2, 2]} : tensor<1x96x28x28xf16, {order = #NHWC}>, tensor<96x96x5x5xf16, {order = #NHWC}>, tensor<96x1x1x4xsi32> -> tensor<1x96x14x14xf16, {order = #NHWC}> return %1, %2 : tensor<1x96x28x28xf16, {order = #NHWC}>, tensor<1x96x14x14xf16, {order = #NHWC}> - //CHECK-DAG: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> - //CHECK-DAG: [[WEIGHTS_0:%.*]] = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK-DAG: [[WEIGHTS_1:%.*]] = const.Declare tensor<96x96x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK-DAG: [[WEIGHTS_2:%.*]] = const.Declare tensor<96x96x5x5xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x5x5xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> + //CHECK-DAG: [[WEIGHTS_0:%.+]] = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTS_1:%.+]] = const.Declare tensor<96x96x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTS_2:%.+]] = const.Declare tensor<96x96x5x5xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x5x5xf16>, [#const.Reorder<#NHWC>] // Conv producer - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0) + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x128x28x28xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 128, 5, 28], [1, 128, 5, 28], [1, 128, 5, 28], [1, 128, 5, 28], [1, 128, 4, 28], [1, 128, 4, 28]], @@ -3290,7 +3321,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[1, 128, 6, 28], [1, 128, 7, 28], [1, 128, 7, 28], [1, 128, 7, 28], [1, 128, 6, 28], [1, 128, 5, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 23, 0]] - //CHECK: [[WEIGHTS_0_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]]) + //CHECK: [[WEIGHTS_0_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]]) //CHECK-SAME: -> !VPU.DistributedTensor<96x128x3x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3]], @@ -3298,7 +3329,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]]) + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]]) //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3306,7 +3337,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_0_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_0_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]], //CHECK-SAME: [[WEIGHTS_0_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -3317,7 +3348,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 7, 28], [1, 96, 9, 28], [1, 96, 8, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 6, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 18, 0], [0, 0, 22, 0]] - //CHECK: [[OUT_0:%.*]] = VPU.Copy([[OUT_0_CMX]]) + //CHECK: [[OUT_0:%.+]] = VPU.Copy([[OUT_0_CMX]]) //CHECK-SAME: -> tensor<1x96x28x28xf16, {order = #NHWC}> // First conv consumer @@ -3326,7 +3357,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU // memory_shapes = [[1, 96, 6, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 6, 28], [1, 96, 5, 28]] // memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 23, 0]] - //CHECK: [[OUT_0_COPYBACK:%.*]] = VPU.Copy([[OUT_0]]) + //CHECK: [[OUT_0_COPYBACK:%.+]] = VPU.Copy([[OUT_0]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x96x28x28xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 4, 28], [1, 96, 4, 28]], @@ -3334,7 +3365,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 7, 28], [1, 96, 9, 28], [1, 96, 8, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 6, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 18, 0], [0, 0, 22, 0]] - //CHECK: [[WEIGHTS_1_CMX:%.*]] = VPU.Copy([[WEIGHTS_1]]) + //CHECK: [[WEIGHTS_1_CMX:%.+]] = VPU.Copy([[WEIGHTS_1]]) //CHECK-SAME: -> !VPU.DistributedTensor<96x96x3x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3]], @@ -3342,7 +3373,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_1_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]]) + //CHECK: [[WEIGHTSTABLE_1_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]]) //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3350,7 +3381,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_1_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_1_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[OUT_0_COPYBACK]], //CHECK-SAME: [[WEIGHTS_1_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_1_CMX]]) @@ -3361,7 +3392,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 4, 28], [1, 96, 4, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 5, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 24, 0]] - //CHECK: [[OUT_1:%.*]] = VPU.Copy([[OUT_1_CMX]]) + //CHECK: [[OUT_1:%.+]] = VPU.Copy([[OUT_1_CMX]]) //CHECK-SAME: -> tensor<1x96x28x28xf16, {order = #NHWC}> // Second conv comsumer @@ -3370,7 +3401,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU // memory_shapes = [[1, 96, 7, 28], [1, 96, 9, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 6, 28]] // memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 14, 0], [0, 0, 18, 0], [0, 0, 22, 0]] - //CHECK: [[OUT_0_COPYBACK_1:%.*]] = VPU.Copy([[OUT_0]]) + //CHECK: [[OUT_0_COPYBACK_1:%.+]] = VPU.Copy([[OUT_0]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x96x28x28xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 4, 28], [1, 96, 4, 28]], @@ -3378,7 +3409,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 7, 28], [1, 96, 9, 28], [1, 96, 8, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 6, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 18, 0], [0, 0, 22, 0]] - //CHECK: [[WEIGHTS_2_CMX:%.*]] = VPU.Copy([[WEIGHTS_2]]) + //CHECK: [[WEIGHTS_2_CMX:%.+]] = VPU.Copy([[WEIGHTS_2]]) //CHECK-SAME: -> !VPU.DistributedTensor<96x96x5x5xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5]], @@ -3386,7 +3417,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5], [96, 96, 5, 5]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_2_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]]) + //CHECK: [[WEIGHTSTABLE_2_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]]) //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3394,7 +3425,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_2_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_2_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[OUT_0_COPYBACK_1]], //CHECK-SAME: [[WEIGHTS_2_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_2_CMX]]) @@ -3405,7 +3436,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 3, 14], [1, 96, 3, 14], [1, 96, 2, 14], [1, 96, 2, 14], [1, 96, 2, 14], [1, 96, 2, 14]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 6, 0], [0, 0, 8, 0], [0, 0, 10, 0], [0, 0, 12, 0]] - //CHECK: [[OUT_2:%.*]] = VPU.Copy([[OUT_2_CMX]] + //CHECK: [[OUT_2:%.+]] = VPU.Copy([[OUT_2_CMX]] //CHECK: return [[OUT_1]], [[OUT_2]] : tensor<1x96x28x28xf16, {order = #NHWC}>, tensor<1x96x14x14xf16, {order = #NHWC}> } @@ -3420,6 +3451,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewUnion1 +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x128x28x28xf16, {order = #NHWC}> func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewUnion1(%arg0: tensor<1x128x28x28xf16, {order = #NHWC}>) -> (tensor<1x96x26x26xf16, {order = #NHWC}>, tensor<1x96x27x27xf16, {order = #NHWC}>) { %cst = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> @@ -3431,14 +3463,14 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU %2 = VPU.NCE.Convolution(%0, %cst_2, %cst) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [96, 96, 4, 4], strides = [1, 1]} : tensor<1x96x28x28xf16, {order = #NHWC}>, tensor<96x96x4x4xf16, {order = #NHWC}>, tensor<96x1x1x4xsi32> -> tensor<1x96x27x27xf16, {order = #NHWC}> return %1, %2 : tensor<1x96x26x26xf16, {order = #NHWC}>, tensor<1x96x27x27xf16, {order = #NHWC}> - //CHECK-DAG: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> - //CHECK-DAG: [[WEIGHTS_0:%.*]] = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK-DAG: [[WEIGHTS_1:%.*]] = const.Declare tensor<96x96x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK-DAG: [[WEIGHTS_2:%.*]] = const.Declare tensor<96x96x4x4xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x4x4xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> + //CHECK-DAG: [[WEIGHTS_0:%.+]] = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTS_1:%.+]] = const.Declare tensor<96x96x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTS_2:%.+]] = const.Declare tensor<96x96x4x4xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x4x4xf16>, [#const.Reorder<#NHWC>] // Conv producer - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x128x28x28xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 128, 5, 28], [1, 128, 5, 28], [1, 128, 5, 28], [1, 128, 5, 28], [1, 128, 4, 28], [1, 128, 4, 28]], @@ -3446,7 +3478,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[1, 128, 6, 28], [1, 128, 7, 28], [1, 128, 7, 28], [1, 128, 7, 28], [1, 128, 6, 28], [1, 128, 5, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 23, 0]] - //CHECK: [[WEIGHTS_0_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]] + //CHECK: [[WEIGHTS_0_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]] //CHECK-SAME: -> !VPU.DistributedTensor<96x128x3x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3]], @@ -3454,7 +3486,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3462,7 +3494,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_0_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_0_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]], //CHECK-SAME: [[WEIGHTS_0_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -3473,7 +3505,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 7, 28], [1, 96, 9, 28], [1, 96, 8, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 7, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 8, 0], [0, 0, 13, 0], [0, 0, 17, 0], [0, 0, 21, 0]] - //CHECK: [[OUT_0:%.*]] = VPU.Copy([[OUT_0_CMX]] + //CHECK: [[OUT_0:%.+]] = VPU.Copy([[OUT_0_CMX]] // First conv consumer // @@ -3481,7 +3513,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU // memory_shapes = [[1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 6, 28], [1, 96, 6, 28], [1, 96, 6, 28], [1, 96, 6, 28]] // memory_offsets = [[0, 0, 0, 0], [0, 0, 5, 0], [0, 0, 10, 0], [0, 0, 14, 0], [0, 0, 18, 0], [0, 0, 22, 0]] - //CHECK: [[OUT_0_COPYBACK:%.*]] = VPU.Copy([[OUT_0]] + //CHECK: [[OUT_0_COPYBACK:%.+]] = VPU.Copy([[OUT_0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x96x28x28xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 4, 28], [1, 96, 4, 28]], @@ -3489,7 +3521,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 7, 28], [1, 96, 9, 28], [1, 96, 8, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 7, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 8, 0], [0, 0, 13, 0], [0, 0, 17, 0], [0, 0, 21, 0]] - //CHECK: [[WEIGHTS_1_CMX:%.*]] = VPU.Copy([[WEIGHTS_1]] + //CHECK: [[WEIGHTS_1_CMX:%.+]] = VPU.Copy([[WEIGHTS_1]] //CHECK-SAME: -> !VPU.DistributedTensor<96x96x3x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3]], @@ -3497,7 +3529,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_1_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_1_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3505,7 +3537,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_1_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_1_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[OUT_0_COPYBACK]], //CHECK-SAME: [[WEIGHTS_1_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_1_CMX]]) @@ -3516,7 +3548,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 5, 26], [1, 96, 5, 26], [1, 96, 4, 26], [1, 96, 4, 26], [1, 96, 4, 26], [1, 96, 4, 26]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 5, 0], [0, 0, 10, 0], [0, 0, 14, 0], [0, 0, 18, 0], [0, 0, 22, 0]] - //CHECK: [[OUT_1:%.*]] = VPU.Copy([[OUT_1_CMX]] + //CHECK: [[OUT_1:%.+]] = VPU.Copy([[OUT_1_CMX]] // Second conv consumer // @@ -3524,7 +3556,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU // memory_shapes = [[1, 96, 6, 28], [1, 96, 8, 28], [1, 96, 8, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 7, 28]] // memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 8, 0], [0, 0, 13, 0], [0, 0, 17, 0], [0, 0, 21, 0]] - //CHECK: [[OUT_0_COPYBACK_1:%.*]] = VPU.Copy([[OUT_0]]) + //CHECK: [[OUT_0_COPYBACK_1:%.+]] = VPU.Copy([[OUT_0]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x96x28x28xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 5, 28], [1, 96, 4, 28], [1, 96, 4, 28]], @@ -3532,7 +3564,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 7, 28], [1, 96, 9, 28], [1, 96, 8, 28], [1, 96, 7, 28], [1, 96, 7, 28], [1, 96, 7, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 8, 0], [0, 0, 13, 0], [0, 0, 17, 0], [0, 0, 21, 0]] - //CHECK: [[WEIGHTS_2_CMX:%.*]] = VPU.Copy([[WEIGHTS_2]]) + //CHECK: [[WEIGHTS_2_CMX:%.+]] = VPU.Copy([[WEIGHTS_2]]) //CHECK-SAME: -> !VPU.DistributedTensor<96x96x4x4xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 96, 4, 4], [96, 96, 4, 4], [96, 96, 4, 4], [96, 96, 4, 4], [96, 96, 4, 4], [96, 96, 4, 4]], @@ -3540,7 +3572,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[96, 96, 4, 4], [96, 96, 4, 4], [96, 96, 4, 4], [96, 96, 4, 4], [96, 96, 4, 4], [96, 96, 4, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_2_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]]) + //CHECK: [[WEIGHTSTABLE_2_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]]) //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3548,7 +3580,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0] - //CHECK: [[OUT_2_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_2_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[OUT_0_COPYBACK_1]], //CHECK-SAME: [[WEIGHTS_2_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_2_CMX]]) @@ -3559,7 +3591,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedSiblingsMemViewU //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 5, 27], [1, 96, 5, 27], [1, 96, 5, 27], [1, 96, 4, 27], [1, 96, 4, 27], [1, 96, 4, 27]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 5, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 19, 0], [0, 0, 23, 0]] - //CHECK: [[OUT_2:%.*]] = VPU.Copy([[OUT_2_CMX]] + //CHECK: [[OUT_2:%.+]] = VPU.Copy([[OUT_2_CMX]] //CHECK: return [[OUT_1]], [[OUT_2]] : tensor<1x96x26x26xf16, {order = #NHWC}>, tensor<1x96x27x27xf16, {order = #NHWC}> } @@ -3574,6 +3606,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitForOutputShape +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x128x8x8xf16, {order = #NHWC}> // Between the two conv siblings, even though one has the bigger kernel, the inferred output shapes per cluster // don't fully satisfy H >= 1 for each tile. func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitForOutputShape(%arg0: tensor<1x128x8x8xf16, {order = #NHWC}>) @@ -3587,14 +3620,14 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitFor %2 = VPU.NCE.Convolution(%0, %cst_2, %cst) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [96, 96, 1, 1], strides = [1, 1]} : tensor<1x96x8x8xf16, {order = #NHWC}>, tensor<96x96x1x1xf16, {order = #NHWC}>, tensor<96x1x1x4xsi32> -> tensor<1x96x8x8xf16, {order = #NHWC}> return %1, %2 : tensor<1x96x1x1xf16, {order = #NHWC}>, tensor<1x96x8x8xf16, {order = #NHWC}> - //CHECK-DAG: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> - //CHECK-DAG: [[WEIGHTS_0:%.*]] = const.Declare tensor<96x128x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x1x1xf16>, [#const.Reorder<#NHWC>] - //CHECK-DAG: [[WEIGHTS_1:%.*]] = const.Declare tensor<96x96x8x8xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x8x8xf16>, [#const.Reorder<#NHWC>] - //CHECK-DAG: [[WEIGHTS_2:%.*]] = const.Declare tensor<96x96x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x1x1xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> + //CHECK-DAG: [[WEIGHTS_0:%.+]] = const.Declare tensor<96x128x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x1x1xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTS_1:%.+]] = const.Declare tensor<96x96x8x8xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x8x8xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTS_2:%.+]] = const.Declare tensor<96x96x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x1x1xf16>, [#const.Reorder<#NHWC>] // Conv producer - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x128x8x8xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 128, 2, 8], [1, 128, 2, 8], [1, 128, 1, 8], [1, 128, 1, 8], [1, 128, 1, 8], [1, 128, 1, 8]], @@ -3602,7 +3635,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitFor //CHECK-SAME{LITERAL}: memory_shapes = [[1, 128, 2, 8], [1, 128, 2, 8], [1, 128, 1, 8], [1, 128, 1, 8], [1, 128, 1, 8], [1, 128, 1, 8]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 2, 0], [0, 0, 4, 0], [0, 0, 5, 0], [0, 0, 6, 0], [0, 0, 7, 0]] - //CHECK: [[WEIGHTS_0_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]] + //CHECK: [[WEIGHTS_0_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]] //CHECK-SAME: -> !VPU.DistributedTensor<96x128x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 128, 1, 1], [96, 128, 1, 1], [96, 128, 1, 1], [96, 128, 1, 1], [96, 128, 1, 1], [96, 128, 1, 1]], @@ -3610,7 +3643,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitFor //CHECK-SAME{LITERAL}: memory_shapes = [[96, 128, 1, 1], [96, 128, 1, 1], [96, 128, 1, 1], [96, 128, 1, 1], [96, 128, 1, 1], [96, 128, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3618,7 +3651,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitFor //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_0_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_0_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]], //CHECK-SAME: [[WEIGHTS_0_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -3629,14 +3662,14 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitFor //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 2, 8], [1, 96, 2, 8], [1, 96, 1, 8], [1, 96, 1, 8], [1, 96, 1, 8], [1, 96, 1, 8]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 2, 0], [0, 0, 4, 0], [0, 0, 5, 0], [0, 0, 6, 0], [0, 0, 7, 0]] - //CHECK: [[OUT_0:%.*]] = VPU.Copy([[OUT_0_CMX]] + //CHECK: [[OUT_0:%.+]] = VPU.Copy([[OUT_0_CMX]] // First conv comsumer // // Op is incompatible with SOH strategy, do not take into account when computin overlapped params for consumer or sibling. // - //CHECK: [[OUT_0_COPYBACK:%.*]] = VPU.Copy([[OUT_0]] + //CHECK: [[OUT_0_COPYBACK:%.+]] = VPU.Copy([[OUT_0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x96x8x8xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 8, 8], [1, 96, 8, 8], [1, 96, 8, 8], [1, 96, 8, 8], [1, 96, 8, 8], [1, 96, 8, 8]], @@ -3644,7 +3677,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitFor //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 8, 8], [1, 96, 8, 8], [1, 96, 8, 8], [1, 96, 8, 8], [1, 96, 8, 8], [1, 96, 8, 8]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTS_1_CMX:%.*]] = VPU.Copy([[WEIGHTS_1]] + //CHECK: [[WEIGHTS_1_CMX:%.+]] = VPU.Copy([[WEIGHTS_1]] //CHECK-SAME: -> !VPU.DistributedTensor<96x96x8x8xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[16, 96, 8, 8], [16, 96, 8, 8], [16, 96, 8, 8], [16, 96, 8, 8], [16, 96, 8, 8], [16, 96, 8, 8]], @@ -3652,7 +3685,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitFor //CHECK-SAME{LITERAL}: memory_shapes = [[16, 96, 8, 8], [16, 96, 8, 8], [16, 96, 8, 8], [16, 96, 8, 8], [16, 96, 8, 8], [16, 96, 8, 8]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0], [64, 0, 0, 0], [80, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_1_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_1_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], @@ -3660,7 +3693,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitFor //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0], [64, 0, 0, 0], [80, 0, 0, 0]] - //CHECK: [[OUT_1_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_1_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[OUT_0_COPYBACK]], //CHECK-SAME: [[WEIGHTS_1_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_1_CMX]]) @@ -3671,11 +3704,11 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitFor //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 1, 1], [1, 96, 1, 1], [1, 96, 1, 1], [1, 96, 1, 1], [1, 96, 1, 1], [1, 96, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_1:%.*]] = VPU.Copy([[OUT_1_CMX]] + //CHECK: [[OUT_1:%.+]] = VPU.Copy([[OUT_1_CMX]] // Second conv comsumer - //CHECK: [[OUT_0_COPYBACK_1:%.*]] = VPU.Copy([[OUT_0]] + //CHECK: [[OUT_0_COPYBACK_1:%.+]] = VPU.Copy([[OUT_0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x96x8x8xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 2, 8], [1, 96, 2, 8], [1, 96, 1, 8], [1, 96, 1, 8], [1, 96, 1, 8], [1, 96, 1, 8]], @@ -3683,7 +3716,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitFor //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 2, 8], [1, 96, 2, 8], [1, 96, 1, 8], [1, 96, 1, 8], [1, 96, 1, 8], [1, 96, 1, 8]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 2, 0], [0, 0, 4, 0], [0, 0, 5, 0], [0, 0, 6, 0], [0, 0, 7, 0]] - //CHECK: [[WEIGHTS_2_CMX:%.*]] = VPU.Copy([[WEIGHTS_2]] + //CHECK: [[WEIGHTS_2_CMX:%.+]] = VPU.Copy([[WEIGHTS_2]] //CHECK-SAME: -> !VPU.DistributedTensor<96x96x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 96, 1, 1], [96, 96, 1, 1], [96, 96, 1, 1], [96, 96, 1, 1], [96, 96, 1, 1], [96, 96, 1, 1]], @@ -3691,7 +3724,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitFor //CHECK-SAME{LITERAL}: memory_shapes = [[96, 96, 1, 1], [96, 96, 1, 1], [96, 96, 1, 1], [96, 96, 1, 1], [96, 96, 1, 1], [96, 96, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_2_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_2_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3699,7 +3732,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitFor //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_2_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_2_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[OUT_0_COPYBACK_1]], //CHECK-SAME: [[WEIGHTS_2_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_2_CMX]]) @@ -3710,7 +3743,7 @@ func.func @ChainOpsMultipleConsumersToNCEClusteringSOHOverlappedImproperSplitFor //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 2, 8], [1, 96, 2, 8], [1, 96, 1, 8], [1, 96, 1, 8], [1, 96, 1, 8], [1, 96, 1, 8]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 2, 0], [0, 0, 4, 0], [0, 0, 5, 0], [0, 0, 6, 0], [0, 0, 7, 0]] - //CHECK: [[OUT_2:%.*]] = VPU.Copy([[OUT_2_CMX]] + //CHECK: [[OUT_2:%.+]] = VPU.Copy([[OUT_2_CMX]] //CHECK: return [[OUT_1]], [[OUT_2]] : tensor<1x96x1x1xf16, {order = #NHWC}>, tensor<1x96x8x8xf16, {order = #NHWC}> } @@ -3724,6 +3757,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedStart +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x128x65x65xf16, {order = #NHWC}> func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedStart(%arg0: tensor<1x128x65x65xf16, {order = #NHWC}>) -> tensor<1x96x32x32xf16, {order = #NHWC}> { %cst = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> %cst_0 = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] @@ -3732,11 +3766,11 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedStart(%arg0: te %1 = VPU.NCE.Convolution(%0, %cst_1, %cst) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [96, 96, 3, 3], strides = [2, 2]} : tensor<1x96x65x65xf16, {order = #NHWC}>, tensor<96x96x3x3xf16, {order = #NHWC}>, tensor<96x1x1x4xsi32> -> tensor<1x96x32x32xf16, {order = #NHWC}> return %1 : tensor<1x96x32x32xf16, {order = #NHWC}> - //CHECK-DAG: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> - //CHECK-DAG: [[WEIGHTS_0:%.*]] = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK-DAG: [[WEIGHTS_1:%.*]] = const.Declare tensor<96x96x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> + //CHECK-DAG: [[WEIGHTS_0:%.+]] = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTS_1:%.+]] = const.Declare tensor<96x96x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x128x65x65xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 128, 11, 65], [1, 128, 11, 65], [1, 128, 11, 65], [1, 128, 11, 65], [1, 128, 11, 65], [1, 128, 10, 65]], @@ -3744,7 +3778,7 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedStart(%arg0: te //CHECK-SAME{LITERAL}: memory_shapes = [[1, 128, 12, 65], [1, 128, 13, 65], [1, 128, 13, 65], [1, 128, 13, 65], [1, 128, 13, 65], [1, 128, 11, 65]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 10, 0], [0, 0, 21, 0], [0, 0, 32, 0], [0, 0, 43, 0], [0, 0, 54, 0]] - //CHECK: [[WEIGHTS_0_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]] + //CHECK: [[WEIGHTS_0_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]] //CHECK-SAME: -> !VPU.DistributedTensor<96x128x3x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3]], @@ -3752,7 +3786,7 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedStart(%arg0: te //CHECK-SAME{LITERAL}: memory_shapes = [[96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3760,7 +3794,7 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedStart(%arg0: te //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_0_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_0_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]], //CHECK-SAME: [[WEIGHTS_0_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -3771,13 +3805,13 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedStart(%arg0: te //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 13, 65], [1, 96, 14, 65], [1, 96, 13, 65], [1, 96, 12, 65], [1, 96, 11, 65], [1, 96, 11, 65]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 11, 0], [0, 0, 22, 0], [0, 0, 33, 0], [0, 0, 44, 0], [0, 0, 54, 0]] - //CHECK: [[OUT_0:%.*]] = VPU.Copy([[OUT_0_CMX]] + //CHECK: [[OUT_0:%.+]] = VPU.Copy([[OUT_0_CMX]] // Requirements for this consumer only, w/o producer compute view: // memory_shapes = [[1, 96, 13, 65], [1, 96, 13, 65], [1, 96, 11, 65], [1, 96, 11, 65], [1, 96, 11, 65], [1, 96, 11, 65]] // memory_offsets = [[0, 0, 0, 0], [0, 0, 12, 0], [0, 0, 24, 0], [0, 0, 34, 0], [0, 0, 44, 0], [0, 0, 54, 0]] - //CHECK: [[OUT_0_COPYBACK:%.*]] = VPU.Copy([[OUT_0]] + //CHECK: [[OUT_0_COPYBACK:%.+]] = VPU.Copy([[OUT_0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x96x65x65xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 11, 65], [1, 96, 11, 65], [1, 96, 11, 65], [1, 96, 11, 65], [1, 96, 11, 65], [1, 96, 10, 65]], @@ -3785,7 +3819,7 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedStart(%arg0: te //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 13, 65], [1, 96, 14, 65], [1, 96, 13, 65], [1, 96, 12, 65], [1, 96, 11, 65], [1, 96, 11, 65]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 11, 0], [0, 0, 22, 0], [0, 0, 33, 0], [0, 0, 44, 0], [0, 0, 54, 0]] - //CHECK: [[WEIGHTS_1_CMX:%.*]] = VPU.Copy([[WEIGHTS_1]] + //CHECK: [[WEIGHTS_1_CMX:%.+]] = VPU.Copy([[WEIGHTS_1]] //CHECK-SAME: -> !VPU.DistributedTensor<96x96x3x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3]], @@ -3793,7 +3827,7 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedStart(%arg0: te //CHECK-SAME{LITERAL}: memory_shapes = [[96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3], [96, 96, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_1_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_1_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3801,7 +3835,7 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedStart(%arg0: te //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_1_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_1_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[OUT_0_COPYBACK]], //CHECK-SAME: [[WEIGHTS_1_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_1_CMX]]) @@ -3812,7 +3846,7 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedStart(%arg0: te //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 6, 32], [1, 96, 6, 32], [1, 96, 5, 32], [1, 96, 5, 32], [1, 96, 5, 32], [1, 96, 5, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 6, 0], [0, 0, 12, 0], [0, 0, 17, 0], [0, 0, 22, 0], [0, 0, 27, 0]] - //CHECK: [[OUT_1:%.*]] = VPU.Copy([[OUT_1_CMX]] + //CHECK: [[OUT_1:%.+]] = VPU.Copy([[OUT_1_CMX]] //CHECK: return [[OUT_1]] : tensor<1x96x32x32xf16, {order = #NHWC}> } @@ -3826,6 +3860,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedEnd +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x128x65x65xf16, {order = #NHWC}> func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedEnd(%arg0: tensor<1x128x65x65xf16, {order = #NHWC}>) -> tensor<1x96x20x20xf16, {order = #NHWC}> { %cst = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> %cst_0 = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] @@ -3834,11 +3869,11 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedEnd(%arg0: tens %1 = VPU.NCE.Convolution(%0, %cst_1, %cst) {multiClusterStrategy = #VPU.multi_cluster_strategy, ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [96, 96, 7, 7], strides = [3, 3]} : tensor<1x96x65x65xf16, {order = #NHWC}>, tensor<96x96x7x7xf16, {order = #NHWC}>, tensor<96x1x1x4xsi32> -> tensor<1x96x20x20xf16, {order = #NHWC}> return %1 : tensor<1x96x20x20xf16, {order = #NHWC}> - //CHECK-DAG: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> - //CHECK-DAG: [[WEIGHTS_0:%.*]] = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] - //CHECK-DAG: [[WEIGHTS_1:%.*]] = const.Declare tensor<96x96x7x7xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x7x7xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<96x1x1x4xsi32> = dense<10> : tensor<96x1x1x4xsi32> + //CHECK-DAG: [[WEIGHTS_0:%.+]] = const.Declare tensor<96x128x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x128x3x3xf16>, [#const.Reorder<#NHWC>] + //CHECK-DAG: [[WEIGHTS_1:%.+]] = const.Declare tensor<96x96x7x7xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<96x96x7x7xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x128x65x65xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 128, 11, 65], [1, 128, 11, 65], [1, 128, 11, 65], [1, 128, 11, 65], [1, 128, 11, 65], [1, 128, 10, 65]], @@ -3846,7 +3881,7 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedEnd(%arg0: tens //CHECK-SAME{LITERAL}: memory_shapes = [[1, 128, 12, 65], [1, 128, 13, 65], [1, 128, 13, 65], [1, 128, 13, 65], [1, 128, 13, 65], [1, 128, 11, 65]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 10, 0], [0, 0, 21, 0], [0, 0, 32, 0], [0, 0, 43, 0], [0, 0, 54, 0]] - //CHECK: [[WEIGHTS_0_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]] + //CHECK: [[WEIGHTS_0_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]] //CHECK-SAME: -> !VPU.DistributedTensor<96x128x3x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3]], @@ -3854,7 +3889,7 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedEnd(%arg0: tens //CHECK-SAME{LITERAL}: memory_shapes = [[96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3], [96, 128, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3862,7 +3897,7 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedEnd(%arg0: tens //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_0_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_0_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX]], //CHECK-SAME: [[WEIGHTS_0_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -3873,13 +3908,13 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedEnd(%arg0: tens //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 15, 65], [1, 96, 16, 65], [1, 96, 14, 65], [1, 96, 13, 65], [1, 96, 14, 65], [1, 96, 15, 65]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 11, 0], [0, 0, 22, 0], [0, 0, 32, 0], [0, 0, 41, 0], [0, 0, 50, 0]] - //CHECK: [[OUT_0:%.*]] = VPU.Copy([[OUT_0_CMX]] + //CHECK: [[OUT_0:%.+]] = VPU.Copy([[OUT_0_CMX]] // Requirements for this consumer only, w/o producer compute view: // memory_shapes = [[1, 96, 15, 65], [1, 96, 16, 65], [1, 96, 13, 65], [1, 96, 13, 65], [1, 96, 13, 65], [1, 96, 13, 65]] // memory_offsets = [[0, 0, 0, 0], [0, 0, 11, 0], [0, 0, 23, 0], [0, 0, 32, 0], [0, 0, 41, 0], [0, 0, 50, 0]] - //CHECK: [[OUT_0_COPYBACK:%.*]] = VPU.Copy([[OUT_0]] + //CHECK: [[OUT_0_COPYBACK:%.+]] = VPU.Copy([[OUT_0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x96x65x65xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 11, 65], [1, 96, 11, 65], [1, 96, 11, 65], [1, 96, 11, 65], [1, 96, 11, 65], [1, 96, 10, 65]], @@ -3887,7 +3922,7 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedEnd(%arg0: tens //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 15, 65], [1, 96, 16, 65], [1, 96, 14, 65], [1, 96, 13, 65], [1, 96, 14, 65], [1, 96, 15, 65]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 11, 0], [0, 0, 22, 0], [0, 0, 32, 0], [0, 0, 41, 0], [0, 0, 50, 0]] - //CHECK: [[WEIGHTS_1_CMX:%.*]] = VPU.Copy([[WEIGHTS_1]] + //CHECK: [[WEIGHTS_1_CMX:%.+]] = VPU.Copy([[WEIGHTS_1]] //CHECK-SAME: -> !VPU.DistributedTensor<96x96x7x7xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 96, 7, 7], [96, 96, 7, 7], [96, 96, 7, 7], [96, 96, 7, 7], [96, 96, 7, 7], [96, 96, 7, 7]], @@ -3895,7 +3930,7 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedEnd(%arg0: tens //CHECK-SAME{LITERAL}: memory_shapes = [[96, 96, 7, 7], [96, 96, 7, 7], [96, 96, 7, 7], [96, 96, 7, 7], [96, 96, 7, 7], [96, 96, 7, 7]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_1_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_1_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<96x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], @@ -3903,7 +3938,7 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedEnd(%arg0: tens //CHECK-SAME{LITERAL}: memory_shapes = [[96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4], [96, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_1_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_1_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[OUT_0_COPYBACK]], //CHECK-SAME: [[WEIGHTS_1_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_1_CMX]]) @@ -3914,7 +3949,7 @@ func.func @ChainOpsToNCEClusteringSOHIncompatibleOutputOverlappedEnd(%arg0: tens //CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 4, 20], [1, 96, 4, 20], [1, 96, 3, 20], [1, 96, 3, 20], [1, 96, 3, 20], [1, 96, 3, 20]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 8, 0], [0, 0, 11, 0], [0, 0, 14, 0], [0, 0, 17, 0]] - //CHECK: [[OUT_1:%.*]] = VPU.Copy([[OUT_1_CMX]] + //CHECK: [[OUT_1:%.+]] = VPU.Copy([[OUT_1_CMX]] //CHECK: return [[OUT_1]] : tensor<1x96x20x20xf16, {order = #NHWC}> } @@ -3933,6 +3968,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ConcatWithOverlappedInputsNCEConsumersMemViewUnion +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x16x28x28xf16, {order = #NHWC}> func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerConvType) -> (!ConcatOutputType, !ConvConsumerOutput0, !ConvConsumerOutput1) { %cst = const.Declare tensor<16x1x1x4xsi32> = dense<10> : tensor<16x1x1x4xsi32> %cst_0 = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x3x3xf16>, [#const.Reorder<#NHWC>] @@ -3951,14 +3987,14 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo return %3, %4, %5 : !ConcatOutputType, !ConvConsumerOutput0, !ConvConsumerOutput1 - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<16x1x1x4xsi32> - //CHECK: [[WEIGHTS_0:%.*]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> - //CHECK: [[WEIGHTS_1:%.*]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> - //CHECK: [[WEIGHTS_2:%.*]] = const.Declare tensor<16x16x4x4xf16, {order = #NHWC}> + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<16x1x1x4xsi32> + //CHECK: [[WEIGHTS_0:%.+]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> + //CHECK: [[WEIGHTS_1:%.+]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> + //CHECK: [[WEIGHTS_2:%.+]] = const.Declare tensor<16x16x4x4xf16, {order = #NHWC}> //CONV 0 - //CHECK: [[INPUT_CMX_0:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX_0:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x28x28xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -3967,7 +4003,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 6, 28], [1, 16, 7, 28], [1, 16, 7, 28], [1, 16, 7, 28], [1, 16, 6, 28], [1, 16, 5, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 23, 0]] - //CHECK: [[WEIGHTS_0_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]] + //CHECK: [[WEIGHTS_0_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x3x3xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -3976,7 +4012,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -3985,7 +4021,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_0_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_0_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_0]], //CHECK-SAME: [[WEIGHTS_0_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -3998,11 +4034,11 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 7, 28], [1, 16, 9, 28], [1, 16, 8, 28], [1, 16, 7, 28], [1, 16, 7, 28], [1, 16, 7, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 8, 0], [0, 0, 13, 0], [0, 0, 17, 0], [0, 0, 21, 0]] - //CHECK: [[OUT_0:%.*]] = VPU.Copy([[OUT_0_CMX]] + //CHECK: [[OUT_0:%.+]] = VPU.Copy([[OUT_0_CMX]] // CONV 1 - //CHECK: [[INPUT_CMX_1:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX_1:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x28x28xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -4012,7 +4048,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 6, 28], [1, 16, 7, 28], [1, 16, 7, 28], [1, 16, 7, 28], [1, 16, 6, 28], [1, 16, 5, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 23, 0]] - //CHECK: [[WEIGHTS_1_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]] + //CHECK: [[WEIGHTS_1_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x3x3xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4021,7 +4057,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4031,7 +4067,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_1_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_1_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_1]], //CHECK-SAME: [[WEIGHTS_1_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -4045,11 +4081,11 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 8, 0], [0, 0, 13, 0], [0, 0, 17, 0], [0, 0, 21, 0]] - //CHECK: [[OUT_1:%.*]] = VPU.Copy([[OUT_1_CMX]] + //CHECK: [[OUT_1:%.+]] = VPU.Copy([[OUT_1_CMX]] // CONV 2 - //CHECK: [[INPUT_CMX_2:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX_2:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x28x28xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -4059,7 +4095,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 6, 28], [1, 16, 7, 28], [1, 16, 7, 28], [1, 16, 7, 28], [1, 16, 6, 28], [1, 16, 5, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 23, 0]] - //CHECK: [[WEIGHTS_2_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]] + //CHECK: [[WEIGHTS_2_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x3x3xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4068,7 +4104,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4078,7 +4114,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_2_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_2_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_2]], //CHECK-SAME: [[WEIGHTS_2_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -4092,17 +4128,17 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 8, 0], [0, 0, 13, 0], [0, 0, 17, 0], [0, 0, 21, 0]] - //CHECK: [[OUT_2:%.*]] = VPU.Copy([[OUT_2_CMX]] + //CHECK: [[OUT_2:%.+]] = VPU.Copy([[OUT_2_CMX]] - //CHECK: [[CONCAT:%.*]] = VPU.Concat([[OUT_0]], [[OUT_1]], [[OUT_2]]) + //CHECK: [[CONCAT:%.+]] = VPU.Concat([[OUT_0]], [[OUT_1]], [[OUT_2]]) //CHECK-SAME{LITERAL}: static_offsets = [[0, 0, 0, 0], [0, 16, 0, 0], [0, 32, 0, 0]] //CHECK-SAME: tensor<1x16x28x28xf16, {order = #NHWC}>, tensor<1x16x28x28xf16, {order = #NHWC}>, tensor<1x16x28x28xf16, {order = #NHWC}> //CHECK-SAME: -> tensor<1x48x28x28xf16, {order = #NHWC}> //CONV 3 - //CHECK: [[INPUT_CMX_3:%.*]] = VPU.Copy([[OUT_0]]) + //CHECK: [[INPUT_CMX_3:%.+]] = VPU.Copy([[OUT_0]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x16x28x28xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4111,7 +4147,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 7, 28], [1, 16, 9, 28], [1, 16, 8, 28], [1, 16, 7, 28], [1, 16, 7, 28], [1, 16, 7, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 8, 0], [0, 0, 13, 0], [0, 0, 17, 0], [0, 0, 21, 0]] - //CHECK: [[WEIGHTS_3_CMX:%.*]] = VPU.Copy([[WEIGHTS_1]] + //CHECK: [[WEIGHTS_3_CMX:%.+]] = VPU.Copy([[WEIGHTS_1]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x3x3xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4120,7 +4156,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4129,7 +4165,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_3_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_3_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_3]], //CHECK-SAME: [[WEIGHTS_3_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -4144,7 +4180,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CONV 4 - //CHECK: [[INPUT_CMX_4:%.*]] = VPU.Copy([[OUT_2]] + //CHECK: [[INPUT_CMX_4:%.+]] = VPU.Copy([[OUT_2]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x28x28xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4153,7 +4189,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 7, 28], [1, 16, 9, 28], [1, 16, 8, 28], [1, 16, 7, 28], [1, 16, 7, 28], [1, 16, 7, 28]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 8, 0], [0, 0, 13, 0], [0, 0, 17, 0], [0, 0, 21, 0]] - //CHECK: [[WEIGHTS_4_CMX:%.*]] = VPU.Copy([[WEIGHTS_2]] + //CHECK: [[WEIGHTS_4_CMX:%.+]] = VPU.Copy([[WEIGHTS_2]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x4x4xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4162,7 +4198,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 4, 4], [16, 16, 4, 4], [16, 16, 4, 4], [16, 16, 4, 4], [16, 16, 4, 4], [16, 16, 4, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4171,7 +4207,7 @@ func.func @ConcatWithOverlappedInputsNCEConsumersMemViewUnion(%arg0: !ProducerCo //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_0_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_0_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_4]], //CHECK-SAME: [[WEIGHTS_4_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -4199,6 +4235,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @ConcatWithOverlappedInputsCompatibleNCEConsumers +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x16x32x32xf16, {order = #NHWC}> func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConvType) -> (!ConcatOutputType, !ConvConsumerOutput0, !ConvConsumerOutput1) { %cst = const.Declare tensor<16x1x1x4xsi32> = dense<10> : tensor<16x1x1x4xsi32> %cst_0 = const.Declare tensor<16x16x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x1x1xf16>, [#const.Reorder<#NHWC>] @@ -4218,14 +4255,14 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv return %3, %4, %5 : !ConcatOutputType, !ConvConsumerOutput0, !ConvConsumerOutput1 - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<16x1x1x4xsi32> - //CHECK: [[WEIGHTS_0:%.*]] = const.Declare tensor<16x16x1x1xf16, {order = #NHWC}> - //CHECK: [[WEIGHTS_1:%.*]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> - //CHECK: [[WEIGHTS_2:%.*]] = const.Declare tensor<16x16x5x5xf16, {order = #NHWC}> + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<16x1x1x4xsi32> + //CHECK: [[WEIGHTS_0:%.+]] = const.Declare tensor<16x16x1x1xf16, {order = #NHWC}> + //CHECK: [[WEIGHTS_1:%.+]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> + //CHECK: [[WEIGHTS_2:%.+]] = const.Declare tensor<16x16x5x5xf16, {order = #NHWC}> //CONV 0 - //CHECK: [[INPUT_CMX_0:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX_0:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x32x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -4235,7 +4272,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 8, 32], [1, 16, 10, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 7, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 25, 0]] - //CHECK: [[WEIGHTS_0_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]] + //CHECK: [[WEIGHTS_0_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x1x1xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4244,7 +4281,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4253,7 +4290,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_0_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_0_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_0]], //CHECK-SAME: [[WEIGHTS_0_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -4266,11 +4303,11 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 8, 32], [1, 16, 10, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 7, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 25, 0]] - //CHECK: [[OUT_0:%.*]] = VPU.Copy([[OUT_0_CMX]] + //CHECK: [[OUT_0:%.+]] = VPU.Copy([[OUT_0_CMX]] // CONV 1 - //CHECK: [[INPUT_CMX_1:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX_1:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x32x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -4280,7 +4317,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 8, 32], [1, 16, 10, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 7, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 25, 0]] - //CHECK: [[WEIGHTS_1_CMX:%.*]] = VPU.Copy([[WEIGHTS_1]] + //CHECK: [[WEIGHTS_1_CMX:%.+]] = VPU.Copy([[WEIGHTS_1]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x3x3xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4289,7 +4326,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4299,7 +4336,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_1_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_1_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_1]], //CHECK-SAME: [[WEIGHTS_1_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -4313,11 +4350,11 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 25, 0]] - //CHECK: [[OUT_1:%.*]] = VPU.Copy([[OUT_1_CMX]] + //CHECK: [[OUT_1:%.+]] = VPU.Copy([[OUT_1_CMX]] // CONV 2 - //CHECK: [[INPUT_CMX_2:%.*]] = VPU.Copy(%arg0 + //CHECK: [[INPUT_CMX_2:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x32x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -4327,7 +4364,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 8, 32], [1, 16, 10, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 7, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 25, 0]] - //CHECK: [[WEIGHTS_2_CMX:%.*]] = VPU.Copy([[WEIGHTS_2]] + //CHECK: [[WEIGHTS_2_CMX:%.+]] = VPU.Copy([[WEIGHTS_2]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x5x5xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4336,7 +4373,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4346,7 +4383,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_2_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_2_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_2]], //CHECK-SAME: [[WEIGHTS_2_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -4360,7 +4397,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 25, 0]] - //CHECK: [[OUT_2:%.*]] = VPU.Copy([[OUT_2_CMX]] + //CHECK: [[OUT_2:%.+]] = VPU.Copy([[OUT_2_CMX]] //CHECK: VPU.Concat([[OUT_0]], [[OUT_1]], [[OUT_2]]) @@ -4370,7 +4407,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CONV 3 - //CHECK: [[INPUT_CMX_3:%.*]] = VPU.Copy([[OUT_0]] + //CHECK: [[INPUT_CMX_3:%.+]] = VPU.Copy([[OUT_0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x32x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4379,7 +4416,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 8, 32], [1, 16, 10, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 7, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 25, 0]] - //CHECK: [[WEIGHTS_3_CMX:%.*]] = VPU.Copy([[WEIGHTS_1]] + //CHECK: [[WEIGHTS_3_CMX:%.+]] = VPU.Copy([[WEIGHTS_1]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x3x3xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4388,7 +4425,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4397,7 +4434,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_3_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_3_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_3]], //CHECK-SAME: [[WEIGHTS_3_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -4412,7 +4449,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CONV 4 - //CHECK: [[INPUT_CMX_4:%.*]] = VPU.Copy([[OUT_2]] + //CHECK: [[INPUT_CMX_4:%.+]] = VPU.Copy([[OUT_2]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x32x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4421,7 +4458,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 8, 32], [1, 16, 10, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 7, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 25, 0]] - //CHECK: [[WEIGHTS_4_CMX:%.*]] = VPU.Copy([[WEIGHTS_2]] + //CHECK: [[WEIGHTS_4_CMX:%.+]] = VPU.Copy([[WEIGHTS_2]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x5x5xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4430,7 +4467,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -4439,7 +4476,7 @@ func.func @ConcatWithOverlappedInputsCompatibleNCEConsumers(%arg0: !ProducerConv //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_0_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_0_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_4]], //CHECK-SAME: [[WEIGHTS_4_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -4462,6 +4499,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @NCEInterpolateToDistributedOpClustering +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x16x1x1xf16, {order = #NHWC}> func.func @NCEInterpolateToDistributedOpClustering(%arg0: tensor<1x16x1x1xf16, {order = #NHWC}>) -> tensor<1x16x2x2xf16, {order = #NHWC}> { %weights = const.Declare tensor<16x16x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<16x16x1x1xf16>, [#const.Reorder<#NHWC>] %weights_table = const.Declare tensor<16x1x1x4xsi32> = dense<1> : tensor<16x1x1x4xsi32> @@ -4497,17 +4535,17 @@ func.func @NCEInterpolateToDistributedOpClustering(%arg0: tensor<1x16x1x1xf16, { return %interpolate : tensor<1x16x2x2xf16, {order = #NHWC}> - // CHECK-DAG: [[WEIGHTS:%.*]] = const.Declare tensor<16x16x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x1x1xf16>, [#const.Reorder<#NHWC>] - // CHECK-DAG: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<16x1x1x4xsi32> = dense<1> : tensor<16x1x1x4xsi32> + // CHECK-DAG: [[WEIGHTS:%.+]] = const.Declare tensor<16x16x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x1x1xf16>, [#const.Reorder<#NHWC>] + // CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<16x1x1x4xsi32> = dense<1> : tensor<16x1x1x4xsi32> - // CHECK-DAG: [[INPUT_SM:%.*]] = const.Declare tensor<1x16x2x2xi1> = dense : tensor<1x16x2x2xi1> - // CHECK: [[INPUT_SE:%.*]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 16, 1, 1], + // CHECK-DAG: [[INPUT_SM:%.+]] = const.Declare tensor<1x16x2x2xi1> = dense : tensor<1x16x2x2xi1> + // CHECK: [[INPUT_SE:%.+]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 16, 1, 1], // CHECK-SAME: seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , // CHECK-SAME: scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00], nearest_mode = , offsets = [0, 0, 0, 0], sizes = [1, 16, 2, 2]>, // CHECK-SAME: seDepth = 1 : i64, seSize = [16]} // CHECK-SAME: -> tensor<1x1x2x2xi32, {order = #NHWC}> - // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor(%arg0, [[INPUT_SM]], [[INPUT_SE]]) - // CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[INPUT_SPARSE]] + // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor([[INPUT]], [[INPUT_SM]], [[INPUT_SE]]) + // CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT_SPARSE]] // CHECK-SAME: -> !VPU.SparseTensor< // CHECK-SAME: data=!VPU.DistributedTensor<1x16x1x1xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments @@ -4530,7 +4568,7 @@ func.func @NCEInterpolateToDistributedOpClustering(%arg0: tensor<1x16x1x1xf16, { // CHECK-SAME: #VPU.SEInterpolate, coordinate_transformation_mode = , // CHECK-SAME: scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00], nearest_mode = , offsets = [0, 0, 0, 0], sizes = [1, 16, 2, 2]>> - // CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + // CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] // CHECK-SAME: -> !VPU.DistributedTensor<16x16x1x1xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1]], @@ -4538,7 +4576,7 @@ func.func @NCEInterpolateToDistributedOpClustering(%arg0: tensor<1x16x1x1xf16, { // CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - // CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + // CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] // CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], @@ -4546,7 +4584,7 @@ func.func @NCEInterpolateToDistributedOpClustering(%arg0: tensor<1x16x1x1xf16, { // CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - // CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Interpolate( + // CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Interpolate( // CHECK-SAME: [[INPUT_CMX]], // CHECK-SAME: [[WEIGHTS_CMX]], // CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -4557,7 +4595,7 @@ func.func @NCEInterpolateToDistributedOpClustering(%arg0: tensor<1x16x1x1xf16, { // CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 2, 2], [1, 16, 2, 2], [1, 16, 2, 2], [1, 16, 2, 2], [1, 16, 2, 2], [1, 16, 2, 2]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - // CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + // CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] // CHECK: return [[OUT]] : tensor<1x16x2x2xf16, {order = #NHWC}> } @@ -4571,6 +4609,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @NCEInterpolateToDistributedOpSOK +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x64x5x10xf16, {order = #NHWC}> func.func @NCEInterpolateToDistributedOpSOK(%arg0: tensor<1x64x5x10xf16, {order = #NHWC}>) -> tensor<1x64x10x20xf16, {order = #NHWC}> { %weights = const.Declare tensor<64x64x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<64x64x1x1xf16>, [#const.Reorder<#NHWC>] %weights_table = const.Declare tensor<64x1x1x4xsi32> = dense<1> : tensor<64x1x1x4xsi32> @@ -4606,17 +4645,17 @@ func.func @NCEInterpolateToDistributedOpSOK(%arg0: tensor<1x64x5x10xf16, {order return %interpolate : tensor<1x64x10x20xf16, {order = #NHWC}> - // CHECK-DAG: [[WEIGHTS:%.*]] = const.Declare tensor<64x64x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<64x64x1x1xf16>, [#const.Reorder<#NHWC>] - // CHECK-DAG: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<64x1x1x4xsi32> = dense<1> : tensor<64x1x1x4xsi32> + // CHECK-DAG: [[WEIGHTS:%.+]] = const.Declare tensor<64x64x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<64x64x1x1xf16>, [#const.Reorder<#NHWC>] + // CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<64x1x1x4xsi32> = dense<1> : tensor<64x1x1x4xsi32> - // CHECK-DAG: [[INPUT_SM:%.*]] = const.Declare tensor<1x64x10x20xi1, {order = #NHWC}> = dense : tensor<1x64x10x20xi1, {order = #NHWC}> - // CHECK: [[INPUT_SE:%.*]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 64, 5, 10], + // CHECK-DAG: [[INPUT_SM:%.+]] = const.Declare tensor<1x64x10x20xi1, {order = #NHWC}> = dense : tensor<1x64x10x20xi1, {order = #NHWC}> + // CHECK: [[INPUT_SE:%.+]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 64, 5, 10], // CHECK-SAME: seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , // CHECK-SAME: scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00], nearest_mode = , offsets = [0, 0, 0, 0], sizes = [1, 64, 10, 20]>, // CHECK-SAME: seDepth = 1 : i64, seSize = [64]} // CHECK-SAME: -> tensor<1x1x10x20xi32, {order = #NHWC}> - // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor(%arg0, [[INPUT_SM]], [[INPUT_SE]]) - // CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[INPUT_SPARSE]] + // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor([[INPUT]], [[INPUT_SM]], [[INPUT_SE]]) + // CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT_SPARSE]] // CHECK-SAME: -> !VPU.SparseTensor< // CHECK-SAME: data=!VPU.DistributedTensor<1x64x5x10xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 4 : i64, uniform_distributed_segments @@ -4639,7 +4678,7 @@ func.func @NCEInterpolateToDistributedOpSOK(%arg0: tensor<1x64x5x10xf16, {order // CHECK-SAME: #VPU.SEInterpolate, coordinate_transformation_mode = , // CHECK-SAME: scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00], nearest_mode = , offsets = [0, 0, 0, 0], sizes = [1, 64, 10, 20]>> - // CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + // CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] // CHECK-SAME: -> !VPU.DistributedTensor<64x64x1x1xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[16, 64, 1, 1], [16, 64, 1, 1], [16, 64, 1, 1], [16, 64, 1, 1]], @@ -4647,7 +4686,7 @@ func.func @NCEInterpolateToDistributedOpSOK(%arg0: tensor<1x64x5x10xf16, {order // CHECK-SAME{LITERAL}: memory_shapes = [[16, 64, 1, 1], [16, 64, 1, 1], [16, 64, 1, 1], [16, 64, 1, 1]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0]] - // CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + // CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] // CHECK-SAME: -> !VPU.DistributedTensor<64x1x1x4xsi32, #NCHW, @CMX_NN, // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], @@ -4655,7 +4694,7 @@ func.func @NCEInterpolateToDistributedOpSOK(%arg0: tensor<1x64x5x10xf16, {order // CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0]] - // CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Interpolate( + // CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Interpolate( // CHECK-SAME: [[INPUT_CMX]], // CHECK-SAME: [[WEIGHTS_CMX]], // CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -4666,7 +4705,7 @@ func.func @NCEInterpolateToDistributedOpSOK(%arg0: tensor<1x64x5x10xf16, {order // CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 10, 20], [1, 64, 10, 20], [1, 64, 10, 20], [1, 64, 10, 20]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - // CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + // CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] // CHECK: return [[OUT]] : tensor<1x64x10x20xf16, {order = #NHWC}> } @@ -4681,6 +4720,7 @@ module @executors { IE.TileResource 2 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @BilinearNCEInterpolateToDistributedOpSOH +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x16x5x5xf16, {order = #NHWC}> func.func @BilinearNCEInterpolateToDistributedOpSOH(%arg0: tensor<1x16x5x5xf16, {order = #NHWC}>) -> tensor<1x16x10x10xf16, {order = #NHWC}> { %weights = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<16x16x3x3xf16>, [#const.Reorder<#NHWC>] %weights_table = const.Declare tensor<16x1x1x4xsi32> = dense<1> : tensor<16x1x1x4xsi32> @@ -4714,17 +4754,17 @@ func.func @BilinearNCEInterpolateToDistributedOpSOH(%arg0: tensor<1x16x5x5xf16, return %interpolate : tensor<1x16x10x10xf16, {order = #NHWC}> - // CHECK-DAG: [[WEIGHTS:%.*]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x3x3xf16>, [#const.Reorder<#NHWC>] - // CHECK-DAG: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<16x1x1x4xsi32> = dense<1> : tensor<16x1x1x4xsi32> + // CHECK-DAG: [[WEIGHTS:%.+]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x3x3xf16>, [#const.Reorder<#NHWC>] + // CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<16x1x1x4xsi32> = dense<1> : tensor<16x1x1x4xsi32> - // CHECK-DAG: [[INPUT_SM:%.*]] = const.Declare tensor<1x16x12x12xi1> = dense : tensor<1x16x12x12xi1> - // CHECK: [[INPUT_SE:%.*]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 16, 5, 5], + // CHECK-DAG: [[INPUT_SM:%.+]] = const.Declare tensor<1x16x12x12xi1> = dense : tensor<1x16x12x12xi1> + // CHECK: [[INPUT_SE:%.+]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 16, 5, 5], // CHECK-SAME: seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , // CHECK-SAME: scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00], nearest_mode = , offsets = [0, 0, 0, 0], sizes = [1, 16, 12, 12]>, // CHECK-SAME: seDepth = 1 : i64, seSize = [16]} // CHECK-SAME: -> tensor<1x1x12x12xi32, {order = #NHWC}> - // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor(%arg0, [[INPUT_SM]], [[INPUT_SE]]) - // CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[INPUT_SPARSE]]) + // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor([[INPUT]], [[INPUT_SM]], [[INPUT_SE]]) + // CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT_SPARSE]]) // CHECK-SAME: -> !VPU.SparseTensor< // CHECK-SAME: data=!VPU.DistributedTensor<1x16x5x5xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64, uniform_distributed_segments @@ -4747,7 +4787,7 @@ func.func @BilinearNCEInterpolateToDistributedOpSOH(%arg0: tensor<1x16x5x5xf16, // CHECK-SAME: #VPU.SEInterpolate, coordinate_transformation_mode = , // CHECK-SAME: scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00], nearest_mode = , offsets = [0, 0, 0, 0], sizes = [1, 16, 12, 12]>> - // CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + // CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] // CHECK-SAME: -> !VPU.DistributedTensor<16x16x3x3xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[16, 16, 3, 3], [16, 16, 3, 3]], @@ -4755,7 +4795,7 @@ func.func @BilinearNCEInterpolateToDistributedOpSOH(%arg0: tensor<1x16x5x5xf16, // CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 3, 3], [16, 16, 3, 3]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]] - // CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + // CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] // CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[16, 1, 1, 4], [16, 1, 1, 4]], @@ -4763,7 +4803,7 @@ func.func @BilinearNCEInterpolateToDistributedOpSOH(%arg0: tensor<1x16x5x5xf16, // CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]] - // CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Interpolate( + // CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Interpolate( // CHECK-SAME: [[INPUT_CMX]], // CHECK-SAME: [[WEIGHTS_CMX]], // CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -4774,7 +4814,7 @@ func.func @BilinearNCEInterpolateToDistributedOpSOH(%arg0: tensor<1x16x5x5xf16, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 5, 10], [1, 16, 5, 10]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 5, 0]] - // CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + // CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] // CHECK: return [[OUT]] : tensor<1x16x10x10xf16, {order = #NHWC}> } @@ -4828,18 +4868,18 @@ func.func @BilinearNCEInterpolateToDistributedOpSOHWithTiling(%arg0: tensor<1x16 return %interpolate : tensor<1x16x80x320xf16, {order = #NHWC}> - // CHECK-DAG: [[WEIGHTS:%.*]] = const.Declare tensor<16x16x3x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x3x1xf16>, [#const.Reorder<#NHWC>] - // CHECK-DAG: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<16x1x1x4xsi32> = dense<1> : tensor<16x1x1x4xsi32> + // CHECK-DAG: [[WEIGHTS:%.+]] = const.Declare tensor<16x16x3x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x3x1xf16>, [#const.Reorder<#NHWC>] + // CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<16x1x1x4xsi32> = dense<1> : tensor<16x1x1x4xsi32> - // CHECK-DAG: [[INPUT_SM:%.*]] = const.Declare tensor<1x16x82x320xi1> = dense : tensor<1x16x82x320xi1> - // CHECK: [[INPUT_SE:%.*]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 16, 42, 320], + // CHECK-DAG: [[INPUT_SM:%.+]] = const.Declare tensor<1x16x82x320xi1> = dense : tensor<1x16x82x320xi1> + // CHECK: [[INPUT_SE:%.+]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 16, 42, 320], // CHECK-SAME: seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , // CHECK-SAME: scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 1.000000e+00], nearest_mode = , // CHECK-SAME: offsets = [0, 0, 2, 0], sizes = [1, 16, 82, 320], initial_input_shape = [1, 16, 160, 320], initial_output_shape = [1, 16, 320, 320]>, // CHECK-SAME: seDepth = 1 : i64, seSize = [16]} // CHECK-SAME: -> tensor<1x1x82x320xi32, {order = #NHWC}> // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor([[ARG0]], [[INPUT_SM]], [[INPUT_SE]]) - // CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[INPUT_SPARSE]]) + // CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT_SPARSE]]) // CHECK-SAME: -> !VPU.SparseTensor< // CHECK-SAME: data=!VPU.DistributedTensor<1x16x42x320xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64, uniform_distributed_segments @@ -4864,7 +4904,7 @@ func.func @BilinearNCEInterpolateToDistributedOpSOHWithTiling(%arg0: tensor<1x16 // CHECK-SAME: offsets = [0, 0, 2, 0], sizes = [1, 16, 82, 320], // CHECK-SAME: initial_input_shape = [1, 16, 160, 320], initial_output_shape = [1, 16, 320, 320]>> - // CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + // CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] // CHECK-SAME: -> !VPU.DistributedTensor<16x16x3x1xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[16, 16, 3, 1], [16, 16, 3, 1]], @@ -4872,7 +4912,7 @@ func.func @BilinearNCEInterpolateToDistributedOpSOHWithTiling(%arg0: tensor<1x16 // CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 3, 1], [16, 16, 3, 1]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]] - // CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + // CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] // CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[16, 1, 1, 4], [16, 1, 1, 4]], @@ -4880,7 +4920,7 @@ func.func @BilinearNCEInterpolateToDistributedOpSOHWithTiling(%arg0: tensor<1x16 // CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]] - // CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Interpolate( + // CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Interpolate( // CHECK-SAME: [[INPUT_CMX]] // CHECK-SAME: [[WEIGHTS_CMX]] // CHECK-SAME: [[WEIGHTSTABLE_CMX]] @@ -4891,7 +4931,7 @@ func.func @BilinearNCEInterpolateToDistributedOpSOHWithTiling(%arg0: tensor<1x16 // CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 40, 320], [1, 16, 40, 320]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 40, 0]] - // CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + // CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] // CHECK: return [[OUT]] : tensor<1x16x80x320xf16, {order = #NHWC}> } @@ -4906,6 +4946,7 @@ module @executors { IE.TileResource 2 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @BilinearNCEInterpolateToDistributedOpSOK +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x32x5x5xf16, {order = #NHWC}> func.func @BilinearNCEInterpolateToDistributedOpSOK(%arg0: tensor<1x32x5x5xf16, {order = #NHWC}>) -> tensor<1x32x10x10xf16, {order = #NHWC}> { %weights = const.Declare tensor<32x32x4x4xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x4x4xf16>, [#const.Reorder<#NHWC>] %weights_table = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> @@ -4939,17 +4980,17 @@ func.func @BilinearNCEInterpolateToDistributedOpSOK(%arg0: tensor<1x32x5x5xf16, return %interpolate : tensor<1x32x10x10xf16, {order = #NHWC}> - // CHECK-DAG: [[WEIGHTS:%.*]] = const.Declare tensor<32x32x4x4xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x32x4x4xf16>, [#const.Reorder<#NHWC>] - // CHECK-DAG: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + // CHECK-DAG: [[WEIGHTS:%.+]] = const.Declare tensor<32x32x4x4xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x32x4x4xf16>, [#const.Reorder<#NHWC>] + // CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> - // CHECK-DAG: [[INPUT_SM:%.*]] = const.Declare tensor<1x32x22x22xi1> = dense : tensor<1x32x22x22xi1> - // CHECK: [[INPUT_SE:%.*]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 32, 5, 5], + // CHECK-DAG: [[INPUT_SM:%.+]] = const.Declare tensor<1x32x22x22xi1> = dense : tensor<1x32x22x22xi1> + // CHECK: [[INPUT_SE:%.+]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 32, 5, 5], // CHECK-SAME: seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , // CHECK-SAME: scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00], nearest_mode = , offsets = [0, 0, 0, 0], sizes = [1, 32, 22, 22]>, // CHECK-SAME: seDepth = 1 : i64, seSize = [32]} // CHECK-SAME: -> tensor<1x1x22x22xi32, {order = #NHWC}> - // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor(%arg0, [[INPUT_SM]], [[INPUT_SE]]) - // CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[INPUT_SPARSE]] + // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor([[INPUT]], [[INPUT_SM]], [[INPUT_SE]]) + // CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT_SPARSE]] // CHECK-SAME: -> !VPU.SparseTensor< // CHECK-SAME: data=!VPU.DistributedTensor<1x32x5x5xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments @@ -4972,7 +5013,7 @@ func.func @BilinearNCEInterpolateToDistributedOpSOK(%arg0: tensor<1x32x5x5xf16, // CHECK-SAME: #VPU.SEInterpolate, coordinate_transformation_mode = , // CHECK-SAME: scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00], nearest_mode = , offsets = [0, 0, 0, 0], sizes = [1, 32, 22, 22]>> - // CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + // CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] // CHECK-SAME: -> !VPU.DistributedTensor<32x32x4x4xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[16, 32, 4, 4], [16, 32, 4, 4]], @@ -4980,7 +5021,7 @@ func.func @BilinearNCEInterpolateToDistributedOpSOK(%arg0: tensor<1x32x5x5xf16, // CHECK-SAME{LITERAL}: memory_shapes = [[16, 32, 4, 4], [16, 32, 4, 4]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0]] - // CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + // CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] // CHECK-SAME: -> !VPU.DistributedTensor<32x1x1x4xsi32, #NCHW, @CMX_NN, // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[16, 1, 1, 4], [16, 1, 1, 4]], @@ -4988,7 +5029,7 @@ func.func @BilinearNCEInterpolateToDistributedOpSOK(%arg0: tensor<1x32x5x5xf16, // CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0]] - // CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Interpolate( + // CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Interpolate( // CHECK-SAME: [[INPUT_CMX]], // CHECK-SAME: [[WEIGHTS_CMX]], // CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -4999,7 +5040,7 @@ func.func @BilinearNCEInterpolateToDistributedOpSOK(%arg0: tensor<1x32x5x5xf16, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 10, 10], [1, 32, 10, 10]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]] - // CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + // CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] // CHECK: return [[OUT]] : tensor<1x32x10x10xf16, {order = #NHWC}> } @@ -5060,22 +5101,22 @@ func.func @OverlappedConvToOverlappedSEPOp(%input: tensor<1x16x30x30xf16, {order return %interpolate : tensor<1x32x60x60xf16, {order = #NHWC}> - // CHECK-DAG: [[CONV_WEIGHTS:%.*]] = const.Declare tensor<32x16x3x3xf16, {order = #NHWC}> - // CHECK-DAG: [[CONV_WEIGHTS_TABLE:%.*]] = const.Declare tensor<32x1x1x4xsi32> + // CHECK-DAG: [[CONV_WEIGHTS:%.+]] = const.Declare tensor<32x16x3x3xf16, {order = #NHWC}> + // CHECK-DAG: [[CONV_WEIGHTS_TABLE:%.+]] = const.Declare tensor<32x1x1x4xsi32> - // CHECK: [[CONV_INPUT_CMX:%.*]] = VPU.Copy([[INPUT]] + // CHECK: [[CONV_INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] // CHECK-SAME: -> !VPU.DistributedTensor<1x16x30x30xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 15, 30], [1, 16, 15, 30]], compute_offsets = [[0, 0, 0, 0], [0, 0, 15, 0]], // CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 16, 30], [1, 16, 16, 30]], memory_offsets = [[0, 0, 0, 0], [0, 0, 14, 0]]} - // CHECK: [[CONV_WEIGHTS_CMX:%.*]] = VPU.Copy([[CONV_WEIGHTS]] + // CHECK: [[CONV_WEIGHTS_CMX:%.+]] = VPU.Copy([[CONV_WEIGHTS]] // CHECK-SAME: -> !VPU.DistributedTensor<32x16x3x3xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[32, 16, 3, 3], [32, 16, 3, 3]], compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]], // CHECK-SAME{LITERAL}: memory_shapes = [[32, 16, 3, 3], [32, 16, 3, 3]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]]} - // CHECK: [[CONV_WEIGHTS_TABLE_CMX:%.*]] = VPU.Copy([[CONV_WEIGHTS_TABLE]] + // CHECK: [[CONV_WEIGHTS_TABLE_CMX:%.+]] = VPU.Copy([[CONV_WEIGHTS_TABLE]] // CHECK-SAME: -> !VPU.DistributedTensor<32x1x1x4xsi32, #NCHW, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[32, 1, 1, 4], [32, 1, 1, 4]], compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]], @@ -5090,11 +5131,11 @@ func.func @OverlappedConvToOverlappedSEPOp(%input: tensor<1x16x30x30xf16, {order // CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 15, 30], [1, 32, 15, 30]], compute_offsets = [[0, 0, 0, 0], [0, 0, 15, 0]], // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 15, 30], [1, 32, 15, 30]], memory_offsets = [[0, 0, 0, 0], [0, 0, 15, 0]]} - // CHECK: [[CONV_DDR:%.*]] = VPU.Copy([[CONV_CMX]] + // CHECK: [[CONV_DDR:%.+]] = VPU.Copy([[CONV_CMX]] // CHECK-SAME: -> tensor<1x32x30x30xf16, {order = #NHWC}> - // CHECK-DAG: [[INTERP_INPUT_SM:%.*]] = const.Declare tensor<1x32x62x62xi1> = dense : tensor<1x32x62x62xi1> - // CHECK: [[INTERP_INPUT_SE:%.*]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 32, 30, 30], + // CHECK-DAG: [[INTERP_INPUT_SM:%.+]] = const.Declare tensor<1x32x62x62xi1> = dense : tensor<1x32x62x62xi1> + // CHECK: [[INTERP_INPUT_SE:%.+]] = VPU.StorageElementTable {dataElemType = f16, dataShape = [1, 32, 30, 30], // CHECK-SAME: seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , // CHECK-SAME: scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00], nearest_mode = , offsets = [0, 0, 0, 0], sizes = [1, 32, 62, 62]>, // CHECK-SAME: seDepth = 1 : i64, seSize = [32]} @@ -5104,7 +5145,7 @@ func.func @OverlappedConvToOverlappedSEPOp(%input: tensor<1x16x30x30xf16, {order // CHECK-DAG: [[INTERP_WEIGHTS:%.+]] = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x32x3x3xf16>, [#const.Reorder<#NHWC>] // CHECK-DAG: [[INTERP_WEIGHTS_TABLE:%.+]] = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> - // CHECK: [[INTER_INPUT_CMX:%.*]] = VPU.Copy([[INTERP_INPUT_SPARSE]] + // CHECK: [[INTER_INPUT_CMX:%.+]] = VPU.Copy([[INTERP_INPUT_SPARSE]] // CHECK-SAME: -> !VPU.SparseTensor< // CHECK-SAME: data=!VPU.DistributedTensor<1x32x30x30xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64, uniform_distributed_segments @@ -5121,19 +5162,19 @@ func.func @OverlappedConvToOverlappedSEPOp(%input: tensor<1x16x30x30xf16, {order // CHECK-SAME: #VPU.SEInterpolate, coordinate_transformation_mode = , // CHECK-SAME: scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00], nearest_mode = , offsets = [0, 0, 0, 0], sizes = [1, 32, 62, 62]>> - // CHECK: [[INTERP_WEIGHTS_CMX:%.*]] = VPU.Copy([[INTERP_WEIGHTS]] + // CHECK: [[INTERP_WEIGHTS_CMX:%.+]] = VPU.Copy([[INTERP_WEIGHTS]] // CHECK-SAME: -> !VPU.DistributedTensor<32x32x3x3xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[32, 32, 3, 3], [32, 32, 3, 3]], compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]], // CHECK-SAME{LITERAL}: memory_shapes = [[32, 32, 3, 3], [32, 32, 3, 3]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]]} - // CHECK: [[INTERP_WEIGHTS_TABLE_CMX:%.*]] = VPU.Copy([[INTERP_WEIGHTS_TABLE]] + // CHECK: [[INTERP_WEIGHTS_TABLE_CMX:%.+]] = VPU.Copy([[INTERP_WEIGHTS_TABLE]] // CHECK-SAME: -> !VPU.DistributedTensor<32x1x1x4xsi32, #NCHW, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[32, 1, 1, 4], [32, 1, 1, 4]], compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]], // CHECK-SAME{LITERAL}: memory_shapes = [[32, 1, 1, 4], [32, 1, 1, 4]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]]} - // CHECK: [[INTERP_CMX:%.*]] = VPU.NCE.Interpolate( + // CHECK: [[INTERP_CMX:%.+]] = VPU.NCE.Interpolate( // CHECK-SAME: [[INTER_INPUT_CMX]], // CHECK-SAME: [[INTERP_WEIGHTS_CMX]], // CHECK-SAME: [[INTERP_WEIGHTS_TABLE_CMX]] @@ -5142,7 +5183,7 @@ func.func @OverlappedConvToOverlappedSEPOp(%input: tensor<1x16x30x30xf16, {order // CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 30, 60], [1, 32, 30, 60]], compute_offsets = [[0, 0, 0, 0], [0, 0, 30, 0]], // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 30, 60], [1, 32, 30, 60]], memory_offsets = [[0, 0, 0, 0], [0, 0, 30, 0]]} - // CHECK: [[INTERP_DDR:%.*]] = VPU.Copy([[INTERP_CMX]] + // CHECK: [[INTERP_DDR:%.+]] = VPU.Copy([[INTERP_CMX]] // CHECK: return [[INTERP_DDR]] : tensor<1x32x60x60xf16, {order = #NHWC}> } @@ -5157,6 +5198,7 @@ module @executors { IE.TileResource 2 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @SEPadToDistributedOpSOH +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x16x40x40xf16, {order = #NHWC}> func.func @SEPadToDistributedOpSOH(%arg0: tensor<1x16x40x40xf16, {order = #NHWC}>) -> tensor<1x32x20x20xf16, {order = #NHWC}> { %weights = const.Declare tensor<32x16x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x16x3x3xf16>, [#const.Reorder<#NHWC>] %weights_table = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> @@ -5182,16 +5224,16 @@ func.func @SEPadToDistributedOpSOH(%arg0: tensor<1x16x40x40xf16, {order = #NHWC} return %conv : tensor<1x32x20x20xf16, {order = #NHWC}> - // CHECK-DAG: [[WEIGHTS:%.*]] = const.Declare tensor<32x16x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x16x3x3xf16>, [#const.Reorder<#NHWC>] - // CHECK-DAG: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + // CHECK-DAG: [[WEIGHTS:%.+]] = const.Declare tensor<32x16x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x16x3x3xf16>, [#const.Reorder<#NHWC>] + // CHECK-DAG: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> - // CHECK-DAG: [[INPUT_SM:%.*]] = const.Declare tensor<1x16x42x42xi1, {order = #NHWC}> = dense<1> : tensor<1x16x42x42xi8>, [#const.Reorder<#NHWC>, #const.CastElemType] - // CHECK: [[INPUT_SE:%.*]] = VPU.StorageElementTable { + // CHECK-DAG: [[INPUT_SM:%.+]] = const.Declare tensor<1x16x42x42xi1, {order = #NHWC}> = dense<1> : tensor<1x16x42x42xi8>, [#const.Reorder<#NHWC>, #const.CastElemType] + // CHECK: [[INPUT_SE:%.+]] = VPU.StorageElementTable { // CHECK-SAME: dataElemType = f16, dataShape = [1, 16, 40, 40], // CHECK-SAME: seAttr = #VPU.SEPadding, padding = [1, 1, 1, 1]>, // CHECK-SAME: seDepth = 1 : i64, seSize = [16]} -> tensor<1x1x42x42xi32, {order = #NHWC}> - // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor(%arg0, [[INPUT_SM]], [[INPUT_SE]]) - // CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[INPUT_SPARSE]] + // CHECK: [[INPUT_SPARSE:%.+]] = VPU.GroupSparseTensor([[INPUT]], [[INPUT_SM]], [[INPUT_SE]]) + // CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT_SPARSE]] // CHECK-SAME: -> !VPU.SparseTensor< // CHECK-SAME: data=!VPU.DistributedTensor<1x16x40x40xf16, #NHWC, @CMX_NN // CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64, uniform_distributed_segments @@ -5213,7 +5255,7 @@ func.func @SEPadToDistributedOpSOH(%arg0: tensor<1x16x40x40xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 20, 0]] // CHECK-SAME: #VPU.SEPadding, padding = [1, 1, 1, 1]>> - // CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + // CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] // CHECK-SAME: -> !VPU.DistributedTensor<32x16x3x3xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[32, 16, 3, 3], [32, 16, 3, 3]] @@ -5221,7 +5263,7 @@ func.func @SEPadToDistributedOpSOH(%arg0: tensor<1x16x40x40xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[32, 16, 3, 3], [32, 16, 3, 3]] // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]] - // CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + // CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] // CHECK-SAME: -> !VPU.DistributedTensor<32x1x1x4xsi32, #NCHW, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[32, 1, 1, 4], [32, 1, 1, 4]] @@ -5229,7 +5271,7 @@ func.func @SEPadToDistributedOpSOH(%arg0: tensor<1x16x40x40xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[32, 1, 1, 4], [32, 1, 1, 4]] // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]] - // CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Convolution( + // CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Convolution( // CHECK-SAME: [[INPUT_CMX]] // CHECK-SAME: [[WEIGHTS_CMX]] // CHECK-SAME: [[WEIGHTSTABLE_CMX]] @@ -5240,7 +5282,7 @@ func.func @SEPadToDistributedOpSOH(%arg0: tensor<1x16x40x40xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 10, 20], [1, 32, 10, 20]] // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 10, 0]] - // CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + // CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] // CHECK: return [[OUT]] : tensor<1x32x20x20xf16, {order = #NHWC}> } @@ -5255,7 +5297,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @SliceConvConcatGeluSOK - +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x80x1x3008xf16, {order = #NHWC}> func.func @SliceConvConcatGeluSOK(%arg0: tensor<1x80x1x3008xf16, {order = #NHWC}>) -> tensor<1x512x1x3000xf16, {order = #NHWC}> { %weights_0 = const.Declare tensor<256x80x1x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<256x80x1x3xf16>, [#const.Reorder<#NHWC>] %weights_table_0 = const.Declare tensor<256x1x1x4xsi32> = dense<10> : tensor<256x1x1x4xsi32> @@ -5277,13 +5319,13 @@ func.func @SliceConvConcatGeluSOK(%arg0: tensor<1x80x1x3008xf16, {order = #NHWC} %8 = VPU.Concat(%5, %7) {static_offsets = [[0, 0, 0, 0], [0, 0, 0, 1500]]} : tensor<1x512x1x1500xf16, {order = #NHWC}>, tensor<1x512x1x1500xf16, {order = #NHWC}> -> tensor<1x512x1x3000xf16, {order = #NHWC}> return %8 : tensor<1x512x1x3000xf16, {order = #NHWC}> - // CHECK-DAG: [[WEIGHTS_0:%.*]] = const.Declare tensor<256x80x1x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<256x80x1x3xf16>, [#const.Reorder<#NHWC>] - // CHECK-DAG: [[WEIGHTSTABLE_0:%.*]] = const.Declare tensor<256x1x1x4xsi32> = dense<10> : tensor<256x1x1x4xsi32> - // CHECK-DAG: [[WEIGHTS_1:%.*]] = const.Declare tensor<256x80x1x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<256x80x1x3xf16>, [#const.Reorder<#NHWC>] - // CHECK-DAG: [[WEIGHTSTABLE_1:%.*]] = const.Declare tensor<256x1x1x4xsi32> = dense<10> : tensor<256x1x1x4xsi32> + // CHECK-DAG: [[WEIGHTS_0:%.+]] = const.Declare tensor<256x80x1x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<256x80x1x3xf16>, [#const.Reorder<#NHWC>] + // CHECK-DAG: [[WEIGHTSTABLE_0:%.+]] = const.Declare tensor<256x1x1x4xsi32> = dense<10> : tensor<256x1x1x4xsi32> + // CHECK-DAG: [[WEIGHTS_1:%.+]] = const.Declare tensor<256x80x1x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<256x80x1x3xf16>, [#const.Reorder<#NHWC>] + // CHECK-DAG: [[WEIGHTSTABLE_1:%.+]] = const.Declare tensor<256x1x1x4xsi32> = dense<10> : tensor<256x1x1x4xsi32> - // CHECK: [[CONV_INPUT:%.*]] = VPU.Slice %arg0 [0, 0, 0, 0] [1, 80, 1, 3000] : tensor<1x80x1x3008xf16, {order = #NHWC}> to tensor<1x80x1x3000xf16, {order = #NHWC}> - // CHECK: [[CONV0_INPUT_CMX:%.*]] = VPU.Copy([[CONV_INPUT]] + // CHECK: [[CONV_INPUT:%.+]] = VPU.Slice [[INPUT]] [0, 0, 0, 0] [1, 80, 1, 3000] : tensor<1x80x1x3008xf16, {order = #NHWC}> to tensor<1x80x1x3000xf16, {order = #NHWC}> + // CHECK: [[CONV0_INPUT_CMX:%.+]] = VPU.Copy([[CONV_INPUT]] // CHECK-SAME: -> !VPU.DistributedTensor<1x80x1x3000xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000]], @@ -5291,7 +5333,7 @@ func.func @SliceConvConcatGeluSOK(%arg0: tensor<1x80x1x3008xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - // CHECK: [[CONV0_WEIGHT_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]] + // CHECK: [[CONV0_WEIGHT_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]] // CHECK-SAME: -> !VPU.DistributedTensor<256x80x1x3xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[48, 80, 1, 3], [48, 80, 1, 3], [48, 80, 1, 3], [48, 80, 1, 3], [32, 80, 1, 3], [32, 80, 1, 3]], @@ -5299,7 +5341,7 @@ func.func @SliceConvConcatGeluSOK(%arg0: tensor<1x80x1x3008xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[48, 80, 1, 3], [48, 80, 1, 3], [48, 80, 1, 3], [48, 80, 1, 3], [32, 80, 1, 3], [32, 80, 1, 3]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [48, 0, 0, 0], [96, 0, 0, 0], [144, 0, 0, 0], [192, 0, 0, 0], [224, 0, 0, 0]] - // CHECK: [[CONV0_WEIGHTTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE_0]] + // CHECK: [[CONV0_WEIGHTTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE_0]] // CHECK-SAME: -> !VPU.DistributedTensor<256x1x1x4xsi32, #NCHW, @CMX_NN, // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4]], @@ -5307,7 +5349,7 @@ func.func @SliceConvConcatGeluSOK(%arg0: tensor<1x80x1x3008xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [48, 0, 0, 0], [96, 0, 0, 0], [144, 0, 0, 0], [192, 0, 0, 0], [224, 0, 0, 0]] - // CHECK: [[CONV0:%.*]] = VPU.NCE.Convolution([[CONV0_INPUT_CMX]], + // CHECK: [[CONV0:%.+]] = VPU.NCE.Convolution([[CONV0_INPUT_CMX]], // CHECK: [[CONV0_WEIGHT_CMX]], // CHECK: [[CONV0_WEIGHTTABLE_CMX]]) // CHECK-SAME: -> !VPU.DistributedTensor<1x256x1x3000xf16, #NHWC, @CMX_NN, @@ -5317,10 +5359,10 @@ func.func @SliceConvConcatGeluSOK(%arg0: tensor<1x80x1x3008xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[1, 48, 1, 3000], [1, 48, 1, 3000], [1, 48, 1, 3000], [1, 48, 1, 3000], [1, 32, 1, 3000], [1, 32, 1, 3000]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 48, 0, 0], [0, 96, 0, 0], [0, 144, 0, 0], [0, 192, 0, 0], [0, 224, 0, 0]] - // CHECK: [[CONV0_OUTPUT:%.*]] = VPU.Copy([[CONV0]] + // CHECK: [[CONV0_OUTPUT:%.+]] = VPU.Copy([[CONV0]] - // CHECK: [[CONV1_INPUT_CMX:%.*]] = VPU.Copy([[CONV_INPUT]] + // CHECK: [[CONV1_INPUT_CMX:%.+]] = VPU.Copy([[CONV_INPUT]] // CHECK-SAME: -> !VPU.DistributedTensor<1x80x1x3000xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000]], @@ -5328,7 +5370,7 @@ func.func @SliceConvConcatGeluSOK(%arg0: tensor<1x80x1x3008xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000], [1, 80, 1, 3000]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - // CHECK: [[CONV1_WEIGHT_CMX:%.*]] = VPU.Copy([[WEIGHTS_1]] + // CHECK: [[CONV1_WEIGHT_CMX:%.+]] = VPU.Copy([[WEIGHTS_1]] // CHECK-SAME: -> !VPU.DistributedTensor<256x80x1x3xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[48, 80, 1, 3], [48, 80, 1, 3], [48, 80, 1, 3], [48, 80, 1, 3], [32, 80, 1, 3], [32, 80, 1, 3]], @@ -5336,7 +5378,7 @@ func.func @SliceConvConcatGeluSOK(%arg0: tensor<1x80x1x3008xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[48, 80, 1, 3], [48, 80, 1, 3], [48, 80, 1, 3], [48, 80, 1, 3], [32, 80, 1, 3], [32, 80, 1, 3]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [48, 0, 0, 0], [96, 0, 0, 0], [144, 0, 0, 0], [192, 0, 0, 0], [224, 0, 0, 0]] - // CHECK: [[CONV1_WEIGHTTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE_1]] + // CHECK: [[CONV1_WEIGHTTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE_1]] // CHECK-SAME: -> !VPU.DistributedTensor<256x1x1x4xsi32, #NCHW, @CMX_NN, // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4]], @@ -5344,7 +5386,7 @@ func.func @SliceConvConcatGeluSOK(%arg0: tensor<1x80x1x3008xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [48, 1, 1, 4], [32, 1, 1, 4], [32, 1, 1, 4]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [48, 0, 0, 0], [96, 0, 0, 0], [144, 0, 0, 0], [192, 0, 0, 0], [224, 0, 0, 0]] - // CHECK: [[CONV1:%.*]] = VPU.NCE.Convolution([[CONV1_INPUT_CMX]], + // CHECK: [[CONV1:%.+]] = VPU.NCE.Convolution([[CONV1_INPUT_CMX]], // CHECK: [[CONV1_WEIGHT_CMX]], // CHECK: [[CONV1_WEIGHTTABLE_CMX]]) // CHECK-SAME: -> !VPU.DistributedTensor<1x256x1x3000xf16, #NHWC, @CMX_NN, @@ -5354,18 +5396,18 @@ func.func @SliceConvConcatGeluSOK(%arg0: tensor<1x80x1x3008xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[1, 48, 1, 3000], [1, 48, 1, 3000], [1, 48, 1, 3000], [1, 48, 1, 3000], [1, 32, 1, 3000], [1, 32, 1, 3000]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 48, 0, 0], [0, 96, 0, 0], [0, 144, 0, 0], [0, 192, 0, 0], [0, 224, 0, 0]] - // CHECK: [[CONV1_OUTPUT:%.*]] = VPU.Copy([[CONV1]] + // CHECK: [[CONV1_OUTPUT:%.+]] = VPU.Copy([[CONV1]] - // CHECK: [[CONV_CONCAT:%.*]] = VPU.Concat([[CONV0_OUTPUT]], [[CONV1_OUTPUT]]) {static_offsets = [ + // CHECK: [[CONV_CONCAT:%.+]] = VPU.Concat([[CONV0_OUTPUT]], [[CONV1_OUTPUT]]) {static_offsets = [ // CHECK-SAME: [0, 0, 0, 0], [0, 256, 0, 0] // CHECK-SAME: ]} : // CHECK-SAME: tensor<1x256x1x3000xf16, {order = #NHWC}>, // CHECK-SAME: tensor<1x256x1x3000xf16, {order = #NHWC}> -> tensor<1x512x1x3000xf16, {order = #NHWC}> - // CHECK: [[GELU_0_SLICE:%.*]] = VPU.Slice [[CONV_CONCAT]] [0, 0, 0, 0] [1, 512, 1, 1500] : + // CHECK: [[GELU_0_SLICE:%.+]] = VPU.Slice [[CONV_CONCAT]] [0, 0, 0, 0] [1, 512, 1, 1500] : // CHECK-SAME: tensor<1x512x1x3000xf16, {order = #NHWC}> to tensor<1x512x1x1500xf16, {order = #NHWC}> - // CHECK: [[GELU_0_INPUT:%.*]] = VPU.Copy([[GELU_0_SLICE]] + // CHECK: [[GELU_0_INPUT:%.+]] = VPU.Copy([[GELU_0_SLICE]] // CHECK-SAME: -> !VPU.DistributedTensor<1x512x1x1500xf16, #NHWC, @CMX_NN, /// CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 1, 1500], [1, 96, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500]], @@ -5373,7 +5415,7 @@ func.func @SliceConvConcatGeluSOK(%arg0: tensor<1x80x1x3008xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 1, 1500], [1, 96, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 96, 0, 0], [0, 192, 0, 0], [0, 272, 0, 0], [0, 352, 0, 0], [0, 432, 0, 0]] - // CHECK: [[GELU_0:%.*]] = VPU.Gelu([[GELU_0_INPUT]]) + // CHECK: [[GELU_0:%.+]] = VPU.Gelu([[GELU_0_INPUT]]) // CHECK-SAME: -> !VPU.DistributedTensor<1x512x1x1500xf16, #NHWC, @CMX_NN, /// CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 1, 1500], [1, 96, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500]], @@ -5381,13 +5423,13 @@ func.func @SliceConvConcatGeluSOK(%arg0: tensor<1x80x1x3008xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 1, 1500], [1, 96, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 96, 0, 0], [0, 192, 0, 0], [0, 272, 0, 0], [0, 352, 0, 0], [0, 432, 0, 0]] - // CHECK: [[GELU_0_OUTPUT:%.*]] = VPU.Copy([[GELU_0]] + // CHECK: [[GELU_0_OUTPUT:%.+]] = VPU.Copy([[GELU_0]] // CHECK-SAME: -> tensor<1x512x1x1500xf16, {order = #NHWC}> - // CHECK: [[GELU_1_SLICE:%.*]] = VPU.Slice [[CONV_CONCAT]] [0, 0, 0, 1500] [1, 512, 1, 1500] : + // CHECK: [[GELU_1_SLICE:%.+]] = VPU.Slice [[CONV_CONCAT]] [0, 0, 0, 1500] [1, 512, 1, 1500] : // CHECK-SAME: tensor<1x512x1x3000xf16, {order = #NHWC}> to tensor<1x512x1x1500xf16, {order = #NHWC}> - // CHECK: [[GELU_1_INPUT:%.*]] = VPU.Copy([[GELU_1_SLICE]] + // CHECK: [[GELU_1_INPUT:%.+]] = VPU.Copy([[GELU_1_SLICE]] // CHECK-SAME: -> !VPU.DistributedTensor<1x512x1x1500xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 1, 1500], [1, 96, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500]], @@ -5395,7 +5437,7 @@ func.func @SliceConvConcatGeluSOK(%arg0: tensor<1x80x1x3008xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 1, 1500], [1, 96, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 96, 0, 0], [0, 192, 0, 0], [0, 272, 0, 0], [0, 352, 0, 0], [0, 432, 0, 0]] - // CHECK: [[GELU_1:%.*]] = VPU.Gelu([[GELU_1_INPUT]]) + // CHECK: [[GELU_1:%.+]] = VPU.Gelu([[GELU_1_INPUT]]) // CHECK-SAME: -> !VPU.DistributedTensor<1x512x1x1500xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[1, 96, 1, 1500], [1, 96, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500]], @@ -5403,9 +5445,9 @@ func.func @SliceConvConcatGeluSOK(%arg0: tensor<1x80x1x3008xf16, {order = #NHWC} // CHECK-SAME{LITERAL}: memory_shapes = [[1, 96, 1, 1500], [1, 96, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500], [1, 80, 1, 1500]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 96, 0, 0], [0, 192, 0, 0], [0, 272, 0, 0], [0, 352, 0, 0], [0, 432, 0, 0]] - // CHECK: [[GELU_1_OUTPUT:%.*]] = VPU.Copy([[GELU_1]] + // CHECK: [[GELU_1_OUTPUT:%.+]] = VPU.Copy([[GELU_1]] - // CHECK: [[GELU_CONCAT:%.*]] = VPU.Concat([[GELU_0_OUTPUT]], [[GELU_1_OUTPUT]]) {static_offsets = [ + // CHECK: [[GELU_CONCAT:%.+]] = VPU.Concat([[GELU_0_OUTPUT]], [[GELU_1_OUTPUT]]) {static_offsets = [ // CHECK-SAME: [0, 0, 0, 0], [0, 0, 0, 1500] // CHECK: ]} : tensor<1x512x1x1500xf16, {order = #NHWC}>, tensor<1x512x1x1500xf16, {order = #NHWC}> -> tensor<1x512x1x3000xf16, {order = #NHWC}> @@ -5426,7 +5468,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @OverlappedThroughConcatWithCompatibleNCEConsumers -// CHECK-SAME: ([[ARG0:%.*]]: tensor<1x16x32x32xf16, {order = #NHWC}>) +// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x16x32x32xf16, {order = #NHWC}>) func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerConvType) -> (!ConvConsumerOutput0, !ConvConsumerOutput1) { %cst = const.Declare tensor<16x1x1x4xsi32> = dense<10> : tensor<16x1x1x4xsi32> %cst_0 = const.Declare tensor<16x16x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x1x1xf16>, [#const.Reorder<#NHWC>] @@ -5448,16 +5490,16 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon return %4, %5 : !ConvConsumerOutput0, !ConvConsumerOutput1 - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<16x1x1x4xsi32> - //CHECK: [[WEIGHTS_0:%.*]] = const.Declare tensor<16x16x1x1xf16, {order = #NHWC}> - //CHECK: [[WEIGHTS_1:%.*]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> - //CHECK: [[WEIGHTS_2:%.*]] = const.Declare tensor<16x16x5x5xf16, {order = #NHWC}> - //CHECK: [[WEIGHTS_3:%.*]] = const.Declare tensor<16x48x7x7xf16, {order = #NHWC}> - //CHECK: [[WEIGHTS_4:%.*]] = const.Declare tensor<16x48x5x5xf16, {order = #NHWC}> + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<16x1x1x4xsi32> + //CHECK: [[WEIGHTS_0:%.+]] = const.Declare tensor<16x16x1x1xf16, {order = #NHWC}> + //CHECK: [[WEIGHTS_1:%.+]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> + //CHECK: [[WEIGHTS_2:%.+]] = const.Declare tensor<16x16x5x5xf16, {order = #NHWC}> + //CHECK: [[WEIGHTS_3:%.+]] = const.Declare tensor<16x48x7x7xf16, {order = #NHWC}> + //CHECK: [[WEIGHTS_4:%.+]] = const.Declare tensor<16x48x5x5xf16, {order = #NHWC}> //CONV 0 - //CHECK: [[INPUT_CMX_0:%.*]] = VPU.Copy([[ARG0]] + //CHECK: [[INPUT_CMX_0:%.+]] = VPU.Copy([[ARG0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x32x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -5467,7 +5509,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 8, 32], [1, 16, 10, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 7, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 25, 0]] - //CHECK: [[WEIGHTS_0_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]] + //CHECK: [[WEIGHTS_0_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x1x1xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5476,7 +5518,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5485,7 +5527,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_0_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_0_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_0]], //CHECK-SAME: [[WEIGHTS_0_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -5498,11 +5540,11 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 9, 32], [1, 16, 12, 32], [1, 16, 11, 32], [1, 16, 11, 32], [1, 16, 11, 32], [1, 16, 8, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 24, 0]] - //CHECK: [[OUT_0:%.*]] = VPU.Copy([[OUT_0_CMX]] + //CHECK: [[OUT_0:%.+]] = VPU.Copy([[OUT_0_CMX]] // CONV 1 - //CHECK: [[INPUT_CMX_1:%.*]] = VPU.Copy([[ARG0]] + //CHECK: [[INPUT_CMX_1:%.+]] = VPU.Copy([[ARG0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x32x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -5512,7 +5554,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 8, 32], [1, 16, 10, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 7, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 25, 0]] - //CHECK: [[WEIGHTS_1_CMX:%.*]] = VPU.Copy([[WEIGHTS_1]] + //CHECK: [[WEIGHTS_1_CMX:%.+]] = VPU.Copy([[WEIGHTS_1]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x3x3xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5521,7 +5563,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5531,7 +5573,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_1_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_1_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_1]], //CHECK-SAME: [[WEIGHTS_1_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -5545,11 +5587,11 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 24, 0]] - //CHECK: [[OUT_1:%.*]] = VPU.Copy([[OUT_1_CMX]] + //CHECK: [[OUT_1:%.+]] = VPU.Copy([[OUT_1_CMX]] // CONV 2 - //CHECK: [[INPUT_CMX_2:%.*]] = VPU.Copy([[ARG0]] + //CHECK: [[INPUT_CMX_2:%.+]] = VPU.Copy([[ARG0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x32x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -5559,7 +5601,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 8, 32], [1, 16, 10, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 7, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 25, 0]] - //CHECK: [[WEIGHTS_2_CMX:%.*]] = VPU.Copy([[WEIGHTS_2]] + //CHECK: [[WEIGHTS_2_CMX:%.+]] = VPU.Copy([[WEIGHTS_2]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x5x5xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5568,7 +5610,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5578,7 +5620,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_2_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_2_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_2]], //CHECK-SAME: [[WEIGHTS_2_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -5592,17 +5634,17 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 24, 0]] - //CHECK: [[OUT_2:%.*]] = VPU.Copy([[OUT_2_CMX]] + //CHECK: [[OUT_2:%.+]] = VPU.Copy([[OUT_2_CMX]] - //CHECK: [[CONCAT:%.*]] = VPU.Concat([[OUT_0]], [[OUT_1]], [[OUT_2]]) + //CHECK: [[CONCAT:%.+]] = VPU.Concat([[OUT_0]], [[OUT_1]], [[OUT_2]]) //CHECK-SAME{LITERAL}: static_offsets = [[0, 0, 0, 0], [0, 16, 0, 0], [0, 32, 0, 0]] //CHECK-SAME: tensor<1x16x32x32xf16, {order = #NHWC}>, tensor<1x16x32x32xf16, {order = #NHWC}>, tensor<1x16x32x32xf16, {order = #NHWC}> //CHECK-SAME: -> tensor<1x48x32x32xf16, {order = #NHWC}> //CONV 3 - //CHECK: [[INPUT_CMX_3:%.*]] = VPU.Copy([[CONCAT]] + //CHECK: [[INPUT_CMX_3:%.+]] = VPU.Copy([[CONCAT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x48x32x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -5612,7 +5654,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[1, 48, 9, 32], [1, 48, 12, 32], [1, 48, 11, 32], [1, 48, 11, 32], [1, 48, 11, 32], [1, 48, 8, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 24, 0]] - //CHECK: [[WEIGHTS_3_CMX:%.*]] = VPU.Copy([[WEIGHTS_3]] + //CHECK: [[WEIGHTS_3_CMX:%.+]] = VPU.Copy([[WEIGHTS_3]] //CHECK-SAME: -> !VPU.DistributedTensor<16x48x7x7xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5621,7 +5663,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[16, 48, 7, 7], [16, 48, 7, 7], [16, 48, 7, 7], [16, 48, 7, 7], [16, 48, 7, 7], [16, 48, 7, 7]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5630,7 +5672,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_3_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_3_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_3]], //CHECK-SAME: [[WEIGHTS_3_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -5643,11 +5685,11 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 6, 32], [1, 16, 6, 32], [1, 16, 5, 32], [1, 16, 5, 32], [1, 16, 5, 32], [1, 16, 5, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 6, 0], [0, 0, 12, 0], [0, 0, 17, 0], [0, 0, 22, 0], [0, 0, 27, 0]] - //CHECK: [[OUT_3:%.*]] = VPU.Copy([[OUT_3_CMX]] + //CHECK: [[OUT_3:%.+]] = VPU.Copy([[OUT_3_CMX]] //CONV 4 - //CHECK: [[INPUT_CMX_4:%.*]] = VPU.Copy([[CONCAT]] + //CHECK: [[INPUT_CMX_4:%.+]] = VPU.Copy([[CONCAT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x48x32x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -5657,7 +5699,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[1, 48, 9, 32], [1, 48, 12, 32], [1, 48, 11, 32], [1, 48, 11, 32], [1, 48, 11, 32], [1, 48, 8, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 9, 0], [0, 0, 14, 0], [0, 0, 19, 0], [0, 0, 24, 0]] - //CHECK: [[WEIGHTS_4_CMX:%.*]] = VPU.Copy([[WEIGHTS_4]] + //CHECK: [[WEIGHTS_4_CMX:%.+]] = VPU.Copy([[WEIGHTS_4]] //CHECK-SAME: -> !VPU.DistributedTensor<16x48x5x5xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5666,7 +5708,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[16, 48, 5, 5], [16, 48, 5, 5], [16, 48, 5, 5], [16, 48, 5, 5], [16, 48, 5, 5], [16, 48, 5, 5]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5675,7 +5717,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_4_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_4_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_4]], //CHECK-SAME: [[WEIGHTS_4_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -5688,7 +5730,7 @@ func.func @OverlappedThroughConcatWithCompatibleNCEConsumers(%arg0: !ProducerCon //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 6, 32], [1, 16, 6, 32], [1, 16, 5, 32], [1, 16, 5, 32], [1, 16, 5, 32], [1, 16, 5, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 6, 0], [0, 0, 12, 0], [0, 0, 17, 0], [0, 0, 22, 0], [0, 0, 27, 0]] - //CHECK: [[OUT_4:%.*]] = VPU.Copy([[OUT_4_CMX]] + //CHECK: [[OUT_4:%.+]] = VPU.Copy([[OUT_4_CMX]] } } @@ -5706,7 +5748,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @IncompatibleConcatOverlappedWithNCEConsumers -// CHECK-SAME: ([[ARG0:%.*]]: tensor<1x16x32x32xf16, {order = #NHWC}>) +// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x16x32x32xf16, {order = #NHWC}>) func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType) -> (!ConvConsumerOutput0, !ConvConsumerOutput1) { %cst = const.Declare tensor<16x1x1x4xsi32> = dense<10> : tensor<16x1x1x4xsi32> %cst_0 = const.Declare tensor<16x16x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x1x1xf16>, [#const.Reorder<#NHWC>] @@ -5728,16 +5770,16 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType return %4, %5 : !ConvConsumerOutput0, !ConvConsumerOutput1 - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<16x1x1x4xsi32> - //CHECK: [[WEIGHTS_0:%.*]] = const.Declare tensor<16x16x1x1xf16, {order = #NHWC}> - //CHECK: [[WEIGHTS_1:%.*]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> - //CHECK: [[WEIGHTS_2:%.*]] = const.Declare tensor<16x16x5x5xf16, {order = #NHWC}> - //CHECK: [[WEIGHTS_3:%.*]] = const.Declare tensor<16x16x7x7xf16, {order = #NHWC}> - //CHECK: [[WEIGHTS_4:%.*]] = const.Declare tensor<16x16x5x5xf16, {order = #NHWC}> + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<16x1x1x4xsi32> + //CHECK: [[WEIGHTS_0:%.+]] = const.Declare tensor<16x16x1x1xf16, {order = #NHWC}> + //CHECK: [[WEIGHTS_1:%.+]] = const.Declare tensor<16x16x3x3xf16, {order = #NHWC}> + //CHECK: [[WEIGHTS_2:%.+]] = const.Declare tensor<16x16x5x5xf16, {order = #NHWC}> + //CHECK: [[WEIGHTS_3:%.+]] = const.Declare tensor<16x16x7x7xf16, {order = #NHWC}> + //CHECK: [[WEIGHTS_4:%.+]] = const.Declare tensor<16x16x5x5xf16, {order = #NHWC}> //CONV 0 - //CHECK: [[INPUT_CMX_0:%.*]] = VPU.Copy([[ARG0]] + //CHECK: [[INPUT_CMX_0:%.+]] = VPU.Copy([[ARG0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x32x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -5747,7 +5789,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 8, 32], [1, 16, 10, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 7, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 25, 0]] - //CHECK: [[WEIGHTS_0_CMX:%.*]] = VPU.Copy([[WEIGHTS_0]] + //CHECK: [[WEIGHTS_0_CMX:%.+]] = VPU.Copy([[WEIGHTS_0]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x1x1xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5756,7 +5798,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1], [16, 16, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5765,7 +5807,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_0_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_0_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_0]], //CHECK-SAME: [[WEIGHTS_0_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -5778,11 +5820,11 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 6, 32], [1, 16, 6, 32], [1, 16, 5, 32], [1, 16, 5, 32], [1, 16, 5, 32], [1, 16, 5, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 6, 0], [0, 0, 12, 0], [0, 0, 17, 0], [0, 0, 22, 0], [0, 0, 27, 0]] - //CHECK: [[OUT_0:%.*]] = VPU.Copy([[OUT_0_CMX]] + //CHECK: [[OUT_0:%.+]] = VPU.Copy([[OUT_0_CMX]] // CONV 1 - //CHECK: [[INPUT_CMX_1:%.*]] = VPU.Copy([[ARG0]] + //CHECK: [[INPUT_CMX_1:%.+]] = VPU.Copy([[ARG0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x32x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -5792,7 +5834,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 8, 32], [1, 16, 10, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 7, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 25, 0]] - //CHECK: [[WEIGHTS_1_CMX:%.*]] = VPU.Copy([[WEIGHTS_1]] + //CHECK: [[WEIGHTS_1_CMX:%.+]] = VPU.Copy([[WEIGHTS_1]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x3x3xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5801,7 +5843,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5811,7 +5853,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_1_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_1_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_1]], //CHECK-SAME: [[WEIGHTS_1_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -5825,11 +5867,11 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 6, 0], [0, 0, 12, 0], [0, 0, 17, 0], [0, 0, 22, 0], [0, 0, 27, 0]] - //CHECK: [[OUT_1:%.*]] = VPU.Copy([[OUT_1_CMX]] + //CHECK: [[OUT_1:%.+]] = VPU.Copy([[OUT_1_CMX]] // CONV 2 - //CHECK: [[INPUT_CMX_2:%.*]] = VPU.Copy([[ARG0]] + //CHECK: [[INPUT_CMX_2:%.+]] = VPU.Copy([[ARG0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x32x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -5839,7 +5881,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 8, 32], [1, 16, 10, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 9, 32], [1, 16, 7, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 4, 0], [0, 0, 10, 0], [0, 0, 15, 0], [0, 0, 20, 0], [0, 0, 25, 0]] - //CHECK: [[WEIGHTS_2_CMX:%.*]] = VPU.Copy([[WEIGHTS_2]] + //CHECK: [[WEIGHTS_2_CMX:%.+]] = VPU.Copy([[WEIGHTS_2]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x5x5xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5848,7 +5890,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5858,7 +5900,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_2_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_2_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_2]], //CHECK-SAME: [[WEIGHTS_2_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -5872,17 +5914,17 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 6, 0], [0, 0, 12, 0], [0, 0, 17, 0], [0, 0, 22, 0], [0, 0, 27, 0]] - //CHECK: [[OUT_2:%.*]] = VPU.Copy([[OUT_2_CMX]] + //CHECK: [[OUT_2:%.+]] = VPU.Copy([[OUT_2_CMX]] - //CHECK: [[CONCAT:%.*]] = VPU.Concat([[OUT_0]], [[OUT_1]], [[OUT_2]]) + //CHECK: [[CONCAT:%.+]] = VPU.Concat([[OUT_0]], [[OUT_1]], [[OUT_2]]) //CHECK-SAME{LITERAL}: static_offsets = [[0, 0, 0, 0], [0, 0, 32, 0], [0, 0, 64, 0]] //CHECK-SAME: tensor<1x16x32x32xf16, {order = #NHWC}>, tensor<1x16x32x32xf16, {order = #NHWC}>, tensor<1x16x32x32xf16, {order = #NHWC}> //CHECK-SAME: -> tensor<1x16x96x32xf16, {order = #NHWC}> //CONV 3 - //CHECK: [[INPUT_CMX_3:%.*]] = VPU.Copy([[CONCAT]] + //CHECK: [[INPUT_CMX_3:%.+]] = VPU.Copy([[CONCAT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x96x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -5892,7 +5934,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 19, 32], [1, 16, 22, 32], [1, 16, 22, 32], [1, 16, 22, 32], [1, 16, 22, 32], [1, 16, 19, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 13, 0], [0, 0, 29, 0], [0, 0, 45, 0], [0, 0, 61, 0], [0, 0, 77, 0]] - //CHECK: [[WEIGHTS_3_CMX:%.*]] = VPU.Copy([[WEIGHTS_3]] + //CHECK: [[WEIGHTS_3_CMX:%.+]] = VPU.Copy([[WEIGHTS_3]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x7x7xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5901,7 +5943,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 7, 7], [16, 16, 7, 7], [16, 16, 7, 7], [16, 16, 7, 7], [16, 16, 7, 7], [16, 16, 7, 7]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5910,7 +5952,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_3_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_3_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_3]], //CHECK-SAME: [[WEIGHTS_3_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -5923,11 +5965,11 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 16, 32], [1, 16, 16, 32], [1, 16, 16, 32], [1, 16, 16, 32], [1, 16, 16, 32], [1, 16, 16, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 16, 0], [0, 0, 32, 0], [0, 0, 48, 0], [0, 0, 64, 0], [0, 0, 80, 0]] - //CHECK: [[OUT_3:%.*]] = VPU.Copy([[OUT_3_CMX]] + //CHECK: [[OUT_3:%.+]] = VPU.Copy([[OUT_3_CMX]] //CONV 4 - //CHECK: [[INPUT_CMX_4:%.*]] = VPU.Copy([[CONCAT]]) + //CHECK: [[INPUT_CMX_4:%.+]] = VPU.Copy([[CONCAT]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x16x96x32xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "OVERLAPPED" //CHECK-SAME: num_tiles = [1, 1, 6, 1] @@ -5937,7 +5979,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 19, 32], [1, 16, 22, 32], [1, 16, 22, 32], [1, 16, 22, 32], [1, 16, 22, 32], [1, 16, 19, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 13, 0], [0, 0, 29, 0], [0, 0, 45, 0], [0, 0, 61, 0], [0, 0, 77, 0]] - //CHECK: [[WEIGHTS_4_CMX:%.*]] = VPU.Copy([[WEIGHTS_4]] + //CHECK: [[WEIGHTS_4_CMX:%.+]] = VPU.Copy([[WEIGHTS_4]] //CHECK-SAME: -> !VPU.DistributedTensor<16x16x5x5xf16, #NHWC, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5946,7 +5988,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5], [16, 16, 5, 5]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<16x1x1x4xsi32, #NCHW, @CMX_NN //CHECK-SAME: mode = "DUPLICATED" //CHECK-SAME: num_clusters = 6 : i64, uniform_distributed_segments @@ -5955,7 +5997,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_4_CMX:%.*]] = VPU.NCE.Convolution( + //CHECK: [[OUT_4_CMX:%.+]] = VPU.NCE.Convolution( //CHECK-SAME: [[INPUT_CMX_4]], //CHECK-SAME: [[WEIGHTS_4_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -5968,7 +6010,7 @@ func.func @IncompatibleConcatOverlappedWithNCEConsumers(%arg0: !ProducerConvType //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 16, 32], [1, 16, 16, 32], [1, 16, 16, 32], [1, 16, 16, 32], [1, 16, 16, 32], [1, 16, 16, 32]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 16, 0], [0, 0, 32, 0], [0, 0, 48, 0], [0, 0, 64, 0], [0, 0, 80, 0]] - //CHECK: [[OUT_4:%.*]] = VPU.Copy([[OUT_4_CMX]] + //CHECK: [[OUT_4:%.+]] = VPU.Copy([[OUT_4_CMX]] //CHECK-SAME: -> tensor<1x16x96x32xf16, {order = #NHWC}> } @@ -5995,10 +6037,10 @@ func.func @CompressConvToDistributedOpSOB(%arg0: tensor<6x4x224x224xf16, {order return %compressConv : tensor<6x64x112x112xf16, {order = #NHWC}> - //CHECK: [[WEIGHTSTABLE:%.*]] = const.Declare tensor<64x1x1x4xsi32> = dense<10> : tensor<64x1x1x4xsi32> - //CHECK: [[WEIGHTS:%.*]] = const.Declare tensor<64x1x1x160xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<64x1x1x160xf16>, [#const.Reorder<#NHWC>] + //CHECK: [[WEIGHTSTABLE:%.+]] = const.Declare tensor<64x1x1x4xsi32> = dense<10> : tensor<64x1x1x4xsi32> + //CHECK: [[WEIGHTS:%.+]] = const.Declare tensor<64x1x1x160xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<64x1x1x160xf16>, [#const.Reorder<#NHWC>] - //CHECK: [[INPUT_CMX:%.*]] = VPU.Copy([[INPUT]] + //CHECK: [[INPUT_CMX:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<6x4x224x224xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [6, 1, 1, 1], num_clusters = 6 : i64, uniform_distributed_segments //CHECK-SAME{LITERAL}: compute_shapes = [[1, 4, 224, 224], [1, 4, 224, 224], [1, 4, 224, 224], [1, 4, 224, 224], [1, 4, 224, 224], [1, 4, 224, 224]], @@ -6006,7 +6048,7 @@ func.func @CompressConvToDistributedOpSOB(%arg0: tensor<6x4x224x224xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 4, 224, 224], [1, 4, 224, 224], [1, 4, 224, 224], [1, 4, 224, 224], [1, 4, 224, 224], [1, 4, 224, 224]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [1, 0, 0, 0], [2, 0, 0, 0], [3, 0, 0, 0], [4, 0, 0, 0], [5, 0, 0, 0]] - //CHECK: [[WEIGHTS_CMX:%.*]] = VPU.Copy([[WEIGHTS]] + //CHECK: [[WEIGHTS_CMX:%.+]] = VPU.Copy([[WEIGHTS]] //CHECK-SAME: -> !VPU.DistributedTensor<64x1x1x160xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[64, 1, 1, 160], [64, 1, 1, 160], [64, 1, 1, 160], [64, 1, 1, 160], [64, 1, 1, 160], [64, 1, 1, 160]], @@ -6014,7 +6056,7 @@ func.func @CompressConvToDistributedOpSOB(%arg0: tensor<6x4x224x224xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[64, 1, 1, 160], [64, 1, 1, 160], [64, 1, 1, 160], [64, 1, 1, 160], [64, 1, 1, 160], [64, 1, 1, 160]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[WEIGHTSTABLE_CMX:%.*]] = VPU.Copy([[WEIGHTSTABLE]] + //CHECK: [[WEIGHTSTABLE_CMX:%.+]] = VPU.Copy([[WEIGHTSTABLE]] //CHECK-SAME: -> !VPU.DistributedTensor<64x1x1x4xsi32, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[64, 1, 1, 4], [64, 1, 1, 4], [64, 1, 1, 4], [64, 1, 1, 4], [64, 1, 1, 4], [64, 1, 1, 4]], @@ -6022,7 +6064,7 @@ func.func @CompressConvToDistributedOpSOB(%arg0: tensor<6x4x224x224xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[64, 1, 1, 4], [64, 1, 1, 4], [64, 1, 1, 4], [64, 1, 1, 4], [64, 1, 1, 4], [64, 1, 1, 4]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUT_CMX:%.*]] = VPU.NCE.CompressConvolution( + //CHECK: [[OUT_CMX:%.+]] = VPU.NCE.CompressConvolution( //CHECK-SAME: [[INPUT_CMX]] //CHECK-SAME: [[WEIGHTS_CMX]], //CHECK-SAME: [[WEIGHTSTABLE_CMX]]) @@ -6033,7 +6075,7 @@ func.func @CompressConvToDistributedOpSOB(%arg0: tensor<6x4x224x224xf16, {order //CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 112, 112], [1, 64, 112, 112], [1, 64, 112, 112], [1, 64, 112, 112], [1, 64, 112, 112], [1, 64, 112, 112]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [1, 0, 0, 0], [2, 0, 0, 0], [3, 0, 0, 0], [4, 0, 0, 0], [5, 0, 0, 0]] - //CHECK: [[OUT:%.*]] = VPU.Copy([[OUT_CMX]] + //CHECK: [[OUT:%.+]] = VPU.Copy([[OUT_CMX]] //CHECK: return [[OUT]] : tensor<6x64x112x112xf16, {order = #NHWC}> } @@ -6048,6 +6090,8 @@ func.func @CompressConvToDistributedOpSOB(%arg0: tensor<6x4x224x224xf16, {order module @Permute { IE.TileResource 2 of @NCE at 1.300000e+03 MHz +// CHECK-LABEL: @NCEPermuteCompressConv +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x3x224x224xf16> func.func @NCEPermuteCompressConv(%arg0: tensor<1x3x224x224xf16>) -> tensor<1x16x112x112xf16, {order = #NHWC}> { %WEIGHTS = const.Declare tensor<16x1x1x48x!qElemType, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x1x1x48xf16>, [ @@ -6077,7 +6121,7 @@ func.func @NCEPermuteCompressConv(%arg0: tensor<1x3x224x224xf16>) -> tensor<1x16 return %1 : tensor<1x16x112x112xf16, {order = #NHWC}> - // CHECK: [[COPY_INPUT:%.*]] = VPU.Copy(%arg0 + // CHECK: [[COPY_INPUT:%.+]] = VPU.Copy([[INPUT]] // CHECK-SAME: -> !VPU.DistributedTensor<1x3x224x224xf16, #NCHW, @CMX_NN, // CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[1, 3, 112, 224], [1, 3, 112, 224]], @@ -6085,7 +6129,7 @@ func.func @NCEPermuteCompressConv(%arg0: tensor<1x3x224x224xf16>) -> tensor<1x16 // CHECK-SAME{LITERAL}: memory_shapes = [[1, 3, 112, 224], [1, 3, 112, 224]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 112, 0]]}> - // CHECK: [[NCE_PERMUTE:%.*]] = VPU.NCE.Permute([[COPY_INPUT]] + // CHECK: [[NCE_PERMUTE:%.+]] = VPU.NCE.Permute([[COPY_INPUT]] // CHECK-SAME: -> !VPU.DistributedTensor<1x4x224x224x!qElemType, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[1, 4, 112, 224], [1, 4, 112, 224]], @@ -6112,6 +6156,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-LABEL: @UnrollSOKAveragePoolInputDuplicatedOutputSegmented +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x1x320x1xf16> func.func @UnrollSOKAveragePoolInputDuplicatedOutputSegmented(%input: tensor<1x1x320x1xf16>) -> tensor<1x320x1x1xf16, {order = #NHWC}> { %mvn = VPU.MVN(%input) {across_channels = false, eps = 9.9999997473787516E-6 : f64, multiClusterStrategy = #VPU.multi_cluster_strategy, normalize_variance = true} : tensor<1x1x320x1xf16> -> tensor<1x1x320x1xf16> @@ -6134,7 +6179,7 @@ func.func @UnrollSOKAveragePoolInputDuplicatedOutputSegmented(%input: tensor<1x1 // (DUP) MVN (DUP) -> (DUP) AveragePool (SEG) -> (SEG) Sigmoid - //CHECK: [[MVN_COPY_IN:%.*]] = VPU.Copy(%arg0 + //CHECK: [[MVN_COPY_IN:%.+]] = VPU.Copy([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x1x320x1xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1]], @@ -6142,7 +6187,7 @@ func.func @UnrollSOKAveragePoolInputDuplicatedOutputSegmented(%input: tensor<1x1 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[MVN:%.*]] = VPU.MVN([[MVN_COPY_IN]] + //CHECK: [[MVN:%.+]] = VPU.MVN([[MVN_COPY_IN]] //CHECK-SAME: !VPU.DistributedTensor<1x1x320x1xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1]], @@ -6150,15 +6195,15 @@ func.func @UnrollSOKAveragePoolInputDuplicatedOutputSegmented(%input: tensor<1x1 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1], [1, 1, 320, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[MVN_COPY_OUT:%.*]] = VPU.Copy([[MVN]] + //CHECK: [[MVN_COPY_OUT:%.+]] = VPU.Copy([[MVN]] //CHECK-SAME: -> tensor<1x1x320x1xf16> - //CHECK: [[RESHAPE:%.*]] = VPU.AffineReshape([[MVN_COPY_OUT]]) + //CHECK: [[RESHAPE:%.+]] = VPU.AffineReshape([[MVN_COPY_OUT]]) //CHECK-SAME{LITERAL}: {dim_mapping = [[0], [0], [1], [2, 3]], shape_value = [1, 320, 1, 1]} : tensor<1x1x320x1xf16> -> tensor<1x320x1x1xf16> - //CHECK: [[CAST:%.*]] = VPU.PermuteCast([[RESHAPE]]) {dst_order = #NHWC, mem_perm = #NHWC} : tensor<1x320x1x1xf16> -> tensor<1x320x1x1xf16, {order = #NHWC}> + //CHECK: [[CAST:%.+]] = VPU.PermuteCast([[RESHAPE]]) {dst_order = #NHWC, mem_perm = #NHWC} : tensor<1x320x1x1xf16> -> tensor<1x320x1x1xf16, {order = #NHWC}> - //CHECK: [[AVERAGEPOOL_INPUT_COPY_IN:%.*]] = VPU.Copy([[CAST]] + //CHECK: [[AVERAGEPOOL_INPUT_COPY_IN:%.+]] = VPU.Copy([[CAST]] //CHECK-SAME: -> !VPU.DistributedTensor<1x320x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1]], @@ -6166,17 +6211,17 @@ func.func @UnrollSOKAveragePoolInputDuplicatedOutputSegmented(%input: tensor<1x1 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1], [1, 320, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[AVERAGEPOOL:%.*]] = VPU.NCE.AveragePool([[AVERAGEPOOL_INPUT_COPY_IN]] + //CHECK: [[AVERAGEPOOL:%.+]] = VPU.NCE.AveragePool([[AVERAGEPOOL_INPUT_COPY_IN]] //CHECK-SAME: -> !VPU.DistributedTensor<1x320x1x1xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 64, 1, 1], [1, 64, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1]], //CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 64, 0, 0], [0, 128, 0, 0], [0, 176, 0, 0], [0, 224, 0, 0], [0, 272, 0, 0]], //CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 1, 1], [1, 64, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 64, 0, 0], [0, 128, 0, 0], [0, 176, 0, 0], [0, 224, 0, 0], [0, 272, 0, 0]]}> - //CHECK: [[AVERAGEPOOL_COPY_OUT:%.*]] = VPU.Copy([[AVERAGEPOOL]] + //CHECK: [[AVERAGEPOOL_COPY_OUT:%.+]] = VPU.Copy([[AVERAGEPOOL]] //CHECK-SAME: -> tensor<1x320x1x1xf16, {order = #NHWC}> - //CHECK: [[SIGMOID_COPY_IN:%.*]] = VPU.Copy([[AVERAGEPOOL_COPY_OUT]] + //CHECK: [[SIGMOID_COPY_IN:%.+]] = VPU.Copy([[AVERAGEPOOL_COPY_OUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x320x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 64, 1, 1], [1, 64, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1]], @@ -6184,7 +6229,7 @@ func.func @UnrollSOKAveragePoolInputDuplicatedOutputSegmented(%input: tensor<1x1 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 1, 1], [1, 64, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 64, 0, 0], [0, 128, 0, 0], [0, 176, 0, 0], [0, 224, 0, 0], [0, 272, 0, 0]]}> - //CHECK: [[SIGMOID:%.*]] = VPU.Sigmoid([[SIGMOID_COPY_IN]] + //CHECK: [[SIGMOID:%.+]] = VPU.Sigmoid([[SIGMOID_COPY_IN]] //CHECK-SAME: -> !VPU.DistributedTensor<1x320x1x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, alignment = [1, 16, 1, 1], uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 64, 1, 1], [1, 64, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1]], @@ -6192,7 +6237,7 @@ func.func @UnrollSOKAveragePoolInputDuplicatedOutputSegmented(%input: tensor<1x1 //CHECK-SAME{LITERAL}: memory_shapes = [[1, 64, 1, 1], [1, 64, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1], [1, 48, 1, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 64, 0, 0], [0, 128, 0, 0], [0, 176, 0, 0], [0, 224, 0, 0], [0, 272, 0, 0]]}> - //CHECK: [[SIGMOID_COPY_OUT:%.*]] = VPU.Copy([[SIGMOID]] + //CHECK: [[SIGMOID_COPY_OUT:%.+]] = VPU.Copy([[SIGMOID]] //CHECK-SAME: -> tensor<1x320x1x1xf16, {order = #NHWC}> } @@ -6209,6 +6254,8 @@ module @NCEPermute { IE.TileResource 2 of @NCE at 1.700000e+03 MHz +// CHECK-LABEL: @NCEPermute3x224x224 +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x3x224x224xf16> func.func @NCEPermute3x224x224(%arg0: tensor<1x3x224x224xf16>) -> tensor<1x4x224x224x!qElemType, {order = #NHWC}> { %0 = VPU.NCE.Permute(%arg0) { dstElemType = !qElemType, @@ -6220,7 +6267,7 @@ func.func @NCEPermute3x224x224(%arg0: tensor<1x3x224x224xf16>) -> tensor<1x4x224 return %0 : tensor<1x4x224x224x!qElemType, {order = #NHWC}> - // CHECK: [[COPY_INPUT:%.*]] = VPU.Copy(%arg0 + // CHECK: [[COPY_INPUT:%.+]] = VPU.Copy([[INPUT]] // CHECK-SAME: -> !VPU.DistributedTensor<1x3x224x224xf16, #NCHW, @CMX_NN, { // CHECK-SAME: mode = "OVERLAPPED", // CHECK-SAME: num_tiles = [1, 1, 2, 1], @@ -6233,7 +6280,7 @@ func.func @NCEPermute3x224x224(%arg0: tensor<1x3x224x224xf16>) -> tensor<1x4x224 // CHECK-SAME: } - // CHECK: [[NCE_PERMUTE:%.*]] = VPU.NCE.Permute + // CHECK: [[NCE_PERMUTE:%.+]] = VPU.NCE.Permute // CHECK-SAME: -> !VPU.DistributedTensor<1x4x224x224x!qElemType, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "OVERLAPPED", // CHECK-SAME: num_tiles = [1, 1, 2, 1], @@ -6246,7 +6293,7 @@ func.func @NCEPermute3x224x224(%arg0: tensor<1x3x224x224xf16>) -> tensor<1x4x224 // CHECK-SAME: } - // CHECK: [[COPY_OUTPUT:%.*]] = VPU.Copy + // CHECK: [[COPY_OUTPUT:%.+]] = VPU.Copy // CHECK-SAME: -> tensor<1x4x224x224x!qElemType, {order = #NHWC}> } @@ -6261,6 +6308,8 @@ module @NCEPermuteWithSOK { IE.TileResource 4 of @NCE at 1.700000e+03 MHz +// CHECK-LABEL: @NCEPermuteSOK +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x128x32x64xf16> func.func @NCEPermuteSOK(%arg0: tensor<1x128x32x64xf16>) -> tensor<1x128x32x64xf16, {order = #NHWC}> { %0 = VPU.NCE.Permute(%arg0) { dstElemType = f16, @@ -6272,7 +6321,7 @@ func.func @NCEPermuteSOK(%arg0: tensor<1x128x32x64xf16>) -> tensor<1x128x32x64xf return %0 : tensor<1x128x32x64xf16, {order = #NHWC}> - // CHECK: [[COPY_INPUT:%.*]] = VPU.Copy(%arg0 + // CHECK: [[COPY_INPUT:%.+]] = VPU.Copy([[INPUT]] // CHECK-SAME: -> !VPU.DistributedTensor<1x128x32x64xf16, #NCHW, @CMX_NN, { // CHECK-SAME: mode = "SEGMENTED", // CHECK-SAME: num_tiles = [1, 4, 1, 1], @@ -6285,7 +6334,7 @@ func.func @NCEPermuteSOK(%arg0: tensor<1x128x32x64xf16>) -> tensor<1x128x32x64xf // CHECK-SAME: } - // CHECK: [[NCE_PERMUTE:%.*]] = VPU.NCE.Permute + // CHECK: [[NCE_PERMUTE:%.+]] = VPU.NCE.Permute // CHECK-SAME: -> !VPU.DistributedTensor<1x128x32x64xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "SEGMENTED", // CHECK-SAME: num_tiles = [1, 4, 1, 1], @@ -6299,7 +6348,7 @@ func.func @NCEPermuteSOK(%arg0: tensor<1x128x32x64xf16>) -> tensor<1x128x32x64xf // CHECK-SAME: } - // CHECK: [[COPY_OUTPUT:%.*]] = VPU.Copy + // CHECK: [[COPY_OUTPUT:%.+]] = VPU.Copy // CHECK-SAME: -> tensor<1x128x32x64xf16, {order = #NHWC}> } @@ -6316,6 +6365,8 @@ module @NCEPermuteDepthwiseConv { IE.TileResource 2 of @NCE at 1.700000e+03 MHz +// CHECK-LABEL: @NCEPermuteDWCONV3x224x224 +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x3x224x224xf16> func.func @NCEPermuteDWCONV3x224x224(%arg0: tensor<1x3x224x224xf16>) -> tensor<1x16x224x224x!qElemType, {order = #NHWC}> { %WEIGHTS = const.Declare tensor<16x16x1x1x!qElemType, {order = #NHWC}> = dense<1.000000e+00> : tensor<16x16x1x1xf16>, [ @@ -6343,7 +6394,7 @@ func.func @NCEPermuteDWCONV3x224x224(%arg0: tensor<1x3x224x224xf16>) -> tensor<1 return %1 : tensor<1x16x224x224x!qElemType, {order = #NHWC}> - // CHECK: [[COPY_INPUT:%.*]] = VPU.Copy(%arg0 + // CHECK: [[COPY_INPUT:%.+]] = VPU.Copy([[INPUT]] // CHECK-SAME: -> !VPU.DistributedTensor<1x3x224x224xf16, #NCHW, @CMX_NN, { // CHECK-SAME: mode = "OVERLAPPED", // CHECK-SAME: num_tiles = [1, 1, 2, 1], @@ -6356,7 +6407,7 @@ func.func @NCEPermuteDWCONV3x224x224(%arg0: tensor<1x3x224x224xf16>) -> tensor<1 // CHECK-SAME: } - // CHECK: [[NCE_PERMUTE:%.*]] = VPU.NCE.Permute + // CHECK: [[NCE_PERMUTE:%.+]] = VPU.NCE.Permute // CHECK-SAME: -> !VPU.DistributedTensor<1x16x224x224x!qElemType, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "OVERLAPPED", // CHECK-SAME: num_tiles = [1, 1, 2, 1], @@ -6369,7 +6420,7 @@ func.func @NCEPermuteDWCONV3x224x224(%arg0: tensor<1x3x224x224xf16>) -> tensor<1 // CHECK-SAME: } - // CHECK: [[COPY_OUTPUT:%.*]] = VPU.Copy + // CHECK: [[COPY_OUTPUT:%.+]] = VPU.Copy // CHECK-SAME: -> tensor<1x16x224x224x!qElemType, {order = #NHWC}> } @@ -6414,7 +6465,7 @@ func.func @NCEPermuteCONV3x3(%arg0: tensor<1x3x224x224xf16>) -> tensor<1x16x224x return %1 : tensor<1x16x224x224x!qElemType, {order = #NHWC}> - // CHECK: [[COPY_INPUT:%.*]] = VPU.Copy([[ARG0]] + // CHECK: [[COPY_INPUT:%.+]] = VPU.Copy([[ARG0]] // CHECK-SAME: -> !VPU.DistributedTensor<1x3x224x224xf16, #NCHW, @CMX_NN, { // CHECK-SAME: mode = "OVERLAPPED", // CHECK-SAME: num_tiles = [1, 1, 2, 1], @@ -6426,7 +6477,7 @@ func.func @NCEPermuteCONV3x3(%arg0: tensor<1x3x224x224xf16>) -> tensor<1x16x224x // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 112, 0]] // CHECK-SAME: } - // CHECK: [[NCE_PERMUTE:%.*]] = VPU.NCE.Permute + // CHECK: [[NCE_PERMUTE:%.+]] = VPU.NCE.Permute // CHECK-SAME: -> !VPU.DistributedTensor<1x16x224x224x!qElemType, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "OVERLAPPED", // CHECK-SAME: num_tiles = [1, 1, 2, 1], @@ -6438,7 +6489,7 @@ func.func @NCEPermuteCONV3x3(%arg0: tensor<1x3x224x224xf16>) -> tensor<1x16x224x // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 111, 0]] // CHECK-SAME: } - // CHECK: [[COPY_OUTPUT:%.*]] = VPU.Copy + // CHECK: [[COPY_OUTPUT:%.+]] = VPU.Copy // CHECK-SAME: -> tensor<1x16x224x224x!qElemType, {order = #NHWC}> } @@ -6465,25 +6516,25 @@ func.func @MultiDepthConv(%arg0: tensor<1x32x112x112xf16, {order = #NHWC}>) -> t %concat = VPU.Concat(%dwconv_1, %dwconv_2, %dwconv_3) {static_offsets = [[0, 0, 0, 0], [0, 32, 0, 0], [0, 64, 0, 0]]} : tensor<1x32x112x112xf16, {order = #NHWC}>, tensor<1x32x112x112xf16, {order = #NHWC}>, tensor<1x32x112x112xf16, {order = #NHWC}> -> tensor<1x96x112x112xf16, {order = #NHWC}> return %concat: tensor<1x96x112x112xf16, {order = #NHWC}> - // CHECK: [[TILING_COPY_1:%.*]] = VPU.Copy - // CHECK: [[TILING_COPY_2:%.*]] = VPU.Copy - // CHECK: [[TILING_COPY_3:%.*]] = VPU.Copy - // CHECK: [[DWCONV_1:%.*]] = VPU.NCE.DepthConvolution - // CHECK: [[TILING_COPY_OUT_1:%.*]] = VPU.Copy - - // CHECK: [[TILING_COPY_4:%.*]] = VPU.Copy - // CHECK: [[TILING_COPY_5:%.*]] = VPU.Copy - // CHECK: [[TILING_COPY_6:%.*]] = VPU.Copy - // CHECK: [[DWCONV_2:%.*]] = VPU.NCE.DepthConvolution - // CHECK: [[TILING_COPY_OUT_2:%.*]] = VPU.Copy - - // CHECK: [[TILING_COPY_7:%.*]] = VPU.Copy - // CHECK: [[TILING_COPY_8:%.*]] = VPU.Copy - // CHECK: [[TILING_COPY_9:%.*]] = VPU.Copy - // CHECK: [[DWCONV_3:%.*]] = VPU.NCE.DepthConvolution - // CHECK: [[TILING_COPY_OUT_3:%.*]] = VPU.Copy - - // CHECK: [[CONCAT:%.*]] = VPU.Concat([[TILING_COPY_OUT_1]], [[TILING_COPY_OUT_2]], [[TILING_COPY_OUT_3]]) + // CHECK: [[TILING_COPY_1:%.+]] = VPU.Copy + // CHECK: [[TILING_COPY_2:%.+]] = VPU.Copy + // CHECK: [[TILING_COPY_3:%.+]] = VPU.Copy + // CHECK: [[DWCONV_1:%.+]] = VPU.NCE.DepthConvolution + // CHECK: [[TILING_COPY_OUT_1:%.+]] = VPU.Copy + + // CHECK: [[TILING_COPY_4:%.+]] = VPU.Copy + // CHECK: [[TILING_COPY_5:%.+]] = VPU.Copy + // CHECK: [[TILING_COPY_6:%.+]] = VPU.Copy + // CHECK: [[DWCONV_2:%.+]] = VPU.NCE.DepthConvolution + // CHECK: [[TILING_COPY_OUT_2:%.+]] = VPU.Copy + + // CHECK: [[TILING_COPY_7:%.+]] = VPU.Copy + // CHECK: [[TILING_COPY_8:%.+]] = VPU.Copy + // CHECK: [[TILING_COPY_9:%.+]] = VPU.Copy + // CHECK: [[DWCONV_3:%.+]] = VPU.NCE.DepthConvolution + // CHECK: [[TILING_COPY_OUT_3:%.+]] = VPU.Copy + + // CHECK: [[CONCAT:%.+]] = VPU.Concat([[TILING_COPY_OUT_1]], [[TILING_COPY_OUT_2]], [[TILING_COPY_OUT_3]]) // CHECK: {static_offsets = [ // CHECK-SAME: [0, 0, 0, 0], [0, 32, 0, 0], [0, 64, 0, 0] // CHECK-SAME: ]} : tensor<1x32x112x112xf16, {order = #NHWC}>, tensor<1x32x112x112xf16, {order = #NHWC}>, tensor<1x32x112x112xf16, {order = #NHWC}> @@ -6524,30 +6575,30 @@ func.func @InplaceEltwiseInferFromInput(%arg0: tensor<1x32x112x112xf16, {order = return %2 : tensor<1x32x112x112xf16, {order = #NHWC}> - // CHECK: [[TILING_COPY_1:%.*]] = VPU.Copy - // CHECK: [[AVG_POOL_1:%.*]] = VPU.NCE.AveragePool + // CHECK: [[TILING_COPY_1:%.+]] = VPU.Copy + // CHECK: [[AVG_POOL_1:%.+]] = VPU.NCE.AveragePool // CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 56, 112], [1, 32, 56, 112]], // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 56, 0]], // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 57, 112], [1, 32, 57, 112]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 55, 0]] - // CHECK: [[TILING_COPY_2:%.*]] = VPU.Copy + // CHECK: [[TILING_COPY_2:%.+]] = VPU.Copy - // CHECK: [[TILING_COPY_3:%.*]] = VPU.Copy - // CHECK: [[AVG_POOL_2:%.*]] = VPU.NCE.AveragePool + // CHECK: [[TILING_COPY_3:%.+]] = VPU.Copy + // CHECK: [[AVG_POOL_2:%.+]] = VPU.NCE.AveragePool // CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 56, 112], [1, 32, 56, 112]], // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 56, 0]], // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 57, 112], [1, 32, 57, 112]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 55, 0]] - // CHECK: [[TILING_COPY_4:%.*]] = VPU.Copy + // CHECK: [[TILING_COPY_4:%.+]] = VPU.Copy - // CHECK: [[INPUT0_CMX:%.*]] = VPU.Copy - // CHECK: [[INPUT1_CMX:%.*]] = VPU.Copy + // CHECK: [[INPUT0_CMX:%.+]] = VPU.Copy + // CHECK: [[INPUT1_CMX:%.+]] = VPU.Copy - // CHECK: [[OUT_CMX:%.*]] = VPU.NCE.Eltwise( + // CHECK: [[OUT_CMX:%.+]] = VPU.NCE.Eltwise( // CHECK-SAME: [[INPUT0_CMX]], // CHECK-SAME: [[INPUT1_CMX]]) // CHECK-SAME: -> !VPU.DistributedTensor<1x32x112x112xf16, #NHWC, @CMX_NN, @@ -6559,7 +6610,7 @@ func.func @InplaceEltwiseInferFromInput(%arg0: tensor<1x32x112x112xf16, {order = // CHECK-NOT{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 56, 0]] // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 55, 0]] - // CHECK: [[TILING_COPY_OUT_2:%.*]] = VPU.Copy + // CHECK: [[TILING_COPY_OUT_2:%.+]] = VPU.Copy } // ----- @@ -6579,7 +6630,7 @@ func.func @SubtractSWSOHTileAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, return %0 : tensor<1x32x44x44xf16> - //CHECK: [[INPUT0:%.*]] = VPU.Copy([[INPUT_0]] + //CHECK: [[INPUT0:%.+]] = VPU.Copy([[INPUT_0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x44x44xf16, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], @@ -6587,7 +6638,7 @@ func.func @SubtractSWSOHTileAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 23, 0], [0, 0, 30, 0], [0, 0, 37, 0]]}> - //CHECK: [[INPUT1:%.*]] = VPU.Copy([[INPUT_1]] + //CHECK: [[INPUT1:%.+]] = VPU.Copy([[INPUT_1]] //CHECK-SAME: -> !VPU.DistributedTensor<1x1x1x44xf16, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44]], @@ -6595,7 +6646,7 @@ func.func @SubtractSWSOHTileAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[SUBTRACT:%.*]] = VPU.Subtract([[INPUT0]], + //CHECK: [[SUBTRACT:%.+]] = VPU.Subtract([[INPUT0]], //CHECK: [[INPUT1]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x32x44x44xf16, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, @@ -6604,7 +6655,7 @@ func.func @SubtractSWSOHTileAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 23, 0], [0, 0, 30, 0], [0, 0, 37, 0]]}> - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[SUBTRACT]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[SUBTRACT]] //CHECK: return [[OUTPUT]] : tensor<1x32x44x44xf16> } @@ -6627,7 +6678,7 @@ func.func @AddSWSOHTileNotAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, return %0 : tensor<1x32x44x44xf16> - //CHECK: [[INPUT0:%.*]] = VPU.Copy([[INPUT_0]] + //CHECK: [[INPUT0:%.+]] = VPU.Copy([[INPUT_0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x44x44xf16, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], @@ -6635,7 +6686,7 @@ func.func @AddSWSOHTileNotAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 23, 0], [0, 0, 30, 0], [0, 0, 37, 0]]}> - //CHECK: [[INPUT1:%.*]] = VPU.Copy([[INPUT_1]] + //CHECK: [[INPUT1:%.+]] = VPU.Copy([[INPUT_1]] //CHECK-SAME: -> !VPU.DistributedTensor<1x1x44x44xf16, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 8, 44], [1, 1, 8, 44], [1, 1, 7, 44], [1, 1, 7, 44], [1, 1, 7, 44], [1, 1, 7, 44]], @@ -6643,7 +6694,7 @@ func.func @AddSWSOHTileNotAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 8, 44], [1, 1, 8, 44], [1, 1, 7, 44], [1, 1, 7, 44], [1, 1, 7, 44], [1, 1, 7, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 23, 0], [0, 0, 30, 0], [0, 0, 37, 0]]}> - //CHECK: [[ADD:%.*]] = VPU.Add([[INPUT0]], + //CHECK: [[ADD:%.+]] = VPU.Add([[INPUT0]], //CHECK: [[INPUT1]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x32x44x44xf16, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, @@ -6652,7 +6703,7 @@ func.func @AddSWSOHTileNotAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 23, 0], [0, 0, 30, 0], [0, 0, 37, 0]]}> - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[ADD]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[ADD]] //CHECK: return [[OUTPUT]] : tensor<1x32x44x44xf16> } @@ -6675,7 +6726,7 @@ func.func @AddSWSOHTileAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, return %0 : tensor<1x32x44x44xf16> - //CHECK: [[INPUT0:%.*]] = VPU.Copy([[INPUT_0]] + //CHECK: [[INPUT0:%.+]] = VPU.Copy([[INPUT_0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x44x44xf16, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], @@ -6683,7 +6734,7 @@ func.func @AddSWSOHTileAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 23, 0], [0, 0, 30, 0], [0, 0, 37, 0]]}> - //CHECK: [[INPUT1:%.*]] = VPU.Copy([[INPUT_1]] + //CHECK: [[INPUT1:%.+]] = VPU.Copy([[INPUT_1]] //CHECK-SAME: -> !VPU.DistributedTensor<1x1x1x44xf16, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44]], @@ -6691,7 +6742,7 @@ func.func @AddSWSOHTileAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[ADD:%.*]] = VPU.Add([[INPUT0]], + //CHECK: [[ADD:%.+]] = VPU.Add([[INPUT0]], //CHECK: [[INPUT1]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x32x44x44xf16, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, @@ -6700,7 +6751,7 @@ func.func @AddSWSOHTileAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 23, 0], [0, 0, 30, 0], [0, 0, 37, 0]]}> - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[ADD]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[ADD]] //CHECK: return [[OUTPUT]] : tensor<1x32x44x44xf16> } @@ -6723,7 +6774,7 @@ func.func @EqualSWSOHTileNotAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, return %0 : tensor<1x32x44x44xi8> - //CHECK: [[INPUT0:%.*]] = VPU.Copy([[INPUT_0]] + //CHECK: [[INPUT0:%.+]] = VPU.Copy([[INPUT_0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x44x44xf16, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], @@ -6731,7 +6782,7 @@ func.func @EqualSWSOHTileNotAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 23, 0], [0, 0, 30, 0], [0, 0, 37, 0]]}> - //CHECK: [[INPUT1:%.*]] = VPU.Copy([[INPUT_1]] + //CHECK: [[INPUT1:%.+]] = VPU.Copy([[INPUT_1]] //CHECK-SAME: -> !VPU.DistributedTensor<1x1x44x44xf16, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 8, 44], [1, 1, 8, 44], [1, 1, 7, 44], [1, 1, 7, 44], [1, 1, 7, 44], [1, 1, 7, 44]], @@ -6739,7 +6790,7 @@ func.func @EqualSWSOHTileNotAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 8, 44], [1, 1, 8, 44], [1, 1, 7, 44], [1, 1, 7, 44], [1, 1, 7, 44], [1, 1, 7, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 23, 0], [0, 0, 30, 0], [0, 0, 37, 0]]}> - //CHECK: [[EQUAL:%.*]] = VPU.Equal([[INPUT0]], + //CHECK: [[EQUAL:%.+]] = VPU.Equal([[INPUT0]], //CHECK: [[INPUT1]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x32x44x44xi8, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, @@ -6748,7 +6799,7 @@ func.func @EqualSWSOHTileNotAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 23, 0], [0, 0, 30, 0], [0, 0, 37, 0]]}> - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[EQUAL]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[EQUAL]] //CHECK: return [[OUTPUT]] : tensor<1x32x44x44xi8> } @@ -6771,7 +6822,7 @@ func.func @EqualSWSOHTileAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, return %0 : tensor<1x32x44x44xi8> - //CHECK: [[INPUT0:%.*]] = VPU.Copy([[INPUT_0]] + //CHECK: [[INPUT0:%.+]] = VPU.Copy([[INPUT_0]] //CHECK-SAME: -> !VPU.DistributedTensor<1x32x44x44xf16, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], @@ -6779,7 +6830,7 @@ func.func @EqualSWSOHTileAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 23, 0], [0, 0, 30, 0], [0, 0, 37, 0]]}> - //CHECK: [[INPUT1:%.*]] = VPU.Copy([[INPUT_1]] + //CHECK: [[INPUT1:%.+]] = VPU.Copy([[INPUT_1]] //CHECK-SAME: -> !VPU.DistributedTensor<1x1x1x44xf16, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44]], @@ -6787,7 +6838,7 @@ func.func @EqualSWSOHTileAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44], [1, 1, 1, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[EQUAL:%.*]] = VPU.Equal([[INPUT0]], + //CHECK: [[EQUAL:%.+]] = VPU.Equal([[INPUT0]], //CHECK: [[INPUT1]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x32x44x44xi8, #NCHW, @CMX_NN, // CHECK-SAME{LITERAL}: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, @@ -6796,7 +6847,7 @@ func.func @EqualSWSOHTileAtBroadcastAxis(%arg0: tensor<1x32x44x44xf16>, // CHECK-SAME{LITERAL}: memory_shapes = [[1, 32, 8, 44], [1, 32, 8, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44], [1, 32, 7, 44]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 8, 0], [0, 0, 16, 0], [0, 0, 23, 0], [0, 0, 30, 0], [0, 0, 37, 0]]}> - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[EQUAL]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[EQUAL]] //CHECK: return [[OUTPUT]] : tensor<1x32x44x44xi8> } @@ -6818,7 +6869,7 @@ func.func @FloorSWSOH(%arg0: tensor<1x16x16x512xf16>) -> tensor<1x16x16x512xf16> return %0 : tensor<1x16x16x512xf16> - //CHECK: [[INPUT:%.*]] = VPU.Copy([[INPUT_DATA]] + //CHECK: [[INPUT:%.+]] = VPU.Copy([[INPUT_DATA]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x16x512xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 2, 512], [1, 16, 2, 512]], @@ -6826,7 +6877,7 @@ func.func @FloorSWSOH(%arg0: tensor<1x16x16x512xf16>) -> tensor<1x16x16x512xf16> //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 2, 512], [1, 16, 2, 512]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 6, 0], [0, 0, 9, 0], [0, 0, 12, 0], [0, 0, 14, 0]]}> - //CHECK: [[FLOOR:%.*]] = VPU.Floor([[INPUT]]) + //CHECK: [[FLOOR:%.+]] = VPU.Floor([[INPUT]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x16x16x512xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 2, 512], [1, 16, 2, 512]], @@ -6834,7 +6885,7 @@ func.func @FloorSWSOH(%arg0: tensor<1x16x16x512xf16>) -> tensor<1x16x16x512xf16> //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 2, 512], [1, 16, 2, 512]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 6, 0], [0, 0, 9, 0], [0, 0, 12, 0], [0, 0, 14, 0]]}> - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[FLOOR]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[FLOOR]] //CHECK: return [[OUTPUT]] : tensor<1x16x16x512xf16> } @@ -6855,7 +6906,7 @@ func.func @FloorSWSOK(%arg0: tensor<1x16x1x513xf16>) -> tensor<1x16x1x513xf16> { return %0 : tensor<1x16x1x513xf16> - //CHECK: [[INPUT:%.*]] = VPU.Copy({{[^:]+}} + //CHECK: [[INPUT:%.+]] = VPU.Copy({{[^:]+}} //CHECK-SAME: -> !VPU.DistributedTensor<1x16x1x513xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 2, 1, 513], [1, 2, 1, 513]], @@ -6863,7 +6914,7 @@ func.func @FloorSWSOK(%arg0: tensor<1x16x1x513xf16>) -> tensor<1x16x1x513xf16> { //CHECK-SAME{LITERAL}: memory_shapes = [[1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 2, 1, 513], [1, 2, 1, 513]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 3, 0, 0], [0, 6, 0, 0], [0, 9, 0, 0], [0, 12, 0, 0], [0, 14, 0, 0]]}> - //CHECK: [[FLOOR:%.*]] = VPU.Floor([[INPUT]]) + //CHECK: [[FLOOR:%.+]] = VPU.Floor([[INPUT]]) //CHECK-SAME: -> !VPU.DistributedTensor<1x16x1x513xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 2, 1, 513], [1, 2, 1, 513]], @@ -6871,7 +6922,7 @@ func.func @FloorSWSOK(%arg0: tensor<1x16x1x513xf16>) -> tensor<1x16x1x513xf16> { //CHECK-SAME{LITERAL}: memory_shapes = [[1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 2, 1, 513], [1, 2, 1, 513]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 3, 0, 0], [0, 6, 0, 0], [0, 9, 0, 0], [0, 12, 0, 0], [0, 14, 0, 0]]} - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[FLOOR]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[FLOOR]] //CHECK: return [[OUTPUT]] : tensor<1x16x1x513xf16> } @@ -6892,7 +6943,7 @@ func.func @FloorSWClustering(%arg0: tensor<1x1x1x513xf16>) -> tensor<1x1x1x513xf return %0 : tensor<1x1x1x513xf16> - //CHECK: [[INPUT:%.*]] = VPU.Copy({{[^:]+}} + //CHECK: [[INPUT:%.+]] = VPU.Copy({{[^:]+}} //CHECK-SAME: -> !VPU.DistributedTensor<1x1x1x513xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513]], @@ -6900,7 +6951,7 @@ func.func @FloorSWClustering(%arg0: tensor<1x1x1x513xf16>) -> tensor<1x1x1x513xf //CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[FLOOR:%.*]] = VPU.Floor([[INPUT]] + //CHECK: [[FLOOR:%.+]] = VPU.Floor([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x1x1x513xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513]], @@ -6908,7 +6959,7 @@ func.func @FloorSWClustering(%arg0: tensor<1x1x1x513xf16>) -> tensor<1x1x1x513xf //CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[FLOOR]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[FLOOR]] //CHECK: return [[OUTPUT]] : tensor<1x1x1x513xf16> } @@ -6929,7 +6980,7 @@ func.func @RoundSWSOH(%arg0: tensor<1x16x16x512xf16>) -> tensor<1x16x16x512xf16> return %0 : tensor<1x16x16x512xf16> - //CHECK: [[INPUT:%.*]] = VPU.Copy([[INPUT_DATA]] + //CHECK: [[INPUT:%.+]] = VPU.Copy([[INPUT_DATA]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x16x512xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 2, 512], [1, 16, 2, 512]], @@ -6937,7 +6988,7 @@ func.func @RoundSWSOH(%arg0: tensor<1x16x16x512xf16>) -> tensor<1x16x16x512xf16> //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 2, 512], [1, 16, 2, 512]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 6, 0], [0, 0, 9, 0], [0, 0, 12, 0], [0, 0, 14, 0]]}> - //CHECK: [[ROUND:%.*]] = VPU.Round([[INPUT]] + //CHECK: [[ROUND:%.+]] = VPU.Round([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x16x512xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 2, 512], [1, 16, 2, 512]], @@ -6945,7 +6996,7 @@ func.func @RoundSWSOH(%arg0: tensor<1x16x16x512xf16>) -> tensor<1x16x16x512xf16> //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 3, 512], [1, 16, 2, 512], [1, 16, 2, 512]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 6, 0], [0, 0, 9, 0], [0, 0, 12, 0], [0, 0, 14, 0]]}> - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[ROUND]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[ROUND]] //CHECK: return [[OUTPUT]] : tensor<1x16x16x512xf16> } @@ -6966,7 +7017,7 @@ func.func @RoundSWSOK(%arg0: tensor<1x16x1x513xf16>) -> tensor<1x16x1x513xf16> { return %0 : tensor<1x16x1x513xf16> - //CHECK: [[INPUT:%.*]] = VPU.Copy({{[^:]+}} + //CHECK: [[INPUT:%.+]] = VPU.Copy({{[^:]+}} //CHECK-SAME: -> !VPU.DistributedTensor<1x16x1x513xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 2, 1, 513], [1, 2, 1, 513]], @@ -6974,7 +7025,7 @@ func.func @RoundSWSOK(%arg0: tensor<1x16x1x513xf16>) -> tensor<1x16x1x513xf16> { //CHECK-SAME{LITERAL}: memory_shapes = [[1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 2, 1, 513], [1, 2, 1, 513]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 3, 0, 0], [0, 6, 0, 0], [0, 9, 0, 0], [0, 12, 0, 0], [0, 14, 0, 0]]}> - //CHECK: [[ROUND:%.*]] = VPU.Round([[INPUT]] + //CHECK: [[ROUND:%.+]] = VPU.Round([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x16x1x513xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 2, 1, 513], [1, 2, 1, 513]], @@ -6982,7 +7033,7 @@ func.func @RoundSWSOK(%arg0: tensor<1x16x1x513xf16>) -> tensor<1x16x1x513xf16> { //CHECK-SAME{LITERAL}: memory_shapes = [[1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 3, 1, 513], [1, 2, 1, 513], [1, 2, 1, 513]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 3, 0, 0], [0, 6, 0, 0], [0, 9, 0, 0], [0, 12, 0, 0], [0, 14, 0, 0]]}> - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[ROUND]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[ROUND]] //CHECK: return [[OUTPUT]] : tensor<1x16x1x513xf16> } @@ -7003,7 +7054,7 @@ func.func @RoundSWClustering(%arg0: tensor<1x1x1x513xf16>) -> tensor<1x1x1x513xf return %0 : tensor<1x1x1x513xf16> - //CHECK: [[INPUT:%.*]] = VPU.Copy({{[^:]+}} + //CHECK: [[INPUT:%.+]] = VPU.Copy({{[^:]+}} //CHECK-SAME: -> !VPU.DistributedTensor<1x1x1x513xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513]], @@ -7011,7 +7062,7 @@ func.func @RoundSWClustering(%arg0: tensor<1x1x1x513xf16>) -> tensor<1x1x1x513xf //CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[ROUND:%.*]] = VPU.Round([[INPUT]] + //CHECK: [[ROUND:%.+]] = VPU.Round([[INPUT]] //CHECK-SAME: -> !VPU.DistributedTensor<1x1x1x513xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED", num_clusters = 6 : i64, uniform_distributed_segments, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513]], @@ -7019,7 +7070,7 @@ func.func @RoundSWClustering(%arg0: tensor<1x1x1x513xf16>) -> tensor<1x1x1x513xf //CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513], [1, 1, 1, 513]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - //CHECK: [[OUTPUT:%.*]] = VPU.Copy([[ROUND]] + //CHECK: [[OUTPUT:%.+]] = VPU.Copy([[ROUND]] //CHECK: return [[OUTPUT]] : tensor<1x1x1x513xf16> } @@ -7047,31 +7098,31 @@ func.func @AccumulateClustering( tensor<1x64x1x1xf16, {order = #NHWC}> -> tensor<1x64x16x1xf16, {order = #NHWC}> - // CHECK: [[COPY_LHS:%.*]] = VPU.Copy([[LHS]] + // CHECK: [[COPY_LHS:%.+]] = VPU.Copy([[LHS]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x16x1xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "DUPLICATED", // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_RHS:%.*]] = VPU.Copy([[RHS]] + // CHECK: [[COPY_RHS:%.+]] = VPU.Copy([[RHS]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x16x1xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "DUPLICATED", // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_LHS_SCALES:%.*]] = VPU.Copy([[LHS_SCALES]] + // CHECK: [[COPY_LHS_SCALES:%.+]] = VPU.Copy([[LHS_SCALES]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x1x1xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "DUPLICATED", // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_RHS_SCALES:%.*]] = VPU.Copy([[RHS_SCALES]] + // CHECK: [[COPY_RHS_SCALES:%.+]] = VPU.Copy([[RHS_SCALES]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x1x1xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "DUPLICATED", // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[ACCUMULATE:%.*]] = VPU.Accumulate( + // CHECK: [[ACCUMULATE:%.+]] = VPU.Accumulate( // CHECK-SAME: [[COPY_LHS]] // CHECK-SAME: [[COPY_RHS]] // CHECK-SAME: [[COPY_LHS_SCALES]] @@ -7081,7 +7132,7 @@ func.func @AccumulateClustering( // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_OUT:%.*]] = VPU.Copy([[ACCUMULATE]] + // CHECK: [[COPY_OUT:%.+]] = VPU.Copy([[ACCUMULATE]] // CHECK-SAME: -> tensor<1x64x16x1xf16, {order = #NHWC}> return %ACCUMULATE : tensor<1x64x16x1xf16, {order = #NHWC}> @@ -7112,33 +7163,33 @@ func.func @AccumulateSplitOverHeight( tensor<1x64x1x1xf16, {order = #NHWC}> -> tensor<1x64x16x1xf16, {order = #NHWC}> - // CHECK: [[COPY_LHS:%.*]] = VPU.Copy([[LHS]] + // CHECK: [[COPY_LHS:%.+]] = VPU.Copy([[LHS]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x16x1xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "SEGMENTED", // CHECK-SAME: num_tiles = [1, 1, {{6|3|4}}, 1], // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_RHS:%.*]] = VPU.Copy([[RHS]] + // CHECK: [[COPY_RHS:%.+]] = VPU.Copy([[RHS]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x16x1xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "SEGMENTED", // CHECK-SAME: num_tiles = [1, 1, {{6|3|4}}, 1], // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_LHS_SCALES:%.*]] = VPU.Copy([[LHS_SCALES]] + // CHECK: [[COPY_LHS_SCALES:%.+]] = VPU.Copy([[LHS_SCALES]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x1x1xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "DUPLICATED", // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_RHS_SCALES:%.*]] = VPU.Copy([[RHS_SCALES]] + // CHECK: [[COPY_RHS_SCALES:%.+]] = VPU.Copy([[RHS_SCALES]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x1x1xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "DUPLICATED", // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[ACCUMULATE:%.*]] = VPU.Accumulate( + // CHECK: [[ACCUMULATE:%.+]] = VPU.Accumulate( // CHECK-SAME: [[COPY_LHS]] // CHECK-SAME: [[COPY_RHS]] // CHECK-SAME: [[COPY_LHS_SCALES]] @@ -7149,7 +7200,7 @@ func.func @AccumulateSplitOverHeight( // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_OUT:%.*]] = VPU.Copy([[ACCUMULATE]] + // CHECK: [[COPY_OUT:%.+]] = VPU.Copy([[ACCUMULATE]] // CHECK-SAME: -> tensor<1x64x16x1xf16, {order = #NHWC}> return %ACCUMULATE : tensor<1x64x16x1xf16, {order = #NHWC}> @@ -7179,35 +7230,35 @@ func.func @AccumulateSplitOverKernel( tensor<1x64x1x1xf16, {order = #NHWC}> -> tensor<1x64x16x1xf16, {order = #NHWC}> - // CHECK: [[COPY_LHS:%.*]] = VPU.Copy([[LHS]] + // CHECK: [[COPY_LHS:%.+]] = VPU.Copy([[LHS]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x16x1xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "SEGMENTED", // CHECK-SAME: num_tiles = [1, {{6|3|4}}, 1, 1], // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_RHS:%.*]] = VPU.Copy([[RHS]] + // CHECK: [[COPY_RHS:%.+]] = VPU.Copy([[RHS]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x16x1xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "SEGMENTED", // CHECK-SAME: num_tiles = [1, {{6|3|4}}, 1, 1], // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_LHS_SCALES:%.*]] = VPU.Copy([[LHS_SCALES]] + // CHECK: [[COPY_LHS_SCALES:%.+]] = VPU.Copy([[LHS_SCALES]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x1x1xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "SEGMENTED", // CHECK-SAME: num_tiles = [1, {{6|3|4}}, 1, 1], // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_RHS_SCALES:%.*]] = VPU.Copy([[RHS_SCALES]] + // CHECK: [[COPY_RHS_SCALES:%.+]] = VPU.Copy([[RHS_SCALES]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x1x1xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "SEGMENTED", // CHECK-SAME: num_tiles = [1, {{6|3|4}}, 1, 1], // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[ACCUMULATE:%.*]] = VPU.Accumulate( + // CHECK: [[ACCUMULATE:%.+]] = VPU.Accumulate( // CHECK-SAME: [[COPY_LHS]] // CHECK-SAME: [[COPY_RHS]] // CHECK-SAME: [[COPY_LHS_SCALES]] @@ -7218,7 +7269,7 @@ func.func @AccumulateSplitOverKernel( // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_OUT:%.*]] = VPU.Copy([[ACCUMULATE]] + // CHECK: [[COPY_OUT:%.+]] = VPU.Copy([[ACCUMULATE]] // CHECK-SAME: -> tensor<1x64x16x1xf16, {order = #NHWC}> return %ACCUMULATE : tensor<1x64x16x1xf16, {order = #NHWC}> @@ -7248,33 +7299,33 @@ func.func @AccumulateSplitOverWidth( tensor<1x64x1x1xf16, {order = #NHWC}> -> tensor<1x64x16x32xf16, {order = #NHWC}> - // CHECK: [[COPY_LHS:%.*]] = VPU.Copy([[LHS]] + // CHECK: [[COPY_LHS:%.+]] = VPU.Copy([[LHS]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x16x32xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "SEGMENTED", // CHECK-SAME: num_tiles = [1, 1, 1, {{6|3|4}}], // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_RHS:%.*]] = VPU.Copy([[RHS]] + // CHECK: [[COPY_RHS:%.+]] = VPU.Copy([[RHS]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x16x32xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "SEGMENTED", // CHECK-SAME: num_tiles = [1, 1, 1, {{6|3|4}}], // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_LHS_SCALES:%.*]] = VPU.Copy([[LHS_SCALES]] + // CHECK: [[COPY_LHS_SCALES:%.+]] = VPU.Copy([[LHS_SCALES]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x1x1xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "DUPLICATED", // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_RHS_SCALES:%.*]] = VPU.Copy([[RHS_SCALES]] + // CHECK: [[COPY_RHS_SCALES:%.+]] = VPU.Copy([[RHS_SCALES]] // CHECK-SAME: -> !VPU.DistributedTensor<1x64x1x1xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "DUPLICATED", // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[ACCUMULATE:%.*]] = VPU.Accumulate( + // CHECK: [[ACCUMULATE:%.+]] = VPU.Accumulate( // CHECK-SAME: [[COPY_LHS]] // CHECK-SAME: [[COPY_RHS]] // CHECK-SAME: [[COPY_LHS_SCALES]] @@ -7285,7 +7336,7 @@ func.func @AccumulateSplitOverWidth( // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_OUT:%.*]] = VPU.Copy([[ACCUMULATE]] + // CHECK: [[COPY_OUT:%.+]] = VPU.Copy([[ACCUMULATE]] // CHECK-SAME: -> tensor<1x64x16x32xf16, {order = #NHWC}> return %ACCUMULATE : tensor<1x64x16x32xf16, {order = #NHWC}> @@ -7310,14 +7361,14 @@ func.func @PoolingSplitOverWidth( } : tensor<1x16x32x64xf16, {order = #NHWC}> -> tensor<1x16x32x64xf16, {order = #NHWC}> - // CHECK: [[COPY_INPUT:%.*]] = VPU.Copy([[DATA]] + // CHECK: [[COPY_INPUT:%.+]] = VPU.Copy([[DATA]] // CHECK-SAME: -> !VPU.DistributedTensor<1x16x32x64xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "OVERLAPPED", // CHECK-SAME: num_tiles = [1, 1, 1, {{6|3|4}}], // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[POOL:%.*]] = VPU.NCE.MaxPool( + // CHECK: [[POOL:%.+]] = VPU.NCE.MaxPool( // CHECK-SAME: [[COPY_INPUT]] // CHECK-SAME: -> !VPU.DistributedTensor<1x16x32x64xf16, #NHWC, @CMX_NN, { // CHECK-SAME: mode = "OVERLAPPED", @@ -7325,7 +7376,7 @@ func.func @PoolingSplitOverWidth( // CHECK-SAME: num_clusters = {{6|3|4}} : i64 // CHECK-SAME: }> - // CHECK: [[COPY_OUT:%.*]] = VPU.Copy([[POOL]] + // CHECK: [[COPY_OUT:%.+]] = VPU.Copy([[POOL]] // CHECK-SAME: -> tensor<1x16x32x64xf16, {order = #NHWC}> return %POOL : tensor<1x16x32x64xf16, {order = #NHWC}> @@ -7586,27 +7637,27 @@ func.func @EltwiseInputsSameOffsets(%arg0: tensor<1x128x72x72xf16, {order = #NHW return %3 : tensor<1x128x72x72xf16> - // CHECK: [[TILING_COPY_0:%.*]] = VPU.Copy([[ARG0]] - // CHECK: [[TILING_COPY_1:%.*]] = VPU.Copy - // CHECK: [[TILING_COPY_2:%.*]] = VPU.Copy - // CHECK: [[TILING_CONV:%.*]] = VPU.NCE.Convolution([[TILING_COPY_0]] + // CHECK: [[TILING_COPY_0:%.+]] = VPU.Copy([[ARG0]] + // CHECK: [[TILING_COPY_1:%.+]] = VPU.Copy + // CHECK: [[TILING_COPY_2:%.+]] = VPU.Copy + // CHECK: [[TILING_CONV:%.+]] = VPU.NCE.Convolution([[TILING_COPY_0]] // CHECK-SAME{LITERAL}: -> !VPU.DistributedTensor<1x64x72x72xf16, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64, uniform_distributed_segments, compute_shapes = [[1, 64, 18, 72], [1, 64, 18, 72], [1, 64, 18, 72], [1, 64, 18, 72]], compute_offsets = [[0, 0, 0, 0], [0, 0, 18, 0], [0, 0, 36, 0], [0, 0, 54, 0]], memory_shapes = [[1, 64, 19, 72], [1, 64, 20, 72], [1, 64, 20, 72], [1, 64, 19, 72]], memory_offsets = [[0, 0, 0, 0], [0, 0, 17, 0], [0, 0, 35, 0], [0, 0, 53, 0]]}> - // CHECK: [[TILING_COPY_3:%.*]] = VPU.Copy([[TILING_CONV]] - // CHECK: [[TILING_COPY_4:%.*]] = VPU.Copy([[TILING_COPY_3]] - // CHECK: [[TILING_COPY_5:%.*]] = VPU.Copy - // CHECK: [[TILING_COPY_6:%.*]] = VPU.Copy - // CHECK: [[TILING_DWCONV:%.*]] = VPU.NCE.DepthConvolution([[TILING_COPY_4]] + // CHECK: [[TILING_COPY_3:%.+]] = VPU.Copy([[TILING_CONV]] + // CHECK: [[TILING_COPY_4:%.+]] = VPU.Copy([[TILING_COPY_3]] + // CHECK: [[TILING_COPY_5:%.+]] = VPU.Copy + // CHECK: [[TILING_COPY_6:%.+]] = VPU.Copy + // CHECK: [[TILING_DWCONV:%.+]] = VPU.NCE.DepthConvolution([[TILING_COPY_4]] // CHECK-SAME{LITERAL}: -> !VPU.DistributedTensor<1x64x72x72xf16, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64, uniform_distributed_segments, compute_shapes = [[1, 64, 18, 72], [1, 64, 18, 72], [1, 64, 18, 72], [1, 64, 18, 72]], compute_offsets = [[0, 0, 0, 0], [0, 0, 18, 0], [0, 0, 36, 0], [0, 0, 54, 0]], memory_shapes = [[1, 64, 19, 72], [1, 64, 20, 72], [1, 64, 20, 72], [1, 64, 19, 72]], memory_offsets = [[0, 0, 0, 0], [0, 0, 17, 0], [0, 0, 35, 0], [0, 0, 53, 0]]}> - // CHECK: [[TILING_COPY_7:%.*]] = VPU.Copy([[TILING_DWCONV]] - // CHECK: [[CONCAT:%.*]] = VPU.Concat([[TILING_COPY_3]], [[TILING_COPY_7]]) + // CHECK: [[TILING_COPY_7:%.+]] = VPU.Copy([[TILING_DWCONV]] + // CHECK: [[CONCAT:%.+]] = VPU.Concat([[TILING_COPY_3]], [[TILING_COPY_7]]) // CHECK-SAME{LITERAL}: {static_offsets = [[0, 0, 0, 0], [0, 64, 0, 0]]} : tensor<1x64x72x72xf16, {order = #NHWC}>, tensor<1x64x72x72xf16, {order = #NHWC}> -> tensor<1x128x72x72xf16, {order = #NHWC}> - // CHECK: [[TILING_COPY_8:%.*]] = VPU.Copy([[CONCAT]] + // CHECK: [[TILING_COPY_8:%.+]] = VPU.Copy([[CONCAT]] // CHECK-SAME{LITERAL}: -> !VPU.DistributedTensor<1x128x72x72xf16, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64, uniform_distributed_segments, compute_shapes = [[1, 128, 18, 72], [1, 128, 18, 72], [1, 128, 18, 72], [1, 128, 18, 72]], compute_offsets = [[0, 0, 0, 0], [0, 0, 18, 0], [0, 0, 36, 0], [0, 0, 54, 0]], memory_shapes = [[1, 128, 19, 72], [1, 128, 20, 72], [1, 128, 20, 72], [1, 128, 19, 72]], memory_offsets = [[0, 0, 0, 0], [0, 0, 17, 0], [0, 0, 35, 0], [0, 0, 53, 0]]}> - // CHECK: [[TILING_COPY_9:%.*]] = VPU.Copy([[ARG1]] + // CHECK: [[TILING_COPY_9:%.+]] = VPU.Copy([[ARG1]] // CHECK-SAME{LITERAL}: -> !VPU.DistributedTensor<1x128x72x72xf16, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64, uniform_distributed_segments, compute_shapes = [[1, 128, 18, 72], [1, 128, 18, 72], [1, 128, 18, 72], [1, 128, 18, 72]], compute_offsets = [[0, 0, 0, 0], [0, 0, 18, 0], [0, 0, 36, 0], [0, 0, 54, 0]], memory_shapes = [[1, 128, 19, 72], [1, 128, 20, 72], [1, 128, 20, 72], [1, 128, 19, 72]], memory_offsets = [[0, 0, 0, 0], [0, 0, 17, 0], [0, 0, 35, 0], [0, 0, 53, 0]]}> - // CHECK: [[ELTWISE:%.*]] = VPU.NCE.Eltwise([[TILING_COPY_8]], [[TILING_COPY_9]]) + // CHECK: [[ELTWISE:%.+]] = VPU.NCE.Eltwise([[TILING_COPY_8]], [[TILING_COPY_9]]) // CHECK-SAME{LITERAL}: -> !VPU.DistributedTensor<1x128x72x72xf16, #NCHW, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64, uniform_distributed_segments, compute_shapes = [[1, 128, 18, 72], [1, 128, 18, 72], [1, 128, 18, 72], [1, 128, 18, 72]], compute_offsets = [[0, 0, 0, 0], [0, 0, 18, 0], [0, 0, 36, 0], [0, 0, 54, 0]], memory_shapes = [[1, 128, 18, 72], [1, 128, 18, 72], [1, 128, 18, 72], [1, 128, 18, 72]], memory_offsets = [[0, 0, 0, 0], [0, 0, 18, 0], [0, 0, 36, 0], [0, 0, 54, 0]]}> - // CHECK: [[TILING_COPY_9:%.*]] = VPU.Copy([[ELTWISE]] + // CHECK: [[TILING_COPY_9:%.+]] = VPU.Copy([[ELTWISE]] // CHECK: return [[TILING_COPY_9]] : tensor<1x128x72x72xf16> } @@ -8948,27 +8999,27 @@ IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-SAME: ([[ARG0:%.+]]: tensor<1x64x64x64x!qElemType, {order = #NHWC}>, // CHECK-SAME: [[ARG1:%.+]]: tensor<1x64x64x64x!qElemType1, {order = #NHWC}>) func.func @EltwiseAddMulticlusterSOHOverlappedDepthConvolution(%arg0: tensor<1x64x64x64x!qElemType, {order = #NHWC}>, %arg1: tensor<1x64x64x64x!qElemType1, {order = #NHWC}>) -> tensor<1x64x128x128x!qElemType2, {order = #NHWC}> { - %0 = VPU.NCE.Eltwise(%arg0, %arg1) {multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, - ppe = #VPU.PPEInt, clamp_low = 0 : i64, clamp_high = 255 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, - quant_mult = [27959], quant_shift = [29], quant_post_shift = 0 : i64, in1_quant_mult = [5299], in2_quant_mult = [16913], fp_prelu_alpha = 1.000000e+00 : f64>} + %0 = VPU.NCE.Eltwise(%arg0, %arg1) {multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, + ppe = #VPU.PPEInt, clamp_low = 0 : i64, clamp_high = 255 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, + quant_mult = [27959], quant_shift = [29], quant_post_shift = 0 : i64, in1_quant_mult = [5299], in2_quant_mult = [16913], fp_prelu_alpha = 1.000000e+00 : f64>} -> tensor<1x64x64x64x!qElemType3, {order = #NHWC}> - %1 = VPU.StorageElementTable {dataElemType = !qElemType3, dataShape = [1, 64, 64, 64], - seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00]>, seDepth = 1 : i64, seSize = [64]} + %1 = VPU.StorageElementTable {dataElemType = !qElemType3, dataShape = [1, 64, 64, 64], + seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00]>, seDepth = 1 : i64, seSize = [64]} -> tensor<1x1x130x130xi32, {order = #NHWC}> %cst_220 = const.Declare tensor<1x64x130x130xi1, {order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> = dense<1> : tensor<1x64x130x130xi8>, [#const.Reorder (d0, d2, d3, d1)>>, #const.CastElemType] %2 = VPU.GroupSparseTensor(%0, %cst_220, %1) { - seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00]>} - -> !VPU.SparseTensor, - sparsity_map=tensor<1x64x130x130xi1, {order = #NHWC}>, - storage_element_table=tensor<1x1x130x130xi32, {order = #NHWC}>, + seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00]>} + -> !VPU.SparseTensor, + sparsity_map=tensor<1x64x130x130xi1, {order = #NHWC}>, + storage_element_table=tensor<1x1x130x130xi32, {order = #NHWC}>, #VPU.SEInterpolate, coordinate_transformation_mode = , scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00]>> %cst_15 = const.Declare tensor<64x1x1x4xsi32> = dense<1> : tensor<64x1x1x4xsi32> %cst_16 = const.Declare tensor<64x16x1x1x!quant.uniform, {order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> = dense<1.000000e+00> : tensor<64x1x3x3xf32>, [#const.CastElemType>, #const.Reshape<[64, 9, 1, 1]>, #const.PadWithZero<[0, 0, 0, 0], [0, 7, 0, 0]>, #const.Reorder (d0, d2, d3, d1)>>] - %3 = VPU.NCE.DepthConvolution(%2, %cst_16, %cst_15) {multiClusterStrategy = #VPU.multi_cluster_strategy, - pad = #VPU.Padding, - ppe = #VPU.PPEInt, clamp_low = 0 : i64, clamp_high = 255 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [64, 1, 3, 3], strides = [1, 1]} + %3 = VPU.NCE.DepthConvolution(%2, %cst_16, %cst_15) {multiClusterStrategy = #VPU.multi_cluster_strategy, + pad = #VPU.Padding, + ppe = #VPU.PPEInt, clamp_low = 0 : i64, clamp_high = 255 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [64, 1, 3, 3], strides = [1, 1]} -> tensor<1x64x128x128x!qElemType2, {order = #NHWC}> return %3: tensor<1x64x128x128x!qElemType2, {order = #NHWC}> // CHECK: [[IN_CP0:%.+]] = VPU.Copy([[ARG0]]) {out_mem_space = @CMX_NN} : tensor<1x64x64x64x!qElemType, {order = #NHWC}> @@ -9021,23 +9072,23 @@ IE.TileResource 6 of @NCE at 1.700000e+03 MHz // CHECK-SAME: ([[ARG0:%.+]]: tensor<1x64x64x64x!qElemType, {order = #NHWC}>, // CHECK-SAME: [[ARG1:%.+]]: tensor<1x64x64x64x!qElemType1, {order = #NHWC}>) func.func @EltwiseAddMulticlusterSOHOverlappedConvolution(%arg0: tensor<1x64x64x64x!qElemType, {order = #NHWC}>, %arg1: tensor<1x64x64x64x!qElemType1, {order = #NHWC}>) -> tensor<1x64x130x130x!qElemType2, {order = #NHWC}> { - %0 = VPU.NCE.Eltwise(%arg0, %arg1) {multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, - ppe = #VPU.PPEInt, clamp_low = 0 : i64, clamp_high = 255 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, - quant_mult = [27959], quant_shift = [29], quant_post_shift = 0 : i64, in1_quant_mult = [5299], in2_quant_mult = [16913], fp_prelu_alpha = 1.000000e+00 : f64>} + %0 = VPU.NCE.Eltwise(%arg0, %arg1) {multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, + ppe = #VPU.PPEInt, clamp_low = 0 : i64, clamp_high = 255 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, + quant_mult = [27959], quant_shift = [29], quant_post_shift = 0 : i64, in1_quant_mult = [5299], in2_quant_mult = [16913], fp_prelu_alpha = 1.000000e+00 : f64>} -> tensor<1x64x64x64x!qElemType3, {order = #NHWC}> - %1 = VPU.StorageElementTable {dataElemType = !qElemType3, dataShape = [1, 64, 64, 64], - seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00]>, seDepth = 1 : i64, seSize = [64]} + %1 = VPU.StorageElementTable {dataElemType = !qElemType3, dataShape = [1, 64, 64, 64], + seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00]>, seDepth = 1 : i64, seSize = [64]} -> tensor<1x1x130x130xi32, {order = #NHWC}> %cst_220 = const.Declare tensor<1x64x130x130xi1, {order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>}> = dense<1> : tensor<1x64x130x130xi8>, [#const.Reorder (d0, d2, d3, d1)>>, #const.CastElemType] %2 = VPU.GroupSparseTensor(%0, %cst_220, %1) { - seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00]>} - -> !VPU.SparseTensor, - sparsity_map=tensor<1x64x130x130xi1, {order = #NHWC}>, - storage_element_table=tensor<1x1x130x130xi32, {order = #NHWC}>, + seAttr = #VPU.SEInterpolate, coordinate_transformation_mode = , scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00]>} + -> !VPU.SparseTensor, + sparsity_map=tensor<1x64x130x130xi1, {order = #NHWC}>, + storage_element_table=tensor<1x1x130x130xi32, {order = #NHWC}>, #VPU.SEInterpolate, coordinate_transformation_mode = , scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00]>> - + %cst = const.Declare tensor<64x64x1x1xf16, {order = #NHWC}> = dense<0.200000e+00> : tensor<64x64x1x1xf16, {order = #NHWC}> %cst_0 = const.Declare tensor<64x1x1x4xsi32> = dense<8> : tensor<64x1x1x4xsi32> %3 = VPU.NCE.Convolution(%2, %cst, %cst_0) { @@ -9045,9 +9096,9 @@ func.func @EltwiseAddMulticlusterSOHOverlappedConvolution(%arg0: tensor<1x64x64x pad = #VPU.Padding, ppe = #VPU.PPEStub<>, rawFilterShape = [64, 64, 1, 1], strides = [1, 1] - } : !VPU.SparseTensor, - sparsity_map=tensor<1x64x130x130xi1, {order = #NHWC}>, - storage_element_table=tensor<1x1x130x130xi32, {order = #NHWC}>, + } : !VPU.SparseTensor, + sparsity_map=tensor<1x64x130x130xi1, {order = #NHWC}>, + storage_element_table=tensor<1x1x130x130xi32, {order = #NHWC}>, #VPU.SEInterpolate, coordinate_transformation_mode = , scale = [1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00]>>, tensor<64x64x1x1xf16, {order = #NHWC}>, tensor<64x1x1x4xsi32> -> tensor<1x64x130x130x!qElemType2, {order = #NHWC}> return %3: tensor<1x64x130x130x!qElemType2, {order = #NHWC}> // CHECK: [[IN_CP0:%.+]] = VPU.Copy([[ARG0]]) {out_mem_space = @CMX_NN} : tensor<1x64x64x64x!qElemType, {order = #NHWC}> @@ -9084,3 +9135,52 @@ func.func @EltwiseAddMulticlusterSOHOverlappedConvolution(%arg0: tensor<1x64x64x } +// ----- + +module @executors { +IE.TileResource 3 of @NCE at 1.700000e+03 MHz + +// CHECK-LABEL: @GatherDMA +// CHECK-SAME: [[INPUT:%.+]]: tensor<1x1x128256x2048xf16>, +// CHECK-SAME: [[INDICES:%.+]]: tensor<1x1x1024x1xi64> +func.func @GatherDMA(%input: tensor<1x1x128256x2048xf16>, %indices: tensor<1x1x1024x1xi64>) -> tensor<1x1x1024x2048xf16> { + + %gatherDMA = VPU.GatherDMA(%input, %indices) {axis_value = 2 : i64, batch_dims = 1 : i64, multiClusterStrategy = #VPU.multi_cluster_strategy} : + tensor<1x1x128256x2048xf16>, tensor<1x1x1024x1xi64> -> tensor<1x1x1024x2048xf16> + return %gatherDMA : tensor<1x1x1024x2048xf16> + + // CHECK: [[INDICES_COPY:%.+]] = VPU.Copy([[INDICES]]) {out_mem_space = @CMX_NN} : tensor<1x1x1024x1xi64> + // CHECK-SAME: -> !VPU.DistributedTensor<1x1x1024x1xi64, #NCHW, @CMX_NN, { + // CHECK-SAME: mode = "DUPLICATED", num_clusters = 3 : i64, uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 1024, 1], [1, 1, 1024, 1], [1, 1, 1024, 1]], + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + // CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 1024, 1], [1, 1, 1024, 1], [1, 1, 1024, 1]], + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> + + // CHECK: [[GATHER_DMA:%.+]] = VPU.GatherDMA([[INPUT]], [[INDICES_COPY]]) {axis_value = 2 : i64, batch_dims = 1 : i64} : + // CHECK-SAME: tensor<1x1x128256x2048xf16>, + // CHECK-SAME: !VPU.DistributedTensor<1x1x1024x1xi64, #NCHW, @CMX_NN, { + // CHECK-SAME: mode = "DUPLICATED", num_clusters = 3 : i64, uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 1024, 1], [1, 1, 1024, 1], [1, 1, 1024, 1]], + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + // CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 1024, 1], [1, 1, 1024, 1], [1, 1, 1024, 1]], + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> + // CHECK-SAME: -> !VPU.DistributedTensor<1x1x1024x2048xf16, #NCHW, @CMX_NN, { + // CHECK-SAME: mode = "SEGMENTED", num_tiles = [1, 1, 1, 3], num_clusters = 3 : i64, uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 1024, 683], [1, 1, 1024, 683], [1, 1, 1024, 682]], + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 683], [0, 0, 0, 1366]], + // CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 1024, 683], [1, 1, 1024, 683], [1, 1, 1024, 682]], + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 683], [0, 0, 0, 1366]]}> + + // CHECK: [[OUT:%.+]] = VPU.Copy([[GATHER_DMA]]) : + // CHECK-SAME: !VPU.DistributedTensor<1x1x1024x2048xf16, #NCHW, @CMX_NN, { + // CHECK-SAME: mode = "SEGMENTED", num_tiles = [1, 1, 1, 3], num_clusters = 3 : i64, uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 1024, 683], [1, 1, 1024, 683], [1, 1, 1024, 682]], + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 683], [0, 0, 0, 1366]], + // CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 1024, 683], [1, 1, 1024, 683], [1, 1, 1024, 682]], + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 683], [0, 0, 0, 1366]]}> + // CHECK-SAME: -> tensor<1x1x1024x2048xf16> + + // CHECK: return [[OUT]] : tensor<1x1x1024x2048xf16> +} +} diff --git a/tests/lit/NPU/dialect/VPU/passes/merge_vf_subgraphs_40XX+.mlir b/tests/lit/NPU/dialect/VPU/passes/merge_vf_subgraphs_40XX+.mlir index 3181fbb535..4ae78f1950 100644 --- a/tests/lit/NPU/dialect/VPU/passes/merge_vf_subgraphs_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/merge_vf_subgraphs_40XX+.mlir @@ -353,7 +353,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } func.func @NotBuildOutOfLimitNumTiles(%arg0: tensor<1x512x92x120x!qElemType, {order = #NHWC}>) -> tensor<1x256x92x120x!qElemType, {order = #NHWC}> { @@ -813,3 +813,40 @@ func.func @BuildSubtractMultiplySubgraph(%arg0: tensor<1x4x160x160xf16, {order = //CHECK: return [[VERTICAL_FUSION]] : tensor<1x4x160x160xf16, {order = #NHWC}> } + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> + +//CHECK-LABEL: @BuildSubgraphMemPermuteClustered +//CHECK-SAME: [[INPUT:%.+]]: tensor<1x16x256x256xf16, {order = #NWCH}> +func.func @BuildSubgraphMemPermuteClustered(%arg0: tensor<1x16x256x256xf16, {order = #NWCH}>) -> tensor<1x32x256x256xf16, {order = #NHWC}> { + %cst_0 = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<32x16x1x1xf16, {order = #NHWC}> + %cst_1 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %0 = VPU.VerticalFusion (%arg0 as %arg1: tensor<1x16x256x256xf16, {order = #NWCH}>) attributes {tilingStrategy = [1, 1, 4, 1]} -> tensor<1x16x256x256xf16, {order = #NHWC}> { + %3 = VPU.MemPermute(%arg1) + {multiClusterStrategy = #VPU.multi_cluster_strategy, + dst_order = #NHWC, mem_perm = #NWCH} : tensor<1x16x256x256xf16, {order = #NWCH}> -> tensor<1x16x256x256xf16, {order = #NHWC}> + VPU.Yield %3 + } + %1 = VPU.VerticalFusion (%0 as %arg1: tensor<1x16x256x256xf16, {order = #NHWC}>, %cst_0 as %arg2: tensor<32x16x1x1xf16, {order = #NHWC}>, %cst_1 as %arg3: tensor<32x1x1x4xsi32>) attributes {tilingStrategy = [1, 1, 4, 1]} -> tensor<1x32x256x256xf16, {order = #NHWC}> { + %3 = VPU.NCE.Convolution(%arg1, %arg2, %arg3) + {multiClusterStrategy = #VPU.multi_cluster_strategy, + pad = #VPU.Padding, + ppe = #VPU.PPEStub<>, + rawFilterShape = [32, 16, 1, 1], strides = [1, 1]} : tensor<1x16x256x256xf16, {order = #NHWC}>, tensor<32x16x1x1xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x256x256xf16, {order = #NHWC}> + VPU.Yield %3 + } + return %1 : tensor<1x32x256x256xf16, {order = #NHWC}> + // CHECK: [[CST_0:%.+]] = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x16x1x1xf16, {order = #NHWC}> + // CHECK: [[CST_1:%.+]] = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + // CHECK: [[VERTICAL_FUSION:%.+]] = VPU.VerticalFusion + // CHECK: [[INPUT]] as %arg1 + // CHECK: [[CST_0]] as %arg2 + // CHECK: [[CST_1]] as %arg3 + // CHECK: [[PERMUTE:%.+]] = VPU.MemPermute + // CHECK: [[CONV:%.+]] = VPU.NCE.Convolution([[PERMUTE]], %arg2, %arg3) + // CHECK: return [[VERTICAL_FUSION]] : tensor<1x32x256x256xf16, {order = #NHWC}> +} diff --git a/tests/lit/NPU/dialect/VPU/passes/move_reflect_pad_to_cmx.mlir b/tests/lit/NPU/dialect/VPU/passes/move_reflect_pad_to_cmx.mlir index 1f75154af6..3c55b6675b 100644 --- a/tests/lit/NPU/dialect/VPU/passes/move_reflect_pad_to_cmx.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/move_reflect_pad_to_cmx.mlir @@ -13,25 +13,25 @@ // CHECK-LABEL: @moveReflectPadToCmxW // CHECK-SAME: ([[INPUT:%.+]]: tensor<1x32x32x3x!qElemType, {order = #NHWC}> -func.func @moveReflectPadToCmxW(%arg0: tensor<1x32x32x3x!qElemType, {order = #NHWC}>) -> tensor<1x32x32x5x!qElemType1, {mem_space = @DDR, order = #NHWC}> { +func.func @moveReflectPadToCmxW(%arg0: tensor<1x32x32x3x!qElemType, {order = #NHWC}>) -> tensor<1x32x32x5x!qElemType1, {order = #NHWC}> { %copy_0 = VPU.Copy(%arg0) {out_mem_space = @CMX_NN} : tensor<1x32x32x3x!qElemType, {order = #NHWC}> -> tensor<1x32x32x3x!qElemType, {mem_space = @CMX_NN, order = #NHWC}> %input_copy_to_ddr = VPU.Copy(%copy_0) : tensor<1x32x32x3x!qElemType, {mem_space = @CMX_NN, order = #NHWC}> -> tensor<1x32x32x3x!qElemType, {order = #NHWC}> - + %quantize_cast = VPU.QuantizeCast(%input_copy_to_ddr) {dstElemType = !qElemType1} : tensor<1x32x32x3x!qElemType, {order = #NHWC}> -> tensor<1x32x32x3x!qElemType1, {order = #NHWC}> %input_pad_0 = VPU.Slice %quantize_cast [0, 0, 0, 1] [1, 32, 32, 1]: tensor<1x32x32x3x!qElemType1, {order = #NHWC}> - to tensor<1x32x32x1x!qElemType1, {order = #NHWC}> + to tensor<1x32x32x1x!qElemType1, {order = #NHWC}> %input_pad_1 = VPU.Slice %quantize_cast [0, 0, 0, 2] [1, 32, 32, 1]: tensor<1x32x32x3x!qElemType1, {order = #NHWC}> to tensor<1x32x32x1x!qElemType1, {order = #NHWC}> - - %concat_view = VPU.Concat (%input_pad_0, %quantize_cast, %input_pad_1) + + %concat_view = VPU.Concat (%input_pad_0, %quantize_cast, %input_pad_1) {static_offsets = [[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 4]]} : - tensor<1x32x32x1x!qElemType1, {order = #NHWC}>, - tensor<1x32x32x3x!qElemType1, {order = #NHWC}>, + tensor<1x32x32x1x!qElemType1, {order = #NHWC}>, + tensor<1x32x32x3x!qElemType1, {order = #NHWC}>, tensor<1x32x32x1x!qElemType1, {order = #NHWC}> - -> tensor<1x32x32x5x!qElemType1, {mem_space = @DDR, order = #NHWC}> - - return %concat_view: tensor<1x32x32x5x!qElemType1, {mem_space = @DDR, order = #NHWC}> + -> tensor<1x32x32x5x!qElemType1, {order = #NHWC}> + + return %concat_view: tensor<1x32x32x5x!qElemType1, {order = #NHWC}> // CHECK: [[ARG_COPY:%.+]] = VPU.Copy([[INPUT]]) {out_mem_space = @CMX_NN} : tensor<1x32x32x3x!qElemType, {order = #NHWC}> -> tensor<1x32x32x3x!qElemType, {mem_space = @CMX_NN, order = #NHWC}> // CHECK: [[CMX_TO_DDR_COPY:%.+]] = VPU.Copy([[ARG_COPY]]) : tensor<1x32x32x3x!qElemType, {mem_space = @CMX_NN, order = #NHWC}> -> tensor<1x32x32x3x!qElemType, {order = #NHWC}> @@ -42,8 +42,8 @@ func.func @moveReflectPadToCmxW(%arg0: tensor<1x32x32x3x!qElemType, {order = #NH // CHECK: [[CONCAT:%.+]] = VPU.Concat([[INPUT_SLICE_0]], [[DDR_TO_CMX_COPY]], [[INPUT_SLICE_1]]) // CHECK-SAME(LITERAL): {static_offsets = [[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 4]]} // CHECK: tensor<1x32x32x1x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}>, tensor<1x32x32x3x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}>, tensor<1x32x32x1x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}> -> tensor<1x32x32x5x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}> - // CHECK: [[CONCAT_OUT_COPY:%.+]] = VPU.Copy([[CONCAT]]) {out_mem_space = @DDR} : tensor<1x32x32x5x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}> -> tensor<1x32x32x5x!qElemType1, {mem_space = @DDR, order = #NHWC}> - // CHECK: return [[CONCAT_OUT_COPY]] : tensor<1x32x32x5x!qElemType1, {mem_space = @DDR, order = #NHWC}> + // CHECK: [[CONCAT_OUT_COPY:%.+]] = VPU.Copy([[CONCAT]]) : tensor<1x32x32x5x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}> -> tensor<1x32x32x5x!qElemType1, {order = #NHWC}> + // CHECK: return [[CONCAT_OUT_COPY]] : tensor<1x32x32x5x!qElemType1, {order = #NHWC}> } // ----- @@ -54,12 +54,12 @@ func.func @moveReflectPadToCmxW(%arg0: tensor<1x32x32x3x!qElemType, {order = #NH // CHECK-LABEL: @moveReflectPadToCmxH // CHECK-SAME: ([[INPUT:%.+]]: tensor<1x32x3x5x!qElemType, {order = #NHWC}>) -func.func @moveReflectPadToCmxH(%arg0: tensor<1x32x3x5x!qElemType, {order = #NHWC}>) -> tensor<1x32x5x5x!qElemType, {mem_space = @DDR, order = #NHWC}> { +func.func @moveReflectPadToCmxH(%arg0: tensor<1x32x3x5x!qElemType, {order = #NHWC}>) -> tensor<1x32x5x5x!qElemType, {order = #NHWC}> { %copy_0 = VPU.Copy(%arg0) {out_mem_space = @CMX_NN} : tensor<1x32x3x5x!qElemType, {order = #NHWC}> -> tensor<1x32x3x5x!qElemType, {mem_space = @CMX_NN, order = #NHWC}> %input_copy_to_ddr = VPU.Copy(%copy_0) : tensor<1x32x3x5x!qElemType, {mem_space = @CMX_NN, order = #NHWC}> -> tensor<1x32x3x5x!qElemType, {order = #NHWC}> - + %input_pad_0 = VPU.Slice %input_copy_to_ddr [0, 0, 1, 0] [1, 32, 1, 5]: tensor<1x32x3x5x!qElemType, {order = #NHWC}> - to tensor<1x32x1x5x!qElemType, {order = #NHWC}> + to tensor<1x32x1x5x!qElemType, {order = #NHWC}> %input_pad_1 = VPU.Slice %input_copy_to_ddr [0, 0, 2, 0] [1, 32, 1, 5]: tensor<1x32x3x5x!qElemType, {order = #NHWC}> to tensor<1x32x1x5x!qElemType, {order = #NHWC}> %permute_cast_0 = VPU.PermuteCast(%input_pad_0) {dst_order = #NHWC, mem_perm = #NCHW} : tensor<1x32x1x5x!qElemType, {order = #NHWC}> -> tensor<1x32x1x5x!qElemType, {order = #NHWC}> @@ -68,15 +68,15 @@ func.func @moveReflectPadToCmxH(%arg0: tensor<1x32x3x5x!qElemType, {order = #NHW %copy_1 = VPU.Copy(%input_copy_to_ddr) {out_mem_space = @CMX_NN} : tensor<1x32x3x5x!qElemType, {order = #NHWC}> -> tensor<1x32x3x5x!qElemType, {mem_space = @CMX_NN, order = #NHWC}> %max_pool = VPU.NCE.MaxPool (%copy_1) {kernel_size = [1, 1], pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = 0 : i64, clamp_high = 255 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.0 : f64>, strides = [1, 1]} -> !VPU.DistributedTensor<1x32x3x5x!qElemType, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 3, 1], num_clusters = 3 : i64, uniform_distributed_segments, compute_shapes = [[1, 32, 1, 5], [1, 32, 1, 5], [1, 32, 1, 5]], compute_offsets = [[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 2, 0]], memory_shapes = [[1, 32, 1, 5], [1, 32, 1, 5], [1, 32, 1, 5]], memory_offsets = [[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 2, 0]]}> %copy_2 = VPU.Copy(%max_pool) : !VPU.DistributedTensor<1x32x3x5x!qElemType, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 3, 1], num_clusters = 3 : i64, uniform_distributed_segments, compute_shapes = [[1, 32, 1, 5], [1, 32, 1, 5], [1, 32, 1, 5]], compute_offsets = [[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 2, 0]], memory_shapes = [[1, 32, 1, 5], [1, 32, 1, 5], [1, 32, 1, 5]], memory_offsets = [[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 2, 0]]}> -> tensor<1x32x3x5x!qElemType, {order = #NHWC}> - + %concat_view = VPU.Concat (%permute_cast_0, %copy_2, %permute_cast_1) {static_offsets = [[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 4, 0]]} : tensor<1x32x1x5x!qElemType, {order = #NHWC}>, tensor<1x32x3x5x!qElemType, {order = #NHWC}>, tensor<1x32x1x5x!qElemType, {order = #NHWC}> - -> tensor<1x32x5x5x!qElemType, {mem_space = @DDR, order = #NHWC}> + -> tensor<1x32x5x5x!qElemType, {order = #NHWC}> - return %concat_view: tensor<1x32x5x5x!qElemType, {mem_space = @DDR, order = #NHWC}> + return %concat_view: tensor<1x32x5x5x!qElemType, {order = #NHWC}> // CHECK: [[ARG_COPY:%.+]] = VPU.Copy([[INPUT]]) {out_mem_space = @CMX_NN} : tensor<1x32x3x5x!qElemType, {order = #NHWC}> -> tensor<1x32x3x5x!qElemType, {mem_space = @CMX_NN, order = #NHWC}> // CHECK: [[CMX_TO_DDR_COPY:%.+]] = VPU.Copy([[ARG_COPY]]) : tensor<1x32x3x5x!qElemType, {mem_space = @CMX_NN, order = #NHWC}> -> tensor<1x32x3x5x!qElemType, {order = #NHWC}> @@ -87,15 +87,15 @@ func.func @moveReflectPadToCmxH(%arg0: tensor<1x32x3x5x!qElemType, {order = #NHW // CHECK: [[PERMUTE_CAST_1:%.+]] = VPU.PermuteCast([[INPUT_SLICE_1]]) {dst_order = #NHWC, mem_perm = #NCHW} : tensor<1x32x1x5x!qElemType, {mem_space = [@CMX_NN, 0], order = #NHWC}> -> tensor<1x32x1x5x!qElemType, {mem_space = [@CMX_NN, 0], order = #NHWC}> // CHECK: [[COPY_0:%.+]] = VPU.Copy([[CMX_TO_DDR_COPY]]) {out_mem_space = @CMX_NN} : tensor<1x32x3x5x!qElemType, {order = #NHWC}> -> tensor<1x32x3x5x!qElemType, {mem_space = @CMX_NN, order = #NHWC}> // CHECK: [[MAX_POOL:%.+]] = VPU.NCE.MaxPool([[COPY_0]]) {kernel_size = [1, 1], pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = 0 : i64, clamp_high = 255 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, strides = [1, 1]} -> !VPU.DistributedTensor<1x32x3x5x!qElemType, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 3, 1], num_clusters = 3 : i64, uniform_distributed_segments - // CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 1, 5], [1, 32, 1, 5], [1, 32, 1, 5]], compute_offsets = [[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 2, 0]], memory_shapes = [[1, 32, 1, 5], [1, 32, 1, 5], [1, 32, 1, 5]], memory_offsets = [[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 2, 0]]}> + // CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 1, 5], [1, 32, 1, 5], [1, 32, 1, 5]], compute_offsets = [[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 2, 0]], memory_shapes = [[1, 32, 1, 5], [1, 32, 1, 5], [1, 32, 1, 5]], memory_offsets = [[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 2, 0]]}> // CHECK: [[COPY_1:%.+]] = VPU.Copy([[MAX_POOL]]) : !VPU.DistributedTensor<1x32x3x5x!qElemType, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 3, 1], num_clusters = 3 : i64, uniform_distributed_segments // CHECK-SAME{LITERAL}: compute_shapes = [[1, 32, 1, 5], [1, 32, 1, 5], [1, 32, 1, 5]], compute_offsets = [[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 2, 0]], memory_shapes = [[1, 32, 1, 5], [1, 32, 1, 5], [1, 32, 1, 5]], memory_offsets = [[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 2, 0]]}> -> tensor<1x32x3x5x!qElemType, {order = #NHWC}> // CHECK: [[INPUT_COPY_TO_CMX:%.+]] = VPU.Copy([[COPY_1]]) {out_mem_space = [@CMX_NN, 0]} : tensor<1x32x3x5x!qElemType, {order = #NHWC}> -> tensor<1x32x3x5x!qElemType, {mem_space = [@CMX_NN, 0], order = #NHWC}> // CHECK: [[CONCAT:%.+]] = VPU.Concat([[PERMUTE_CAST_0]], [[INPUT_COPY_TO_CMX]], [[PERMUTE_CAST_1]]) // CHECK-SAME{LITERAL}: {static_offsets = [[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 4, 0]]} // CHECK: tensor<1x32x1x5x!qElemType, {mem_space = [@CMX_NN, 0], order = #NHWC}>, tensor<1x32x3x5x!qElemType, {mem_space = [@CMX_NN, 0], order = #NHWC}>, tensor<1x32x1x5x!qElemType, {mem_space = [@CMX_NN, 0], order = #NHWC}> -> tensor<1x32x5x5x!qElemType, {mem_space = [@CMX_NN, 0], order = #NHWC}> - // CHECK: [[CONCAT_OUT_COPY:%.+]] = VPU.Copy([[CONCAT]]) {out_mem_space = @DDR} : tensor<1x32x5x5x!qElemType, {mem_space = [@CMX_NN, 0], order = #NHWC}> -> tensor<1x32x5x5x!qElemType, {mem_space = @DDR, order = #NHWC}> - // CHECK: return [[CONCAT_OUT_COPY]] : tensor<1x32x5x5x!qElemType, {mem_space = @DDR, order = #NHWC}> + // CHECK: [[CONCAT_OUT_COPY:%.+]] = VPU.Copy([[CONCAT]]) : tensor<1x32x5x5x!qElemType, {mem_space = [@CMX_NN, 0], order = #NHWC}> -> tensor<1x32x5x5x!qElemType, {order = #NHWC}> + // CHECK: return [[CONCAT_OUT_COPY]] : tensor<1x32x5x5x!qElemType, {order = #NHWC}> } // ----- @@ -106,24 +106,24 @@ func.func @moveReflectPadToCmxH(%arg0: tensor<1x32x3x5x!qElemType, {order = #NHW // CHECK-LABEL: @dontMoveReflectPadToCmxNoCmxToDdrCopy // CHECK-SAME: ([[INPUT:%.+]]: tensor<1x32x32x3x!qElemType, {order = #NHWC}> -func.func @dontMoveReflectPadToCmxNoCmxToDdrCopy(%arg0: tensor<1x32x32x3x!qElemType, {order = #NHWC}>) -> tensor<1x32x32x5x!qElemType1, {mem_space = @DDR, order = #NHWC}> { - %copy_0 = VPU.Copy(%arg0) {out_mem_space = @DDR} : tensor<1x32x32x3x!qElemType, {order = #NHWC}> -> tensor<1x32x32x3x!qElemType, {order = #NHWC}> - +func.func @dontMoveReflectPadToCmxNoCmxToDdrCopy(%arg0: tensor<1x32x32x3x!qElemType, {order = #NHWC}>) -> tensor<1x32x32x5x!qElemType1, {order = #NHWC}> { + %copy_0 = VPU.Copy(%arg0) : tensor<1x32x32x3x!qElemType, {order = #NHWC}> -> tensor<1x32x32x3x!qElemType, {order = #NHWC}> + %quantize_cast = VPU.QuantizeCast(%copy_0) {dstElemType = !qElemType1} : tensor<1x32x32x3x!qElemType, {order = #NHWC}> -> tensor<1x32x32x3x!qElemType1, {order = #NHWC}> %input_pad_0 = VPU.Slice %quantize_cast [0, 0, 0, 1] [1, 32, 32, 1]: tensor<1x32x32x3x!qElemType1, {order = #NHWC}> - to tensor<1x32x32x1x!qElemType1, {order = #NHWC}> + to tensor<1x32x32x1x!qElemType1, {order = #NHWC}> %input_pad_1 = VPU.Slice %quantize_cast [0, 0, 0, 2] [1, 32, 32, 1]: tensor<1x32x32x3x!qElemType1, {order = #NHWC}> to tensor<1x32x32x1x!qElemType1, {order = #NHWC}> - - %concat_view = VPU.Concat (%input_pad_0, %quantize_cast, %input_pad_1) + + %concat_view = VPU.Concat (%input_pad_0, %quantize_cast, %input_pad_1) {static_offsets = [[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 4]]} : - tensor<1x32x32x1x!qElemType1, {order = #NHWC}>, - tensor<1x32x32x3x!qElemType1, {order = #NHWC}>, + tensor<1x32x32x1x!qElemType1, {order = #NHWC}>, + tensor<1x32x32x3x!qElemType1, {order = #NHWC}>, tensor<1x32x32x1x!qElemType1, {order = #NHWC}> - -> tensor<1x32x32x5x!qElemType1, {mem_space = @DDR, order = #NHWC}> - - return %concat_view: tensor<1x32x32x5x!qElemType1, {mem_space = @DDR, order = #NHWC}> + -> tensor<1x32x32x5x!qElemType1, {order = #NHWC}> + + return %concat_view: tensor<1x32x32x5x!qElemType1, {order = #NHWC}> // CHECK: VPU.Concat // CHECK-NOT: tensor<1x32x32x1x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}>, tensor<1x32x32x32x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}>, tensor<1x32x32x1x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}> -> tensor<1x32x32x34x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}> @@ -137,25 +137,25 @@ func.func @dontMoveReflectPadToCmxNoCmxToDdrCopy(%arg0: tensor<1x32x32x3x!qElemT // CHECK-LABEL: @dontMoveReflectPadToCmxPaddingWithMoreThan1 // CHECK-SAME: ([[INPUT:%.+]]: tensor<1x32x32x3x!qElemType, {order = #NHWC}> -func.func @dontMoveReflectPadToCmxPaddingWithMoreThan1(%arg0: tensor<1x32x32x3x!qElemType, {order = #NHWC}>) -> tensor<1x32x32x6x!qElemType1, {mem_space = @DDR, order = #NHWC}> { +func.func @dontMoveReflectPadToCmxPaddingWithMoreThan1(%arg0: tensor<1x32x32x3x!qElemType, {order = #NHWC}>) -> tensor<1x32x32x6x!qElemType1, {order = #NHWC}> { %copy_0 = VPU.Copy(%arg0) {out_mem_space = @CMX_NN} : tensor<1x32x32x3x!qElemType, {order = #NHWC}> -> tensor<1x32x32x3x!qElemType, {mem_space = @CMX_NN, order = #NHWC}> %input_copy_to_ddr = VPU.Copy(%copy_0) : tensor<1x32x32x3x!qElemType, {mem_space = @CMX_NN, order = #NHWC}> -> tensor<1x32x32x3x!qElemType, {order = #NHWC}> - + %quantize_cast = VPU.QuantizeCast(%input_copy_to_ddr) {dstElemType = !qElemType1} : tensor<1x32x32x3x!qElemType, {order = #NHWC}> -> tensor<1x32x32x3x!qElemType1, {order = #NHWC}> %input_pad_0 = VPU.Slice %quantize_cast [0, 0, 0, 1] [1, 32, 32, 1]: tensor<1x32x32x3x!qElemType1, {order = #NHWC}> - to tensor<1x32x32x1x!qElemType1, {order = #NHWC}> + to tensor<1x32x32x1x!qElemType1, {order = #NHWC}> %input_pad_1 = VPU.Slice %quantize_cast [0, 0, 0, 2] [1, 32, 32, 2]: tensor<1x32x32x3x!qElemType1, {order = #NHWC}> to tensor<1x32x32x2x!qElemType1, {order = #NHWC}> - - %concat_view = VPU.Concat (%input_pad_0, %quantize_cast, %input_pad_1) + + %concat_view = VPU.Concat (%input_pad_0, %quantize_cast, %input_pad_1) {static_offsets = [[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 4]]} : - tensor<1x32x32x1x!qElemType1, {order = #NHWC}>, - tensor<1x32x32x3x!qElemType1, {order = #NHWC}>, + tensor<1x32x32x1x!qElemType1, {order = #NHWC}>, + tensor<1x32x32x3x!qElemType1, {order = #NHWC}>, tensor<1x32x32x2x!qElemType1, {order = #NHWC}> - -> tensor<1x32x32x6x!qElemType1, {mem_space = @DDR, order = #NHWC}> - - return %concat_view: tensor<1x32x32x6x!qElemType1, {mem_space = @DDR, order = #NHWC}> + -> tensor<1x32x32x6x!qElemType1, {order = #NHWC}> + + return %concat_view: tensor<1x32x32x6x!qElemType1, {order = #NHWC}> // CHECK: VPU.Concat // CHECK-NOT: tensor<1x32x32x1x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}>, tensor<1x32x32x32x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}>, tensor<1x32x32x1x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}> -> tensor<1x32x32x34x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}> @@ -169,25 +169,25 @@ func.func @dontMoveReflectPadToCmxPaddingWithMoreThan1(%arg0: tensor<1x32x32x3x! // CHECK-LABEL: @dontMoveReflectPadToCmxDoesntFitInCmx // CHECK-SAME: ([[INPUT:%.+]]: tensor<1x1024x1024x3x!qElemType, {order = #NHWC}> -func.func @dontMoveReflectPadToCmxDoesntFitInCmx(%arg0: tensor<1x1024x1024x3x!qElemType, {order = #NHWC}>) -> tensor<1x1024x1024x5x!qElemType1, {mem_space = @DDR, order = #NHWC}> { +func.func @dontMoveReflectPadToCmxDoesntFitInCmx(%arg0: tensor<1x1024x1024x3x!qElemType, {order = #NHWC}>) -> tensor<1x1024x1024x5x!qElemType1, {order = #NHWC}> { %copy_0 = VPU.Copy(%arg0) {out_mem_space = @CMX_NN} : tensor<1x1024x1024x3x!qElemType, {order = #NHWC}> -> tensor<1x1024x1024x3x!qElemType, {mem_space = @CMX_NN, order = #NHWC}> %input_copy_to_ddr = VPU.Copy(%copy_0) : tensor<1x1024x1024x3x!qElemType, {mem_space = @CMX_NN, order = #NHWC}> -> tensor<1x1024x1024x3x!qElemType, {order = #NHWC}> - + %quantize_cast = VPU.QuantizeCast(%input_copy_to_ddr) {dstElemType = !qElemType1} : tensor<1x1024x1024x3x!qElemType, {order = #NHWC}> -> tensor<1x1024x1024x3x!qElemType1, {order = #NHWC}> %input_pad_0 = VPU.Slice %quantize_cast [0, 0, 0, 1] [1, 1024, 1024, 1]: tensor<1x1024x1024x3x!qElemType1, {order = #NHWC}> - to tensor<1x1024x1024x1x!qElemType1, {order = #NHWC}> + to tensor<1x1024x1024x1x!qElemType1, {order = #NHWC}> %input_pad_1 = VPU.Slice %quantize_cast [0, 0, 0, 2] [1, 1024, 1024, 1]: tensor<1x1024x1024x3x!qElemType1, {order = #NHWC}> to tensor<1x1024x1024x1x!qElemType1, {order = #NHWC}> - - %concat_view = VPU.Concat (%input_pad_0, %quantize_cast, %input_pad_1) + + %concat_view = VPU.Concat (%input_pad_0, %quantize_cast, %input_pad_1) {static_offsets = [[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 4]]} : - tensor<1x1024x1024x1x!qElemType1, {order = #NHWC}>, - tensor<1x1024x1024x3x!qElemType1, {order = #NHWC}>, + tensor<1x1024x1024x1x!qElemType1, {order = #NHWC}>, + tensor<1x1024x1024x3x!qElemType1, {order = #NHWC}>, tensor<1x1024x1024x1x!qElemType1, {order = #NHWC}> - -> tensor<1x1024x1024x5x!qElemType1, {mem_space = @DDR, order = #NHWC}> - - return %concat_view: tensor<1x1024x1024x5x!qElemType1, {mem_space = @DDR, order = #NHWC}> + -> tensor<1x1024x1024x5x!qElemType1, {order = #NHWC}> + + return %concat_view: tensor<1x1024x1024x5x!qElemType1, {order = #NHWC}> // CHECK: VPU.Concat // CHECK-NOT: tensor<1x1024x1024x1x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}>, tensor<1x1024x1024x3x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}>, tensor<1x1024x1024x1x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}> -> tensor<1x1024x1024x5x!qElemType1, {mem_space = [@CMX_NN, 0], order = #NHWC}> @@ -206,12 +206,12 @@ func.func @dontMoveReflectPadToCmxNotTheSameCopySource(%arg0: tensor<1x1x1x24xf1 %copy_cmx_to_ddr_0 = VPU.Copy(%copy_0) : tensor<1x1x1x24xf16, {mem_space = @CMX_NN, order = #NCHW}> -> tensor<1x1x1x24xf16> %copy_cmx_to_ddr_1 = VPU.Copy(%copy_1) : tensor<1x16x1x24xf16, {mem_space = @CMX_NN, order = #NHWC}> -> tensor<1x16x1x24xf16, {order = #NHWC}> - + %slice = VPU.Slice %copy_cmx_to_ddr_1 [0, 0, 0, 0] [1, 1, 1, 24] : tensor<1x16x1x24xf16, {order = #NHWC}> to tensor<1x1x1x24xf16, {order = #NHWC}> %permute_cast = VPU.PermuteCast(%slice) {dst_order = #NCHW, mem_perm = #NWCH} : tensor<1x1x1x24xf16, {order = #NHWC}> -> tensor<1x1x1x24xf16> - + %concat = VPU.Concat(%copy_cmx_to_ddr_0, %permute_cast) {static_offsets = [[0, 0, 0, 0], [0, 0, 1, 0]]} : tensor<1x1x1x24xf16>, tensor<1x1x1x24xf16> -> tensor<1x1x2x24xf16> - + return %concat: tensor<1x1x2x24xf16> // CHECK: VPU.Concat diff --git a/tests/lit/NPU/dialect/VPU/passes/multi_cluster_strategy_assignment.mlir b/tests/lit/NPU/dialect/VPU/passes/multi_cluster_strategy_assignment.mlir index c01d251109..20463f79cc 100644 --- a/tests/lit/NPU/dialect/VPU/passes/multi_cluster_strategy_assignment.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/multi_cluster_strategy_assignment.mlir @@ -1544,5 +1544,3 @@ func.func @LogicalNotAssignedClustering(%arg0: tensor<1x1x1x256xf16>) -> tensor< //CHECK: [[LOGICALNOT:%.+]] = VPU.LogicalNot([[INPUT]]) { //CHECK-SAME: multiClusterStrategy = #VPU.multi_cluster_strategy} } - - diff --git a/tests/lit/NPU/dialect/VPU/passes/optimize_concat_skip_for_main_func.mlir b/tests/lit/NPU/dialect/VPU/passes/optimize_concat_skip_for_main_func.mlir new file mode 100644 index 0000000000..7a773f3e81 --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/optimize_concat_skip_for_main_func.mlir @@ -0,0 +1,81 @@ +// +// Copyright (C) 2023-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=HostCompile" --optimize-concat="disable-pass-on-entry-function=true" %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +// CHECK-LABEL: SkipMainFunc +module @SkipMainFunc { + net.NetworkInfo entryPoint : @SameSiblingConcat + inputsInfo : { + DataInfo "input0" : tensor<100x1x1x1xf16> + DataInfo "input1" : tensor<112x15x1x1xf16> + } outputsInfo : { + DataInfo "output0" : tensor<112x16x1x1xf16> + DataInfo "output1" : tensor<112x16x1x1xf16> + DataInfo "output2" : tensor<112x16x1x1xf16> + } + + // CHECK: func.func [[FUNC:@.+]]([[INPUT_0:%.+]]: tensor<100x1x1x1xf16>, [[INPUT_1:%.+]]: tensor<112x15x1x1xf16>) -> (tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>) { + func.func @SameSiblingConcat(%arg0: tensor<100x1x1x1xf16>, %arg1: tensor<112x15x1x1xf16>) -> (tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>) { + %cst = const.Declare tensor<112x15x1x1xf16> = dense<0.000000e+00> : tensor<112x15x1x1xf16> + %cst_0 = const.Declare tensor<112x15x1x1xf16> = dense<0.000000e+00> : tensor<112x15x1x1xf16> + %0 = VPU.Expand(%arg0) {pads_begin = [0, 0, 0, 0], pads_end = [12, 0, 0, 0]} : tensor<100x1x1x1xf16> -> tensor<112x1x1x1xf16> + %1 = VPU.Concat(%0, %cst) {static_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]]} : tensor<112x1x1x1xf16>, tensor<112x15x1x1xf16> -> tensor<112x16x1x1xf16> + %2 = VPU.Concat(%0, %cst_0) {static_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]]} : tensor<112x1x1x1xf16>, tensor<112x15x1x1xf16> -> tensor<112x16x1x1xf16> + %3 = VPU.Concat(%0, %arg1) {static_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]]} : tensor<112x1x1x1xf16>, tensor<112x15x1x1xf16> -> tensor<112x16x1x1xf16> + + return %1, %2, %3 : tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16> + + // CHECK: [[CST:%.+]] = const.Declare tensor<112x15x1x1xf16> = dense<0.000000e+00> : tensor<112x15x1x1xf16> + // CHECK: [[CST_0:%.+]] = const.Declare tensor<112x15x1x1xf16> = dense<0.000000e+00> : tensor<112x15x1x1xf16> + + // CHECK: [[EXPAND:%.+]] = VPU.Expand([[INPUT_0]]) {pads_begin = [0, 0, 0, 0], pads_end = [12, 0, 0, 0]} : tensor<100x1x1x1xf16> -> tensor<112x1x1x1xf16> + // CHECK: [[CONCAT_0:%.+]] = VPU.Concat([[EXPAND]], [[CST]]) + // CHECK: [[CONCAT_1:%.+]] = VPU.Concat([[EXPAND]], [[CST_0]]) + // CHECK: [[CONCAT_2:%.+]] = VPU.Concat([[EXPAND]], [[INPUT_1]]) + // CHECK: return [[CONCAT_0]], [[CONCAT_1]], [[CONCAT_2]] : tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16> + } +} + +// ----- + +// CHECK-LABEL: OptimizeConcatPassOnNpuFunction +module @OptimizeConcatPassOnNpuFunction { + net.NetworkInfo entryPoint : @main + inputsInfo : { + DataInfo "input0" : tensor<100x1x1x1xf16> + DataInfo "input1" : tensor<112x15x1x1xf16> + } outputsInfo : { + DataInfo "output0" : tensor<112x16x1x1xf16> + DataInfo "output1" : tensor<112x16x1x1xf16> + DataInfo "output2" : tensor<112x16x1x1xf16> + } + + // CHECK: func.func [[FUNC:@.+]]([[INPUT_0:%.+]]: tensor<100x1x1x1xf16>, [[INPUT_1:%.+]]: tensor<112x15x1x1xf16>) -> (tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>) + func.func @SameSiblingConcat(%arg0: tensor<100x1x1x1xf16>, %arg1: tensor<112x15x1x1xf16>) -> (tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>) { + %cst = const.Declare tensor<112x15x1x1xf16> = dense<0.000000e+00> : tensor<112x15x1x1xf16> + %cst_0 = const.Declare tensor<112x15x1x1xf16> = dense<0.000000e+00> : tensor<112x15x1x1xf16> + %0 = VPU.Expand(%arg0) {pads_begin = [0, 0, 0, 0], pads_end = [12, 0, 0, 0]} : tensor<100x1x1x1xf16> -> tensor<112x1x1x1xf16> + %1 = VPU.Concat(%0, %cst) {static_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]]} : tensor<112x1x1x1xf16>, tensor<112x15x1x1xf16> -> tensor<112x16x1x1xf16> + %2 = VPU.Concat(%0, %cst_0) {static_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]]} : tensor<112x1x1x1xf16>, tensor<112x15x1x1xf16> -> tensor<112x16x1x1xf16> + %3 = VPU.Concat(%0, %arg1) {static_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]]} : tensor<112x1x1x1xf16>, tensor<112x15x1x1xf16> -> tensor<112x16x1x1xf16> + + return %1, %2, %3 : tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16> + + // CHECK: [[CST:%.+]] = const.Declare tensor<112x15x1x1xf16> = dense<0.000000e+00> : tensor<112x15x1x1xf16> + + // CHECK: [[EXPAND:%.+]] = VPU.Expand([[INPUT_0]]) {pads_begin = [0, 0, 0, 0], pads_end = [12, 0, 0, 0]} : tensor<100x1x1x1xf16> -> tensor<112x1x1x1xf16> + // CHECK: [[CONCAT_0:%.+]] = VPU.Concat([[EXPAND]], [[CST]]) + // CHECK: [[CONCAT_1:%.+]] = VPU.Concat([[EXPAND]], [[INPUT_1]]) + // CHECK: return [[CONCAT_0]], [[CONCAT_0]], [[CONCAT_1]] : tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16> + } + + // CHECK: func.func [[MAIN:@.+]]([[INPUT_0:%.+]]: tensor<100x1x1x1xf16>, [[INPUT_1:%.+]]: tensor<112x15x1x1xf16>) -> (tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>) + func.func @main(%arg0: tensor<100x1x1x1xf16>, %arg1: tensor<112x15x1x1xf16>) -> (tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>) { + %0, %1, %2 = func.call @SameSiblingConcat(%arg0, %arg1) : (tensor<100x1x1x1xf16>, tensor<112x15x1x1xf16>) -> (tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>) + return %0, %1, %2 : tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16>, tensor<112x16x1x1xf16> + } +} diff --git a/tests/lit/NPU/dialect/VPU/passes/optimize_shared_input_copy_for_concat.mlir b/tests/lit/NPU/dialect/VPU/passes/optimize_shared_input_copy_for_concat.mlir index c29fc1ce92..35e1dc741e 100644 --- a/tests/lit/NPU/dialect/VPU/passes/optimize_shared_input_copy_for_concat.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/optimize_shared_input_copy_for_concat.mlir @@ -302,3 +302,71 @@ func.func @OptimizeSharedInputCopyForConcatWithBlockArgInput(%input0: !Input_CMX // CHECK: return [[CMX_CONCAT0]], [[CMX_CONCAT1]] : tensor<1x256x16x32xf16, {mem_space = @CMX_NN, order = #NHWC}>, tensor<1x256x16x32xf16, {mem_space = @CMX_NN, order = #NHWC}> } + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!Input_Distributed = !VPU.DistributedTensor< + 1x128x16x16xf16, #NHWC, @CMX_NN, { + mode = "SEGMENTED", + num_tiles = [1, 1, 2, 1], + num_clusters = 2 : i64 +}> + +!Output_Distributed = !VPU.DistributedTensor< + 1x256x24x24xf16, #NHWC, @CMX_NN, { + mode = "SEGMENTED", + num_tiles = [1, 1, 2, 1], + num_clusters = 2 : i64 +}> + +// CHECK-LABEL: @OptimizeSharedClusteredInputCopyForConcatOnTwoAxes +// CHECK-SAME: ([[INPUT0:%.+]]: !VPU.DistributedTensor<1x128x16x16xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>, [[INPUT1:%.+]]: !VPU.DistributedTensor<1x128x16x16xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>, [[INPUT2:%.+]]: !VPU.DistributedTensor<1x128x16x16xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>, [[INPUT3:%.+]]: !VPU.DistributedTensor +func.func @OptimizeSharedClusteredInputCopyForConcatOnTwoAxes(%input0: !Input_Distributed, %input1: !Input_Distributed, %input2: !Input_Distributed, %input3: !Input_Distributed) -> (!Output_Distributed, !Output_Distributed) { + %cst0 = const.Declare tensor<1x128x32x32xf16, {order = #NHWC}> = dense<0.000000e+00> : tensor<1x128x32x32xf16>, [#const.Reorder<#NHWC>] + %cst1 = const.Declare tensor<1x128x32x32xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<1x128x32x32xf16>, [#const.Reorder<#NHWC>] + + %conv_out0 = VPU.Copy(%input0) {out_mem_space = @DDR} : !Input_Distributed -> tensor<1x128x16x16xf16, {order = #NHWC}> + %conv_out1 = VPU.Copy(%input1) {out_mem_space = @DDR} : !Input_Distributed -> tensor<1x128x16x16xf16, {order = #NHWC}> + %conv_out2 = VPU.Copy(%input2) {out_mem_space = @DDR} : !Input_Distributed -> tensor<1x128x16x16xf16, {order = #NHWC}> + %conv_out3 = VPU.Copy(%input3) {out_mem_space = @DDR} : !Input_Distributed -> tensor<1x128x16x16xf16, {order = #NHWC}> + + %concat_output = VPU.Concat(%conv_out0, %conv_out1, %conv_out2, %conv_out3) {static_offsets = [[0, 0, 0, 0], [0, 0, 0, 16], [0, 0, 16, 0], [0, 0, 16, 16]]} : tensor<1x128x16x16xf16, {order = #NHWC}>, tensor<1x128x16x16xf16, {order = #NHWC}>, tensor<1x128x16x16xf16, {order = #NHWC}>, tensor<1x128x16x16xf16, {order = #NHWC}> -> tensor<1x128x32x32xf16, {order = #NHWC}> + + %concat0 = VPU.Concat(%concat_output , %cst0) {static_offsets = [[0, 0, 0, 0], [0, 128, 0, 0]]} : tensor<1x128x32x32xf16, {order = #NHWC}>, tensor<1x128x32x32xf16, {order = #NHWC}> -> tensor<1x256x32x32xf16, {order = #NHWC}> + %concat1 = VPU.Concat(%concat_output , %cst1) {static_offsets = [[0, 0, 0, 0], [0, 128, 0, 0]]} : tensor<1x128x32x32xf16, {order = #NHWC}>, tensor<1x128x32x32xf16, {order = #NHWC}> -> tensor<1x256x32x32xf16, {order = #NHWC}> + + %slice0 = VPU.Slice %concat0 [0, 0, 0, 0] [1, 256, 24, 24] :tensor<1x256x32x32xf16, {order = #NHWC}> to tensor<1x256x24x24xf16, {order = #NHWC}> + %slice1 = VPU.Slice %concat1 [0, 0, 0, 0] [1, 256, 24, 24] :tensor<1x256x32x32xf16, {order = #NHWC}> to tensor<1x256x24x24xf16, {order = #NHWC}> + + %output0 = VPU.Copy(%slice0) { out_mem_space = @CMX_NN } : tensor<1x256x24x24xf16, {order = #NHWC}> -> !Output_Distributed + %output1 = VPU.Copy(%slice1) { out_mem_space = @CMX_NN } : tensor<1x256x24x24xf16, {order = #NHWC}> -> !Output_Distributed + + return %output0, %output1 : !Output_Distributed, !Output_Distributed + + + // CHECK-DAG: [[CST0:%.+]] = const.Declare tensor<1x128x24x24xf16, {order = #NHWC}> = dense<0.000000e+00> : tensor<1x128x32x32xf16>, [#const.SubView<[0, 0, 0, 0], [1, 128, 24, 24]>, #const.Reorder<#NHWC>] + // CHECK-DAG: [[CST1:%.+]] = const.Declare tensor<1x128x24x24xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<1x128x32x32xf16>, [#const.SubView<[0, 0, 0, 0], [1, 128, 24, 24]>, #const.Reorder<#NHWC>] + + // CHECK: [[INPUT0_COPY:%.+]] = VPU.Copy([[INPUT0]]) {out_mem_space = @DDR} : !VPU.DistributedTensor<1x128x16x16xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> -> tensor<1x128x16x16xf16, {order = #NHWC}> + // CHECK: [[INPUT1_COPY:%.+]] = VPU.Copy([[INPUT1]]) {out_mem_space = @DDR} : !VPU.DistributedTensor<1x128x16x16xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> -> tensor<1x128x16x16xf16, {order = #NHWC}> + // CHECK: [[INPUT2_COPY:%.+]] = VPU.Copy([[INPUT2]]) {out_mem_space = @DDR} : !VPU.DistributedTensor<1x128x16x16xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> -> tensor<1x128x16x16xf16, {order = #NHWC}> + // CHECK: [[INPUT3_COPY:%.+]] = VPU.Copy([[INPUT3]]) {out_mem_space = @DDR} : !VPU.DistributedTensor<1x128x16x16xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> -> tensor<1x128x16x16xf16, {order = #NHWC}> + // CHECK: [[DDR_CONCAT:%.+]] = VPU.Concat([[INPUT0_COPY]], [[INPUT1_COPY]], [[INPUT2_COPY]], [[INPUT3_COPY]]) + // CHECK-SAME{LITERAL}: {static_offsets = [[0, 0, 0, 0], [0, 0, 0, 16], [0, 0, 16, 0], [0, 0, 16, 16]]} + + // CHECK: [[CST0_COPY:%.+]] = VPU.Copy([[CST0]]) {out_mem_space = @CMX_NN} : tensor<1x128x24x24xf16, {order = #NHWC}> -> !VPU.DistributedTensor<1x128x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + // CHECK: [[SLICE0:%.+]] = VPU.Slice [[DDR_CONCAT]] [0, 0, 0, 0] [1, 128, 24, 24] : tensor<1x128x32x32xf16, {order = #NHWC}> to tensor<1x128x24x24xf16, {order = #NHWC}> + // CHECK: [[SLICE0_COPY:%.+]] = VPU.Copy([[SLICE0]]) {out_mem_space = @CMX_NN} : tensor<1x128x24x24xf16, {order = #NHWC}> -> !VPU.DistributedTensor<1x128x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + // CHECK: [[CMX_CONCAT0:%.+]] = VPU.Concat([[SLICE0_COPY]], [[CST0_COPY]]) + // CHECK-SAME{LITERAL}: {static_offsets = [[0, 0, 0, 0], [0, 128, 0, 0]]} : !VPU.DistributedTensor<1x128x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>, !VPU.DistributedTensor<1x128x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> -> !VPU.DistributedTensor<1x256x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + + // CHECK: [[CST1_COPY:%.+]] = VPU.Copy([[CST1]]) {out_mem_space = @CMX_NN} : tensor<1x128x24x24xf16, {order = #NHWC}> -> !VPU.DistributedTensor<1x128x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + // CHECK: [[SLICE1:%.+]] = VPU.Slice [[DDR_CONCAT]] [0, 0, 0, 0] [1, 128, 24, 24] : tensor<1x128x32x32xf16, {order = #NHWC}> to tensor<1x128x24x24xf16, {order = #NHWC}> + // CHECK: [[SLICE1_COPY:%.+]] = VPU.Copy([[SLICE1]]) {out_mem_space = @CMX_NN} : tensor<1x128x24x24xf16, {order = #NHWC}> -> !VPU.DistributedTensor<1x128x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + // CHECK: [[CMX_CONCAT1:%.+]] = VPU.Concat([[SLICE1_COPY]], [[CST1_COPY]]) + // CHECK-SAME{LITERAL}: {static_offsets = [[0, 0, 0, 0], [0, 128, 0, 0]]} : !VPU.DistributedTensor<1x128x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>, !VPU.DistributedTensor<1x128x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> -> !VPU.DistributedTensor<1x256x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + // CHECK: return [[CMX_CONCAT0]], [[CMX_CONCAT1]] +} diff --git a/tests/lit/NPU/dialect/VPU/passes/query_ws_info.mlir b/tests/lit/NPU/dialect/VPU/passes/query_ws_info.mlir index f3139bf355..164aad77eb 100644 --- a/tests/lit/NPU/dialect/VPU/passes/query_ws_info.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/query_ws_info.mlir @@ -29,8 +29,8 @@ module @NoConstants { {-# dialect_resources: { builtin: { - ov_1: "0x10000000AABBCCDD", - ov_2: "0x10000000AABBCCDD" + vpux_ow_1: "0x10000000AABBCCDD", + vpux_ow_2: "0x10000000AABBCCDD" } } #-} @@ -46,8 +46,8 @@ module @WsInfo { } func.func @main(%arg: tensor<4x16xf16>) -> tensor<4x16xf16> { - %cst1 = const.Declare tensor<4xui8> = dense_resource : tensor<4xui8>, [#const.Add<1.0>] - %cst2 = const.Declare tensor<4xui8> = dense_resource : tensor<4xui8>, [#const.Add<2.0>] + %cst1 = const.Declare tensor<4xui8> = dense_resource : tensor<4xui8>, [#const.Add<1.0>] + %cst2 = const.Declare tensor<4xui8> = dense_resource : tensor<4xui8>, [#const.Add<2.0>] return %arg : tensor<4x16xf16> } } diff --git a/tests/lit/NPU/dialect/VPU/passes/relocate_weight_table_for_reuse.mlir b/tests/lit/NPU/dialect/VPU/passes/relocate_weight_table_for_reuse.mlir index d7f592a1c3..b39aaaf532 100644 --- a/tests/lit/NPU/dialect/VPU/passes/relocate_weight_table_for_reuse.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/relocate_weight_table_for_reuse.mlir @@ -31,6 +31,43 @@ func.func @RelocateWtTableReuseOneClusterNCEConv(%input: tensor<1x512x28x28xf16, // ----- +#GNHWC = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4, d2)> + +// CHECK-LABEL: @RelocateWtTableReuseOneClusterNCEMatMul +func.func @RelocateWtTableReuseOneClusterNCEMatMul(%input: tensor<2x1x96x1x1xf16, {order = #GNHWC}>) -> tensor<2x1x16x1x1xf32, {order = #GNHWC}> { + %weights = const.Declare tensor<2x16x96x1x1xf16, {order = #GNHWC}> = dense<1.0> : tensor<2x16x96x1x1xf16>, [#const.Reorder<#GNHWC>] + %wt = const.Declare tensor<2x16x1x1x4xsi32> = dense<[[[[[128, 0, 1065353216, 0]]], [[[256, 0, 1065353216, 0]]], [[[384, 0, 1065353216, 0]]], [[[512, 0, 1065353216, 0]]], + [[[640, 0, 1065353216, 0]]], [[[768, 0, 1065353216, 0]]], [[[896, 0, 1065353216, 0]]], [[[1024, 0, 1065353216, 0]]], + [[[1152, 0, 1065353216, 0]]], [[[1280, 0, 1065353216, 0]]], [[[1408, 0, 1065353216, 0]]], [[[1536, 0, 1065353216, 0]]], + [[[1664, 0, 1065353216, 0]]], [[[1792, 0, 1065353216, 0]]], [[[1920, 0, 1065353216, 0]]], [[[2048, 0, 1065353216, 0]]]], + [[[[2176, 0, 1065353216, 0]]], [[[2304, 0, 1065353216, 0]]], [[[2432, 0, 1065353216, 0]]], [[[2560, 0, 1065353216, 0]]], + [[[2688, 0, 1065353216, 0]]], [[[2816, 0, 1065353216, 0]]], [[[2944, 0, 1065353216, 0]]], [[[3072, 0, 1065353216, 0]]], + [[[3200, 0, 1065353216, 0]]], [[[3328, 0, 1065353216, 0]]], [[[3456, 0, 1065353216, 0]]], [[[3584, 0, 1065353216, 0]]], + [[[3712, 0, 1065353216, 0]]], [[[3840, 0, 1065353216, 0]]], [[[3968, 0, 1065353216, 0]]], [[[4096, 0, 1065353216, 0]]]]]> : tensor<2x16x1x1x4xsi32> + + %matmul = VPU.NCE.MatMul(%input, %weights, %wt) + {mpe_engine = #VPU.MPEEngine37XX>, pad = #VPU.Padding, + ppe = #VPU.PPEFp, clamp_low = -3.4028234663852886E+38 : f64, clamp_high = 3.4028234663852886E+38 : f64, scale = 1.000000e+00 : f64, + prelu_alpha = [1.000000e+00], bias = 0.000000e+00 : f64, adder = 0.000000e+00 : f64>, rawFilterShape = [2, 16, 96, 1, 1], strides = [1, 1]} + -> tensor<2x1x16x1x1xf32, {order = #GNHWC}> + + return %matmul : tensor<2x1x16x1x1xf32, {order = #GNHWC}> + + // CHECK: [[WEIGHTS_TABLE:%.+]] = const.Declare tensor<2x16x1x1x4xsi32> = + // CHECK-SAME{LITERAL}: dense<[[[[[0, 16777215, 1065353216, 0]]], [[[128, 16777215, 1065353216, 0]]], [[[256, 16777215, 1065353216, 0]]], [[[384, 16777215, 1065353216, 0]]], + // CHECK-SAME{LITERAL}: [[[512, 16777215, 1065353216, 0]]], [[[640, 16777215, 1065353216, 0]]], [[[768, 16777215, 1065353216, 0]]], [[[896, 16777215, 1065353216, 0]]], + // CHECK-SAME{LITERAL}: [[[1024, 16777215, 1065353216, 0]]], [[[1152, 16777215, 1065353216, 0]]], [[[1280, 16777215, 1065353216, 0]]], [[[1408, 16777215, 1065353216, 0]]], + // CHECK-SAME{LITERAL}: [[[1536, 16777215, 1065353216, 0]]], [[[1664, 16777215, 1065353216, 0]]], [[[1792, 16777215, 1065353216, 0]]], [[[1920, 16777215, 1065353216, 0]]]], + // CHECK-SAME{LITERAL}: [[[[0, 16777215, 1065353216, 0]]], [[[128, 16777215, 1065353216, 0]]], [[[256, 16777215, 1065353216, 0]]], [[[384, 16777215, 1065353216, 0]]], + // CHECK-SAME{LITERAL}: [[[512, 16777215, 1065353216, 0]]], [[[640, 16777215, 1065353216, 0]]], [[[768, 16777215, 1065353216, 0]]], [[[896, 16777215, 1065353216, 0]]], + // CHECK-SAME{LITERAL}: [[[1024, 16777215, 1065353216, 0]]], [[[1152, 16777215, 1065353216, 0]]], [[[1280, 16777215, 1065353216, 0]]], [[[1408, 16777215, 1065353216, 0]]], + // CHECK-SAME{LITERAL}: [[[1536, 16777215, 1065353216, 0]]], [[[1664, 16777215, 1065353216, 0]]], [[[1792, 16777215, 1065353216, 0]]], [[[1920, 16777215, 1065353216, 0]]]]]> : tensor<2x16x1x1x4xsi32> + + // CHECK: VPU.NCE.MatMul({{%.+}}, {{%.+}}, [[WEIGHTS_TABLE]]) +} + +// ----- + #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> !InDistributed = !VPU.DistributedTensor<1x128x1x1xf16, #NHWC, @CMX_NN, diff --git a/tests/lit/NPU/dialect/VPU/passes/resolve_shaped_type_result_dims.mlir b/tests/lit/NPU/dialect/VPU/passes/resolve_shaped_type_result_dims.mlir new file mode 100644 index 0000000000..f6e77c70ac --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/resolve_shaped_type_result_dims.mlir @@ -0,0 +1,76 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --resolve-shaped-type-result-dims %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +// ----- +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +// CHECK-LABEL: @Conv +func.func @Conv(%arg0: tensor<1x16x?x8xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 100, 8]> : tensor<4xsi64>, order = #NHWC}>, %arg1: tensor<1x16x4x8xf16, {order = #NHWC}>) -> (tensor<1x32x?x4xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 50, 4]> : tensor<4xsi64>, order = #NHWC}>, index) { + // CHECK: [[IN:%.+]]: tensor<1x16x?x8xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 100, 8]> : tensor<4xsi64>, order = #NHWC}> + %C2 = arith.constant 2 : index + // CHECK: [[C2:%.+]] = arith.constant 2 : index + %cst_0 = const.Declare tensor<32x16x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x16x3x3xf16>, [#const.Reorder<#NHWC>] + // CHECK: [[WEIGHT:%.+]] = const.Declare + %CONV = VPU.NCE.Convolution(%arg0, %cst_0) {ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [32, 16, 3, 3], strides = [2, 2]} : tensor<1x16x?x8xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 100, 8]> : tensor<4xsi64>, order = #NHWC}>, tensor<32x16x3x3xf16, {order = #NHWC}> -> tensor<1x32x?x4xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 50, 4]> : tensor<4xsi64>, order = #NHWC}> + // CHECK: [[CONV:%.+]] = VPU.NCE.Convolution([[IN]], [[WEIGHT]]) + %DIM = tensor.dim %CONV, %C2 : tensor<1x32x?x4xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 50, 4]> : tensor<4xsi64>, order = #NHWC}> + // CHECK: [[VAL:%.+]] = tensor.dim [[IN]], [[C2]] + // CHECK: [[DIM:%.+]] = arith.divsi [[VAL]], [[C2]] : index + return %CONV, %DIM : tensor<1x32x?x4xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 50, 4]> : tensor<4xsi64>, order = #NHWC}>, index + // CHECK: return [[CONV]], [[DIM]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +// CHECK-LABEL: @Eltwise +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x16x?x256xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 340, 256]> : tensor<4xsi64>, order = #NHWC}> +func.func @Eltwise(%arg0: tensor<1x16x?x256xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 340, 256]> : tensor<4xsi64>, order = #NHWC}>) -> (tensor<1x16x?x256xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 340, 256]> : tensor<4xsi64>, order = #NHWC}>, index) { + %C2 = arith.constant 2 : index + %ELTWISE = VPU.NCE.Eltwise(%arg0, %arg0) { + input_padding = [0, 0, 0, 0], + op_type = #VPU.eltwise_type, + multiClusterStrategy = #VPU.multi_cluster_strategy, + ppe = #VPU.PPEStub<>, + tilingStrategy = [1, 1, 2, 1] + } -> tensor<1x16x?x256xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 340, 256]> : tensor<4xsi64>, order = #NHWC}> + %DIM = tensor.dim %ELTWISE, %C2 : tensor<1x16x?x256xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 340, 256]> : tensor<4xsi64>, order = #NHWC}> + return %ELTWISE,%DIM : tensor<1x16x?x256xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 340, 256]> : tensor<4xsi64>, order = #NHWC}>, index + + // CHECK: [[C2:%.+]] = arith.constant 2 : index + // CHECK: [[OUTPUT:%.+]] = VPU.NCE.Eltwise([[INPUT]], [[INPUT]]) + // CHECK: [[DIM:%.+]] = tensor.dim [[INPUT]], [[C2]] + // CHECK: return [[OUTPUT]], [[DIM]] +} + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +// CHECK-LABEL: @NCEMaxPool +// CHECK-SAME: [[INPUT0:%arg[0-9]]]: tensor<1x16x?x15xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 340, 15]> : tensor<4xsi64>, mem_space = @CMX_NN, order = #NHWC}>, +// CHECK-SAME: [[INPUT1:%arg[0-9]]]: tensor<16x1x1x4xsi32, {mem_space = @CMX_NN, order = #NHWC}> +func.func @NCEMaxPool(%arg0: tensor<1x16x?x15xf16, {mem_space = @CMX_NN, bounds = #const.OpaqueI64Elements<[1, 16, 340, 15]> : tensor<4xsi64>, order = #NHWC}>, + %arg1: tensor<16x1x1x4xsi32, {mem_space = @CMX_NN, order = #NHWC}> + ) -> (tensor<1x16x?x15xf16, {mem_space = @CMX_NN, bounds = #const.OpaqueI64Elements<[1, 16, 340, 15]> : tensor<4xsi64>, order = #NCHW}>, index) { + %C2 = arith.constant 2 : index + %MAXPOOL = VPU.NCE.MaxPool(%arg0, %arg1) { + kernel_size = [1, 1], + pad = #VPU.Padding, + ppe = #VPU.PPEStub<>, + strides = [1, 1] + } -> tensor<1x16x?x15xf16, {mem_space = @CMX_NN, bounds = #const.OpaqueI64Elements<[1, 16, 340, 15]> : tensor<4xsi64>, order = #NCHW}> { + VPU.DPU.Workload outOffsets [0, 0, 0, 0] outSizes [1, 16, 15, 15] #VPU.mpe_mode + } + %DIM = tensor.dim %arg0, %C2 : tensor<1x16x?x15xf16, {mem_space = @CMX_NN, bounds = #const.OpaqueI64Elements<[1, 16, 340, 15]> : tensor<4xsi64>, order = #NHWC}> + return %MAXPOOL,%DIM : tensor<1x16x?x15xf16, {mem_space = @CMX_NN, bounds = #const.OpaqueI64Elements<[1, 16, 340, 15]> : tensor<4xsi64>, order = #NCHW}>, index + + // CHECK: [[C2:%.+]] = arith.constant 2 : index + // CHECK: [[OUTPUT:%.+]] = VPU.NCE.MaxPool([[INPUT0]], [[INPUT1]] ) + // CHECK: [[DIM:%.+]] = tensor.dim [[INPUT0]], [[C2]] + // CHECK: return [[OUTPUT]], [[DIM]] +} diff --git a/tests/lit/NPU/dialect/VPU/passes/scf_outlining.mlir b/tests/lit/NPU/dialect/VPU/passes/scf_outlining.mlir index 131e94f432..6fe15c9e64 100644 --- a/tests/lit/NPU/dialect/VPU/passes/scf_outlining.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/scf_outlining.mlir @@ -60,7 +60,7 @@ module @ControlFlowOutliningStaticShape { // CHECK: [[OUTPUT:%.+]] = VPU.NCE.Convolution([[INPUT0]], [[CST]], [[CST_0]]) {pad = #VPU.Padding, {{[^:]+}}} : tensor<1x32x33x64xf16, {order = #NHWC}>, tensor<256x32x3x3xf16, {order = #NHWC}>, tensor<256x1x1x4xsi32, {order = #NCHW}> -> tensor<1x256x32x64xf16, {order = #NHWC}> // CHECK: return [[OUTPUT]] : tensor<1x256x32x64xf16, {order = #NHWC}> // CHECK: } -// CHECK: func.func @main(%arg0: tensor<1x32x64x64xf16, {order = #NHWC}>) -> tensor<1x256x64x64xf16, {order = #NHWC}> { +// CHECK: func.func @main(%arg0: tensor<1x32x64x64xf16, {order = #NHWC}>) -> tensor<1x256x64x64xf16, {order = #NHWC}> { // CHECK: [[LOCAL_BUFF:%.+]] = tensor.empty() : tensor<1x256x64x64xf16, {order = #NHWC}> // CHECK: [[CST_1:%.+]] = arith.constant 0 : index // CHECK: [[CST_64:%.+]] = arith.constant 64 : index @@ -109,18 +109,18 @@ module @ControlFlowOutliningStaticShape1 { %3 = arith.cmpi eq, %arg1, %c0_0 : index %extracted_slice = tensor.extract_slice %arg0[0, 0, %2, 0] [1, 16, 51, 200] [1, 1, 1, 1] : tensor<1x16x200x200xf16, {order = #NHWC}> to tensor<1x16x51x200xf16, {order = #NHWC}> %4 = scf.if %3 -> (tensor<1x16x50x200xf16, {order = #NHWC}>) { - %5 = VPU.NCE.MaxPool(%extracted_slice, %cst ) {kernel_size = [3, 3], pad = #VPU.Padding, ppe = #VPU.PPEStub<>, strides = [1, 1]} -> tensor<1x16x50x200xf16, {order = #NHWC}> + %5 = VPU.NCE.MaxPool(%extracted_slice, %cst ) {kernel_size = [3, 3], pad = #VPU.Padding, ppe = #VPU.PPEStub<>, strides = [1, 1]} -> tensor<1x16x50x200xf16, {order = #NHWC}> scf.yield %5 : tensor<1x16x50x200xf16, {order = #NHWC}> } else { %c200_1 = arith.constant 200 : index %5 = arith.subi %c200_1, %arg1 : index %6 = arith.cmpi eq, %arg1, %5 : index %7 = scf.if %6 -> (tensor<1x16x50x200xf16, {order = #NHWC}>) { - %8 = VPU.NCE.MaxPool(%extracted_slice, %cst ) {kernel_size = [3, 3], pad = #VPU.Padding, ppe = #VPU.PPEStub<>, strides = [1, 1]} -> tensor<1x16x50x200xf16, {order = #NHWC}> + %8 = VPU.NCE.MaxPool(%extracted_slice, %cst ) {kernel_size = [3, 3], pad = #VPU.Padding, ppe = #VPU.PPEStub<>, strides = [1, 1]} -> tensor<1x16x50x200xf16, {order = #NHWC}> scf.yield %8 : tensor<1x16x50x200xf16, {order = #NHWC}> } else { %extracted_slice_2 = tensor.extract_slice %arg0[0, 0, %2, 0] [1, 16, 52, 200] [1, 1, 1, 1] : tensor<1x16x200x200xf16, {order = #NHWC}> to tensor<1x16x52x200xf16, {order = #NHWC}> - %8 = VPU.NCE.MaxPool(%extracted_slice_2, %cst ) {kernel_size = [3, 3], pad = #VPU.Padding, ppe = #VPU.PPEStub<>, strides = [1, 1]} -> tensor<1x16x50x200xf16, {order = #NHWC}> + %8 = VPU.NCE.MaxPool(%extracted_slice_2, %cst ) {kernel_size = [3, 3], pad = #VPU.Padding, ppe = #VPU.PPEStub<>, strides = [1, 1]} -> tensor<1x16x50x200xf16, {order = #NHWC}> scf.yield %8 : tensor<1x16x50x200xf16, {order = #NHWC}> } scf.yield %7 : tensor<1x16x50x200xf16, {order = #NHWC}> @@ -334,3 +334,47 @@ module @ControlFlowOutliningMultipleOutput { // CHECK: } // CHECK: return [[RESULT]] : tensor<1x256x64x64xf16, {order = #NHWC}> +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> +module @Add { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input1" : tensor<1x1x100x100xf32> + DataInfo "input2" : tensor<1x1x100x100xf32> + } outputsInfo : { + DataInfo "Add_3" friendlyName = "output" : tensor<1x1x100x100xf32> + } + func.func @main(%arg0: tensor<1x1x100x100xf32>, %arg1: tensor<1x1x100x100xf32>) -> tensor<1x1x100x100xf32>{ + %0 = VPU.PermuteCast(%arg0) {dst_order = #NHWC, mem_perm = #NHWC} : tensor<1x1x100x100xf32> -> tensor<1x1x100x100xf32, {order = #NHWC}> + %1 = VPU.PermuteCast(%arg1) {dst_order = #NHWC, mem_perm = #NHWC} : tensor<1x1x100x100xf32> -> tensor<1x1x100x100xf32, {order = #NHWC}> + %2 = VPU.ShapeCast {shape = [1, 16, 25, 25]} inputs(%0 : tensor<1x1x100x100xf32, {order = #NHWC}>) -> tensor<1x16x25x25xf32, {order = #NHWC}> + %3 = VPU.Convert(%2) {dstElemType = f16, multiClusterStrategy = #VPU.multi_cluster_strategy} : tensor<1x16x25x25xf32, {order = #NHWC}> -> tensor<1x16x25x25xf16, {order = #NHWC}> + %4 = VPU.ShapeCast {shape = [1, 16, 25, 25]} inputs(%1 : tensor<1x1x100x100xf32, {order = #NHWC}>) -> tensor<1x16x25x25xf32, {order = #NHWC}> + %5 = VPU.Convert(%4) {dstElemType = f16, multiClusterStrategy = #VPU.multi_cluster_strategy} : tensor<1x16x25x25xf32, {order = #NHWC}> -> tensor<1x16x25x25xf16, {order = #NHWC}> + %6 = VPU.NCE.Eltwise(%3, %5) {multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, quant_scale = [1.000000e+00] +, fp_prelu_alpha = 1.000000e+00 : f64>} -> tensor<1x16x25x25xf32, {order = #NHWC}> + %7 = VPU.ShapeCast {shape = [1, 1, 100, 100]} inputs(%6 : tensor<1x16x25x25xf32, {order = #NHWC}>) -> tensor<1x1x100x100xf32, {order = #NHWC}> + %8 = VPU.PermuteCast(%7) {dst_order = #NCHW, mem_perm = #NWCH} : tensor<1x1x100x100xf32, {order = #NHWC}> -> tensor<1x1x100x100xf32> + return %8 : tensor<1x1x100x100xf32> + } + + // CHECK: func.func private @main_func0([[INPUT0:%.+]]: tensor<1x1x100x100xf32>, [[INPUT1:%.+]]: tensor<1x1x100x100xf32>) -> tensor<1x1x100x100xf32> { + // CHECK: [[CAST0:%.+]] = VPU.PermuteCast([[INPUT0]]) + // CHECK: [[CAST1:%.+]] = VPU.PermuteCast([[INPUT1]]) + // CHECK: [[SHAPECAST0:%.+]] = VPU.ShapeCast {shape = [1, 16, 25, 25]} inputs([[CAST0]] + // CHECK: [[CONVERT0:%.+]] = VPU.Convert([[SHAPECAST0]]) + // CHECK: [[SHAPECAST1:%.+]] = VPU.ShapeCast {shape = [1, 16, 25, 25]} inputs([[CAST1]] + // CHECK: [[CONVERT1:%.+]] = VPU.Convert([[SHAPECAST1]]) + // CHECK: [[ELTWISE:%.+]] = VPU.NCE.Eltwise([[CONVERT0]], [[CONVERT1]]) + // CHECK: [[SHAPECAST2:%.+]] = VPU.ShapeCast + // CHECK: [[RESULT:%.+]] = VPU.PermuteCast + // CHECK: return [[RESULT]] : tensor<1x1x100x100xf32> + // CHECK: } + // CHECK: func.func @main([[ARGS0:%.+]]: tensor<1x1x100x100xf32>, [[ARGS1:%.+]]: tensor<1x1x100x100xf32>) -> tensor<1x1x100x100xf32> { + // CHECK: [[RESULTS:%.+]] = call @main_func0([[ARGS0]], [[ARGS1]]) : (tensor<1x1x100x100xf32>, tensor<1x1x100x100xf32>) -> tensor<1x1x100x100xf32> + // CHECK: return [[RESULTS]] : tensor<1x1x100x100xf32> + // CHECK: } +} + diff --git a/tests/lit/NPU/dialect/VPU/passes/scf_vertical_fusion_40XX+.mlir b/tests/lit/NPU/dialect/VPU/passes/scf_vertical_fusion_40XX+.mlir new file mode 100644 index 0000000000..814ecae246 --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/scf_vertical_fusion_40XX+.mlir @@ -0,0 +1,513 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --scf-vertical-fusion --canonicalize %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +IE.TileResource 3 of @NCE at 1.700000e+03 MHz { + IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.ExecutorResource 2 of @SHAVE_ACT + IE.ExecutorResource 1 of @DPU +} + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +//CHECK: #[[$MAP0:.*]] = affine_map<(d0) -> (-d0 + 960, 26)> +//CHECK: #[[$MAP1:.*]] = affine_map<(d0) -> (0, d0 - 1)> +//CHECK: #[[$MAP2:.*]] = affine_map<(d0) -> (-d0 + 1, 0)> +//CHECK: #[[$MAP3:.*]] = affine_map<()[s0] -> (1, s0)> +//CHECK: #[[$MAP4:.*]] = affine_map<(d0, d1) -> (0, d0 + d1 - 959)> +//CHECK: #[[$MAP5:.*]] = affine_map<(d0) -> (d0 + 6)> + +// CHECK-LABEL: @MergeVFChain3Tiles +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x256x540x120xf16, {order = #NHWC}>) +func.func @MergeVFChain3Tiles(%arg0: tensor<1x256x540x120xf16, {order = #NHWC}>) -> tensor<1x128x540x240xf16, {order = #NHWC}> + { + %cst = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_0 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_1 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_2 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_3 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_4 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_5 = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1x32x1x1xf32>, [#const.Reshape<[32, 1, 1, 1]>, #const.CastElemType, #const.PadWithZero<[0, 0, 0, 0], [0, 15, 0, 0]>, #const.Reorder<#NHWC>] + %cst_6 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_7 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_8 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_9 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_10 = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1x32x1x1xf32>, [#const.Reshape<[32, 1, 1, 1]>, #const.CastElemType, #const.PadWithZero<[0, 0, 0, 0], [0, 15, 0, 0]>, #const.Reorder<#NHWC>] + %cst_11 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_12 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_13 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + + %0 = VPU.ShapeCast {shape = [1, 32, 540, 960]} inputs(%arg0 : tensor<1x256x540x120xf16, {order = #NHWC}>) -> tensor<1x32x540x960xf16, {order = #NHWC}> + %1 = VPU.NCE.Convolution(%0, %cst, %cst_13) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 22]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %2 = VPU.NCE.Convolution(%1, %cst_0, %cst_12) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 22]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %3 = VPU.NCE.Convolution(%2, %cst_1, %cst_11) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 21]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %4 = VPU.NCE.DepthConvolution(%3, %cst_10, %cst_9) {multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 1, 1, 1], strides = [1, 1], tilingStrategy = [1, 1, 1, 21]} -> tensor<1x32x540x960xf16, {order = #NHWC}> + %5 = VPU.NCE.Convolution(%4, %cst_2, %cst_8) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 22]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %6 = VPU.NCE.Convolution(%5, %cst_3, %cst_7) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 22]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %7 = VPU.NCE.Convolution(%6, %cst_4, %cst_6) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 21]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %8 = VPU.NCE.DepthConvolution(%7, %cst_5, %cst_9) {multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 1, 1, 1], strides = [1, 1], tilingStrategy = [1, 1, 1, 20]} -> tensor<1x32x540x960xf16, {order = #NHWC}> + %9 = VPU.ShapeCast {shape = [1, 128, 540, 240]} inputs(%8 : tensor<1x32x540x960xf16, {order = #NHWC}>) -> tensor<1x128x540x240xf16, {order = #NHWC}> + + return %9: tensor<1x128x540x240xf16, {order = #NHWC}> + + //CHECK: [[PAD_VALUE:%.+]] = arith.constant 0.000000e+00 : f16 + //CHECK: [[LOOP_STEP:%.+]] = arith.constant 26 : index + //CHECK: [[LOOP_END:%.+]] = arith.constant 960 : index + //CHECK: [[LOOP_BEGIN:%.+]] = arith.constant 0 : index + + //CHECK: [[CAST_INPUT:%.+]] = VPU.ShapeCast {shape = [1, 32, 540, 960]} inputs([[INPUT]] : tensor<1x256x540x120xf16, {order = #NHWC}>) -> tensor<1x32x540x960xf16, {order = #NHWC}> + //CHECK: [[LOOP_OUTPUT:%.+]] = tensor.empty() : tensor<1x32x540x960xf16, {order = #NHWC}> + //CHECK: [[LOOP:%.+]] = scf.for + //CHECK-SAME: [[LOOP_ITER:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END]] step [[LOOP_STEP]] + //CHECK-SAME: iter_args([[LOOP_OUT:%arg[0-9]]] = [[LOOP_OUTPUT]]) -> (tensor<1x32x540x960xf16, {order = #NHWC}>) { + //CHECK: [[OUTPUT_SIZE:%.+]] = affine.min #[[$MAP0]]([[LOOP_ITER]]) + //CHECK: [[TEMP_VALUE0:%.+]] = affine.max #[[$MAP1]]([[LOOP_ITER]]) + //CHECK: [[TEMP_VALUE1:%.+]] = affine.min #[[$MAP2]]([[LOOP_ITER]]) + //CHECK: [[PAD_LOW5:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE1]]] + //CHECK: [[TEMP_VALUE2:%.+]] = affine.min #[[$MAP4]]([[LOOP_ITER]], [[TEMP_VALUE0]]) + //CHECK: [[PAD_HIGH5:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE2]]] + //CHECK: [[TEMP_VALUE2:%.+]] = affine.max #[[$MAP1]]([[TEMP_VALUE0]]) + //CHECK: [[TEMP_VALUE3:%.+]] = affine.min #[[$MAP2]]([[TEMP_VALUE0]]) + //CHECK: [[PAD_LOW4:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE3]]] + //CHECK: [[TEMP_VALUE4:%.+]] = affine.min #[[$MAP4]]([[TEMP_VALUE0]], [[TEMP_VALUE2]]) + //CHECK: [[PAD_HIGH4:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE4]]] + //CHECK: [[TEMP_VALUE5:%.+]] = affine.max #[[$MAP1]]([[TEMP_VALUE2]]) + //CHECK: [[TEMP_VALUE6:%.+]] = affine.min #[[$MAP2]]([[TEMP_VALUE2]]) + //CHECK: [[PAD_LOW3:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE6]]] + //CHECK: [[TEMP_VALUE7:%.+]] = affine.min #[[$MAP4]]([[TEMP_VALUE2]], [[TEMP_VALUE5]]) + //CHECK: [[PAD_HIGH3:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE7]]] + //CHECK: [[TEMP_VALUE8:%.+]] = affine.max #[[$MAP1]]([[TEMP_VALUE5]]) + //CHECK: [[TEMP_VALUE9:%.+]] = affine.min #[[$MAP2]]([[TEMP_VALUE5]]) + //CHECK: [[PAD_LOW2:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE9]]] + //CHECK: [[TEMP_VALUE10:%.+]] = affine.min #[[$MAP4]]([[TEMP_VALUE5]], [[TEMP_VALUE8]]) + //CHECK: [[PAD_HIGH2:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE10]]] + //CHECK: [[TEMP_VALUE11:%.+]] = affine.max #[[$MAP1]]([[TEMP_VALUE8]]) + //CHECK: [[TEMP_VALUE12:%.+]] = affine.min #[[$MAP2]]([[TEMP_VALUE8]]) + //CHECK: [[PAD_LOW1:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE12]]] + //CHECK: [[TEMP_VALUE13:%.+]] = affine.min #[[$MAP4]]([[TEMP_VALUE8]], [[TEMP_VALUE11]]) + //CHECK: [[PAD_HIGH1:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE13]]] + //CHECK: [[SLICE_OFFSET:%.+]] = affine.max #[[$MAP1]]([[TEMP_VALUE11]]) + //CHECK: [[TEMP_VALUE14:%.+]] = affine.min #[[$MAP2]]([[TEMP_VALUE11]]) + //CHECK: [[PAD_LOW0:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE14]]] + //CHECK: [[SLICE_SIZE:%.+]] = affine.apply #[[$MAP5]]([[OUTPUT_SIZE]]) + //CHECK: [[TEMP_VALUE15:%.+]] = affine.min #[[$MAP4]]([[TEMP_VALUE11]], [[SLICE_OFFSET]]) + //CHECK: [[PAD_HIGH0:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE15]]] + //CHECK: [[SLICE:%.+]] = tensor.extract_slice [[CAST_INPUT]][0, 0, 0, [[SLICE_OFFSET]]] [1, 32, 540, [[SLICE_SIZE]]] [1, 1, 1, 1] : tensor<1x32x540x960xf16, {order = #NHWC}> to tensor<1x32x540x?xf16, {order = #NHWC}> + + //CHECK: [[PAD0:%.+]] = tensor.pad [[SLICE]] low[0, 0, 1, [[PAD_LOW0]]] high[0, 0, 1, [[PAD_HIGH0]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + //CHECK: [[CONV0:%.+]] = VPU.NCE.Convolution([[PAD0]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[PAD1:%.+]] = tensor.pad [[CONV0]] low[0, 0, 1, [[PAD_LOW1]]] high[0, 0, 1, [[PAD_HIGH1]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + + //CHECK: [[CONV1:%.+]] = VPU.NCE.Convolution([[PAD1]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[PAD2:%.+]] = tensor.pad [[CONV1]] low[0, 0, 1, [[PAD_LOW2]]] high[0, 0, 1, [[PAD_HIGH2]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + + //CHECK: [[CONV2:%.+]] = VPU.NCE.Convolution([[PAD2]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[DWCONV0:%.+]] = VPU.NCE.DepthConvolution([[CONV2]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[PAD3:%.+]] = tensor.pad [[DWCONV0]] low[0, 0, 1, [[PAD_LOW3]]] high[0, 0, 1, [[PAD_HIGH3]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + + //CHECK: [[CONV3:%.+]] = VPU.NCE.Convolution([[PAD3]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[PAD4:%.+]] = tensor.pad [[CONV3]] low[0, 0, 1, [[PAD_LOW4]]] high[0, 0, 1, [[PAD_HIGH4]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + + //CHECK: [[CONV4:%.+]] = VPU.NCE.Convolution([[PAD4]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[PAD5:%.+]] = tensor.pad [[CONV4]] low[0, 0, 1, [[PAD_LOW5]]] high[0, 0, 1, [[PAD_HIGH5]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + + //CHECK: [[CONV5:%.+]] = VPU.NCE.Convolution([[PAD5]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[DWCONV1:%.+]] = VPU.NCE.DepthConvolution([[CONV5]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[INSERT:%.+]] = tensor.insert_slice [[DWCONV1]] into [[LOOP_OUT]][0, 0, 0, [[LOOP_ITER]]] [1, 32, 540, [[OUTPUT_SIZE]]] [1, 1, 1, 1] : tensor<1x32x540x?xf16, {order = #NHWC}> into tensor<1x32x540x960xf16, {order = #NHWC}> + //CHECK: scf.yield [[INSERT]] : tensor<1x32x540x960xf16, {order = #NHWC}> + //CHECK: [[CAST:%.+]] = VPU.ShapeCast {shape = [1, 128, 540, 240]} inputs([[LOOP]] + //CHECK: return [[CAST]] : tensor<1x128x540x240xf16, {order = #NHWC}> +} + +// ----- + +IE.TileResource 6 of @NCE at 1.850000e+03 MHz { + IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.ExecutorResource 2 of @SHAVE_ACT + IE.ExecutorResource 1 of @DPU +} + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +//CHECK: #[[$MAP0:.*]] = affine_map<(d0) -> (-d0 + 960, 44)> +//CHECK: #[[$MAP1:.*]] = affine_map<(d0) -> (0, d0 - 1)> +//CHECK: #[[$MAP2:.*]] = affine_map<(d0) -> (-d0 + 1, 0)> +//CHECK: #[[$MAP3:.*]] = affine_map<()[s0] -> (1, s0)> +//CHECK: #[[$MAP4:.*]] = affine_map<(d0, d1) -> (0, d0 + d1 - 959)> +//CHECK: #[[$MAP5:.*]] = affine_map<(d0) -> (d0 + 6)> + +// CHECK-LABEL: @MergeVFChain6Tiles +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x256x540x120xf16, {order = #NHWC}>) +func.func @MergeVFChain6Tiles(%arg0: tensor<1x256x540x120xf16, {order = #NHWC}>) -> tensor<1x128x540x240xf16, {order = #NHWC}> + { + %cst = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_0 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_1 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_2 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_3 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_4 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_5 = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1x32x1x1xf32>, [#const.Reshape<[32, 1, 1, 1]>, #const.CastElemType, #const.PadWithZero<[0, 0, 0, 0], [0, 15, 0, 0]>, #const.Reorder<#NHWC>] + %cst_6 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_7 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_8 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_9 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_10 = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1x32x1x1xf32>, [#const.Reshape<[32, 1, 1, 1]>, #const.CastElemType, #const.PadWithZero<[0, 0, 0, 0], [0, 15, 0, 0]>, #const.Reorder<#NHWC>] + %cst_11 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_12 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_13 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + + %0 = VPU.ShapeCast {shape = [1, 32, 540, 960]} inputs(%arg0 : tensor<1x256x540x120xf16, {order = #NHWC}>) -> tensor<1x32x540x960xf16, {order = #NHWC}> + %1 = VPU.NCE.Convolution(%0, %cst, %cst_13) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 22]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %2 = VPU.NCE.Convolution(%1, %cst_0, %cst_12) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 22]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %3 = VPU.NCE.Convolution(%2, %cst_1, %cst_11) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 21]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %4 = VPU.NCE.DepthConvolution(%3, %cst_10, %cst_9) {multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 1, 1, 1], strides = [1, 1], tilingStrategy = [1, 1, 1, 21]} -> tensor<1x32x540x960xf16, {order = #NHWC}> + %5 = VPU.NCE.Convolution(%4, %cst_2, %cst_8) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 22]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %6 = VPU.NCE.Convolution(%5, %cst_3, %cst_7) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 22]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %7 = VPU.NCE.Convolution(%6, %cst_4, %cst_6) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 21]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %8 = VPU.NCE.DepthConvolution(%7, %cst_5, %cst_9) {multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 1, 1, 1], strides = [1, 1], tilingStrategy = [1, 1, 1, 20]} -> tensor<1x32x540x960xf16, {order = #NHWC}> + %9 = VPU.ShapeCast {shape = [1, 128, 540, 240]} inputs(%8 : tensor<1x32x540x960xf16, {order = #NHWC}>) -> tensor<1x128x540x240xf16, {order = #NHWC}> + + return %9: tensor<1x128x540x240xf16, {order = #NHWC}> + + //CHECK: [[PAD_VALUE:%.+]] = arith.constant 0.000000e+00 : f16 + //CHECK: [[LOOP_STEP:%.+]] = arith.constant 44 : index + //CHECK: [[LOOP_END:%.+]] = arith.constant 960 : index + //CHECK: [[LOOP_BEGIN:%.+]] = arith.constant 0 : index + + //CHECK: [[CAST_INPUT:%.+]] = VPU.ShapeCast {shape = [1, 32, 540, 960]} inputs([[INPUT]] : tensor<1x256x540x120xf16, {order = #NHWC}>) -> tensor<1x32x540x960xf16, {order = #NHWC}> + //CHECK: [[LOOP_OUTPUT:%.+]] = tensor.empty() : tensor<1x32x540x960xf16, {order = #NHWC}> + //CHECK: [[LOOP:%.+]] = scf.for + //CHECK-SAME: [[LOOP_ITER:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END]] step [[LOOP_STEP]] + //CHECK-SAME: iter_args([[LOOP_OUT:%arg[0-9]]] = [[LOOP_OUTPUT]]) -> (tensor<1x32x540x960xf16, {order = #NHWC}>) { + //CHECK: [[OUTPUT_SIZE:%.+]] = affine.min #[[$MAP0]]([[LOOP_ITER]]) + //CHECK: [[TEMP_VALUE0:%.+]] = affine.max #[[$MAP1]]([[LOOP_ITER]]) + //CHECK: [[TEMP_VALUE1:%.+]] = affine.min #[[$MAP2]]([[LOOP_ITER]]) + //CHECK: [[PAD_LOW5:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE1]]] + //CHECK: [[TEMP_VALUE2:%.+]] = affine.min #[[$MAP4]]([[LOOP_ITER]], [[TEMP_VALUE0]]) + //CHECK: [[PAD_HIGH5:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE2]]] + //CHECK: [[TEMP_VALUE2:%.+]] = affine.max #[[$MAP1]]([[TEMP_VALUE0]]) + //CHECK: [[TEMP_VALUE3:%.+]] = affine.min #[[$MAP2]]([[TEMP_VALUE0]]) + //CHECK: [[PAD_LOW4:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE3]]] + //CHECK: [[TEMP_VALUE4:%.+]] = affine.min #[[$MAP4]]([[TEMP_VALUE0]], [[TEMP_VALUE2]]) + //CHECK: [[PAD_HIGH4:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE4]]] + //CHECK: [[TEMP_VALUE5:%.+]] = affine.max #[[$MAP1]]([[TEMP_VALUE2]]) + //CHECK: [[TEMP_VALUE6:%.+]] = affine.min #[[$MAP2]]([[TEMP_VALUE2]]) + //CHECK: [[PAD_LOW3:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE6]]] + //CHECK: [[TEMP_VALUE7:%.+]] = affine.min #[[$MAP4]]([[TEMP_VALUE2]], [[TEMP_VALUE5]]) + //CHECK: [[PAD_HIGH3:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE7]]] + //CHECK: [[TEMP_VALUE8:%.+]] = affine.max #[[$MAP1]]([[TEMP_VALUE5]]) + //CHECK: [[TEMP_VALUE9:%.+]] = affine.min #[[$MAP2]]([[TEMP_VALUE5]]) + //CHECK: [[PAD_LOW2:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE9]]] + //CHECK: [[TEMP_VALUE10:%.+]] = affine.min #[[$MAP4]]([[TEMP_VALUE5]], [[TEMP_VALUE8]]) + //CHECK: [[PAD_HIGH2:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE10]]] + //CHECK: [[TEMP_VALUE11:%.+]] = affine.max #[[$MAP1]]([[TEMP_VALUE8]]) + //CHECK: [[TEMP_VALUE12:%.+]] = affine.min #[[$MAP2]]([[TEMP_VALUE8]]) + //CHECK: [[PAD_LOW1:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE12]]] + //CHECK: [[TEMP_VALUE13:%.+]] = affine.min #[[$MAP4]]([[TEMP_VALUE8]], [[TEMP_VALUE11]]) + //CHECK: [[PAD_HIGH1:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE13]]] + //CHECK: [[SLICE_OFFSET:%.+]] = affine.max #[[$MAP1]]([[TEMP_VALUE11]]) + //CHECK: [[TEMP_VALUE14:%.+]] = affine.min #[[$MAP2]]([[TEMP_VALUE11]]) + //CHECK: [[PAD_LOW0:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE14]]] + //CHECK: [[SLICE_SIZE:%.+]] = affine.apply #[[$MAP5]]([[OUTPUT_SIZE]]) + //CHECK: [[TEMP_VALUE15:%.+]] = affine.min #[[$MAP4]]([[TEMP_VALUE11]], [[SLICE_OFFSET]]) + //CHECK: [[PAD_HIGH0:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE15]]] + //CHECK: [[SLICE:%.+]] = tensor.extract_slice [[CAST_INPUT]][0, 0, 0, [[SLICE_OFFSET]]] [1, 32, 540, [[SLICE_SIZE]]] [1, 1, 1, 1] : tensor<1x32x540x960xf16, {order = #NHWC}> to tensor<1x32x540x?xf16, {order = #NHWC}> + + //CHECK: [[PAD0:%.+]] = tensor.pad [[SLICE]] low[0, 0, 1, [[PAD_LOW0]]] high[0, 0, 1, [[PAD_HIGH0]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + //CHECK: [[CONV0:%.+]] = VPU.NCE.Convolution([[PAD0]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[PAD1:%.+]] = tensor.pad [[CONV0]] low[0, 0, 1, [[PAD_LOW1]]] high[0, 0, 1, [[PAD_HIGH1]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + + //CHECK: [[CONV1:%.+]] = VPU.NCE.Convolution([[PAD1]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[PAD2:%.+]] = tensor.pad [[CONV1]] low[0, 0, 1, [[PAD_LOW2]]] high[0, 0, 1, [[PAD_HIGH2]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + + //CHECK: [[CONV2:%.+]] = VPU.NCE.Convolution([[PAD2]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[DWCONV0:%.+]] = VPU.NCE.DepthConvolution([[CONV2]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[PAD3:%.+]] = tensor.pad [[DWCONV0]] low[0, 0, 1, [[PAD_LOW3]]] high[0, 0, 1, [[PAD_HIGH3]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + + //CHECK: [[CONV3:%.+]] = VPU.NCE.Convolution([[PAD3]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[PAD4:%.+]] = tensor.pad [[CONV3]] low[0, 0, 1, [[PAD_LOW4]]] high[0, 0, 1, [[PAD_HIGH4]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + + //CHECK: [[CONV4:%.+]] = VPU.NCE.Convolution([[PAD4]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[PAD5:%.+]] = tensor.pad [[CONV4]] low[0, 0, 1, [[PAD_LOW5]]] high[0, 0, 1, [[PAD_HIGH5]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + + //CHECK: [[CONV5:%.+]] = VPU.NCE.Convolution([[PAD5]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[DWCONV1:%.+]] = VPU.NCE.DepthConvolution([[CONV5]] + //CHECK-SAME: pad = #VPU.Padding + //CHECK: [[INSERT:%.+]] = tensor.insert_slice [[DWCONV1]] into [[LOOP_OUT]][0, 0, 0, [[LOOP_ITER]]] [1, 32, 540, [[OUTPUT_SIZE]]] [1, 1, 1, 1] : tensor<1x32x540x?xf16, {order = #NHWC}> into tensor<1x32x540x960xf16, {order = #NHWC}> + //CHECK: scf.yield [[INSERT]] : tensor<1x32x540x960xf16, {order = #NHWC}> + //CHECK: [[CAST:%.+]] = VPU.ShapeCast {shape = [1, 128, 540, 240]} inputs([[LOOP]] + //CHECK: return [[CAST]] : tensor<1x128x540x240xf16, {order = #NHWC}> +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +//CHECK: #[[$MAP:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 240)> + +// CHECK-LABEL: @MergeDynamicEltwise +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> +func.func @MergeDynamicEltwise( + %arg0: tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> +) -> tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> { + %0 = VPU.NCE.Eltwise(%arg0, %arg0) { + is_inplace = true, + multiClusterStrategy = #VPU.multi_cluster_strategy, + op_type = #VPU.eltwise_type, + ppe = #VPU.PPEInt< + mode = , + clamp_low = -2147483648 : i64, + clamp_high = 2147483647 : i64, + lrelu_mult = 1 : i64, + lrelu_shift = 0 : i64, + quant_scale = [1.000000e+00], + fp_prelu_alpha = 1.000000e+00 : f64 + >, + tilingStrategy = [1, 1, 1, 2] + } -> tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + + %1 = VPU.NCE.Eltwise(%0, %0) { + is_inplace = true, + multiClusterStrategy = #VPU.multi_cluster_strategy, + op_type = #VPU.eltwise_type, + ppe = #VPU.PPEInt< + mode = , + clamp_low = -2147483648 : i64, + clamp_high = 2147483647 : i64, + lrelu_mult = 1 : i64, + lrelu_shift = 0 : i64, + quant_scale = [1.000000e+00], + fp_prelu_alpha = 1.000000e+00 : f64 + >, + tilingStrategy = [1, 1, 1, 2] + } -> tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + + return %1 : tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + + //CHECK: [[LOOP_STEP:%.+]] = arith.constant 240 : index + //CHECK: [[LOOP_BEGIN:%.+]] = arith.constant 0 : index + //CHECK: [[DIM_INDEX:%.+]] = arith.constant 3 : index + + //CHECK: [[DIM:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX]] : tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[LOOP_OUTPUT:%.+]] = tensor.empty([[DIM]]) : tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[LOOP_END:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX]] : tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: [[LOOP:%.+]] = scf.for + //CHECK-SAME: [[LOOP_ITER:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END]] step [[LOOP_STEP]] + //CHECK-SAME: iter_args([[LOOP_OUT:%arg[0-9]]] = [[LOOP_OUTPUT]]) -> (tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}>) { + + //CHECK: [[SLICE_SIZE:%.+]] = affine.min #[[$MAP]]([[LOOP_ITER]])[[[LOOP_END]]] + //CHECK: [[SLICE:%.+]] = tensor.extract_slice [[INPUT]][0, 0, 0, [[LOOP_ITER]]] [1, 16, 256, [[SLICE_SIZE]]] [1, 1, 1, 1] + //CHECK-SAME: tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 240]> : tensor<4xsi64>, order = #NHWC}> + + //CHECK: [[ELTWISE0:%.+]] = VPU.NCE.Eltwise([[SLICE]], [[SLICE]]) + //CHECK: [[ELTWISE1:%.+]] = VPU.NCE.Eltwise([[ELTWISE0]], [[ELTWISE0]]) + + //CHECK: [[INSERT:%.+]] = tensor.insert_slice [[ELTWISE1]] into [[LOOP_OUT]][0, 0, 0, [[LOOP_ITER]]] [1, 16, 256, [[SLICE_SIZE]]] [1, 1, 1, 1] + //CHECK-SAME: tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 240]> : tensor<4xsi64>, order = #NHWC}> into tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: scf.yield [[INSERT]] : tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> + //CHECK: return [[LOOP]] : tensor<1x16x256x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 256, 480]> : tensor<4xsi64>, order = #NHWC}> +} + +// ----- + +IE.TileResource 3 of @NCE at 1.700000e+03 MHz { + IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.ExecutorResource 2 of @SHAVE_ACT + IE.ExecutorResource 1 of @DPU +} + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +//CHECK: #[[$MAP0:.*]] = affine_map<(d0) -> (-d0 + 960, 13)> +//CHECK: #[[$MAP1:.*]] = affine_map<(d0) -> (0, d0 - 1)> +//CHECK: #[[$MAP2:.*]] = affine_map<(d0) -> (-d0 + 1, 0)> +//CHECK: #[[$MAP3:.*]] = affine_map<()[s0] -> (1, s0)> +//CHECK: #[[$MAP4:.*]] = affine_map<(d0, d1) -> (0, d0 + d1 - 959)> +//CHECK: #[[$MAP5:.*]] = affine_map<(d0) -> (d0 + 1)> +//CHECK: #[[$MAP6:.*]] = affine_map<(d0) -> (-d0 + 960, 27)> +//CHECK: #[[$MAP7:.*]] = affine_map<(d0) -> (d0 + 2)> + +// CHECK-LABEL: @MergeVF2Chains +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x32x540x960xf16, {order = #NHWC}>) +func.func @MergeVF2Chains(%arg0: tensor<1x32x540x960xf16, {order = #NHWC}>) -> tensor<1x32x540x960xf16, {order = #NHWC}> + { + %cst = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_0 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_1 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType, #const.Reorder<#NHWC>] + %cst_2 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_3 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_4 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_5 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_6 = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1x32x1x1xf32>, [#const.Reshape<[32, 1, 1, 1]>, #const.CastElemType, #const.PadWithZero<[0, 0, 0, 0], [0, 15, 0, 0]>, #const.Reorder<#NHWC>] + + %0 = VPU.NCE.DepthConvolution(%arg0, %cst_6, %cst_5) {multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 1, 1, 1], strides = [1, 1], tilingStrategy = [1, 1, 1, 21]} -> tensor<1x32x540x960xf16, {order = #NHWC}> + %1 = VPU.NCE.Convolution(%0, %cst, %cst_4) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 22]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %2 = VPU.Sign(%1) : tensor<1x32x540x960xf16, {order = #NHWC}> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %3 = VPU.NCE.Convolution(%2, %cst_0, %cst_3) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 22]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + %4 = VPU.NCE.Convolution(%3, %cst_1, %cst_2) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 21]} : tensor<1x32x540x960xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x540x960xf16, {order = #NHWC}> + + return %4: tensor<1x32x540x960xf16, {order = #NHWC}> + + //CHECK: [[LOOP_STEP1:%.+]] = arith.constant 27 : index + //CHECK: [[PAD_VALUE:%.+]] = arith.constant 0.000000e+00 : f16 + //CHECK: [[LOOP_STEP0:%.+]] = arith.constant 13 : index + //CHECK: [[LOOP_END:%.+]] = arith.constant 960 : index + //CHECK: [[LOOP_BEGIN:%.+]] = arith.constant 0 : index + //CHECK: [[LOOP_OUTPUT0:%.+]] = tensor.empty() : tensor<1x32x540x960xf16, {order = #NHWC}> + //CHECK: [[LOOP0:%.+]] = scf.for + //CHECK-SAME: [[LOOP_ITER0:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END]] step [[LOOP_STEP0]] + //CHECK-SAME: iter_args([[LOOP_OUT0:%arg[0-9]]] = [[LOOP_OUTPUT0]]) -> (tensor<1x32x540x960xf16, {order = #NHWC}>) + + //CHECK: [[INSERT_SIZE0:%.+]] = affine.min #[[$MAP0]]([[LOOP_ITER0]]) + //CHECK: [[SLICE_OFFSET0:%.+]] = affine.max #[[$MAP1]]([[LOOP_ITER0]]) + //CHECK: [[TEMP_VALUE0:%.+]] = affine.min #[[$MAP2]]([[LOOP_ITER0]]) + //CHECK: [[PAD_LOW0:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE0]]] + //CHECK: [[TEMP_VALUE1:%.+]] = affine.min #[[$MAP4]]([[LOOP_ITER0]], [[SLICE_OFFSET0]]) + //CHECK: [[PAD_HIGH0:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE1]]] + //CHECK: [[SLICE_SIZE0:%.+]] = affine.apply #[[$MAP5]]([[INSERT_SIZE0]]) + + //CHECK: [[SLICE0:%.+]] = tensor.extract_slice [[INPUT]][0, 0, 0, [[SLICE_OFFSET0]]] [1, 32, 540, [[SLICE_SIZE0]]] [1, 1, 1, 1] : tensor<1x32x540x960xf16, {order = #NHWC}> to tensor<1x32x540x?xf16, {order = #NHWC}> + //CHECK: [[DWCONV0:%.+]] = VPU.NCE.DepthConvolution([[SLICE0]] + + //CHECK: [[PAD0:%.+]] = tensor.pad [[DWCONV0]] low[0, 0, 1, [[PAD_LOW0]]] high[0, 0, 1, [[PAD_HIGH0]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + //CHECK: [[CONV0:%.+]] = VPU.NCE.Convolution([[PAD0]] + + //CHECK: [[INSERT0:%.+]] = tensor.insert_slice [[CONV0]] into [[LOOP_OUT0]][0, 0, 0, [[LOOP_ITER0]]] [1, 32, 540, [[INSERT_SIZE0]]] [1, 1, 1, 1] : tensor<1x32x540x?xf16, {order = #NHWC}> into tensor<1x32x540x960xf16, {order = #NHWC}> + //CHECK: scf.yield [[INSERT0]] : tensor<1x32x540x960xf16, {order = #NHWC}> + + //CHECK: [[SIGN:%.+]] = VPU.Sign([[LOOP0]]) : tensor<1x32x540x960xf16, {order = #NHWC}> -> tensor<1x32x540x960xf16, {order = #NHWC}> + //CHECK: [[LOOP_OUTPUT1:%.+]] = tensor.empty() : tensor<1x32x540x960xf16, {order = #NHWC}> + //CHECK: [[LOOP1:%.+]] = scf.for + //CHECK-SAME: [[LOOP_ITER1:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END]] step [[LOOP_STEP1]] + //CHECK-SAME: iter_args([[LOOP_OUT1:%arg[0-9]]] = [[LOOP_OUTPUT1]]) -> (tensor<1x32x540x960xf16, {order = #NHWC}>) + + //CHECK: [[INSERT_SIZE1:%.+]] = affine.min #[[$MAP6]]([[LOOP_ITER1]]) + //CHECK: [[TEMP_VALUE2:%.+]] = affine.max #[[$MAP1]]([[LOOP_ITER0]]) + //CHECK: [[TEMP_VALUE3:%.+]] = affine.min #[[$MAP2]]([[LOOP_ITER0]]) + //CHECK: [[PAD_LOW2:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE3]]] + //CHECK: [[TEMP_VALUE4:%.+]] = affine.min #[[$MAP4]]([[LOOP_ITER1]], [[TEMP_VALUE2]]) + //CHECK: [[PAD_HIGH2:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE4]]] + //CHECK: [[SLICE_OFFSET1:%.+]] = affine.max #[[$MAP1]]([[TEMP_VALUE2]]) + //CHECK: [[TEMP_VALUE5:%.+]] = affine.min #[[$MAP2]]([[TEMP_VALUE2]]) + //CHECK: [[PAD_LOW1:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE5]]] + //CHECK: [[SLICE_SIZE1:%.+]] = affine.apply #[[$MAP7]]([[INSERT_SIZE1]]) + //CHECK: [[TEMP_VALUE6:%.+]] = affine.min #[[$MAP4]]([[TEMP_VALUE2]], [[SLICE_OFFSET1]]) + //CHECK: [[PAD_HIGH1:%.+]] = affine.max #[[$MAP3]]()[[[TEMP_VALUE6]]] + + //CHECK: [[SLICE1:%.+]] = tensor.extract_slice [[SIGN]][0, 0, 0, [[SLICE_OFFSET1]]] [1, 32, 540, [[SLICE_SIZE1]]] [1, 1, 1, 1] : tensor<1x32x540x960xf16, {order = #NHWC}> to tensor<1x32x540x?xf16, {order = #NHWC}> + + //CHECK: [[PAD1:%.+]] = tensor.pad [[SLICE1]] low[0, 0, 1, [[PAD_LOW1]]] high[0, 0, 1, [[PAD_HIGH1]]] + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + //CHECK: [[CONV1:%.+]] = VPU.NCE.Convolution([[PAD1]] + + //CHECK: [[PAD2:%.+]] = tensor.pad [[CONV1]] low[0, 0, 1, [[PAD_LOW2]]] high[0, 0, 1, [[PAD_HIGH2]]] { + //CHECK: tensor.yield [[PAD_VALUE]] : f16 + //CHECK: tensor<1x32x540x?xf16, {order = #NHWC}> to tensor<1x32x542x?xf16, {order = #NHWC}> + //CHECK: [[CONV2:%.+]] = VPU.NCE.Convolution([[PAD2]] + + //CHECK: [[INSERT1:%.+]] = tensor.insert_slice [[CONV2]] into [[LOOP_OUT1]][0, 0, 0, [[LOOP_ITER1]]] [1, 32, 540, [[INSERT_SIZE1]]] [1, 1, 1, 1] : tensor<1x32x540x?xf16, {order = #NHWC}> into tensor<1x32x540x960xf16, {order = #NHWC}> + //CHECK: scf.yield [[INSERT1]] : tensor<1x32x540x960xf16, {order = #NHWC}> + //CHECK: return [[LOOP1]] : tensor<1x32x540x960xf16, {order = #NHWC}> +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + + // CHECK-LABEL: @MergeWithLayoutCast + // CHECK-SAME: [[INPUT0:%arg[0-9]]]: tensor<1x16x256x140xf16>, + // CHECK-SAME: [[INPUT1:%arg[0-9]]]: tensor<1x16x256x140xf16>) + func.func @MergeWithLayoutCast( + %arg0: tensor<1x16x256x140xf16>, + %arg1: tensor<1x16x256x140xf16> + ) -> tensor<1x16x256x140xf16> { + %0 = VPU.LayoutCast(%arg0) {dst_order = #NHWC} : tensor<1x16x256x140xf16> -> tensor<1x16x256x140xf16, {order = #NHWC}> + %1 = VPU.LayoutCast(%arg1) {dst_order = #NHWC} : tensor<1x16x256x140xf16> -> tensor<1x16x256x140xf16, {order = #NHWC}> + %2 = VPU.NCE.Eltwise(%0, %1) { + is_inplace = true, + multiClusterStrategy = #VPU.multi_cluster_strategy, + op_type = #VPU.eltwise_type, + ppe = #VPU.PPEInt< + mode = , + clamp_low = -2147483648 : i64, + clamp_high = 2147483647 : i64, + lrelu_mult = 1 : i64, + lrelu_shift = 0 : i64, + quant_scale = [1.000000e+00], + fp_prelu_alpha = 1.000000e+00 : f64 + >, + tilingStrategy = [1, 1, 1, 2] + } -> tensor<1x16x256x140xf16, {order = #NHWC}> + + %3 = VPU.LayoutCast(%2) {dst_order = #NCHW} : tensor<1x16x256x140xf16, {order = #NHWC}> -> tensor<1x16x256x140xf16> + + return %3 : tensor<1x16x256x140xf16> + + + //CHECK: [[LOOP_STEP:%.+]] = arith.constant 70 : index + //CHECK: [[LOOP_END:%.+]] = arith.constant 140 : index + //CHECK: [[LOOP_BEGIN:%.+]] = arith.constant 0 : index + //CHECK: [[LOOP_OUTPUT:%.+]] = tensor.empty() : tensor<1x16x256x140xf16, {order = #NHWC}> + + //CHECK: [[LOOP:%.+]] = scf.for + //CHECK-SAME: [[LOOP_ITER:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END]] step [[LOOP_STEP]] + //CHECK-SAME: iter_args([[LOOP_OUT:%arg[0-9]]] = [[LOOP_OUTPUT]]) -> (tensor<1x16x256x140xf16, {order = #NHWC}>) + + //CHECK: [[SLICE0:%.+]] = tensor.extract_slice [[INPUT0]][0, 0, 0, [[LOOP_ITER]]] [1, 16, 256, 70] [1, 1, 1, 1] : tensor<1x16x256x140xf16> to tensor<1x16x256x70xf16> + //CHECK: [[CAST0:%.+]] = VPU.LayoutCast([[SLICE0]]) {dst_order = #NHWC} : tensor<1x16x256x70xf16> -> tensor<1x16x256x70xf16, {order = #NHWC}> + //CHECK: [[SLICE1:%.+]] = tensor.extract_slice [[INPUT1]][0, 0, 0, [[LOOP_ITER]]] [1, 16, 256, 70] [1, 1, 1, 1] : tensor<1x16x256x140xf16> to tensor<1x16x256x70xf16> + //CHECK: [[CAST1:%.+]] = VPU.LayoutCast([[SLICE1]]) {dst_order = #NHWC} : tensor<1x16x256x70xf16> -> tensor<1x16x256x70xf16, {order = #NHWC}> + //CHECK: [[ELTWISE:%.+]] = VPU.NCE.Eltwise([[CAST0]], [[CAST1]]) + //CHECK: [[INSERT:%.+]] = tensor.insert_slice [[ELTWISE]] into [[LOOP_OUT]][0, 0, 0, [[LOOP_ITER]]] [1, 16, 256, 70] [1, 1, 1, 1] : tensor<1x16x256x70xf16, {order = #NHWC}> into tensor<1x16x256x140xf16, {order = #NHWC}> + //CHECK: scf.yield [[INSERT]] : tensor<1x16x256x140xf16, {order = #NHWC}> + + // pure view-like op doesn't have tilingStrategy, it cannot be tiled. it might be used to continue VF further, but we cannot start VF with that unfortunately + //CHECK: [[CAST2:%.+]] = VPU.LayoutCast([[LOOP]]) {dst_order = #NCHW} : tensor<1x16x256x140xf16, {order = #NHWC}> -> tensor<1x16x256x140xf16> + //CHECK: return [[CAST2]] : tensor<1x16x256x140xf16> +} diff --git a/tests/lit/NPU/dialect/VPU/passes/set_target_independent_pass_options.mlir b/tests/lit/NPU/dialect/VPU/passes/set_target_independent_pass_options.mlir new file mode 100644 index 0000000000..8e6e6fd522 --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/set_target_independent_pass_options.mlir @@ -0,0 +1,46 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --vpu-arch=%arch% --split-input-file --set-target-independent-options="enable-auto-padding-odu enable-auto-padding-idu" %s | FileCheck --check-prefix=CHECK-PAD %s --strict-whitespace +// REQUIRES: arch-NPU37XX || arch-NPU40XX +// RUN: vpux-opt --vpu-arch=%arch% --split-input-file --set-target-independent-options="enable-is-reduce-supported " %s | FileCheck --check-prefix=CHECK-REDUCE %s --strict-whitespace +// REQUIRES: arch-NPU37XX || arch-NPU40XX +// RUN: vpux-opt --vpu-arch=%arch% --split-input-file --set-target-independent-options="allow-custom-values=true" %s | FileCheck --check-prefix=CHECK-CUSTOM %s --strict-whitespace +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +module @mainModule attributes {} { +} + +// CHECK-PAD: module @mainModule +// CHECK-PAD: config.PipelineOptions @Options +// CHECK-PAD: config.Option @VPU.AutoPaddingODU : true +// CHECK-PAD: config.Option @VPU.AutoPaddingIDU : true +// CHECK-REDUCE: config.Option @VPU.ReduceSupported : true + +// ----- + +module @mainModule attributes {} { +} + + +// CHECK-REDUCE: module @mainModule +// CHECK-REDUCE: config.PipelineOptions @Options +// CHECK-REDUCE: config.Option @VPU.ReduceSupported : true +// CHECK-PAD: config.Option @VPU.AutoPaddingIDU : true + +// ----- + +module @NoInsertionNeeded { + config.PipelineOptions @Options { + config.Option @VPU.MyOptions: false + } +} + + +// CHECK-CUSTOM: module @NoInsertionNeeded +// CHECK-CUSTOM: config.PipelineOptions @Options +// CHECK-REDUCE: config.Option @VPU.ReduceSupported : true +// CHECK-PAD: config.Option @VPU.AutoPaddingIDU : true +// CHECK-CUSTOM: config.Option @VPU.MyOptions : false diff --git a/tests/lit/NPU/dialect/VPU/passes/setup_channels_auto_padding.mlir b/tests/lit/NPU/dialect/VPU/passes/setup_channels_auto_padding.mlir deleted file mode 100644 index cb8318d880..0000000000 --- a/tests/lit/NPU/dialect/VPU/passes/setup_channels_auto_padding.mlir +++ /dev/null @@ -1,15 +0,0 @@ -// -// Copyright (C) 2024-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -// RUN: vpux-opt --vpu-arch=%arch% --setup-channels-auto-padding="enable-auto-padding-odu enable-auto-padding-idu" %s | FileCheck %s --strict-whitespace -// REQUIRES: arch-NPU37XX || arch-NPU40XX - -module @mainModule attributes {} { -} - -// CHECK: module @mainModule -// CHECK: config.PipelineOptions @Options -// CHECK: config.Option @VPU.AutoPaddingODU : true -// CHECK: config.Option @VPU.AutoPaddingIDU : true diff --git a/tests/lit/NPU/dialect/VPU/passes/setup_is_reduce_supported.mlir b/tests/lit/NPU/dialect/VPU/passes/setup_is_reduce_supported.mlir deleted file mode 100644 index 0cc8b25770..0000000000 --- a/tests/lit/NPU/dialect/VPU/passes/setup_is_reduce_supported.mlir +++ /dev/null @@ -1,14 +0,0 @@ -// -// Copyright (C) 2024-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -// RUN: vpux-opt --vpu-arch=%arch% --setup-is-reduce-supported="enable-is-reduce-supported" %s | FileCheck %s --strict-whitespace -// REQUIRES: arch-NPU37XX || arch-NPU40XX - -module @mainModule attributes {} { -} - -// CHECK: module @mainModule -// CHECK: config.PipelineOptions @Options -// CHECK: config.Option @VPU.ReduceSupported : true diff --git a/tests/lit/NPU/dialect/VPU/passes/setup_per_barrier_variant_constraint_37XX.mlir b/tests/lit/NPU/dialect/VPU/passes/setup_per_barrier_variant_constraint_37XX.mlir index ac609154a0..6c55d1c3ac 100644 --- a/tests/lit/NPU/dialect/VPU/passes/setup_per_barrier_variant_constraint_37XX.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/setup_per_barrier_variant_constraint_37XX.mlir @@ -7,7 +7,7 @@ // RUN: vpux-opt --vpu-arch=%arch% --setup-npu-constraint %s | FileCheck %s // REQUIRES: arch-NPU37XX -module @mainModule attributes { VPU.arch = #VPU.arch_kind } { +module @mainModule attributes { config.arch = #config.arch_kind } { IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU diff --git a/tests/lit/NPU/dialect/VPU/passes/setup_per_barrier_variant_constraint_40XX.mlir b/tests/lit/NPU/dialect/VPU/passes/setup_per_barrier_variant_constraint_40XX.mlir index f4ad948b9b..713cc2ed5f 100644 --- a/tests/lit/NPU/dialect/VPU/passes/setup_per_barrier_variant_constraint_40XX.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/setup_per_barrier_variant_constraint_40XX.mlir @@ -7,7 +7,7 @@ // RUN: vpux-opt --vpu-arch=%arch% --setup-npu-constraint %s | FileCheck %s // REQUIRES: arch-NPU40XX -module @mainModule attributes { VPU.arch = #VPU.arch_kind } { +module @mainModule attributes { config.arch = #config.arch_kind } { IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU diff --git a/tests/lit/NPU/dialect/VPU/passes/setup_per_barrier_variant_constraint_40XX_wlm.mlir b/tests/lit/NPU/dialect/VPU/passes/setup_per_barrier_variant_constraint_40XX_wlm.mlir index a0d5a24162..65846e90ac 100644 --- a/tests/lit/NPU/dialect/VPU/passes/setup_per_barrier_variant_constraint_40XX_wlm.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/setup_per_barrier_variant_constraint_40XX_wlm.mlir @@ -3,10 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // -// RUN: vpux-opt --vpu-arch=%arch% --setup-npu-constraint="workload-management-enable=true enable-sw-kernel-fifo-per-shave-engine=true" %s | FileCheck %s +// RUN: vpux-opt --vpu-arch=%arch% --setup-npu-constraint="workload-management-status=ENABLED enable-sw-kernel-fifo-per-shave-engine=true" %s | FileCheck %s // REQUIRES: arch-NPU40XX -module @mainModule attributes { VPU.arch = #VPU.arch_kind } { +module @mainModule attributes { config.arch = #config.arch_kind } { IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU @@ -15,11 +15,12 @@ module @mainModule attributes { VPU.arch = #VPU.arch_kind } { // CHECK: module @mainModule attributes // CHECK: config.PipelineOptions @Options -// CHECK: config.Option @VPU.UseDedicatedFifoPerShaveEngine : true +// CHECK-DAG: config.Option @VPU.UseDedicatedFifoPerShaveEngine : true // Currently, still non-WLM barrier configuration settings are present even for WLM enabled mode. To be updated to WLM values in E#155846 -// CHECK: config.Option @VPU.BarrierMaxVariantSum : 64 -// CHECK: config.Option @VPU.BarrierMaxVariantCount : 128 -// CHECK: config.Option @VPU.MetadataMaxVariantCount : 128 -// CHECK: config.Option @VPU.MetadataMaxInvariantCount : 64 -// CHECK: config.Option @VPU.MetadataMaxKernelInvocationCount : 32 -// CHECK: config.Option @VPU.MetadataMaxKernelRangeCount : 32 +// CHECK-DAG: config.Option @VPU.BarrierMaxVariantSum : 64 +// CHECK-DAG: config.Option @VPU.BarrierMaxVariantCount : 128 +// CHECK-DAG: config.Option @VPU.MetadataMaxVariantCount : 128 +// CHECK-DAG: config.Option @VPU.MetadataMaxInvariantCount : 64 +// CHECK-DAG: config.Option @VPU.MetadataMaxKernelInvocationCount : 32 +// CHECK-DAG: config.Option @VPU.MetadataMaxKernelRangeCount : 32 +// CHECK-DAG: config.Option @VPU.WorkloadManagementStatus : "ENABLED" diff --git a/tests/lit/NPU/dialect/VPU/passes/sw_kernel_data_prefetch_reserve_mem_3720.mlir b/tests/lit/NPU/dialect/VPU/passes/sw_kernel_data_prefetch_reserve_mem_3720.mlir new file mode 100644 index 0000000000..487b33807f --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/sw_kernel_data_prefetch_reserve_mem_3720.mlir @@ -0,0 +1,27 @@ +// +// Copyright (C) 2024-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --sw-kernel-data-prefetch-reserve-mem %s | FileCheck %s +// REQUIRES: arch-NPU37XX + +module @SimpleGraph { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "data" : tensor<1x16x4x4xf16> + } outputsInfo : { + DataInfo "prob" : tensor<1x16x4x4xf16> + } + + func.func @main(%arg0: tensor<1x16x4x4xf16>) -> tensor<1x16x4x4xf16> { + %results = VPU.Gelu(%arg0) : tensor<1x16x4x4xf16> -> tensor<1x16x4x4xf16> + return %results: tensor<1x16x4x4xf16> + } + + // reserve dummy memory at the end of CMX + + // CHECK: IE.TileResource + // CHECK: ReservedMemory + // CHECK: SWKernelPrefetchingReservedMemory + // CHECK: IE.MemoryResource 256 bytes of @CMX_NN offset 1982208 +} diff --git a/tests/lit/NPU/dialect/VPU/passes/sw_kernel_data_prefetch_reserve_mem_40XX+.mlir b/tests/lit/NPU/dialect/VPU/passes/sw_kernel_data_prefetch_reserve_mem_40XX+.mlir new file mode 100644 index 0000000000..af102704b7 --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/sw_kernel_data_prefetch_reserve_mem_40XX+.mlir @@ -0,0 +1,214 @@ +// +// Copyright (C) 2024-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --sw-kernel-data-prefetch-reserve-mem %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +module @SimpleGraph { + IE.TileResource 1 of @NCE at 1.300000e+03 MHz { + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + } + + module @VPU.SW { + func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} + } + + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "data" : tensor<1x16x4x4xf16> + } outputsInfo : { + DataInfo "prob" : tensor<1x16x4x4xf16> + } + + func.func @main(%arg0: tensor<1x16x4x4xf16>) -> tensor<1x16x4x4xf16> { + %results = VPU.Gelu(%arg0) : tensor<1x16x4x4xf16> -> tensor<1x16x4x4xf16> + return %results: tensor<1x16x4x4xf16> + } + + // reserve dummy memory at the end of CMX + + // CHECK: IE.TileResource + // CHECK: ReservedMemory + // CHECK: SWKernelPrefetchingReservedMemory + // CHECK: IE.MemoryResource 1024 bytes of @CMX_NN offset 1473536 +} + +// ----- + +module @SimpleGraphWithReservedMem { + module @VPU.SW { + func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} + } + + IE.TileResource 1 of @NCE at 1.300000e+03 MHz { + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + builtin.module @ReservedMemory { + module @CustomReservedMemory { + IE.MemoryResource 512 bytes of @CMX_NN offset 1474048 + } + } + } + + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "data" : tensor<1x16x4x4xf16> + } outputsInfo : { + DataInfo "prob" : tensor<1x16x4x4xf16> + } + + func.func @main(%arg0: tensor<1x16x4x4xf16>) -> tensor<1x16x4x4xf16> { + %results = VPU.Gelu(%arg0) : tensor<1x16x4x4xf16> -> tensor<1x16x4x4xf16> + return %results: tensor<1x16x4x4xf16> + } + + // Reserve additional memory + + // CHECK: IE.TileResource + // CHECK: ReservedMemory + // CHECK: SWKernelPrefetchingReservedMemory + // CHECK: IE.MemoryResource 512 bytes of @CMX_NN offset 1473536 + // CHECK: CustomReservedMemory + // CHECK: IE.MemoryResource 512 bytes of @CMX_NN offset 1474048 +} + +// ----- + +module @SimpleGraphWithReservedMemHasEnoughSize { + module @VPU.SW { + func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} + } + + IE.TileResource 1 of @NCE at 1.300000e+03 MHz { + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + builtin.module @ReservedMemory { + module @CustomReservedMemory { + IE.MemoryResource 1024 bytes of @CMX_NN offset 1473536 + } + } + } + + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "data" : tensor<1x16x4x4xf16> + } outputsInfo : { + DataInfo "prob" : tensor<1x16x4x4xf16> + } + + func.func @main(%arg0: tensor<1x16x4x4xf16>) -> tensor<1x16x4x4xf16> { + %results = VPU.Gelu(%arg0) : tensor<1x16x4x4xf16> -> tensor<1x16x4x4xf16> + return %results: tensor<1x16x4x4xf16> + } + + // no need to change the reserved memory size, just put it at the end of CMX + + // CHECK: IE.TileResource + // CHECK: ReservedMemory + // CHECK: CustomReservedMemory + // CHECK: IE.MemoryResource 1024 bytes of @CMX_NN offset 1473536 +} + +// ----- + +module @SimpleGraphWith2ReservedMem { + module @VPU.SW { + func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} + } + + IE.TileResource 1 of @NCE at 1.300000e+03 MHz { + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + builtin.module @ReservedMemory { + module @CustomReservedMemory1 { + IE.MemoryResource 512 bytes of @CMX_NN offset 1473984 + } + + module @CustomReservedMemory2 { + IE.MemoryResource 64 bytes of @CMX_NN offset 1474496 + } + } + } + + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "data" : tensor<1x16x4x4xf16> + } outputsInfo : { + DataInfo "prob" : tensor<1x16x4x4xf16> + } + func.func @main(%arg0: tensor<1x16x4x4xf16>) -> tensor<1x16x4x4xf16> { + %results = VPU.Gelu(%arg0) : tensor<1x16x4x4xf16> -> tensor<1x16x4x4xf16> + return %results: tensor<1x16x4x4xf16> + } + + // Reserve missing chunk of memory + + // CHECK: IE.TileResource + // CHECK: ReservedMemory + // CHECK: SWKernelPrefetchingReservedMemory + // CHECK: IE.MemoryResource 448 bytes of @CMX_NN offset 1473536 + // CHECK: CustomReservedMemory1 + // CHECK: IE.MemoryResource 512 bytes of @CMX_NN offset 1473984 + // CHECK: CustomReservedMemory2 + // CHECK: IE.MemoryResource 64 bytes of @CMX_NN offset 1474496 +} + +// ----- + +module @SimpleGraphWith2ReservedMemHaveEnoughTotalSize { + module @VPU.SW { + func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} + } + + IE.TileResource 1 of @NCE at 1.300000e+03 MHz { + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + builtin.module @ReservedMemory { + module @CustomReservedMemory1 { + IE.MemoryResource 128 bytes of @CMX_NN offset 1473536 + } + + module @CustomReservedMemory2 { + IE.MemoryResource 896 bytes of @CMX_NN offset 1473664 + } + } + } + + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "data" : tensor<1x16x4x4xf16> + } outputsInfo : { + DataInfo "prob" : tensor<1x16x4x4xf16> + } + + func.func @main(%arg0: tensor<1x16x4x4xf16>) -> tensor<1x16x4x4xf16> { + %results = VPU.Gelu(%arg0) : tensor<1x16x4x4xf16> -> tensor<1x16x4x4xf16> + return %results: tensor<1x16x4x4xf16> + } + + // CHECK: IE.TileResource + // CHECK: ReservedMemory + // CHECK: CustomReservedMemory1 + // CHECK: IE.MemoryResource 128 bytes of @CMX_NN offset 1473536 + // CHECK: CustomReservedMemory2 + // CHECK: IE.MemoryResource 896 bytes of @CMX_NN offset 1473664 +} + +// ----- + +module @SimpleGraphNoSWKernel { + + IE.TileResource 1 of @NCE at 1.300000e+03 MHz { + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + } + + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "data" : tensor<1x16x4x4xf16> + } outputsInfo : { + DataInfo "prob" : tensor<1x16x4x4xf16> + } + func.func @main(%arg0: tensor<1x16x4x4xf16>) -> tensor<1x16x4x4xf16> { + return %arg0 : tensor<1x16x4x4xf16> + } + // not change if no SW Kernel + + // CHECK-NOT: ReservedMemory +} diff --git a/tests/lit/NPU/dialect/VPU/passes/sw_kernel_instruction_prefetch_reserve_mem_for_dummy_kernels.mlir b/tests/lit/NPU/dialect/VPU/passes/sw_kernel_instruction_prefetch_reserve_mem_for_dummy_kernels.mlir new file mode 100644 index 0000000000..31ef59c173 --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/sw_kernel_instruction_prefetch_reserve_mem_for_dummy_kernels.mlir @@ -0,0 +1,79 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --sw-kernel-instruction-prefetch-reserve-mem-for-dummy-kernels %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +module @SimpleGraphAddFirstResMem { + IE.TileResource 1 of @NCE at 1.300000e+03 MHz { + IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + } + + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "data" : tensor<1x16x4x4xf16> + } outputsInfo : { + DataInfo "prob" : tensor<1x16x4x4xf16> + } + func.func @main(%arg0: tensor<1x16x4x4xf16>) -> tensor<1x16x4x4xf16> { + %results = VPU.Gelu(%arg0) : tensor<1x16x4x4xf16> -> tensor<1x16x4x4xf16> + return %results: tensor<1x16x4x4xf16> + } + + // CHECK: IE.TileResource + // CHECK: ReservedMemory + // CHECK-NEXT: DummySWKernelsForInstructionPrefetchReservedMemory + // CHECK-NEXT: IE.MemoryResource 8 bytes of @CMX_NN offset 1474552 +} + +// ----- + +module @SimpleGraphAddSecondResMem { + IE.TileResource 1 of @NCE at 1.300000e+03 MHz { + IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + builtin.module @ReservedMemory { + module @CustomReservedMemory { + IE.MemoryResource 512 bytes of @CMX_NN offset 1474048 + } + } + } + + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "data" : tensor<1x16x4x4xf16> + } outputsInfo : { + DataInfo "prob" : tensor<1x16x4x4xf16> + } + func.func @main(%arg0: tensor<1x16x4x4xf16>) -> tensor<1x16x4x4xf16> { + %results = VPU.Gelu(%arg0) : tensor<1x16x4x4xf16> -> tensor<1x16x4x4xf16> + return %results: tensor<1x16x4x4xf16> + } + + // CHECK: IE.TileResource + // CHECK: ReservedMemory + // CHECK-NEXT: DummySWKernelsForInstructionPrefetchReservedMemory + // CHECK-NEXT: IE.MemoryResource 8 bytes of @CMX_NN offset 1474040 + // CHECK: CustomReservedMemory + // CHECK-NEXT: IE.MemoryResource 512 bytes of @CMX_NN offset 1474048 +} + +// ----- + +module @SimpleGraphNotAddResMem { + IE.TileResource 1 of @NCE at 1.300000e+03 MHz { + IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + } + + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "data" : tensor<1x16x4x4xf16> + } outputsInfo : { + DataInfo "prob" : tensor<1x16x4x4xf16> + } + func.func @main(%arg0: tensor<1x16x4x4xf16>) -> tensor<1x16x4x4xf16> { + return %arg0: tensor<1x16x4x4xf16> + } + + // CHECK: IE.TileResource + // CHECK-NOT: ReservedMemory + // CHECK-NOT: DummySWKernelsForInstructionPrefetchReservedMemory +} \ No newline at end of file diff --git a/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_isolated_40XX+.mlir b/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_isolated_40XX+.mlir index ff42be0fe6..e3ab3cd871 100644 --- a/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_isolated_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_isolated_40XX+.mlir @@ -10,7 +10,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitSwConvOverOC @@ -50,7 +50,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitSwMaxPoolOverH @@ -88,7 +88,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitSwAddOverC @@ -116,7 +116,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitAddSameInputOverC @@ -142,7 +142,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @InterpSplitOverC @@ -176,7 +176,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @InterpSplitOverCNoCommonFactor @@ -211,7 +211,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @InterpSplitOverHW @@ -261,28 +261,26 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @NoTilingClusterNCEConv -// CHECK-SAME: [[INPUT:%arg[0-9]]]: !VPU.DistributedTensor<1x32x100x100xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> +// CHECK-SAME: [[INPUT:%.+]]: !VPU.DistributedTensor<1x32x100x100xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> func.func @NoTilingClusterNCEConv(%arg0: !DistributedTensor0) -> !DistributedTensor1 { %weights = const.Declare tensor<128x32x3x3xf16, {mem_space = @CMX_NN, order = #NHWC}> = dense<1.000000e+00> : tensor<128x32x3x3xf16, {mem_space = @CMX_NN}>, [#const.Reorder<#NHWC>] - %weights_table = const.Declare tensor<128x1x1x4xsi32, {mem_space = @CMX_NN, order = #NCHW}> = dense<10> : tensor<128x1x1x4xsi32, {mem_space = @CMX_NN}> - %0 = VPU.NCE.Convolution(%arg0, %weights, %weights_table) { + %0 = VPU.NCE.Convolution(%arg0, %weights) { ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [128, 32, 3, 3], strides = [1, 1] - } : !VPU.DistributedTensor<1x32x100x100xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>, tensor<128x32x3x3xf16, {mem_space = @CMX_NN, order = #NHWC}>, tensor<128x1x1x4xsi32, {mem_space = @CMX_NN, order = #NCHW}> -> !VPU.DistributedTensor<1x128x100x100xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + } : !VPU.DistributedTensor<1x32x100x100xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>, tensor<128x32x3x3xf16, {mem_space = @CMX_NN, order = #NHWC}> -> !VPU.DistributedTensor<1x128x100x100xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> return %0 : !VPU.DistributedTensor<1x128x100x100xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> // CHECK-DAG: [[WEIGHTS:%.+]] = const.Declare tensor<128x32x3x3xf16, {mem_space = @CMX_NN, order = #NHWC}> - // CHECK-DAG: [[WEIGHT_TABLE:%.+]] = const.Declare tensor<128x1x1x4xsi32, {mem_space = @CMX_NN, order = #NCHW}> - // CHECK: [[NCE_CONV:%.*]] = VPU.NCE.Convolution(%arg0, [[WEIGHTS]], [[WEIGHT_TABLE]]) + // CHECK: [[NCE_CONV:%.+]] = VPU.NCE.Convolution([[INPUT]], [[WEIGHTS]]) // CHECK-SAME: pad = #VPU.Padding // CHECK-SAME: strides = [1, 1] // CHECK-NOT: tilingStrategy @@ -299,7 +297,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @GatherSplit @@ -324,7 +322,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @GatherSplitWithBatchDims @@ -349,7 +347,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @GatherSplitOptimize @@ -374,7 +372,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @Yuv2RGBSplit @@ -400,7 +398,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @GatherNDSplit @@ -426,7 +424,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @GatherNDSplitIndices @@ -452,7 +450,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @NotSplitGatherForLargeSizeOnGatherAxis @@ -477,7 +475,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @NotSplitGatherForLargeIORatio @@ -502,7 +500,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @Gather4DSplit @@ -529,7 +527,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @Gather4DSplitWithBatchDims @@ -556,7 +554,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @Gather4DSplitOptimize @@ -583,7 +581,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @NotSplitGather4DForLargeSizeOnGatherAxis @@ -610,7 +608,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @NotSplitGather4DForLargeIORatioUseDDRAccess @@ -639,7 +637,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @DepthToSpaceBlocksFirstSplit @@ -666,7 +664,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @DepthToSpaceDepthFirstSplit @@ -691,7 +689,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SpaceToDepthBlockFirstSplit @@ -711,7 +709,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SpaceToDepthDepthFirstSplit @@ -738,31 +736,28 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitNCEConvOverOH // CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x32x64x48xf16, {order = #NHWC}> func.func @SplitNCEConvOverOH(%arg0: tensor<1x32x64x48xf16, {order = #NHWC}>) -> tensor<1x256x64x48xf16, {order = #NHWC}> { %weights = const.Declare tensor<256x32x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<256x32x3x3xf16>, [#const.Reorder<#NHWC>] - %weights_table = const.Declare tensor<256x1x1x4xsi32> = dense<1> : tensor<256x1x1x4xsi32> - %0 = VPU.NCE.Convolution(%arg0, %weights, %weights_table) { + %0 = VPU.NCE.Convolution(%arg0, %weights) { ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [256, 32, 3, 3], strides = [1, 1] - } : tensor<1x32x64x48xf16, {order = #NHWC}>, tensor<256x32x3x3xf16, {order = #NHWC}>, tensor<256x1x1x4xsi32> -> tensor<1x256x64x48xf16, {order = #NHWC}> + } : tensor<1x32x64x48xf16, {order = #NHWC}>, tensor<256x32x3x3xf16, {order = #NHWC}> -> tensor<1x256x64x48xf16, {order = #NHWC}> return %0 : tensor<1x256x64x48xf16, {order = #NHWC}> // CHECK-DAG: [[FILTER:%.+]] = const.Declare tensor<256x32x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> // CHECK-SAME: : tensor<256x32x3x3xf16>, [#const.Reorder<#NHWC>] - // CHECK-DAG: [[WEIGHTS_TABLE:%.+]] = const.Declare tensor<256x1x1x4xsi32> = dense<1> - // CHECK-SAME: : tensor<256x1x1x4xsi32> - // CHECK: [[CONV:%.+]] = VPU.NCE.Convolution([[INPUT]], [[FILTER]], [[WEIGHTS_TABLE]]) + // CHECK: [[CONV:%.+]] = VPU.NCE.Convolution([[INPUT]], [[FILTER]]) // CHECK-SAME: pad = #VPU.Padding, // CHECK-SAME: rawFilterShape = [256, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 2, 1] // CHECK-SAME: -> tensor<1x256x64x48xf16, {order = #NHWC}> @@ -785,31 +780,28 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitQuantNCEConvOverOC // CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x32x64x64x!qElemType, {order = #NHWC}> func.func @SplitQuantNCEConvOverOC(%arg0: tensor<1x32x64x64x!qElemType, {order = #NHWC}>) -> tensor<1x512x64x64x!qElemType1, {order = #NHWC}> { %weights = const.Declare tensor<512x32x3x3x!qElemType2, {order = #NHWC}> = dense<1.000000e+00> : tensor<512x32x3x3xf16>, [#const.CastElemType, #const.CastElemType, #const.Reorder<#NHWC>] - %weights_table = const.Declare tensor<512x1x1x4xsi32, {order = #NCHW}> = dense<10> : tensor<512x1x1x4xsi32> - %0 = VPU.NCE.Convolution(%arg0, %weights, %weights_table) { + %0 = VPU.NCE.Convolution(%arg0, %weights) { ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [512, 32, 3, 3], strides = [1, 1] - } : tensor<1x32x64x64x!qElemType, {order = #NHWC}>, tensor<512x32x3x3x!qElemType2, {order = #NHWC}>, tensor<512x1x1x4xsi32, {order = #NCHW}> -> tensor<1x512x64x64x!qElemType1, {order = #NHWC}> + } : tensor<1x32x64x64x!qElemType, {order = #NHWC}>, tensor<512x32x3x3x!qElemType2, {order = #NHWC}> -> tensor<1x512x64x64x!qElemType1, {order = #NHWC}> return %0 : tensor<1x512x64x64x!qElemType1, {order = #NHWC}> // CHECK-DAG: [[WEIGHTS:%.+]] = const.Declare tensor<512x32x3x3x!qElemType2, {order = #NHWC}> = dense<1.000000e+00> // CHECK-SAME: : tensor<512x32x3x3xf16>, [#const.CastElemType, #const.CastElemType, #const.Reorder<#NHWC>] - // CHECK-DAG: [[WEIGHTS_TABLE:%.+]] = const.Declare tensor<512x1x1x4xsi32, {order = #NCHW}> = dense<10> - // CHECK-SAME: : tensor<512x1x1x4xsi32> - // CHECK: [[CONV:%.+]] = VPU.NCE.Convolution([[INPUT]], [[WEIGHTS]], [[WEIGHTS_TABLE]]) + // CHECK: [[CONV:%.+]] = VPU.NCE.Convolution([[INPUT]], [[WEIGHTS]]) // CHECK-SAME: pad = #VPU.Padding, // CHECK-SAME: rawFilterShape = [512, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 2, 1] // CHECK-SAME: -> tensor<1x512x64x64x!qElemType1, {order = #NHWC}> @@ -824,57 +816,11 @@ func.func @SplitQuantNCEConvOverOC(%arg0: tensor<1x32x64x64x!qElemType, {order = #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -!qElemType = !quant.uniform - module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} -} - -// CHECK-LABEL: @SplitI4QuantNCEConvOverOC -// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x128x256x4xf16, {order = #NHWC}> -func.func @SplitI4QuantNCEConvOverOC(%arg0: tensor<1x128x256x4xf16, {order = #NHWC}>) -> tensor<1x6320x256x4xf16, {order = #NHWC}> { - %weights = const.Declare tensor<6320x128x1x1x!qElemType, {order = #NHWC}> = dense<1.000000e+00> : tensor<6320x128x1x1xf16>, [#const.CastElemType, #const.CastElemType, #const.Reorder<#NHWC>] - %weights_table = const.Declare tensor<6320x1x1x4xsi32, {order = #NCHW}> = dense<10> : tensor<6320x1x1x4xsi32> - - %0 = VPU.NCE.Convolution(%arg0, %weights, %weights_table) { - ppe = #VPU.PPEStub<>, - pad = #VPU.Padding, - rawFilterShape = [6320, 128, 1, 1], strides = [1, 1] - } : tensor<1x128x256x4xf16, {order = #NHWC}>, tensor<6320x128x1x1x!qElemType, {order = #NHWC}>, tensor<6320x1x1x4xsi32, {order = #NCHW}> -> tensor<1x6320x256x4xf16, {order = #NHWC}> - - return %0 : tensor<1x6320x256x4xf16, {order = #NHWC}> - - // CHECK-DAG: [[WEIGHTS:%.+]] = const.Declare tensor<6320x128x1x1x!qElemType, {order = #NHWC}> = dense<1.000000e+00> - // CHECK-SAME: : tensor<6320x128x1x1xf16>, [#const.CastElemType, #const.CastElemType, #const.Reorder<#NHWC>] - - // CHECK-DAG: [[WEIGHTS_TABLE:%.+]] = const.Declare tensor<6320x1x1x4xsi32, {order = #NCHW}> = dense<10> - // CHECK-SAME: : tensor<6320x1x1x4xsi32> - - // CHECK: [[CONV:%.+]] = VPU.NCE.Convolution([[INPUT]], [[WEIGHTS]], [[WEIGHTS_TABLE]]) - // CHECK-SAME: pad = #VPU.Padding, - // CHECK-SAME: rawFilterShape = [6320, 128, 1, 1], - // CHECK-SAME: strides = [1, 1], - // CHECK-SAME: tilingStrategy = [1, 12, 1, 1]} - // CHECK-SAME: -> tensor<1x6320x256x4xf16, {order = #NHWC}> - - // CHECK: return [[CONV]] : tensor<1x6320x256x4xf16, {order = #NHWC}> -} - -} - -// ----- - -#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - -module @Test { - -IE.TileResource 1 of @NCE { -IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitNCEMaxPoolOverH @@ -907,7 +853,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitNCEEltwiseAddOverC @@ -942,7 +888,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitNCEEltwiseAddSameInput @@ -973,7 +919,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @ConvertU8F32SplitOverW @@ -998,7 +944,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SigmoidSplitOverW @@ -1022,7 +968,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @TanhSplitOverW @@ -1046,7 +992,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @ExpSplitOverW @@ -1070,7 +1016,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SqrtSplitOverW @@ -1093,7 +1039,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @EluSplitOverW @@ -1116,7 +1062,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @ClampSplitOverW @@ -1139,7 +1085,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @ReLUSplitOverW @@ -1162,7 +1108,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @HSwishSplitOverW @@ -1185,7 +1131,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitDivideEltwiseSw @@ -1211,7 +1157,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @MemPermuteSplitNCHWToNHWC2Part @@ -1235,7 +1181,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @AvgPoolSwSplit2Part @@ -1262,7 +1208,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitSparseNCEConvOverOH @@ -1272,14 +1218,13 @@ func.func @SplitSparseNCEConvOverOH(%arg0: tensor<1x32x80x60xf16, {order = #NHWC %weights_sm = const.Declare tensor<160x1x1x384xi1> = dense<1.000000e+00> : tensor<160x32x3x3xf16>, [#const.Reorder<#NHWC>, #const.GetSparsityMap] %weights_sparse = VPU.GroupSparseTensor(%weights, %weights_sm) {is_weights} -> !VPU.SparseTensor, sparsity_map=tensor<160x1x1x384xi1>, is_weights> - %weights_table = const.Declare tensor<160x1x1x4xsi32, {order = #NCHW}> = dense<10> : tensor<160x1x1x4xsi32> - %0 = VPU.NCE.Convolution(%arg0, %weights_sparse, %weights_table) { + %0 = VPU.NCE.Convolution(%arg0, %weights_sparse) { ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [160, 32, 3, 3], strides = [1, 1] - } : tensor<1x32x80x60xf16, {order = #NHWC}>, !VPU.SparseTensor, sparsity_map=tensor<160x1x1x384xi1>, is_weights>, tensor<160x1x1x4xsi32, {order = #NCHW}> -> tensor<1x160x80x60xf16, {order = #NHWC}> + } : tensor<1x32x80x60xf16, {order = #NHWC}>, !VPU.SparseTensor, sparsity_map=tensor<160x1x1x384xi1>, is_weights> -> tensor<1x160x80x60xf16, {order = #NHWC}> return %0 : tensor<1x160x80x60xf16, {order = #NHWC}> @@ -1293,10 +1238,7 @@ func.func @SplitSparseNCEConvOverOH(%arg0: tensor<1x32x80x60xf16, {order = #NHWC // CHECK-SAME: data=tensor<160x32x3x3xf16, {order = #NHWC}>, // CHECK-SAME: sparsity_map=tensor<160x1x1x384xi1>, is_weights - // CHECK-DAG: [[WEIGHTS_TABLE:%.+]] = const.Declare tensor<160x1x1x4xsi32, {order = #NCHW}> = dense<10> - // CHECK-SAME: : tensor<160x1x1x4xsi32> - - // CHECK: [[OUTPUT:%.+]] = VPU.NCE.Convolution([[INPUT]], [[WEIGHTS_SPARSE]], [[WEIGHTS_TABLE]]) + // CHECK: [[OUTPUT:%.+]] = VPU.NCE.Convolution([[INPUT]], [[WEIGHTS_SPARSE]]) // CHECK-SAME: pad = #VPU.Padding, // CHECK-SAME: rawFilterShape = [160, 32, 3, 3], // CHECK-SAME: strides = [1, 1], tilingStrategy = [1, 1, 2, 1] @@ -1320,7 +1262,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitSparseQuantNCEConvOverOH @@ -1330,14 +1272,13 @@ func.func @SplitSparseQuantNCEConvOverOH(%arg0: tensor<1x32x80x80x!qElemType, {o %weights_sm = const.Declare tensor<320x1x1x384xi1> = dense<1.000000e+00> : tensor<320x32x3x3xf16>, [#const.Reorder<#NHWC>, #const.GetSparsityMap] %weights_sparse = VPU.GroupSparseTensor(%weights, %weights_sm) {is_weights} -> !VPU.SparseTensor, sparsity_map=tensor<320x1x1x384xi1>, is_weights> - %weights_table = const.Declare tensor<320x1x1x4xsi32, {order = #NCHW}> = dense<10> : tensor<320x1x1x4xsi32> - %0 = VPU.NCE.Convolution(%arg0, %weights_sparse, %weights_table) { + %0 = VPU.NCE.Convolution(%arg0, %weights_sparse) { ppe = #VPU.PPEStub<>, pad = #VPU.Padding, rawFilterShape = [320, 32, 3, 3], strides = [1, 1] - } : tensor<1x32x80x80x!qElemType, {order = #NHWC}>, !VPU.SparseTensor, sparsity_map=tensor<320x1x1x384xi1>, is_weights>, tensor<320x1x1x4xsi32, {order = #NCHW}> -> tensor<1x320x80x80x!qElemType1, {order = #NHWC}> + } : tensor<1x32x80x80x!qElemType, {order = #NHWC}>, !VPU.SparseTensor, sparsity_map=tensor<320x1x1x384xi1>, is_weights> -> tensor<1x320x80x80x!qElemType1, {order = #NHWC}> return %0 : tensor<1x320x80x80x!qElemType1, {order = #NHWC}> @@ -1351,10 +1292,7 @@ func.func @SplitSparseQuantNCEConvOverOH(%arg0: tensor<1x32x80x80x!qElemType, {o // CHECK-SAME: data=tensor<320x32x3x3x!qElemType2, {order = #NHWC}>, // CHECK-SAME: sparsity_map=tensor<320x1x1x384xi1>, is_weights - // CHECK-DAG: [[WEIGHTS_TABLE:%.+]] = const.Declare tensor<320x1x1x4xsi32, {order = #NCHW}> = dense<10> - // CHECK-SAME: : tensor<320x1x1x4xsi32> - - // CHECK: [[OUTPUT:%.+]] = VPU.NCE.Convolution([[INPUT]], [[WEIGHTS_SPARSE]], [[WEIGHTS_TABLE]]) + // CHECK: [[OUTPUT:%.+]] = VPU.NCE.Convolution([[INPUT]], [[WEIGHTS_SPARSE]]) // CHECK-SAME: pad = #VPU.Padding, // CHECK-SAME: rawFilterShape = [320, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 2, 1] // CHECK-SAME: -> tensor<1x320x80x80x!qElemType1, {order = #NHWC}> @@ -1372,7 +1310,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitNCEAveragePoolOverW @@ -1396,7 +1334,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitAveragePoolOverW @@ -1423,7 +1361,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @MVN1NormalizeSplit @@ -1448,7 +1386,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @MVN1NormalizeSplitOverH @@ -1466,48 +1404,13 @@ func.func @MVN1NormalizeSplitOverH(%arg0: tensor<1x512x256x256xf16, {order = #NH // ----- -module @Test { - -IE.TileResource 1 of @NCE { -IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} -} - -// CHECK-LABEL: func.func @MVNTileOverCEvenly -// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x18x93200x1xf16> - -func.func @MVNTileOverCEvenly(%arg0: tensor<1x18x93200x1xf16>) -> tensor<1x18x93200x1xf16> { - %0 = VPU.MVN(%arg0) { - across_channels = false, - eps = 9.9999997473787516E-6 : f64, - normalize_variance = true - } : tensor<1x18x93200x1xf16> -> tensor<1x18x93200x1xf16> - - return %0 : tensor<1x18x93200x1xf16> - - // CHECK: [[MVN:%.+]] = VPU.MVN([[INPUT]]) { - // CHECK-SAME: across_channels = false, - // CHECK-SAME: eps = 9.9999997473787516E-6 : f64, - // CHECK-SAME: normalize_variance = true, - // CHECK-SAME: tilingStrategy = [1, 9, 1, 1] - // CHECK-NOT: tilingStrategy = [1, 6, 1, 1] - // CHECK-SAME: } : tensor<1x18x93200x1xf16> -> tensor<1x18x93200x1xf16> - - // CHECK: return [[MVN]] : tensor<1x18x93200x1xf16> - -} - -} - -// ----- - #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @MVN1MeanVarSplitTileAtC @@ -1540,7 +1443,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @MVN1MeanVarSplitTileAtN @@ -1571,7 +1474,7 @@ module @Test { IE.TileResource 6 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @NCEMatMulSOGAndGTile @@ -1605,7 +1508,7 @@ module @Test { IE.TileResource 6 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @NCEMatMulSOGAndHTile @@ -1635,7 +1538,7 @@ module @Test { IE.TileResource 6 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @NCEMatMulSOGAndGHTile @@ -1664,7 +1567,7 @@ func.func @NCEMatMulSOGAndGHTile(%arg0: tensor<12x1x512x512xf16>, %arg1: tensor< module @executors { IE.TileResource 4 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @DynamicDequantizeSplitOverH @@ -1706,7 +1609,7 @@ module @ClampTilingNumForAlignment { module @executors { IE.TileResource 1 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @MultiplyNotAlign // CHECK-SAME: ([[INPUT0:%.+]]: tensor<1x512x48x336xf16>, @@ -1833,7 +1736,7 @@ module @Test { IE.TileResource 6 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @DepthToSpaceDepthFirstSplitWithMultiCluster @@ -1856,7 +1759,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @AcoshOpSplitOverC @@ -1879,7 +1782,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @AcosOpSplitOverC @@ -1902,7 +1805,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @AsinhOpSplitOverC @@ -1925,7 +1828,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @AsinOpSplitOverC @@ -1948,7 +1851,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @AtanhOpSplitOverC @@ -1971,7 +1874,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @AtanOpSplitOverC @@ -1994,7 +1897,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SeluOpSplitOverC @@ -2018,7 +1921,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @CosOpSplitOverC @@ -2041,35 +1944,7 @@ module @Test { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} -} - -// CHECK-LABEL: func.func @GruGatesSplitH -// CHECK-SAME: ([[INPUT:%.+]]: tensor<1x1x200x76800xf16>, -// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x1x200x25600xf16>, -// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x1x200x76800xf16>) -func.func @GruGatesSplitH(%arg0: tensor<1x1x200x76800xf16>, %arg1: tensor<1x1x200x25600xf16>, %arg2: tensor<1x1x200x76800xf16>) -> (tensor<1x1x200x25600xf16>) { - %cst= const.Declare tensor<1x1x1x102400xf16> = dense<1.0> : tensor<1x1x1x102400xf16> - %0 = VPU.GRUGates(%arg0, %arg1, %arg2, %cst) : tensor<1x1x200x76800xf16>, tensor<1x1x200x25600xf16>, tensor<1x1x200x76800xf16>, tensor<1x1x1x102400xf16> -> tensor<1x1x200x25600xf16> - - return %0 : tensor<1x1x200x25600xf16> - - // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1x1x1x102400xf16> = dense<1.000000e+00> : tensor<1x1x1x102400xf16> - // CHECK: [[GRUGATES:%.+]] = VPU.GRUGates([[INPUT]], [[INPUT_0]], [[INPUT_1]], [[CST]]) { - // CHECK-SAME: tilingStrategy = [1, 1, 100, 1]} : tensor<1x1x200x76800xf16>, tensor<1x1x200x25600xf16>, tensor<1x1x200x76800xf16>, tensor<1x1x1x102400xf16> -> tensor<1x1x200x25600xf16> - - // CHECK: return [[GRUGATES]] : tensor<1x1x200x25600xf16> -} - -} - -// ----- - -module @Test { - -IE.TileResource 1 of @NCE { - IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @LSTMGatesSplitH @@ -2086,4 +1961,40 @@ func.func @LSTMGatesSplitH(%arg0: tensor<1x1x1536x2048xf16>, %arg1: tensor<1x1x1 // CHECK: return [[LSTMGATES_0:%.+]], [[LSTMGATES_1:%.+]] : tensor<1x1x1536x512xf16>, tensor<1x1x1536x512xf16> } +// CHECK-LABEL: func.func @AssignTilingStrategyAvgPool5D +// CHECK-SAME: [[INPUT0:%arg[0-7]]]: tensor<1x24x8x56x56xf16> +func.func @AssignTilingStrategyAvgPool5D(%arg0: tensor<1x24x8x56x56xf16>) -> tensor<1x24x4x56x56xf16> { + %0 = VPU.AvgPool(%arg0) {exclude_pads, kernel_size = [2, 1, 1], pads_begin = [0, 0, 0], pads_end = [0, 0, 0], rounding_type = #IE.rounding_type, strides = [2, 1, 1]} : tensor<1x24x8x56x56xf16> -> tensor<1x24x4x56x56xf16> + return %0 : tensor<1x24x4x56x56xf16> + + // CHECK: [[AVG_POOL:%.+]] = VPU.AvgPool([[INPUT0]]) { + // CHECK-SAME: exclude_pads, kernel_size = [2, 1, 1], pads_begin = [0, 0, 0], pads_end = [0, 0, 0], rounding_type = #IE.rounding_type, strides = [2, 1, 1], tilingStrategy = [1, 1, 1, 2, 1]} : + // CHECK-SAME: tensor<1x24x8x56x56xf16> -> tensor<1x24x4x56x56xf16> + // CHECK: return [[AVG_POOL]] : tensor<1x24x4x56x56xf16> +} + +} + + +// ----- + +module @MultiplyWithSOK { + IE.TileResource 4 of @NCE at 1.700000e+03 MHz { + IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware + IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + } + // CHECK-LABEL: func.func @MultiplyWithSOK + // CHECK-SAME: [[INPUT:%.+]]: tensor<1x801x60x384xf16> + func.func @MultiplyWithSOK(%arg0: tensor<1x801x60x384xf16>) -> tensor<1x801x60x384xf16> { + %cst = const.Declare tensor<1x1x1x384xf16> = dense<0.200000e+00> : tensor<1x1x1x384xf16> + %0 = VPU.Multiply(%arg0, %cst) {auto_broadcast = #IE.auto_broadcast_type, multiClusterStrategy = #VPU.multi_cluster_strategy} : tensor<1x801x60x384xf16>, tensor<1x1x1x384xf16> -> tensor<1x801x60x384xf16> + + return %0 : tensor<1x801x60x384xf16> + // CHECK: [[CST:%.+]] = const.Declare tensor<1x1x1x384xf16> = dense<1.999510e-01> : tensor<1x1x1x384xf16> + // CHECK: [[MULTIPLY:%.+]] = VPU.Multiply([[INPUT]], [[CST]]) { + // CHECK-SAME: auto_broadcast = #IE.auto_broadcast_type, + // CHECK-SAME: multiClusterStrategy = #VPU.multi_cluster_strategy, + // CHECK-SAME: tilingStrategy = [1, 48, 1, 1]} : tensor<1x801x60x384xf16>, tensor<1x1x1x384xf16> -> tensor<1x801x60x384xf16> + // CHECK: return [[MULTIPLY]] : tensor<1x801x60x384xf16> + } } diff --git a/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_isolated_40XX.mlir b/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_isolated_40XX.mlir index 79d41a11da..8dfab88830 100644 --- a/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_isolated_40XX.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_isolated_40XX.mlir @@ -607,3 +607,40 @@ func.func @SEPDWConvCTile(%arg0: tensor<1x768x32x32xf16, {order = #NHWC}>) -> te // CHECK: VPU.NCE.DepthConvolution // CHECK-SAME: tilingStrategy = [1, 2, 1, 1] } + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!qElemType = !quant.uniform + +// CHECK-LABEL: @SplitI4QuantNCEConvOverOC +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x128x256x4xf16, {order = #NHWC}> +func.func @SplitI4QuantNCEConvOverOC(%arg0: tensor<1x128x256x4xf16, {order = #NHWC}>) -> tensor<1x6320x256x4xf16, {order = #NHWC}> { + %weights = const.Declare tensor<6320x128x1x1x!qElemType, {order = #NHWC}> = dense<1.000000e+00> : tensor<6320x128x1x1xf16>, [#const.CastElemType, #const.CastElemType, #const.Reorder<#NHWC>] + %weights_table = const.Declare tensor<6320x1x1x4xsi32, {order = #NCHW}> = dense<10> : tensor<6320x1x1x4xsi32> + + %0 = VPU.NCE.Convolution(%arg0, %weights, %weights_table) { + ppe = #VPU.PPEStub<>, + pad = #VPU.Padding, + rawFilterShape = [6320, 128, 1, 1], strides = [1, 1] + } : tensor<1x128x256x4xf16, {order = #NHWC}>, tensor<6320x128x1x1x!qElemType, {order = #NHWC}>, tensor<6320x1x1x4xsi32, {order = #NCHW}> -> tensor<1x6320x256x4xf16, {order = #NHWC}> + + return %0 : tensor<1x6320x256x4xf16, {order = #NHWC}> + + // CHECK-DAG: [[WEIGHTS:%.+]] = const.Declare tensor<6320x128x1x1x!qElemType, {order = #NHWC}> = dense<1.000000e+00> + // CHECK-SAME: : tensor<6320x128x1x1xf16>, [#const.CastElemType, #const.CastElemType, #const.Reorder<#NHWC>] + + // CHECK-DAG: [[WEIGHTS_TABLE:%.+]] = const.Declare tensor<6320x1x1x4xsi32, {order = #NCHW}> = dense<10> + // CHECK-SAME: : tensor<6320x1x1x4xsi32> + + // CHECK: [[CONV:%.+]] = VPU.NCE.Convolution([[INPUT]], [[WEIGHTS]], [[WEIGHTS_TABLE]]) + // CHECK-SAME: pad = #VPU.Padding, + // CHECK-SAME: rawFilterShape = [6320, 128, 1, 1], + // CHECK-SAME: strides = [1, 1], + // CHECK-SAME: tilingStrategy = [1, 12, 1, 1]} + // CHECK-SAME: -> tensor<1x6320x256x4xf16, {order = #NHWC}> + + // CHECK: return [[CONV]] : tensor<1x6320x256x4xf16, {order = #NHWC}> +} diff --git a/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_isolated_40XX_extended.mlir b/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_isolated_40XX_extended.mlir new file mode 100644 index 0000000000..8817a1a765 --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_isolated_40XX_extended.mlir @@ -0,0 +1,69 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --tiling-strategy-assignment="tiling-mode=ISOLATED" %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +module @Test { + +IE.TileResource 1 of @NCE { +IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware +IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +} + +// CHECK-LABEL: func.func @MVNTileOverCEvenly +// CHECK-SAME: [[INPUT:%arg[0-9]]]: tensor<1x18x93200x1xf16> + +func.func @MVNTileOverCEvenly(%arg0: tensor<1x18x93200x1xf16>) -> tensor<1x18x93200x1xf16> { + %0 = VPU.MVN(%arg0) { + across_channels = false, + eps = 9.9999997473787516E-6 : f64, + normalize_variance = true + } : tensor<1x18x93200x1xf16> -> tensor<1x18x93200x1xf16> + + return %0 : tensor<1x18x93200x1xf16> + + // CHECK: [[MVN:%.+]] = VPU.MVN([[INPUT]]) { + // CHECK-SAME: across_channels = false, + // CHECK-SAME: eps = 9.9999997473787516E-6 : f64, + // CHECK-SAME: normalize_variance = true, + // CHECK-SAME: tilingStrategy = [1, 9, 1, 1] + // CHECK-NOT: tilingStrategy = [1, 6, 1, 1] + // CHECK-SAME: } : tensor<1x18x93200x1xf16> -> tensor<1x18x93200x1xf16> + + // CHECK: return [[MVN]] : tensor<1x18x93200x1xf16> + +} + +} + +// ----- + +module @Test { + +IE.TileResource 1 of @NCE { + IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware + IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +} + +// CHECK-LABEL: func.func @GruGatesSplitH +// CHECK-SAME: ([[INPUT:%.+]]: tensor<1x1x200x76800xf16>, +// CHECK-SAME: [[INPUT_0:%.+]]: tensor<1x1x200x25600xf16>, +// CHECK-SAME: [[INPUT_1:%.+]]: tensor<1x1x200x76800xf16>) +func.func @GruGatesSplitH(%arg0: tensor<1x1x200x76800xf16>, %arg1: tensor<1x1x200x25600xf16>, %arg2: tensor<1x1x200x76800xf16>) -> (tensor<1x1x200x25600xf16>) { + %cst= const.Declare tensor<1x1x1x102400xf16> = dense<1.0> : tensor<1x1x1x102400xf16> + %0 = VPU.GRUGates(%arg0, %arg1, %arg2, %cst) : tensor<1x1x200x76800xf16>, tensor<1x1x200x25600xf16>, tensor<1x1x200x76800xf16>, tensor<1x1x1x102400xf16> -> tensor<1x1x200x25600xf16> + + return %0 : tensor<1x1x200x25600xf16> + + // CHECK-DAG: [[CST:%.+]] = const.Declare tensor<1x1x1x102400xf16> = dense<1.000000e+00> : tensor<1x1x1x102400xf16> + // CHECK: [[GRUGATES:%.+]] = VPU.GRUGates([[INPUT]], [[INPUT_0]], [[INPUT_1]], [[CST]]) { + // CHECK-SAME: tilingStrategy = [1, 1, 100, 1]} : tensor<1x1x200x76800xf16>, tensor<1x1x200x25600xf16>, tensor<1x1x200x76800xf16>, tensor<1x1x1x102400xf16> -> tensor<1x1x200x25600xf16> + + // CHECK: return [[GRUGATES]] : tensor<1x1x200x25600xf16> +} + +} + diff --git a/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_prefetch_40XX+.mlir b/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_prefetch_40XX+.mlir index 77c5479814..cb88c71bb0 100644 --- a/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_prefetch_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_prefetch_40XX+.mlir @@ -9,7 +9,7 @@ module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitSwConvOverOC @@ -46,7 +46,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitSwMaxPoolOverH @@ -81,7 +81,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitSoftMaxWithSoK @@ -102,7 +102,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitSoftMaxOverW @@ -123,7 +123,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @InterpSplitOverC @@ -157,7 +157,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @InterpSplitOverHW @@ -190,7 +190,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @InterpSplitOverCNoCommonFactor @@ -221,7 +221,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitPReluOverW @@ -245,7 +245,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitLeakyReluOverW @@ -268,7 +268,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @GenericTiling @@ -322,7 +322,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @NoTileWithSOH @@ -367,7 +367,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @TileWithSOH @@ -410,7 +410,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @NoTileWithSOK @@ -456,7 +456,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @LargeConstPipeliningSOKFor @@ -502,7 +502,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitNCEEltwise @@ -534,7 +534,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @NoPrefetchingForEltwise @@ -583,7 +583,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitAveragePoolOverW @@ -609,7 +609,7 @@ module @executors { module @executors { IE.TileResource 1 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @MVN1NormalizeSplit @@ -635,7 +635,7 @@ module @executors { module @executors { IE.TileResource 4 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @MVN1NormalizeSplitOverH @@ -656,7 +656,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @ClampSplitOverW @@ -677,7 +677,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @ReLUSplitOverW @@ -700,7 +700,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @LogSplitOverW @@ -721,7 +721,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @AbsSplitOverW @@ -742,7 +742,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitFloorModEltwiseSw @@ -762,7 +762,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitModEltwiseSw @@ -782,7 +782,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitPowerEltwiseSw @@ -803,7 +803,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitLogicalOrEltwiseSw @@ -824,7 +824,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitLogicalXorEltwiseSw @@ -845,7 +845,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitEqualEltwiseSw @@ -866,7 +866,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitNotEqualEltwiseSw @@ -887,7 +887,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitLessEltwiseSw @@ -908,7 +908,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitLessEqualEltwiseSw @@ -929,7 +929,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitGreaterEltwiseSw @@ -950,7 +950,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitGreaterEqualEltwiseSw @@ -973,7 +973,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitErfOverW @@ -994,7 +994,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitFloorOverW @@ -1017,7 +1017,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @TanSplitOverW @@ -1038,7 +1038,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SwishSplitOverW @@ -1059,7 +1059,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @HSigmoidSplitOverW @@ -1080,7 +1080,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitNegativeActivationSw @@ -1101,7 +1101,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitCeilingActivationSw @@ -1122,7 +1122,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitSignActivationSw @@ -1143,7 +1143,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitSelectEltwiseSw @@ -1164,7 +1164,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitAddEltwiseSw @@ -1185,7 +1185,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitRoundActivationSw @@ -1206,7 +1206,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitGeluActivationSw @@ -1227,7 +1227,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitTopK @@ -1250,7 +1250,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitStridedSliceOverW @@ -1271,7 +1271,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitLogicalNotEltwiseSw @@ -1297,7 +1297,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @PrefetchTilingWithParentConsidered @@ -1342,7 +1342,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitDequantizeWithSoH @@ -1373,7 +1373,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitPReluOverH @@ -1403,7 +1403,7 @@ module @executors { module @executors { IE.TileResource 4 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @NCEMatMulSOGAndPipelined func.func @NCEMatMulSOGAndPipelined(%arg0: tensor<1x32x4x32xf16, {order = #NHWC}>) -> tensor<32x1x1408x1x1xf16, {order = #GNHWC}>{ @@ -1437,7 +1437,7 @@ module @executors { module @executors { IE.TileResource 4 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @SplitConvWithLargeFilter diff --git a/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_prefetch_40XX.mlir b/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_prefetch_40XX.mlir index 3038d8d027..911526f823 100644 --- a/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_prefetch_40XX.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_prefetch_40XX.mlir @@ -147,7 +147,7 @@ func.func @NCEMatMulSOGAndGHTile(%arg0: tensor<12x1x512x512xf16>, %arg1: tensor< module @executors { IE.TileResource 4 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @pipeliningTilingForBigFilter @@ -168,7 +168,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitNCEConvOverOH @@ -212,7 +212,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitI4QuantNCEConvOverOC @@ -255,7 +255,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @TileOverCWithBigC @@ -299,7 +299,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitNCEPoolOverH @@ -331,7 +331,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @TileWithSOK @@ -380,7 +380,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitSparseNCEConvOverOH @@ -433,7 +433,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitSparseNCEConvOverOH @@ -485,7 +485,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitNCEAveragePoolOverW @@ -509,7 +509,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @SplitNCECompressConv @@ -548,7 +548,7 @@ module @executors { module @executors { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @PrefetchTilingWithSOHParentConsidered diff --git a/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_prefetch_VPUNN_40XX.mlir b/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_prefetch_VPUNN_40XX.mlir index 91cd0a4e04..6176bff16e 100644 --- a/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_prefetch_VPUNN_40XX.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/tiling_strategy_assignment_prefetch_VPUNN_40XX.mlir @@ -207,7 +207,7 @@ func.func @SplitNCEMaxPoolOverW(%arg0: tensor<1x128x1024x28xf16, {order = #NHWC} module @executors { IE.TileResource 4 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @pipeliningTilingForBigFilter diff --git a/tests/lit/NPU/dialect/VPU/passes/vertical_fusion_outlining.mlir b/tests/lit/NPU/dialect/VPU/passes/vertical_fusion_outlining.mlir index ecb43f8b43..ca8002973d 100644 --- a/tests/lit/NPU/dialect/VPU/passes/vertical_fusion_outlining.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/vertical_fusion_outlining.mlir @@ -629,7 +629,7 @@ module @VerticalFusionWithConcatOpOutlining { // CHECK: } // CHECK: return [[VF]] : tensor<1x48x1024x4xf16, {order = #NHWC}> -// CHECK: func.func private @main_vf4([[ARG0:%.+]]: tensor<48x4096x1x1xf16, {order = #NHWC}>, [[ARG1:%.+]]: tensor<1x4096x1024x4xf16, {order = #NHWC}>, [[ARG2:%.+]]: tensor<1x48x1024x4xf16, {order = #NHWC}>) -> tensor<1x96x1024x4xf16, {order = #NHWC}> attributes {pure_vertical_fusion_region} { +// CHECK: func.func private @main_vf4([[ARG0:%.+]]: tensor<48x4096x1x1xf16, {order = #NHWC}>, [[ARG1:%.+]]: tensor<1x4096x1024x4xf16, {order = #NHWC}>, [[ARG2:%.+]]: tensor<1x48x1024x4xf16, {order = #NHWC}>) -> tensor<1x96x1024x4xf16, {order = #NHWC}> { // CHECK: [[CST:%.+]] = const.Declare tensor<48x1x1x4xsi32> = dense<1> : tensor<48x1x1x4xsi32> // CHECK: [[VF:%.+]] = VPU.VerticalFusion ([[ARG1]] as {{[^:]+}}: tensor<1x4096x1024x4xf16, {order = #NHWC}>, [[ARG0]] as {{[^:]+}}: tensor<48x4096x1x1xf16, {order = #NHWC}>, [[CST]] as {{[^:]+}}: tensor<48x1x1x4xsi32>) attributes {tilingStrategy = [1, 1, 15, 1]} -> tensor<1x48x1024x4xf16, {order = #NHWC}> { // CHECK: [[OP:%.+]] = VPU.NCE.Convolution({{[^:]+}}, {{[^:]+}}, {{[^:]+}}) {multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEStub<>, rawFilterShape = [48, 4096, 1, 1], strides = [1, 1]} @@ -880,7 +880,7 @@ module @OutliningMixedCase { // CHECK: } // CHECK: return [[VF]] : tensor<1x48x1024x4xf16, {order = #NHWC}> -// CHECK: func.func private @main_vf5([[ARG0:%.+]]: tensor<48x4096x1x1xf16, {order = #NHWC}>, [[ARG1:%.+]]: tensor<1x4096x1024x4xf16, {order = #NHWC}>, [[ARG2:%.+]]: tensor<1x48x1024x4xf16, {order = #NHWC}>, [[ARG3:%.+]]: tensor<1x48x1024x4xf16, {order = #NHWC}>, [[ARG4:%.+]]: tensor<1x48x1024x4xf16, {order = #NHWC}>) -> tensor<1x190x1024x4xf16, {order = #NHWC}> attributes {pure_vertical_fusion_region} { +// CHECK: func.func private @main_vf5([[ARG0:%.+]]: tensor<48x4096x1x1xf16, {order = #NHWC}>, [[ARG1:%.+]]: tensor<1x4096x1024x4xf16, {order = #NHWC}>, [[ARG2:%.+]]: tensor<1x48x1024x4xf16, {order = #NHWC}>, [[ARG3:%.+]]: tensor<1x48x1024x4xf16, {order = #NHWC}>, [[ARG4:%.+]]: tensor<1x48x1024x4xf16, {order = #NHWC}>) -> tensor<1x190x1024x4xf16, {order = #NHWC}> { // CHECK: [[CST:%.+]] = const.Declare tensor<48x1x1x4xsi32> = dense<1> : tensor<48x1x1x4xsi32> // CHECK: [[VF:%.+]] = VPU.VerticalFusion ([[ARG1]] as {{[^:]+}}: tensor<1x4096x1024x4xf16, {order = #NHWC}>, [[ARG0]] as {{[^:]+}}: tensor<48x4096x1x1xf16, {order = #NHWC}>, [[CST]] as {{[^:]+}}: tensor<48x1x1x4xsi32>) attributes {tilingStrategy = [1, 1, 15, 1]} -> tensor<1x48x1024x4xf16, {order = #NHWC}> { // CHECK: [[OP:%.+]] = VPU.NCE.Convolution({{[^:]+}}, {{[^:]+}}, {{[^:]+}}) {multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEStub<>, rawFilterShape = [48, 4096, 1, 1], strides = [1, 1]} diff --git a/tests/lit/NPU/dialect/VPU/passes/vertical_fusion_outlining_simple.mlir b/tests/lit/NPU/dialect/VPU/passes/vertical_fusion_outlining_simple.mlir index 1a0ad271ec..493ba332b2 100644 --- a/tests/lit/NPU/dialect/VPU/passes/vertical_fusion_outlining_simple.mlir +++ b/tests/lit/NPU/dialect/VPU/passes/vertical_fusion_outlining_simple.mlir @@ -116,7 +116,7 @@ module @ParallelConcatInput { // CHECK: DataInfo "output0" : tensor<1x64x256x256xf16> // CHECK: DataInfo "output1" : tensor<1x64x256x256xf16> -// CHECK: func.func private @main_vf1([[ARG:%.+]]: tensor<1x32x256x256xf16, {order = #NHWC}>) -> (tensor<1x64x256x256xf16, {order = #NHWC}>, tensor<1x64x256x256xf16, {order = #NHWC}>) attributes {pure_vertical_fusion_region} { +// CHECK: func.func private @main_vf1([[ARG:%.+]]: tensor<1x32x256x256xf16, {order = #NHWC}>) -> (tensor<1x64x256x256xf16, {order = #NHWC}>, tensor<1x64x256x256xf16, {order = #NHWC}>) { // CHECK: [[CST0:%.+]] = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x32x3x3xf16, {order = #NHWC}> // CHECK: [[CST1:%.+]] = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> // CHECK: [[OP0:%.+]] = VPU.NCE.Convolution([[ARG]], [[CST0]], [[CST1]]) {multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEStub<>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 2, 1]} @@ -150,6 +150,91 @@ module @ParallelConcatInput { // ----- +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +module @ParallelConcatInputWithMultiTilingDim { + net.NetworkInfo entryPoint : @main + inputsInfo : { + DataInfo "input" : tensor<1x32x256x256xf16> + } + outputsInfo : { + DataInfo "output0" : tensor<1x64x256x256xf16> + DataInfo "output1" : tensor<1x64x256x256xf16> + } + + func.func @main(%arg0: tensor<1x32x256x256xf16, {order = #NHWC}>) -> (tensor<1x64x256x256xf16, {order = #NHWC}>, tensor<1x64x256x256xf16, {order = #NHWC}>) { + %cst_0 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf16, {order = #NHWC}> + %cst_1 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + %cst_2 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf16, {order = #NHWC}> + %cst_3 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> + + %0 = VPU.NCE.Convolution(%arg0, %cst_0, %cst_1) + {multiClusterStrategy = #VPU.multi_cluster_strategy, + ppe = #VPU.PPEStub<>, + pad = #VPU.Padding, + rawFilterShape = [32, 32, 3, 3], strides = [1, 1], + tilingStrategy = [1, 1, 2, 1]} + : tensor<1x32x256x256xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x256x256xf16, {order = #NHWC}> + + %1 = VPU.NCE.Convolution(%arg0, %cst_0, %cst_1) + {multiClusterStrategy = #VPU.multi_cluster_strategy, + ppe = #VPU.PPEStub<>, + pad = #VPU.Padding, + rawFilterShape = [32, 32, 3, 3], strides = [1, 1], + tilingStrategy = [1, 1, 2, 1]} + : tensor<1x32x256x256xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x256x256xf16, {order = #NHWC}> + + %2 = VPU.Concat(%0, %1) {static_offsets = [[0, 0, 0, 0], [0, 32, 0, 0]]} : tensor<1x32x256x256xf16, {order = #NHWC}>, tensor<1x32x256x256xf16, {order = #NHWC}> -> tensor<1x64x256x256xf16, {order = #NHWC}> + + %3 = VPU.VerticalFusion (%2 as %arg1: tensor<1x64x256x256xf16, {order = #NHWC}>) attributes {tilingStrategy = [1, 1, 4, 4]} -> tensor<1x64x256x256xf16, {order = #NHWC}> { + %6 = VPU.SoftMax(%arg1) {axisInd = 1 : i64, multiClusterStrategy = #VPU.multi_cluster_strategy} : tensor<1x64x256x256xf16, {order = #NHWC}> -> tensor<1x64x256x256xf16, {order = #NHWC}> + VPU.Yield %6 + } + + %4 = VPU.Concat(%0, %1) {static_offsets = [[0, 0, 0, 0], [0, 32, 0, 0]]} : tensor<1x32x256x256xf16, {order = #NHWC}>, tensor<1x32x256x256xf16, {order = #NHWC}> -> tensor<1x64x256x256xf16, {order = #NHWC}> + + %5 = VPU.VerticalFusion (%4 as %arg1: tensor<1x64x256x256xf16, {order = #NHWC}>) attributes {tilingStrategy = [1, 1, 4, 4]} -> tensor<1x64x256x256xf16, {order = #NHWC}> { + %6 = VPU.SoftMax(%arg1) {axisInd = 1 : i64, multiClusterStrategy = #VPU.multi_cluster_strategy} : tensor<1x64x256x256xf16, {order = #NHWC}> -> tensor<1x64x256x256xf16, {order = #NHWC}> + VPU.Yield %6 + } + + return %3, %5 : tensor<1x64x256x256xf16, {order = #NHWC}>, tensor<1x64x256x256xf16, {order = #NHWC}> + } +} + +// CHECK-LABEL: @ParallelConcatInputWithMultiTilingDim + +// CHECK: DataInfo "input" : tensor<1x32x256x256xf16> +// CHECK: DataInfo "output0" : tensor<1x64x256x256xf16> +// CHECK: DataInfo "output1" : tensor<1x64x256x256xf16> + +// CHECK: func.func private @main_vf1([[ARG:%.+]]: tensor<1x32x256x256xf16, {order = #NHWC}>) -> (tensor<1x64x256x256xf16, {order = #NHWC}>, tensor<1x64x256x256xf16, {order = #NHWC}>) { +// CHECK: [[CST0:%.+]] = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<32x32x3x3xf16, {order = #NHWC}> +// CHECK: [[CST1:%.+]] = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32> +// CHECK: [[OP0:%.+]] = VPU.NCE.Convolution([[ARG]], [[CST0]], [[CST1]]) {multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEStub<>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 2, 1]} +// CHECK-SAME: -> tensor<1x32x256x256xf16, {order = #NHWC}> +// CHECK: [[OP1:%.+]] = VPU.NCE.Convolution([[ARG]], [[CST0]], [[CST1]]) {multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEStub<>, rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 2, 1]} +// CHECK-SAME: -> tensor<1x32x256x256xf16, {order = #NHWC}> +// CHECK: [[OP2:%.+]] = VPU.Concat([[OP0]], [[OP1]]) +// CHECK-SAME{LITERAL}: {static_offsets = [[0, 0, 0, 0], [0, 32, 0, 0]]} +// CHECK-SAME: : tensor<1x32x256x256xf16, {order = #NHWC}>, tensor<1x32x256x256xf16, {order = #NHWC}> -> tensor<1x64x256x256xf16, {order = #NHWC}> +// CHECK: [[VF0:%.+]] = VPU.VerticalFusion ([[OP2]] as {{[^:]+}}: tensor<1x64x256x256xf16, {order = #NHWC}>) attributes {tilingStrategy = [1, 1, 4, 4]} -> tensor<1x64x256x256xf16, {order = #NHWC}> { +// CHECK: [[VFOP0:%.+]] = VPU.SoftMax({{[^:]+}}) {axisInd = 1 : i64, multiClusterStrategy = #VPU.multi_cluster_strategy} : tensor<1x64x256x256xf16, {order = #NHWC}> -> tensor<1x64x256x256xf16, {order = #NHWC}> +// CHECK: VPU.Yield [[VFOP0]] +// CHECK: } +// CHECK: [[OP4:%.+]] = VPU.Concat([[OP0]], [[OP1]]) +// CHECK-SAME{LITERAL}: {static_offsets = [[0, 0, 0, 0], [0, 32, 0, 0]]} +// CHECK: [[VF1:%.+]] = VPU.VerticalFusion ([[OP4]] as {{[^:]+}}: tensor<1x64x256x256xf16, {order = #NHWC}>) attributes {tilingStrategy = [1, 1, 4, 4]} -> tensor<1x64x256x256xf16, {order = #NHWC}> { +// CHECK: [[VFOP1:%.+]] = VPU.SoftMax({{[^:]+}}) {axisInd = 1 : i64, multiClusterStrategy = #VPU.multi_cluster_strategy} : tensor<1x64x256x256xf16, {order = #NHWC}> -> tensor<1x64x256x256xf16, {order = #NHWC}> +// CHECK: VPU.Yield [[VFOP1]] +// CHECK: } +// CHECK: return [[VF0]], [[VF1]] : tensor<1x64x256x256xf16, {order = #NHWC}>, tensor<1x64x256x256xf16, {order = #NHWC}> + +// CHECK: func.func @main([[INPUT:%.+]]: tensor<1x32x256x256xf16, {order = #NHWC}>) -> (tensor<1x64x256x256xf16, {order = #NHWC}>, tensor<1x64x256x256xf16, {order = #NHWC}>) { +// CHECK: [[CALL0:%.+]]:2 = call @main_vf1([[INPUT]]) : (tensor<1x32x256x256xf16, {order = #NHWC}>) -> (tensor<1x64x256x256xf16, {order = #NHWC}>, tensor<1x64x256x256xf16, {order = #NHWC}>) +// CHECK: return [[CALL0]]#0, [[CALL0]]#1 : tensor<1x64x256x256xf16, {order = #NHWC}>, tensor<1x64x256x256xf16, {order = #NHWC}> + +// ----- + #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> module @ConcatSliceUsers { net.NetworkInfo entryPoint : @main @@ -185,7 +270,7 @@ module @ConcatSliceUsers { // CHECK: DataInfo "input" : tensor<1x32x256x256xf16> // CHECK: DataInfo "output" : tensor<1x48x256x256xf16> -// CHECK: func.func private @main_vf1([[ARG:%.+]]: tensor<1x32x256x256xf16, {order = #NHWC}>) -> (tensor<1x32x256x256xf16, {order = #NHWC}>, tensor<1x16x256x256xf16, {order = #NHWC}>) attributes {pure_vertical_fusion_region} { +// CHECK: func.func private @main_vf1([[ARG:%.+]]: tensor<1x32x256x256xf16, {order = #NHWC}>) -> (tensor<1x32x256x256xf16, {order = #NHWC}>, tensor<1x16x256x256xf16, {order = #NHWC}>) { // CHECK: [[VF:%.+]] = VPU.VerticalFusion ([[ARG]] as {{[^:]+}}: tensor<1x32x256x256xf16, {order = #NHWC}>) attributes {tilingStrategy = [1, 1, 4, 1]} -> tensor<1x32x256x256xf16, {order = #NHWC}> { // CHECK: [[VFOP:%.+]] = VPU.SoftMax({{[^:]+}}) {axisInd = 1 : i64, multiClusterStrategy = #VPU.multi_cluster_strategy} : tensor<1x32x256x256xf16, {order = #NHWC}> -> tensor<1x32x256x256xf16, {order = #NHWC}> // CHECK: VPU.Yield [[VFOP]] @@ -243,7 +328,7 @@ module @ConcatMultiUsers { // CHECK: DataInfo "input" : tensor<1x32x256x256xf16> // CHECK: DataInfo "output" : tensor<1x80x256x256xf16> -// CHECK: func.func private @main_vf1([[ARG:%.+]]: tensor<1x32x256x256xf16, {order = #NHWC}>) -> tensor<1x48x256x256xf16, {order = #NHWC}> attributes {pure_vertical_fusion_region} { +// CHECK: func.func private @main_vf1([[ARG:%.+]]: tensor<1x32x256x256xf16, {order = #NHWC}>) -> tensor<1x48x256x256xf16, {order = #NHWC}> { // CHECK: [[VF:%.+]] = VPU.VerticalFusion ([[ARG]] as {{[^:]+}}: tensor<1x32x256x256xf16, {order = #NHWC}>) attributes {tilingStrategy = [1, 1, 4, 1]} -> tensor<1x32x256x256xf16, {order = #NHWC}> { // CHECK: [[VFOP:%.+]] = VPU.SoftMax({{[^:]+}}) {axisInd = 1 : i64, multiClusterStrategy = #VPU.multi_cluster_strategy} : tensor<1x32x256x256xf16, {order = #NHWC}> -> tensor<1x32x256x256xf16, {order = #NHWC}> // CHECK: VPU.Yield [[VFOP]] @@ -315,3 +400,87 @@ module @OutliningWithQuantizeCast { // CHECK: [[CALL:%.+]] = call @main_vf1([[INPUT1]], [[INPUT2]], [[INPUT3]]) : (tensor<1x48x1024x4x!qElemType, {order = #NHWC}>, tensor<4096x48x1x1x!qElemType1, {order = #NHWC}>, tensor<48x4096x1x1x!qElemType1, {order = #NHWC}>) -> tensor<1x48x1024x4x!qElemType1, {order = #NHWC}> // CHECK: return [[CALL]] : tensor<1x48x1024x4x!qElemType1, {order = #NHWC}> + +// ----- + +!qElemType = !quant.uniform +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!SparseTensorType = !VPU.SparseTensor< + data=tensor<16x16x11x11x!qElemType, {order = #NHWC}>, + sparsity_map=tensor<16x1x1x2048xi1>, + is_weights, + #VPU.SparsityCompression< + axis = 0 : i64, + numElems = dense<[121, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]> : tensor<16xi64>, + alignment = 16 : i64>> + +module @OutliningWithGroupSparseTensor { + net.NetworkInfo entryPoint : @main + inputsInfo : { + DataInfo "input1" : tensor<1x16x1034x2058xf16> + DataInfo "input2" : tensor<1x16x1024x2048xf16> + DataInfo "input3" : tensor<1x16x1024x2048xf16> + DataInfo "input4" : tensor<1x16x1034x2058xf16> + } + outputsInfo : { + DataInfo "output" : tensor<1x16x1024x2048xf16> + } + + func.func @main(%arg0: tensor<1x16x1034x2058xf16, {order = #NHWC}>, %arg1: tensor<1x16x1024x2048xf16, {order = #NHWC}>, %arg2: tensor<1x16x1024x2048xf16, {order = #NHWC}>, %arg3: tensor<1x16x1034x2058xf16, {order = #NHWC}>) -> tensor<1x16x1024x2048xf16, {order = #NHWC}> { + %cst_0 = const.Declare tensor<16x1x1x2048xi1> = dense<1> : tensor<16x16x11x11xsi8, {order = #NHWC}>, [#const.CastElemType, #const.GetSparsityMap] + %cst_1 = const.Declare tensor<16x16x11x11x!qElemType, {order = #NHWC}> = dense<1> : tensor<16x16x11x11xsi8, {order = #NHWC}>, [#const.CastElemType, #const.Sparsify] + %cst_2 = const.Declare tensor<16x1x1x4xsi32> = dense<1> : tensor<16x1x1x4xsi32> + %0 = VPU.GroupSparseTensor(%cst_1, %cst_0) {is_weights, sparsity_compression = #VPU.SparsityCompression : tensor<16xi64>, alignment = 16 : i64>} -> !SparseTensorType + %1 = VPU.GroupSparseTensor(%cst_1, %cst_0) {is_weights, sparsity_compression = #VPU.SparsityCompression : tensor<16xi64>, alignment = 16 : i64>} -> !SparseTensorType + + %2 = VPU.NCE.Convolution(%arg0, %1, %cst_2) { + mpe_engine = #VPU.MPEEngine37XX>, + multiClusterStrategy = #VPU.multi_cluster_strategy, + pad = #VPU.Padding, + ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 0.0069205216132104397 : f64>, + rawFilterShape = [16, 16, 11, 11], strides = [1, 1], tilingStrategy = [1, 1, 47, 1] + } : tensor<1x16x1034x2058xf16, {order = #NHWC}>, !SparseTensorType, tensor<16x1x1x4xsi32> -> tensor<1x16x1024x2048xf16, {order = #NHWC}> + + %3 = VPU.VerticalFusion (%2 as %arg4: tensor<1x16x1024x2048xf16, {order = #NHWC}>, %arg1 as %arg5: tensor<1x16x1024x2048xf16, {order = #NHWC}>, %arg2 as %arg6: tensor<1x16x1024x2048xf16, {order = #NHWC}>) attributes {scenario = #VPU.vf_scenario, tilingStrategy = [1, 1, 1, 98]} -> tensor<1x16x1024x2048xf16, {order = #NHWC}> { + %4 = VPU.NCE.Eltwise(%arg4, %arg5) {is_inplace = true, multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, quant_scale = [1.000000e+00], fp_prelu_alpha = 1.000000e+00 : f64>} -> tensor<1x16x1024x2048xf16, {order = #NHWC}> + %5 = VPU.NCE.Eltwise(%4, %arg6) {is_inplace = true, multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, quant_scale = [1.000000e+00], fp_prelu_alpha = 1.000000e+00 : f64>} -> tensor<1x16x1024x2048xf16, {order = #NHWC}> + VPU.Yield %5 + } + + %6 = VPU.NCE.Convolution(%arg3, %0, %cst_2) { + mpe_engine = #VPU.MPEEngine37XX>, + multiClusterStrategy = #VPU.multi_cluster_strategy, + pad = #VPU.Padding, + ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 0.0069205216132104397 : f64>, + rawFilterShape = [16, 16, 11, 11], strides = [1, 1], tilingStrategy = [1, 1, 47, 1] + } : tensor<1x16x1034x2058xf16, {order = #NHWC}>, !SparseTensorType, tensor<16x1x1x4xsi32> -> tensor<1x16x1024x2048xf16, {order = #NHWC}> + + %7 = VPU.NCE.Eltwise(%6, %3) {is_inplace = true, multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, ppe = #VPU.PPEInt, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, quant_scale = [1.000000e+00], fp_prelu_alpha = 1.000000e+00 : f64>} -> tensor<1x16x1024x2048xf16, {order = #NHWC}> + return %7 : tensor<1x16x1024x2048xf16, {order = #NHWC}> + } +} + +// CHECK-LABEL: @OutliningWithGroupSparseTensor + +// CHECK: func.func private @main_vf1([[ARG0:%.+]]: tensor<1x16x1034x2058xf16, {order = #NHWC}>, [[ARG1:%.+]]: tensor<1x16x1024x2048xf16, {order = #NHWC}>, [[ARG2:%.+]]: tensor<1x16x1024x2048xf16, {order = #NHWC}>, [[ARG3:%.+]]: tensor<1x16x1034x2058xf16, {order = #NHWC}>) -> tensor<1x16x1024x2048xf16, {order = #NHWC}> { +// CHECK: [[CST0:%.+]] = const.Declare tensor<16x1x1x2048xi1> +// CHECK: [[CST1:%.+]] = const.Declare tensor<16x16x11x11x!qElemType, {order = #NHWC}> +// CHECK: [[CST2:%.+]] = const.Declare tensor<16x1x1x4xsi32> +// CHECK: [[GROUP_SPARSE1:%.+]] = VPU.GroupSparseTensor([[CST1]], [[CST0]]) +// CHECK: [[GROUP_SPARSE2:%.+]] = VPU.GroupSparseTensor([[CST1]], [[CST0]]) +// CHECK: [[CONV1:%.+]] = VPU.NCE.Convolution([[ARG0]], [[GROUP_SPARSE2]], [[CST2]]) +// CHECK: [[VF:%.+]] = VPU.VerticalFusion ([[CONV1]] as {{[^:]+}}: tensor<1x16x1024x2048xf16, {order = #NHWC}>, [[ARG1]] as {{[^:]+}}: tensor<1x16x1024x2048xf16, {order = #NHWC}>, [[ARG2]] as {{[^:]+}}: tensor<1x16x1024x2048xf16, {order = #NHWC}>) +// CHECK: [[ELTWISE1:%.+]] = VPU.NCE.Eltwise({{[^:]+}}, {{[^:]+}}) +// CHECK: [[ELTWISE2:%.+]] = VPU.NCE.Eltwise([[ELTWISE1]], {{[^:]+}}) +// CHECK: VPU.Yield [[ELTWISE2]] +// CHECK: [[CONV2:%.+]] = VPU.NCE.Convolution([[ARG3]], [[GROUP_SPARSE1]], [[CST2]]) +// CHECK: [[ELTWISE3:%.+]] = VPU.NCE.Eltwise([[CONV2]], [[VF]]) +// CHECK: return [[ELTWISE3]] + +// CHECK-NOT: func.func private @main_vf2 + +// CHECK: func.func @main([[INPUT1:%.+]]: tensor<1x16x1034x2058xf16, {order = #NHWC}>, [[INPUT2:%.+]]: tensor<1x16x1024x2048xf16, {order = #NHWC}>, [[INPUT3:%.+]]: tensor<1x16x1024x2048xf16, {order = #NHWC}>, [[INPUT4:%.+]]: tensor<1x16x1034x2058xf16, {order = #NHWC}>) -> tensor<1x16x1024x2048xf16, {order = #NHWC}> { +// CHECK: [[CALL:%.+]] = call @main_vf1([[INPUT1]], [[INPUT2]], [[INPUT3]], [[INPUT4]]) + +// CHECK: return [[CALL]] diff --git a/tests/lit/NPU/dialect/VPU/passes/vf_tiling_barrier_fifo_wlm_mode.mlir b/tests/lit/NPU/dialect/VPU/passes/vf_tiling_barrier_fifo_wlm_mode.mlir new file mode 100644 index 0000000000..704c62ca20 --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/vf_tiling_barrier_fifo_wlm_mode.mlir @@ -0,0 +1,69 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=DefaultHW" --vertical-fusion-tiling="workload-management-mode=PWLM_V1_BARRIER_FIFO" %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!qElemType = !quant.uniform + +// CHECK-LABEL: @VfTilingWithMultiTilingDims +// CHECK-SAME: [[INPUT0:%.+]]: tensor<1x320x64x64x!qElemType, {order = #NHWC}> +// CHECK-SAME: [[INPUT1:%.+]]: tensor<1x320x64x64xf16, {order = #NHWC}> +func.func @VfTilingWithMultiTilingDims(%arg0: tensor<1x320x64x64x!qElemType, {order = #NHWC}>, %arg1: tensor<1x320x64x64xf16, {order = #NHWC}>) -> tensor<1x320x64x64xf16, {order = #NHWC}> { + %cst = const.Declare tensor<320x320x1x1x!qElemType, {order = #NHWC}> = dense<1.000000e+00> : tensor<320x320x1x1xf16>, [#const.CastElemType, #const.CastElemType, #const.Reorder<#NHWC>] + %cst_0 = const.Declare tensor<320x1x1x4xsi32> = dense<1> : tensor<320x1x1x4xsi32> + %0 = VPU.VerticalFusion (%arg0 as %arg2: tensor<1x320x64x64x!qElemType, {order = #NHWC}>, %cst as %arg3: tensor<320x320x1x1x!qElemType, {order = #NHWC}>, %cst_0 as %arg4: tensor<320x1x1x4xsi32>, %arg1 as %arg5: tensor<1x320x64x64xf16, {order = #NHWC}>) attributes {scenario = #VPU.vf_scenario, tilingStrategy = [1, 1, 2, 4]} -> tensor<1x320x64x64xf16, {order = #NHWC}> { + %1 = VPU.NCE.Convolution(%arg2, %arg3, %arg4) {mpe_engine = #VPU.MPEEngine37XX>, multiClusterStrategy = #VPU.multi_cluster_strategy, pad = #VPU.Padding, ppe = #VPU.PPEFp, clamp_low = -3.4028234663852886E+38 : f64, clamp_high = 3.4028234663852886E+38 : f64, prelu_alpha = [1.000000e+00], adder = 0.000000e+00 : f64>, rawFilterShape = [320, 320, 1, 1], strides = [1, 1]} : tensor<1x320x64x64x!qElemType, {order = #NHWC}>, tensor<320x320x1x1x!qElemType, {order = #NHWC}>, tensor<320x1x1x4xsi32> -> tensor<1x320x64x64xf16, {order = #NHWC}> + %2 = VPU.NCE.Eltwise(%1, %arg5) {is_inplace = true, multiClusterStrategy = #VPU.multi_cluster_strategy, op_type = #VPU.eltwise_type, ppe = #VPU.PPEFp, clamp_low = -3.4028234663852886E+38 : f64, clamp_high = 3.4028234663852886E+38 : f64, scale = 1.000000e+00 : f64, prelu_alpha = [1.000000e+00], bias = 0.000000e+00 : f64, adder = 0.000000e+00 : f64>} -> tensor<1x320x64x64xf16, {order = #NHWC}> + VPU.Yield %2 + } + return %0 : tensor<1x320x64x64xf16, {order = #NHWC}> + + // CHECK: [[INPUT0_SLICE00:%.+]] = VPU.Slice [[INPUT0]] [0, 0, 0, 0] [1, 320, 32, 16] + // CHECK: [[CONV_00:%.+]] = VPU.NCE.Convolution([[INPUT0_SLICE00]] + // CHECK: [[INPUT1_SLICE00:%.+]] = VPU.Slice [[INPUT1]] [0, 0, 0, 0] [1, 320, 32, 16] + // CHECK: [[ELTWISE_00:%.+]] = VPU.NCE.Eltwise([[CONV_00]], [[INPUT1_SLICE00]]) + + // CHECK: [[INPUT0_SLICE01:%.+]] = VPU.Slice [[INPUT0]] [0, 0, 0, 16] [1, 320, 32, 16] + // CHECK: [[CONV_01:%.+]] = VPU.NCE.Convolution([[INPUT0_SLICE01]] + // CHECK: [[INPUT1_SLICE01:%.+]] = VPU.Slice [[INPUT1]] [0, 0, 0, 16] [1, 320, 32, 16] + // CHECK: [[ELTWISE_01:%.+]] = VPU.NCE.Eltwise([[CONV_01]], [[INPUT1_SLICE01]]) + + // CHECK: [[INPUT0_SLICE02:%.+]] = VPU.Slice [[INPUT0]] [0, 0, 0, 32] [1, 320, 32, 16] + // CHECK: [[CONV_02:%.+]] = VPU.NCE.Convolution([[INPUT0_SLICE02]] + // CHECK: [[INPUT1_SLICE02:%.+]] = VPU.Slice [[INPUT1]] [0, 0, 0, 32] [1, 320, 32, 16] + // CHECK: [[ELTWISE_02:%.+]] = VPU.NCE.Eltwise([[CONV_02]], [[INPUT1_SLICE02]]) + + // CHECK: [[INPUT0_SLICE03:%.+]] = VPU.Slice [[INPUT0]] [0, 0, 0, 48] [1, 320, 32, 16] + // CHECK: [[CONV_03:%.+]] = VPU.NCE.Convolution([[INPUT0_SLICE03]] + // CHECK: [[INPUT1_SLICE03:%.+]] = VPU.Slice [[INPUT1]] [0, 0, 0, 48] [1, 320, 32, 16] + // CHECK: [[ELTWISE_03:%.+]] = VPU.NCE.Eltwise([[CONV_03]], [[INPUT1_SLICE03]]) + + // CHECK: [[INPUT0_SLICE10:%.+]] = VPU.Slice [[INPUT0]] [0, 0, 32, 0] [1, 320, 32, 16] + // CHECK: [[CONV_10:%.+]] = VPU.NCE.Convolution([[INPUT0_SLICE10]] + // CHECK: [[INPUT1_SLICE10:%.+]] = VPU.Slice [[INPUT1]] [0, 0, 32, 0] [1, 320, 32, 16] + // CHECK: [[ELTWISE_10:%.+]] = VPU.NCE.Eltwise([[CONV_10]], [[INPUT1_SLICE10]]) + + // CHECK: [[INPUT0_SLICE11:%.+]] = VPU.Slice [[INPUT0]] [0, 0, 32, 16] [1, 320, 32, 16] + // CHECK: [[CONV_11:%.+]] = VPU.NCE.Convolution([[INPUT0_SLICE11]] + // CHECK: [[INPUT1_SLICE11:%.+]] = VPU.Slice [[INPUT1]] [0, 0, 32, 16] [1, 320, 32, 16] + // CHECK: [[ELTWISE_11:%.+]] = VPU.NCE.Eltwise([[CONV_11]], [[INPUT1_SLICE11]]) + + // CHECK: [[INPUT0_SLICE12:%.+]] = VPU.Slice [[INPUT0]] [0, 0, 32, 32] [1, 320, 32, 16] + // CHECK: [[CONV_12:%.+]] = VPU.NCE.Convolution([[INPUT0_SLICE12]] + // CHECK: [[INPUT1_SLICE12:%.+]] = VPU.Slice [[INPUT1]] [0, 0, 32, 32] [1, 320, 32, 16] + // CHECK: [[ELTWISE_12:%.+]] = VPU.NCE.Eltwise([[CONV_12]], [[INPUT1_SLICE12]]) + + // CHECK: [[INPUT0_SLICE13:%.+]] = VPU.Slice [[INPUT0]] [0, 0, 32, 48] [1, 320, 32, 16] + // CHECK: [[CONV_13:%.+]] = VPU.NCE.Convolution([[INPUT0_SLICE13]] + // CHECK: [[INPUT1_SLICE13:%.+]] = VPU.Slice [[INPUT1]] [0, 0, 32, 48] [1, 320, 32, 16] + // CHECK: [[ELTWISE_13:%.+]] = VPU.NCE.Eltwise([[CONV_13]], [[INPUT1_SLICE13]]) + + // CHECK: [[CONCAT:%.+]] = VPU.Concat([[ELTWISE_00]], [[ELTWISE_01]], [[ELTWISE_02]], [[ELTWISE_03]], [[ELTWISE_10]], [[ELTWISE_11]], [[ELTWISE_12]], [[ELTWISE_13]]) + // CHECK-SAME{LITERAL}: {static_offsets = [[0, 0, 0, 0], [0, 0, 0, 16], [0, 0, 0, 32], [0, 0, 0, 48], [0, 0, 32, 0], [0, 0, 32, 16], [0, 0, 32, 32], [0, 0, 32, 48]]} + // CHECK: return [[CONCAT]] +} diff --git a/tests/lit/NPU/dialect/VPU/passes/wrap_verticalfusion_barrier_fifo_wlm_mode.mlir b/tests/lit/NPU/dialect/VPU/passes/wrap_verticalfusion_barrier_fifo_wlm_mode.mlir new file mode 100644 index 0000000000..03f2f08308 --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/passes/wrap_verticalfusion_barrier_fifo_wlm_mode.mlir @@ -0,0 +1,34 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=DefaultHW" --wrap-in-vertical-fusion="workload-management-mode=PWLM_V1_BARRIER_FIFO" %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + + +// CHECK-LABEL: @WrapNCETiledTaskWith2dTiling +// CHECK-SAME: [[INPUT0:%.+]]: tensor<1x32x256x256xf16, {order = #NHWC}> +// CHECK-SAME: [[WT:%.+]]: tensor<32x1x1x4xsi32> +// CHECK-SAME: [[WEIGHTS:%.+]]: tensor<32x32x3x3xf16, {order = #NHWC}> +func.func @WrapNCETiledTaskWith2dTiling(%arg0: tensor<1x32x256x256xf16, {order = #NHWC}>, %wt: tensor<32x1x1x4xsi32>, %weights: tensor<32x32x3x3xf16, {order = #NHWC}>) -> tensor<1x32x256x256xf16, {order = #NHWC}> { + %0 = VPU.NCE.Convolution(%arg0, %weights, %wt) + {multiClusterStrategy = #VPU.multi_cluster_strategy, + pad = #VPU.Padding, + ppe = #VPU.PPEStub<>, + rawFilterShape = [32, 32, 3, 3], + strides = [1, 1], + tilingStrategy = [1, 1, 2, 2]} : tensor<1x32x256x256xf16, {order = #NHWC}>, tensor<32x32x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> tensor<1x32x256x256xf16, {order = #NHWC}> + return %0 : tensor<1x32x256x256xf16, {order = #NHWC}> + + //CHECK: VPU.VerticalFusion ([[INPUT0]] as [[ARG0:%.+]]: tensor<1x32x256x256xf16, {order = #NHWC}>, [[WEIGHTS]] as [[ARG1:%.+]]: tensor<32x32x3x3xf16, {order = #NHWC}>, [[WT]] as [[ARG2:%.+]]: tensor<32x1x1x4xsi32>) + //CHECK-SAME: attributes {tilingStrategy = [1, 1, 2, 2]} -> tensor<1x32x256x256xf16, {order = #NHWC}> { + //CHECK: VPU.NCE.Convolution([[ARG0]], [[ARG1]], [[ARG2]]) + //CHECK-SAME: multiClusterStrategy = #VPU.multi_cluster_strategy, + //CHECK-SAME: pad = #VPU.Padding, + //CHECK-SAME: rawFilterShape = [32, 32, 3, 3], strides = [1, 1]} + //CHECK-SAME: -> tensor<1x32x256x256xf16, {order = #NHWC}> + //CHECK: VPU.Yield +} diff --git a/tests/lit/NPU/dialect/VPU/pipelines/default_hw_mode_37XX.mlir b/tests/lit/NPU/dialect/VPU/pipelines/default_hw_mode_37XX.mlir index 80ae065f5d..4dd28a103f 100644 --- a/tests/lit/NPU/dialect/VPU/pipelines/default_hw_mode_37XX.mlir +++ b/tests/lit/NPU/dialect/VPU/pipelines/default_hw_mode_37XX.mlir @@ -9,7 +9,7 @@ #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> // CHECK-LABEL: @Convolution -module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @Convolution attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x3x62x62xf16> } outputsInfo : { @@ -407,13 +407,13 @@ module @VerticalFusionOutlining { module @AdjustMemorySpaceAndOptimizeSharedInputCopyForConcat1T { IE.TileResource 1 of @NCE at 1.300000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" tensorNames = ["input"] : tensor<1x3x128x128xf32> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPU/pipelines/default_hw_mode_40XX.mlir b/tests/lit/NPU/dialect/VPU/pipelines/default_hw_mode_40XX.mlir index ba269a84d0..30e4f18f7b 100644 --- a/tests/lit/NPU/dialect/VPU/pipelines/default_hw_mode_40XX.mlir +++ b/tests/lit/NPU/dialect/VPU/pipelines/default_hw_mode_40XX.mlir @@ -9,7 +9,7 @@ #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> // CHECK-LABEL: @Convolution -module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @Convolution attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x3x62x62xf16> } outputsInfo : { @@ -109,6 +109,13 @@ module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compi // CHECK-LABEL: @SoftMax module @SoftMax { + // CHECK-DAG: {{ }}IE.TileResource + // CHECK-DAG: {{ }}module @DummySWKernelsForInstructionPrefetchReservedMemory + // CHECK-NEXT: {{ }}IE.MemoryResource 8 bytes of @CMX_NN offset 1473528 + // CHECK-DAG: {{ }}module @SWKernelPrefetchingReservedMemory + // CHECK-NEXT: {{ }}IE.MemoryResource 512 bytes of @CMX_NN offset 1473536 + // CHECK-DAG: {{ }}module @DmaProfilingReservedMemory + // CHECK-NEXT: {{ }}IE.MemoryResource 512 bytes of @CMX_NN offset 1474048 net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x1000xf16> } outputsInfo : { @@ -590,13 +597,13 @@ module @VerticalFusionOutlining { module @AdjustMemorySpaceAndOptimizeSharedInputCopyForConcat1T { IE.TileResource 1 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" tensorNames = ["input"] : tensor<1x3x128x128xf32> } outputsInfo : { @@ -692,13 +699,13 @@ module @AdjustMemorySpaceAndOptimizeSharedInputCopyForConcat1T { module @AdjustMemorySpaceAndOptimizeSharedInputCopyForConcat2T { IE.TileResource 2 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" tensorNames = ["input"] : tensor<1x3x128x128xf32> } outputsInfo : { @@ -778,3 +785,40 @@ module @AdjustMemorySpaceAndOptimizeSharedInputCopyForConcat2T { } } + +// ----- + +// CHECK-LABEL: @SoftMax +module @SoftMax { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<3x3x16x8xf16> + } outputsInfo : { + DataInfo "output" : tensor<8x16x3x3xf16> + } + + // CHECK: func.func @main([[ARG0:%.+]]: tensor<3x3x16x8xf16>) -> tensor<8x16x3x3xf16> + func.func @main(%arg0: tensor<3x3x16x8xf16>) -> tensor<8x16x3x3xf16> { + %2088 = VPU.MemPermute(%arg0) {dst_order = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, mem_perm = affine_map<(d0, d1, d2, d3) -> (d3, d2, d0, d1)>} : tensor<3x3x16x8xf16> -> tensor<8x16x3x3xf16> + return %2088 : tensor<8x16x3x3xf16> + } + + // CHECK: [[IN:%.+]] = VPU.Copy([[ARG0]]) {out_mem_space = @CMX_NN} : tensor<3x3x16x8xf16> + // CHECK-SAME: -> !VPU.DistributedTensor<3x3x16x8xf16, #NCHW, @CMX_NN, + // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[3, 3, 3, 8], [3, 3, 3, 8], [3, 3, 3, 8], [3, 3, 3, 8], [3, 3, 2, 8], [3, 3, 2, 8]], + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 6, 0], [0, 0, 9, 0], [0, 0, 12, 0], [0, 0, 14, 0]], + // CHECK-SAME{LITERAL}: memory_shapes = [[3, 3, 3, 8], [3, 3, 3, 8], [3, 3, 3, 8], [3, 3, 3, 8], [3, 3, 2, 8], [3, 3, 2, 8]], + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 6, 0], [0, 0, 9, 0], [0, 0, 12, 0], [0, 0, 14, 0]]}> + + // CHECK: [[MEM_PERMUTE:%.+]] = VPU.MemPermute([[IN]]) + // CHECK-SAME: dst_order = #NCHW, mem_perm = #map + + // CHECK: [[OUT:%.+]] = VPU.Copy([[MEM_PERMUTE]]) : !VPU.DistributedTensor<8x16x3x3xf16, #NCHW, @CMX_NN, + // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 6, 1, 1], num_clusters = 6 : i64, uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[8, 3, 3, 3], [8, 3, 3, 3], [8, 3, 3, 3], [8, 3, 3, 3], [8, 2, 3, 3], [8, 2, 3, 3]], + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 3, 0, 0], [0, 6, 0, 0], [0, 9, 0, 0], [0, 12, 0, 0], [0, 14, 0, 0]], + // CHECK-SAME{LITERAL}: memory_shapes = [[8, 3, 3, 3], [8, 3, 3, 3], [8, 3, 3, 3], [8, 3, 3, 3], [8, 2, 3, 3], [8, 2, 3, 3]], + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 3, 0, 0], [0, 6, 0, 0], [0, 9, 0, 0], [0, 12, 0, 0], [0, 14, 0, 0]]}> -> tensor<8x16x3x3xf16> + + // CHECK: return [[OUT]] : tensor<8x16x3x3xf16> +} diff --git a/tests/lit/NPU/dialect/VPU/pipelines/reference_sw_mode_37xx.mlir b/tests/lit/NPU/dialect/VPU/pipelines/reference_sw_mode_37xx.mlir new file mode 100644 index 0000000000..11f5f94ede --- /dev/null +++ b/tests/lit/NPU/dialect/VPU/pipelines/reference_sw_mode_37xx.mlir @@ -0,0 +1,31 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=ReferenceSW allow-custom-values=true" --mlir-elide-elementsattrs-if-larger 8 --reference-sw-mode-vpu %s | FileCheck %s --strict-whitespace +// REQUIRES: arch-NPU37XX + +// CHECK-LABEL: @SoftMax +module @SoftMax { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x1000xf16> + } outputsInfo : { + DataInfo "softmax" : tensor<1x1000xf16> + } + + // CHECK: func.func @main([[ARG0:%.+]]: tensor<1x1000xf16>) -> tensor<1x1000xf16> + func.func @main(%arg0: tensor<1x1000xf16>) -> tensor<1x1000xf16> { + %0 = VPU.AffineReshape(%arg0) {dim_mapping = [[0, 1, 2], [3]], shape_value = [1, 1, 1, 1000]} : tensor<1x1000xf16> -> tensor<1x1x1x1000xf16> + %1 = VPU.SoftMax(%0) {axisInd = 3 : i64} : tensor<1x1x1x1000xf16> -> tensor<1x1x1x1000xf16> + %2 = VPU.AffineReshape(%1) {dim_mapping = [[0], [0], [0], [1]], shape_value = [1, 1000]} : tensor<1x1x1x1000xf16> -> tensor<1x1000xf16> + return %2 : tensor<1x1000xf16> + + // CHECK: [[RESHAPE:%.+]] = VPU.AffineReshape([[ARG0]]) + // CHECK-SAME{LITERAL}: {dim_mapping = [[0, 1, 2], [3]], shape_value = [1, 1, 1, 1000]} : tensor<1x1000xf16> -> tensor<1x1x1x1000xf16> + // CHECK: [[SOFTMAX:%.+]] = VPU.SoftMax([[RESHAPE]]) {axisInd = 3 : i64} : tensor<1x1x1x1000xf16> -> tensor<1x1x1x1000xf16> + + // CHECK: [[OUT:%.+]] = VPU.AffineReshape([[SOFTMAX]]) + // CHECK-SAME{LITERAL}: {dim_mapping = [[0], [0], [0], [1]], shape_value = [1, 1000]} : tensor<1x1x1x1000xf16> -> tensor<1x1000xf16> + } +} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/add_start_barrier_compiler_bar_programming_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/add_start_barrier_compiler_bar_programming_40XX+.mlir index 16f934b348..166c776d86 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/add_start_barrier_compiler_bar_programming_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/add_start_barrier_compiler_bar_programming_40XX+.mlir @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --add-start-barrier="enable-compiler-barrier-programming=true" %s | FileCheck %s +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --add-start-barrier %s | FileCheck %s // REQUIRES: arch-NPU40XX // ----- diff --git a/tests/lit/NPU/dialect/VPUIP/passes/add_start_barrier_fw_bar_programming_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/add_start_barrier_fw_bar_programming_40XX+.mlir deleted file mode 100644 index f1d14de07f..0000000000 --- a/tests/lit/NPU/dialect/VPUIP/passes/add_start_barrier_fw_bar_programming_40XX+.mlir +++ /dev/null @@ -1,319 +0,0 @@ -// -// Copyright (C) 2022-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --add-start-barrier="enable-compiler-barrier-programming=false" %s | FileCheck %s -// REQUIRES: arch-NPU40XX - -#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> - -!DDRType = memref<1x3x224x224xf16, #NCHW, @DDR> - -//CHECK-LABEL: @AddStartBarrierWithConstInput -func.func @AddStartBarrierWithConstInput() -> !DDRType { - %0 = const.Declare !DDRType = dense<1.000000e+00> : tensor<1x3x224x224xf16> - %1 = VPURT.DeclareBuffer <150528> -> !DDRType - - VPURT.Task { - %2 = VPUIP.NNDMA {port = 0 : i64} inputs(%0 : !DDRType) outputs(%1 : !DDRType) -> !DDRType - } - return %1 : !DDRType - - // CHECK: [[BAR0:%.*]] = VPURT.DeclareVirtualBarrier {isStartBarrier} -> !VPURT.Barrier - // CHECK: VPURT.Task updates([[BAR0]] : !VPURT.Barrier) - // CHECK: VPUIP.SyncDMA - // CHECK: VPURT.Task waits([[BAR0]] : !VPURT.Barrier) - // CHECK: VPUIP.NNDMA -} - -// ----- - -#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> - -!DDRType = memref<1x3x224x224xf16, #NCHW, @DDR> - -//CHECK-LABEL: @AddStartBarrierWithNonConstInput -func.func @AddStartBarrierWithNonConstInput() -> !DDRType { - %0 = VPURT.DeclareBuffer <150528> -> !DDRType - %1 = VPURT.DeclareBuffer <150528> -> !DDRType - %2 = VPURT.DeclareBuffer <301056> -> !DDRType - - VPURT.Task { - %4 = VPUIP.NNDMA {port = 0 : i64} inputs(%0 : !DDRType) outputs(%1 : !DDRType) -> !DDRType - } - VPURT.Task { - %4 = VPUIP.NNDMA {port = 0 : i64} inputs(%1 : !DDRType) outputs(%2 : !DDRType) -> !DDRType - } - return %2 : !DDRType - - // CHECK: [[BAR0:%.*]] = VPURT.DeclareVirtualBarrier {isStartBarrier} -> !VPURT.Barrier - // CHECK: VPURT.Task updates([[BAR0]] : !VPURT.Barrier) - // CHECK: VPUIP.SyncDMA - // CHECK: VPURT.Task waits([[BAR0]] : !VPURT.Barrier) - // CHECK: VPUIP.NNDMA - // CHECK: VPURT.Task - // CHECK: VPUIP.NNDMA -} - -// ----- - -#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> - -!DDRType = memref<1x3x224x224xf16, #NCHW, @DDR> - -//CHECK-LABEL: @AddStartBarrierWithTwoSyncDMA -func.func @AddStartBarrierWithTwoSyncDMA() -> !DDRType { - %0 = VPURT.DeclareBuffer <150528> -> !DDRType - %1 = VPURT.DeclareBuffer <150528> -> !DDRType - %2 = VPURT.DeclareBuffer <301056> -> !DDRType - - VPURT.Task { - %4 = VPUIP.NNDMA {port = 1 : i64} inputs(%0 : !DDRType) outputs(%1 : !DDRType) -> !DDRType - } - VPURT.Task { - %4 = VPUIP.NNDMA {port = 1 : i64} inputs(%1 : !DDRType) outputs(%2 : !DDRType) -> !DDRType - } - return %2 : !DDRType - - // CHECK: [[BAR0:%.*]] = VPURT.DeclareVirtualBarrier {isStartBarrier} -> !VPURT.Barrier - // CHECK: VPURT.Task updates([[BAR0]] : !VPURT.Barrier) - // CHECK: VPUIP.SyncDMA - // CHECK: VPURT.Task waits([[BAR0]] : !VPURT.Barrier) - // CHECK: VPUIP.SyncDMA - // CHECK: VPURT.Task - // CHECK: VPUIP.NNDMA - // CHECK: VPURT.Task - // CHECK: VPUIP.NNDMA -} - -// ----- - -#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> - -!DDRType = memref<1x3x224x224xf16, #NCHW, @DDR> - -//CHECK-LABEL: @NotAddStartBarrier -func.func @NotAddStartBarrier() -> !DDRType { - %0 = VPURT.DeclareBuffer <150528> -> !DDRType - %1 = VPURT.DeclareBuffer <150528> -> !DDRType - %2 = VPURT.DeclareBuffer <301056> -> !DDRType - - %b = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier - - VPURT.Task updates(%b : !VPURT.Barrier) { - %4 = VPUIP.NNDMA {port = 0 : i64} inputs(%0 : !DDRType) outputs(%1 : !DDRType) -> !DDRType - } - VPURT.Task waits(%b : !VPURT.Barrier) { - %4 = VPUIP.NNDMA {port = 0 : i64} inputs(%1 : !DDRType) outputs(%2 : !DDRType) -> !DDRType - } - return %2 : !DDRType - - // CHECK: [[BAR0:%.*]] = VPURT.DeclareVirtualBarrier {isStartBarrier} -> !VPURT.Barrier - // CHECK-NOT: VPUIP.SyncDMA - // CHECK: VPURT.Task updates([[BAR0]] : !VPURT.Barrier) - // CHECK: VPUIP.NNDMA - // CHECK: VPURT.Task waits([[BAR0]] : !VPURT.Barrier) - // CHECK: VPUIP.NNDMA -} - -// ----- - -#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> - -!DDRType = memref<1x3x224x224xf16, #NCHW, @DDR> - -VPURT.SW.Runtime - entryPoint: @VPU.SW::@runtime - stack_configuration: [4096, 4096, 4096, 4096] - -module @VPU.SW { - func.func private @builtin_relu(%input : memref<*xf16>, %output : memref<*xf16>) - attributes { - VPU.kernel_code = "activation_relu.cpp", - VPU.kernel_entry = "activation_relu", - VPU.task_type = @COMPUTE - } - -func.func private @runtime() - attributes { - VPU.kernel_code = "nnActEntry" - } -} - -//CHECK-LABEL: @AddStartBarrierBecauseBarrierIsConsumedByNonDma -func.func @AddStartBarrierBecauseBarrierIsConsumedByNonDma() -> !DDRType { - %0 = VPURT.DeclareBuffer <150528> -> !DDRType - %1 = VPURT.DeclareBuffer <150528> -> !DDRType - %2 = VPURT.DeclareBuffer <301056> -> !DDRType - %in_ddr = VPURT.DeclareBuffer <0> -> memref<1x1x1x1000xf16, @DDR> - %out_ddr = VPURT.DeclareBuffer <2000> -> memref<1x1x1x1000xf16, @DDR> - - %b = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier - - VPURT.Task updates(%b : !VPURT.Barrier) { - %4 = VPUIP.NNDMA {port = 0 : i64} inputs(%0 : !DDRType) outputs(%1 : !DDRType) -> !DDRType - } - VPURT.Task waits(%b : !VPURT.Barrier) { - %4 = VPUIP.NNDMA {port = 0 : i64} inputs(%1 : !DDRType) outputs(%2 : !DDRType) -> !DDRType - } - - VPURT.Task waits(%b : !VPURT.Barrier) { - VPUIP.SW.Kernel {resultSegmentSizes = array} - @VPU.SW::@builtin_relu - inputs(%in_ddr as %arg2: memref<1x1x1x1000xf16, @DDR>) - outputs(%out_ddr as %arg3: memref<1x1x1x1000xf16, @DDR>) - on tile 0 -> memref<1x1x1x1000xf16, @DDR> { - VPUIP.SW.Kernel.run {attrs = [false, true, 6.0892105102539063E-4]} (%arg2, %arg3) - : memref<1x1x1x1000xf16, @DDR> - , memref<1x1x1x1000xf16, @DDR> - } - } - return %2 : !DDRType - - // CHECK: [[BAR0:%.*]] = VPURT.DeclareVirtualBarrier {isStartBarrier} -> !VPURT.Barrier - // CHECK: [[BAR1:%.*]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier - // CHECK: VPURT.Task updates([[BAR0]] : !VPURT.Barrier) - // CHECK: VPUIP.SyncDMA - // CHECK: VPURT.Task waits([[BAR0]] : !VPURT.Barrier) updates([[BAR1]] : !VPURT.Barrier) - // CHECK: VPUIP.NNDMA - // CHECK: VPURT.Task waits([[BAR1]] : !VPURT.Barrier) - // CHECK: VPUIP.NNDMA - // CHECK: VPURT.Task waits([[BAR1]] : !VPURT.Barrier) - // CHECK: VPUIP.SW.Kernel -} - -// ----- - -#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> - -!DDRType = memref<1x3x224x224xf16, #NCHW, @DDR> - -VPURT.SW.Runtime - entryPoint: @VPU.SW::@runtime - stack_configuration: [4096, 4096, 4096, 4096] - -module @VPU.SW { - func.func private @builtin_relu(%input : memref<*xf16>, %output : memref<*xf16>) - attributes { - VPU.kernel_code = "activation_relu.cpp", - VPU.kernel_entry = "activation_relu", - VPU.task_type = @COMPUTE - } - -func.func private @runtime() - attributes { - VPU.kernel_code = "nnActEntry" - } -} - -//CHECK-LABEL: @DoNotAddStartBarrierIfThereIsABarrierNotConsumedByNonDma -func.func @DoNotAddStartBarrierIfThereIsABarrierNotConsumedByNonDma() -> !DDRType { - %0 = VPURT.DeclareBuffer <150528> -> !DDRType - %1 = VPURT.DeclareBuffer <150528> -> !DDRType - %2 = VPURT.DeclareBuffer <301056> -> !DDRType - %in_ddr = VPURT.DeclareBuffer <0> -> memref<1x1x1x1000xf16, @DDR> - %out_ddr = VPURT.DeclareBuffer <2000> -> memref<1x1x1x1000xf16, @DDR> - - %b0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier - %b1 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier - - VPURT.Task updates(%b0, %b1 : !VPURT.Barrier, !VPURT.Barrier) { - %4 = VPUIP.NNDMA {port = 0 : i64} inputs(%0 : !DDRType) outputs(%1 : !DDRType) -> !DDRType - } - - VPURT.Task waits(%b0 : !VPURT.Barrier) { - VPUIP.SW.Kernel {resultSegmentSizes = array} - @VPU.SW::@builtin_relu - inputs(%in_ddr as %arg2: memref<1x1x1x1000xf16, @DDR>) - outputs(%out_ddr as %arg3: memref<1x1x1x1000xf16, @DDR>) - on tile 0 -> memref<1x1x1x1000xf16, @DDR> { - VPUIP.SW.Kernel.run {attrs = [false, true, 6.0892105102539063E-4]} (%arg2, %arg3) - : memref<1x1x1x1000xf16, @DDR> - , memref<1x1x1x1000xf16, @DDR> - } - } - - VPURT.Task waits(%b1 : !VPURT.Barrier) { - %4 = VPUIP.NNDMA {port = 1 : i64} inputs(%1 : !DDRType) outputs(%2 : !DDRType) -> !DDRType - } - - return %2 : !DDRType - - // CHECK: [[BAR0:%.*]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier - // CHECK: [[BAR1:%.*]] = VPURT.DeclareVirtualBarrier {isStartBarrier} -> !VPURT.Barrier - // CHECK: VPURT.Task updates([[BAR0]], [[BAR1]] : !VPURT.Barrier, !VPURT.Barrier) - // CHECK: VPUIP.NNDMA - // CHECK: VPURT.Task waits([[BAR0]] : !VPURT.Barrier) - // CHECK: VPUIP.SW.Kernel - // CHECK: VPURT.Task waits([[BAR1]] : !VPURT.Barrier) - // CHECK: VPUIP.NNDMA -} - -// ----- - -#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> - -!DDRType = memref<1x3x224x224xf16, #NCHW, @DDR> - -VPURT.SW.Runtime - entryPoint: @VPU.SW::@runtime - stack_configuration: [4096, 4096, 4096, 4096] - -module @VPU.SW { - func.func private @builtin_relu(%input : memref<*xf16>, %output : memref<*xf16>) - attributes { - VPU.kernel_code = "activation_relu.cpp", - VPU.kernel_entry = "activation_relu", - VPU.task_type = @COMPUTE - } - -func.func private @runtime() - attributes { - VPU.kernel_code = "nnActEntry" - } -} - -//CHECK-LABEL: @AddStartBarrierBecauseBarrierDependsOnNonDmaTask -func.func @AddStartBarrierBecauseBarrierDependsOnNonDmaTask() -> !DDRType { - %0 = VPURT.DeclareBuffer <150528> -> !DDRType - %1 = VPURT.DeclareBuffer <150528> -> !DDRType - %2 = VPURT.DeclareBuffer <301056> -> !DDRType - %in_ddr = VPURT.DeclareBuffer <0> -> memref<1x1x1x1000xf16, @DDR> - %out_ddr = VPURT.DeclareBuffer <2000> -> memref<1x1x1x1000xf16, @DDR> - - %b = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier - - VPURT.Task updates(%b : !VPURT.Barrier) { - %4 = VPUIP.NNDMA {port = 0 : i64} inputs(%0 : !DDRType) outputs(%1 : !DDRType) -> !DDRType - } - - VPURT.Task updates(%b : !VPURT.Barrier) { - VPUIP.SW.Kernel {resultSegmentSizes = array} - @VPU.SW::@builtin_relu - inputs(%in_ddr as %arg2: memref<1x1x1x1000xf16, @DDR>) - outputs(%out_ddr as %arg3: memref<1x1x1x1000xf16, @DDR>) - on tile 0 -> memref<1x1x1x1000xf16, @DDR> { - VPUIP.SW.Kernel.run {attrs = [false, true, 6.0892105102539063E-4]} (%arg2, %arg3) - : memref<1x1x1x1000xf16, @DDR> - , memref<1x1x1x1000xf16, @DDR> - } - } - - VPURT.Task waits(%b : !VPURT.Barrier) { - %4 = VPUIP.NNDMA {port = 0 : i64} inputs(%1 : !DDRType) outputs(%2 : !DDRType) -> !DDRType - } - - return %2 : !DDRType - - // CHECK: [[BAR0:%.*]] = VPURT.DeclareVirtualBarrier {isStartBarrier} -> !VPURT.Barrier - // CHECK: [[BAR1:%.*]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier - // CHECK: VPURT.Task updates([[BAR0]] : !VPURT.Barrier) - // CHECK: VPUIP.SyncDMA - // CHECK: VPURT.Task waits([[BAR0]] : !VPURT.Barrier) updates([[BAR1]] : !VPURT.Barrier) - // CHECK: VPUIP.NNDMA - // CHECK: VPURT.Task updates([[BAR1]] : !VPURT.Barrier) - // CHECK: VPUIP.SW.Kernel - // CHECK: VPURT.Task waits([[BAR1]] : !VPURT.Barrier) - // CHECK: VPUIP.NNDMA -} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_40XX.mlir index 5166165ed5..c8a8e4cd23 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_40XX.mlir @@ -18,7 +18,7 @@ // Other : [ SyncDMA ] | // -module @subgraph attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @subgraph attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096] module @VPU.SW { func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE} @@ -27,18 +27,18 @@ module @subgraph attributes {VPU.arch = #VPU.arch_kind, config.compilat } IE.TileResource {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { builtin.module @ReservedMemory { - module @DmaProfilingReservedMemory { - IE.MemoryResource 512 bytes of @CMX_NN offset 0 + module @DummySWKernelsForInstructionPrefetchReservedMemory { + IE.MemoryResource 8 bytes of @CMX_NN offset 1474552 } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 2306867200 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 2306867200 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo {inferenceTiming = 369464 : i64} entryPoint : @main inputsInfo : { DataInfo "data" : tensor<1x3x62x62xui8> } outputsInfo : { @@ -83,7 +83,7 @@ module @subgraph attributes {VPU.arch = #VPU.arch_kind, config.compilat VPURT.Task waits(%2: !VPURT.Barrier) updates(%3 : !VPURT.Barrier) { %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T } - + VPURT.Task waits(%3: !VPURT.Barrier) updates(%4 : !VPURT.Barrier) { %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T } @@ -142,7 +142,7 @@ module @subgraph attributes {VPU.arch = #VPU.arch_kind, config.compilat // CHECK: VPURT.Task waits([[BARRIER_4]] : !VPURT.Barrier) updates([[BARRIER_1]] : !VPURT.Barrier) { // CHECK-NEXT: VPUIP.NNDMA - + // CHECK: VPURT.Task waits([[BARRIER_1]] : !VPURT.Barrier) updates([[BARRIER_5]] : !VPURT.Barrier) { // CHECK: VPUIP.SW.Kernel // CHECK-SAME: @VPU.SW::@builtin_SoftMax diff --git a/tests/lit/NPU/dialect/VPUIP/passes/assign_physical_barriers.mlir b/tests/lit/NPU/dialect/VPUIP/passes/assign_physical_barriers.mlir index 9c89b0554b..43db90d493 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/assign_physical_barriers.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/assign_physical_barriers.mlir @@ -3,10 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --assign-physical-barriers="num-barriers=4" %s | FileCheck %s +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% workload-management-enable=false" --assign-physical-barriers="num-barriers=4" %s | FileCheck %s // REQUIRES: arch-NPU37XX || arch-NPU40XX -module attributes {VPUIP.wlm_status = #VPUIP.wlm_status} { // CHECK-LABEL: @LinearDMA func.func @LinearDMA(%arg0: memref<10xf16>, %arg1: memref<10xf16>) -> memref<10xf16> { // CHECK-NOT: VPURT.DeclareVirtualBarrier @@ -46,14 +45,12 @@ func.func @LinearDMA(%arg0: memref<10xf16>, %arg1: memref<10xf16>) -> memref<10x } return %arg1 : memref<10xf16> } -} // ----- #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPUIP.wlm_status = #VPUIP.wlm_status} { // CHECK-LABEL: @MultipleExecutors func.func @MultipleExecutors(%arg0: memref<1x16x32x32xf16>, %arg1: memref<1x16x32x32xf16>) -> memref<1x16x32x32xf16> { %cst0 = const.Declare memref<16x16x1x1xf16, #NHWC> = @@ -319,4 +316,3 @@ func.func @MultipleExecutors(%arg0: memref<1x16x32x32xf16>, %arg1: memref<1x16x3 return %arg1 : memref<1x16x32x32xf16> } -} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/assign_physical_barriers_wlm.mlir b/tests/lit/NPU/dialect/VPUIP/passes/assign_physical_barriers_wlm.mlir index 00cca39bb9..c2f5499dfd 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/assign_physical_barriers_wlm.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/assign_physical_barriers_wlm.mlir @@ -6,9 +6,7 @@ // RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --assign-physical-barriers="num-barriers=4 workload-management-barrier-count-threshold=1" %s | FileCheck %s // REQUIRES: arch-NPU37XX || arch-NPU40XX -module attributes {VPUIP.wlm_status = #VPUIP.wlm_status} { -// CHECK: attributes -// CHECK-SAME: VPUIP.wlm_status = #VPUIP.wlm_status +// CHECK: config.Option @VPU.WorkloadManagementStatus : "FAILED" // CHECK-LABEL: @LinearDMA func.func @LinearDMA(%arg0: memref<10xf16>, %arg1: memref<10xf16>) -> memref<10xf16> { // CHECK-NOT: attributes @@ -49,4 +47,3 @@ func.func @LinearDMA(%arg0: memref<10xf16>, %arg1: memref<10xf16>) -> memref<10x } return %arg1 : memref<10xf16> } -} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/assign_physical_barriers_wlm_pages_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/assign_physical_barriers_wlm_pages_40XX+.mlir index 4062a3e256..957173c63a 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/assign_physical_barriers_wlm_pages_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/assign_physical_barriers_wlm_pages_40XX+.mlir @@ -8,7 +8,7 @@ #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPUIP.wlm_status = #VPUIP.wlm_status} { +module { func.func @DmaAndDpuGraph() -> memref<1x16x8x32xf16, #NHWC, [@CMX_NN, 0]> { %bar0 = VPURT.DeclareVirtualBarrier {isStartBarrier, wlmPage = 0 : i64} -> !VPURT.Barrier %bar1 = VPURT.DeclareVirtualBarrier {wlmPage = 0 : i64} -> !VPURT.Barrier diff --git a/tests/lit/NPU/dialect/VPUIP/passes/batch_matmul_to_matmul.mlir b/tests/lit/NPU/dialect/VPUIP/passes/batch_matmul_to_matmul.mlir index eaa59b7a2d..76e957317d 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/batch_matmul_to_matmul.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/batch_matmul_to_matmul.mlir @@ -10,7 +10,7 @@ // CHECK-LABEL: @BatchMatMulToMatMul module @BatchMatMulToMatMul attributes { - VPU.arch = #VPU.arch_kind + config.arch = #config.arch_kind } { IE.TileResource 4 of @NCE at 1.850000e+03 MHz { @@ -110,7 +110,7 @@ func.func @main() -> () { // CHECK-LABEL-DAG: @QuantBatchMatMulToMatMul // CHECK-DAG: [[Q_ELEM_TYPE:!.+]] = !quant.uniform + config.arch = #config.arch_kind } { IE.TileResource 4 of @NCE at 1.850000e+03 MHz { @@ -202,7 +202,7 @@ func.func @main() -> () { // CHECK-LABEL: @MatMulWithBatch1ToMatMul module @MatMulWithBatch1ToMatMul attributes { - VPU.arch = #VPU.arch_kind + config.arch = #config.arch_kind } { IE.TileResource 4 of @NCE at 1.850000e+03 MHz { diff --git a/tests/lit/NPU/dialect/VPUIP/passes/calculate_async_region_cycle_cost.mlir b/tests/lit/NPU/dialect/VPUIP/passes/calculate_async_region_cycle_cost.mlir index 1178263c61..79679bd208 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/calculate_async_region_cycle_cost.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/calculate_async_region_cycle_cost.mlir @@ -21,10 +21,10 @@ module @AddCycleCostForSWMultiCluster attributes {config.compilationMode = #conf IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} } IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 524288000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 524288000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} module @VPU.SW { func.func private @builtin_MVN(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "mvn1.cpp", VPU.kernel_entry = "mvn1"} func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} @@ -78,7 +78,7 @@ module @AddCycleCostForSWSingleCluster { IE.TileResource 1 of @NCE at 1.300000e+03 MHz { IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @DMA_NN diff --git a/tests/lit/NPU/dialect/VPUIP/passes/calculate_async_region_cycle_cost_40xx.mlir b/tests/lit/NPU/dialect/VPUIP/passes/calculate_async_region_cycle_cost_40xx.mlir index b04ea0f325..4bee248f23 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/calculate_async_region_cycle_cost_40xx.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/calculate_async_region_cycle_cost_40xx.mlir @@ -21,7 +21,7 @@ module @AddCycleCostForDistributedBuffers attributes {config.compilationMode = # IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} } IE.ExecutorResource 2 of @DMA_NN diff --git a/tests/lit/NPU/dialect/VPUIP/passes/compute_halo_region_for_dpu_task_op_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/compute_halo_region_for_dpu_task_op_40XX+.mlir index 0f21c15828..c00e8fb26c 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/compute_halo_region_for_dpu_task_op_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/compute_halo_region_for_dpu_task_op_40XX+.mlir @@ -1509,7 +1509,7 @@ func.func @main(%arg0: memref<1x16x20x16xf16>, %arg1: memref<1x16x20x16xf16>, ]> // CHECK-LABEL: @SOK_ODUPermute -module @SOK_ODUPermute attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @SOK_ODUPermute attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { net.NetworkInfo entryPoint : @main diff --git a/tests/lit/NPU/dialect/VPUIP/passes/convert_to_dma_37XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/convert_to_dma_37XX.mlir index 0da8a31171..f7b8efdae3 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/convert_to_dma_37XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/convert_to_dma_37XX.mlir @@ -6,8 +6,6 @@ // RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=DefaultHW" --convert-to-dma --canonicalize %s | FileCheck %s // REQUIRES: arch-NPU37XX -// ----- - #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #map = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> diff --git a/tests/lit/NPU/dialect/VPUIP/passes/convert_to_dma.mlir b/tests/lit/NPU/dialect/VPUIP/passes/convert_to_dma_40XX+.mlir similarity index 57% rename from tests/lit/NPU/dialect/VPUIP/passes/convert_to_dma.mlir rename to tests/lit/NPU/dialect/VPUIP/passes/convert_to_dma_40XX+.mlir index 82d03bea80..ce83f0abb5 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/convert_to_dma.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/convert_to_dma_40XX+.mlir @@ -1,12 +1,10 @@ // -// Copyright (C) 2022-2025 Intel Corporation. +// Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // // RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=DefaultHW allow-custom-values=true" --convert-to-dma --canonicalize %s | FileCheck %s -// REQUIRES: arch-NPU37XX || arch-NPU40XX - -// ----- +// REQUIRES: arch-NPU40XX VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] module @VPU.SW { @@ -40,21 +38,17 @@ func.func @ConvertMemPermuteNCHWToNHCW(%arg0: memref<6x4x8x512xf16, @DDR>) return %5: memref<6x8x4x512xf16, @DDR> - // CHECK: [[COPY_BUFF0:%.+]] = memref.alloc() : memref<6x4x8x512xf16, [@CMX_NN, 0]> - // CHECK: [[COPY0:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[INPUT]] : memref<6x4x8x512xf16, @DDR>) - // CHECK-SAME: outputs([[COPY_BUFF0]] : memref<6x4x8x512xf16, [@CMX_NN, 0]>) - // CHECK: [[PERMUTEDMA_BUFF1:%.+]] = memref.alloc() : memref<6x8x4x512xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE0:%.+]] = VPUIP.GenericReshape inputs([[COPY0]] : memref<6x4x8x512xf16, [@CMX_NN, 0]>) -> memref<1x24x8x512xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF2:%.+]] = memref.alloc() : memref<1x8x24x512xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA0:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE0]] : memref<1x24x8x512xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF2]] : memref<1x8x24x512xf16, [@CMX_NN, 0]>) -> memref<1x8x24x512xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE1:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA0]] : memref<1x8x24x512xf16, [@CMX_NN, 0]>) -> memref<1x8x6x2048xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA1:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE1]] : memref<1x8x6x2048xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF1]] : memref<6x8x4x512xf16, [@CMX_NN, 0]>) -> memref<6x8x4x512xf16, [@CMX_NN, 0]> - // CHECK: [[COPY_BUFF3:%.+]] = memref.alloc() : memref<6x8x4x512xf16, @DDR> - // CHECK: [[COPY1:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[PERMUTEDMA1]] : memref<6x8x4x512xf16, [@CMX_NN, 0]>) - // CHECK-SAME: outputs([[COPY_BUFF3]] : memref<6x8x4x512xf16, @DDR>) - // CHECK: return [[COPY1]] : memref<6x8x4x512xf16, @DDR> + // CHECK: [[COPY_BUFF_0:%.+]] = memref.alloc() : memref<6x4x8x512xf16, [@CMX_NN, 0]> + // CHECK: [[COPY_0:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[INPUT]] + // CHECK-SAME: outputs([[COPY_BUFF_0]] + // CHECK: [[PERMUTE_DMA_BUFF_0:%.+]] = memref.alloc() : memref<6x8x4x512xf16, [@CMX_NN, 0]> + // CHECK: [[PERMUTE_DMA_0:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[COPY_0]]{{.*}}) outputs([[PERMUTE_DMA_BUFF_0]]{{.*}}) + // CHECK: [[COPY_BUFF_1:%.+]] = memref.alloc() : memref<6x8x4x512xf16, @DDR> + // CHECK: [[COPY_1:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[PERMUTE_DMA_0]] + // CHECK-SAME: outputs([[COPY_BUFF_1]] + // CHECK: return [[COPY_1]] } // ----- @@ -93,22 +87,17 @@ func.func @ConvertMemPermuteNCHWToNHCWWithDifferentDimsOrder(%arg0: memref<6x4x8 return %5: memref<6x512x8x4xf16, #NHWC, @DDR> - // CHECK: [[COPY_BUFF0:%.+]] = memref.alloc() : memref<6x4x8x512xf16, [@CMX_NN, 0]> - // CHECK: [[COPY0:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[INPUT]] : memref<6x4x8x512xf16, @DDR>) - // CHECK-SAME: outputs([[COPY_BUFF0]] : memref<6x4x8x512xf16, [@CMX_NN, 0]>) - // CHECK: [[PERMUTEDMA_BUFF1:%.+]] = memref.alloc() : memref<6x512x8x4xf16, #NHWC, [@CMX_NN, 0]> - // CHECK: [[PERMUTECAST:%.+]] = VPUIP.PermuteCast {dst_order = #NHWC, mem_perm = #NCHW} inputs([[COPY0]] : memref<6x4x8x512xf16, [@CMX_NN, 0]>) -> memref<6x512x4x8xf16, #NHWC, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE0:%.+]] = VPUIP.GenericReshape inputs([[PERMUTECAST]] : memref<6x512x4x8xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x512x24x8xf16, #NHWC, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF2:%.+]] = memref.alloc() : memref<1x512x8x24xf16, #NHWC, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA0:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE0]] : memref<1x512x24x8xf16, #NHWC, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF2]] : memref<1x512x8x24xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x512x8x24xf16, #NHWC, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE1:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA0]] : memref<1x512x8x24xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x2048x8x6xf16, #NHWC, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA1:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE1]] : memref<1x2048x8x6xf16, #NHWC, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF1]] : memref<6x512x8x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<6x512x8x4xf16, #NHWC, [@CMX_NN, 0]> - // CHECK: [[COPY_BUFF3:%.+]] = memref.alloc() : memref<6x512x8x4xf16, #NHWC, @DDR> - // CHECK: [[COPY1:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[PERMUTEDMA1]] : memref<6x512x8x4xf16, #NHWC, [@CMX_NN, 0]>) - // CHECK-SAME: outputs([[COPY_BUFF3]] : memref<6x512x8x4xf16, #NHWC, @DDR>) - // CHECK: return [[COPY1]] : memref<6x512x8x4xf16, #NHWC, @DDR> + // CHECK: [[COPY_BUFF_0:%.+]] = memref.alloc() : memref<6x4x8x512xf16, [@CMX_NN, 0]> + // CHECK: [[COPY_0:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[INPUT]] + // CHECK-SAME: outputs([[COPY_BUFF_0]] + // CHECK: [[PERMUTE_DMA_BUFF_0:%.+]] = memref.alloc() : memref<6x512x8x4xf16, #NHWC, [@CMX_NN, 0]> + // CHECK: [[PERMUTE_DMA_0:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[COPY_0]]{{.*}}) outputs([[PERMUTE_DMA_BUFF_0]]{{.*}}) + // CHECK: [[COPY_BUFF_1:%.+]] = memref.alloc() : memref<6x512x8x4xf16, #NHWC, @DDR> + // CHECK: [[COPY_1:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[PERMUTE_DMA_0]] + // CHECK-SAME: outputs([[COPY_BUFF_1]] + // CHECK: return [[COPY_1]] } // ----- @@ -147,24 +136,17 @@ func.func @ConvertMemPermuteNCHWToHCNW(%arg0: memref<6x4x8x512xf16, @DDR>) return %5: memref<8x4x6x512xf16, @DDR> - // CHECK: [[COPY_BUFF0:%.+]] = memref.alloc() : memref<6x4x8x512xf16, [@CMX_NN, 0]> - // CHECK: [[COPY0:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[INPUT]] : memref<6x4x8x512xf16, @DDR>) - // CHECK-SAME: outputs([[COPY_BUFF0]] : memref<6x4x8x512xf16, [@CMX_NN, 0]>) - // CHECK: [[PERMUTEDMA_BUFF1:%.+]] = memref.alloc() : memref<8x4x6x512xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE0:%.+]] = VPUIP.GenericReshape inputs([[COPY0]] : memref<6x4x8x512xf16, [@CMX_NN, 0]>) -> memref<1x24x8x512xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF2:%.+]] = memref.alloc() : memref<1x8x24x512xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA0:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE0]] : memref<1x24x8x512xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF2]] : memref<1x8x24x512xf16, [@CMX_NN, 0]>) -> memref<1x8x24x512xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE1:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA0]] : memref<1x8x24x512xf16, [@CMX_NN, 0]>) -> memref<1x48x4x512xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF3:%.+]] = memref.alloc() : memref<1x4x48x512xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA1:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE1]] : memref<1x48x4x512xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF3]] : memref<1x4x48x512xf16, [@CMX_NN, 0]>) -> memref<1x4x48x512xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE2:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA1]] : memref<1x4x48x512xf16, [@CMX_NN, 0]>) -> memref<1x4x8x3072xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA2:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE2]] : memref<1x4x8x3072xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF1]] : memref<8x4x6x512xf16, [@CMX_NN, 0]>) -> memref<8x4x6x512xf16, [@CMX_NN, 0]> - // CHECK: [[COPY_BUFF4:%.+]] = memref.alloc() : memref<8x4x6x512xf16, @DDR> - // CHECK: [[COPY1:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[PERMUTEDMA2]] : memref<8x4x6x512xf16, [@CMX_NN, 0]>) - // CHECK-SAME: outputs([[COPY_BUFF4]] : memref<8x4x6x512xf16, @DDR>) - // CHECK: return [[COPY1]] : memref<8x4x6x512xf16, @DDR> + // CHECK: [[COPY_BUFF_0:%.+]] = memref.alloc() : memref<6x4x8x512xf16, [@CMX_NN, 0]> + // CHECK: [[COPY_0:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[INPUT]] + // CHECK-SAME: outputs([[COPY_BUFF_0]] + // CHECK: [[PERMUTE_DMA_BUFF_0:%.+]] = memref.alloc() : memref<8x4x6x512xf16, [@CMX_NN, 0]> + // CHECK: [[PERMUTE_DMA_0:%.+]] = VPUIP.PermuteDMA {mem_perm = #map} inputs([[COPY_0]]{{.*}}) outputs([[PERMUTE_DMA_BUFF_0]]{{.*}}) + // CHECK: [[COPY_BUFF_1:%.+]] = memref.alloc() : memref<8x4x6x512xf16, @DDR> + // CHECK: [[COPY_1:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[PERMUTE_DMA_0]] + // CHECK-SAME: outputs([[COPY_BUFF_1]] + // CHECK: return [[COPY_1]] } // ----- @@ -203,24 +185,17 @@ func.func @ConvertMemPermuteNCHWToNWHC(%arg0: memref<6x4x8x512xf16, @DDR>) return %5: memref<6x512x8x4xf16, @DDR> - // CHECK: [[COPY_BUFF0:%.+]] = memref.alloc() : memref<6x4x8x512xf16, [@CMX_NN, 0]> - // CHECK: [[COPY0:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[INPUT]] : memref<6x4x8x512xf16, @DDR>) - // CHECK-SAME: outputs([[COPY_BUFF0]] : memref<6x4x8x512xf16, [@CMX_NN, 0]>) - // CHECK: [[PERMUTEDMA_BUFF1:%.+]] = memref.alloc() : memref<6x512x8x4xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE0:%.+]] = VPUIP.GenericReshape inputs([[COPY0]] : memref<6x4x8x512xf16, [@CMX_NN, 0]>) -> memref<1x6x4x4096xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF2:%.+]] = memref.alloc() : memref<1x6x4096x4xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA0:%.+]] = VPUIP.PermuteDMA {mem_perm = #NCWH} inputs([[GENERIC_RESHAPE0]] : memref<1x6x4x4096xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF2]] : memref<1x6x4096x4xf16, [@CMX_NN, 0]>) -> memref<1x6x4096x4xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE1:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA0]] : memref<1x6x4096x4xf16, [@CMX_NN, 0]>) -> memref<1x48x512x4xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF3:%.+]] = memref.alloc() : memref<1x512x48x4xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA1:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE1]] : memref<1x48x512x4xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF3]] : memref<1x512x48x4xf16, [@CMX_NN, 0]>) -> memref<1x512x48x4xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE2:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA1]] : memref<1x512x48x4xf16, [@CMX_NN, 0]>) -> memref<1x512x6x32xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA2:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE2]] : memref<1x512x6x32xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF1]] : memref<6x512x8x4xf16, [@CMX_NN, 0]>) -> memref<6x512x8x4xf16, [@CMX_NN, 0]> - // CHECK: [[COPY_BUFF4:%.+]] = memref.alloc() : memref<6x512x8x4xf16, @DDR> - // CHECK: [[COPY1:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[PERMUTEDMA2]] : memref<6x512x8x4xf16, [@CMX_NN, 0]>) - // CHECK-SAME: outputs([[COPY_BUFF4]] : memref<6x512x8x4xf16, @DDR>) - // CHECK: return [[COPY1]] : memref<6x512x8x4xf16, @DDR> + // CHECK: [[COPY_BUFF_0:%.+]] = memref.alloc() : memref<6x4x8x512xf16, [@CMX_NN, 0]> + // CHECK: [[COPY_0:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[INPUT]] + // CHECK-SAME: outputs([[COPY_BUFF_0]] + // CHECK: [[PERMUTE_DMA_BUFF_0:%.+]] = memref.alloc() : memref<6x512x8x4xf16, [@CMX_NN, 0]> + // CHECK: [[PERMUTE_DMA_0:%.+]] = VPUIP.PermuteDMA {mem_perm = #NWHC} inputs([[COPY_0]]{{.*}}) outputs([[PERMUTE_DMA_BUFF_0]]{{.*}}) + // CHECK: [[COPY_BUFF_1:%.+]] = memref.alloc() : memref<6x512x8x4xf16, @DDR> + // CHECK: [[COPY_1:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[PERMUTE_DMA_0]] + // CHECK-SAME: outputs([[COPY_BUFF_1]] + // CHECK: return [[COPY_1]] } // ----- @@ -257,25 +232,17 @@ func.func @ConvertMemPermuteNCHWToCWNH(%arg0: memref<86x4x256x4xf16, @DDR>) return %5: memref<4x4x86x256xf16, @DDR> - // CHECK: [[COPY_BUFF0:%.+]] = memref.alloc() : memref<86x4x256x4xf16, [@CMX_NN, 0]> - // CHECK: [[COPY0:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[INPUT]] : memref<86x4x256x4xf16, @DDR>) - // CHECK-SAME: outputs([[COPY_BUFF0]] : memref<86x4x256x4xf16, [@CMX_NN, 0]>) - // CHECK: [[PERMUTEDMA_BUFF1:%.+]] = memref.alloc() : memref<86x256x4x4xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE0:%.+]] = VPUIP.GenericReshape inputs([[COPY0]] : memref<86x4x256x4xf16, [@CMX_NN, 0]>) -> memref<1x344x256x4xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF2:%.+]] = memref.alloc() : memref<1x256x344x4xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA0:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE0]] : memref<1x344x256x4xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF2]] : memref<1x256x344x4xf16, [@CMX_NN, 0]>) -> memref<1x256x344x4xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE1:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA0]] : memref<1x256x344x4xf16, [@CMX_NN, 0]>) -> memref<1x256x86x16xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA1:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE1]] : memref<1x256x86x16xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF1]] : memref<86x256x4x4xf16, [@CMX_NN, 0]>) -> memref<86x256x4x4xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE2:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA1]] : memref<86x256x4x4xf16, [@CMX_NN, 0]>) -> memref<1x22016x16x1xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF3:%.+]] = memref.alloc() : memref<1x16x22016x1xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA2:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE2]] : memref<1x22016x16x1xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF3]] : memref<1x16x22016x1xf16, [@CMX_NN, 0]>) -> memref<1x16x22016x1xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE3:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA2]] : memref<1x16x22016x1xf16, [@CMX_NN, 0]>) -> memref<4x4x86x256xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF4:%.+]] = memref.alloc() : memref<4x4x86x256xf16, @DDR> - // CHECK: [[COPY1:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[GENERIC_RESHAPE3]] : memref<4x4x86x256xf16, [@CMX_NN, 0]>) - // CHECK-SAME: outputs([[PERMUTEDMA_BUFF4]] : memref<4x4x86x256xf16, @DDR>) - // CHECK: return [[COPY1]] : memref<4x4x86x256xf16, @DDR> + // CHECK: [[COPY_BUFF_0:%.+]] = memref.alloc() : memref<86x4x256x4xf16, [@CMX_NN, 0]> + // CHECK: [[COPY_0:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[INPUT]] + // CHECK-SAME: outputs([[COPY_BUFF_0]] + // CHECK: [[PERMUTE_DMA_BUFF_0:%.+]] = memref.alloc() : memref<4x4x86x256xf16, [@CMX_NN, 0]> + // CHECK: [[PERMUTE_DMA_0:%.+]] = VPUIP.PermuteDMA {mem_perm = #map} inputs([[COPY_0]]{{.*}}) outputs([[PERMUTE_DMA_BUFF_0]]{{.*}}) + // CHECK: [[COPY_BUFF_1:%.+]] = memref.alloc() : memref<4x4x86x256xf16, @DDR> + // CHECK: [[COPY_1:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[PERMUTE_DMA_0]] + // CHECK-SAME: outputs([[COPY_BUFF_1]] + // CHECK: return [[COPY_1]] } // ----- @@ -312,22 +279,17 @@ func.func @ConvertMemPermuteNCHWToHNWC(%arg0: memref<4x4x86x256xf16, @DDR>) return %5: memref<86x4x256x4xf16, @DDR> - // CHECK: [[COPY_BUFF0:%.+]] = memref.alloc() : memref<4x4x86x256xf16, [@CMX_NN, 0]> - // CHECK: [[COPY0:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[INPUT]] : memref<4x4x86x256xf16, @DDR>) - // CHECK-SAME: outputs([[COPY_BUFF0]] : memref<4x4x86x256xf16, [@CMX_NN, 0]>) - // CHECK: [[GENERIC_RESHAPE0:%.+]] = VPUIP.GenericReshape inputs([[COPY0]] : memref<4x4x86x256xf16, [@CMX_NN, 0]>) -> memref<1x16x86x256xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF1:%.+]] = memref.alloc() : memref<1x86x16x256xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA0:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE0]] : memref<1x16x86x256xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF1]] : memref<1x86x16x256xf16, [@CMX_NN, 0]>) -> memref<1x86x16x256xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE1:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA0]] : memref<1x86x16x256xf16, [@CMX_NN, 0]>) -> memref<1x344x4x256xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF2:%.+]] = memref.alloc() : memref<1x344x256x4xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA1:%.+]] = VPUIP.PermuteDMA {mem_perm = #NCWH} inputs([[GENERIC_RESHAPE1]] : memref<1x344x4x256xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF2]] : memref<1x344x256x4xf16, [@CMX_NN, 0]>) -> memref<1x344x256x4xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE2:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA1]] : memref<1x344x256x4xf16, [@CMX_NN, 0]>) -> memref<86x4x256x4xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF3:%.+]] = memref.alloc() : memref<86x4x256x4xf16, @DDR> - // CHECK: [[COPY1:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[GENERIC_RESHAPE2]] : memref<86x4x256x4xf16, [@CMX_NN, 0]>) - // CHECK-SAME: outputs([[PERMUTEDMA_BUFF3]] : memref<86x4x256x4xf16, @DDR>) - // CHECK: return [[COPY1]] : memref<86x4x256x4xf16, @DDR> + // CHECK: [[COPY_BUFF_0:%.+]] = memref.alloc() : memref<4x4x86x256xf16, [@CMX_NN, 0]> + // CHECK: [[COPY_0:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[INPUT]] + // CHECK-SAME: outputs([[COPY_BUFF_0]] + // CHECK: [[PERMUTE_DMA_BUFF_0:%.+]] = memref.alloc() : memref<86x4x256x4xf16, [@CMX_NN, 0]> + // CHECK: [[PERMUTE_DMA_0:%.+]] = VPUIP.PermuteDMA {mem_perm = #map} inputs([[COPY_0]]{{.*}}) outputs([[PERMUTE_DMA_BUFF_0]]{{.*}}) + // CHECK: [[COPY_BUFF_1:%.+]] = memref.alloc() : memref<86x4x256x4xf16, @DDR> + // CHECK: [[COPY_1:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[PERMUTE_DMA_0]] + // CHECK-SAME: outputs([[COPY_BUFF_1]] + // CHECK: return [[COPY_1]] } // ----- @@ -403,22 +365,17 @@ func.func @ConvertMemPermuteNCHWToWCHN(%arg0: memref<4x2x121x3xf16, @DDR>) return %5: memref<3x2x121x4xf16, @DDR> - // CHECK: [[COPY_BUFF0:%.+]] = memref.alloc() : memref<4x2x121x3xf16, [@CMX_NN, 0]> - // CHECK: [[COPY0:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[INPUT]] : memref<4x2x121x3xf16, @DDR>) - // CHECK-SAME: outputs([[COPY_BUFF0]] : memref<4x2x121x3xf16, [@CMX_NN, 0]>) - // CHECK: [[GENERIC_RESHAPE0:%.+]] = VPUIP.GenericReshape inputs([[COPY0]] : memref<4x2x121x3xf16, [@CMX_NN, 0]>) -> memref<1x4x242x3xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF1:%.+]] = memref.alloc() : memref<1x242x4x3xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA0:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE0]] : memref<1x4x242x3xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF1]] : memref<1x242x4x3xf16, [@CMX_NN, 0]>) -> memref<1x242x4x3xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE1:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA0]] : memref<1x242x4x3xf16, [@CMX_NN, 0]>) -> memref<1x968x3x1xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF2:%.+]] = memref.alloc() : memref<1x3x968x1xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA1:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE1]] : memref<1x968x3x1xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF2]] : memref<1x3x968x1xf16, [@CMX_NN, 0]>) -> memref<1x3x968x1xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE2:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA1]] : memref<1x3x968x1xf16, [@CMX_NN, 0]>) -> memref<3x2x121x4xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF3:%.+]] = memref.alloc() : memref<3x2x121x4xf16, @DDR> - // CHECK: [[COPY1:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[GENERIC_RESHAPE2]] : memref<3x2x121x4xf16, [@CMX_NN, 0]>) - // CHECK-SAME: outputs([[PERMUTEDMA_BUFF3]] : memref<3x2x121x4xf16, @DDR>) - // CHECK: return [[COPY1]] : memref<3x2x121x4xf16, @DDR> + // CHECK: [[COPY_BUFF_0:%.+]] = memref.alloc() : memref<4x2x121x3xf16, [@CMX_NN, 0]> + // CHECK: [[COPY_0:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[INPUT]] + // CHECK-SAME: outputs([[COPY_BUFF_0]] + // CHECK: [[PERMUTE_DMA_BUFF_0:%.+]] = memref.alloc() : memref<3x2x121x4xf16, [@CMX_NN, 0]> + // CHECK: [[PERMUTE_DMA_0:%.+]] = VPUIP.PermuteDMA {mem_perm = #map} inputs([[COPY_0]]{{.*}}) outputs([[PERMUTE_DMA_BUFF_0]]{{.*}}) + // CHECK: [[COPY_BUFF_1:%.+]] = memref.alloc() : memref<3x2x121x4xf16, @DDR> + // CHECK: [[COPY_1:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[PERMUTE_DMA_0]] + // CHECK-SAME: outputs([[COPY_BUFF_1]] + // CHECK: return [[COPY_1]] } // ----- @@ -457,23 +414,17 @@ func.func @ConvertMemPermuteNCHWToWCHNWithDifferentDimsOrder(%arg0: memref<4x3x2 return %5: memref<3x2x121x4xf16, @DDR> - // CHECK: [[COPY_BUFF0:%.+]] = memref.alloc() : memref<4x3x2x121xf16, #NHWC, [@CMX_NN, 0]> - // CHECK: [[COPY0:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[INPUT]] : memref<4x3x2x121xf16, #NHWC, @DDR>) - // CHECK-SAME: outputs([[COPY_BUFF0]] : memref<4x3x2x121xf16, #NHWC, [@CMX_NN, 0]>) - // CHECK: [[PERMUTE_CAST0:%.+]] = VPUIP.PermuteCast {dst_order = #NCHW, mem_perm = #NCHW} inputs([[COPY0]] : memref<4x3x2x121xf16, #NHWC, [@CMX_NN, 0]>) -> memref<4x2x121x3xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE0:%.+]] = VPUIP.GenericReshape inputs([[PERMUTE_CAST0]] : memref<4x2x121x3xf16, [@CMX_NN, 0]>) -> memref<1x4x242x3xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF1:%.+]] = memref.alloc() : memref<1x242x4x3xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA0:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE0]] : memref<1x4x242x3xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF1]] : memref<1x242x4x3xf16, [@CMX_NN, 0]>) -> memref<1x242x4x3xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE1:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA0]] : memref<1x242x4x3xf16, [@CMX_NN, 0]>) -> memref<1x968x3x1xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF2:%.+]] = memref.alloc() : memref<1x3x968x1xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA1:%.+]] = VPUIP.PermuteDMA {mem_perm = #NHCW} inputs([[GENERIC_RESHAPE1]] : memref<1x968x3x1xf16, [@CMX_NN, 0]>) outputs([[PERMUTEDMA_BUFF2]] : memref<1x3x968x1xf16, [@CMX_NN, 0]>) -> memref<1x3x968x1xf16, [@CMX_NN, 0]> - // CHECK: [[GENERIC_RESHAPE2:%.+]] = VPUIP.GenericReshape inputs([[PERMUTEDMA1]] : memref<1x3x968x1xf16, [@CMX_NN, 0]>) -> memref<3x2x121x4xf16, [@CMX_NN, 0]> - // CHECK: [[PERMUTEDMA_BUFF3:%.+]] = memref.alloc() : memref<3x2x121x4xf16, @DDR> - // CHECK: [[COPY1:%.+]] = VPUIP.Copy - // CHECK-SAME: inputs([[GENERIC_RESHAPE2]] : memref<3x2x121x4xf16, [@CMX_NN, 0]>) - // CHECK-SAME: outputs([[PERMUTEDMA_BUFF3]] : memref<3x2x121x4xf16, @DDR>) - // CHECK: return [[COPY1]] : memref<3x2x121x4xf16, @DDR> + // CHECK: [[COPY_BUFF_0:%.+]] = memref.alloc() : memref<4x3x2x121xf16, #NHWC, [@CMX_NN, 0]> + // CHECK: [[COPY_0:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[INPUT]] + // CHECK-SAME: outputs([[COPY_BUFF_0]] + // CHECK: [[PERMUTE_DMA_BUFF_0:%.+]] = memref.alloc() : memref<3x2x121x4xf16, [@CMX_NN, 0]> + // CHECK: [[PERMUTE_DMA_0:%.+]] = VPUIP.PermuteDMA {mem_perm = #map} inputs([[COPY_0]]{{.*}}) outputs([[PERMUTE_DMA_BUFF_0]]{{.*}}) + // CHECK: [[COPY_BUFF_1:%.+]] = memref.alloc() : memref<3x2x121x4xf16, @DDR> + // CHECK: [[COPY_1:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[PERMUTE_DMA_0]] + // CHECK-SAME: outputs([[COPY_BUFF_1]] + // CHECK: return [[COPY_1]] } // ----- @@ -556,7 +507,7 @@ func.func @NotMoveUpsamplingDMAInCMXWithLargeSize(%arg0: memref<1x64x128x128xf16 IE.TileResource 3 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] @@ -566,43 +517,43 @@ VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 409 } // CHECK-LABEL: @ConvertPerAxisTileToDMAOutputDDR -// CHECK-SAME: [[INPUT_0:%.+]]: memref<1x1x1x2000xf16, @DDR> -// CHECK-SAME: [[INPUT_1:%.+]]: memref<1x22x16x2000xf16, @DDR> -func.func @ConvertPerAxisTileToDMAOutputDDR(%arg0: memref<1x1x1x2000xf16, @DDR>, %arg1: memref<1x22x16x2000xf16, @DDR>) -> memref<1x22x16x2000xf16, @DDR> { - %cmx_buf_0 = memref.alloc() : memref<1x1x1x2000xf16, [@CMX_NN, 0]> - %0 = VPUIP.Copy inputs(%arg0 : memref<1x1x1x2000xf16, @DDR>) outputs(%cmx_buf_0 : memref<1x1x1x2000xf16, [@CMX_NN, 0]>) -> memref<1x1x1x2000xf16, [@CMX_NN, 0]> - %cmx_buf_1 = memref.alloc() : memref<1x22x16x2000xf16, [@CMX_NN, 0]> +// CHECK-SAME: [[INPUT_0:%.+]]: memref<1x1x1x1000xf16, @DDR> +// CHECK-SAME: [[INPUT_1:%.+]]: memref<1x22x16x1000xf16, @DDR> +func.func @ConvertPerAxisTileToDMAOutputDDR(%arg0: memref<1x1x1x1000xf16, @DDR>, %arg1: memref<1x22x16x1000xf16, @DDR>) -> memref<1x22x16x1000xf16, @DDR> { + %cmx_buf_0 = memref.alloc() : memref<1x1x1x1000xf16, [@CMX_NN, 0]> + %0 = VPUIP.Copy inputs(%arg0 : memref<1x1x1x1000xf16, @DDR>) outputs(%cmx_buf_0 : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) -> memref<1x1x1x1000xf16, [@CMX_NN, 0]> + %cmx_buf_1 = memref.alloc() : memref<1x22x16x1000xf16, [@CMX_NN, 0]> %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Tile - inputs(%0 as %arg2: memref<1x1x1x2000xf16, [@CMX_NN, 0]>) - outputs(%cmx_buf_1 as %arg3: memref<1x22x16x2000xf16, [@CMX_NN, 0]>) on tile 0 - -> memref<1x22x16x2000xf16, [@CMX_NN, 0]>{ - VPUIP.SW.Kernel.run {attrs = [4, [1, 22, 16, 1]]}(%arg2, %arg3) : memref<1x1x1x2000xf16, [@CMX_NN, 0]>, memref<1x22x16x2000xf16, [@CMX_NN, 0]> + inputs(%0 as %arg2: memref<1x1x1x1000xf16, [@CMX_NN, 0]>) + outputs(%cmx_buf_1 as %arg3: memref<1x22x16x1000xf16, [@CMX_NN, 0]>) on tile 0 + -> memref<1x22x16x1000xf16, [@CMX_NN, 0]>{ + VPUIP.SW.Kernel.run {attrs = [4, [1, 22, 16, 1]]}(%arg2, %arg3) : memref<1x1x1x1000xf16, [@CMX_NN, 0]>, memref<1x22x16x1000xf16, [@CMX_NN, 0]> } - %1 = VPUIP.Copy inputs(%results : memref<1x22x16x2000xf16, [@CMX_NN, 0]>) outputs(%arg1 : memref<1x22x16x2000xf16, @DDR>) -> memref<1x22x16x2000xf16, @DDR> - return %1 : memref<1x22x16x2000xf16, @DDR> + %1 = VPUIP.Copy inputs(%results : memref<1x22x16x1000xf16, [@CMX_NN, 0]>) outputs(%arg1 : memref<1x22x16x1000xf16, @DDR>) -> memref<1x22x16x1000xf16, @DDR> + return %1 : memref<1x22x16x1000xf16, @DDR> - // CHECK: [[CMX_BUF_0:%.+]] = memref.alloc() : memref<1x1x1x2000xf16, [@CMX_NN, 0]> - // CHECK: [[COPY_IN:%.+]] = VPUIP.Copy inputs([[INPUT_0]] : memref<1x1x1x2000xf16, @DDR>) outputs([[CMX_BUF_0]] : memref<1x1x1x2000xf16, [@CMX_NN, 0]>) -> memref<1x1x1x2000xf16, [@CMX_NN, 0]> - // CHECK: [[DDR_BUF:%.+]] = memref.alloc() : memref<1x1x16x2000xf16, @DDR> + // CHECK: [[CMX_BUF_0:%.+]] = memref.alloc() : memref<1x1x1x1000xf16, [@CMX_NN, 0]> + // CHECK: [[COPY_IN:%.+]] = VPUIP.Copy inputs([[INPUT_0]] : memref<1x1x1x1000xf16, @DDR>) outputs([[CMX_BUF_0]] : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) -> memref<1x1x1x1000xf16, [@CMX_NN, 0]> + // CHECK: [[CMX_BUF_1:%.+]] = memref.alloc() : memref<1x1x16x1000xf16, [@CMX_NN, 0]> // CHECK: [[PER_AXIS_TILE_DMA_0:%.+]] = VPUIP.PerAxisTileDMA {axis = 2 : i64, tiles = 16 : i64} - // CHECK-SAME: inputs([[COPY_IN]] : memref<1x1x1x2000xf16, [@CMX_NN, 0]>) - // CHECK-SAME: outputs([[DDR_BUF]] : memref<1x1x16x2000xf16, @DDR>) -> memref<1x1x16x2000xf16, @DDR> - // CHECK: [[CMX_BUF_1:%.+]] = memref.alloc() : memref<1x22x16x2000xf16, [@CMX_NN, 0]> + // CHECK-SAME: inputs([[COPY_IN]] : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) + // CHECK-SAME: outputs([[CMX_BUF_1]] : memref<1x1x16x1000xf16, [@CMX_NN, 0]>) -> memref<1x1x16x1000xf16, [@CMX_NN, 0]> + // CHECK: [[CMX_BUF_2:%.+]] = memref.alloc() : memref<1x22x16x1000xf16, [@CMX_NN, 0]> // CHECK: [[PER_AXIS_TILE_DMA_1:%.+]] = VPUIP.PerAxisTileDMA {axis = 1 : i64, tiles = 22 : i64} - // CHECK-SAME: inputs([[PER_AXIS_TILE_DMA_0]] : memref<1x1x16x2000xf16, @DDR>) - // CHECK-SAME: outputs([[CMX_BUF_1]] : memref<1x22x16x2000xf16, [@CMX_NN, 0]>) -> memref<1x22x16x2000xf16, [@CMX_NN, 0]> - // CHECK: [[COPY_OUT:%.+]] = VPUIP.Copy inputs([[PER_AXIS_TILE_DMA_1]] : memref<1x22x16x2000xf16, [@CMX_NN, 0]>) outputs([[INPUT_1]] : memref<1x22x16x2000xf16, @DDR>) -> memref<1x22x16x2000xf16, @DDR> + // CHECK-SAME: inputs([[PER_AXIS_TILE_DMA_0]] : memref<1x1x16x1000xf16, [@CMX_NN, 0]>) + // CHECK-SAME: outputs([[CMX_BUF_2]] : memref<1x22x16x1000xf16, [@CMX_NN, 0]>) -> memref<1x22x16x1000xf16, [@CMX_NN, 0]> + // CHECK: [[COPY_OUT:%.+]] = VPUIP.Copy inputs([[PER_AXIS_TILE_DMA_1]] : memref<1x22x16x1000xf16, [@CMX_NN, 0]>) outputs([[INPUT_1]] : memref<1x22x16x1000xf16, @DDR>) -> memref<1x22x16x1000xf16, @DDR> - // CHECK: return [[COPY_OUT]] : memref<1x22x16x2000xf16, @DDR> + // CHECK: return [[COPY_OUT]] : memref<1x22x16x1000xf16, @DDR> } // ----- IE.TileResource 3 of @NCE { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware -IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} +IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] @@ -644,3 +595,50 @@ func.func @ConvertPerAxisTileToDMAInLogicOrder(%arg0: memref<1x1x1x512xf16, @DDR // CHECK: outputs([[INPUT_1]] : memref<1x2x512x512xf16, @DDR>) -> memref<1x2x512x512xf16, @DDR> // CHECK: return [[COPY_OUT]] : memref<1x2x512x512xf16, @DDR> } + +// ----- + +VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] + module @VPU.SW { + func.func private @builtin_MemPermute(memref<*xf16, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, none) attributes {VPU.kernel_code = "reorder.cpp", VPU.kernel_entry = "reorder"} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} + } + +// CHECK-LABEL: @ConvertMemPermuteNCHWToHCWN +// CHECK-SAME: [[INPUT:%.+]]: memref<128x2x36x68xf16, @DDR> +func.func @ConvertMemPermuteNCHWToHCWN(%arg0: memref<128x2x36x68xf16, @DDR>) + -> memref<36x2x68x128xf16, @DDR> { + %0 = memref.alloc() : memref<128x2x36x68xf16, [@CMX_NN, 0]> + %1 = VPUIP.Copy + inputs(%arg0 : memref<128x2x36x68xf16, @DDR>) + outputs(%0 : memref<128x2x36x68xf16, [@CMX_NN, 0]>) -> memref<128x2x36x68xf16, [@CMX_NN, 0]> + + %2 = memref.alloc() : memref<36x2x68x128xf16, [@CMX_NN, 0]> + %3 = VPUIP.SW.Kernel {resultSegmentSizes = array} + @VPU.SW::@builtin_MemPermute + inputs(%1 as %arg2: memref<128x2x36x68xf16, [@CMX_NN, 0]>) + outputs(%2 as %arg3: memref<36x2x68x128xf16, [@CMX_NN, 0]>) + on tile 0 -> memref<36x2x68x128xf16, [@CMX_NN, 0]>{ + VPUIP.SW.Kernel.run {attrs = [[3, 0, 2, 1]]}(%arg2, %arg3) + : memref<128x2x36x68xf16, [@CMX_NN, 0]>, memref<36x2x68x128xf16, [@CMX_NN, 0]> + } + + %4 = memref.alloc() : memref<36x2x68x128xf16, @DDR> + %5 = VPUIP.Copy + inputs(%3 : memref<36x2x68x128xf16, [@CMX_NN, 0]>) + outputs(%4 : memref<36x2x68x128xf16, @DDR>) -> memref<36x2x68x128xf16, @DDR> + + return %5: memref<36x2x68x128xf16, @DDR> + + // CHECK: [[ALLOC:%.+]] = memref.alloc() : memref<128x2x36x68xf16, [@CMX_NN, 0]> + // CHECK: [[COPY0:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[INPUT]] : memref<128x2x36x68xf16, @DDR>) + // CHECK-SAME: outputs([[ALLOC]] : memref<128x2x36x68xf16, [@CMX_NN, 0]>) + // CHECK: [[ALLOC_0:%.+]] = memref.alloc() : memref<36x2x68x128xf16, [@CMX_NN, 0]> + // CHECK: [[PERMUTEDMA0:%.+]] = VPUIP.PermuteDMA {mem_perm = #map} inputs([[COPY0]] : memref<128x2x36x68xf16, [@CMX_NN, 0]>) outputs([[ALLOC_0]] : memref<36x2x68x128xf16, [@CMX_NN, 0]>) -> memref<36x2x68x128xf16, [@CMX_NN, 0]> + // CHECK: [[ALLOC_1:%.+]] = memref.alloc() : memref<36x2x68x128xf16, @DDR> + // CHECK: [[COPY2:%.+]] = VPUIP.Copy inputs([[PERMUTEDMA0]] : memref<36x2x68x128xf16, [@CMX_NN, 0]>) outputs([[ALLOC_1]] : memref<36x2x68x128xf16, @DDR>) -> memref<36x2x68x128xf16, @DDR> + + // CHECK: return [[COPY2]] : memref<36x2x68x128xf16, @DDR> +} + diff --git a/tests/lit/NPU/dialect/VPUIP/passes/convert_view_ops_to_declarations.mlir b/tests/lit/NPU/dialect/VPUIP/passes/convert_view_ops_to_declarations.mlir index 0a4ed1caa1..aab3d30db3 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/convert_view_ops_to_declarations.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/convert_view_ops_to_declarations.mlir @@ -599,3 +599,41 @@ func.func @ExtractFlatSlice() -> !OutType { // CHECK: return [[NEW_SOURCE]] // CHECK-NOT: VPUIP.ExtractFlatSlice } + +// ----- + +// CHECK: func.func @ReinterpretCast([[ARG0:%.+]]: memref<8xi8, @DDR>, [[ARG1:%.+]]: memref<8xi8, @DDR>) +// CHECK-SAME: -> memref<8xi8, @DDR> +func.func @ReinterpretCast(%arg0: memref<8xi8, @DDR>, %arg1: memref<8xi8, @DDR>) + -> memref<8xi8, @DDR> { + %in = VPURT.DeclareBuffer [0] <0> -> memref<8xi8, @DDR> + %out = VPURT.DeclareBuffer [0] <0> -> memref<8xi8, @DDR> + + %cast_in = Core.ReinterpretCast(%in) : memref<8xi8, @DDR> -> memref<4x1x1x1xf16, @DDR> + %tmp = VPURT.DeclareBuffer <0> -> memref<4x1x1x1xf16, @DDR> + %copy = VPUIP.NNDMA {port = 0 : i64} + inputs(%cast_in : memref<4x1x1x1xf16, @DDR>) outputs(%tmp : memref<4x1x1x1xf16, @DDR>) + -> memref<4x1x1x1xf16, @DDR> + + %cast_out = Core.ReinterpretCast(%copy) : memref<4x1x1x1xf16, @DDR> -> memref<8xi8, @DDR> + %res = VPUIP.NNDMA {port = 0 : i64} + inputs(%cast_out : memref<8xi8, @DDR>) outputs(%out : memref<8xi8, @DDR>) + -> memref<8xi8, @DDR> + + return %arg1 : memref<8xi8, @DDR> + + // CHECK-DAG: [[IN:%.+]] = VPURT.DeclareBuffer [0] <0> -> memref<4x1x1x1xf16, @DDR> + // CHECK-DAG: [[OUT:%.+]] = VPURT.DeclareBuffer [0] <0> -> memref<8xi8, @DDR> + + // CHECK: [[TMP:%.+]] = VPURT.DeclareBuffer <0> -> memref<4x1x1x1xf16, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.NNDMA + // CHECK-SAME: inputs([[IN]] + // CHECK-SAME: outputs([[TMP]] + + // CHECK: [[CAST_BUF:%.+]] = VPURT.DeclareBuffer <0> -> memref<8xi8, @DDR> + // CHECK: [[RES:%.+]] = VPUIP.NNDMA + // CHECK-SAME: inputs([[CAST_BUF]] + // CHECK-SAME: outputs([[OUT]] + + // CHECK: return [[ARG1]] +} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/copy_op_tiling_37XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/copy_op_tiling_37XX.mlir index 68e54bec96..d6deb04c1d 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/copy_op_tiling_37XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/copy_op_tiling_37XX.mlir @@ -366,9 +366,9 @@ func.func @SplitByChannelsSparseBuffers(%arg0: !SparseType, %arg1: !SparseType) // CHECK: [[ARG_0_TILE_1:%.*]] = VPUIP.SubView %arg0 [0, 160, 0, 0] [1, 160, 32, 16] : // CHECK-SAME: !VPUIP.SparseBuffer, - // CHECK-SMAE: sparsity_map=memref<1x320x32x16xi1, {order = #NCHW, strides = [655360, 2048, 32, 1]}>> to + // CHECK-SAME: sparsity_map=memref<1x320x32x16xi1, {order = #NCHW, strides = [655360, 2048, 32, 1]}>> to // CHECK-SAME: !VPUIP.SparseBuffer, - // CHECK-SMAE: sparsity_map=memref<1x160x32x16xi1, {order = #NCHW, strides = [655360, 2048, 32, 1]}>> + // CHECK-SAME: sparsity_map=memref<1x160x32x16xi1, {order = #NCHW, strides = [655360, 2048, 32, 1]}>> // CHECK: [[ARG_1_TILE_1:%.*]] = VPUIP.SubView %arg1 [0, 160, 0, 0] [1, 160, 32, 16] : // CHECK-SAME: !VPUIP.SparseBuffer, diff --git a/tests/lit/NPU/dialect/VPUIP/passes/dma_task_profiling_after_barrier_37XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/dma_task_profiling_after_barrier_37XX.mlir index d3b401e48c..80c448f67a 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/dma_task_profiling_after_barrier_37XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/dma_task_profiling_after_barrier_37XX.mlir @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -// RUN: vpux-opt --init-compiler="vpu-arch=%arch% allow-custom-values=true" --dma-task-profiling-after-barrier="dma-profiling=true" %s | FileCheck %s +// RUN: vpux-opt --init-compiler="vpu-arch=%arch% allow-custom-values=true" --dma-task-profiling-after-barrier %s | FileCheck %s // REQUIRES: arch-NPU37XX !dataType = memref<1x16x4x4xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, [@CMX_NN, 0]> diff --git a/tests/lit/NPU/dialect/VPUIP/passes/feasible_allocation_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/feasible_allocation_40XX+.mlir index 43af7c9251..3947b0683f 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/feasible_allocation_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/feasible_allocation_40XX+.mlir @@ -18,7 +18,7 @@ module @Spilling { IE.ExecutorResource 2 of @DMA_NN IE.TileResource 6 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -179,7 +179,7 @@ func.func @main(%in: !act_type_DDR, %out: !act_type_DDR) -> !act_type_DDR { module @SpillingOpWith2Outputs { IE.ExecutorResource 2 of @DMA_NN IE.TileResource 6 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -357,7 +357,7 @@ func.func @main(%in: !act_type_DDR, %out: !act_type_DDR) -> !act_type_DDR { module @SpillingOfSubViewBuffer { IE.ExecutorResource 2 of @DMA_NN IE.TileResource 6 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -650,7 +650,7 @@ func.func @main(%in: !act_type_DDR, %out: !act_type_DDR) -> !act_type_DDR { module @ControlEdgeOverlapMemory { IE.ExecutorResource 2 of @DMA_NN IE.TileResource 6 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -793,7 +793,7 @@ func.func @main(%in: !act_type_DDR, %out0: !act_type_DDR, %out1: !act_type_DDR) module @ControlEdgeOverlapMemoryCheckProdCons { IE.ExecutorResource 2 of @DMA_NN IE.TileResource 6 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -1018,7 +1018,7 @@ func.func @main(%in: !act_type_DDR, %out: !act_type_DDR) -> !act_type_DDR { module @SingleConvWithClusteringAndDmaPortDistribution { IE.ExecutorResource 2 of @DMA_NN IE.TileResource 6 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -1172,7 +1172,7 @@ func.func @main(%input: !Input_DDR) -> !Output_DDR { module @SpillingWithClustering { IE.ExecutorResource 2 of @DMA_NN IE.TileResource 6 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -1366,7 +1366,7 @@ func.func @main(%input: !BufMemrefDDR) -> !BufMemrefDDR { module @Prefetching { IE.ExecutorResource 2 of @DMA_NN IE.TileResource 6 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -1507,7 +1507,7 @@ func.func @main(%in: memref<1x32x16x16xf16, #NHWC>, %out: memref<1x128x4x4xf16, module @PipelineShaveAct { IE.ExecutorResource 2 of @DMA_NN IE.TileResource 6 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -1526,7 +1526,7 @@ VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 409 module @VPU.SW { IE.ExecutorResource 2 of @DMA_NN IE.TileResource 6 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -1700,7 +1700,7 @@ func.func @main(%in0: memref<1x32x48x48xf16, #NHWC>, %in1: memref<1x32x48x48xf16 module @PrefetchNoActSpillAtEndAndWrongOrder { IE.ExecutorResource 2 of @DMA_NN IE.TileResource 6 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -1717,7 +1717,7 @@ VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 409 module @VPU.SW { IE.ExecutorResource 2 of @DMA_NN IE.TileResource 6 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -1825,7 +1825,7 @@ func.func @main(%arg0: memref<1x1x1x1000xf16, @DDR>, %arg1: memref<1x1x1x1000xf1 module @MultiBufferSpillingFromOp { IE.ExecutorResource 2 of @DMA_NN IE.TileResource 6 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -1843,7 +1843,7 @@ VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 409 module @VPU.SW { IE.ExecutorResource 2 of @DMA_NN IE.TileResource 6 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -1894,24 +1894,24 @@ func.func @main(%arg0: memref<1x1x640x128xf16, @DDR>, %arg1: memref<1x1x640x128x async.yield %10 : memref<4xsi32, [@CMX_NN, 0]> } %token_6, %results_7:2 = async.execute [%token, %token_0, %token_2, %token_4] ( - %results as %arg2: !async.value>, - %results_1 as %arg3: !async.value>, - %results_3 as %arg4: !async.value>, - %results_5 as %arg5: !async.value>) - -> (!async.value>, - !async.value>) - attributes {VPUIP.executor = @SHAVE_ACT, "async-deps-index" = 4 : i64, cycleCost = 1 : i64} { - %results_20, %dynamicOutputShapes = VPUIP.SW.Kernel {dynamicInputShapesMap = array, dynamicOutputShapesMap = array, resultSegmentSizes = array} @VPU.SW::@builtin_Concat + %results as %arg2: !async.value>, + %results_1 as %arg3: !async.value>, + %results_3 as %arg4: !async.value>, + %results_5 as %arg5: !async.value>) + -> (!async.value>, + !async.value>) + attributes {VPUIP.executor = @SHAVE_ACT, "async-deps-index" = 4 : i64, cycleCost = 1 : i64} { + %results_20, %dynamicOutputShapes = VPUIP.SW.Kernel {dynamicInputShapesMap = array, dynamicOutputShapesMap = array, resultSegmentSizes = array} @VPU.SW::@builtin_Concat inputs( - %arg2 as %arg10: memref<1x1x640x128xf16, [@CMX_NN, 0]>, - %arg3 as %arg11: memref<1x1x640x128xf16, [@CMX_NN, 0]>) + %arg2 as %arg10: memref<1x1x640x128xf16, [@CMX_NN, 0]>, + %arg3 as %arg11: memref<1x1x640x128xf16, [@CMX_NN, 0]>) dynamicInputShapes( - %arg4 : memref<4xsi32, [@CMX_NN, 0]>, - %arg5 : memref<4xsi32, [@CMX_NN, 0]>) + %arg4 : memref<4xsi32, [@CMX_NN, 0]>, + %arg5 : memref<4xsi32, [@CMX_NN, 0]>) outputs( - %out_CMX_0 as %arg12: memref<1x2x640x128xf16, [@CMX_NN, 0]>) + %out_CMX_0 as %arg12: memref<1x2x640x128xf16, [@CMX_NN, 0]>) dynamicOutputShapes( - %ds_out_CMX_0 : memref<4xsi32, [@CMX_NN, 0]>) + %ds_out_CMX_0 : memref<4xsi32, [@CMX_NN, 0]>) on tile 0 -> (memref<1x2x640x128xf16, [@CMX_NN, 0]>, memref<4xsi32, [@CMX_NN, 0]>){ VPUIP.SW.Kernel.run {attrs = [[0, 0, 0, 0], [0, 0, 1, 0]]}(%arg10, %arg11, %arg12) : memref<1x1x640x128xf16, [@CMX_NN, 0]>, memref<1x1x640x128xf16, [@CMX_NN, 0]>, memref<1x2x640x128xf16, [@CMX_NN, 0]> } @@ -1958,10 +1958,10 @@ func.func @main(%arg0: memref<1x1x640x128xf16, @DDR>, %arg1: memref<1x1x640x128x // CHECK: [[BUF1:%.*]] = VPURT.DeclareBuffer [0] <327680> -> memref<4xsi32, [@CMX_NN, 0]> // CHECK: [[T0:%.+]], [[R0:%.+]] = async.execute - // CHECK: VPUIP.SW.Kernel + // CHECK: VPUIP.SW.Kernel // CHECK: inputs( // CHECK: dynamicInputShapes( - // CHECK: outputs([[BUF0]] as %arg9: memref<1x2x640x128xf16, [@CMX_NN, 0]>) + // CHECK: outputs([[BUF0]] as %arg9: memref<1x2x640x128xf16, [@CMX_NN, 0]>) // CHECK: dynamicOutputShapes([[BUF1]] : memref<4xsi32, [@CMX_NN, 0]>) // check dynamic spill writes for outputs of the same operation diff --git a/tests/lit/NPU/dialect/VPUIP/passes/legalize_schedule_for_wlm_fetch_dmas_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/legalize_schedule_for_wlm_fetch_dmas_40XX.mlir index 8c922b6456..10041764de 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/legalize_schedule_for_wlm_fetch_dmas_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/legalize_schedule_for_wlm_fetch_dmas_40XX.mlir @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -// RUN: vpux-opt --init-compiler="vpu-arch=%arch% allow-custom-values=true" --split-input-file --legalize-schedule-for-wlm --canonicalize %s | FileCheck %s +// RUN: vpux-opt --init-compiler="vpu-arch=%arch% allow-custom-values=true" --split-input-file --legalize-schedule-for-partial-wlm --canonicalize %s | FileCheck %s // REQUIRES: arch-NPU40XX !qElemType = !quant.uniform @@ -21,13 +21,13 @@ module @NoLegalizationDueToFIFOOrdering attributes {config.compilationMode = #co } IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "result.1" : tensor<1x3x224x224xf16> } outputsInfo : { @@ -137,13 +137,13 @@ module @LegalizeWithJustDMAs attributes {config.compilationMode = #config.compil } IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "result.1" : tensor<1x3x224x224xf16> } outputsInfo : { @@ -256,13 +256,13 @@ module @LegalizeGroupThreeWithSharedBarrier attributes {config.compilationMode = } IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "result.1" : tensor<1x3x224x224xf16> } outputsInfo : { @@ -374,13 +374,13 @@ module @LegalizeGroupsWithBothSharedBarriers attributes {config.compilationMode } IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "result.1" : tensor<1x3x224x224xf16> } outputsInfo : { @@ -516,13 +516,13 @@ module @LegalizeWithJustDMAWithWaitingDMABeforeGrandParent attributes {config.co } IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "result.1" : tensor<1x3x224x224xf16> } outputsInfo : { @@ -645,13 +645,13 @@ module @LegalizeWithBarrierAndDMAWithWaitingDMABeforeGrandParent attributes {con } IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "result.1" : tensor<1x3x224x224xf16> } outputsInfo : { @@ -770,13 +770,13 @@ module @LegalizeGroupThreeWithSharedBarrierMultiTile attributes {config.compilat } IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 4 of @SHAVE_ACT IE.ExecutorResource 2 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "result.1" : tensor<1x3x224x224xf16> } outputsInfo : { @@ -942,13 +942,13 @@ module @LegalizeWithoutAnyDMAPresent attributes {config.compilationMode = #confi } IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "result.1" : tensor<1x3x224x224xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUIP/passes/move_pure_view_op_before_copy.mlir b/tests/lit/NPU/dialect/VPUIP/passes/move_pure_view_op_before_copy.mlir index 4b31d7dcfe..9b6252ef1d 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/move_pure_view_op_before_copy.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/move_pure_view_op_before_copy.mlir @@ -30,15 +30,15 @@ func.func @MovePureViewOpBeforeCopyMultipleConsumers( return %3, %4 : memref<1x16x56x224xf16, #NHWC, @DDR>, memref<1x16x112x112xf16, #NCHW, @DDR> - // CHECK: [[PERMUTECAST:%.*]] = VPUIP.PermuteCast {dst_order = #NCHW, mem_perm = #NWCH} inputs(%arg0 : memref<1x16x112x112xf16, #NHWC, @CMX>) -> memref<1x16x112x112xf16, @CMX> - // CHECK: [[ALLOC0:%.*]] = memref.alloc() : memref<1x16x112x112xf16, @DDR> - // CHECK: [[COPY0:%.*]] = VPUIP.Copy inputs([[PERMUTECAST]] : memref<1x16x112x112xf16, @CMX>) outputs([[ALLOC0]] : memref<1x16x112x112xf16, @DDR>) -> memref<1x16x112x112xf16, @DDR> + // CHECK: [[PERMUTECAST:%.+]] = VPUIP.PermuteCast {dst_order = #NCHW, mem_perm = #NWCH} inputs(%arg0 : memref<1x16x112x112xf16, #NHWC, @CMX>) -> memref<1x16x112x112xf16, @CMX> + // CHECK: [[ALLOC0:%.+]] = memref.alloc() : memref<1x16x112x112xf16, @DDR> + // CHECK: [[COPY0:%.+]] = VPUIP.Copy inputs([[PERMUTECAST]] : memref<1x16x112x112xf16, @CMX>) outputs([[ALLOC0]] : memref<1x16x112x112xf16, @DDR>) -> memref<1x16x112x112xf16, @DDR> - // CHECK: [[GENERICRESHAPE:%.*]] = VPUIP.GenericReshape inputs(%arg0 : memref<1x16x112x112xf16, #NHWC, @CMX>) -> memref<1x16x56x224xf16, #NHWC, @CMX> - // CHECK: [[ALLOC1:%.*]] = memref.alloc() : memref<1x16x56x224xf16, #NHWC, @DDR> - // CHECK: [[COPY1:%.*]] = VPUIP.Copy inputs([[GENERICRESHAPE]] : memref<1x16x56x224xf16, #NHWC, @CMX>) outputs([[ALLOC1]] : memref<1x16x56x224xf16, #NHWC, @DDR>) -> memref<1x16x56x224xf16, #NHWC, @DDR> + // CHECK: [[GENERICRESHAPE:%.+]] = VPUIP.GenericReshape inputs(%arg0 : memref<1x16x112x112xf16, #NHWC, @CMX>) -> memref<1x16x56x224xf16, #NHWC, @CMX> + // CHECK: [[ALLOC1:%.+]] = memref.alloc() : memref<1x16x56x224xf16, #NHWC, @DDR> + // CHECK: [[COPY1:%.+]] = VPUIP.Copy inputs([[GENERICRESHAPE]] : memref<1x16x56x224xf16, #NHWC, @CMX>) outputs([[ALLOC1]] : memref<1x16x56x224xf16, #NHWC, @DDR>) -> memref<1x16x56x224xf16, #NHWC, @DDR> - // CHECK: [[COPY2:%.*]] = VPUIP.Copy inputs([[COPY0]] : memref<1x16x112x112xf16, @DDR>) outputs(%arg1 : memref<1x16x112x112xf16, @DDR>) -> memref<1x16x112x112xf16, @DDR> + // CHECK: [[COPY2:%.+]] = VPUIP.Copy inputs([[COPY0]] : memref<1x16x112x112xf16, @DDR>) outputs(%arg1 : memref<1x16x112x112xf16, @DDR>) -> memref<1x16x112x112xf16, @DDR> // CHECK: return [[COPY1]], [[COPY2]] : memref<1x16x56x224xf16, #NHWC, @DDR>, memref<1x16x112x112xf16, @DDR> } @@ -145,10 +145,10 @@ func.func @QuantizeCastBeforeDistributedCopy(%arg0: memref<1x128x8x8x!qElemType, return %2 : memref<1x128x8x8x!qElemType1, #NHWC, @CMX_NN> - // CHECK: [[VAR0:%.*]] = VPUIP.QuantizeCast inputs(%arg0 : - // CHECK: [[VAR1:%.*]] = VPUIP.Copy + // CHECK: [[VAR0:%.+]] = VPUIP.QuantizeCast inputs(%arg0 : + // CHECK: [[VAR1:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[VAR0]] - // CHECK: [[VAR2:%.*]] = VPUIP.Copy + // CHECK: [[VAR2:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[VAR1]] // CHECK: return [[VAR2]] } @@ -165,14 +165,14 @@ func.func @MoveSubviewToTheFrontOfCopy(%arg0: memref<1x16x2x2xf16, @DDR>, %arg1: return %4 : memref<1x8x2x2xf16, @DDR> - // CHECK: [[VAR0:%.*]] = VPUIP.SubView %arg0 [0, 0, 0, 0] [1, 8, 2, 2] : + // CHECK: [[VAR0:%.+]] = VPUIP.SubView %arg0 [0, 0, 0, 0] [1, 8, 2, 2] : // CHECK-SAME: memref<1x16x2x2xf16, @DDR> to memref<1x8x2x2xf16, {order = #NCHW, strides = [64, 4, 2, 1]}, @DDR> - // CHECK: [[VAR1:%.*]] = memref.alloc() : memref<1x8x2x2xf16, @DDR> - // CHECK: [[VAR2:%.*]] = VPUIP.Copy inputs([[VAR0]] : memref<1x8x2x2xf16, {order = #NCHW, strides = [64, 4, 2, 1]}, @DDR>) + // CHECK: [[VAR1:%.+]] = memref.alloc() : memref<1x8x2x2xf16, @DDR> + // CHECK: [[VAR2:%.+]] = VPUIP.Copy inputs([[VAR0]] : memref<1x8x2x2xf16, {order = #NCHW, strides = [64, 4, 2, 1]}, @DDR>) // CHECK-SAME: outputs([[VAR1]] : memref<1x8x2x2xf16, @DDR>) - // CHECK: [[VAR3:%.*]] = VPUIP.Copy inputs([[VAR2]] : memref<1x8x2x2xf16, @DDR>) + // CHECK: [[VAR3:%.+]] = VPUIP.Copy inputs([[VAR2]] : memref<1x8x2x2xf16, @DDR>) // CHECK-SAME: outputs(%arg1 : memref<1x8x2x2xf16, @DDR>) // CHECK: return [[VAR3]] : memref<1x8x2x2xf16, @DDR> @@ -193,12 +193,12 @@ func.func @DoNotMoveSubviewToTheFrontOfSparseCopy(%arg0: !VPUIP.SparseBuffer, sparsity_map=memref<1x64x17x56xi1, {order = #NHWC, strides = [200704, 1, 3584, 64]}>> - // CHECK: [[ALLOC0:%.*]] = memref.alloc() : memref<1x64x56x56x!qElemType, #NHWC> - // CHECK: [[ALLOC1:%.*]] = memref.alloc() : memref<1x64x56x56xi1, #NHWC> - // CHECK: [[SPARSEBUFFER:%.*]] = VPUIP.GroupSparseBuffer([[ALLOC0]], [[ALLOC1]]) - // CHECK: [[COPY:%.*]] = VPUIP.Copy inputs(%arg0 + // CHECK: [[ALLOC0:%.+]] = memref.alloc() : memref<1x64x56x56x!qElemType, #NHWC> + // CHECK: [[ALLOC1:%.+]] = memref.alloc() : memref<1x64x56x56xi1, #NHWC> + // CHECK: [[SPARSEBUFFER:%.+]] = VPUIP.GroupSparseBuffer([[ALLOC0]], [[ALLOC1]]) + // CHECK: [[COPY:%.+]] = VPUIP.Copy inputs(%arg0 // CHECK-SAME: outputs([[SPARSEBUFFER]] - // CHECK: [[SUBVIEW:%.*]] = VPUIP.SubView [[COPY]] [0, 0, 0, 0] [1, 64, 17, 56] + // CHECK: [[SUBVIEW:%.+]] = VPUIP.SubView [[COPY]] [0, 0, 0, 0] [1, 64, 17, 56] // CHECK: return [[SUBVIEW]] } @@ -232,8 +232,8 @@ func.func @DoNotMoveSubviewToTheFrontOfSparseCopy(%arg0: !VPUIP.SparseBuffer (d0, d2, d3, d1)>, strides = [270336, 1, 5632, 64]}, @DDR>) outputs(%5 : memref<1x32x48x88x!qElemType, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @DDR>) -> memref<1x32x48x88x!qElemType, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @DDR> return %6 : memref<1x32x48x88x!qElemType, #NHWC, @DDR> - // CHECK: [[BUFF_0:%.*]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1x64x48x88x!qElemType, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64}> - // CHECK: [[ADD_0:%.*]] = VPUIP.NCEClusterTask {minimumHardwareExecutionCost = 1081 : i64, task_type = #VPUIP.nce_task_type} + // CHECK: [[BUFF_0:%.+]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1x64x48x88x!qElemType, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64}> + // CHECK: [[ADD_0:%.+]] = VPUIP.NCEClusterTask {minimumHardwareExecutionCost = 1081 : i64, task_type = #VPUIP.nce_task_type} // CHECK-SAME: input(%arg0 : memref<1x64x48x88x!qElemType, #NHWC, @CMX_NN>) // CHECK-SAME: weights(%arg1 : memref<1x64x48x88x!qElemType, #NHWC, @CMX_NN>) // CHECK-SAME: parent_input(%arg0 : memref<1x64x48x88x!qElemType, #NHWC, @CMX_NN>) @@ -244,15 +244,15 @@ func.func @DoNotMoveSubviewToTheFrontOfSparseCopy(%arg0: !VPUIP.SparseBuffer} // CHECK: } - // CHECK: [[SUBVIEW:%.*]] = VPUIP.SubView [[ADD_0]] [0, 0, 0, 0] [1, 32, 48, 88] : + // CHECK: [[SUBVIEW:%.+]] = VPUIP.SubView [[ADD_0]] [0, 0, 0, 0] [1, 32, 48, 88] : // CHECK-SAME: !VPUIP.DistributedBuffer<1x64x48x88x!qElemType, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64}> to !VPUIP.DistributedBuffer<1x32x48x88x!qElemType, {order = #NHWC, strides = [270336, 1, 5632, 64]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64}> - // CHECK: [[BUFF_1:%.*]] = memref.alloc() : memref<1x32x48x88x!qElemType, #NHWC, @DDR> - // CHECK: [[Tilling_COPY:%.*]] = VPUIP.Copy + // CHECK: [[BUFF_1:%.+]] = memref.alloc() : memref<1x32x48x88x!qElemType, #NHWC, @DDR> + // CHECK: [[Tilling_COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[SUBVIEW]] : !VPUIP.DistributedBuffer<1x32x48x88x!qElemType, {order = #NHWC, strides = [270336, 1, 5632, 64]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64}>) // CHECK-SAME: outputs([[BUFF_1]] : memref<1x32x48x88x!qElemType, #NHWC, @DDR>) -> memref<1x32x48x88x!qElemType, #NHWC, @DDR> - // CHECK: [[BUFF_2:%.*]] = memref.alloc() : memref<1x32x48x88x!qElemType, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + // CHECK: [[BUFF_2:%.+]] = memref.alloc() : memref<1x32x48x88x!qElemType, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[Tilling_COPY]] : memref<1x32x48x88x!qElemType, #NHWC, @DDR>) // CHECK-SAME: outputs([[BUFF_2]] : memref<1x32x48x88x!qElemType, #NHWC, @DDR>) // CHECK: return [[COPY]] : memref<1x32x48x88x!qElemType, #NHWC, @DDR> @@ -310,13 +310,13 @@ func.func @NoChangesForStridedCopy( return %3 : memref<16x4xf16, @DDR> - // CHECK: [[SUBVIEW:%.*]] = VPUIP.SubView %arg0 [0, 0, 0, 0] [1, 1, 16, 4] : memref<1x2x16x4xf16, @DDR> to memref<1x1x16x4xf16, {order = #NCHW, strides = [128, 64, 4, 1]}, @DDR> + // CHECK: [[SUBVIEW:%.+]] = VPUIP.SubView %arg0 [0, 0, 0, 0] [1, 1, 16, 4] : memref<1x2x16x4xf16, @DDR> to memref<1x1x16x4xf16, {order = #NCHW, strides = [128, 64, 4, 1]}, @DDR> - // CHECK: [[BUFF_0:%.*]] = memref.alloc() : memref<1x1x16x4xf16, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + // CHECK: [[BUFF_0:%.+]] = memref.alloc() : memref<1x1x16x4xf16, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[SUBVIEW]] : memref<1x1x16x4xf16, {order = #NCHW, strides = [128, 64, 4, 1]}, @DDR>) // CHECK-SAME: outputs([[BUFF_0]] : memref<1x1x16x4xf16, @DDR>) - // CHECK: [[RESHAPE:%.*]] = VPUIP.GenericReshape inputs([[COPY]] : memref<1x1x16x4xf16, @DDR>) -> memref<16x4xf16, @DDR> + // CHECK: [[RESHAPE:%.+]] = VPUIP.GenericReshape inputs([[COPY]] : memref<1x1x16x4xf16, @DDR>) -> memref<16x4xf16, @DDR> // CHECK: return [[RESHAPE]] : memref<16x4xf16, @DDR> } @@ -346,9 +346,9 @@ func.func @MoveQuantizeCastBeforeTilingCopy(%arg0: !InputDistributed) -> memref< %1 = VPUIP.QuantizeCast inputs(%0 : memref<1x16x32x32x!qElemType, #NHWC, @DDR>) -> memref<1x16x32x32x!qElemType1, #NHWC, @DDR> return %1 : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> - // CHECK: [[QUANTCAST:%.*]] = VPUIP.QuantizeCast inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32x!qElemType, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x16x32x32x!qElemType1, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}> - // CHECK: [[BUF_0:%.*]] = memref.alloc() : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + // CHECK: [[QUANTCAST:%.+]] = VPUIP.QuantizeCast inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32x!qElemType, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x16x32x32x!qElemType1, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}> + // CHECK: [[BUF_0:%.+]] = memref.alloc() : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[QUANTCAST]] : !VPUIP.DistributedBuffer<1x16x32x32x!qElemType1, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[BUF_0]] : memref<1x16x32x32x!qElemType1, #NHWC, @DDR>) -> memref<1x16x32x32x!qElemType1, #NHWC, @DDR> // CHECK: return [[COPY]] : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> @@ -380,9 +380,9 @@ func.func @MoveGenericReshapeBeforeTilingCopyRankShrinks( return %1 : memref<1x16x1024xf16, @DDR> - // CHECK: [[RESHAPE:%.*]] = VPUIP.GenericReshape inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x16x1024xf16, #CHW, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1], num_clusters = 2 : i64}> - // CHECK: [[BUF_0:%.*]] = memref.alloc() : memref<1x16x1024xf16, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + // CHECK: [[RESHAPE:%.+]] = VPUIP.GenericReshape inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x16x1024xf16, #CHW, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1], num_clusters = 2 : i64}> + // CHECK: [[BUF_0:%.+]] = memref.alloc() : memref<1x16x1024xf16, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[RESHAPE]] : !VPUIP.DistributedBuffer<1x16x1024xf16, #CHW, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1], num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[BUF_0]] : memref<1x16x1024xf16, @DDR>) -> memref<1x16x1024xf16, @DDR> // CHECK: return [[COPY]] : memref<1x16x1024xf16, @DDR> @@ -416,9 +416,9 @@ func.func @MovePermuteCastBeforeTilingCopy( return %1 : memref<1x16x32x32xf16, @DDR> - // CHECK: [[PERMUTE:%.*]] = VPUIP.PermuteCast {dst_order = #NCHW, mem_perm = #NWCH} inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x16x32x32xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}> - // CHECK: [[BUFF_0:%.*]] = memref.alloc() : memref<1x16x32x32xf16, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + // CHECK: [[PERMUTE:%.+]] = VPUIP.PermuteCast {dst_order = #NCHW, mem_perm = #NWCH} inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x16x32x32xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}> + // CHECK: [[BUFF_0:%.+]] = memref.alloc() : memref<1x16x32x32xf16, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[PERMUTE]] : !VPUIP.DistributedBuffer<1x16x32x32xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[BUFF_0]] : memref<1x16x32x32xf16, @DDR>) -> memref<1x16x32x32xf16, @DDR> // CHECK: return [[COPY]] : memref<1x16x32x32xf16, @DDR> @@ -445,11 +445,11 @@ func.func @DoNotMovePermuteCastBeforeTilingCopySegmentedForNonTrivialReorder(%ar return %1 : memref<1x16x32x32xf16, @DDR> - // CHECK: [[OUT_BUFF:%.*]] = memref.alloc() : memref<1x16x32x32xf16, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + // CHECK: [[OUT_BUFF:%.+]] = memref.alloc() : memref<1x16x32x32xf16, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs(%arg0 // CHECK-SAME: outputs([[OUT_BUFF]] : memref<1x16x32x32xf16, #NHWC, @DDR>) -> memref<1x16x32x32xf16, #NHWC, @DDR> - // CHECK: [[PERMUTE:%.*]] = VPUIP.PermuteCast {dst_order = #NCHW, mem_perm = #NWCH} inputs([[COPY]] : memref<1x16x32x32xf16, #NHWC, @DDR>) -> memref<1x16x32x32xf16, @DDR> + // CHECK: [[PERMUTE:%.+]] = VPUIP.PermuteCast {dst_order = #NCHW, mem_perm = #NWCH} inputs([[COPY]] : memref<1x16x32x32xf16, #NHWC, @DDR>) -> memref<1x16x32x32xf16, @DDR> // CHECK: return [[PERMUTE]] : memref<1x16x32x32xf16, @DDR> } @@ -474,11 +474,11 @@ func.func @DoNotMovePermuteCastBeforeTilingCopySegmentedForUnmatchedOutShape(%ar return %1 : memref<1x32x16x32xf16, #NHWC, @DDR> - // CHECK: [[OUT_BUFF:%.*]] = memref.alloc() : memref<1x16x32x32xf16, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + // CHECK: [[OUT_BUFF:%.+]] = memref.alloc() : memref<1x16x32x32xf16, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs(%arg0 // CHECK-SAME: outputs([[OUT_BUFF]] : memref<1x16x32x32xf16, #NHWC, @DDR>) -> memref<1x16x32x32xf16, #NHWC, @DDR> - // CHECK: [[PERMUTE:%.*]] = VPUIP.PermuteCast {dst_order = #NHWC, mem_perm = #NCHW} inputs([[COPY]] : memref<1x16x32x32xf16, #NHWC, @DDR>) -> memref<1x32x16x32xf16, #NHWC, @DDR> + // CHECK: [[PERMUTE:%.+]] = VPUIP.PermuteCast {dst_order = #NHWC, mem_perm = #NCHW} inputs([[COPY]] : memref<1x16x32x32xf16, #NHWC, @DDR>) -> memref<1x32x16x32xf16, #NHWC, @DDR> // CHECK: return [[PERMUTE]] : memref<1x32x16x32xf16, #NHWC, @DDR> } @@ -503,9 +503,9 @@ func.func @MoveShapeCastBeforeTilingCopySegmented(%arg0: !InputDistributed) -> m return %1 : memref<1x4x64x64xf16, #NHWC, @DDR> - //CHECK: [[SHAPECAST:%.*]] = VPUIP.ShapeCast {shape = [1, 4, 64, 64]} inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x4x64x64xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> - //CHECK: [[OUTBUFF:%.*]] = memref.alloc() : memref<1x4x64x64xf16, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + //CHECK: [[SHAPECAST:%.+]] = VPUIP.ShapeCast {shape = [1, 4, 64, 64]} inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x4x64x64xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + //CHECK: [[OUTBUFF:%.+]] = memref.alloc() : memref<1x4x64x64xf16, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[SHAPECAST]] : !VPUIP.DistributedBuffer<1x4x64x64xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[OUTBUFF]] : memref<1x4x64x64xf16, #NHWC, @DDR>) -> memref<1x4x64x64xf16, #NHWC, @DDR> //CHECK: return [[COPY]] : memref<1x4x64x64xf16, #NHWC, @DDR> @@ -537,11 +537,11 @@ func.func @NotMoveShapeCastBeforeTilingCopySegmented(%arg0: memref<1x16x9x3xf16, return %4 : memref<1x16x3x9xf16, #NHWC, @DDR> - // CHECK: [[ALLOC_0:%.*]] = VPURT.AllocDistributed - // CHECK: [[CLUSTER_TILING_0:%.*]] = VPUIP.NCEClusterTask - // CHECK: [[ALLOC_1:%.*]] = memref.alloc() - // CHECK: [[CLUSTER_TILING_1:%.*]] = VPUIP.Copy - // CHECK: [[SHAPE_CAST:%.*]] = VPUIP.ShapeCast {shape = [1, 16, 3, 9]} + // CHECK: [[ALLOC_0:%.+]] = VPURT.AllocDistributed + // CHECK: [[CLUSTER_TILING_0:%.+]] = VPUIP.NCEClusterTask + // CHECK: [[ALLOC_1:%.+]] = memref.alloc() + // CHECK: [[CLUSTER_TILING_1:%.+]] = VPUIP.Copy + // CHECK: [[SHAPE_CAST:%.+]] = VPUIP.ShapeCast {shape = [1, 16, 3, 9]} // CHECK: return [[SHAPE_CAST]] : memref<1x16x3x9xf16, #NHWC, @DDR> } @@ -565,9 +565,9 @@ func.func @MoveGenericReshapeBeforeTilingCopySegmented(%arg0: !InputDistributed) return %1 : memref<1x4x64x64xf16, #NHWC, @DDR> - //CHECK: [[RESHAPE:%.*]] = VPUIP.GenericReshape inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x4x64x64xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> - //CHECK: [[OUTBUFF:%.*]] = memref.alloc() : memref<1x4x64x64xf16, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + //CHECK: [[RESHAPE:%.+]] = VPUIP.GenericReshape inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x4x64x64xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + //CHECK: [[OUTBUFF:%.+]] = memref.alloc() : memref<1x4x64x64xf16, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[RESHAPE]] : !VPUIP.DistributedBuffer<1x4x64x64xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[OUTBUFF]] : memref<1x4x64x64xf16, #NHWC, @DDR>) -> memref<1x4x64x64xf16, #NHWC, @DDR> //CHECK: return [[COPY]] : memref<1x4x64x64xf16, #NHWC, @DDR> @@ -593,11 +593,11 @@ func.func @DoNotMoveGenericReshapeBeforeTilingCopySegmented(%arg0: !InputDistrib return %1 : memref<1x16x1024xf16, @DDR> - // CHECK: [[OUT_BUFF:%.*]] = memref.alloc() : memref<1x16x32x32xf16, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + // CHECK: [[OUT_BUFF:%.+]] = memref.alloc() : memref<1x16x32x32xf16, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs(%arg0 // CHECK-SAME: outputs([[OUT_BUFF]] : memref<1x16x32x32xf16, #NHWC, @DDR>) -> memref<1x16x32x32xf16, #NHWC, @DDR> - // CHECK: [[RESHAPE:%.*]] = VPUIP.GenericReshape inputs([[COPY]] : memref<1x16x32x32xf16, #NHWC, @DDR>) -> memref<1x16x1024xf16, @DDR> + // CHECK: [[RESHAPE:%.+]] = VPUIP.GenericReshape inputs([[COPY]] : memref<1x16x32x32xf16, #NHWC, @DDR>) -> memref<1x16x1024xf16, @DDR> // CHECK: return [[RESHAPE]] : memref<1x16x1024xf16, @DDR> } @@ -622,11 +622,11 @@ func.func @DoNotMoveGenericReshapeWithDifferentOrderBeforeTilingCopySegmented(%a return %1 : memref<1x1x32x1024xf16, #NCWH, @DDR> - // CHECK: [[OUT_BUFF:%.*]] = memref.alloc() : memref<1x32x32x32xf16, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + // CHECK: [[OUT_BUFF:%.+]] = memref.alloc() : memref<1x32x32x32xf16, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs(%arg0 // CHECK-SAME: outputs([[OUT_BUFF]] : memref<1x32x32x32xf16, #NHWC, @DDR>) -> memref<1x32x32x32xf16, #NHWC, @DDR> - // CHECK: [[RESHAPE:%.*]] = VPUIP.GenericReshape inputs([[COPY]] : memref<1x32x32x32xf16, #NHWC, @DDR>) -> memref<1x1x32x1024xf16, #NCWH, @DDR> + // CHECK: [[RESHAPE:%.+]] = VPUIP.GenericReshape inputs([[COPY]] : memref<1x32x32x32xf16, #NHWC, @DDR>) -> memref<1x1x32x1024xf16, #NCWH, @DDR> // CHECK: return [[RESHAPE]] : memref<1x1x32x1024xf16, #NCWH, @DDR> } @@ -653,9 +653,9 @@ func.func @MoveQuantizeCastBeforeTilingCopySegmentedOverH(%arg0: !InputDistribut return %1 : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> - //CHECK: [[QUANTIZE:%.*]] = VPUIP.QuantizeCast inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32x!qElemType, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x16x32x32x!qElemType1, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> - //CHECK: [[OUTBUFF:%.*]] = memref.alloc() : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + //CHECK: [[QUANTIZE:%.+]] = VPUIP.QuantizeCast inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32x!qElemType, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x16x32x32x!qElemType1, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + //CHECK: [[OUTBUFF:%.+]] = memref.alloc() : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[QUANTIZE]] : !VPUIP.DistributedBuffer<1x16x32x32x!qElemType1, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[OUTBUFF]] : memref<1x16x32x32x!qElemType1, #NHWC, @DDR>) -> memref<1x16x32x32x!qElemType1, #NHWC, @DDR> //CHECK: return [[COPY]] : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> @@ -687,14 +687,14 @@ func.func @MoveQuantizeCastBeforeTilingCopySegmentedOverK(%arg0: !InputDistribut return %1 : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> - //CHECK: [[QUANTIZE:%.*]] = VPUIP.QuantizeCast + //CHECK: [[QUANTIZE:%.+]] = VPUIP.QuantizeCast //CHECK-SAME: inputs([[ARG0]] : !VPUIP.DistributedBuffer<1x16x32x32x!qElemType, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}>) //CHECK-SAME: -> !VPUIP.DistributedBuffer<1x16x32x32x!qElemType1, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}> - //CHECK: [[OUTBUFF:%.*]] = memref.alloc() : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + //CHECK: [[OUTBUFF:%.+]] = memref.alloc() : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[QUANTIZE]] : !VPUIP.DistributedBuffer<1x16x32x32x!qElemType1, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[OUTBUFF]] : memref<1x16x32x32x!qElemType1, #NHWC, @DDR>) -> memref<1x16x32x32x!qElemType1, #NHWC, @DDR> @@ -731,14 +731,14 @@ func.func @MoveQuantizeCastBeforeTilingCopyOverlapped(%arg0: !InputDistributed) return %1 : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> - // CHECK: [[QUANTIZE:%.*]] = VPUIP.QuantizeCast + // CHECK: [[QUANTIZE:%.+]] = VPUIP.QuantizeCast // CHECK-SAME: inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32x!qElemType, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], // CHECK-SAME: kernel = [1, 1], pads = #VPU.Padding, strides = [1, 1], num_clusters = 2 : i64}>) // CHECK-SAME: -> !VPUIP.DistributedBuffer<1x16x32x32x!qElemType1, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], // CHECK-SAME: kernel = [1, 1], pads = #VPU.Padding, strides = [1, 1], num_clusters = 2 : i64}> - // CHECK: [[OUTBUFF:%.*]] = memref.alloc() : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + // CHECK: [[OUTBUFF:%.+]] = memref.alloc() : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[QUANTIZE]] : !VPUIP.DistributedBuffer<1x16x32x32x!qElemType1, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], kernel = [1, 1], pads = #VPU.Padding, strides = [1, 1], num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[OUTBUFF]] : memref<1x16x32x32x!qElemType1, #NHWC, @DDR>) -> memref<1x16x32x32x!qElemType1, #NHWC, @DDR> // CHECK: return [[COPY]] : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> @@ -773,14 +773,14 @@ func.func @MoveQuantizeCastBeforeTilingCopyOverlappedMixedTypes(%arg0: !InputDis return %1 : memref<1x16x32x32xui8, #NHWC, @DDR> - // CHECK: [[QUANTIZE:%.*]] = VPUIP.QuantizeCast + // CHECK: [[QUANTIZE:%.+]] = VPUIP.QuantizeCast // CHECK-SAME: inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32x!qElemType, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], // CHECK-SAME: kernel = [1, 1], pads = #VPU.Padding, strides = [1, 1], num_clusters = 2 : i64}>) // CHECK-SAME: -> !VPUIP.DistributedBuffer<1x16x32x32xui8, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], // CHECK-SAME: kernel = [1, 1], pads = #VPU.Padding, strides = [1, 1], num_clusters = 2 : i64}> - // CHECK: [[OUTBUFF:%.*]] = memref.alloc() : memref<1x16x32x32xui8, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + // CHECK: [[OUTBUFF:%.+]] = memref.alloc() : memref<1x16x32x32xui8, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[QUANTIZE]] : !VPUIP.DistributedBuffer<1x16x32x32xui8, #NHWC, @CMX_NN, {mode = "OVERLAPPED", num_tiles = [1, 1, 2, 1], kernel = [1, 1], pads = #VPU.Padding, strides = [1, 1], num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[OUTBUFF]] : memref<1x16x32x32xui8, #NHWC, @DDR>) -> memref<1x16x32x32xui8, #NHWC, @DDR> // CHECK: return [[COPY]] : memref<1x16x32x32xui8, #NHWC, @DDR> @@ -822,13 +822,13 @@ func.func @SkipPerChannelOverlappedQuantizeCast(%arg0: !InputDistributed) -> mem return %1 : memref<1x16x32x32x!qElemType1, #NHWC, @DDR> - // CHECK: [[ALLOCATE:%.*]] = memref.alloc() : memref<1x16x32x32x!qElemType, #NHWC, @DDR> - // CHECK: [[TILING:%.*]] = VPUIP.Copy + // CHECK: [[ALLOCATE:%.+]] = memref.alloc() : memref<1x16x32x32x!qElemType, #NHWC, @DDR> + // CHECK: [[TILING:%.+]] = VPUIP.Copy // CHECK-SAME: inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x32x32x!qElemType, #NHWC, @CMX_NN // CHECK-SAME: outputs([[ALLOCATE]] : memref<1x16x32x32x!qElemType, #NHWC, @DDR>) // CHECK-SAME: -> memref<1x16x32x32x!qElemType, #NHWC, @DDR> - // CHECK: [[QUANTIZE_CAST:%.*]] = VPUIP.QuantizeCast + // CHECK: [[QUANTIZE_CAST:%.+]] = VPUIP.QuantizeCast // CHECK-SAME: inputs([[TILING]] : memref<1x16x32x32x!qElemType, #NHWC, @DDR>) // CHECK-SAME: -> memref<1x16x32x32x!qElemType1, #NHWC, @DDR> @@ -863,10 +863,11 @@ func.func @SkipPerChannelOverlappedQuantizeCast(%arg0: !InputDistributed) -> mem alignment = [1, 1, 4, 1]}> // CHECK-LABEL: @DoNotMoveShapeCastWhenDistributedNotCompatibleAfterShapeChange +// CHECK-SAME: [[ARG0:%.+]]: !VPUIP.DistributedBuffer<1x16x3x9xf16, #NWCH, @CMX_NN func.func @DoNotMoveShapeCastWhenDistributedNotCompatibleAfterShapeChange( %arg0: !Distributed0) -> !Distributed2 { - %0 = VPUIP.WorkloadCast inputs(%arg0 : !Distributed0) -> !Distributed1 + %0 = VPUIP.ViewOp %arg0 : !Distributed0 to !Distributed1 %1 = memref.alloc() : memref<1x3x9x16xf16, #NHWC, @DDR> %2 = VPUIP.Copy inputs(%0 : !Distributed1) @@ -879,26 +880,25 @@ func.func @DoNotMoveShapeCastWhenDistributedNotCompatibleAfterShapeChange( return %5 : !Distributed2 - // CHECK: [[WORKLOAD_CAST:%.*]] = VPUIP.WorkloadCast - // CHECK-SAME: inputs(%arg0 : !VPUIP.DistributedBuffer<1x16x3x9xf16, #NWCH, @CMX_NN, - // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 1, 1, 2], num_clusters = 2 : i64, equal_memory_and_compute_view}>) - // CHECK-SAME: -> !VPUIP.DistributedBuffer<1x3x9x16xf16, #NHWC, @CMX_NN, - // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + // CHECK: [[VIEWOP:%.+]] = VPUIP.ViewOp [[ARG0]] : !VPUIP.DistributedBuffer<1x16x3x9xf16, #NWCH, @CMX_NN, + // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 1, 1, 2], num_clusters = 2 : i64, equal_memory_and_compute_view}> + // CHECK-SAME: to !VPUIP.DistributedBuffer<1x3x9x16xf16, #NHWC, @CMX_NN, + // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> - // CHECK: [[MEMREF_ALLOC:%.*]] = memref.alloc() : memref<1x3x9x16xf16, #NHWC, @DDR> + // CHECK: [[MEMREF_ALLOC:%.+]] = memref.alloc() : memref<1x3x9x16xf16, #NHWC, @DDR> - // CHECK: [[COPY0:%.*]] = VPUIP.Copy - // CHECK-SAME: inputs([[WORKLOAD_CAST]] : !VPUIP.DistributedBuffer<1x3x9x16xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) + // CHECK: [[COPY0:%.+]] = VPUIP.Copy + // CHECK-SAME: inputs([[VIEWOP]] : !VPUIP.DistributedBuffer<1x3x9x16xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[MEMREF_ALLOC]] : memref<1x3x9x16xf16, #NHWC, @DDR>) -> memref<1x3x9x16xf16, #NHWC, @DDR> - // CHECK: [[SHAPE_CAST:%.*]] = VPUIP.ShapeCast {shape = [1, 48, 3, 3]} inputs([[COPY0]] : memref<1x3x9x16xf16, #NHWC, @DDR>) + // CHECK: [[SHAPE_CAST:%.+]] = VPUIP.ShapeCast {shape = [1, 48, 3, 3]} inputs([[COPY0]] : memref<1x3x9x16xf16, #NHWC, @DDR>) // CHECK-SAME: -> memref<1x48x3x3xf16, #NHWC, @DDR> - // CHECK: [[ALLOC_DISTRIBUTED:%.*]] = VPURT.AllocDistributed + // CHECK: [[ALLOC_DISTRIBUTED:%.+]] = VPURT.AllocDistributed // CHECK-SAME: -> !VPUIP.DistributedBuffer<1x48x3x3xf16, #NHWC, @CMX_NN, - // cHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64, alignment = [1, 1, 4, 1]}> + // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64, alignment = [1, 1, 4, 1]}> - // CHECK: [[COPY1:%.*]] = VPUIP.Copy + // CHECK: [[COPY1:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[SHAPE_CAST]] : memref<1x48x3x3xf16, #NHWC, @DDR>) // CHECK-SAME: outputs([[ALLOC_DISTRIBUTED]] : !VPUIP.DistributedBuffer<1x48x3x3xf16, #NHWC, @CMX_NN // CHECK-SAME: -> !VPUIP.DistributedBuffer<1x48x3x3xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64, alignment = [1, 1, 4, 1]}> @@ -1002,10 +1002,10 @@ func.func @MoveQuantizeCastBeforeTilingCopyMultipleConsumers(%in0: !InputDistrib } return %7, %9 : memref<1x64x32x32x!qElemType3, #NHWC, @DDR>, !OutputDistributed - // CHECK: [[NCE_OUT:%.*]] = VPUIP.NCEClusterTask - // CHECK: [[QUANTCAST:%.*]] = VPUIP.QuantizeCast inputs([[NCE_OUT]] : !VPUIP.DistributedBuffer<1x64x32x32x!qElemType5, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x64x32x32x!qElemType3, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> - // CHECK: [[BUF:%.*]] = memref.alloc() : memref<1x64x32x32x!qElemType3, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + // CHECK: [[NCE_OUT:%.+]] = VPUIP.NCEClusterTask + // CHECK: [[QUANTCAST:%.+]] = VPUIP.QuantizeCast inputs([[NCE_OUT]] : !VPUIP.DistributedBuffer<1x64x32x32x!qElemType5, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x64x32x32x!qElemType3, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + // CHECK: [[BUF:%.+]] = memref.alloc() : memref<1x64x32x32x!qElemType3, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[QUANTCAST]] : !VPUIP.DistributedBuffer<1x64x32x32x!qElemType3, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[BUF]] : memref<1x64x32x32x!qElemType3, #NHWC, @DDR>) -> memref<1x64x32x32x!qElemType3, #NHWC, @DDR> // CHECK: VPUIP.NCEClusterTask @@ -1059,18 +1059,18 @@ func.func @MoveSubViewWithPerAxisQuantization(%arg0: memref<1x8x2x2x!qElemType, return %OUT_COPY : memref<1x6x2x2x!qElemType1, @DDR> - // CHECK: [[SUBVIEW:%.*]] = VPUIP.SubView %arg0 [0, 0, 0, 0] [1, 6, 2, 2] : + // CHECK: [[SUBVIEW:%.+]] = VPUIP.SubView %arg0 [0, 0, 0, 0] [1, 6, 2, 2] : // CHECK-SAME: memref<1x8x2x2x[[QUANT_8_CHAN]], @DDR> // CHECK-SAME: to memref<1x6x2x2x[[QUANT_6_CHAN]], {order = #NCHW, strides = [32, 4, 2, 1]}, @DDR> - // CHECK: [[ALLOC:%.*]] = memref.alloc() : memref<1x6x2x2x[[QUANT_6_CHAN]], @DDR> + // CHECK: [[ALLOC:%.+]] = memref.alloc() : memref<1x6x2x2x[[QUANT_6_CHAN]], @DDR> - // CHECK: [[IN_COPY:%.*]] = VPUIP.Copy + // CHECK: [[IN_COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[SUBVIEW]] : memref<1x6x2x2x[[QUANT_6_CHAN]], {order = #NCHW, strides = [32, 4, 2, 1]}, @DDR>) // CHECK-SAME: outputs([[ALLOC]] : memref<1x6x2x2x[[QUANT_6_CHAN]], @DDR>) // CHECK-SAME: -> memref<1x6x2x2x[[QUANT_6_CHAN]], @DDR> - // CHECK: [[OUT_COPY:%.*]] = VPUIP.Copy + // CHECK: [[OUT_COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[IN_COPY]] : memref<1x6x2x2x[[QUANT_6_CHAN]], @DDR>) // CHECK-SAME: outputs(%arg1 : memref<1x6x2x2x[[QUANT_6_CHAN]], @DDR>) -> memref<1x6x2x2x[[QUANT_6_CHAN]], @DDR> @@ -1116,23 +1116,23 @@ func.func @DoNotMoveShapeCastWhenCompressConv( return %7 : memref<1x32x104x208xf16, #NHWC, [@CMX_NN, 0]> - // CHECK: [[ALLOC_WEIGHTS:%.*]] = memref.alloc() : memref<32x1x1x32xf16, #NHWC, [@CMX_NN, 0]> - // CHECK: [[W_CMX:%.*]] = VPUIP.Copy inputs(%arg1 : memref<32x1x1x32xf16, #NHWC>) + // CHECK: [[ALLOC_WEIGHTS:%.+]] = memref.alloc() : memref<32x1x1x32xf16, #NHWC, [@CMX_NN, 0]> + // CHECK: [[W_CMX:%.+]] = VPUIP.Copy inputs(%arg1 : memref<32x1x1x32xf16, #NHWC>) // CHECK-SAME: outputs([[ALLOC_WEIGHTS]] : memref<32x1x1x32xf16, #NHWC, [@CMX_NN, 0]>) -> memref<32x1x1x32xf16, #NHWC, [@CMX_NN, 0]> - // CHECK: [[ALLOC_WEIGHTSTABLE:%.*]] = memref.alloc() : memref<32x1x1x4xsi32, [@CMX_NN, 0]> - // CHECK: [[WT_CMX:%.*]] = VPUIP.Copy inputs(%arg2 : memref<32x1x1x4xsi32>) + // CHECK: [[ALLOC_WEIGHTSTABLE:%.+]] = memref.alloc() : memref<32x1x1x4xsi32, [@CMX_NN, 0]> + // CHECK: [[WT_CMX:%.+]] = VPUIP.Copy inputs(%arg2 : memref<32x1x1x4xsi32>) // CHECK-SAME: outputs([[ALLOC_WEIGHTSTABLE]] : memref<32x1x1x4xsi32, [@CMX_NN, 0]>) -> memref<32x1x1x4xsi32, [@CMX_NN, 0]> - // CHECK: [[SHAPECAST_WEIGHTS:%.*]] = VPUIP.ShapeCast {shape = [32, 16, 3, 3]} + // CHECK: [[SHAPECAST_WEIGHTS:%.+]] = VPUIP.ShapeCast {shape = [32, 16, 3, 3]} // CHECK-SAME: inputs([[W_CMX]] : memref<32x1x1x32xf16, #NHWC, [@CMX_NN, 0]>) -> memref<32x16x3x3xf16, #NHWC, [@CMX_NN, 0]> - // CHECK: [[ALLOC_OUTPUT:%.*]] = memref.alloc() : memref<1x32x104x208xf16, #NHWC, [@CMX_NN, 0]> + // CHECK: [[ALLOC_OUTPUT:%.+]] = memref.alloc() : memref<1x32x104x208xf16, #NHWC, [@CMX_NN, 0]> - // CHECK: [[SHAPECAST_INPUT:%.*]] = VPUIP.ShapeCast {shape = [1, 16, 208, 416]} + // CHECK: [[SHAPECAST_INPUT:%.+]] = VPUIP.ShapeCast {shape = [1, 16, 208, 416]} // CHECK-SAME: inputs(%arg0 : memref<1x4x208x416xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x16x208x416xf16, #NHWC, [@CMX_NN, 0]> - // CHECK: [[COMPRESS_CONV:%.*]] = VPUIP.NCEClusterTask + // CHECK: [[COMPRESS_CONV:%.+]] = VPUIP.NCEClusterTask // CHECK-SAME: {cm_sp_pattern = 15 : i64, input_channels_compression, // CHECK-SAME: kernel_padding = #VPU.Padding, // CHECK-SAME: kernel_size = [3, 3], kernel_strides = [2, 2], @@ -1167,17 +1167,17 @@ func.func @MoveSubviewToTheFrontOfCopyMultipleConsumers(%arg0: memref<1x16x2x2xf return %6 : memref<1x8x2x2xf16, @DDR> - // CHECK: [[SUBVIEW:%.*]] = VPUIP.SubView %arg0 [0, 0, 0, 0] [1, 8, 2, 2] : + // CHECK: [[SUBVIEW:%.+]] = VPUIP.SubView %arg0 [0, 0, 0, 0] [1, 8, 2, 2] : // CHECK-SAME: memref<1x16x2x2xf16, @DDR> to memref<1x8x2x2xf16, {order = #NCHW, strides = [64, 4, 2, 1]}, @DDR> - // CHECK: [[BUF_0:%.*]] = memref.alloc() : memref<1x8x2x2xf16, @DDR> - // CHECK: [[COPY_0:%.*]] = VPUIP.Copy inputs([[SUBVIEW]] : memref<1x8x2x2xf16, {order = #NCHW, strides = [64, 4, 2, 1]}, @DDR>) + // CHECK: [[BUF_0:%.+]] = memref.alloc() : memref<1x8x2x2xf16, @DDR> + // CHECK: [[COPY_0:%.+]] = VPUIP.Copy inputs([[SUBVIEW]] : memref<1x8x2x2xf16, {order = #NCHW, strides = [64, 4, 2, 1]}, @DDR>) // CHECK-SAME: outputs([[BUF_0]] : memref<1x8x2x2xf16, @DDR>) -> memref<1x8x2x2xf16, @DDR> - // CHECK: [[BUF_1:%.*]] = memref.alloc() : memref<1x16x2x2xf16, @DDR> - // CHECK: [[COPY_1:%.*]] = VPUIP.Copy inputs(%arg0 : memref<1x16x2x2xf16, @DDR>) outputs([[BUF_1]] : memref<1x16x2x2xf16, @DDR>) -> memref<1x16x2x2xf16, @DDR> + // CHECK: [[BUF_1:%.+]] = memref.alloc() : memref<1x16x2x2xf16, @DDR> + // CHECK: [[COPY_1:%.+]] = VPUIP.Copy inputs(%arg0 : memref<1x16x2x2xf16, @DDR>) outputs([[BUF_1]] : memref<1x16x2x2xf16, @DDR>) -> memref<1x16x2x2xf16, @DDR> - // CHECK: [[COPY_2:%.*]] = VPUIP.Copy inputs([[COPY_0]] : memref<1x8x2x2xf16, @DDR>) outputs(%arg1 : memref<1x8x2x2xf16, @DDR>) -> memref<1x8x2x2xf16, @DDR> + // CHECK: [[COPY_2:%.+]] = VPUIP.Copy inputs([[COPY_0]] : memref<1x8x2x2xf16, @DDR>) outputs(%arg1 : memref<1x8x2x2xf16, @DDR>) -> memref<1x8x2x2xf16, @DDR> // CHECK: return [[COPY_2]] : memref<1x8x2x2xf16, @DDR> } @@ -1219,8 +1219,8 @@ func.func @MoveSubviewToTheFrontOfTillingCopyMultipleConsumers(%in0 : memref<1x3 return %6, %8 : memref<1x24x128x128xf16, #NHWC, @DDR>, memref<1x32x128x128xf16, #NHWC, @DDR> - // CHECK: [[BUFF_0:%.*]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1x32x128x128xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> - // CHECK: [[ADD_0:%.*]] = VPUIP.NCEClusterTask {is_inplace = true, minimumHardwareExecutionCost = 20758 : i64, task_type = #VPUIP.nce_task_type} + // CHECK: [[BUFF_0:%.+]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1x32x128x128xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + // CHECK: [[ADD_0:%.+]] = VPUIP.NCEClusterTask {is_inplace = true, minimumHardwareExecutionCost = 20758 : i64, task_type = #VPUIP.nce_task_type} // CHECK-SAME: input(%arg0 : memref<1x32x128x128xf16, #NHWC, @CMX_NN>) // CHECK-SAME: weights(%arg1 : memref<1x32x128x128xf16, #NHWC, @CMX_NN>) // CHECK-SAME: parent_input(%arg0 : memref<1x32x128x128xf16, #NHWC, @CMX_NN>) @@ -1233,23 +1233,23 @@ func.func @MoveSubviewToTheFrontOfTillingCopyMultipleConsumers(%in0 : memref<1x3 // CHECK: PPETask {ppe = #VPU.PPEStub<>} // CHECK: } - // CHECK: [[SUBVIEW:%.*]] = VPUIP.SubView [[ADD_0]] [0, 0, 0, 0] [1, 24, 128, 128] : + // CHECK: [[SUBVIEW:%.+]] = VPUIP.SubView [[ADD_0]] [0, 0, 0, 0] [1, 24, 128, 128] : // CHECK-SAME: !VPUIP.DistributedBuffer<1x32x128x128xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> to !VPUIP.DistributedBuffer<1x24x128x128xf16, {order = #NHWC, strides = [524288, 1, 4096, 32]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> - // CHECK: [[BUFF_1:%.*]] = memref.alloc() : memref<1x24x128x128xf16, #NHWC, @DDR> - // CHECK: [[Tilling_COPY_0:%.*]] = VPUIP.Copy + // CHECK: [[BUFF_1:%.+]] = memref.alloc() : memref<1x24x128x128xf16, #NHWC, @DDR> + // CHECK: [[Tilling_COPY_0:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[SUBVIEW]] : !VPUIP.DistributedBuffer<1x24x128x128xf16, {order = #NHWC, strides = [524288, 1, 4096, 32]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[BUFF_1]] : memref<1x24x128x128xf16, #NHWC, @DDR>) -> memref<1x24x128x128xf16, #NHWC, @DDR> - // CHECK: [[BUFF_2:%.*]] = memref.alloc() : memref<1x24x128x128xf16, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy inputs([[Tilling_COPY_0]] : memref<1x24x128x128xf16, #NHWC, @DDR>) outputs([[BUFF_2]] : memref<1x24x128x128xf16, #NHWC, @DDR>) -> memref<1x24x128x128xf16, #NHWC, @DDR> + // CHECK: [[BUFF_2:%.+]] = memref.alloc() : memref<1x24x128x128xf16, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy inputs([[Tilling_COPY_0]] : memref<1x24x128x128xf16, #NHWC, @DDR>) outputs([[BUFF_2]] : memref<1x24x128x128xf16, #NHWC, @DDR>) -> memref<1x24x128x128xf16, #NHWC, @DDR> - // CHECK: [[BUFF_3:%.*]] = memref.alloc() : memref<1x32x128x128xf16, #NHWC, @DDR> - // CHECK: [[Tilling_COPY_1:%.*]] = VPUIP.Copy + // CHECK: [[BUFF_3:%.+]] = memref.alloc() : memref<1x32x128x128xf16, #NHWC, @DDR> + // CHECK: [[Tilling_COPY_1:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[ADD_0]] : !VPUIP.DistributedBuffer<1x32x128x128xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[BUFF_3]] : memref<1x32x128x128xf16, #NHWC, @DDR>) -> memref<1x32x128x128xf16, #NHWC, @DDR> - // CHECK: return [[COPY]], [[Tilling_COPY_1:%.*]] : memref<1x24x128x128xf16, #NHWC, @DDR>, memref<1x32x128x128xf16, #NHWC, @DDR> + // CHECK: return [[COPY]], [[Tilling_COPY_1:%.+]] : memref<1x24x128x128xf16, #NHWC, @DDR>, memref<1x32x128x128xf16, #NHWC, @DDR> } // ----- @@ -1284,8 +1284,8 @@ func.func @NotMoveSubviewToTheFrontOfTillingCopyForIncompatibleDistributedBuffer return %6 : memref<1x32x128x128xf16, #NHWC, @DDR> - // CHECK: [[BUFF_0:%.*]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1x32x129x128xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> - // CHECK: [[ADD_0:%.*]] = VPUIP.NCEClusterTask {is_inplace = true, minimumHardwareExecutionCost = 20758 : i64, task_type = #VPUIP.nce_task_type} + // CHECK: [[BUFF_0:%.+]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1x32x129x128xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + // CHECK: [[ADD_0:%.+]] = VPUIP.NCEClusterTask {is_inplace = true, minimumHardwareExecutionCost = 20758 : i64, task_type = #VPUIP.nce_task_type} // CHECK-SAME: input(%arg0 : memref<1x32x129x128xf16, #NHWC, @CMX_NN>) // CHECK-SAME: weights(%arg1 : memref<1x32x129x128xf16, #NHWC, @CMX_NN>) // CHECK-SAME: parent_input(%arg0 : memref<1x32x129x128xf16, #NHWC, @CMX_NN>) @@ -1298,17 +1298,17 @@ func.func @NotMoveSubviewToTheFrontOfTillingCopyForIncompatibleDistributedBuffer // CHECK: PPETask {ppe = #VPU.PPEStub<>} // CHECK: } - // CHECK: [[BUFF_1:%.*]] = memref.alloc() : memref<1x32x129x128xf16, #NHWC, @DDR> - // CHECK: [[Tilling_COPY_0:%.*]] = VPUIP.Copy + // CHECK: [[BUFF_1:%.+]] = memref.alloc() : memref<1x32x129x128xf16, #NHWC, @DDR> + // CHECK: [[Tilling_COPY_0:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[ADD_0]] : !VPUIP.DistributedBuffer<1x32x129x128xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[BUFF_1]] : memref<1x32x129x128xf16, #NHWC, @DDR>) // CHECK-SAME: -> memref<1x32x129x128xf16, #NHWC, @DDR> - // CHECK: [[SUBVIEW:%.*]] = VPUIP.SubView [[Tilling_COPY_0]] [0, 0, 0, 0] [1, 32, 128, 128] : + // CHECK: [[SUBVIEW:%.+]] = VPUIP.SubView [[Tilling_COPY_0]] [0, 0, 0, 0] [1, 32, 128, 128] : // CHECK-SAME: memref<1x32x129x128xf16, #NHWC, @DDR> to memref<1x32x128x128xf16, {order = #NHWC, strides = [528384, 1, 4096, 32]}, @DDR> - // CHECK: [[BUFF_2:%.*]] = memref.alloc() : memref<1x32x128x128xf16, #NHWC, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy inputs([[SUBVIEW]] : memref<1x32x128x128xf16, {order = #NHWC, strides = [528384, 1, 4096, 32]}, @DDR>) outputs([[BUFF_2]] : memref<1x32x128x128xf16, #NHWC, @DDR>) -> memref<1x32x128x128xf16, #NHWC, @DDR> + // CHECK: [[BUFF_2:%.+]] = memref.alloc() : memref<1x32x128x128xf16, #NHWC, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy inputs([[SUBVIEW]] : memref<1x32x128x128xf16, {order = #NHWC, strides = [528384, 1, 4096, 32]}, @DDR>) outputs([[BUFF_2]] : memref<1x32x128x128xf16, #NHWC, @DDR>) -> memref<1x32x128x128xf16, #NHWC, @DDR> // CHECK: return [[COPY]] : memref<1x32x128x128xf16, #NHWC, @DDR> } @@ -1357,7 +1357,7 @@ func.func @MoveShapeCastBeforeTilingCopyExplicitSegDuplicated(%arg0: !InputDistr return %2 : !OutputDistributed - //CHECK: [[SHAPECAST:%.*]] = VPUIP.ShapeCast {shape = [1, 4, 96, 128]} + //CHECK: [[SHAPECAST:%.+]] = VPUIP.ShapeCast {shape = [1, 4, 96, 128]} //CHECK-SAME: inputs([[ARG0]] : !VPUIP.DistributedBuffer<1x48x32x32xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], //CHECK-SAME: num_clusters = 2 : i64, alignment = [1, 16, 1, 1], @@ -1372,8 +1372,8 @@ func.func @MoveShapeCastBeforeTilingCopyExplicitSegDuplicated(%arg0: !InputDistr //CHECK-SAME{LITERAL}: memory_shapes = [[1, 4, 96, 128], [1, 4, 96, 128]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUTBUFF:%.*]] = memref.alloc() : memref<1x4x96x128xf16, #NHWC, @DDR> - //CHECK: [[COPY:%.*]] = VPUIP.Copy + //CHECK: [[OUTBUFF:%.+]] = memref.alloc() : memref<1x4x96x128xf16, #NHWC, @DDR> + //CHECK: [[COPY:%.+]] = VPUIP.Copy //CHECK-SAME: inputs([[SHAPECAST]] : !VPUIP.DistributedBuffer<1x4x96x128xf16, #NHWC, @CMX_NN //CHECK-SAME: outputs([[OUTBUFF]] : memref<1x4x96x128xf16, #NHWC, @DDR>) //CHECK-SAME: -> memref<1x4x96x128xf16, #NHWC, @DDR> @@ -1422,7 +1422,7 @@ func.func @MoveGenericReshapeBeforeTilingCopyExplicitSegMulticasted(%arg0: !Inpu return %2 : !OutputDistributed - //CHECK: [[RESHAPE:%.*]] = VPUIP.GenericReshape + //CHECK: [[RESHAPE:%.+]] = VPUIP.GenericReshape //CHECK-SAME: inputs([[ARG0]] : !VPUIP.DistributedBuffer<1x48x32x32xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "SEGMENTED|MULTICASTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64, //CHECK-SAME{LITERAL}: compute_shapes = [[1, 48, 16, 32], [1, 48, 16, 32]], @@ -1436,8 +1436,8 @@ func.func @MoveGenericReshapeBeforeTilingCopyExplicitSegMulticasted(%arg0: !Inpu //CHECK-SAME{LITERAL}: memory_shapes = [[1, 384, 1, 128], [1, 384, 1, 128]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUTBUFF:%.*]] = memref.alloc() : memref<1x384x1x128xf16, #NHWC, @DDR> - //CHECK: [[COPY:%.*]] = VPUIP.Copy + //CHECK: [[OUTBUFF:%.+]] = memref.alloc() : memref<1x384x1x128xf16, #NHWC, @DDR> + //CHECK: [[COPY:%.+]] = VPUIP.Copy //CHECK-SAME: inputs([[RESHAPE]] : !VPUIP.DistributedBuffer<1x384x1x128xf16, #NHWC, @CMX_NN //CHECK-SAME: outputs([[OUTBUFF]] : memref<1x384x1x128xf16, #NHWC, @DDR>) //CHECK-SAME: -> memref<1x384x1x128xf16, #NHWC, @DDR> @@ -1491,7 +1491,7 @@ func.func @MoveMultipleViewOpsBeforeTilingCopyExplicitSegDuplicated(%arg0: !Inpu return %3 : !OutputDistributed - //CHECK: [[PERMUTECAST:%.*]] = VPUIP.PermuteCast {dst_order = #NHWC, mem_perm = #NHCW} + //CHECK: [[PERMUTECAST:%.+]] = VPUIP.PermuteCast {dst_order = #NHWC, mem_perm = #NHCW} //CHECK-SAME: inputs([[ARG0]] : !VPUIP.DistributedBuffer<1x48x1x16xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], //CHECK-SAME: num_clusters = 2 : i64, alignment = [1, 16, 1, 1], @@ -1507,7 +1507,7 @@ func.func @MoveMultipleViewOpsBeforeTilingCopyExplicitSegDuplicated(%arg0: !Inpu //CHECK-SAME{LITERAL}: memory_shapes = [[1, 48, 16, 1], [1, 48, 16, 1]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[RESHAPE:%.*]] = VPUIP.GenericReshape + //CHECK: [[RESHAPE:%.+]] = VPUIP.GenericReshape //CHECK-SAME: inputs([[PERMUTECAST]] : !VPUIP.DistributedBuffer<1x48x16x1xf16, #NHWC, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], //CHECK-SAME: num_clusters = 2 : i64, alignment = [1, 16, 1, 1], @@ -1522,8 +1522,8 @@ func.func @MoveMultipleViewOpsBeforeTilingCopyExplicitSegDuplicated(%arg0: !Inpu //CHECK-SAME{LITERAL}: memory_shapes = [[1, 16, 16, 3], [1, 16, 16, 3]], //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]] - //CHECK: [[OUTBUFF:%.*]] = memref.alloc() : memref<1x16x16x3xf16, #NHWC, @DDR> - //CHECK: [[COPY:%.*]] = VPUIP.Copy + //CHECK: [[OUTBUFF:%.+]] = memref.alloc() : memref<1x16x16x3xf16, #NHWC, @DDR> + //CHECK: [[COPY:%.+]] = VPUIP.Copy //CHECK-SAME: inputs([[RESHAPE]] : !VPUIP.DistributedBuffer<1x16x16x3xf16, #NHWC, @CMX_NN, //CHECK-SAME: outputs([[OUTBUFF]] : memref<1x16x16x3xf16, #NHWC, @DDR>) //CHECK-SAME: -> memref<1x16x16x3xf16, #NHWC, @DDR> @@ -1552,7 +1552,7 @@ func.func @MoveGenericReshapeBeforeTilingCopyRankChanged(%arg0: !InputDistribute return %1 : memref<1x4x64xf16, @DDR> - //CHECK: [[RESHAPE:%.*]] = VPUIP.GenericReshape + //CHECK: [[RESHAPE:%.+]] = VPUIP.GenericReshape //CHECK-SAME: inputs(%arg0 : !VPUIP.DistributedBuffer< //CHECK-SAME: 4x64x1x1xf16, #NCHW, @CMX_NN, { //CHECK-SAME: mode = "DUPLICATED|SEGMENTED", @@ -1570,8 +1570,8 @@ func.func @MoveGenericReshapeBeforeTilingCopyRankChanged(%arg0: !InputDistribute //CHECK-SAME: uniform_distributed_segments //CHECK-SAME: }> - //CHECK: [[OUTBUFF:%.*]] = memref.alloc() : memref<1x4x64xf16, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + //CHECK: [[OUTBUFF:%.+]] = memref.alloc() : memref<1x4x64xf16, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[RESHAPE]] : !VPUIP.DistributedBuffer<1x4x64xf16, #CHW, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 1, 2], num_clusters = 2 : i64, alignment = [1, 1, 16], uniform_distributed_segments}>) // CHECK-SAME: outputs([[OUTBUFF]] : memref<1x4x64xf16, @DDR>) -> memref<1x4x64xf16, @DDR> @@ -1603,7 +1603,7 @@ func.func @MoveGenericReshapeBeforeTilingCopyRankChangedExplicitDistribution(%ar return %1 : memref<1x4x64xf16, @DDR> - //CHECK: [[RESHAPE:%.*]] = VPUIP.GenericReshape + //CHECK: [[RESHAPE:%.+]] = VPUIP.GenericReshape //CHECK-SAME: inputs(%arg0 : !VPUIP.DistributedBuffer< //CHECK-SAME: 4x64x1x1xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED|SEGMENTED", @@ -1623,8 +1623,8 @@ func.func @MoveGenericReshapeBeforeTilingCopyRankChangedExplicitDistribution(%ar //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0], [0, 0, 0]] //CHECK-SAME: }> - //CHECK: [[OUTBUFF:%.*]] = memref.alloc() : memref<1x4x64xf16, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + //CHECK: [[OUTBUFF:%.+]] = memref.alloc() : memref<1x4x64xf16, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[RESHAPE]] : !VPUIP.DistributedBuffer<1x4x64xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64 // CHECK-SAME: outputs([[OUTBUFF]] : memref<1x4x64xf16, @DDR>) -> memref<1x4x64xf16, @DDR> @@ -1654,7 +1654,7 @@ func.func @MoveGenericReshapeBeforeTilingCopyRankChangedAndSplitHigherDim(%arg0: return %1 : memref<2x2x64xf16, @DDR> - //CHECK: [[RESHAPE:%.*]] = VPUIP.GenericReshape + //CHECK: [[RESHAPE:%.+]] = VPUIP.GenericReshape //CHECK-SAME: inputs(%arg0 : !VPUIP.DistributedBuffer< //CHECK-SAME: 4x64x1x1xf16, #NCHW, @CMX_NN, { //CHECK-SAME: mode = "DUPLICATED|SEGMENTED", @@ -1672,8 +1672,8 @@ func.func @MoveGenericReshapeBeforeTilingCopyRankChangedAndSplitHigherDim(%arg0: //CHECK-SAME: uniform_distributed_segments //CHECK-SAME: }> - //CHECK: [[OUTBUFF:%.*]] = memref.alloc() : memref<2x2x64xf16, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + //CHECK: [[OUTBUFF:%.+]] = memref.alloc() : memref<2x2x64xf16, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[RESHAPE]] : !VPUIP.DistributedBuffer<2x2x64xf16, #CHW, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 1, 2], num_clusters = 2 : i64, alignment = [1, 1, 16], uniform_distributed_segments}>) // CHECK-SAME: outputs([[OUTBUFF]] : memref<2x2x64xf16, @DDR>) -> memref<2x2x64xf16, @DDR> @@ -1705,7 +1705,7 @@ func.func @MoveGenericReshapeBeforeTilingCopyRankChangedAndSplitHigherDimExplici return %1 : memref<2x2x64xf16, @DDR> - //CHECK: [[RESHAPE:%.*]] = VPUIP.GenericReshape + //CHECK: [[RESHAPE:%.+]] = VPUIP.GenericReshape //CHECK-SAME: inputs(%arg0 : !VPUIP.DistributedBuffer< //CHECK-SAME: 4x64x1x1xf16, #NCHW, @CMX_NN, //CHECK-SAME: {mode = "DUPLICATED|SEGMENTED", @@ -1725,8 +1725,8 @@ func.func @MoveGenericReshapeBeforeTilingCopyRankChangedAndSplitHigherDimExplici //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0], [0, 0, 0]] //CHECK-SAME: }> - //CHECK: [[OUTBUFF:%.*]] = memref.alloc() : memref<2x2x64xf16, @DDR> - // CHECK: [[COPY:%.*]] = VPUIP.Copy + //CHECK: [[OUTBUFF:%.+]] = memref.alloc() : memref<2x2x64xf16, @DDR> + // CHECK: [[COPY:%.+]] = VPUIP.Copy // CHECK-SAME: inputs([[RESHAPE]] : !VPUIP.DistributedBuffer<2x2x64xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64 // CHECK-SAME: outputs([[OUTBUFF]] : memref<2x2x64xf16, @DDR>) -> memref<2x2x64xf16, @DDR> diff --git a/tests/lit/NPU/dialect/VPUIP/passes/optimize_concat_view_copies.mlir b/tests/lit/NPU/dialect/VPUIP/passes/optimize_concat_view_copies.mlir index 649b8d974c..66b5a12929 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/optimize_concat_view_copies.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/optimize_concat_view_copies.mlir @@ -2017,19 +2017,19 @@ func.func @MoveConcatViewWithClusteredCopyToCMX_ReshapeChangesShapeRank( // CHECK: [[BUFFER_CMX:%.+]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<2x49x49xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> // CHECK: [[SUBVIEW0:%.+]] = VPUIP.SubView [[BUFFER_CMX]] // CHECK-SAME: [0, 0, 0] [1, 49, 49] : !VPUIP.DistributedBuffer<2x49x49xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> to - // CHECK-SAME: !VPUIP.DistributedBuffer<1x49x49xf16, {order = #CHW, strides = [2401, 49, 1]}, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + // CHECK-SAME: !VPUIP.DistributedBuffer<1x49x49xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> // CHECK: [[TILING_COPY0:%.+]] = VPUIP.Copy // CHECK-SAME: inputs(%arg0 : memref<1x49x49xf16, @DDR>) - // CHECK-SAME: outputs([[SUBVIEW0]] : !VPUIP.DistributedBuffer<1x49x49xf16, {order = #CHW, strides = [2401, 49, 1]}, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x49x49xf16, {order = #CHW, strides = [2401, 49, 1]}, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + // CHECK-SAME: outputs([[SUBVIEW0]] : !VPUIP.DistributedBuffer<1x49x49xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x49x49xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> // CHECK: [[SUBVIEW1:%.+]] = VPUIP.SubView [[BUFFER_CMX]] // CHECK-SAME: [1, 0, 0] [1, 49, 49] : !VPUIP.DistributedBuffer<2x49x49xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> to - // CHECK-SAME: !VPUIP.DistributedBuffer<1x49x49xf16, {order = #CHW, strides = [2401, 49, 1]}, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + // CHECK-SAME: !VPUIP.DistributedBuffer<1x49x49xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> // CHECK: [[TILING_COPY1:%.+]] = VPUIP.Copy // CHECK-SAME: inputs(%arg1 : memref<1x49x49xf16, @DDR>) - // CHECK-SAME: outputs([[SUBVIEW1]] : !VPUIP.DistributedBuffer<1x49x49xf16, {order = #CHW, strides = [2401, 49, 1]}, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x49x49xf16, {order = #CHW, strides = [2401, 49, 1]}, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + // CHECK-SAME: outputs([[SUBVIEW1]] : !VPUIP.DistributedBuffer<1x49x49xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x49x49xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> // CHECK: [[CONCAT:%.+]] = VPUIP.ConcatView - // CHECK-SAME: inputs([[TILING_COPY0]], [[TILING_COPY1]] : !VPUIP.DistributedBuffer<1x49x49xf16, {order = #CHW, strides = [2401, 49, 1]}, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>, !VPUIP.DistributedBuffer<1x49x49xf16, {order = #CHW, strides = [2401, 49, 1]}, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) + // CHECK-SAME: inputs([[TILING_COPY0]], [[TILING_COPY1]] : !VPUIP.DistributedBuffer<1x49x49xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>, !VPUIP.DistributedBuffer<1x49x49xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) // CHECK-SAME: outputs([[BUFFER_CMX]] : !VPUIP.DistributedBuffer<2x49x49xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<2x49x49xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> // CHECK: [[RESHAPE:%.+]] = VPUIP.GenericReshape inputs([[CONCAT]] : !VPUIP.DistributedBuffer<2x49x49xf16, #CHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) @@ -5780,11 +5780,11 @@ func.func @SplitUnbalancedConcatWithoutReshapeConsumer(%arg0 : !Arg0T, %arg1 : ! mode = "DUPLICATED|SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64, alignment = [1, 16, 1, 1]} > -!DistribConsumerTypeLeft = !VPUIP.DistributedBuffer<1x64x16x503xf16, {order = #NHWC, strides = [515072, 1, 32192, 64]}, @CMX_NN, { +!DistribConsumerTypeLeft = !VPUIP.DistributedBuffer<1x64x16x503xf16, #NHWC, @CMX_NN, { mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64} > -!DistribConsumerTypeRight = !VPUIP.DistributedBuffer<1x64x16x502xf16, {order = #NHWC, strides = [514048, 1, 32128, 64]}, @CMX_NN, { +!DistribConsumerTypeRight = !VPUIP.DistributedBuffer<1x64x16x502xf16, #NHWC, @CMX_NN, { mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64} > @@ -5853,20 +5853,20 @@ func.func @SplitUnbalancedConcatIfSubviewOnConcatAxis(%arg0 : !Arg0T, %arg1 : !A // CHECK: [[ALLOC_DISTRIBUTED0:%.+]] = VPURT.AllocDistributed // CHECK: [[SUBVIEW0:%.+]] = VPUIP.SubView [[PERMUTE1]] [0, 0, 0, 0] [1, 64, 16, 503] : memref<1x64x16x1004xf16, #NHWC, @DDR> to memref<1x64x16x503xf16, {order = #NHWC, strides = [1028096, 1, 64256, 64]}, @DDR> // CHECK: [[COPY1:%.+]] = VPUIP.Copy inputs([[SUBVIEW0]] : memref<1x64x16x503xf16, {order = #NHWC, strides = [1028096, 1, 64256, 64]}, @DDR>) - // CHECK-SAME: outputs([[ALLOC_DISTRIBUTED0]] : !VPUIP.DistributedBuffer<1x64x16x503xf16, {order = #NHWC, strides = [515072, 1, 32192, 64]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) + // CHECK-SAME: outputs([[ALLOC_DISTRIBUTED0]] : !VPUIP.DistributedBuffer<1x64x16x503xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) // CHECK: [[ALLOC_DISTRIBUTED1:%.+]] = VPURT.AllocDistributed // CHECK: [[SUBVIEW1:%.+]] = VPUIP.SubView [[PERMUTE1]] [0, 0, 0, 503] [1, 64, 16, 501] : memref<1x64x16x1004xf16, #NHWC, @DDR> to memref<1x64x16x501xf16, {order = #NHWC, strides = [1028096, 1, 64256, 64]}, @DDR> - // CHECK: [[SUBVIEW2:%.+]] = VPUIP.SubView [[ALLOC_DISTRIBUTED1]] [0, 0, 0, 0] [1, 64, 16, 501] : !VPUIP.DistributedBuffer<1x64x16x502xf16, {order = #NHWC, strides = [514048, 1, 32128, 64]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + // CHECK: [[SUBVIEW2:%.+]] = VPUIP.SubView [[ALLOC_DISTRIBUTED1]] [0, 0, 0, 0] [1, 64, 16, 501] : !VPUIP.DistributedBuffer<1x64x16x502xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> // CHECK-SAME: to !VPUIP.DistributedBuffer<1x64x16x501xf16, {order = #NHWC, strides = [514048, 1, 32128, 64]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> // CHECK: [[COPY2:%.+]] = VPUIP.Copy inputs([[SUBVIEW1]] : memref<1x64x16x501xf16, {order = #NHWC, strides = [1028096, 1, 64256, 64]}, @DDR>) // CHECK-SAME: outputs([[SUBVIEW2]] : !VPUIP.DistributedBuffer<1x64x16x501xf16, {order = #NHWC, strides = [514048, 1, 32128, 64]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) - // CHECK: [[SUBVIEW3:%.+]] = VPUIP.SubView [[ALLOC_DISTRIBUTED1]] [0, 0, 0, 501] [1, 64, 16, 1] : !VPUIP.DistributedBuffer<1x64x16x502xf16, {order = #NHWC, strides = [514048, 1, 32128, 64]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + // CHECK: [[SUBVIEW3:%.+]] = VPUIP.SubView [[ALLOC_DISTRIBUTED1]] [0, 0, 0, 501] [1, 64, 16, 1] : !VPUIP.DistributedBuffer<1x64x16x502xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> // CHECK-SAME: to !VPUIP.DistributedBuffer<1x64x16x1xf16, {order = #NHWC, strides = [514048, 1, 32128, 64]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> // CHECK: [[COPY3:%.+]] = VPUIP.Copy inputs([[PERMUTE2]] : memref<1x64x16x1xf16, #NHWC, @DDR>) outputs([[SUBVIEW3]] : !VPUIP.DistributedBuffer<1x64x16x1xf16, {order = #NHWC, strides = [514048, 1, 32128, 64]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) // CHECK: [[CONCATVIEW:%.+]] = VPUIP.ConcatView inputs([[COPY2]], [[COPY3]] : !VPUIP.DistributedBuffer<1x64x16x501xf16, {order = #NHWC, strides = [514048, 1, 32128, 64]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>, // CHECK-SAME: !VPUIP.DistributedBuffer<1x64x16x1xf16, {order = #NHWC, strides = [514048, 1, 32128, 64]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) - // CHECK-SAME: outputs([[ALLOC_DISTRIBUTED1]] : !VPUIP.DistributedBuffer<1x64x16x502xf16, {order = #NHWC, strides = [514048, 1, 32128, 64]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) + // CHECK-SAME: outputs([[ALLOC_DISTRIBUTED1]] : !VPUIP.DistributedBuffer<1x64x16x502xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) // CHECK: return [[COPY1]], [[CONCATVIEW]] } @@ -7006,3 +7006,74 @@ func.func @NotAvoidConcatExtraChannelWithPermuteCastBecauseOfIncompatibleMemShap // CHECK: return [[COPY_2]] } + + + +// +// ----- +// +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + + +!Ret = !VPUIP.DistributedBuffer<4160x128x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @CMX_NN, { + mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, + alignment = [16, 1, 1, 1], uniform_distributed_segments, + compute_shapes = [[1040, 128, 1, 1], [1040, 128, 1, 1], [1040, 128, 1, 1], [1040, 128, 1, 1]], + compute_offsets = [[0, 0, 0, 0], [1040, 0, 0, 0], [2080, 0, 0, 0], [3120, 0, 0, 0]], + memory_shapes = [[1040, 128, 1, 1], [1040, 128, 1, 1], [1040, 128, 1, 1], [1040, 128, 1, 1]], + memory_offsets = [[0, 0, 0, 0], [1040, 0, 0, 0], [2080, 0, 0, 0], [3120, 0, 0, 0]] +}> + +!InputDistributed = !VPUIP.DistributedBuffer<1x8x1x128xf16, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, @CMX_NN, { + mode = "SEGMENTED", num_tiles = [1, 4, 1, 1], num_clusters = 4 : i64, + uniform_distributed_segments, + compute_shapes = [[1, 2, 1, 128], [1, 2, 1, 128], [1, 2, 1, 128], [1, 2, 1, 128]], + compute_offsets = [[0, 0, 0, 0], [0, 2, 0, 0], [0, 4, 0, 0], [0, 6, 0, 0]], + memory_shapes = [[1, 2, 1, 128], [1, 2, 1, 128], [1, 2, 1, 128], [1, 2, 1, 128]], + memory_offsets = [[0, 0, 0, 0], [0, 2, 0, 0], [0, 4, 0, 0], [0, 6, 0, 0]] +}> + + +// CHECK-LABEL: func.func @NotOptSplitUnbalancedConcatOnSameAxisForOffsetBiggerThanTilingDim + +func.func @NotOptSplitUnbalancedConcatOnSameAxisForOffsetBiggerThanTilingDim(%arg0 : memref<1x8x8319x128xf16, @DDR>, %arg1 : !InputDistributed) -> (!Ret) { + %alloc_0 = memref.alloc() : memref<1x8x1x128xf16, @DDR> + %0 = VPUIP.Copy inputs(%arg1 : !InputDistributed) outputs(%alloc_0 : memref<1x8x1x128xf16, @DDR>) -> memref<1x8x1x128xf16, @DDR> + + %alloc_1 = memref.alloc() : memref<1x8x8320x128xf16, @DDR> + %1 = VPUIP.SubView %alloc_1 [0, 0, 0, 0] [1, 8, 8319, 128] : memref<1x8x8320x128xf16, @DDR> to memref<1x8x8319x128xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, strides = [8519680, 1064960, 128, 1]}, @DDR> + %2 = VPUIP.Copy inputs(%arg0 : memref<1x8x8319x128xf16, @DDR>) outputs(%1 : memref<1x8x8319x128xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, strides = [8519680, 1064960, 128, 1]}, @DDR>) -> memref<1x8x8319x128xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, strides = [8519680, 1064960, 128, 1]}, @DDR> + %3 = VPUIP.SubView %alloc_1 [0, 0, 8319, 0] [1, 8, 1, 128] : memref<1x8x8320x128xf16, @DDR> to memref<1x8x1x128xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, strides = [8519680, 1064960, 128, 1]}, @DDR> + %4 = VPUIP.Copy inputs(%0 : memref<1x8x1x128xf16, @DDR>) outputs(%3 : memref<1x8x1x128xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, strides = [8519680, 1064960, 128, 1]}, @DDR>) -> memref<1x8x1x128xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, strides = [8519680, 1064960, 128, 1]}, @DDR> + %5 = VPUIP.ConcatView inputs(%2, %4 : memref<1x8x8319x128xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, strides = [8519680, 1064960, 128, 1]}, @DDR>, memref<1x8x1x128xf16, {order = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, strides = [8519680, 1064960, 128, 1]}, @DDR>) outputs(%alloc_1 : memref<1x8x8320x128xf16, @DDR>) -> memref<1x8x8320x128xf16, @DDR> + %6 = VPUIP.GenericReshape inputs(%5 : memref<1x8x8320x128xf16, @DDR>) -> memref<66560x128x1x1xf16, @DDR> + + %7 = VPUIP.PermuteCast {dst_order = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>} inputs(%6 : memref<66560x128x1x1xf16, @DDR>) -> memref<66560x128x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @DDR> + + %8 = VPUIP.SubView %7 [58240, 0, 0, 0] [4160, 128, 1, 1] : memref<66560x128x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @DDR> to memref<4160x128x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @DDR> + %9 = VPURT.AllocDistributed -> !Ret + %10 = VPUIP.Copy inputs(%8 : memref<4160x128x1x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @DDR>) outputs(%9 : !Ret) -> !Ret + + return %10 : !Ret + + // CHECK: [[ALLOC:%.+]] = memref.alloc() : memref<1x8x1x128xf16, @DDR> + // CHECK: [[COPY_0:%.+]] = VPUIP.Copy + // CHECK: [[ALLOC_0:%.+]] = memref.alloc() : memref<1x8x8320x128xf16, @DDR> + // CHECK: [[SUBVIEW_0:%.+]] = VPUIP.SubView [[ALLOC_0]] [0, 0, 0, 0] [1, 8, 8319, 128] : memref<1x8x8320x128xf16, @DDR> to memref<1x8x8319x128xf16, {order = #NCHW, strides = [8519680, 1064960, 128, 1]}, @DDR> + // CHECK: [[COPY_1:%.+]] = VPUIP.Copy inputs(%arg0 : memref<1x8x8319x128xf16, @DDR>) outputs([[SUBVIEW_0]] : memref<1x8x8319x128xf16, {order = #NCHW, strides = [8519680, 1064960, 128, 1]}, @DDR>) -> memref<1x8x8319x128xf16, {order = #NCHW, strides = [8519680, 1064960, 128, 1]}, @DDR> + + // CHECK: [[SUBVIEW_1:%.+]] = VPUIP.SubView [[ALLOC_0]] [0, 0, 8319, 0] [1, 8, 1, 128] : memref<1x8x8320x128xf16, @DDR> to memref<1x8x1x128xf16, {order = #NCHW, strides = [8519680, 1064960, 128, 1]}, @DDR> + // CHECK: [[COPY_2:%.+]] = VPUIP.Copy inputs([[COPY_0]] : memref<1x8x1x128xf16, @DDR>) outputs([[SUBVIEW_1]] : memref<1x8x1x128xf16, {order = #NCHW, strides = [8519680, 1064960, 128, 1]}, @DDR>) -> memref<1x8x1x128xf16, {order = #NCHW, strides = [8519680, 1064960, 128, 1]}, @DDR> + + // CHECK: [[CONCAT:%.+]] = VPUIP.ConcatView inputs([[COPY_1]], [[COPY_2]] : memref<1x8x8319x128xf16, {order = #NCHW, strides = [8519680, 1064960, 128, 1]}, @DDR>, memref<1x8x1x128xf16, {order = #NCHW, strides = [8519680, 1064960, 128, 1]}, @DDR>) outputs([[ALLOC_0]] : memref<1x8x8320x128xf16, @DDR>) -> memref<1x8x8320x128xf16, @DDR> + + // CHECK: [[GENERIC_RESHAPE_0:%.+]] = VPUIP.GenericReshape inputs([[CONCAT]] : memref<1x8x8320x128xf16, @DDR>) -> memref<66560x128x1x1xf16, @DDR> + // CHECK: [[PERMUTECAST_0:%.+]] = VPUIP.PermuteCast {dst_order = #NHWC, mem_perm = #NHWC} inputs([[GENERIC_RESHAPE_0]] : memref<66560x128x1x1xf16, @DDR>) -> memref<66560x128x1x1xf16, #NHWC, @DDR> + + // CHECK: [[SUBVIEW_2:%.+]] = VPUIP.SubView [[PERMUTECAST_0]] [58240, 0, 0, 0] [4160, 128, 1, 1] : memref<66560x128x1x1xf16, #NHWC, @DDR> to memref<4160x128x1x1xf16, #NHWC, @DDR> + // CHECK: [[DISTRIB_BUFFER:%.+]] = VPURT.AllocDistributed + + // CHECK: [[COPY_3:%.+]] = VPUIP.Copy + // CHECK: return [[COPY_3]] +} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/optimize_concat_view_copies_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/optimize_concat_view_copies_40XX+.mlir index f77a9a2fd1..02d3d9e0db 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/optimize_concat_view_copies_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/optimize_concat_view_copies_40XX+.mlir @@ -366,3 +366,128 @@ func.func @SplitUnbalancedConcatOnDifferentAxisBranchInputIsDistributedDuplicate // CHECK-SAME{LITERAL}: memory_shapes = [[32, 1024, 1, 1], [32, 1024, 1, 1], [32, 1024, 1, 1], [32, 1024, 1, 1]], memory_offsets = [[0, 0, 0, 0], [32, 0, 0, 0], [64, 0, 0, 0], [96, 0, 0, 0]]}> // CHECK: return [[CONCATVIEW_0]], [[CONCATVIEW_1]] } + +// +// ----- +// + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!ResultT = !VPUIP.DistributedBuffer<1024x96x1x1xf16, #NHWC, @CMX_NN, { + mode = "SEGMENTED", + num_tiles = [4, 1, 1, 1], + num_clusters = 4 : i64, + alignment = [16, 1, 1, 1], + uniform_distributed_segments, + compute_shapes = [[256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1]], + compute_offsets = [[0, 0, 0, 0], [256, 0, 0, 0], [512, 0, 0, 0], [768, 0, 0, 0]], + memory_shapes = [[256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1]], + memory_offsets = [[0, 0, 0, 0], [256, 0, 0, 0], [512, 0, 0, 0], [768, 0, 0, 0]] +}> + +!Arg0T = memref<1x32x768x96xf16, @DDR> +!Arg1T = !VPUIP.DistributedBuffer<1x32x256x96xf16, #NCHW, @CMX_NN, { + mode = "SEGMENTED", + num_tiles = [1, 4, 1, 1], + num_clusters = 4 : i64, + uniform_distributed_segments, + compute_shapes = [[1, 8, 256, 96], [1, 8, 256, 96], [1, 8, 256, 96], [1, 8, 256, 96]], + compute_offsets = [[0, 0, 0, 0], [0, 8, 0, 0], [0, 16, 0, 0], [0, 24, 0, 0]], + memory_shapes = [[1, 8, 256, 96], [1, 8, 256, 96], [1, 8, 256, 96], [1, 8, 256, 96]], + memory_offsets = [[0, 0, 0, 0], [0, 8, 0, 0], [0, 16, 0, 0], [0, 24, 0, 0]] +}> + +// CHECK-LABEL: func.func @NotSplitUnbalancedDDRConcatOnSameAxisWhenNoLeftBranchDataOnTheLastCluster +// CHECK-SAME: [[ARG0:%.+]]: memref<1x32x768x96xf16, @DDR>, +// CHECK-SAME: [[ARG1:%.+]]: !VPUIP.DistributedBuffer<1x32x256x96xf16, #NCHW, @CMX_NN +func.func @NotSplitUnbalancedDDRConcatOnSameAxisWhenNoLeftBranchDataOnTheLastCluster(%arg0 : !Arg0T, %arg1 : !Arg1T) -> (!ResultT, !ResultT) { + %alloc = memref.alloc() : memref<1x32x1024x96xf16, @DDR> + // Left branch + %0 = VPUIP.SubView %alloc [0, 0, 0, 0] [1, 32, 768, 96] : memref<1x32x1024x96xf16, @DDR> to memref<1x32x768x96xf16, {order = #NCHW, strides = [3145728, 98304, 96, 1]}, @DDR> + %1 = VPUIP.Copy inputs(%arg0 : memref<1x32x768x96xf16, @DDR>) outputs(%0 : memref<1x32x768x96xf16, {order = #NCHW, strides = [3145728, 98304, 96, 1]}, @DDR>) -> memref<1x32x768x96xf16, {order = #NCHW, strides = [3145728, 98304, 96, 1]}, @DDR> + + // Right branch + %alloc_1 = memref.alloc() : memref<1x32x256x96xf16, @DDR> + %2 = VPUIP.Copy inputs(%arg1 : !Arg1T) outputs(%alloc_1 : memref<1x32x256x96xf16, @DDR>) -> memref<1x32x256x96xf16, @DDR> + + %3 = VPUIP.SubView %alloc [0, 0, 768, 0] [1, 32, 256, 96] : memref<1x32x1024x96xf16, @DDR> to memref<1x32x256x96xf16, {order = #NCHW, strides = [3145728, 98304, 96, 1]}, @DDR> + %4 = VPUIP.Copy inputs(%2 : memref<1x32x256x96xf16, @DDR>) outputs(%3 : memref<1x32x256x96xf16, {order = #NCHW, strides = [3145728, 98304, 96, 1]}, @DDR>) -> memref<1x32x256x96xf16, {order = #NCHW, strides = [3145728, 98304, 96, 1]}, @DDR> + %5 = VPUIP.ConcatView inputs(%1, %4 : memref<1x32x768x96xf16, {order = #NCHW, strides = [3145728, 98304, 96, 1]}, @DDR>, memref<1x32x256x96xf16, {order = #NCHW, strides = [3145728, 98304, 96, 1]}, @DDR>) outputs(%alloc : memref<1x32x1024x96xf16, @DDR>) -> memref<1x32x1024x96xf16, @DDR> + + %6 = VPUIP.GenericReshape inputs(%5 : memref<1x32x1024x96xf16, @DDR>) -> memref<32768x96x1x1xf16, @DDR> + %7 = VPUIP.PermuteCast {dst_order = #NHWC, mem_perm = #NHWC} inputs(%6 : memref<32768x96x1x1xf16, @DDR>) -> memref<32768x96x1x1xf16, #NHWC, @DDR> + + %8 = VPUIP.SubView %7 [0, 0, 0, 0] [1024, 96, 1, 1] : memref<32768x96x1x1xf16, #NHWC, @DDR> to memref<1024x96x1x1xf16, #NHWC, @DDR> + %9 = VPUIP.SubView %7 [1024, 0, 0, 0] [1024, 96, 1, 1] : memref<32768x96x1x1xf16, #NHWC, @DDR> to memref<1024x96x1x1xf16, #NHWC, @DDR> + + %10 = VPURT.AllocDistributed -> !ResultT + %11 = VPUIP.Copy inputs(%8 : memref<1024x96x1x1xf16, #NHWC, @DDR>) outputs(%10 : !ResultT) -> !ResultT + + %12 = VPURT.AllocDistributed -> !ResultT + %13 = VPUIP.Copy inputs(%9 : memref<1024x96x1x1xf16, #NHWC, @DDR>) outputs(%12 : !ResultT) -> !ResultT + + return %11, %13 : !ResultT, !ResultT + + // CHECK: [[ALLOC:%.+]] = memref.alloc() : memref<1x32x256x96xf16, @DDR> + // CHECK: [[COPY_0:%.+]] = VPUIP.Copy inputs([[ARG1]] : !VPUIP.DistributedBuffer<1x32x256x96xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 4, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[1, 8, 256, 96], [1, 8, 256, 96], [1, 8, 256, 96], [1, 8, 256, 96]], compute_offsets = [[0, 0, 0, 0], [0, 8, 0, 0], [0, 16, 0, 0], [0, 24, 0, 0]], + // CHECK-SAME{LITERAL}: memory_shapes = [[1, 8, 256, 96], [1, 8, 256, 96], [1, 8, 256, 96], [1, 8, 256, 96]], memory_offsets = [[0, 0, 0, 0], [0, 8, 0, 0], [0, 16, 0, 0], [0, 24, 0, 0]]}>) + // CHECK-SAME: outputs([[ALLOC:%.+]] : memref<1x32x256x96xf16, @DDR>) -> memref<1x32x256x96xf16, @DDR> + + // CHECK: [[RESHAPE_0:%.+]] = VPUIP.GenericReshape inputs([[ARG0]] : memref<1x32x768x96xf16, @DDR>) -> memref<24576x96x1x1xf16, @DDR> + // CHECK: [[PERMUTECAST_0:%.+]] = VPUIP.PermuteCast {dst_order = #NHWC, mem_perm = #NHWC} inputs([[RESHAPE_0]] : memref<24576x96x1x1xf16, @DDR>) -> memref<24576x96x1x1xf16, #NHWC, @DDR> + + // CHECK: [[RESHAPE_1:%.+]] = VPUIP.GenericReshape inputs([[COPY_0]] : memref<1x32x256x96xf16, @DDR>) -> memref<8192x96x1x1xf16, @DDR> + // CHECK: [[PERMUTECAST_1:%.+]] = VPUIP.PermuteCast {dst_order = #NHWC, mem_perm = #NHWC} inputs([[RESHAPE_1]] : memref<8192x96x1x1xf16, @DDR>) -> memref<8192x96x1x1xf16, #NHWC, @DDR> + + // CHECK: [[ALLOC_DISTRIBUTED_0:%.+]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1024x96x1x1xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1]], compute_offsets = [[0, 0, 0, 0], [256, 0, 0, 0], [512, 0, 0, 0], [768, 0, 0, 0]], + // CHECK-SAME{LITERAL}: memory_shapes = [[256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1]], memory_offsets = [[0, 0, 0, 0], [256, 0, 0, 0], [512, 0, 0, 0], [768, 0, 0, 0]]}> + + // CHECK: [[ALLOC_0:%.+]] = memref.alloc() : memref<1024x96x1x1xf16, #NHWC, @DDR> + + // CHECK: [[SUBVIEW_0:%.+]] = VPUIP.SubView [[PERMUTECAST_0]] [0, 0, 0, 0] [768, 96, 1, 1] : memref<24576x96x1x1xf16, #NHWC, @DDR> to memref<768x96x1x1xf16, #NHWC, @DDR> + // CHECK: [[SUBVIEW_1:%.+]] = VPUIP.SubView [[ALLOC_0:%.+]] [0, 0, 0, 0] [768, 96, 1, 1] : memref<1024x96x1x1xf16, #NHWC, @DDR> to memref<768x96x1x1xf16, #NHWC, @DDR> + // CHECK: [[COPY_1:%.+]] = VPUIP.Copy inputs([[SUBVIEW_0]] : memref<768x96x1x1xf16, #NHWC, @DDR>) + // CHECK-SAME: outputs([[SUBVIEW_1]] : memref<768x96x1x1xf16, #NHWC, @DDR>) -> memref<768x96x1x1xf16, #NHWC, @DDR> + + // CHECK: [[SUBVIEW_2:%.+]] = VPUIP.SubView [[PERMUTECAST_1]] [0, 0, 0, 0] [256, 96, 1, 1] : memref<8192x96x1x1xf16, #NHWC, @DDR> to memref<256x96x1x1xf16, #NHWC, @DDR> + // CHECK: [[SUBVIEW_3:%.+]] = VPUIP.SubView [[ALLOC_0]] [768, 0, 0, 0] [256, 96, 1, 1] : memref<1024x96x1x1xf16, #NHWC, @DDR> to memref<256x96x1x1xf16, #NHWC, @DDR> + // CHECK: [[COPY_2:%.+]] = VPUIP.Copy inputs([[SUBVIEW_2]] : memref<256x96x1x1xf16, #NHWC, @DDR>) + // CHECK-SAME: outputs([[SUBVIEW_3]] : memref<256x96x1x1xf16, #NHWC, @DDR>) -> memref<256x96x1x1xf16, #NHWC, @DDR> + + // CHECK: [[CONCATVIEW_0:%.+]] = VPUIP.ConcatView inputs([[COPY_1]], [[COPY_2]] : memref<768x96x1x1xf16, #NHWC, @DDR>, memref<256x96x1x1xf16, #NHWC, @DDR>) + // CHECK-SAME: outputs([[ALLOC_0]] : memref<1024x96x1x1xf16, #NHWC, @DDR>) -> memref<1024x96x1x1xf16, #NHWC, @DDR> + + // CHECK: [[COPY_3:%.+]] = VPUIP.Copy inputs([[CONCATVIEW_0]] : memref<1024x96x1x1xf16, #NHWC, @DDR>) + // CHECK-SAME: outputs([[ALLOC_DISTRIBUTED_0]] : !VPUIP.DistributedBuffer<1024x96x1x1xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1]], compute_offsets = [[0, 0, 0, 0], [256, 0, 0, 0], [512, 0, 0, 0], [768, 0, 0, 0]], + // CHECK-SAME{LITERAL}: memory_shapes = [[256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1]], memory_offsets = [[0, 0, 0, 0], [256, 0, 0, 0], [512, 0, 0, 0], [768, 0, 0, 0]]}> + + // CHECK: [[ALLOC_DISTRIBUTED_1:%.+]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1024x96x1x1xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1]], compute_offsets = [[0, 0, 0, 0], [256, 0, 0, 0], [512, 0, 0, 0], [768, 0, 0, 0]], + // CHECK-SAME{LITERAL}: memory_shapes = [[256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1]], memory_offsets = [[0, 0, 0, 0], [256, 0, 0, 0], [512, 0, 0, 0], [768, 0, 0, 0]]}> + + // CHECK: [[ALLOC_1:%.+]] = memref.alloc() : memref<1024x96x1x1xf16, #NHWC, @DDR> + + // CHECK: [[SUBVIEW_4:%.+]] = VPUIP.SubView [[PERMUTECAST_0]] [768, 0, 0, 0] [768, 96, 1, 1] : memref<24576x96x1x1xf16, #NHWC, @DDR> to memref<768x96x1x1xf16, #NHWC, @DDR> + // CHECK: [[SUBVIEW_5:%.+]] = VPUIP.SubView [[ALLOC_1:%.+]] [0, 0, 0, 0] [768, 96, 1, 1] : memref<1024x96x1x1xf16, #NHWC, @DDR> to memref<768x96x1x1xf16, #NHWC, @DDR> + // CHECK: [[COPY_4:%.+]] = VPUIP.Copy inputs([[SUBVIEW_4]] : memref<768x96x1x1xf16, #NHWC, @DDR>) + // CHECK-SAME: outputs([[SUBVIEW_5]] : memref<768x96x1x1xf16, #NHWC, @DDR>) -> memref<768x96x1x1xf16, #NHWC, @DDR> + + // CHECK: [[SUBVIEW_6:%.+]] = VPUIP.SubView [[PERMUTECAST_1]] [256, 0, 0, 0] [256, 96, 1, 1] : memref<8192x96x1x1xf16, #NHWC, @DDR> to memref<256x96x1x1xf16, #NHWC, @DDR> + // CHECK: [[SUBVIEW_7:%.+]] = VPUIP.SubView [[ALLOC_1]] [768, 0, 0, 0] [256, 96, 1, 1] : memref<1024x96x1x1xf16, #NHWC, @DDR> to memref<256x96x1x1xf16, #NHWC, @DDR> + // CHECK: [[COPY_5:%.+]] = VPUIP.Copy inputs([[SUBVIEW_6]] : memref<256x96x1x1xf16, #NHWC, @DDR>) + // CHECK-SAME: outputs([[SUBVIEW_7]] : memref<256x96x1x1xf16, #NHWC, @DDR>) -> memref<256x96x1x1xf16, #NHWC, @DDR> + + // CHECK: [[CONCATVIEW_1:%.+]] = VPUIP.ConcatView inputs([[COPY_4]], [[COPY_5]] : memref<768x96x1x1xf16, #NHWC, @DDR>, memref<256x96x1x1xf16, #NHWC, @DDR>) + // CHECK-SAME: outputs([[ALLOC_1]] : memref<1024x96x1x1xf16, #NHWC, @DDR>) -> memref<1024x96x1x1xf16, #NHWC, @DDR> + + // CHECK: [[COPY_6:%.+]] = VPUIP.Copy inputs([[CONCATVIEW_1]] : memref<1024x96x1x1xf16, #NHWC, @DDR>) + // CHECK-SAME: outputs([[ALLOC_DISTRIBUTED_1]] : !VPUIP.DistributedBuffer<1024x96x1x1xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1]], compute_offsets = [[0, 0, 0, 0], [256, 0, 0, 0], [512, 0, 0, 0], [768, 0, 0, 0]], + // CHECK-SAME{LITERAL}: memory_shapes = [[256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1], [256, 96, 1, 1]], memory_offsets = [[0, 0, 0, 0], [256, 0, 0, 0], [512, 0, 0, 0], [768, 0, 0, 0]]}> + + // CHECK: return [[COPY_3]], [[COPY_6]] +} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/optimize_copies.mlir b/tests/lit/NPU/dialect/VPUIP/passes/optimize_copies.mlir index d9b21444f0..738843d2a9 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/optimize_copies.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/optimize_copies.mlir @@ -3029,10 +3029,10 @@ func.func @MoveTilingCopyBeforeSubviewForSegmentedOnN(%arg0: !WeightsType) -> (! // CHECK: [[WEIGHTS_COPY:%.*]] = VPUIP.Copy // CHECK-SAME: inputs(%arg0 : memref<64x64x1x1xf16, #NHWC, @DDR>) // CHECK-SAME: outputs([[WEIGHTS_BUF_CMX]] : !VPUIP.DistributedBuffer<64x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [16, 1, 1, 1]}>) -> !VPUIP.DistributedBuffer<64x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [16, 1, 1, 1]}> - // CHECK: [[SUBVIEW0:%.*]] = VPUIP.SubView [[WEIGHTS_COPY]] [0, 0, 0, 0] [32, 64, 1, 1] : !VPUIP.DistributedBuffer<64x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [16, 1, 1, 1]}> to !VPUIP.DistributedBuffer<32x64x1x1xf16, {order = #NHWC, strides = [64, 1, 64, 64]}, @CMX_NN, {mode = "DUPLICATED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [8, 1, 1, 1]}> - // CHECK: [[CAST0:%.*]] = VPUIP.DistributedCast inputs([[SUBVIEW0]] : !VPUIP.DistributedBuffer<32x64x1x1xf16, {order = #NHWC, strides = [64, 1, 64, 64]}, @CMX_NN, {mode = "DUPLICATED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [8, 1, 1, 1]}>) -> !VPUIP.DistributedBuffer<32x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [16, 1, 1, 1]}> - // CHECK: [[SUBVIEW1:%.*]] = VPUIP.SubView %1 [32, 0, 0, 0] [32, 64, 1, 1] : !VPUIP.DistributedBuffer<64x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [16, 1, 1, 1]}> to !VPUIP.DistributedBuffer<32x64x1x1xf16, {order = #NHWC, strides = [64, 1, 64, 64]}, @CMX_NN, {mode = "DUPLICATED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [8, 1, 1, 1]}> - // CHECK: [[CAST1:%.*]] = VPUIP.DistributedCast inputs([[SUBVIEW1]] : !VPUIP.DistributedBuffer<32x64x1x1xf16, {order = #NHWC, strides = [64, 1, 64, 64]}, @CMX_NN, {mode = "DUPLICATED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [8, 1, 1, 1]}>) -> !VPUIP.DistributedBuffer<32x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [16, 1, 1, 1]}> + // CHECK: [[SUBVIEW0:%.*]] = VPUIP.SubView [[WEIGHTS_COPY]] [0, 0, 0, 0] [32, 64, 1, 1] : !VPUIP.DistributedBuffer<64x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [16, 1, 1, 1]}> to !VPUIP.DistributedBuffer<32x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [8, 1, 1, 1]}> + // CHECK: [[CAST0:%.*]] = VPUIP.DistributedCast inputs([[SUBVIEW0]] : !VPUIP.DistributedBuffer<32x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [8, 1, 1, 1]}>) -> !VPUIP.DistributedBuffer<32x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [16, 1, 1, 1]}> + // CHECK: [[SUBVIEW1:%.*]] = VPUIP.SubView %1 [32, 0, 0, 0] [32, 64, 1, 1] : !VPUIP.DistributedBuffer<64x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [16, 1, 1, 1]}> to !VPUIP.DistributedBuffer<32x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [8, 1, 1, 1]}> + // CHECK: [[CAST1:%.*]] = VPUIP.DistributedCast inputs([[SUBVIEW1]] : !VPUIP.DistributedBuffer<32x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [8, 1, 1, 1]}>) -> !VPUIP.DistributedBuffer<32x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [16, 1, 1, 1]}> // CHECK: [[OUTBUF0:%.*]] = memref.alloc() : memref<32x64x1x1xf16, #NHWC, @DDR> // CHECK: [[COPY0:%.*]] = VPUIP.Copy // CHECK-SAME: inputs([[CAST0]] : !VPUIP.DistributedBuffer<32x64x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, alignment = [16, 1, 1, 1]}>) @@ -3102,10 +3102,10 @@ func.func @MoveTilingCopyBeforeSubviewWithExplicitShapesAndOffsetsOnN(%arg0: !We // CHECK: [[WEIGHTS_COPY:%.*]] = VPUIP.Copy inputs([[ARG0]] : memref<1568x32x1x1xf16, #NHWC, @DDR>) // CHECK-SAME: outputs([[WEIGHTS_BUF_CMX]] : !VPUIP.DistributedBuffer<1568x32x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1]}>) // CHECK-SAME: -> !VPUIP.DistributedBuffer<1568x32x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1]}> - // CHECK: [[SUBVIEW0:%.*]] = VPUIP.SubView [[WEIGHTS_COPY]] [0, 0, 0, 0] [784, 32, 1, 1] : !VPUIP.DistributedBuffer<1568x32x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1]}> to !VPUIP.DistributedBuffer<784x32x1x1xf16, {order = #NHWC, strides = [32, 1, 32, 32]}, @CMX_NN, {mode = "DUPLICATED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [8, 1, 1, 1]}> - // CHECK: [[CAST0:%.*]] = VPUIP.DistributedCast inputs([[SUBVIEW0]] : !VPUIP.DistributedBuffer<784x32x1x1xf16, {order = #NHWC, strides = [32, 1, 32, 32]}, @CMX_NN, {mode = "DUPLICATED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [8, 1, 1, 1]}>) -> !VPUIP.DistributedBuffer<784x32x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments, compute_shapes = {{\[\[}}208, 32, 1, 1], [192, 32, 1, 1], [192, 32, 1, 1], [192, 32, 1, 1]], compute_offsets = {{\[\[}}0, 0, 0, 0], [208, 0, 0, 0], [400, 0, 0, 0], [592, 0, 0, 0]], memory_shapes = {{\[\[}}784, 32, 1, 1], [784, 32, 1, 1], [784, 32, 1, 1], [784, 32, 1, 1]], memory_offsets = {{\[\[}}0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> - // CHECK: [[SUBVIEW1:%.*]] = VPUIP.SubView [[WEIGHTS_COPY]] [784, 0, 0, 0] [784, 32, 1, 1] : !VPUIP.DistributedBuffer<1568x32x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1]}> to !VPUIP.DistributedBuffer<784x32x1x1xf16, {order = #NHWC, strides = [32, 1, 32, 32]}, @CMX_NN, {mode = "DUPLICATED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [8, 1, 1, 1]}> - // CHECK: [[CAST1:%.*]] = VPUIP.DistributedCast inputs([[SUBVIEW1]] : !VPUIP.DistributedBuffer<784x32x1x1xf16, {order = #NHWC, strides = [32, 1, 32, 32]}, @CMX_NN, {mode = "DUPLICATED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [8, 1, 1, 1]}>) -> !VPUIP.DistributedBuffer<784x32x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments, compute_shapes = {{\[\[}}208, 32, 1, 1], [192, 32, 1, 1], [192, 32, 1, 1], [192, 32, 1, 1]], compute_offsets = {{\[\[}}0, 0, 0, 0], [208, 0, 0, 0], [400, 0, 0, 0], [592, 0, 0, 0]], memory_shapes = {{\[\[}}784, 32, 1, 1], [784, 32, 1, 1], [784, 32, 1, 1], [784, 32, 1, 1]], memory_offsets = {{\[\[}}0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> + // CHECK: [[SUBVIEW0:%.*]] = VPUIP.SubView [[WEIGHTS_COPY]] [0, 0, 0, 0] [784, 32, 1, 1] : !VPUIP.DistributedBuffer<1568x32x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1]}> to !VPUIP.DistributedBuffer<784x32x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [8, 1, 1, 1]}> + // CHECK: [[CAST0:%.*]] = VPUIP.DistributedCast inputs([[SUBVIEW0]] : !VPUIP.DistributedBuffer<784x32x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [8, 1, 1, 1]}>) -> !VPUIP.DistributedBuffer<784x32x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments, compute_shapes = {{\[\[}}208, 32, 1, 1], [192, 32, 1, 1], [192, 32, 1, 1], [192, 32, 1, 1]], compute_offsets = {{\[\[}}0, 0, 0, 0], [208, 0, 0, 0], [400, 0, 0, 0], [592, 0, 0, 0]], memory_shapes = {{\[\[}}784, 32, 1, 1], [784, 32, 1, 1], [784, 32, 1, 1], [784, 32, 1, 1]], memory_offsets = {{\[\[}}0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> + // CHECK: [[SUBVIEW1:%.*]] = VPUIP.SubView [[WEIGHTS_COPY]] [784, 0, 0, 0] [784, 32, 1, 1] : !VPUIP.DistributedBuffer<1568x32x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1]}> to !VPUIP.DistributedBuffer<784x32x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [8, 1, 1, 1]}> + // CHECK: [[CAST1:%.*]] = VPUIP.DistributedCast inputs([[SUBVIEW1]] : !VPUIP.DistributedBuffer<784x32x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [8, 1, 1, 1]}>) -> !VPUIP.DistributedBuffer<784x32x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, alignment = [16, 1, 1, 1], uniform_distributed_segments, compute_shapes = {{\[\[}}208, 32, 1, 1], [192, 32, 1, 1], [192, 32, 1, 1], [192, 32, 1, 1]], compute_offsets = {{\[\[}}0, 0, 0, 0], [208, 0, 0, 0], [400, 0, 0, 0], [592, 0, 0, 0]], memory_shapes = {{\[\[}}784, 32, 1, 1], [784, 32, 1, 1], [784, 32, 1, 1], [784, 32, 1, 1]], memory_offsets = {{\[\[}}0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}> // CHECK: [[OUTBUF0:%.*]] = memref.alloc() : memref<784x32x1x1xf16, #NHWC, @DDR> // CHECK: [[COPY0:%.*]] = VPUIP.Copy // CHECK-SAME: inputs([[CAST0]] : !VPUIP.DistributedBuffer<784x32x1x1xf16, #NHWC, @CMX_NN, @@ -5132,6 +5132,64 @@ func.func @FuseCopiesThroughReshape(%input : memref<1x32x50x28x28xf16, #map, @DD // ----- +!qElemType = !quant.quantile + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#CHW = affine_map<(d0, d1, d2) -> (d0, d1, d2)> + +!OutputBufferType = !VPUIP.DistributedBuffer<1x2048x1x128x!qElemType, #NCHW, @CMX_NN, + {mode = "SEGMENTED", num_tiles = [1, 4, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, + compute_shapes = [[1, 512, 1, 128], [1, 512, 1, 128], [1, 512, 1, 128], [1, 512, 1, 128]], + compute_offsets = [[0, 0, 0, 0], [0, 512, 0, 0], [0, 1024, 0, 0], [0, 1536, 0, 0]], + memory_shapes = [[1, 512, 1, 128], [1, 512, 1, 128], [1, 512, 1, 128], [1, 512, 1, 128]], + memory_offsets = [[0, 0, 0, 0], [0, 512, 0, 0], [0, 1024, 0, 0], [0, 1536, 0, 0]]}> + + +// CHECK-LABEL: @FuseCopiesThroughReshapeQuantType +// CHECK-SAME: [[INPUT:%.+]]: memref<2048x1x128x!qElemType, {order = #CHW, strides = [2048, 128, 1]}, @DDR> +func.func @FuseCopiesThroughReshapeQuantType(%input : memref<2048x1x128x!qElemType, {order = #CHW, strides = [2048, 128, 1]}, @DDR>) -> !OutputBufferType { + %0 = memref.alloc() : memref<2048x1x128x!qElemType, @DDR> + %1 = VPUIP.Copy inputs(%input : memref<2048x1x128x!qElemType, {order = #CHW, strides = [2048, 128, 1]}, @DDR>) outputs(%0 : memref<2048x1x128x!qElemType, @DDR>) -> memref<2048x1x128x!qElemType, @DDR> + %2 = VPUIP.GenericReshape inputs(%1 : memref<2048x1x128x!qElemType, @DDR>) -> memref<1x2048x1x128x!qElemType, @DDR> + %3 = VPURT.AllocDistributed -> !OutputBufferType + %4 = VPUIP.Copy inputs(%2 : memref<1x2048x1x128x!qElemType, @DDR>) outputs(%3 : !OutputBufferType) -> !OutputBufferType + return %4 : !OutputBufferType + + // CHECK: [[DDR_BUF:%.+]] = VPURT.AllocDistributed + // CHECK-SAME: -> !VPUIP.DistributedBuffer<2048x1x128x!qElemType, #CHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[512, 1, 128], [512, 1, 128], [512, 1, 128], [512, 1, 128]], + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0], [512, 0, 0], [1024, 0, 0], [1536, 0, 0]] + // CHECK-SAME{LITERAL}: memory_shapes = [[512, 1, 128], [512, 1, 128], [512, 1, 128], [512, 1, 128]] + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0], [512, 0, 0], [1024, 0, 0], [1536, 0, 0]]}> + + // CHECK: [[COPY:%.+]] = VPUIP.Copy inputs([[INPUT]] : memref<2048x1x128x!qElemType, {order = #CHW, strides = [2048, 128, 1]}, @DDR>) outputs([[DDR_BUF]] + // CHECK-SAME: -> !VPUIP.DistributedBuffer<2048x1x128x!qElemType, #CHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[512, 1, 128], [512, 1, 128], [512, 1, 128], [512, 1, 128]], + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0], [512, 0, 0], [1024, 0, 0], [1536, 0, 0]] + // CHECK-SAME{LITERAL}: memory_shapes = [[512, 1, 128], [512, 1, 128], [512, 1, 128], [512, 1, 128]] + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0], [512, 0, 0], [1024, 0, 0], [1536, 0, 0]]}> + + // CHECK: [[RESHAPE:%.+]] = VPUIP.GenericReshape inputs([[COPY]] + // CHECK-SAME: !VPUIP.DistributedBuffer<2048x1x128x!qElemType, #CHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments + // CHECK-SAME{LITERAL}: compute_shapes = [[512, 1, 128], [512, 1, 128], [512, 1, 128], [512, 1, 128]], + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0], [512, 0, 0], [1024, 0, 0], [1536, 0, 0]] + // CHECK-SAME{LITERAL}: memory_shapes = [[512, 1, 128], [512, 1, 128], [512, 1, 128], [512, 1, 128]] + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0], [512, 0, 0], [1024, 0, 0], [1536, 0, 0]]}> + // CHECK-SAME: -> !VPUIP.DistributedBuffer<1x2048x1x128x!qElemType, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 4, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments + // CHECK-SAME{LITERAL}: compute_shapes = [[1, 512, 1, 128], [1, 512, 1, 128], [1, 512, 1, 128], [1, 512, 1, 128]], + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 512, 0, 0], [0, 1024, 0, 0], [0, 1536, 0, 0]], + // CHECK-SAME{LITERAL}: memory_shapes = [[1, 512, 1, 128], [1, 512, 1, 128], [1, 512, 1, 128], [1, 512, 1, 128]], + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 512, 0, 0], [0, 1024, 0, 0], [0, 1536, 0, 0]]}> + + // CHECK: return [[RESHAPE]] : !VPUIP.DistributedBuffer<1x2048x1x128x!qElemType, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 4, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments + // CHECK-SAME{LITERAL}: compute_shapes = [[1, 512, 1, 128], [1, 512, 1, 128], [1, 512, 1, 128], [1, 512, 1, 128]], + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 512, 0, 0], [0, 1024, 0, 0], [0, 1536, 0, 0]], + // CHECK-SAME{LITERAL}: memory_shapes = [[1, 512, 1, 128], [1, 512, 1, 128], [1, 512, 1, 128], [1, 512, 1, 128]], + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 512, 0, 0], [0, 1024, 0, 0], [0, 1536, 0, 0]]}> +} + +// ----- + #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4, d1, d2)> diff --git a/tests/lit/NPU/dialect/VPUIP/passes/optimize_copies_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/optimize_copies_40XX+.mlir index ce1d3a2a62..52c553214f 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/optimize_copies_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/optimize_copies_40XX+.mlir @@ -8,7 +8,7 @@ IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @NotMoveTilingCopyBeforeSubviewByNotFixCMX @@ -92,7 +92,7 @@ func.func @NotMoveTilingCopyBeforeSubviewByNotFixCMX(%arg0: memref<11008x128x1x1 IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> @@ -129,7 +129,7 @@ func.func @NotFuseCMXCopyToTheFrontOfTillingCopyDueToCMXSizeLimitation() -> !Inp IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @NotEraseCMX2CMXCopyAfterSubviewDueToCMXSizeLimitation // CHECK-SAME: [[DATA:%.+]]: memref<8000x32xf16> @@ -156,7 +156,7 @@ func.func @NotEraseCMX2CMXCopyAfterSubviewDueToCMXSizeLimitation(%data : memref< IE.TileResource 4 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> @@ -189,7 +189,7 @@ func.func @NotRemoveDistributedOpCMXToCMXCopyDueToSubView() IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @NotEraseCMX2CMXNonDistributedCopyWithoutNCEClusterTask // CHECK-SAME: [[DATA:%.+]]: memref<1x256x256x3x!qElemType, #NHWC, @DDR> @@ -227,7 +227,7 @@ func.func @NotEraseCMX2CMXNonDistributedCopyWithoutNCEClusterTask(%data : memref IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: func.func @EraseCMX2CMXNonDistributedCopyWithNCEClusterTask // CHECK-SAME: [[WEIGHTS:%.+]]: memref<1x256x256x3x!qElemType, #NHWC, [@CMX_NN, 0]> @@ -287,7 +287,7 @@ func.func @EraseCMX2CMXNonDistributedCopyWithNCEClusterTask(%weights: memref<1x2 IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } // CHECK-LABEL: @NotFuseDistributedOpCopiesThroughReshapeDueToNoAxisMapping diff --git a/tests/lit/NPU/dialect/VPUIP/passes/optimize_parallel_copies_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/optimize_parallel_copies_40XX+.mlir new file mode 100644 index 0000000000..d0bf30449e --- /dev/null +++ b/tests/lit/NPU/dialect/VPUIP/passes/optimize_parallel_copies_40XX+.mlir @@ -0,0 +1,463 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --optimize-parallel-copies %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +!Weights_table_CMX = memref<128x1x1x4xsi32, @CMX_NN> + +func.func @TwoAxisTilingConsiderDistanceSiblingSubview(%arg0: memref<1x1024x256xf16, @DDR>, %arg1: memref<1x1024x256xf16, @DDR>) { + %cst_0 = const.Declare memref<128x256x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1536x256x1x1xf16, {order = #NHWC}>, [#const.SubView<[0, 0, 0, 0], [128, 256, 1, 1]>, #const.Sparsify] + %cst_1 = const.Declare memref<128x256x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1536x256x1x1xf16, {order = #NHWC}>, [#const.SubView<[128, 0, 0, 0], [128, 256, 1, 1]>, #const.Sparsify] + %cst_2 = const.Declare memref<128x256x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1536x256x1x1xf16, {order = #NHWC}>, [#const.SubView<[256, 0, 0, 0], [128, 256, 1, 1]>, #const.Sparsify] + %cst_3 = const.Declare memref<128x256x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1536x256x1x1xf16, {order = #NHWC}>, [#const.SubView<[384, 0, 0, 0], [128, 256, 1, 1]>, #const.Sparsify] + %cst_4 = const.Declare memref<128x256x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1536x256x1x1xf16, {order = #NHWC}>, [#const.SubView<[512, 0, 0, 0], [128, 256, 1, 1]>, #const.Sparsify] + %cst_5 = const.Declare memref<128x256x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1536x256x1x1xf16, {order = #NHWC}>, [#const.SubView<[640, 0, 0, 0], [128, 256, 1, 1]>, #const.Sparsify] + %cst_6 = const.Declare memref<128x256x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1536x256x1x1xf16, {order = #NHWC}>, [#const.SubView<[768, 0, 0, 0], [128, 256, 1, 1]>, #const.Sparsify] + %cst_7 = const.Declare memref<128x256x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1536x256x1x1xf16, {order = #NHWC}>, [#const.SubView<[896, 0, 0, 0], [128, 256, 1, 1]>, #const.Sparsify] + %cst_8 = const.Declare memref<128x256x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1536x256x1x1xf16, {order = #NHWC}>, [#const.SubView<[1024, 0, 0, 0], [128, 256, 1, 1]>, #const.Sparsify] + %cst_9 = const.Declare memref<128x256x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1536x256x1x1xf16, {order = #NHWC}>, [#const.SubView<[1152, 0, 0, 0], [128, 256, 1, 1]>, #const.Sparsify] + %cst_10 = const.Declare memref<128x256x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1536x256x1x1xf16, {order = #NHWC}>, [#const.SubView<[1280, 0, 0, 0], [128, 256, 1, 1]>, #const.Sparsify] + %cst_11 = const.Declare memref<128x256x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1536x256x1x1xf16, {order = #NHWC}>, [#const.SubView<[1408, 0, 0, 0], [128, 256, 1, 1]>, #const.Sparsify] + + %table = memref.alloc() : !Weights_table_CMX + %0 = VPUIP.GenericReshape inputs(%arg0 : memref<1x1024x256xf16, @DDR>) -> memref<1024x256x1x1xf16, @DDR> + %1 = VPUIP.PermuteCast {dst_order = #NHWC, mem_perm = affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>} inputs(%0 : memref<1024x256x1x1xf16, @DDR>) -> memref<1x256x1024x1xf16, #NHWC, @DDR> + %2 = VPUIP.GenericReshape inputs(%1 : memref<1x256x1024x1xf16, #NHWC, @DDR>) -> memref<1x256x256x4xf16, #NHWC, @DDR> + + + // ACT-COPY1 WEIGHT-COPY1 ACT-COPY24 WEIGHT-COPY24 + // \ / \ / + // NCE1 ... NCE24 + // + // => + // + // WEIGHT-COPY1 ACT-COPY1 WEIGHT-COPY6 + // \ / \ / + // NCE1 ... NCE6 + // + // WEIGHT-COPY7 ACT-COPY2 WEIGHT-COPY12 + // \ / \ / + // NCE7 ... NCE12 + // + // WEIGHT-COPY13 ACT-COPY1 WEIGHT-COPY18 + // \ / \ / + // NCE13 ... NCE18 + // + // WEIGHT-COPY19 ACT-COPY2 WEIGHT-COPY24 + // \ / \ / + // NCE19 ... NCE24 + + + // the first set of tiles + %3 = VPUIP.SubView %2 [0, 0, 0, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + %alloc = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %4 = VPUIP.Copy inputs(%cst_0 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloc : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %alloc_1 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %5 = VPUIP.Copy inputs(%3 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloc_1 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloc_2 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %6 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%5 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%4 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%5 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloc_2 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloc_2 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %7 = VPUIP.SubView %2 [0, 0, 0, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %7 will be fused to %3 + %alloc_3 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %8 = VPUIP.Copy inputs(%cst_1 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloc_3 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %alloc_4 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %9 = VPUIP.Copy inputs(%7 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloc_4 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloc_5 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %10 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%9 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%8 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%9 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloc_5 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloc_5 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %11 = VPUIP.SubView %2 [0, 0, 0, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %11 will be fused to %3 + %alloc_6 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %12 = VPUIP.Copy inputs(%cst_2 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloc_6 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %alloc_7 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %13 = VPUIP.Copy inputs(%11 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloc_7 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloc_8 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %14 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%13 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%12 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%13 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloc_8 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloc_8 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %15 = VPUIP.SubView %2 [0, 0, 0, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %15 will be fused to %3 + %alloc_9 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %16 = VPUIP.Copy inputs(%cst_3 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloc_9 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %alloc_10 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %17 = VPUIP.Copy inputs(%15 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloc_10 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloc_11 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %18 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%17 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%16 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%17 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloc_11 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloc_11 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %19 = VPUIP.SubView %2 [0, 0, 0, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %19 will be fused to %3 + %alloc_12 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %20 = VPUIP.Copy inputs(%cst_4 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloc_12 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %alloc_13 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %21 = VPUIP.Copy inputs(%19 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloc_13 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloc_14 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %22 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%21 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%20 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%21 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloc_14 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloc_14 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %23 = VPUIP.SubView %2 [0, 0, 0, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %23 will be fused to %3 + %alloc_15 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %24 = VPUIP.Copy inputs(%cst_5 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloc_15 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %alloc_16 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %25 = VPUIP.Copy inputs(%23 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloc_16 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloc_17 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %26 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%25 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%24 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%25 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloc_17 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloc_17 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + + // the second set of tiles + %27 = VPUIP.SubView %2 [0, 0, 128, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + %alloca_18 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %28 = VPUIP.Copy inputs(%cst_0 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_18 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // %28 will not be fused, since it is beyond cost distance + %alloca_19 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %29 = VPUIP.Copy inputs(%27 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_19 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_20 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %30 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%29 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%28 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%29 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_20 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_20 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %31 = VPUIP.SubView %2 [0, 0, 128, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %31 will be fused to %27 + %alloca_21 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %32 = VPUIP.Copy inputs(%cst_1 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_21 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // %32 will not be fused, since it is beyond cost distance + %alloca_22 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %33 = VPUIP.Copy inputs(%31 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_22 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_23 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %34 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%33 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%32 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%33 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_23 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_23 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %35 = VPUIP.SubView %2 [0, 0, 128, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %35 will be fused to %27 + %alloca_24 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %36 = VPUIP.Copy inputs(%cst_2 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_24 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // %36 will not be fused, since it is beyond cost distance + %alloca_25 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %37 = VPUIP.Copy inputs(%35 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_25 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_26 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %38 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%37 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%36 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%37 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_26 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_26 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %39 = VPUIP.SubView %2 [0, 0, 128, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %39 will be fused to %27 + %alloca_27 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %40 = VPUIP.Copy inputs(%cst_3 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_27 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // %40 will not be fused, since it is beyond cost distance + %alloca_28 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %41 = VPUIP.Copy inputs(%39 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_28 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_29 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %42 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%41 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%40 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%41 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_29 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_29 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %43 = VPUIP.SubView %2 [0, 0, 128, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %43 will be fused to %27 + %alloca_30 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %44 = VPUIP.Copy inputs(%cst_4 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_30 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // %44 will not be fused, since it is beyond cost distance + %alloca_31 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %45 = VPUIP.Copy inputs(%43 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_31 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_32 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %46 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%45 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%44 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%45 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_32 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_32 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %47 = VPUIP.SubView %2 [0, 0, 128, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %47 will be fused to %27 + %alloca_33 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %48 = VPUIP.Copy inputs(%cst_5 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_33 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // %48 will not be fused, since it is beyond cost distance + %alloca_34 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %49 = VPUIP.Copy inputs(%47 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_34 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_35 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %50 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%49 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%48 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%49 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_35 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_35 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + + // the third set of tiles + %51 = VPUIP.SubView %2 [0, 0, 0, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %51 will be fused to %3 + %alloca_36 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %52 = VPUIP.Copy inputs(%cst_6 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_36 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %alloca_37 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %53 = VPUIP.Copy inputs(%51 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_37 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_38 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %54 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%53 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%52 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%53 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_38 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_38 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %55 = VPUIP.SubView %2 [0, 0, 0, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %55 will be fused to %3 + %alloca_39 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %56 = VPUIP.Copy inputs(%cst_7 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_39 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %alloca_40 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %57 = VPUIP.Copy inputs(%55 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_40 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_41 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %58 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%57 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%56 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%57 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_41 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_41 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %59 = VPUIP.SubView %2 [0, 0, 0, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %59 will be fused to %3 + %alloca_42 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %60 = VPUIP.Copy inputs(%cst_8 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_42 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %alloca_43 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %61 = VPUIP.Copy inputs(%59 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_43 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_44 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %62 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%61 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%60 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%61 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_44 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_44 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %63 = VPUIP.SubView %2 [0, 0, 0, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %63 will be fused to %3 + %alloca_45 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %64 = VPUIP.Copy inputs(%cst_9 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_45 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %alloca_46 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %65 = VPUIP.Copy inputs(%63 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_46 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_47 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %66 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%65 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%64 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%65 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_47 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_47 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %67 = VPUIP.SubView %2 [0, 0, 0, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %67 will be fused to %3 + %alloca_48 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %68 = VPUIP.Copy inputs(%cst_10 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_48 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %alloca_49 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %69 = VPUIP.Copy inputs(%67 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_49 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_50 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %70 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%69 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%68 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%69 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_50 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_50 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %71 = VPUIP.SubView %2 [0, 0, 0, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %71 will be fused to %3 + %alloca_51 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %72 = VPUIP.Copy inputs(%cst_11 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_51 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %alloca_52 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %73 = VPUIP.Copy inputs(%71 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_52 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_53 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %74 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%73 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%72 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%73 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_53 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_53 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + + // the fourth set of tiles + %75 = VPUIP.SubView %2 [0, 0, 128, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %75 will be fused to %27 + %alloca_54 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %76 = VPUIP.Copy inputs(%cst_6 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_54 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // %76 will not be fused, since it is beyond cost distance + %alloca_55 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %77 = VPUIP.Copy inputs(%75 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_55 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_56 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %78 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%77 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%76 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%77 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_56 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_56 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %79 = VPUIP.SubView %2 [0, 0, 128, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %79 will be fused to %27 + %alloca_57 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %80 = VPUIP.Copy inputs(%cst_7 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_57 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // %80 will not be fused, since it is beyond cost distance + %alloca_58 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %81 = VPUIP.Copy inputs(%79 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_58 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_59 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %82 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%81 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%80 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%81 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_59 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_59 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %83 = VPUIP.SubView %2 [0, 0, 128, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %83 will be fused to %27 + %alloca_60 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %84 = VPUIP.Copy inputs(%cst_8 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_60 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // %84 will not be fused, since it is beyond cost distance + %alloca_61 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %85 = VPUIP.Copy inputs(%83 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_61 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_62 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %86 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%85 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%84 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%85 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_62 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_62 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %87 = VPUIP.SubView %2 [0, 0, 128, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %87 will be fused to %27 + %alloca_63 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %88 = VPUIP.Copy inputs(%cst_9 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_63 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // %88 will not be fused, since it is beyond cost distance + %alloca_64 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %89 = VPUIP.Copy inputs(%87 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_64 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_65 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %90 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%89 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%88 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%89 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_65 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_65 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %91 = VPUIP.SubView %2 [0, 0, 128, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %91 will be fused to %27 + %alloca_66 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %92 = VPUIP.Copy inputs(%cst_10 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_66 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // %92 will not be fused, since it is beyond cost distance + %alloca_67 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %93 = VPUIP.Copy inputs(%91 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_67 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_68 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %94 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%93 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%92 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%93 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_68 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_68 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + %95 = VPUIP.SubView %2 [0, 0, 128, 0] [1, 256, 128, 4] : memref<1x256x256x4xf16, #NHWC, @DDR> to memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR> + // %95 will be fused to %27 + %alloca_69 = memref.alloc() : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + %96 = VPUIP.Copy inputs(%cst_11 : memref<128x256x1x1xf16, {order = #NHWC}>) outputs(%alloca_69 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // %96 will not be fused, since it is beyond cost distance + %alloca_70 = memref.alloc() : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %97 = VPUIP.Copy inputs(%95 : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs(%alloca_70 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + %alloca_71 = memref.alloc() : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> + %98 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} + input(%97 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) weights(%96 : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) weight_table(%table : !Weights_table_CMX) parent_input(%97 : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) parent_output(%alloca_71 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloca_71 : memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x128x128x4xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {inEnd = [3, 127, 255], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 127, 127], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + } + + return + // CHECK: [[COPY_WEIGHT_1:%.+]] = VPUIP.Copy inputs([[CST_1:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_1:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[COPY_ACT_1:%.+]] = VPUIP.Copy inputs([[SUBVIEW_1:%.+]] : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs([[ALLOC_2:%.+]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + // CHECK: [[NCE_1:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_1]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_1]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_2:%.+]] = VPUIP.Copy inputs([[CST_2:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_2:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_2:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_1]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_2]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_3:%.+]] = VPUIP.Copy inputs([[CST_3:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_3:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_3:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_1]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_3]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_4:%.+]] = VPUIP.Copy inputs([[CST_4:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_4:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_4:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_1]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_4]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_5:%.+]] = VPUIP.Copy inputs([[CST_5:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_5:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_5:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_1]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_5]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_6:%.+]] = VPUIP.Copy inputs([[CST_6:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_6:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_6:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_1]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_6]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + + // CHECK: [[COPY_WEIGHT_7:%.+]] = VPUIP.Copy inputs([[CST_1:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_7:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[COPY_ACT_2:%.+]] = VPUIP.Copy inputs([[SUBVIEW_2:%.+]] : memref<1x256x128x4xf16, {order = #NHWC, strides = [262144, 1, 1024, 256]}, @DDR>) outputs([[ALLOC_2:%.+]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]> + // CHECK: [[NCE_7:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_2]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_7]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_8:%.+]] = VPUIP.Copy inputs([[CST_2:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_8:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_8:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_2]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_8]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_9:%.+]] = VPUIP.Copy inputs([[CST_3:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_9:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_9:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_2]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_9]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_10:%.+]] = VPUIP.Copy inputs([[CST_4:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_10:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_10:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_2]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_10]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_11:%.+]] = VPUIP.Copy inputs([[CST_5:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_11:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_11:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_2]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_11]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_12:%.+]] = VPUIP.Copy inputs([[CST_6:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_12:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_12:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_2]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_12]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + + // CHECK: [[COPY_WEIGHT_13:%.+]] = VPUIP.Copy inputs([[CST_7:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_13:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_13:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_1]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_13]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_14:%.+]] = VPUIP.Copy inputs([[CST_8:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_14:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_14:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_1]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_14]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_15:%.+]] = VPUIP.Copy inputs([[CST_9:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_15:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_15:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_1]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_15]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_16:%.+]] = VPUIP.Copy inputs([[CST_10:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_16:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_16:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_1]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_16]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_17:%.+]] = VPUIP.Copy inputs([[CST_11:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_17:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_17:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_1]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_17]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_18:%.+]] = VPUIP.Copy inputs([[CST_12:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_18:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_18:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_1]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_18]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + + // CHECK: [[COPY_WEIGHT_19:%.+]] = VPUIP.Copy inputs([[CST_7:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_19:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_19:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_2]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_19]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_20:%.+]] = VPUIP.Copy inputs([[CST_8:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_20:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_20:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_2]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_20]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_21:%.+]] = VPUIP.Copy inputs([[CST_9:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_21:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_21:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_2]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_21]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_22:%.+]] = VPUIP.Copy inputs([[CST_10:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_22:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_22:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_2]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_22]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_23:%.+]] = VPUIP.Copy inputs([[CST_11:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_23:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_23:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_2]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_23]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) + // CHECK: [[COPY_WEIGHT_24:%.+]] = VPUIP.Copy inputs([[CST_12:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}>) outputs([[ALLOC_24:%.+]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) -> memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]> + // CHECK: [[NCE_24:%.+]] = VPUIP.NCEClusterTask + // CHECK: input([[COPY_ACT_2]] : memref<1x256x128x4xf16, #NHWC, [@CMX_NN, 0]>) + // CHECK: weights([[COPY_WEIGHT_24]] : memref<128x256x1x1xf16, {order = #NHWC}, [@CMX_NN, 0]>) +} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/optimize_subview_copies.mlir b/tests/lit/NPU/dialect/VPUIP/passes/optimize_subview_copies.mlir index d370175470..f3f4f440c1 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/optimize_subview_copies.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/optimize_subview_copies.mlir @@ -557,11 +557,11 @@ func.func @Optimize2SubviewCopyConvPatternWithOptimizableCopyIn( // CHECK: [[DISTRIB_CAST:%.+]] = VPUIP.DistributedCast // CHECK-SAME: inputs([[ARG0]] : !VPUIP.DistributedBuffer<1x3072x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED|SEGMENTED" - // CHECK-SAME: -> !VPUIP.DistributedBuffer<1x3072x1x1xf16, {order = #NHWC, strides = [3072, 1, 3072, 3072]}, @CMX_NN, + // CHECK-SAME: -> !VPUIP.DistributedBuffer<1x3072x1x1xf16, #NHWC, @CMX_NN, // CHECK-SAME: mode = "DUPLICATED", num_clusters = 2 : i64 // CHECK: [[SUBVIEW0:%.+]] = VPUIP.SubView [[DISTRIB_CAST]] [0, 0, 0, 0] [1, 1536, 1, 1] - // CHECK-SAME: : !VPUIP.DistributedBuffer<1x3072x1x1xf16, {order = #NHWC, strides = [3072, 1, 3072, 3072]}, @CMX_NN, + // CHECK-SAME: : !VPUIP.DistributedBuffer<1x3072x1x1xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 2 : i64 // CHECK-SAME{LITERAL}: memory_shapes = [[1, 3072, 1, 1], [1, 3072, 1, 1]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]] // CHECK-SAME: to !VPUIP.DistributedBuffer<1x1536x1x1xf16, {order = #NHWC, strides = [3072, 1, 3072, 3072]}, @CMX_NN, @@ -569,7 +569,7 @@ func.func @Optimize2SubviewCopyConvPatternWithOptimizableCopyIn( // CHECK-SAME{LITERAL}: memory_shapes = [[1, 1536, 1, 1], [1, 1536, 1, 1]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]] // CHECK: [[SUBVIEW1:%.+]] = VPUIP.SubView [[DISTRIB_CAST]] [0, 1536, 0, 0] [1, 1536, 1, 1] - // CHECK-SAME: : !VPUIP.DistributedBuffer<1x3072x1x1xf16, {order = #NHWC, strides = [3072, 1, 3072, 3072]}, @CMX_NN, + // CHECK-SAME: : !VPUIP.DistributedBuffer<1x3072x1x1xf16, #NHWC, @CMX_NN, // CHECK-SAME: {mode = "DUPLICATED", num_clusters = 2 : i64 // CHECK-SAME{LITERAL}: memory_shapes = [[1, 3072, 1, 1], [1, 3072, 1, 1]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]] // CHECK-SAME: to !VPUIP.DistributedBuffer<1x1536x1x1xf16, {order = #NHWC, strides = [3072, 1, 3072, 3072]}, @CMX_NN, @@ -2354,7 +2354,7 @@ func.func @NotOptimizeSubviewWithRMS( // CHECK: [[RMS_INPUT2:%.+]] = VPUIP.Copy inputs([[ARG1]] : memref<3072xf16, @DDR> // CHECK: [[RMS_RESULT:%.+]] = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_RMS - // CHECK-SAME: inputs([[RMS_INPUT1]] as %{{.+}}: memref<1x1x3072xf16, [@CMX_NN, 0]>, + // CHECK-SAME: inputs([[RMS_INPUT1]] as %{{.+}}: memref<1x1x3072xf16, [@CMX_NN, 0]>, // CHECK-SAME: [[RMS_INPUT2]] as %{{.+}}: memref<3072xf16, [@CMX_NN, 0]>) // CHECK: [[GENERIC_RESHAPE:%.+]] = VPUIP.GenericReshape inputs([[RMS_RESULT]] : memref<1x1x3072xf16, [@CMX_NN, 0]>) diff --git a/tests/lit/NPU/dialect/VPUIP/passes/profiling_actshave_37XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/profiling_actshave_37XX.mlir index 111eb59fcd..2d05070715 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/profiling_actshave_37XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/profiling_actshave_37XX.mlir @@ -176,16 +176,16 @@ module @ActShaveProfilingMulticluster { //CHECK-NEXT: DataInfo "actshave" : tensor<16xui32> //CHECK: @main(%arg0: memref<1x4x512x1xf16, #NCWH, @DDR>, %arg1: memref<1x4x512x1xf16, #NCWH, @DDR>, %arg2: memref<16xui32>) -> (memref<1x4x512x1xf16, #NCWH, @DDR>, memref<16xui32>) //CHECK: [[PROF_BUF:%.+]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<16xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}> - //CHECK: [[PROF_BUF_SLOT:%.+]] = VPUIP.SubView [[PROF_BUF]] [0] [16] : !VPUIP.DistributedBuffer<16xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}> to !VPUIP.DistributedBuffer<16xui32, {order = #C, strides = [1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}> + //CHECK: [[PROF_BUF_SLOT:%.+]] = VPUIP.SubView [[PROF_BUF]] [0] [16] : !VPUIP.DistributedBuffer<16xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}> to !VPUIP.DistributedBuffer<16xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}> //CHECK: [[OP_RESULT:%.*]], [[OP_RESULT_PROF:%.*]] = VPUIP.SW.Kernel //CHECK-SAME: @VPU.SW::@builtin_MVN - //CHECK-SAME: profiling_data([[PROF_BUF_SLOT]] : !VPUIP.DistributedBuffer<16xui32, {order = #C, strides = [1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}>) on tile 0 -> (!VPUIP.DistributedBuffer<1x4x512x1xf16, #NCWH, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>, !VPUIP.DistributedBuffer<16xui32, {order = #C, strides = [1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}>) + //CHECK-SAME: profiling_data([[PROF_BUF_SLOT]] : !VPUIP.DistributedBuffer<16xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}>) on tile 0 -> (!VPUIP.DistributedBuffer<1x4x512x1xf16, #NCWH, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>, !VPUIP.DistributedBuffer<16xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}>) //CHECK-NEXT: VPUIP.SW.Kernel.run //CHECK: [[PROF_OUTPUT:%.+]] = VPUIP.SubView %arg2 [0] [16] : memref<16xui32> to memref<16xui32 //CHECK: [[CONCAT_PROF_RES:%.+]] = VPUIP.ConcatView - //CHECK-SAME: inputs([[OP_RESULT_PROF]] : !VPUIP.DistributedBuffer<16xui32, {order = #C, strides = [1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}>) + //CHECK-SAME: inputs([[OP_RESULT_PROF]] : !VPUIP.DistributedBuffer<16xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}>) //CHECK-SAME: outputs([[PROF_BUF]] : !VPUIP.DistributedBuffer<16xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}>) //CHECK: [[NCE_RES_COPY:%.+]] = VPUIP.NNDMA {profiling_buffer_mgmt} inputs([[CONCAT_PROF_RES]] : !VPUIP.DistributedBuffer<16xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}>) outputs([[PROF_OUTPUT]] : memref<16xui32>) -> memref<16xui32> @@ -254,17 +254,17 @@ module @ActShaveProfilingMulticlusterMultitile { //CHECK-NEXT: DataInfo "actshave" : tensor<32xui32> //CHECK: @main(%arg0: memref<1x128x64x32xf16, #NWHC, @DDR>, %arg1: memref<1x128x64x32xf16, #NWHC, @DDR>, %arg2: memref<32xui32>) -> (memref<1x128x64x32xf16, #NWHC, @DDR>, memref<32xui32>) //CHECK: [[PROF_BUF:%.+]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<32xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}> - //CHECK: [[PROF_BUF_SLOT:%.+]] = VPUIP.SubView [[PROF_BUF]] [0] [32] : !VPUIP.DistributedBuffer<32xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}> to !VPUIP.DistributedBuffer<32xui32, {order = #C, strides = [1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}> + //CHECK: [[PROF_BUF_SLOT:%.+]] = VPUIP.SubView [[PROF_BUF]] [0] [32] : !VPUIP.DistributedBuffer<32xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}> to !VPUIP.DistributedBuffer<32xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}> //CHECK: [[OP_RESULT:%.*]], [[OP_RESULT_PROF:%.*]] = VPUIP.SW.Kernel //CHECK-SAME: @VPU.SW::@builtin_MVN - //CHECK-SAME: profiling_data([[PROF_BUF_SLOT]] : !VPUIP.DistributedBuffer<32xui32, {order = #C, strides = [1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}> + //CHECK-SAME: profiling_data([[PROF_BUF_SLOT]] : !VPUIP.DistributedBuffer<32xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}> //CHECK-NEXT: VPUIP.SW.Kernel.run //CHECK-NEXT: VPUIP.SW.Kernel.run //CHECK: [[PROF_OUTPUT:%.+]] = VPUIP.SubView %arg2 [0] [32] : memref<32xui32> to memref<32xui32 //CHECK: [[CONCAT_PROF_RES:%.+]] = VPUIP.ConcatView - //CHECK-SAME: inputs([[OP_RESULT_PROF]] : !VPUIP.DistributedBuffer<32xui32, {order = #C, strides = [1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}>) + //CHECK-SAME: inputs([[OP_RESULT_PROF]] : !VPUIP.DistributedBuffer<32xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}>) //CHECK-SAME: outputs([[PROF_BUF]] : !VPUIP.DistributedBuffer<32xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}>) //CHECK: [[NCE_RES_COPY:%.+]] = VPUIP.NNDMA {profiling_buffer_mgmt} inputs([[CONCAT_PROF_RES]] : !VPUIP.DistributedBuffer<32xui32, #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [2], num_clusters = 2 : i64, uniform_distributed_segments}>) outputs([[PROF_OUTPUT]] : memref<32xui32>) -> memref<32xui32> diff --git a/tests/lit/NPU/dialect/VPUIP/passes/profiling_dpu.mlir b/tests/lit/NPU/dialect/VPUIP/passes/profiling_dpu.mlir index a9fb2451e9..33f8d8fc1e 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/profiling_dpu.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/profiling_dpu.mlir @@ -152,7 +152,7 @@ module @DpuProfilingWithMulticlustering { //CHECK-SAME: weights(%arg1 : memref<48x16x3x3xf16, #NHWC, @CMX_NN>) //CHECK-SAME: weight_table(%arg2 : memref<48x1x1x4xsi32, #NHWC, @CMX_NN>) //CHECK-SAME: outputs([[OUTPUT_BUF_CMX]] : !VPUIP.DistributedBuffer<1x48x60x60xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 4, 1], num_clusters = 4 : i64}>) - //CHECK-SAME: profiling_data([[PROF_BUF_VIEW_CMX]] : !VPUIP.DistributedBuffer<[[PROFDATA_INFO_TENSOR_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], {order = #C, strides = [1]}, @CMX_NN + //CHECK-SAME: profiling_data([[PROF_BUF_VIEW_CMX]] : !VPUIP.DistributedBuffer<[[PROFDATA_INFO_TENSOR_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], #C, @CMX_NN //CHECK: [[PROF_OUTPUT_VIEW:%.*]] = VPUIP.SubView %arg4 [0] [[[PROFDATA_INFO_TENSOR_SIZE]]] : memref<[[PROFDATA_INFO_TENSOR_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]]> //CHECK: [[PROF_VIEW_CMX_CONCAT:%.*]] = VPUIP.ConcatView inputs([[NCE_RES]]#1 @@ -346,10 +346,10 @@ module @DpuProfilingMultipleOps { } //CHECK: [[PROF_VIEW_OP_4:%.+]] = VPUIP.SubView [[BUFFER_D]] [0] [[[PROF_VIEW_OP_4_SIZE:.*]]] : !VPUIP.DistributedBuffer<[[PROFDATA_INFO_TENSOR_SPLIT_0]]x[[PROFDATA_INFO_TENSOR_TYPE]], #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [3], num_clusters = 3 : i64, uniform_distributed_segments}> - //CHECK-SAME: to !VPUIP.DistributedBuffer<[[PROF_VIEW_OP_4_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], {order = #C, strides = [1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [3], num_clusters = 3 : i64, uniform_distributed_segments}> + //CHECK-SAME: to !VPUIP.DistributedBuffer<[[PROF_VIEW_OP_4_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [3], num_clusters = 3 : i64, uniform_distributed_segments}> //CHECK: [[OP_RESULT_4:%[0-9]+]]:2 = VPUIP.NCEClusterTask //CHECK-SAME: profilingMetadata = #VPUIP.DpuProfilingMetadataAttr - //CHECK-SAME: profiling_data([[PROF_VIEW_OP_4]] : !VPUIP.DistributedBuffer<[[PROF_VIEW_OP_4_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], {order = #C, strides = [1]}, @CMX_NN + //CHECK-SAME: profiling_data([[PROF_VIEW_OP_4]] : !VPUIP.DistributedBuffer<[[PROF_VIEW_OP_4_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], #C, @CMX_NN %15 = VPURT.AllocDistributed -> !OutputDistributed %16 = VPUIP.NCEClusterTask { @@ -372,10 +372,10 @@ module @DpuProfilingMultipleOps { } //CHECK: [[PROF_VIEW_OP_5:%.+]] = VPUIP.SubView [[BUFFER_D]] [[[PROF_VIEW_OFFSET:.*]]] [[[PROF_VIEW_OP_5_SIZE:.*]]] : !VPUIP.DistributedBuffer<[[PROFDATA_INFO_TENSOR_SPLIT_0]]x[[PROFDATA_INFO_TENSOR_TYPE]], #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [3], num_clusters = 3 : i64, uniform_distributed_segments}> - //CHECK-SAME: to !VPUIP.DistributedBuffer<[[PROF_VIEW_OP_5_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], {order = #C, strides = [1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [3], num_clusters = 3 : i64, uniform_distributed_segments}> + //CHECK-SAME: to !VPUIP.DistributedBuffer<[[PROF_VIEW_OP_5_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [3], num_clusters = 3 : i64, uniform_distributed_segments}> //CHECK: [[OP_RESULT_5:%[0-9]+]]:2 = VPUIP.NCEClusterTask //CHECK-SAME: profilingMetadata = #VPUIP.DpuProfilingMetadataAttr - //CHECK-SAME: profiling_data([[PROF_VIEW_OP_5]] : !VPUIP.DistributedBuffer<[[PROF_VIEW_OP_5_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], {order = #C, strides = [1]}, @CMX_NN + //CHECK-SAME: profiling_data([[PROF_VIEW_OP_5]] : !VPUIP.DistributedBuffer<[[PROF_VIEW_OP_5_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], #C, @CMX_NN %17 = memref.alloc() : !Output_DDR @@ -385,8 +385,8 @@ module @DpuProfilingMultipleOps { //CHECK: [[DDR_VIEW_2:%.*]] = VPUIP.SubView %arg2 [[[DDR_VIEW_2_OFFSET:.*]]] [[[PROFDATA_INFO_TENSOR_SPLIT_0]]] : memref<[[PROFDATA_INFO_TENSOR_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]]> to memref<[[PROFDATA_INFO_TENSOR_SPLIT_0]]x[[PROFDATA_INFO_TENSOR_TYPE]]> //CHECK: [[PROF_CONCAT_3:%.*]] = VPUIP.ConcatView inputs([[OP_RESULT_4]]#1, [[OP_RESULT_5]]#1 - //CHECK-SAME: !VPUIP.DistributedBuffer<[[PROF_VIEW_OP_4_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], {order = #C, strides = [1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [3], num_clusters = 3 : i64, uniform_distributed_segments}> - //CHECK-SAME: !VPUIP.DistributedBuffer<[[PROF_VIEW_OP_5_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], {order = #C, strides = [1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [3], num_clusters = 3 : i64, uniform_distributed_segments}> + //CHECK-SAME: !VPUIP.DistributedBuffer<[[PROF_VIEW_OP_4_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [3], num_clusters = 3 : i64, uniform_distributed_segments}> + //CHECK-SAME: !VPUIP.DistributedBuffer<[[PROF_VIEW_OP_5_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], #C, @CMX_NN, {mode = "SEGMENTED", num_tiles = [3], num_clusters = 3 : i64, uniform_distributed_segments}> //CHECK: VPUIP.NNDMA //CHECK-SAME: inputs([[PROF_CONCAT_3]] : !VPUIP.DistributedBuffer<[[PROFDATA_INFO_TENSOR_SPLIT_0]]x[[PROFDATA_INFO_TENSOR_TYPE]], #C, @CMX_NN //CHECK-SAME: outputs([[DDR_VIEW_2]] : memref<[[PROFDATA_INFO_TENSOR_SPLIT_0]]x[[PROFDATA_INFO_TENSOR_TYPE]]>) -> memref<[[PROFDATA_INFO_TENSOR_SPLIT_0]]x[[PROFDATA_INFO_TENSOR_TYPE]]> @@ -578,7 +578,7 @@ module @DpuProfilingSparseWithMulticlustering { //CHECK-SAME: weight_table(%arg2 : memref<48x1x1x4xsi32, #NHWC, @CMX_NN>) //CHECK-SAME: outputs([[OUTPUT_BUF_CMX]] : !VPUIP.DistributedBuffer<1x48x60x60xf16, #NHWC, @CMX_NN //CHECK-SAME: output_sparsity_map([[SPARSITY_MAP_BUF_CMX]] : !VPUIP.DistributedBuffer<1x48x60x60xi1, #NHWC, @CMX_NN - //CHECK-SAME: profiling_data([[PROF_BUF_VIEW_CMX]] : !VPUIP.DistributedBuffer<[[PROFDATA_INFO_TENSOR_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], {order = #C, strides = [1]}, @CMX_NN + //CHECK-SAME: profiling_data([[PROF_BUF_VIEW_CMX]] : !VPUIP.DistributedBuffer<[[PROFDATA_INFO_TENSOR_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]], #C, @CMX_NN //CHECK: [[PROF_OUTPUT_VIEW:%.*]] = VPUIP.SubView %arg4 [0] [[[PROFDATA_INFO_TENSOR_SIZE]]] : memref<[[PROFDATA_INFO_TENSOR_SIZE]]x[[PROFDATA_INFO_TENSOR_TYPE]]> //CHECK: [[PROF_VIEW_CMX_CONCAT:%.*]] = VPUIP.ConcatView inputs([[NCE_RES]]#2 diff --git a/tests/lit/NPU/dialect/VPUIP/passes/reduce_exceeding_active_count_barriers_wlm_no_shared_bars.mlir b/tests/lit/NPU/dialect/VPUIP/passes/reduce_exceeding_active_count_barriers_wlm_no_shared_bars.mlir index 1dba866407..c1a49fa611 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/reduce_exceeding_active_count_barriers_wlm_no_shared_bars.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/reduce_exceeding_active_count_barriers_wlm_no_shared_bars.mlir @@ -10,7 +10,7 @@ #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPUIP.wlm_status = #VPUIP.wlm_status} { +module { module @VPU.SW { func.func private @builtin_relu(%input : memref<*xf16>, %output : memref<*xf16>) attributes {VPU.kernel_code = "activation_relu.cpp", VPU.kernel_entry = "activation_relu", VPU.task_type = @COMPUTE } func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/reduce_exceeding_active_count_barriers_wlm_shared_bars.mlir b/tests/lit/NPU/dialect/VPUIP/passes/reduce_exceeding_active_count_barriers_wlm_shared_bars.mlir index 2b46a5e5c6..21b0441f23 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/reduce_exceeding_active_count_barriers_wlm_shared_bars.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/reduce_exceeding_active_count_barriers_wlm_shared_bars.mlir @@ -10,7 +10,7 @@ #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPUIP.wlm_status = #VPUIP.wlm_status} { +module { module @VPU.SW { func.func private @builtin_relu(%input : memref<*xf16>, %output : memref<*xf16>) attributes {VPU.kernel_code = "activation_relu.cpp", VPU.kernel_entry = "activation_relu", VPU.task_type = @COMPUTE } func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} @@ -174,7 +174,7 @@ module attributes {VPUIP.wlm_status = #VPUIP.wlm_status} { #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module attributes {VPUIP.wlm_status = #VPUIP.wlm_status} { +module { module @VPU.SW { func.func private @builtin_relu(%input : memref<*xf16>, %output : memref<*xf16>) attributes {VPU.kernel_code = "activation_relu.cpp", VPU.kernel_entry = "activation_relu", VPU.task_type = @COMPUTE } func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/static_allocation.mlir b/tests/lit/NPU/dialect/VPUIP/passes/static_allocation.mlir index b765eb73a6..a367132da2 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/static_allocation.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/static_allocation.mlir @@ -111,12 +111,10 @@ module @LinearGraphWithReservedMem { builtin.module @ReservedMemory { module @CustomReservedMemory { - IE.MemoryResource 512 bytes of @DDR + IE.MemoryResource 512 bytes of @DDR offset 0 } } -// CHECK: IE.MemoryResource 512 bytes of @DDR offset 0 - net.NetworkInfo entryPoint : @main inputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUIP/passes/sw_kernel_prefetching_reserve_mem_3720.mlir b/tests/lit/NPU/dialect/VPUIP/passes/sw_kernel_prefetching_reserve_mem_3720.mlir deleted file mode 100644 index b56d1ab510..0000000000 --- a/tests/lit/NPU/dialect/VPUIP/passes/sw_kernel_prefetching_reserve_mem_3720.mlir +++ /dev/null @@ -1,235 +0,0 @@ -// -// Copyright (C) 2024-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --sw-kernel-prefetching-reserve-mem %s | FileCheck %s -// REQUIRES: arch-NPU37XX - -module @SimpleGraph { - module @VPU.SW { - func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} - func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} - } - - net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "data" : tensor<1x16x4x4xf16> - } outputsInfo : { - DataInfo "prob" : tensor<1x16x4x4xf16> - } - - func.func @main(%arg0: memref<1x16x4x4xf16>, %arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> { - %0 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %1 = VPUIP.Copy inputs(%arg0 : memref<1x16x4x4xf16>) outputs(%0 : memref<1x16x4x4xf16, [@CMX_NN, 0]>) -> memref<1x16x4x4xf16, [@CMX_NN, 0]> - %2 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Gelu inputs(%1 as %arg2: memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%2 as %arg3: memref<1x16x4x4xf16, [@CMX_NN, 0]>) on tile 0 -> memref<1x16x4x4xf16, [@CMX_NN, 0]>{ - VPUIP.SW.Kernel.run(%arg2, %arg3) : memref<1x16x4x4xf16, [@CMX_NN, 0]>, memref<1x16x4x4xf16, [@CMX_NN, 0]> - } - %3 = VPUIP.Copy inputs(%results : memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> - - return %arg1: memref<1x16x4x4xf16> - } - - // reserve dummy memory at the end of CMX - - // CHECK: IE.TileResource - // CHECK: ReservedMemory - // CHECK: SWKernelPrefetchingReservedMemory - // CHECK: IE.MemoryResource 256 bytes of @CMX_NN offset 1982208 -} - -// ----- - -module @SimpleGraphWithReservedMem { - module @VPU.SW { - func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} - func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} - } - - IE.TileResource 1 of @NCE at 1.300000e+03 MHz { - builtin.module @ReservedMemory { - module @CustomReservedMemory { - IE.MemoryResource 128 bytes of @CMX_NN - } - } - } - - net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "data" : tensor<1x16x4x4xf16> - } outputsInfo : { - DataInfo "prob" : tensor<1x16x4x4xf16> - } - - func.func @main(%arg0: memref<1x16x4x4xf16>, %arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> { - %0 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %1 = VPUIP.Copy inputs(%arg0 : memref<1x16x4x4xf16>) outputs(%0 : memref<1x16x4x4xf16, [@CMX_NN, 0]>) -> memref<1x16x4x4xf16, [@CMX_NN, 0]> - %2 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Gelu inputs(%1 as %arg2: memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%2 as %arg3: memref<1x16x4x4xf16, [@CMX_NN, 0]>) on tile 0 -> memref<1x16x4x4xf16, [@CMX_NN, 0]>{ - VPUIP.SW.Kernel.run(%arg2, %arg3) : memref<1x16x4x4xf16, [@CMX_NN, 0]>, memref<1x16x4x4xf16, [@CMX_NN, 0]> - } - %3 = VPUIP.Copy inputs(%results : memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> - - return %arg1: memref<1x16x4x4xf16> - } - - // enlarge the original reserved memory and put it at the end of CMX - - // CHECK: IE.TileResource - // CHECK: ReservedMemory - // CHECK: CustomReservedMemory - // CHECK: IE.MemoryResource 256 bytes of @CMX_NN offset 1982208 -} - -// ----- - -module @SimpleGraphWithReservedMemHasEnoughSize { - module @VPU.SW { - func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} - func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} - } - - IE.TileResource 1 of @NCE at 1.300000e+03 MHz { - builtin.module @ReservedMemory { - module @CustomReservedMemory { - IE.MemoryResource 512 bytes of @CMX_NN - } - } - } - - net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "data" : tensor<1x16x4x4xf16> - } outputsInfo : { - DataInfo "prob" : tensor<1x16x4x4xf16> - } - - func.func @main(%arg0: memref<1x16x4x4xf16>, %arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> { - %0 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %1 = VPUIP.Copy inputs(%arg0 : memref<1x16x4x4xf16>) outputs(%0 : memref<1x16x4x4xf16, [@CMX_NN, 0]>) -> memref<1x16x4x4xf16, [@CMX_NN, 0]> - %2 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Gelu inputs(%1 as %arg2: memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%2 as %arg3: memref<1x16x4x4xf16, [@CMX_NN, 0]>) on tile 0 -> memref<1x16x4x4xf16, [@CMX_NN, 0]>{ - VPUIP.SW.Kernel.run(%arg2, %arg3) : memref<1x16x4x4xf16, [@CMX_NN, 0]>, memref<1x16x4x4xf16, [@CMX_NN, 0]> - } - %3 = VPUIP.Copy inputs(%results : memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> - - return %arg1: memref<1x16x4x4xf16> - } - - // no need to change the reserved memory size, just put it at the end of CMX - - // CHECK: IE.TileResource - // CHECK: ReservedMemory - // CHECK: CustomReservedMemory - // CHECK: IE.MemoryResource 512 bytes of @CMX_NN offset 1981952 -} - -// ----- - -module @SimpleGraphWith2ReservedMem { - module @VPU.SW { - func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} - func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} - } - - IE.TileResource 1 of @NCE at 1.300000e+03 MHz { - builtin.module @ReservedMemory { - module @CustomReservedMemory1 { - IE.MemoryResource 128 bytes of @CMX_NN - } - - module @CustomReservedMemory2 { - IE.MemoryResource 64 bytes of @CMX_NN - } - } - } - - net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "data" : tensor<1x16x4x4xf16> - } outputsInfo : { - DataInfo "prob" : tensor<1x16x4x4xf16> - } - - func.func @main(%arg0: memref<1x16x4x4xf16>, %arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> { - %0 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %1 = VPUIP.Copy inputs(%arg0 : memref<1x16x4x4xf16>) outputs(%0 : memref<1x16x4x4xf16, [@CMX_NN, 0]>) -> memref<1x16x4x4xf16, [@CMX_NN, 0]> - %2 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Gelu inputs(%1 as %arg2: memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%2 as %arg3: memref<1x16x4x4xf16, [@CMX_NN, 0]>) on tile 0 -> memref<1x16x4x4xf16, [@CMX_NN, 0]>{ - VPUIP.SW.Kernel.run(%arg2, %arg3) : memref<1x16x4x4xf16, [@CMX_NN, 0]>, memref<1x16x4x4xf16, [@CMX_NN, 0]> - } - %3 = VPUIP.Copy inputs(%results : memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> - - return %arg1: memref<1x16x4x4xf16> - } - - // enlarge reserved memory size, and put both of them at the end of CMX - - // CHECK: IE.TileResource - // CHECK: ReservedMemory - // CHECK: CustomReservedMemory1 - // CHECK: IE.MemoryResource 128 bytes of @CMX_NN offset 1982336 - // CHECK: CustomReservedMemory2 - // CHECK: IE.MemoryResource 128 bytes of @CMX_NN offset 1982208 -} - -// ----- - -module @SimpleGraphWith2ReservedMemHaveEnoughTotalSize { - module @VPU.SW { - func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} - func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} - } - - IE.TileResource 1 of @NCE at 1.300000e+03 MHz { - builtin.module @ReservedMemory { - module @CustomReservedMemory1 { - IE.MemoryResource 128 bytes of @CMX_NN - } - - module @CustomReservedMemory2 { - IE.MemoryResource 256 bytes of @CMX_NN - } - } - } - - net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "data" : tensor<1x16x4x4xf16> - } outputsInfo : { - DataInfo "prob" : tensor<1x16x4x4xf16> - } - - func.func @main(%arg0: memref<1x16x4x4xf16>, %arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> { - %0 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %1 = VPUIP.Copy inputs(%arg0 : memref<1x16x4x4xf16>) outputs(%0 : memref<1x16x4x4xf16, [@CMX_NN, 0]>) -> memref<1x16x4x4xf16, [@CMX_NN, 0]> - %2 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Gelu inputs(%1 as %arg2: memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%2 as %arg3: memref<1x16x4x4xf16, [@CMX_NN, 0]>) on tile 0 -> memref<1x16x4x4xf16, [@CMX_NN, 0]>{ - VPUIP.SW.Kernel.run(%arg2, %arg3) : memref<1x16x4x4xf16, [@CMX_NN, 0]>, memref<1x16x4x4xf16, [@CMX_NN, 0]> - } - %3 = VPUIP.Copy inputs(%results : memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> - - return %arg1: memref<1x16x4x4xf16> - } - - // not need to enlarge the reserved memory size, just put both of them at the end of CMX - - // CHECK: IE.TileResource - // CHECK: ReservedMemory - // CHECK: CustomReservedMemory1 - // CHECK: IE.MemoryResource 128 bytes of @CMX_NN offset 1982336 - // CHECK: CustomReservedMemory2 - // CHECK: IE.MemoryResource 256 bytes of @CMX_NN offset 1982080 -} - -// ----- - -module @SimpleGraphNoSWKernel { - net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "data" : tensor<1x16x4x4xf16> - } outputsInfo : { - DataInfo "prob" : tensor<1x16x4x4xf16> - } - func.func @main(%arg0: memref<1x16x4x4xf16>, %arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> { - return %arg1 : memref<1x16x4x4xf16> - } - // not change if no SW Kernel - - // CHECK-NOT: ReservedMemory -} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/sw_kernel_prefetching_reserve_mem_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/sw_kernel_prefetching_reserve_mem_40XX+.mlir deleted file mode 100644 index 92d4920ead..0000000000 --- a/tests/lit/NPU/dialect/VPUIP/passes/sw_kernel_prefetching_reserve_mem_40XX+.mlir +++ /dev/null @@ -1,248 +0,0 @@ -// -// Copyright (C) 2024-2025 Intel Corporation. -// SPDX-License-Identifier: Apache-2.0 -// - -// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --sw-kernel-prefetching-reserve-mem %s | FileCheck %s -// REQUIRES: arch-NPU40XX - -module @SimpleGraph { - IE.TileResource 1 of @NCE at 1.300000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} - } - - module @VPU.SW { - func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} - func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} - } - - net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "data" : tensor<1x16x4x4xf16> - } outputsInfo : { - DataInfo "prob" : tensor<1x16x4x4xf16> - } - - func.func @main(%arg0: memref<1x16x4x4xf16>, %arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> { - %0 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %1 = VPUIP.Copy inputs(%arg0 : memref<1x16x4x4xf16>) outputs(%0 : memref<1x16x4x4xf16, [@CMX_NN, 0]>) -> memref<1x16x4x4xf16, [@CMX_NN, 0]> - %2 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Gelu inputs(%1 as %arg2: memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%2 as %arg3: memref<1x16x4x4xf16, [@CMX_NN, 0]>) on tile 0 -> memref<1x16x4x4xf16, [@CMX_NN, 0]>{ - VPUIP.SW.Kernel.run(%arg2, %arg3) : memref<1x16x4x4xf16, [@CMX_NN, 0]>, memref<1x16x4x4xf16, [@CMX_NN, 0]> - } - %3 = VPUIP.Copy inputs(%results : memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> - - return %arg1: memref<1x16x4x4xf16> - } - - // reserve dummy memory at the end of CMX - - // CHECK: IE.TileResource - // CHECK: ReservedMemory - // CHECK: SWKernelPrefetchingReservedMemory - // CHECK: IE.MemoryResource 1024 bytes of @CMX_NN offset 1473536 -} - -// ----- - -module @SimpleGraphWithReservedMem { - module @VPU.SW { - func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} - func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} - } - - IE.TileResource 1 of @NCE at 1.300000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} - builtin.module @ReservedMemory { - module @CustomReservedMemory { - IE.MemoryResource 512 bytes of @CMX_NN - } - } - } - - net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "data" : tensor<1x16x4x4xf16> - } outputsInfo : { - DataInfo "prob" : tensor<1x16x4x4xf16> - } - - func.func @main(%arg0: memref<1x16x4x4xf16>, %arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> { - %0 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %1 = VPUIP.Copy inputs(%arg0 : memref<1x16x4x4xf16>) outputs(%0 : memref<1x16x4x4xf16, [@CMX_NN, 0]>) -> memref<1x16x4x4xf16, [@CMX_NN, 0]> - %2 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Gelu inputs(%1 as %arg2: memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%2 as %arg3: memref<1x16x4x4xf16, [@CMX_NN, 0]>) on tile 0 -> memref<1x16x4x4xf16, [@CMX_NN, 0]>{ - VPUIP.SW.Kernel.run(%arg2, %arg3) : memref<1x16x4x4xf16, [@CMX_NN, 0]>, memref<1x16x4x4xf16, [@CMX_NN, 0]> - } - %3 = VPUIP.Copy inputs(%results : memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> - - return %arg1: memref<1x16x4x4xf16> - } - - // enlarge the original reserved memory and put it at the end of CMX - - // CHECK: IE.TileResource - // CHECK: ReservedMemory - // CHECK: CustomReservedMemory - // CHECK: IE.MemoryResource 1024 bytes of @CMX_NN offset 1473536 -} - -// ----- - -module @SimpleGraphWithReservedMemHasEnoughSize { - module @VPU.SW { - func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} - func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} - } - - IE.TileResource 1 of @NCE at 1.300000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} - builtin.module @ReservedMemory { - module @CustomReservedMemory { - IE.MemoryResource 1024 bytes of @CMX_NN - } - } - } - - net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "data" : tensor<1x16x4x4xf16> - } outputsInfo : { - DataInfo "prob" : tensor<1x16x4x4xf16> - } - - func.func @main(%arg0: memref<1x16x4x4xf16>, %arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> { - %0 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %1 = VPUIP.Copy inputs(%arg0 : memref<1x16x4x4xf16>) outputs(%0 : memref<1x16x4x4xf16, [@CMX_NN, 0]>) -> memref<1x16x4x4xf16, [@CMX_NN, 0]> - %2 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Gelu inputs(%1 as %arg2: memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%2 as %arg3: memref<1x16x4x4xf16, [@CMX_NN, 0]>) on tile 0 -> memref<1x16x4x4xf16, [@CMX_NN, 0]>{ - VPUIP.SW.Kernel.run(%arg2, %arg3) : memref<1x16x4x4xf16, [@CMX_NN, 0]>, memref<1x16x4x4xf16, [@CMX_NN, 0]> - } - %3 = VPUIP.Copy inputs(%results : memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> - - return %arg1: memref<1x16x4x4xf16> - } - - // no need to change the reserved memory size, just put it at the end of CMX - - // CHECK: IE.TileResource - // CHECK: ReservedMemory - // CHECK: CustomReservedMemory - // CHECK: IE.MemoryResource 1024 bytes of @CMX_NN offset 1473536 -} - -// ----- - -module @SimpleGraphWith2ReservedMem { - module @VPU.SW { - func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} - func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} - } - - IE.TileResource 1 of @NCE at 1.300000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} - builtin.module @ReservedMemory { - module @CustomReservedMemory1 { - IE.MemoryResource 512 bytes of @CMX_NN - } - - module @CustomReservedMemory2 { - IE.MemoryResource 64 bytes of @CMX_NN - } - } - } - - net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "data" : tensor<1x16x4x4xf16> - } outputsInfo : { - DataInfo "prob" : tensor<1x16x4x4xf16> - } - - func.func @main(%arg0: memref<1x16x4x4xf16>, %arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> { - %0 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %1 = VPUIP.Copy inputs(%arg0 : memref<1x16x4x4xf16>) outputs(%0 : memref<1x16x4x4xf16, [@CMX_NN, 0]>) -> memref<1x16x4x4xf16, [@CMX_NN, 0]> - %2 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Gelu inputs(%1 as %arg2: memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%2 as %arg3: memref<1x16x4x4xf16, [@CMX_NN, 0]>) on tile 0 -> memref<1x16x4x4xf16, [@CMX_NN, 0]>{ - VPUIP.SW.Kernel.run(%arg2, %arg3) : memref<1x16x4x4xf16, [@CMX_NN, 0]>, memref<1x16x4x4xf16, [@CMX_NN, 0]> - } - %3 = VPUIP.Copy inputs(%results : memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> - - return %arg1: memref<1x16x4x4xf16> - } - - // enlarge reserved memory size, and put both of them at the end of CMX - - // CHECK: IE.TileResource - // CHECK: ReservedMemory - // CHECK: CustomReservedMemory1 - // CHECK: IE.MemoryResource 512 bytes of @CMX_NN offset 1474048 - // CHECK: CustomReservedMemory2 - // CHECK: IE.MemoryResource 512 bytes of @CMX_NN offset 1473536 -} - -// ----- - -module @SimpleGraphWith2ReservedMemHaveEnoughTotalSize { - module @VPU.SW { - func.func private @builtin_Gelu(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i1, i1, f64) attributes {VPU.kernel_code = "activation_gelu.cpp", VPU.kernel_entry = "activation_gelu"} - func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} - } - - IE.TileResource 1 of @NCE at 1.300000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} - builtin.module @ReservedMemory { - module @CustomReservedMemory1 { - IE.MemoryResource 128 bytes of @CMX_NN - } - - module @CustomReservedMemory2 { - IE.MemoryResource 896 bytes of @CMX_NN - } - } - } - - net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "data" : tensor<1x16x4x4xf16> - } outputsInfo : { - DataInfo "prob" : tensor<1x16x4x4xf16> - } - - func.func @main(%arg0: memref<1x16x4x4xf16>, %arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> { - %0 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %1 = VPUIP.Copy inputs(%arg0 : memref<1x16x4x4xf16>) outputs(%0 : memref<1x16x4x4xf16, [@CMX_NN, 0]>) -> memref<1x16x4x4xf16, [@CMX_NN, 0]> - %2 = memref.alloc() : memref<1x16x4x4xf16, [@CMX_NN, 0]> - %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Gelu inputs(%1 as %arg2: memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%2 as %arg3: memref<1x16x4x4xf16, [@CMX_NN, 0]>) on tile 0 -> memref<1x16x4x4xf16, [@CMX_NN, 0]>{ - VPUIP.SW.Kernel.run(%arg2, %arg3) : memref<1x16x4x4xf16, [@CMX_NN, 0]>, memref<1x16x4x4xf16, [@CMX_NN, 0]> - } - %3 = VPUIP.Copy inputs(%results : memref<1x16x4x4xf16, [@CMX_NN, 0]>) outputs(%arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> - - return %arg1: memref<1x16x4x4xf16> - } - - // not need to enlarge the reserved memory size, just put both of them at the end of CMX - - // CHECK: IE.TileResource - // CHECK: ReservedMemory - // CHECK: CustomReservedMemory1 - // CHECK: IE.MemoryResource 128 bytes of @CMX_NN offset 1474432 - // CHECK: CustomReservedMemory2 - // CHECK: IE.MemoryResource 896 bytes of @CMX_NN offset 1473536 -} - -// ----- - -module @SimpleGraphNoSWKernel { - - IE.TileResource 1 of @NCE at 1.300000e+03 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} - } - - net.NetworkInfo entryPoint : @main inputsInfo : { - DataInfo "data" : tensor<1x16x4x4xf16> - } outputsInfo : { - DataInfo "prob" : tensor<1x16x4x4xf16> - } - func.func @main(%arg0: memref<1x16x4x4xf16>, %arg1: memref<1x16x4x4xf16>) -> memref<1x16x4x4xf16> { - return %arg1 : memref<1x16x4x4xf16> - } - // not change if no SW Kernel - - // CHECK-NOT: ReservedMemory -} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/swizzling_37XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/swizzling_37XX.mlir index 497f48bd33..a05bd71009 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/swizzling_37XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/swizzling_37XX.mlir @@ -632,7 +632,7 @@ func.func @SetSwizzlingForQuantConstantsSOK(%input : !Input_DDR) -> !Output_DDR #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> IE.TileResource 1 of @NCE at 1.300000e+03 MHz { - IE.MemoryResource 2100000 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 2100000 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -1095,7 +1095,7 @@ func.func @SetSwizzlingForDpuToDpuBufferWithInplace(%in0 : memref<1x240x8x98xf16 #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> IE.TileResource 1 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1470000 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1470000 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } diff --git a/tests/lit/NPU/dialect/VPUIP/passes/swizzling_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/swizzling_40XX+.mlir index ec1cf5b448..bf0509a1d2 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/swizzling_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/swizzling_40XX+.mlir @@ -9,7 +9,7 @@ #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> IE.TileResource 1 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1470000 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1470000 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -172,7 +172,7 @@ func.func @DoNotSwizzleDueToAlignmentMemIncrease(%in : memref<1x16x149x150xf16, !OutputStub_CMX = memref<1x16x148x148xf16, #NHWC, [@CMX_NN, 0]> IE.TileResource 1 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1470000 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1470000 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -289,7 +289,7 @@ func.func @SetSwizzlingForConstantButNotActivationDueToCmxSizeLimit(%input : !In #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> IE.TileResource 1 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1470000 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1470000 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } @@ -486,7 +486,7 @@ func.func @CannotSwizzledDueToMultiUserWhichCannotSwizzled(%arg0 : memref<1x256x }> IE.TileResource 1 of @NCE at 1.700000e+03 MHz { - IE.MemoryResource 1470000 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1470000 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @DPU } diff --git a/tests/lit/NPU/dialect/VPUIP/passes/tile_act_shave_kernel_task.mlir b/tests/lit/NPU/dialect/VPUIP/passes/tile_act_shave_kernel_task.mlir index f37e41a0c3..ec86430280 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/tile_act_shave_kernel_task.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/tile_act_shave_kernel_task.mlir @@ -2861,6 +2861,162 @@ func.func @TileDynamicDequantize(%arg0: memref<1x28x768x128x!qElemType>, %arg1: // ----- +!qElemType = !quant.uniform + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!InputDistributed = !VPUIP.DistributedBuffer<64x16x3x3x!qElemType, #NHWC, @CMX_NN, + {mode = "SEGMENTED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, uniform_distributed_segments, + compute_shapes = [[32, 16, 3, 3], [32, 16, 3, 3]], compute_offsets = [[0, 0, 0, 0], [32, 0, 0, 0]], + memory_shapes = [[32, 16, 3, 3], [32, 16, 3, 3]], memory_offsets = [[0, 0, 0, 0], [32, 0, 0, 0]]}> + +!OutputDistributed = !VPUIP.DistributedBuffer<64x16x3x3xf16, #NHWC, @CMX_NN, + {mode = "SEGMENTED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, uniform_distributed_segments, + compute_shapes = [[32, 16, 3, 3], [32, 16, 3, 3]], compute_offsets = [[0, 0, 0, 0], [32, 0, 0, 0]], + memory_shapes = [[32, 16, 3, 3], [32, 16, 3, 3]], memory_offsets = [[0, 0, 0, 0], [32, 0, 0, 0]]}> + +module @VPU.SW { + func.func private @builtin_Dequantize(memref<*x!qElemType, @CMX_NN>, memref<*xf16, @CMX_NN>, none) attributes {VPU.kernel_code = "dequantize.cpp", VPU.kernel_entry = "dequantize", VPU.kernel_name = "dequantize", VPU.task_type = @COMPUTE} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} +} + +// CHECK-DAG: [[QTYPE:!.+]] = !quant.uniform + +// CHECK: @DequantMultiClusterOffQuantAxis +func.func @DequantMultiClusterOffQuantAxis(%arg0: memref<64x16x3x3xui8, #NHWC>, %arg1: memref<64x16x3x3xf16, #NHWC>) -> memref<64x16x3x3xf16, #NHWC> { + %0 = VPUIP.QuantizeCast inputs(%arg0 : memref<64x16x3x3xui8, #NHWC>) -> memref<64x16x3x3x!qElemType, #NHWC> + %1 = VPURT.AllocDistributed -> !InputDistributed + %2 = VPUIP.Copy inputs(%0 : memref<64x16x3x3x!qElemType, #NHWC>) outputs(%1 : !InputDistributed) -> !InputDistributed + %3 = VPURT.AllocDistributed -> !OutputDistributed + + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Dequantize + inputs(%2 as %arg2: !InputDistributed) + outputs(%3 as %arg3: !OutputDistributed) -> !OutputDistributed + { + VPUIP.SW.Kernel.run {attrs = [[0, 16, 2963130708733665567, 3251366363510221414, 3435735286504893891, 3539601489976307753, 6341165033837320192, 6341165033837320192, 6341165033837320192, 6341165033837320192]]}(%arg2, %arg3) : !InputDistributed, !OutputDistributed + } + + %alloc = memref.alloc() : memref<64x16x3x3xf16, #NHWC> + %4 = VPUIP.Copy inputs(%results : !OutputDistributed) outputs(%alloc : memref<64x16x3x3xf16, #NHWC>) -> memref<64x16x3x3xf16, #NHWC> + %5 = VPUIP.Copy inputs(%4 : memref<64x16x3x3xf16, #NHWC>) outputs(%arg1 : memref<64x16x3x3xf16, #NHWC>) -> memref<64x16x3x3xf16, #NHWC> + return %5 : memref<64x16x3x3xf16, #NHWC> + + // CHECK: VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Dequantize + // CHECK-SAME: inputs + // CHECK-SAME: !VPUIP.DistributedBuffer<32x16x3x3x[[QTYPE]], #NHWC, @CMX_NN + // CHECK-SAME: !VPUIP.DistributedBuffer<32x16x3x3x[[QTYPE]], #NHWC, @CMX_NN + // CHECK-SAME: outputs + // CHECK-SAME: !VPUIP.DistributedBuffer<32x16x3x3xf16, #NHWC, @CMX_NN + // CHECK-SAME: !VPUIP.DistributedBuffer<32x16x3x3xf16, #NHWC, @CMX_NN + // CHECK-SAME: { + // CHECK: VPUIP.SW.Kernel.run {attrs = {{\[\[}}0, 16, 2963130708733665567, + // CHECK: VPUIP.SW.Kernel.run {attrs = {{\[\[}}0, 16, 2963130708733665567, + // CHECK: } +} + +// ----- + +!qElemType = !quant.uniform + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!InputDistributed = !VPUIP.DistributedBuffer< + 32x48x3x3x!qElemType, #NHWC, @CMX_NN, + {mode = "SEGMENTED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, uniform_distributed_segments, + compute_shapes = [[16, 48, 3, 3], [16, 48, 3, 3]], compute_offsets = [[0, 0, 0, 0], [16, 0, 0, 0]], + memory_shapes = [[16, 48, 3, 3], [16, 48, 3, 3]], memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0]]}> + +!OutputDistributed = !VPUIP.DistributedBuffer< + 32x48x3x3xf16, #NHWC, @CMX_NN, + {mode = "SEGMENTED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, uniform_distributed_segments, + compute_shapes = [[16, 48, 3, 3], [16, 48, 3, 3]], compute_offsets = [[0, 0, 0, 0], [16, 0, 0, 0]], + memory_shapes = [[16, 48, 3, 3], [16, 48, 3, 3]], memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0]]}> + +module @VPU.SW { + func.func private @builtin_Dequantize(memref<*x!qElemType, @CMX_NN>, memref<*xf16, @CMX_NN>, none) attributes {VPU.kernel_code = "dequantize.cpp", VPU.kernel_entry = "dequantize", VPU.kernel_name = "dequantize", VPU.task_type = @COMPUTE} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} +} + +// CHECK-DAG: [[QTYPE_1:!.+]] = !quant.uniform +// CHECK-DAG: [[QTYPE_2:!.+]] = !quant.uniform + +// CHECK: @DequantMultiClusterOnQuantAxis +func.func @DequantMultiClusterOnQuantAxis(%arg0: memref<32x48x3x3x!qElemType, #NHWC>, %arg1: memref<32x48x3x3xf16, #NHWC>) -> memref<32x48x3x3xf16, #NHWC> { + %1 = VPURT.AllocDistributed -> !InputDistributed + %2 = VPUIP.Copy inputs(%arg0 : memref<32x48x3x3x!qElemType, #NHWC>) outputs(%1 : !InputDistributed) -> !InputDistributed + %3 = VPURT.AllocDistributed -> !OutputDistributed + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} + @VPU.SW::@builtin_Dequantize inputs(%2 as %arg2: !InputDistributed) outputs(%3 as %arg3: !OutputDistributed) on tile 0 -> !OutputDistributed + { + VPUIP.SW.Kernel.run {attrs = [[3, 32, 2963130708733665567, 3251366363510221414, 3435735286504893891, 3539601489976307753, 3631645211836494193, 3723970412968293048, 3781673839774741504, 3677810277753894052, 6341165033837320192, 6341165033837320192, 6341165033837320192, 6341165033837320192, 6341165033837320192, 6341165033837320192, 6341165033837320192, 6341165033837320192]]}(%arg2, %arg3) : !InputDistributed, !OutputDistributed + } + %alloc = memref.alloc() : memref<32x48x3x3xf16, #NHWC> + %4 = VPUIP.Copy inputs(%results : !OutputDistributed) outputs(%alloc : memref<32x48x3x3xf16, #NHWC>) -> memref<32x48x3x3xf16, #NHWC> + %5 = VPUIP.Copy inputs(%4 : memref<32x48x3x3xf16, #NHWC>) outputs(%arg1 : memref<32x48x3x3xf16, #NHWC>) -> memref<32x48x3x3xf16, #NHWC> + return %5 : memref<32x48x3x3xf16, #NHWC> + + // CHECK: VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Dequantize + // CHECK-SAME: inputs + // CHECK-SAME: !VPUIP.DistributedBuffer<16x48x3x3x[[QTYPE_1]], #NHWC, @CMX_NN + // CHECK-SAME: !VPUIP.DistributedBuffer<16x48x3x3x[[QTYPE_2]], #NHWC, @CMX_NN + // CHECK-SAME: outputs + // CHECK-SAME: !VPUIP.DistributedBuffer<16x48x3x3xf16, #NHWC, @CMX_NN + // CHECK-SAME: !VPUIP.DistributedBuffer<16x48x3x3xf16, #NHWC, @CMX_NN + // CHECK-SAME: { + // CHECK: VPUIP.SW.Kernel.run {attrs = {{\[\[}}3, 16, 2963130708733665567, 3251366363510221414, + // CHECK: VPUIP.SW.Kernel.run {attrs = {{\[\[}}3, 16, 3631645211836494193, 3723970412968293048, + // CHECK: } +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!qElemType = !quant.uniform + +module @VPU.SW { + func.func private @builtin_Dequantize(memref<*x!qElemType, @CMX_NN>, memref<*xf16, @CMX_NN>, none) attributes {VPU.kernel_code = "dequantize.cpp", VPU.kernel_entry = "dequantize", VPU.kernel_name = "dequantize", VPU.task_type = @COMPUTE} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} +} + +// CHECK-DAG: [[QTYPE_1:!.+]] = !quant.uniform +// CHECK-DAG: [[QTYPE_2:!.+]] = !quant.uniform + +// CHECK: @DequantSingleClusterOnQuantAxis +func.func @DequantSingleClusterOnQuantAxis(%arg0: memref<32x48x3x3xui8, #NHWC>, %arg1: memref<32x48x3x3xf16, #NHWC>) -> memref<32x48x3x3xf16, #NHWC> { + %0 = VPUIP.QuantizeCast inputs(%arg0 : memref<32x48x3x3xui8, #NHWC>) -> memref<32x48x3x3x!qElemType, #NHWC> + %alloc = memref.alloc() : memref<32x48x3x3x!qElemType, #NHWC, [@CMX_NN, 0]> + %1 = VPUIP.Copy inputs(%0 : memref<32x48x3x3x!qElemType, #NHWC>) outputs(%alloc : memref<32x48x3x3x!qElemType, #NHWC, [@CMX_NN, 0]>) -> memref<32x48x3x3x!qElemType, #NHWC, [@CMX_NN, 0]> + %alloc_0 = memref.alloc() : memref<32x48x3x3xf16, #NHWC, [@CMX_NN, 0]> + + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Dequantize inputs(%1 as %arg2: memref<32x48x3x3x!qElemType, #NHWC, [@CMX_NN, 0]>) outputs(%alloc_0 as %arg3: memref<32x48x3x3xf16, #NHWC, [@CMX_NN, 0]>) on tile 0 -> memref<32x48x3x3xf16, #NHWC, [@CMX_NN, 0]>{ + VPUIP.SW.Kernel.run {attrs = [[3, 32, 2963130708733665567, 3251366363510221414, 3435735286504893891, 3539601489976307753, 3631645211836494193, 3723970412968293048, 3781673839774741504, 3827836440340673700, 6341165033837320192, 6341165033837320192, 6341165033837320192, 6341165033837320192, 6341165033837320192, 6341165033837320192, 6341165033837320192, 6341165033837320192]]}(%arg2, %arg3) : memref<32x48x3x3x!qElemType, #NHWC, [@CMX_NN, 0]>, memref<32x48x3x3xf16, #NHWC, [@CMX_NN, 0]> + } + + %alloc_1 = memref.alloc() : memref<32x48x3x3xf16, #NHWC> + %2 = VPUIP.Copy inputs(%results : memref<32x48x3x3xf16, #NHWC, [@CMX_NN, 0]>) outputs(%alloc_1 : memref<32x48x3x3xf16, #NHWC>) -> memref<32x48x3x3xf16, #NHWC> + %3 = VPUIP.Copy inputs(%2 : memref<32x48x3x3xf16, #NHWC>) outputs(%arg1 : memref<32x48x3x3xf16, #NHWC>) -> memref<32x48x3x3xf16, #NHWC> + return %3 : memref<32x48x3x3xf16, #NHWC> + + // CHECK: VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Dequantize + // CHECK-SAME: inputs + // CHECK-SAME: memref<16x48x3x3x[[QTYPE_1]], #NHWC, [@CMX_NN, 0]> + // CHECK-SAME: memref<16x48x3x3x[[QTYPE_2]], #NHWC, [@CMX_NN, 0]> + // CHECK-SAME: outputs + // CHECK-SAME: memref<16x48x3x3xf16, #NHWC, [@CMX_NN, 0]> + // CHECK-SAME: memref<16x48x3x3xf16, #NHWC, [@CMX_NN, 0]> + // CHECK-SAME: { + // CHECK: VPUIP.SW.Kernel.run {attrs = {{\[\[}}3, 16, 2963130708733665567, + // CHECK: VPUIP.SW.Kernel.run {attrs = {{\[\[}}3, 16, 3631645211836494193, + // CHECK: } +} + +// ----- + module @VPU.SW { func.func private @builtin_GatherElements(memref<*xf16, @CMX_NN>, memref<*xsi32, @CMX_NN>, memref<*xf16, @CMX_NN>, i64) attributes {VPU.kernel_code = "gather_elements.cpp", VPU.kernel_entry = "gather_elements", VPU.task_type = @COMPUTE} func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} @@ -3138,6 +3294,65 @@ func.func @TileGridSample(%arg0: memref<1x32x48x720xf16>, %arg1: memref<1x48x720 // ----- +module @VPU.SW { + func.func private @builtin_GridSample(memref<*xf16, @CMX_NN>, memref<*xsi32, @CMX_NN>, memref<*xf16, @CMX_NN>, i64) attributes {VPU.kernel_code = "grid_sample.cpp", VPU.kernel_entry = "grid_sample", VPU.task_type = @COMPUTE} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} +} + +// CHECK-LABEL: @TileGridSampleOverN +// CHECK-SAME: [[INPUT_DATA:%.+]]: memref<16x16x20x30xf16>, +// CHECK-SAME: [[INPUT_GRID:%.+]]: memref<16x64x4x2xf16> +func.func @TileGridSampleOverN(%arg0: memref<16x16x20x30xf16>, %arg1: memref<16x64x4x2xf16>, %arg2: memref<16x16x64x4xf16>) -> memref<16x16x64x4xf16> { + %input_alloc = memref.alloc() : memref<16x16x20x30xf16, [@CMX_NN, 0]> + %input_copy = VPUIP.Copy inputs(%arg0 : memref<16x16x20x30xf16>) outputs(%input_alloc : memref<16x16x20x30xf16, [@CMX_NN, 0]>) -> memref<16x16x20x30xf16, [@CMX_NN, 0]> + + %coord_alloc = memref.alloc() : memref<16x64x4x2xf16, [@CMX_NN, 0]> + %coord_copy = VPUIP.Copy inputs(%arg1 : memref<16x64x4x2xf16>) outputs(%coord_alloc : memref<16x64x4x2xf16, [@CMX_NN, 0]>) -> memref<16x64x4x2xf16, [@CMX_NN, 0]> + + %output_alloc = memref.alloc() : memref<16x16x64x4xf16, [@CMX_NN, 0]> + + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_GridSample + inputs(%input_copy as %arg3: memref<16x16x20x30xf16, [@CMX_NN, 0]>, + %coord_copy as %arg4: memref<16x64x4x2xf16, [@CMX_NN, 0]>) + outputs(%output_alloc as %arg5: memref<16x16x64x4xf16, [@CMX_NN, 0]>) on tile 0 -> memref<16x16x64x4xf16, [@CMX_NN, 0]> + { + VPUIP.SW.Kernel.run {attrs = [0, 0, 0]}(%arg3, %arg4, %arg5) : memref<16x16x20x30xf16, [@CMX_NN, 0]>, memref<16x64x4x2xf16, [@CMX_NN, 0]>, memref<16x16x64x4xf16, [@CMX_NN, 0]> + } + %out_ddr = memref.alloc() : memref<16x16x64x4xf16> + %out_copy = VPUIP.Copy inputs(%results : memref<16x16x64x4xf16, [@CMX_NN, 0]>) outputs(%out_ddr : memref<16x16x64x4xf16>) -> memref<16x16x64x4xf16> + %out_final = VPUIP.Copy inputs(%out_copy : memref<16x16x64x4xf16>) outputs(%arg2 : memref<16x16x64x4xf16>) -> memref<16x16x64x4xf16> + return %out_final : memref<16x16x64x4xf16> + + // CHECK: [[I_ALLOC:%.+]] = memref.alloc() : memref<16x16x20x30xf16, [@CMX_NN, 0]> + // CHECK: [[I_COPY:%.+]] = VPUIP.Copy inputs([[INPUT_DATA]] : memref<16x16x20x30xf16>) outputs([[I_ALLOC]] : memref<16x16x20x30xf16, [@CMX_NN, 0]>) -> memref<16x16x20x30xf16, [@CMX_NN, 0]> + + // CHECK: [[G_ALLOC:%.+]] = memref.alloc() : memref<16x64x4x2xf16, [@CMX_NN, 0]> + // CHECK: [[G_COPY:%.+]] = VPUIP.Copy inputs([[INPUT_GRID]] : memref<16x64x4x2xf16>) outputs([[G_ALLOC]] : memref<16x64x4x2xf16, [@CMX_NN, 0]>) -> memref<16x64x4x2xf16, [@CMX_NN, 0]> + + // CHECK: [[O_ALLOC:%.+]] = memref.alloc() : memref<16x16x64x4xf16, [@CMX_NN, 0]> + + // CHECK: [[I_SLICE_0:%.+]] = VPUIP.SubView [[I_COPY]] [0, 0, 0, 0] [8, 16, 20, 30] : memref<16x16x20x30xf16, [@CMX_NN, 0]> to memref<8x16x20x30xf16, [@CMX_NN, 0]> + // CHECK: [[G_SLICE_0:%.+]] = VPUIP.SubView [[G_COPY]] [0, 0, 0, 0] [8, 64, 4, 2] : memref<16x64x4x2xf16, [@CMX_NN, 0]> to memref<8x64x4x2xf16, [@CMX_NN, 0]> + // CHECK: [[O_SLICE_0:%.+]] = VPUIP.SubView [[O_ALLOC]] [0, 0, 0, 0] [8, 16, 64, 4] : memref<16x16x64x4xf16, [@CMX_NN, 0]> to memref<8x16x64x4xf16, [@CMX_NN, 0]> + // CHECK: [[I_SLICE_1:%.+]] = VPUIP.SubView [[I_COPY]] [8, 0, 0, 0] [8, 16, 20, 30] : memref<16x16x20x30xf16, [@CMX_NN, 0]> to memref<8x16x20x30xf16, [@CMX_NN, 0]> + // CHECK: [[G_SLICE_1:%.+]] = VPUIP.SubView [[G_COPY]] [8, 0, 0, 0] [8, 64, 4, 2] : memref<16x64x4x2xf16, [@CMX_NN, 0]> to memref<8x64x4x2xf16, [@CMX_NN, 0]> + // CHECK: [[O_SLICE_1:%.+]] = VPUIP.SubView [[O_ALLOC]] [8, 0, 0, 0] [8, 16, 64, 4] : memref<16x16x64x4xf16, [@CMX_NN, 0]> to memref<8x16x64x4xf16, [@CMX_NN, 0]> + + // CHECK: [[GS:%.+]]:2 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_GridSample + // CHECK-SAME: inputs([[I_SLICE_0]] as [[IN_0:[^:]+]]: memref<8x16x20x30xf16, [@CMX_NN, 0]>, [[G_SLICE_0]] as [[GRID_0:[^:]+]]: memref<8x64x4x2xf16, [@CMX_NN, 0]>, + // CHECK-SAME: [[I_SLICE_1]] as [[IN_1:[^:]+]]: memref<8x16x20x30xf16, [@CMX_NN, 0]>, [[G_SLICE_1]] as [[GRID_1:[^:]+]]: memref<8x64x4x2xf16, [@CMX_NN, 0]>) + // CHECK-SAME: outputs([[O_SLICE_0]] as [[OUT_0:[^:]+]]: memref<8x16x64x4xf16, [@CMX_NN, 0]>, [[O_SLICE_1]] as [[OUT_1:[^:]+]]: memref<8x16x64x4xf16, [@CMX_NN, 0]>) + // CHECK-SAME: on tile 0 -> (memref<8x16x64x4xf16, [@CMX_NN, 0]>, memref<8x16x64x4xf16, [@CMX_NN, 0]>){ + // CHECK: VPUIP.SW.Kernel.run {attrs = [0, 0, 0]}([[IN_0]], [[GRID_0]], [[OUT_0]]) : memref<8x16x20x30xf16, [@CMX_NN, 0]>, memref<8x64x4x2xf16, [@CMX_NN, 0]>, memref<8x16x64x4xf16, [@CMX_NN, 0]> + // CHECK: VPUIP.SW.Kernel.run {attrs = [0, 0, 0]}([[IN_1]], [[GRID_1]], [[OUT_1]]) : memref<8x16x20x30xf16, [@CMX_NN, 0]>, memref<8x64x4x2xf16, [@CMX_NN, 0]>, memref<8x16x64x4xf16, [@CMX_NN, 0]> + // CHECK: } + + // CHECK: [[CONCAT:%.+]] = VPUIP.ConcatView inputs([[GS]]#0, [[GS]]#1 : memref<8x16x64x4xf16, [@CMX_NN, 0]>, memref<8x16x64x4xf16, [@CMX_NN, 0]>) + // CHECK-SAME: outputs([[O_ALLOC]] : memref<16x16x64x4xf16, [@CMX_NN, 0]>) -> memref<16x16x64x4xf16, [@CMX_NN, 0]> +} + +// ----- + #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> !DistributedBuffer = !VPUIP.DistributedBuffer< diff --git a/tests/lit/NPU/dialect/VPUIP/passes/tile_act_shave_kernel_task_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/tile_act_shave_kernel_task_40XX+.mlir index 9ffa5a6cfb..c665f7a4ff 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/tile_act_shave_kernel_task_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/tile_act_shave_kernel_task_40XX+.mlir @@ -2521,61 +2521,51 @@ func.func @TileConvertWithI4Output(%arg0: memref<1x148x90x128xf16>) -> memref<1x // CHECK-SAME{LITERAL}: memory_shapes = [[1, 50, 90, 128], [1, 49, 90, 128], [1, 49, 90, 128]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 50, 0, 0], [0, 99, 0, 0]]}>) - // CHECK: [[CONVERT_IN_0:%.+]] = VPUIP.SubView [[IN_COPY]] [0, 75, 0, 0] [1, 73, 90, 128] - // CHECK-SAME: to !VPUIP.DistributedBuffer<1x73x90x128xf16, {order = #NCHW, strides = [1704960, 11520, 128, 1]}, @CMX_NN, - // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 3, 1, 1], num_clusters = 3 : i64, uniform_distributed_segments, - // CHECK-SAME{LITERAL}: compute_shapes = [[1, 25, 90, 128], [1, 24, 90, 128], [1, 24, 90, 128]], - // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 25, 0, 0], [0, 49, 0, 0]], - // CHECK-SAME{LITERAL}: memory_shapes = [[1, 25, 90, 128], [1, 24, 90, 128], [1, 24, 90, 128]], - // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 25, 0, 0], [0, 49, 0, 0]] - - // CHECK: [[CONVERT_IN_1:%.+]] = VPUIP.SubView [[IN_COPY]] [0, 0, 0, 0] [1, 75, 90, 128] - // CHECK-SAME: to !VPUIP.DistributedBuffer<1x75x90x128xf16, {order = #NCHW, strides = [1704960, 11520, 128, 1]}, @CMX_NN, - // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 3, 1, 1], num_clusters = 3 : i64, uniform_distributed_segments, - // CHECK-SAME{LITERAL}: compute_shapes = [[1, 25, 90, 128], [1, 25, 90, 128], [1, 25, 90, 128]], - // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 25, 0, 0], [0, 50, 0, 0]], - // CHECK-SAME{LITERAL}: memory_shapes = [[1, 25, 90, 128], [1, 25, 90, 128], [1, 25, 90, 128]], - // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 25, 0, 0], [0, 50, 0, 0]] - - // CHECK: [[CONVERT_OUT:%.+]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1x148x90x128xsi4, #NCHW, @CMX_NN, - // CHECK: {mode = "SEGMENTED", num_tiles = [1, 3, 1, 1], num_clusters = 3 : i64, uniform_distributed_segments, - // CHECK-SAME{LITERAL}: compute_shapes = [[1, 50, 90, 128], [1, 49, 90, 128], [1, 49, 90, 128]] - // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 50, 0, 0], [0, 99, 0, 0]] - // CHECK-SAME{LITERAL}: memory_shapes = [[1, 50, 90, 128], [1, 49, 90, 128], [1, 49, 90, 128]] - // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 50, 0, 0], [0, 99, 0, 0]] - - // CHECK: [[CONVERT_OUT_0:%.+]] = VPUIP.SubView [[CONVERT_OUT]] [0, 75, 0, 0] [1, 73, 90, 128] - // CHECK-SAME: to !VPUIP.DistributedBuffer<1x73x90x128xsi4, {order = #NCHW, strides = [1704960, 11520, 128, 1]}, @CMX_NN, - // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 3, 1, 1], num_clusters = 3 : i64, uniform_distributed_segments - // CHECK-SAME{LITERAL}: compute_shapes = [[1, 25, 90, 128], [1, 24, 90, 128], [1, 24, 90, 128]], - // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 25, 0, 0], [0, 49, 0, 0]], - // CHECK-SAME{LITERAL}: memory_shapes = [[1, 25, 90, 128], [1, 24, 90, 128], [1, 24, 90, 128]], - // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 25, 0, 0], [0, 49, 0, 0]] - - // CHECK: [[CONVERT_OUT_1:%.+]] = VPUIP.SubView [[CONVERT_OUT]] [0, 0, 0, 0] [1, 75, 90, 128] - // CHECK-SAME: to !VPUIP.DistributedBuffer<1x75x90x128xsi4, {order = #NCHW, strides = [1704960, 11520, 128, 1]}, @CMX_NN - // CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 3, 1, 1], num_clusters = 3 : i64, uniform_distributed_segments - // CHECK-SAME{LITERAL}: compute_shapes = [[1, 25, 90, 128], [1, 25, 90, 128], [1, 25, 90, 128]] - // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 25, 0, 0], [0, 50, 0, 0]] - // CHECK-SAME{LITERAL}: memory_shapes = [[1, 25, 90, 128], [1, 25, 90, 128], [1, 25, 90, 128]] - // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 25, 0, 0], [0, 50, 0, 0]]}> - - // CHECK: [[CONVERT:%.+]]:2 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Convert - // CHECK-SAME: inputs([[CONVERT_IN_1]] as [[ARG_0:[^:]+]]: !VPUIP.DistributedBuffer<1x75x90x128xf16, {order = #NCHW, strides = [1704960, 11520, 128, 1]}, @CMX_NN - // CHECK-SAME: [[CONVERT_IN_0]] as [[ARG_1:[^:]+]]: !VPUIP.DistributedBuffer<1x73x90x128xf16, {order = #NCHW, strides = [1704960, 11520, 128, 1]}, @CMX_NN - // CHECK-SAME: outputs([[CONVERT_OUT_1]] as [[ARG_2:[^:]+]]: !VPUIP.DistributedBuffer<1x75x90x128xsi4, {order = #NCHW, strides = [1704960, 11520, 128, 1]}, @CMX_NN - // CHECK-SAME: [[CONVERT_OUT_0]] as [[ARG_3:[^:]+]]: !VPUIP.DistributedBuffer<1x73x90x128xsi4, {order = #NCHW, strides = [1704960, 11520, 128, 1]}, @CMX_NN - // CHECK: VPUIP.SW.Kernel.run {attrs = []}([[ARG_0]], [[ARG_2]]) - // CHECK: VPUIP.SW.Kernel.run {attrs = []}([[ARG_1]], [[ARG_3]]) - - // CHECK: [[CONCAT:%.+]] = VPUIP.ConcatView inputs([[CONVERT]]#0, [[CONVERT]]#1 - - // CHECK: [[OUTPUT_BUFF:%.+]] = memref.alloc() : memref<1x148x90x128xsi4> - // CHECK: [[OUTPUT_COPY:%.+]] = VPUIP.Copy - // CHECK: inputs([[CONCAT]] - // CHECK: outputs([[OUTPUT_BUFF]] - - // CHECK: return [[OUTPUT_COPY]] : memref<1x148x90x128xsi4> + // CHECK: [[IN_SHAPECAST:%.+]] = VPUIP.ShapeCast + // CHECK-SAME-DAG{LITERAL}: explicit_output_offsets = [[0, 0, 0, 0], [0, 576000, 0, 0], [0, 1140480, 0, 0]] + // CHECK-SAME-DAG{LITERAL}: explicit_output_shapes = [[1, 576000, 1, 1], [1, 564480, 1, 1], [1, 564480, 1, 1]] + // CHECK-SAME-DAG{LITERAL}: shape = [1, 1704960, 1, 1] + + // CHECK-DAG: [[SUBVIEW_3:%.+]] = VPUIP.SubView [[IN_SHAPECAST]] [0, 864000, 0, 0] [1, 840960, 1, 1] + // CHECK-DAG: [[SUBVIEW_4:%.+]] = VPUIP.SubView [[IN_SHAPECAST]] [0, 0, 0, 0] [1, 864000, 1, 1] + + // CHECK: [[COPY_5:%.+]] = VPURT.AllocDistributed + // CHECK: -> !VPUIP.DistributedBuffer<1x1704960x1x1xsi4, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 3, 1, 1], num_clusters = 3 : i64, uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[1, 576000, 1, 1], [1, 564480, 1, 1], [1, 564480, 1, 1]], + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 576000, 0, 0], [0, 1140480, 0, 0]], + // CHECK-SAME{LITERAL}: memory_shapes = [[1, 576000, 1, 1], [1, 564480, 1, 1], [1, 564480, 1, 1]], + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 576000, 0, 0], [0, 1140480, 0, 0]]}> + + // CHECK-DAG: [[SUBVIEW_6:%.+]] = VPUIP.SubView [[COPY_5]] [0, 864000, 0, 0] [1, 840960, 1, 1] + // CHECK-DAG: [[SUBVIEW_7:%.+]] = VPUIP.SubView [[COPY_5]] [0, 0, 0, 0] [1, 864000, 1, 1] + + // CHECK: [[SW_KERNEL:%.+]]:2 = VPUIP.SW.Kernel {resultSegmentSizes + // CHECK-SAME{LITERAL}: compute_shapes = [[1, 288000, 1, 1], [1, 276480, 1, 1], [1, 276480, 1, 1]], + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 288000, 0, 0], [0, 564480, 0, 0]], + // CHECK-SAME{LITERAL}: memory_shapes = [[1, 288000, 1, 1], [1, 276480, 1, 1], [1, 276480, 1, 1]], + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 288000, 0, 0], [0, 564480, 0, 0]]}>) + // CHECK: VPUIP.SW.Kernel.run + // CHECK-SAME: !VPUIP.DistributedBuffer<1x864000x1x1xf16, {order = #NCHW, strides = [1704960, 1, 1, 1]}, @CMX_NN + // CHECK-SAME: !VPUIP.DistributedBuffer<1x864000x1x1xsi4, {order = #NCHW, strides = [1704960, 1, 1, 1]}, @CMX_NN + // CHECK: VPUIP.SW.Kernel.run + // CHECK-SAME: !VPUIP.DistributedBuffer<1x840960x1x1xf16, {order = #NCHW, strides = [1704960, 1, 1, 1]}, @CMX_NN + // CHECK-SAME: !VPUIP.DistributedBuffer<1x840960x1x1xsi4, {order = #NCHW, strides = [1704960, 1, 1, 1]}, @CMX_NN + + // CHECK-DAG: [[CONCAT_8:%.+]] = VPUIP.ConcatView + + // CHECK: [[OUT_SHAPECAST:%.+]] = VPUIP.ShapeCast + // CHECK-SAME-DAG{LITERAL}: explicit_output_offsets = [[0, 0, 0, 0], [0, 50, 0, 0], [0, 99, 0, 0]] + // CHECK-SAME-DAG{LITERAL}: explicit_output_shapes = [[1, 50, 90, 128], [1, 49, 90, 128], [1, 49, 90, 128]] + // CHECK-SAME-DAG{LITERAL}: shape = [1, 148, 90, 128]} + + // CHECK: [[OUTPUT_DDR:%.+]] = memref.alloc() : memref<1x148x90x128xsi4> + // CHECK: [[OUT_COPY:%.+]] = VPUIP.Copy inputs([[OUT_SHAPECAST]] : !VPUIP.DistributedBuffer<1x148x90x128xsi4, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 3, 1, 1], num_clusters = 3 : i64, uniform_distributed_segments, + // CHECK-SAME{LITERAL}: compute_shapes = [[1, 50, 90, 128], [1, 49, 90, 128], [1, 49, 90, 128]], + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 50, 0, 0], [0, 99, 0, 0]], + // CHECK-SAME{LITERAL}: memory_shapes = [[1, 50, 90, 128], [1, 49, 90, 128], [1, 49, 90, 128]], + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 50, 0, 0], [0, 99, 0, 0]]}>) + + // CHECK: return [[OUT_COPY]] : memref<1x148x90x128xsi4> } @@ -3009,7 +2999,7 @@ module @VPU.SW { IE.TileResource 4 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } @@ -3191,22 +3181,22 @@ func.func @TileReverse(%arg0: memref<128x1x5x5xf16>, %arg1: memref<128x1x5x5xf16 // CHECK-SAME: [[SUBVIEW_IN_0]] as {{[^:]+}}: !VPUIP.DistributedBuffer<64x1x5x5xf16, // CHECK-SAME: outputs([[SUBVIEW_OUT_1]] as {{[^:]+}}: !VPUIP.DistributedBuffer<64x1x5x5xf16, // CHECK-SAME: [[SUBVIEW_OUT_0]] as {{[^:]+}}: !VPUIP.DistributedBuffer<64x1x5x5xf16, - // CHECK-SAME: -> (!VPUIP.DistributedBuffer<64x1x5x5xf16, {order = #NCHW, strides = [25, 25, 5, 1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, + // CHECK-SAME: -> (!VPUIP.DistributedBuffer<64x1x5x5xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5]], // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0]], // CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5]], // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0]]}>, - // CHECK-SAME: !VPUIP.DistributedBuffer<64x1x5x5xf16, {order = #NCHW, strides = [25, 25, 5, 1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, + // CHECK-SAME: !VPUIP.DistributedBuffer<64x1x5x5xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5]], compute_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0]], memory_shapes = [[16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5]], memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0]]}>){ // CHECK: VPUIP.SW.Kernel.run {attrs = [3, 0, [1, 2, 3]]}({{[^:]+}}, {{[^:]+}}) // CHECK: VPUIP.SW.Kernel.run {attrs = [3, 0, [1, 2, 3]]}({{[^:]+}}, {{[^:]+}}) // CHECK: } - // CHECK: [[CONCAT:%.+]] = VPUIP.ConcatView inputs([[REVERSE]]#0, [[REVERSE]]#1 : !VPUIP.DistributedBuffer<64x1x5x5xf16, {order = #NCHW, strides = [25, 25, 5, 1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, + // CHECK: [[CONCAT:%.+]] = VPUIP.ConcatView inputs([[REVERSE]]#0, [[REVERSE]]#1 : !VPUIP.DistributedBuffer<64x1x5x5xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5]], // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0]], // CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5]], - // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0]]}>, !VPUIP.DistributedBuffer<64x1x5x5xf16, {order = #NCHW, strides = [25, 25, 5, 1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0]]}>, !VPUIP.DistributedBuffer<64x1x5x5xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [4, 1, 1, 1], num_clusters = 4 : i64, uniform_distributed_segments, // CHECK-SAME{LITERAL}: compute_shapes = [[16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5]], // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [16, 0, 0, 0], [32, 0, 0, 0], [48, 0, 0, 0]], // CHECK-SAME{LITERAL}: memory_shapes = [[16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5], [16, 1, 5, 5]], @@ -3228,7 +3218,7 @@ func.func @TileReverse(%arg0: memref<128x1x5x5xf16>, %arg1: memref<128x1x5x5xf16 // CHECK: return [[RESULT]] : memref<128x1x5x5xf16> } - // ----- +// ----- IE.TileResource 6 of @NCE at 1.300000e+03 MHz { IE.ExecutorResource 1 of @DPU @@ -3600,6 +3590,17 @@ func.func @TileMemPermuteMultiClustered(%arg0 : memref<5x5x25x1xf16, #NHWC, [@CM #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +// Distributed buffer type aliases +!InputDistributedBuffer = !VPUIP.DistributedBuffer< + 128x2x36x68xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 1, 3], num_clusters = 3 : i64, uniform_distributed_segments, + compute_shapes = [[128, 2, 36, 23], [128, 2, 36, 23], [128, 2, 36, 22]], compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 23], [0, 0, 0, 46]], + memory_shapes = [[128, 2, 36, 23], [128, 2, 36, 23], [128, 2, 36, 22]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 23], [0, 0, 0, 46]]}> + +!OutputDistributedBuffer = !VPUIP.DistributedBuffer< + 36x2x68x128xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 3, 1], num_clusters = 3 : i64, uniform_distributed_segments, + compute_shapes = [[36, 2, 23, 128], [36, 2, 23, 128], [36, 2, 22, 128]], compute_offsets = [[0, 0, 0, 0], [0, 0, 23, 0], [0, 0, 46, 0]], + memory_shapes = [[36, 2, 23, 128], [36, 2, 23, 128], [36, 2, 22, 128]], memory_offsets = [[0, 0, 0, 0], [0, 0, 23, 0], [0, 0, 46, 0]]}> + IE.TileResource 4 of @NCE at 1.700000e+03 MHz { IE.ExecutorResource 1 of @DPU } @@ -3609,37 +3610,64 @@ module @VPU.SW { func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} } -// CHECK-LABEL: func.func @TileMemPermuteWithNCHWInAndOut -// CHECK-SAME: ([[ARG0:%.+]]: memref<64x2x36x68xf16, [@CMX_NN, 0]>) -> memref<36x2x68x64xf16, [@CMX_NN, 0]> { -func.func @TileMemPermuteWithNCHWInAndOut(%arg0 : memref<64x2x36x68xf16, [@CMX_NN, 0]>) -> memref<36x2x68x64xf16, [@CMX_NN, 0]> { - %alloc = memref.alloc() : memref<36x2x68x64xf16, [@CMX_NN, 0]> - %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_MemPermute inputs(%arg0 as %arg1: memref<64x2x36x68xf16, [@CMX_NN, 0]>) outputs(%alloc as %arg2: memref<36x2x68x64xf16, [@CMX_NN, 0]>) on tile 0 -> memref<36x2x68x64xf16, [@CMX_NN, 0]>{ - VPUIP.SW.Kernel.run {attrs = [[3, 0, 2, 1]]}(%arg1, %arg2) : memref<64x2x36x68xf16, [@CMX_NN, 0]>, memref<36x2x68x64xf16, [@CMX_NN, 0]> - } - - return %results : memref<36x2x68x64xf16, [@CMX_NN, 0]> - // CHECK: [[ALLOC:%.+]] = memref.alloc() : memref<36x2x68x64xf16, [@CMX_NN, 0]> - // CHECK: [[SUBVIEW0:%.+]] = VPUIP.SubView [[ARG0]] [0, 0, 0, 0] [64, 2, 36, 34] : memref<64x2x36x68xf16, [@CMX_NN, 0]> to memref<64x2x36x34xf16, {order = #NCHW, strides = [4896, 2448, 68, 1]}, [@CMX_NN, 0]> - // CHECK: [[SUBVIEW1:%.+]] = VPUIP.SubView [[ALLOC]] [0, 0, 0, 0] [36, 2, 34, 64] : memref<36x2x68x64xf16, [@CMX_NN, 0]> to memref<36x2x34x64xf16, {order = #NCHW, strides = [8704, 4352, 64, 1]}, [@CMX_NN, 0]> - // CHECK: [[SUBVIEW2:%.+]] = VPUIP.SubView [[ARG0]] [0, 0, 0, 34] [64, 2, 36, 34] : memref<64x2x36x68xf16, [@CMX_NN, 0]> to memref<64x2x36x34xf16, {order = #NCHW, strides = [4896, 2448, 68, 1]}, [@CMX_NN, 0]> - // CHECK: [[SUBVIEW3:%.+]] = VPUIP.SubView [[ALLOC]] [0, 0, 34, 0] [36, 2, 34, 64] : memref<36x2x68x64xf16, [@CMX_NN, 0]> to memref<36x2x34x64xf16, {order = #NCHW, strides = [8704, 4352, 64, 1]}, [@CMX_NN, 0]> +// CHECK-LABEL: func.func @TileMemPermuteWithNCHWInAndOutClustered +// CHECK-SAME: ([[ARG0:%.+]]: memref<128x2x36x68xf16, [@CMX_NN, 0]>) -> memref<36x2x68x128xf16, [@CMX_NN, 0]> { +func.func @TileMemPermuteWithNCHWInAndOutClustered(%arg0 : memref<128x2x36x68xf16, #NCHW, [@CMX_NN, 0]>) -> memref<36x2x68x128xf16, #NCHW, [@CMX_NN, 0]> { + %0 = VPURT.AllocDistributed -> !InputDistributedBuffer + %1 = VPUIP.Copy + inputs(%arg0 : memref<128x2x36x68xf16, #NCHW, [@CMX_NN, 0]>) + outputs(%0 : !InputDistributedBuffer) -> !InputDistributedBuffer + %3 = VPURT.AllocDistributed -> !OutputDistributedBuffer + + %4 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_MemPermute + inputs(%0 as %arg1: !InputDistributedBuffer) + outputs(%3 as %arg2: !OutputDistributedBuffer) on tile 0 + -> !OutputDistributedBuffer { + VPUIP.SW.Kernel.run {attrs = [[3, 0, 2, 1]]}(%arg1, %arg2) : !InputDistributedBuffer, !OutputDistributedBuffer + } + %5= memref.alloc() : memref<36x2x68x128xf16, #NCHW, [@CMX_NN, 0]> + %6 = VPUIP.Copy + inputs(%4 : !OutputDistributedBuffer) + outputs(%5 : memref<36x2x68x128xf16, #NCHW, [@CMX_NN, 0]>) -> memref<36x2x68x128xf16, #NCHW, [@CMX_NN, 0]> + + return %6 : memref<36x2x68x128xf16, #NCHW, [@CMX_NN, 0]> + + // CHECK: [[ALLOC0:%.+]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<128x2x36x68xf16, #NCHW, @CMX_NN, + // CHECK{LITERAL}: {mode = "SEGMENTED", num_tiles = [1, 1, 1, 3], num_clusters = 3 : i64, uniform_distributed_segments, compute_shapes = [[128, 2, 36, 23], [128, 2, 36, 23], [128, 2, 36, 22]], compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 23], [0, 0, 0, 46]], memory_shapes = [[128, 2, 36, 23], [128, 2, 36, 23], [128, 2, 36, 22]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 23], [0, 0, 0, 46]]}> + // CHECK: [[SUBVIEW1:%.+]] = VPUIP.SubView [[ALLOC0]] [0, 0, 0, 36] [128, 2, 36, 32] + // CHECK{LITERAL}: {explicit_output_shapes = [[128, 2, 36, 11], [128, 2, 36, 11], [128, 2, 36, 10]]} : !VPUIP.DistributedBuffer<128x2x36x68xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 1, 3], num_clusters = 3 : i64, uniform_distributed_segments, compute_shapes = [[128, 2, 36, 23], [128, 2, 36, 23], [128, 2, 36, 22]], compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 23], [0, 0, 0, 46]], memory_shapes = [[128, 2, 36, 23], [128, 2, 36, 23], [128, 2, 36, 22]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 23], [0, 0, 0, 46]]}> + // CHECK{LITERAL}: to !VPUIP.DistributedBuffer<128x2x36x32xf16, {order = #NCHW, strides = [4896, 2448, 68, 1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 1, 3], num_clusters = 3 : i64, uniform_distributed_segments, compute_shapes = [[128, 2, 36, 11], [128, 2, 36, 11], [128, 2, 36, 10]], compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 11], [0, 0, 0, 22]], memory_shapes = [[128, 2, 36, 11], [128, 2, 36, 11], [128, 2, 36, 10]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 11], [0, 0, 0, 22]]}> + // CHECK: [[SUBVIEW0:%.+]] = VPUIP.SubView [[ALLOC0]] [0, 0, 0, 0] [128, 2, 36, 36] + // CHECK{LITERAL}: {explicit_output_shapes = [[128, 2, 36, 12], [128, 2, 36, 12], [128, 2, 36, 12]]} : !VPUIP.DistributedBuffer<128x2x36x68xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 1, 3], num_clusters = 3 : i64, uniform_distributed_segments, compute_shapes = [[128, 2, 36, 23], [128, 2, 36, 23], [128, 2, 36, 22]], compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 23], [0, 0, 0, 46]], memory_shapes = [[128, 2, 36, 23], [128, 2, 36, 23], [128, 2, 36, 22]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 23], [0, 0, 0, 46]]}> + // CHECK{LITERAL}: to !VPUIP.DistributedBuffer<128x2x36x36xf16, {order = #NCHW, strides = [4896, 2448, 68, 1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 1, 3], num_clusters = 3 : i64, uniform_distributed_segments, compute_shapes = [[128, 2, 36, 12], [128, 2, 36, 12], [128, 2, 36, 12]], compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 12], [0, 0, 0, 24]], memory_shapes = [[128, 2, 36, 12], [128, 2, 36, 12], [128, 2, 36, 12]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 12], [0, 0, 0, 24]]}> + // CHECK: [[COPY0:%.+]] = VPUIP.Copy inputs([[ARG0]] : memref<128x2x36x68xf16, [@CMX_NN, 0]>) outputs([[ALLOC0]] : !VPUIP.DistributedBuffer<128x2x36x68xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 1, 3], num_clusters = 3 : i64, uniform_distributed_segments, + // CHECK{LITERAL}: compute_shapes = [[128, 2, 36, 23], [128, 2, 36, 23], [128, 2, 36, 22]], compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 23], [0, 0, 0, 46]], memory_shapes = [[128, 2, 36, 23], [128, 2, 36, 23], [128, 2, 36, 22]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 23], [0, 0, 0, 46]]}>) -> + // CHECK{LITERAL}: !VPUIP.DistributedBuffer<128x2x36x68xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 1, 3], num_clusters = 3 : i64, uniform_distributed_segments, compute_shapes = [[128, 2, 36, 23], [128, 2, 36, 23], [128, 2, 36, 22]], compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 23], [0, 0, 0, 46]], memory_shapes = [[128, 2, 36, 23], [128, 2, 36, 23], [128, 2, 36, 22]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 23], [0, 0, 0, 46]]}> + + // CHECK: [[ALLOC1:%.+]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<36x2x68x128xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 3, 1], num_clusters = 3 : i64, uniform_distributed_segments, + // CHECK{LITERAL}: compute_shapes = [[36, 2, 23, 128], [36, 2, 23, 128], [36, 2, 22, 128]], compute_offsets = [[0, 0, 0, 0], [0, 0, 23, 0], [0, 0, 46, 0]], memory_shapes = [[36, 2, 23, 128], [36, 2, 23, 128], [36, 2, 22, 128]], memory_offsets = [[0, 0, 0, 0], [0, 0, 23, 0], [0, 0, 46, 0]]}> + // CHECK: [[SUBVIEW3:%.+]] = VPUIP.SubView [[ALLOC1]] [0, 0, 36, 0] [36, 2, 32, 128] + // CHECK{LITERAL}: {explicit_output_shapes = [[36, 2, 11, 128], [36, 2, 11, 128], [36, 2, 10, 128]]} : !VPUIP.DistributedBuffer<36x2x68x128xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 3, 1], num_clusters = 3 : i64, uniform_distributed_segments, compute_shapes = [[36, 2, 23, 128], [36, 2, 23, 128], [36, 2, 22, 128]], compute_offsets = [[0, 0, 0, 0], [0, 0, 23, 0], [0, 0, 46, 0]], memory_shapes = [[36, 2, 23, 128], [36, 2, 23, 128], [36, 2, 22, 128]], memory_offsets = [[0, 0, 0, 0], [0, 0, 23, 0], [0, 0, 46, 0]]}> + // CHECK{LITERAL}: to !VPUIP.DistributedBuffer<36x2x32x128xf16, {order = #NCHW, strides = [17408, 8704, 128, 1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 3, 1], num_clusters = 3 : i64, uniform_distributed_segments, compute_shapes = [[36, 2, 11, 128], [36, 2, 11, 128], [36, 2, 10, 128]], compute_offsets = [[0, 0, 0, 0], [0, 0, 11, 0], [0, 0, 22, 0]], memory_shapes = [[36, 2, 11, 128], [36, 2, 11, 128], [36, 2, 10, 128]], memory_offsets = [[0, 0, 0, 0], [0, 0, 11, 0], [0, 0, 22, 0]]}> + // CHECK: [[SUBVIEW2:%.+]] = VPUIP.SubView [[ALLOC1]] [0, 0, 0, 0] [36, 2, 36, 128] + // CHECK{LITERAL}: {explicit_output_shapes = [[36, 2, 12, 128], [36, 2, 12, 128], [36, 2, 12, 128]]} : !VPUIP.DistributedBuffer<36x2x68x128xf16, #NCHW, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 3, 1], num_clusters = 3 : i64, uniform_distributed_segments, compute_shapes = [[36, 2, 23, 128], [36, 2, 23, 128], [36, 2, 22, 128]], compute_offsets = [[0, 0, 0, 0], [0, 0, 23, 0], [0, 0, 46, 0]], memory_shapes = [[36, 2, 23, 128], [36, 2, 23, 128], [36, 2, 22, 128]], memory_offsets = [[0, 0, 0, 0], [0, 0, 23, 0], [0, 0, 46, 0]]}> + // CHECK{LITERAL}: to !VPUIP.DistributedBuffer<36x2x36x128xf16, {order = #NCHW, strides = [17408, 8704, 128, 1]}, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 3, 1], num_clusters = 3 : i64, uniform_distributed_segments, compute_shapes = [[36, 2, 12, 128], [36, 2, 12, 128], [36, 2, 12, 128]], compute_offsets = [[0, 0, 0, 0], [0, 0, 12, 0], [0, 0, 24, 0]], memory_shapes = [[36, 2, 12, 128], [36, 2, 12, 128], [36, 2, 12, 128]], memory_offsets = [[0, 0, 0, 0], [0, 0, 12, 0], [0, 0, 24, 0]]}> - // CHECK: [[RESULTS:%.+]]:2 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_MemPermute - // CHECK-SAME: inputs([[SUBVIEW0]] as [[ARG1:%.+]]: memref<64x2x36x34xf16, {order = #NCHW, strides = [4896, 2448, 68, 1]}, [@CMX_NN, 0]>, - // CHECK-SAME: [[SUBVIEW2]] as [[ARG2:%.+]]: memref<64x2x36x34xf16, {order = #NCHW, strides = [4896, 2448, 68, 1]}, [@CMX_NN, 0]>) - // CHECK-SAME: outputs([[SUBVIEW1]] as [[ARG3:%.+]]: memref<36x2x34x64xf16, {order = #NCHW, strides = [8704, 4352, 64, 1]}, [@CMX_NN, 0]>, - // CHECK-SAME: [[SUBVIEW3]] as [[ARG4:%.+]]: memref<36x2x34x64xf16, {order = #NCHW, strides = [8704, 4352, 64, 1]}, [@CMX_NN, 0]>) - // CHECK-SAME: on tile 0 - // CHECK{LITERAL}: VPUIP.SW.Kernel.run {attrs = [[3, 0, 2, 1]]} - // CHECK-SAME: ([[ARG1]], [[ARG3]]) : memref<64x2x36x34xf16, {order = #NCHW, strides = [4896, 2448, 68, 1]}, [@CMX_NN, 0]>, memref<36x2x34x64xf16, {order = #NCHW, strides = [8704, 4352, 64, 1]}, [@CMX_NN, 0]> - // CHECK{LITERAL}: VPUIP.SW.Kernel.run {attrs = [[3, 0, 2, 1]]} - // CHECK-SAME: ([[ARG2]], [[ARG4]]) : memref<64x2x36x34xf16, {order = #NCHW, strides = [4896, 2448, 68, 1]}, [@CMX_NN, 0]>, memref<36x2x34x64xf16, {order = #NCHW, strides = [8704, 4352, 64, 1]}, [@CMX_NN, 0]> + // CHECK: [[RESULTS:%.+]]:2 = VPUIP.SW.Kernel {resultSegmentSizes = array} + // CHECK-SAME: @VPU.SW::@builtin_MemPermute + // CHECK: VPUIP.SW.Kernel.run + // CHECK: {mode = "SEGMENTED", num_tiles = [1, 1, 1, 3], + // CHECK: VPUIP.SW.Kernel.run + // CHECK: {mode = "SEGMENTED", num_tiles = [1, 1, 1, 3], - // CHECK: [[CONCAT:%.+]] = VPUIP.ConcatView inputs([[RESULTS]]#0, [[RESULTS]]#1 : memref<36x2x34x64xf16, {order = #NCHW, strides = [8704, 4352, 64, 1]}, [@CMX_NN, 0]>, - // CHECK-SAME: memref<36x2x34x64xf16, {order = #NCHW, strides = [8704, 4352, 64, 1]}, [@CMX_NN, 0]>) outputs(%alloc : memref<36x2x68x64xf16, [@CMX_NN, 0]>) -> memref<36x2x68x64xf16, [@CMX_NN, 0]> + // CHECK: [[CONCAT:%.+]] = VPUIP.ConcatView inputs([[RESULTS]]#0, [[RESULTS]]#1 : !VPUIP.DistributedBuffer<36x2x36x128xf16, {order = #NCHW, strides = [17408, 8704, 128, 1]}, @CMX_NN, + // CHECK: outputs([[ALLOC1]] : !VPUIP.DistributedBuffer<36x2x68x128xf16, #NCHW, @CMX_NN, - // CHECK: return [[CONCAT]] : memref<36x2x68x64xf16, [@CMX_NN, 0]> + // CHECK: [[ALLOC:%.+]] = memref.alloc() : memref<36x2x68x128xf16, [@CMX_NN, 0]> + // CHECK: [[COPY1:%.+]] = VPUIP.Copy inputs([[CONCAT]] : !VPUIP.DistributedBuffer<36x2x68x128xf16, #NCHW, @CMX_NN, + // CHECK: outputs([[ALLOC]] : memref<36x2x68x128xf16, [@CMX_NN, 0]>) -> memref<36x2x68x128xf16, [@CMX_NN, 0]> + // CHECK: return [[COPY1]] : memref<36x2x68x128xf16, [@CMX_NN, 0]> } // ----- @@ -3753,3 +3781,177 @@ func.func @TileCumSum(%arg0: memref<1x6x32x32xf16>, %arg1: memref<1x6x32x32xf16> // CHECK: return [[RESULT]] : memref<1x6x32x32xf16> } + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NWHC = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2, d1)> +#C = affine_map<(d0) -> (d0)> + +VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] +module @VPU.SW { + func.func private @builtin_LSTMSequence(memref<*xf16, [@CMX_NN, 0]>, memref<*xsi32, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, memref<*xsi32, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, memref<*xsi32, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, i64) attributes {VPU.kernel_code = "lstm_sequence.cpp", VPU.kernel_entry = "lstm_sequence"} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} +} + +!InputDistributed = !VPUIP.DistributedBuffer<1x2x36x512xf16, #NCHW, @CMX_NN, { + mode = "SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64, uniform_distributed_segments, + compute_shapes = [[1, 1, 36, 512], [1, 1, 36, 512]], compute_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]], + memory_shapes = [[1, 1, 36, 512], [1, 1, 36, 512]], memory_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]]}> + +!InputDistributed1 = !VPUIP.DistributedBuffer<1x2x1x128xf16, #NCHW, @CMX_NN, { + mode = "SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64, uniform_distributed_segments, + compute_shapes = [[1, 1, 1, 128], [1, 1, 1, 128]], compute_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]], + memory_shapes = [[1, 1, 1, 128], [1, 1, 1, 128]], memory_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]]}> + +!InputDistributed2 = !VPUIP.DistributedBuffer<2x4x128x128xf16, #NWHC, @CMX_NN, { + mode = "SEGMENTED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, uniform_distributed_segments, + compute_shapes = [[1, 4, 128, 128], [1, 4, 128, 128]], compute_offsets = [[0, 0, 0, 0], [1, 0, 0, 0]], + memory_shapes = [[1, 4, 128, 128], [1, 4, 128, 128]], memory_offsets = [[0, 0, 0, 0], [1, 0, 0, 0]]}> + +!InputDistributed3 = !VPUIP.DistributedBuffer<1x2x4x128xf16, #NCHW, @CMX_NN, { + mode = "SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64, uniform_distributed_segments, + compute_shapes = [[1, 1, 4, 128], [1, 1, 4, 128]], compute_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]], + memory_shapes = [[1, 1, 4, 128], [1, 1, 4, 128]], memory_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]]}> + +!InputDistributed4 = !VPUIP.DistributedBuffer<1x1x1x2xsi32, #NCHW, @CMX_NN, { + mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments, + compute_shapes = [[1, 1, 1, 2], [1, 1, 1, 2]], compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]], + memory_shapes = [[1, 1, 1, 2], [1, 1, 1, 2]], memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0]]}> + +!ShapeDistributed = !VPUIP.DistributedBuffer<4xsi32, #C, @CMX_NN, { + mode = "DUPLICATED", num_clusters = 2 : i64, uniform_distributed_segments}> + +!OutputDistributed1 = !VPUIP.DistributedBuffer<1x2x36x128xf16, #NCHW, @CMX_NN, { + mode = "SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64, uniform_distributed_segments, + compute_shapes = [[1, 1, 36, 128], [1, 1, 36, 128]], compute_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]], + memory_shapes = [[1, 1, 36, 128], [1, 1, 36, 128]], memory_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]]}> + +!OutputDistributed2 = !VPUIP.DistributedBuffer<1x2x1x128xf16, #NCHW, @CMX_NN, { + mode = "SEGMENTED", num_tiles = [1, 2, 1, 1], num_clusters = 2 : i64, uniform_distributed_segments, + compute_shapes = [[1, 1, 1, 128], [1, 1, 1, 128]], compute_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]], + memory_shapes = [[1, 1, 1, 128], [1, 1, 1, 128]], memory_offsets = [[0, 0, 0, 0], [0, 1, 0, 0]]}> + + +func.func @LSTMSequence( + %input : !InputDistributed, %input_shape: !ShapeDistributed, + %output: !OutputDistributed1, %output_shape: !ShapeDistributed) +-> (!OutputDistributed1, !ShapeDistributed) { + %bounded_input = VPUIP.GroupBoundedBuffer(%input, %input_shape) : !InputDistributed, !ShapeDistributed + -> !VPUIP.BoundedBuffer + %bounded_output = VPUIP.GroupBoundedBuffer(%output, %output_shape) : !OutputDistributed1, !ShapeDistributed + -> !VPUIP.BoundedBuffer + + %input1 = VPURT.AllocDistributed -> !InputDistributed1 + %input2 = VPURT.AllocDistributed -> !InputDistributed1 + %input3 = VPURT.AllocDistributed -> !InputDistributed2 + %input4 = VPURT.AllocDistributed -> !InputDistributed3 + %input5 = VPURT.AllocDistributed -> !InputDistributed4 + + %output1 = VPURT.AllocDistributed -> !OutputDistributed2 + %output2 = VPURT.AllocDistributed -> !OutputDistributed2 + %out1, %out2, %out3 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_LSTMSequence + inputs( + %bounded_input as %arg16: !VPUIP.BoundedBuffer, + %input1 as %arg17: !InputDistributed1, + %input2 as %arg18: !InputDistributed1, + %input3 as %arg19: !InputDistributed2, + %input4 as %arg20: !InputDistributed3, + %input5 as %arg21: !InputDistributed4) + outputs( + %bounded_output as %arg22: !VPUIP.BoundedBuffer, + %output1 as %arg23: !OutputDistributed2, + %output2 as %arg24: !OutputDistributed2) on tile 0 + -> (!VPUIP.BoundedBuffer, + !OutputDistributed2, !OutputDistributed2) { + VPUIP.SW.Kernel.run {attrs = [2]}(%arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23, %arg24) + : !VPUIP.BoundedBuffer, + !InputDistributed1, + !InputDistributed1, + !InputDistributed2, + !InputDistributed3, + !InputDistributed4, + !VPUIP.BoundedBuffer, + !OutputDistributed2, !OutputDistributed2 + } + + %result_data, %result_shape = VPUIP.UngroupBoundedBuffer(%out1) : + !VPUIP.BoundedBuffer + -> !OutputDistributed1, !ShapeDistributed + return %result_data, %result_shape : !OutputDistributed1, !ShapeDistributed + + // CHECK: VPUIP.SW.Kernel + // CHECK: VPUIP.SW.Kernel.run + // CHECK: VPUIP.SW.Kernel.run + + // CHECK: VPUIP.ConcatView + +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +module @VPU.SW { + func.func private @builtin_Convert(memref<*xf16, @CMX_NN>) attributes {VPU.kernel_code = "convert.cpp", VPU.kernel_entry = "convert"} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} +} + +!Distributed = !VPUIP.DistributedBuffer<1x320x64x64xf16, #NHWC, @CMX_NN, { + mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, + compute_shapes = [[1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 10, 64], [1, 320, 10, 64]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 11, 0], [0, 0, 22, 0], [0, 0, 33, 0], [0, 0, 44, 0], [0, 0, 54, 0]], + memory_shapes = [[1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 10, 64], [1, 320, 10, 64]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 11, 0], [0, 0, 22, 0], [0, 0, 33, 0], [0, 0, 44, 0], [0, 0, 54, 0]]}> + +!Distributed1 = !VPUIP.DistributedBuffer<1x320x64x64xf32, #NHWC, @CMX_NN, { + mode = "SEGMENTED", num_tiles = [1, 1, 6, 1], num_clusters = 6 : i64, uniform_distributed_segments, + compute_shapes = [[1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 10, 64], [1, 320, 10, 64]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 11, 0], [0, 0, 22, 0], [0, 0, 33, 0], [0, 0, 44, 0], [0, 0, 54, 0]], + memory_shapes = [[1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 10, 64], [1, 320, 10, 64]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 11, 0], [0, 0, 22, 0], [0, 0, 33, 0], [0, 0, 44, 0], [0, 0, 54, 0]]}> + +// CHECK-LABEL: @BalanceTileAconvertOp +func.func @BalanceTileAconvertOp(%input: !Distributed) -> !Distributed1 { + + %0 = VPURT.AllocDistributed -> !Distributed1 + %1 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Convert + inputs(%input as %arg2: !Distributed) + outputs(%0 as %arg3:!Distributed1) on tile 0 + -> !Distributed1 { + VPUIP.SW.Kernel.run(%arg2, %arg3): !Distributed, !Distributed1 + } + + return %1: !Distributed1 + + // CHECK: [[IN_SHAPECAST:%.+]] = VPUIP.ShapeCast + // CHECK-SAME-DAG{LITERAL}: explicit_output_offsets = [[0, 0, 0, 0], [0, 0, 225280, 0], [0, 0, 450560, 0], [0, 0, 675840, 0], [0, 0, 901120, 0], [0, 0, 1105920, 0]] + // CHECK-SAME-DAG{LITERAL}: explicit_output_shapes = [[1, 1, 225280, 1], [1, 1, 225280, 1], [1, 1, 225280, 1], [1, 1, 225280, 1], [1, 1, 204800, 1], [1, 1, 204800, 1]] + // CHECK-SAME-DAG{LITERAL}: shape = [1, 1, 1310720, 1] + // CHECK-DAG: [[SUBVIEW_0:%.+]] = VPUIP.SubView [[IN_SHAPECAST]] [0, 0, 675840, 0] [1, 1, 634880, 1] + // CHECK-DAG: [[SUBVIEW_1:%.+]] = VPUIP.SubView [[IN_SHAPECAST]] [0, 0, 0, 0] [1, 1, 675840, 1] + + // CHECK: [[OUT_BUFF:%.+]] = VPURT.AllocDistributed + // CHECK-DAG: [[SUBVIEW_OUT_0:%.+]] = VPUIP.SubView [[OUT_BUFF]] [0, 0, 675840, 0] [1, 1, 634880, 1] + // CHECK-DAG: [[SUBVIEW_OUT_1:%.+]] = VPUIP.SubView [[OUT_BUFF]] [0, 0, 0, 0] [1, 1, 675840, 1] + + // CHECK: [[SW_KERNEL:%.+]]:2 = VPUIP.SW.Kernel {resultSegmentSizes + // CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 112640, 1], [1, 1, 112640, 1], [1, 1, 112640, 1], [1, 1, 112640, 1], [1, 1, 112640, 1], [1, 1, 112640, 1]] + // CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 112640, 0], [0, 0, 225280, 0], [0, 0, 337920, 0], [0, 0, 450560, 0], [0, 0, 563200, 0]] + // CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 112640, 1], [1, 1, 112640, 1], [1, 1, 112640, 1], [1, 1, 112640, 1], [1, 1, 112640, 1], [1, 1, 112640, 1]] + // CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 112640, 0], [0, 0, 225280, 0], [0, 0, 337920, 0], [0, 0, 450560, 0], [0, 0, 563200, 0]] + // CHECK: VPUIP.SW.Kernel.run + // CHECK-SAME: !VPUIP.DistributedBuffer<1x1x675840x1xf16, {order = #NHWC, strides = [1310720, 1, 1, 1]}, @CMX_NN + // CHECK-SAME: !VPUIP.DistributedBuffer<1x1x675840x1xf32, {order = #NHWC, strides = [1310720, 1, 1, 1]}, @CMX_NN + // CHECK: VPUIP.SW.Kernel.run + // CHECK-SAME: !VPUIP.DistributedBuffer<1x1x634880x1xf16, {order = #NHWC, strides = [1310720, 1, 1, 1]}, @CMX_NN + // CHECK-SAME: !VPUIP.DistributedBuffer<1x1x634880x1xf32, {order = #NHWC, strides = [1310720, 1, 1, 1]}, @CMX_NN + + // CHECK: [[CONCAT:%.+]] = VPUIP.ConcatView + // CHECK: [[OUT_SHAPECAST:%.+]] = VPUIP.ShapeCast + // CHECK-SAME-DAG{LITERAL}: explicit_output_offsets = [[0, 0, 0, 0], [0, 0, 11, 0], [0, 0, 22, 0], [0, 0, 33, 0], [0, 0, 44, 0], [0, 0, 54, 0]] + // CHECK-SAME-DAG{LITERAL}: explicit_output_shapes = [[1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 11, 64], [1, 320, 10, 64], [1, 320, 10, 64]] + // CHECK-SAME-DAG{LITERAL}: shape = [1, 320, 64, 64] + + // CHECK: return [[OUT_SHAPECAST]] +} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/ungroup_bounded_buffers.mlir b/tests/lit/NPU/dialect/VPUIP/passes/ungroup_bounded_buffers.mlir index 1917d0015c..4c8d5b05bc 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/ungroup_bounded_buffers.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/ungroup_bounded_buffers.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --ungroup-bounded-buffers %s | FileCheck %s // REQUIRES: arch-NPU37XX || arch-NPU40XX -module @TestCopy attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @TestCopy attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { // CHECK-LABEL: main net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "Parameter_213" : tensor<2x4x20x20xf16> @@ -71,7 +71,7 @@ module @TestCopy attributes {VPU.arch = #VPU.arch_kind, config.compilat // ----- -module @TestSwKernel attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @TestSwKernel attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] @@ -181,7 +181,7 @@ module @TestSwKernel attributes {VPU.arch = #VPU.arch_kind, config.comp // ----- #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module @DynamicReshape attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode, VPU.revisionID = #VPU.revision_id} { +module @DynamicReshape attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] module @VPU.SW { func.func private @builtin_DynamicReshape(memref<*xf16, [@CMX_NN, 0]>, memref<*xsi32, [@CMX_NN, 0]>, memref<*xsi32, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, memref<*xsi32, [@CMX_NN, 0]>, i64) attributes {VPU.kernel_code = "dynamic_reshape.cpp", VPU.kernel_entry = "dynamic_reshape", VPU.task_type = @COMPUTE} @@ -198,13 +198,13 @@ module @DynamicReshape attributes {VPU.arch = #VPU.arch_kind, config.co } IE.TileResource 2 of @NCE at 1.300000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} // CHECK-LABEL: main net.NetworkInfo entryPoint : @main inputsInfo : { @@ -270,7 +270,7 @@ module @DynamicReshape attributes {VPU.arch = #VPU.arch_kind, config.co // ----- #NWHC = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2, d1)> -module attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode, VPU.revisionID = #VPU.revision_id} { +module attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { config.PipelineOptions @Options { config.Option @VPU.EnableSEPtrsOperations : false config.Option @VPU.EnableExperimentalSEPtrsOperations : false @@ -285,14 +285,14 @@ module attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} module @VPU.SW { func.func private @builtin_LSTMSequence(memref<*xf16, [@CMX_NN, 0]>, memref<*xsi32, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, memref<*xsi32, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, memref<*xsi32, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, i64) attributes {VPU.kernel_code = "lstm_sequence.cpp", VPU.kernel_entry = "lstm_sequence"} func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} } IE.TileResource 4 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } diff --git a/tests/lit/NPU/dialect/VPUIP/passes/unroll_distributed_ops_se_operation_soh_overlapped_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/unroll_distributed_ops_se_operation_soh_overlapped_40XX+.mlir index 2a94dd39e6..bc8b4ea620 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/unroll_distributed_ops_se_operation_soh_overlapped_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/unroll_distributed_ops_se_operation_soh_overlapped_40XX+.mlir @@ -472,3 +472,355 @@ func.func @UnrollNceSoHSEPInterpolateOverlappedTwoClustersAndTwoUsers() -> (!Out // CHECK-SAME: [10241, 10241, 10241, 10241, 10241, 11265, 11265, 11265, 11265, 12289, 12289, 12289, 12289, 13313, 13313, 13313, 13313, 14337, 14337, 14337, 14337, 14337] // CHECK{LITERAL}: ]]]> : tensor<1x1x12x22xi32, {order = #NHWC}> } + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!InputDistributed = !VPUIP.DistributedBuffer< + 1x16x4x4xf16, #NHWC, @CMX_NN, { + mode = "OVERLAPPED", + num_tiles = [1, 1, 3, 1], + num_clusters = 3 : i64, + uniform_distributed_segments, + compute_shapes = [[1, 16, 1, 4], [1, 16, 2, 4], [1, 16, 1, 4]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 3, 0]], + memory_shapes = [[1, 16, 3, 4], [1, 16, 3, 4], [1, 16, 2, 4]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 2, 0]] +}> + +!InputSparseMapDistributed = !VPUIP.DistributedBuffer< + 1x16x6x6xi1, #NHWC, @CMX_NN, { + mode = "OVERLAPPED", + num_tiles = [1, 1, 3, 1], + num_clusters = 3 : i64, + uniform_distributed_segments, + compute_shapes = [[1, 16, 2, 6], [1, 16, 2, 6], [1, 16, 2, 6]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 2, 0], [0, 0, 4, 0]], + memory_shapes = [[1, 16, 4, 6], [1, 16, 3, 6], [1, 16, 3, 6]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 2, 0], [0, 0, 3, 0]] +}> + +!InputSETableDistributed = !VPUIP.DistributedBuffer< + 1x1x6x6xi32, #NHWC, @CMX_NN, { + mode = "OVERLAPPED", + num_tiles = [1, 1, 3, 1], + num_clusters = 3 : i64, + uniform_distributed_segments, + compute_shapes = [[1, 1, 2, 6], [1, 1, 2, 6], [1, 1, 2, 6]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 2, 0], [0, 0, 4, 0]], + memory_shapes = [[1, 1, 4, 6], [1, 1, 3, 6], [1, 1, 3, 6]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 2, 0], [0, 0, 3, 0]] +}> + +!WeightsDistributed = !VPUIP.DistributedBuffer< + 16x16x3x3xf16, #NHWC, @CMX_NN, { + mode = "DUPLICATED", + num_clusters = 3 : i64, + uniform_distributed_segments, + compute_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + memory_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] +}> + +!WeightsSparsityMapDistributed = !VPUIP.DistributedBuffer< + 16x1x1x256xi1, #NCHW, @CMX_NN, { + mode = "DUPLICATED", + num_clusters = 3 : i64, + uniform_distributed_segments, + compute_shapes = [[16, 1, 1, 256], [16, 1, 1, 256], [16, 1, 1, 256]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + memory_shapes = [[16, 1, 1, 256], [16, 1, 1, 256], [16, 1, 1, 256]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] +}> + +!WeightsTableDistributed = !VPUIP.DistributedBuffer< + 16x1x1x4xsi32, #NCHW, @CMX_NN, { + mode = "DUPLICATED", + num_clusters = 3 : i64, + uniform_distributed_segments, + compute_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] +}> + +!OutputDistributed = !VPUIP.DistributedBuffer< + 1x16x4x4xf16, #NCHW, @CMX_NN, { + mode = "OVERLAPPED", + num_tiles = [1, 1, 3, 1], + num_clusters = 3 : i64, + uniform_distributed_segments, + compute_shapes = [[1, 16, 2, 4], [1, 16, 1, 4], [1, 16, 1, 4]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 2, 0], [0, 0, 3, 0]], + memory_shapes = [[1, 16, 2, 4], [1, 16, 1, 4], [1, 16, 1, 4]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 2, 0], [0, 0, 3, 0]] +}> + +!InputStub_CMX = memref<1x16x33x33xf16, #NHWC, @CMX_NN> +!InputSparseMapStub_CMX = memref<1x16x33x33xi1, #NHWC, @CMX_NN> +!InputSETableStub_CMX = memref<1x16x33x33xi32, #NHWC, @CMX_NN> +!OutputStub_CMX = memref<1x16x33x33xf16, #NHWC, @CMX_NN> +!OutputSparsityStub_CMX = memref<1x16x33x33xi1, #NHWC, @CMX_NN> +!WeightsStub_CMX = memref<16x16x3x3xf16, #NHWC, @CMX_NN> +!WeightsTableStub_CMX = memref<16x1x1x4xsi32, @CMX_NN> + +!Output_DDR = memref<1x16x4x4xf16, #NCHW, @DDR> + +//CHECK-LABEL: @UnrollNceSoHSEPOverlappedThreeClusters +func.func @UnrollNceSoHSEPOverlappedThreeClusters() -> !Output_DDR { + // Barriers + %bar0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %bar1 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %bar2 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %bar3 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %bar4 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %seTable_cst = const.Declare memref<1x1x6x6xi32, #NHWC, @DDR> = dense<[[[ + [0, 0, 1024, 2048, 3072, 3072], + [0, 0, 1024, 2048, 3072, 3072], + [4096, 4096, 5120, 6144, 7168, 7168], + [8192, 8192, 9216, 10240, 11264, 11264], + [8193, 8193, 9217, 10241, 11265, 11265], + [8193, 8193, 9217, 10241, 11265, 11265] + ]]]> : tensor<1x1x6x6xi32, {order = #NHWC}> + %seTable_CMX = VPURT.DeclareBuffer <4160> -> !InputSETableDistributed + VPURT.Task updates(%bar0: !VPURT.Barrier) { + VPUIP.NNDMA inputs(%seTable_cst : memref<1x1x6x6xi32, #NHWC, @DDR>) outputs(%seTable_CMX : !InputSETableDistributed) -> !InputSETableDistributed + } + + %parent_out = VPURT.DeclareBuffer [0] <0> -> !Output_DDR + + %parent_input_cmx = VPURT.DeclareBuffer <0> -> !InputDistributed + %parent_input_sparsity_map = VPURT.DeclareBuffer <1024> -> !InputSparseMapDistributed + %weights = VPURT.DeclareBuffer [0, 1, 2] <2048> -> !WeightsDistributed + %weights_sparsity_map = VPURT.DeclareBuffer [0, 1, 2] <5120> -> !WeightsSparsityMapDistributed + %weights_table = VPURT.DeclareBuffer [0, 1, 2] <6144> -> !WeightsTableDistributed + %parent_out_cmx = VPURT.DeclareBuffer <7168> -> !OutputDistributed + + VPURT.Task waits(%bar0, %bar1, %bar2, %bar3 : !VPURT.Barrier, !VPURT.Barrier, !VPURT.Barrier, !VPURT.Barrier) updates(%bar4 : !VPURT.Barrier) { + %21 = VPUIP.NCEClusterTask {constantsFused = true, is_superdense, kernel_padding = #VPU.Padding, kernel_size = [3, 3], kernel_strides = [1, 1], minimumHardwareExecutionCost = 425 : i64, mpe_engine = #VPU.MPEEngine37XX>, task_type = #VPUIP.nce_task_type} + input(%parent_input_cmx : !InputDistributed) + input_sparsity_map(%parent_input_sparsity_map : !InputSparseMapDistributed) + input_storage_element_table(%seTable_CMX: !InputSETableDistributed) + weights(%weights : !WeightsDistributed) + weights_sparsity_map(%weights_sparsity_map : !WeightsSparsityMapDistributed) + weight_table(%weights_table : !WeightsTableDistributed) + parent_input(%parent_input_cmx : !InputDistributed) + parent_input_sparsity_map(%parent_input_sparsity_map : !InputSparseMapDistributed) + parent_input_storage_element_table(%seTable_CMX: !InputSETableDistributed) + parent_output(%parent_out_cmx : !OutputDistributed) + outputs(%parent_out_cmx : !OutputDistributed) + -> !OutputDistributed + variants : { + DPUTask {cluster_id = 0 : i64, inEnd = [5, 3, 15], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 1, 15], outStart = [0, 0, 0], pad = #VPU.Padding} + DPUTask {cluster_id = 1 : i64, inEnd = [5, 2, 15], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 0, 15], outStart = [0, 0, 0], pad = #VPU.Padding} + DPUTask {cluster_id = 2 : i64, inEnd = [5, 2, 15], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [3, 0, 15], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + PPETask {ppe = #VPU.PPEFp, clamp_low = -3.4028234663852886E+38 : f64, clamp_high = 3.4028234663852886E+38 : f64, scale = 1.000000e+00 : f64, prelu_alpha = [1.000000e+00], bias = 0.000000e+00 : f64, adder = 0.000000e+00 : f64>} + } + } + + VPURT.Task waits(%bar4: !VPURT.Barrier) { + VPUIP.NNDMA inputs(%parent_out_cmx: !OutputDistributed) outputs(%parent_out: !Output_DDR) -> !Output_DDR + } + + return %parent_out: !Output_DDR + + // CHECK{LITERAL}: const.Declare memref<1x1x4x6xi32, #NHWC, @DDR> = dense<[[[ + // CHECK-SAME: [0, 0, 1024, 2048, 3072, 3072] + // CHECK-SAME: [0, 0, 1024, 2048, 3072, 3072] + // CHECK-SAME: [4096, 4096, 5120, 6144, 7168, 7168] + // CHECK-SAME: [8192, 8192, 9216, 10240, 11264, 11264] + // CHECK{LITERAL}: ]]]> : tensor<1x1x4x6xi32, {order = #NHWC}> + + // CHECK{LITERAL}: const.Declare memref<1x1x3x6xi32, #NHWC, @DDR> = dense<[[[ + // CHECK-SAME: [1, 1, 1025, 2049, 3073, 3073] + // CHECK-SAME: [4097, 4097, 5121, 6145, 7169, 7169] + // CHECK-SAME: [8193, 8193, 9217, 10241, 11265, 11265] + // CHECK{LITERAL}: ]]]> : tensor<1x1x3x6xi32, {order = #NHWC}> + + // CHECK{LITERAL}: const.Declare memref<1x1x3x6xi32, #NHWC, @DDR> = dense<[[[ + // CHECK-SAME: [2, 2, 1026, 2050, 3074, 3074] + // CHECK-SAME: [4098, 4098, 5122, 6146, 7170, 7170] + // CHECK-SAME: [4098, 4098, 5122, 6146, 7170, 7170] + // CHECK{LITERAL}: ]]]> : tensor<1x1x3x6xi32, {order = #NHWC}> +} + +// ----- + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!InputDistributedSOW = !VPUIP.DistributedBuffer< + 1x16x4x4xf16, #NHWC, @CMX_NN, { + mode = "OVERLAPPED", + num_tiles = [1, 1, 1, 3], + num_clusters = 3 : i64, + uniform_distributed_segments, + compute_shapes = [[1, 16, 4, 1], [1, 16, 4, 2], [1, 16, 4, 1]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 3]], + memory_shapes = [[1, 16, 4, 3], [1, 16, 4, 3], [1, 16, 4, 2]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 2]] +}> + +!InputSparseMapDistributedSOW = !VPUIP.DistributedBuffer< + 1x16x6x6xi1, #NHWC, @CMX_NN, { + mode = "OVERLAPPED", + num_tiles = [1, 1, 1, 3], + num_clusters = 3 : i64, + uniform_distributed_segments, + compute_shapes = [[1, 16, 6, 2], [1, 16, 6, 2], [1, 16, 6, 2]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 2], [0, 0, 0, 4]], + memory_shapes = [[1, 16, 6, 4], [1, 16, 6, 3], [1, 16, 6, 3]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 2], [0, 0, 0, 3]] +}> + +!InputSETableDistributedSOW = !VPUIP.DistributedBuffer< + 1x1x6x6xi32, #NHWC, @CMX_NN, { + mode = "OVERLAPPED", + num_tiles = [1, 1, 1, 3], + num_clusters = 3 : i64, + uniform_distributed_segments, + compute_shapes = [[1, 1, 6, 2], [1, 1, 6, 2], [1, 1, 6, 2]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 2], [0, 0, 0, 4]], + memory_shapes = [[1, 1, 6, 4], [1, 1, 6, 3], [1, 1, 6, 3]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 2], [0, 0, 0, 3]] +}> + +!WeightsDistributed = !VPUIP.DistributedBuffer< + 16x16x3x3xf16, #NHWC, @CMX_NN, { + mode = "DUPLICATED", + num_clusters = 3 : i64, + uniform_distributed_segments, + compute_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + memory_shapes = [[16, 16, 3, 3], [16, 16, 3, 3], [16, 16, 3, 3]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] +}> + +!WeightsSparsityMapDistributed = !VPUIP.DistributedBuffer< + 16x1x1x256xi1, #NCHW, @CMX_NN, { + mode = "DUPLICATED", + num_clusters = 3 : i64, + uniform_distributed_segments, + compute_shapes = [[16, 1, 1, 256], [16, 1, 1, 256], [16, 1, 1, 256]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + memory_shapes = [[16, 1, 1, 256], [16, 1, 1, 256], [16, 1, 1, 256]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] +}> + +!WeightsTableDistributed = !VPUIP.DistributedBuffer< + 16x1x1x4xsi32, #NCHW, @CMX_NN, { + mode = "DUPLICATED", + num_clusters = 3 : i64, + uniform_distributed_segments, + compute_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + memory_shapes = [[16, 1, 1, 4], [16, 1, 1, 4], [16, 1, 1, 4]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] +}> + +!OutputDistributedSOW = !VPUIP.DistributedBuffer< + 1x16x4x4xf16, #NHWC, @CMX_NN, { + mode = "OVERLAPPED", + num_tiles = [1, 1, 1, 3], + num_clusters = 3 : i64, + uniform_distributed_segments, + compute_shapes = [[1, 16, 4, 2], [1, 16, 4, 1], [1, 16, 4, 1]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 2], [0, 0, 0, 3]], + memory_shapes = [[1, 16, 4, 2], [1, 16, 4, 1], [1, 16, 4, 1]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 2], [0, 0, 0, 3]] +}> + +!Output_DDR = memref<1x16x4x4xf16, #NHWC, @DDR> + +//CHECK-LABEL: @UnrollNceSoWSEPOverlappedThreeClusters +func.func @UnrollNceSoWSEPOverlappedThreeClusters() -> !Output_DDR { + // Barriers + %bar0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %bar1 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %bar2 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %bar3 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %bar4 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %seTable_cst = const.Declare memref<1x1x6x6xi32, #NHWC, @DDR> = dense<[[[ + [0, 0, 1024, 2048, 2049, 2049], + [0, 0, 1024, 2048, 2049, 2049], + [4096, 4096, 5120, 6144, 6145, 6145], + [8192, 8192, 9216, 10240, 10241, 10241], + [12288, 12288, 13312, 14336, 14337, 14337], + [12288, 12288, 13312, 14336, 14337, 14337] + ]]]> : tensor<1x1x6x6xi32, {order = #NHWC}> + %seTable_CMX = VPURT.DeclareBuffer <4160> -> !InputSETableDistributedSOW + VPURT.Task updates(%bar0: !VPURT.Barrier) { + VPUIP.NNDMA inputs(%seTable_cst : memref<1x1x6x6xi32, #NHWC, @DDR>) outputs(%seTable_CMX : !InputSETableDistributedSOW) -> !InputSETableDistributedSOW + } + + %parent_out = VPURT.DeclareBuffer [0] <0> -> !Output_DDR + + %parent_input_cmx = VPURT.DeclareBuffer <0> -> !InputDistributedSOW + %parent_input_sparsity_map = VPURT.DeclareBuffer <1024> -> !InputSparseMapDistributedSOW + %weights = VPURT.DeclareBuffer [0, 1, 2] <2048> -> !WeightsDistributed + %weights_sparsity_map = VPURT.DeclareBuffer [0, 1, 2] <5120> -> !WeightsSparsityMapDistributed + %weights_table = VPURT.DeclareBuffer [0, 1, 2] <6144> -> !WeightsTableDistributed + %parent_out_cmx = VPURT.DeclareBuffer <7168> -> !OutputDistributedSOW + + VPURT.Task waits(%bar0, %bar1, %bar2, %bar3 : !VPURT.Barrier, !VPURT.Barrier, !VPURT.Barrier, !VPURT.Barrier) updates(%bar4 : !VPURT.Barrier) { + %21 = VPUIP.NCEClusterTask {constantsFused = true, is_superdense, kernel_padding = #VPU.Padding, kernel_size = [3, 3], kernel_strides = [1, 1], minimumHardwareExecutionCost = 425 : i64, mpe_engine = #VPU.MPEEngine37XX>, task_type = #VPUIP.nce_task_type} + input(%parent_input_cmx : !InputDistributedSOW) + input_sparsity_map(%parent_input_sparsity_map : !InputSparseMapDistributedSOW) + input_storage_element_table(%seTable_CMX: !InputSETableDistributedSOW) + weights(%weights : !WeightsDistributed) + weights_sparsity_map(%weights_sparsity_map : !WeightsSparsityMapDistributed) + weight_table(%weights_table : !WeightsTableDistributed) + parent_input(%parent_input_cmx : !InputDistributedSOW) + parent_input_sparsity_map(%parent_input_sparsity_map : !InputSparseMapDistributedSOW) + parent_input_storage_element_table(%seTable_CMX: !InputSETableDistributedSOW) + parent_output(%parent_out_cmx : !OutputDistributedSOW) + outputs(%parent_out_cmx : !OutputDistributedSOW) + -> !OutputDistributedSOW + variants : { + DPUTask {cluster_id = 0 : i64, inEnd = [3, 5, 15], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [1, 3, 15], outStart = [0, 0, 0], pad = #VPU.Padding} + DPUTask {cluster_id = 1 : i64, inEnd = [2, 5, 15], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [0, 3, 15], outStart = [0, 0, 0], pad = #VPU.Padding} + DPUTask {cluster_id = 2 : i64, inEnd = [2, 5, 15], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, outEnd = [0, 3, 15], outStart = [0, 0, 0], pad = #VPU.Padding} + } PPE : { + PPETask {ppe = #VPU.PPEFp, clamp_low = -3.4028234663852886E+38 : f64, clamp_high = 3.4028234663852886E+38 : f64, scale = 1.000000e+00 : f64, prelu_alpha = [1.000000e+00], bias = 0.000000e+00 : f64, adder = 0.000000e+00 : f64>} + } + } + + VPURT.Task waits(%bar4: !VPURT.Barrier) { + VPUIP.NNDMA inputs(%parent_out_cmx: !OutputDistributedSOW) outputs(%parent_out: !Output_DDR) -> !Output_DDR + } + + return %parent_out: !Output_DDR + + // CHECK{LITERAL}: const.Declare memref<1x1x6x4xi32, #NHWC, @DDR> = dense<[[[ + // CHECK-SAME: [0, 0, 1024, 2048] + // CHECK-SAME: [0, 0, 1024, 2048] + // CHECK-SAME: [4096, 4096, 5120, 6144] + // CHECK-SAME: [8192, 8192, 9216, 10240] + // CHECK-SAME: [12288, 12288, 13312, 14336] + // CHECK-SAME: [12288, 12288, 13312, 14336] + // CHECK{LITERAL}: ]]]> : tensor<1x1x6x4xi32, {order = #NHWC}> + + // CHECK{LITERAL}: const.Declare memref<1x1x6x3xi32, #NHWC, @DDR> = dense<[[[ + // CHECK-SAME: [1, 1025, 2049] + // CHECK-SAME: [1, 1025, 2049] + // CHECK-SAME: [4097, 5121, 6145] + // CHECK-SAME: [8193, 9217, 10241] + // CHECK-SAME: [12289, 13313, 14337] + // CHECK-SAME: [12289, 13313, 14337] + // CHECK{LITERAL}: ]]]> : tensor<1x1x6x3xi32, {order = #NHWC}> + + // CHECK{LITERAL}: const.Declare memref<1x1x6x3xi32, #NHWC, @DDR> = dense<[[[ + // CHECK-SAME: [2, 1026, 1026] + // CHECK-SAME: [2, 1026, 1026] + // CHECK-SAME: [4098, 5122, 5122] + // CHECK-SAME: [8194, 9218, 9218] + // CHECK-SAME: [12290, 13314, 13314] + // CHECK-SAME: [12290, 13314, 13314] + // CHECK{LITERAL}: ]]]> : tensor<1x1x6x3xi32, {order = #NHWC}> +} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/unroll_distributed_ops_sw_kernel.mlir b/tests/lit/NPU/dialect/VPUIP/passes/unroll_distributed_ops_sw_kernel.mlir new file mode 100644 index 0000000000..5836315fa4 --- /dev/null +++ b/tests/lit/NPU/dialect/VPUIP/passes/unroll_distributed_ops_sw_kernel.mlir @@ -0,0 +1,70 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --unroll-distributed-ops --canonicalize %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +!qElemType = !quant.uniform +//CHECK-DAG: [[QTYPE_1:!.+]] = !quant.uniform +//CHECK-DAG: [[QTYPE_2:!.+]] = !quant.uniform + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!InputDistributed = !VPUIP.DistributedBuffer< + 16x48x3x3x!qElemType, #NHWC, @CMX_NN, + {mode = "SEGMENTED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, uniform_distributed_segments, + compute_shapes = [[8, 48, 3, 3], [8, 48, 3, 3]], + compute_offsets = [[0, 0, 0, 0], [8, 0, 0, 0]], + memory_shapes = [[8, 48, 3, 3], [8, 48, 3, 3]], + memory_offsets = [[0, 0, 0, 0], [8, 0, 0, 0]]} +> + +!OutputDistributed = !VPUIP.DistributedBuffer< + 16x48x3x3xf16, #NHWC, @CMX_NN, + {mode = "SEGMENTED", num_tiles = [2, 1, 1, 1], num_clusters = 2 : i64, uniform_distributed_segments, + compute_shapes = [[8, 48, 3, 3], [8, 48, 3, 3]], + compute_offsets = [[0, 0, 0, 0], [8, 0, 0, 0]], + memory_shapes = [[8, 48, 3, 3], [8, 48, 3, 3]], + memory_offsets = [[0, 0, 0, 0], [8, 0, 0, 0]]} +> + + +module @VPU.SW { + func.func private @builtin_Dequantize(memref<*x!qElemType, @CMX_NN>, memref<*xf16, @CMX_NN>, none) attributes {VPU.kernel_code = "dequantize.cpp", VPU.kernel_entry = "dequantize", VPU.kernel_name = "dequantize", VPU.task_type = @COMPUTE} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} +} + +func.func @UnrollDequantOverQuantAxis(%arg0: memref<16x48x3x3xui8, #NHWC, @DDR>, %arg1: memref<16x48x3x3xf16, #NHWC, @DDR>) -> memref<16x48x3x3xf16, #NHWC, @DDR> { + %0 = VPURT.DeclareBuffer [0] <0> -> memref<16x48x3x3xf16, #NHWC, @DDR> + %1 = VPURT.DeclareBuffer <13824> -> !InputDistributed + %2 = VPURT.DeclareBuffer <0> -> !OutputDistributed + %3 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %4 = VPURT.DeclareBuffer [0] <0> -> memref<16x48x3x3x!qElemType, #NHWC, @DDR> + %5 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + VPURT.Task updates(%3 : !VPURT.Barrier) { + %6 = VPUIP.NNDMA {port = 0 : i64} inputs(%4 : memref<16x48x3x3x!qElemType, #NHWC, @DDR>) outputs(%1 : !InputDistributed) -> !InputDistributed + } + VPURT.Task waits(%3 : !VPURT.Barrier) updates(%5 : !VPURT.Barrier) { + %results = VPUIP.SW.Kernel {listIndex = 0 : i64, resultSegmentSizes = array} + @VPU.SW::@builtin_Dequantize + inputs(%1 as %arg2: !InputDistributed) + outputs(%2 as %arg3: !OutputDistributed) -> !OutputDistributed + { + VPUIP.SW.Kernel.run {attrs = [[3, 16, 2963130708733665567, 3251366363510221414, 3435735286504893891, 3539601489976307753, 6341165033837320192, 6341165033837320192, 6341165033837320192, 6341165033837320192]]}(%arg2, %arg3) : !InputDistributed, !OutputDistributed + } + } + VPURT.Task waits(%5 : !VPURT.Barrier) { + %6 = VPUIP.NNDMA {port = 0 : i64} inputs(%2 : !OutputDistributed) outputs(%0 : memref<16x48x3x3xf16, #NHWC, @DDR>) -> memref<16x48x3x3xf16, #NHWC, @DDR> + } + return %arg1 : memref<16x48x3x3xf16, #NHWC, @DDR> + + //CHECK: @VPU.SW::@builtin_Dequantize + //CHECK-SAME: memref<8x48x3x3x[[QTYPE_1]], #NHWC, [@CMX_NN, 0]> + //CHECK{LITERAL}: VPUIP.SW.Kernel.run {attrs = [[3, 8, 2963130708733665567, 3251366363510221414, 6341165033837320192, 6341165033837320192]]} + + //CHECK: @VPU.SW::@builtin_Dequantize + //CHECK-SAME: memref<8x48x3x3x[[QTYPE_2]], #NHWC, [@CMX_NN, 1]> + //CHECK{LITERAL}: VPUIP.SW.Kernel.run {attrs = [[3, 8, 3435735286504893891, 3539601489976307753, 6341165033837320192, 6341165033837320192]]} +} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/unroll_gather_dma_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/unroll_gather_dma_40XX+.mlir new file mode 100644 index 0000000000..f3eee19bc1 --- /dev/null +++ b/tests/lit/NPU/dialect/VPUIP/passes/unroll_gather_dma_40XX+.mlir @@ -0,0 +1,93 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --init-compiler="vpu-arch=%arch%" --unroll-gather-dma %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + +!IndicesDistributed = !VPUIP.DistributedBuffer< + 1x1x1024x1xi64, #NCHW, @CMX_NN, { + mode = "DUPLICATED", num_clusters = 3 : i64, uniform_distributed_segments, + compute_shapes = [[1, 1, 1024, 1], [1, 1, 1024, 1], [1, 1, 1024, 1]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + memory_shapes = [[1, 1, 1024, 1], [1, 1, 1024, 1], [1, 1, 1024, 1]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] +}> + +!OutputDistributed = !VPUIP.DistributedBuffer< + 1x1x1024x2048xf16, #NCHW, @CMX_NN, { + mode = "SEGMENTED", num_tiles = [1, 1, 1, 3], num_clusters = 3 : i64, uniform_distributed_segments, + compute_shapes = [[1, 1, 1024, 683], [1, 1, 1024, 683], [1, 1, 1024, 682]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 683], [0, 0, 0, 1366]], + memory_shapes = [[1, 1, 1024, 683], [1, 1, 1024, 683], [1, 1, 1024, 682]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 683], [0, 0, 0, 1366]] +}> + +// Case 1: Unroll ClusterD2SDMA with single-cluster input and multi-cluster(SEGMENTED) output +func.func @UnrollGatherDMA() -> !OutputDistributed { + %0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %1 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %2 = VPURT.DeclareBuffer <0> -> memref<1x1x128256x2048xf16, @DDR> + %3 = VPURT.DeclareBuffer <16768> -> !IndicesDistributed + %4 = VPURT.DeclareBuffer <24960> -> !OutputDistributed + + VPURT.Task waits(%0 : !VPURT.Barrier) updates(%1 : !VPURT.Barrier) { + VPUIP.GatherDMA {channelType = 0 : i64, elementSize = 0 : i64, padding = 0 : i64, port = 0 : i64} + inputs(%2 : memref<1x1x128256x2048xf16, @DDR>) + indices(%3 : !IndicesDistributed) + outputs(%4 : !OutputDistributed) -> !OutputDistributed + } + + return %4: !OutputDistributed + + //CHECK: [[BARRIER_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BARRIER_1:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFF_0:%.+]] = VPURT.DeclareBuffer <0> -> memref<1x1x128256x683xf16, {order = #NCHW, strides = [262668288, 262668288, 2048, 1]}, @DDR> + //CHECK: [[INPUT_BUFF_1:%.+]] = VPURT.DeclareBuffer <1366> -> memref<1x1x128256x683xf16, {order = #NCHW, strides = [262668288, 262668288, 2048, 1]}, @DDR> + //CHECK: [[INPUT_BUFF_2:%.+]] = VPURT.DeclareBuffer <2732> -> memref<1x1x128256x682xf16, {order = #NCHW, strides = [262668288, 262668288, 2048, 1]}, @DDR> + + //CHECK: [[INDICES_BUFF_0:%.+]] = VPURT.DeclareBuffer [0] <16768> -> memref<1x1x1024x1xi64, [@CMX_NN, 0]> + //CHECK: [[INDICES_BUFF_1:%.+]] = VPURT.DeclareBuffer [1] <16768> -> memref<1x1x1024x1xi64, [@CMX_NN, 1]> + //CHECK: [[INDICES_BUFF_2:%.+]] = VPURT.DeclareBuffer [2] <16768> -> memref<1x1x1024x1xi64, [@CMX_NN, 2]> + + //CHECK: [[OUTPUT_BUFF:%.+]] = VPURT.DeclareBuffer <24960> + //CHECK-SAME: -> !VPUIP.DistributedBuffer<1x1x1024x2048xf16, #NCHW, @CMX_NN, + //CHECK-SAME: {mode = "SEGMENTED", num_tiles = [1, 1, 1, 3], num_clusters = 3 : i64, uniform_distributed_segments, + //CHECK-SAME{LITERAL}: compute_shapes = [[1, 1, 1024, 683], [1, 1, 1024, 683], [1, 1, 1024, 682]], + //CHECK-SAME{LITERAL}: compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 683], [0, 0, 0, 1366]], + //CHECK-SAME{LITERAL}: memory_shapes = [[1, 1, 1024, 683], [1, 1, 1024, 683], [1, 1, 1024, 682]], + //CHECK-SAME{LITERAL}: memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 683], [0, 0, 0, 1366]]}> + + //CHECK: [[OUTPUT_BUFF_0:%.+]] = VPURT.DeclareBuffer [0] <24960> -> memref<1x1x1024x683xf16, [@CMX_NN, 0]> + //CHECK: [[OUTPUT_BUFF_1:%.+]] = VPURT.DeclareBuffer [1] <24960> -> memref<1x1x1024x683xf16, [@CMX_NN, 1]> + //CHECK: [[OUTPUT_BUFF_2:%.+]] = VPURT.DeclareBuffer [2] <24960> -> memref<1x1x1024x682xf16, [@CMX_NN, 2]> + + //CHECK: VPURT.Task waits([[BARRIER_0]] : !VPURT.Barrier) updates([[BARRIER_1]] : !VPURT.Barrier) { + //CHECK: VPUIP.GatherDMA {channelType = 0 : i64, elementSize = 0 : i64, padding = 0 : i64, port = 0 : i64} + //CHECK-SAME: inputs([[INPUT_BUFF_0]] : memref<1x1x128256x683xf16, {order = #NCHW, strides = [262668288, 262668288, 2048, 1]}, @DDR>) + //CHECK-SAME: indices([[INDICES_BUFF_0]] : memref<1x1x1024x1xi64, [@CMX_NN, 0]>) + //CHECK-SAME: outputs([[OUTPUT_BUFF_0]] : memref<1x1x1024x683xf16, [@CMX_NN, 0]>) + //CHECK-SAME: -> memref<1x1x1024x683xf16, [@CMX_NN, 0]> + //CHECK: } + //CHECK: VPURT.Task waits([[BARRIER_0]] : !VPURT.Barrier) updates([[BARRIER_1]] : !VPURT.Barrier) { + //CHECK: VPUIP.GatherDMA {channelType = 0 : i64, elementSize = 0 : i64, padding = 0 : i64, port = 0 : i64} + //CHECK-SAME: inputs([[INPUT_BUFF_1]] : memref<1x1x128256x683xf16, {order = #NCHW, strides = [262668288, 262668288, 2048, 1]}, @DDR>) + //CHECK-SAME: indices([[INDICES_BUFF_1]] : memref<1x1x1024x1xi64, [@CMX_NN, 1]>) + //CHECK-SAME: outputs([[OUTPUT_BUFF_1]] : memref<1x1x1024x683xf16, [@CMX_NN, 1]>) + //CHECK-SAME: -> memref<1x1x1024x683xf16, [@CMX_NN, 1]> + //CHECK: } + //CHECK: VPURT.Task waits([[BARRIER_0]] : !VPURT.Barrier) updates([[BARRIER_1]] : !VPURT.Barrier) { + //CHECK: VPUIP.GatherDMA {channelType = 0 : i64, elementSize = 0 : i64, padding = 0 : i64, port = 0 : i64} + //CHECK-SAME: inputs([[INPUT_BUFF_2]] : memref<1x1x128256x682xf16, {order = #NCHW, strides = [262668288, 262668288, 2048, 1]}, @DDR>) + //CHECK-SAME: indices([[INDICES_BUFF_2]] : memref<1x1x1024x1xi64, [@CMX_NN, 2]>) + //CHECK-SAME: outputs([[OUTPUT_BUFF_2]] : memref<1x1x1024x682xf16, [@CMX_NN, 2]>) + //CHECK-SAME: -> memref<1x1x1024x682xf16, [@CMX_NN, 2]> + //CHECK: } + + //CHECK: return [[OUTPUT_BUFF]] +} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/unroll_permute_to_nndma_37XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/unroll_permute_dma_37XX.mlir similarity index 99% rename from tests/lit/NPU/dialect/VPUIP/passes/unroll_permute_to_nndma_37XX.mlir rename to tests/lit/NPU/dialect/VPUIP/passes/unroll_permute_dma_37XX.mlir index 00620033f7..8907087ca4 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/unroll_permute_to_nndma_37XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/unroll_permute_dma_37XX.mlir @@ -1,9 +1,9 @@ // -// Copyright (C) 2022-2025 Intel Corporation. +// Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // -// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --unroll-permute-to-nndma %s | FileCheck %s +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --unroll-permute-dma %s | FileCheck %s // REQUIRES: arch-NPU37XX #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> @@ -1257,8 +1257,8 @@ func.func @PermuteToDMAWithNCHWToNHWC2D() -> !OutputDistributed { #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -// CHECK-LABEL: @UniformPermuteDMAPlaneSizeRequiredTwoDMAs -func.func @UniformPermuteDMAPlaneSizeRequiredTwoDMAs() -> memref<1x16x1x261xf16, [@CMX_NN, 0]> { +// CHECK-LABEL: @UniformPermuteDMAPlaneSizeRequiresTwoDMAs +func.func @UniformPermuteDMAPlaneSizeRequiresTwoDMAs() -> memref<1x16x1x261xf16, [@CMX_NN, 0]> { %bar0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier %input = VPURT.DeclareBuffer [0] <0> -> memref<1x16x1x261xf16, #NHWC, [@CMX_NN, 0]> @@ -1302,8 +1302,8 @@ func.func @UniformPermuteDMAPlaneSizeRequiredTwoDMAs() -> memref<1x16x1x261xf16, #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -// CHECK-LABEL: @UniformPermuteDMAPlaneSizeRequiredFourDMAs -func.func @UniformPermuteDMAPlaneSizeRequiredFourDMAs() -> memref<1x16x1x520xf16, [@CMX_NN, 0]> { +// CHECK-LABEL: @UniformPermuteDMAPlaneSizeRequiresFourDMAs +func.func @UniformPermuteDMAPlaneSizeRequiresFourDMAs() -> memref<1x16x1x520xf16, [@CMX_NN, 0]> { %bar0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier %input = VPURT.DeclareBuffer [0] <0> -> memref<1x16x1x520xf16, #NHWC, [@CMX_NN, 0]> diff --git a/tests/lit/NPU/dialect/VPUIP/passes/unroll_permute_dma_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/unroll_permute_dma_40XX+.mlir new file mode 100644 index 0000000000..44c5e7a63e --- /dev/null +++ b/tests/lit/NPU/dialect/VPUIP/passes/unroll_permute_dma_40XX+.mlir @@ -0,0 +1,1393 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --unroll-permute-dma %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!InputDistributed = !VPUIP.DistributedBuffer< + 1x1x960x4xf16, #NHWC, @CMX_NN, + { + mode = "DUPLICATED", + num_clusters = 4 : i64, + uniform_distributed_segments, + compute_shapes = [[1, 1, 960, 4], [1, 1, 960, 4], [1, 1, 960, 4], [1, 1, 960, 4]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + memory_shapes = [[1, 1, 960, 4], [1, 1, 960, 4], [1, 1, 960, 4], [1, 1, 960, 4]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] + } +> + +!OutputDistributed = !VPUIP.DistributedBuffer< + 1x960x4x1xf16, #NHWC, @CMX_NN, + { + mode = "DUPLICATED", + num_clusters = 4 : i64, + uniform_distributed_segments, + compute_shapes = [[1, 960, 4, 1], [1, 960, 4, 1], [1, 960, 4, 1], [1, 960, 4, 1]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + memory_shapes = [[1, 960, 4, 1], [1, 960, 4, 1], [1, 960, 4, 1], [1, 960, 4, 1]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] + } +> + +// CHECK-LABEL: @PermuteDMAForShapeWithDuplicatedOutputWithExplicitShapesAndOffsets +func.func @PermuteDMAForShapeWithDuplicatedOutputWithExplicitShapesAndOffsets() -> !OutputDistributed { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer <0> -> !InputDistributed + %output = VPURT.DeclareBuffer <16384> -> !OutputDistributed + + VPURT.Task updates(%BAR_0 : !VPURT.Barrier) { + %80 = VPUIP.PermuteDMA {mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, port = 0 : i64} + inputs(%input : !InputDistributed) + outputs(%output : !OutputDistributed) + -> !OutputDistributed + } + + return %output: !OutputDistributed + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <3840> -> [[INPUT_TYPE_1:.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer <16384> -> [[RETURN_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0, 1, 2, 3] <16384> -> [[OUTPUT_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0, 1, 2, 3] <17344> -> [[OUTPUT_TYPE_1:.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NHWC + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NHWC + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + +!InputDistributed = !VPUIP.DistributedBuffer< + 64x64x3x3xf16, #NCHW, @CMX_NN, + { + mode = "DUPLICATED", + num_clusters = 4 : i64, + uniform_distributed_segments, + compute_shapes = [[64, 64, 3, 3], [64, 64, 3, 3], [64, 64, 3, 3], [64, 64, 3, 3]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + memory_shapes = [[64, 64, 3, 3], [64, 64, 3, 3], [64, 64, 3, 3], [64, 64, 3, 3]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] + } +> + +!OutputDistributed = !VPUIP.DistributedBuffer< + 64x64x3x3xf16, #NHWC, @CMX_NN, + { + mode = "DUPLICATED", + num_clusters = 4 : i64, + uniform_distributed_segments, + compute_shapes = [[64, 64, 3, 3], [64, 64, 3, 3], [64, 64, 3, 3], [64, 64, 3, 3]], + compute_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + memory_shapes = [[64, 64, 3, 3], [64, 64, 3, 3], [64, 64, 3, 3], [64, 64, 3, 3]], + memory_offsets = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] + } +> + +// CHECK-LABEL: @PermuteDMAForLayoutWithDuplicatedOutputWithExplicitShapesAndOffsets +func.func @PermuteDMAForLayoutWithDuplicatedOutputWithExplicitShapesAndOffsets() -> !OutputDistributed { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer <0> -> !InputDistributed + %output = VPURT.DeclareBuffer <689152> -> !OutputDistributed + + VPURT.Task updates(%BAR_0 : !VPURT.Barrier) { + %1 = VPUIP.PermuteDMA {mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, port = 0 : i64} + inputs(%input : !InputDistributed) + outputs(%output : !OutputDistributed) + -> !OutputDistributed + } + + return %output: !OutputDistributed + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <36864> -> [[INPUT_TYPE_1:.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer <689152> -> [[RETURN_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0, 1, 2, 3] <689152> -> [[OUTPUT_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0, 1, 2, 3] <726016> -> [[OUTPUT_TYPE_1:.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!qElemType = !quant.uniform +!qElemType1 = !quant.uniform + +// CHECK: !qElemType = !quant.uniform +// CHECK: !qElemType1 = !quant.uniform + +// CHECK-LABEL: @PermuteDMAWithTileOverQuantAxisAndTransposeOfQuantAxis +func.func @PermuteDMAWithTileOverQuantAxisAndTransposeOfQuantAxis() -> memref<1x32x1x8x!qElemType, #NHWC, [@CMX_NN, 0]> { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer [0] <0> -> memref<1x8x1x32x!qElemType1, #NHWC, [@CMX_NN, 0]> + %output = VPURT.DeclareBuffer [0] <4096> -> memref<1x32x1x8x!qElemType, #NHWC, [@CMX_NN, 0]> + + VPURT.Task updates(%BAR_0: !VPURT.Barrier) { + VPUIP.PermuteDMA {dst_stride = 0 : i64, mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>, port = 0 : i64, src_plane_stride = 0 : i64} + inputs(%input : memref<1x8x1x32x!qElemType1, #NHWC, [@CMX_NN, 0]>) + outputs(%output : memref<1x32x1x8x!qElemType, #NHWC, [@CMX_NN, 0]>) -> memref<1x32x1x8x!qElemType, #NHWC, [@CMX_NN, 0]> + } + + return %output: memref<1x32x1x8x!qElemType, #NHWC, [@CMX_NN, 0]> + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+!qElemType1.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <128> -> [[INPUT_TYPE_1:.+!qElemType1.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[RETURN_TYPE_0:.+!qElemType.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[OUTPUT_TYPE_0:.+!qElemType.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <4112> -> [[OUTPUT_TYPE_1:.+!qElemType.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NWHC + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NWHC + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK-LABEL: @PermuteDMAWithNoSplitForNonByteAlignedElementType +func.func @PermuteDMAWithNoSplitForNonByteAlignedElementType() -> memref<4x3x3x3xui4, [@CMX_NN, 0]> { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer [0] <0> -> memref<4x3x3x3xui4, #NHWC, [@CMX_NN, 0]> + %output = VPURT.DeclareBuffer [0] <4096> -> memref<4x3x3x3xui4, [@CMX_NN, 0]> + + VPURT.Task updates(%BAR_0: !VPURT.Barrier) { + VPUIP.PermuteDMA {mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)>} + inputs(%input : memref<4x3x3x3xui4, #NHWC, [@CMX_NN, 0]>) + outputs(%output : memref<4x3x3x3xui4, [@CMX_NN, 0]>) -> memref<4x3x3x3xui4, [@CMX_NN, 0]> + } + + return %output: memref<4x3x3x3xui4, [@CMX_NN, 0]> + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+ui4.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[RETURN_TYPE_0:.+ui4.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[OUTPUT_TYPE_0:.+ui4.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!qElemType = !quant.uniform + +// CHECK: !qElemType = !quant.uniform +// CHECK: !qElemType1 = !quant.uniform +// CHECK: !qElemType2 = !quant.uniform + +// CHECK-LABEL: @PermuteDMAWithTileOverQuantAxis +func.func @PermuteDMAWithTileOverQuantAxis() -> memref<4x3x3x3x!qElemType, [@CMX_NN, 0]> { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer [0] <0> -> memref<4x3x3x3x!qElemType, #NHWC, [@CMX_NN, 0]> + %output = VPURT.DeclareBuffer [0] <4096> -> memref<4x3x3x3x!qElemType, [@CMX_NN, 0]> + + VPURT.Task updates(%BAR_0: !VPURT.Barrier) { + VPUIP.PermuteDMA {mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)>} + inputs(%input : memref<4x3x3x3x!qElemType, #NHWC, [@CMX_NN, 0]>) + outputs(%output : memref<4x3x3x3x!qElemType, [@CMX_NN, 0]>) -> memref<4x3x3x3x!qElemType, [@CMX_NN, 0]> + } + + return %output: memref<4x3x3x3x!qElemType, [@CMX_NN, 0]> + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+!qElemType1.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <54> -> [[INPUT_TYPE_1:.+!qElemType2.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[RETURN_TYPE_0:.+!qElemType.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[OUTPUT_TYPE_0:.+!qElemType1.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <4150> -> [[OUTPUT_TYPE_1:.+!qElemType2.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!qElemType = !quant.uniform + +// CHECK: !qElemType = !quant.uniform + +// CHECK-LABEL: @PermuteDMAWithTileOverNonQuantAxis +func.func @PermuteDMAWithTileOverNonQuantAxis() -> memref<4x3x3x3x!qElemType, [@CMX_NN, 0]> { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer [0] <0> -> memref<4x3x3x3x!qElemType, #NHWC, [@CMX_NN, 0]> + %output = VPURT.DeclareBuffer [0] <4096> -> memref<4x3x3x3x!qElemType, [@CMX_NN, 0]> + + VPURT.Task updates(%BAR_0: !VPURT.Barrier) { + VPUIP.PermuteDMA {mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)>} + inputs(%input : memref<4x3x3x3x!qElemType, #NHWC, [@CMX_NN, 0]>) + outputs(%output : memref<4x3x3x3x!qElemType, [@CMX_NN, 0]>) -> memref<4x3x3x3x!qElemType, [@CMX_NN, 0]> + } + + return %output: memref<4x3x3x3x!qElemType, [@CMX_NN, 0]> + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+!qElemType.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <54> -> [[INPUT_TYPE_1:.+!qElemType.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[RETURN_TYPE_0:.+!qElemType.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[OUTPUT_TYPE_0:.+!qElemType.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <4150> -> [[OUTPUT_TYPE_1:.+!qElemType.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK-LABEL: @PermuteDMAWithNHWCToNCHW +func.func @PermuteDMAWithNHWCToNCHW() -> memref<1x3x3x3xf16, [@CMX_NN, 0]> { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer [0] <0> -> memref<1x3x3x3xf16, #NHWC, [@CMX_NN, 0]> + %output = VPURT.DeclareBuffer [0] <4096> -> memref<1x3x3x3xf16, [@CMX_NN, 0]> + + VPURT.Task updates(%BAR_0: !VPURT.Barrier) { + VPUIP.PermuteDMA {dst_stride = 0 : i64, mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)>, port = 0 : i64, src_plane_stride = 0 : i64} + inputs(%input : memref<1x3x3x3xf16, #NHWC, [@CMX_NN, 0]>) + outputs(%output : memref<1x3x3x3xf16, [@CMX_NN, 0]>) -> memref<1x3x3x3xf16, [@CMX_NN, 0]> + } + + return %output: memref<1x3x3x3xf16, [@CMX_NN, 0]> + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[RETURN_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[OUTPUT_TYPE_0:.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK-LABEL: @PermuteDMAWithNHWCToNCHW +func.func @PermuteDMAWithNHWCToNCHW() -> memref<1x8x16x16xf16, [@CMX_NN, 0]> { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer [0] <0> -> memref<1x8x16x16xf16, #NHWC, [@CMX_NN, 0]> + %output = VPURT.DeclareBuffer [0] <4096> -> memref<1x8x16x16xf16, [@CMX_NN, 0]> + + VPURT.Task updates(%BAR_0: !VPURT.Barrier) { + VPUIP.PermuteDMA {dst_stride = 0 : i64, mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)>, port = 0 : i64, src_plane_stride = 0 : i64} + inputs(%input : memref<1x8x16x16xf16, #NHWC, [@CMX_NN, 0]>) + outputs(%output : memref<1x8x16x16xf16, [@CMX_NN, 0]>) -> memref<1x8x16x16xf16, [@CMX_NN, 0]> + } + + return %output: memref<1x8x16x16xf16, [@CMX_NN, 0]> + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <2048> -> [[INPUT_TYPE_1:.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[RETURN_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[OUTPUT_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <4352> -> [[OUTPUT_TYPE_1:.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK-LABEL: @PermuteDMAFromTranspose +func.func @PermuteDMAFromTranspose() -> memref<1x32x1x8xf16, #NHWC, [@CMX_NN, 0]> { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer [0] <0> -> memref<1x8x1x32xf16, #NHWC, [@CMX_NN, 0]> + %output = VPURT.DeclareBuffer [0] <4096> -> memref<1x32x1x8xf16, #NHWC, [@CMX_NN, 0]> + + VPURT.Task updates(%BAR_0: !VPURT.Barrier) { + VPUIP.PermuteDMA {dst_stride = 0 : i64, mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>, port = 0 : i64, src_plane_stride = 0 : i64} + inputs(%input : memref<1x8x1x32xf16, #NHWC, [@CMX_NN, 0]>) + outputs(%output : memref<1x32x1x8xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x32x1x8xf16, #NHWC, [@CMX_NN, 0]> + } + + return %output: memref<1x32x1x8xf16, #NHWC, [@CMX_NN, 0]> + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <256> -> [[INPUT_TYPE_1:.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[RETURN_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[OUTPUT_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <4128> -> [[OUTPUT_TYPE_1:.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NWHC + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NWHC + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK-LABEL: @PermuteDMAWithLargePlaneNumber +func.func @PermuteDMAWithLargePlaneNumber() -> memref<1x8x32x16xf16, [@CMX_NN, 0]> { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer [0] <0> -> memref<1x8x32x16xf16, #NHWC, [@CMX_NN, 0]> + %output = VPURT.DeclareBuffer [0] <8192> -> memref<1x8x32x16xf16, [@CMX_NN, 0]> + + VPURT.Task updates(%BAR_0: !VPURT.Barrier) { + VPUIP.PermuteDMA {dst_stride = 0 : i64, mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)>, port = 0 : i64, src_plane_stride = 0 : i64} + inputs(%input : memref<1x8x32x16xf16, #NHWC, [@CMX_NN, 0]>) + outputs(%output : memref<1x8x32x16xf16, [@CMX_NN, 0]>) -> memref<1x8x32x16xf16, [@CMX_NN, 0]> + } + return %output: memref<1x8x32x16xf16, [@CMX_NN, 0]> + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <4096> -> [[INPUT_TYPE_1:.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <8192> -> [[RETURN_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <8192> -> [[OUTPUT_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <8704> -> [[OUTPUT_TYPE_1:.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + +VPURT.SW.Runtime entryPoint: @VPU.SW::@runtime stack_configuration: [4096, 4096, 4096, 4096] + +module @VPU.SW { + func.func private @builtin_Convert(%input : memref<*xf16, [@CMX_NN, 0]>, %output : memref<*xf16, [@CMX_NN, 0]>) attributes { VPU.kernel_code = "convert_fp16.cpp", VPU.kernel_entry = "convert_fp16", VPU.task_type = @COMPUTE} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} +} + +// CHECK-LABEL: @UnrollDistributedPermuteDMA +func.func @UnrollDistributedPermuteDMA() -> memref<1x3x24x24xf16, #NHWC, @DDR> { + %result = VPURT.DeclareBuffer <0> -> memref<1x3x24x24xf16, #NHWC, @DDR> + %cst = const.Declare memref<1x1x1x16xui8> = dense<1> : tensor<1x1x1x16xui8> + %cst_0 = const.Declare memref<16x1x1x4xsi32> = dense<1> : tensor<16x1x1x4xsi32> + + %0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %1 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %2 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %3 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %4 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %5 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %6 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %7 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %8 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %9 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %10 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %11 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %12 = VPURT.DeclareBuffer <0> -> memref<1x3x24x24xui8, @DDR> + %13 = VPURT.DeclareBuffer <0> -> memref<1x3x12x24xf16, #NHWC, @DDR> + %14 = VPURT.DeclareBuffer <1728> -> memref<1x3x12x24xf16, #NHWC, @DDR> + %15 = VPURT.DeclareBuffer [0] <3456> -> memref<1x3x24x24xui8, [@CMX_NN, 0]> + %16 = VPURT.DeclareBuffer [0] <0> -> memref<1x3x24x24xf16, [@CMX_NN, 0]> + %17 = VPURT.DeclareBuffer <0> -> memref<1x3x24x24xf16, @DDR> + %18 = VPURT.DeclareBuffer <3456> -> memref<1x16x24x24xf16, @DDR> + %19 = VPURT.DeclareBuffer <5440> -> !VPUIP.DistributedBuffer<1x16x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + %20 = VPURT.DeclareBuffer [0] <5440> -> memref<1x16x12x24xf16, #NHWC, [@CMX_NN, 0]> + %21 = VPURT.DeclareBuffer [1] <5440> -> memref<1x16x12x24xf16, #NHWC, [@CMX_NN, 1]> + %22 = VPURT.DeclareBuffer [0] <5184> -> memref<16x1x1x4xsi32, [@CMX_NN, 0]> + %23 = VPURT.DeclareBuffer [1] <5184> -> memref<16x1x1x4xsi32, [@CMX_NN, 1]> + %24 = VPURT.DeclareBuffer [0, 1] <5184> -> !VPUIP.DistributedBuffer<16x1x1x4xsi32, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + %25 = VPURT.DeclareBuffer [0] <0> -> memref<1x1x1x16xui8, [@CMX_NN, 0]> + %26 = VPURT.DeclareBuffer [1] <0> -> memref<1x1x1x16xui8, [@CMX_NN, 1]> + %27 = VPURT.DeclareBuffer [0, 1] <0> -> !VPUIP.DistributedBuffer<1x1x1x16xui8, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + %28 = VPURT.DeclareBuffer <14656> -> !VPUIP.DistributedBuffer<1x16x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + %29 = VPURT.DeclareBuffer [0] <14656> -> memref<1x16x12x24xf16, #NHWC, [@CMX_NN, 0]> + %30 = VPURT.DeclareBuffer [1] <14656> -> memref<1x16x12x24xf16, #NHWC, [@CMX_NN, 1]> + %31 = VPURT.DeclareBuffer <3456> -> memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR> + %32 = VPURT.DeclareBuffer <6912> -> memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR> + %33 = VPURT.DeclareBuffer <10368> -> memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR> + %34 = VPURT.DeclareBuffer <13824> -> memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR> + %35 = VPURT.DeclareBuffer <17280> -> memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR> + %36 = VPURT.DeclareBuffer <0> -> memref<1x1x24x24xf16, {order = #NCHW, strides = [1728, 576, 24, 1]}, @DDR> + %37 = VPURT.DeclareBuffer <20736> -> memref<1x1x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR> + %38 = VPURT.DeclareBuffer [0] <14656> -> memref<1x3x12x24xf16, {order = #NHWC, strides = [9216, 1, 384, 16]}, [@CMX_NN, 0]> + %39 = VPURT.DeclareBuffer [1] <14656> -> memref<1x3x12x24xf16, {order = #NHWC, strides = [9216, 1, 384, 16]}, [@CMX_NN, 1]> + + VPURT.Task updates(%0 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %40 = VPUIP.NNDMA inputs(%12 : memref<1x3x24x24xui8, @DDR>) outputs(%15 : memref<1x3x24x24xui8, [@CMX_NN, 0]>) -> memref<1x3x24x24xui8, [@CMX_NN, 0]> + } + VPURT.Task waits(%0 : !VPURT.Barrier) updates(%1 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %40 = VPUIP.NNDMA inputs(%cst_0 : memref<16x1x1x4xsi32>) outputs(%24 : !VPUIP.DistributedBuffer<16x1x1x4xsi32, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<16x1x1x4xsi32, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + } + VPURT.Task waits(%0 : !VPURT.Barrier) updates(%1 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Convert inputs(%15 as %arg2: memref<1x3x24x24xui8, [@CMX_NN, 0]>) outputs(%16 as %arg3: memref<1x3x24x24xf16, [@CMX_NN, 0]>) on tile 0 -> memref<1x3x24x24xf16, [@CMX_NN, 0]>{ + VPUIP.SW.Kernel.run(%arg2, %arg3) : memref<1x3x24x24xui8, [@CMX_NN, 0]>, memref<1x3x24x24xf16, [@CMX_NN, 0]> + } + } + + // expand input + VPURT.Task waits(%1 : !VPURT.Barrier) updates(%2 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %40 = VPUIP.NNDMA inputs(%16 : memref<1x3x24x24xf16, [@CMX_NN, 0]>) outputs(%17 : memref<1x3x24x24xf16, @DDR>) -> memref<1x3x24x24xf16, @DDR> + } + VPURT.Task waits(%2 : !VPURT.Barrier) updates(%3 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %40 = VPUIP.NNDMA inputs(%17 : memref<1x3x24x24xf16, @DDR>) outputs(%31 : memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR>) -> memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR> + } + VPURT.Task waits(%3 : !VPURT.Barrier) updates(%4 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %40 = VPUIP.NNDMA inputs(%17 : memref<1x3x24x24xf16, @DDR>) outputs(%32 : memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR>) -> memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR> + } + VPURT.Task waits(%4 : !VPURT.Barrier) updates(%5 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %40 = VPUIP.NNDMA inputs(%17 : memref<1x3x24x24xf16, @DDR>) outputs(%33 : memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR>) -> memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR> + } + VPURT.Task waits(%5 : !VPURT.Barrier) updates(%6 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %40 = VPUIP.NNDMA inputs(%17 : memref<1x3x24x24xf16, @DDR>) outputs(%34 : memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR>) -> memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR> + } + VPURT.Task waits(%6 : !VPURT.Barrier) updates(%7 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %40 = VPUIP.NNDMA inputs(%17 : memref<1x3x24x24xf16, @DDR>) outputs(%35 : memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR>) -> memref<1x3x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR> + } + VPURT.Task waits(%7 : !VPURT.Barrier) updates(%8 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %40 = VPUIP.NNDMA inputs(%36 : memref<1x1x24x24xf16, {order = #NCHW, strides = [1728, 576, 24, 1]}, @DDR>) outputs(%37 : memref<1x1x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR>) -> memref<1x1x24x24xf16, {order = #NCHW, strides = [9216, 576, 24, 1]}, @DDR> + } + // permute + VPURT.Task waits(%8 : !VPURT.Barrier) updates(%9 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %41 = VPUIP.PermuteDMA {dst_stride = 0 : i64, mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, port = 0 : i64, src_plane_stride = 0 : i64} inputs(%18 : memref<1x16x24x24xf16, @DDR>) outputs(%19 : !VPUIP.DistributedBuffer<1x16x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x16x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + } + + VPURT.Task waits(%9 : !VPURT.Barrier) updates(%10 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %40 = VPUIP.NNDMA inputs(%cst : memref<1x1x1x16xui8>) outputs(%27 : !VPUIP.DistributedBuffer<1x1x1x16xui8, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x1x1x16xui8, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + } + + // NCE task + VPURT.Task waits(%10 : !VPURT.Barrier) updates(%11 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %40 = VPUIP.NCEClusterTask {is_segmented, kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} input(%20 : memref<1x16x12x24xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%22 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) parent_input(%19 : !VPUIP.DistributedBuffer<1x16x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) parent_output(%28 : !VPUIP.DistributedBuffer<1x16x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) outputs(%29 : memref<1x16x12x24xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x16x12x24xf16, #NHWC, [@CMX_NN, 0]> variants : { + DPUTask {cluster_id = 0 : i64, outEnd = [23, 11, 15], mpe_mode = #VPU.mpe_mode, pad = #VPU.Padding, outStart = [0, 0, 0]} + } PPE : { + PPETask {ppe = #VPU.PPEStub<>} + } + } + VPURT.Task waits(%10 : !VPURT.Barrier) updates(%11 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %40 = VPUIP.NCEClusterTask {is_segmented, kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], task_type = #VPUIP.nce_task_type} input(%21 : memref<1x16x12x24xf16, #NHWC, [@CMX_NN, 1]>) weight_table(%23 : memref<16x1x1x4xsi32, [@CMX_NN, 1]>) parent_input(%19 : !VPUIP.DistributedBuffer<1x16x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) parent_output(%28 : !VPUIP.DistributedBuffer<1x16x24x24xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) outputs(%30 : memref<1x16x12x24xf16, #NHWC, [@CMX_NN, 1]>) -> memref<1x16x12x24xf16, #NHWC, [@CMX_NN, 1]> variants : { + DPUTask {cluster_id = 1 : i64, outEnd = [23, 23, 15], mpe_mode = #VPU.mpe_mode, pad = #VPU.Padding, outStart = [0, 12, 0]} + } PPE : { + PPETask {ppe = #VPU.PPEStub<>} + } + } + // copy result + VPURT.Task waits(%11 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %40 = VPUIP.NNDMA inputs(%38 : memref<1x3x12x24xf16, {order = #NHWC, strides = [9216, 1, 384, 16]}, [@CMX_NN, 0]>) outputs(%13 : memref<1x3x12x24xf16, #NHWC, @DDR>) -> memref<1x3x12x24xf16, #NHWC, @DDR> + } + VPURT.Task waits(%11 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %40 = VPUIP.NNDMA {port = 1 : i64} inputs(%39 : memref<1x3x12x24xf16, {order = #NHWC, strides = [9216, 1, 384, 16]}, [@CMX_NN, 1]>) outputs(%14 : memref<1x3x12x24xf16, #NHWC, @DDR>) -> memref<1x3x12x24xf16, #NHWC, @DDR> + } + return %result : memref<1x3x24x24xf16, #NHWC, @DDR> + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BAR_1:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BAR_2:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BAR_3:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BAR_4:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BAR_5:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BAR_6:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BAR_7:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BAR_8:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BAR_9:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BAR_10:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BAR_11:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer <3456> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer <12672> -> [[INPUT_TYPE_1:.+]] + //CHECK: [[INPUT_BUFFER_2:%.+]] = VPURT.DeclareBuffer <4032> -> [[INPUT_TYPE_2:.+]] + //CHECK: [[INPUT_BUFFER_3:%.+]] = VPURT.DeclareBuffer <13248> -> [[INPUT_TYPE_3:.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <5440> -> [[OUTPUT_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <5456> -> [[OUTPUT_TYPE_1:.+]] + //CHECK: [[OUTPUT_BUFFER_2:%.+]] = VPURT.DeclareBuffer [1] <5440> -> [[OUTPUT_TYPE_2:.+]] + //CHECK: [[OUTPUT_BUFFER_3:%.+]] = VPURT.DeclareBuffer [1] <5456> -> [[OUTPUT_TYPE_3:.+]] + + + //CHECK: VPURT.Task waits([[BAR_1]] : !VPURT.Barrier) updates([[BAR_2]] : !VPURT.Barrier) + //CHECK: VPUIP.NNDMA + //CHECK: } + //CHECK: VPURT.Task waits([[BAR_2]] : !VPURT.Barrier) updates([[BAR_3]] : !VPURT.Barrier) + //CHECK: VPUIP.NNDMA + //CHECK: } + //CHECK: VPURT.Task waits([[BAR_3]] : !VPURT.Barrier) updates([[BAR_4]] : !VPURT.Barrier) + //CHECK: VPUIP.NNDMA + //CHECK: } + //CHECK: VPURT.Task waits([[BAR_4]] : !VPURT.Barrier) updates([[BAR_5]] : !VPURT.Barrier) + //CHECK: VPUIP.NNDMA + //CHECK: } + //CHECK: VPURT.Task waits([[BAR_5]] : !VPURT.Barrier) updates([[BAR_6]] : !VPURT.Barrier) + //CHECK: VPUIP.NNDMA + //CHECK: } + //CHECK: VPURT.Task waits([[BAR_6]] : !VPURT.Barrier) updates([[BAR_7]] : !VPURT.Barrier) + //CHECK: VPUIP.NNDMA + //CHECK: } + //CHECK: VPURT.Task waits([[BAR_7]] : !VPURT.Barrier) updates([[BAR_8]] : !VPURT.Barrier) + //CHECK: VPUIP.NNDMA + //CHECK: } + + + //CHECK: VPURT.Task waits([[BAR_8]] : !VPURT.Barrier) updates([[BAR_9]] : !VPURT.Barrier) + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task waits([[BAR_8]] : !VPURT.Barrier) updates([[BAR_9]] : !VPURT.Barrier) + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: VPURT.Task waits([[BAR_8]] : !VPURT.Barrier) updates([[BAR_9]] : !VPURT.Barrier) + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_2]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_2]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_2]] + //CHECK: outputs([[OUTPUT_BUFFER_2]] + //CHECK: } + + //CHECK: VPURT.Task waits([[BAR_8]] : !VPURT.Barrier) updates([[BAR_9]] : !VPURT.Barrier) + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_3]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_3]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_3]] + //CHECK: outputs([[OUTPUT_BUFFER_3]] + //CHECK: } + + //CHECK: VPURT.Task waits([[BAR_9]] : !VPURT.Barrier) updates([[BAR_10]] : !VPURT.Barrier) + //CHECK: VPUIP.NNDMA inputs(%cst_0 : memref<1x1x1x16xui8>) outputs(%33 : !VPUIP.DistributedBuffer<1x1x1x16xui8, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x1x1x16xui8, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + //CHECK: } + + // nce task + //CHECK: VPURT.Task waits([[BAR_10]] : !VPURT.Barrier) updates([[BAR_11]] : !VPURT.Barrier) + //CHECK: VPUIP.NCEClusterTask + //CHECK: } + //CHECK: VPURT.Task waits([[BAR_10]] : !VPURT.Barrier) updates([[BAR_11]] : !VPURT.Barrier) + //CHECK: VPUIP.NCEClusterTask + //CHECK: } + + // copy back + //CHECK: VPURT.Task waits([[BAR_11]] : !VPURT.Barrier) + //CHECK: VPUIP.NNDMA + //CHECK: } + //CHECK: VPURT.Task waits([[BAR_11]] : !VPURT.Barrier) + //CHECK: VPUIP.NNDMA {port = 1 : i64} + //CHECK: } + //CHECK: return %0 : memref<1x3x24x24xf16, #NHWC, @DDR> + +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK-LABEL: @PermuteDMAWithNCHWToNHWCForNetworkOutput +func.func @PermuteDMAWithNCHWToNHWCForNetworkOutput() -> memref<1x32x14x7xf16, #NHWC, [@CMX_NN, 0]> { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer <0> -> memref<1x32x14x7xf16, @DDR> + %output = VPURT.DeclareBuffer [0] <6272> -> memref<1x32x14x7xf16, #NHWC, [@CMX_NN, 0]> + + VPURT.Task updates(%BAR_0 : !VPURT.Barrier) { + VPUIP.PermuteDMA {mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>} + inputs(%input : memref<1x32x14x7xf16, @DDR>) + outputs(%output : memref<1x32x14x7xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x32x14x7xf16, #NHWC, [@CMX_NN, 0]> + } + + return %output: memref<1x32x14x7xf16, #NHWC, [@CMX_NN, 0]> + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer <0> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer <3136> -> [[INPUT_TYPE_1:.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <6272> -> [[RETURN_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <6272> -> [[OUTPUT_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <6304> -> [[OUTPUT_TYPE_1:.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK-LABEL: @PermuteDMAWithNCHWToNHWCForNetworkInput +func.func @PermuteDMAWithNCHWToNHWCForNetworkInput() -> memref<1x32x14x7xf16, #NHWC, [@CMX_NN, 0]> { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer <0> -> memref<1x32x14x7xf16, @DDR> + %output = VPURT.DeclareBuffer [0] <6272> -> memref<1x32x14x7xf16, #NHWC, [@CMX_NN, 0]> + + VPURT.Task updates(%BAR_0 : !VPURT.Barrier) { + VPUIP.PermuteDMA {mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>} + inputs(%input : memref<1x32x14x7xf16, @DDR>) + outputs(%output : memref<1x32x14x7xf16, #NHWC, [@CMX_NN, 0]>) -> memref<1x32x14x7xf16, #NHWC, [@CMX_NN, 0]> + } + + return %output: memref<1x32x14x7xf16, #NHWC, [@CMX_NN, 0]> + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer <0> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer <3136> -> [[INPUT_TYPE_1:.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <6272> -> [[RETURN_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <6272> -> [[OUTPUT_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <6304> -> [[OUTPUT_TYPE_1:.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!OutputDistributed = !VPUIP.DistributedBuffer< + 1x32x14x7xf16, #NHWC, @CMX_NN, { + mode = "SEGMENTED", + num_tiles = [1, 1, 2, 1], + num_clusters = 2 : i64 +}> + +// CHECK-LABEL: @ClusterPermuteDMAWithNCHWToNHWCForNetworkOutput +func.func @ClusterPermuteDMAWithNCHWToNHWCForNetworkOutput() -> !OutputDistributed { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %BAR_1 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %0 = VPURT.DeclareBuffer <0> -> memref<1x32x14x7xf16, @DDR> + %1 = VPURT.DeclareBuffer <0> -> !OutputDistributed + + VPURT.Task waits(%BAR_0 : !VPURT.Barrier) updates(%BAR_1 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + VPUIP.PermuteDMA {mem_perm = #NHWC} + inputs(%0 : memref<1x32x14x7xf16, @DDR>) + outputs(%1 : !OutputDistributed) -> !OutputDistributed + } + return %1: !OutputDistributed + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BAR_1:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer <0> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer <3136> -> [[INPUT_TYPE_1:.+]] + //CHECK: [[INPUT_BUFFER_2:%.+]] = VPURT.DeclareBuffer <98> -> [[INPUT_TYPE_2:.+]] + //CHECK: [[INPUT_BUFFER_3:%.+]] = VPURT.DeclareBuffer <3234> -> [[INPUT_TYPE_3:.+]] + + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer <0> -> [[RETURN_TYPE_0:.+]] + + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[OUTPUT_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <32> -> [[OUTPUT_TYPE_1:.+]] + //CHECK: [[OUTPUT_BUFFER_2:%.+]] = VPURT.DeclareBuffer [1] <0> -> [[OUTPUT_TYPE_2:.+]] + //CHECK: [[OUTPUT_BUFFER_3:%.+]] = VPURT.DeclareBuffer [1] <32> -> [[OUTPUT_TYPE_3:.+]] + + //CHECK: VPURT.Task waits([[BAR_0]] : !VPURT.Barrier) updates([[BAR_1]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task waits([[BAR_0]] : !VPURT.Barrier) updates([[BAR_1]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: VPURT.Task waits([[BAR_0]] : !VPURT.Barrier) updates([[BAR_1]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_2]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_2]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_2]] + //CHECK: outputs([[OUTPUT_BUFFER_2]] + //CHECK: } + + //CHECK: VPURT.Task waits([[BAR_0]] : !VPURT.Barrier) updates([[BAR_1]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_3]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_3]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_3]] + //CHECK: outputs([[OUTPUT_BUFFER_3]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!OutputDistributed = !VPUIP.DistributedBuffer< + 1x32x14x7xf16, #NHWC, @CMX_NN, { + mode = "SEGMENTED", + num_tiles = [1, 1, 2, 1], + num_clusters = 2 : i64 +}> + +// CHECK-LABEL: @ClusterPermuteDMAWithNCHWToNHWCForNetworkInput +func.func @ClusterPermuteDMAWithNCHWToNHWCForNetworkInput() -> !OutputDistributed { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %BAR_1 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %0 = VPURT.DeclareBuffer <0> -> memref<1x32x14x7xf16, @DDR> + %1 = VPURT.DeclareBuffer <0> -> !OutputDistributed + + VPURT.Task waits(%BAR_0 : !VPURT.Barrier) updates(%BAR_1 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + VPUIP.PermuteDMA {mem_perm = #NHWC} + inputs(%0 : memref<1x32x14x7xf16, @DDR>) + outputs(%1 : !OutputDistributed) -> !OutputDistributed + } + return %1: !OutputDistributed + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BAR_1:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer <0> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer <3136> -> [[INPUT_TYPE_1:.+]] + //CHECK: [[INPUT_BUFFER_2:%.+]] = VPURT.DeclareBuffer <98> -> [[INPUT_TYPE_2:.+]] + //CHECK: [[INPUT_BUFFER_3:%.+]] = VPURT.DeclareBuffer <3234> -> [[INPUT_TYPE_3:.+]] + + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer <0> -> [[RETURN_TYPE_0:.+]] + + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[OUTPUT_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <32> -> [[OUTPUT_TYPE_1:.+]] + //CHECK: [[OUTPUT_BUFFER_2:%.+]] = VPURT.DeclareBuffer [1] <0> -> [[OUTPUT_TYPE_2:.+]] + //CHECK: [[OUTPUT_BUFFER_3:%.+]] = VPURT.DeclareBuffer [1] <32> -> [[OUTPUT_TYPE_3:.+]] + + //CHECK: VPURT.Task waits([[BAR_0]] : !VPURT.Barrier) updates([[BAR_1]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task waits([[BAR_0]] : !VPURT.Barrier) updates([[BAR_1]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: VPURT.Task waits([[BAR_0]] : !VPURT.Barrier) updates([[BAR_1]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_2]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_2]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_2]] + //CHECK: outputs([[OUTPUT_BUFFER_2]] + //CHECK: } + + //CHECK: VPURT.Task waits([[BAR_0]] : !VPURT.Barrier) updates([[BAR_1]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_3]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_3]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_3]] + //CHECK: outputs([[OUTPUT_BUFFER_3]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NC = affine_map<(d0, d1) -> (d0, d1)> +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#map = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> + +!qElemType = !quant.uniform + +!InputDistributed = !VPUIP.DistributedBuffer< + 1x4x8x8x!qElemType, #NHWC, @CMX_NN, { + mode = "DUPLICATED", + num_clusters = 2 : i64 +}> + +!OutputDistributed = !VPUIP.DistributedBuffer< + 1x4x8x8x!qElemType, #NCHW, @CMX_NN, { + mode = "DUPLICATED", + num_clusters = 2 : i64 +}> + +// CHECK-LABEL: @ClusterPermuteDMAWithDistributedInputAndOutput +func.func @ClusterPermuteDMAWithDistributedInputAndOutput() -> !OutputDistributed { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %BAR_1 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %0 = VPURT.DeclareBuffer <0> -> !InputDistributed + %1 = VPURT.DeclareBuffer <2000> -> !OutputDistributed + + VPURT.Task waits(%BAR_0 : !VPURT.Barrier) updates(%BAR_1 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + VPUIP.PermuteDMA {mem_perm = #map} + inputs(%0 : !InputDistributed) + outputs(%1 : !OutputDistributed) -> !OutputDistributed + } + return %1: !OutputDistributed + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + //CHECK: [[BAR_1:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <128> -> [[INPUT_TYPE_1:.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer <2000> -> [[RETURN_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0, 1] <2000> -> [[OUTPUT_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0, 1] <2032> -> [[OUTPUT_TYPE_1:.+]] + + //CHECK: VPURT.Task waits([[BAR_0]] : !VPURT.Barrier) updates([[BAR_1]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task waits([[BAR_0]] : !VPURT.Barrier) updates([[BAR_1]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + +!OutputDistributed = !VPUIP.DistributedBuffer< + 1x72x2x1xf16, #NHWC, @CMX_NN, { + mode = "SEGMENTED", + num_tiles = [1, 1, 2, 1], + num_clusters = 2 : i64 +}> + +// CHECK-LABEL: @PermuteDMAWithNCHWToNHWC2D +func.func @PermuteDMAWithNCHWToNHWC2D() -> !OutputDistributed { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer <0> -> memref<1x72x2x1xf16, @DDR> + %output = VPURT.DeclareBuffer <0> -> !OutputDistributed + + VPURT.Task updates(%BAR_0 : !VPURT.Barrier) attributes {isTrailingSWLayer = false} { + %18 = VPUIP.PermuteDMA {mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, port = 0 : i64} inputs(%input : memref<1x72x2x1xf16, @DDR>) outputs(%output : !OutputDistributed) -> !OutputDistributed + } + return %output: !VPUIP.DistributedBuffer<1x72x2x1xf16, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer <0> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer <144> -> [[INPUT_TYPE_1:.+]] + //CHECK: [[INPUT_BUFFER_2:%.+]] = VPURT.DeclareBuffer <2> -> [[INPUT_TYPE_2:.+]] + //CHECK: [[INPUT_BUFFER_3:%.+]] = VPURT.DeclareBuffer <146> -> [[INPUT_TYPE_3:.+]] + + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer <0> -> [[RETURN_TYPE_0:.+]] + + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[OUTPUT_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <72> -> [[OUTPUT_TYPE_1:.+]] + //CHECK: [[OUTPUT_BUFFER_2:%.+]] = VPURT.DeclareBuffer [1] <0> -> [[OUTPUT_TYPE_2:.+]] + //CHECK: [[OUTPUT_BUFFER_3:%.+]] = VPURT.DeclareBuffer [1] <72> -> [[OUTPUT_TYPE_3:.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_2]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_2]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_2]] + //CHECK: outputs([[OUTPUT_BUFFER_2]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_3]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_3]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NCHW + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_3]] + //CHECK: outputs([[OUTPUT_BUFFER_3]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK-LABEL: @UniformPermuteDMAPlaneSizeRequiresTwoDMAs +func.func @UniformPermuteDMAPlaneSizeRequiresTwoDMAs() -> memref<1x16x1x261xf16, [@CMX_NN, 0]> { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer [0] <0> -> memref<1x16x1x261xf16, #NHWC, [@CMX_NN, 0]> + %output = VPURT.DeclareBuffer [0] <8352> -> memref<1x16x1x261xf16, [@CMX_NN, 0]> + + VPURT.Task updates(%BAR_0: !VPURT.Barrier) { + VPUIP.PermuteDMA {dst_stride = 0 : i64, mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)>, port = 0 : i64, src_plane_stride = 0 : i64} + inputs(%input : memref<1x16x1x261xf16, #NHWC, [@CMX_NN, 0]>) + outputs(%output : memref<1x16x1x261xf16, [@CMX_NN, 0]>) -> memref<1x16x1x261xf16, [@CMX_NN, 0]> + } + + return %output: memref<1x16x1x261xf16, [@CMX_NN, 0]> + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <16> -> [[INPUT_TYPE_1:.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <8352> -> [[RETURN_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <8352> -> [[OUTPUT_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <12528> -> [[OUTPUT_TYPE_1:.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK-LABEL: @UniformPermuteDMAPlaneSizeDoesNotRequireFourDMAs +func.func @UniformPermuteDMAPlaneSizeDoesNotRequireFourDMAs() -> memref<1x16x1x520xf16, [@CMX_NN, 0]> { + %BAR_0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %input = VPURT.DeclareBuffer [0] <0> -> memref<1x16x1x520xf16, #NHWC, [@CMX_NN, 0]> + %output = VPURT.DeclareBuffer [0] <16640> -> memref<1x16x1x520xf16, [@CMX_NN, 0]> + + VPURT.Task updates(%BAR_0: !VPURT.Barrier) { + VPUIP.PermuteDMA {dst_stride = 0 : i64, mem_perm = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)>, port = 0 : i64, src_plane_stride = 0 : i64} + inputs(%input : memref<1x16x1x520xf16, #NHWC, [@CMX_NN, 0]>) + outputs(%output : memref<1x16x1x520xf16, [@CMX_NN, 0]>) -> memref<1x16x1x520xf16, [@CMX_NN, 0]> + } + + return %output: memref<1x16x1x520xf16, [@CMX_NN, 0]> + + //CHECK: [[BAR_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + //CHECK: [[INPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <0> -> [[INPUT_TYPE_0:.+]] + //CHECK: [[INPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <8320> -> [[INPUT_TYPE_1:.+]] + //CHECK: [[RETURN_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <16640> -> [[RETURN_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_0:%.+]] = VPURT.DeclareBuffer [0] <16640> -> [[OUTPUT_TYPE_0:.+]] + //CHECK: [[OUTPUT_BUFFER_1:%.+]] = VPURT.DeclareBuffer [0] <17160> -> [[OUTPUT_TYPE_1:.+]] + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_0]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_0]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 0 + //CHECK: inputs([[INPUT_BUFFER_0]] + //CHECK: outputs([[OUTPUT_BUFFER_0]] + //CHECK: } + + //CHECK: VPURT.Task updates([[BAR_0]] : !VPURT.Barrier) { + //CHECK: VPUIP.PermuteDMA { + //CHECK-SAME: #VPUIP.InternalDataFlowAttr + //CHECK-SAME: inputType = [[INPUT_TYPE_1]] + //CHECK-SAME: outputType = [[OUTPUT_TYPE_1]] + //CHECK-SAME: mappingOrder = #NCHW + //CHECK-SAME: loopOrder = #NHWC + //CHECK-SAME: port = 1 + //CHECK: inputs([[INPUT_BUFFER_1]] + //CHECK: outputs([[OUTPUT_BUFFER_1]] + //CHECK: } + + //CHECK: return [[RETURN_BUFFER_0]] : [[RETURN_TYPE_0]] +} diff --git a/tests/lit/NPU/dialect/VPUIP/passes/wrap_with_permute_as_nndma_37XX+.mlir b/tests/lit/NPU/dialect/VPUIP/passes/wrap_with_permute_as_nndma_37XX+.mlir index 3e1e831b38..9e89bd74d2 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/wrap_with_permute_as_nndma_37XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/wrap_with_permute_as_nndma_37XX+.mlir @@ -1726,3 +1726,85 @@ func.func @FuseMemPermuteWithCopyNotApplicableDynamic(%arg0: !VPUIP.BoundedBuffe return %6 : !VPUIP.BoundedBuffer, dynamic_shape=memref<4xsi32, [@CMX_NN, 0]>> } + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +!OutDistributedType = !VPUIP.DistributedBuffer< + 1x232x48x84xf16, { + order = #NHWC, + strides = [967680, 1, 20160, 240]}, @CMX_NN, { + mode = "SEGMENTED", + num_tiles = [1, 1, 2, 1], + num_clusters = 2 : i64 +}> + +// CHECK-LABEL: @WrapExpandWithClusterCopyAsExpandDMAMultiChildren +// CHECK-SAME: [[INPUT:%.+]]: memref<1x232x48x84xf16, #NHWC> +func.func @WrapExpandWithClusterCopyAsExpandDMAMultiChildren(%arg0: memref<1x232x48x84xf16, #NHWC>) + -> (!OutDistributedType, !OutDistributedType) { + %alloc = memref.alloc() : memref<1x240x48x84xf16, #NHWC> + %alloc_0 = const.Declare memref<240x1x1x4xsi32> = dense<1> : tensor<240x1x1x4xsi32> + %alloc_1 = const.Declare memref<240x16x1x1xf16> = dense<1.0> : tensor<240x16x1x1xf16> + %alloc_2 = const.Declare memref<368x240x1x1xf16> = dense<1.0> : tensor<368x240x1x1xf16> + %alloc_3 = const.Declare memref<368x1x1x4xsi32> = dense<1> : tensor<368x1x1x4xsi32> + %0 = VPUIP.Expand {pads_begin = [0, 0, 0, 0], pads_end = [0, 8, 0, 0]} inputs(%arg0 : memref<1x232x48x84xf16, #NHWC>) outputs(%alloc : memref<1x240x48x84xf16, #NHWC>) -> memref<1x240x48x84xf16, #NHWC> + + %1 = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + %2 = VPUIP.Copy + inputs(%0 : memref<1x240x48x84xf16, #NHWC>) + outputs(%1 : !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + %3 = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<240x16x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + %4 = VPUIP.Copy + inputs(%alloc_1 : memref<240x16x1x1xf16>) + outputs(%3 : !VPUIP.DistributedBuffer<240x16x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<240x16x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + %5 = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<240x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + %6 = VPUIP.Copy + inputs(%alloc_0 : memref<240x1x1x4xsi32>) + outputs(%5 : !VPUIP.DistributedBuffer<240x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<240x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + %7 = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + %8 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], minimumHardwareExecutionCost = 36369 : i64, task_type = #VPUIP.nce_task_type} + input(%2 : !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) + weights(%4 : !VPUIP.DistributedBuffer<240x16x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) + weight_table(%6 : !VPUIP.DistributedBuffer<240x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) + parent_input(%2 : !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) + parent_output(%7 : !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) + outputs(%7 : !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) + -> !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> variants : { + DPUTask {cluster_id = 0 : i64, mpe_mode = #VPU.mpe_mode, outEnd = [41, 11, 63], outStart = [0, 0, 0], pad = #VPU.Padding} + DPUTask {cluster_id = 1 : i64, mpe_mode = #VPU.mpe_mode, outEnd = [41, 23, 63], outStart = [0, 12, 0], pad = #VPU.Padding} + } PPE : { + PPETask {ppe = #VPU.PPEStub<>} + } + %9 = VPUIP.SubView %8 [0, 0, 0, 0] [1, 232, 48, 84] : !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> to !OutDistributedType + %10 = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + %11 = VPUIP.NCEClusterTask {kernel_padding = #VPU.Padding, kernel_size = [1, 1], kernel_strides = [1, 1], minimumHardwareExecutionCost = 36369 : i64, task_type = #VPUIP.nce_task_type} + input(%2 : !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) + weights(%4 : !VPUIP.DistributedBuffer<240x16x1x1xf16, #NHWC, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) + weight_table(%6 : !VPUIP.DistributedBuffer<240x1x1x4xsi32, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) + parent_input(%2 : !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) + parent_output(%10 : !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) + outputs(%10 : !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) + -> !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> variants : { + DPUTask {cluster_id = 0 : i64, mpe_mode = #VPU.mpe_mode, outEnd = [41, 11, 63], outStart = [0, 0, 0], pad = #VPU.Padding} + DPUTask {cluster_id = 1 : i64, mpe_mode = #VPU.mpe_mode, outEnd = [41, 23, 63], outStart = [0, 12, 0], pad = #VPU.Padding} + } PPE : { + PPETask {ppe = #VPU.PPEStub<>} + } + %12 = VPUIP.SubView %11 [0, 0, 0, 0] [1, 232, 48, 84] : !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> to !OutDistributedType + return %9, %12 : !OutDistributedType, !OutDistributedType + + // CHECK-NOT: VPUIP.Expand + // CHECK: [[OUT_BUFF:%.+]] = VPURT.AllocDistributed + // CHECK: [[EXPAND_DMA:%.+]] = VPUIP.ExpandDMA {pads_begin = [0, 0, 0, 0], pads_end = [0, 8, 0, 0]} + // CHECK-SAME: inputs([[INPUT]] : memref<1x232x48x84xf16, #NHWC>) + // CHECK-SAME: outputs([[OUT_BUFF]] : !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}>) + // CHECK-SAME: -> !VPUIP.DistributedBuffer<1x240x48x84xf16, #NHWC, @CMX_NN, {mode = "SEGMENTED", num_tiles = [1, 1, 2, 1], num_clusters = 2 : i64}> + + //CHECK: [[NCE1:%.*]] = VPUIP.NCEClusterTask + // CHECK-SAME: input([[EXPAND_DMA]] + //CHECK: [[NCE2:%.*]] = VPUIP.NCEClusterTask + // CHECK-SAME: input([[EXPAND_DMA]] +} + diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_37XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_37XX.mlir index c714c852d7..9a3f82c15b 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_37XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_37XX.mlir @@ -9,7 +9,7 @@ #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> // CHECK-LABEL: @SoftMax -module @SoftMax attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @SoftMax attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { // CHECK-DAG: {{ }}IE.TileResource VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] @@ -87,7 +87,7 @@ module @SoftMax attributes {VPU.arch = #VPU.arch_kind, config.compilati #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> // CHECK-LABEL: @TwoFunctions -module @TwoFunctions attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @TwoFunctions attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { // CHECK-DAG: {{ }}IE.TileResource VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] @@ -271,7 +271,7 @@ module @TwoFunctions attributes {VPU.arch = #VPU.arch_kind, config.comp // ----- // CHECK-LABEL: TestCopy -module @TestCopy attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @TestCopy attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "Parameter_213" : tensor<2x4x20x20xf16> DataInfo "vpu_shape_Parameter_213" : tensor<4xsi32> diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir index c89e0fb3a7..7b71b4c779 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir @@ -19,11 +19,7 @@ }> // CHECK-LABEL: @SoftMax -module @SoftMax attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { - // CHECK-DAG: {{ }}IE.TileResource - // CHECK-DAG: {{ }}module @DmaProfilingReservedMemory - // CHECK-NEXT: {{ }}IE.MemoryResource 1024 bytes of @CMX_NN offset 1473536 - +module @SoftMax attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096] module @VPU.SW { func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE} @@ -169,7 +165,7 @@ module @SoftMax attributes {VPU.arch = #VPU.arch_kind, config.compilati }> // CHECK-LABEL: @TwoFunctions -module @TwoFunctions attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @TwoFunctions attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { // CHECK-DAG: {{ }}IE.TileResource VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir index d6a520266d..dab62a46bf 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir @@ -8,11 +8,7 @@ // REQUIRES: arch-NPU40XX // CHECK-LABEL: @Gather -module @Gather attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { - // CHECK-DAG: {{ }}IE.TileResource - // CHECK-DAG: {{ }}module @DmaProfilingReservedMemory - // CHECK-NEXT: {{ }}IE.MemoryResource {{[0-9]+}} bytes of @CMX_NN - +module @Gather attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { VPURT.SW.Runtime entryPoint: @VPU.SW::@runtime stack_configuration: [4096, 4096, 4096, 4096] diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir index 940416311e..02e4a016c5 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir @@ -3,12 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // -// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=DefaultHW allow-custom-values=true" --mlir-elide-elementsattrs-if-larger 8 --default-hw-mode-vpuip="enable-sw-kernel-prefetching-reserve-mem=true" %s | FileCheck %s +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=DefaultHW allow-custom-values=true" --mlir-elide-elementsattrs-if-larger 8 --default-hw-mode-vpuip %s | FileCheck %s // REQUIRES: arch-NPU40XX #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> -module @VerticalFusionOutlining attributes {config.compilationMode = #config.compilation_mode, VPU.revisionID = #VPU.revision_id} { +module @VerticalFusionOutlining attributes {config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { module @VPU.SW { func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE} func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/memory_allocation.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/memory_allocation.mlir index 2cdcd23b0c..09449ca17d 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/memory_allocation.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/memory_allocation.mlir @@ -365,12 +365,10 @@ module @ThreeFunctionsReservedMem { builtin.module @ReservedMemory { module @CustomReservedMemory { - IE.MemoryResource 512 bytes of @DDR + IE.MemoryResource 512 bytes of @DDR offset 0 } } - // CHECK: IE.MemoryResource 512 bytes of @DDR offset 0 - net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x8x60x60xf16> diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/reference_sw_mode_37XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/reference_sw_mode_37XX.mlir new file mode 100644 index 0000000000..c7bb0005d1 --- /dev/null +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/reference_sw_mode_37XX.mlir @@ -0,0 +1,74 @@ +// +// Copyright (C) 2022-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=ReferenceSW allow-custom-values=true" --mlir-elide-elementsattrs-if-larger 8 --reference-sw-mode-vpuip %s | FileCheck %s +// REQUIRES: arch-NPU37XX + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + +// CHECK-LABEL: @SoftMax +module @SoftMax attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + + VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] + module @VPU.SW { + func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} + } + + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x1000xf16> + } outputsInfo : { + DataInfo "softmax" : tensor<1x1000xf16> + } + + func.func @main(%arg0: memref<1x1000xf16>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> { + %0 = VPUIP.GenericReshape inputs(%arg0 : memref<1x1000xf16>) -> memref<1x1x1x1000xf16> + %1 = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + %2 = VPUIP.Copy + inputs(%0 : memref<1x1x1x1000xf16>) + outputs(%1 : !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) + -> !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + %3 = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + %4 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_SoftMax + inputs(%2 as %arg4: !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) + outputs(%3 as %arg5: !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) on tile 0 + -> !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>{ + VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg4, %arg5) : !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>, !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + } + %alloc = memref.alloc() : memref<1x1x1x1000xf16> + %5 = VPUIP.Copy inputs(%4 : !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) outputs(%alloc : memref<1x1x1x1000xf16>) -> memref<1x1x1x1000xf16> + %6 = VPUIP.GenericReshape inputs(%5 : memref<1x1x1x1000xf16>) -> memref<1x1000xf16> + %7 = VPUIP.Copy inputs(%6 : memref<1x1000xf16>) outputs(%arg1 : memref<1x1000xf16>) -> memref<1x1000xf16> + return %7 : memref<1x1000xf16> + + + // CHECK: [[BUFF0:%.+]] = VPURT.DeclareBuffer [0] <0> -> memref<1x1000xf16, @DDR> + // CHECK: [[DISTR_BUFF0:%.+]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + // CHECK: [[DISTR_BUFF1:%.+]] = VPURT.AllocDistributed -> !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + // CHECK: [[BUFF1:%.+]] = VPURT.DeclareBuffer <0> -> memref<1x1x1x1000xf16, @DDR> + // CHECK: [[BAR0:%.+]] = VPURT.ConfigureBarrier<0> -> !VPURT.Barrier + // CHECK: [[BUFF2:%.+]] = VPURT.DeclareBuffer [0] <0> -> memref<1x1x1x1000xf16, @DDR> + // CHECK: [[BAR1:%.+]] = VPURT.ConfigureBarrier<1> -> !VPURT.Barrier + // CHECK: [[BAR2:%.+]] = VPURT.ConfigureBarrier<2> -> !VPURT.Barrier + // CHECK: [[BAR3:%.+]] = VPURT.ConfigureBarrier<3> {isFinalBarrier} -> !VPURT.Barrier + // CHECK: [[BUFF3:%.+]] = VPURT.DeclareBuffer <0> -> memref<1x1000xf16, @DDR> + + // CHECK: VPURT.Task updates([[BAR0]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA {port = 0 : i64} inputs([[BUFF2]] : memref<1x1x1x1000xf16, @DDR>) outputs([[DISTR_BUFF0]] : !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) -> !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}> + // CHECK-NEXT: } + + // CHECK: VPURT.Task waits([[BAR0]] : !VPURT.Barrier) updates([[BAR1]] : !VPURT.Barrier) { + // CHECK: %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_SoftMax inputs([[DISTR_BUFF0]] as {{[^:]+}}: !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) + // CHECK-SAME outputs([[DISTR_BUFF1]] as {{[^:]+}}: !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) on tile 0 -> !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>{ + + // CHECK: VPURT.Task waits([[BAR1]] : !VPURT.Barrier) updates([[BAR2]] : !VPURT.Barrier) { + // CHECK: %10 = VPUIP.NNDMA {port = 0 : i64} inputs([[DISTR_BUFF1]] : !VPUIP.DistributedBuffer<1x1x1x1000xf16, #NCHW, @CMX_NN, {mode = "DUPLICATED", num_clusters = 2 : i64}>) outputs([[BUFF1]] : memref<1x1x1x1000xf16, @DDR>) -> memref<1x1x1x1000xf16, @DDR> + // CHECK: } + + // CHECK: VPURT.Task waits([[BAR2]] : !VPURT.Barrier) updates([[BAR3]] : !VPURT.Barrier) { + // CHECK: %10 = VPUIP.NNDMA {port = 0 : i64} inputs([[BUFF3]] : memref<1x1000xf16, @DDR>) outputs([[BUFF0]] : memref<1x1000xf16, @DDR>) -> memref<1x1000xf16, @DDR> + // CHECK: } + } +} diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/reference_sw_mode_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/reference_sw_mode_40XX.mlir new file mode 100644 index 0000000000..e5f97dd94d --- /dev/null +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/reference_sw_mode_40XX.mlir @@ -0,0 +1,86 @@ +// +// Copyright (C) 2022-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% compilation-mode=ReferenceSW allow-custom-values=true" --mlir-elide-elementsattrs-if-larger 8 --reference-sw-mode-vpuip %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +#NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + +!DistributedBuffer = !VPUIP.DistributedBuffer< + 1x1x1x1000xf16, #NCHW, @CMX_NN, { + mode = "DUPLICATED", + num_clusters = 1 : i64, + uniform_distributed_segments, + compute_shapes = [[1, 1, 1, 1000]], + compute_offsets = [[0, 0, 0, 0]], + memory_shapes = [[1, 1, 1, 1000]], + memory_offsets = [[0, 0, 0, 0]] +}> + +module @SoftMax attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + + VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096] + module @VPU.SW { + func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} + } + + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x1000xf16> + } outputsInfo : { + DataInfo "softmax" : tensor<1x1000xf16> + } + + IE.TileResource 1 of @NCE at 1.300000e+03 MHz + + func.func @main(%arg0: memref<1x1000xf16>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> { + %0 = VPUIP.GenericReshape inputs(%arg0 : memref<1x1000xf16>) -> memref<1x1x1x1000xf16> + %1 = memref.alloc() : memref<1x1x1x1000xf16, [@CMX_NN, 0]> + %2 = VPUIP.Copy inputs(%0 : memref<1x1x1x1000xf16>) outputs(%1 : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) -> memref<1x1x1x1000xf16, [@CMX_NN, 0]> + %3 = memref.alloc() : memref<1x1x1x1000xf16, [@CMX_NN, 0]> + %4 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_SoftMax inputs(%2 as %arg2: memref<1x1x1x1000xf16,[@CMX_NN, 0]>) outputs(%3 as %arg3: memref<1x1x1x1000xf16,[@CMX_NN, 0]>) on tile 0 -> memref<1x1x1x1000xf16,[@CMX_NN, 0]>{ + VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg2, %arg3) : memref<1x1x1x1000xf16,[@CMX_NN, 0]>, memref<1x1x1x1000xf16,[@CMX_NN, 0]> + } + %5 = memref.alloc() : memref<1x1x1x1000xf16, [@CMX_NN, 0]> + + %6 = VPUIP.Copy inputs(%4 : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) outputs(%5 : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) -> memref<1x1x1x1000xf16, [@CMX_NN, 0]> + %7 = VPUIP.GenericReshape inputs(%6 : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) -> memref<1x1000xf16, [@CMX_NN, 0]> + %8 = VPUIP.Copy inputs(%7 : memref<1x1000xf16, [@CMX_NN, 0]>) outputs(%arg1 : memref<1x1000xf16>) -> memref<1x1000xf16> + return %8 : memref<1x1000xf16> + + // CHECK: [[BAR0:%.+]] = VPURT.ConfigureBarrier<0> {isStartBarrier} -> !VPURT.Barrier + // CHECK: [[BUFF0:%.+]] = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + // CHECK: [[BUFF1:%.+]] = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + // CHECK: [[BUFF2:%.+]] = VPURT.DeclareBuffer [0] <0> -> memref<1x1000xf16, @DDR> + // CHECK: [[BUFF3:%.+]] = VPURT.DeclareBuffer [0] <0> -> memref<1x1x1x1000xf16, [@CMX_NN, 0]> + // CHECK: [[BUFF4:%.+]] = VPURT.DeclareBuffer [0] <2048> -> memref<1x1x1x1000xf16, [@CMX_NN, 0]> + // CHECK: [[BUFF5:%.+]] = VPURT.DeclareBuffer [0] <0> -> memref<1x1x1x1000xf16, [@CMX_NN, 0]> + // CHECK: [[BAR1:%.+]] = VPURT.ConfigureBarrier<1> -> !VPURT.Barrier + // CHECK: [[BUFF6:%.+]] = VPURT.DeclareBuffer [0] <0> -> memref<1x1x1x1000xf16, @DDR> + // CHECK: [[BAR2:%.+]] = VPURT.ConfigureBarrier<2> -> !VPURT.Barrier + // CHECK: [[BAR3:%.+]] = VPURT.ConfigureBarrier<3> -> !VPURT.Barrier + // CHECK: [[BAR4:%.+]] = VPURT.ConfigureBarrier<4> {isFinalBarrier} -> !VPURT.Barrier + // CHECK: [[BUFF7:%.+]] = VPURT.DeclareBuffer [0] <0> -> memref<1x1000xf16, [@CMX_NN, 0]> + + // CHECK: VPURT.Task updates([[BAR0]] : !VPURT.Barrier) { + // CHECK: VPUIP.SyncDMA {port = 0 : i64} inputs([[BUFF0]] : memref<0x0x0x0xi32, @DDR>) outputs([[BUFF1]] : memref<0x0x0x0xi32, @DDR>) -> memref<0x0x0x0xi32, @DDR> + // CHECK: } + + // CHECK: VPURT.Task waits([[BAR0]] : !VPURT.Barrier) updates([[BAR1]] : !VPURT.Barrier) { + // CHECK: VPUIP.NNDMA {port = 0 : i64} inputs([[BUFF6]] : memref<1x1x1x1000xf16, @DDR>) outputs([[BUFF3]] : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) -> memref<1x1x1x1000xf16, [@CMX_NN, 0]> + // CHECK: } + + // CHECK: VPURT.Task waits([[BAR1]] : !VPURT.Barrier) updates([[BAR2]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_SoftMax inputs([[BUFF3]] as %arg2: memref<1x1x1x1000xf16, [@CMX_NN, 0]>) outputs([[BUFF4]] as %arg3: memref<1x1x1x1000xf16, [@CMX_NN, 0]>) on tile 0 -> memref<1x1x1x1000xf16, [@CMX_NN, 0]>{ + + // CHECK: VPURT.Task waits([[BAR2]] : !VPURT.Barrier) updates([[BAR3]] : !VPURT.Barrier) { + // CHECK: VPUIP.NNDMA {port = 0 : i64} inputs([[BUFF4]] : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) outputs([[BUFF5]] : memref<1x1x1x1000xf16, [@CMX_NN, 0]>) -> memref<1x1x1x1000xf16, [@CMX_NN, 0]> + // CHECK: } + + // CHECK: VPURT.Task waits([[BAR3]] : !VPURT.Barrier) updates([[BAR4]] : !VPURT.Barrier) { + // CHECK: VPUIP.NNDMA {port = 0 : i64} inputs([[BUFF7]] : memref<1x1000xf16, [@CMX_NN, 0]>) outputs([[BUFF2]] : memref<1x1000xf16, @DDR>) -> memref<1x1000xf16, @DDR> + // CHECK: } + } +} diff --git a/tests/lit/NPU/dialect/VPUMI40XX/add_barrier_configuration_ops_all_dma_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/add_barrier_configuration_ops_all_dma_40XX+.mlir index 8a85a14be1..7262285b96 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/add_barrier_configuration_ops_all_dma_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/add_barrier_configuration_ops_all_dma_40XX+.mlir @@ -8,7 +8,7 @@ #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> -module @Convolution attributes {config.compilationMode = #config.compilation_mode, VPU.revisionID = #VPU.revision_id} { +module @Convolution attributes {config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { IE.TileResource 1 of @NCE at 1.700000e+03 MHz { builtin.module @ReservedMemory { module @DmaProfilingReservedMemory { @@ -16,13 +16,13 @@ module @Convolution attributes {config.compilationMode = #config.compilation_mod } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/add_barrier_configuration_ops_initial_dma_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/add_barrier_configuration_ops_initial_dma_40XX+.mlir index 5035584de2..d0e2e4b971 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/add_barrier_configuration_ops_initial_dma_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/add_barrier_configuration_ops_initial_dma_40XX+.mlir @@ -8,7 +8,7 @@ #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> -module @Convolution attributes {config.compilationMode = #config.compilation_mode, VPU.revisionID = #VPU.revision_id} { +module @Convolution attributes {config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { IE.TileResource 1 of @NCE at 1.700000e+03 MHz { builtin.module @ReservedMemory { module @DmaProfilingReservedMemory { @@ -16,13 +16,13 @@ module @Convolution attributes {config.compilationMode = #config.compilation_mod } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/add_barrier_configuration_ops_no_dma_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/add_barrier_configuration_ops_no_dma_40XX+.mlir index 1d6e0e30a4..485cf03551 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/add_barrier_configuration_ops_no_dma_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/add_barrier_configuration_ops_no_dma_40XX+.mlir @@ -8,7 +8,7 @@ #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> -module @Convolution attributes {config.compilationMode = #config.compilation_mode, VPU.revisionID = #VPU.revision_id} { +module @Convolution attributes {config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { IE.TileResource 1 of @NCE at 1.700000e+03 MHz { builtin.module @ReservedMemory { module @DmaProfilingReservedMemory { @@ -16,13 +16,13 @@ module @Convolution attributes {config.compilationMode = #config.compilation_mod } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/add_bootstrap_work_items_40xx+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/add_bootstrap_work_items_40xx+.mlir index 64a3b12a30..a55d3bb79c 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/add_bootstrap_work_items_40xx+.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/add_bootstrap_work_items_40xx+.mlir @@ -61,7 +61,7 @@ module { module { IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @DmaAndVarint inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { @@ -114,7 +114,7 @@ module { // ----- -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @single_hswish inputsInfo : { @@ -171,7 +171,7 @@ module attributes {VPU.arch = #VPU.arch_kind} { // ----- -module attributes {VPU.arch = #VPU.arch_kind} { +module attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @single_hswish_shave_idx1 inputsInfo : { @@ -215,10 +215,10 @@ module attributes {VPU.arch = #VPU.arch_kind} { VPUMI40XX.OpRanges } - // CHECK: VPURegMapped.Enqueue - // CHECK-SAME: <0:0:0> -> <0:0:1> - // CHECK-SAME: taskType = #VPURegMapped.task_type // CHECK: VPURegMapped.Enqueue // CHECK-SAME: <0:1:0> -> <0:1:0> // CHECK-SAME: taskType = #VPURegMapped.task_type + // CHECK: VPURegMapped.Enqueue + // CHECK-SAME: <0:0:0> -> <0:0:1> + // CHECK-SAME: taskType = #VPURegMapped.task_type } diff --git a/tests/lit/NPU/dialect/VPUMI40XX/add_bootstrap_work_items_full_wlm_40xx+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/add_bootstrap_work_items_full_wlm_40xx+.mlir new file mode 100644 index 0000000000..8298488cf5 --- /dev/null +++ b/tests/lit/NPU/dialect/VPUMI40XX/add_bootstrap_work_items_full_wlm_40xx+.mlir @@ -0,0 +1,63 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --add-bootstrap-work-items="workload-management-mode=FWLM_V1_PAGES" %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> +module { + IE.ExecutorResource 1 of @M2I + IE.ExecutorResource 1 of @DMA_NN + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} + net.NetworkInfo entryPoint : @DpusEnqueuedByDma inputsInfo : { + DataInfo "input" : tensor<1x16x16x16xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x14x14xf16> + } + func.func @DpusEnqueuedByDma(%arg0: memref<1x16x16x16xf16, @DDR>, %arg1: memref<1x16x14x14xf16, @DDR>) -> memref<1x16x14x14xf16, @DDR> { + %cst = const.Declare memref<1x1x1x4864xui8> = dense<1> : tensor<1x1x1x4864xui8> + %buf0 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %buf1 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %0 = VPURT.DeclareBuffer [0] <0> -> memref<1x16x16x16xf16, @DDR> + %1 = VPURT.DeclareBuffer [0] <0> -> memref<1x16x14x14xf16, @DDR> + %2 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x16x16xf16, [@CMX_NN, 0]> + %3 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]> + %4 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x14x14xf16, [@CMX_NN, 0]> + %5 = VPURT.DeclareBuffer [0] <16896> -> memref<1x1x1x4864xui8, [@CMX_NN, 0]> + %6 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %7 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %8 = VPURT.DeclareBuffer [0] <16896> -> memref<16x1x1x4xsi32, [@CMX_NN, 0]> + %9 = VPURT.DeclareBuffer [0] <17152> -> memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]> + %10 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8} <0, -1> -> !VPURegMapped.Index<0:0:0> + %11 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 2 : ui8} <1, -1> -> !VPURegMapped.Index<0:0:1> + %12 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8} <2, -1> -> !VPURegMapped.Index<0:0:2> + %13 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, isFinalBarrier, producer_count = 1 : ui8} <3, -1> -> !VPURegMapped.Index<0:0:3> + %14 = VPUMI40XX.NNDMA {port = 0 : i64} inputs(%0 : memref<1x16x16x16xf16, @DDR>) outputs(%2 : memref<1x16x16x16xf16, [@CMX_NN, 0]>) updates(%10 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + %15 = VPUMI40XX.NNDMA {is_out_of_order, port = 0 : i64} inputs(%cst : memref<1x1x1x4864xui8>) outputs(%5 : memref<1x1x1x4864xui8, [@CMX_NN, 0]>) previousDMA(%14 : !VPURegMapped.Index<0:0:0>) updates(%11 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:1> + %16 = VPUMI40XX.NNDMA {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 1 : i64>, port = 0 : i64} inputs(%buf0 : memref<0x0x0x0xi32, @DDR>) outputs(%buf1 : memref<0x0x0x0xi32, @DDR>) previousDMA(%15 : !VPURegMapped.Index<0:0:1>) updates(%10 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:2> + + %17 = VPUMI40XX.DPUInvariant {clean_after = 0 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 0 : ui64} input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%10 : !VPURegMapped.Index<0:0:0>) updates(%11 : !VPURegMapped.Index<0:0:1>) -> <0:0:0> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %18 = VPUMI40XX.DPUInvariant {clean_after = 0 : ui64, is_superdense, kernel_padding = #VPU.Padding, kernel_size = [3, 3], kernel_strides = [1, 1], mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 0 : ui64} previousTask(%17 : !VPURegMapped.Index<0:0:0>) input(%7 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) outputs(%4 : memref<1x16x14x14xf16, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%12 : !VPURegMapped.Index<0:0:2>) -> <0:0:1> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %19 = VPUMI40XX.DPUVariant calls(%17 : <0:0:0>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) {end = [15, 15, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:0> + %20 = VPUMI40XX.DPUVariant previousTask(%19 : !VPURegMapped.Index<0:0:0>) calls(%18 : <0:0:1>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) {end = [13, 13, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:1> + %21 = VPUMI40XX.MappedInference dmas((%14) : (!VPURegMapped.Index<0:0:0>)) invariants(%17 : !VPURegMapped.Index<0:0:0>) variants(%19 : !VPURegMapped.Index<0:0:0>) barriers(%10 : !VPURegMapped.Index<0:0:0>) dmaCount([[3]]) invariantCount([2]) variantCount([2]) actKernelRangesCount([[0, 0]]) actKernelInvocationsCount([[0, 0]]) mediaCount(0) barrierCount(4) -> !VPURegMapped.Index<0:0:0> + return %arg1 : memref<1x16x14x14xf16, @DDR> + } + + // New bootstrap enqueue op added only for DMA tasks. + // DPUs are enqueued by enqueue DMA + + // CHECK: VPURegMapped.Enqueue + // CHECK-SAME <0:0:0> -> <0:0:1> + // CHECK-SAME taskType = #VPURegMapped.task_type + + // CHECK: workItemCount(1) + // CHECK-SAME: bootsrapWorkItemsCount(1) +} \ No newline at end of file diff --git a/tests/lit/NPU/dialect/VPUMI40XX/add_enqueue_ops_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/add_enqueue_ops_40XX+.mlir index 8cfcf83c33..e56e1ede26 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/add_enqueue_ops_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/add_enqueue_ops_40XX+.mlir @@ -16,13 +16,13 @@ module @Convolution attributes {config.compilationMode = #config.compilation_mod } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { @@ -88,13 +88,13 @@ module @WaitAndUpdateBarrierSafetyForEnqueue attributes {config.compilationMode } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { @@ -170,13 +170,13 @@ module @EnqTaskUsingLcaBasedOnPrevBarsUsers { } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { @@ -236,13 +236,13 @@ module @TestSoftmax { } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x1000x1x1xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/add_enqueue_ops_predefined_enqueue_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/add_enqueue_ops_predefined_enqueue_40XX+.mlir index ea063c1efd..af3605f355 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/add_enqueue_ops_predefined_enqueue_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/add_enqueue_ops_predefined_enqueue_40XX+.mlir @@ -16,13 +16,13 @@ module @Convolution attributes {config.compilationMode = #config.compilation_mod } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { @@ -87,13 +87,13 @@ module @TwoDmaFifosEnqueueOpsForSameBarrierNotNextToEachOther attributes {config } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { @@ -163,13 +163,13 @@ module @Softmax attributes {config.compilationMode = #config.compilation_mode } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/add_fetch_ops_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/add_fetch_ops_40XX+.mlir index d1d28543bb..edeeb9fa90 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/add_fetch_ops_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/add_fetch_ops_40XX+.mlir @@ -16,13 +16,13 @@ module @Convolution attributes {config.compilationMode = #config.compilation_mod } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { @@ -94,13 +94,13 @@ module @Softmax attributes {config.compilationMode = #config.compilation_mode } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/barrier_computation_40XX.mlir b/tests/lit/NPU/dialect/VPUMI40XX/barrier_computation_40XX.mlir index ff0499d430..3794ddd39a 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/barrier_computation_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/barrier_computation_40XX.mlir @@ -6,7 +6,7 @@ // RUN: vpux-opt --split-input-file --vpu-arch=%arch% --barrier-computation-VPUMI40XX %s | FileCheck %s // REQUIRES: arch-NPU40XX -module @Test attributes {VPU.arch = #VPU.arch_kind} { +module @Test attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 6 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @main inputsInfo : { @@ -488,7 +488,7 @@ func.func @main(%arg0: memref<1x1x2x1000xf16>, %arg1: memref<1x1x2x1000xf16>) -> // ----- -module @Test attributes {VPU.arch = #VPU.arch_kind} { +module @Test attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 6 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @main inputsInfo : { @@ -530,7 +530,7 @@ func.func @main(%arg0: memref<1x1x2x1000xf16>, %arg1: memref<1x1x2x1000xf16>) -> // ----- -module @TestCleanAfterSettingInCaseMultUpdBars attributes {VPU.arch = #VPU.arch_kind} { +module @TestCleanAfterSettingInCaseMultUpdBars attributes {config.arch = #config.arch_kind} { IE.ExecutorResource 1 of @DMA_NN IE.TileResource 6 of @NCE at 6.000000e+02 MHz net.NetworkInfo entryPoint : @main inputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/barrier_topological_mapping_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/barrier_topological_mapping_40XX+.mlir index d50ff05712..58269297de 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/barrier_topological_mapping_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/barrier_topological_mapping_40XX+.mlir @@ -17,13 +17,13 @@ module @Convolution { } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { @@ -109,13 +109,13 @@ module @Convolution { } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/dump_statistics_of_wlm_ops_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/dump_statistics_of_wlm_ops_40XX+.mlir new file mode 100644 index 0000000000..71724821b7 --- /dev/null +++ b/tests/lit/NPU/dialect/VPUMI40XX/dump_statistics_of_wlm_ops_40XX+.mlir @@ -0,0 +1,85 @@ +// +// Copyright (C) 2024-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: env OV_NPU_LOG_LEVEL=LOG_INFO env IE_NPU_LOG_FILTER=dump-statistics-of-wlm-ops vpux-opt --init-compiler="vpu-arch=%arch% allow-custom-values=true" --dump-statistics-of-wlm-ops -o /dev/null %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> +module @SimpleIrToCheckWlmStats attributes {config.compilationMode = #config.compilation_mode} { + IE.TileResource 1 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DmaProfilingReservedMemory { + IE.MemoryResource 512 bytes of @CMX_NN offset 0 + } + } + IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.ExecutorResource 2 of @SHAVE_ACT + IE.ExecutorResource 1 of @DPU + } + IE.ExecutorResource 1 of @M2I + IE.ExecutorResource 1 of @DMA_NN + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x16x16x16xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x14x14xf16> + } + func.func @main(%arg0: memref<1x16x16x16xf16, @DDR>, %arg1: memref<1x16x14x14xf16, @DDR>) -> memref<1x16x14x14xf16, @DDR> { + %cst = const.Declare memref<1x1x1x4864xui8> = dense<1> : tensor<1x1x1x4864xui8> + %0 = VPURT.DeclareBuffer [0] <0> -> memref<1x16x16x16xf16, @DDR> + %1 = VPURT.DeclareBuffer [0] <0> -> memref<1x16x14x14xf16, @DDR> + %2 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x16x16xf16, [@CMX_NN, 0]> + %3 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]> + %4 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x14x14xf16, [@CMX_NN, 0]> + %5 = VPURT.DeclareBuffer [0] <16896> -> memref<1x1x1x4864xui8, [@CMX_NN, 0]> + %6 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %7 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %8 = VPURT.DeclareBuffer [0] <16896> -> memref<16x1x1x4xsi32, [@CMX_NN, 0]> + %9 = VPURT.DeclareBuffer [0] <17152> -> memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]> + %10 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8} <4, -1> -> !VPURegMapped.Index<0:0:0> + %11 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8}(%10 : !VPURegMapped.Index<0:0:0>) <0, -1> -> !VPURegMapped.Index<0:0:1> + %12 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 2 : ui8}(%11 : !VPURegMapped.Index<0:0:1>) <1, -1> -> !VPURegMapped.Index<0:0:2> + %13 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8}(%12 : !VPURegMapped.Index<0:0:2>) <2, -1> -> !VPURegMapped.Index<0:0:3> + %14 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, isFinalBarrier, producer_count = 1 : ui8}(%13 : !VPURegMapped.Index<0:0:3>) <3, -1> -> !VPURegMapped.Index<0:0:4> + %15 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %16 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %17 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %18 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %19 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%15 : !VPURegMapped.Index<0:0:0>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%12 : !VPURegMapped.Index<0:0:2>) -> <0:0:0> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %20 = VPUMI40XX.DPUInvariant {clean_after = 3 : ui64, is_superdense, kernel_padding = #VPU.Padding, kernel_size = [3, 3], kernel_strides = [1, 1], mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 4 : ui64} taskLocation(%16 : !VPURegMapped.Index<0:0:1>) previousTask(%19 : !VPURegMapped.Index<0:0:0>) input(%7 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) outputs(%4 : memref<1x16x14x14xf16, [@CMX_NN, 0]>) waits(%12 : !VPURegMapped.Index<0:0:2>) updates(%13 : !VPURegMapped.Index<0:0:3>) -> <0:0:1> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %21 = VPUMI40XX.DPUVariant taskLocation(%17 : !VPURegMapped.Index<0:0:0>) calls(%19 : <0:0:0>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) {end = [15, 15, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:0> + %22 = VPUMI40XX.DPUVariant taskLocation(%18 : !VPURegMapped.Index<0:0:1>) previousTask(%21 : !VPURegMapped.Index<0:0:0>) calls(%20 : <0:0:1>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) {end = [13, 13, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:1> + %23 = VPURT.DeclareBuffer <0> -> memref<1x1x1x1xi32, @DDR> + %24 = VPURT.DeclareBuffer <0> -> memref<1x1x1x1xi32, @DDR> + %25 = VPURegMapped.ViewTaskRange(%19 -> %20 : <0:0:0> -> <0:0:1>) -> memref<2x352xui8> + %26 = VPURegMapped.ViewTaskRange(%15 -> %16 : <0:0:0> -> <0:0:1>) -> memref<2x352xui8, [@CMX_NN, 0]> + %27 = VPURegMapped.ViewTaskRange(%21 -> %22 : <0:0:0> -> <0:0:1>) -> memref<2x224xui8> + %28 = VPURegMapped.ViewTaskRange(%17 -> %18 : <0:0:0> -> <0:0:1>) -> memref<2x224xui8, [@CMX_NN, 0]> + %29 = VPUMI40XX.NNDMA {is_critical, is_out_of_order, port = 1 : i64} inputs(%25 : memref<2x352xui8>) outputs(%26 : memref<2x352xui8, [@CMX_NN, 0]>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + %30 = VPUMI40XX.NNDMA {is_critical, is_out_of_order, port = 1 : i64} inputs(%27 : memref<2x224xui8>) outputs(%28 : memref<2x224xui8, [@CMX_NN, 0]>) previousDMA(%29 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:1> + %31 = VPUMI40XX.NNDMA {dma_descriptor = #VPUIP.DMADescriptorAttr, port = 1 : i64} inputs(%23 : memref<1x1x1x1xi32, @DDR>) outputs(%24 : memref<1x1x1x1xi32, @DDR>) previousDMA(%30 : !VPURegMapped.Index<0:0:1>) updates(%10 : !VPURegMapped.Index<0:0:0>) start_after(1) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:2> + %32 = VPUMI40XX.NNDMA {port = 1 : i64} inputs(%0 : memref<1x16x16x16xf16, @DDR>) outputs(%2 : memref<1x16x16x16xf16, [@CMX_NN, 0]>) previousDMA(%31 : !VPURegMapped.Index<0:0:2>) waits(%10 : !VPURegMapped.Index<0:0:0>) updates(%11 : !VPURegMapped.Index<0:0:1>) start_after(2) clean_after(1) acceleration_mode() -> !VPURegMapped.Index<0:0:3> + %33 = VPUMI40XX.NNDMA {is_out_of_order, port = 1 : i64} inputs(%cst : memref<1x1x1x4864xui8>) outputs(%5 : memref<1x1x1x4864xui8, [@CMX_NN, 0]>) previousDMA(%32 : !VPURegMapped.Index<0:0:3>) updates(%12 : !VPURegMapped.Index<0:0:2>) start_after(3) clean_after(2) acceleration_mode() -> !VPURegMapped.Index<0:0:4> + %34 = VPUMI40XX.NNDMA {port = 1 : i64} inputs(%4 : memref<1x16x14x14xf16, [@CMX_NN, 0]>) outputs(%1 : memref<1x16x14x14xf16, @DDR>) waits(%13 : !VPURegMapped.Index<0:0:3>) updates(%14 : !VPURegMapped.Index<0:0:4>) start_after(5) clean_after(4) acceleration_mode() -> !VPURegMapped.Index<0:1:0> + %35 = VPURegMapped.Enqueue at(%10 : !VPURegMapped.Index<0:0:0>) (%21 -> %21 : <0:0:0> -> <0:0:0>) -> !VPURegMapped.Index<0:0:0> {taskType = #VPURegMapped.task_type} + %36 = VPURegMapped.Enqueue previousTaskIdx(%35 : !VPURegMapped.Index<0:0:0>) at(%10 : !VPURegMapped.Index<0:0:0>) (%22 -> %22 : <0:0:1> -> <0:0:1>) -> !VPURegMapped.Index<0:0:1> {taskType = #VPURegMapped.task_type} + %37 = VPURegMapped.Enqueue previousTaskIdx(%36 : !VPURegMapped.Index<0:0:1>) at(%10 : !VPURegMapped.Index<0:0:0>) (%34 -> %34 : <0:1:0> -> <0:1:0>) -> !VPURegMapped.Index<0:0:2> {taskType = #VPURegMapped.task_type} + %38 = VPUMI40XX.MappedInference dmas((%29, %34) : (!VPURegMapped.Index<0:0:0>, !VPURegMapped.Index<0:1:0>)) invariants(%19 : !VPURegMapped.Index<0:0:0>) variants(%21 : !VPURegMapped.Index<0:0:0>) barriers(%10 : !VPURegMapped.Index<0:0:0>) workItemTasks(%35 : !VPURegMapped.Index<0:0:0>) dmaCount([[5, 1]]) invariantCount([2]) variantCount([2]) actKernelRangesCount([[0, 0]]) actKernelInvocationsCount([[0, 0]]) mediaCount(0) barrierCount(5) workItemCount(3) -> !VPURegMapped.Index<0:0:0> + return %arg1 : memref<1x16x14x14xf16, @DDR> + } +} + +// CHECK: Fetch DMA count - 2 +// CHECK: DPUVariant - 1 +// CHECK: DPUInvariant - 1 +// CHECK: WorkItem count - 3 +// CHECK: DMA - 1 +// CHECK: DPUVariant - 2 diff --git a/tests/lit/NPU/dialect/VPUMI40XX/link_enqueue_ops_for_same_barrier_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/link_enqueue_ops_for_same_barrier_40XX+.mlir index d48e265001..6235d79be7 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/link_enqueue_ops_for_same_barrier_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/link_enqueue_ops_for_same_barrier_40XX+.mlir @@ -17,13 +17,13 @@ module @Convolution attributes {config.compilationMode = #config.compilation_mod } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/link_enqueue_targets_dpu_full_wlm_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/link_enqueue_targets_dpu_full_wlm_40XX+.mlir new file mode 100644 index 0000000000..78f8ef2c51 --- /dev/null +++ b/tests/lit/NPU/dialect/VPUMI40XX/link_enqueue_targets_dpu_full_wlm_40XX+.mlir @@ -0,0 +1,126 @@ +// +// Copyright (C) 2023-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --vpu-arch=%arch% --link-enqueue-targets="workload-management-mode=FWLM_V1_PAGES" %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> +module @DpusAndEnqueueDmaOp attributes {config.compilationMode = #config.compilation_mode} { + config.PipelineOptions @Options { + config.Option @VPU.UseDedicatedFifoPerShaveEngine : false + } + IE.TileResource 1 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DmaProfilingReservedMemory { + IE.MemoryResource 512 bytes of @CMX_NN offset 0 + } + } + IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.ExecutorResource 2 of @SHAVE_ACT + IE.ExecutorResource 1 of @DPU + } + IE.ExecutorResource 1 of @M2I + IE.ExecutorResource 1 of @DMA_NN + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x16x16x16xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x14x14xf16> + } + func.func @main(%arg0: memref<1x16x16x16xf16, @DDR>, %arg1: memref<1x16x14x14xf16, @DDR>) -> memref<1x16x14x14xf16, @DDR> { + %cst = const.Declare memref<1x1x1x4864xui8> = dense<1> : tensor<1x1x1x4864xui8> + %buf0 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %buf1 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %3 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]> + %4 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x14x14xf16, [@CMX_NN, 0]> + %6 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %7 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %8 = VPURT.DeclareBuffer [0] <16896> -> memref<16x1x1x4xsi32, [@CMX_NN, 0]> + %9 = VPURT.DeclareBuffer [0] <17152> -> memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]> + %10 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8} <0, -1> -> !VPURegMapped.Index<0:0:0> + %11 = VPUMI40XX.ConfigureBarrier {consumer_count = 4 : ui8, producer_count = 1 : ui8}(%10 : !VPURegMapped.Index<0:0:0>) <1, -1> -> !VPURegMapped.Index<0:0:1> + %14 = VPUMI40XX.ConfigureBarrier {consumer_count = 0 : ui8, isFinalBarrier, producer_count = 4 : ui8}(%11 : !VPURegMapped.Index<0:0:1>) <2, -1> -> !VPURegMapped.Index<0:0:2> + + %15 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %16 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %17 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:2> + %18 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:3> + + %19 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %20 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %21 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:2> + %22 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:3> + + %enq_dma0 = VPUMI40XX.NNDMA {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 1 : i64>, port = 0 : i64} inputs(%buf0 : memref<0x0x0x0xi32, @DDR>) outputs(%buf1 : memref<0x0x0x0xi32, @DDR>) waits(%10 : !VPURegMapped.Index<0:0:0>) updates(%11 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + %enq_dma1 = VPUMI40XX.NNDMA {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 2 : i64, endTask = 3 : i64>, port = 0 : i64} inputs(%buf0 : memref<0x0x0x0xi32, @DDR>) outputs(%buf1 : memref<0x0x0x0xi32, @DDR>) previousDMA(%enq_dma0 : !VPURegMapped.Index<0:0:0>) waits(%10 : !VPURegMapped.Index<0:0:0>) updates(%11 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:1> + + %23 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%15 : !VPURegMapped.Index<0:0:0>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%14 : !VPURegMapped.Index<0:0:2>) -> <0:0:0> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %24 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%16 : !VPURegMapped.Index<0:0:1>) previousTask(%23 : !VPURegMapped.Index<0:0:0>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%14 : !VPURegMapped.Index<0:0:2>) -> <0:0:1> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %25 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%17 : !VPURegMapped.Index<0:0:2>) previousTask(%24 : !VPURegMapped.Index<0:0:1>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%14 : !VPURegMapped.Index<0:0:2>) -> <0:0:2> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %26 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%18 : !VPURegMapped.Index<0:0:3>) previousTask(%25 : !VPURegMapped.Index<0:0:2>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%14 : !VPURegMapped.Index<0:0:2>) -> <0:0:3> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + + %27 = VPUMI40XX.DPUVariant taskLocation(%19 : !VPURegMapped.Index<0:0:0>) calls(%23 : <0:0:0>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) {end = [15, 15, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:0> + %28 = VPUMI40XX.DPUVariant taskLocation(%20 : !VPURegMapped.Index<0:0:1>) previousTask(%27 : !VPURegMapped.Index<0:0:0>) calls(%24 : <0:0:1>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) {end = [13, 13, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], lastSecondaryTaskInExecutionGroup, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:1> + %29 = VPUMI40XX.DPUVariant taskLocation(%21 : !VPURegMapped.Index<0:0:2>) previousTask(%28 : !VPURegMapped.Index<0:0:1>) calls(%25 : <0:0:2>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) {end = [15, 15, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:2> + %30 = VPUMI40XX.DPUVariant taskLocation(%22 : !VPURegMapped.Index<0:0:3>) previousTask(%29 : !VPURegMapped.Index<0:0:2>) calls(%26 : <0:0:3>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) {end = [13, 13, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], lastSecondaryTaskInExecutionGroup, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:3> + + %31 = VPURegMapped.Enqueue (%enq_dma0 -> %enq_dma1 : <0:0:0> -> <0:0:1>) -> !VPURegMapped.Index<0:0:0> {taskType = #VPURegMapped.task_type} + + %33 = VPUMI40XX.MappedInference dmas((%enq_dma0) : (!VPURegMapped.Index<0:0:0>)) invariants(%23 : !VPURegMapped.Index<0:0:0>) variants(%27 : !VPURegMapped.Index<0:0:0>) barriers(%10 : !VPURegMapped.Index<0:0:0>) workItemTasks(%31 : !VPURegMapped.Index<0:0:0>) dmaCount([[1]]) invariantCount([4]) variantCount([4]) actKernelRangesCount([[0, 0]]) actKernelInvocationsCount([[0, 0]]) mediaCount(0) barrierCount(3) workItemCount(1) bootsrapWorkItemsCount(1) -> !VPURegMapped.Index<0:0:0> + return %arg1 : memref<1x16x14x14xf16, @DDR> + } +} + +//CHECK: [[DMA_ENQ0:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 1 : i64>, port = 0 : i64 +//CHECK-NOT: taskLinkAttrName +//CHECK-SAME: -> !VPURegMapped.Index[[DMA_ENQ0_IDX:.+]] + +//CHECK: [[DMA_ENQ1:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 2 : i64, endTask = 3 : i64>, port = 0 : i64 +//CHECK-SAME: taskLinkAttrName = #VPURegMapped.IndexType<[[DMA_ENQ0_IDX]]> +//CHECK-SAME: previousDMA([[DMA_ENQ0]] : !VPURegMapped.Index[[DMA_ENQ0_IDX]]) +//CHECK-SAME: -> !VPURegMapped.Index[[DMA_ENQ1_IDX:.+]] + +//CHECK: VPUMI40XX.DPUInvariant +//CHECK-NOT: taskLinkAttrName + +//CHECK: VPUMI40XX.DPUInvariant +//CHECK-NOT: taskLinkAttrName + +//CHECK: VPUMI40XX.DPUInvariant +//CHECK-NOT: taskLinkAttrName + +//CHECK: VPUMI40XX.DPUInvariant +//CHECK-NOT: taskLinkAttrName + +//CHECK: %[[VAR0:.+]] = VPUMI40XX.DPUVariant +//CHECK-NOT: taskLinkAttrName +//CHECK-SAME: -> [[VAR0_IDX:.+]] + +//CHECK: %[[VAR1:.+]] = VPUMI40XX.DPUVariant +//CHECK-SAME: taskLinkAttrName = #VPURegMapped.IndexType<[[VAR0_IDX]]> +//CHECK-SAME: -> [[VAR1_IDX:.+]] + +//CHECK: %[[VAR2:.+]] = VPUMI40XX.DPUVariant +//CHECK-NOT: taskLinkAttrName +//CHECK-SAME: -> [[VAR2_IDX:.+]] + +//CHECK: %[[VAR3:.+]] = VPUMI40XX.DPUVariant +//CHECK-SAME: taskLinkAttrName = #VPURegMapped.IndexType<[[VAR2_IDX]]> +//CHECK-SAME: -> [[VAR3_IDX:.+]] + +//CHECK: VPURegMapped.Enqueue +//CHECK-SAME: ([[DMA_ENQ0]] -> [[DMA_ENQ0]] : [[DMA_ENQ0_IDX]] -> [[DMA_ENQ0_IDX]]) diff --git a/tests/lit/NPU/dialect/VPUMI40XX/propagate_final_barrier_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/propagate_final_barrier_40XX+.mlir index 564539f0a2..52932a5669 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/propagate_final_barrier_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/propagate_final_barrier_40XX+.mlir @@ -16,13 +16,13 @@ module @TestConvolution { } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/reorder_mapped_inference_ops_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/reorder_mapped_inference_ops_40XX+.mlir index b974853e55..100419cd3b 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/reorder_mapped_inference_ops_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/reorder_mapped_inference_ops_40XX+.mlir @@ -14,7 +14,7 @@ module @test attributes {config.compilationMode = #config.compilation_mode } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/split_enqueue_dma_ops_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/split_enqueue_dma_ops_40XX+.mlir new file mode 100644 index 0000000000..e15f8ba19e --- /dev/null +++ b/tests/lit/NPU/dialect/VPUMI40XX/split_enqueue_dma_ops_40XX+.mlir @@ -0,0 +1,382 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --vpu-arch=%arch% --split-enqueue-dma-ops %s | FileCheck %s +// REQUIRES: arch-NPU40XX + + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> +module @DpuEnqueueDmaNoNeedSplit attributes {config.compilationMode = #config.compilation_mode} { + IE.TileResource 1 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DmaProfilingReservedMemory { + IE.MemoryResource 512 bytes of @CMX_NN offset 0 + } + } + IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.ExecutorResource 2 of @SHAVE_ACT + IE.ExecutorResource 1 of @DPU + } + IE.ExecutorResource 1 of @M2I + IE.ExecutorResource 1 of @DMA_NN + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x16x16x16xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x14x14xf16> + } + func.func @main(%arg0: memref<1x16x16x16xf16, @DDR>, %arg1: memref<1x16x14x14xf16, @DDR>) -> memref<1x16x14x14xf16, @DDR> { + %cst = const.Declare memref<1x1x1x4864xui8> = dense<1> : tensor<1x1x1x4864xui8> + %buf0 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %buf1 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %3 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]> + %4 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x14x14xf16, [@CMX_NN, 0]> + %6 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %7 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %8 = VPURT.DeclareBuffer [0] <16896> -> memref<16x1x1x4xsi32, [@CMX_NN, 0]> + %9 = VPURT.DeclareBuffer [0] <17152> -> memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]> + %10 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 1 : ui8} <4, -1> -> !VPURegMapped.Index<0:0:0> + %14 = VPUMI40XX.ConfigureBarrier {consumer_count = 0 : ui8, isFinalBarrier, producer_count = 2 : ui8}(%10 : !VPURegMapped.Index<0:0:0>) <3, -1> -> !VPURegMapped.Index<0:0:1> + %15 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %16 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %17 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %18 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %enq_dma = VPUMI40XX.NNDMA {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 1 : i64>, port = 0 : i64} inputs(%buf0 : memref<0x0x0x0xi32, @DDR>) outputs(%buf1 : memref<0x0x0x0xi32, @DDR>) updates(%10 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + + %19 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%15 : !VPURegMapped.Index<0:0:0>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%10 : !VPURegMapped.Index<0:0:0>) updates(%14 : !VPURegMapped.Index<0:0:1>) -> <0:0:0> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %20 = VPUMI40XX.DPUInvariant {clean_after = 3 : ui64, is_superdense, kernel_padding = #VPU.Padding, kernel_size = [3, 3], kernel_strides = [1, 1], mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 4 : ui64} taskLocation(%16 : !VPURegMapped.Index<0:0:1>) previousTask(%19 : !VPURegMapped.Index<0:0:0>) input(%7 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) outputs(%4 : memref<1x16x14x14xf16, [@CMX_NN, 0]>) waits(%10 : !VPURegMapped.Index<0:0:0>) updates(%14 : !VPURegMapped.Index<0:0:1>) -> <0:0:1> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %21 = VPUMI40XX.DPUVariant taskLocation(%17 : !VPURegMapped.Index<0:0:0>) calls(%19 : <0:0:0>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) {end = [15, 15, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:0> + %22 = VPUMI40XX.DPUVariant taskLocation(%18 : !VPURegMapped.Index<0:0:1>) previousTask(%21 : !VPURegMapped.Index<0:0:0>) calls(%20 : <0:0:1>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) {end = [13, 13, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], lastSecondaryTaskInExecutionGroup, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:1> + %25 = VPURegMapped.ViewTaskRange(%19 -> %20 : <0:0:0> -> <0:0:1>) -> memref<2x352xui8> + %26 = VPURegMapped.ViewTaskRange(%15 -> %16 : <0:0:0> -> <0:0:1>) -> memref<2x352xui8, [@CMX_NN, 0]> + %35 = VPURegMapped.Enqueue (%enq_dma -> %enq_dma : <0:0:0> -> <0:0:0>) -> !VPURegMapped.Index<0:0:0> {taskType = #VPURegMapped.task_type} + %37 = VPUMI40XX.MappedInference dmas((%enq_dma) : (!VPURegMapped.Index<0:0:0>)) invariants(%19 : !VPURegMapped.Index<0:0:0>) variants(%21 : !VPURegMapped.Index<0:0:0>) barriers(%10 : !VPURegMapped.Index<0:0:0>) workItemTasks(%35 : !VPURegMapped.Index<0:0:0>) dmaCount([[1]]) invariantCount([2]) variantCount([2]) actKernelRangesCount([[0, 0]]) actKernelInvocationsCount([[0, 0]]) mediaCount(0) barrierCount(2) workItemCount(1) bootsrapWorkItemsCount(1) -> !VPURegMapped.Index<0:0:0> + return %arg1 : memref<1x16x14x14xf16, @DDR> + } +} + +//CHECK: [[BAR0:%.+]] = VPUMI40XX.ConfigureBarrier +//CHECK: [[BAR1:%.+]] = VPUMI40XX.ConfigureBarrier +//CHECK: [[DMA_ENQ:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 1 : i64>, port = 0 : i64} +//CHECK-SAME: updates([[BAR0]] : !VPURegMapped.Index<0:0:0>) +//CHECK-SAME: -> !VPURegMapped.Index<0:0:0> +//CHECK: VPURegMapped.Enqueue +//CHECK-SAME: ([[DMA_ENQ]] -> [[DMA_ENQ]] : <0:0:0> -> <0:0:0>) +//CHECK: dmas(([[DMA_ENQ]]) +//CHECK-SAME: workItemCount(1) +//CHECK-SAME: bootsrapWorkItemsCount(1) + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> +module @DpuEnqueueDmaNeedSplit attributes {config.compilationMode = #config.compilation_mode} { + IE.TileResource 1 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DmaProfilingReservedMemory { + IE.MemoryResource 512 bytes of @CMX_NN offset 0 + } + } + IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.ExecutorResource 2 of @SHAVE_ACT + IE.ExecutorResource 1 of @DPU + } + IE.ExecutorResource 1 of @M2I + IE.ExecutorResource 1 of @DMA_NN + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x16x16x16xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x14x14xf16> + } + func.func @main(%arg0: memref<1x16x16x16xf16, @DDR>, %arg1: memref<1x16x14x14xf16, @DDR>) -> memref<1x16x14x14xf16, @DDR> { + %cst = const.Declare memref<1x1x1x4864xui8> = dense<1> : tensor<1x1x1x4864xui8> + %buf0 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %buf1 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %3 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]> + %4 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x14x14xf16, [@CMX_NN, 0]> + %6 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %7 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %8 = VPURT.DeclareBuffer [0] <16896> -> memref<16x1x1x4xsi32, [@CMX_NN, 0]> + %9 = VPURT.DeclareBuffer [0] <17152> -> memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]> + %10 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 1 : ui8} <4, -1> -> !VPURegMapped.Index<0:0:0> + %14 = VPUMI40XX.ConfigureBarrier {consumer_count = 0 : ui8, isFinalBarrier, producer_count = 2 : ui8}(%10 : !VPURegMapped.Index<0:0:0>) <3, -1> -> !VPURegMapped.Index<0:0:1> + %15 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %16 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %17 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %18 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %enq_dma = VPUMI40XX.NNDMA {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 1 : i64>, port = 0 : i64} inputs(%buf0 : memref<0x0x0x0xi32, @DDR>) outputs(%buf1 : memref<0x0x0x0xi32, @DDR>) updates(%10 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + + %19 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%15 : !VPURegMapped.Index<0:0:0>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%10 : !VPURegMapped.Index<0:0:0>) updates(%14 : !VPURegMapped.Index<0:0:1>) -> <0:0:0> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %20 = VPUMI40XX.DPUInvariant {clean_after = 3 : ui64, is_superdense, kernel_padding = #VPU.Padding, kernel_size = [3, 3], kernel_strides = [1, 1], mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 4 : ui64} taskLocation(%16 : !VPURegMapped.Index<0:0:1>) previousTask(%19 : !VPURegMapped.Index<0:0:0>) input(%7 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) outputs(%4 : memref<1x16x14x14xf16, [@CMX_NN, 0]>) waits(%10 : !VPURegMapped.Index<0:0:0>) updates(%14 : !VPURegMapped.Index<0:0:1>) -> <0:0:1> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %21 = VPUMI40XX.DPUVariant taskLocation(%17 : !VPURegMapped.Index<0:0:0>) calls(%19 : <0:0:0>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) {end = [15, 15, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], lastSecondaryTaskInExecutionGroup, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:0> + %22 = VPUMI40XX.DPUVariant taskLocation(%18 : !VPURegMapped.Index<0:0:1>) previousTask(%21 : !VPURegMapped.Index<0:0:0>) calls(%20 : <0:0:1>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) {end = [13, 13, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], lastSecondaryTaskInExecutionGroup, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:1> + %25 = VPURegMapped.ViewTaskRange(%19 -> %20 : <0:0:0> -> <0:0:1>) -> memref<2x352xui8> + %26 = VPURegMapped.ViewTaskRange(%15 -> %16 : <0:0:0> -> <0:0:1>) -> memref<2x352xui8, [@CMX_NN, 0]> + %35 = VPURegMapped.Enqueue (%enq_dma -> %enq_dma : <0:0:0> -> <0:0:0>) -> !VPURegMapped.Index<0:0:0> {taskType = #VPURegMapped.task_type} + %37 = VPUMI40XX.MappedInference dmas((%enq_dma) : (!VPURegMapped.Index<0:0:0>)) invariants(%19 : !VPURegMapped.Index<0:0:0>) variants(%21 : !VPURegMapped.Index<0:0:0>) barriers(%10 : !VPURegMapped.Index<0:0:0>) workItemTasks(%35 : !VPURegMapped.Index<0:0:0>) dmaCount([[1]]) invariantCount([2]) variantCount([2]) actKernelRangesCount([[0, 0]]) actKernelInvocationsCount([[0, 0]]) mediaCount(0) barrierCount(2) workItemCount(1) bootsrapWorkItemsCount(1) -> !VPURegMapped.Index<0:0:0> + return %arg1 : memref<1x16x14x14xf16, @DDR> + } +} + +//CHECK: [[BAR0:%.+]] = VPUMI40XX.ConfigureBarrier +//CHECK: [[BAR1:%.+]] = VPUMI40XX.ConfigureBarrier +//CHECK: [[DMA_ENQ0:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 0 : i64>, port = 0 : i64} +//CHECK-NOT: updates( +//CHECK-SAME: -> !VPURegMapped.Index<0:0:0> +//CHECK: [[DMA_ENQ1:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 1 : i64, endTask = 1 : i64>, port = 0 : i64} +//CHECK-SAME: previousDMA([[DMA_ENQ0]] : !VPURegMapped.Index<0:0:0>) +//CHECK-SAME: updates([[BAR0]] : !VPURegMapped.Index<0:0:0>) +//CHECK-SAME: -> !VPURegMapped.Index<0:0:1> +//CHECK: VPURegMapped.Enqueue +//CHECK-SAME: ([[DMA_ENQ0]] -> [[DMA_ENQ1]] : <0:0:0> -> <0:0:1>) +//CHECK: dmas(([[DMA_ENQ0]]) +//CHECK-SAME: workItemCount(1) +//CHECK-SAME: bootsrapWorkItemsCount(1) + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> +module @DpuEnqueueDmaNeedSplitMultipleTimes attributes {config.compilationMode = #config.compilation_mode} { + IE.TileResource 1 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DmaProfilingReservedMemory { + IE.MemoryResource 512 bytes of @CMX_NN offset 0 + } + } + IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.ExecutorResource 2 of @SHAVE_ACT + IE.ExecutorResource 1 of @DPU + } + IE.ExecutorResource 1 of @M2I + IE.ExecutorResource 1 of @DMA_NN + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x16x16x16xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x14x14xf16> + } + func.func @main(%arg0: memref<1x16x16x16xf16, @DDR>, %arg1: memref<1x16x14x14xf16, @DDR>) -> memref<1x16x14x14xf16, @DDR> { + %cst = const.Declare memref<1x1x1x4864xui8> = dense<1> : tensor<1x1x1x4864xui8> + %buf0 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %buf1 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %3 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]> + %4 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x14x14xf16, [@CMX_NN, 0]> + %6 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %7 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %8 = VPURT.DeclareBuffer [0] <16896> -> memref<16x1x1x4xsi32, [@CMX_NN, 0]> + %9 = VPURT.DeclareBuffer [0] <17152> -> memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]> + %10 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8} <0, -1> -> !VPURegMapped.Index<0:0:0> + %11 = VPUMI40XX.ConfigureBarrier {consumer_count = 4 : ui8, producer_count = 1 : ui8}(%10 : !VPURegMapped.Index<0:0:0>) <1, -1> -> !VPURegMapped.Index<0:0:1> + %14 = VPUMI40XX.ConfigureBarrier {consumer_count = 0 : ui8, isFinalBarrier, producer_count = 4 : ui8}(%11 : !VPURegMapped.Index<0:0:1>) <2, -1> -> !VPURegMapped.Index<0:0:2> + + %15 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %16 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %17 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:2> + %18 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:3> + + %19 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %20 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %21 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:2> + %22 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:3> + + %some_dma = VPUMI40XX.NNDMA {port = 1 : i64} inputs(%buf0 : memref<0x0x0x0xi32, @DDR>) outputs(%buf1 : memref<0x0x0x0xi32, @DDR>) updates(%10 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<1:0:0> + + %enq_dma = VPUMI40XX.NNDMA {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 3 : i64>, port = 0 : i64} inputs(%buf0 : memref<0x0x0x0xi32, @DDR>) outputs(%buf1 : memref<0x0x0x0xi32, @DDR>) waits(%10 : !VPURegMapped.Index<0:0:0>) updates(%11 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + + %23 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%15 : !VPURegMapped.Index<0:0:0>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%14 : !VPURegMapped.Index<0:0:2>) -> <0:0:0> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %24 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%16 : !VPURegMapped.Index<0:0:1>) previousTask(%23 : !VPURegMapped.Index<0:0:0>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%14 : !VPURegMapped.Index<0:0:2>) -> <0:0:1> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %25 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%17 : !VPURegMapped.Index<0:0:2>) previousTask(%24 : !VPURegMapped.Index<0:0:1>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%14 : !VPURegMapped.Index<0:0:2>) -> <0:0:2> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %26 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%18 : !VPURegMapped.Index<0:0:3>) previousTask(%25 : !VPURegMapped.Index<0:0:2>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%14 : !VPURegMapped.Index<0:0:2>) -> <0:0:3> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + + %27 = VPUMI40XX.DPUVariant taskLocation(%19 : !VPURegMapped.Index<0:0:0>) calls(%23 : <0:0:0>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) {end = [15, 15, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], lastSecondaryTaskInExecutionGroup, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:0> + %28 = VPUMI40XX.DPUVariant taskLocation(%20 : !VPURegMapped.Index<0:0:1>) previousTask(%27 : !VPURegMapped.Index<0:0:0>) calls(%24 : <0:0:1>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) {end = [13, 13, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], lastSecondaryTaskInExecutionGroup, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:1> + %29 = VPUMI40XX.DPUVariant taskLocation(%21 : !VPURegMapped.Index<0:0:2>) previousTask(%28 : !VPURegMapped.Index<0:0:1>) calls(%25 : <0:0:2>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) {end = [15, 15, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], lastSecondaryTaskInExecutionGroup, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:2> + %30 = VPUMI40XX.DPUVariant taskLocation(%22 : !VPURegMapped.Index<0:0:3>) previousTask(%29 : !VPURegMapped.Index<0:0:2>) calls(%26 : <0:0:3>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) {end = [13, 13, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], lastSecondaryTaskInExecutionGroup, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:3> + + %31 = VPURegMapped.Enqueue (%enq_dma -> %enq_dma : <0:0:0> -> <0:0:0>) -> !VPURegMapped.Index<0:0:0> {taskType = #VPURegMapped.task_type} + %32 = VPURegMapped.Enqueue previousTaskIdx(%31 : !VPURegMapped.Index<0:0:0>) (%some_dma -> %some_dma : <1:0:0> -> <1:0:0>) -> !VPURegMapped.Index<1:0:0> {taskType = #VPURegMapped.task_type} + + %33 = VPUMI40XX.MappedInference dmas((%enq_dma), (%some_dma) : (!VPURegMapped.Index<0:0:0>), (!VPURegMapped.Index<1:0:0>)) invariants(%23 : !VPURegMapped.Index<0:0:0>) variants(%27 : !VPURegMapped.Index<0:0:0>) barriers(%10 : !VPURegMapped.Index<0:0:0>) workItemTasks(%31 : !VPURegMapped.Index<0:0:0>) dmaCount([[1],[1]]) invariantCount([4]) variantCount([4]) actKernelRangesCount([[0, 0]]) actKernelInvocationsCount([[0, 0]]) mediaCount(0) barrierCount(3) workItemCount(2) bootsrapWorkItemsCount(2) -> !VPURegMapped.Index<0:0:0> + return %arg1 : memref<1x16x14x14xf16, @DDR> + } +} + +//CHECK: [[BAR0:%.+]] = VPUMI40XX.ConfigureBarrier +//CHECK: [[BAR1:%.+]] = VPUMI40XX.ConfigureBarrier +//CHECK: [[BAR2:%.+]] = VPUMI40XX.ConfigureBarrier +//CHECK: [[DMA:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {port = 1 : i64} +//CHECK-NOT: waits( +//CHECK-SAME: updates([[BAR0]] : !VPURegMapped.Index<0:0:0>) +//CHECK-SAME: -> !VPURegMapped.Index<1:0:0> +//CHECK: [[DMA_ENQ0:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 0 : i64>, port = 0 : i64} +//CHECK-SAME: waits([[BAR0]] : !VPURegMapped.Index<0:0:0>) +//CHECK-NOT: updates( +//CHECK-SAME: -> !VPURegMapped.Index<0:0:0> +//CHECK: [[DMA_ENQ1:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 1 : i64, endTask = 1 : i64>, port = 0 : i64} +//CHECK-SAME: previousDMA([[DMA_ENQ0]] : !VPURegMapped.Index<0:0:0>) +//CHECK-NOT: waits( +//CHECK-NOT: updates( +//CHECK-SAME: -> !VPURegMapped.Index<0:0:1> +//CHECK: [[DMA_ENQ2:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 2 : i64, endTask = 2 : i64>, port = 0 : i64} +//CHECK-SAME: previousDMA([[DMA_ENQ1]] : !VPURegMapped.Index<0:0:1>) +//CHECK-NOT: waits( +//CHECK-NOT: updates( +//CHECK-SAME: -> !VPURegMapped.Index<0:0:2> +//CHECK: [[DMA_ENQ3:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 3 : i64, endTask = 3 : i64>, port = 0 : i64} +//CHECK-SAME: previousDMA([[DMA_ENQ2]] : !VPURegMapped.Index<0:0:2>) +//CHECK-NOT: waits( +//CHECK-SAME: updates([[BAR1]] : !VPURegMapped.Index<0:0:1>) +//CHECK-SAME: -> !VPURegMapped.Index<0:0:3> +//CHECK: VPURegMapped.Enqueue +//CHECK-SAME: ([[DMA_ENQ0]] -> [[DMA_ENQ3]] : <0:0:0> -> <0:0:3>) +//CHECK: dmas(([[DMA_ENQ0]]) +//CHECK-SAME: workItemCount(2) +//CHECK-SAME: bootsrapWorkItemsCount(2 + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> +module @DpuEnqueueDmaNeedSplitMultipleTimes attributes {config.compilationMode = #config.compilation_mode} { + IE.TileResource 1 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DmaProfilingReservedMemory { + IE.MemoryResource 512 bytes of @CMX_NN offset 0 + } + } + IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.ExecutorResource 2 of @SHAVE_ACT + IE.ExecutorResource 1 of @DPU + } + IE.ExecutorResource 1 of @M2I + IE.ExecutorResource 1 of @DMA_NN + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x16x16x16xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x14x14xf16> + } + func.func @main(%arg0: memref<1x16x16x16xf16, @DDR>, %arg1: memref<1x16x14x14xf16, @DDR>) -> memref<1x16x14x14xf16, @DDR> { + %cst = const.Declare memref<1x1x1x4864xui8> = dense<1> : tensor<1x1x1x4864xui8> + %buf0 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %buf1 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %3 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]> + %4 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x14x14xf16, [@CMX_NN, 0]> + %6 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %7 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %8 = VPURT.DeclareBuffer [0] <16896> -> memref<16x1x1x4xsi32, [@CMX_NN, 0]> + %9 = VPURT.DeclareBuffer [0] <17152> -> memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]> + %10 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 1 : ui8} <0, -1> -> !VPURegMapped.Index<0:0:0> + %11 = VPUMI40XX.ConfigureBarrier {consumer_count = 4 : ui8, producer_count = 2 : ui8}(%10 : !VPURegMapped.Index<0:0:0>) <1, -1> -> !VPURegMapped.Index<0:0:1> + %14 = VPUMI40XX.ConfigureBarrier {consumer_count = 0 : ui8, isFinalBarrier, producer_count = 4 : ui8}(%11 : !VPURegMapped.Index<0:0:1>) <2, -1> -> !VPURegMapped.Index<0:0:2> + + %15 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %16 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %17 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:2> + %18 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:3> + + %19 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:0> + %20 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:1> + %21 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:2> + %22 = VPUMI40XX.DeclareTaskBuffer -> !VPURegMapped.Index<0:0:3> + + %some_dma = VPUMI40XX.NNDMA {port = 0 : i64} inputs(%buf0 : memref<0x0x0x0xi32, @DDR>) outputs(%buf1 : memref<0x0x0x0xi32, @DDR>) updates(%10 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + + %enq_dma0 = VPUMI40XX.NNDMA {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 1 : i64>, port = 0 : i64} inputs(%buf0 : memref<0x0x0x0xi32, @DDR>) outputs(%buf1 : memref<0x0x0x0xi32, @DDR>) previousDMA(%some_dma : !VPURegMapped.Index<0:0:0>) waits(%10 : !VPURegMapped.Index<0:0:0>) updates(%11 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:1> + %enq_dma1 = VPUMI40XX.NNDMA {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 2 : i64, endTask = 3 : i64>, port = 0 : i64} inputs(%buf0 : memref<0x0x0x0xi32, @DDR>) outputs(%buf1 : memref<0x0x0x0xi32, @DDR>) previousDMA(%enq_dma0 : !VPURegMapped.Index<0:0:1>) waits(%10 : !VPURegMapped.Index<0:0:0>) updates(%11 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:2> + + %23 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%15 : !VPURegMapped.Index<0:0:0>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%14 : !VPURegMapped.Index<0:0:2>) -> <0:0:0> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %24 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%16 : !VPURegMapped.Index<0:0:1>) previousTask(%23 : !VPURegMapped.Index<0:0:0>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%14 : !VPURegMapped.Index<0:0:2>) -> <0:0:1> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %25 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%17 : !VPURegMapped.Index<0:0:2>) previousTask(%24 : !VPURegMapped.Index<0:0:1>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%14 : !VPURegMapped.Index<0:0:2>) -> <0:0:2> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %26 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64} taskLocation(%18 : !VPURegMapped.Index<0:0:3>) previousTask(%25 : !VPURegMapped.Index<0:0:2>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%14 : !VPURegMapped.Index<0:0:2>) -> <0:0:3> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + + %27 = VPUMI40XX.DPUVariant taskLocation(%19 : !VPURegMapped.Index<0:0:0>) calls(%23 : <0:0:0>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) {end = [15, 15, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], lastSecondaryTaskInExecutionGroup, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:0> + %28 = VPUMI40XX.DPUVariant taskLocation(%20 : !VPURegMapped.Index<0:0:1>) previousTask(%27 : !VPURegMapped.Index<0:0:0>) calls(%24 : <0:0:1>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) {end = [13, 13, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], lastSecondaryTaskInExecutionGroup, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:1> + %29 = VPUMI40XX.DPUVariant taskLocation(%21 : !VPURegMapped.Index<0:0:2>) previousTask(%28 : !VPURegMapped.Index<0:0:1>) calls(%25 : <0:0:2>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) {end = [15, 15, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], lastSecondaryTaskInExecutionGroup, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:2> + %30 = VPUMI40XX.DPUVariant taskLocation(%22 : !VPURegMapped.Index<0:0:3>) previousTask(%29 : !VPURegMapped.Index<0:0:2>) calls(%26 : <0:0:3>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) {end = [13, 13, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], lastSecondaryTaskInExecutionGroup, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:3> + + %31 = VPURegMapped.Enqueue (%some_dma -> %enq_dma1 : <0:0:0> -> <0:0:2>) -> !VPURegMapped.Index<0:0:0> {taskType = #VPURegMapped.task_type} + + %32 = VPUMI40XX.MappedInference dmas((%some_dma) : (!VPURegMapped.Index<0:0:0>)) invariants(%23 : !VPURegMapped.Index<0:0:0>) variants(%27 : !VPURegMapped.Index<0:0:0>) barriers(%10 : !VPURegMapped.Index<0:0:0>) workItemTasks(%31 : !VPURegMapped.Index<0:0:0>) dmaCount([[3]]) invariantCount([4]) variantCount([4]) actKernelRangesCount([[0, 0]]) actKernelInvocationsCount([[0, 0]]) mediaCount(0) barrierCount(3) workItemCount(1) bootsrapWorkItemsCount(1) -> !VPURegMapped.Index<0:0:0> + return %arg1 : memref<1x16x14x14xf16, @DDR> + } +} + +//CHECK: [[BAR0:%.+]] = VPUMI40XX.ConfigureBarrier +//CHECK: [[BAR1:%.+]] = VPUMI40XX.ConfigureBarrier +//CHECK: [[BAR2:%.+]] = VPUMI40XX.ConfigureBarrier +//CHECK: [[DMA:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {port = 0 : i64} +//CHECK-NOT: waits( +//CHECK-SAME: updates([[BAR0]] : !VPURegMapped.Index<0:0:0>) +//CHECK-SAME: -> !VPURegMapped.Index<0:0:0> +//CHECK: [[DMA_ENQ0:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 0 : i64>, port = 0 : i64} +//CHECK-SAME: previousDMA([[DMA]] : !VPURegMapped.Index<0:0:0>) +//CHECK-SAME: waits([[BAR0]] : !VPURegMapped.Index<0:0:0>) +//CHECK-NOT: updates( +//CHECK-SAME: -> !VPURegMapped.Index<0:0:1> +//CHECK: [[DMA_ENQ1:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 1 : i64, endTask = 1 : i64>, port = 0 : i64} +//CHECK-SAME: previousDMA([[DMA_ENQ0]] : !VPURegMapped.Index<0:0:1>) +//CHECK-NOT: waits( +//CHECK-SAME: updates([[BAR1]] : !VPURegMapped.Index<0:0:1>) +//CHECK-SAME: -> !VPURegMapped.Index<0:0:2> +//CHECK: [[DMA_ENQ2:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 2 : i64, endTask = 2 : i64>, port = 0 : i64} +//CHECK-SAME: previousDMA([[DMA_ENQ1]] : !VPURegMapped.Index<0:0:2>) +//CHECK-SAME: waits([[BAR0]] : !VPURegMapped.Index<0:0:0>) +//CHECK-NOT: updates( +//CHECK-SAME: -> !VPURegMapped.Index<0:0:3> +//CHECK: [[DMA_ENQ3:%.+]] = VPUMI40XX.NNDMA +//CHECK-SAME: {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 3 : i64, endTask = 3 : i64>, port = 0 : i64} +//CHECK-SAME: previousDMA([[DMA_ENQ2]] : !VPURegMapped.Index<0:0:3>) +//CHECK-NOT: waits( +//CHECK-SAME: updates([[BAR1]] : !VPURegMapped.Index<0:0:1>) +//CHECK-SAME: -> !VPURegMapped.Index<0:0:4> +//CHECK: VPURegMapped.Enqueue +//CHECK-SAME: ([[DMA]] -> [[DMA_ENQ3]] : <0:0:0> -> <0:0:4>) +//CHECK: dmas(([[DMA]]) +//CHECK-SAME: workItemCount(1) +//CHECK-SAME: bootsrapWorkItemsCount(1) diff --git a/tests/lit/NPU/dialect/VPUMI40XX/split_enqueue_ops_40XX.mlir b/tests/lit/NPU/dialect/VPUMI40XX/split_enqueue_ops_40XX.mlir index 64b1075a26..20766fabdc 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/split_enqueue_ops_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/split_enqueue_ops_40XX.mlir @@ -9,7 +9,7 @@ #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> -module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @Convolution attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 1 of @NCE at 1.700000e+03 MHz { builtin.module @ReservedMemory { module @DmaProfilingReservedMemory { @@ -17,13 +17,13 @@ module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compi } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { @@ -76,7 +76,7 @@ module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compi #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> -module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @Convolution attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 1 of @NCE at 1.700000e+03 MHz { builtin.module @ReservedMemory { module @DmaProfilingReservedMemory { @@ -84,13 +84,13 @@ module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compi } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { @@ -150,7 +150,7 @@ module @Convolution attributes {VPU.arch = #VPU.arch_kind, config.compi #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> -module @ConvolutionMulipleGroupsInSingleEnq attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @ConvolutionMulipleGroupsInSingleEnq attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 1 of @NCE at 1.700000e+03 MHz { builtin.module @ReservedMemory { module @DmaProfilingReservedMemory { @@ -158,13 +158,13 @@ module @ConvolutionMulipleGroupsInSingleEnq attributes {VPU.arch = #VPU.arch_kin } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/ungroup_execution_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/ungroup_execution_40XX+.mlir index 5bdbdb40c0..35a394845d 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/ungroup_execution_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/ungroup_execution_40XX+.mlir @@ -16,13 +16,13 @@ module @Convolution attributes {config.compilationMode = #config.compilation_mod } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { @@ -93,13 +93,13 @@ module @Convolution attributes {config.compilationMode = #config.compilation_mod } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x1000x1x1xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/unroll_enqueue_ops_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/unroll_enqueue_ops_40XX+.mlir index 04e63708de..ddd1f88ff2 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/unroll_enqueue_ops_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/unroll_enqueue_ops_40XX+.mlir @@ -17,13 +17,13 @@ module @Convolution attributes {config.compilationMode = #config.compilation_mod } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/unroll_fetch_task_ops_40XX.mlir b/tests/lit/NPU/dialect/VPUMI40XX/unroll_fetch_task_ops_40XX.mlir index babcf56b11..a3e80073b4 100644 --- a/tests/lit/NPU/dialect/VPUMI40XX/unroll_fetch_task_ops_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUMI40XX/unroll_fetch_task_ops_40XX.mlir @@ -16,13 +16,13 @@ module @Convolution attributes {config.compilationMode = #config.compilation_mod } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x16x16xf16> } outputsInfo : { @@ -98,13 +98,13 @@ module @Convolution attributes {config.compilationMode = #config.compilation_mod } } IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x1000x1x1xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUMI40XX/update_enqueue_dma_input_and_output_40XX+.mlir b/tests/lit/NPU/dialect/VPUMI40XX/update_enqueue_dma_input_and_output_40XX+.mlir new file mode 100644 index 0000000000..ee3c41461d --- /dev/null +++ b/tests/lit/NPU/dialect/VPUMI40XX/update_enqueue_dma_input_and_output_40XX+.mlir @@ -0,0 +1,228 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --update-enqueue-dma-input-and-output %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> + +module @UpdateEnqueueDMAOpsDPU attributes {config.compilationMode = #config.compilation_mode} { + IE.TileResource 1 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DmaProfilingReservedMemory { + IE.MemoryResource 512 bytes of @CMX_NN offset 0 + } + } + IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.ExecutorResource 2 of @SHAVE_ACT + IE.ExecutorResource 1 of @DPU + } + IE.ExecutorResource 1 of @M2I + IE.ExecutorResource 1 of @DMA_NN + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x16x16x16xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x14x14xf16> + } + func.func @main(%arg0: memref<1x16x16x16xf16, @DDR>, %arg1: memref<1x16x14x14xf16, @DDR>) -> memref<1x16x14x14xf16, @DDR> { + %cst = const.Declare memref<1x1x1x4864xui8> = dense<1> : tensor<1x1x1x4864xui8> + %0 = VPURT.DeclareBuffer [0] <0> -> memref<1x16x16x16xf16, @DDR> + %1 = VPURT.DeclareBuffer [0] <0> -> memref<1x16x14x14xf16, @DDR> + %2 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x16x16xf16, [@CMX_NN, 0]> + %3 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]> + %4 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x14x14xf16, [@CMX_NN, 0]> + %5 = VPURT.DeclareBuffer [0] <16896> -> memref<1x1x1x4864xui8, [@CMX_NN, 0]> + %6 = VPURT.DeclareBuffer [0] <512> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %7 = VPURT.DeclareBuffer [0] <8704> -> memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]> + %8 = VPURT.DeclareBuffer [0] <16896> -> memref<16x1x1x4xsi32, [@CMX_NN, 0]> + %9 = VPURT.DeclareBuffer [0] <17152> -> memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]> + + %buf0 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %buf1 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + + %10 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8, wlmPage = 0 : i64} <4, -1> -> !VPURegMapped.Index<0:0:0> + %11 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8, wlmPage = 0 : i64}(%10 : !VPURegMapped.Index<0:0:0>) <0, -1> -> !VPURegMapped.Index<0:0:1> + %12 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 2 : ui8, wlmPage = 0 : i64}(%11 : !VPURegMapped.Index<0:0:1>) <1, -1> -> !VPURegMapped.Index<0:0:2> + %13 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, producer_count = 1 : ui8, wlmPage = 0 : i64}(%12 : !VPURegMapped.Index<0:0:2>) <2, -1> -> !VPURegMapped.Index<0:0:3> + %14 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, isFinalBarrier, producer_count = 1 : ui8, wlmPage = 0 : i64}(%13 : !VPURegMapped.Index<0:0:3>) <3, -1> -> !VPURegMapped.Index<0:0:4> + + %15 = VPUMI40XX.DeclareTaskBuffer {offset = 51200 : ui64} -> !VPURegMapped.Index<0:0:0> + %16 = VPUMI40XX.DeclareTaskBuffer {offset = 51200 : ui64} -> !VPURegMapped.Index<0:0:1> + %17 = VPUMI40XX.DeclareTaskBuffer {offset = 51200 : ui64} -> !VPURegMapped.Index<0:0:0> + %18 = VPUMI40XX.DeclareTaskBuffer {offset = 51200 : ui64} -> !VPURegMapped.Index<0:0:1> + %19 = VPUMI40XX.DPUInvariant {clean_after = 2 : ui64, is_permute_quantize, mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 3 : ui64, wlmPage = 0 : i64} taskLocation(%15 : !VPURegMapped.Index<0:0:0>) input(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) outputs(%3 : memref<1x16x16x16xf16, #NWCH, [@CMX_NN, 0]>) waits(%11 : !VPURegMapped.Index<0:0:1>) updates(%12 : !VPURegMapped.Index<0:0:2>) -> <0:0:0> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %20 = VPUMI40XX.DPUInvariant {clean_after = 3 : ui64, is_superdense, kernel_padding = #VPU.Padding, kernel_size = [3, 3], kernel_strides = [1, 1], mpe_frequent_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, start_after = 4 : ui64, wlmPage = 0 : i64} taskLocation(%16 : !VPURegMapped.Index<0:0:1>) previousTask(%19 : !VPURegMapped.Index<0:0:0>) input(%7 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) outputs(%4 : memref<1x16x14x14xf16, [@CMX_NN, 0]>) waits(%12 : !VPURegMapped.Index<0:0:2>) updates(%13 : !VPURegMapped.Index<0:0:3>) -> <0:0:1> PPE : { + VPUMI40XX.PPETask {ppe = #VPU.PPEStub<>} + } + %21 = VPUMI40XX.DPUVariant taskLocation(%17 : !VPURegMapped.Index<0:0:0>) calls(%19 : <0:0:0>) weights(%6 : memref<1x16x16x16xf16, #NHWC, [@CMX_NN, 0]>) {end = [15, 15, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], wlmPage = 0, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0]} -> <0:0:0> + %22 = VPUMI40XX.DPUVariant taskLocation(%18 : !VPURegMapped.Index<0:0:1>) previousTask(%21 : !VPURegMapped.Index<0:0:0>) calls(%20 : <0:0:1>) weights(%9 : memref<16x16x3x3xf16, #NHWC, [@CMX_NN, 0]>) weight_table(%8 : memref<16x1x1x4xsi32, [@CMX_NN, 0]>) {end = [13, 13, 15], inEnd = [15, 15, 15], inStart = [0, 0, 0], wlmPage = 0, mpe_mode = #VPU.mpe_mode, nce_task_type = #VPUIP.nce_task_type, pad = #VPU.Padding, start = [0, 0, 0], taskLinkAttrName = #VPURegMapped.IndexType<<0:0:0>>} -> <0:0:1> + %23 = VPURT.DeclareBuffer <0> -> memref<1x1x1x1xi32, @DDR> + %24 = VPURT.DeclareBuffer <0> -> memref<1x1x1x1xi32, @DDR> + %25 = VPURegMapped.ViewTaskRange(%19 -> %20 : <0:0:0> -> <0:0:1>) -> memref<2x352xui8> + %26 = VPURegMapped.ViewTaskRange(%15 -> %16 : <0:0:0> -> <0:0:1>) -> memref<2x352xui8, [@CMX_NN, 0]> + %27 = VPURegMapped.ViewTaskRange(%21 -> %22 : <0:0:0> -> <0:0:1>) -> memref<2x224xui8> + %28 = VPURegMapped.ViewTaskRange(%17 -> %18 : <0:0:0> -> <0:0:1>) -> memref<2x224xui8, [@CMX_NN, 0]> + %29 = VPUMI40XX.NNDMA {is_critical, is_out_of_order, port = 0 : i64, wlmPage = -1 : i64} inputs(%25 : memref<2x352xui8>) outputs(%26 : memref<2x352xui8, [@CMX_NN, 0]>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + %30 = VPUMI40XX.NNDMA {is_critical, is_out_of_order, port = 0 : i64, wlmPage = -1 : i64} inputs(%27 : memref<2x224xui8>) outputs(%28 : memref<2x224xui8, [@CMX_NN, 0]>) previousDMA(%29 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:1> + %31 = VPUMI40XX.NNDMA {dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64, wlmPage = 0 : i64} inputs(%23 : memref<1x1x1x1xi32, @DDR>) outputs(%24 : memref<1x1x1x1xi32, @DDR>) previousDMA(%30 : !VPURegMapped.Index<0:0:1>) updates(%10 : !VPURegMapped.Index<0:0:0>) start_after(1) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:2> + %32 = VPUMI40XX.NNDMA {port = 0 : i64, wlmPage = 0 : i64} inputs(%0 : memref<1x16x16x16xf16, @DDR>) outputs(%2 : memref<1x16x16x16xf16, [@CMX_NN, 0]>) previousDMA(%31 : !VPURegMapped.Index<0:0:2>) waits(%10 : !VPURegMapped.Index<0:0:0>) updates(%11 : !VPURegMapped.Index<0:0:1>) start_after(2) clean_after(1) acceleration_mode() -> !VPURegMapped.Index<0:0:3> + + %enq_dma = VPUMI40XX.NNDMA {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 1 : i64>, port = 0 : i64} inputs(%buf0 : memref<0x0x0x0xi32, @DDR>) outputs(%buf1 : memref<0x0x0x0xi32, @DDR>) previousDMA(%32 : !VPURegMapped.Index<0:0:3>) waits(%10 : !VPURegMapped.Index<0:0:0>) updates(%11 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:4> + + %33 = VPUMI40XX.NNDMA {is_out_of_order, port = 0 : i64, wlmPage = 0 : i64} inputs(%cst : memref<1x1x1x4864xui8>) outputs(%5 : memref<1x1x1x4864xui8, [@CMX_NN, 0]>) previousDMA(%enq_dma : !VPURegMapped.Index<0:0:4>) updates(%12 : !VPURegMapped.Index<0:0:2>) start_after(3) clean_after(2) acceleration_mode() -> !VPURegMapped.Index<0:0:5> + %34 = VPUMI40XX.NNDMA {port = 0 : i64, wlmPage = 0 : i64} inputs(%4 : memref<1x16x14x14xf16, [@CMX_NN, 0]>) outputs(%1 : memref<1x16x14x14xf16, @DDR>) waits(%13 : !VPURegMapped.Index<0:0:3>) updates(%14 : !VPURegMapped.Index<0:0:4>) start_after(5) clean_after(4) acceleration_mode() -> !VPURegMapped.Index<0:1:0> + + %35 = VPURegMapped.Enqueue (%29 -> %29 : <0:0:0> -> <0:0:0>) -> !VPURegMapped.Index<0:0:0> {taskType = #VPURegMapped.task_type} + %36 = VPUMI40XX.MappedInference dmas((%29, %34) : (!VPURegMapped.Index<0:0:0>, !VPURegMapped.Index<0:1:0>)) invariants(%19 : !VPURegMapped.Index<0:0:0>) variants(%21 : !VPURegMapped.Index<0:0:0>) barriers(%10 : !VPURegMapped.Index<0:0:0>) workItemTasks(%35 : !VPURegMapped.Index<0:0:0>) dmaCount([[6, 1]]) invariantCount([2]) variantCount([2]) actKernelRangesCount([[0, 0]]) actKernelInvocationsCount([[0, 0]]) mediaCount(0) barrierCount(5) workItemCount(1) -> !VPURegMapped.Index<0:0:0> + return %arg1 : memref<1x16x14x14xf16, @DDR> + } +} + +// CHECK: [[EQDMA_CST:%.+]] = const.Declare memref<1xui32> = dense<2080> : tensor<1xui32> +// CHECK: [[EQDMA_REG_BUF:%.+]] = VPURT.DeclareBuffer <788529152> -> memref<1xui32, @Register> + +// CHECK: VPUMI40XX.NNDMA +// CHECK: VPUMI40XX.NNDMA +// CHECK: VPUMI40XX.NNDMA +// CHECK: VPUMI40XX.NNDMA + +// DPU Enqueue +// CHECK: [[EQDMA_0:%.+]] = VPUMI40XX.NNDMA { +// CHECK-SAME: dma_descriptor = #VPUIP.DMADescriptorAttr +// CHECK-SAME: enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 1 : i64> +// CHECK-SAME: inputs([[EQDMA_CST]] : memref<1xui32>) outputs([[EQDMA_REG_BUF]] : memref<1xui32, @Register>) + +// DMA Enqueue +// CHECK: [[EQ_0:%.+]] = VPURegMapped.Enqueue +// CHECK-NOT: VPURegMapped.Enqueue +// CHECK: workItemTasks([[EQ_0]] : !VPURegMapped.Index<0:0:0>) + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +#NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> +module @UpdateEnqueueDMAOpsSHV attributes {config.compilationMode = #config.compilation_mode} { + IE.TileResource 1 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DmaProfilingReservedMemory { + IE.MemoryResource 512 bytes of @CMX_NN offset 0 + } + } + IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + IE.ExecutorResource 2 of @SHAVE_ACT + IE.ExecutorResource 1 of @DPU + } + IE.ExecutorResource 1 of @M2I + IE.ExecutorResource 1 of @DMA_NN + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x1000x1x1xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x1000x1x1xf16> + } + func.func @main(%arg0: memref<1x1000x1x1xf16, @DDR>, %arg1: memref<1x1000x1x1xf16, @DDR>) -> memref<1x1000x1x1xf16, @DDR> { + %cst = const.Declare memref<64xui32> = dense<[16842753, 0, 0, 0, 16908289, 0, 0, 0, 16908290, 0, 0, 0, 65793, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]> : tensor<64xui32> + %0 = VPURT.DeclareBuffer <788594688> -> memref<64xui32, @Register> + %1 = VPURT.DeclareBuffer [0] <0> -> memref<3x3844xf16, @DDR> + %2 = VPURT.DeclareBuffer [0] <0> -> memref<1x3x62x62xf16, @DDR> + %3 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %4 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %5 = VPURT.DeclareBuffer [0] <1473536> -> memref<16xui32, [@CMX_NN, 0]> + %6 = VPURT.DeclareBuffer [0] <0> -> memref<3844x3xf16, [@CMX_NN, 0]> + %7 = VPURT.DeclareBuffer [0] <23104> -> memref<3844x3xf16, [@CMX_NN, 0]> + %8 = VPURT.DeclareBuffer [0] <0> -> memref<1x3x62x62xf16, [@CMX_NN, 0]> + %9 = VPURT.DeclareBuffer [0] <0> -> memref<3x3844xf16, [@CMX_NN, 0]> + %10 = VPURT.DeclareBuffer [0] <0> -> memref<1x3x32x62xf16, {order = #NHWC, strides = [11532, 1, 186, 3]}, [@CMX_NN, 0]> + %11 = VPURT.DeclareBuffer [0] <23104> -> memref<1x3x32x62xf16, {order = #NHWC, strides = [11532, 1, 186, 3]}, [@CMX_NN, 0]> + %12 = VPURT.DeclareBuffer [0] <11904> -> memref<1x3x30x62xf16, {order = #NHWC, strides = [11532, 1, 186, 3]}, [@CMX_NN, 0]> + %13 = VPURT.DeclareBuffer [0] <35008> -> memref<1x3x30x62xf16, {order = #NHWC, strides = [11532, 1, 186, 3]}, [@CMX_NN, 0]> + + %buf0 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %buf1 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + + %14 = VPUMI40XX.DeclareKernelText kernel_path("softmax") -> !VPURegMapped.Index<0:0:0> + %15 = VPUMI40XX.DeclareKernelEntry kernel_path("softmax") -> !VPURegMapped.Index<0:0:0> + %16 = VPUMI40XX.DeclareKernelArgs kernel_path("softmax") -> !VPURegMapped.Index<0:0:0> + %17 = VPUMI40XX.DeclareKernelArgs kernel_path("softmax") -> !VPURegMapped.Index<0:0:1> + %18 = VPUMI40XX.KernelParams inputs(%10 : memref<1x3x32x62xf16, {order = #NHWC, strides = [11532, 1, 186, 3]}, [@CMX_NN, 0]>) outputs(%11 : memref<1x3x32x62xf16, {order = #NHWC, strides = [11532, 1, 186, 3]}, [@CMX_NN, 0]>) kernel_type("softmax") kernel_params(dense_resource<__elided__> : vector<136xui8>) -> !VPURegMapped.Index<0:0:0> + %19 = VPUMI40XX.KernelParams inputs(%12 : memref<1x3x30x62xf16, {order = #NHWC, strides = [11532, 1, 186, 3]}, [@CMX_NN, 0]>) outputs(%13 : memref<1x3x30x62xf16, {order = #NHWC, strides = [11532, 1, 186, + 3]}, [@CMX_NN, 0]>) kernel_type("softmax") kernel_params(dense_resource<__elided__> : vector<136xui8>) -> !VPURegMapped.Index<0:0:1> + %20 = VPUMI40XX.ConfigureBarrier {consumer_count = 1 : ui8, isStartBarrier, producer_count = 1 : ui8, wlmPage = 0 : i64} <0, -1> -> !VPURegMapped.Index<0:0:0> + %21 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 1 : ui8, wlmPage = 0 : i64}(%20 : !VPURegMapped.Index<0:0:0>) <1, -1> -> !VPURegMapped.Index<0:0:1> + %22 = VPUMI40XX.ConfigureBarrier {consumer_count = 2 : ui8, producer_count = 2 : ui8, wlmPage = 0 : i64}(%21 : !VPURegMapped.Index<0:0:1>) <2, -1> -> !VPURegMapped.Index<0:0:2> + %23 = VPUMI40XX.ConfigureBarrier {consumer_count = 0 : ui8, isFinalBarrier, producer_count = 1 : ui8, wlmPage = 0 : i64}(%22 : !VPURegMapped.Index<0:0:2>) <3, -1> -> !VPURegMapped.Index<0:0:3> + + %24 = VPUMI40XX.DeclareTaskBuffer {offset = 51200 : ui64} -> !VPURegMapped.Index<0:0:30> + %25 = VPUMI40XX.DeclareTaskBuffer {offset = 51200 : ui64} -> !VPURegMapped.Index<0:0:31> + %26 = VPUMI40XX.DeclareTaskBuffer {offset = 51200 : ui64} -> !VPURegMapped.Index<0:0:30> + %27 = VPUMI40XX.DeclareTaskBuffer {offset = 51200 : ui64} -> !VPURegMapped.Index<0:0:31> + + %28 = VPUMI40XX.ActKernelRange taskLocation(%26 : !VPURegMapped.Index<0:0:30>) kernel_text_index(%14 : !VPURegMapped.Index<0:0:0>) kernel_args_index(%16 : !VPURegMapped.Index<0:0:0>) kernel_entry_index(%15 : !VPURegMapped.Index<0:0:0>) kernelTaskType(@COMPUTE) -> !VPURegMapped.Index<0:0:0> + + %29 = VPUMI40XX.ActKernelRange taskLocation(%27 : !VPURegMapped.Index<0:0:31>) previousTask(%28 : !VPURegMapped.Index<0:0:0>) kernel_text_index(%14 : !VPURegMapped.Index<0:0:0>) kernel_args_index(%17 : !VPURegMapped.Index<0:0:1>) kernel_entry_index(%15 : !VPURegMapped.Index<0:0:0>) kernelTaskType(@COMPUTE) -> !VPURegMapped.Index<0:0:1> + + %30 = VPUMI40XX.ActKernelInvocation {wlmPage = 0 : i64} taskLocation(%24 : !VPURegMapped.Index<0:0:30>) range_index(%28 : <0:0:0>) kernel_params(%18 : <0:0:0>) waits(%21 : !VPURegMapped.Index<0:0:1>) updates(%22 : !VPURegMapped.Index<0:0:2>) tile(0) start_after(0) clean_after(0) -> !VPURegMapped.Index<0:0:0> + + %31 = VPUMI40XX.ActKernelInvocation {taskLinkAttrName = #VPURegMapped.IndexType<<0:0:0>>, lastSecondaryTaskInExecutionGroup, wlmPage = 0 : i64} taskLocation(%25 : !VPURegMapped.Index<0:0:31>) previousTask(%30 : !VPURegMapped.Index<0:0:0>) range_index(%29 : <0:0:1>) kernel_params(%19 : <0:0:1>) waits(%21 : !VPURegMapped.Index<0:0:1>) updates(%22 : !VPURegMapped.Index<0:0:2>) tile(0) start_after(0) clean_after(0) -> !VPURegMapped.Index<0:0:1> + + %32 = VPURegMapped.ViewTaskRange(%28 -> %29 : <0:0:0> -> <0:0:1>) -> memref<2x40xui8> + %33 = VPURegMapped.ViewTaskRange(%26 -> %27 : <0:0:30> -> <0:0:31>) -> memref<2x40xui8, [@CMX_NN, 0]> + %34 = VPURegMapped.ViewTaskRange(%30 -> %31 : <0:0:0> -> <0:0:1>) -> memref<2x96xui8> + %35 = VPURegMapped.ViewTaskRange(%24 -> %25 : <0:0:30> -> <0:0:31>) -> memref<2x96xui8, [@CMX_NN, 0]> + %36 = VPUMI40XX.NNDMA {dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64, wlmPage = -1 : i64} inputs(%cst : memref<64xui32>) outputs(%0 : memref<64xui32, @Register>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:0> + %37 = VPUMI40XX.NNDMA {is_critical, is_out_of_order, port = 0 : i64, taskLinkAttrName = #VPURegMapped.IndexType<<0:0:0>>, wlmPage = -1 : i64} inputs(%32 : memref<2x40xui8>) outputs(%33 : memref<2x40xui8, [@CMX_NN, 0]>) previousDMA(%36 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:1> + + %38 = VPUMI40XX.NNDMA {is_critical, is_out_of_order, port = 0 : i64, taskLinkAttrName = #VPURegMapped.IndexType<<0:0:1>>, wlmPage = -1 : i64} inputs(%34 : memref<2x96xui8>) outputs(%35 : memref<2x96xui8, [@CMX_NN, 0]>) previousDMA(%37 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:2> + + %39 = VPUMI40XX.NNDMA {dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64, taskLinkAttrName = #VPURegMapped.IndexType<<0:0:2>>, wlmPage = 0 : i64} inputs(%3 : memref<0x0x0x0xi32, @DDR>) outputs(%4 : memref<0x0x0x0xi32, @DDR>) previousDMA(%38 : !VPURegMapped.Index<0:0:2>) updates(%20 : !VPURegMapped.Index<0:0:0>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:3> + + %40 = VPUMI40XX.NNDMA {allow_different_in_out_shapes, dma_descriptor = #VPUIP.DMADescriptorAttr, is_out_of_order, port = 0 : i64, taskLinkAttrName = #VPURegMapped.IndexType<<0:0:3>>, wlmPage = 0 : i64} inputs(%1 : memref<3x3844xf16, @DDR>) outputs(%6 : memref<3844x3xf16, [@CMX_NN, 0]>) previousDMA(%39 : !VPURegMapped.Index<0:0:3>) waits(%20 : !VPURegMapped.Index<0:0:0>) updates(%21 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:4> + + %enq_dma = VPUMI40XX.NNDMA {enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 1 : i64>, port = 0 : i64} inputs(%buf0 : memref<0x0x0x0xi32, @DDR>) outputs(%buf1 : memref<0x0x0x0xi32, @DDR>) previousDMA(%40 : !VPURegMapped.Index<0:0:4>) waits(%20 : !VPURegMapped.Index<0:0:0>) updates(%21 : !VPURegMapped.Index<0:0:1>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:0:5> + + %41 = VPUMI40XX.NNDMA {allow_different_in_out_shapes, dma_descriptor = #VPUIP.DMADescriptorAttr, port = 0 : i64, wlmPage = 0 : i64} inputs(%7 : memref<3844x3xf16, [@CMX_NN, 0]>) outputs(%9 : memref<3x3844xf16, [@CMX_NN, 0]>) waits(%22 : !VPURegMapped.Index<0:0:2>) start_after(0) clean_after(0) acceleration_mode() -> !VPURegMapped.Index<0:1:0> + + %42 = VPUMI40XX.NNDMA {port = 0 : i64, taskLinkAttrName = #VPURegMapped.IndexType<<0:1:0>>, wlmPage = 0 : i64} inputs(%8 : memref<1x3x62x62xf16, [@CMX_NN, 0]>) outputs(%2 : memref<1x3x62x62xf16, @DDR>) previousDMA(%41 : !VPURegMapped.Index<0:1:0>) waits(%22 : !VPURegMapped.Index<0:0:2>) updates(%23 : !VPURegMapped.Index<0:0:3>) start_after(0) clean_after(0) acceleration_mode() dma_transaction(#VPUMI40XX.NNDMATransaction, outputType = memref<1x3x62x62xf16, @DDR>>) -> !VPURegMapped.Index<0:1:1> + + %43 = VPUMI40XX.PlatformInfo -> <0:0:0> + %44 = VPUMI40XX.ActShaveRt kernel("nnActEntry") -> !VPURegMapped.Index<0:0:0> + %45 = VPUMI40XX.Bootstrap inputs(%20 : <0:0:0>) -> !VPURegMapped.Index<0:0:0> + %46 = VPUMI40XX.Bootstrap inputs(%21 : <0:0:1>) -> !VPURegMapped.Index<0:0:1> + %47 = VPUMI40XX.Bootstrap inputs(%22 : <0:0:2>) -> !VPURegMapped.Index<0:0:2> + %48 = VPUMI40XX.Bootstrap inputs(%23 : <0:0:3>) -> !VPURegMapped.Index<0:0:3> + %49 = VPURegMapped.Enqueue (%36 -> %36 : <0:0:0> -> <0:0:0>) -> !VPURegMapped.Index<0:0:0> {taskType = #VPURegMapped.task_type} + %50 = VPURegMapped.Enqueue previousTaskIdx(%49 : !VPURegMapped.Index<0:0:0>) (%41 -> %41 : <0:1:0> -> <0:1:0>) -> !VPURegMapped.Index<0:0:1> {taskType = #VPURegMapped.task_type} + + %52 = VPUMI40XX.MappedInference {workloadManagementBarrierProgrammingMode = #VPURegMapped.workload_management_barrier_programming_mode} dmas((%36, %41) : (!VPURegMapped.Index<0:0:0>, !VPURegMapped.Index<0:1:0>)) actKernelRanges((%28) : (!VPURegMapped.Index<0:0:0>)) actKernelInvocations((%30) : (!VPURegMapped.Index<0:0:0>)) barriers(%20 : !VPURegMapped.Index<0:0:0>) workItemTasks(%49 : !VPURegMapped.Index<0:0:0>) bootstrapBarriers(%45 : !VPURegMapped.Index<0:0:0>) actShaveRt(%44 : !VPURegMapped.Index<0:0:0>) dmaHwpBase(%5 : memref<16xui32, [@CMX_NN, 0]>) dmaCount([[6, 2]]) invariantCount([0]) variantCount([0]) actKernelRangesCount([[2, 0]]) actKernelInvocationsCount([[2, 0]]) mediaCount(0) barrierCount(4) workItemCount(2) bootstrapBarriersCount(4) bootsrapWorkItemsCount(2) -> !VPURegMapped.Index<0:0:0> + return %arg1 : memref<1x1000x1x1xf16, @DDR> + } +} + +// CHECK: [[EQDMA_CST:%.+]] = const.Declare memref<1xui32> = dense<2080> : tensor<1xui32> +// CHECK: [[EQDMA_REG_BUF:%.+]] = VPURT.DeclareBuffer <788578304> -> memref<1xui32, @Register> + +// CHECK: VPUMI40XX.NNDMA +// CHECK: VPUMI40XX.NNDMA +// CHECK: VPUMI40XX.NNDMA +// CHECK: VPUMI40XX.NNDMA +// CHECK: VPUMI40XX.NNDMA + +// SHV Enqueue +// CHECK: [[EQDMA_0:%.+]] = VPUMI40XX.NNDMA { +// CHECK-SAME: dma_descriptor = #VPUIP.DMADescriptorAttr +// CHECK-SAME: enqueue_dma_attr = #VPUIP.EnqueueDMAAttr<, tile = 0 : i64, list = 0 : i64, startTask = 0 : i64, endTask = 1 : i64> +// CHECK-SAME: inputs([[EQDMA_CST]] : memref<1xui32>) outputs([[EQDMA_REG_BUF]] : memref<1xui32, @Register>) + +// DMA Enqueue +// CHECK: [[EQ_0:%.+]] = VPURegMapped.Enqueue +// CHECK: [[EQ_1:%.+]] = VPURegMapped.Enqueue +// CHECK-NOT: VPURegMapped.Enqueue +// CHECK: workItemTasks([[EQ_0]] : !VPURegMapped.Index<0:0:0>) diff --git a/tests/lit/NPU/dialect/VPURT/inference_execution_analysis_37XX.mlir b/tests/lit/NPU/dialect/VPURT/inference_execution_analysis_37XX.mlir index 8b19d0e7ef..f8b212b6e1 100644 --- a/tests/lit/NPU/dialect/VPURT/inference_execution_analysis_37XX.mlir +++ b/tests/lit/NPU/dialect/VPURT/inference_execution_analysis_37XX.mlir @@ -23,13 +23,13 @@ module @dumpsubgraph attributes {config.compilationMode = #config.compilation_mo IE.TileResource 2 of @NCE at 1.300000e+03 MHz { // CHECK: IE.TileResource {activity_factor = {{[0-9]+.[0-9]+}} : f64} 2 of @NCE at 1.300000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { //CHECK: net.NetworkInfo {inferenceTiming = {{[0-9]+}} : i64} entryPoint : @main inputsInfo : { DataInfo "result.1" : tensor<1x3x224x224xf16> diff --git a/tests/lit/NPU/dialect/VPURT/inference_execution_analysis_40XX+.mlir b/tests/lit/NPU/dialect/VPURT/inference_execution_analysis_40XX+.mlir index 41a76a10e7..bfefaaee8c 100644 --- a/tests/lit/NPU/dialect/VPURT/inference_execution_analysis_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPURT/inference_execution_analysis_40XX+.mlir @@ -23,13 +23,13 @@ module @dumpsubgraph attributes {config.compilationMode = #config.compilation_mo IE.TileResource 4 of @NCE at 1.700000e+03 MHz { // CHECK: IE.TileResource {activity_factor = {{[0-9]+.[0-9]+}} : f64} 4 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { //CHECK: net.NetworkInfo {inferenceTiming = {{[0-9]+}} : i64} entryPoint : @main inputsInfo : { DataInfo "result.1" : tensor<1x3x224x224xf16> diff --git a/tests/lit/NPU/dialect/VPURT/insert_barrier_marking_end_of_descriptor_groups_40XX.mlir b/tests/lit/NPU/dialect/VPURT/insert_barrier_marking_end_of_descriptor_groups_40XX.mlir index 5ec6e25c6d..0b12eef947 100644 --- a/tests/lit/NPU/dialect/VPURT/insert_barrier_marking_end_of_descriptor_groups_40XX.mlir +++ b/tests/lit/NPU/dialect/VPURT/insert_barrier_marking_end_of_descriptor_groups_40XX.mlir @@ -41,7 +41,7 @@ // DMA // | -module @NoInsertionNeeded attributes {config.compilationMode = #config.compilation_mode, VPUIP.wlm_status = #VPUIP.wlm_status} { +module @NoInsertionNeeded attributes {config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.MetadataMaxVariantCount : 8 config.Option @VPU.MetadataMaxInvariantCount : 4 @@ -50,13 +50,13 @@ module @NoInsertionNeeded attributes {config.compilationMode = #config.compilati } IE.TileResource 1 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "result.1" : tensor<1x3x224x224xf16> } outputsInfo : { @@ -187,7 +187,7 @@ module @NoInsertionNeeded attributes {config.compilationMode = #config.compilati // DMA // | -module @NoInsertionNeededMultiTile attributes {config.compilationMode = #config.compilation_mode, VPUIP.wlm_status = #VPUIP.wlm_status} { +module @NoInsertionNeededMultiTile attributes {config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.MetadataMaxVariantCount : 8 config.Option @VPU.MetadataMaxInvariantCount : 4 @@ -196,13 +196,13 @@ module @NoInsertionNeededMultiTile attributes {config.compilationMode = #config. } IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 2 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "result.1" : tensor<1x3x224x224xf16> } outputsInfo : { @@ -370,7 +370,7 @@ module @NoInsertionNeededMultiTile attributes {config.compilationMode = #config. #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> -module @InsertBarriersWhereNeeded attributes {config.compilationMode = #config.compilation_mode, VPUIP.wlm_status = #VPUIP.wlm_status} { +module @InsertBarriersWhereNeeded attributes {config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.MetadataMaxVariantCount : 8 config.Option @VPU.MetadataMaxInvariantCount : 4 @@ -379,13 +379,13 @@ module @InsertBarriersWhereNeeded attributes {config.compilationMode = #config.c } IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "result.1" : tensor<1x3x224x224xf16> } outputsInfo : { @@ -541,7 +541,7 @@ module @InsertBarriersWhereNeeded attributes {config.compilationMode = #config.c #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> -module @InsertBarriersWhereNeededMultiTile attributes {config.compilationMode = #config.compilation_mode, VPUIP.wlm_status = #VPUIP.wlm_status} { +module @InsertBarriersWhereNeededMultiTile attributes {config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.MetadataMaxVariantCount : 12 config.Option @VPU.MetadataMaxInvariantCount : 6 @@ -550,13 +550,13 @@ module @InsertBarriersWhereNeededMultiTile attributes {config.compilationMode = } IE.TileResource 2 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} func.func @main(%arg0: memref<1x3x224x224xf16, @DDR>, %arg1: memref<1x64x112x112xf16, @DDR>) -> memref<1x64x112x112xf16, @DDR> { // barriers diff --git a/tests/lit/NPU/dialect/VPURT/insert_barrier_marking_end_of_descriptor_groups_noWLM_40XX.mlir b/tests/lit/NPU/dialect/VPURT/insert_barrier_marking_end_of_descriptor_groups_noWLM_40XX.mlir index b3f8dbf4c6..de9104c5f4 100644 --- a/tests/lit/NPU/dialect/VPURT/insert_barrier_marking_end_of_descriptor_groups_noWLM_40XX.mlir +++ b/tests/lit/NPU/dialect/VPURT/insert_barrier_marking_end_of_descriptor_groups_noWLM_40XX.mlir @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true" --insert-barrier-to-mark-end-of-descriptor-group %s | FileCheck %s +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% workload-management-enable=false allow-custom-values=true" --insert-barrier-to-mark-end-of-descriptor-group %s | FileCheck %s // REQUIRES: arch-NPU40XX #NCHW = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> @@ -13,7 +13,7 @@ !qElemType = !quant.uniform:f16:0, {2.3954496608944391E-5:127,1.8652968519315946E-5:127,5.714536651851624E-6:127,9.2288640540415852E-6:127,5.2774985005536418E-5:127,1.0355251041922983E-5:127,4.7608623354453737E-5:127,4.2622483621432085E-5:127,6.3378041184793303E-5:127,9.0411328893946842E-6:127,3.7636343888410434E-5:127,9.1462623415969495E-6:127,3.0472522645484744E-5:127,1.3648806181992955E-6:127,8.4283783679872047E-5:127,5.5778683639886814E-5:127,1.6640490434301182E-5:127,5.4847537063238186E-5:127,9.0531476839320868E-5:127,1.0873389056348425E-5:127,8.4944597379429132E-5:127,5.5928868571604333E-5:127,7.9477865864911412E-5:127,1.5408973994217519E-5:127,2.7033287709153543E-5:127,7.3740801473302161E-6:127,4.5475997324064961E-5:127,4.5415923351377953E-5:127,1.4605484609528789E-5:127,8.4554116556963585E-6:127,1.9478985643762302E-5:127,4.6332051434854825E-6:127,9.6568911094365155E-6:127,1.3648806594488189E-4:127,4.8584825410617617E-6:127,1.0588037686085138E-5:127,9.6493818628506396E-6:127,1.0663130151943897E-5:127,3.8086898683562992E-5:127,4.142100416769193E-5:127,7.5805844284418062E-6:127,5.2684874046505907E-5:127,9.1462623415969495E-6:127,1.327634796382874E-5:127,6.1538275771253692E-6:127,2.9691561000553643E-5:127,5.6079053503321852E-5:127,1.4605484609528789E-5:127,2.8324878121924213E-5:127,6.1695969949557085E-5:127,5.0492174043430117E-5:127,2.3218590443528543E-5:127,1.5026002418337845E-5:127,4.4184406911294294E-5:127,4.0009265809547241E-5:127,2.0289984275036911E-5:127,2.0680465097502461E-5:127,5.5906340831846705E-6:127,1.6054769200602853E-5:127,1.0940972275621308E-5:127,2.3008331539124016E-5:127,3.4182090458907481E-5:127,9.4766691913754922E-6:127,8.9450145330954728E-5:127}> !qElemType3 = !quant.uniform -module attributes {config.compilationMode = #config.compilation_mode, VPUIP.wlm_status = #VPUIP.wlm_status} { +module attributes {config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.MetadataMaxVariantCount : 12 config.Option @VPU.MetadataMaxInvariantCount : 6 @@ -280,7 +280,7 @@ func.func @insertBarrierBetweenEvery3rdSetOfDPUtasks(%arg0: memref<1x3x224x224xf !qElemType = !quant.uniform:f16:0, {2.3954496608944391E-5:127,1.8652968519315946E-5:127,5.714536651851624E-6:127,9.2288640540415852E-6:127,5.2774985005536418E-5:127,1.0355251041922983E-5:127,4.7608623354453737E-5:127,4.2622483621432085E-5:127,6.3378041184793303E-5:127,9.0411328893946842E-6:127,3.7636343888410434E-5:127,9.1462623415969495E-6:127,3.0472522645484744E-5:127,1.3648806181992955E-6:127,8.4283783679872047E-5:127,5.5778683639886814E-5:127,1.6640490434301182E-5:127,5.4847537063238186E-5:127,9.0531476839320868E-5:127,1.0873389056348425E-5:127,8.4944597379429132E-5:127,5.5928868571604333E-5:127,7.9477865864911412E-5:127,1.5408973994217519E-5:127,2.7033287709153543E-5:127,7.3740801473302161E-6:127,4.5475997324064961E-5:127,4.5415923351377953E-5:127,1.4605484609528789E-5:127,8.4554116556963585E-6:127,1.9478985643762302E-5:127,4.6332051434854825E-6:127,9.6568911094365155E-6:127,1.3648806594488189E-4:127,4.8584825410617617E-6:127,1.0588037686085138E-5:127,9.6493818628506396E-6:127,1.0663130151943897E-5:127,3.8086898683562992E-5:127,4.142100416769193E-5:127,7.5805844284418062E-6:127,5.2684874046505907E-5:127,9.1462623415969495E-6:127,1.327634796382874E-5:127,6.1538275771253692E-6:127,2.9691561000553643E-5:127,5.6079053503321852E-5:127,1.4605484609528789E-5:127,2.8324878121924213E-5:127,6.1695969949557085E-5:127,5.0492174043430117E-5:127,2.3218590443528543E-5:127,1.5026002418337845E-5:127,4.4184406911294294E-5:127,4.0009265809547241E-5:127,2.0289984275036911E-5:127,2.0680465097502461E-5:127,5.5906340831846705E-6:127,1.6054769200602853E-5:127,1.0940972275621308E-5:127,2.3008331539124016E-5:127,3.4182090458907481E-5:127,9.4766691913754922E-6:127,8.9450145330954728E-5:127}> !qElemType3 = !quant.uniform -module attributes {config.compilationMode = #config.compilation_mode, VPUIP.wlm_status = #VPUIP.wlm_status} { +module attributes {config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.MetadataMaxVariantCount : 12 config.Option @VPU.MetadataMaxInvariantCount : 6 @@ -556,7 +556,7 @@ func.func @barBetweenEvery3rdSetOfDPUtasksWithNoBarDeps(%arg0: memref<1x3x224x22 !qElemType = !quant.uniform:f16:0, {2.3954496608944391E-5:127,1.8652968519315946E-5:127,5.714536651851624E-6:127,9.2288640540415852E-6:127,5.2774985005536418E-5:127,1.0355251041922983E-5:127,4.7608623354453737E-5:127,4.2622483621432085E-5:127,6.3378041184793303E-5:127,9.0411328893946842E-6:127,3.7636343888410434E-5:127,9.1462623415969495E-6:127,3.0472522645484744E-5:127,1.3648806181992955E-6:127,8.4283783679872047E-5:127,5.5778683639886814E-5:127,1.6640490434301182E-5:127,5.4847537063238186E-5:127,9.0531476839320868E-5:127,1.0873389056348425E-5:127,8.4944597379429132E-5:127,5.5928868571604333E-5:127,7.9477865864911412E-5:127,1.5408973994217519E-5:127,2.7033287709153543E-5:127,7.3740801473302161E-6:127,4.5475997324064961E-5:127,4.5415923351377953E-5:127,1.4605484609528789E-5:127,8.4554116556963585E-6:127,1.9478985643762302E-5:127,4.6332051434854825E-6:127,9.6568911094365155E-6:127,1.3648806594488189E-4:127,4.8584825410617617E-6:127,1.0588037686085138E-5:127,9.6493818628506396E-6:127,1.0663130151943897E-5:127,3.8086898683562992E-5:127,4.142100416769193E-5:127,7.5805844284418062E-6:127,5.2684874046505907E-5:127,9.1462623415969495E-6:127,1.327634796382874E-5:127,6.1538275771253692E-6:127,2.9691561000553643E-5:127,5.6079053503321852E-5:127,1.4605484609528789E-5:127,2.8324878121924213E-5:127,6.1695969949557085E-5:127,5.0492174043430117E-5:127,2.3218590443528543E-5:127,1.5026002418337845E-5:127,4.4184406911294294E-5:127,4.0009265809547241E-5:127,2.0289984275036911E-5:127,2.0680465097502461E-5:127,5.5906340831846705E-6:127,1.6054769200602853E-5:127,1.0940972275621308E-5:127,2.3008331539124016E-5:127,3.4182090458907481E-5:127,9.4766691913754922E-6:127,8.9450145330954728E-5:127}> !qElemType3 = !quant.uniform -module attributes {config.compilationMode = #config.compilation_mode, VPUIP.wlm_status = #VPUIP.wlm_status} { +module attributes {config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.MetadataMaxVariantCount : 2 config.Option @VPU.MetadataMaxInvariantCount : 2 @@ -678,7 +678,7 @@ func.func @insertBarrierBetweenConsecutiveDPUtasksWithSharedBarriers(%arg0: memr !qElemType = !quant.uniform:f16:0, {2.3954496608944391E-5:127,1.8652968519315946E-5:127,5.714536651851624E-6:127,9.2288640540415852E-6:127,5.2774985005536418E-5:127,1.0355251041922983E-5:127,4.7608623354453737E-5:127,4.2622483621432085E-5:127,6.3378041184793303E-5:127,9.0411328893946842E-6:127,3.7636343888410434E-5:127,9.1462623415969495E-6:127,3.0472522645484744E-5:127,1.3648806181992955E-6:127,8.4283783679872047E-5:127,5.5778683639886814E-5:127,1.6640490434301182E-5:127,5.4847537063238186E-5:127,9.0531476839320868E-5:127,1.0873389056348425E-5:127,8.4944597379429132E-5:127,5.5928868571604333E-5:127,7.9477865864911412E-5:127,1.5408973994217519E-5:127,2.7033287709153543E-5:127,7.3740801473302161E-6:127,4.5475997324064961E-5:127,4.5415923351377953E-5:127,1.4605484609528789E-5:127,8.4554116556963585E-6:127,1.9478985643762302E-5:127,4.6332051434854825E-6:127,9.6568911094365155E-6:127,1.3648806594488189E-4:127,4.8584825410617617E-6:127,1.0588037686085138E-5:127,9.6493818628506396E-6:127,1.0663130151943897E-5:127,3.8086898683562992E-5:127,4.142100416769193E-5:127,7.5805844284418062E-6:127,5.2684874046505907E-5:127,9.1462623415969495E-6:127,1.327634796382874E-5:127,6.1538275771253692E-6:127,2.9691561000553643E-5:127,5.6079053503321852E-5:127,1.4605484609528789E-5:127,2.8324878121924213E-5:127,6.1695969949557085E-5:127,5.0492174043430117E-5:127,2.3218590443528543E-5:127,1.5026002418337845E-5:127,4.4184406911294294E-5:127,4.0009265809547241E-5:127,2.0289984275036911E-5:127,2.0680465097502461E-5:127,5.5906340831846705E-6:127,1.6054769200602853E-5:127,1.0940972275621308E-5:127,2.3008331539124016E-5:127,3.4182090458907481E-5:127,9.4766691913754922E-6:127,8.9450145330954728E-5:127}> !qElemType3 = !quant.uniform -module attributes {config.compilationMode = #config.compilation_mode, VPUIP.wlm_status = #VPUIP.wlm_status} { +module attributes {config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.MetadataMaxVariantCount : 2 config.Option @VPU.MetadataMaxInvariantCount : 2 @@ -801,7 +801,7 @@ func.func @insertBarrierBetweenConsecutiveDPUtasks(%arg0: memref<1x3x64x64xf16, !qElemType = !quant.uniform:f16:0, {2.3954496608944391E-5:127,1.8652968519315946E-5:127,5.714536651851624E-6:127,9.2288640540415852E-6:127,5.2774985005536418E-5:127,1.0355251041922983E-5:127,4.7608623354453737E-5:127,4.2622483621432085E-5:127,6.3378041184793303E-5:127,9.0411328893946842E-6:127,3.7636343888410434E-5:127,9.1462623415969495E-6:127,3.0472522645484744E-5:127,1.3648806181992955E-6:127,8.4283783679872047E-5:127,5.5778683639886814E-5:127,1.6640490434301182E-5:127,5.4847537063238186E-5:127,9.0531476839320868E-5:127,1.0873389056348425E-5:127,8.4944597379429132E-5:127,5.5928868571604333E-5:127,7.9477865864911412E-5:127,1.5408973994217519E-5:127,2.7033287709153543E-5:127,7.3740801473302161E-6:127,4.5475997324064961E-5:127,4.5415923351377953E-5:127,1.4605484609528789E-5:127,8.4554116556963585E-6:127,1.9478985643762302E-5:127,4.6332051434854825E-6:127,9.6568911094365155E-6:127,1.3648806594488189E-4:127,4.8584825410617617E-6:127,1.0588037686085138E-5:127,9.6493818628506396E-6:127,1.0663130151943897E-5:127,3.8086898683562992E-5:127,4.142100416769193E-5:127,7.5805844284418062E-6:127,5.2684874046505907E-5:127,9.1462623415969495E-6:127,1.327634796382874E-5:127,6.1538275771253692E-6:127,2.9691561000553643E-5:127,5.6079053503321852E-5:127,1.4605484609528789E-5:127,2.8324878121924213E-5:127,6.1695969949557085E-5:127,5.0492174043430117E-5:127,2.3218590443528543E-5:127,1.5026002418337845E-5:127,4.4184406911294294E-5:127,4.0009265809547241E-5:127,2.0289984275036911E-5:127,2.0680465097502461E-5:127,5.5906340831846705E-6:127,1.6054769200602853E-5:127,1.0940972275621308E-5:127,2.3008331539124016E-5:127,3.4182090458907481E-5:127,9.4766691913754922E-6:127,8.9450145330954728E-5:127}> !qElemType3 = !quant.uniform -module attributes {config.compilationMode = #config.compilation_mode, VPUIP.wlm_status = #VPUIP.wlm_status} { +module attributes {config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.MetadataMaxVariantCount : 2 config.Option @VPU.MetadataMaxInvariantCount : 2 @@ -915,7 +915,7 @@ func.func @insertBarrierBetweenConsecutiveSWtasks(%arg0: memref<1x3x64x64xf16, @ !qElemType = !quant.uniform:f16:0, {2.3954496608944391E-5:127,1.8652968519315946E-5:127,5.714536651851624E-6:127,9.2288640540415852E-6:127,5.2774985005536418E-5:127,1.0355251041922983E-5:127,4.7608623354453737E-5:127,4.2622483621432085E-5:127,6.3378041184793303E-5:127,9.0411328893946842E-6:127,3.7636343888410434E-5:127,9.1462623415969495E-6:127,3.0472522645484744E-5:127,1.3648806181992955E-6:127,8.4283783679872047E-5:127,5.5778683639886814E-5:127,1.6640490434301182E-5:127,5.4847537063238186E-5:127,9.0531476839320868E-5:127,1.0873389056348425E-5:127,8.4944597379429132E-5:127,5.5928868571604333E-5:127,7.9477865864911412E-5:127,1.5408973994217519E-5:127,2.7033287709153543E-5:127,7.3740801473302161E-6:127,4.5475997324064961E-5:127,4.5415923351377953E-5:127,1.4605484609528789E-5:127,8.4554116556963585E-6:127,1.9478985643762302E-5:127,4.6332051434854825E-6:127,9.6568911094365155E-6:127,1.3648806594488189E-4:127,4.8584825410617617E-6:127,1.0588037686085138E-5:127,9.6493818628506396E-6:127,1.0663130151943897E-5:127,3.8086898683562992E-5:127,4.142100416769193E-5:127,7.5805844284418062E-6:127,5.2684874046505907E-5:127,9.1462623415969495E-6:127,1.327634796382874E-5:127,6.1538275771253692E-6:127,2.9691561000553643E-5:127,5.6079053503321852E-5:127,1.4605484609528789E-5:127,2.8324878121924213E-5:127,6.1695969949557085E-5:127,5.0492174043430117E-5:127,2.3218590443528543E-5:127,1.5026002418337845E-5:127,4.4184406911294294E-5:127,4.0009265809547241E-5:127,2.0289984275036911E-5:127,2.0680465097502461E-5:127,5.5906340831846705E-6:127,1.6054769200602853E-5:127,1.0940972275621308E-5:127,2.3008331539124016E-5:127,3.4182090458907481E-5:127,9.4766691913754922E-6:127,8.9450145330954728E-5:127}> !qElemType3 = !quant.uniform -module attributes {config.compilationMode = #config.compilation_mode, VPUIP.wlm_status = #VPUIP.wlm_status} { +module attributes {config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.MetadataMaxVariantCount : 2 config.Option @VPU.MetadataMaxInvariantCount : 2 @@ -1026,7 +1026,7 @@ func.func @insertBarrierBetweenConsecutiveSWtasks2(%arg0: memref<1x3x64x64xf16, !qElemType = !quant.uniform:f16:0, {2.3954496608944391E-5:127,1.8652968519315946E-5:127,5.714536651851624E-6:127,9.2288640540415852E-6:127,5.2774985005536418E-5:127,1.0355251041922983E-5:127,4.7608623354453737E-5:127,4.2622483621432085E-5:127,6.3378041184793303E-5:127,9.0411328893946842E-6:127,3.7636343888410434E-5:127,9.1462623415969495E-6:127,3.0472522645484744E-5:127,1.3648806181992955E-6:127,8.4283783679872047E-5:127,5.5778683639886814E-5:127,1.6640490434301182E-5:127,5.4847537063238186E-5:127,9.0531476839320868E-5:127,1.0873389056348425E-5:127,8.4944597379429132E-5:127,5.5928868571604333E-5:127,7.9477865864911412E-5:127,1.5408973994217519E-5:127,2.7033287709153543E-5:127,7.3740801473302161E-6:127,4.5475997324064961E-5:127,4.5415923351377953E-5:127,1.4605484609528789E-5:127,8.4554116556963585E-6:127,1.9478985643762302E-5:127,4.6332051434854825E-6:127,9.6568911094365155E-6:127,1.3648806594488189E-4:127,4.8584825410617617E-6:127,1.0588037686085138E-5:127,9.6493818628506396E-6:127,1.0663130151943897E-5:127,3.8086898683562992E-5:127,4.142100416769193E-5:127,7.5805844284418062E-6:127,5.2684874046505907E-5:127,9.1462623415969495E-6:127,1.327634796382874E-5:127,6.1538275771253692E-6:127,2.9691561000553643E-5:127,5.6079053503321852E-5:127,1.4605484609528789E-5:127,2.8324878121924213E-5:127,6.1695969949557085E-5:127,5.0492174043430117E-5:127,2.3218590443528543E-5:127,1.5026002418337845E-5:127,4.4184406911294294E-5:127,4.0009265809547241E-5:127,2.0289984275036911E-5:127,2.0680465097502461E-5:127,5.5906340831846705E-6:127,1.6054769200602853E-5:127,1.0940972275621308E-5:127,2.3008331539124016E-5:127,3.4182090458907481E-5:127,9.4766691913754922E-6:127,8.9450145330954728E-5:127}> !qElemType3 = !quant.uniform -module attributes {config.compilationMode = #config.compilation_mode, VPUIP.wlm_status = #VPUIP.wlm_status} { +module attributes {config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.MetadataMaxVariantCount : 2 config.Option @VPU.MetadataMaxInvariantCount : 2 @@ -1161,7 +1161,7 @@ func.func @noInsertBarrierBetweenConsecutiveSWtasksIfPathExists(%arg0: memref<1x !qElemType = !quant.uniform:f16:0, {2.3954496608944391E-5:127,1.8652968519315946E-5:127,5.714536651851624E-6:127,9.2288640540415852E-6:127,5.2774985005536418E-5:127,1.0355251041922983E-5:127,4.7608623354453737E-5:127,4.2622483621432085E-5:127,6.3378041184793303E-5:127,9.0411328893946842E-6:127,3.7636343888410434E-5:127,9.1462623415969495E-6:127,3.0472522645484744E-5:127,1.3648806181992955E-6:127,8.4283783679872047E-5:127,5.5778683639886814E-5:127,1.6640490434301182E-5:127,5.4847537063238186E-5:127,9.0531476839320868E-5:127,1.0873389056348425E-5:127,8.4944597379429132E-5:127,5.5928868571604333E-5:127,7.9477865864911412E-5:127,1.5408973994217519E-5:127,2.7033287709153543E-5:127,7.3740801473302161E-6:127,4.5475997324064961E-5:127,4.5415923351377953E-5:127,1.4605484609528789E-5:127,8.4554116556963585E-6:127,1.9478985643762302E-5:127,4.6332051434854825E-6:127,9.6568911094365155E-6:127,1.3648806594488189E-4:127,4.8584825410617617E-6:127,1.0588037686085138E-5:127,9.6493818628506396E-6:127,1.0663130151943897E-5:127,3.8086898683562992E-5:127,4.142100416769193E-5:127,7.5805844284418062E-6:127,5.2684874046505907E-5:127,9.1462623415969495E-6:127,1.327634796382874E-5:127,6.1538275771253692E-6:127,2.9691561000553643E-5:127,5.6079053503321852E-5:127,1.4605484609528789E-5:127,2.8324878121924213E-5:127,6.1695969949557085E-5:127,5.0492174043430117E-5:127,2.3218590443528543E-5:127,1.5026002418337845E-5:127,4.4184406911294294E-5:127,4.0009265809547241E-5:127,2.0289984275036911E-5:127,2.0680465097502461E-5:127,5.5906340831846705E-6:127,1.6054769200602853E-5:127,1.0940972275621308E-5:127,2.3008331539124016E-5:127,3.4182090458907481E-5:127,9.4766691913754922E-6:127,8.9450145330954728E-5:127}> !qElemType3 = !quant.uniform -module attributes {config.compilationMode = #config.compilation_mode, VPUIP.wlm_status = #VPUIP.wlm_status} { +module attributes {config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.MetadataMaxVariantCount : 18 config.Option @VPU.MetadataMaxInvariantCount : 12 diff --git a/tests/lit/NPU/dialect/VPURT/intermediate_buffer_output_40XX+.mlir b/tests/lit/NPU/dialect/VPURT/intermediate_buffer_output_40XX+.mlir index 1d08cd46cf..1fdc1eb9f5 100644 --- a/tests/lit/NPU/dialect/VPURT/intermediate_buffer_output_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPURT/intermediate_buffer_output_40XX+.mlir @@ -17,16 +17,16 @@ #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #NWCH = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> -module @testsubgraph attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @testsubgraph attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 4 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "result.1" : tensor<1x3x224x224xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPURT/wlm_insert_dummy_barriers_in_pages_40XX+.mlir b/tests/lit/NPU/dialect/VPURT/wlm_insert_dummy_barriers_in_pages_40XX+.mlir new file mode 100644 index 0000000000..775548167c --- /dev/null +++ b/tests/lit/NPU/dialect/VPURT/wlm_insert_dummy_barriers_in_pages_40XX+.mlir @@ -0,0 +1,110 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --wlm-insert-dummy-barriers-in-pages="num-barriers=4" %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +func.func @DmaGraph() -> memref<16x16x1x1xf16, #NHWC, [@CMX_NN, 0]> { + %bar0 = VPURT.DeclareVirtualBarrier {isStartBarrier, wlmPage = 0 : i64} -> !VPURT.Barrier + %bar1 = VPURT.DeclareVirtualBarrier {wlmPage = 1 : i64} -> !VPURT.Barrier + %bar2 = VPURT.DeclareVirtualBarrier {wlmPage = 1 : i64} -> !VPURT.Barrier + %bar3 = VPURT.DeclareVirtualBarrier {isFinalBarrier, wlmPage = 2 : i64} -> !VPURT.Barrier + + // dummy buffer + %cst0 = const.Declare memref<16x16x1x1xf16, #NHWC> = + dense<1.0> : tensor<16x16x1x1xf16>, [#const.Reorder<#NHWC>] + %buf0 = VPURT.DeclareBuffer [0] <32768> -> memref<16x16x1x1xf16, #NHWC, [@CMX_NN, 0]> + + + // Simple subgraph with page split (barX(PageY)): + // + // _______ DMA0[0] + // | + // bar0(0) <- Page0 has only one barrier whereas page size = 2 + // | Insert dummy barrier parallel to bar0 + // Page0 DMA0[1] + // | + // _______ DMA0[2] + // | + // bar1(1) + // | + // Page1 DMA0[3] + // | + // bar2(1) + // | + // _______ DMA0[4] + // | + // Page2 bar3(2) + // _______ + + + VPURT.Task updates(%bar0: !VPURT.Barrier) wlmPage(0) + { + VPUIP.NNDMA {port = 0 : i64} inputs(%cst0: memref<16x16x1x1xf16, #NHWC>) outputs(%buf0: memref<16x16x1x1xf16, #NHWC, [@CMX_NN, 0]>) -> memref<16x16x1x1xf16, #NHWC, [@CMX_NN, 0]> + } + + VPURT.Task waits(%bar0: !VPURT.Barrier) wlmPage(0) + { + VPUIP.NNDMA {port = 0 : i64} inputs(%cst0: memref<16x16x1x1xf16, #NHWC>) outputs(%buf0: memref<16x16x1x1xf16, #NHWC, [@CMX_NN, 0]>) -> memref<16x16x1x1xf16, #NHWC, [@CMX_NN, 0]> + } + + VPURT.Task updates(%bar1: !VPURT.Barrier) wlmPage(0) + { + VPUIP.NNDMA {port = 0 : i64} inputs(%cst0: memref<16x16x1x1xf16, #NHWC>) outputs(%buf0: memref<16x16x1x1xf16, #NHWC, [@CMX_NN, 0]>) -> memref<16x16x1x1xf16, #NHWC, [@CMX_NN, 0]> + } + + VPURT.Task waits(%bar1: !VPURT.Barrier) updates(%bar2: !VPURT.Barrier) wlmPage(1) + { + VPUIP.NNDMA {port = 0 : i64} inputs(%cst0: memref<16x16x1x1xf16, #NHWC>) outputs(%buf0: memref<16x16x1x1xf16, #NHWC, [@CMX_NN, 0]>) -> memref<16x16x1x1xf16, #NHWC, [@CMX_NN, 0]> + } + + VPURT.Task waits(%bar2: !VPURT.Barrier) updates(%bar3: !VPURT.Barrier) wlmPage(1) + { + VPUIP.NNDMA {port = 0 : i64} inputs(%cst0: memref<16x16x1x1xf16, #NHWC>) outputs(%buf0: memref<16x16x1x1xf16, #NHWC, [@CMX_NN, 0]>) -> memref<16x16x1x1xf16, #NHWC, [@CMX_NN, 0]> + } + + return %buf0: memref<16x16x1x1xf16, #NHWC, [@CMX_NN, 0]> + + + // Simple subgraph with page split after inserting dummy barrier + // + // _______ DMA0[0] + // | \ + // bar0(0) barDummy(0) + // | / + // Page0 DMA0[1] + // | + // _______ DMA0[2] + // | + // bar1(1) + // | + // Page1 DMA0[3] + // | + // bar2(1) + // | + // _______ DMA0[4] + // | + // Page2 bar3(2) + // _______ + + + // CHECK: [[BAR0:%.+]] = VPURT.DeclareVirtualBarrier {isStartBarrier, wlmPage = 0 : i64} -> !VPURT.Barrier + // CHECK: [[BAR_DUMMY:%.+]] = VPURT.DeclareVirtualBarrier {wlmPage = 0 : i64} -> !VPURT.Barrier + // CHECK: [[BAR1:%.+]] = VPURT.DeclareVirtualBarrier {wlmPage = 1 : i64} -> !VPURT.Barrier + // CHECK: [[BAR2:%.+]] = VPURT.DeclareVirtualBarrier {wlmPage = 1 : i64} -> !VPURT.Barrier + // CHECK: [[BAR3:%.+]] = VPURT.DeclareVirtualBarrier {isFinalBarrier, wlmPage = 2 : i64} -> !VPURT.Barrier + + // CHECK: VPURT.Task updates([[BAR0]], [[BAR_DUMMY]] : !VPURT.Barrier, !VPURT.Barrier) wlmPage(0) { + + // CHECK: VPURT.Task waits([[BAR0]], [[BAR_DUMMY]] : !VPURT.Barrier, !VPURT.Barrier) wlmPage(0) { + + // CHECK: VPURT.Task updates([[BAR1]] : !VPURT.Barrier) wlmPage(0) { + + // CHECK: VPURT.Task waits([[BAR1]] : !VPURT.Barrier) updates([[BAR2]] : !VPURT.Barrier) wlmPage(1) { + + // CHECK: VPURT.Task waits([[BAR2]] : !VPURT.Barrier) updates([[BAR3]] : !VPURT.Barrier) wlmPage(1) { +} diff --git a/tests/lit/NPU/dialect/core/passes/add_netinfo_to_module.mlir b/tests/lit/NPU/dialect/core/passes/add_netinfo_to_module.mlir index b7221566a7..4a07614a45 100644 --- a/tests/lit/NPU/dialect/core/passes/add_netinfo_to_module.mlir +++ b/tests/lit/NPU/dialect/core/passes/add_netinfo_to_module.mlir @@ -69,7 +69,7 @@ module @MultipleNestedModules { } } -// CHECK-LABEL: module @Module2 { +// CHECK-LABEL: module @Module2 { module @Module2 { // CHECK: net.NetworkInfo entryPoint : @main_part3 inputsInfo : { // CHECK: DataInfo "in_0" : tensor<1x3x60x60xf16> @@ -116,14 +116,6 @@ module @ExistingNetInfoFailure { // ----- -module @NoFnFailure { - // expected-error@+1 {{Module must contain exactly one function to add NetworkInfoOp}} - module @Module0 { - } -} - -// ----- - module @MultipleFunctionFailure { // expected-error@+1 {{Module must contain exactly one function to add NetworkInfoOp}} module @Module0 { diff --git a/tests/lit/NPU/dialect/core/passes/add_netinfo_to_module_tensor_semantics.mlir b/tests/lit/NPU/dialect/core/passes/add_netinfo_to_module_tensor_semantics.mlir new file mode 100644 index 0000000000..9b8da95545 --- /dev/null +++ b/tests/lit/NPU/dialect/core/passes/add_netinfo_to_module_tensor_semantics.mlir @@ -0,0 +1,24 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --init-compiler=vpu-arch=%arch% --split-input-file --add-netinfo-to-module=has-tensor-semantics=true %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + + +// CHECK-LABEL: module @Module0 +module @Module0 { + func.func private @nested(%arg0: tensor, %arg1: tensor) -> (tensor, tensor) { + return %arg0, %arg1: tensor, tensor + } + + // CHECK: net.NetworkInfo entryPoint : @nested inputsInfo : { + // CHECK: DataInfo "in_0" : tensor + // CHECK: DataInfo "in_1" : tensor + // CHECK: } outputsInfo : { + // CHECK: DataInfo "out_0" : tensor + // CHECK: DataInfo "out_1" : tensor + // CHECK: } + // CHECK: func.func private @nested +} diff --git a/tests/lit/NPU/dialect/core/passes/ws_fold_reinterpret_cast_into_const.mlir b/tests/lit/NPU/dialect/core/passes/ws_fold_reinterpret_cast_into_const.mlir new file mode 100644 index 0000000000..27ebd0a8ea --- /dev/null +++ b/tests/lit/NPU/dialect/core/passes/ws_fold_reinterpret_cast_into_const.mlir @@ -0,0 +1,114 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --ws-fold-reinterpret-cast-into-const %s | FileCheck %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +{-# + dialect_resources: { + builtin: { + ov_0: "0x1000000080120304" + } + } +#-} + +// CHECK: func.func @DenseResource() -> tensor<4xi8> +func.func @DenseResource() -> tensor<4xi8> { + %cst = const.Declare tensor<2xsi16> = dense_resource : tensor<2xsi16> + // 0x8012 -> 0x80 (-128), 0x12; 0x0304 -> 0x03, 0x04 + %cast = Core.ReinterpretCast(%cst) : tensor<2xsi16> -> tensor<4xi8> + return %cast : tensor<4xi8> + + // CHECK: [[CST:%.+]] = const.Declare tensor<4xi8> = dense<[-128, 18, 3, 4]> + // CHECK: return [[CST]] +} + +// ----- + +{-# + dialect_resources: { + builtin: { + ov_f32: "0x1000000043800304" + } + } +#-} + +// CHECK: func.func @DenseResource_f32() -> tensor<12xi8> +func.func @DenseResource_f32() -> tensor<12xi8> { + %cst = const.Declare tensor<3xf32> = dense_resource : tensor<1xf32>, [#const.PadWithZero<[1], [1]>] + // 0x43800304 -> 0x43 (67), 0x80 (-128), 0x03, 0x04 + %cast = Core.ReinterpretCast(%cst) : tensor<3xf32> -> tensor<12xi8> + return %cast : tensor<12xi8> + + // CHECK: [[CST:%.+]] = const.Declare tensor<12xi8> = dense<[0, 0, 0, 0, 67, -128, 3, 4, 0, 0, 0, 0]> + // CHECK: return [[CST]] +} + +// ----- + +// CHECK: func.func private @outlined() -> tensor<3xi8> +func.func private @outlined() -> tensor<3xi8> { + %cst = const.Declare tensor<3xsi8> = dense<[-2, 5, 3]> : tensor<3xsi8> + %cast = Core.ReinterpretCast(%cst) : tensor<3xsi8> -> tensor<3xi8> + return %cast : tensor<3xi8> + + // CHECK: [[CST:%.+]] = const.Declare tensor<3xi8> = dense<[-2, 5, 3]> + // CHECK: return [[CST]] +} + +// CHECK: func.func @SimpleOutlining([[IN:%.+]]: tensor<1x2x3x4xf32>) +// CHECK-SAME: -> (tensor<1x2x3x4xf32>, tensor<3xi8>, tensor<4xi8>) +func.func @SimpleOutlining(%arg0: tensor<1x2x3x4xf32>) -> (tensor<1x2x3x4xf32>, tensor<3xi8>, tensor<4xi8>) { + %out1 = func.call @outlined() : () -> tensor<3xi8> + + %cst = const.Declare tensor<4xui8> = dense<42> : tensor<2xui8>, [#const.PadWithZero<[0], [2]>] + %out2 = Core.ReinterpretCast(%cst) : tensor<4xui8> -> tensor<4xi8> + + return %arg0, %out1, %out2 : tensor<1x2x3x4xf32>, tensor<3xi8>, tensor<4xi8> + + // CHECK: [[OUT1:%.+]] = call @outlined() + // CHECK: [[OUT2:%.+]] = const.Declare tensor<4xi8> = dense<[42, 42, 0, 0]> + // CHECK: return [[IN]], [[OUT1]], [[OUT2]] +} + +// ----- + +// CHECK: func.func @Splat() -> tensor<4xi8> +func.func @Splat() -> tensor<4xi8> { + %cst = const.Declare tensor<4xsi8> = dense<42> : tensor<4xsi8> + %cast = Core.ReinterpretCast(%cst) : tensor<4xsi8> -> tensor<4xi8> + return %cast : tensor<4xi8> + + // CHECK: [[CST:%.+]] = const.Declare tensor<4xi8> = dense<42> + // CHECK: return [[CST]] +} + +// ----- + +{-# + dialect_resources: { + builtin: { + ov_0: "0x1000000080120304" + } + } +#-} + +func.func private @main_part1(%arg: tensor<2xsi16>) -> tensor<2xsi16> { + return %arg : tensor<2xsi16> +} + +// CHECK: func.func @MultiUserConst() -> (tensor<4xi8>, tensor<2xsi16>) +func.func @MultiUserConst() -> (tensor<4xi8>, tensor<2xsi16>) { + %cst = const.Declare tensor<2xsi16> = dense_resource : tensor<2xsi16> + // 0x8012 -> 0x80 (-128), 0x12; 0x0304 -> 0x03, 0x04 + %cast = Core.ReinterpretCast(%cst) : tensor<2xsi16> -> tensor<4xi8> + %call = func.call @main_part1(%cst) : (tensor<2xsi16>) -> tensor<2xsi16> + return %cast, %call : tensor<4xi8>, tensor<2xsi16> + + // CHECK: [[ORIG_CST:%.+]] = const.Declare tensor<2xsi16> = dense_resource + // CHECK: [[CST:%.+]] = const.Declare tensor<4xi8> = dense<[-128, 18, 3, 4]> + // CHECK: [[CALL:%.+]] = call @main_part1([[ORIG_CST]]) + // CHECK: return [[CST]], [[CALL]] +} diff --git a/tests/lit/NPU/large_memory/baseline_37XX.mlir b/tests/lit/NPU/large_memory/baseline_37XX.mlir index 83ec70d42d..ce6d1a9f79 100644 --- a/tests/lit/NPU/large_memory/baseline_37XX.mlir +++ b/tests/lit/NPU/large_memory/baseline_37XX.mlir @@ -2,20 +2,19 @@ // Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // - // RUN: vpux-translate --vpu-arch=%arch% --export-ELF %s | FileCheck %s // REQUIRES: arch-NPU37XX -module @Test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @Test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 2 of @NCE at 1.300000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "Input" : tensor<1x1024xui8> } outputsInfo : { diff --git a/tests/lit/NPU/large_memory/baseline_40XX.mlir b/tests/lit/NPU/large_memory/baseline_40XX.mlir index 758d7faf7c..4f3633d5a1 100644 --- a/tests/lit/NPU/large_memory/baseline_40XX.mlir +++ b/tests/lit/NPU/large_memory/baseline_40XX.mlir @@ -5,16 +5,16 @@ // RUN: vpux-translate --vpu-arch=%arch% --export-ELF %s | FileCheck %s // REQUIRES: arch-NPU40XX -module @Test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @Test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "Input" : tensor<1x1024xui8> } outputsInfo : { diff --git a/tests/lit/NPU/large_memory/leon_overflow_37XX.mlir b/tests/lit/NPU/large_memory/leon_overflow_37XX.mlir index 69a662d293..ce2073ff65 100644 --- a/tests/lit/NPU/large_memory/leon_overflow_37XX.mlir +++ b/tests/lit/NPU/large_memory/leon_overflow_37XX.mlir @@ -5,16 +5,16 @@ // RUN: vpux-translate --vpu-arch=%arch% --export-ELF %s | FileCheck %s // REQUIRES: arch-NPU37XX -module @Test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @Test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 2 of @NCE at 1.300000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "Input" : tensor<1x1024xui8> } outputsInfo : { diff --git a/tests/lit/NPU/large_memory/shave_overflow_37XX.mlir b/tests/lit/NPU/large_memory/shave_overflow_37XX.mlir index f22c45e604..231a424cc0 100644 --- a/tests/lit/NPU/large_memory/shave_overflow_37XX.mlir +++ b/tests/lit/NPU/large_memory/shave_overflow_37XX.mlir @@ -5,16 +5,16 @@ // RUN: vpux-translate --vpu-arch=%arch% --export-ELF %s | FileCheck %s // REQUIRES: arch-NPU37XX -module @Test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @Test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 2 of @NCE at 1.300000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "Input" : tensor<1x1024xui8> } outputsInfo : { diff --git a/tests/lit/NPU/large_memory/shave_overflow_40XX.mlir b/tests/lit/NPU/large_memory/shave_overflow_40XX.mlir index abce46258b..602bb6f895 100644 --- a/tests/lit/NPU/large_memory/shave_overflow_40XX.mlir +++ b/tests/lit/NPU/large_memory/shave_overflow_40XX.mlir @@ -5,16 +5,16 @@ // RUN: vpux-translate --vpu-arch=%arch% --export-ELF %s | FileCheck %s // REQUIRES: arch-NPU40XX -module @Test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @Test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "Input" : tensor<1x1024xui8> } outputsInfo : { diff --git a/tests/lit/NPU/pipelines/IR/transpose_conv.bin b/tests/lit/NPU/pipelines/IR/transpose_conv.bin new file mode 100644 index 0000000000..7261afe9d8 --- /dev/null +++ b/tests/lit/NPU/pipelines/IR/transpose_conv.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:039491366f6c6c70846b8310576498aa484988957249024d0205b12edfaf2d41 +size 416 diff --git a/tests/lit/NPU/pipelines/IR/transpose_conv.xml b/tests/lit/NPU/pipelines/IR/transpose_conv.xml new file mode 100644 index 0000000000..82bf62ec87 --- /dev/null +++ b/tests/lit/NPU/pipelines/IR/transpose_conv.xml @@ -0,0 +1,126 @@ + + + + + + + + 1 + 3 + 16 + 20 + + + + + + + + 4 + + + + + + + 1 + 3 + 16 + 20 + + + 4 + + + + + 1 + 3 + 20 + 16 + + + + + + + + + + + 8 + 3 + 2 + 4 + + + + + + + + + + + + 8 + 3 + 2 + 4 + + + + + 8 + 3 + 2 + 4 + + + + + + + + 1 + 3 + 20 + 16 + + + 8 + 3 + 2 + 4 + + + + + 1 + 8 + 19 + 13 + + + + + + + 1 + 8 + 19 + 13 + + + + + + + + + + + + + + diff --git a/tests/lit/NPU/pipelines/default_hw_mode_37XX.mlir b/tests/lit/NPU/pipelines/default_hw_mode_37XX.mlir index 7eaabd4695..497affad48 100644 --- a/tests/lit/NPU/pipelines/default_hw_mode_37XX.mlir +++ b/tests/lit/NPU/pipelines/default_hw_mode_37XX.mlir @@ -178,7 +178,7 @@ module @DynamicReshape { // CHECK-DAG: {{ }}module @SWKernelPrefetchingReservedMemory { // CHECK-DAG: {{ }}IE.MemoryResource {{[0-9]+}} bytes of @CMX_NN offset {{[0-9]+}} // CHECK-DAG: {{ }}IE.MemoryResource {{[0-9]+}} bytes of @CMX_NN_FragmentationAware - // CHECK-DAG: {{ }}IE.MemoryResource {{[0-9]+}} bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = {{.+}} : f64} + // CHECK-DAG: {{ }}IE.MemoryResource {{[0-9]+}} bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = {{.+}} : f64} // CHECK-DAG: {{ }}IE.ExecutorResource 1 of @DPU // CHECK-DAG: {{ }}IE.ExecutorResource 2 of @SHAVE_ACT // CHECK-DAG: {{ }}IE.ExecutorResource 1 of @SHAVE_NN @@ -255,7 +255,7 @@ module @DynamicReshape { // CHECK-DAG: {{ }}module @SWKernelPrefetchingReservedMemory { // CHECK-DAG: {{ }}IE.MemoryResource {{[0-9]+}} bytes of @CMX_NN offset {{[0-9]+}} // CHECK-DAG: {{ }}IE.MemoryResource {{[0-9]+}} bytes of @CMX_NN_FragmentationAware - // CHECK-DAG: {{ }}IE.MemoryResource {{[0-9]+}} bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = {{.+}} : f64} + // CHECK-DAG: {{ }}IE.MemoryResource {{[0-9]+}} bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = {{.+}} : f64} // CHECK-DAG: {{ }}IE.ExecutorResource 1 of @DPU // CHECK-DAG: {{ }}IE.ExecutorResource 2 of @SHAVE_ACT // CHECK-DAG: {{ }}IE.ExecutorResource 1 of @SHAVE_NN diff --git a/tests/lit/NPU/pipelines/dynamic_shape_static_shape_transformations_37XX_40XX.mlir b/tests/lit/NPU/pipelines/dynamic_shape_static_shape_transformations_37XX_40XX.mlir index a8ad57a4ac..d90bca4f6c 100644 --- a/tests/lit/NPU/pipelines/dynamic_shape_static_shape_transformations_37XX_40XX.mlir +++ b/tests/lit/NPU/pipelines/dynamic_shape_static_shape_transformations_37XX_40XX.mlir @@ -28,9 +28,9 @@ func.func @DynamicConvAddTranpose(%arg0: !BoundedInType) -> !BoundedTransposeTyp : !BoundedOutType -> !BoundedTransposeType return %transpose : !BoundedTransposeType - // CHECK-DAG: [[DIM_1:%.+]] = const.Declare tensor<1xsi64> = dense<1> : tensor<1xsi64> // CHECK-DAG: [[DIM_16:%.+]] = const.Declare tensor<1xsi64> = dense<16> : tensor<1xsi64> // CHECK-DAG: [[DIM_4:%.+]] = const.Declare tensor<1xsi64> = dense<4> : tensor<1xsi64> + // CHECK-DAG: [[DIM_1:%.+]] = const.Declare tensor<1xsi64> = dense<1> : tensor<1xsi64> // CHECK: [[DYN_EXPAND:%.+]] = IE.DynamicExpand([[IN]]) // CHECK-SAME: : tensor<1x512x4x?xf32, {bounds = #const.OpaqueI64Elements<[1, 512, 4, 320]> : tensor<4xsi64>, order = #NCHW}> -> tensor<1x512x4x320xf32> diff --git a/tests/lit/NPU/pipelines/host_compile_40XX.mlir b/tests/lit/NPU/pipelines/host_compile_40XX.mlir index 91c8e5f008..7ad551732a 100644 --- a/tests/lit/NPU/pipelines/host_compile_40XX.mlir +++ b/tests/lit/NPU/pipelines/host_compile_40XX.mlir @@ -15,6 +15,8 @@ module @CopyInputOutput { DataInfo "output" : tensor<1x3x60x60xf16> } + // CHECK: builtin.module @ReservedMemory + func.func private @main_part1(%arg0: tensor<1x3x60x60xf16>) -> tensor<1x3x60x60xf16> { %0 = VPU.Copy(%arg0) : tensor<1x3x60x60xf16> -> tensor<1x3x60x60xf16> return %0 : tensor<1x3x60x60xf16> @@ -24,10 +26,10 @@ module @CopyInputOutput { %0 = call @main_part1(%arg0) : (tensor<1x3x60x60xf16>) -> tensor<1x3x60x60xf16> return %0 : tensor<1x3x60x60xf16> } - // CHECK: module [[MODULE0:@.+]] attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} { + // CHECK: module [[MODULE0:@.+]] attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { // CHECK: func.func private [[FUNC0:@.+]]([[_:%.+]]: memref<1x3x60x60xf16, @DDR>, [[_:%.+]]: memref<1x3x60x60xf16, @DDR>) // CHECK-SAME: -> memref<1x3x60x60xf16, @DDR> attributes {inliner_dispatch = #VPUIP.VPUIPInlinerDispatch} { - // CHECK-COUNT-1: VPURT.Task + // CHECK-COUNT-1: VPURT.Task // CHECK-NOT: VPU.Copy // CHECK: func.func @main([[ARG0:%.+]]: memref<1x3x60x60xf16>, [[ARG1:%.+]]: memref<1x3x60x60xf16>) -> memref<1x3x60x60xf16> { @@ -55,7 +57,9 @@ module @StaticEltwiseNHWC { DataInfo "output" : tensor<1x16x720x1000xf16> } - // CHECK: module [[MODULE0:@.+]] attributes {VPU.arch = #VPU.arch_kind, VPU.revisionID = #VPU.revision_id, config.compilationMode = #config.compilation_mode} { + // CHECK: builtin.module @ReservedMemory + + // CHECK: module [[MODULE0:@.+]] attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { // CHECK: func.func private [[FUNC0:@.+]]([[_:%.+]]: memref<1x90x1000x16xf16, @DDR>, [[_:%.+]]: memref<1x90x1000x16xf16, @DDR>, [[_:%.+]]: memref<1x90x1000x16xf16, @DDR>) -> memref<1x90x1000x16xf16, @DDR> attributes {inliner_dispatch = #VPUIP.VPUIPInlinerDispatch} { // CHECK-COUNT-25: VPURT.Task // CHECK-NOT: IE.Add diff --git a/tests/lit/NPU/pipelines/host_compile_40XX_dynamic.mlir b/tests/lit/NPU/pipelines/host_compile_40XX_dynamic.mlir new file mode 100644 index 0000000000..d229173466 --- /dev/null +++ b/tests/lit/NPU/pipelines/host_compile_40XX_dynamic.mlir @@ -0,0 +1,70 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: echo -e '{"": {"layerType": "VPU.NCE.Convolution","multiClusterStrategy": "SplitOverHeight","tilingStrategy": {"C": 1,"H": 8,"N": 1,"W": 1},"updatedVFTiling": "False","verticalFusion": "False"}}' > strategy_in.json +// RUN: env IE_NPU_READ_STRATEGY_JSON=1 vpux-opt --vpu-arch=%arch% --split-input-file --mlir-elide-elementsattrs-if-larger 8 --host-compile="enable-dynamic-shape-transformations=false scf-tiling=true scf-compute-ops-outlining=true use-memref-for-host-function-bufferization=true read-strategy-from-json=true" %s | FileCheck %s +// RUN: rm strategy_in.json +// REQUIRES: arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> +// CHECK-LABEL: @EltwiseNHWCDynamic +module @EltwiseNHWCDynamic { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input1" : tensor<1x16x?x1000xf16> + DataInfo "input2" : tensor<1x16x?x1000xf16> + } outputsInfo : { + DataInfo "output" : tensor<1x16x?x1000xf16> + } + + func.func @main(%arg0: tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>, %arg1: tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> { + %0 = IE.Add(%arg0, %arg1) {auto_broadcast = #IE.auto_broadcast_type} : + tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}>, + tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + -> tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + return %0 : tensor<1x16x?x1000xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 720, 1000]> : tensor<4xsi64>, order = #NHWC}> + } + + // CHECK: func.func @main_func0_static(%arg0: memref<1x90x1000x16xf16, @DDR>, %arg1: memref<1x90x1000x16xf16, @DDR>, %arg2: memref<1x90x1000x16xf16, @DDR>) + // CHECK-SAME: -> memref<1x90x1000x16xf16, @DDR> attributes {inliner_dispatch = #VPUIP.VPUIPInlinerDispatch} { + + // CHECK-COUNT-6: VPUIP.NCEClusterTask + // CHECK-NOT: IE.Add + + // CHECK: func.func @main([[ARG0:%.+]]: memref<1x?x1000x16xf16>, [[ARG1:%.+]]: memref<1x?x1000x16xf16>, [[ARG2:%.+]]: memref<1x?x1000x16xf16>) -> memref<1x?x1000x16xf16> { + // CHECK: [[C90:%.+]] = arith.constant 90 : index + // CHECK: [[C0:%.+]] = arith.constant 0 : index + // CHECK: [[C1000:%.+]] = arith.constant 1000 : index + // CHECK: [[SUB:%.+]] = arith.subi [[C1000]], [[C0]] : index + // CHECK: [[DIV:%.+]] = arith.divsi [[SUB]], [[C90]] : index + // CHECK: [[GROUP:%.+]] = async.create_group [[DIV]] : !async.group + // CHECK: scf.for [[ARG3:%.+]] = [[C0]] to [[C1000]] step [[C90]] { + // CHECK: [[MIN:%.+]] = affine.min #map([[ARG3]]) + // CHECK: [[CMP:%.+]] = arith.cmpi ne, [[MIN]], [[C90]] : index + // CHECK: [[IF:%.+]] = scf.if [[CMP]] -> (index) { + // CHECK: [[SUB1:%.+]] = arith.subi [[C90]], [[MIN]] : index + // CHECK: [[CMP1:%.+]] = arith.cmpi slt, [[ARG3]], [[SUB1]] : index + // CHECK: cf.assert [[CMP1]], "Not enough elements to backtrack in scf.for loop" + // CHECK: [[SUB2:%.+]] = arith.subi [[ARG3]], [[SUB1]] : index + // CHECK: scf.yield [[SUB2]] : index + // CHECK: } else { + // CHECK: scf.yield [[ARG3]] : index + // CHECK: } + + // CHECK: [[SUBVIEW:%.+]] = memref.subview [[ARG0]][0, [[IF]], 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x?x1000x16xf16> to memref<1x90x1000x16xf16, strided<[?, 16000, 16, 1], offset: ?>> + // CHECK: [[SUBVIEW_0:%.+]] = memref.subview [[ARG1]][0, [[IF]], 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x?x1000x16xf16> to memref<1x90x1000x16xf16, strided<[?, 16000, 16, 1], offset: ?>> + // CHECK: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[SUBVIEW]] : memref<1x90x1000x16xf16, strided<[?, 16000, 16, 1], offset: ?>> to memref<1x90x1000x16xf16> + // CHECK: [[CAST_0:%.+]] = builtin.unrealized_conversion_cast [[SUBVIEW_0]] : memref<1x90x1000x16xf16, strided<[?, 16000, 16, 1], offset: ?>> to memref<1x90x1000x16xf16> + // CHECK: [[SUBVIEW_1:%.+]] = memref.subview [[ARG2]][0, [[IF]], 0, 0] [1, 90, 1000, 16] [1, 1, 1, 1] : memref<1x?x1000x16xf16> to memref<1x90x1000x16xf16, strided<[?, 16000, 16, 1], offset: ?>> + // CHECK: [[CAST_1:%.+]] = builtin.unrealized_conversion_cast [[SUBVIEW_1]] : memref<1x90x1000x16xf16, strided<[?, 16000, 16, 1], offset: ?>> to memref<1x90x1000x16xf16> + // CHECK: [[TOKEN:%.+]], [[BODYRESULTS:%.+]] = async.execute -> !async.value> { + // CHECK: [[RESULT:%.+]] = Core.NestedCall @Module0::@main_func0_static([[CAST]], [[CAST_0]], [[CAST_1]]) : (memref<1x90x1000x16xf16>, memref<1x90x1000x16xf16>, memref<1x90x1000x16xf16>) -> memref<1x90x1000x16xf16> + // CHECK: async.yield [[RESULT]] : memref<1x90x1000x16xf16> + // CHECK: } + // CHECK: [[ADD_TO_GROUP_RES:%.+]] = async.add_to_group [[TOKEN]], [[GROUP]] : !async.token + // CHECK: [[AWAIT:%.+]] = async.await [[BODYRESULTS]] : !async.value> + // CHECK: } + // CHECK: async.await_all [[GROUP]] + // CHECK: return [[ARG2]] : memref<1x?x1000x16xf16> +} diff --git a/tests/lit/NPU/pipelines/import_IE_weights_separation.mlir b/tests/lit/NPU/pipelines/import_IE_weights_separation.mlir new file mode 100644 index 0000000000..502c480d35 --- /dev/null +++ b/tests/lit/NPU/pipelines/import_IE_weights_separation.mlir @@ -0,0 +1,37 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-translate --vpu-arch=%arch% --import-IE ./IR/transpose_conv.xml | FileCheck %s +// RUN: vpux-translate --vpu-arch=%arch% --import-IE --weights-separation-path=false ./IR/transpose_conv.xml | FileCheck --check-prefix=CHECK-DISABLED %s +// RUN: vpux-translate --vpu-arch=%arch% --import-IE --weights-separation-path=true ./IR/transpose_conv.xml | FileCheck --check-prefix=CHECK-ENABLED %s +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +//CHECK: module @Conv2dWithTransposeTest { +//CHECK: net.NetworkInfo entryPoint : @main inputsInfo : { +//CHECK: DataInfo "Parameter_10" : tensor<1x3x16x20xf32> +//CHECK: } outputsInfo : { +//CHECK: DataInfo "Convolution_17" friendlyName = "Result_18" : tensor<1x8x19x13xf32> + +//CHECK: func.func @main([[ARG0:%.+]]: tensor<1x3x16x20xf32>) -> tensor<1x8x19x13xf32> { +//CHECK: [[TRANSPOSE_PERM:%.+]] = const.Declare tensor<4xsi64> = dense<[0, 1, 3, 2]> : tensor<4xsi64> +//CHECK: [[TRANSPOSE:%.+]] = IE.Transpose([[ARG0]], [[TRANSPOSE_PERM]]) : tensor<1x3x16x20xf32>, tensor<4xsi64> -> tensor<1x3x20x16xf32> + + +// WS enabled: +//CHECK-ENABLED: [[WEIGHTS:%.+]] = const.Declare tensor<8x3x2x4xf16> = dense<{{.*}}> : tensor<8x3x2x4xf16> + +// This Convert operation was preserved by "weights-separation-path" option +//CHECK-ENABLED: [[CONVERT:%.+]] = IE.Convert([[WEIGHTS]]) {dstElemType = f32} : tensor<8x3x2x4xf16> -> tensor<8x3x2x4xf32> +//CHECK-ENABLED: [[CONV:%.+]] = IE.Convolution([[TRANSPOSE:%.+]], [[CONVERT]]) +//CHECK-ENABLED: return [[CONV]] : tensor<1x8x19x13xf32> + + +// WS disabled: +//CHECK-DISABLED: [[WEIGHTS:%.+]] = const.Declare tensor<8x3x2x4xf32> = dense<{{.*}}> : tensor<8x3x2x4xf32> + +// f16->f32 conversion was folded by nGraph passes +//CHECK-DISABLED-NOT: IE.Convert +//CHECK-DISABLED: [[CONV:%.+]] = IE.Convolution([[TRANSPOSE:%.+]], [[WEIGHTS]]) +//CHECK-DISABLED: return [[CONV]] : tensor<1x8x19x13xf32> diff --git a/tests/lit/NPU/pipelines/reinterpret_cast_pipeline_from_vpu.mlir b/tests/lit/NPU/pipelines/reinterpret_cast_pipeline_from_vpu.mlir new file mode 100644 index 0000000000..fb04e785ce --- /dev/null +++ b/tests/lit/NPU/pipelines/reinterpret_cast_pipeline_from_vpu.mlir @@ -0,0 +1,78 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch%" --default-hw-mode-vpu --lower-VPU-to-VPUIP --default-hw-mode-vpuip %s | FileCheck %s --strict-whitespace +// REQUIRES: arch-NPU37XX || arch-NPU40XX + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK: @InMain +module @InMain { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x512x7x7xf16> + DataInfo "out_ov_0_hash_12345_concat": tensor<2359296xi8> + } outputsInfo : { + DataInfo "output" : tensor<256x512x3x3xf16> + } + + func.func @main(%arg0: tensor<1x512x7x7xf16, {order = #NHWC}>, %arg1: tensor<2359296xi8>) + -> tensor<256x512x3x3xf16, {order = #NHWC}> { + %out = Core.ReinterpretCast(%arg1) : tensor<2359296xi8> -> tensor<256x512x3x3xf16, {order = #NHWC}> + return %out : tensor<256x512x3x3xf16, {order = #NHWC}> + } + + // CHECK: func.func @main + // CHECK-SAME: ({{%.+}}: memref<1x512x7x7xf16, #NHWC, @DDR>, {{%.+}}: memref<2359296xi8, @DDR>, {{%.+}}: memref<256x512x3x3xf16, #NHWC, @DDR>) + // CHECK-SAME: -> memref<256x512x3x3xf16, #NHWC, @DDR> + + // CHECK-DAG: [[BLOB:%.+]] = VPURT.DeclareBuffer [1] <0> -> memref<{{.+}}x512x3x3xf16, #NHWC, @DDR> + // CHECK-DAG: [[OUT:%.+]] = VPURT.DeclareBuffer [0] <0> -> memref<{{.+}}x512x3x3xf16, #NHWC, @DDR> + + // CHECK: VPUIP.NNDMA {{.*}} inputs([[BLOB]] + // CHECK-SAME: outputs([[OUT]] +} + +// ----- + +#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> + +// CHECK: @InMainWithSlice +module @InMainWithSlice { + net.NetworkInfo entryPoint : @main inputsInfo : { + DataInfo "input" : tensor<1x512x7x7xf16> + DataInfo "out_ov_0_hash_12345_concat": tensor<51021312xi8> + } outputsInfo : { + DataInfo "output" : tensor<256x512x3x3xf16> + } + + func.func private @dummy_call(%arg: tensor<256x512x3x3xf16, {order = #NHWC}>) + -> tensor<256x512x3x3xf16, {order = #NHWC}> { + return %arg : tensor<256x512x3x3xf16, {order = #NHWC}> + } + + func.func @main(%arg0: tensor<1x512x7x7xf16, {order = #NHWC}>, %arg1: tensor<51021312xi8>) + -> tensor<256x512x3x3xf16, {order = #NHWC}> { + %0 = VPU.Slice %arg1 [48662016] [2359296] : tensor<51021312xi8> to tensor<2359296xi8> + %out = Core.ReinterpretCast(%0) : tensor<2359296xi8> -> tensor<256x512x3x3xf16, {order = #NHWC}> + return %out : tensor<256x512x3x3xf16, {order = #NHWC}> + } + + // CHECK: func.func @main + // CHECK-SAME: ({{%.+}}: memref<1x512x7x7xf16, #NHWC, @DDR>, {{%.+}}: memref<51021312xi8, @DDR>, {{%.+}}: memref<256x512x3x3xf16, #NHWC, @DDR>) + // CHECK-SAME: -> memref<256x512x3x3xf16, #NHWC, @DDR> + + // Note: still i8 because there's a VPUIP.Copy after VPUIP.SubView + // CHECK-DAG: [[BLOB:%.+]] = VPURT.DeclareBuffer [1] <48662016> -> memref<{{.+}}xi8, @DDR> + // CHECK-DAG: [[OUT:%.+]] = VPURT.DeclareBuffer [0] <0> -> memref<{{.+}}x512x3x3xf16, #NHWC, @DDR> + + // CHECK-DAG: [[BLOB_DDR:%.+]] = VPURT.DeclareBuffer <0> -> memref<{{.+}}xi8, @DDR> + // CHECK-DAG: [[BLOB_F16:%.+]] = VPURT.DeclareBuffer <0> -> memref<{{.+}}x512x3x3xf16, #NHWC, @DDR> + + // CHECK: VPUIP.NNDMA {{.*}} inputs([[BLOB]] + // CHECK-SAME: outputs([[BLOB_DDR]] + + // CHECK: VPUIP.NNDMA {{.*}} inputs([[BLOB_F16]] + // CHECK-SAME: outputs([[OUT]] +} diff --git a/tests/lit/NPU/pipelines/ws_monolithic.mlir b/tests/lit/NPU/pipelines/ws_monolithic.mlir index 897973170e..5692761b48 100644 --- a/tests/lit/NPU/pipelines/ws_monolithic.mlir +++ b/tests/lit/NPU/pipelines/ws_monolithic.mlir @@ -11,7 +11,7 @@ {-# dialect_resources: { builtin: { - ov_1: "0x0000000400aa" + vpux_ow_1: "0x0000000400aa" } } #-} @@ -24,14 +24,14 @@ module @WeightsSeprationMode attributes {} { } func.func @main(%arg0: tensor<1x2x1x1xui8>) -> tensor<1x2x1x1xui8> { - %cst = const.Declare tensor<1x2x1x1xui8> = dense_resource : tensor<1x2x1x1xui8>, [#const.Add<1.0 : f32>] + %cst = const.Declare tensor<1x2x1x1xui8> = dense_resource : tensor<1x2x1x1xui8>, [#const.Add<1.0 : f32>] return %cst : tensor<1x2x1x1xui8> } // Note: We mainly want to check that #const.Add is mapped to a VPU.Add and don't care about any of the other functionality // the pipelines perform. // CHECK: func.func @wrapper_main([[ARG0:%.+]]: tensor<1x2x1x1xui8>) -> tensor<1x2x1x1xui8> { -// CHECK: [[CST:%.+]] = const.Declare tensor<1x2x1x1xui8> = dense_resource : tensor<1x2x1x1xui8> +// CHECK: [[CST:%.+]] = const.Declare tensor<1x2x1x1xui8> = dense_resource : tensor<1x2x1x1xui8> // CHECK: [[ADD:%.+]] = VPU.Add // CHECK: return } diff --git a/tests/lit/NPU/roundtrip/act-kernel_37XX.mlir b/tests/lit/NPU/roundtrip/act-kernel_37XX.mlir index 958044dcb0..2029c0a19b 100644 --- a/tests/lit/NPU/roundtrip/act-kernel_37XX.mlir +++ b/tests/lit/NPU/roundtrip/act-kernel_37XX.mlir @@ -9,11 +9,11 @@ // REQUIRES: arch-NPU37XX // -module @Test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { - IE.MemoryResource 31457280 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} +module @Test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + IE.MemoryResource 31457280 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} IE.ExecutorResource 1 of @DMA_NN IE.TileResource 1 of @NCE { - IE.MemoryResource 2097152 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 2097152 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } diff --git a/tests/lit/NPU/utils/dump_statistics_of_task_ops_37XX.mlir b/tests/lit/NPU/utils/dump_statistics_of_task_ops_37XX.mlir index 38759135ae..d0f404faaf 100644 --- a/tests/lit/NPU/utils/dump_statistics_of_task_ops_37XX.mlir +++ b/tests/lit/NPU/utils/dump_statistics_of_task_ops_37XX.mlir @@ -10,7 +10,7 @@ !qtype = !quant.uniform -module @dual_tile attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @dual_tile attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { net.NetworkInfo entryPoint : @main inputsInfo : { @@ -19,9 +19,9 @@ module @dual_tile attributes {VPU.arch = #VPU.arch_kind, config.compila DataInfo "output_0" : tensor<2x16x16x16xf16> } - IE.MemoryResource 31457280 bytes of @DDR {VPU.bandwidth = 8, VPU.derateFactor = 6.000000e-01} + IE.MemoryResource 31457280 bytes of @DDR {config.bandwidth = 8, config.derateFactor = 6.000000e-01} IE.TileResource 1 of @NCE { - IE.MemoryResource 2097152 bytes of @CMX_NN {VPU.bandwidth = 32, VPU.derateFactor = 1.000000e+00} + IE.MemoryResource 2097152 bytes of @CMX_NN {config.bandwidth = 32, config.derateFactor = 1.000000e+00} IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 2 of @DMA_NN diff --git a/tests/lit/NPU/utils/dump_statistics_of_task_ops_prof_37XX.mlir b/tests/lit/NPU/utils/dump_statistics_of_task_ops_prof_37XX.mlir index 9fa5e68e9b..763b67493b 100644 --- a/tests/lit/NPU/utils/dump_statistics_of_task_ops_prof_37XX.mlir +++ b/tests/lit/NPU/utils/dump_statistics_of_task_ops_prof_37XX.mlir @@ -10,7 +10,7 @@ #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #loc0 = loc(unknown) #loc2 = loc("profiling_result") -module @age_gender attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { +module @age_gender attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] loc(#loc0) module @VPU.SW { func.func private @builtin_MemPermute(memref<*xf16, [@CMX_NN, 0]>, memref<*xf16, [@CMX_NN, 0]>, none) attributes {VPU.kernel_code = "reorder.cpp", VPU.kernel_entry = "reorder"} loc(#loc0) @@ -26,10 +26,10 @@ module @age_gender attributes {VPU.arch = #VPU.arch_kind, config.compil IE.ExecutorResource 1 of @DPU loc(#loc0) IE.ExecutorResource 2 of @SHAVE_ACT loc(#loc0) IE.ExecutorResource 1 of @SHAVE_NN loc(#loc0) - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} loc(#loc0) + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} loc(#loc0) } loc(#loc0) IE.ExecutorResource 2 of @DMA_NN loc(#loc0) - IE.MemoryResource 524288000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} loc(#loc0) + IE.MemoryResource 524288000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} loc(#loc0) net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "data" : tensor<1x3x62x62xf32> loc(#loc0) } outputsInfo : { diff --git a/tests/micro_benchmarks/src/logging_api.cpp b/tests/micro_benchmarks/src/logging_api.cpp index 6b0dcdfe20..0474e6c8e5 100644 --- a/tests/micro_benchmarks/src/logging_api.cpp +++ b/tests/micro_benchmarks/src/logging_api.cpp @@ -3,19 +3,18 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "vpux/compiler/dialect/const/dialect.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/init.hpp" +#include "vpux/compiler/utils/attributes.hpp" + +#include #include #include #include #include -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/const/dialect.hpp" -#include "vpux/compiler/dialect/const/ops.hpp" -#include "vpux/compiler/dialect/const/utils/content.hpp" -#include "vpux/compiler/init.hpp" -#include "vpux/compiler/utils/attributes.hpp" -#include "vpux/utils/core/array_ref.hpp" +#include namespace llvm { template <> diff --git a/tests/unit/vpux_compiler/common/ppe_utils.hpp b/tests/unit/vpux_compiler/common/ppe_utils.hpp index 9dbe0a9568..f7a07ebc09 100644 --- a/tests/unit/vpux_compiler/common/ppe_utils.hpp +++ b/tests/unit/vpux_compiler/common/ppe_utils.hpp @@ -13,13 +13,17 @@ #include #include -#include "common/utils.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/interfaces/ppe_factory.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" #include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/init.hpp" using namespace vpux; @@ -237,14 +241,31 @@ class VPU_PpeUnitBase : public MLIR_PpeRegistry { nullptr); } - IE::ReduceMeanOp createReduceMean(mlir::Type inElemType, mlir::Type outElemType) { + IE::ReduceMeanOp createReduceMean(mlir::Type inElemType, mlir::Type outElemType, ArrayRef axes, + ArrayRef inputPadding) { mlir::OpBuilder builder(&_ctx); auto input = builder.create(mlir::UnknownLoc::get(&_ctx), ArrayRef{1, 16, 32, 32}, inElemType); const auto outType = mlir::RankedTensorType::get(ArrayRef{1, 16, 32, 32}, outElemType); + const auto axesAttr = getIntArrayAttr(&_ctx, axes); + const auto inputPaddingAttr = inputPadding.empty() ? nullptr : getIntArrayAttr(&_ctx, inputPadding); - return builder.create(_loc, outType, input.getResult(), nullptr, nullptr, nullptr, nullptr, - nullptr); + return builder.create(_loc, outType, input.getResult(), /*axes=*/nullptr, axesAttr, + /*keep_dims=*/nullptr, /*output_padding=*/nullptr, inputPaddingAttr); + } + + VPU::NCEReduceOp createNCEReduce(mlir::Type inElemType, mlir::Type outElemType, ArrayRef axes, + VPU::ReduceType reduceType, ArrayRef inputPadding) { + mlir::OpBuilder builder(&_ctx); + auto input = builder.create(mlir::UnknownLoc::get(&_ctx), + ArrayRef{1, 16, 32, 32}, inElemType); + const auto outType = mlir::RankedTensorType::get(ArrayRef{1, 16, 32, 32}, outElemType); + const auto axesAttr = getIntArrayAttr(&_ctx, axes); + const auto typeAttr = VPU::ReduceTypeAttr::get(&_ctx, reduceType); + const auto inputPaddingAttr = inputPadding.empty() ? nullptr : getIntArrayAttr(&_ctx, inputPadding); + return builder.create(_loc, outType, input.getResult(), axesAttr, /*ppe=*/nullptr, typeAttr, + /*multiClusterStrategy=*/nullptr, /*output_padding=*/nullptr, + inputPaddingAttr); } VPU::NCEInterpolateOp createNCEInterpolate(mlir::Type inElemType, mlir::Type weightsElemType, diff --git a/tests/unit/vpux_compiler/common/utils.hpp b/tests/unit/vpux_compiler/common/utils.hpp index 997729b976..6d68a8cae0 100644 --- a/tests/unit/vpux_compiler/common/utils.hpp +++ b/tests/unit/vpux_compiler/common/utils.hpp @@ -5,18 +5,17 @@ #pragma once -#include - -#include -#include - #include "vpux/compiler/NPU37XX/dialect/NPUReg37XX/ops.hpp" -#include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/ops.hpp" +#include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/dialect.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/dialect/VPURegMapped/utils.hpp" #include "vpux/compiler/init.hpp" #include "vpux/compiler/interfaces_registry.hpp" +#include +#include + +#include #include #include @@ -32,7 +31,7 @@ class MLIR_UnitBase : public testing::Test { class NPUSpecific_UnitTest : public MLIR_UnitBase { public: - NPUSpecific_UnitTest(vpux::VPU::ArchKind arch) { + NPUSpecific_UnitTest(vpux::config::ArchKind arch) { // We need to register hw-specific interfaces (e.g. NCEOpInterface) for VPU NCE ops auto interfacesRegistry = vpux::createInterfacesRegistry(arch); interfacesRegistry->registerInterfaces(registry); @@ -45,7 +44,7 @@ class NPUSpecific_UnitTest : public MLIR_UnitBase { namespace vpux::VPU::arch37xx { class UnitTest : public NPUSpecific_UnitTest { public: - UnitTest(): NPUSpecific_UnitTest(vpux::VPU::ArchKind::NPU37XX) { + UnitTest(): NPUSpecific_UnitTest(vpux::config::ArchKind::NPU37XX) { } }; } // namespace vpux::VPU::arch37xx @@ -53,7 +52,7 @@ class UnitTest : public NPUSpecific_UnitTest { namespace vpux::VPU::arch40xx { class UnitTest : public NPUSpecific_UnitTest { public: - UnitTest(): NPUSpecific_UnitTest(vpux::VPU::ArchKind::NPU40XX) { + UnitTest(): NPUSpecific_UnitTest(vpux::config::ArchKind::NPU40XX) { } }; } // namespace vpux::VPU::arch40xx diff --git a/tests/unit/vpux_compiler/core/barrier_optimization.cpp b/tests/unit/vpux_compiler/core/barrier_optimization.cpp index 812aa72131..6c551a072a 100644 --- a/tests/unit/vpux_compiler/core/barrier_optimization.cpp +++ b/tests/unit/vpux_compiler/core/barrier_optimization.cpp @@ -699,7 +699,7 @@ BarrierInfoMaps barriersWithFIFOdependenciesNPU40XXconfig(mlir::MLIRContext* ctx constexpr llvm::StringLiteral inputIR = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode, VPU.revisionID = #VPU.revision_id} { + module attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { config.PipelineOptions @Options { config.Option @VPU.UseDedicatedFifoPerShaveEngine : false config.Option @VPU.ReduceSupported : false @@ -709,13 +709,13 @@ BarrierInfoMaps barriersWithFIFOdependenciesNPU40XXconfig(mlir::MLIRContext* ctx } IE.TileResource 6 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} module @VPU.SW { func.func private @builtin_relu(%input : memref<*xf16>, %output : memref<*xf16>) attributes {VPU.kernel_code = "activation_relu.cpp", VPU.kernel_entry = "activation_relu", VPU.task_type = @COMPUTE } @@ -832,7 +832,7 @@ BarrierInfoMaps barriersWithFIFOdependenciesNPU40XXconfig2(mlir::MLIRContext* ct constexpr llvm::StringLiteral inputIR = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode, VPU.revisionID = #VPU.revision_id} { + module attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { config.PipelineOptions @Options { config.Option @VPU.UseDedicatedFifoPerShaveEngine : false config.Option @VPU.ReduceSupported : false @@ -842,13 +842,13 @@ BarrierInfoMaps barriersWithFIFOdependenciesNPU40XXconfig2(mlir::MLIRContext* ct } IE.TileResource 6 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} func.func @main(%arg0: memref<1x3x64x64xf16, @DDR>, %arg1: memref<1x3x64x64xf16, @DDR>) -> memref<1x3x64x64xf16, @DDR> { %bar0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier @@ -930,7 +930,7 @@ BarrierInfoMaps barriersWithFIFOdependenciesNPU40XXconfig3(mlir::MLIRContext* ct constexpr llvm::StringLiteral inputIR = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode, VPU.revisionID = #VPU.revision_id} { + module attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { config.PipelineOptions @Options { config.Option @VPU.UseDedicatedFifoPerShaveEngine : false config.Option @VPU.ReduceSupported : false @@ -940,13 +940,13 @@ BarrierInfoMaps barriersWithFIFOdependenciesNPU40XXconfig3(mlir::MLIRContext* ct } IE.TileResource 6 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} func.func @main(%arg0: memref<1x3x64x64xf16, @DDR>, %arg1: memref<1x3x64x64xf16, @DDR>) -> memref<1x3x64x64xf16, @DDR> { %bar0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier @@ -1144,7 +1144,7 @@ BarrierInfoMaps barriersWithFIFOdependenciesNPU40XXconfig6( constexpr llvm::StringLiteral inputIR = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode, VPU.revisionID = #VPU.revision_id} { + module attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { config.PipelineOptions @Options { config.Option @VPU.UseDedicatedFifoPerShaveEngine : false config.Option @VPU.ReduceSupported : false @@ -1154,13 +1154,13 @@ BarrierInfoMaps barriersWithFIFOdependenciesNPU40XXconfig6( } IE.TileResource 6 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} func.func @main(%arg0: memref<1x3x64x64xf16, @DDR>, %arg1: memref<1x3x64x64xf16, @DDR>) -> memref<1x3x64x64xf16, @DDR> { %bar0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier @@ -1258,7 +1258,7 @@ SmallVector variableGraphSplitBlockSizeNPU40XXconfig( constexpr llvm::StringLiteral inputIR = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode, VPU.revisionID = #VPU.revision_id} { + module attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { config.PipelineOptions @Options { config.Option @VPU.ReduceSupported : false config.Option @VPU.AutoPaddingODU : false @@ -1267,13 +1267,13 @@ SmallVector variableGraphSplitBlockSizeNPU40XXconfig( } IE.TileResource 6 of @NCE at 1.850000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} module @VPU.SW { func.func private @builtin_relu(%input : memref<*xf16>, %output : memref<*xf16>) attributes {VPU.kernel_code = "activation_relu.cpp", VPU.kernel_entry = "activation_relu", VPU.task_type = @COMPUTE } @@ -1785,7 +1785,7 @@ void parallelWaitBarriersIRconfig(mlir::MLIRContext* ctx, mlir::OwningOpRef (d0, d2, d3, d1)> - module attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode, VPU.revisionID = #VPU.revision_id} { + module attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode, config.revisionID = #config.revision_id} { config.PipelineOptions @Options { config.Option @VPU.ReduceSupported : false config.Option @VPU.AutoPaddingODU : false @@ -1794,13 +1794,13 @@ void parallelWaitBarriersIRconfig(mlir::MLIRContext* ctx, mlir::OwningOpRef, %output : memref<*xf16>) attributes {VPU.kernel_code = "activation_relu.cpp", VPU.kernel_entry = "activation_relu", VPU.task_type = @COMPUTE } diff --git a/tests/unit/vpux_compiler/core/locations_verifier_tests.cpp b/tests/unit/vpux_compiler/core/locations_verifier_tests.cpp index 62e010ea1a..c20e7eb275 100644 --- a/tests/unit/vpux_compiler/core/locations_verifier_tests.cpp +++ b/tests/unit/vpux_compiler/core/locations_verifier_tests.cpp @@ -4,6 +4,8 @@ // // +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/core/transforms/passes.hpp" #include "vpux/compiler/init.hpp" #include "vpux/compiler/utils/locations_verifier.hpp" diff --git a/tests/unit/vpux_compiler/core/reserved_memory_info_tests.cpp b/tests/unit/vpux_compiler/core/reserved_memory_info_tests.cpp index 68017ce3a5..4f408e1044 100644 --- a/tests/unit/vpux_compiler/core/reserved_memory_info_tests.cpp +++ b/tests/unit/vpux_compiler/core/reserved_memory_info_tests.cpp @@ -28,13 +28,13 @@ TEST_F(MLIR_ArgAllocationInfo, MultipleCallOps) { module @test { IE.TileResource 2 of @NCE at 1.300000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { @@ -120,13 +120,13 @@ TEST_F(MLIR_ArgAllocationInfo, MultipleCallOpsWithMultipleUses) { module @test { IE.TileResource 2 of @NCE at 1.300000e+03 MHz { IE.MemoryResource 1784217 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1982464 bytes of @CMX_NN {VPU.bandwidth = 32 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1982464 bytes of @CMX_NN {config.bandwidth = 32 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @SHAVE_NN IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 67108864000 bytes of @DDR {VPU.bandwidth = 8 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 67108864000 bytes of @DDR {config.bandwidth = 8 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { diff --git a/tests/unit/vpux_compiler/dialect/IE/infer_output_shape.cpp b/tests/unit/vpux_compiler/dialect/IE/infer_output_shape.cpp index 367c531ff2..61aaeefbc8 100644 --- a/tests/unit/vpux_compiler/dialect/IE/infer_output_shape.cpp +++ b/tests/unit/vpux_compiler/dialect/IE/infer_output_shape.cpp @@ -4,13 +4,12 @@ // #include "vpux/compiler/utils/infer_output_shape.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/dialect/IE/utils/shape_infer.hpp" +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/init.hpp" -#include "vpux/utils/core/error.hpp" +#include #include #include #include diff --git a/tests/unit/vpux_compiler/dialect/IE/infer_return_type_components.cpp b/tests/unit/vpux_compiler/dialect/IE/infer_return_type_components.cpp index c68dfe4ebc..8b8d8ff11a 100644 --- a/tests/unit/vpux_compiler/dialect/IE/infer_return_type_components.cpp +++ b/tests/unit/vpux_compiler/dialect/IE/infer_return_type_components.cpp @@ -5,6 +5,12 @@ #include "common/utils.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/types.hpp" #include "vpux/compiler/utils/infer_output_shape.hpp" diff --git a/tests/unit/vpux_compiler/dialect/IE/type_inference.cpp b/tests/unit/vpux_compiler/dialect/IE/type_inference.cpp index d24c7428b8..20ba47f80f 100644 --- a/tests/unit/vpux_compiler/dialect/IE/type_inference.cpp +++ b/tests/unit/vpux_compiler/dialect/IE/type_inference.cpp @@ -4,10 +4,11 @@ // #include "common/utils.hpp" - #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include #include diff --git a/tests/unit/vpux_compiler/dialect/NPU37XX/ppe_factory_tests.cpp b/tests/unit/vpux_compiler/dialect/NPU37XX/ppe_factory_tests.cpp index 320f68f295..3479dd9124 100644 --- a/tests/unit/vpux_compiler/dialect/NPU37XX/ppe_factory_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/NPU37XX/ppe_factory_tests.cpp @@ -893,7 +893,7 @@ TEST_F(NPU37xxPpeIfcUnitTest, IntPPE_MaxPool_F16_U8_CLAMP) { } TEST_F(NPU37xxPpeIfcUnitTest, IntPPE_ReduceMean_U8_U8_NOOP) { - auto op = createReduceMean(getU8Type(), getU8Type()); + auto op = createReduceMean(getU8Type(), getU8Type(), {Dims4D::Act::C.ind()}, {}); ASSERT_NE(op, nullptr); auto ppeAttr = _ppeIfc->retrievePPEAttribute(op); ASSERT_NE(ppeAttr, nullptr); diff --git a/tests/unit/vpux_compiler/dialect/VPU/generate_tiling_utils_tests.cpp b/tests/unit/vpux_compiler/dialect/VPU/generate_tiling_utils_tests.cpp index 9854b6460f..bc31a6da37 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/generate_tiling_utils_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/generate_tiling_utils_tests.cpp @@ -9,7 +9,7 @@ using namespace vpux; -using vpux::VPU::ArchKind; +using vpux::config::ArchKind; using MLIR_VPU_Generate_Tiling = MLIR_UnitBase; TEST_F(MLIR_VPU_Generate_Tiling, Calculate_Workload_Number) { diff --git a/tests/unit/vpux_compiler/dialect/VPU/get_explicit_distributed_attr.cpp b/tests/unit/vpux_compiler/dialect/VPU/get_explicit_distributed_attr.cpp index 119e8079cc..51e1fc144d 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/get_explicit_distributed_attr.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/get_explicit_distributed_attr.cpp @@ -347,7 +347,7 @@ TEST_F(MLIR_GetExplicitDistributionInfoAttrTest, NCEPermuteOp) { #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> !qElemType = !quant.uniform - module @test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = + module @test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { func.func @main(%arg0: tensor<1x3x224x224xf16>) -> tensor<1x4x224x224x!qElemType, {order = #NHWC}> { diff --git a/tests/unit/vpux_compiler/dialect/VPU/layer_vpunn_cost_tests.cpp b/tests/unit/vpux_compiler/dialect/VPU/layer_vpunn_cost_tests.cpp index 8dd1a25587..ebc5636f5e 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/layer_vpunn_cost_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/layer_vpunn_cost_tests.cpp @@ -3,15 +3,15 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "common/utils.hpp" +#include "vpux/compiler/core/cost_model_utils.hpp" +#include "vpux/compiler/dialect/IE/utils/resources.hpp" #include "vpux/compiler/dialect/VPU/IR/ops.hpp" -#include "vpux/compiler/dialect/VPU/IR/types.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp" #include "vpux/compiler/dialect/VPU/utils/cost_model/layer_vpunn_cost.hpp" #include "vpux/compiler/dialect/config/IR/attributes.hpp" - -#include "common/utils.hpp" -#include "vpux/compiler/core/cost_model_utils.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include #include @@ -20,7 +20,7 @@ #include -using vpux::VPU::ArchKind; +using vpux::config::ArchKind; using vpux::VPU::MultiClusterStrategy; using namespace vpux; @@ -28,7 +28,7 @@ using MLIR_VPU_LayerVPUNNCost = vpux::VPU::arch37xx::UnitTest; VPU::StrategyCost getSWVPUNNCost(std::shared_ptr vpunnLayer, mlir::ModuleOp module, VPU::MultiClusterStrategy mcStrategy) { - const auto archKind = VPU::getArch(module); + const auto archKind = config::getArch(module); const auto vpunnCostFunction = VPU::CostModelConfig::createLayerCostModel(archKind); auto tileOp = IE::getTileExecutor(module); @@ -44,7 +44,7 @@ VPU::StrategyCost getSWVPUNNCost(std::shared_ptr vpunnLayer, VPUNN::CyclesInterfaceType getHWVPUNNCost(VPUNN::DPULayer& vpunnLayer, mlir::ModuleOp module, VPU::MultiClusterStrategy mcStrategy) { - const auto archKind = VPU::getArch(module); + const auto archKind = config::getArch(module); const auto vpunnCostFunction = VPU::CostModelConfig::createLayerCostModel(archKind); auto tileOp = IE::getTileExecutor(module); @@ -73,7 +73,7 @@ VPUNN::CyclesInterfaceType getWeightsDMACost(VPU::NCEOpInterface nceOp, mlir::Mo return 0; } const auto weightsType = mlir::cast(weightsVal.getType()); - const auto archKind = VPU::getArch(module); + const auto archKind = config::getArch(module); const auto vpunnCostModel = VPU::CostModelConfig::createCostModel(archKind); const auto vpunnDevice = VPU::getVPUDeviceType(archKind); const auto numDMAPorts = IE::getAvailableExecutor(module, VPU::ExecutorKind::DMA_NN).getCount(); diff --git a/tests/unit/vpux_compiler/dialect/VPU/mc_strategy_getter_tests.cpp b/tests/unit/vpux_compiler/dialect/VPU/mc_strategy_getter_tests.cpp index 4f6472e1b7..49b08a6d38 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/mc_strategy_getter_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/mc_strategy_getter_tests.cpp @@ -28,25 +28,25 @@ TEST_F(MLIR_MCStrategy_Getter, MCGetterList) { const auto numClusters = 2; SmallVector strategyNPU37XXSet; - auto mcGetter = VPU::createMCStrategyGetter(VPU::ArchKind::NPU37XX, numClusters); + auto mcGetter = VPU::createMCStrategyGetter(config::ArchKind::NPU37XX, numClusters); mcGetter->getMCStrategies(strategyNPU37XXSet); EXPECT_EQ(strategyNPU37XXSet.size(), 5); SmallVector strategyNPU37XX1TileSet; - mcGetter = VPU::createMCStrategyGetter(VPU::ArchKind::NPU37XX, 1); + mcGetter = VPU::createMCStrategyGetter(config::ArchKind::NPU37XX, 1); mcGetter->getMCStrategies(strategyNPU37XX1TileSet); EXPECT_EQ(strategyNPU37XX1TileSet.size(), 1); SmallVector strategyVPU40XX2TilesSet; - mcGetter = VPU::createMCStrategyGetter(VPU::ArchKind::NPU40XX, numClusters); + mcGetter = VPU::createMCStrategyGetter(config::ArchKind::NPU40XX, numClusters); mcGetter->getMCStrategies(strategyVPU40XX2TilesSet); EXPECT_EQ(strategyVPU40XX2TilesSet.size(), 6); SmallVector strategyVPU40XX6TilesSet; - mcGetter = VPU::createMCStrategyGetter(VPU::ArchKind::NPU40XX, 6); + mcGetter = VPU::createMCStrategyGetter(config::ArchKind::NPU40XX, 6); mcGetter->getMCStrategies(strategyVPU40XX6TilesSet); EXPECT_EQ(strategyVPU40XX6TilesSet.size(), 8); diff --git a/tests/unit/vpux_compiler/dialect/VPU/mc_strategy_nothrow_tests.cpp b/tests/unit/vpux_compiler/dialect/VPU/mc_strategy_nothrow_tests.cpp index face3761b9..69daa25c08 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/mc_strategy_nothrow_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/mc_strategy_nothrow_tests.cpp @@ -6,6 +6,7 @@ #include "vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp" #include "vpux/compiler/dialect/VPU/utils/sibling_ops_analysis.hpp" #include "vpux/compiler/dialect/VPU/utils/strategy_manager/strategy_manager.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include #include @@ -20,16 +21,16 @@ using MLIR_VPU_ClusteringStrategyNoThrow = vpux::VPU::arch40xx::UnitTest; TEST_F(MLIR_VPU_ClusteringStrategyNoThrow, SWLayer_ClusteringStrategy) { constexpr llvm::StringLiteral inputIR = R"( #loc0 = loc(unknown) - module @main attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { + module @main attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 2306867200 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 2306867200 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} func.func @main(%softmax_in: tensor<1x8x4x76xf16>, %power_f16_in: tensor<1x16x256x256xf16>, %power_f16_pow: tensor<1x16x1x1xf16>, @@ -98,10 +99,10 @@ TEST_F(MLIR_VPU_ClusteringStrategyNoThrow, SWLayer_ClusteringStrategy) { bool enablePrefetchTiling = true; // set cost model factory - VPU::CostModelConfig::setFactory(VPU::ArchKind::NPU40XX); + VPU::CostModelConfig::setFactory(config::ArchKind::NPU40XX); auto siblingsOpsAnalysis = vpux::VPU::SiblingOpsAnalysis(func); - const auto arch = VPU::getArch(module.get()); + const auto arch = config::getArch(module.get()); auto layerCostModel = VPU::CostModelConfig::createLayerCostModel(arch); vpux::VPU::StrategyManager strategyManager(func, tileOp.getCount(), enablePrefetchTiling, VPU::MCOptimizationScope::SUBGRAPH, siblingsOpsAnalysis, layerCostModel, diff --git a/tests/unit/vpux_compiler/dialect/VPU/nce_workload_cost_tests.cpp b/tests/unit/vpux_compiler/dialect/VPU/nce_workload_cost_tests.cpp index 4541e37c62..c5aed163b1 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/nce_workload_cost_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/nce_workload_cost_tests.cpp @@ -4,18 +4,16 @@ // #include "vpux/compiler/core/tiling.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp" #include "vpux/compiler/dialect/VPU/utils/nce_invariant.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/attributes.hpp" #include "vpux/compiler/dialect/VPUIP/interfaces/dpu_tiler.hpp" #include "vpux/compiler/dialect/VPUIP/transforms/factories/split_cost_getter.hpp" - #include "vpux/utils/logger/logger.hpp" #include #include - #include #include @@ -48,7 +46,7 @@ vpux::VPUIP::WorkloadCostParams buildWorkloadCost(const NceOpTensorShape& tensor costParams.kernelSize = {1, 1}; costParams.kernelStride = {1, 1}; costParams.nceTaskType = vpux::VPUIP::NCETaskType::CONV; - costParams.arch = vpux::VPU::ArchKind::NPU37XX; + costParams.arch = vpux::config::ArchKind::NPU37XX; costParams.numDPU = numDPU; return costParams; } @@ -60,7 +58,7 @@ TEST(MLIR_VPU_WorkloadCost, VPUNNCostInterface) { llvm::SmallVector mpeModeList{vpux::VPU::MPEMode::VECTOR_FP16, vpux::VPU::MPEMode::VECTOR, vpux::VPU::MPEMode::MATRIX}; - const auto costModel = vpux::VPU::CostModelConfig::createCostModel(vpux::VPU::ArchKind::NPU37XX); + const auto costModel = vpux::VPU::CostModelConfig::createCostModel(vpux::config::ArchKind::NPU37XX); llvm::SmallVector testTensorLists; for (int64_t h = initDimensionValue; h < maxDimensionValue; h *= testStep) { diff --git a/tests/unit/vpux_compiler/dialect/VPU/op_tiling_cache_tests.cpp b/tests/unit/vpux_compiler/dialect/VPU/op_tiling_cache_tests.cpp index 3324505f05..562c0f0ecd 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/op_tiling_cache_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/op_tiling_cache_tests.cpp @@ -28,7 +28,7 @@ using MLIR_OpTilingCacheTest = vpux::VPU::arch37xx::UnitTest; llvm::StringLiteral inputIR = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { func.func @main(%arg0: tensor<1x16x8x8xf16, {order = #NHWC}>) -> tensor<1x16x8x8xf16, {order = #NHWC}> { @@ -54,16 +54,16 @@ llvm::StringLiteral inputIR = R"( TEST_F(MLIR_OpTilingCacheTest, OutputTilingTest) { auto registry = vpux::createDialectRegistry(); - auto interfacesRegistry = vpux::createInterfacesRegistry(VPU::ArchKind::NPU40XX); + auto interfacesRegistry = vpux::createInterfacesRegistry(config::ArchKind::NPU40XX); interfacesRegistry->registerInterfaces(registry); mlir::MLIRContext ctx(registry); auto module = mlir::parseSourceString(inputIR, &ctx); ASSERT_TRUE(module.get() != nullptr); - module.get()->removeAttr("VPU.arch"); + module.get()->removeAttr("config.arch"); mlir::PassManager pm(module.get()->getName(), mlir::OpPassManager::Nesting::Implicit); - auto initCompilerOptions = VPU::InitCompilerOptions(VPU::ArchKind::NPU40XX, config::CompilationMode::DefaultHW); + auto initCompilerOptions = VPU::InitCompilerOptions(config::ArchKind::NPU40XX, config::CompilationMode::DefaultHW); VPU::buildInitCompilerPipeline(pm, initCompilerOptions, vpux::Logger::global()); ASSERT_TRUE(mlir::succeeded(pm.run(module.get()))); @@ -92,18 +92,18 @@ TEST_F(MLIR_OpTilingCacheTest, OutputTilingTest) { TEST_F(MLIR_OpTilingCacheTest, OpDPUCostTest) { auto registry = vpux::createDialectRegistry(); - auto interfacesRegistry = vpux::createInterfacesRegistry(VPU::ArchKind::NPU40XX); + auto interfacesRegistry = vpux::createInterfacesRegistry(config::ArchKind::NPU40XX); interfacesRegistry->registerInterfaces(registry); // set cost model factory - VPU::CostModelConfig::setFactory(VPU::ArchKind::NPU40XX); + VPU::CostModelConfig::setFactory(config::ArchKind::NPU40XX); mlir::MLIRContext ctx(registry); auto module = mlir::parseSourceString(inputIR, &ctx); ASSERT_TRUE(module.get() != nullptr); - module.get()->removeAttr("VPU.arch"); + module.get()->removeAttr("config.arch"); mlir::PassManager pm(module.get()->getName(), mlir::OpPassManager::Nesting::Implicit); - auto initCompilerOptions = VPU::InitCompilerOptions(VPU::ArchKind::NPU40XX, config::CompilationMode::DefaultHW); + auto initCompilerOptions = VPU::InitCompilerOptions(config::ArchKind::NPU40XX, config::CompilationMode::DefaultHW); VPU::buildInitCompilerPipeline(pm, initCompilerOptions, vpux::Logger::global()); ASSERT_TRUE(mlir::succeeded(pm.run(module.get()))); @@ -133,10 +133,11 @@ TEST_F(MLIR_OpTilingCacheTest, OpDPUCostTest) { mlir::dyn_cast(nceOps[0].getOperation()).getMultiClusterStrategy().value(); const auto costParams = VPU::getWorkloadCostParam(mlir::dyn_cast(nceOps[0].getOperation()), - VPU::ArchKind::NPU40XX, numDPUs); - const auto vpunnStrategy = VPU::getVPULayerStrategy(strategy, numDPUs, numTiles, VPU::ArchKind::NPU40XX, 1, true); + config::ArchKind::NPU40XX, numDPUs); + const auto vpunnStrategy = + VPU::getVPULayerStrategy(strategy, numDPUs, numTiles, config::ArchKind::NPU40XX, 1, true); - auto layerCostModel = VPU::CostModelConfig::createLayerCostModel(VPU::ArchKind::NPU40XX); + auto layerCostModel = VPU::CostModelConfig::createLayerCostModel(config::ArchKind::NPU40XX); auto dpuCost1 = getDPUCostForNCEOp(nceOps[0], strategy, outputTiling, costParams, vpunnStrategy, layerCostModel, Logger::global()); diff --git a/tests/unit/vpux_compiler/dialect/VPU/operation_strategies_tests.cpp b/tests/unit/vpux_compiler/dialect/VPU/operation_strategies_tests.cpp index 464b4b23d6..7d0a4efa7e 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/operation_strategies_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/operation_strategies_tests.cpp @@ -17,7 +17,7 @@ #include -using vpux::VPU::ArchKind; +using vpux::config::ArchKind; using namespace vpux; using MLIR_VPU_OpStrategies = vpux::VPU::arch37xx::UnitTest; diff --git a/tests/unit/vpux_compiler/dialect/VPU/runtime_sparsity_stats_provider_tests.cpp b/tests/unit/vpux_compiler/dialect/VPU/runtime_sparsity_stats_provider_tests.cpp index 25ed93a15a..9ea2d9bb08 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/runtime_sparsity_stats_provider_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/runtime_sparsity_stats_provider_tests.cpp @@ -17,7 +17,7 @@ #include -using vpux::VPU::ArchKind; +using vpux::config::ArchKind; using namespace vpux; using MLIR_VPU_RT_SPARSITY_STATS_PROVIDER = vpux::VPU::arch37xx::UnitTest; diff --git a/tests/unit/vpux_compiler/dialect/VPU/scf_tiling_interface_tests.cpp b/tests/unit/vpux_compiler/dialect/VPU/scf_tiling_interface_tests.cpp index 4b3c3202d4..08a657823d 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/scf_tiling_interface_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/scf_tiling_interface_tests.cpp @@ -72,17 +72,17 @@ TEST_F(MLIR_SCFTilingTest, ComputeInputTilesEltwise) { VPU::SCFTileInfo outputTile({1, 16, 256, 70}, builder); auto scfTilingInput = nceEltwiseOpModel.backInferSCFTileInfo(eltwise.getOperation(), builder, outputTile); - EXPECT_EQ(scfTilingInput.size(), 2); - auto inputShape1 = mlir::getConstantIntValues(scfTilingInput.front().shape); - auto inputShape2 = mlir::getConstantIntValues(scfTilingInput.back().shape); + EXPECT_EQ(scfTilingInput.tiles.size(), 2); + auto inputShape1 = mlir::getConstantIntValues(scfTilingInput.tiles.front().shape); + auto inputShape2 = mlir::getConstantIntValues(scfTilingInput.tiles.back().shape); EXPECT_TRUE(inputShape1.has_value() && inputShape2.has_value()); EXPECT_TRUE(llvm::equal(inputShape1.value(), inputShape2.value())); SmallVector expectedShape = {1, 16, 256, 70}; EXPECT_TRUE(llvm::equal(inputShape1.value(), expectedShape)); - auto inputOffset1 = mlir::getConstantIntValues(scfTilingInput.front().offsets); - auto inputOffset2 = mlir::getConstantIntValues(scfTilingInput.back().offsets); + auto inputOffset1 = mlir::getConstantIntValues(scfTilingInput.tiles.front().offsets); + auto inputOffset2 = mlir::getConstantIntValues(scfTilingInput.tiles.back().offsets); EXPECT_TRUE(inputOffset1.has_value() && inputOffset2.has_value()); EXPECT_TRUE(llvm::equal(inputOffset1.value(), inputOffset2.value())); @@ -142,9 +142,9 @@ TEST_F(MLIR_SCFTilingTest, ComputeInputTilesConv) { mlir::getAsIndexOpFoldResult(&ctx, axes)); auto scfTilingInput = nceConvOpModel.backInferSCFTileInfo(conv.getOperation(), builder, outputTile); - EXPECT_EQ(scfTilingInput.size(), 1); - auto inputShape = mlir::getConstantIntValues(scfTilingInput.front().shape); - auto inputOffset = mlir::getConstantIntValues(scfTilingInput.front().offsets); + EXPECT_EQ(scfTilingInput.tiles.size(), 1); + auto inputShape = mlir::getConstantIntValues(scfTilingInput.tiles.front().shape); + auto inputOffset = mlir::getConstantIntValues(scfTilingInput.tiles.front().offsets); EXPECT_TRUE(inputShape.has_value() && inputOffset.has_value()); SmallVector expectedInputOffset = {0, 0, 31, 0}; @@ -209,9 +209,9 @@ TEST_F(MLIR_SCFTilingTest, ComputeInputTilesCTileConv) { mlir::getAsIndexOpFoldResult(&ctx, axes)); auto scfTilingInput = nceConvOpModel.backInferSCFTileInfo(conv.getOperation(), builder, outputTile); - EXPECT_EQ(scfTilingInput.size(), 3); - auto inputShape = mlir::getConstantIntValues(scfTilingInput.front().shape); - auto inputOffset = mlir::getConstantIntValues(scfTilingInput.front().offsets); + EXPECT_EQ(scfTilingInput.tiles.size(), 3); + auto inputShape = mlir::getConstantIntValues(scfTilingInput.tiles.front().shape); + auto inputOffset = mlir::getConstantIntValues(scfTilingInput.tiles.front().offsets); EXPECT_TRUE(inputShape.has_value() && inputOffset.has_value()); SmallVector expectedInputOffset = {0, 0, 0, 0}; @@ -219,8 +219,8 @@ TEST_F(MLIR_SCFTilingTest, ComputeInputTilesCTileConv) { EXPECT_TRUE(llvm::equal(inputShape.value(), expectedInputShape)); EXPECT_TRUE(llvm::equal(inputOffset.value(), expectedInputOffset)); - auto filterShape = mlir::getConstantIntValues(scfTilingInput[1].shape); - auto filterOffset = mlir::getConstantIntValues(scfTilingInput[1].offsets); + auto filterShape = mlir::getConstantIntValues(scfTilingInput.tiles[1].shape); + auto filterOffset = mlir::getConstantIntValues(scfTilingInput.tiles[1].offsets); EXPECT_TRUE(filterShape.has_value() && filterOffset.has_value()); SmallVector expectedFilterOffset = {256, 0, 0, 0}; @@ -228,8 +228,8 @@ TEST_F(MLIR_SCFTilingTest, ComputeInputTilesCTileConv) { EXPECT_TRUE(llvm::equal(filterShape.value(), expectedFilterShape)); EXPECT_TRUE(llvm::equal(filterOffset.value(), expectedFilterOffset)); - auto wtShape = mlir::getConstantIntValues(scfTilingInput.back().shape); - auto wtOffset = mlir::getConstantIntValues(scfTilingInput.back().offsets); + auto wtShape = mlir::getConstantIntValues(scfTilingInput.tiles.back().shape); + auto wtOffset = mlir::getConstantIntValues(scfTilingInput.tiles.back().offsets); EXPECT_TRUE(wtShape.has_value() && wtOffset.has_value()); SmallVector expectedWtOffset = {256, 0, 0, 0}; @@ -289,9 +289,9 @@ TEST_F(MLIR_SCFTilingTest, ComputeInputTilesPooling) { mlir::getAsIndexOpFoldResult(&ctx, axes)); auto scfTilingInput = ncePoolOpModel.backInferSCFTileInfo(pooling.getOperation(), builder, outputTile); - EXPECT_EQ(scfTilingInput.size(), 1); - auto inputShape = mlir::getConstantIntValues(scfTilingInput.front().shape); - auto inputOffset = mlir::getConstantIntValues(scfTilingInput.front().offsets); + EXPECT_EQ(scfTilingInput.tiles.size(), 1); + auto inputShape = mlir::getConstantIntValues(scfTilingInput.tiles.front().shape); + auto inputOffset = mlir::getConstantIntValues(scfTilingInput.tiles.front().offsets); EXPECT_TRUE(inputShape.has_value() && inputOffset.has_value()); SmallVector expectedInputOffset = {0, 0, 99, 0}; @@ -364,9 +364,9 @@ TEST_F(MLIR_SCFTilingTest, ComputeInputTilesDWConv) { mlir::getAsIndexOpFoldResult(&ctx, axes)); auto scfTilingInput = nceDwConvOpModel.backInferSCFTileInfo(dwConv.getOperation(), builder, outputTile); - EXPECT_EQ(scfTilingInput.size(), 3); - auto inputShape = mlir::getConstantIntValues(scfTilingInput.front().shape); - auto inputOffset = mlir::getConstantIntValues(scfTilingInput.front().offsets); + EXPECT_EQ(scfTilingInput.tiles.size(), 3); + auto inputShape = mlir::getConstantIntValues(scfTilingInput.tiles.front().shape); + auto inputOffset = mlir::getConstantIntValues(scfTilingInput.tiles.front().offsets); EXPECT_TRUE(inputShape.has_value() && inputOffset.has_value()); SmallVector expectedInputOffset = {0, 16, 0, 0}; @@ -374,8 +374,8 @@ TEST_F(MLIR_SCFTilingTest, ComputeInputTilesDWConv) { EXPECT_TRUE(llvm::equal(inputShape.value(), expectedInputShape)); EXPECT_TRUE(llvm::equal(inputOffset.value(), expectedInputOffset)); - auto filterShape = mlir::getConstantIntValues(scfTilingInput[1].shape); - auto filterOffset = mlir::getConstantIntValues(scfTilingInput[1].offsets); + auto filterShape = mlir::getConstantIntValues(scfTilingInput.tiles[1].shape); + auto filterOffset = mlir::getConstantIntValues(scfTilingInput.tiles[1].offsets); EXPECT_TRUE(filterShape.has_value() && filterOffset.has_value()); SmallVector expectedFilterOffset = {16, 0, 0, 0}; @@ -383,8 +383,8 @@ TEST_F(MLIR_SCFTilingTest, ComputeInputTilesDWConv) { EXPECT_TRUE(llvm::equal(filterShape.value(), expectedFilterShape)); EXPECT_TRUE(llvm::equal(filterOffset.value(), expectedFilterOffset)); - auto wtShape = mlir::getConstantIntValues(scfTilingInput.back().shape); - auto wtOffset = mlir::getConstantIntValues(scfTilingInput.back().offsets); + auto wtShape = mlir::getConstantIntValues(scfTilingInput.tiles.back().shape); + auto wtOffset = mlir::getConstantIntValues(scfTilingInput.tiles.back().offsets); EXPECT_TRUE(wtShape.has_value() && wtOffset.has_value()); SmallVector expectedWtOffset = {16, 0, 0, 0}; diff --git a/tests/unit/vpux_compiler/dialect/VPU/se_attr_interface_tests.cpp b/tests/unit/vpux_compiler/dialect/VPU/se_attr_interface_tests.cpp index 84c0932f54..ec6d4d6e37 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/se_attr_interface_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/se_attr_interface_tests.cpp @@ -3,11 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // -// - -#include #include #include +#include "vpux/compiler/dialect/IE/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/VPU/IR/dialect.hpp" #include "vpux/compiler/init.hpp" @@ -18,6 +16,8 @@ #include +#include + using namespace vpux; struct SeSizes { diff --git a/tests/unit/vpux_compiler/dialect/VPU/se_table_patch_test.cpp b/tests/unit/vpux_compiler/dialect/VPU/se_table_patch_test.cpp new file mode 100644 index 0000000000..3f06615f53 --- /dev/null +++ b/tests/unit/vpux_compiler/dialect/VPU/se_table_patch_test.cpp @@ -0,0 +1,210 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// + +#include +#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" +#include "vpux/compiler/dialect/VPU/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" +#include "vpux/compiler/dialect/VPUIP/IR/types.hpp" +#include "vpux/compiler/dialect/VPUIP/transforms/passes/unroll_distributed_ops.hpp" +#include "vpux/compiler/dialect/const/dialect.hpp" +#include "vpux/compiler/dialect/const/ops.hpp" +#include "vpux/compiler/init.hpp" +#include "vpux/compiler/utils/attributes.hpp" +#include "vpux/compiler/utils/types.hpp" + +#include +#include + +#include + +using namespace vpux; + +struct SETablePatchParams { + std::vector inputSETableValues; + std::vector seTableShape; + std::vector numTiles; + int64_t numClusters; + // NCE Input distribution parameters + std::vector dataShape; + std::vector> nceInputComputeShapes; + std::vector> nceInputComputeOffsets; + std::vector> nceInputMemoryShapes; + std::vector> nceInputMemoryOffsets; + // SE Table subview parameters + std::vector> seTableMemoryShapes; + std::vector> seTableMemoryOffsets; + std::vector> expectedPatchedValues; +}; + +class SETablePatchTests : public testing::TestWithParam {}; + +TEST_P(SETablePatchTests, patchSETableValue) { + auto registry = vpux::createDialectRegistry(); + mlir::MLIRContext ctx(registry); + ctx.loadDialect(); + ctx.loadDialect(); + ctx.loadDialect(); + + const auto params = GetParam(); + + mlir::OpBuilder builder(&ctx); + auto loc = mlir::UnknownLoc::get(&ctx); + + // Create SE table constant with NHWC layout + const auto seTableTensorType = mlir::RankedTensorType::get(params.seTableShape, builder.getIntegerType(32)); + const auto seTableContent = mlir::DenseElementsAttr::get(seTableTensorType, ArrayRef(params.inputSETableValues)); + const auto seTableConstant = Const::ContentAttr::get(seTableContent); + auto ddrMemSpaceAttr = vpux::IndexedSymbolAttr::get(&ctx, stringifyEnum(VPU::MemoryKind::DDR), 0); + + // Create SE table type with NHWC layout + const auto seTableType = + mlir::MemRefType::get(params.seTableShape, builder.getIntegerType(32), + mlir::AffineMapAttr::get(DimsOrder::NHWC.toAffineMap(&ctx)), ddrMemSpaceAttr); + + // Create a constant operation + auto constOp = builder.create(loc, seTableType, seTableConstant); + + // Create NCE Input distribution attribute + auto distributionModeAttr = VPU::DistributionModeAttr::get(&ctx, VPU::DistributionMode::OVERLAPPED); + auto numTilesAttr = getIntArrayAttr(&ctx, params.numTiles); + auto nceInputComputeShapesAttr = getIntArrayOfArray(&ctx, params.nceInputComputeShapes); + auto nceInputComputeOffsetsAttr = getIntArrayOfArray(&ctx, params.nceInputComputeOffsets); + auto nceInputMemoryShapesAttr = getIntArrayOfArray(&ctx, params.nceInputMemoryShapes); + auto nceInputMemoryOffsetsAttr = getIntArrayOfArray(&ctx, params.nceInputMemoryOffsets); + + auto nceInputDistributionAttr = VPU::DistributionInfoAttr::get( + &ctx, distributionModeAttr, numTilesAttr, /*kernelSize=*/nullptr, + /*pads=*/nullptr, /*kernelStrides=*/nullptr, + /*numClusters=*/getIntAttr(&ctx, params.numClusters), + /*alignment=*/nullptr, + /*uniformDistributedSegments=*/nullptr, nceInputComputeShapesAttr, nceInputComputeOffsetsAttr, + nceInputMemoryShapesAttr, nceInputMemoryOffsetsAttr, + /*equalMemoryAndComputeView=*/nullptr); + + // Create distributed buffer type for NCE Input + auto memSpaceAttr = vpux::IndexedSymbolAttr::get(&ctx, stringifyEnum(VPU::MemoryKind::CMX_NN), 0); + auto distributedType = VPUIP::DistributedBufferType::get( + &ctx, params.dataShape, builder.getF16Type(), mlir::AffineMapAttr::get(DimsOrder::NHWC.toAffineMap(&ctx)), + memSpaceAttr, nceInputDistributionAttr); + + // Loop through all clusters to test patchSETableValue + for (int64_t clusterId = 0; clusterId < params.numClusters; ++clusterId) { + auto targetMemoryShape = params.seTableMemoryShapes[clusterId]; + auto targetMemoryOffset = params.seTableMemoryOffsets[clusterId]; + auto subviewOp = builder.createOrFold(loc, constOp, targetMemoryOffset, targetMemoryShape); + + auto subviewConstOp = subviewOp.getDefiningOp(); + + // Call patchSETableValue function with the subview result + auto patchedValue = VPUIP::patchSETableValue(loc, subviewConstOp, distributedType, clusterId, builder); + + // Extract the patched SE table values + auto patchedConstOp = mlir::cast(patchedValue.getDefiningOp()); + auto patchedContent = patchedConstOp.getContent(); + auto patchedValues = to_small_vector(patchedContent.getValues()); + + // Verify the patched values match the expected values for this cluster + EXPECT_EQ(patchedValues.size(), params.expectedPatchedValues[clusterId].size()) + << "Cluster " << clusterId << " has incorrect number of patched values"; + for (size_t i = 0; i < patchedValues.size(); ++i) { + EXPECT_EQ(patchedValues[i], params.expectedPatchedValues[clusterId][i]) + << "Cluster " << clusterId << " has incorrect patched value at index " << i; + } + } +} + +// clang-format off + +// Test parameters for patchSETableValue +std::vector seTablePatchParams = { + // Test case 1: H-tiling case + { + /*inputSETableValues=*/{ + 0x0000, 0x0000, 0x0400, 0x0800, 0x0C00, 0x0C00, + 0x0000, 0x0000, 0x0400, 0x0800, 0x0C00, 0x0C00, + 0x1000, 0x1000, 0x1400, 0x1800, 0x1C00, 0x1C00, + 0x2000, 0x2000, 0x2400, 0x2800, 0x2C00, 0x2C00, + 0x2001, 0x2001, 0x2401, 0x2801, 0x2C01, 0x2C01, + 0x2001, 0x2001, 0x2401, 0x2801, 0x2C01, 0x2C01 + }, + /*seTableShape=*/{1, 1, 6, 6}, + /*numTiles=*/{1, 1, 3, 1}, + /*numClusters=*/3, + /*dataShape=*/{1, 16, 4, 4}, + /*nceInputComputeShapes=*/{{1, 16, 1, 4}, {1, 16, 2, 4}, {1, 16, 1, 4}}, + /*nceInputComputeOffsets=*/{{0, 0, 0, 0}, {0, 0, 1, 0}, {0, 0, 3, 0}}, + /*nceInputMemoryShapes=*/{{1, 16, 3, 4}, {1, 16, 3, 4}, {1, 16, 2, 4}}, + /*nceInputMemoryOffsets=*/{{0, 0, 0, 0}, {0, 0, 1, 0}, {0, 0, 2, 0}}, + /*seTableMemoryShapes=*/{{1, 1, 4, 6}, {1, 1, 3, 6}, {1, 1, 3, 6}}, + /*seTableMemoryOffsets=*/{{0, 0, 0, 0}, {0, 0, 2, 0}, {0, 0, 3, 0}}, + /*expectedPatchedValues=*/{ + // Expected patched values for cluster 0 + {0x0000, 0x0000, 0x0400, 0x0800, 0x0C00, 0x0C00, + 0x0000, 0x0000, 0x0400, 0x0800, 0x0C00, 0x0C00, + 0x1000, 0x1000, 0x1400, 0x1800, 0x1C00, 0x1C00, + 0x2000, 0x2000, 0x2400, 0x2800, 0x2C00, 0x2C00}, + // Expected patched values for cluster 1 + {0x0001, 0x0001, 0x0401, 0x0801, 0x0C01, 0x0C01, + 0x1001, 0x1001, 0x1401, 0x1801, 0x1C01, 0x1C01, + 0x2001, 0x2001, 0x2401, 0x2801, 0x2C01, 0x2C01}, + // Expected patched values for cluster 2 + {0x0002, 0x0002, 0x0402, 0x0802, 0x0C02, 0x0C02, + 0x1002, 0x1002, 0x1402, 0x1802, 0x1C02, 0x1C02, + 0x1002, 0x1002, 0x1402, 0x1802, 0x1C02, 0x1C02} + } + }, + + // Test case 2: W-tiling case + { + /*inputSETableValues=*/{ + 0x0000, 0x0000, 0x0400, 0x0800, 0x0801, 0x0801, + 0x0000, 0x0000, 0x0400, 0x0800, 0x0801, 0x0801, + 0x1000, 0x1000, 0x1400, 0x1800, 0x1801, 0x1801, + 0x2000, 0x2000, 0x2400, 0x2800, 0x2801, 0x2801, + 0x3000, 0x3000, 0x3400, 0x3800, 0x3801, 0x3801, + 0x3000, 0x3000, 0x3400, 0x3800, 0x3801, 0x3801 + }, + /*seTableShape=*/{1, 1, 6, 6}, + /*numTiles=*/{1, 1, 1, 3}, + /*numClusters=*/3, + /*dataShape=*/{1, 16, 4, 4}, + /*nceInputComputeShapes=*/{{1, 16, 4, 1}, {1, 16, 4, 2}, {1, 16, 4, 1}}, + /*nceInputComputeOffsets=*/{{0, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 0, 3}}, + /*nceInputMemoryShapes=*/{{1, 16, 4, 3}, {1, 16, 4, 3}, {1, 16, 4, 2}}, + /*nceInputMemoryOffsets=*/{{0, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 0, 2}}, + /*seTableMemoryShapes=*/{{1, 1, 6, 4}, {1, 1, 6, 3}, {1, 1, 6, 3}}, + /*seTableMemoryOffsets=*/{{0, 0, 0, 0}, {0, 0, 0, 2}, {0, 0, 0, 3}}, + /*expectedPatchedValues=*/{ + // Expected patched values for cluster 0 + {0x0000, 0x0000, 0x0400, 0x0800, + 0x0000, 0x0000, 0x0400, 0x0800, + 0x1000, 0x1000, 0x1400, 0x1800, + 0x2000, 0x2000, 0x2400, 0x2800, + 0x3000, 0x3000, 0x3400, 0x3800, + 0x3000, 0x3000, 0x3400, 0x3800}, + // Expected patched values for cluster 1 + {0x0001, 0x0401, 0x0801, + 0x0001, 0x0401, 0x0801, + 0x1001, 0x1401, 0x1801, + 0x2001, 0x2401, 0x2801, + 0x3001, 0x3401, 0x3801, + 0x3001, 0x3401, 0x3801}, + // Expected patched values for cluster 2 + {0x0002, 0x0402, 0x0402, + 0x0002, 0x0402, 0x0402, + 0x1002, 0x1402, 0x1402, + 0x2002, 0x2402, 0x2402, + 0x3002, 0x3402, 0x3402, + 0x3002, 0x3402, 0x3402}, + } + } +}; + +// clang-format on + +INSTANTIATE_TEST_SUITE_P(unit, SETablePatchTests, testing::ValuesIn(seTablePatchParams)); diff --git a/tests/unit/vpux_compiler/dialect/VPU/sparse_op_interface_tests.cpp b/tests/unit/vpux_compiler/dialect/VPU/sparse_op_interface_tests.cpp index 3dd722eae0..274c3a732f 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/sparse_op_interface_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/sparse_op_interface_tests.cpp @@ -19,7 +19,7 @@ #include -using vpux::VPU::ArchKind; +using vpux::config::ArchKind; void testSparsitySupport(llvm::StringLiteral inputIR, ArchKind arch, bool supportInputSparsity, bool supportOutputSparsity, bool supportWeightSparsity) { diff --git a/tests/unit/vpux_compiler/dialect/VPU/state_provider_interface_tests.cpp b/tests/unit/vpux_compiler/dialect/VPU/state_provider_interface_tests.cpp index 9e338177f1..48d9745600 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/state_provider_interface_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/state_provider_interface_tests.cpp @@ -26,7 +26,7 @@ #include "llvm/Bitcode/BitcodeReader.h" -using vpux::VPU::ArchKind; +using vpux::config::ArchKind; using namespace vpux; std::mt19937 gen(1); @@ -309,7 +309,7 @@ TEST_F(StateProviderInterfaceTests, StateProviderNCEPermute_tests) { #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> #loc0 = loc(unknown) - module @main attributes {VPU.arch = #VPU.arch_kind} { + module @main attributes {config.arch = #config.arch_kind} { func.func @main(%arg0: tensor<1x3x224x224xf16>) -> tensor<1x32x112x112x!qElemType0, {order = #NHWC}> { %cst = const.Declare tensor<32x1x1x32x!qElemType1, {order = #NHWC}> = dense<1.0> : tensor<32x1x1x32xf16>, [#const.CastElemType, #const.CastElemType, #const.Reorder<#NHWC>] %cst_0 = const.Declare tensor<32x1x1x4xsi32> = dense<0> : tensor<32x1x1x4xsi32> @@ -337,7 +337,7 @@ TEST_F(StateProviderInterfaceTests, StateProviderNCEPermute_tests) { auto func = module.get().lookupSymbol("main"); ASSERT_TRUE(func != nullptr); - module.get()->removeAttr("VPU.arch"); + module.get()->removeAttr("config.arch"); mlir::PassManager pm(module.get()->getName(), mlir::OpPassManager::Nesting::Implicit); auto initCompilerOptions = VPU::InitCompilerOptions(ArchKind::NPU37XX, config::CompilationMode::DefaultHW); diff --git a/tests/unit/vpux_compiler/dialect/VPU/tiling_utils_tests.cpp b/tests/unit/vpux_compiler/dialect/VPU/tiling_utils_tests.cpp index 76a2548cb6..bb5dc685c1 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/tiling_utils_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/tiling_utils_tests.cpp @@ -4,7 +4,9 @@ // #include "vpux/compiler/core/tiling.hpp" +#include "vpux/compiler/dialect/VPU/IR/ops.hpp" #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" +#include "vpux/compiler/dialect/VPU/utils/sibling_ops_analysis.hpp" #include "vpux/compiler/dialect/config/IR/attributes.hpp" #include @@ -12,7 +14,7 @@ #include -using vpux::VPU::ArchKind; +using vpux::config::ArchKind; using namespace vpux; using MLIR_VPU_doesTopKLayerFitIntoCMX = MLIR_UnitBase; @@ -227,7 +229,7 @@ TEST_F(MLIR_VPU_isMultiClusterCompatibleForTiling, isSplitOverHeightCompatibleFo })"; auto registry = vpux::createDialectRegistry(); - const auto arch = VPU::ArchKind::NPU40XX; + const auto arch = config::ArchKind::NPU40XX; auto interfacesRegistry = vpux::createInterfacesRegistry(arch); interfacesRegistry->registerInterfaces(registry); @@ -294,7 +296,7 @@ TEST_F(MLIR_VPU_isMultiClusterCompatibleForTiling, isSplitOverKernelCompatibleFo })"; auto registry = vpux::createDialectRegistry(); - const auto arch = VPU::ArchKind::NPU40XX; + const auto arch = config::ArchKind::NPU40XX; auto interfacesRegistry = vpux::createInterfacesRegistry(arch); interfacesRegistry->registerInterfaces(registry); diff --git a/tests/unit/vpux_compiler/dialect/VPU/vf_config.cpp b/tests/unit/vpux_compiler/dialect/VPU/vf_config.cpp index adc8a53fb3..b33031129b 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/vf_config.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/vf_config.cpp @@ -17,7 +17,7 @@ #include -using vpux::VPU::ArchKind; +using vpux::config::ArchKind; using namespace vpux; using MLIR_VPU_VFConfig = vpux::VPU::arch37xx::UnitTest; diff --git a/tests/unit/vpux_compiler/dialect/VPU/vf_container.cpp b/tests/unit/vpux_compiler/dialect/VPU/vf_container.cpp index 0bbf39f039..aa4d807005 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/vf_container.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/vf_container.cpp @@ -19,7 +19,7 @@ #include -using vpux::VPU::ArchKind; +using vpux::config::ArchKind; using namespace vpux; using MLIR_VPU_VFPipelineContainer = vpux::VPU::arch40xx::UnitTest; @@ -67,7 +67,7 @@ TEST_F(MLIR_VPU_VFPipelineContainer, VF_ContainerCost) { auto container = VPU::VFPipelineContainer(); // set cost model factory - VPU::CostModelConfig::setFactory(VPU::ArchKind::NPU40XX); + VPU::CostModelConfig::setFactory(config::ArchKind::NPU40XX); auto layerCost = std::make_unique(func); auto operationStorage = std::make_unique(); diff --git a/tests/unit/vpux_compiler/dialect/VPU/vpunn_cost_model_analysis_test.cpp b/tests/unit/vpux_compiler/dialect/VPU/vpunn_cost_model_analysis_test.cpp index aa3b9914e4..1f3efcf139 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/vpunn_cost_model_analysis_test.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/vpunn_cost_model_analysis_test.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/dialect/VPU/utils/cost_model/cost_model.hpp" #include "vpux/compiler/dialect/VPU/utils/cost_model/factories/cost_model_config.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/interfaces_registry.hpp" #include "vpux/compiler/utils/passes.hpp" @@ -59,13 +60,41 @@ class CheckNoCachePass : public mlir::PassWrapper { +public: + ::llvm::StringRef getName() const override { + return "CheckSharedCostModelPass"; + } + void safeRunOnFunc() final { + auto func = getOperation(); + auto module = func->getParentOfType(); + const auto arch = config::getArch(module); + + const auto maybeCostModelAnalysis = getCachedParentAnalysis(module); + auto costModel = VPU::CostModelAnalysis::getOrCreateCostModel(maybeCostModelAnalysis, arch, _log); + + const auto maybeLayerCostModelAnalysis = getCachedParentAnalysis(module); + auto layerCostModel = + VPU::LayerCostModelAnalysis::getOrCreateLayerCostModel(maybeLayerCostModelAnalysis, arch, _log) + ->get_cost_model_shared(); + + VPUX_THROW_UNLESS(costModel != nullptr, "CostModelAnalysis must have a valid VPUCostModel instance"); + VPUX_THROW_UNLESS(layerCostModel != nullptr, "LayerCostModelAnalysis must have a valid VPUCostModel instance"); + VPUX_THROW_UNLESS(costModel == layerCostModel, + "CostModelAnalysis and LayerCostModelAnalysis must share the same VPUCostModel instance"); + } +}; + } // namespace CostModelAnalysisTests using MLIR_CostModelAnalysisTest = MLIR_UnitBase; const static llvm::StringLiteral inputIR = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { func.func @main(%arg0: tensor<1x128x32x32xf16, {order = #NHWC}>) -> tensor<1x64x32x32xf16, {order = #NHWC}> { %cst = const.Declare tensor<64x1x1x4xsi32> = dense<10> : tensor<64x1x1x4xsi32> %cst_0 = const.Declare tensor<64x128x1x1xf16, {order = #NHWC}> = dense<1.000000e+00> : tensor<64x128x1x1xf16>, [#const.Reorder<#NHWC>] @@ -80,10 +109,10 @@ const static llvm::StringLiteral inputIR = R"( TEST_F(MLIR_CostModelAnalysisTest, CostModelAnalysisBehavior) { auto registry = vpux::createDialectRegistry(); - const auto arch = VPU::ArchKind::NPU40XX; + const auto arch = config::ArchKind::NPU40XX; auto interfacesRegistry = vpux::createInterfacesRegistry(arch); interfacesRegistry->registerInterfaces(registry); - VPU::CostModelConfig::setFactory(VPU::ArchKind::NPU40XX); + VPU::CostModelConfig::setFactory(config::ArchKind::NPU40XX); mlir::MLIRContext ctx(registry); auto module = mlir::parseSourceString(inputIR, &ctx); diff --git a/tests/unit/vpux_compiler/dialect/VPU/vpunn_pre_split_cost_tests.cpp b/tests/unit/vpux_compiler/dialect/VPU/vpunn_pre_split_cost_tests.cpp index aae12d5b54..2724b4fbc7 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/vpunn_pre_split_cost_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/vpunn_pre_split_cost_tests.cpp @@ -58,11 +58,11 @@ const static llvm::StringLiteral inputIRClustering = R"( TEST_F(MLIR_PreSplitCostTest, SamePreSplitCostForSOK) { auto registry = vpux::createDialectRegistry(); - const auto arch = VPU::ArchKind::NPU40XX; + const auto arch = config::ArchKind::NPU40XX; auto interfacesRegistry = vpux::createInterfacesRegistry(arch); interfacesRegistry->registerInterfaces(registry); // set cost model factory - VPU::CostModelConfig::setFactory(VPU::ArchKind::NPU40XX); + VPU::CostModelConfig::setFactory(config::ArchKind::NPU40XX); mlir::MLIRContext ctx(registry); auto module = mlir::parseSourceString(inputIRSOK, &ctx); @@ -113,11 +113,11 @@ TEST_F(MLIR_PreSplitCostTest, SamePreSplitCostForSOK) { TEST_F(MLIR_PreSplitCostTest, SamePreSplitCostForClustering) { auto registry = vpux::createDialectRegistry(); - const auto arch = VPU::ArchKind::NPU40XX; + const auto arch = config::ArchKind::NPU40XX; auto interfacesRegistry = vpux::createInterfacesRegistry(arch); interfacesRegistry->registerInterfaces(registry); // set cost model factory - VPU::CostModelConfig::setFactory(VPU::ArchKind::NPU40XX); + VPU::CostModelConfig::setFactory(config::ArchKind::NPU40XX); mlir::MLIRContext ctx(registry); auto module = mlir::parseSourceString(inputIRClustering, &ctx); diff --git a/tests/unit/vpux_compiler/dialect/VPU/weights_separation_utils.cpp b/tests/unit/vpux_compiler/dialect/VPU/weights_separation_utils.cpp index 759a99deba..4e18a3da83 100644 --- a/tests/unit/vpux_compiler/dialect/VPU/weights_separation_utils.cpp +++ b/tests/unit/vpux_compiler/dialect/VPU/weights_separation_utils.cpp @@ -5,6 +5,7 @@ #include "common/utils.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" #include "vpux/compiler/dialect/VPU/utils/weights_separation.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" #include "vpux/compiler/dialect/core/dialect.hpp" @@ -67,33 +68,33 @@ constexpr llvm::StringLiteral INPUT_IR = R"( {-# dialect_resources: { builtin: { - ov1: "0x10000000ABABABABCDCDCDCD", - ov2: "0x10000000ABABABABCDCDCDCD", - ov3: "0x10000000ABABABABCDCDCDCD", - ov4: "0x10000000ABABABABCDCDCDCD" + vpux_ow_1: "0x10000000ABABABABCDCDCDCD", + vpux_ow_2: "0x10000000ABABABABCDCDCDCD", + vpux_ow_3: "0x10000000ABABABABCDCDCDCD", + vpux_ow_4: "0x10000000ABABABABCDCDCDCD" } } #-} module @main { func.func @main(%arg0: tensor<2x2xf16>) -> tensor<2x2xf16> { - %ov1_0 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, + %ov1_0 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<1.0>] - %ov1_1 = const.Declare tensor<102x2xf16> = dense_resource : tensor<2x2xf16>, + %ov1_1 = const.Declare tensor<102x2xf16> = dense_resource : tensor<2x2xf16>, [#const.PadWithZero<[0, 0], [100, 0]>] // ov1 = 2 * 2 * f16 + 102 * 2 * f16 - %ov2 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, + %ov2 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Rescale<5.0>] // ov2 = 2 * 2 * f16 - %ov3_1 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, + %ov3_1 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Rescale<5.0>] - %ov3_2 = const.Declare tensor<52x2xf16> = dense_resource : tensor<2x2xf16>, + %ov3_2 = const.Declare tensor<52x2xf16> = dense_resource : tensor<2x2xf16>, [#const.PadWithZero<[0, 0], [50, 0]>] // ov3 = 2 * 2 * f16 + 52 * 2 * f16 - %ov4 = const.Declare tensor<2x2xf64> = dense_resource : tensor<2x2xf16>, + %ov4 = const.Declare tensor<2x2xf64> = dense_resource : tensor<2x2xf16>, [#const.CastElemType] // ov4 = 2 * 2 * f64 @@ -164,10 +165,10 @@ TEST_F(MLIR_VPU_WeightsSeparationUtils_SplitInitAlgo, PartialSlicing) { const auto resourceName = getResourceName(actual.front().getContentAttr().getBaseContent()).str(); std::vector expected; // Note: ov2 and ov4 are assumed to be stored together - if (resourceName == "ov2" || resourceName == "ov4") { + if (resourceName == "vpux_ow_2" || resourceName == "vpux_ow_4") { expected = extractSpecificSplits(module.get(), [&](const VPU::TransformationsSplit& x) { const auto xName = getResourceName(x.getContentAttr().getBaseContent()).str(); - return xName == "ov2" || xName == "ov4"; + return xName == "vpux_ow_2" || xName == "vpux_ow_4"; }); } else { expected = extractSpecificSplits(module.get(), [&](const VPU::TransformationsSplit& x) { @@ -183,25 +184,25 @@ constexpr llvm::StringLiteral INPUT_IR_SUBVIEWS = R"( {-# dialect_resources: { builtin: { - ov1: "0x10000000ABABABABCDCDCDCD", - ov2: "0x10000000ABABABABCDCDCDCD" + vpux_ow_1: "0x10000000ABABABABCDCDCDCD", + vpux_ow_2: "0x10000000ABABABABCDCDCDCD" } } #-} module @main { func.func @main(%arg0: tensor<2x2xf16>) -> tensor<2x2xf16> { - %ov1_0 = const.Declare tensor<1x2xf16> = dense_resource : tensor<2x2xf16>, + %ov1_0 = const.Declare tensor<1x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<1.0>, #const.SubView<[0, 0], [1, 2]>] - %ov1_1 = const.Declare tensor<1x2xf16> = dense_resource : tensor<2x2xf16>, + %ov1_1 = const.Declare tensor<1x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<1.0>, #const.SubView<[1, 0], [1, 2]>] - %ov1_2 = const.Declare tensor<2x1xf16> = dense_resource : tensor<2x2xf16>, + %ov1_2 = const.Declare tensor<2x1xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<1.0>, #const.SubView<[0, 0], [2, 1]>] - %ov1_3 = const.Declare tensor<2x1xf16> = dense_resource : tensor<2x2xf16>, + %ov1_3 = const.Declare tensor<2x1xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<1.0>, #const.SubView<[0, 1], [2, 1]>] // ov1 = 2 * 2 * f16 (subviews do not count) - %ov2 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, + %ov2 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Rescale<5.0>] // ov2 = 2 * 2 * f16 @@ -262,8 +263,8 @@ constexpr llvm::StringLiteral INPUT_IR_REORDERS = R"( {-# dialect_resources: { builtin: { - ov1: "0x10000000ABABABABCDCDCDCD", - ov2: "0x10000000ABABABABCDCDCDCD" + vpux_ow_1: "0x10000000ABABABABCDCDCDCD", + vpux_ow_2: "0x10000000ABABABABCDCDCDCD" } } #-} @@ -272,15 +273,15 @@ constexpr llvm::StringLiteral INPUT_IR_REORDERS = R"( module @main { func.func @main(%arg0: tensor<2x2xf16>) -> tensor<2x2xf16> { - %ov1_0 = const.Declare tensor<1x2xf16, {order = #CN}> = dense_resource : tensor<2x2xf16>, + %ov1_0 = const.Declare tensor<1x2xf16, {order = #CN}> = dense_resource : tensor<2x2xf16>, [#const.Add<1.0>, #const.Reorder<#CN>, #const.SubView<[0, 0], [1, 2]>] - %ov1_1 = const.Declare tensor<1x2xf16, {order = #CN}> = dense_resource : tensor<2x2xf16>, + %ov1_1 = const.Declare tensor<1x2xf16, {order = #CN}> = dense_resource : tensor<2x2xf16>, [#const.Add<1.0>, #const.Reorder<#CN>, #const.SubView<[1, 0], [1, 2]>] - %ov1_2 = const.Declare tensor<2x1xf16> = dense_resource : tensor<2x2xf16>, + %ov1_2 = const.Declare tensor<2x1xf16> = dense_resource : tensor<2x2xf16>, [#const.Add<1.0>, #const.PadWithZero<[0, 0], [0, 1]>, #const.SubView<[0, 0], [2, 1]>] // ov1 with reorder = 2 * 2 * f16 + 2 * 3 * f16 (subviews do not count) - %ov2 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, + %ov2 = const.Declare tensor<2x2xf16> = dense_resource : tensor<2x2xf16>, [#const.Rescale<5.0>] // ov2 = 2 * 2 * f16 diff --git a/tests/unit/vpux_compiler/dialect/VPUIP/layer_info_interface_tests.cpp b/tests/unit/vpux_compiler/dialect/VPUIP/layer_info_interface_tests.cpp index 21a94c649c..1e5e5918ac 100644 --- a/tests/unit/vpux_compiler/dialect/VPUIP/layer_info_interface_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPUIP/layer_info_interface_tests.cpp @@ -65,7 +65,7 @@ TEST_F(MLIR_VPUIP_LayerInfo, AsyncLayerOpInterface) { mlir::PassManager pm(module.get()->getName(), mlir::OpPassManager::Nesting::Implicit); auto initCompilerOptions = - vpux::VPU::InitCompilerOptions(vpux::VPU::ArchKind::NPU37XX, vpux::config::CompilationMode::ReferenceSW); + vpux::VPU::InitCompilerOptions(vpux::config::ArchKind::NPU37XX, vpux::config::CompilationMode::ReferenceSW); vpux::VPU::buildInitCompilerPipeline(pm, initCompilerOptions, vpux::Logger::global()); diff --git a/tests/unit/vpux_compiler/dialect/VPUMI40XX/op_ranges.cpp b/tests/unit/vpux_compiler/dialect/VPUMI40XX/op_ranges.cpp index c9a8e41078..5ebb4589e2 100644 --- a/tests/unit/vpux_compiler/dialect/VPUMI40XX/op_ranges.cpp +++ b/tests/unit/vpux_compiler/dialect/VPUMI40XX/op_ranges.cpp @@ -123,16 +123,16 @@ class MLIR_TaskRangeTest : public MLIR_UnitBase { TEST_F(MLIR_TaskRangeTest, Empty) { constexpr std::string_view inputIR = R"( - module @EmptyOpRanges attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { + module @EmptyOpRanges attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 2306867200 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 2306867200 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input_0" : tensor<1x2x3x4xf16> } outputsInfo : { @@ -162,16 +162,16 @@ TEST_F(MLIR_TaskRangeTest, Empty) { TEST_F(MLIR_TaskRangeTest, DMA) { constexpr std::string_view inputIR = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @MultiOpRanges attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { + module @MultiOpRanges attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 2306867200 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 2306867200 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input_0" : tensor<1x2x3x4xf16> } outputsInfo : { @@ -210,16 +210,16 @@ TEST_F(MLIR_TaskRangeTest, DMA) { TEST_F(MLIR_TaskRangeTest, Shave) { constexpr std::string_view inputIR = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @MultiOpRanges attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { + module @MultiOpRanges attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 2306867200 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 2306867200 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input_0" : tensor<1x2x3x4xf16> } outputsInfo : { @@ -287,16 +287,16 @@ TEST_F(MLIR_TaskRangeTest, Shave) { TEST_F(MLIR_TaskRangeTest, DPU) { constexpr std::string_view inputIR = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @MultiOpRanges attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { + module @MultiOpRanges attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 2 of @DMA_NN - IE.MemoryResource 2306867200 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 2306867200 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input_0" : tensor<1x2x3x4xf16> } outputsInfo : { diff --git a/tests/unit/vpux_compiler/dialect/VPURT/barrier_pages_split.cpp b/tests/unit/vpux_compiler/dialect/VPURT/barrier_pages_split.cpp index 3833b39955..b3465ff5ea 100644 --- a/tests/unit/vpux_compiler/dialect/VPURT/barrier_pages_split.cpp +++ b/tests/unit/vpux_compiler/dialect/VPURT/barrier_pages_split.cpp @@ -155,10 +155,10 @@ TEST_F(BarrierPagesSplitTests, CheckSplitForGraphSimple) { * | | * b3 | * | | - * ------ t5 | - * \ / - * b4 - * Page2 | + * ------ t5 | <- t5 does not update any barrier from next page but since + * / there is no such task in the graph, that works on Page1/Page2 + * b4 boundary, page split legalization, which requires at least one boundary task, + * Page2 | will create t5->b4 dependency * t6 * | * b5 @@ -174,7 +174,7 @@ std::tuple graphWithLongDepOnTaskUpdat {4}, // task 2 {2}, // task 3 {3}, // task 4 - {4}, // task 5 + {}, // task 5 {5} // task 6 }; @@ -1257,8 +1257,8 @@ TEST_F(BarrierPagesSplitTests, LegalizeBoundaryTaskDepsWithMultipleUpdateBars) { * | t4 * | \ * \ \ - * ------ t5 | - * | | + * ------ t5 /| + * / | * Page2 b5 b4 * | | * | t6 @@ -1278,15 +1278,15 @@ std::tuple graphWithMultipleBoundaryTa BarrierInfoMaps barrierMapsConfig; barrierMapsConfig.taskUpdateBarriers = { - {0}, // task 0 - {1}, // task 1 - {2}, // task 2 - {3}, // task 3 - {4}, // task 4 - {5}, // task 5 - {6}, // task 6 - {6}, // task 7 - {7} // task 8 + {0}, // task 0 + {1}, // task 1 + {2}, // task 2 + {3}, // task 3 + {4, 5}, // task 4 + {}, // task 5 + {6}, // task 6 + {6}, // task 7 + {7} // task 8 }; barrierMapsConfig.taskWaitBarriers = { @@ -1318,8 +1318,8 @@ std::tuple graphWithMultipleBoundaryTa {1}, // task 1 {2}, // task 2 {2, 3}, // task 3 - {4}, // task 4 - {4, 5}, // task 5 + {4, 5}, // task 4 + {4}, // task 5 {6}, // task 6 {6}, // task 7 {7} // task 8 @@ -2165,6 +2165,119 @@ TEST_F(BarrierPagesSplitTests, LegalizeForBarrierDmaWhereOneOfWaitBarriersWouldD EXPECT_EQ(expectedBarrierMapsConfig.barrierConsumerMap, testResult.barrierConsumerMap); } +/** + * HW FIFO (DMA): t0 t2 t3 t4 t5 + * HW FIFO (DPU): t1 + * + * ------ t0 + * | + * b0 + * / \ + * Page0 t1 t2 + * | | + * b1 | + * | | + * ------ t3 | <- both t2 and t3 are boundary tasks but t3 has no update barrier + * / + * b2 <- Create t3->b2 dep + * | Slot for inserting barrier DMA prepared between b2 and b3 + * Page1 t4 + * | + * b3 + * | + * ------ t5 + * | + * Page2 b4 + * ------ + */ +// Create a tuple with BarrierInfoMaps, pageSize and expectedBarrierMapsConfig +std::tuple graphToLegalizeForBarrierDmaWhereStartTaskHasNoUpdateBar() { + BarrierInfoMaps barrierMapsConfig; + + barrierMapsConfig.taskUpdateBarriers = { + {0}, // task 0 + {1}, // task 1 + {2}, // task 2 + {}, // task 3 + {3}, // task 4 + {4} // task 5 + }; + + barrierMapsConfig.taskWaitBarriers = { + {}, // task 0 + {0}, // task 1 + {0}, // task 2 + {1}, // task 3 + {2}, // task 4 + {3} // task 5 + }; + + fillProducersAndConsumers(barrierMapsConfig); + + const VPURT::TaskQueueType dmaType{VPU::ExecutorKind::DMA_NN, 0}; + const VPURT::TaskQueueType dpuType{VPU::ExecutorKind::DPU, 0}; + + barrierMapsConfig.taskQueueTypeMap[dmaType] = {0, 2, 3, 4, 5}; + barrierMapsConfig.taskQueueTypeMap[dpuType] = {1}; + + size_t pageSize = 2; + + BarrierInfoMaps expectedBarrierMapsConfig; + + expectedBarrierMapsConfig.taskUpdateBarriers = { + {0}, // task 0 + {1}, // task 1 + {2}, // task 2 + {2}, // task 3 + {3}, // task 4 + {4} // task 5 + }; + + expectedBarrierMapsConfig.taskWaitBarriers = { + {}, // task 0 + {0}, // task 1 + {0}, // task 2 + {1}, // task 3 + {2}, // task 4 + {3} // task 5 + }; + fillProducersAndConsumers(expectedBarrierMapsConfig); + + return std::make_tuple(barrierMapsConfig, pageSize, expectedBarrierMapsConfig); +} + +TEST_F(BarrierPagesSplitTests, LegalizeForBarrierDmaWhereStartTaskHasNoUpdateBar) { + auto [barrierMapsConfig, pageSize, expectedBarrierMapsConfig] = + graphToLegalizeForBarrierDmaWhereStartTaskHasNoUpdateBar(); + + BarrierInfoTest barrierInfoTest(barrierMapsConfig); + VPURT::BarrierPagesSplitHandler barrierPagesSplitHandlerTest(barrierInfoTest, barrierMapsConfig.taskQueueTypeMap, + pageSize, /*_barrierFifoDepth = */ 1); + + EXPECT_NO_THROW(barrierPagesSplitHandlerTest.verifyTaskBarrierPagesAreValid()); + EXPECT_NO_THROW(barrierPagesSplitHandlerTest.verifyNoCyclicDeps()); + + EXPECT_TRUE(barrierPagesSplitHandlerTest.areBoundaryTasksFromNeighborPagesDependent()); + + barrierPagesSplitHandlerTest.legalizeForDmaProgrammingBarriers(); + + auto barProgDmaPosPage1 = barrierPagesSplitHandlerTest.getDmaProgrammingBarrierPosition(1); + + ASSERT_TRUE(barProgDmaPosPage1.valid); + ASSERT_EQ(barProgDmaPosPage1.waitBars.size(), 1); + EXPECT_EQ(barProgDmaPosPage1.waitBars[0], 2); + ASSERT_EQ(barProgDmaPosPage1.updateBars.size(), 1); + EXPECT_EQ(barProgDmaPosPage1.updateBars[0], 3); + EXPECT_EQ(barProgDmaPosPage1.insertAfter, 3); + + auto testResult = barrierPagesSplitHandlerTest.getBarrierMaps(); + + EXPECT_EQ(expectedBarrierMapsConfig.taskUpdateBarriers, testResult.taskUpdateBarriers); + EXPECT_EQ(expectedBarrierMapsConfig.taskWaitBarriers, testResult.taskWaitBarriers); + EXPECT_EQ(expectedBarrierMapsConfig.barrierProducerMap, testResult.barrierProducerMap); + EXPECT_EQ(expectedBarrierMapsConfig.barrierConsumerMap, testResult.barrierConsumerMap); +} + /** * HW FIFO (DMA0): t0 t5 t6 t9 * HW FIFO (DMA1): t1 t8 diff --git a/tests/unit/vpux_compiler/dialect/VPURT/inference_execution_analysis_tests.cpp b/tests/unit/vpux_compiler/dialect/VPURT/inference_execution_analysis_tests.cpp index 59fb187603..bc4eb53c49 100644 --- a/tests/unit/vpux_compiler/dialect/VPURT/inference_execution_analysis_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPURT/inference_execution_analysis_tests.cpp @@ -44,19 +44,19 @@ TEST_F(MLIR_InferenceExecutionAnalysis, CheckCycleUpdateWith1ActShaveEngineOn1Cl // ACT C0_1: [----------------] constexpr StringLiteral inputIR = R"( - module @test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { + module @test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.UseDedicatedFifoPerShaveEngine : false } IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 1 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 524288000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 524288000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] @@ -101,7 +101,7 @@ TEST_F(MLIR_InferenceExecutionAnalysis, CheckCycleUpdateWith1ActShaveEngineOn1Cl ASSERT_TRUE(funcOp != nullptr); // set cost model factory - VPU::CostModelConfig::setFactory(VPU::ArchKind::NPU40XX); + VPU::CostModelConfig::setFactory(config::ArchKind::NPU40XX); CycleCostInfo cycleCostInfo(funcOp); VPURT::InferenceExecutionSimulator infSim(log, funcOp, cycleCostInfo); @@ -139,19 +139,19 @@ TEST_F(MLIR_InferenceExecutionAnalysis, CheckCycleUpdateWith2ActShaveEngineOn1Cl // ACT C0_1: [----------------] constexpr StringLiteral inputIR = R"( - module @test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { + module @test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.UseDedicatedFifoPerShaveEngine : false } IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @M2I IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 524288000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 524288000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] @@ -196,7 +196,7 @@ TEST_F(MLIR_InferenceExecutionAnalysis, CheckCycleUpdateWith2ActShaveEngineOn1Cl ASSERT_TRUE(funcOp != nullptr); // set cost model factory - VPU::CostModelConfig::setFactory(VPU::ArchKind::NPU40XX); + VPU::CostModelConfig::setFactory(config::ArchKind::NPU40XX); CycleCostInfo cycleCostInfo(funcOp); VPURT::InferenceExecutionSimulator infSim(log, funcOp, cycleCostInfo); @@ -238,18 +238,18 @@ TEST_F(MLIR_InferenceExecutionAnalysis, CheckCycleUpdateOnMultiQueueIR) { // ACT C1_0: [----------------] // ACT C1_1: [----------------] constexpr StringLiteral inputIR = R"( - module @test attributes {VPU.arch = #VPU.arch_kind, config.compilationMode = #config.compilation_mode} { + module @test attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { config.PipelineOptions @Options { config.Option @VPU.UseDedicatedFifoPerShaveEngine : false } IE.TileResource 6 of @NCE at 1.700000e+03 MHz { IE.MemoryResource 1327104 bytes of @CMX_NN_FragmentationAware - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} IE.ExecutorResource 2 of @SHAVE_ACT IE.ExecutorResource 1 of @DPU } IE.ExecutorResource 1 of @DMA_NN - IE.MemoryResource 524288000 bytes of @DDR {VPU.bandwidth = 64 : i64, VPU.derateFactor = 6.000000e-01 : f64} + IE.MemoryResource 524288000 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] @@ -375,7 +375,7 @@ TEST_F(MLIR_InferenceExecutionAnalysis, CheckCycleUpdateOnMultiQueueIR) { ASSERT_TRUE(funcOp != nullptr); // set cost model factory - VPU::CostModelConfig::setFactory(VPU::ArchKind::NPU37XX); + VPU::CostModelConfig::setFactory(config::ArchKind::NPU37XX); CycleCostInfo cycleCostInfo(funcOp); VPURT::InferenceExecutionSimulator infSim(log, funcOp, cycleCostInfo); diff --git a/tests/unit/vpux_compiler/dialect/VPURegMapped/mapped_register_tests.cpp b/tests/unit/vpux_compiler/dialect/VPURegMapped/mapped_register_tests.cpp index dca234eb58..165d82193d 100644 --- a/tests/unit/vpux_compiler/dialect/VPURegMapped/mapped_register_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/VPURegMapped/mapped_register_tests.cpp @@ -63,7 +63,7 @@ class MLIR_VPURegMapped_RegisterMapped : public MLIR_UnitBase { mlir::PassManager pm(module->getName(), mlir::OpPassManager::Nesting::Implicit); auto initCompilerOptions = - VPU::InitCompilerOptions(vpux::VPU::ArchKind::NPU40XX, config::CompilationMode::DefaultHW); + VPU::InitCompilerOptions(vpux::config::ArchKind::NPU40XX, config::CompilationMode::DefaultHW); VPU::buildInitCompilerPipeline(pm, initCompilerOptions, log); diff --git a/tests/unit/vpux_compiler/dialect/const/constant_folding_in_background_tests.cpp b/tests/unit/vpux_compiler/dialect/const/constant_folding_in_background_tests.cpp index 4ed355a866..47de8341cb 100644 --- a/tests/unit/vpux_compiler/dialect/const/constant_folding_in_background_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/const/constant_folding_in_background_tests.cpp @@ -22,6 +22,7 @@ #include using namespace vpux; +#include using namespace std::chrono_literals; namespace { diff --git a/tests/unit/vpux_compiler/dialect/const/content_tests.cpp b/tests/unit/vpux_compiler/dialect/const/content_tests.cpp index 8d3052b2ea..28c4c88388 100644 --- a/tests/unit/vpux_compiler/dialect/const/content_tests.cpp +++ b/tests/unit/vpux_compiler/dialect/const/content_tests.cpp @@ -2147,8 +2147,8 @@ TEST_F(MLIR_ConstContentAttrTest, BitPackIsLast) { EXPECT_NO_THROW(std::ignore = contentAttrSetup.clone().transpose(DimsOrder::NHWC)); // Inserting another transformation that has the LAST position requirement - EXPECT_ANY_THROW( - std::ignore = contentAttrSetup.clone().swizzleConstant(5, static_cast(VPU::ArchKind::NPU37XX))); + EXPECT_ANY_THROW(std::ignore = contentAttrSetup.clone().swizzleConstant( + 5, static_cast(config::ArchKind::NPU37XX))); const auto quantType = mlir::quant::UniformQuantizedType::get(mlir::quant::QuantizationFlags::Signed, getSInt8Type(&ctx), @@ -2461,7 +2461,7 @@ TEST_F(MLIR_ConstContentAttrTest, PositionRequirement) { auto contentAttrSetup1 = baseContentAttrSetup.rescale(10.0); // Inserting a transformation that has the LAST position requirement - auto contentAttrSetup2 = contentAttrSetup1.swizzleConstant(5, static_cast(VPU::ArchKind::NPU37XX)); + auto contentAttrSetup2 = contentAttrSetup1.swizzleConstant(5, static_cast(config::ArchKind::NPU37XX)); // Inserting a transformation that has the PREFERRED_LAST position requirement auto contentAttrSetup3 = contentAttrSetup2.sparsify(false); @@ -2613,7 +2613,8 @@ TEST_F(MLIR_ConstContentAttrTest, SwizzleConstant_SubBytes_I1) { Const::ContentSetup baseContentAttrSetup(baseType); auto contentAttrSetup = baseContentAttrSetup.castElemType(mlir::IntegerType::get(&ctx, 1)); - auto contentAttrSetup1 = contentAttrSetup.clone().swizzleConstant(5, static_cast(VPU::ArchKind::NPU37XX)); + auto contentAttrSetup1 = + contentAttrSetup.clone().swizzleConstant(5, static_cast(config::ArchKind::NPU37XX)); auto contentAttr = Const::ContentAttr::get(baseAttr, std::move(contentAttrSetup)); auto contentAttr1 = Const::ContentAttr::get(baseAttr, std::move(contentAttrSetup1)); @@ -2621,7 +2622,7 @@ TEST_F(MLIR_ConstContentAttrTest, SwizzleConstant_SubBytes_I1) { EXPECT_EQ(content.getType(), contentAttr1.getType()); EXPECT_FALSE(content.isSplat()); EXPECT_EQ(contentAttr.isSplat(), content.isSplat()); - VPU::ArchKind archKind = static_cast(VPU::ArchKind::NPU37XX); + config::ArchKind archKind = static_cast(config::ArchKind::NPU37XX); const auto contentType = contentAttr.getType(); auto acheAlignSize = static_cast( alignSizeForSwizzling(contentType.getTotalAllocSize().count(), getSizeAlignmentForSwizzling(archKind))); @@ -2651,7 +2652,8 @@ TEST_F(MLIR_ConstContentAttrTest, SwizzleConstant_SubBytes_I4) { Const::ContentSetup baseContentAttrSetup(baseType); auto contentAttrSetup = baseContentAttrSetup.castElemType(mlir::IntegerType::get(&ctx, 4)); - auto contentAttrSetup1 = contentAttrSetup.clone().swizzleConstant(5, static_cast(VPU::ArchKind::NPU37XX)); + auto contentAttrSetup1 = + contentAttrSetup.clone().swizzleConstant(5, static_cast(config::ArchKind::NPU37XX)); auto contentAttr1 = Const::ContentAttr::get(baseAttr, std::move(contentAttrSetup1)); auto contentAttr = Const::ContentAttr::get(baseAttr, std::move(contentAttrSetup)); @@ -2660,7 +2662,7 @@ TEST_F(MLIR_ConstContentAttrTest, SwizzleConstant_SubBytes_I4) { EXPECT_FALSE(content.isSplat()); EXPECT_EQ(contentAttr.isSplat(), content.isSplat()); - VPU::ArchKind archKind = static_cast(VPU::ArchKind::NPU37XX); + config::ArchKind archKind = static_cast(config::ArchKind::NPU37XX); const auto contentType = contentAttr.getType(); auto acheAlignSize = static_cast( alignSizeForSwizzling(contentType.getTotalAllocSize().count(), getSizeAlignmentForSwizzling(archKind))); @@ -2689,14 +2691,14 @@ TEST_F(MLIR_ConstContentAttrTest, SwizzleConstant_U8) { const auto baseAttr = Const::createConstContent(baseType, ArrayRef(vals)); Const::ContentSetup baseContentAttrSetup(baseType); - auto contentAttrSetup = baseContentAttrSetup.swizzleConstant(5, static_cast(VPU::ArchKind::NPU37XX)); + auto contentAttrSetup = baseContentAttrSetup.swizzleConstant(5, static_cast(config::ArchKind::NPU37XX)); auto contentAttr = Const::ContentAttr::get(baseAttr, std::move(contentAttrSetup)); const auto content = contentAttr.fold(); EXPECT_EQ(content.getType(), contentAttr.getType()); EXPECT_FALSE(content.isSplat()); EXPECT_EQ(contentAttr.isSplat(), content.isSplat()); - VPU::ArchKind archKind = static_cast(VPU::ArchKind::NPU37XX); + config::ArchKind archKind = static_cast(config::ArchKind::NPU37XX); const auto contentType = contentAttr.getType(); auto acheAlignSize = static_cast( alignSizeForSwizzling(contentType.getTotalAllocSize().count(), getSizeAlignmentForSwizzling(archKind))); @@ -2723,14 +2725,14 @@ TEST_F(MLIR_ConstContentAttrTest, SwizzleConstant_FP32) { const auto baseAttr = Const::createConstContent(baseType, ArrayRef(vals)); Const::ContentSetup baseContentAttrSetup(baseType); - auto contentAttrSetup = baseContentAttrSetup.swizzleConstant(5, static_cast(VPU::ArchKind::NPU37XX)); + auto contentAttrSetup = baseContentAttrSetup.swizzleConstant(5, static_cast(config::ArchKind::NPU37XX)); auto contentAttr = Const::ContentAttr::get(baseAttr, std::move(contentAttrSetup)); const auto content = contentAttr.fold(); EXPECT_EQ(content.getType(), contentAttr.getType()); EXPECT_FALSE(content.isSplat()); EXPECT_EQ(contentAttr.isSplat(), content.isSplat()); - VPU::ArchKind archKind = static_cast(VPU::ArchKind::NPU37XX); + config::ArchKind archKind = static_cast(config::ArchKind::NPU37XX); const auto contentType = contentAttr.getType(); auto acheAlignSize = static_cast( alignSizeForSwizzling(contentType.getTotalAllocSize().count(), getSizeAlignmentForSwizzling(archKind))); @@ -2763,7 +2765,7 @@ TEST_F(MLIR_ConstContentAttrTest, SwizzleConstant_SubBytes_Splat_I1) { const auto contentType = Const::inferFinalType(baseType, contentAttrSetup.getTransformations()); ASSERT_NE(contentType, nullptr); - auto contentAttrSetup1 = contentAttrSetup.swizzleConstant(5, static_cast(VPU::ArchKind::NPU37XX)); + auto contentAttrSetup1 = contentAttrSetup.swizzleConstant(5, static_cast(config::ArchKind::NPU37XX)); auto contentAttr1 = Const::ContentAttr::get(baseAttr, std::move(contentAttrSetup1)); const auto content = contentAttr1.fold(); @@ -2771,7 +2773,7 @@ TEST_F(MLIR_ConstContentAttrTest, SwizzleConstant_SubBytes_Splat_I1) { EXPECT_FALSE(content.isSplat()); EXPECT_EQ(contentAttr1.isSplat(), content.isSplat()); - VPU::ArchKind archKind = static_cast(VPU::ArchKind::NPU37XX); + config::ArchKind archKind = static_cast(config::ArchKind::NPU37XX); auto acheAlignSize = static_cast( alignSizeForSwizzling(contentType.getTotalAllocSize().count(), getSizeAlignmentForSwizzling(archKind))); @@ -2805,7 +2807,7 @@ TEST_F(MLIR_ConstContentAttrTest, SwizzleConstant_SubBytes_Splat_I4) { const auto contentType = Const::inferFinalType(baseType, contentAttrSetup.getTransformations()); ASSERT_NE(contentType, nullptr); - auto contentAttrSetup1 = contentAttrSetup.swizzleConstant(5, static_cast(VPU::ArchKind::NPU37XX)); + auto contentAttrSetup1 = contentAttrSetup.swizzleConstant(5, static_cast(config::ArchKind::NPU37XX)); auto contentAttr1 = Const::ContentAttr::get(baseAttr, std::move(contentAttrSetup1)); const auto content = contentAttr1.fold(); @@ -2813,7 +2815,7 @@ TEST_F(MLIR_ConstContentAttrTest, SwizzleConstant_SubBytes_Splat_I4) { EXPECT_FALSE(content.isSplat()); EXPECT_EQ(contentAttr1.isSplat(), content.isSplat()); - VPU::ArchKind archKind = static_cast(VPU::ArchKind::NPU37XX); + config::ArchKind archKind = static_cast(config::ArchKind::NPU37XX); auto acheAlignSize = static_cast( alignSizeForSwizzling(contentType.getTotalAllocSize().count(), getSizeAlignmentForSwizzling(archKind))); @@ -2842,7 +2844,7 @@ TEST_F(MLIR_ConstContentAttrTest, SwizzleConstant_Splat_U8) { Const::ContentSetup baseContentAttrSetup(baseType); - auto contentAttrSetup = baseContentAttrSetup.swizzleConstant(5, static_cast(VPU::ArchKind::NPU37XX)); + auto contentAttrSetup = baseContentAttrSetup.swizzleConstant(5, static_cast(config::ArchKind::NPU37XX)); auto contentAttr = Const::ContentAttr::get(baseAttr, std::move(contentAttrSetup)); const auto content = contentAttr.fold(); @@ -2850,7 +2852,7 @@ TEST_F(MLIR_ConstContentAttrTest, SwizzleConstant_Splat_U8) { EXPECT_FALSE(content.isSplat()); EXPECT_EQ(contentAttr.isSplat(), content.isSplat()); - VPU::ArchKind archKind = static_cast(VPU::ArchKind::NPU37XX); + config::ArchKind archKind = static_cast(config::ArchKind::NPU37XX); const auto contentType = contentAttr.getType(); auto acheAlignSize = static_cast( alignSizeForSwizzling(contentType.getTotalAllocSize().count(), getSizeAlignmentForSwizzling(archKind))); @@ -2880,7 +2882,7 @@ TEST_F(MLIR_ConstContentAttrTest, SwizzleConstant_Splat_FP32) { Const::ContentSetup baseContentAttrSetup(baseType); - auto contentAttrSetup = baseContentAttrSetup.swizzleConstant(5, static_cast(VPU::ArchKind::NPU37XX)); + auto contentAttrSetup = baseContentAttrSetup.swizzleConstant(5, static_cast(config::ArchKind::NPU37XX)); auto contentAttr = Const::ContentAttr::get(baseAttr, std::move(contentAttrSetup)); const auto content = contentAttr.fold(); @@ -2888,7 +2890,7 @@ TEST_F(MLIR_ConstContentAttrTest, SwizzleConstant_Splat_FP32) { EXPECT_FALSE(content.isSplat()); EXPECT_EQ(contentAttr.isSplat(), content.isSplat()); - VPU::ArchKind archKind = static_cast(VPU::ArchKind::NPU37XX); + config::ArchKind archKind = static_cast(config::ArchKind::NPU37XX); const auto contentType = contentAttr.getType(); auto acheAlignSize = static_cast( alignSizeForSwizzling(contentType.getTotalAllocSize().count(), getSizeAlignmentForSwizzling(archKind))); diff --git a/tests/unit/vpux_compiler/dialect/core/bounded_type.cpp b/tests/unit/vpux_compiler/dialect/core/bounded_type.cpp index 6c23d65d7d..748f4df778 100644 --- a/tests/unit/vpux_compiler/dialect/core/bounded_type.cpp +++ b/tests/unit/vpux_compiler/dialect/core/bounded_type.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/dialect.hpp" #include diff --git a/tests/unit/vpux_compiler/dialect/core/dynamic_dims_mask_type.cpp b/tests/unit/vpux_compiler/dialect/core/dynamic_dims_mask_type.cpp index fd15021f3b..d3e3559ab4 100644 --- a/tests/unit/vpux_compiler/dialect/core/dynamic_dims_mask_type.cpp +++ b/tests/unit/vpux_compiler/dialect/core/dynamic_dims_mask_type.cpp @@ -7,6 +7,7 @@ #include "vpux/compiler/dialect/IE/utils/dynamic_shape_utils.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" +#include "vpux/compiler/dialect/core/IR/tensor_attr.hpp" #include "vpux/compiler/dialect/core/dialect.hpp" #include diff --git a/tests/unit/vpux_compiler/frontend/auto_batch_compiler_detection.cpp b/tests/unit/vpux_compiler/frontend/auto_batch_compiler_detection.cpp index 251629a3b8..52d4e895ee 100644 --- a/tests/unit/vpux_compiler/frontend/auto_batch_compiler_detection.cpp +++ b/tests/unit/vpux_compiler/frontend/auto_batch_compiler_detection.cpp @@ -4,6 +4,7 @@ // #include +#include #include #include #include diff --git a/tests/unit/vpux_compiler/pipelines/setup_params_according_to_opt_level.cpp b/tests/unit/vpux_compiler/pipelines/setup_params_according_to_opt_level.cpp new file mode 100644 index 0000000000..6f2bfef7e2 --- /dev/null +++ b/tests/unit/vpux_compiler/pipelines/setup_params_according_to_opt_level.cpp @@ -0,0 +1,46 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/NPU40XX/pipeline_options.hpp" +#include "vpux/compiler/pipelines/options_setup.hpp" + +#include + +using namespace vpux; + +// Local test helper that mimics DefaultHWSetup40XX::setupOptionsImpl logic +class TestDefaultHWSetup40XX : public OptionsSetupBase { +public: + static void setupOptionsImpl(DefaultHWOptions40XX& options, const intel_npu::Config& config) { + if (config.get()) { + overwriteIfUnset(options.optimizationLevel, 3); + } + setupParamsAccordingToOptimizationLevel(options.optimizationLevel, options, options.workloadManagementEnable); + } +}; + +class OptionsSetupTurboTest : public ::testing::Test { +public: + void SetUp() override { + _optionDesc = std::make_shared(); + _optionDesc->add(); + _config = intel_npu::Config(_optionDesc); + _config->update({{std::string(intel_npu::TURBO::key()), "YES"}}); + } + + std::shared_ptr _optionDesc; + std::optional _config; +}; + +TEST_F(OptionsSetupTurboTest, UserSetEnableReduceNumTilesForSmallModelsPassIsNotOverriddenByTurbo) { + DefaultHWOptions40XX options; + options.enableReduceNumTilesForSmallModelsPass = false; // user sets explicitly + + ASSERT_TRUE(_config.has_value()); + TestDefaultHWSetup40XX::setupOptionsImpl(options, _config.value()); + + // Should remain as set by user, not overridden by TURBO logic + EXPECT_FALSE(options.enableReduceNumTilesForSmallModelsPass); +} diff --git a/tests/unit/vpux_compiler/utils/compilation_mode_params_parser_tests.cpp b/tests/unit/vpux_compiler/utils/compilation_mode_params_parser_tests.cpp index 62dfaf6b01..48d8825efb 100644 --- a/tests/unit/vpux_compiler/utils/compilation_mode_params_parser_tests.cpp +++ b/tests/unit/vpux_compiler/utils/compilation_mode_params_parser_tests.cpp @@ -16,7 +16,7 @@ class CompilationModeParamsParserTest : public MLIR_UnitBase {}; TEST_F(CompilationModeParamsParserTest, EmptyOptions) { std::string params = ""; - auto result = parseOnlyPublic(params, VPU::ArchKind::NPU40XX, /*warnForPrivate=*/false, + auto result = parseOnlyPublic(params, config::ArchKind::NPU40XX, /*warnForPrivate=*/false, /*logLevel=*/LogLevel::None); ASSERT_TRUE(result != nullptr); } @@ -24,14 +24,14 @@ TEST_F(CompilationModeParamsParserTest, EmptyOptions) { TEST_F(CompilationModeParamsParserTest, PublicOptions) { { std::string params = "optimization-level=100"; - auto result = parseOnlyPublic(params, VPU::ArchKind::NPU40XX, /*warnForPrivate=*/false, + auto result = parseOnlyPublic(params, config::ArchKind::NPU40XX, /*warnForPrivate=*/false, /*logLevel=*/LogLevel::None); ASSERT_TRUE(result != nullptr); EXPECT_EQ(result->optimizationLevel.getValue(), 100); } { std::string params = "optimization-level=200 performance-hint-override=randomEntry"; - auto result = parseOnlyPublic(params, VPU::ArchKind::NPU40XX, /*warnForPrivate=*/false, + auto result = parseOnlyPublic(params, config::ArchKind::NPU40XX, /*warnForPrivate=*/false, /*logLevel=*/LogLevel::None); ASSERT_TRUE(result != nullptr); EXPECT_EQ(result->optimizationLevel.getValue(), 200); @@ -42,14 +42,14 @@ TEST_F(CompilationModeParamsParserTest, PublicOptions) { TEST_F(CompilationModeParamsParserTest, PrivateOptions) { { std::string params = "schedule-trace-file-name=randomFileName"; - auto result = parseOnlyPublic(params, VPU::ArchKind::NPU40XX, /*warnForPrivate=*/false, + auto result = parseOnlyPublic(params, config::ArchKind::NPU40XX, /*warnForPrivate=*/false, /*logLevel=*/LogLevel::None); ASSERT_TRUE(result != nullptr); EXPECT_NE(result->scheduleTraceFile.getValue(), "randomFileName"); } { std::string params = "schedule-trace-file-name=randomFileName weights-sparsity-threshold=1234567890"; - auto result = parseOnlyPublic(params, VPU::ArchKind::NPU40XX, /*warnForPrivate=*/false, + auto result = parseOnlyPublic(params, config::ArchKind::NPU40XX, /*warnForPrivate=*/false, /*logLevel=*/LogLevel::None); ASSERT_TRUE(result != nullptr); EXPECT_NE(result->scheduleTraceFile.getValue(), "randomFileName"); @@ -60,7 +60,7 @@ TEST_F(CompilationModeParamsParserTest, PrivateOptions) { TEST_F(CompilationModeParamsParserTest, MixedOptions) { { std::string params = "optimization-level=100 schedule-trace-file-name=randomFileName"; - auto result = parseOnlyPublic(params, VPU::ArchKind::NPU40XX, /*warnForPrivate=*/false, + auto result = parseOnlyPublic(params, config::ArchKind::NPU40XX, /*warnForPrivate=*/false, /*logLevel=*/LogLevel::None); ASSERT_TRUE(result != nullptr); EXPECT_EQ(result->optimizationLevel.getValue(), 100); @@ -69,7 +69,7 @@ TEST_F(CompilationModeParamsParserTest, MixedOptions) { { std::string params = "optimization-level=200 schedule-trace-file-name=randomFileName " "performance-hint-override=randomEntry weights-sparsity-threshold=1234567890"; - auto result = parseOnlyPublic(params, VPU::ArchKind::NPU40XX, /*warnForPrivate=*/false, + auto result = parseOnlyPublic(params, config::ArchKind::NPU40XX, /*warnForPrivate=*/false, /*logLevel=*/LogLevel::None); ASSERT_TRUE(result != nullptr); EXPECT_EQ(result->optimizationLevel.getValue(), 200); @@ -83,14 +83,14 @@ TEST_F(CompilationModeParamsParserTest, InvalidOptions) { { // Note: correct option would be `optimization-level` std::string params = "my-optimization-level=100"; - auto result = parseOnlyPublic(params, VPU::ArchKind::NPU40XX, /*warnForPrivate=*/false, + auto result = parseOnlyPublic(params, config::ArchKind::NPU40XX, /*warnForPrivate=*/false, /*logLevel=*/LogLevel::None); ASSERT_TRUE(result == nullptr); } { // Note: correct option would be `optimization-level` std::string params = "performance-hint-override=randomEntry my-optimization-level=200"; - auto result = parseOnlyPublic(params, VPU::ArchKind::NPU40XX, /*warnForPrivate=*/false, + auto result = parseOnlyPublic(params, config::ArchKind::NPU40XX, /*warnForPrivate=*/false, /*logLevel=*/LogLevel::None); ASSERT_TRUE(result == nullptr); } @@ -99,13 +99,13 @@ TEST_F(CompilationModeParamsParserTest, InvalidOptions) { TEST_F(CompilationModeParamsParserTest, ValuesWithSpaces) { { std::string params = "function-outlining=\"repeating-blocks='min-ops-in-block=2 max-num-iterations=10'\""; - auto result = parseOnlyPublic(params, VPU::ArchKind::NPU40XX, /*warnForPrivate=*/false, + auto result = parseOnlyPublic(params, config::ArchKind::NPU40XX, /*warnForPrivate=*/false, /*logLevel=*/LogLevel::None); ASSERT_TRUE(result != nullptr); } { std::string params = "debatcher-settings={debatching-inlining-method=naive max-batch-number-disable-limit=-1}"; - auto result = parseOnlyPublic(params, VPU::ArchKind::NPU40XX, /*warnForPrivate=*/false, + auto result = parseOnlyPublic(params, config::ArchKind::NPU40XX, /*warnForPrivate=*/false, /*logLevel=*/LogLevel::None); ASSERT_TRUE(result != nullptr); } diff --git a/tests/unit/vpux_compiler/utils/dma_transfer_reduction.cpp b/tests/unit/vpux_compiler/utils/dma_transfer_reduction.cpp index 5a565cd77b..899ecc6f8d 100644 --- a/tests/unit/vpux_compiler/utils/dma_transfer_reduction.cpp +++ b/tests/unit/vpux_compiler/utils/dma_transfer_reduction.cpp @@ -8,6 +8,7 @@ #include #include "common/utils.hpp" +#include "vpux/compiler/dialect/core/IR/memref_attr.hpp" #include "vpux/compiler/utils/dma_transaction_utils.hpp" #include "vpux/utils/core/range.hpp" diff --git a/tests/unit/vpux_compiler/utils/function_outlining_splitter_batching_tests.cpp b/tests/unit/vpux_compiler/utils/function_outlining_splitter_batching_tests.cpp index 9a95ade5ae..2e30106c47 100644 --- a/tests/unit/vpux_compiler/utils/function_outlining_splitter_batching_tests.cpp +++ b/tests/unit/vpux_compiler/utils/function_outlining_splitter_batching_tests.cpp @@ -3,13 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "common/utils.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/init.hpp" #include "vpux/compiler/utils/IE/function_outlining_splitter.hpp" -#include "common/utils.hpp" - #include #include diff --git a/tests/unit/vpux_compiler/utils/function_outlining_splitter_naive_tests.cpp b/tests/unit/vpux_compiler/utils/function_outlining_splitter_naive_tests.cpp index 77311d0cc1..da6516c12e 100644 --- a/tests/unit/vpux_compiler/utils/function_outlining_splitter_naive_tests.cpp +++ b/tests/unit/vpux_compiler/utils/function_outlining_splitter_naive_tests.cpp @@ -3,14 +3,18 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "common/utils.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/init.hpp" #include "vpux/compiler/utils/IE/function_outlining_splitter.hpp" -#include "common/utils.hpp" - #include #include diff --git a/tests/unit/vpux_compiler/utils/function_outlining_splitter_options_test.cpp b/tests/unit/vpux_compiler/utils/function_outlining_splitter_options_test.cpp index 3f6da98c06..909aac27e7 100644 --- a/tests/unit/vpux_compiler/utils/function_outlining_splitter_options_test.cpp +++ b/tests/unit/vpux_compiler/utils/function_outlining_splitter_options_test.cpp @@ -3,11 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "vpux/compiler/dialect/IE/IR/ops.hpp" -#include "vpux/compiler/init.hpp" -#include "vpux/compiler/utils/IE/function_outlining_splitter.hpp" - #include "common/utils.hpp" +#include "vpux/compiler/utils/IE/function_outlining_splitter.hpp" #include #include diff --git a/tests/unit/vpux_compiler/utils/function_outlining_splitter_repeating_blocks_tests.cpp b/tests/unit/vpux_compiler/utils/function_outlining_splitter_repeating_blocks_tests.cpp index d06287681c..baba9a9630 100644 --- a/tests/unit/vpux_compiler/utils/function_outlining_splitter_repeating_blocks_tests.cpp +++ b/tests/unit/vpux_compiler/utils/function_outlining_splitter_repeating_blocks_tests.cpp @@ -3,14 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "common/utils.hpp" #include "vpux/compiler/dialect/IE/IR/dialect.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" #include "vpux/compiler/dialect/const/ops.hpp" #include "vpux/compiler/init.hpp" #include "vpux/compiler/utils/IE/function_outlining_splitter.hpp" -#include "common/utils.hpp" - #include #include diff --git a/tests/unit/vpux_compiler/utils/get_distribution_from_op.cpp b/tests/unit/vpux_compiler/utils/get_distribution_from_op.cpp index a7d742d77c..013da20daa 100644 --- a/tests/unit/vpux_compiler/utils/get_distribution_from_op.cpp +++ b/tests/unit/vpux_compiler/utils/get_distribution_from_op.cpp @@ -340,7 +340,7 @@ TEST_P(GetDistributedTypeFromSOKOpTests, SplitOverChannelsDistribution) { const vpux::VPU::DistributionMode expectedDistribution = params.expectedDistribution; auto registry = vpux::createDialectRegistry(); - auto interfacesRegistry = vpux::createInterfacesRegistry(vpux::VPU::ArchKind::NPU37XX); + auto interfacesRegistry = vpux::createInterfacesRegistry(vpux::config::ArchKind::NPU37XX); interfacesRegistry->registerInterfaces(registry); mlir::MLIRContext ctx(registry); @@ -459,31 +459,31 @@ std::vector verticalFusionWrappingParams = { #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> module @test { IE.TileResource 3 of @NCE at 6.000000e+02 MHz - func.func @main(%arg0: tensor<1x2048x1x1xf16, {order = #NHWC}>) + func.func @main(%arg0: tensor<1x2048x1x1xf16, {order = #NHWC}>) -> (tensor<1x12288x1x1xf16, {order = #NHWC}>, tensor<1x12288x1x1xf16, {order = #NHWC}>) { - %cst = const.Declare tensor<12288x2048x1x1x!qElemType, {order = #NHWC}> = dense<1.000000e+00> : + %cst = const.Declare tensor<12288x2048x1x1x!qElemType, {order = #NHWC}> = dense<1.000000e+00> : tensor<12288x2048x1x1xf16, {order = #NHWC}>, [#const.CastElemType, #const.CastElemType] %cst_0 = const.Declare tensor<12288x1x1x4xsi32> = dense<10> : tensor<12288x1x1x4xsi32> - %0 = VPU.VerticalFusion (%arg0 as %arg1: tensor<1x2048x1x1xf16, {order = #NHWC}>, - %cst as %arg2: tensor<12288x2048x1x1x!qElemType, {order = #NHWC}>, - %cst_0 as %arg3: tensor<12288x1x1x4xsi32>) + %0 = VPU.VerticalFusion (%arg0 as %arg1: tensor<1x2048x1x1xf16, {order = #NHWC}>, + %cst as %arg2: tensor<12288x2048x1x1x!qElemType, {order = #NHWC}>, + %cst_0 as %arg3: tensor<12288x1x1x4xsi32>) attributes {tilingStrategy = [1, 8, 1, 1]} -> tensor<1x12288x1x1xf16, {order = #NHWC}> { %1 = VPU.NCE.Convolution(%arg1, %arg2, %arg3) { - multiClusterStrategy = #VPU.multi_cluster_strategy, - pad = #VPU.Padding, - ppe = #VPU.PPEStub<>, - rawFilterShape = [12288, 2048, 1, 1], strides = [1, 1]} : - tensor<1x2048x1x1xf16, {order = #NHWC}>, tensor<12288x2048x1x1x!qElemType, {order = #NHWC}>, - tensor<12288x1x1x4xsi32> -> tensor<1x12288x1x1xf16, {order = #NHWC}> - VPU.Yield %1 + multiClusterStrategy = #VPU.multi_cluster_strategy, + pad = #VPU.Padding, + ppe = #VPU.PPEStub<>, + rawFilterShape = [12288, 2048, 1, 1], strides = [1, 1]} : + tensor<1x2048x1x1xf16, {order = #NHWC}>, tensor<12288x2048x1x1x!qElemType, {order = #NHWC}>, + tensor<12288x1x1x4xsi32> -> tensor<1x12288x1x1xf16, {order = #NHWC}> + VPU.Yield %1 } %2 = VPU.NCE.Convolution(%arg0, %cst, %cst_0) { - multiClusterStrategy = #VPU.multi_cluster_strategy, - pad = #VPU.Padding, - ppe = #VPU.PPEStub<>, - rawFilterShape = [12288, 2048, 1, 1], strides = [1, 1], tilingStrategy = [1, 8, 1, 1]} : - tensor<1x2048x1x1xf16, {order = #NHWC}>, tensor<12288x2048x1x1x!qElemType, {order = #NHWC}>, - tensor<12288x1x1x4xsi32> -> tensor<1x12288x1x1xf16, {order = #NHWC}> + multiClusterStrategy = #VPU.multi_cluster_strategy, + pad = #VPU.Padding, + ppe = #VPU.PPEStub<>, + rawFilterShape = [12288, 2048, 1, 1], strides = [1, 1], tilingStrategy = [1, 8, 1, 1]} : + tensor<1x2048x1x1xf16, {order = #NHWC}>, tensor<12288x2048x1x1x!qElemType, {order = #NHWC}>, + tensor<12288x1x1x4xsi32> -> tensor<1x12288x1x1xf16, {order = #NHWC}> return %0, %2 : tensor<1x12288x1x1xf16, {order = #NHWC}>, tensor<1x12288x1x1xf16, {order = #NHWC}> } })", false, VPU::DistributionMode::SEGMENTED | VPU::DistributionMode::DUPLICATED}}; diff --git a/tests/unit/vpux_compiler/utils/get_overlap_neighbour_ops_tests.cpp b/tests/unit/vpux_compiler/utils/get_overlap_neighbour_ops_tests.cpp index 9f458ad5e4..c05915d25f 100644 --- a/tests/unit/vpux_compiler/utils/get_overlap_neighbour_ops_tests.cpp +++ b/tests/unit/vpux_compiler/utils/get_overlap_neighbour_ops_tests.cpp @@ -106,7 +106,7 @@ TEST_P(GetOverlapSiblingsTests, GetOps) { const auto inputIR = GetParam(); auto registry = vpux::createDialectRegistry(); - auto interfacesRegistry = vpux::createInterfacesRegistry(vpux::VPU::ArchKind::NPU40XX); + auto interfacesRegistry = vpux::createInterfacesRegistry(vpux::config::ArchKind::NPU40XX); interfacesRegistry->registerInterfaces(registry); mlir::MLIRContext ctx(registry); @@ -136,7 +136,7 @@ TEST_P(GetActivationOverlapTests, GetParams) { const auto inputTileShape = params.tileShape; auto registry = vpux::createDialectRegistry(); - auto interfacesRegistry = vpux::createInterfacesRegistry(vpux::VPU::ArchKind::NPU40XX); + auto interfacesRegistry = vpux::createInterfacesRegistry(vpux::config::ArchKind::NPU40XX); interfacesRegistry->registerInterfaces(registry); mlir::MLIRContext ctx(registry); @@ -197,7 +197,7 @@ TEST_P(GetOutputOverlapTests, GetParams) { const auto outputTileShape = params.tileShape; auto registry = vpux::createDialectRegistry(); - auto interfacesRegistry = vpux::createInterfacesRegistry(vpux::VPU::ArchKind::NPU40XX); + auto interfacesRegistry = vpux::createInterfacesRegistry(vpux::config::ArchKind::NPU40XX); interfacesRegistry->registerInterfaces(registry); mlir::MLIRContext ctx(registry); @@ -253,7 +253,7 @@ TEST_P(GetOutputOverlapTests, GetParams) { llvm::StringLiteral twoConvConsumers = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 4 of @NCE at 6.000000e+02 MHz func.func @main(%arg0: tensor<1x144x28x27xf16, {order = #NHWC}>) -> (tensor<1x144x28x27xf16, {order = #NHWC}>, tensor<1x144x28x27xf16, {order = #NHWC}>) { @@ -299,7 +299,7 @@ llvm::StringLiteral twoConvConsumers = R"( llvm::StringLiteral nceInterpAndConvConsumers = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 4 of @NCE at 6.000000e+02 MHz func.func @main(%arg0: tensor<1x144x28x27xf16, {order = #NHWC}>) -> (tensor<1x144x28x27xf16, {order = #NHWC}>, tensor<1x144x56x54xf16, {order = #NHWC}>) { @@ -365,7 +365,7 @@ llvm::StringLiteral nceInterpAndConvConsumers = R"( llvm::StringLiteral threeClusteredConsumers = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 4 of @NCE at 6.000000e+02 MHz func.func @main(%arg0: tensor<1x144x28x27xf16, {order = #NHWC}>) -> (tensor<1x144x28x27xf16, {order = #NHWC}>, tensor<1x144x28x27xf16, {order = #NHWC}>, tensor<1x144x28x27xf16, {order = #NHWC}>) { @@ -413,7 +413,7 @@ llvm::StringLiteral threeClusteredConsumers = R"( llvm::StringLiteral oneConsumerNotClustered = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 4 of @NCE at 6.000000e+02 MHz func.func @main(%arg0: tensor<1x144x28x27xf16, {order = #NHWC}>) -> (tensor<1x144x28x27xf16, {order = #NHWC}>, tensor<1x144x28x27xf16, {order = #NHWC}>) { @@ -508,7 +508,7 @@ llvm::StringLiteral quantizeCastDirectConsumer = R"( !qElemType1 = !quant.uniform !qElemType2 = !quant.uniform !qElemType3 = !quant.uniform - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 2 of @NCE at 6.000000e+02 MHz func.func @main(%arg0: tensor<1x32x28x27x!qElemType, {order = #NHWC}>) -> (tensor<1x32x14x14x!qElemType2, {order = #NHWC}>, tensor<1x32x28x27x!qElemType3, {order = #NHWC}>) { @@ -563,7 +563,7 @@ llvm::StringLiteral multipleConsumersOfQuantizeCast = R"( !qElemType1 = !quant.uniform !qElemType2 = !quant.uniform !qElemType3 = !quant.uniform - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 2 of @NCE at 6.000000e+02 MHz func.func @main(%arg0: tensor<1x32x28x27x!qElemType, {order = #NHWC}>) -> (tensor<1x32x14x14x!qElemType2, {order = #NHWC}>, tensor<1x32x28x27x!qElemType3, {order = #NHWC}>, tensor<1x32x28x27x!qElemType3, {order = #NHWC}>) { @@ -667,7 +667,7 @@ INSTANTIATE_TEST_SUITE_P(QuantizeCastDirectConsumersTiled, GetOutputOverlapTests llvm::StringLiteral eltwiseResidualBlock = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 4 of @NCE at 6.000000e+02 MHz func.func @main(%arg0: tensor<1x16x8x8xf16, {order = #NHWC}>) -> (tensor<1x16x4x4xf16, {order = #NHWC}>, tensor<1x16x8x8xf16, {order = #NHWC}>) { @@ -719,7 +719,7 @@ llvm::StringLiteral eltwiseResidualBlock = R"( llvm::StringLiteral eltwiseWithParentsInDiffSubgraphs = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 4 of @NCE at 6.000000e+02 MHz func.func @main(%arg0: tensor<1x16x8x8xf16, {order = #NHWC}>) -> (tensor<1x16x8x8xf16, {order = #NHWC}>, tensor<1x16x8x8xf16, {order = #NHWC}>, tensor<1x16x8x8xf16, {order = #NHWC}>) { @@ -808,7 +808,7 @@ INSTANTIATE_TEST_SUITE_P(EltwiseConsumersTiled, GetOutputOverlapTests, testing:: llvm::StringLiteral eltwiseInPlaceSubgraph = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 4 of @NCE at 6.000000e+02 MHz func.func @main(%arg0: tensor<1x16x8x8xf16, {order = #NHWC}>) -> (tensor<1x16x4x4xf16, {order = #NHWC}>, tensor<1x16x8x8xf16, {order = #NHWC}>) { @@ -874,7 +874,7 @@ llvm::StringLiteral eltwiseInPlaceSubgraph = R"( llvm::StringLiteral eltwiseInPlaceWithParentsInDiffSubgraphs = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 4 of @NCE at 6.000000e+02 MHz func.func @main(%arg0: tensor<1x16x8x8xf16, {order = #NHWC}>) -> (tensor<1x16x8x8xf16, {order = #NHWC}>, tensor<1x16x8x8xf16, {order = #NHWC}>, tensor<1x16x8x8xf16, {order = #NHWC}>) { @@ -984,9 +984,9 @@ INSTANTIATE_TEST_SUITE_P(InPlaceEltwiseConsumersTiled, GetOutputOverlapTests, llvm::StringLiteral concatSubgraph = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 4 of @NCE at 6.000000e+02 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } func.func @main(%arg0: tensor<1x16x8x8xf16, {order = #NHWC}>) -> (tensor<1x16x4x4xf16, {order = #NHWC}>, tensor<1x16x8x8xf16, {order = #NHWC}>) { @@ -1048,9 +1048,9 @@ llvm::StringLiteral concatSubgraph = R"( llvm::StringLiteral notSOHCompatibleConcatSubgraph = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 4 of @NCE at 6.000000e+02 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } func.func @main(%arg0: tensor<1x16x8x8xf16, {order = #NHWC}>) -> (tensor<1x16x4x4xf16, {order = #NHWC}>, tensor<1x16x16x8xf16, {order = #NHWC}>) { @@ -1120,9 +1120,9 @@ llvm::StringLiteral notSOHCompatibleConcatSubgraph = R"( llvm::StringLiteral concatWithParentsInDiffSubgraphs = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 4 of @NCE at 6.000000e+02 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } func.func @main(%arg0: tensor<1x16x8x8xf16, {order = #NHWC}>) -> (tensor<1x16x8x8xf16, {order = #NHWC}>, tensor<1x16x8x8xf16, {order = #NHWC}>, tensor<1x16x8x8xf16, {order = #NHWC}>) { @@ -1243,9 +1243,9 @@ INSTANTIATE_TEST_SUITE_P(ConcatConsumersTiled, GetOutputOverlapTests, llvm::StringLiteral mixedSubgraph0 = R"( #NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)> - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 4 of @NCE at 6.000000e+02 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } func.func @main(%arg0: tensor<1x16x8x8xf16, {order = #NHWC}>, %arg1: tensor<1x32x8x8xf16, {order = #NHWC}>) -> (tensor<1x16x8x8xf16, {order = #NHWC}>, tensor<1x16x8x8xf16, {order = #NHWC}>, tensor<1x32x8x8xf16, {order = #NHWC}>) { @@ -1330,9 +1330,9 @@ llvm::StringLiteral mixedSubgraph1 = R"( !qElemType1 = !quant.uniform !qElemType2 = !quant.uniform !qElemType3 = !quant.uniform - module @test attributes {VPU.arch = #VPU.arch_kind} { + module @test attributes {config.arch = #config.arch_kind} { IE.TileResource 4 of @NCE at 6.000000e+02 MHz { - IE.MemoryResource 1474560 bytes of @CMX_NN {VPU.bandwidth = 64 : i64, VPU.derateFactor = 1.000000e+00 : f64} + IE.MemoryResource 1474560 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} } func.func @main(%arg0: tensor<1x16x8x8x!qElemType, {order = #NHWC}>, %arg1: tensor<1x16x8x8x!qElemType, {order = #NHWC}>) -> (tensor<1x16x8x8x!qElemType3, {order = #NHWC}>, tensor<1x16x8x8x!qElemType2, {order = #NHWC}>, tensor<1x16x8x8x!qElemType3, {order = #NHWC}>) { diff --git a/tests/unit/vpux_compiler/utils/llvm_transitive_clone_tests.cpp b/tests/unit/vpux_compiler/utils/llvm_transitive_clone_tests.cpp index 2c4511fdf6..c5bbb549d1 100644 --- a/tests/unit/vpux_compiler/utils/llvm_transitive_clone_tests.cpp +++ b/tests/unit/vpux_compiler/utils/llvm_transitive_clone_tests.cpp @@ -33,7 +33,7 @@ TEST_P(LLVMTransitiveCloneTests, CloneFunctions) { const llvm::StringLiteral entry = params.entry; const llvm::StringLiteral swModuleName = params.swModule; auto registry = vpux::createDialectRegistry(); - auto interfacesRegistry = vpux::createInterfacesRegistry(vpux::VPU::ArchKind::NPU40XX); + auto interfacesRegistry = vpux::createInterfacesRegistry(vpux::config::ArchKind::NPU40XX); interfacesRegistry->registerInterfaces(registry); mlir::MLIRContext ctx(registry); diff --git a/tests/unit/vpux_compiler/utils/overlapped_parameters_tests.cpp b/tests/unit/vpux_compiler/utils/overlapped_parameters_tests.cpp index 1b55afd55f..65ea90cc46 100644 --- a/tests/unit/vpux_compiler/utils/overlapped_parameters_tests.cpp +++ b/tests/unit/vpux_compiler/utils/overlapped_parameters_tests.cpp @@ -41,7 +41,7 @@ TEST_P(GetOverlapDistributionParamsTests, GetMemoryViewFromProducerConsumers) { const auto expectedMemoryOffsets = params.memoryOffsets; auto registry = vpux::createDialectRegistry(); - auto interfacesRegistry = vpux::createInterfacesRegistry(vpux::VPU::ArchKind::NPU37XX); + auto interfacesRegistry = vpux::createInterfacesRegistry(vpux::config::ArchKind::NPU37XX); interfacesRegistry->registerInterfaces(registry); mlir::MLIRContext ctx(registry); diff --git a/tests/unit/vpux_compiler/utils/pipeline_options_utils.cpp b/tests/unit/vpux_compiler/utils/pipeline_options_utils.cpp new file mode 100644 index 0000000000..19a001a2ab --- /dev/null +++ b/tests/unit/vpux_compiler/utils/pipeline_options_utils.cpp @@ -0,0 +1,111 @@ +// +// Copyright (C) 2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +#include "vpux/compiler/dialect/VPU/utils/setup_pipeline_options_utils.hpp" +#include "vpux/compiler/dialect/core/transforms/passes.hpp" +#include "vpux/compiler/init.hpp" +#include "vpux/compiler/utils/analysis.hpp" +#include "vpux/utils/core/string_ref.hpp" + +#include "common/utils.hpp" + +#include +#include + +using namespace vpux; +using MLIR_GetAttributeFromOption = MLIR_UnitBase; + +using vpux::VPU::getAttributeFromOption; +namespace { + +struct ManyOptionsPassOptions { + bool boolOption = false; + int64_t intOption = 42; + std::string strOption = "hello"; + double doubleOption = 3.14; +}; + +class ManyOptionsPass : public vpux::ModulePass { +public: + ManyOptionsPass(): vpux::ModulePass(::mlir::TypeID::get()) { + } + ManyOptionsPass(const ManyOptionsPass& other): vpux::ModulePass(other) { + } + ManyOptionsPass(const ManyOptionsPassOptions& options): vpux::ModulePass(::mlir::TypeID::get()) { + boolOption = options.boolOption; + intOption = options.intOption; + strOption = options.strOption; + doubleOption = options.doubleOption; + } + + ::llvm::StringRef getName() const override { + return "ManyOptionsPass"; + } + void safeRunOnModule() override final { + } + std::unique_ptr<::mlir::Pass> clonePass() const override { + return std::make_unique(*static_cast(this)); + } + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ManyOptionsPass) + + mlir::Pass::Option boolOption{*this, "", ::llvm::cl::desc(""), ::llvm::cl::init(false)}; + mlir::Pass::Option intOption{*this, "", ::llvm::cl::desc(""), ::llvm::cl::init(42)}; + mlir::Pass::Option strOption{*this, "", ::llvm::cl::desc(""), ::llvm::cl::init("hello")}; + mlir::Pass::Option doubleOption{*this, "", ::llvm::cl::desc(""), ::llvm::cl::init(3.14)}; +}; + +} // namespace + +TEST(MLIR_GetAttributeFromOption, BoolOption) { + auto registry = vpux::createDialectRegistry(); + mlir::MLIRContext ctx(registry); + + ManyOptionsPassOptions options; + options.boolOption = true; + auto pass = std::make_unique(options); + auto attr = getAttributeFromOption(&ctx, pass->boolOption); + auto boolAttr = mlir::dyn_cast(attr); + ASSERT_TRUE(boolAttr != nullptr); + EXPECT_TRUE(boolAttr.getValue()); +} + +TEST(MLIR_GetAttributeFromOption, Int64Option) { + auto registry = vpux::createDialectRegistry(); + mlir::MLIRContext ctx(registry); + + ManyOptionsPassOptions options; + options.intOption = 73; + auto pass = std::make_unique(options); + auto attr = getAttributeFromOption(&ctx, pass->intOption); + auto intAttr = mlir::dyn_cast(attr); + ASSERT_TRUE(intAttr != nullptr); + EXPECT_EQ(intAttr.getValue().getSExtValue(), 73); +} + +TEST(MLIR_GetAttributeFromOption, StringOption) { + auto registry = vpux::createDialectRegistry(); + mlir::MLIRContext ctx(registry); + + ManyOptionsPassOptions options; + options.strOption = "bye"; + auto pass = std::make_unique(options); + auto attr = getAttributeFromOption(&ctx, pass->strOption); + auto strAttr = mlir::dyn_cast(attr); + ASSERT_TRUE(strAttr != nullptr); + EXPECT_EQ(strAttr.getValue(), "bye"); +} + +TEST(MLIR_GetAttributeFromOption, DoubleOption) { + auto registry = vpux::createDialectRegistry(); + mlir::MLIRContext ctx(registry); + + ManyOptionsPassOptions options; + options.doubleOption = 2.71828; + auto pass = std::make_unique(options); + auto attr = getAttributeFromOption(&ctx, pass->doubleOption); + auto floatAttr = mlir::dyn_cast(attr); + ASSERT_TRUE(floatAttr != nullptr); + EXPECT_DOUBLE_EQ(floatAttr.getValueAsDouble(), 2.71828); +} diff --git a/tests/unit/vpux_compiler/utils/quantization.cpp b/tests/unit/vpux_compiler/utils/quantization.cpp new file mode 100644 index 0000000000..9fa2a09060 --- /dev/null +++ b/tests/unit/vpux_compiler/utils/quantization.cpp @@ -0,0 +1,100 @@ +// +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +// + +#include "vpux/compiler/utils/quantization.hpp" +#include "common/utils.hpp" +#include "vpux/compiler/core/attributes/shape.hpp" + +#include +#include + +using namespace vpux; + +using MLIR_QuantizationUtilsTest = MLIR_UnitBase; + +void checkScalesAndZps(mlir::Type tiledType, ArrayRef expectedScales, ArrayRef expectedZps) { + auto perAxisQuant = mlir::dyn_cast(tiledType); + EXPECT_NE(perAxisQuant, nullptr); + + const auto scales = perAxisQuant.getScales(); + EXPECT_EQ(scales, expectedScales); + + const auto zps = perAxisQuant.getZeroPoints(); + EXPECT_EQ(zps, expectedZps); +} + +TEST_F(MLIR_QuantizationUtilsTest, TileScalesAndZp) { + mlir::MLIRContext ctx(registry); + ctx.loadDialect(); + + constexpr int64_t axisSize = 32; + SmallVector scales(axisSize, 0.01); + SmallVector zeroPoints(axisSize, 1); + + for (auto idx : irange(axisSize)) { + scales[idx] *= idx; // scales = 0.01 0.02 0.03 etc. + zeroPoints[idx] *= idx; // zp = 1 2 3 4 5 etc. + } + + const auto quantType = mlir::quant::UniformQuantizedPerAxisType::get( + 0, getUInt8Type(&ctx), mlir::Float32Type::get(&ctx), scales, zeroPoints, 0, 0, 255); + + { + // Test case 0: tile contiguous section on quant axis + const SmallVector expectedScales = {0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24}; + const SmallVector expectedZPs = {15, 16, 17, 18, 19, 20, 21, 22, 23, 24}; + + const auto shape = Shape({10, 2, 3, 1}); + const auto offsets = Shape({15, 0, 0, 0}); + auto tiledTypeContiguous = tileScalesAndZP(quantType, shape, offsets); + checkScalesAndZps(tiledTypeContiguous, expectedScales, expectedZPs); + + const auto strides = Shape({1, 1, 1, 1}); + tiledTypeContiguous = tileScalesAndZP(quantType, shape, offsets, strides); + checkScalesAndZps(tiledTypeContiguous, expectedScales, expectedZPs); + } + + { + // Test case 1: tile strided section on quant axis + const SmallVector expectedScalesOdd = {0.15, 0.18, 0.21, 0.24, 0.27, 0.3}; + const SmallVector expectedZPsOdd = {15, 18, 21, 24, 27, 30}; + + const auto shape = Shape({6, 2, 3, 1}); + const auto offsets = Shape({15, 0, 0, 0}); + const auto stridesOdd = Shape({3, 1, 1, 1}); + auto tiledTypeStridedOdd = tileScalesAndZP(quantType, shape, offsets, stridesOdd); + checkScalesAndZps(tiledTypeStridedOdd, expectedScalesOdd, expectedZPsOdd); + + const SmallVector expectedScalesEven = {0.15, 0.17, 0.19, 0.21, 0.23, 0.25}; + const SmallVector expectedZPsEven = {15, 17, 19, 21, 23, 25}; + + const auto stridesEven = Shape({2, 1, 1, 1}); + auto tiledTypeStridedEven = tileScalesAndZP(quantType, shape, offsets, stridesEven); + checkScalesAndZps(tiledTypeStridedEven, expectedScalesEven, expectedZPsEven); + } + + { + // Test case 2: stride axis is not quantization axis + const SmallVector expectedScales = {0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24}; + const SmallVector expectedZPs = {15, 16, 17, 18, 19, 20, 21, 22, 23, 24}; + + const auto shape = Shape({10, 2, 3, 1}); + const auto offsets = Shape({15, 0, 2, 0}); + const auto strides = Shape({1, 1, 2, 1}); + auto tiledTypeContiguous = tileScalesAndZP(quantType, shape, offsets, strides); + checkScalesAndZps(tiledTypeContiguous, expectedScales, expectedZPs); + } + + { + // Test case 3: slice axis is not quantization axis + const auto shape = Shape({32, 2, 3, 1}); + const auto offsets = Shape({0, 0, 2, 0}); + const auto strides = Shape({1, 1, 2, 1}); + auto noTilingTypeOnQuantAxisType = tileScalesAndZP(quantType, shape, offsets, strides); + checkScalesAndZps(noTilingTypeOnQuantAxisType, scales, zeroPoints); + } +} diff --git a/tests/unit/vpux_compiler/utils/rewriter_tests.cpp b/tests/unit/vpux_compiler/utils/rewriter_tests.cpp index 8a03b4095a..383932d966 100644 --- a/tests/unit/vpux_compiler/utils/rewriter_tests.cpp +++ b/tests/unit/vpux_compiler/utils/rewriter_tests.cpp @@ -5,6 +5,7 @@ #include "common/utils.hpp" #include "vpux/compiler/core/attributes/dims_order.hpp" +#include "vpux/compiler/dialect/VPU/IR/types.hpp" #include "vpux/compiler/dialect/VPUIP/IR/dialect.hpp" #include "vpux/compiler/dialect/const/dialect.hpp" #include "vpux/compiler/utils/attributes.hpp" diff --git a/tests/unit/vpux_compiler/utils/swizzling_test.cpp b/tests/unit/vpux_compiler/utils/swizzling_test.cpp index d11151542d..1dcb98aa5e 100644 --- a/tests/unit/vpux_compiler/utils/swizzling_test.cpp +++ b/tests/unit/vpux_compiler/utils/swizzling_test.cpp @@ -34,7 +34,7 @@ bool compareBuffers(const SmallVector& buf1, const MutableArrayRef& buf2) } template -bool swizzlingTest(uint32_t elements, uint32_t swizzleKey, VPU::ArchKind archKind) { +bool swizzlingTest(uint32_t elements, uint32_t swizzleKey, config::ArchKind archKind) { BufferTransform::BufferSwizzleTransform bufferTransform{swizzleKey, archKind}; const auto swizzlePatternStride{bufferTransform.getSwizzlePatternStride()}; @@ -82,7 +82,7 @@ TEST_P(SwizzlingTest_VPUX37XX, swizzlingTest_VPUX37XX) { const auto elements = std::get<1>(params); bool result = false; - EXPECT_TRUE(result = swizzlingTest(elements, swizzlingKey, VPU::ArchKind::NPU37XX)); + EXPECT_TRUE(result = swizzlingTest(elements, swizzlingKey, config::ArchKind::NPU37XX)); } TEST_P(SwizzlingTest_VPUX40XX, swizzlingTest_VPUX40XX) { @@ -91,7 +91,7 @@ TEST_P(SwizzlingTest_VPUX40XX, swizzlingTest_VPUX40XX) { const auto elements = std::get<1>(params); bool result = false; - EXPECT_TRUE(result = swizzlingTest(elements, swizzlingKey, VPU::ArchKind::NPU40XX)); + EXPECT_TRUE(result = swizzlingTest(elements, swizzlingKey, config::ArchKind::NPU40XX)); } INSTANTIATE_TEST_SUITE_P(testAligned_VPUX40XX_Key0, SwizzlingTest_VPUX40XX, Combine(Values(0), Values(1024))); diff --git a/thirdparty/CMakeLists.txt b/thirdparty/CMakeLists.txt index 9591cb1bbc..df9adf8412 100644 --- a/thirdparty/CMakeLists.txt +++ b/thirdparty/CMakeLists.txt @@ -1,6 +1,6 @@ # # Copyright (C) 2022-2025 Intel Corporation. -# SPDX-License-Identifier: Apache 2.0 +# SPDX-License-Identifier: Apache-2.0 # # @@ -47,15 +47,15 @@ endif() # flatbuffers # -if(CMAKE_SOURCE_DIR STREQUAL OpenVINO_SOURCE_DIR AND - ENABLE_OV_TF_LITE_FRONTEND AND (NOT ENABLE_SYSTEM_FLATBUFFERS OR NOT Flatbuffers_FOUND)) +if(TARGET flatbuffers OR TARGET flatc) # we are building NPU plugin via -DOPENVINO_EXTRA_MODULES # and flatbuffers is already built as part of OpenVINO in case of # building in a single tree + message(WARNING "Flatbuffers target present. Possible version mismatch.") else() - set(FLATBUFFERS_BUILD_TESTS OFF CACHE BOOL "" FORCE) - set(FLATBUFFERS_INSTALL OFF CACHE BOOL "" FORCE) - set(FLATBUFFERS_BUILD_FLATC ON CACHE BOOL "" FORCE) + set(FLATBUFFERS_BUILD_TESTS OFF) + set(FLATBUFFERS_INSTALL OFF) + set(FLATBUFFERS_BUILD_FLATC ON) add_subdirectory(flatbuffers EXCLUDE_FROM_ALL) @@ -69,14 +69,11 @@ else() target_compile_options(flatc PRIVATE -Wno-unused-but-set-variable) endif() endif() - - vpux_add_native_tool(flatc "${CMAKE_CURRENT_SOURCE_DIR}/flatbuffers" - CMAKE_ARGS - "FLATBUFFERS_BUILD_TESTS:BOOL=OFF" - "FLATBUFFERS_INSTALL:BOOL=OFF" - "FLATBUFFERS_BUILD_FLATC:BOOL=ON" - ) endif() +set(flatc_TARGET flatc) +set(flatc_COMMAND $) +set(flatc_TARGET "${flatc_TARGET}" PARENT_SCOPE) +set(flatc_COMMAND "${flatc_COMMAND}" PARENT_SCOPE) # # npu_elf @@ -94,12 +91,12 @@ else() # Legacy no-monorepo scenario add_subdirectory(elf/vpux_elf EXCLUDE_FROM_ALL) target_include_directories(npu_elf PRIVATE - "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/NPUReg37XX/firmware_headers/details" - "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/firmware_headers/details") + "${PROJECT_SOURCE_DIR}/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/NPUReg37XX/firmware_headers/details" + "${PROJECT_SOURCE_DIR}/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/firmware_headers/details") target_include_directories(vpux_elf PRIVATE - "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/NPUReg37XX/firmware_headers/details" - "${IE_MAIN_VPUX_PLUGIN_SOURCE_DIR}/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/firmware_headers/details") + "${PROJECT_SOURCE_DIR}/src/vpux_compiler/include/vpux/compiler/NPU37XX/dialect/NPUReg37XX/firmware_headers/details" + "${PROJECT_SOURCE_DIR}/src/vpux_compiler/include/vpux/compiler/NPU40XX/dialect/NPUReg40XX/firmware_headers/details") endif() # @@ -110,4 +107,6 @@ set(VPUNN_BUILD_SHARED_LIB OFF CACHE BOOL "" FORCE) set(VPUNN_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) set(VPUNN_BUILD_TESTS OFF CACHE BOOL "" FORCE) set(VPUNN_OPT_LEGACY_ZTILING ON CACHE BOOL "" FORCE) +set(VPUNN_OPT_LEGACY_DMA_TH_4 ON CACHE BOOL "" FORCE) +set(VPUNN_OPT_LEGACY_DMA_TH_5 OFF CACHE BOOL "" FORCE) add_subdirectory(vpucostmodel EXCLUDE_FROM_ALL) diff --git a/thirdparty/elf b/thirdparty/elf index 7e8651735b..4b0a4a06ae 160000 --- a/thirdparty/elf +++ b/thirdparty/elf @@ -1 +1 @@ -Subproject commit 7e8651735be77a877d2bfa04c7355136836def0f +Subproject commit 4b0a4a06ae09c0c3a973f8f18761c549ec2309eb diff --git a/tools/npureg-tblgen/CMakeLists.txt b/tools/npureg-tblgen/CMakeLists.txt index 34c5318df8..8c5b04b0c3 100644 --- a/tools/npureg-tblgen/CMakeLists.txt +++ b/tools/npureg-tblgen/CMakeLists.txt @@ -17,6 +17,7 @@ add_llvm_executable(${TARGET_NAME} main.cpp) add_dependencies(${TARGET_NAME} MLIRVPUXVPUAttrIncGen + MLIRVPUXconfigAttrIncGen MLIRSupport) target_compile_features(${TARGET_NAME} PRIVATE cxx_std_17) enable_warnings_as_errors(${TARGET_NAME}) diff --git a/tools/npureg-tblgen/main.cpp b/tools/npureg-tblgen/main.cpp index f37819c7a7..4630262bf2 100644 --- a/tools/npureg-tblgen/main.cpp +++ b/tools/npureg-tblgen/main.cpp @@ -20,6 +20,8 @@ // clang-format off // because header file should be in the first +#include +#include #include #include // clang-format on @@ -33,11 +35,11 @@ static llvm::cl::opt Action(llvm::cl::desc("Actions to perform"), llvm::cl::values(clEnumValN(Generate, "generate", "")), llvm::cl::init(Generate)); -static llvm::cl::opt Platform(llvm::cl::desc("Specify the platform type"), - llvm::cl::values(clEnumValN(vpux::VPU::ArchKind::NPU40XX, "NPU40XX", - "LNL platform") - // clang-format off - ), llvm::cl::init(vpux::VPU::ArchKind::NPU40XX)); +static llvm::cl::opt Platform(llvm::cl::desc("Specify the platform type"), + llvm::cl::values(clEnumValN(vpux::config::ArchKind::NPU40XX, + "NPU40XX", "LNL platform") + // clang-format off + ), llvm::cl::init(vpux::config::ArchKind::NPU40XX)); // clang-format on static std::map platformTypeMap{ @@ -418,7 +420,7 @@ bool RegGenMain(llvm::raw_ostream& stream, llvm::RecordKeeper& records) { return false; }; - const auto platformTypeName = platformTypeMap[vpux::VPU::stringifyArchKind(Platform).str()]; + const auto platformTypeName = platformTypeMap[vpux::config::stringifyArchKind(Platform).str()]; switch (Action) { case Generate: diff --git a/tools/side-load-strategy-generator/generate_mc_sideloader.py b/tools/side-load-strategy-generator/generate_mc_sideloader.py index b618482e48..c5b05a0cdc 100644 --- a/tools/side-load-strategy-generator/generate_mc_sideloader.py +++ b/tools/side-load-strategy-generator/generate_mc_sideloader.py @@ -63,7 +63,24 @@ def loadFromFile(filename: str, entryId: int): #include "vpux/compiler/dialect/VPU/utils/manual_strategy_utils.hpp" #include "vpux/compiler/core/type_interfaces.hpp" -#include "vpux/compiler/dialect/IE/IR/ops.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/activation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/arithmetic.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/bitwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/comparison.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/control_flow.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/convolution.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_movement.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/data_type.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/eltwise.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/image.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/logical.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/normalization.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/pooling.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/recurrent.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/reduce.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/resources.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/shape_manipulation.hpp" +#include "vpux/compiler/dialect/IE/IR/ops/specialized.hpp" #include "vpux/compiler/dialect/VPU/IR/ops_interfaces.hpp" #include diff --git a/tools/vpux-opt/mlir_passes_extractor.py b/tools/vpux-opt/mlir_passes_extractor.py index b7b13e4363..881c518831 100755 --- a/tools/vpux-opt/mlir_passes_extractor.py +++ b/tools/vpux-opt/mlir_passes_extractor.py @@ -1,8 +1,8 @@ +#!/usr/bin/env python # # Copyright (C) 2025 Intel Corporation. # SPDX-License-Identifier: Apache-2.0 # -#!/usr/bin/env python import argparse import os diff --git a/tools/vpux-opt/vpux-opt.cpp b/tools/vpux-opt/vpux-opt.cpp index d9dbd56473..4fe9020d51 100644 --- a/tools/vpux-opt/vpux-opt.cpp +++ b/tools/vpux-opt/vpux-opt.cpp @@ -5,6 +5,7 @@ #include "vpux/compiler/NPU37XX/dialect/VPURT/transforms/passes.hpp" #include "vpux/compiler/NPU40XX/dialect/ELF/passes.hpp" +#include "vpux/compiler/NPU40XX/dialect/NPUReg40XX/passes.hpp" #include "vpux/compiler/NPU40XX/dialect/VPU/transforms/passes.hpp" #include "vpux/compiler/NPU40XX/dialect/VPUIP/transforms/passes.hpp" #include "vpux/compiler/NPU40XX/dialect/VPURT/transforms/passes.hpp" @@ -33,6 +34,7 @@ #include "vpux/utils/core/error.hpp" +#include #include #include #include @@ -78,6 +80,7 @@ int main(int argc, char* argv[]) { vpux::ELF::registerPasses(); vpux::VPUMI37XX::registerPasses(); vpux::VPUMI40XX::registerPasses(); + vpux::NPUReg40XX::registerPasses(); vpux::VPUASM::registerPasses(); vpux::VPUIPDPU::registerPasses(); vpux::ShaveCodeGen::registerPasses(); @@ -89,6 +92,7 @@ int main(int argc, char* argv[]) { mlir::memref::registerResolveShapedTypeResultDims(); mlir::registerLinalgPasses(); mlir::memref::registerExpandStridedMetadataPass(); + mlir::registerArithToLLVMConversionPass(); return mlir::asMainReturnCode( mlir::MlirOptMain(argc, argv, "NPU Optimizer Testing Tool", registry, hwSpecificRegistration)); diff --git a/tools/vpux-translate/vpux-translate.cpp b/tools/vpux-translate/vpux-translate.cpp index 3807dcec96..7043e6425c 100644 --- a/tools/vpux-translate/vpux-translate.cpp +++ b/tools/vpux-translate/vpux-translate.cpp @@ -7,8 +7,8 @@ #include "vpux/compiler/act_kernels/shave_binary_resources.h" #include "vpux/compiler/dialect/ELFNPU37XX/export.hpp" #include "vpux/compiler/dialect/ELFNPU37XX/import.hpp" -#include "vpux/compiler/dialect/VPU/IR/attributes.hpp" #include "vpux/compiler/dialect/config/IR/attributes.hpp" +#include "vpux/compiler/dialect/config/IR/utils.hpp" #include "vpux/compiler/frontend/IE.hpp" #include "vpux/compiler/init.hpp" #include "vpux/compiler/interfaces_registry.hpp" @@ -76,6 +76,10 @@ llvm::cl::opt dynamicShapeToStatic{ "like this: tensor<1x?x3xf32, {bounds = [1, 18, 3], ..}>."), llvm::cl::init(false)}; +llvm::cl::opt enableWeightsSeparationPath{ + "weights-separation-path", + llvm::cl::desc("Disables constants folding for more \"Const->Convert->{Op}\" patterns"), llvm::cl::init(false)}; + enum class NetworkIOType { INPUT, OUTPUT }; // @@ -140,8 +144,15 @@ mlir::OwningOpRef importIE(llvm::SourceMgr& sourceMgr, mlir::MLI // constants in MLIR protects the code from use-after-free errors. constexpr bool useSharedConstants = false; - module = IE::importNetwork(ctx, model, IE::buildOVParams(model), IE::buildOVResults(model), useSharedConstants, - rootTiming, vpuxProfiling, enableDummyOpReplacement, dynamicShapeToStatic); + IE::ImportNetworkConfig importCfg; + importCfg.sharedConstants = useSharedConstants; + importCfg.enableProfiling = vpuxProfiling; + importCfg.stubLayers = enableDummyOpReplacement; + importCfg.dynamicShapeToStatic = dynamicShapeToStatic; + importCfg.enableWeightsSeparationPath = enableWeightsSeparationPath; + + module = IE::importNetwork(ctx, model, IE::buildOVParams(model), IE::buildOVResults(model), rootTiming, + importCfg); } catch (const std::exception& ex) { printTo(llvm::errs(), "Failed to translate IE IR {0} to MLIR : {1}", netFileName, ex.what()); return nullptr; @@ -193,16 +204,16 @@ mlir::LogicalResult exportELF(mlir::ModuleOp module, llvm::raw_ostream& output) mlir::DefaultTimingManager tm; - auto arch = VPU::getArch(module.getOperation()); + auto arch = config::getArch(module.getOperation()); - if (arch == VPU::ArchKind::NPU37XX) { + if (arch == config::ArchKind::NPU37XX) { const auto buf = ELFNPU37XX::exportToELF(module); output.write(reinterpret_cast(buf.data()), buf.size()); - } else if (arch >= VPU::ArchKind::NPU40XX) { + } else if (arch >= config::ArchKind::NPU40XX) { const auto buf = ELF::exportToELF(module); output.write(reinterpret_cast(buf.data()), buf.size()); } else { - VPUX_THROW("ELF Flow not supported for ARCH {0}", VPU::stringifyArchKind(arch)); + VPUX_THROW("ELF Flow not supported for ARCH {0}", config::stringifyArchKind(arch)); } return mlir::success(); diff --git a/validation/linux_npu_driver_config.json b/validation/linux_npu_driver_config.json index 8cc8c5926a..bde6b9558d 100644 --- a/validation/linux_npu_driver_config.json +++ b/validation/linux_npu_driver_config.json @@ -1,3 +1,3 @@ { - "intel": "d287bf6f72a56e843208be20d8207a4bf05bc69e" + "intel": "d7726ce164bd1f41c16e491415bcc2080a46a420" } diff --git a/validation/openvino_config.json b/validation/openvino_config.json index f76427118a..23f3e0d05d 100644 --- a/validation/openvino_config.json +++ b/validation/openvino_config.json @@ -1,3 +1,3 @@ { - "openvinotoolkit": "dd611339928e5637b1ea43d9557a88ac1b938060" + "openvinotoolkit": "d97acfdce00ea5229e4c2d0ab03256ce0dff0a68" }