Pass Failed: AffineToPipeline

Hi there, I’m trying to use the circt project as an hls tool. I’ve created a gauss filter function in the c file:

#define COLS            20
#define ROWS            20
#define KERN_SIZE       3
#define KERN_RAD        (KERN_SIZE >> 1)
#define PIXELS          COLS* ROWS
#define MAX(x, y)       (((x) > (y)) ? (x) : (y))
#define MIN(x, y)       (((x) < (y)) ? (x) : (y))
void gauss(uint8_t input[PIXELS], uint8_t output[PIXELS])
{
    // Gauss kernel
    int16_t kernel[KERN_SIZE][KERN_SIZE] = {
        {1, 2, 1},
        {2, 4, 2},
        {1, 2, 1}
    };
    for (int32_t y = 0; y < ROWS; y++) {
        for (int32_t x = 0; x < COLS; x++) {
            // temp var to store the result
            int16_t sum = 0;
            for (int32_t i = -KERN_RAD; i <= KERN_RAD; i++) {
                for (int32_t j = -KERN_RAD; j <= KERN_RAD; j++) {
                    int32_t yi = y + i;
                    int32_t xj = x + j;
                    if (xj >= 0 && xj < COLS && yi >= 0 && yi < ROWS)
                        sum +=
                            (int16_t)(input[yi * COLS + xj] * kernel[i + KERN_RAD][j + KERN_RAD]);
                }
            }
            // Normalize and store
            output[y * COLS + x] = (uint8_t)(MAX(MIN(255, sum >> 4), 0));
        }
    }
}

Then I used the Polygeist to translate it into mlir like the code following:

#set = affine_set<(d0, d1, d2, d3) : (-d0 - d1 + 19 >= 0, d0 + d1 >= 0, d2 + d3 >= 0, -d2 - d3 + 19 >= 0)>
module {
  func.func @gauss(%arg0: memref<400xi8>, %arg1: memref<400xi8>) attributes {llvm.linkage = #llvm.linkage<external>} {
    %true = arith.constant true
    %c4_i32 = arith.constant 4 : i32
    %c255_i32 = arith.constant 255 : i32
    %c0_i16 = arith.constant 0 : i16
    %c0_i32 = arith.constant 0 : i32
    %c4_i16 = arith.constant 4 : i16
    %c2_i16 = arith.constant 2 : i16
    %c1_i16 = arith.constant 1 : i16
    %alloca = memref.alloca() : memref<3x3xi16>
    affine.store %c1_i16, %alloca[0, 0] : memref<3x3xi16>
    affine.store %c2_i16, %alloca[0, 1] : memref<3x3xi16>
    affine.store %c1_i16, %alloca[0, 2] : memref<3x3xi16>
    affine.store %c2_i16, %alloca[1, 0] : memref<3x3xi16>
    affine.store %c4_i16, %alloca[1, 1] : memref<3x3xi16>
    affine.store %c2_i16, %alloca[1, 2] : memref<3x3xi16>
    affine.store %c1_i16, %alloca[2, 0] : memref<3x3xi16>
    affine.store %c2_i16, %alloca[2, 1] : memref<3x3xi16>
    affine.store %c1_i16, %alloca[2, 2] : memref<3x3xi16>
    affine.for %arg2 = 0 to 20 {
      affine.for %arg3 = 0 to 20 {
        %0 = affine.for %arg4 = -1 to 2 iter_args(%arg5 = %c0_i16) -> (i16) {
          %7 = affine.for %arg6 = -1 to 2 iter_args(%arg7 = %arg5) -> (i16) {
            %8 = affine.if #set(%arg2, %arg4, %arg3, %arg6) -> i16 {
              %9 = affine.load %arg0[%arg4 * 20 + %arg3 + %arg6 + %arg2 * 20] : memref<400xi8>
              %10 = arith.extsi %9 : i8 to i32
              %11 = affine.load %alloca[%arg4 + 1, %arg6 + 1] : memref<3x3xi16>
              %12 = arith.extsi %11 : i16 to i32
              %13 = arith.muli %10, %12 : i32
              %14 = arith.trunci %13 : i32 to i16
              %15 = arith.addi %arg7, %14 : i16
              affine.yield %15 : i16
            } else {
              affine.yield %arg7 : i16
            }
            affine.yield %8 : i16
          }
          affine.yield %7 : i16
        }
        %1 = arith.extsi %0 : i16 to i32
        %2 = arith.shrsi %1, %c4_i32 : i32
        %3 = arith.cmpi sgt, %2, %c255_i32 : i32
        %4 = scf.if %3 -> (i1) {
          scf.yield %true : i1
        } else {
          %7 = arith.cmpi sgt, %2, %c0_i32 : i32
          scf.yield %7 : i1
        }
        %5 = scf.if %4 -> (i32) {
          %7 = arith.select %3, %c255_i32, %2 : i32
          scf.yield %7 : i32
        } else {
          scf.yield %c0_i32 : i32
        }
        %6 = arith.trunci %5 : i32 to i8
        affine.store %6, %arg1[%arg3 + %arg2 * 20] : memref<400xi8>
      }
    }
    return
  }
}

For the first step I’m thinking use the pass --convert-affine-to-pipeline as I’ve successfully tested with other simple loop functions. But with this programm I 've got this error backtrace:

circt-opt: /home/bi/Desktop/circt/llvm/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp:643: mlir::DependenceResult mlir::checkMemrefAccessDependence(const mlir::MemRefAccess &, const mlir::MemRefAccess &, unsigned int, mlir::FlatAffineValueConstraints *, SmallVector<mlir::DependenceComponent, 2> *, bool): Assertion `loopDepth <= numCommonLoops + 1' failed.
PLEASE submit a bug report to https://github.com/llvm/circt and include the crash backtrace.
Stack dump:
0.	Program arguments: /home/bi/Desktop/circt/build/bin/circt-opt gauss.mlir -convert-affine-to-pipeline
 #0 0x0000560a65fea36d llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) /home/bi/Desktop/circt/llvm/llvm/lib/Support/Unix/Signals.inc:567:11
 #1 0x0000560a65fea7fb PrintStackTraceSignalHandler(void*) /home/bi/Desktop/circt/llvm/llvm/lib/Support/Unix/Signals.inc:641:1
 #2 0x0000560a65fe8b76 llvm::sys::RunSignalHandlers() /home/bi/Desktop/circt/llvm/llvm/lib/Support/Signals.cpp:104:5
 #3 0x0000560a65feaf25 SignalHandler(int) /home/bi/Desktop/circt/llvm/llvm/lib/Support/Unix/Signals.inc:412:1
 #4 0x00007fbecfe3e420 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x14420)
 #5 0x00007fbecf8d100b raise /build/glibc-SzIz7B/glibc-2.31/signal/../sysdeps/unix/sysv/linux/raise.c:51:1
 #6 0x00007fbecf8b0859 abort /build/glibc-SzIz7B/glibc-2.31/stdlib/abort.c:81:7
 #7 0x00007fbecf8b0729 get_sysdep_segment_value /build/glibc-SzIz7B/glibc-2.31/intl/loadmsgcat.c:509:8
 #8 0x00007fbecf8b0729 _nl_load_domain /build/glibc-SzIz7B/glibc-2.31/intl/loadmsgcat.c:970:34
 #9 0x00007fbecf8c1fd6 (/lib/x86_64-linux-gnu/libc.so.6+0x33fd6)
#10 0x0000560a67d850b8 mlir::checkMemrefAccessDependence(mlir::MemRefAccess const&, mlir::MemRefAccess const&, unsigned int, mlir::FlatAffineValueConstraints*, llvm::SmallVector<mlir::DependenceComponent, 2u>*, bool) /home/bi/Desktop/circt/llvm/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp:644:17
#11 0x0000560a67b528de checkMemrefDependence(llvm::SmallVectorImpl<mlir::Operation*>&, unsigned int, llvm::DenseMap<mlir::Operation*, llvm::SmallVector<circt::analysis::MemoryDependence, 1u>, llvm::DenseMapInfo<mlir::Operation*, void>, llvm::detail::DenseMapPair<mlir::Operation*, llvm::SmallVector<circt::analysis::MemoryDependence, 1u>>>&) /home/bi/Desktop/circt/lib/Analysis/DependenceAnalysis.cpp:46:33
#12 0x0000560a67b526ef circt::analysis::MemoryDependenceAnalysis::MemoryDependenceAnalysis(mlir::Operation*) /home/bi/Desktop/circt/lib/Analysis/DependenceAnalysis.cpp:142:65
#13 0x0000560a65ffa004 mlir::detail::AnalysisModel<circt::analysis::MemoryDependenceAnalysis>::AnalysisModel<mlir::func::FuncOp&>(mlir::func::FuncOp&) /home/bi/Desktop/circt/llvm/mlir/include/mlir/Pass/AnalysisManager.h:120:48
#14 0x0000560a65ff9f7e std::_MakeUniq<mlir::detail::AnalysisModel<circt::analysis::MemoryDependenceAnalysis>>::__single_object std::make_unique<mlir::detail::AnalysisModel<circt::analysis::MemoryDependenceAnalysis>, mlir::func::FuncOp&>(mlir::func::FuncOp&) /usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/bits/unique_ptr.h:962:34
#15 0x0000560a65ff7147 auto mlir::detail::AnalysisMap::constructAnalysis<circt::analysis::MemoryDependenceAnalysis, mlir::func::FuncOp, (void*)0>(mlir::AnalysisManager&, mlir::func::FuncOp) /home/bi/Desktop/circt/llvm/mlir/include/mlir/Pass/AnalysisManager.h:233:12
#16 0x0000560a65ff6c36 circt::analysis::MemoryDependenceAnalysis& mlir::detail::AnalysisMap::getAnalysisImpl<circt::analysis::MemoryDependenceAnalysis, mlir::func::FuncOp>(mlir::PassInstrumentor*, mlir::func::FuncOp, mlir::AnalysisManager&) /home/bi/Desktop/circt/llvm/mlir/include/mlir/Pass/AnalysisManager.h:211:27
#17 0x0000560a65ff6b15 std::enable_if<std::is_constructible<circt::analysis::MemoryDependenceAnalysis, mlir::func::FuncOp>::value || std::is_constructible<circt::analysis::MemoryDependenceAnalysis, mlir::func::FuncOp, mlir::AnalysisManager&>::value, circt::analysis::MemoryDependenceAnalysis&>::type mlir::detail::AnalysisMap::getAnalysis<circt::analysis::MemoryDependenceAnalysis, mlir::func::FuncOp>(mlir::PassInstrumentor*, mlir::AnalysisManager&) /home/bi/Desktop/circt/llvm/mlir/include/mlir/Pass/AnalysisManager.h:167:5
#18 0x0000560a65ff69a4 circt::analysis::MemoryDependenceAnalysis& mlir::AnalysisManager::getAnalysis<circt::analysis::MemoryDependenceAnalysis, mlir::func::FuncOp>() /home/bi/Desktop/circt/llvm/mlir/include/mlir/Pass/AnalysisManager.h:326:5
#19 0x0000560a65ff6932 circt::analysis::MemoryDependenceAnalysis& mlir::Pass::getAnalysis<circt::analysis::MemoryDependenceAnalysis, mlir::func::FuncOp>() /home/bi/Desktop/circt/llvm/mlir/include/mlir/Pass/Pass.h:225:5
#20 0x0000560a65ff64b5 circt::analysis::MemoryDependenceAnalysis& mlir::OperationPass<mlir::func::FuncOp>::getAnalysis<circt::analysis::MemoryDependenceAnalysis>() /home/bi/Desktop/circt/llvm/mlir/include/mlir/Pass/Pass.h:373:5
#21 0x0000560a65ff0d4f (anonymous namespace)::AffineToPipeline::runOnOperation() /home/bi/Desktop/circt/lib/Conversion/AffineToPipeline/AffineToPipeline.cpp:68:29
#22 0x0000560a6888545a mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:471:21
#23 0x0000560a68885a74 mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:534:16
#24 0x0000560a6888b058 mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_14::operator()(mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo&) const /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:754:36
#25 0x0000560a6888acc9 mlir::LogicalResult mlir::failableParallelForEach<__gnu_cxx::__normal_iterator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo*, std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>>, mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_14&>(mlir::MLIRContext*, __gnu_cxx::__normal_iterator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo*, std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>>, __gnu_cxx::__normal_iterator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo*, std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>>, mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_14&) /home/bi/Desktop/circt/llvm/mlir/include/mlir/IR/Threading.h:46:18
#26 0x0000560a68886d53 mlir::LogicalResult mlir::failableParallelForEach<std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>&, mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_14&>(mlir::MLIRContext*, std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>&, mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_14&) /home/bi/Desktop/circt/llvm/mlir/include/mlir/IR/Threading.h:92:10
#27 0x0000560a688865f7 mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:764:14
#28 0x0000560a68885727 mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:655:5
#29 0x0000560a6888544b mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:468:5
#30 0x0000560a68885a74 mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:534:16
#31 0x0000560a6888749c mlir::PassManager::runPasses(mlir::Operation*, mlir::AnalysisManager) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:843:10
#32 0x0000560a688873b2 mlir::PassManager::run(mlir::Operation*) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:823:60
#33 0x0000560a67a36c6f performActions(llvm::raw_ostream&, bool, bool, std::shared_ptr<llvm::SourceMgr> const&, mlir::MLIRContext*, llvm::function_ref<mlir::LogicalResult (mlir::PassManager&)>, bool, bool) /home/bi/Desktop/circt/llvm/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp:90:17
#34 0x0000560a67a368ff processBuffer(llvm::raw_ostream&, std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, bool, bool, bool, bool, bool, bool, llvm::function_ref<mlir::LogicalResult (mlir::PassManager&)>, mlir::DialectRegistry&, llvm::ThreadPool*) /home/bi/Desktop/circt/llvm/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp:138:12
#35 0x0000560a67a366b8 mlir::MlirOptMain(llvm::raw_ostream&, std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::function_ref<mlir::LogicalResult (mlir::PassManager&)>, mlir::DialectRegistry&, bool, bool, bool, bool, bool, bool, bool)::$_0::operator()(std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::raw_ostream&) const /home/bi/Desktop/circt/llvm/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp:180:12
#36 0x0000560a67a365ad mlir::LogicalResult llvm::function_ref<mlir::LogicalResult (std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::raw_ostream&)>::callback_fn<mlir::MlirOptMain(llvm::raw_ostream&, std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::function_ref<mlir::LogicalResult (mlir::PassManager&)>, mlir::DialectRegistry&, bool, bool, bool, bool, bool, bool, bool)::$_0>(long, std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::raw_ostream&) /home/bi/Desktop/circt/llvm/llvm/include/llvm/ADT/STLFunctionalExtras.h:45:12
#37 0x0000560a67a798d9 llvm::function_ref<mlir::LogicalResult (std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::raw_ostream&)>::operator()(std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::raw_ostream&) const /home/bi/Desktop/circt/llvm/llvm/include/llvm/ADT/STLFunctionalExtras.h:68:12
#38 0x0000560a67a78eb5 mlir::splitAndProcessBuffer(std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::function_ref<mlir::LogicalResult (std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::raw_ostream&)>, llvm::raw_ostream&, bool, bool) /home/bi/Desktop/circt/llvm/mlir/lib/Support/ToolUtilities.cpp:28:12
#39 0x0000560a67a355d3 mlir::MlirOptMain(llvm::raw_ostream&, std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::function_ref<mlir::LogicalResult (mlir::PassManager&)>, mlir::DialectRegistry&, bool, bool, bool, bool, bool, bool, bool) /home/bi/Desktop/circt/llvm/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp:185:10
#40 0x0000560a67a3576f mlir::MlirOptMain(llvm::raw_ostream&, std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, mlir::PassPipelineCLParser const&, mlir::DialectRegistry&, bool, bool, bool, bool, bool, bool, bool, bool) /home/bi/Desktop/circt/llvm/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp:208:10
#41 0x0000560a67a363b1 mlir::MlirOptMain(int, char**, llvm::StringRef, mlir::DialectRegistry&, bool) /home/bi/Desktop/circt/llvm/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp:305:14
#42 0x0000560a65ea774b main /home/bi/Desktop/circt/tools/circt-opt/circt-opt.cpp:68:23
#43 0x00007fbecf8b2083 __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:342:3
#44 0x0000560a65ea75ae _start (/home/bi/Desktop/circt/build/bin/circt-opt+0x18915ae)
[1]    69702 abort (core dumped)  /home/bi/Desktop/circt/build/bin/circt-opt gauss.mlir 

I don’t really understand how this is happening. Could anyone tell me what causes this problem?

I’ve also tried to firstly use mlir-opt with --lower-affine and --convert-scf-to-cf to convert everything into the dialects that circt supports.

module {
  func.func @gauss(%arg0: memref<400xi8>, %arg1: memref<400xi8>) attributes {llvm.linkage = #llvm.linkage<external>} {
    %true = arith.constant true
    %c4_i32 = arith.constant 4 : i32
    %c255_i32 = arith.constant 255 : i32
    %c0_i16 = arith.constant 0 : i16
    %c0_i32 = arith.constant 0 : i32
    %c4_i16 = arith.constant 4 : i16
    %c2_i16 = arith.constant 2 : i16
    %c1_i16 = arith.constant 1 : i16
    %alloca = memref.alloca() : memref<3x3xi16>
    %c0 = arith.constant 0 : index
    %c0_0 = arith.constant 0 : index
    memref.store %c1_i16, %alloca[%c0, %c0_0] : memref<3x3xi16>
    %c0_1 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    memref.store %c2_i16, %alloca[%c0_1, %c1] : memref<3x3xi16>
    %c0_2 = arith.constant 0 : index
    %c2 = arith.constant 2 : index
    memref.store %c1_i16, %alloca[%c0_2, %c2] : memref<3x3xi16>
    %c1_3 = arith.constant 1 : index
    %c0_4 = arith.constant 0 : index
    memref.store %c2_i16, %alloca[%c1_3, %c0_4] : memref<3x3xi16>
    %c1_5 = arith.constant 1 : index
    %c1_6 = arith.constant 1 : index
    memref.store %c4_i16, %alloca[%c1_5, %c1_6] : memref<3x3xi16>
    %c1_7 = arith.constant 1 : index
    %c2_8 = arith.constant 2 : index
    memref.store %c2_i16, %alloca[%c1_7, %c2_8] : memref<3x3xi16>
    %c2_9 = arith.constant 2 : index
    %c0_10 = arith.constant 0 : index
    memref.store %c1_i16, %alloca[%c2_9, %c0_10] : memref<3x3xi16>
    %c2_11 = arith.constant 2 : index
    %c1_12 = arith.constant 1 : index
    memref.store %c2_i16, %alloca[%c2_11, %c1_12] : memref<3x3xi16>
    %c2_13 = arith.constant 2 : index
    %c2_14 = arith.constant 2 : index
    memref.store %c1_i16, %alloca[%c2_13, %c2_14] : memref<3x3xi16>
    %c0_15 = arith.constant 0 : index
    %c20 = arith.constant 20 : index
    %c1_16 = arith.constant 1 : index
    cf.br ^bb1(%c0_15 : index)
  ^bb1(%0: index):  // 2 preds: ^bb0, ^bb23
    %1 = arith.cmpi slt, %0, %c20 : index
    cf.cond_br %1, ^bb2, ^bb24
  ^bb2:  // pred: ^bb1
    %c0_17 = arith.constant 0 : index
    %c20_18 = arith.constant 20 : index
    %c1_19 = arith.constant 1 : index
    cf.br ^bb3(%c0_17 : index)
  ^bb3(%2: index):  // 2 preds: ^bb2, ^bb22
    %3 = arith.cmpi slt, %2, %c20_18 : index
    cf.cond_br %3, ^bb4, ^bb23
  ^bb4:  // pred: ^bb3
    %c-1 = arith.constant -1 : index
    %c2_20 = arith.constant 2 : index
    %c1_21 = arith.constant 1 : index
    cf.br ^bb5(%c-1, %c0_i16 : index, i16)
  ^bb5(%4: index, %5: i16):  // 2 preds: ^bb4, ^bb13
    %6 = arith.cmpi slt, %4, %c2_20 : index
    cf.cond_br %6, ^bb6, ^bb14
  ^bb6:  // pred: ^bb5
    %c-1_22 = arith.constant -1 : index
    %c2_23 = arith.constant 2 : index
    %c1_24 = arith.constant 1 : index
    cf.br ^bb7(%c-1_22, %5 : index, i16)
  ^bb7(%7: index, %8: i16):  // 2 preds: ^bb6, ^bb12
    %9 = arith.cmpi slt, %7, %c2_23 : index
    cf.cond_br %9, ^bb8, ^bb13
  ^bb8:  // pred: ^bb7
    %c0_25 = arith.constant 0 : index
    %c-1_26 = arith.constant -1 : index
    %10 = arith.muli %0, %c-1_26 : index
    %c-1_27 = arith.constant -1 : index
    %11 = arith.muli %4, %c-1_27 : index
    %12 = arith.addi %10, %11 : index
    %c19 = arith.constant 19 : index
    %13 = arith.addi %12, %c19 : index
    %14 = arith.cmpi sge, %13, %c0_25 : index
    %15 = arith.addi %0, %4 : index
    %16 = arith.cmpi sge, %15, %c0_25 : index
    %17 = arith.andi %14, %16 : i1
    %18 = arith.addi %2, %7 : index
    %19 = arith.cmpi sge, %18, %c0_25 : index
    %20 = arith.andi %17, %19 : i1
    %c-1_28 = arith.constant -1 : index
    %21 = arith.muli %2, %c-1_28 : index
    %c-1_29 = arith.constant -1 : index
    %22 = arith.muli %7, %c-1_29 : index
    %23 = arith.addi %21, %22 : index
    %c19_30 = arith.constant 19 : index
    %24 = arith.addi %23, %c19_30 : index
    %25 = arith.cmpi sge, %24, %c0_25 : index
    %26 = arith.andi %20, %25 : i1
    cf.cond_br %26, ^bb9, ^bb10
  ^bb9:  // pred: ^bb8
    %c20_31 = arith.constant 20 : index
    %27 = arith.muli %4, %c20_31 : index
    %28 = arith.addi %27, %2 : index
    %29 = arith.addi %28, %7 : index
    %c20_32 = arith.constant 20 : index
    %30 = arith.muli %0, %c20_32 : index
    %31 = arith.addi %29, %30 : index
    %32 = memref.load %arg0[%31] : memref<400xi8>
    %33 = arith.extsi %32 : i8 to i32
    %c1_33 = arith.constant 1 : index
    %34 = arith.addi %4, %c1_33 : index
    %c1_34 = arith.constant 1 : index
    %35 = arith.addi %7, %c1_34 : index
    %36 = memref.load %alloca[%34, %35] : memref<3x3xi16>
    %37 = arith.extsi %36 : i16 to i32
    %38 = arith.muli %33, %37 : i32
    %39 = arith.trunci %38 : i32 to i16
    %40 = arith.addi %8, %39 : i16
    cf.br ^bb11(%40 : i16)
  ^bb10:  // pred: ^bb8
    cf.br ^bb11(%8 : i16)
  ^bb11(%41: i16):  // 2 preds: ^bb9, ^bb10
    cf.br ^bb12
  ^bb12:  // pred: ^bb11
    %42 = arith.addi %7, %c1_24 : index
    cf.br ^bb7(%42, %41 : index, i16)
  ^bb13:  // pred: ^bb7
    %43 = arith.addi %4, %c1_21 : index
    cf.br ^bb5(%43, %8 : index, i16)
  ^bb14:  // pred: ^bb5
    %44 = arith.extsi %5 : i16 to i32
    %45 = arith.shrsi %44, %c4_i32 : i32
    %46 = arith.cmpi sgt, %45, %c255_i32 : i32
    cf.cond_br %46, ^bb15, ^bb16
  ^bb15:  // pred: ^bb14
    cf.br ^bb17(%true : i1)
  ^bb16:  // pred: ^bb14
    %47 = arith.cmpi sgt, %45, %c0_i32 : i32
    cf.br ^bb17(%47 : i1)
  ^bb17(%48: i1):  // 2 preds: ^bb15, ^bb16
    cf.br ^bb18
  ^bb18:  // pred: ^bb17
    cf.cond_br %48, ^bb19, ^bb20
  ^bb19:  // pred: ^bb18
    %49 = arith.select %46, %c255_i32, %45 : i32
    cf.br ^bb21(%49 : i32)
  ^bb20:  // pred: ^bb18
    cf.br ^bb21(%c0_i32 : i32)
  ^bb21(%50: i32):  // 2 preds: ^bb19, ^bb20
    cf.br ^bb22
  ^bb22:  // pred: ^bb21
    %51 = arith.trunci %50 : i32 to i8
    %c20_35 = arith.constant 20 : index
    %52 = arith.muli %0, %c20_35 : index
    %53 = arith.addi %2, %52 : index
    memref.store %51, %arg1[%53] : memref<400xi8>
    %54 = arith.addi %2, %c1_19 : index
    cf.br ^bb3(%54 : index)
  ^bb23:  // pred: ^bb3
    %55 = arith.addi %0, %c1_16 : index
    cf.br ^bb1(%55 : index)
  ^bb24:  // pred: ^bb1
    return
  }
}

Then I tried to use the pass --lower-std-to-handshake to do further lowering. But now it causes another error:

<stdin>:11:15: error: memref's must be both statically sized and unidimensional.
    %alloca = memref.alloca() : memref<3x3xi16>

Could anyone tell me why there is only 1-d memref supported in the project?

Also, I’ve tried to use 1-d array instead with the same steps as above. Then I’ve got another error because there are handshake.br ops derived from cf.br, which cannot be lowered.

And another question: Doesn’t the --convert-affine-to-pipeline pass support nested loops? I’ve got the following code, which is generated by Polygeist.

module {
  func.func @matmul(%arg0: memref<9xi32>, %arg1: memref<9xi32>, %arg2: memref<9xi32>) {
    affine.for %arg3 = 0 to 3 {
      affine.for %arg4 = 0 to 3 {
        affine.for %arg5 = 0 to 3 {
          %0 = affine.load %arg0[%arg5 + %arg3 * 3] : memref<9xi32>
          %1 = affine.load %arg1[%arg5 + %arg4 * 3] : memref<9xi32>
          %2 = arith.muli %0, %1 : i32
          %3 = affine.load %arg2[%arg4 + %arg3 * 3] : memref<9xi32>
          %4 = arith.addi %3, %2 : i32
          affine.store %4, %arg2[%arg4 + %arg3 * 3] : memref<9xi32>
        }
      }
    }
    return
  }
}

After running the pass I’ve got these:

module {
  func.func @matmul(%arg0: memref<9xi32>, %arg1: memref<9xi32>, %arg2: memref<9xi32>) {
    affine.for %arg3 = 0 to 3 {
      affine.for %arg4 = 0 to 3 {
        affine.for %arg5 = 0 to 3 {
          %c3 = arith.constant 3 : index
          %0 = arith.muli %arg3, %c3 : index
          %1 = arith.addi %arg5, %0 : index
          %2 = memref.load %arg0[%1] : memref<9xi32>
          %c3_0 = arith.constant 3 : index
          %3 = arith.muli %arg4, %c3_0 : index
          %4 = arith.addi %arg5, %3 : index
          %5 = memref.load %arg1[%4] : memref<9xi32>
          %6 = arith.muli %2, %5 : i32
          %c3_1 = arith.constant 3 : index
          %7 = arith.muli %arg3, %c3_1 : index
          %8 = arith.addi %arg4, %7 : index
          %9 = memref.load %arg2[%8] : memref<9xi32>
          %10 = arith.addi %9, %6 : i32
          %c3_2 = arith.constant 3 : index
          %11 = arith.muli %arg3, %c3_2 : index
          %12 = arith.addi %arg4, %11 : index
          memref.store %10, %arg2[%12] : memref<9xi32>
        }
      }
    }
    return
  }
}

The loops aren’t converted at all, but inside the index is already kinda lowered.

I think that the pipeline pass does not support nested loops ([AffineToStaticLogic] Support iteratively lowering nested loops. · Issue #2659 · llvm/circt · GitHub). It might be possible to do: affine → scf → Calyx → (Calyx native compiler) → Verilog if you need but of course that downside is that you don’t get any pipelining.

@mortbopet and @mikeurbach might have more suggestions

Yeah, that’s what I thought. Though it seems like scf.for is not supported to be lowered into calyx. :joy:

Oh, weird! If there is no issue about this already, can you open one in the CIRCT repo?

There are a few things you’re trying, which are at different levels of completeness.

-convert-affine-to-pipeline

This is probably the least fleshed out pass. I added it with very basic support, and others have recently been improving it.

This pass supports affine, but does not yet support nested loops: circt/AffineToPipeline.cpp at 535def1265e7ed406a1dc2cbfbc484d540f6e4e4 · llvm/circt · GitHub. We should give a nice error message stating that, rather than crashing the compiler in an obscure way.

We actually have a couple open issues about ways to handle nested loops:

-lower-scf-to-calyx

This pass is more mature, and got a lot of work from @mortbopet. The pass that converts pipeline to Calyx also uses a lot of the same infrastructure.

This pass does support scf loops, but you need to convert from scf.for to scf.while. There is a pass in upstream MLIR to do this, called -scf-for-to-while. There are some examples in this test: circt/convert_controlflow.mlir at main · llvm/circt · GitHub

-lower-std-to-handshake

This pass is probably the most mature, and has seen a lot of love from multiple people.

This pass supports cf, including nested loops. It does require memrefs to be one dimensional, which is closer to how they look in hardware. There is a -flatten-memrefs pass to convert automatically.


If you just want to see something work end-to-end for HLS, I’d recommend the handshake (dynamically scheduled) flow. If you’re interested in statically timed designs, we can always use more use cases to drive the implementation of the AffineToPipeline work.

For the end-to-end dynamically scheduled flow, instead of running the passes individually, I would recommend using the hlstool binary. There are many other passes to run besides -lower-std-to-handshake. hlstool will consume MLIR in the affine, scf, or cf dialect, and take them through the whole flow to System Verilog.

I tried it on your guass example, and it seems something about the IR caused -flatten-memrefs to fall over, so we’ll have to improve that pass. For the simple matmul, it works:

hlstool -dynamic-hw matmul.mlir

The -dynamic-hw flag means to use the handshake (dynamically scheduled) flow, with lowering through the HW dialect. There are a number of options and knobs to tune in the generated hardware, which can be controlled through the hlstool CLI.

We don’t have support in hlstool for running through the -affine-to-pipeline or -lower-scf-to-calyx flows, but those would be good to add, so you have a one stop shop for trying different HLS flows.

Thanks for pushing on this and let us know if you have any other questions. A lot of this work is driven by specific use cases, so finding out where your use case is not met will help us know what to improve.

1 Like

Oh, thanks for the detailed explaination! I’ll look into those;)

I’m wondering why there are so many differences between the result in Vivado 2019.1 Implementation using both System Verilog files of the flow affine -> scf -> calyx -> calyx_native -> hw/sv/comb -> verilog (first image) and the one step hlstool (second image) in the case of matmul.
image
image
The first way seems incorrect, for there are only IOs used. I mean, how could that be possible… Though it may be because I used the circt-opt wrongly. The second seems more reasonable to me, but still uses a lot IOs, so there must be something could be optimized. I’m using the xc7z020clg484-1 and these are just some thoughts, I haven’t deeply dig into the code yet.

I’ve also tried the gauss functon with hls-tool. There is always this error:

hlstool: /home/bi/Desktop/circt/llvm/mlir/lib/IR/PatternMatch.cpp:276: virtual void mlir::RewriterBase::eraseOp(mlir::Operation *): Assertion `op->use_empty() && "expected 'op' to have no uses"' failed.
PLEASE submit a bug report to https://github.com/llvm/circt and include the crash backtrace.
Stack dump:
0.	Program arguments: /home/bi/Desktop/circt/build/bin/hlstool gauss.mlir -dynamic-hw -debug-only=dialect-conversion -mlir-print-ir-after-all
 #0 0x0000561c0e76fb1d llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) /home/bi/Desktop/circt/llvm/llvm/lib/Support/Unix/Signals.inc:567:11
 #1 0x0000561c0e76ffab PrintStackTraceSignalHandler(void*) /home/bi/Desktop/circt/llvm/llvm/lib/Support/Unix/Signals.inc:641:1
 #2 0x0000561c0e76e326 llvm::sys::RunSignalHandlers() /home/bi/Desktop/circt/llvm/llvm/lib/Support/Signals.cpp:104:5
 #3 0x0000561c0e7706d5 SignalHandler(int) /home/bi/Desktop/circt/llvm/llvm/lib/Support/Unix/Signals.inc:412:1
 #4 0x00007f193c584420 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x14420)
 #5 0x00007f193c01700b raise /build/glibc-SzIz7B/glibc-2.31/signal/../sysdeps/unix/sysv/linux/raise.c:51:1
 #6 0x00007f193bff6859 abort /build/glibc-SzIz7B/glibc-2.31/stdlib/abort.c:81:7
 #7 0x00007f193bff6729 get_sysdep_segment_value /build/glibc-SzIz7B/glibc-2.31/intl/loadmsgcat.c:509:8
 #8 0x00007f193bff6729 _nl_load_domain /build/glibc-SzIz7B/glibc-2.31/intl/loadmsgcat.c:970:34
 #9 0x00007f193c007fd6 (/lib/x86_64-linux-gnu/libc.so.6+0x33fd6)
#10 0x0000561c0f406a34 mlir::RewriterBase::eraseOp(mlir::Operation*) /home/bi/Desktop/circt/llvm/mlir/lib/IR/PatternMatch.cpp:0:3
#11 0x0000561c0edc485a (anonymous namespace)::EliminateCBranchIntoMuxPattern::matchAndRewrite(circt::handshake::MuxOp, mlir::PatternRewriter&) const /home/bi/Desktop/circt/lib/Dialect/Handshake/HandshakeOps.cpp:349:12
#12 0x0000561c0edd84cb mlir::detail::OpOrInterfaceRewritePatternBase<circt::handshake::MuxOp>::matchAndRewrite(mlir::Operation*, mlir::PatternRewriter&) const /home/bi/Desktop/circt/llvm/mlir/include/mlir/IR/PatternMatch.h:330:12
#13 0x0000561c0faa2810 mlir::PatternApplicator::matchAndRewrite(mlir::Operation*, mlir::PatternRewriter&, llvm::function_ref<bool (mlir::Pattern const&)>, llvm::function_ref<void (mlir::Pattern const&)>, llvm::function_ref<mlir::LogicalResult (mlir::Pattern const&)>) /home/bi/Desktop/circt/llvm/mlir/lib/Rewrite/PatternApplicator.cpp:200:25
#14 0x0000561c0fa2ed53 (anonymous namespace)::GreedyPatternRewriteDriver::simplify(llvm::MutableArrayRef<mlir::Region>) /home/bi/Desktop/circt/llvm/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp:293:19
#15 0x0000561c0fa2e1ee mlir::applyPatternsAndFoldGreedily(llvm::MutableArrayRef<mlir::Region>, mlir::FrozenRewritePatternSet const&, mlir::GreedyRewriteConfig) /home/bi/Desktop/circt/llvm/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp:453:8
#16 0x0000561c0f57c005 mlir::applyPatternsAndFoldGreedily(mlir::Operation*, mlir::FrozenRewritePatternSet const&, mlir::GreedyRewriteConfig) /home/bi/Desktop/circt/llvm/mlir/include/mlir/Transforms/GreedyPatternRewriteDriver.h:86:10
#17 0x0000561c0f579cbc (anonymous namespace)::Canonicalizer::runOnOperation() /home/bi/Desktop/circt/llvm/mlir/lib/Transforms/Canonicalizer.cpp:61:9
#18 0x0000561c0fb0b99a mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:471:21
#19 0x0000561c0fb0bfb4 mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:534:16
#20 0x0000561c0fb11598 mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_14::operator()(mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo&) const /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:754:36
#21 0x0000561c0fb11209 mlir::LogicalResult mlir::failableParallelForEach<__gnu_cxx::__normal_iterator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo*, std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>>, mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_14&>(mlir::MLIRContext*, __gnu_cxx::__normal_iterator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo*, std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>>, __gnu_cxx::__normal_iterator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo*, std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>>, mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_14&) /home/bi/Desktop/circt/llvm/mlir/include/mlir/IR/Threading.h:46:18
#22 0x0000561c0fb0d293 mlir::LogicalResult mlir::failableParallelForEach<std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>&, mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_14&>(mlir::MLIRContext*, std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>&, mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_14&) /home/bi/Desktop/circt/llvm/mlir/include/mlir/IR/Threading.h:92:10
#23 0x0000561c0fb0cb37 mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:764:14
#24 0x0000561c0fb0bc67 mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:655:5
#25 0x0000561c0fb0b98b mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:468:5
#26 0x0000561c0fb0bfb4 mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:534:16
#27 0x0000561c0fb0d9dc mlir::PassManager::runPasses(mlir::Operation*, mlir::AnalysisManager) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:843:10
#28 0x0000561c0fb0d8f2 mlir::PassManager::run(mlir::Operation*) /home/bi/Desktop/circt/llvm/mlir/lib/Pass/Pass.cpp:823:60
#29 0x0000561c0e645107 doHLSFlowDynamic(mlir::PassManager&, mlir::ModuleOp, std::optional<std::unique_ptr<llvm::ToolOutputFile, std::default_delete<llvm::ToolOutputFile>>>&) /home/bi/Desktop/circt/tools/hlstool/hlstool.cpp:385:17
#30 0x0000561c0e644bce processBuffer(mlir::MLIRContext&, mlir::TimingScope&, llvm::SourceMgr&, std::optional<std::unique_ptr<llvm::ToolOutputFile, std::default_delete<llvm::ToolOutputFile>>>&) /home/bi/Desktop/circt/tools/hlstool/hlstool.cpp:427:16
#31 0x0000561c0e644826 processInputSplit(mlir::MLIRContext&, mlir::TimingScope&, std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, std::optional<std::unique_ptr<llvm::ToolOutputFile, std::default_delete<llvm::ToolOutputFile>>>&) /home/bi/Desktop/circt/tools/hlstool/hlstool.cpp:449:12
#32 0x0000561c0e64469d processInput(mlir::MLIRContext&, mlir::TimingScope&, std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, std::optional<std::unique_ptr<llvm::ToolOutputFile, std::default_delete<llvm::ToolOutputFile>>>&) /home/bi/Desktop/circt/tools/hlstool/hlstool.cpp:465:12
#33 0x0000561c0e644554 executeHlstool(mlir::MLIRContext&) /home/bi/Desktop/circt/tools/hlstool/hlstool.cpp:500:14
#34 0x0000561c0e644356 main /home/bi/Desktop/circt/tools/hlstool/hlstool.cpp:556:17
#35 0x00007f193bff8083 __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:342:3
#36 0x0000561c0e64413e _start (/home/bi/Desktop/circt/build/bin/hlstool+0xe1d13e)
[1]    115161 abort (core dumped)  /home/bi/Desktop/circt/build/bin/hlstool gauss.mlir -dynamic-hw   2>&1 | 
       115162 done                 tee log.txt

For all the functions I’ve tested with, they all stucked here. I’ve checked the debug info and it seems like there are a few passes successfully executed already, which are:
--lower-affine
--convert-scf-to-cf
--flatten-memref
--handshake-legalize-memrefs
--convert-scf-to-cf
--lower-std-to-handshake
--handshake-lower-extmem-to-hw
Then the error will show up. Sadly I guess now there is no way I could possibly to hls the gauss function at all. For the other functions with the same error as well.

The first flow, going through Calyx native and then back to CIRCT is something we’ve added support for, but haven’t thoroughly tested through synthesis on FPGA. Does the System Verilog simulate correctly? It’s possible the flow succeeds but the produced System Verilog somehow isn’t connected or functional, and it all optimizes away in synthesis.

All of this is work in progress, so it is normal to find situations our passes don’t support. When you encounter an error like this, please file an issue on GitHub - llvm/circt: Circuit IR Compilers and Tools with steps to reproduce the error. Like I said before, we are adding support as needed, so your examples are very helpful to find places support is missing.

When using the Calyx native compiler, you need to pass the --synthesis flag so it can generate the synthesizable verilog. By default, it assumes you’re going to run it through a simulator and therefore generates designs that might be optimized away.

Can you open an issue in the Calyx repository with the input file so I can reproduce the error locally?

I don’t know about Vivado, but if you run what should be internal IP through Quartus it will assign all of the module ports to random device pins. So you not only end up with an obnoxious number of pins used (and in most non-trivial IP you run out of pins) but your design gets stretched out across the device. In Quartus, you’ve got to assign each and every port wire as a virtual pin (we call this a Virtual Pin Project – VPP). In Vivado, I think it’s called an out-of-context build.

If you’re already generating native calyx, then we provide a way to push the design through Vivado and get resource numbers without a whole lot of configuration. Again, happy to chat more if you need help making it work.

Tried with this flag also, the generated file is the same as before. I’ll open an issue about this.

Good idea! I’ll try this later.

Sounds fair, but IIRC OOC flow is set by default.

I haven’t tried to simulate the generated System Verilog files. But yeah, I’ll open an issue on GitHub about this later.

I’ve tried this, but I got an error saying:

⠇ synth-verilog.run_vivado[fud] ERROR: Utilization implementation file /tmp/tmpgyftljvf/out/FutilBuild.runs/impl_1/main_utilization_placed.rpt is missing
[fud] ERROR: Timing file /tmp/tmpgyftljvf/out/FutilBuild.runs/impl_1/main_timing_summary_routed.rpt is missing
{
  "uram": -1,
  "cell_lut1": 6,
  "cell_lut2": 324,
  "cell_lut3": 103,
  "cell_lut4": 75,
  "cell_lut5": 38,
  "cell_lut6": 257,
  "cell_fdre": 300
}

The original mlir programm is:

module {
  func.func @matmul(%arg0: memref<9xi32>, %arg1: memref<9xi32>, %arg2: memref<9xi32>) {
    affine.for %arg3 = 0 to 3 {
      affine.for %arg4 = 0 to 3 {
        affine.for %arg5 = 0 to 3 {
          %0 = affine.load %arg0[%arg5 + %arg3 * 3] : memref<9xi32>
          %1 = affine.load %arg1[%arg5 + %arg4 * 3] : memref<9xi32>
          %2 = arith.muli %0, %1 : i32
          %3 = affine.load %arg2[%arg4 + %arg3 * 3] : memref<9xi32>
          %4 = arith.addi %3, %2 : i32
          affine.store %4, %arg2[%arg4 + %arg3 * 3] : memref<9xi32>
        }
      }
    }
    return
  }
}

Then I used this to generate the following native calyx: mlir-opt matmul.mlir -lower-affine -scf-for-to-while | circt-opt -lower-scf-to-calyx | circt-translate -export-calyx -o matmul.futil.

import "primitives/core.futil";
import "primitives/binary_operators.futil";
component matmul<"toplevel"=1>(ext_mem0_read_data: 32, ext_mem0_done: 1, ext_mem1_read_data: 32, ext_mem1_done: 1, ext_mem2_read_data: 32, ext_mem2_done: 1, @clk clk: 1, @reset reset: 1, @go go: 1) -> (ext_mem0_write_data: 32, ext_mem0_addr0: 4, ext_mem0_write_en: 1, ext_mem1_write_data: 32, ext_mem1_addr0: 4, ext_mem1_write_en: 1, ext_mem2_write_data: 32, ext_mem2_addr0: 4, ext_mem2_write_en: 1, @done done: 1) {
  cells {
    std_slice_3 = std_slice(32, 4);
    std_slice_2 = std_slice(32, 4);
    std_slice_1 = std_slice(32, 4);
    std_slice_0 = std_slice(32, 4);
    std_add_7 = std_add(32);
    muli_4_reg = std_reg(32);
    std_mult_pipe_4 = std_mult_pipe(32);
    std_add_6 = std_add(32);
    load_0_reg = std_reg(32);
    std_add_5 = std_add(32);
    muli_3_reg = std_reg(32);
    std_mult_pipe_3 = std_mult_pipe(32);
    muli_2_reg = std_reg(32);
    std_mult_pipe_2 = std_mult_pipe(32);
    std_add_4 = std_add(32);
    muli_1_reg = std_reg(32);
    std_mult_pipe_1 = std_mult_pipe(32);
    std_add_3 = std_add(32);
    muli_0_reg = std_reg(32);
    std_mult_pipe_0 = std_mult_pipe(32);
    std_add_2 = std_add(32);
    std_slt_2 = std_slt(32);
    std_add_1 = std_add(32);
    std_slt_1 = std_slt(32);
    std_add_0 = std_add(32);
    std_slt_0 = std_slt(32);
    while_2_arg0_reg = std_reg(32);
    while_1_arg0_reg = std_reg(32);
    while_0_arg0_reg = std_reg(32);
  }
  wires {
    group assign_while_0_init_0 {
      while_0_arg0_reg.in = 32'd0;
      while_0_arg0_reg.write_en = 1'd1;
      assign_while_0_init_0[done] = while_0_arg0_reg.done;
    }
    group assign_while_1_init_0 {
      while_1_arg0_reg.in = 32'd0;
      while_1_arg0_reg.write_en = 1'd1;
      assign_while_1_init_0[done] = while_1_arg0_reg.done;
    }
    group assign_while_2_init_0 {
      while_2_arg0_reg.in = 32'd0;
      while_2_arg0_reg.write_en = 1'd1;
      assign_while_2_init_0[done] = while_2_arg0_reg.done;
    }
    comb group bb0_0 {
      std_slt_0.left = while_2_arg0_reg.out;
      std_slt_0.right = 32'd3;
    }
    comb group bb0_2 {
      std_slt_1.left = while_1_arg0_reg.out;
      std_slt_1.right = 32'd3;
    }
    comb group bb0_4 {
      std_slt_2.left = while_0_arg0_reg.out;
      std_slt_2.right = 32'd3;
    }
    group bb0_6 {
      std_mult_pipe_0.left = while_2_arg0_reg.out;
      std_mult_pipe_0.right = 32'd3;
      muli_0_reg.in = std_mult_pipe_0.out;
      muli_0_reg.write_en = std_mult_pipe_0.done;
      std_mult_pipe_0.go = 1'd1;
      bb0_6[done] = muli_0_reg.done;
    }
    group bb0_9 {
      std_mult_pipe_1.left = while_1_arg0_reg.out;
      std_mult_pipe_1.right = 32'd3;
      muli_1_reg.in = std_mult_pipe_1.out;
      muli_1_reg.write_en = std_mult_pipe_1.done;
      std_mult_pipe_1.go = 1'd1;
      bb0_9[done] = muli_1_reg.done;
    }
    group bb0_12 {
      std_slice_3.in = std_add_3.out;
      std_slice_2.in = std_add_4.out;
      std_mult_pipe_2.left = ext_mem0_read_data;
      std_mult_pipe_2.right = ext_mem1_read_data;
      muli_2_reg.in = std_mult_pipe_2.out;
      muli_2_reg.write_en = std_mult_pipe_2.done;
      std_mult_pipe_2.go = 1'd1;
      ext_mem0_addr0 = std_slice_3.out;
      std_add_3.left = while_0_arg0_reg.out;
      std_add_3.right = std_mult_pipe_0.out;
      ext_mem1_addr0 = std_slice_2.out;
      std_add_4.left = while_0_arg0_reg.out;
      std_add_4.right = std_mult_pipe_1.out;
      bb0_12[done] = muli_2_reg.done;
    }
    group bb0_13 {
      std_mult_pipe_3.left = while_2_arg0_reg.out;
      std_mult_pipe_3.right = 32'd3;
      muli_3_reg.in = std_mult_pipe_3.out;
      muli_3_reg.write_en = std_mult_pipe_3.done;
      std_mult_pipe_3.go = 1'd1;
      bb0_13[done] = muli_3_reg.done;
    }
    group bb0_15 {
      std_slice_1.in = std_add_5.out;
      ext_mem2_addr0 = std_slice_1.out;
      load_0_reg.in = ext_mem2_read_data;
      load_0_reg.write_en = 1'd1;
      std_add_5.left = while_1_arg0_reg.out;
      std_add_5.right = std_mult_pipe_3.out;
      bb0_15[done] = load_0_reg.done;
    }
    group bb0_17 {
      std_mult_pipe_4.left = while_2_arg0_reg.out;
      std_mult_pipe_4.right = 32'd3;
      muli_4_reg.in = std_mult_pipe_4.out;
      muli_4_reg.write_en = std_mult_pipe_4.done;
      std_mult_pipe_4.go = 1'd1;
      bb0_17[done] = muli_4_reg.done;
    }
    group bb0_19 {
      std_slice_0.in = std_add_7.out;
      ext_mem2_addr0 = std_slice_0.out;
      ext_mem2_write_data = std_add_6.out;
      ext_mem2_write_en = 1'd1;
      std_add_7.left = while_1_arg0_reg.out;
      std_add_7.right = std_mult_pipe_4.out;
      std_add_6.left = load_0_reg.out;
      std_add_6.right = std_mult_pipe_2.out;
      bb0_19[done] = ext_mem2_done;
    }
    group assign_while_0_latch {
      while_0_arg0_reg.in = std_add_2.out;
      while_0_arg0_reg.write_en = 1'd1;
      std_add_2.left = while_0_arg0_reg.out;
      std_add_2.right = 32'd1;
      assign_while_0_latch[done] = while_0_arg0_reg.done;
    }
    group assign_while_1_latch {
      while_1_arg0_reg.in = std_add_1.out;
      while_1_arg0_reg.write_en = 1'd1;
      std_add_1.left = while_1_arg0_reg.out;
      std_add_1.right = 32'd1;
      assign_while_1_latch[done] = while_1_arg0_reg.done;
    }
    group assign_while_2_latch {
      while_2_arg0_reg.in = std_add_0.out;
      while_2_arg0_reg.write_en = 1'd1;
      std_add_0.left = while_2_arg0_reg.out;
      std_add_0.right = 32'd1;
      assign_while_2_latch[done] = while_2_arg0_reg.done;
    }
  }
  control {
    seq {
      par {
        assign_while_2_init_0;
      }
      while std_slt_0.out with bb0_0 {
        seq {
          par {
            assign_while_1_init_0;
          }
          while std_slt_1.out with bb0_2 {
            seq {
              par {
                assign_while_0_init_0;
              }
              while std_slt_2.out with bb0_4 {
                seq {
                  seq {
                    bb0_6;
                    bb0_9;
                    bb0_12;
                    bb0_13;
                    bb0_15;
                    bb0_17;
                    bb0_19;
                  }
                  assign_while_0_latch;
                }
              }
              assign_while_1_latch;
            }
          }
          assign_while_2_latch;
        }
      }
    }
  }
}

Then I’ve used this command to try the RTL synthesis with Xilinx Vivado: fud e --to resource-estimate matmul.futil. From here the error mentioned above shows up, for other programs as well. But if I use synth-files it works perfectly fine. I’ve followed the instruction in the link you gave, I should have installed all the dependencies and set the vivado stuff. I’ve looked up the impl1 folder, there is no main_utilization_placed.rpt but a matmul_utilization_placed.rpt. So I changed the top module name to main and then the command works. So it can only work as Attributes - Calyx Documentation introduced?

Ah, yes! That’s seems like a bug. Specifically, the fud toolchain (and the simulation toolchain) assume that the name of the top level component is “main”. Can you open an issue about this in the Calyx repository? Also, if you got the numbers from the synthesis flow, let us know if you need anything else!

Hi, me again. I’ve opened an issue on GitHub;) I’m trying this flow in the link you gave me by hand:

I’m still testing the matmul function. The first I use cargo run in the calyx folder and the sv file is generated.

For the second step I’ve fisrtly run the pass externalize and there is one @external attribute in the code:

extern "/home/bi/Desktop/calyx/primitives/binary_operators.sv" {
  comb primitive std_fp_add<"share"=1>[WIDTH, INT_WIDTH, FRAC_WIDTH](left: WIDTH, right: WIDTH) -> (out: WIDTH);
  comb primitive std_fp_sub<"share"=1>[WIDTH, INT_WIDTH, FRAC_WIDTH](left: WIDTH, right: WIDTH) -> (out: WIDTH);
  primitive std_fp_mult_pipe<"state_share"=1>[WIDTH, INT_WIDTH, FRAC_WIDTH](@clk clk: 1, @reset reset: 1, @write_together @static(3) @go go: 1, @write_together left: WIDTH, @write_together right: WIDTH) -> (@stable out: WIDTH, @done done: 1);
  primitive std_fp_div_pipe<"state_share"=1>[WIDTH, INT_WIDTH, FRAC_WIDTH](@clk clk: 1, @reset reset: 1, @write_together @go go: 1, @write_together left: WIDTH, @write_together right: WIDTH) -> (@stable out_remainder: WIDTH, @stable out_quotient: WIDTH, @done done: 1);
  comb primitive std_fp_gt<"share"=1>[WIDTH, INT_WIDTH, FRAC_WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  comb primitive std_fp_sadd<"share"=1>[WIDTH, INT_WIDTH, FRAC_WIDTH](left: WIDTH, right: WIDTH) -> (out: WIDTH);
  comb primitive std_fp_ssub<"share"=1>[WIDTH, INT_WIDTH, FRAC_WIDTH](left: WIDTH, right: WIDTH) -> (out: WIDTH);
  primitive std_fp_smult_pipe<"state_share"=1>[WIDTH, INT_WIDTH, FRAC_WIDTH](@clk clk: 1, @reset reset: 1, @write_together @static(3) @go go: 1, @write_together left: WIDTH, @write_together right: WIDTH) -> (@stable out: WIDTH, @done done: 1);
  primitive std_fp_sdiv_pipe<"state_share"=1>[WIDTH, INT_WIDTH, FRAC_WIDTH](@clk clk: 1, @reset reset: 1, @write_together @go go: 1, @write_together left: WIDTH, @write_together right: WIDTH) -> (@stable out_remainder: WIDTH, @stable out_quotient: WIDTH, @done done: 1);
  comb primitive std_fp_sgt<"share"=1>[WIDTH, INT_WIDTH, FRAC_WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  comb primitive std_fp_slt<"share"=1>[WIDTH, INT_WIDTH, FRAC_WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  primitive std_mult_pipe<"state_share"=1>[WIDTH](@clk clk: 1, @reset reset: 1, @write_together @static(3) @go go: 1, @write_together left: WIDTH, @write_together right: WIDTH) -> (@stable out: WIDTH, @done done: 1);
  primitive std_div_pipe<"state_share"=1>[WIDTH](@clk clk: 1, @reset reset: 1, @write_together @go go: 1, @write_together left: WIDTH, @write_together right: WIDTH) -> (@stable out_quotient: WIDTH, @stable out_remainder: WIDTH, @done done: 1);
  comb primitive std_sadd<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: WIDTH);
  comb primitive std_ssub<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: WIDTH);
  primitive std_smult_pipe<"state_share"=1>[WIDTH](@clk clk: 1, @reset reset: 1, @write_together @static(3) @go go: 1, @write_together left: WIDTH, @write_together right: WIDTH) -> (@stable out: WIDTH, @done done: 1);
  primitive std_sdiv_pipe[WIDTH](@clk clk: 1, @reset reset: 1, @write_together @go go: 1, @write_together left: WIDTH, @write_together right: WIDTH) -> (out_quotient: WIDTH, out_remainder: WIDTH, @done done: 1);
  comb primitive std_sgt<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  comb primitive std_slt<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  comb primitive std_seq<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  comb primitive std_sneq<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  comb primitive std_sge<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  comb primitive std_sle<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  comb primitive std_slsh<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: WIDTH);
  comb primitive std_srsh<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: WIDTH);
}
extern "/home/bi/Desktop/calyx/primitives/core.sv" {
  comb primitive std_const<"share"=1>[WIDTH, VALUE]() -> (out: WIDTH);
  comb primitive std_wire<"share"=1>[WIDTH](in: WIDTH) -> (out: WIDTH);
  comb primitive std_slice<"share"=1>[IN_WIDTH, OUT_WIDTH](in: IN_WIDTH) -> (out: OUT_WIDTH);
  comb primitive std_pad<"share"=1>[IN_WIDTH, OUT_WIDTH](in: IN_WIDTH) -> (out: OUT_WIDTH);
  comb primitive std_cat<"share"=1>[LEFT_WIDTH, RIGHT_WIDTH, OUT_WIDTH](left: LEFT_WIDTH, right: RIGHT_WIDTH) -> (out: OUT_WIDTH);
  comb primitive std_not<"share"=1>[WIDTH](in: WIDTH) -> (out: WIDTH);
  comb primitive std_and<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: WIDTH);
  comb primitive std_or<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: WIDTH);
  comb primitive std_xor<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: WIDTH);
  comb primitive std_add<"share"=1>[WIDTH](@data left: WIDTH, @data right: WIDTH) -> (out: WIDTH);
  comb primitive std_sub<"share"=1>[WIDTH](@data left: WIDTH, @data right: WIDTH) -> (out: WIDTH);
  comb primitive std_gt<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  comb primitive std_lt<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  comb primitive std_eq<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  comb primitive std_neq<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  comb primitive std_ge<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  comb primitive std_le<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: 1);
  comb primitive std_lsh<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: WIDTH);
  comb primitive std_rsh<"share"=1>[WIDTH](left: WIDTH, right: WIDTH) -> (out: WIDTH);
  comb primitive std_mux<"share"=1>[WIDTH](cond: 1, tru: WIDTH, fal: WIDTH) -> (out: WIDTH);
  primitive std_reg<"state_share"=1>[WIDTH](@write_together in: WIDTH, @write_together @static @go write_en: 1, @clk clk: 1, @reset reset: 1) -> (@stable out: WIDTH, @done done: 1);
  primitive std_mem_d1[WIDTH, SIZE, IDX_SIZE](@read_together addr0: IDX_SIZE, @write_together write_data: WIDTH, @write_together @static @go write_en: 1, @clk clk: 1, @reset reset: 1) -> (@read_together read_data: WIDTH, @done done: 1);
  primitive std_mem_d2[WIDTH, D0_SIZE, D1_SIZE, D0_IDX_SIZE, D1_IDX_SIZE](@read_together @write_together(2) addr0: D0_IDX_SIZE, @read_together @write_together(2) addr1: D1_IDX_SIZE, @write_together write_data: WIDTH, @write_together @static @go write_en: 1, @clk clk: 1, @reset reset: 1) -> (@read_together read_data: WIDTH, @done done: 1);
  primitive std_mem_d3[WIDTH, D0_SIZE, D1_SIZE, D2_SIZE, D0_IDX_SIZE, D1_IDX_SIZE, D2_IDX_SIZE](@read_together @write_together(2) addr0: D0_IDX_SIZE, @read_together @write_together(2) addr1: D1_IDX_SIZE, @read_together @write_together(2) addr2: D2_IDX_SIZE, @write_together write_data: WIDTH, @write_together @static @go write_en: 1, @clk clk: 1, @reset reset: 1) -> (@read_together read_data: WIDTH, @done done: 1);
  primitive std_mem_d4[WIDTH, D0_SIZE, D1_SIZE, D2_SIZE, D3_SIZE, D0_IDX_SIZE, D1_IDX_SIZE, D2_IDX_SIZE, D3_IDX_SIZE](@read_together @write_together(2) addr0: D0_IDX_SIZE, @read_together @write_together(2) addr1: D1_IDX_SIZE, @read_together @write_together(2) addr2: D2_IDX_SIZE, @read_together @write_together(2) addr3: D3_IDX_SIZE, @write_together write_data: WIDTH, @write_together @static @go write_en: 1, @clk clk: 1) -> (@read_together read_data: WIDTH, @done done: 1);
}
component main<"toplevel"=1>(@external ext_mem0_read_data: 32, ext_mem0_done: 1, ext_mem1_read_data: 32, ext_mem1_done: 1, ext_mem2_read_data: 32, ext_mem2_done: 1, @clk clk: 1, @reset reset: 1, @go go: 1) -> (ext_mem0_write_data: 32, ext_mem0_addr0: 4, ext_mem0_write_en: 1, ext_mem1_write_data: 32, ext_mem1_addr0: 4, ext_mem1_write_en: 1, ext_mem2_write_data: 32, ext_mem2_addr0: 4, ext_mem2_write_en: 1, @done done: 1) {
  cells {
    std_slice_3 = std_slice(32, 4);
    std_slice_2 = std_slice(32, 4);
    std_slice_1 = std_slice(32, 4);
    std_slice_0 = std_slice(32, 4);
    std_add_7 = std_add(32);
    muli_4_reg = std_reg(32);
    std_mult_pipe_4 = std_mult_pipe(32);
    std_add_6 = std_add(32);
    load_0_reg = std_reg(32);
    std_add_5 = std_add(32);
    muli_3_reg = std_reg(32);
    std_mult_pipe_3 = std_mult_pipe(32);
    muli_2_reg = std_reg(32);
    std_mult_pipe_2 = std_mult_pipe(32);
    std_add_4 = std_add(32);
    muli_1_reg = std_reg(32);
    std_mult_pipe_1 = std_mult_pipe(32);
    std_add_3 = std_add(32);
    muli_0_reg = std_reg(32);
    std_mult_pipe_0 = std_mult_pipe(32);
    std_add_2 = std_add(32);
    std_slt_2 = std_slt(32);
    std_add_1 = std_add(32);
    std_slt_1 = std_slt(32);
    std_add_0 = std_add(32);
    std_slt_0 = std_slt(32);
    while_2_arg0_reg = std_reg(32);
    while_1_arg0_reg = std_reg(32);
    while_0_arg0_reg = std_reg(32);
  }
  wires {
    group assign_while_0_init_0 {
      while_0_arg0_reg.in = 32'd0;
      while_0_arg0_reg.write_en = 1'd1;
      assign_while_0_init_0[done] = while_0_arg0_reg.done;
    }
    group assign_while_1_init_0 {
      while_1_arg0_reg.in = 32'd0;
      while_1_arg0_reg.write_en = 1'd1;
      assign_while_1_init_0[done] = while_1_arg0_reg.done;
    }
    group assign_while_2_init_0 {
      while_2_arg0_reg.in = 32'd0;
      while_2_arg0_reg.write_en = 1'd1;
      assign_while_2_init_0[done] = while_2_arg0_reg.done;
    }
    group bb0_6 {
      std_mult_pipe_0.left = while_2_arg0_reg.out;
      std_mult_pipe_0.right = 32'd3;
      muli_0_reg.in = std_mult_pipe_0.out;
      muli_0_reg.write_en = std_mult_pipe_0.done;
      std_mult_pipe_0.go = 1'd1;
      bb0_6[done] = muli_0_reg.done;
    }
    group bb0_9 {
      std_mult_pipe_1.left = while_1_arg0_reg.out;
      std_mult_pipe_1.right = 32'd3;
      muli_1_reg.in = std_mult_pipe_1.out;
      muli_1_reg.write_en = std_mult_pipe_1.done;
      std_mult_pipe_1.go = 1'd1;
      bb0_9[done] = muli_1_reg.done;
    }
    group bb0_12 {
      std_slice_3.in = std_add_3.out;
      std_slice_2.in = std_add_4.out;
      std_mult_pipe_2.left = ext_mem0_read_data;
      std_mult_pipe_2.right = ext_mem1_read_data;
      muli_2_reg.in = std_mult_pipe_2.out;
      muli_2_reg.write_en = std_mult_pipe_2.done;
      std_mult_pipe_2.go = 1'd1;
      ext_mem0_addr0 = std_slice_3.out;
      std_add_3.left = while_0_arg0_reg.out;
      std_add_3.right = std_mult_pipe_0.out;
      ext_mem1_addr0 = std_slice_2.out;
      std_add_4.left = while_0_arg0_reg.out;
      std_add_4.right = std_mult_pipe_1.out;
      bb0_12[done] = muli_2_reg.done;
    }
    group bb0_13 {
      std_mult_pipe_3.left = while_2_arg0_reg.out;
      std_mult_pipe_3.right = 32'd3;
      muli_3_reg.in = std_mult_pipe_3.out;
      muli_3_reg.write_en = std_mult_pipe_3.done;
      std_mult_pipe_3.go = 1'd1;
      bb0_13[done] = muli_3_reg.done;
    }
    group bb0_15 {
      std_slice_1.in = std_add_5.out;
      ext_mem2_addr0 = std_slice_1.out;
      load_0_reg.in = ext_mem2_read_data;
      load_0_reg.write_en = 1'd1;
      std_add_5.left = while_1_arg0_reg.out;
      std_add_5.right = std_mult_pipe_3.out;
      bb0_15[done] = load_0_reg.done;
    }
    group bb0_17 {
      std_mult_pipe_4.left = while_2_arg0_reg.out;
      std_mult_pipe_4.right = 32'd3;
      muli_4_reg.in = std_mult_pipe_4.out;
      muli_4_reg.write_en = std_mult_pipe_4.done;
      std_mult_pipe_4.go = 1'd1;
      bb0_17[done] = muli_4_reg.done;
    }
    group bb0_19 {
      std_slice_0.in = std_add_7.out;
      ext_mem2_addr0 = std_slice_0.out;
      ext_mem2_write_data = std_add_6.out;
      ext_mem2_write_en = 1'd1;
      std_add_7.left = while_1_arg0_reg.out;
      std_add_7.right = std_mult_pipe_4.out;
      std_add_6.left = load_0_reg.out;
      std_add_6.right = std_mult_pipe_2.out;
      bb0_19[done] = ext_mem2_done;
    }
    group assign_while_0_latch {
      while_0_arg0_reg.in = std_add_2.out;
      while_0_arg0_reg.write_en = 1'd1;
      std_add_2.left = while_0_arg0_reg.out;
      std_add_2.right = 32'd1;
      assign_while_0_latch[done] = while_0_arg0_reg.done;
    }
    group assign_while_1_latch {
      while_1_arg0_reg.in = std_add_1.out;
      while_1_arg0_reg.write_en = 1'd1;
      std_add_1.left = while_1_arg0_reg.out;
      std_add_1.right = 32'd1;
      assign_while_1_latch[done] = while_1_arg0_reg.done;
    }
    group assign_while_2_latch {
      while_2_arg0_reg.in = std_add_0.out;
      while_2_arg0_reg.write_en = 1'd1;
      std_add_0.left = while_2_arg0_reg.out;
      std_add_0.right = 32'd1;
      assign_while_2_latch[done] = while_2_arg0_reg.done;
    }
    comb group bb0_0 {
      std_slt_0.left = while_2_arg0_reg.out;
      std_slt_0.right = 32'd3;
    }
    comb group bb0_2 {
      std_slt_1.left = while_1_arg0_reg.out;
      std_slt_1.right = 32'd3;
    }
    comb group bb0_4 {
      std_slt_2.left = while_0_arg0_reg.out;
      std_slt_2.right = 32'd3;
    }
  }

  control {
    seq {
      par {
        assign_while_2_init_0;
      }
      while std_slt_0.out with bb0_0 {
        seq {
          par {
            assign_while_1_init_0;
          }
          while std_slt_1.out with bb0_2 {
            seq {
              par {
                assign_while_0_init_0;
              }
              while std_slt_2.out with bb0_4 {
                seq {
                  seq {
                    bb0_6;
                    bb0_9;
                    bb0_12;
                    bb0_13;
                    bb0_15;
                    bb0_17;
                    bb0_19;
                  }
                  assign_while_0_latch;
                }
              }
              assign_while_1_latch;
            }
          }
          assign_while_2_latch;
        }
      }
    }
  }
}

But I always get this error when I try to use the flag -b xilinx:

Error: Program has no memories marked with attribute @external. Please make sure that at least one memory is marked as @external.

The third step can be run and generate the xml file like following

<?xml version="1.0" encoding="UTF-8"?>
<root versionMajor="1" versionMinor="6"><kernel name="Toplevel" language="ip_c" vlnv="capra.cs.cornell.edu:kernel:Toplevel:1.0" preferredWorkGroupSizeMultiple="0" workGroupSize="1" interrupt="false" hwControlProtocol="ap_ctrl_hs"><ports><port name="s_axi_control" mode="slave" range="0x1000" dataWidth="32" portType="addressable" base="0x0"/></ports><args><arg name="timeout" addressQualifier="0" id="0" port="s_axi_control" size="0x4" offset="0x010" type="uint" hostOffset="0x0" hostSize="0x4"/></args></kernel></root>

This looks pretty much like a bug to me. Is this something wrong with my usage or with the compiler?