[core] Fix calling convention differences between libstdc++ and libc++ (

NVIDIA#2188) * [core] Fix calling convention differences between libstdc++ and libc++ for small structs. In libc++, the compiler uses a "zero-cost" model of passing small struct values. In libstdc++, this was never fixed in order to maintain backwards compatibility with older compilers. So small structs are passed on the stack via references. See https://tinyurl.com/bdhrkx9x for more information. These patches fix nvq++ to adapt the calling convention to the header files in use at compile time. * Fix typo in comment. * Update tests to expect optional width, align info.
schweitzpgi · Sep 5, 2024 · 5b030df · 5b030df
1 parent bdea842
commit 5b030df
Show file tree

Hide file tree

Showing 7 changed files with 74 additions and 53 deletions.
diff --git a/docker/build/assets.Dockerfile b/docker/build/assets.Dockerfile
@@ -291,7 +291,7 @@ RUN cd /cuda-quantum && source scripts/configure_build.sh && \
         --param nvqpp_site_config=build/test/lit.site.cfg.py ${filtered} && \
     # FIXME: Some tests are still failing when building against libc++
     # tracked in https://github.com/NVIDIA/cuda-quantum/issues/1712
-    filtered=" --filter-out Kernel/(inline-qpu-func|signature-2)" && \
+    filtered=" --filter-out Kernel/inline-qpu-func" && \
     if [ ! -x "$(command -v nvcc)" ]; then \
         filtered+="|TargetConfig/check_compile"; \
     fi && \

diff --git a/include/cudaq/Frontend/nvqpp/ASTBridge.h b/include/cudaq/Frontend/nvqpp/ASTBridge.h
@@ -401,6 +401,7 @@ class QuakeBridgeVisitor
 
   bool TraverseRecordType(clang::RecordType *t);
   bool interceptRecordDecl(clang::RecordDecl *x);
+  std::pair<std::uint64_t, unsigned> getWidthAndAlignment(clang::RecordDecl *x);
   bool VisitRecordDecl(clang::RecordDecl *x);
 
   // Type declarations to be converted to high-level Quake and CC types are

diff --git a/lib/Frontend/nvqpp/ConvertDecl.cpp b/lib/Frontend/nvqpp/ConvertDecl.cpp
@@ -252,7 +252,8 @@ bool QuakeBridgeVisitor::interceptRecordDecl(clang::RecordDecl *x) {
           return false;
         members.push_back(popType());
       }
-      return pushType(cc::StructType::get(ctx, members));
+      auto [width, align] = getWidthAndAlignment(x);
+      return pushType(cc::StructType::get(ctx, members, width, align));
     }
     if (name.equals("tuple")) {
       auto *cts = cast<clang::ClassTemplateSpecializationDecl>(x);
@@ -265,9 +266,14 @@ bool QuakeBridgeVisitor::interceptRecordDecl(clang::RecordDecl *x) {
           return false;
         members.push_back(popType());
       }
-      if (tuplesAreReversed)
+      auto [width, align] = getWidthAndAlignment(x);
+      if (tuplesAreReversed) {
         std::reverse(members.begin(), members.end());
-      return pushType(cc::StructType::get(ctx, members));
+        // Resets are for libstdc++ calling convention compatibility.
+        width = 0;
+        align = 0;
+      }
+      return pushType(cc::StructType::get(ctx, members, width, align));
     }
     if (ignoredClass(x))
       return true;

diff --git a/lib/Frontend/nvqpp/ConvertType.cpp b/lib/Frontend/nvqpp/ConvertType.cpp
@@ -221,6 +221,17 @@ bool QuakeBridgeVisitor::TraverseRecordType(clang::RecordType *t) {
   return true;
 }
 
+std::pair<std::uint64_t, unsigned>
+QuakeBridgeVisitor::getWidthAndAlignment(clang::RecordDecl *x) {
+  auto *defn = x->getDefinition();
+  assert(defn && "struct must be defined here");
+  auto *ty = defn->getTypeForDecl();
+  if (ty->isDependentType())
+    return {0, 0};
+  auto ti = getContext()->getTypeInfo(ty);
+  return {ti.Width, llvm::PowerOf2Ceil(ti.Align) / 8};
+}
+
 bool QuakeBridgeVisitor::VisitRecordDecl(clang::RecordDecl *x) {
   assert(!x->isLambda() && "expected lambda to be handled in traverse");
   // Note that we're generating a Type on the type stack.
@@ -232,15 +243,7 @@ bool QuakeBridgeVisitor::VisitRecordDecl(clang::RecordDecl *x) {
     return pushType(cc::StructType::get(ctx, name, /*isOpaque=*/true));
   SmallVector<Type> fieldTys =
       lastTypes(std::distance(x->field_begin(), x->field_end()));
-  auto [width, alignInBytes] = [&]() -> std::pair<std::uint64_t, unsigned> {
-    auto *defn = x->getDefinition();
-    assert(defn && "struct must be defined here");
-    auto *ty = defn->getTypeForDecl();
-    if (ty->isDependentType())
-      return {0, 0};
-    auto ti = getContext()->getTypeInfo(ty);
-    return {ti.Width, llvm::PowerOf2Ceil(ti.Align) / 8};
-  }();
+  auto [width, alignInBytes] = getWidthAndAlignment(x);
   if (name.empty())
     return pushType(cc::StructType::get(ctx, fieldTys, width, alignInBytes));
   return pushType(

diff --git a/targettests/Kernel/signature-2.cpp b/targettests/Kernel/signature-2.cpp
@@ -149,6 +149,13 @@ struct T2 {
   }
 };
 
+struct T3 {
+  void operator()(std::pair<long, double> tup) __qpu__ {
+    show5(std::get<0>(tup));
+    show7(std::get<1>(tup));
+  }
+};
+
 int main() {
   VectorOfStruct vsData = {{1, 1.0f, 95.0}, {2, 18.4f, 86.945}};
   VS{}(vsData);
@@ -164,6 +171,9 @@ int main() {
 
   std::tuple<long, double> t2{2098, 99.5};
   T2{}(t2);
+
+  std::pair<long, double> t3{34061, 1999.2};
+  T3{}(t3);
 
   return 0;
 }
@@ -175,3 +185,4 @@ int main() {
 // CHECK: 1.200 2.400 4.800
 // CHECK: 234 89238 3.140 Z
 // CHECK: 2098 99.5
+// CHECK: 34061 1999.2
diff --git a/test/AST-Quake/tuple-0.cpp b/test/AST-Quake/tuple-0.cpp
@@ -19,9 +19,9 @@ struct ArithmeticTupleQernel {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__ArithmeticTupleQernel(
-// CHECK-SAME:      %[[VAL_0:.*]]: !cc.struct<{[[TUP:.*, .*, .*, .*]]}>)
-// CHECK:           %[[VAL_1:.*]] = cc.alloca !cc.struct<{[[TUP]]}>
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<!cc.struct<{[[TUP]]}>>
+// CHECK-SAME:      %[[VAL_0:.*]]: !cc.struct<{[[TUP:.*, .*, .*, .*]]}{{.*}}>)
+// CHECK:           %[[VAL_1:.*]] = cc.alloca !cc.struct<{[[TUP]]}{{.*}}>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<!cc.struct<{[[TUP]]}{{.*}}>>
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<1>
 // CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
@@ -37,9 +37,9 @@ struct ArithmeticPairQernel {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__ArithmeticPairQernel(
-// CHECK-SAME:      %[[VAL_0:.*]]: !cc.struct<{f32, i32}>)
-// CHECK:           %[[VAL_1:.*]] = cc.alloca !cc.struct<{f32, i32}>
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<!cc.struct<{f32, i32}>>
+// CHECK-SAME:      %[[VAL_0:.*]]: !cc.struct<{f32, i32} [64,4]>)
+// CHECK:           %[[VAL_1:.*]] = cc.alloca !cc.struct<{f32, i32} [64,4]>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<!cc.struct<{f32, i32} [64,4]>>
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<1>
 // CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
@@ -56,12 +56,12 @@ struct ArithmeticTupleQernelWithUse {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__ArithmeticTupleQernelWithUse(
-// CHECK-SAME:      %[[VAL_0:.*]]: !cc.struct<{[[TUP]]}>)
+// CHECK-SAME:      %[[VAL_0:.*]]: !cc.struct<{[[TUP]]}{{.*}}>)
 // CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
 // CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.struct<{[[TUP]]}>
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<!cc.struct<{[[TUP]]}>>
-// CHECK:           %[[VAL_4:.*]] = cc.c{{.*}} %[[VAL_3]]{{.*}} : (!cc.ptr<!cc.struct<{[[TUP]]}>>) -> !cc.ptr<i32>
+// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.struct<{[[TUP]]}{{.*}}>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<!cc.struct<{[[TUP]]}{{.*}}>>
+// CHECK:           %[[VAL_4:.*]] = cc.c{{.*}} %[[VAL_3]]{{.*}} : (!cc.ptr<!cc.struct<{[[TUP]]}{{.*}}>>) -> !cc.ptr<i32>
 // CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_6:.*]] = cc.cast signed %[[VAL_5]] : (i32) -> i64
 // CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<?>[%[[VAL_6]] : i64]
@@ -94,12 +94,12 @@ struct ArithmeticTupleQernelWithUse0 {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__ArithmeticTupleQernelWithUse0(
-// CHECK-SAME:      %[[VAL_0:.*]]: !cc.struct<{[[TUP]]}>)
+// CHECK-SAME:      %[[VAL_0:.*]]: !cc.struct<{[[TUP]]}{{.*}}>)
 // CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
 // CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.struct<{[[TUP]]}>
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<!cc.struct<{[[TUP]]}>>
-// CHECK:           %[[VAL_4:.*]] = cc.c{{.*}} %[[VAL_3]]{{.*}} : (!cc.ptr<!cc.struct<{[[TUP]]}>>) -> !cc.ptr<i32>
+// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.struct<{[[TUP]]}{{.*}}>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<!cc.struct<{[[TUP]]}{{.*}}>>
+// CHECK:           %[[VAL_4:.*]] = cc.c{{.*}} %[[VAL_3]]{{.*}} : (!cc.ptr<!cc.struct<{[[TUP]]}{{.*}}>>) -> !cc.ptr<i32>
 // CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_6:.*]] = cc.cast signed %[[VAL_5]] : (i32) -> i64
 // CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<?>[%[[VAL_6]] : i64]
@@ -131,10 +131,10 @@ struct ArithmeticPairQernelWithUse {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__ArithmeticPairQernelWithUse(
-// CHECK-SAME:      %[[VAL_0:.*]]: !cc.struct<{f32, i32}>)
-// CHECK:           %[[VAL_1:.*]] = cc.alloca !cc.struct<{f32, i32}>
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<!cc.struct<{f32, i32}>>
-// CHECK:           %[[VAL_2:.*]] = cc.compute_ptr %[[VAL_1]][1] : (!cc.ptr<!cc.struct<{f32, i32}>>) -> !cc.ptr<i32>
+// CHECK-SAME:      %[[VAL_0:.*]]: !cc.struct<{f32, i32} [64,4]>)
+// CHECK:           %[[VAL_1:.*]] = cc.alloca !cc.struct<{f32, i32} [64,4]>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<!cc.struct<{f32, i32} [64,4]>>
+// CHECK:           %[[VAL_2:.*]] = cc.compute_ptr %[[VAL_1]][1] : (!cc.ptr<!cc.struct<{f32, i32} [64,4]>>) -> !cc.ptr<i32>
 // CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_4:.*]] = cc.cast signed %[[VAL_3]] : (i32) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_4]] : i64]

diff --git a/test/AST-Quake/tuple-1.cpp b/test/AST-Quake/tuple-1.cpp
@@ -30,29 +30,29 @@ int main1() {
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_prepQubit
-// CHECK-SAME:        (%[[VAL_0:.*]]: !cc.struct<{i32, f64}>, %[[VAL_1:.*]]: !quake.ref)
-// CHECK:           %[[VAL_2:.*]] = cc.alloca !cc.struct<{i32, f64}>
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<!cc.struct<{i32, f64}>>
+// CHECK-SAME:        (%[[VAL_0:.*]]: !cc.struct<{i32, f64} [128,8]>, %[[VAL_1:.*]]: !quake.ref)
+// CHECK:           %[[VAL_2:.*]] = cc.alloca !cc.struct<{i32, f64} [128,8]>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<!cc.struct<{i32, f64} [128,8]>>
 // CHECK:           return
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_RzArcTan2
-// CHECK-SAME:        (%[[VAL_0:.*]]: i1, %[[VAL_1:.*]]: !cc.struct<{i32, f64}>) attributes
+// CHECK-SAME:        (%[[VAL_0:.*]]: i1, %[[VAL_1:.*]]: !cc.struct<{i32, f64} [128,8]>) attributes
 // CHECK:           %[[VAL_2:.*]] = cc.alloca i1
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.struct<{i32, f64}>
-// CHECK:           cc.store %[[VAL_1]], %[[VAL_3]] : !cc.ptr<!cc.struct<{i32, f64}>>
+// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.struct<{i32, f64} [128,8]>
+// CHECK:           cc.store %[[VAL_1]], %[[VAL_3]] : !cc.ptr<!cc.struct<{i32, f64} [128,8]>>
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_6:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i1>
 // CHECK:           cc.if(%[[VAL_7]]) {
 // CHECK:             quake.x %[[VAL_6]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           %[[VAL_8:.*]] = cc.alloca !cc.struct<{i32, f64}>
-// CHECK:           %[[VAL_9:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.struct<{i32, f64}>>
-// CHECK:           cc.store %[[VAL_9]], %[[VAL_8]] : !cc.ptr<!cc.struct<{i32, f64}>>
-// CHECK:           %[[VAL_10:.*]] = cc.load %[[VAL_8]] : !cc.ptr<!cc.struct<{i32, f64}>>
-// CHECK:           call @__nvqpp__mlirgen__function_prepQubit{{.*}}(%[[VAL_10]], %[[VAL_6]]) : (!cc.struct<{i32, f64}>, !quake.ref) -> ()
+// CHECK:           %[[VAL_8:.*]] = cc.alloca !cc.struct<{i32, f64} [128,8]>
+// CHECK:           %[[VAL_9:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.struct<{i32, f64} [128,8]>>
+// CHECK:           cc.store %[[VAL_9]], %[[VAL_8]] : !cc.ptr<!cc.struct<{i32, f64} [128,8]>>
+// CHECK:           %[[VAL_10:.*]] = cc.load %[[VAL_8]] : !cc.ptr<!cc.struct<{i32, f64} [128,8]>>
+// CHECK:           call @__nvqpp__mlirgen__function_prepQubit{{.*}}(%[[VAL_10]], %[[VAL_6]]) : (!cc.struct<{i32, f64} [128,8]>, !quake.ref) -> ()
 // CHECK:           return
 // CHECK:         }
 
@@ -75,29 +75,29 @@ int main2() {
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_prepQubit2
-// CHECK-SAME:        (%[[VAL_0:.*]]: !cc.struct<{[[TUP:.*, .*, .*]]}>, %[[VAL_1:.*]]: !quake.ref)
-// CHECK:           %[[VAL_2:.*]] = cc.alloca !cc.struct<{[[TUP]]}>
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<!cc.struct<{[[TUP]]}>>
+// CHECK-SAME:        (%[[VAL_0:.*]]: !cc.struct<{[[TUP:.*, .*, .*]]}{{.*}}>, %[[VAL_1:.*]]: !quake.ref)
+// CHECK:           %[[VAL_2:.*]] = cc.alloca !cc.struct<{[[TUP]]}{{.*}}>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<!cc.struct<{[[TUP]]}{{.*}}>>
 // CHECK:           return
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_RzArcTan22
-// CHECK-SAME:        (%[[VAL_0:.*]]: i1, %[[VAL_1:.*]]: !cc.struct<{[[TUP]]}>) attributes
+// CHECK-SAME:        (%[[VAL_0:.*]]: i1, %[[VAL_1:.*]]: !cc.struct<{[[TUP]]}{{.*}}>) attributes
 // CHECK:           %[[VAL_2:.*]] = cc.alloca i1
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.struct<{[[TUP]]}>
-// CHECK:           cc.store %[[VAL_1]], %[[VAL_3]] : !cc.ptr<!cc.struct<{[[TUP]]}>>
+// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.struct<{[[TUP]]}{{.*}}>
+// CHECK:           cc.store %[[VAL_1]], %[[VAL_3]] : !cc.ptr<!cc.struct<{[[TUP]]}{{.*}}>>
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_6:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i1>
 // CHECK:           cc.if(%[[VAL_7]]) {
 // CHECK:             quake.x %[[VAL_6]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           %[[VAL_8:.*]] = cc.alloca !cc.struct<{[[TUP]]}>
-// CHECK:           %[[VAL_9:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.struct<{[[TUP]]}>>
-// CHECK:           cc.store %[[VAL_9]], %[[VAL_8]] : !cc.ptr<!cc.struct<{[[TUP]]}>>
-// CHECK:           %[[VAL_10:.*]] = cc.load %[[VAL_8]] : !cc.ptr<!cc.struct<{[[TUP]]}>>
-// CHECK:           call @__nvqpp__mlirgen__function_prepQubit2{{.*}}(%[[VAL_10]], %[[VAL_6]]) : (!cc.struct<{[[TUP]]}>, !quake.ref) -> ()
+// CHECK:           %[[VAL_8:.*]] = cc.alloca !cc.struct<{[[TUP]]}{{.*}}>
+// CHECK:           %[[VAL_9:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.struct<{[[TUP]]}{{.*}}>>
+// CHECK:           cc.store %[[VAL_9]], %[[VAL_8]] : !cc.ptr<!cc.struct<{[[TUP]]}{{.*}}>>
+// CHECK:           %[[VAL_10:.*]] = cc.load %[[VAL_8]] : !cc.ptr<!cc.struct<{[[TUP]]}{{.*}}>>
+// CHECK:           call @__nvqpp__mlirgen__function_prepQubit2{{.*}}(%[[VAL_10]], %[[VAL_6]]) : (!cc.struct<{[[TUP]]}{{.*}}>, !quake.ref) -> ()
 // CHECK:           return
 // CHECK:         }