From 0f4a43717ede10f9f2b6bec268a130bd3370ec5f Mon Sep 17 00:00:00 2001
From: Christoffer Lerno <christoffer@aegik.com>
Date: Sat, 15 Jan 2022 01:37:43 +0100
Subject: [PATCH] Fixes to the aarch64 ABI.

---
 src/compiler/c_abi_internal.h                 |   1 +
 src/compiler/llvm_codegen_c_abi.c             |   5 +
 src/compiler/llvm_codegen_c_abi_aarch64.c     |  24 +++-
 src/compiler/llvm_codegen_c_abi_x86.c         |  10 +-
 test/test_suite/abi/aarch64_hfa_args.c3t      |  80 +-------------
 test/test_suite/abi/literal_load_aarch64.c3t  |   8 +-
 test/test_suite/abi/vec2_aarch64.c3t          | 103 +++++++-----------
 test/test_suite/functions/splat_aarch64.c3t   |  90 ++++++++-------
 .../struct/struct_as_value_aarch64.c3t        |   5 +-
 9 files changed, 135 insertions(+), 191 deletions(-)

diff --git a/src/compiler/c_abi_internal.h b/src/compiler/c_abi_internal.h
index b21f8db61..e3ea16ffd 100644
--- a/src/compiler/c_abi_internal.h
+++ b/src/compiler/c_abi_internal.h
@@ -22,6 +22,7 @@ ABIArgInfo *abi_arg_new_direct_int_ext_by_reg(Type *int_to_extend, bool by_reg);
 ABIArgInfo *abi_arg_new_direct_coerce_bits(BitSize bits);
 ABIArgInfo *abi_arg_new_direct_coerce_type(Type *type);
 ABIArgInfo *abi_arg_new_direct_coerce_array_type(Type *type, int8_t elements);
+ABIArgInfo *abi_arg_new_direct_coerce_to_struct_with_elements(Type *type, int8_t elements);
 ABIArgInfo *abi_arg_new_direct_coerce(AbiType type);
 ABIArgInfo *abi_arg_new_expand_coerce(AbiType target_type, unsigned offset);
 ABIArgInfo *abi_arg_new_expand_coerce_pair(AbiType first_element, unsigned initial_offset, AbiType second_element, unsigned padding, bool is_packed);
diff --git a/src/compiler/llvm_codegen_c_abi.c b/src/compiler/llvm_codegen_c_abi.c
index 718c25163..69a96fa8c 100644
--- a/src/compiler/llvm_codegen_c_abi.c
+++ b/src/compiler/llvm_codegen_c_abi.c
@@ -189,6 +189,11 @@ ABIArgInfo *abi_arg_new_direct_coerce_type(Type *type)
 	return info;
 }
 
+ABIArgInfo *abi_arg_new_direct_coerce_to_struct_with_elements(Type *type, int8_t elements)
+{
+	TODO
+}
+
 ABIArgInfo *abi_arg_new_direct_coerce_array_type(Type *type, int8_t elements)
 {
 	assert(elements > 0);
diff --git a/src/compiler/llvm_codegen_c_abi_aarch64.c b/src/compiler/llvm_codegen_c_abi_aarch64.c
index 366f00b29..cb5d77c03 100644
--- a/src/compiler/llvm_codegen_c_abi_aarch64.c
+++ b/src/compiler/llvm_codegen_c_abi_aarch64.c
@@ -48,7 +48,11 @@ ABIArgInfo *aarch64_classify_argument_type(Type *type)
 	if (type_is_homogenous_aggregate(type, &base, &members))
 	{
 		assert(members < 128);
-		return abi_arg_new_direct_coerce_array_type(base, (int8_t)members);
+		if (members > 1)
+		{
+			return abi_arg_new_direct_coerce_type(type_get_array(base, members));
+		}
+		return abi_arg_new_direct_coerce_type(base);
 	}
 
 	// Aggregates <= in registers
@@ -71,8 +75,12 @@ ABIArgInfo *aarch64_classify_argument_type(Type *type)
 		// We use a pair of i64 for 16-byte aggregate with 8-byte alignment.
 		// For aggregates with 16-byte alignment, we use i128.
 		assert(alignment == 8 || alignment == 16);
-		assert(size / alignment < 128);
-		return abi_arg_new_direct_coerce_array_type(alignment == 8 ? type_ulong : type_u128, (int8_t)(size / alignment));
+
+		if (alignment == 16) return abi_arg_new_direct_coerce_type(type_u128);
+		ArraySize m = size / alignment;
+		if (m > 1) return abi_arg_new_direct_coerce_type(type_get_array(type_ulong, m));
+		return abi_arg_new_direct_coerce_type(type_ulong);
+
 	}
 
 	return abi_arg_new_indirect_not_by_val(type);
@@ -122,7 +130,15 @@ ABIArgInfo *aarch64_classify_return_type(Type *type, bool variadic)
 	// Aggregates <= in registers
 	if (size <= 16)
 	{
-		// For RenderScript <= 16 needs to be coerced.
+		// For RenderScript <= 16 needs to be coerced to ints
+		// this is case is ignored here but needs to be added
+		// in case it is to be supported.
+
+		if (size <= 8 && !platform_target.big_endian)
+		{
+			return abi_arg_new_direct_coerce_type(type_int_unsigned_by_bitsize(size * 8));
+		}
+
 		unsigned alignment = type_abi_alignment(type);
 		// Align to multiple of 8.
 		unsigned aligned_size = aligned_offset(size, 8);
diff --git a/src/compiler/llvm_codegen_c_abi_x86.c b/src/compiler/llvm_codegen_c_abi_x86.c
index e459f7168..985768e5e 100644
--- a/src/compiler/llvm_codegen_c_abi_x86.c
+++ b/src/compiler/llvm_codegen_c_abi_x86.c
@@ -491,7 +491,15 @@ static inline ABIArgInfo *x86_classify_aggregate(CallABI call, Regs *regs, Type
 		// but we do not generate this struct immediately here.
 		unsigned size_in_regs = (size + 3) / 4;
 		assert(size_in_regs < 8);
-		ABIArgInfo *info = abi_arg_new_direct_coerce_array_type(type_uint, (int8_t)size_in_regs);
+		ABIArgInfo *info;
+		if (size_in_regs > 1)
+		{
+			info = abi_arg_new_direct_coerce_to_struct_with_elements(type_uint, (int8_t)size_in_regs);
+		}
+		else
+		{
+			info = abi_arg_new_direct_coerce_type(type_uint);
+		}
 		// Not in reg on MCU
 		if (!platform_target.x86.is_mcu_api) info->attributes.by_reg = true;
 		return info;
diff --git a/test/test_suite/abi/aarch64_hfa_args.c3t b/test/test_suite/abi/aarch64_hfa_args.c3t
index 49e080dc6..2eec5e75c 100644
--- a/test/test_suite/abi/aarch64_hfa_args.c3t
+++ b/test/test_suite/abi/aarch64_hfa_args.c3t
@@ -1,4 +1,3 @@
-// #skipped
 // #target: aarch64-darwin
 module test;
 define Int8x16 = ichar[<16>];
@@ -15,12 +14,10 @@ struct MixedHFAv3
     Int8x16 b;
 }
 
-// CHECK: define{{.*}} %struct.HFAv3 @test([4 x <4 x float>] %{{.*}}, [4 x <4 x float>] %{{.*}}, [4 x <4 x float>] %{{.*}})
 fn HFAv3 test(HFAv3 a0, HFAv3 a1, HFAv3 a2) {
   return a2;
 }
 
-// CHECK: define{{.*}} %struct.MixedHFAv3 @test_mixed([4 x <4 x float>] %{{.*}}, [4 x <4 x float>] %{{.*}}, [4 x <4 x float>] %{{.*}})
 fn MixedHFAv3 test_mixed(MixedHFAv3 a0, MixedHFAv3 a1, MixedHFAv3 a2) {
   return a2;
 }
@@ -31,78 +28,5 @@ fn MixedHFAv3 test_mixed(MixedHFAv3 a0, MixedHFAv3 a1, MixedHFAv3 a2) {
 %HFAv3 = type { [4 x <3 x float>] }
 %MixedHFAv3 = type { [3 x <3 x float>], <16 x i8> }
 
-// CHECK: define{{.*}} %struct.HFAv3 @test([4 x <4 x float>] %{{.*}}, [4 x <4 x float>] %{{.*}}, [4 x <4 x float>] %{{.*}})
-
-define %HFAv3 @test.test(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5, <4 x float> %6, <4 x float> %7, <4 x float> %8, <4 x float> %9, <4 x float> %10, <4 x float> %11) #0 {
-entry:
-  %a0 = alloca %HFAv3, align 16
-  %a1 = alloca %HFAv3, align 16
-  %a2 = alloca %HFAv3, align 16
-  %coerce = bitcast %HFAv3* %a0 to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
-  %12 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce, i32 0, i32 0
-  store <4 x float> %0, <4 x float>* %12, align 16
-  %13 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce, i32 0, i32 1
-  store <4 x float> %1, <4 x float>* %13, align 16
-  %14 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce, i32 0, i32 2
-  store <4 x float> %2, <4 x float>* %14, align 16
-  %15 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce, i32 0, i32 3
-  store <4 x float> %3, <4 x float>* %15, align 16
-  %coerce1 = bitcast %HFAv3* %a1 to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
-  %16 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce1, i32 0, i32 0
-  store <4 x float> %4, <4 x float>* %16, align 16
-  %17 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce1, i32 0, i32 1
-  store <4 x float> %5, <4 x float>* %17, align 16
-  %18 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce1, i32 0, i32 2
-  store <4 x float> %6, <4 x float>* %18, align 16
-  %19 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce1, i32 0, i32 3
-  store <4 x float> %7, <4 x float>* %19, align 16
-  %coerce2 = bitcast %HFAv3* %a2 to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
-  %20 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce2, i32 0, i32 0
-  store <4 x float> %8, <4 x float>* %20, align 16
-  %21 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce2, i32 0, i32 1
-  store <4 x float> %9, <4 x float>* %21, align 16
-  %22 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce2, i32 0, i32 2
-  store <4 x float> %10, <4 x float>* %22, align 16
-  %23 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce2, i32 0, i32 3
-  store <4 x float> %11, <4 x float>* %23, align 16
-  %24 = load %HFAv3, %HFAv3* %a2, align 16
-  ret %HFAv3 %24
-}
-
-; Function Attrs: nounwind
-define %MixedHFAv3 @test.test_mixed(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5, <4 x float> %6, <4 x float> %7, <4 x float> %8, <4 x float> %9, <4 x float> %10, <4 x float> %11) #0 {
-entry:
-  %a0 = alloca %MixedHFAv3, align 16
-  %a1 = alloca %MixedHFAv3, align 16
-  %a2 = alloca %MixedHFAv3, align 16
-  %coerce = bitcast %MixedHFAv3* %a0 to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
-  %12 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce, i32 0, i32 0
-  store <4 x float> %0, <4 x float>* %12, align 16
-  %13 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce, i32 0, i32 1
-  store <4 x float> %1, <4 x float>* %13, align 16
-  %14 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce, i32 0, i32 2
-  store <4 x float> %2, <4 x float>* %14, align 16
-  %15 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce, i32 0, i32 3
-  store <4 x float> %3, <4 x float>* %15, align 16
-  %coerce1 = bitcast %MixedHFAv3* %a1 to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
-  %16 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce1, i32 0, i32 0
-  store <4 x float> %4, <4 x float>* %16, align 16
-  %17 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce1, i32 0, i32 1
-  store <4 x float> %5, <4 x float>* %17, align 16
-  %18 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce1, i32 0, i32 2
-  store <4 x float> %6, <4 x float>* %18, align 16
-  %19 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce1, i32 0, i32 3
-  store <4 x float> %7, <4 x float>* %19, align 16
-  %coerce2 = bitcast %MixedHFAv3* %a2 to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
-  %20 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce2, i32 0, i32 0
-  store <4 x float> %8, <4 x float>* %20, align 16
-  %21 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce2, i32 0, i32 1
-  store <4 x float> %9, <4 x float>* %21, align 16
-  %22 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce2, i32 0, i32 2
-  store <4 x float> %10, <4 x float>* %22, align 16
-  %23 = getelementptr inbounds { <4 x float>, <4 x float>, <4 x float>, <4 x float> }, { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* %coerce2, i32 0, i32 3
-  store <4 x float> %11, <4 x float>* %23, align 16
-  %24 = load %MixedHFAv3, %MixedHFAv3* %a2, align 16
-  ret %MixedHFAv3 %24
-}
-
+define %HFAv3 @test.test([4 x <4 x float>] %0, [4 x <4 x float>] %1, [4 x <4 x float>] %2)
+define %MixedHFAv3 @test.test_mixed([4 x <4 x float>] %0, [4 x <4 x float>] %1, [4 x <4 x float>] %2) #0 {
diff --git a/test/test_suite/abi/literal_load_aarch64.c3t b/test/test_suite/abi/literal_load_aarch64.c3t
index eced81adc..a633c84a1 100644
--- a/test/test_suite/abi/literal_load_aarch64.c3t
+++ b/test/test_suite/abi/literal_load_aarch64.c3t
@@ -18,8 +18,10 @@ fn Test creator()
 
 // #expect: literal_load.ll
 
-declare void @blorg(i64)
+declare void @blorg(i64) #0
 
+define i32 @literal_load.creator() #0 {
+entry:
   %literal = alloca %Test, align 4
   %literal1 = alloca %Test, align 4
   %0 = bitcast %Test* %literal to i32*
@@ -32,5 +34,5 @@ declare void @blorg(i64)
   store i32 0, i32* %4, align 4
   %5 = getelementptr inbounds %Test, %Test* %literal1, i32 0, i32 0
   %6 = load i32, i32* %5, align 4
-  %7 = zext i32 %6 to i64
-  ret i64 %7
\ No newline at end of file
+  ret i32 %6
+}
\ No newline at end of file
diff --git a/test/test_suite/abi/vec2_aarch64.c3t b/test/test_suite/abi/vec2_aarch64.c3t
index 24f350614..20a1a25a6 100644
--- a/test/test_suite/abi/vec2_aarch64.c3t
+++ b/test/test_suite/abi/vec2_aarch64.c3t
@@ -15,7 +15,7 @@ extern fn Vector2 vector2_subtract_value(Vector2 v, float sub) { return Vector2
 // #expect: abi.ll
 %Vector2 = type { float, float }
 
-define %Vector2 @vector2_zero()
+define %Vector2 @vector2_zero() #0 {
 entry:
   %literal = alloca %Vector2, align 4
   %0 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 0
@@ -26,7 +26,7 @@ entry:
   ret %Vector2 %2
 }
 
-define %Vector2 @vector2_one()
+define %Vector2 @vector2_one() #0 {
 entry:
   %literal = alloca %Vector2, align 4
   %0 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 0
@@ -37,83 +37,64 @@ entry:
   ret %Vector2 %2
 }
 
-
-define %Vector2 @vector2_add(float %0, float %1, float %2, float %3)
+define %Vector2 @vector2_add([2 x float] %0, [2 x float] %1) #0 {
 entry:
   %v1 = alloca %Vector2, align 4
   %v2 = alloca %Vector2, align 4
   %literal = alloca %Vector2, align 4
-  %coerce = bitcast %Vector2* %v1 to { float, float }*
-  %4 = getelementptr inbounds { float, float }, { float, float }* %coerce, i32 0, i32 0
-  store float %0, float* %4, align 4
-  %5 = getelementptr inbounds { float, float }, { float, float }* %coerce, i32 0, i32 1
-  store float %1, float* %5, align 4
-  %coerce1 = bitcast %Vector2* %v2 to { float, float }*
-  %6 = getelementptr inbounds { float, float }, { float, float }* %coerce1, i32 0, i32 0
-  store float %2, float* %6, align 4
-  %7 = getelementptr inbounds { float, float }, { float, float }* %coerce1, i32 0, i32 1
-  store float %3, float* %7, align 4
-  %8 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 0
-  store float 0.000000e+00, float* %8, align 4
-  %9 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 1
-  store float 0.000000e+00, float* %9, align 4
-  %10 = load %Vector2, %Vector2* %literal, align 4
-  ret %Vector2 %10
+  %2 = bitcast %Vector2* %v1 to [2 x float]*
+  store [2 x float] %0, [2 x float]* %2, align 4
+  %3 = bitcast %Vector2* %v2 to [2 x float]*
+  store [2 x float] %1, [2 x float]* %3, align 4
+  %4 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 0
+  store float 0.000000e+00, float* %4, align 4
+  %5 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 1
+  store float 0.000000e+00, float* %5, align 4
+  %6 = load %Vector2, %Vector2* %literal, align 4
+  ret %Vector2 %6
 }
 
-define %Vector2 @vector2_add_value(float %0, float %1, float %2)
+define %Vector2 @vector2_add_value([2 x float] %0, float %1) #0 {
 entry:
   %v = alloca %Vector2, align 4
   %literal = alloca %Vector2, align 4
-  %coerce = bitcast %Vector2* %v to { float, float }*
-  %3 = getelementptr inbounds { float, float }, { float, float }* %coerce, i32 0, i32 0
-  store float %0, float* %3, align 4
-  %4 = getelementptr inbounds { float, float }, { float, float }* %coerce, i32 0, i32 1
-  store float %1, float* %4, align 4
-  %5 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 0
-  store float 0.000000e+00, float* %5, align 4
-  %6 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 1
-  store float 0.000000e+00, float* %6, align 4
-  %7 = load %Vector2, %Vector2* %literal, align 4
-  ret %Vector2 %7
+  %2 = bitcast %Vector2* %v to [2 x float]*
+  store [2 x float] %0, [2 x float]* %2, align 4
+  %3 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 0
+  store float 0.000000e+00, float* %3, align 4
+  %4 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 1
+  store float 0.000000e+00, float* %4, align 4
+  %5 = load %Vector2, %Vector2* %literal, align 4
+  ret %Vector2 %5
 }
 
-define %Vector2 @vector2_subtract(float %0, float %1, float %2, float %3)
+define %Vector2 @vector2_subtract([2 x float] %0, [2 x float] %1) #0 {
 entry:
   %v1 = alloca %Vector2, align 4
   %v2 = alloca %Vector2, align 4
   %literal = alloca %Vector2, align 4
-  %coerce = bitcast %Vector2* %v1 to { float, float }*
-  %4 = getelementptr inbounds { float, float }, { float, float }* %coerce, i32 0, i32 0
-  store float %0, float* %4, align 4
-  %5 = getelementptr inbounds { float, float }, { float, float }* %coerce, i32 0, i32 1
-  store float %1, float* %5, align 4
-  %coerce1 = bitcast %Vector2* %v2 to { float, float }*
-  %6 = getelementptr inbounds { float, float }, { float, float }* %coerce1, i32 0, i32 0
-  store float %2, float* %6, align 4
-  %7 = getelementptr inbounds { float, float }, { float, float }* %coerce1, i32 0, i32 1
-  store float %3, float* %7, align 4
-  %8 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 0
-  store float 0.000000e+00, float* %8, align 4
-  %9 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 1
-  store float 0.000000e+00, float* %9, align 4
-  %10 = load %Vector2, %Vector2* %literal, align 4
-  ret %Vector2 %10
+  %2 = bitcast %Vector2* %v1 to [2 x float]*
+  store [2 x float] %0, [2 x float]* %2, align 4
+  %3 = bitcast %Vector2* %v2 to [2 x float]*
+  store [2 x float] %1, [2 x float]* %3, align 4
+  %4 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 0
+  store float 0.000000e+00, float* %4, align 4
+  %5 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 1
+  store float 0.000000e+00, float* %5, align 4
+  %6 = load %Vector2, %Vector2* %literal, align 4
+  ret %Vector2 %6
 }
 
-define %Vector2 @vector2_subtract_value(float %0, float %1, float %2)
+define %Vector2 @vector2_subtract_value([2 x float] %0, float %1) #0 {
 entry:
   %v = alloca %Vector2, align 4
   %literal = alloca %Vector2, align 4
-  %coerce = bitcast %Vector2* %v to { float, float }*
-  %3 = getelementptr inbounds { float, float }, { float, float }* %coerce, i32 0, i32 0
-  store float %0, float* %3, align 4
-  %4 = getelementptr inbounds { float, float }, { float, float }* %coerce, i32 0, i32 1
-  store float %1, float* %4, align 4
-  %5 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 0
-  store float 0.000000e+00, float* %5, align 4
-  %6 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 1
-  store float 0.000000e+00, float* %6, align 4
-  %7 = load %Vector2, %Vector2* %literal, align 4
-  ret %Vector2 %7
+  %2 = bitcast %Vector2* %v to [2 x float]*
+  store [2 x float] %0, [2 x float]* %2, align 4
+  %3 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 0
+  store float 0.000000e+00, float* %3, align 4
+  %4 = getelementptr inbounds %Vector2, %Vector2* %literal, i32 0, i32 1
+  store float 0.000000e+00, float* %4, align 4
+  %5 = load %Vector2, %Vector2* %literal, align 4
+  ret %Vector2 %5
 }
diff --git a/test/test_suite/functions/splat_aarch64.c3t b/test/test_suite/functions/splat_aarch64.c3t
index 9262b0504..b069960f9 100644
--- a/test/test_suite/functions/splat_aarch64.c3t
+++ b/test/test_suite/functions/splat_aarch64.c3t
@@ -15,13 +15,23 @@ fn void test()
 
 // #expect: splat.ll
 
-%vararg = alloca %"int[]", align 8
+%"int[]" = type { i32*, i64 }
+
+declare i32 @sum_us([2 x i64]) #0
+
+define void @splat.test() #0 {
+entry:
+  %vararg = alloca %"int[]", align 8
   %varargslots = alloca [3 x i32], align 4
+  %tempcoerce = alloca [2 x i64], align 8
   %x = alloca [3 x i32], align 4
   %z = alloca %"int[]", align 8
   %vararg1 = alloca %"int[]", align 8
-  %vararg2 = alloca %"int[]", align 8
+  %tempcoerce2 = alloca [2 x i64], align 8
   %vararg3 = alloca %"int[]", align 8
+  %tempcoerce4 = alloca [2 x i64], align 8
+  %vararg5 = alloca %"int[]", align 8
+  %tempcoerce6 = alloca [2 x i64], align 8
   %0 = getelementptr inbounds [3 x i32], [3 x i32]* %varargslots, i64 0, i64 0
   store i32 1, i32* %0, align 4
   %1 = getelementptr inbounds [3 x i32], [3 x i32]* %varargslots, i64 0, i64 1
@@ -33,42 +43,40 @@ fn void test()
   %4 = getelementptr inbounds %"int[]", %"int[]"* %vararg, i32 0, i32 0
   %5 = bitcast [3 x i32]* %varargslots to i32*
   store i32* %5, i32** %4, align 8
-  %6 = bitcast %"int[]"* %vararg to { i64, i64 }*
-  %7 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %6, i32 0, i32 0
-  %8 = load i64, i64* %7, align 8
-  %9 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %6, i32 0, i32 1
-  %10 = load i64, i64* %9, align 8
-  %11 = call i32 @sum_us(i64 %8, i64 %10)
-  %12 = bitcast [3 x i32]* %x to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %12, i8* align 4 bitcast ([3 x i32]* @.__const to i8*), i32 12, i1 false)
-  %13 = bitcast [3 x i32]* %x to i32*
-  %14 = insertvalue %"int[]" undef, i32* %13, 0
-  %15 = insertvalue %"int[]" %14, i64 3, 1
-  store %"int[]" %15, %"int[]"* %z, align 8
-  %16 = getelementptr inbounds %"int[]", %"int[]"* %vararg1, i32 0, i32 1
-  %17 = getelementptr inbounds %"int[]", %"int[]"* %vararg1, i32 0, i32 0
-  store i64 3, i64* %16, align 8
-  %18 = bitcast [3 x i32]* %x to i32*
-  store i32* %18, i32** %17, align 8
-  %19 = bitcast %"int[]"* %vararg1 to { i64, i64 }*
-  %20 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %19, i32 0, i32 0
-  %21 = load i64, i64* %20, align 8
-  %22 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %19, i32 0, i32 1
-  %23 = load i64, i64* %22, align 8
-  %24 = call i32 @sum_us(i64 %21, i64 %23)
-  %25 = getelementptr inbounds %"int[]", %"int[]"* %vararg2, i32 0, i32 1
-  %26 = getelementptr inbounds %"int[]", %"int[]"* %vararg2, i32 0, i32 0
-  %27 = bitcast %"int[]"* %z to { i64, i64 }*
-  %28 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %27, i32 0, i32 0
-  %29 = load i64, i64* %28, align 8
-  %30 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %27, i32 0, i32 1
-  %31 = load i64, i64* %30, align 8
-  %32 = call i32 @sum_us(i64 %29, i64 %31)
-  %33 = getelementptr inbounds %"int[]", %"int[]"* %vararg3, i32 0, i32 1
-  store i64 0, i64* %33, align 8
-  %34 = bitcast %"int[]"* %vararg3 to { i64, i64 }*
-  %35 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %34, i32 0, i32 0
-  %36 = load i64, i64* %35, align 8
-  %37 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %34, i32 0, i32 1
-  %38 = load i64, i64* %37, align 8
-  %39 = call i32 @sum_us(i64 %36, i64 %38)
+  %6 = bitcast [2 x i64]* %tempcoerce to i8*
+  %7 = bitcast %"int[]"* %vararg to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %6, i8* align 8 %7, i32 16, i1 false)
+  %8 = load [2 x i64], [2 x i64]* %tempcoerce, align 8
+  %9 = call i32 @sum_us([2 x i64] %8)
+  %10 = bitcast [3 x i32]* %x to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %10, i8* align 4 bitcast ([3 x i32]* @.__const to i8*), i32 12, i1 false)
+  %11 = bitcast [3 x i32]* %x to i32*
+  %12 = insertvalue %"int[]" undef, i32* %11, 0
+  %13 = insertvalue %"int[]" %12, i64 3, 1
+  store %"int[]" %13, %"int[]"* %z, align 8
+  %14 = getelementptr inbounds %"int[]", %"int[]"* %vararg1, i32 0, i32 1
+  %15 = getelementptr inbounds %"int[]", %"int[]"* %vararg1, i32 0, i32 0
+  store i64 3, i64* %14, align 8
+  %16 = bitcast [3 x i32]* %x to i32*
+  store i32* %16, i32** %15, align 8
+  %17 = bitcast [2 x i64]* %tempcoerce2 to i8*
+  %18 = bitcast %"int[]"* %vararg1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %17, i8* align 8 %18, i32 16, i1 false)
+  %19 = load [2 x i64], [2 x i64]* %tempcoerce2, align 8
+  %20 = call i32 @sum_us([2 x i64] %19)
+  %21 = getelementptr inbounds %"int[]", %"int[]"* %vararg3, i32 0, i32 1
+  %22 = getelementptr inbounds %"int[]", %"int[]"* %vararg3, i32 0, i32 0
+  %23 = bitcast [2 x i64]* %tempcoerce4 to i8*
+  %24 = bitcast %"int[]"* %z to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %23, i8* align 8 %24, i32 16, i1 false)
+  %25 = load [2 x i64], [2 x i64]* %tempcoerce4, align 8
+  %26 = call i32 @sum_us([2 x i64] %25)
+  %27 = getelementptr inbounds %"int[]", %"int[]"* %vararg5, i32 0, i32 1
+  store i64 0, i64* %27, align 8
+  %28 = bitcast [2 x i64]* %tempcoerce6 to i8*
+  %29 = bitcast %"int[]"* %vararg5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %28, i8* align 8 %29, i32 16, i1 false)
+  %30 = load [2 x i64], [2 x i64]* %tempcoerce6, align 8
+  %31 = call i32 @sum_us([2 x i64] %30)
+  ret void
+}
diff --git a/test/test_suite/struct/struct_as_value_aarch64.c3t b/test/test_suite/struct/struct_as_value_aarch64.c3t
index e3c6ab4af..22c83ccb3 100644
--- a/test/test_suite/struct/struct_as_value_aarch64.c3t
+++ b/test/test_suite/struct/struct_as_value_aarch64.c3t
@@ -15,7 +15,7 @@ fn Event test(int x)
 
 // #expect: test.ll
 
-define i64 @test.test(i32 %0) #0 {
+define i32 @test.test(i32 %0) #0 {
 entry:
   %foo = alloca %Event, align 4
   %bar = alloca %Event, align 4
@@ -40,6 +40,5 @@ cond.phi:                                         ; preds = %cond.rhs, %cond.lhs
   store %Event %val, %Event* %taddr, align 4
   %5 = getelementptr inbounds %Event, %Event* %taddr, i32 0, i32 0
   %6 = load i32, i32* %5, align 4
-  %7 = zext i32 %6 to i64
-  ret i64 %7
+  ret i32 %6
 }