Dev (#2545)

* Optimize vector load / store. Fixes to alignment. Support typedef with `@simd` and `@align` #2543. Update vector ABI #2542 * Fix alignment issue with indirect arguments.
2026-02-27 12:01:16 +00:00 · 2025-10-25 12:31:06 +02:00
parent f37e7460aa
commit 423152202f
72 changed files with 2403 additions and 1718 deletions
--- a/test/test_suite/abi/aarch64_hfa_args.c3t
+++ b/test/test_suite/abi/aarch64_hfa_args.c3t
@@ -1,32 +1,34 @@
 // #target: macos-aarch64
 module test;
-alias Int8x16 = ichar[<16>];
-alias Float32x3 = float[<3>];
+typedef Int8x16 = ichar[<16>] @simd;
+typedef Float32x4 = float[<4>] @simd;

-struct HFAv3
+struct HFAv4
 {
-    Float32x3[4] arr;
+    Float32x4[4] arr;
 }

-struct MixedHFAv3
+struct MixedHFAv4
 {
-    Float32x3[3] arr;
+    Float32x4[3] arr;
    Int8x16 b;
 }

-fn HFAv3 test(HFAv3 a0, HFAv3 a1, HFAv3 a2) {
-  return a2;
+fn HFAv4 test(HFAv4 a0, HFAv4 a1, HFAv4 a2)
+{
+	return a2;
 }

-fn MixedHFAv3 test_mixed(MixedHFAv3 a0, MixedHFAv3 a1, MixedHFAv3 a2) {
-  return a2;
+fn MixedHFAv4 test_mixed(MixedHFAv4 a0, MixedHFAv4 a1, MixedHFAv4 a2)
+{
+	return a2;
 }

 /* #expect: test.ll

+%.introspect = type { i8, i64, ptr, i64, i64, i64, [0 x i64] }
+%HFAv4 = type { [4 x <4 x float>] }
+%MixedHFAv4 = type { [3 x <4 x float>], <16 x i8> }

-%HFAv3 = type { [4 x <3 x float>] }
-%MixedHFAv3 = type { [3 x <3 x float>], <16 x i8> }
-
-define %HFAv3 @test.test([4 x <4 x float>] %0, [4 x <4 x float>] %1, [4 x <4 x float>] %2)
-define %MixedHFAv3 @test.test_mixed([4 x <4 x float>] %0, [4 x <4 x float>] %1, [4 x <4 x float>] %2) #0 {
+define %HFAv4 @test.test([4 x <4 x float>] %0, [4 x <4 x float>] %1, [4 x <4 x float>] %2) #0 {
+define %MixedHFAv4 @test.test_mixed([4 x <4 x float>] %0, [4 x <4 x float>] %1, [4 x <4 x float>] %2) #0 {
--- a/test/test_suite/abi/aarch64_hfa_args_no.c3t
+++ b/test/test_suite/abi/aarch64_hfa_args_no.c3t
@@ -0,0 +1,77 @@
+// #target: macos-aarch64
+module test;
+alias Int8x16 = ichar[<16>];
+alias Float32x3 = float[<3>];
+
+struct HFAv3
+{
+    Float32x3[4] arr;
+}
+
+struct HFAv3arr
+{
+    float[3][4] arr;
+}
+
+struct MixedHFAv3
+{
+    Float32x3[3] arr;
+    Int8x16 b;
+}
+
+struct MixedHFAv3arr
+{
+    float[<3>][3] arr;
+    ichar[16] b;
+}
+
+fn HFAv3 test(HFAv3 a0, HFAv3 a1, HFAv3 a2)
+{
+	return a2;
+}
+
+fn HFAv3arr test_arr(HFAv3arr a0, HFAv3arr a1, HFAv3arr a2)
+{
+	return a2;
+}
+
+fn MixedHFAv3 test_mixed(MixedHFAv3 a0, MixedHFAv3 a1, MixedHFAv3 a2)
+{
+	return a2;
+}
+
+fn MixedHFAv3arr test_mixed_arr(MixedHFAv3arr a0, MixedHFAv3arr a1, MixedHFAv3arr a2)
+{
+	return a2;
+}
+
+fn int main()
+{
+	MixedHFAv3 a;
+	MixedHFAv3arr b;
+	Int8x16 c;
+	Float32x3 d;
+	return 0;
+}
+
+/* #expect: test.ll
+
+
+%HFAv3 = type { [4 x [3 x float]] }
+%HFAv3arr = type { [4 x [3 x float]] }
+%MixedHFAv3 = type { [3 x [3 x float]], [16 x i8] }
+%MixedHFAv3arr = type { [3 x [3 x float]], [16 x i8] }
+
+define void @test.test(ptr noalias sret(%HFAv3) align 4 %0, ptr align 4 %1, ptr align 4 %2, ptr align 4 %3) #0 {
+define void @test.test_arr(ptr noalias sret(%HFAv3arr) align 4 %0, ptr align 4 %1, ptr align 4 %2, ptr align 4 %3)
+define void @test.test_mixed(ptr noalias sret(%MixedHFAv3) align 4 %0, ptr align 4 %1, ptr align 4 %2, ptr align 4 %3)
+define void @test.test_mixed_arr(ptr noalias sret(%MixedHFAv3arr) align 4 %0, ptr align 4 %1, ptr align 4 %2, ptr align 4 %3)
+
+  %a = alloca %MixedHFAv3, align 4
+  %b = alloca %MixedHFAv3arr, align 4
+  %c = alloca <16 x i8>, align 16
+  %d = alloca <3 x float>, align 16
+  call void @llvm.memset.p0.i64(ptr align 4 %a, i8 0, i64 52, i1 false)
+  call void @llvm.memset.p0.i64(ptr align 4 %b, i8 0, i64 52, i1 false)
+  store <16 x i8> zeroinitializer, ptr %c, align 16
+  store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, ptr %d, align 16
--- a/test/test_suite/abi/darwin64_avx.c3t
+++ b/test/test_suite/abi/darwin64_avx.c3t
@@ -2,7 +2,7 @@
 // #opt: --x86cpu=avx1
 module test;

-alias Mm256 = float[<8>];
+typedef Mm256 = float[<8>] @simd;
 struct St256
 {
    Mm256 m;
@@ -20,7 +20,7 @@ fn void f39() { f38(x38); f37(x37); }

 // CHECK: declare void @func40(%struct.t128* byval(%struct.t128) align 16)

-alias Mm128 = float[<4>];
+typedef Mm128 = float[<4>] @simd;
 struct Two128
 {
 	Mm128 m;
@@ -32,24 +32,25 @@ fn void func41(Two128 s)
 {
    func40(s);
 }
-
 struct Atwo128
 {
 	Mm128[2] array;
 }

-struct Sa {
-  Atwo128 x;
+struct Sa
+{
+	Atwo128 x;
 }

 extern fn void func42(Sa s);
-fn void func43(Sa s) {
-  func42(s);
+fn void func43(Sa s)
+{
+	func42(s);
 }


-alias Vec46 = float[<2>];
-extern fn void f46(Vec46,Vec46,Vec46,Vec46,Vec46,Vec46,Vec46,Vec46,Vec46,Vec46);
+typedef Vec46 = float[<2>] @simd;
+extern fn void f46(Vec46 a, Vec46 b, Vec46 c, Vec46 d, Vec46 e, Vec46 f, Vec46 g, Vec46 h, Vec46 i, Vec46 j);
 fn void test46() { Vec46 x = {1,2}; f46(x,x,x,x,x,x,x,x,x,x); }

 struct Vec47 { uint a; }
@@ -62,20 +63,23 @@ fn void test49(double d, double e) { test49_helper(d, e); }
 struct Complex { double i; double c; }
 extern fn void test52_helper(int, ...);
 Mm256 x52;
-fn void test52() {
-  test52_helper(0, x52, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, (Complex) { 0, 1.0 });
+fn void test52()
+{
+	test52_helper(0, x52, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, (Complex) { 0, 1.0 });
 }

 extern fn void test54_helper(Mm256, ...);
 Mm256 x54;
-fn void test54() {
-  test54_helper(x54, x54, 1.0, 1.0, 1.0, 1.0, 1.0, (Complex) { 0, 1.0 });
-  test54_helper(x54, x54, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, (Complex) { 0, 1.0 });
+fn void test54()
+{
+	test54_helper(x54, x54, 1.0, 1.0, 1.0, 1.0, 1.0, (Complex) { 0, 1.0 });
+	test54_helper(x54, x54, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, (Complex) { 0, 1.0 });
 }

-alias Mm512 = float[<16>];
-struct St512 {
-  Mm512 m;
+typedef Mm512 = float[<16>] @simd;
+struct St512
+{
+	Mm512 m;
 }

 St512 x55;
@@ -86,27 +90,32 @@ extern fn void f55(St512 x);
 extern fn void f56(Mm512 x);
 fn void f57() { f55(x55); f56(x56); }

-struct Two256 {
-  Mm256 m;
-  Mm256 n;
+struct Two256
+{
+	Mm256 m;
+	Mm256 n;
 }

 extern fn void f58(Two256 s);
-fn void f59(Two256 s) {
-  f58(s);
+fn void f59(Two256 s)
+{
+	f58(s);
 }

-struct Atwo256 {
-  Mm256[2] array;
+struct Atwo256
+{
+	Mm256[2] array;
 }

-struct SAtwo256 {
-  Atwo256 x;
+struct SAtwo256
+{
+	Atwo256 x;
 }

 extern fn void f60(SAtwo256 s);
-fn void f61(SAtwo256 s) {
-  f60(s);
+fn void f61(SAtwo256 s)
+{
+	f60(s);
 }


--- a/test/test_suite/abi/darwin64_avx512.c3t
+++ b/test/test_suite/abi/darwin64_avx512.c3t
@@ -2,10 +2,11 @@
 // #opt: --x86cpu=avx512
 module test;

-alias Mm256 = float[<8>];
-alias Mm512 = float[<16>];
-struct St512 {
-  Mm512 m;
+typedef Mm256 = float[<8>] @simd;
+typedef Mm512 = float[<16>] @simd;
+struct St512
+{
+	Mm512 m;
 }

 St512 x55;
@@ -16,27 +17,32 @@ extern fn void f55(St512 x);
 extern fn void f56(Mm512 x);
 fn void f57() { f55(x55); f56(x56); }

-struct Two256 {
-  Mm256 m;
-  Mm256 n;
+struct Two256
+{
+	Mm256 m;
+	Mm256 n;
 }

 extern fn void f58(Two256 s);
-fn void f59(Two256 s) {
-  f58(s);
+fn void f59(Two256 s)
+{
+	f58(s);
 }

-struct Atwo256 {
-  Mm256[2] array;
+struct Atwo256
+{
+	Mm256[2] array;
 }

-struct SAtwo256 {
-  Atwo256 x;
+struct SAtwo256
+{
+	Atwo256 x;
 }

 extern fn void f60(SAtwo256 s);
-fn void f61(SAtwo256 s) {
-  f60(s);
+fn void f61(SAtwo256 s)
+{
+	f60(s);
 }

 struct Complex { double i; double c; }
@@ -44,15 +50,17 @@ struct Complex { double i; double c; }
 // AVX512: @f62_helper(i32 0, <16 x float> {{%[a-zA-Z0-9]+}}, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double {{%[a-zA-Z0-9]+}}, double {{%[a-zA-Z0-9]+}})
 extern fn void f62_helper(int, ...);
 Mm512 x62;
-fn void f62() {
-  f62_helper(0, x62, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, (Complex) { 1.0, 1.0 });
+fn void f62()
+{
+	f62_helper(0, x62, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, (Complex) { 1.0, 1.0 });
 }

 extern fn void f64_helper(Mm512, ...);
 Mm512 x64;
-fn void f64() {
-  f64_helper(x64, x64, 1.0, 1.0, 1.0, 1.0, 1.0, (Complex) { 1.0, 1.0 });
-  f64_helper(x64, x64, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, (Complex) { 1.0, 1.0 });
+fn void f64()
+{
+	f64_helper(x64, x64, 1.0, 1.0, 1.0, 1.0, 1.0, (Complex) { 1.0, 1.0 });
+	f64_helper(x64, x64, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, (Complex) { 1.0, 1.0 });
 }


--- a/test/test_suite/abi/darwin64_sret.c3t
+++ b/test/test_suite/abi/darwin64_sret.c3t
@@ -12,8 +12,9 @@ fn SimdDouble4x4 ident(SimdDouble4x4 x) {

 /* #expect: foo.ll

-define void @foo.ident(ptr noalias sret(%SimdDouble4x4) align 32 %0, ptr byval(%SimdDouble4x4) align 32 %1) #0 {
+
+define void @foo.ident(ptr noalias sret(%SimdDouble4x4) align 8 %0, ptr byval(%SimdDouble4x4) align 8 %1) #0 {
 entry:
-  call void @llvm.memcpy.p0.p0.i32(ptr align 32 %0, ptr align 32 %1, i32 128, i1 false)
+  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %0, ptr align 8 %1, i32 128, i1 false)
  ret void
 }
--- a/test/test_suite/abi/darwin64_sse.c3t
+++ b/test/test_suite/abi/darwin64_sse.c3t
@@ -2,9 +2,10 @@
 // #opt: --x86cpu=sse4
 module test;

-alias Mm256 = float[<8>];
-struct St256 {
-  Mm256 m;
+typedef Mm256 = float[<8>] @simd;
+struct St256
+{
+	Mm256 m;
 }

 St256 x38;
@@ -19,28 +20,33 @@ fn void f39() { f38(x38); f37(x37); }

 // CHECK: declare void @func40(%struct.t128* byval(%struct.t128) align 16)

-alias Mm128 = float[<4>];
-struct Two128 {
-  Mm128 m;
-  Mm128 n;
+typedef Mm128 = float[<4>] @simd;
+struct Two128
+{
+	Mm128 m;
+	Mm128 n;
 }

 extern fn void func40(Two128 s);
-fn void func41(Two128 s) {
-  func40(s);
+fn void func41(Two128 s)
+{
+	func40(s);
 }

-struct Atwo128 {
-  Mm128[2] array;
+struct Atwo128
+{
+	Mm128[2] array;
 }

-struct Sa {
-  Atwo128 x;
+struct Sa
+{
+	Atwo128 x;
 }

 extern fn void func42(Sa s);
-fn void func43(Sa s) {
-  func42(s);
+fn void func43(Sa s)
+{
+	func42(s);
 }


--- a/test/test_suite/abi/darwinx64_2.c3t
+++ b/test/test_suite/abi/darwinx64_2.c3t
@@ -11,29 +11,28 @@ fn void f12_1(St12 a0) {}

 struct St13_0 { long[3] f0; }
 struct St13_1 { long[2] f0; }
-fn St13_0 f13(int a, int b, int c, int d,
-                 St13_1 e, int f) { while (1) {} }
+fn St13_0 f13(int a, int b, int c, int d, St13_1 e, int f) { while (1) {} }

 fn void f14(int a, int b, int c, int d, int e, int f, ichar x) {}

 fn void f15(int a, int b, int c, int d, int e, int f, void *x) {}

-fn void f16(float a, float b, float c, float d, float e, float f, float g, float h,
-         float x) {}
+fn void f16(float a, float b, float c, float d, float e, float f, float g, float h, float x) {}

 struct Fl18_s0 { int f0; }
 fn void fl18(int a, Fl18_s0 f18_arg1) { while (1) {} }

-struct St20 @align(32) {
-  int x;
-  int y;
+struct St20 @align(32)
+{
+	int x;
+	int y;
 }
 fn void f20(St20 x) {}

 struct StringRef
 {
-  int x;
-  char* ptr;
+	int x;
+	char* ptr;
 }
 fn char *f21(StringRef s) { return s.x+s.ptr; }

@@ -43,105 +42,114 @@ fn void f22(St22s x, St22s y) { }



-struct St23S {
-  short f0;
-  uint f1;
-  int f2;
+struct St23S
+{
+	short f0;
+	uint f1;
+	int f2;
 }


-fn void f23(int a, St23S b) {
-}
+fn void f23(int a, St23S b)  {}

 struct St24s { int a; int b; }

 fn St23S f24(St23S *x, St24s *p2)
 {
-  return *x;
-
+	return *x;
 }

-fn float[<4>] f25(float[<4>] x) {
-  return x+x;
+typedef Float4v = float[<4>] @simd;
+fn Float4v f25(Float4v x)
+{
+	return x+x;
 }

-struct Foo26 {
-  int *x;
-  float *y;
+struct Foo26
+{
+	int *x;
+	float *y;
 }

-fn Foo26 f26(Foo26 *p) {
-  return *p;
+fn Foo26 f26(Foo26 *p)
+{
+	return *p;
 }


-struct V4f32wrapper {
-  float[<4>] v;
+struct V4f32wrapper
+{
+	Float4v v;
 }

-fn V4f32wrapper f27(V4f32wrapper x) {
-  return x;
+fn V4f32wrapper f27(V4f32wrapper x)
+{
+	return x;
 }

 // PR22563 - We should unwrap simple structs and arrays to pass
 // and return them in the appropriate vector registers if possible.

-alias V8f32 = float[<8>];
-struct V8f32wrapper {
-  V8f32 v;
+typedef V8f32 = float[<8>] @simd;
+struct V8f32wrapper
+{
+	V8f32 v;
 }

-fn V8f32wrapper f27a(V8f32wrapper x) {
-  return x;
+fn V8f32wrapper f27a(V8f32wrapper x)
+{
+	return x;
 }

-struct V8f32wrapper_wrapper {
-  V8f32[1] v;
+struct V8f32wrapper_wrapper
+{
+	V8f32[1] v;
 }

-fn V8f32wrapper_wrapper f27b(V8f32wrapper_wrapper x) {
-  return x;
+fn V8f32wrapper_wrapper f27b(V8f32wrapper_wrapper x)
+{
+	return x;
 }

-struct F28c {
-  double x;
-  int y;
+struct F28c
+{
+	double x;
+	int y;
 }
-fn void f28(F28c c) {
+fn void f28(F28c c)
+{
 }

 struct Inner
 {
-    double x;
-    int y;
+	double x;
+	int y;
 }
 struct F29a
 {
-  Inner[1] c;
+	Inner[1] c;
 }

-fn void f29a(F29a a) {
-}
+fn void f29a(F29a a) {}

-struct St0 {
-    char[8] f0; char f2; char f3; char f4; }
-fn void f30(St0 p_4) {
-}
+struct St0  { char[8] f0; char f2; char f3; char f4; }
+fn void f30(St0 p_4) {}

 struct F31foo { float a, b, c; }
-fn float f31(F31foo x) {
-  return x.c;
+fn float f31(F31foo x)
+{
+	return x.c;
 }

-alias V1i64 = ulong[<1>];
+typedef V1i64 = ulong[<1>] @simd;

 fn V1i64 f34(V1i64 arg) { return arg; }


-alias V1i64_2 = uint[<2>];
+typedef V1i64_2 = uint[<2>] @simd;
 fn V1i64_2 f35(V1i64_2 arg) { return arg+arg; }

-alias V2i32 = float[<2>];
+typedef V2i32 = float[<2>] @simd;
 fn V2i32 f36(V2i32 arg) { return arg; }


--- a/test/test_suite/abi/merge_union_bool_avx512.c3t
+++ b/test/test_suite/abi/merge_union_bool_avx512.c3t
@@ -2,23 +2,25 @@
 // #opt: --x86cpu=avx512
 module abi;

+typedef Bool64v = bool[<64>] @simd;
+
 union Mask64
 {
-    bool[<64>] m;
-    ulong ul;
+	Bool64v m;
+	ulong ul;
 }

 fn Mask64 make_mask(ulong n)
 {
-    Mask64 mask;
-    mask.ul = n;
-    return mask;
+	Mask64 mask;
+	mask.ul = n;
+	return mask;
 }

 fn int main()
 {
-    make_mask(20);
-    return 0;
+	make_mask(20);
+	return 0;
 }

 /* #expect: abi.ll
--- a/test/test_suite/abi/riscv32-ilp32-ilp32f-ilp32d-abi-1.c3t
+++ b/test/test_suite/abi/riscv32-ilp32-ilp32f-ilp32d-abi-1.c3t
@@ -32,8 +32,9 @@ fn float128 f_fp_scalar_3(float128 x) { return x; }
 // Aggregates <= 2*xlen may be passed in registers, so will be coerced to
 // integer arguments. The rules for return are the same.

-struct Tiny {
-  char a, b, c, d;
+struct Tiny
+{
+	char a, b, c, d;
 }

 fn void f_agg_tiny(Tiny x) {
@@ -45,68 +46,80 @@ fn Tiny f_agg_tiny_ret() {
  return {1, 2, 3, 4};
 }

-fn void f_vec_tiny_v4i8(char[<4>] x) {
-  x[0] = x[1];
-  x[2] = x[3];
+typedef Char4 = char[<4>] @simd;
+fn void f_vec_tiny_v4i8(Char4 x)
+{
+    x[0] = x[1];
+    x[2] = x[3];
 }

-fn char[<4>] f_vec_tiny_v4i8_ret() {
-  return {1, 2, 3, 4};
+fn Char4 f_vec_tiny_v4i8_ret()
+{
+	return {1, 2, 3, 4};
 }

-fn void f_vec_tiny_v1i32(int[<1>] x) {
-  x[0] = 114;
+typedef Int1 = int[<1>] @simd;
+fn void f_vec_tiny_v1i32(Int1 x)
+{
+	x[0] = 114;
 }

-fn int[<1>] f_vec_tiny_v1i32_ret() {
-  return {1};
+fn Int1 f_vec_tiny_v1i32_ret()
+{
+	return {1};
 }

-struct Small {
-  int a;
-   int* b;
+struct Small
+{
+    int a;
+	int* b;
 }

-fn void f_agg_small(Small x) {
-  x.a += *x.b;
-  x.b = &x.a;
+fn void f_agg_small(Small x)
+{
+	x.a += *x.b;
+	x.b = &x.a;
 }

-fn Small f_agg_small_ret() {
-  return {1, null};
+fn Small f_agg_small_ret()
+{
+	return {1, null};
 }

+typedef Char8 = char[<8>] @simd;

-fn void f_vec_small_v8i8(char[<8>] x) {
-  x[0] = x[7];
+fn void f_vec_small_v8i8(Char8 x)
+{
+	x[0] = x[7];
 }

-fn char[<8>] f_vec_small_v8i8_ret() {
-  return {1, 2, 3, 4, 5, 6, 7, 8};
+fn Char8 f_vec_small_v8i8_ret() => {1, 2, 3, 4, 5, 6, 7, 8};
+
+typedef Long1 = long[<1>] @simd;
+fn void f_vec_small_v1i64(Long1 x)
+{
+	x[0] = 114;
 }

-fn void f_vec_small_v1i64(long[<1>] x) {
-  x[0] = 114;
-}
-
-fn long[<1>] f_vec_small_v1i64_ret() {
-  return {1};
-}
+fn Long1 f_vec_small_v1i64_ret() => {1};

 // Aggregates of 2*xlen size and 2*xlen alignment should be coerced to a
 // single 2*xlen-sized argument, to ensure that alignment can be maintained if
 // passed on the stack.

-struct Small_aligned {
-  long a;
+struct Small_aligned
+{
+	long a;
 }

-fn void f_agg_small_aligned(Small_aligned x) {
-  x.a += x.a;
+fn void f_agg_small_aligned(Small_aligned x)
+{
+	x.a += x.a;
 }

-fn Small_aligned f_agg_small_aligned_ret(Small_aligned x) {
-  return {10};
+fn Small_aligned f_agg_small_aligned_ret(Small_aligned x)
+{
+	return {10};
 }

 // Aggregates greater > 2*xlen will be passed and returned indirectly
@@ -124,13 +137,14 @@ fn Large f_agg_large_ret(int i, ichar j) {
  return {1, 2, 3, 4};
 }

-fn void f_vec_large_v16i8(char[<16>] x) {
-  x[0] = x[7];
+typedef CharV16 = char[<16>] @simd;
+
+fn void f_vec_large_v16i8(CharV16 x)
+{
+	x[0] = x[7];
 }

-fn char[<16>] f_vec_large_v16i8_ret() {
-  return {1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8};
-}
+fn CharV16 f_vec_large_v16i8_ret() => {1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8};

 // Scalars passed on the stack should not have signext/zeroext attributes
 // (they are anyext).
--- a/test/test_suite/abi/riscv32-ilp32-ilp32f-ilp32d-abi-2.c3t
+++ b/test/test_suite/abi/riscv32-ilp32-ilp32f-ilp32d-abi-2.c3t
@@ -33,137 +33,156 @@ fn float128 f_fp_scalar_3(float128 x) { return x; }
 // Aggregates <= 2*xlen may be passed in registers, so will be coerced to
 // integer arguments. The rules for return are the same.

-struct Tiny {
-  char a, b, c, d;
+struct Tiny { char a, b, c, d; }
+
+fn void f_agg_tiny(Tiny x)
+{
+	x.a += x.b;
+	x.c += x.d;
 }

-fn void f_agg_tiny(Tiny x) {
-  x.a += x.b;
-  x.c += x.d;
+fn Tiny f_agg_tiny_ret()
+{
+	return {1, 2, 3, 4};
 }

-fn Tiny f_agg_tiny_ret() {
-  return {1, 2, 3, 4};
+typedef Char4 = char[<4>] @simd;
+
+fn void f_vec_tiny_v4i8(Char4 x)
+{
+	x[0] = x[1];
+	x[2] = x[3];
 }

-fn void f_vec_tiny_v4i8(char[<4>] x) {
-  x[0] = x[1];
-  x[2] = x[3];
+fn Char4 f_vec_tiny_v4i8_ret()
+{
+	return {1, 2, 3, 4};
 }

-fn char[<4>] f_vec_tiny_v4i8_ret() {
-  return {1, 2, 3, 4};
+typedef Int1 = int[<1>] @simd;
+
+fn void f_vec_tiny_v1i32(Int1 x)
+{
+	x[0] = 114;
 }

-fn void f_vec_tiny_v1i32(int[<1>] x) {
-  x[0] = 114;
+fn Int1 f_vec_tiny_v1i32_ret()
+{
+	return {1};
 }

-fn int[<1>] f_vec_tiny_v1i32_ret() {
-  return {1};
+struct Small
+{
+	int a;
+	int* b;
 }

-struct Small {
-  int a;
-   int* b;
+fn void f_agg_small(Small x)
+{
+	x.a += *x.b;
+	x.b = &x.a;
 }

-fn void f_agg_small(Small x) {
-  x.a += *x.b;
-  x.b = &x.a;
+fn Small f_agg_small_ret() { return {1, null}; }
+
+typedef Char8 = char[<8>] @simd;
+fn void f_vec_small_v8i8(Char8 x)
+{
+	x[0] = x[7];
 }

-fn Small f_agg_small_ret() {
-  return {1, null};
+fn Char8 f_vec_small_v8i8_ret()
+{
+	return {1, 2, 3, 4, 5, 6, 7, 8};
 }

-
-fn void f_vec_small_v8i8(char[<8>] x) {
-  x[0] = x[7];
+typedef Long1 = long[<1>] @simd;
+fn void f_vec_small_v1i64(Long1 x)
+{
+	x[0] = 114;
 }

-fn char[<8>] f_vec_small_v8i8_ret() {
-  return {1, 2, 3, 4, 5, 6, 7, 8};
-}
-
-fn void f_vec_small_v1i64(long[<1>] x) {
-  x[0] = 114;
-}
-
-fn long[<1>] f_vec_small_v1i64_ret() {
-  return {1};
+fn Long1 f_vec_small_v1i64_ret()
+{
+	return {1};
 }

 // Aggregates of 2*xlen size and 2*xlen alignment should be coerced to a
 // single 2*xlen-sized argument, to ensure that alignment can be maintained if
 // passed on the stack.

-struct Small_aligned {
-  long a;
+struct Small_aligned
+{
+	long a;
 }

-fn void f_agg_small_aligned(Small_aligned x) {
-  x.a += x.a;
+fn void f_agg_small_aligned(Small_aligned x)
+{
+	x.a += x.a;
 }

-fn Small_aligned f_agg_small_aligned_ret(Small_aligned x) {
-  return {10};
+fn Small_aligned f_agg_small_aligned_ret(Small_aligned x)
+{
+	return {10};
 }

 // Aggregates greater > 2*xlen will be passed and returned indirectly
-struct Large {
-  int a, b, c, d;
+struct Large
+{
+	int a, b, c, d;
 }

-fn void f_agg_large(Large x) {
-  x.a = x.b + x.c + x.d;
+fn void f_agg_large(Large x)
+{
+	x.a = x.b + x.c + x.d;
 }

 // The address where the struct should be written to will be the first
 // argument
-fn Large f_agg_large_ret(int i, ichar j) {
-  return {1, 2, 3, 4};
+fn Large f_agg_large_ret(int i, ichar j)
+{
+	return {1, 2, 3, 4};
 }

-fn void f_vec_large_v16i8(char[<16>] x) {
-  x[0] = x[7];
+typedef Char16v = char[<16>] @simd;
+fn void f_vec_large_v16i8(Char16v x)
+{
+	x[0] = x[7];
 }

-fn char[<16>] f_vec_large_v16i8_ret() {
-  return {1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8};
+fn Char16v f_vec_large_v16i8_ret()
+{
+	return {1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8};
 }

 // Scalars passed on the stack should not have signext/zeroext attributes
 // (they are anyext).

-fn int f_scalar_stack_1(Tiny a, Small b, Small_aligned c,
-                     Large d, char e, ichar f, char g, ichar h) {
-  return g + h;
+fn int f_scalar_stack_1(Tiny a, Small b, Small_aligned c, Large d, char e, ichar f, char g, ichar h)
+{
+	return g + h;
 }

 // Ensure that scalars passed on the stack are still determined correctly in
 // the presence of large return values that consume a register due to the need
 // to pass a pointer.

-fn Large f_scalar_stack_2(int a, long b, long c, float128 d,
-                              char e, ichar f, char g)
+fn Large f_scalar_stack_2(int a, long b, long c, float128 d, char e, ichar f, char g)
 {
-  return {a, e, f, g};
+	return {a, e, f, g};
 }

-fn float128 f_scalar_stack_4(int a, long b, long c, float128 d,
-                             char e, ichar f, char g) {
-  return d;
+fn float128 f_scalar_stack_4(int a, long b, long c, float128 d, char e, ichar f, char g)
+{
+	return d;
 }

 // Aggregates and >=XLen scalars passed on the stack should be lowered just as
 // they would be if passed via registers.

-fn void f_scalar_stack_5(double a, long b, double c, long d, int e,
-                      long f, float g, double h, float128 i) {}
+fn void f_scalar_stack_5(double a, long b, double c, long d, int e, long f, float g, double h, float128 i) {}

-fn void f_agg_stack(double a, long b, double c, long d, Tiny e,
-                 Small f, Small_aligned g, Large h) {}
+fn void f_agg_stack(double a, long b, double c, long d, Tiny e, Small f, Small_aligned g, Large h) {}

 // Ensure that ABI lowering happens as expected for vararg calls. For RV32
 // with the base integer calling convention there will be no observable
@@ -171,10 +190,11 @@ fn void f_agg_stack(double a, long b, double c, long d, Tiny e,

 extern fn int f_va_callee(int, ...);

-fn void f_va_caller() {
-  f_va_callee(1, 2, 3, 4.0, 5.0, (Tiny){6, 7, 8, 9},
-              (Small){10, null}, (Small_aligned){11},
-              (Large){12, 13, 14, 15});
+fn void f_va_caller()
+{
+	f_va_callee(1, 2, 3, 4.0, 5.0, (Tiny){6, 7, 8, 9},
+	          (Small){10, null}, (Small_aligned){11},
+	          (Large){12, 13, 14, 15});
 }


--- a/test/test_suite/abi/riscv32-ilp32-ilp32f-ilp32d-abi-3.c3t
+++ b/test/test_suite/abi/riscv32-ilp32-ilp32f-ilp32d-abi-3.c3t
@@ -8,162 +8,187 @@ fn void f_void() {}
 // Scalar arguments and return values smaller than the word size are extended
 // according to the sign of their type, up to 32 bits

-fn bool f_scalar_0(bool x) { return x; }
+fn bool f_scalar_0(bool x) => x;

-fn ichar f_scalar_1(ichar x) { return x; }
+fn ichar f_scalar_1(ichar x) => x;

-fn char f_scalar_2(char x) { return x; }
+fn char f_scalar_2(char x) => x;

-fn int f_scalar_3(int x) { return x; }
+fn int f_scalar_3(int x) => x;

-fn long f_scalar_4(long x) { return x; }
+fn long f_scalar_4(long x) => x;

-fn int128 f_scalar_5(int128 x) { return x; }
+fn int128 f_scalar_5(int128 x) => x;

-fn float f_fp_scalar_1(float x) { return x; }
+fn float f_fp_scalar_1(float x) => x;

-fn double f_fp_scalar_2(double x) { return x; }
+fn double f_fp_scalar_2(double x) => x;

 // Scalars larger than 2*xlen are passed/returned indirect. However, the
 // RISC-V LLVM backend can handle this fine, so the function doesn't need to
 // be modified.

-fn float128 f_fp_scalar_3(float128 x) { return x; }
+fn float128 f_fp_scalar_3(float128 x)  => x;

 // Aggregates <= 2*xlen may be passed in registers, so will be coerced to
 // integer arguments. The rules for return are the same.

-struct Tiny {
-  char a, b, c, d;
+struct Tiny
+{
+	char a, b, c, d;
 }

-fn void f_agg_tiny(Tiny x) {
-  x.a += x.b;
-  x.c += x.d;
+fn void f_agg_tiny(Tiny x)
+{
+	x.a += x.b;
+	x.c += x.d;
 }

-fn Tiny f_agg_tiny_ret() {
-  return {1, 2, 3, 4};
+fn Tiny f_agg_tiny_ret()
+{
+	return {1, 2, 3, 4};
 }

-fn void f_vec_tiny_v4i8(char[<4>] x) {
-  x[0] = x[1];
-  x[2] = x[3];
+typedef Char4v = char[<4>] @simd;
+typedef Char8v = char[<8>] @simd;
+typedef Char16v = char[<16>] @simd;
+typedef Int1v = int[<1>] @simd;
+typedef Long1v = long[<1>] @simd;
+
+fn void f_vec_tiny_v4i8(Char4v x)
+{
+	x[0] = x[1];
+	x[2] = x[3];
 }

-fn char[<4>] f_vec_tiny_v4i8_ret() {
-  return {1, 2, 3, 4};
+fn Char4v f_vec_tiny_v4i8_ret()
+{
+	return {1, 2, 3, 4};
 }

-fn void f_vec_tiny_v1i32(int[<1>] x) {
-  x[0] = 114;
+fn void f_vec_tiny_v1i32(Int1v x)
+{
+	x[0] = 114;
 }

-fn int[<1>] f_vec_tiny_v1i32_ret() {
-  return {1};
+fn Int1v f_vec_tiny_v1i32_ret()
+{
+	return {1};
 }

-struct Small {
-  int a;
-   int* b;
+struct Small
+{
+	int a;
+	int* b;
 }

-fn void f_agg_small(Small x) {
-  x.a += *x.b;
-  x.b = &x.a;
+fn void f_agg_small(Small x)
+{
+	x.a += *x.b;
+	x.b = &x.a;
 }

-fn Small f_agg_small_ret() {
-  return {1, null};
+fn Small f_agg_small_ret()
+{
+	return {1, null};
 }


-fn void f_vec_small_v8i8(char[<8>] x) {
-  x[0] = x[7];
+fn void f_vec_small_v8i8(Char8v x)
+{
+	x[0] = x[7];
 }

-fn char[<8>] f_vec_small_v8i8_ret() {
-  return {1, 2, 3, 4, 5, 6, 7, 8};
+fn Char8v f_vec_small_v8i8_ret()
+{
+	return {1, 2, 3, 4, 5, 6, 7, 8};
 }

-fn void f_vec_small_v1i64(long[<1>] x) {
-  x[0] = 114;
+fn void f_vec_small_v1i64(Long1v x)
+{
+	x[0] = 114;
 }

-fn long[<1>] f_vec_small_v1i64_ret() {
-  return {1};
+fn Long1v f_vec_small_v1i64_ret()
+{
+	return {1};
 }

 // Aggregates of 2*xlen size and 2*xlen alignment should be coerced to a
 // single 2*xlen-sized argument, to ensure that alignment can be maintained if
 // passed on the stack.

-struct Small_aligned {
-  long a;
+struct Small_aligned
+{
+	long a;
 }

-fn void f_agg_small_aligned(Small_aligned x) {
-  x.a += x.a;
+fn void f_agg_small_aligned(Small_aligned x)
+{
+	x.a += x.a;
 }

-fn Small_aligned f_agg_small_aligned_ret(Small_aligned x) {
-  return {10};
+fn Small_aligned f_agg_small_aligned_ret(Small_aligned x)
+{
+	return {10};
 }

 // Aggregates greater > 2*xlen will be passed and returned indirectly
-struct Large {
-  int a, b, c, d;
+struct Large
+{
+	int a, b, c, d;
 }

-fn void f_agg_large(Large x) {
-  x.a = x.b + x.c + x.d;
+fn void f_agg_large(Large x)
+{
+	x.a = x.b + x.c + x.d;
 }

 // The address where the struct should be written to will be the first
 // argument
-fn Large f_agg_large_ret(int i, ichar j) {
-  return {1, 2, 3, 4};
+fn Large f_agg_large_ret(int i, ichar j)
+{
+	return {1, 2, 3, 4};
 }

-fn void f_vec_large_v16i8(char[<16>] x) {
-  x[0] = x[7];
+fn void f_vec_large_v16i8(Char16v x)
+{
+	x[0] = x[7];
 }

-fn char[<16>] f_vec_large_v16i8_ret() {
-  return {1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8};
+fn Char16v f_vec_large_v16i8_ret()
+{
+	return {1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8};
 }

 // Scalars passed on the stack should not have signext/zeroext attributes
 // (they are anyext).

-fn int f_scalar_stack_1(Tiny a, Small b, Small_aligned c,
-                     Large d, char e, ichar f, char g, ichar h) {
-  return g + h;
+fn int f_scalar_stack_1(Tiny a, Small b, Small_aligned c, Large d, char e, ichar f, char g, ichar h)
+{
+	return g + h;
 }

 // Ensure that scalars passed on the stack are still determined correctly in
 // the presence of large return values that consume a register due to the need
 // to pass a pointer.

-fn Large f_scalar_stack_2(int a, long b, long c, float128 d,
-                              char e, ichar f, char g)
+fn Large f_scalar_stack_2(int a, long b, long c, float128 d, char e, ichar f, char g)
 {
-  return {a, e, f, g};
+	return {a, e, f, g};
 }

-fn float128 f_scalar_stack_4(int a, long b, long c, float128 d,
-                             char e, ichar f, char g) {
-  return d;
+fn float128 f_scalar_stack_4(int a, long b, long c, float128 d, char e, ichar f, char g)
+{
+	return d;
 }

 // Aggregates and >=XLen scalars passed on the stack should be lowered just as
 // they would be if passed via registers.

-fn void f_scalar_stack_5(double a, long b, double c, long d, int e,
-                      long f, float g, double h, float128 i) {}
+fn void f_scalar_stack_5(double a, long b, double c, long d, int e, long f, float g, double h, float128 i) {}

-fn void f_agg_stack(double a, long b, double c, long d, Tiny e,
-                 Small f, Small_aligned g, Large h) {}
+fn void f_agg_stack(double a, long b, double c, long d, Tiny e, Small f, Small_aligned g, Large h) {}

 // Ensure that ABI lowering happens as expected for vararg calls. For RV32
 // with the base integer calling convention there will be no observable
@@ -171,10 +196,11 @@ fn void f_agg_stack(double a, long b, double c, long d, Tiny e,

 extern fn int f_va_callee(int, ...);

-fn void f_va_caller() {
-  f_va_callee(1, 2, 3, 4.0, 5.0, (Tiny){6, 7, 8, 9},
-              (Small){10, null}, (Small_aligned){11},
-              (Large){12, 13, 14, 15});
+fn void f_va_caller()
+{
+	f_va_callee(1, 2, 3, 4.0, 5.0, (Tiny){6, 7, 8, 9},
+			  (Small){10, null}, (Small_aligned){11},
+			  (Large){12, 13, 14, 15});
 }


--- a/test/test_suite/abi/riscv64-lp64-abi.c3t
+++ b/test/test_suite/abi/riscv64-lp64-abi.c3t
@@ -1,20 +1,21 @@
 // #target: linux-riscv64
 module test;

-struct Large {
-  long a, b, c, d;
+struct Large
+{
+	long a, b, c, d;
 }

-alias V32i8 = char[<32>];
+typedef V32i8 = char[<32>] @simd;

-fn int f_scalar_stack_1(int a, int128 b, float c, float128 d, V32i8 e,
-                     char f, char g, char h) {
-  return g + h;
+fn int f_scalar_stack_1(int a, int128 b, float c, float128 d, V32i8 e, char f, char g, char h)
+{
+	return g + h;
 }

-fn Large f_scalar_stack_2(double a, int128 b, float128 c, V32i8 d,
-                              char e, ichar f, char g) {
-  return (Large) {(long)(a), e, f, g};
+fn Large f_scalar_stack_2(double a, int128 b, float128 c, V32i8 d, char e, ichar f, char g)
+{
+	return {(long)(a), e, f, g};
 }

 /* #expect: test.ll
--- a/test/test_suite/abi/riscv64-lp64-lp64f-abi-1.c3t
+++ b/test/test_suite/abi/riscv64-lp64-lp64f-abi-1.c3t
@@ -6,19 +6,20 @@ struct Large {
 }
 // Scalars passed on the stack should not have signext/zeroext attributes
 // (they are anyext).
+typedef Char32V = char[<32>] @simd;

-fn int f_scalar_stack_1(int a, int128 b, double c, float128 d, char[<32>] e,
-                     char f, ichar g, char h) {
-  return g + h;
+fn int f_scalar_stack_1(int a, int128 b, double c, float128 d, Char32V e, char f, ichar g, char h)
+{
+	return g + h;
 }

 // Ensure that scalars passed on the stack are still determined correctly in
 // the presence of large return values that consume a register due to the need
 // to pass a pointer.

-fn Large f_scalar_stack_2(double a, int128 b, float128 c, char[<32>] d,
-                              char e, ichar f, char g) {
-  return {(long)a, e, f, g};
+fn Large f_scalar_stack_2(double a, int128 b, float128 c, Char32V d, char e, ichar f, char g)
+{
+	return {(long)a, e, f, g};
 }

 /* #expect: test.ll
--- a/test/test_suite/abi/riscv64-lp64-lp64f-abi-2.c3t
+++ b/test/test_suite/abi/riscv64-lp64-lp64f-abi-2.c3t
@@ -8,18 +8,19 @@ struct Large {
 // Scalars passed on the stack should not have signext/zeroext attributes
 // (they are anyext).

-fn int f_scalar_stack_1(int a, int128 b, double c, float128 d, char[<32>] e,
-                     char f, ichar g, char h) {
-  return g + h;
+typedef Char32V = char[<32>] @simd;
+fn int f_scalar_stack_1(int a, int128 b, double c, float128 d, Char32V e, char f, ichar g, char h)
+{
+	return g + h;
 }

 // Ensure that scalars passed on the stack are still determined correctly in
 // the presence of large return values that consume a register due to the need
 // to pass a pointer.

-fn Large f_scalar_stack_2(double a, int128 b, float128 c, char[<32>] d,
-                              char e, ichar f, char g) {
-  return {(long)a, e, f, g};
+fn Large f_scalar_stack_2(double a, int128 b, float128 c, Char32V d, char e, ichar f, char g)
+{
+	return {(long)a, e, f, g};
 }

 /* #expect: test.ll
--- a/test/test_suite/abi/riscv64-lp64-lp64f-lp64d-abi-1.c3t
+++ b/test/test_suite/abi/riscv64-lp64-lp64f-lp64d-abi-1.c3t
@@ -15,125 +15,132 @@ fn float128 f_fp_scalar_3(float128 x) { return x; }
 // Aggregates <= 2*xlen may be passed in registers, so will be coerced to
 // integer arguments. The rules for return are the same.

-struct Tiny {
-  ushort a, b, c, d;
+struct Tiny
+{
+	ushort a, b, c, d;
 }

-fn void f_agg_tiny(Tiny x) {
-  x.a += x.b;
-  x.c += x.d;
+fn void f_agg_tiny(Tiny x)
+{
+	x.a += x.b;
+	x.c += x.d;
 }

-fn Tiny f_agg_tiny_ret() {
-  return {1, 2, 3, 4};
+fn Tiny f_agg_tiny_ret()
+{
+	return {1, 2, 3, 4};
 }

-fn void f_vec_tiny_v4i16(short[<4>] x) {
-  x[0] = x[1];
-  x[2] = x[3];
+typedef Short4 = short[<4>] @simd;
+fn void f_vec_tiny_v4i16(Short4 x)
+{
+	x[0] = x[1];
+	x[2] = x[3];
 }

-fn short[<4>] f_vec_tiny_v4i16_ret() {
-  return {1, 2, 3, 4};
+fn Short4 f_vec_tiny_v4i16_ret() => {1, 2, 3, 4};
+
+typedef Long1 = long[<1>] @simd;
+
+fn void f_vec_tiny_v1i64(Long1 x)
+{
+	x[0] = 114;
 }

-fn void f_vec_tiny_v1i64(long[<1>] x) {
-  x[0] = 114;
+fn Long1 f_vec_tiny_v1i64_ret() => {1};
+
+
+struct Small
+{
+	long a;
+	long *b;
 }

-fn long[<1>] f_vec_tiny_v1i64_ret() {
-  return {1};
+fn void f_agg_small(Small x)
+{
+	x.a += *x.b;
+	x.b = &x.a;
 }

-struct Small {
-  long a;
-  long *b;
+fn Small f_agg_small_ret() => {1, null};
+
+typedef Short8 = short[<8>] @simd;
+fn void f_vec_small_v8i16(Short8 x)
+{
+	x[0] = x[7];
 }

-fn void f_agg_small(Small x) {
-  x.a += *x.b;
-  x.b = &x.a;
+fn Short8 f_vec_small_v8i16_ret() => {1, 2, 3, 4, 5, 6, 7, 8};
+
+typedef Int128_1 = int128[<1>] @simd;
+fn void f_vec_small_v1i128(Int128_1 x)
+{
+	x[0] = 114;
 }

-fn Small f_agg_small_ret() {
-  return {1, null};
-}
-
-fn void f_vec_small_v8i16(short[<8>] x) {
-  x[0] = x[7];
-}
-
-fn short[<8>] f_vec_small_v8i16_ret() {
-  return {1, 2, 3, 4, 5, 6, 7, 8};
-}
-
-fn void f_vec_small_v1i128(int128[<1>] x) {
-  x[0] = 114;
-}
-
-fn int128[<1>] f_vec_small_v1i128_ret() {
-  return {1};
-}
+fn Int128_1 f_vec_small_v1i128_ret() => {1};

 // Aggregates of 2*xlen size and 2*xlen alignment should be coerced to a
 // single 2*xlen-sized argument, to ensure that alignment can be maintained if
 // passed on the stack.

-struct Small_aligned {
-  int128 a;
+struct Small_aligned
+{
+	int128 a;
 }

-fn void f_agg_small_aligned(Small_aligned x) {
-  x.a += x.a;
+fn void f_agg_small_aligned(Small_aligned x)
+{
+	x.a += x.a;
 }

-fn Small_aligned f_agg_small_aligned_ret(Small_aligned x) {
-  return {10};
-}
+fn Small_aligned f_agg_small_aligned_ret(Small_aligned x) => {10};

 // Aggregates greater > 2*xlen will be passed and returned indirectly
-struct Large {
-  long a, b, c, d;
+struct Large
+{
+	long a, b, c, d;
 }

-fn void f_agg_large(Large x) {
-  x.a = x.b + x.c + x.d;
+fn void f_agg_large(Large x)
+{
+	x.a = x.b + x.c + x.d;
 }

 // The address where the struct should be written to will be the first
 // argument
-fn Large f_agg_large_ret(int i, ichar j) {
-  return {1, 2, 3, 4};
+fn Large f_agg_large_ret(int i, ichar j) => {1, 2, 3, 4};
+
+typedef Char32V = char[<32>] @simd;
+
+fn void f_vec_large_v32i8(Char32V x)
+{
+	x[0] = x[7];
 }

-fn void f_vec_large_v32i8(char[<32>] x) {
-  x[0] = x[7];
-}
-
-fn char[<32>] f_vec_large_v32i8_ret() {
-  return { [1] = 1, [31] = 31 };
-}
+fn Char32V f_vec_large_v32i8_ret() => { [1] = 1, [31] = 31 };

 // Scalars passed on the stack should not have signext/zeroext attributes
 // (they are anyext).

 fn int f_scalar_stack_1(Tiny a, Small b, Small_aligned c,
-                     Large d, char e, ichar f, char g, ichar h) {
-  return g + h;
+                     Large d, char e, ichar f, char g, ichar h)
+{
+	return g + h;
 }

-fn int f_scalar_stack_2(int a, int128 b, long c, float128 d, char[<32>] e,
-                     char f, ichar g, char h) {
-  return g + h;
+fn int f_scalar_stack_2(int a, int128 b, long c, float128 d, Char32V e, char f, ichar g, char h)
+{
+	return g + h;
 }

 // Ensure that scalars passed on the stack are still determined correctly in
 // the presence of large return values that consume a register due to the need
 // to pass a pointer.

-fn Large f_scalar_stack_3(uint a, int128 b, float128 c, char[<32>] d,
-                              char e, ichar f, char g) {
-  return {a, e, f, g};
+fn Large f_scalar_stack_3(uint a, int128 b, float128 c, Char32V d, char e, ichar f, char g)
+{
+	return {a, e, f, g};
 }

 // Ensure that ABI lowering happens as expected for vararg calls.
@@ -143,20 +150,21 @@ fn Large f_scalar_stack_3(uint a, int128 b, float128 c, char[<32>] d,

 extern fn int f_va_callee(int, ...);

-fn void f_va_caller() {
+fn void f_va_caller()
+{
 	float128 fq;
-  f_va_callee(1, 2, 3L, 4.0f, 5.0, (Tiny){6, 7, 8, 9},
-              (Small){10, null}, (Small_aligned){11},
-              (Large){12, 13, 14, 15});
-  f_va_callee(1, 2, 3, 4, fq, 6, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, (Small_aligned){5}, 6, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, (Small){5,null}, 6, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, fq, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, (Small_aligned){6}, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, (Small){6, null}, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, 6, fq, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, 6, (Small_aligned){7}, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, 6, (Small){7, null}, 8, 9);
+	f_va_callee(1, 2, 3L, 4.0f, 5.0, (Tiny){6, 7, 8, 9},
+	           (Small){10, null}, (Small_aligned){11},
+	           (Large){12, 13, 14, 15});
+	f_va_callee(1, 2, 3, 4, fq, 6, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, (Small_aligned){5}, 6, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, (Small){5,null}, 6, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, fq, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, (Small_aligned){6}, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, (Small){6, null}, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, 6, fq, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, 6, (Small_aligned){7}, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, 6, (Small){7, null}, 8, 9);
 }


--- a/test/test_suite/abi/riscv64-lp64-lp64f-lp64d-abi-2.c3t
+++ b/test/test_suite/abi/riscv64-lp64-lp64f-lp64d-abi-2.c3t
@@ -16,115 +16,143 @@ fn float128 f_fp_scalar_3(float128 x) { return x; }
 // Aggregates <= 2*xlen may be passed in registers, so will be coerced to
 // integer arguments. The rules for return are the same.

-struct Tiny {
-  ushort a, b, c, d;
+struct Tiny
+{
+	ushort a, b, c, d;
 }

-fn void f_agg_tiny(Tiny x) {
-  x.a += x.b;
-  x.c += x.d;
+fn void f_agg_tiny(Tiny x)
+{
+	x.a += x.b;
+	x.c += x.d;
 }

-fn Tiny f_agg_tiny_ret() {
-  return {1, 2, 3, 4};
+fn Tiny f_agg_tiny_ret()
+{
+	return {1, 2, 3, 4};
 }

-fn void f_vec_tiny_v4i16(short[<4>] x) {
-  x[0] = x[1];
-  x[2] = x[3];
+typedef Short4 = short[<4>] @simd;
+fn void f_vec_tiny_v4i16(Short4 x)
+{
+	x[0] = x[1];
+	x[2] = x[3];
 }

-fn short[<4>] f_vec_tiny_v4i16_ret() {
-  return {1, 2, 3, 4};
+fn Short4 f_vec_tiny_v4i16_ret()
+{
+	return {1, 2, 3, 4};
 }

-fn void f_vec_tiny_v1i64(long[<1>] x) {
-  x[0] = 114;
+typedef Long1 = long[<1>] @simd;
+fn void f_vec_tiny_v1i64(Long1 x)
+{
+	x[0] = 114;
 }

-fn long[<1>] f_vec_tiny_v1i64_ret() {
-  return {1};
+fn Long1 f_vec_tiny_v1i64_ret()
+{
+	return {1};
 }

-struct Small {
-  long a;
-  long *b;
+struct Small
+{
+	long a;
+	long *b;
 }

-fn void f_agg_small(Small x) {
-  x.a += *x.b;
-  x.b = &x.a;
+fn void f_agg_small(Small x)
+{
+	x.a += *x.b;
+	x.b = &x.a;
 }

-fn Small f_agg_small_ret() {
-  return {1, null};
+fn Small f_agg_small_ret()
+{
+	return {1, null};
 }

-fn void f_vec_small_v8i16(short[<8>] x) {
-  x[0] = x[7];
+typedef Short8 = short[<8>] @simd;
+fn void f_vec_small_v8i16(Short8 x)
+{
+	x[0] = x[7];
 }

-fn short[<8>] f_vec_small_v8i16_ret() {
-  return {1, 2, 3, 4, 5, 6, 7, 8};
+fn Short8 f_vec_small_v8i16_ret()
+{
+	return {1, 2, 3, 4, 5, 6, 7, 8};
 }

-fn void f_vec_small_v1i128(int128[<1>] x) {
-  x[0] = 114;
+typedef Int128_1 = int128[<1>] @simd;
+
+fn void f_vec_small_v1i128(Int128_1 x)
+{
+	x[0] = 114;
 }

-fn int128[<1>] f_vec_small_v1i128_ret() {
-  return {1};
+fn Int128_1 f_vec_small_v1i128_ret()
+{
+	return {1};
 }

 // Aggregates of 2*xlen size and 2*xlen alignment should be coerced to a
 // single 2*xlen-sized argument, to ensure that alignment can be maintained if
 // passed on the stack.

-struct Small_aligned {
-  int128 a;
+struct Small_aligned
+{
+	int128 a;
 }

-fn void f_agg_small_aligned(Small_aligned x) {
-  x.a += x.a;
+fn void f_agg_small_aligned(Small_aligned x)
+{
+	x.a += x.a;
 }

-fn Small_aligned f_agg_small_aligned_ret(Small_aligned x) {
-  return {10};
+fn Small_aligned f_agg_small_aligned_ret(Small_aligned x)
+{
+	return {10};
 }

 // Aggregates greater > 2*xlen will be passed and returned indirectly
-struct Large {
-  long a, b, c, d;
+struct Large
+{
+	long a, b, c, d;
 }

-fn void f_agg_large(Large x) {
-  x.a = x.b + x.c + x.d;
+fn void f_agg_large(Large x)
+{
+	x.a = x.b + x.c + x.d;
 }

 // The address where the struct should be written to will be the first
 // argument
-fn Large f_agg_large_ret(int i, ichar j) {
-  return {1, 2, 3, 4};
+fn Large f_agg_large_ret(int i, ichar j)
+{
+	return {1, 2, 3, 4};
 }

-fn void f_vec_large_v32i8(char[<32>] x) {
-  x[0] = x[7];
+typedef Char32V = char[<32>] @simd;
+fn void f_vec_large_v32i8(Char32V x)
+{
+	x[0] = x[7];
 }

-fn char[<32>] f_vec_large_v32i8_ret() {
-  return { [1] = 1, [31] = 31 };
+fn Char32V f_vec_large_v32i8_ret()
+{
+	return { [1] = 1, [31] = 31 };
 }

 // Scalars passed on the stack should not have signext/zeroext attributes
 // (they are anyext).

-fn int f_scalar_stack_1(Tiny a, Small b, Small_aligned c,
-                     Large d, char e, ichar f, char g, ichar h) {
-  return g + h;
+fn int f_scalar_stack_1(Tiny a, Small b, Small_aligned c, Large d, char e, ichar f, char g, ichar h)
+{
+	return g + h;
 }

-fn int f_scalar_stack_2(int a, int128 b, long c, float128 d, char[<32>] e,
-                     char f, ichar g, char h) {
+fn int f_scalar_stack_2(int a, int128 b, long c, float128 d, Char32V e, char f, ichar g, char h)
+{
  return g + h;
 }

@@ -132,9 +160,9 @@ fn int f_scalar_stack_2(int a, int128 b, long c, float128 d, char[<32>] e,
 // the presence of large return values that consume a register due to the need
 // to pass a pointer.

-fn Large f_scalar_stack_3(uint a, int128 b, float128 c, char[<32>] d,
-                              char e, ichar f, char g) {
-  return {a, e, f, g};
+fn Large f_scalar_stack_3(uint a, int128 b, float128 c, Char32V d, char e, ichar f, char g)
+{
+	return {a, e, f, g};
 }

 // Ensure that ABI lowering happens as expected for vararg calls.
@@ -145,19 +173,19 @@ fn Large f_scalar_stack_3(uint a, int128 b, float128 c, char[<32>] d,
 extern fn int f_va_callee(int, ...);

 fn void f_va_caller() {
-	float128 fq;
-  f_va_callee(1, 2, 3L, 4.0f, 5.0, (Tiny){6, 7, 8, 9},
-              (Small){10, null}, (Small_aligned){11},
-              (Large){12, 13, 14, 15});
-  f_va_callee(1, 2, 3, 4, fq, 6, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, (Small_aligned){5}, 6, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, (Small){5,null}, 6, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, fq, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, (Small_aligned){6}, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, (Small){6, null}, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, 6, fq, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, 6, (Small_aligned){7}, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, 6, (Small){7, null}, 8, 9);
+		float128 fq;
+	f_va_callee(1, 2, 3L, 4.0f, 5.0, (Tiny){6, 7, 8, 9},
+	            (Small){10, null}, (Small_aligned){11},
+	            (Large){12, 13, 14, 15});
+	f_va_callee(1, 2, 3, 4, fq, 6, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, (Small_aligned){5}, 6, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, (Small){5,null}, 6, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, fq, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, (Small_aligned){6}, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, (Small){6, null}, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, 6, fq, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, 6, (Small_aligned){7}, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, 6, (Small){7, null}, 8, 9);
 }


--- a/test/test_suite/abi/riscv64-lp64-lp64f-lp64d-abi-3.c3t
+++ b/test/test_suite/abi/riscv64-lp64-lp64f-lp64d-abi-3.c3t
@@ -16,125 +16,143 @@ fn float128 f_fp_scalar_3(float128 x) { return x; }
 // Aggregates <= 2*xlen may be passed in registers, so will be coerced to
 // integer arguments. The rules for return are the same.

-struct Tiny {
-  ushort a, b, c, d;
+struct Tiny
+{
+	ushort a, b, c, d;
 }

-fn void f_agg_tiny(Tiny x) {
-  x.a += x.b;
-  x.c += x.d;
+fn void f_agg_tiny(Tiny x)
+{
+	x.a += x.b;
+	x.c += x.d;
 }

 fn Tiny f_agg_tiny_ret() {
  return {1, 2, 3, 4};
 }

-fn void f_vec_tiny_v4i16(short[<4>] x) {
-  x[0] = x[1];
-  x[2] = x[3];
+typedef Short4 = short[<4>] @simd;
+typedef Short8 = short[<8>] @simd;
+typedef Long1 = long[<1>] @simd;
+typedef Int128_1 = int128[<1>] @simd;
+typedef Char32v = char[<32>] @simd;
+
+fn void f_vec_tiny_v4i16(Short4 x)
+{
+	x[0] = x[1];
+	x[2] = x[3];
 }

-fn short[<4>] f_vec_tiny_v4i16_ret() {
-  return {1, 2, 3, 4};
+fn Short4 f_vec_tiny_v4i16_ret() => {1, 2, 3, 4};
+
+fn void f_vec_tiny_v1i64(Long1 x)
+{
+	x[0] = 114;
 }

-fn void f_vec_tiny_v1i64(long[<1>] x) {
-  x[0] = 114;
+fn Long1 f_vec_tiny_v1i64_ret()
+{
+	return {1};
 }

-fn long[<1>] f_vec_tiny_v1i64_ret() {
-  return {1};
+struct Small
+{
+	long a;
+	long *b;
 }

-struct Small {
-  long a;
-  long *b;
+fn void f_agg_small(Small x)
+{
+	x.a += *x.b;
+	x.b = &x.a;
 }

-fn void f_agg_small(Small x) {
-  x.a += *x.b;
-  x.b = &x.a;
+fn Small f_agg_small_ret()
+{
+	return {1, null};
 }

-fn Small f_agg_small_ret() {
-  return {1, null};
+fn void f_vec_small_v8i16(Short8 x)
+{
+	x[0] = x[7];
 }

-fn void f_vec_small_v8i16(short[<8>] x) {
-  x[0] = x[7];
+fn Short8 f_vec_small_v8i16_ret() => {1, 2, 3, 4, 5, 6, 7, 8};
+
+fn void f_vec_small_v1i128(Int128_1 x)
+{
+	x[0] = 114;
 }

-fn short[<8>] f_vec_small_v8i16_ret() {
-  return {1, 2, 3, 4, 5, 6, 7, 8};
-}
-
-fn void f_vec_small_v1i128(int128[<1>] x) {
-  x[0] = 114;
-}
-
-fn int128[<1>] f_vec_small_v1i128_ret() {
-  return {1};
+fn Int128_1 f_vec_small_v1i128_ret()
+{
+	return {1};
 }

 // Aggregates of 2*xlen size and 2*xlen alignment should be coerced to a
 // single 2*xlen-sized argument, to ensure that alignment can be maintained if
 // passed on the stack.

-struct Small_aligned {
-  int128 a;
+struct Small_aligned
+{
+	int128 a;
 }

-fn void f_agg_small_aligned(Small_aligned x) {
-  x.a += x.a;
+fn void f_agg_small_aligned(Small_aligned x)
+{
+	x.a += x.a;
 }

-fn Small_aligned f_agg_small_aligned_ret(Small_aligned x) {
-  return {10};
+fn Small_aligned f_agg_small_aligned_ret(Small_aligned x)
+{
+	return {10};
 }

 // Aggregates greater > 2*xlen will be passed and returned indirectly
-struct Large {
-  long a, b, c, d;
+struct Large
+{
+	long a, b, c, d;
 }

-fn void f_agg_large(Large x) {
-  x.a = x.b + x.c + x.d;
+fn void f_agg_large(Large x)
+{
+	x.a = x.b + x.c + x.d;
 }

 // The address where the struct should be written to will be the first
 // argument
-fn Large f_agg_large_ret(int i, ichar j) {
-  return {1, 2, 3, 4};
+fn Large f_agg_large_ret(int i, ichar j) => {1, 2, 3, 4};
+
+fn void f_vec_large_v32i8(Char32v x)
+{
+	x[0] = x[7];
 }

-fn void f_vec_large_v32i8(char[<32>] x) {
-  x[0] = x[7];
-}
-
-fn char[<32>] f_vec_large_v32i8_ret() {
-  return { [1] = 1, [31] = 31 };
+fn Char32v f_vec_large_v32i8_ret()
+{
+	return { [1] = 1, [31] = 31 };
 }

 // Scalars passed on the stack should not have signext/zeroext attributes
 // (they are anyext).

-fn int f_scalar_stack_1(Tiny a, Small b, Small_aligned c,
-                     Large d, char e, ichar f, char g, ichar h) {
-  return g + h;
+fn int f_scalar_stack_1(Tiny a, Small b, Small_aligned c, Large d, char e, ichar f, char g, ichar h)
+{
+	return g + h;
 }

-fn int f_scalar_stack_2(int a, int128 b, long c, float128 d, char[<32>] e,
-                     char f, ichar g, char h) {
-  return g + h;
+fn int f_scalar_stack_2(int a, int128 b, long c, float128 d, Char32v e, char f, ichar g, char h)
+{
+	return g + h;
 }

 // Ensure that scalars passed on the stack are still determined correctly in
 // the presence of large return values that consume a register due to the need
 // to pass a pointer.

-fn Large f_scalar_stack_3(uint a, int128 b, float128 c, char[<32>] d,
-                              char e, ichar f, char g) {
-  return {a, e, f, g};
+fn Large f_scalar_stack_3(uint a, int128 b, float128 c, Char32v d, char e, ichar f, char g)
+{
+	return {a, e, f, g};
 }

 // Ensure that ABI lowering happens as expected for vararg calls.
@@ -144,20 +162,21 @@ fn Large f_scalar_stack_3(uint a, int128 b, float128 c, char[<32>] d,

 extern fn int f_va_callee(int, ...);

-fn void f_va_caller() {
+fn void f_va_caller()
+{
 	float128 fq;
-  f_va_callee(1, 2, 3L, 4.0f, 5.0, (Tiny){6, 7, 8, 9},
-              (Small){10, null}, (Small_aligned){11},
-              (Large){12, 13, 14, 15});
-  f_va_callee(1, 2, 3, 4, fq, 6, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, (Small_aligned){5}, 6, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, (Small){5,null}, 6, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, fq, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, (Small_aligned){6}, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, (Small){6, null}, 7, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, 6, fq, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, 6, (Small_aligned){7}, 8, 9);
-  f_va_callee(1, 2, 3, 4, 5, 6, (Small){7, null}, 8, 9);
+	f_va_callee(1, 2, 3L, 4.0f, 5.0, (Tiny){6, 7, 8, 9},
+	            (Small){10, null}, (Small_aligned){11},
+	            (Large){12, 13, 14, 15});
+	f_va_callee(1, 2, 3, 4, fq, 6, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, (Small_aligned){5}, 6, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, (Small){5,null}, 6, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, fq, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, (Small_aligned){6}, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, (Small){6, null}, 7, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, 6, fq, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, 6, (Small_aligned){7}, 8, 9);
+	f_va_callee(1, 2, 3, 4, 5, 6, (Small){7, null}, 8, 9);
 }


--- a/test/test_suite/abi/sysv_abi_avx.c3t
+++ b/test/test_suite/abi/sysv_abi_avx.c3t
@@ -16,16 +16,18 @@ fn void callit()
 	take_stringref(s);
 }

-extern fn float[<8>] get_m256();
-extern fn void take_m256(float[<8>] x);
-extern fn float[<16>] get_m512();
-extern fn void take_m512(float[<16>] x);
+typedef Mm256 = float[<8>] @simd;
+typedef Mm512 = float[<16>] @simd;
+extern fn Mm256 get_m256();
+extern fn void take_m256(Mm256 x);
+extern fn Mm512 get_m512();
+extern fn void take_m512(Mm512 x);

 fn void use_vectors()
 {
-  float[<8>] v1 = get_m256();
+  Mm256 v1 = get_m256();
  take_m256(v1);
-  float[<16>] v2 = get_m512();
+  Mm512 v2 = get_m512();
  take_m512(v2);
 }

--- a/test/test_suite/abi/sysv_abi_noavx.c3t
+++ b/test/test_suite/abi/sysv_abi_noavx.c3t
@@ -16,16 +16,18 @@ fn void callit()
 	take_stringref(s);
 }

-extern fn float[<8>] get_m256();
-extern fn void take_m256(float[<8>] x);
-extern fn float[<16>] get_m512();
-extern fn void take_m512(float[<16>] x);
+typedef Mv256 = float[<8>] @simd;
+typedef Mv512 = float[<16>] @simd;
+extern fn Mv256 get_m256();
+extern fn void take_m256(Mv256 x);
+extern fn Mv512 get_m512();
+extern fn void take_m512(Mv512 x);

 fn void use_vectors()
 {
-  float[<8>] v1 = get_m256();
+  Mv256 v1 = get_m256();
  take_m256(v1);
-  float[<16>] v2 = get_m512();
+  Mv512 v2 = get_m512();
  take_m512(v2);
 }

--- a/test/test_suite/abi/sysv_vec_array_indirect.c3t
+++ b/test/test_suite/abi/sysv_vec_array_indirect.c3t
@@ -0,0 +1,32 @@
+// #target: linux-x64
+module test;
+import std;
+fn void test(double[<3>] x)
+{
+	double[<3>]* y = &x;
+}
+
+fn int main()
+{
+	test({ 1, 2, 3 });
+	return 0;
+}
+/* #expect: test.ll
+
+define void @test.test(ptr byval([3 x double]) align 8 %0) #0 {
+entry:
+  %x = alloca <3 x double>, align 32
+  %y = alloca ptr, align 8
+  %1 = load <3 x double>, ptr %0, align 8
+  %expandvec = shufflevector <3 x double> %1, <3 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  store <4 x double> %expandvec, ptr %x, align 32
+  store ptr %x, ptr %y, align 8
+  ret void
+}
+
+entry:
+  %indirectarg = alloca [3 x double], align 8
+  store [3 x double] [double 1.000000e+00, double 2.000000e+00, double 3.000000e+00], ptr %indirectarg, align 8
+  call void @test.test(ptr byval([3 x double]) align 8 %indirectarg)
+  ret i32 0
+}
--- a/test/test_suite/abi/vec_update_align.c3t
+++ b/test/test_suite/abi/vec_update_align.c3t
@@ -0,0 +1,60 @@
+// #target: macos-x64
+// #opt: --x86cpu=avx512
+module test;
+alias Int8x16 = ichar[<16>];
+alias Float32x3 = float[<3>];
+
+struct HFAv3
+{
+    Float32x3[4] arr;
+}
+
+struct HFAv3arr
+{
+    float[3][4] arr;
+}
+
+struct MixedHFAv3
+{
+    Float32x3[3] arr;
+    Int8x16 b;
+}
+
+struct MixedHFAv3arr
+{
+    float[<3>][3] arr;
+    ichar[16] b;
+}
+
+fn int main()
+{
+	MixedHFAv3 a;
+	MixedHFAv3arr b;
+	b.arr[1].x++;
+	float x1 = b.arr[0].y;
+	return 0;
+}
+
+/* #expect: test.ll
+
+%MixedHFAv3 = type { [3 x [3 x float]], [16 x i8] }
+%MixedHFAv3arr = type { [3 x [3 x float]], [16 x i8] }
+
+define i32 @main() #0 {
+entry:
+  %a = alloca %MixedHFAv3, align 4
+  %b = alloca %MixedHFAv3arr, align 4
+  %x1 = alloca float, align 4
+  call void @llvm.memset.p0.i64(ptr align 4 %a, i8 0, i64 52, i1 false)
+  call void @llvm.memset.p0.i64(ptr align 4 %b, i8 0, i64 52, i1 false)
+  %ptradd = getelementptr inbounds i8, ptr %b, i64 16
+  %0 = load <3 x float>, ptr %ptradd, align 4
+  %1 = extractelement <3 x float> %0, i64 0
+  %fincdec = fadd float %1, 1.000000e+00
+  %2 = insertelement <3 x float> %0, float %fincdec, i64 0
+  store <3 x float> %2, ptr %ptradd, align 4
+  %3 = load <3 x float>, ptr %b, align 4
+  %4 = extractelement <3 x float> %3, i64 1
+  store float %4, ptr %x1, align 4
+  ret i32 0
+}
--- a/test/test_suite/arrays/index_from_back.c3t
+++ b/test/test_suite/arrays/index_from_back.c3t
@@ -2,21 +2,21 @@

 module test;

-fn void test(int[10] x, int[<10>] y)
+typedef Int16 = int[<16>] @simd;
+fn void test(int[16] x, Int16 y)
 {
-    int a = x[4];
-    int b = x[^2];
-    int c = y[4];
-    int d = y[^2];
-    int j = 3;
-    int e = y[^j];
-    int f = x[^j];
+	int a = x[4];
+	int b = x[^2];
+	int c = y[4];
+	int d = y[^2];
+	int j = 3;
+	int e = y[^j];
+	int f = x[^j];
 }

 /* #expect: test.ll

-; Function Attrs:
-define void @test.test(ptr byval([10 x i32]) align 8 %0, ptr byval(<10 x i32>) align 64 %1) #0 {
+define void @test.test(ptr byval([16 x i32]) align 8 %0, ptr byval(<16 x i32>) align 64 %1) #0 {
 entry:
  %a = alloca i32, align 4
  %b = alloca i32, align 4
@@ -28,25 +28,25 @@ entry:
  %ptradd = getelementptr inbounds i8, ptr %0, i64 16
  %2 = load i32, ptr %ptradd, align 4
  store i32 %2, ptr %a, align 4
-  %ptradd1 = getelementptr inbounds i8, ptr %0, i64 32
+  %ptradd1 = getelementptr inbounds i8, ptr %0, i64 56
  %3 = load i32, ptr %ptradd1, align 4
  store i32 %3, ptr %b, align 4
-  %4 = load <10 x i32>, ptr %1, align 64
-  %5 = extractelement <10 x i32> %4, i64 4
+  %4 = load <16 x i32>, ptr %1, align 64
+  %5 = extractelement <16 x i32> %4, i64 4
  store i32 %5, ptr %c, align 4
-  %6 = load <10 x i32>, ptr %1, align 64
-  %7 = extractelement <10 x i32> %6, i64 8
+  %6 = load <16 x i32>, ptr %1, align 64
+  %7 = extractelement <16 x i32> %6, i64 14
  store i32 %7, ptr %d, align 4
  store i32 3, ptr %j, align 4
-  %8 = load <10 x i32>, ptr %1, align 64
+  %8 = load <16 x i32>, ptr %1, align 64
  %9 = load i32, ptr %j, align 4
  %sext = sext i32 %9 to i64
-  %10 = sub nuw i64 10, %sext
-  %11 = extractelement <10 x i32> %8, i64 %10
+  %10 = sub nuw i64 16, %sext
+  %11 = extractelement <16 x i32> %8, i64 %10
  store i32 %11, ptr %e, align 4
  %12 = load i32, ptr %j, align 4
  %sext2 = sext i32 %12 to i64
-  %13 = sub nuw i64 10, %sext2
+  %13 = sub nuw i64 16, %sext2
  %ptroffset = getelementptr inbounds [4 x i8], ptr %0, i64 %13
  %14 = load i32, ptr %ptroffset, align 4
  store i32 %14, ptr %f, align 4
--- a/test/test_suite/builtins/matrix_builtin.c3t
+++ b/test/test_suite/builtins/matrix_builtin.c3t
@@ -26,13 +26,17 @@ entry:
  %2 = call <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %0, <4 x i32> %1, i32 2, i32 2, i32 2)
  store <4 x i32> %2, ptr %z, align 16
  store <2 x i32> <i32 1, i32 2>, ptr %a, align 8
-  store <3 x i32> <i32 1, i32 2, i32 3>, ptr %b, align 16
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 undef>, ptr %b, align 16
  %3 = load <2 x i32>, ptr %a, align 8
-  %4 = load <3 x i32>, ptr %b, align 16
-  %5 = call <6 x i32> @llvm.matrix.multiply.v6i32.v2i32.v3i32(<2 x i32> %3, <3 x i32> %4, i32 2, i32 1, i32 3)
-  store <6 x i32> %5, ptr %c, align 32
-  %6 = load <6 x i32>, ptr %c, align 32
-  %7 = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> %6, i32 2, i32 3)
-  store <6 x i32> %7, ptr %c, align 32
+  %4 = load <4 x i32>, ptr %b, align 16
+  %extractvec = shufflevector <4 x i32> %4, <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  %5 = call <6 x i32> @llvm.matrix.multiply.v6i32.v2i32.v3i32(<2 x i32> %3, <3 x i32> %extractvec, i32 2, i32 1, i32 3)
+  %expandvec = shufflevector <6 x i32> %5, <6 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+  store <8 x i32> %expandvec, ptr %c, align 32
+  %6 = load <8 x i32>, ptr %c, align 32
+  %extractvec1 = shufflevector <8 x i32> %6, <8 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+  %7 = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> %extractvec1, i32 2, i32 3)
+  %expandvec2 = shufflevector <6 x i32> %7, <6 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+  store <8 x i32> %expandvec2, ptr %c, align 32
  ret i32 0
 }
--- a/test/test_suite/builtins/shufflevector.c3t
+++ b/test/test_suite/builtins/shufflevector.c3t
@@ -3,6 +3,8 @@
 module test;
 import std::io;

+typedef Vc2 = float[<2>] @simd;
+typedef Vc4 = float[<4>] @simd;
 struct Matrix2x2
 {
 	union
@@ -14,34 +16,34 @@ struct Matrix2x2
 		}
 		struct
 		{
-			float[<2>] m0;
-			float[<2>] m1;
+			Vc2 m0;
+			Vc2 m1;
 		}
-		float[<4>] m;
+		Vc4 m;
 	}
 }

-fn float[<2>] apply1(Matrix2x2* mat, float[<2>] vec)
+fn Vc2 apply1(Matrix2x2* mat, Vc2 vec)
 {
-	return (float[<2>]) {
+	return (Vc2) {
 		mat.m00 * vec[0] + mat.m01 * vec[1],
 		mat.m10 * vec[0] + mat.m11 * vec[1],
 	};
 }

-fn float[<2>] apply2(Matrix2x2* mat, float[<2>] vec)
+fn Vc2 apply2(Matrix2x2* mat, Vc2 vec)
 {
-	return (float[<2>]) {
+	return (Vc2) {
 		mat.m0[0] * vec[0] + mat.m0[1] * vec[1],
 		mat.m1[0] * vec[0] + mat.m1[1] * vec[1],
 	};
 }

-fn float[<2>] apply3(Matrix2x2* mat, float[<2>] vec)
+fn Vc2 apply3(Matrix2x2* mat, Vc2 vec)
 {
-	float[<2>] a = $$swizzle2(mat.m0, mat.m1, 0, 3);
-	float[<2>] b = $$swizzle2(mat.m0, mat.m1, 1, 2);
-	float[<2>] flip = $$swizzle(vec, 1, 0);
+	Vc2 a = (Vc2)$$swizzle2(mat.m0, mat.m1, 0, 3);
+	Vc2 b = (Vc2)$$swizzle2(mat.m0, mat.m1, 1, 2);
+	Vc2 flip = (Vc2)$$swizzle(vec, 1, 0);
 	return a * vec + b * flip;
 }

@@ -49,9 +51,9 @@ fn float[<2>] apply3(Matrix2x2* mat, float[<2>] vec)
 fn void main()
 {
 	Matrix2x2 a = { 1, -3, 5, -7 };
-	io::printfn("1: %s", apply1(&a, (float[<2>]) { 11, 13 }));
-	io::printfn("2: %s", apply2(&a, (float[<2>]) { 11, 13 }));
-	io::printfn("3: %s", apply3(&a, (float[<2>]) { 11, 13 }));
+	io::printfn("1: %s", apply1(&a, (Vc2) { 11, 13 }));
+	io::printfn("2: %s", apply2(&a, (Vc2) { 11, 13 }));
+	io::printfn("3: %s", apply3(&a, (Vc2) { 11, 13 }));
 }

 /* #expect: test.ll
--- a/test/test_suite/cast/implicit_infer_len_cast.c3t
+++ b/test/test_suite/cast/implicit_infer_len_cast.c3t
@@ -6,6 +6,7 @@ macro int test(int[*][*]* y)
 	$typeof(*y) z = *y;
 	return z[1][1];
 }
+
 fn void main()
 {
 	int[2][*] x = { { 2, 3}, { 5, 6 }};
@@ -19,19 +20,19 @@ fn void main()
 /* #expect: test.ll

  %x = alloca [2 x [2 x i32]], align 16
-  %y = alloca [1 x <2 x i32>], align 8
-  %z = alloca [1 x <2 x i32>], align 8
-  %w = alloca [1 x <2 x i32>], align 8
+  %y = alloca [1 x <2 x i32>], align 4
+  %z = alloca [1 x <2 x i32>], align 4
+  %w = alloca [1 x <2 x i32>], align 4
  %aa = alloca %"int[<2>][]", align 8
-  %literal = alloca [1 x <2 x i32>], align 8
+  %literal = alloca [1 x <2 x i32>], align 4
  %bb = alloca [1 x %"int[]"], align 16
  %literal1 = alloca [2 x i32], align 4
  %z2 = alloca [2 x [2 x i32]], align 16
  call void @llvm.memcpy.p0.p0.i32(ptr align 16 %x, ptr align 16 @.__const, i32 16, i1 false)
-  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %y, ptr align 8 @.__const.1, i32 8, i1 false)
-  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %z, ptr align 8 %y, i32 8, i1 false)
-  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %w, ptr align 8 %z, i32 8, i1 false)
-  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %literal, ptr align 8 @.__const.2, i32 8, i1 false)
+  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %y, ptr align 4 @.__const.1, i32 8, i1 false)
+  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %z, ptr align 4 %y, i32 8, i1 false)
+  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %w, ptr align 4 %z, i32 8, i1 false)
+  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %literal, ptr align 4 @.__const.2, i32 8, i1 false)
  %0 = insertvalue %"int[<2>][]" undef, ptr %literal, 0
  %1 = insertvalue %"int[<2>][]" %0, i64 1, 1
  store %"int[<2>][]" %1, ptr %aa, align 8
--- a/test/test_suite/clang/2002-04.c3t
+++ b/test/test_suite/clang/2002-04.c3t
@@ -146,23 +146,19 @@ entry:
 ; Function Attrs:
 define void @test.test2(ptr byval(%FooSt) align 8 %0) #0 {
 entry:
-  %indirectarg = alloca %FooSt, align 8
-  %indirectarg4 = alloca %FooSt, align 8
  %ptradd = getelementptr inbounds i8, ptr %0, i64 2
  %ptradd1 = getelementptr inbounds i8, ptr %0, i64 4
  %ptradd2 = getelementptr inbounds i8, ptr %0, i64 8
  %ptradd3 = getelementptr inbounds i8, ptr %0, i64 12
  %1 = load i16, ptr %ptradd3, align 4
  %sext = sext i16 %1 to i32
-  %2 = load i8, ptr %0, align 4
+  %2 = load i8, ptr %0, align 8
  %3 = load i16, ptr %ptradd, align 2
  %4 = load i8, ptr %ptradd1, align 4
-  %5 = load i32, ptr %ptradd2, align 4
+  %5 = load i32, ptr %ptradd2, align 8
  %6 = call i32 @testE(i8 zeroext %2, i16 signext %3, i8 zeroext %4, i32 %5, i32 %sext, float 0x3FB99999A0000000)
-  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %indirectarg, ptr align 4 %0, i32 20, i1 false)
-  %7 = call i32 @testF(ptr byval(%FooSt) align 8 %indirectarg, float 0x3FB99999A0000000)
-  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %indirectarg4, ptr align 4 %0, i32 20, i1 false)
-  call void @test.test2(ptr byval(%FooSt) align 8 %indirectarg4)
+  %7 = call i32 @testF(ptr byval(%FooSt) align 8 %0, float 0x3FB99999A0000000)
+  call void @test.test2(ptr byval(%FooSt) align 8 %0)
  call void @test.test3(ptr %0)
  ret void
 }
--- a/test/test_suite/compile_time/compile_time_access_subscript.c3t
+++ b/test/test_suite/compile_time/compile_time_access_subscript.c3t
@@ -71,7 +71,7 @@ fn void main()
  store ptr null, ptr %z6, align 8
  store i8 0, ptr %z7, align 1
  store i64 0, ptr %z8, align 8
-  store <3 x i32> zeroinitializer, ptr %z9, align 16
+  store <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr %z9, align 16
  store float 0.000000e+00, ptr %z10, align 4
  %ptradd11 = getelementptr inbounds i8, ptr %z10, i64 4
  store float 0.000000e+00, ptr %ptradd11, align 4
--- a/test/test_suite/compile_time/untyped_conversions.c3t
+++ b/test/test_suite/compile_time/untyped_conversions.c3t
@@ -5,7 +5,8 @@ struct Foo
 {
 	int a; int b;
 }
-fn void test(int[2] a, int[] b, int[<2>] c)
+typedef Int2V = int[<2>] @simd;
+fn void test(int[2] a, int[] b, Int2V c)
 {
 	io::printfn("%s %s %s", a, b, c);
 }
@@ -30,11 +31,12 @@ fn void main()
 %any = type { ptr, i64 }

@"$ct.test.Foo" = linkonce global %.introspect { i8 10, i64 0, ptr null, i64 8, i64 0, i64 2, [0 x i64] zeroinitializer }, align 8
+@"$ct.test.Int2V" = linkonce global %.introspect { i8 18, i64 0, ptr null, i64 8, i64 ptrtoint (ptr @"$ct.v2$int" to i64), i64 0, [0 x i64] zeroinitializer }, align 8
+@"$ct.v2$int" = linkonce global %.introspect { i8 17, i64 0, ptr null, i64 8, i64 ptrtoint (ptr @"$ct.int" to i64), i64 2, [0 x i64] zeroinitializer }, align 8
+@"$ct.int" = linkonce global %.introspect { i8 2, i64 0, ptr null, i64 4, i64 0, i64 0, [0 x i64] zeroinitializer }, align 8
@.str = private unnamed_addr constant [9 x i8] c"%s %s %s\00", align 1
@"$ct.a2$int" = linkonce global %.introspect { i8 15, i64 0, ptr null, i64 8, i64 ptrtoint (ptr @"$ct.int" to i64), i64 2, [0 x i64] zeroinitializer }, align 8
-@"$ct.int" = linkonce global %.introspect { i8 2, i64 0, ptr null, i64 4, i64 0, i64 0, [0 x i64] zeroinitializer }, align 8
@"$ct.sa$int" = linkonce global %.introspect { i8 16, i64 0, ptr null, i64 16, i64 ptrtoint (ptr @"$ct.int" to i64), i64 0, [0 x i64] zeroinitializer }, align 8
-@"$ct.v2$int" = linkonce global %.introspect { i8 17, i64 0, ptr null, i64 8, i64 ptrtoint (ptr @"$ct.int" to i64), i64 2, [0 x i64] zeroinitializer }, align 8
@.__const = private unnamed_addr constant [1 x %Foo] [%Foo { i32 1, i32 2 }], align 4
@.__const.1 = private unnamed_addr constant %Foo { i32 1, i32 2 }, align 4
@.__const.2 = private unnamed_addr constant [1 x [2 x i32]] [[2 x i32] [i32 1, i32 2]], align 4
@@ -70,7 +72,7 @@ entry:
  %ptradd1 = getelementptr inbounds i8, ptr %varargslots, i64 16
  store %any %7, ptr %ptradd1, align 16
  %8 = insertvalue %any undef, ptr %c, 0
-  %9 = insertvalue %any %8, i64 ptrtoint (ptr @"$ct.v2$int" to i64), 1
+  %9 = insertvalue %any %8, i64 ptrtoint (ptr @"$ct.test.Int2V" to i64), 1
  %ptradd2 = getelementptr inbounds i8, ptr %varargslots, i64 32
  store %any %9, ptr %ptradd2, align 16
  %10 = call i64 @std.io.printfn(ptr %retparam, ptr @.str, i64 8, ptr %varargslots, i64 3)
--- a/test/test_suite/debug_symbols/defer_macro.c3t
+++ b/test/test_suite/debug_symbols/defer_macro.c3t
@@ -86,7 +86,7 @@ macro Id unique()
 }


-typedef Color = float[<4>];
+typedef Color = float[<4>] @simd;

 const Color BLACK = {0, 0, 0, 1};
 const Color WHITE = {1, 1, 1, 1};
--- a/test/test_suite/expressions/ternary_infer.c3t
+++ b/test/test_suite/expressions/ternary_infer.c3t
@@ -1,14 +1,16 @@
 // #target: macos-x64
 module test;
-fn int[<2>] foo(int x)
+typedef Int2V = int[<2>] @simd;
+
+fn Int2V foo(int x)
 {
-  return x > 0 ? {0, 0} : {255, 255};
+	return x > 0 ? {0, 0} : {255, 255};
 }


 fn int main()
 {
-  return 0;
+	return 0;
 }
 /* #expect: test.ll

--- a/test/test_suite/functions/test_regression.c3t
+++ b/test/test_suite/functions/test_regression.c3t
@@ -292,7 +292,7 @@ entry:
  %c = alloca %Bobo, align 4
  %indirectarg = alloca %Bobo, align 8
  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %de, ptr align 4 @.__const, i32 12, i1 false)
-  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %c, ptr align 4 %1, i32 20, i1 false)
+  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %c, ptr align 8 %1, i32 20, i1 false)
  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %indirectarg, ptr align 4 %c, i32 20, i1 false)
  %2 = call i32 @test.helo(double 1.000000e+00, ptr byval(%Bobo) align 8 %indirectarg)
  ret i32 1
@@ -452,9 +452,9 @@ loop.exit:                                        ; preds = %loop.cond
  store i32 3, ptr %elements, align 4
  %11 = call i32 (ptr, ...) @printf(ptr @.str.4)
  call void @llvm.memset.p0.i64(ptr align 8 %array, i8 0, i64 40, i1 false)
-  call void @"std_collections_list$int$.List.push"(ptr %array, i32 100)
-  call void @"std_collections_list$int$.List.push"(ptr %array, i32 200)
-  call void @"std_collections_list$int$.List.push"(ptr %array, i32 400)
+  call void @"std_collections_list$int$.List.push"(ptr %array, i32 100) #3
+  call void @"std_collections_list$int$.List.push"(ptr %array, i32 200) #3
+  call void @"std_collections_list$int$.List.push"(ptr %array, i32 400) #3
  call void @"std_collections_list$int$.List.push"(ptr %array, i32 600) #3
  call void @"std_collections_list$int$.List.insert_at"(ptr %array, i64 2, i32 300)
  store i32 0, ptr %i1, align 4
--- a/test/test_suite/slices/slice_to_slice_vector_assign.c3t
+++ b/test/test_suite/slices/slice_to_slice_vector_assign.c3t
@@ -41,8 +41,8 @@ entry:
  %retparam14 = alloca i64, align 8
  %varargslots16 = alloca [1 x %any], align 16
  %retparam17 = alloca i64, align 8
-  store <7 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, ptr %z, align 32
-  store <6 x i32> zeroinitializer, ptr %y, align 32
+  store <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>, ptr %z, align 32
+  store <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef>, ptr %y, align 32
  %ptradd = getelementptr inbounds i8, ptr %z, i64 12
  %0 = insertvalue %"int[]" undef, ptr %ptradd, 0
  %1 = insertvalue %"int[]" %0, i64 3, 1
--- a/test/test_suite/statements/foreach_common.c3t
+++ b/test/test_suite/statements/foreach_common.c3t
@@ -82,13 +82,13 @@ entry:
  %.anon65 = alloca i64, align 8
  %i69 = alloca i64, align 8
  %a70 = alloca float, align 4
-  %.anon74 = alloca i64, align 8
-  %i78 = alloca i8, align 1
-  %a80 = alloca double, align 8
-  %.anon85 = alloca i64, align 8
-  %a89 = alloca double, align 8
+  %.anon75 = alloca i64, align 8
+  %i79 = alloca i8, align 1
+  %a81 = alloca double, align 8
+  %.anon87 = alloca i64, align 8
+  %a91 = alloca double, align 8
  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %foo, ptr align 4 @.__const, i32 12, i1 false)
-  store <3 x float> <float 2.000000e+00, float 4.500000e+00, float 8.000000e+00>, ptr %foo2, align 16
+  store <4 x float> <float 2.000000e+00, float 4.500000e+00, float 8.000000e+00, float undef>, ptr %foo2, align 16
  store i64 0, ptr %.anon, align 8
  br label %loop.cond

@@ -242,9 +242,10 @@ loop.cond48:                                      ; preds = %loop.body50, %loop.
  br i1 %gt49, label %loop.body50, label %loop.exit54

 loop.body50:                                      ; preds = %loop.cond48
-  %37 = load <3 x float>, ptr %foo2, align 16
+  %37 = load <4 x float>, ptr %foo2, align 16
+  %extractvec = shufflevector <4 x float> %37, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
  %38 = load i64, ptr %.anon47, align 8
-  %39 = extractelement <3 x float> %37, i64 %38
+  %39 = extractelement <3 x float> %extractvec, i64 %38
  store float %39, ptr %a51, align 4
  %40 = load float, ptr %a51, align 4
  %fpfpext52 = fpext float %40 to double
@@ -287,73 +288,76 @@ loop.exit64:                                      ; preds = %loop.cond56
 loop.cond66:                                      ; preds = %loop.body68, %loop.exit64
  %49 = load i64, ptr %.anon65, align 8
  %gt67 = icmp ugt i64 3, %49
-  br i1 %gt67, label %loop.body68, label %loop.exit73
+  br i1 %gt67, label %loop.body68, label %loop.exit74

 loop.body68:                                      ; preds = %loop.cond66
  %50 = load i64, ptr %.anon65, align 8
  store i64 %50, ptr %i69, align 8
-  %51 = load <3 x float>, ptr %foo2, align 16
+  %51 = load <4 x float>, ptr %foo2, align 16
+  %extractvec71 = shufflevector <4 x float> %51, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
  %52 = load i64, ptr %.anon65, align 8
-  %53 = extractelement <3 x float> %51, i64 %52
+  %53 = extractelement <3 x float> %extractvec71, i64 %52
  store float %53, ptr %a70, align 4
  %54 = load float, ptr %a70, align 4
-  %fpfpext71 = fpext float %54 to double
+  %fpfpext72 = fpext float %54 to double
  %55 = load i64, ptr %i69, align 8
-  call void (ptr, ...) @printf(ptr @.str.8, i64 %55, double %fpfpext71)
+  call void (ptr, ...) @printf(ptr @.str.8, i64 %55, double %fpfpext72)
  %56 = load i64, ptr %.anon65, align 8
-  %addnuw72 = add nuw i64 %56, 1
-  store i64 %addnuw72, ptr %.anon65, align 8
+  %addnuw73 = add nuw i64 %56, 1
+  store i64 %addnuw73, ptr %.anon65, align 8
  br label %loop.cond66

-loop.exit73:                                      ; preds = %loop.cond66
-  store i64 0, ptr %.anon74, align 8
-  br label %loop.cond75
+loop.exit74:                                      ; preds = %loop.cond66
+  store i64 0, ptr %.anon75, align 8
+  br label %loop.cond76

-loop.cond75:                                      ; preds = %loop.body77, %loop.exit73
-  %57 = load i64, ptr %.anon74, align 8
-  %gt76 = icmp ugt i64 3, %57
-  br i1 %gt76, label %loop.body77, label %loop.exit84
+loop.cond76:                                      ; preds = %loop.body78, %loop.exit74
+  %57 = load i64, ptr %.anon75, align 8
+  %gt77 = icmp ugt i64 3, %57
+  br i1 %gt77, label %loop.body78, label %loop.exit86

-loop.body77:                                      ; preds = %loop.cond75
-  %58 = load i64, ptr %.anon74, align 8
-  %trunc79 = trunc i64 %58 to i8
-  store i8 %trunc79, ptr %i78, align 1
-  %59 = load <3 x float>, ptr %foo2, align 16
-  %60 = load i64, ptr %.anon74, align 8
-  %61 = extractelement <3 x float> %59, i64 %60
-  %fpfpext81 = fpext float %61 to double
-  store double %fpfpext81, ptr %a80, align 8
-  %62 = load i8, ptr %i78, align 1
-  %zext82 = zext i8 %62 to i32
-  %63 = load double, ptr %a80, align 8
-  call void (ptr, ...) @printf(ptr @.str.9, i32 %zext82, double %63)
-  %64 = load i64, ptr %.anon74, align 8
-  %addnuw83 = add nuw i64 %64, 1
-  store i64 %addnuw83, ptr %.anon74, align 8
-  br label %loop.cond75
+loop.body78:                                      ; preds = %loop.cond76
+  %58 = load i64, ptr %.anon75, align 8
+  %trunc80 = trunc i64 %58 to i8
+  store i8 %trunc80, ptr %i79, align 1
+  %59 = load <4 x float>, ptr %foo2, align 16
+  %extractvec82 = shufflevector <4 x float> %59, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  %60 = load i64, ptr %.anon75, align 8
+  %61 = extractelement <3 x float> %extractvec82, i64 %60
+  %fpfpext83 = fpext float %61 to double
+  store double %fpfpext83, ptr %a81, align 8
+  %62 = load i8, ptr %i79, align 1
+  %zext84 = zext i8 %62 to i32
+  %63 = load double, ptr %a81, align 8
+  call void (ptr, ...) @printf(ptr @.str.9, i32 %zext84, double %63)
+  %64 = load i64, ptr %.anon75, align 8
+  %addnuw85 = add nuw i64 %64, 1
+  store i64 %addnuw85, ptr %.anon75, align 8
+  br label %loop.cond76

-loop.exit84:                                      ; preds = %loop.cond75
-  store i64 0, ptr %.anon85, align 8
-  br label %loop.cond86
+loop.exit86:                                      ; preds = %loop.cond76
+  store i64 0, ptr %.anon87, align 8
+  br label %loop.cond88

-loop.cond86:                                      ; preds = %loop.body88, %loop.exit84
-  %65 = load i64, ptr %.anon85, align 8
-  %gt87 = icmp ugt i64 3, %65
-  br i1 %gt87, label %loop.body88, label %loop.exit92
+loop.cond88:                                      ; preds = %loop.body90, %loop.exit86
+  %65 = load i64, ptr %.anon87, align 8
+  %gt89 = icmp ugt i64 3, %65
+  br i1 %gt89, label %loop.body90, label %loop.exit95

-loop.body88:                                      ; preds = %loop.cond86
-  %66 = load <3 x float>, ptr %foo2, align 16
-  %67 = load i64, ptr %.anon85, align 8
-  %68 = extractelement <3 x float> %66, i64 %67
-  %fpfpext90 = fpext float %68 to double
-  store double %fpfpext90, ptr %a89, align 8
-  %69 = load double, ptr %a89, align 8
+loop.body90:                                      ; preds = %loop.cond88
+  %66 = load <4 x float>, ptr %foo2, align 16
+  %extractvec92 = shufflevector <4 x float> %66, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  %67 = load i64, ptr %.anon87, align 8
+  %68 = extractelement <3 x float> %extractvec92, i64 %67
+  %fpfpext93 = fpext float %68 to double
+  store double %fpfpext93, ptr %a91, align 8
+  %69 = load double, ptr %a91, align 8
  call void (ptr, ...) @printf(ptr @.str.10, double %69)
-  %70 = load i64, ptr %.anon85, align 8
-  %addnuw91 = add nuw i64 %70, 1
-  store i64 %addnuw91, ptr %.anon85, align 8
-  br label %loop.cond86
+  %70 = load i64, ptr %.anon87, align 8
+  %addnuw94 = add nuw i64 %70, 1
+  store i64 %addnuw94, ptr %.anon87, align 8
+  br label %loop.cond88

-loop.exit92:                                      ; preds = %loop.cond86
+loop.exit95:                                      ; preds = %loop.cond88
  ret void
 }
--- a/test/test_suite/statements/foreach_r_common.c3t
+++ b/test/test_suite/statements/foreach_r_common.c3t
@@ -82,13 +82,13 @@ entry:
  %.anon65 = alloca i64, align 8
  %i70 = alloca i64, align 8
  %a71 = alloca float, align 4
-  %.anon74 = alloca i64, align 8
-  %i79 = alloca i8, align 1
-  %a81 = alloca double, align 8
-  %.anon85 = alloca i64, align 8
-  %a90 = alloca double, align 8
+  %.anon75 = alloca i64, align 8
+  %i80 = alloca i8, align 1
+  %a82 = alloca double, align 8
+  %.anon87 = alloca i64, align 8
+  %a92 = alloca double, align 8
  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %foo, ptr align 4 @.__const, i32 12, i1 false)
-  store <3 x float> <float 2.000000e+00, float 4.500000e+00, float 8.000000e+00>, ptr %foo2, align 16
+  store <4 x float> <float 2.000000e+00, float 4.500000e+00, float 8.000000e+00, float undef>, ptr %foo2, align 16
  store i64 3, ptr %.anon, align 8
  br label %loop.cond

@@ -245,9 +245,10 @@ loop.body50:                                      ; preds = %loop.cond48
  %37 = load i64, ptr %.anon47, align 8
  %subnuw51 = sub nuw i64 %37, 1
  store i64 %subnuw51, ptr %.anon47, align 8
-  %38 = load <3 x float>, ptr %foo2, align 16
+  %38 = load <4 x float>, ptr %foo2, align 16
+  %extractvec = shufflevector <4 x float> %38, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
  %39 = load i64, ptr %.anon47, align 8
-  %40 = extractelement <3 x float> %38, i64 %39
+  %40 = extractelement <3 x float> %extractvec, i64 %39
  store float %40, ptr %a52, align 4
  %41 = load float, ptr %a52, align 4
  %fpfpext53 = fpext float %41 to double
@@ -287,7 +288,7 @@ loop.exit64:                                      ; preds = %loop.cond56
 loop.cond66:                                      ; preds = %loop.body68, %loop.exit64
  %49 = load i64, ptr %.anon65, align 8
  %gt67 = icmp ugt i64 %49, 0
-  br i1 %gt67, label %loop.body68, label %loop.exit73
+  br i1 %gt67, label %loop.body68, label %loop.exit74

 loop.body68:                                      ; preds = %loop.cond66
  %50 = load i64, ptr %.anon65, align 8
@@ -295,65 +296,68 @@ loop.body68:                                      ; preds = %loop.cond66
  store i64 %subnuw69, ptr %.anon65, align 8
  %51 = load i64, ptr %.anon65, align 8
  store i64 %51, ptr %i70, align 8
-  %52 = load <3 x float>, ptr %foo2, align 16
+  %52 = load <4 x float>, ptr %foo2, align 16
+  %extractvec72 = shufflevector <4 x float> %52, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
  %53 = load i64, ptr %.anon65, align 8
-  %54 = extractelement <3 x float> %52, i64 %53
+  %54 = extractelement <3 x float> %extractvec72, i64 %53
  store float %54, ptr %a71, align 4
  %55 = load float, ptr %a71, align 4
-  %fpfpext72 = fpext float %55 to double
+  %fpfpext73 = fpext float %55 to double
  %56 = load i64, ptr %i70, align 8
-  call void (ptr, ...) @printf(ptr @.str.8, i64 %56, double %fpfpext72)
+  call void (ptr, ...) @printf(ptr @.str.8, i64 %56, double %fpfpext73)
  br label %loop.cond66

-loop.exit73:                                      ; preds = %loop.cond66
-  store i64 3, ptr %.anon74, align 8
-  br label %loop.cond75
+loop.exit74:                                      ; preds = %loop.cond66
+  store i64 3, ptr %.anon75, align 8
+  br label %loop.cond76

-loop.cond75:                                      ; preds = %loop.body77, %loop.exit73
-  %57 = load i64, ptr %.anon74, align 8
-  %gt76 = icmp ugt i64 %57, 0
-  br i1 %gt76, label %loop.body77, label %loop.exit84
+loop.cond76:                                      ; preds = %loop.body78, %loop.exit74
+  %57 = load i64, ptr %.anon75, align 8
+  %gt77 = icmp ugt i64 %57, 0
+  br i1 %gt77, label %loop.body78, label %loop.exit86

-loop.body77:                                      ; preds = %loop.cond75
-  %58 = load i64, ptr %.anon74, align 8
-  %subnuw78 = sub nuw i64 %58, 1
-  store i64 %subnuw78, ptr %.anon74, align 8
-  %59 = load i64, ptr %.anon74, align 8
-  %trunc80 = trunc i64 %59 to i8
-  store i8 %trunc80, ptr %i79, align 1
-  %60 = load <3 x float>, ptr %foo2, align 16
-  %61 = load i64, ptr %.anon74, align 8
-  %62 = extractelement <3 x float> %60, i64 %61
-  %fpfpext82 = fpext float %62 to double
-  store double %fpfpext82, ptr %a81, align 8
-  %63 = load i8, ptr %i79, align 1
-  %zext83 = zext i8 %63 to i32
-  %64 = load double, ptr %a81, align 8
-  call void (ptr, ...) @printf(ptr @.str.9, i32 %zext83, double %64)
-  br label %loop.cond75
+loop.body78:                                      ; preds = %loop.cond76
+  %58 = load i64, ptr %.anon75, align 8
+  %subnuw79 = sub nuw i64 %58, 1
+  store i64 %subnuw79, ptr %.anon75, align 8
+  %59 = load i64, ptr %.anon75, align 8
+  %trunc81 = trunc i64 %59 to i8
+  store i8 %trunc81, ptr %i80, align 1
+  %60 = load <4 x float>, ptr %foo2, align 16
+  %extractvec83 = shufflevector <4 x float> %60, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  %61 = load i64, ptr %.anon75, align 8
+  %62 = extractelement <3 x float> %extractvec83, i64 %61
+  %fpfpext84 = fpext float %62 to double
+  store double %fpfpext84, ptr %a82, align 8
+  %63 = load i8, ptr %i80, align 1
+  %zext85 = zext i8 %63 to i32
+  %64 = load double, ptr %a82, align 8
+  call void (ptr, ...) @printf(ptr @.str.9, i32 %zext85, double %64)
+  br label %loop.cond76

-loop.exit84:                                      ; preds = %loop.cond75
-  store i64 3, ptr %.anon85, align 8
-  br label %loop.cond86
+loop.exit86:                                      ; preds = %loop.cond76
+  store i64 3, ptr %.anon87, align 8
+  br label %loop.cond88

-loop.cond86:                                      ; preds = %loop.body88, %loop.exit84
-  %65 = load i64, ptr %.anon85, align 8
-  %gt87 = icmp ugt i64 %65, 0
-  br i1 %gt87, label %loop.body88, label %loop.exit92
+loop.cond88:                                      ; preds = %loop.body90, %loop.exit86
+  %65 = load i64, ptr %.anon87, align 8
+  %gt89 = icmp ugt i64 %65, 0
+  br i1 %gt89, label %loop.body90, label %loop.exit95

-loop.body88:                                      ; preds = %loop.cond86
-  %66 = load i64, ptr %.anon85, align 8
-  %subnuw89 = sub nuw i64 %66, 1
-  store i64 %subnuw89, ptr %.anon85, align 8
-  %67 = load <3 x float>, ptr %foo2, align 16
-  %68 = load i64, ptr %.anon85, align 8
-  %69 = extractelement <3 x float> %67, i64 %68
-  %fpfpext91 = fpext float %69 to double
-  store double %fpfpext91, ptr %a90, align 8
-  %70 = load double, ptr %a90, align 8
+loop.body90:                                      ; preds = %loop.cond88
+  %66 = load i64, ptr %.anon87, align 8
+  %subnuw91 = sub nuw i64 %66, 1
+  store i64 %subnuw91, ptr %.anon87, align 8
+  %67 = load <4 x float>, ptr %foo2, align 16
+  %extractvec93 = shufflevector <4 x float> %67, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  %68 = load i64, ptr %.anon87, align 8
+  %69 = extractelement <3 x float> %extractvec93, i64 %68
+  %fpfpext94 = fpext float %69 to double
+  store double %fpfpext94, ptr %a92, align 8
+  %70 = load double, ptr %a92, align 8
  call void (ptr, ...) @printf(ptr @.str.10, double %70)
-  br label %loop.cond86
+  br label %loop.cond88

-loop.exit92:                                      ; preds = %loop.cond86
+loop.exit95:                                      ; preds = %loop.cond88
  ret void
 }
--- a/test/test_suite/union/designated_union_zeroing.c3t
+++ b/test/test_suite/union/designated_union_zeroing.c3t
@@ -1,13 +1,16 @@
 // #target: macos-x64
 module test;
-union Rect {
-  struct { float[<2>] min, max; }
+typedef Float2 = float[<2>] @simd;
+union Rect
+{
+	struct { Float2 min, max; }
 }

-fn Rect test_rect(float[<2>] max) {
-  Rect rect = {.max = max};
-  assert(rect.min == {});
-  return rect;
+fn Rect test_rect(Float2 max)
+{
+	Rect rect = { .max = max };
+	assert(rect.min == {});
+	return rect;
 }

 /* #expect: test.ll
--- a/test/test_suite/vector/vector_consts.c3t
+++ b/test/test_suite/vector/vector_consts.c3t
@@ -1,7 +1,8 @@
 // #target: macos-x64
 module foo;
 import std::math;
-fn int x(char[<8>] a, char[<8>] b)
+typedef Char8 = inline char[<8>] @simd;
+fn int x(Char8 a, Char8 b)
 {
 	bool[<8>] z = a.comp_eq(b);
 	return ((char[<8>]) { [0..7] = 255 } & (char[<8>])z + ~(char[<8>])z & (char[<8>]) { 0, 1, 2, 3, 4, 5, 6, 7 }).min();
--- a/test/test_suite/vector/vector_init_regression.c3t
+++ b/test/test_suite/vector/vector_init_regression.c3t
@@ -95,24 +95,27 @@ entry:
  %b = alloca [4 x <4 x float>], align 16
  %.anon = alloca i64, align 8
  %v = alloca <4 x float>, align 16
-  %.anon90 = alloca i64, align 8
-  %v94 = alloca <4 x float>, align 16
+  %.anon92 = alloca i64, align 8
+  %v96 = alloca <4 x float>, align 16
  store float 0x3FE921CAC0000000, ptr %radians, align 4
-  store <3 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr %axis, align 16
+  store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float undef>, ptr %axis, align 16
  %0 = load float, ptr %radians, align 4
  %1 = call reassoc arcp contract float @llvm.cos.f32(float %0)
  store float %1, ptr %cosr, align 4
  %2 = load float, ptr %radians, align 4
  %3 = call reassoc arcp contract float @llvm.sin.f32(float %2)
  store float %3, ptr %sinr, align 4
-  %4 = load <3 x float>, ptr %axis, align 16
-  %5 = extractelement <3 x float> %4, i64 0
+  %4 = load <4 x float>, ptr %axis, align 16
+  %extractvec = shufflevector <4 x float> %4, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  %5 = extractelement <3 x float> %extractvec, i64 0
  store float %5, ptr %x, align 4
-  %6 = load <3 x float>, ptr %axis, align 16
-  %7 = extractelement <3 x float> %6, i64 1
+  %6 = load <4 x float>, ptr %axis, align 16
+  %extractvec1 = shufflevector <4 x float> %6, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  %7 = extractelement <3 x float> %extractvec1, i64 1
  store float %7, ptr %y, align 4
-  %8 = load <3 x float>, ptr %axis, align 16
-  %9 = extractelement <3 x float> %8, i64 2
+  %8 = load <4 x float>, ptr %axis, align 16
+  %extractvec2 = shufflevector <4 x float> %8, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  %9 = extractelement <3 x float> %extractvec2, i64 2
  store float %9, ptr %z, align 4
  call void @llvm.memset.p0.i64(ptr align 16 %a, i8 0, i64 64, i1 false)
  %10 = load float, ptr %cosr, align 4
@@ -127,224 +130,224 @@ entry:
  %15 = insertelement <4 x float> undef, float %14, i64 0
  %16 = load float, ptr %x, align 4
  %17 = load float, ptr %y, align 4
-  %fmul1 = fmul reassoc arcp contract float %16, %17
+  %fmul3 = fmul reassoc arcp contract float %16, %17
  %18 = load float, ptr %cosr, align 4
-  %fpfpext2 = fpext
-  %fsub3 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext2
-  %fpfptrunc4 = fptrunc
+  %fpfpext4 = fpext
+  %fsub5 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext4
+  %fpfptrunc6 = fptrunc
  %19 = load float, ptr %z, align 4
  %20 = load float, ptr %sinr, align 4
-  %fmul5 = fmul reassoc arcp contract float %19, %20
-  %21 = fneg reassoc arcp contract float %fmul5
-  %22 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul1, float %fpfptrunc4, float %21)
+  %fmul7 = fmul reassoc arcp contract float %19, %20
+  %21 = fneg reassoc arcp contract float %fmul7
+  %22 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul3, float %fpfptrunc6, float %21)
  %23 = insertelement <4 x float> %15, float %22, i64 1
  %24 = load float, ptr %x, align 4
  %25 = load float, ptr %z, align 4
-  %fmul6 = fmul reassoc arcp contract float %24, %25
+  %fmul8 = fmul reassoc arcp contract float %24, %25
  %26 = load float, ptr %cosr, align 4
-  %fpfpext7 = fpext
-  %fsub8 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext7
-  %fpfptrunc9 = fptrunc
+  %fpfpext9 = fpext
+  %fsub10 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext9
+  %fpfptrunc11 = fptrunc
  %27 = load float, ptr %y, align 4
  %28 = load float, ptr %sinr, align 4
-  %fmul10 = fmul reassoc arcp contract float %27, %28
-  %29 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul6, float %fpfptrunc9, float %fmul10)
+  %fmul12 = fmul reassoc arcp contract float %27, %28
+  %29 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul8, float %fpfptrunc11, float %fmul12)
  %30 = insertelement <4 x float> %23, float %29, i64 2
  %31 = insertelement <4 x float> %30, float 0.000000e+00, i64 3
  store <4 x float> %31, ptr %a, align 16
  %32 = load float, ptr %y, align 4
  %33 = load float, ptr %x, align 4
-  %fmul11 = fmul reassoc arcp contract float %32, %33
+  %fmul13 = fmul reassoc arcp contract float %32, %33
  %34 = load float, ptr %cosr, align 4
-  %fpfpext12 = fpext
-  %fsub13 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext12
-  %fpfptrunc14 = fptrunc
+  %fpfpext14 = fpext
+  %fsub15 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext14
+  %fpfptrunc16 = fptrunc
  %35 = load float, ptr %z, align 4
  %36 = load float, ptr %sinr, align 4
-  %fmul15 = fmul reassoc arcp contract float %35, %36
-  %37 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul11, float %fpfptrunc14, float %fmul15)
+  %fmul17 = fmul reassoc arcp contract float %35, %36
+  %37 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul13, float %fpfptrunc16, float %fmul17)
  %38 = insertelement <4 x float> undef, float %37, i64 0
  %39 = load float, ptr %cosr, align 4
  %40 = load float, ptr %y, align 4
  %41 = load float, ptr %y, align 4
-  %fmul16 = fmul reassoc arcp contract float %40, %41
+  %fmul18 = fmul reassoc arcp contract float %40, %41
  %42 = load float, ptr %cosr, align 4
-  %fpfpext17 = fpext
-  %fsub18 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext17
-  %fpfptrunc19 = fptrunc
-  %43 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul16, float %fpfptrunc19, float %39)
+  %fpfpext19 = fpext
+  %fsub20 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext19
+  %fpfptrunc21 = fptrunc
+  %43 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul18, float %fpfptrunc21, float %39)
  %44 = insertelement <4 x float> %38, float %43, i64 1
  %45 = load float, ptr %y, align 4
  %46 = load float, ptr %z, align 4
-  %fmul20 = fmul reassoc arcp contract float %45, %46
+  %fmul22 = fmul reassoc arcp contract float %45, %46
  %47 = load float, ptr %cosr, align 4
-  %fpfpext21 = fpext
-  %fsub22 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext21
-  %fpfptrunc23 = fptrunc
+  %fpfpext23 = fpext
+  %fsub24 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext23
+  %fpfptrunc25 = fptrunc
  %48 = load float, ptr %x, align 4
  %49 = load float, ptr %sinr, align 4
-  %fmul24 = fmul reassoc arcp contract float %48, %49
-  %50 = fneg reassoc arcp contract float %fmul24
-  %51 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul20, float %fpfptrunc23, float %50)
+  %fmul26 = fmul reassoc arcp contract float %48, %49
+  %50 = fneg reassoc arcp contract float %fmul26
+  %51 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul22, float %fpfptrunc25, float %50)
  %52 = insertelement <4 x float> %44, float %51, i64 2
  %53 = insertelement <4 x float> %52, float 0.000000e+00, i64 3
  %ptradd = getelementptr inbounds i8, ptr %a, i64 16
  store <4 x float> %53, ptr %ptradd, align 16
  %54 = load float, ptr %z, align 4
  %55 = load float, ptr %x, align 4
-  %fmul25 = fmul reassoc arcp contract float %54, %55
+  %fmul27 = fmul reassoc arcp contract float %54, %55
  %56 = load float, ptr %cosr, align 4
-  %fpfpext26 = fpext
-  %fsub27 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext26
-  %fpfptrunc28 = fptrunc
+  %fpfpext28 = fpext
+  %fsub29 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext28
+  %fpfptrunc30 = fptrunc
  %57 = load float, ptr %y, align 4
  %58 = load float, ptr %sinr, align 4
-  %fmul29 = fmul reassoc arcp contract float %57, %58
-  %59 = fneg reassoc arcp contract float %fmul29
-  %60 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul25, float %fpfptrunc28, float %59)
+  %fmul31 = fmul reassoc arcp contract float %57, %58
+  %59 = fneg reassoc arcp contract float %fmul31
+  %60 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul27, float %fpfptrunc30, float %59)
  %61 = insertelement <4 x float> undef, float %60, i64 0
  %62 = load float, ptr %z, align 4
  %63 = load float, ptr %y, align 4
-  %fmul30 = fmul reassoc arcp contract float %62, %63
+  %fmul32 = fmul reassoc arcp contract float %62, %63
  %64 = load float, ptr %cosr, align 4
-  %fpfpext31 = fpext
-  %fsub32 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext31
-  %fpfptrunc33 = fptrunc
+  %fpfpext33 = fpext
+  %fsub34 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext33
+  %fpfptrunc35 = fptrunc
  %65 = load float, ptr %x, align 4
  %66 = load float, ptr %sinr, align 4
-  %fmul34 = fmul reassoc arcp contract float %65, %66
-  %67 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul30, float %fpfptrunc33, float %fmul34)
+  %fmul36 = fmul reassoc arcp contract float %65, %66
+  %67 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul32, float %fpfptrunc35, float %fmul36)
  %68 = insertelement <4 x float> %61, float %67, i64 1
  %69 = load float, ptr %cosr, align 4
  %70 = load float, ptr %z, align 4
  %71 = load float, ptr %z, align 4
-  %fmul35 = fmul reassoc arcp contract float %70, %71
+  %fmul37 = fmul reassoc arcp contract float %70, %71
  %72 = load float, ptr %cosr, align 4
-  %fpfpext36 = fpext
-  %fsub37 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext36
-  %fpfptrunc38 = fptrunc
-  %73 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul35, float %fpfptrunc38, float %69)
+  %fpfpext38 = fpext
+  %fsub39 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext38
+  %fpfptrunc40 = fptrunc
+  %73 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul37, float %fpfptrunc40, float %69)
  %74 = insertelement <4 x float> %68, float %73, i64 2
  %75 = insertelement <4 x float> %74, float 0.000000e+00, i64 3
-  %ptradd39 = getelementptr inbounds i8, ptr %a, i64 32
-  store <4 x float> %75, ptr %ptradd39, align 16
-  %ptradd40 = getelementptr inbounds i8, ptr %a, i64 48
-  store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr %ptradd40, align 16
+  %ptradd41 = getelementptr inbounds i8, ptr %a, i64 32
+  store <4 x float> %75, ptr %ptradd41, align 16
+  %ptradd42 = getelementptr inbounds i8, ptr %a, i64 48
+  store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr %ptradd42, align 16
  %76 = load float, ptr %cosr, align 4
  %77 = load float, ptr %x, align 4
  %78 = load float, ptr %x, align 4
-  %fmul41 = fmul reassoc arcp contract float %77, %78
+  %fmul43 = fmul reassoc arcp contract float %77, %78
  %79 = load float, ptr %cosr, align 4
-  %fpfpext42 = fpext
-  %fsub43 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext42
-  %fpfptrunc44 = fptrunc
-  %80 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul41, float %fpfptrunc44, float %76)
+  %fpfpext44 = fpext
+  %fsub45 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext44
+  %fpfptrunc46 = fptrunc
+  %80 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul43, float %fpfptrunc46, float %76)
  %81 = insertelement <4 x float> undef, float %80, i64 0
  %82 = load float, ptr %x, align 4
  %83 = load float, ptr %y, align 4
-  %fmul45 = fmul reassoc arcp contract float %82, %83
+  %fmul47 = fmul reassoc arcp contract float %82, %83
  %84 = load float, ptr %cosr, align 4
-  %fpfpext46 = fpext
-  %fsub47 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext46
-  %fpfptrunc48 = fptrunc
+  %fpfpext48 = fpext
+  %fsub49 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext48
+  %fpfptrunc50 = fptrunc
  %85 = load float, ptr %z, align 4
  %86 = load float, ptr %sinr, align 4
-  %fmul49 = fmul reassoc arcp contract float %85, %86
-  %87 = fneg reassoc arcp contract float %fmul49
-  %88 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul45, float %fpfptrunc48, float %87)
+  %fmul51 = fmul reassoc arcp contract float %85, %86
+  %87 = fneg reassoc arcp contract float %fmul51
+  %88 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul47, float %fpfptrunc50, float %87)
  %89 = insertelement <4 x float> %81, float %88, i64 1
  %90 = load float, ptr %x, align 4
  %91 = load float, ptr %z, align 4
-  %fmul50 = fmul reassoc arcp contract float %90, %91
+  %fmul52 = fmul reassoc arcp contract float %90, %91
  %92 = load float, ptr %cosr, align 4
-  %fpfpext51 = fpext
-  %fsub52 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext51
-  %fpfptrunc53 = fptrunc
+  %fpfpext53 = fpext
+  %fsub54 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext53
+  %fpfptrunc55 = fptrunc
  %93 = load float, ptr %y, align 4
  %94 = load float, ptr %sinr, align 4
-  %fmul54 = fmul reassoc arcp contract float %93, %94
-  %95 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul50, float %fpfptrunc53, float %fmul54)
+  %fmul56 = fmul reassoc arcp contract float %93, %94
+  %95 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul52, float %fpfptrunc55, float %fmul56)
  %96 = insertelement <4 x float> %89, float %95, i64 2
  %97 = insertelement <4 x float> %96, float 0.000000e+00, i64 3
  store <4 x float> %97, ptr %b, align 16
-  %ptradd55 = getelementptr inbounds i8, ptr %b, i64 16
+  %ptradd57 = getelementptr inbounds i8, ptr %b, i64 16
  %98 = load float, ptr %y, align 4
  %99 = load float, ptr %x, align 4
-  %fmul56 = fmul reassoc arcp contract float %98, %99
+  %fmul58 = fmul reassoc arcp contract float %98, %99
  %100 = load float, ptr %cosr, align 4
-  %fpfpext57 = fpext
-  %fsub58 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext57
-  %fpfptrunc59 = fptrunc
+  %fpfpext59 = fpext
+  %fsub60 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext59
+  %fpfptrunc61 = fptrunc
  %101 = load float, ptr %z, align 4
  %102 = load float, ptr %sinr, align 4
-  %fmul60 = fmul reassoc arcp contract float %101, %102
-  %103 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul56, float %fpfptrunc59, float %fmul60)
+  %fmul62 = fmul reassoc arcp contract float %101, %102
+  %103 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul58, float %fpfptrunc61, float %fmul62)
  %104 = insertelement <4 x float> undef, float %103, i64 0
  %105 = load float, ptr %cosr, align 4
  %106 = load float, ptr %y, align 4
  %107 = load float, ptr %y, align 4
-  %fmul61 = fmul reassoc arcp contract float %106, %107
+  %fmul63 = fmul reassoc arcp contract float %106, %107
  %108 = load float, ptr %cosr, align 4
-  %fpfpext62 = fpext
-  %fsub63 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext62
-  %fpfptrunc64 = fptrunc
-  %109 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul61, float %fpfptrunc64, float %105)
+  %fpfpext64 = fpext
+  %fsub65 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext64
+  %fpfptrunc66 = fptrunc
+  %109 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul63, float %fpfptrunc66, float %105)
  %110 = insertelement <4 x float> %104, float %109, i64 1
  %111 = load float, ptr %y, align 4
  %112 = load float, ptr %z, align 4
-  %fmul65 = fmul reassoc arcp contract float %111, %112
+  %fmul67 = fmul reassoc arcp contract float %111, %112
  %113 = load float, ptr %cosr, align 4
-  %fpfpext66 = fpext
-  %fsub67 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext66
-  %fpfptrunc68 = fptrunc
+  %fpfpext68 = fpext
+  %fsub69 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext68
+  %fpfptrunc70 = fptrunc
  %114 = load float, ptr %x, align 4
  %115 = load float, ptr %sinr, align 4
-  %fmul69 = fmul reassoc arcp contract float %114, %115
-  %116 = fneg reassoc arcp contract float %fmul69
-  %117 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul65, float %fpfptrunc68, float %116)
+  %fmul71 = fmul reassoc arcp contract float %114, %115
+  %116 = fneg reassoc arcp contract float %fmul71
+  %117 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul67, float %fpfptrunc70, float %116)
  %118 = insertelement <4 x float> %110, float %117, i64 2
  %119 = insertelement <4 x float> %118, float 0.000000e+00, i64 3
-  store <4 x float> %119, ptr %ptradd55, align 16
-  %ptradd70 = getelementptr inbounds i8, ptr %b, i64 32
+  store <4 x float> %119, ptr %ptradd57, align 16
+  %ptradd72 = getelementptr inbounds i8, ptr %b, i64 32
  %120 = load float, ptr %z, align 4
  %121 = load float, ptr %x, align 4
-  %fmul71 = fmul reassoc arcp contract float %120, %121
+  %fmul73 = fmul reassoc arcp contract float %120, %121
  %122 = load float, ptr %cosr, align 4
-  %fpfpext72 = fpext
-  %fsub73 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext72
-  %fpfptrunc74 = fptrunc
+  %fpfpext74 = fpext
+  %fsub75 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext74
+  %fpfptrunc76 = fptrunc
  %123 = load float, ptr %y, align 4
  %124 = load float, ptr %sinr, align 4
-  %fmul75 = fmul reassoc arcp contract float %123, %124
-  %125 = fneg reassoc arcp contract float %fmul75
-  %126 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul71, float %fpfptrunc74, float %125)
+  %fmul77 = fmul reassoc arcp contract float %123, %124
+  %125 = fneg reassoc arcp contract float %fmul77
+  %126 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul73, float %fpfptrunc76, float %125)
  %127 = insertelement <4 x float> undef, float %126, i64 0
  %128 = load float, ptr %z, align 4
  %129 = load float, ptr %y, align 4
-  %fmul76 = fmul reassoc arcp contract float %128, %129
+  %fmul78 = fmul reassoc arcp contract float %128, %129
  %130 = load float, ptr %cosr, align 4
-  %fpfpext77 = fpext
-  %fsub78 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext77
-  %fpfptrunc79 = fptrunc
+  %fpfpext79 = fpext
+  %fsub80 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext79
+  %fpfptrunc81 = fptrunc
  %131 = load float, ptr %x, align 4
  %132 = load float, ptr %sinr, align 4
-  %fmul80 = fmul reassoc arcp contract float %131, %132
-  %133 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul76, float %fpfptrunc79, float %fmul80)
+  %fmul82 = fmul reassoc arcp contract float %131, %132
+  %133 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul78, float %fpfptrunc81, float %fmul82)
  %134 = insertelement <4 x float> %127, float %133, i64 1
  %135 = load float, ptr %cosr, align 4
  %136 = load float, ptr %z, align 4
  %137 = load float, ptr %z, align 4
-  %fmul81 = fmul reassoc arcp contract float %136, %137
+  %fmul83 = fmul reassoc arcp contract float %136, %137
  %138 = load float, ptr %cosr, align 4
-  %fpfpext82 = fpext
-  %fsub83 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext82
-  %fpfptrunc84 = fptrunc
-  %139 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul81, float %fpfptrunc84, float %135)
+  %fpfpext84 = fpext
+  %fsub85 = fsub reassoc arcp contract double 1.000000e+00, %fpfpext84
+  %fpfptrunc86 = fptrunc
+  %139 = call reassoc arcp contract float @llvm.fmuladd.f32(float %fmul83, float %fpfptrunc86, float %135)
  %140 = insertelement <4 x float> %134, float %139, i64 2
  %141 = insertelement <4 x float> %140, float 0.000000e+00, i64 3
-  store <4 x float> %141, ptr %ptradd70, align 16
-  %ptradd85 = getelementptr inbounds i8, ptr %b, i64 48
-  store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr %ptradd85, align 16
+  store <4 x float> %141, ptr %ptradd72, align 16
+  %ptradd87 = getelementptr inbounds i8, ptr %b, i64 48
+  store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr %ptradd87, align 16
  store i64 0, ptr %.anon, align 8
  br label %loop.cond

@@ -360,17 +363,17 @@ loop.body:                                        ; preds = %loop.cond
  store <4 x float> %144, ptr %v, align 16
  %145 = load <4 x float>, ptr %v, align 16
  %146 = extractelement <4 x float> %145, i64 0
-  %fpfpext86 = fpext
+  %fpfpext88 = fpext
  %147 = load <4 x float>, ptr %v, align 16
  %148 = extractelement <4 x float> %147, i64 1
-  %fpfpext87 = fpext
+  %fpfpext89 = fpext
  %149 = load <4 x float>, ptr %v, align 16
  %150 = extractelement <4 x float> %149, i64 2
-  %fpfpext88 = fpext
+  %fpfpext90 = fpext
  %151 = load <4 x float>, ptr %v, align 16
  %152 = extractelement <4 x float> %151, i64 3
-  %fpfpext89 = fpext
-  %153 = call i32 (ptr, ...) @printf(ptr @.str, double %fpfpext86, double %fpfpext87, double %fpfpext88, double %fpfpext89)
+  %fpfpext91 = fpext
+  %153 = call i32 (ptr, ...) @printf(ptr @.str, double %fpfpext88, double %fpfpext89, double %fpfpext90, double %fpfpext91)
  %154 = load i64, ptr %.anon, align 8
  %addnuw = add nuw i64 %154, 1
  store i64 %addnuw, ptr %.anon, align 8
@@ -378,37 +381,37 @@ loop.body:                                        ; preds = %loop.cond

 loop.exit:                                        ; preds = %loop.cond
  %155 = call i32 (ptr, ...) @printf(ptr @.str.1)
-  store i64 0, ptr %.anon90, align 8
-  br label %loop.cond91
+  store i64 0, ptr %.anon92, align 8
+  br label %loop.cond93

-loop.cond91:                                      ; preds = %loop.body93, %loop.exit
-  %156 = load i64, ptr %.anon90, align 8
-  %gt92 = icmp ugt i64 4, %156
-  br i1 %gt92, label %loop.body93, label %loop.exit101
+loop.cond93:                                      ; preds = %loop.body95, %loop.exit
+  %156 = load i64, ptr %.anon92, align 8
+  %gt94 = icmp ugt i64 4, %156
+  br i1 %gt94, label %loop.body95, label %loop.exit103

-loop.body93:                                      ; preds = %loop.cond91
-  %157 = load i64, ptr %.anon90, align 8
-  %ptroffset95 = getelementptr inbounds [16 x i8], ptr %b, i64 %157
-  %158 = load <4 x float>, ptr %ptroffset95, align 16
-  store <4 x float> %158, ptr %v94, align 16
-  %159 = load <4 x float>, ptr %v94, align 16
+loop.body95:                                      ; preds = %loop.cond93
+  %157 = load i64, ptr %.anon92, align 8
+  %ptroffset97 = getelementptr inbounds [16 x i8], ptr %b, i64 %157
+  %158 = load <4 x float>, ptr %ptroffset97, align 16
+  store <4 x float> %158, ptr %v96, align 16
+  %159 = load <4 x float>, ptr %v96, align 16
  %160 = extractelement <4 x float> %159, i64 0
-  %fpfpext96 = fpext
-  %161 = load <4 x float>, ptr %v94, align 16
-  %162 = extractelement <4 x float> %161, i64 1
-  %fpfpext97 = fpext
-  %163 = load <4 x float>, ptr %v94, align 16
-  %164 = extractelement <4 x float> %163, i64 2
  %fpfpext98 = fpext
-  %165 = load <4 x float>, ptr %v94, align 16
-  %166 = extractelement <4 x float> %165, i64 3
+  %161 = load <4 x float>, ptr %v96, align 16
+  %162 = extractelement <4 x float> %161, i64 1
  %fpfpext99 = fpext
-  %167 = call i32 (ptr, ...) @printf(ptr @.str.2, double %fpfpext96, double %fpfpext97, double %fpfpext98, double %fpfpext99)
-  %168 = load i64, ptr %.anon90, align 8
-  %addnuw100 = add nuw i64 %168, 1
-  store i64 %addnuw100, ptr %.anon90, align 8
-  br label %loop.cond91
+  %163 = load <4 x float>, ptr %v96, align 16
+  %164 = extractelement <4 x float> %163, i64 2
+  %fpfpext100 = fpext
+  %165 = load <4 x float>, ptr %v96, align 16
+  %166 = extractelement <4 x float> %165, i64 3
+  %fpfpext101 = fpext
+  %167 = call i32 (ptr, ...) @printf(ptr @.str.2, double %fpfpext98, double %fpfpext99, double %fpfpext100, double %fpfpext101)
+  %168 = load i64, ptr %.anon92, align 8
+  %addnuw102 = add nuw i64 %168, 1
+  store i64 %addnuw102, ptr %.anon92, align 8
+  br label %loop.cond93

-loop.exit101:                                     ; preds = %loop.cond91
+loop.exit103:                                     ; preds = %loop.cond93
  ret void
 }
--- a/test/test_suite/vector/vector_param.c3t
+++ b/test/test_suite/vector/vector_param.c3t
@@ -1,7 +1,9 @@
 // #target: macos-x64
 module test;

-fn void test(int[<4>] x)
+typedef Int4V = int[<4>] @simd;
+
+fn void test(Int4V x)
 {
 	x[1] = 123;
 	int y = x[1];
--- a/test/unit/stdlib/math/matrix.c3
+++ b/test/unit/stdlib/math/matrix.c3
@@ -1,92 +1,91 @@
 module math_matrix @test;
 import std::math;

-fn void test_mat4()
+fn void test_mat4_translate()
 {
-	{
-		Matrix4 mat = MATRIX4_IDENTITY;
-		Matrix4 mat2 = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 };
-		Matrix4 calc = mat.mul(mat2);
-		assert(calc.m == mat.m);
-		assert(mat * mat2 == mat);
+	Matrix4 mat = MATRIX4_IDENTITY;
+	Matrix4 mat2 = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 };
+	Matrix4 calc = mat.mul(mat2);
+	assert(calc.m == mat.m);
+	assert(mat * mat2 == mat);

-		Matrix4 translated = mat.translate({0.0, 0.0, 0.0});
-		assert(translated.m == mat.m);
-	};
-
-	{
-		Matrix4 mat = { 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 };
-		Matrix4 mat2 = { 8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1 };
-		Matrix4 calc = mat.mul(mat2);
-		Matrix4 value = { 56, 46, 36, 26, 152, 126, 100, 74, 56, 46, 36, 26, 152, 126, 100, 74 };
-		assert(calc.m == value.m);
-		assert(mat * mat2 == value);
-	};
-
-	{
-		Matrix4 result = {
-			0.988936, 0.000000, -0.148340, -0.988936, 
-			-0.014599, 0.995146, -0.097325, -2.970838, 
-			0.147620, 0.098414, 0.984136, -20.765262, 
-			0.000000, 0.000000, 0.000000, 1.000000
-		};
-
-		Matrix4f result_f = {
-			0.988936, 0.000000, -0.148340, -0.988936, 
-			-0.014599, 0.995146, -0.097325, -2.970838, 
-			0.147620, 0.098414, 0.984136, -20.765262, 
-			0.000000, 0.000000, 0.000000, 1.000000
-		};
-
-		Matrix4 result_transposed = {
-			0.988936, -0.014599, 0.147620, 0.000000, 
-			0.000000, 0.995146, 0.098414, 0.000000, 
-			-0.148340, -0.097325, 0.984136, 0.000000, 
-			-0.988936, -2.970838, -20.765262, 1.000000
-		};
-
-		Matrix4f result_transposed_f = {
-			0.988936, -0.014599, 0.147620, 0.000000, 
-			0.000000, 0.995146, 0.098414, 0.000000, 
-			-0.148340, -0.097325, 0.984136, 0.000000, 
-			-0.988936, -2.970838, -20.765262, 1.000000
-		};
-
-		Matrix4 look_at = matrix::look_at{double}({4.0, 5.0, 20.0}, {1.0, 3.0, 0.0}, {0.0, 1.0, 0.0});
-		Matrix4f look_at_f = matrix::look_at{float}({4.0, 5.0, 20.0}, {1.0, 3.0, 0.0}, {0.0, 1.0, 0.0});
-
-		assert(math::round_to_decimals((double[<16>])look_at.m, 4) == math::round_to_decimals((double[<16>])result.m, 4));
-		assert(math::round_to_decimals((float[<16>])look_at_f.m, 4) == math::round_to_decimals((float[<16>])result_f.m, 4));
-
-		assert(math::round_to_decimals((double[<16>])result_transposed.m, 4) == math::round_to_decimals((double[<16>])look_at.transpose().m, 4));
-		assert(math::round_to_decimals((float[<16>])result_transposed_f.m, 4) == math::round_to_decimals((float[<16>])look_at_f.transpose().m, 4));
-	};
-
-	{
-		Matrix4 result = {
-			1.857087, 0.000000, 0.000000, 
-			0.000000, 0.000000, 2.414214, 
-			0.000000, 0.000000, 0.000000, 0.000000, 
-			-1.000200, -0.200020, 0.000000, 0.000000, 
-			-1.000000, 0.000000
-		};
-
-		Matrix4f result_f = {
-			1.857087, 0.000000, 0.000000, 
-			0.000000, 0.000000, 2.414214, 
-			0.000000, 0.000000, 0.000000, 0.000000, 
-			-1.000200, -0.200020, 0.000000, 0.000000, 
-			-1.000000, 0.000000
-		};
-
-		Matrix4 perspective = matrix4_perspective(math::deg_to_rad(45), 1.3, 0.1, 1000);
-		Matrix4f perspective_f = matrix4f_perspective((float)math::deg_to_rad(45), 1.3, 0.1, 1000);
-
-		assert(math::round_to_decimals((double[<16>])result.m, 4) == math::round_to_decimals((double[<16>])perspective.m, 4));
-		assert(math::round_to_decimals((float[<16>])result_f.m, 4) == math::round_to_decimals((float[<16>])perspective_f.m, 4));
-	};
+	Matrix4 translated = mat.translate({0.0, 0.0, 0.0});
+	assert(translated.m == mat.m);
 }

+fn void test_mat4_mul()
+{
+	Matrix4 mat = { 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 };
+	Matrix4 mat2 = { 8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1 };
+	Matrix4 calc = mat.mul(mat2);
+	Matrix4 value = { 56, 46, 36, 26, 152, 126, 100, 74, 56, 46, 36, 26, 152, 126, 100, 74 };
+	assert(calc.m == value.m);
+	assert(mat * mat2 == value);
+}
+fn void test_mat4_lookat()
+{
+	Matrix4 result = {
+		0.988936, 0.000000, -0.148340, -0.988936,
+		-0.014599, 0.995146, -0.097325, -2.970838,
+		0.147620, 0.098414, 0.984136, -20.765262,
+		0.000000, 0.000000, 0.000000, 1.000000
+	};
+
+	Matrix4f result_f = {
+		0.988936, 0.000000, -0.148340, -0.988936,
+		-0.014599, 0.995146, -0.097325, -2.970838,
+		0.147620, 0.098414, 0.984136, -20.765262,
+		0.000000, 0.000000, 0.000000, 1.000000
+	};
+
+	Matrix4 result_transposed = {
+		0.988936, -0.014599, 0.147620, 0.000000,
+		0.000000, 0.995146, 0.098414, 0.000000,
+		-0.148340, -0.097325, 0.984136, 0.000000,
+		-0.988936, -2.970838, -20.765262, 1.000000
+	};
+
+	Matrix4f result_transposed_f = {
+		0.988936, -0.014599, 0.147620, 0.000000,
+		0.000000, 0.995146, 0.098414, 0.000000,
+		-0.148340, -0.097325, 0.984136, 0.000000,
+		-0.988936, -2.970838, -20.765262, 1.000000
+	};
+
+	Matrix4 look_at = matrix::look_at{double}({4.0, 5.0, 20.0}, {1.0, 3.0, 0.0}, {0.0, 1.0, 0.0});
+	Matrix4f look_at_f = matrix::look_at{float}({4.0, 5.0, 20.0}, {1.0, 3.0, 0.0}, {0.0, 1.0, 0.0});
+
+	assert(math::round_to_decimals((double[<16>])look_at.m, 4) == math::round_to_decimals((double[<16>])result.m, 4));
+	assert(math::round_to_decimals((float[<16>])look_at_f.m, 4) == math::round_to_decimals((float[<16>])result_f.m, 4));
+
+	assert(math::round_to_decimals((double[<16>])result_transposed.m, 4) == math::round_to_decimals((double[<16>])look_at.transpose().m, 4));
+	assert(math::round_to_decimals((float[<16>])result_transposed_f.m, 4) == math::round_to_decimals((float[<16>])look_at_f.transpose().m, 4));
+}
+
+fn void test_mat4_perspective()
+{
+	Matrix4 result = {
+		1.857087, 0.000000, 0.000000,
+		0.000000, 0.000000, 2.414214,
+		0.000000, 0.000000, 0.000000, 0.000000,
+		-1.000200, -0.200020, 0.000000, 0.000000,
+		-1.000000, 0.000000
+	};
+
+	Matrix4f result_f = {
+		1.857087, 0.000000, 0.000000,
+		0.000000, 0.000000, 2.414214,
+		0.000000, 0.000000, 0.000000, 0.000000,
+		-1.000200, -0.200020, 0.000000, 0.000000,
+		-1.000000, 0.000000
+	};
+
+	Matrix4 perspective = matrix4_perspective(math::deg_to_rad(45), 1.3, 0.1, 1000);
+	Matrix4f perspective_f = matrix4f_perspective((float)math::deg_to_rad(45), 1.3, 0.1, 1000);
+
+	assert(math::round_to_decimals((double[<16>])result.m, 4) == math::round_to_decimals((double[<16>])perspective.m, 4));
+	assert(math::round_to_decimals((float[<16>])result_f.m, 4) == math::round_to_decimals((float[<16>])perspective_f.m, 4));
+}

 fn void test_mat3()
 {