diff --git a/src/compiler/llvm_codegen_builtins.c b/src/compiler/llvm_codegen_builtins.c
index 0050c4080..5e6975d48 100644
--- a/src/compiler/llvm_codegen_builtins.c
+++ b/src/compiler/llvm_codegen_builtins.c
@@ -430,7 +430,7 @@ INLINE void llvm_emit_prefetch(GenContext *c, BEValue *be_value, Expr *expr)
 	llvm_value_set(be_value, result, type_void);
 }
 
-void llvm_emit_reduce_int_builtin(GenContext *c, unsigned intrinsic, BEValue *be_value, Expr *expr)
+static void llvm_emit_reduce_int_builtin(GenContext *c, unsigned intrinsic, BEValue *be_value, Expr *expr)
 {
 	Expr **args = expr->call_expr.arguments;
 	LLVMValueRef arg_slots[1];
@@ -685,6 +685,7 @@ static void llvm_emit_veccomp(GenContext *c, BEValue *value, Expr *expr, Builtin
 {
 	Expr **args = expr->call_expr.arguments;
 	unsigned count = vec_size(args);
+	(void)count;
 	assert(count == 2);
 
 	LLVMValueRef mask;
diff --git a/src/compiler/llvm_codegen_expr.c b/src/compiler/llvm_codegen_expr.c
index 01f1245c7..733258af7 100644
--- a/src/compiler/llvm_codegen_expr.c
+++ b/src/compiler/llvm_codegen_expr.c
@@ -3702,6 +3702,48 @@ static inline void llvm_emit_memcmp(GenContext *c, BEValue *be_value, LLVMValueR
 	llvm_value_set(be_value, function, type_cint);
 }
 
+static inline void llvm_emit_fp_vector_compare(GenContext *c, BEValue *be_value, BEValue *lhs, BEValue *rhs, BinaryOp binary_op, Type *base_type, unsigned len)
+{
+	LLVMTypeRef fp_vec = LLVMVectorType(llvm_get_type(c, base_type), len);
+	LLVMTypeRef bool_vec = LLVMVectorType(c->bool_type, len);
+	llvm_value_addr(c, lhs);
+	llvm_value_addr(c, rhs);
+	LLVMValueRef left = llvm_load(c, fp_vec, lhs->value, lhs->alignment, "lhs");
+	LLVMValueRef right = llvm_load(c, fp_vec, rhs->value, rhs->alignment, "rhs");
+	LLVMValueRef cmp = LLVMBuildFCmp(c->builder, binary_op == BINARYOP_EQ ? LLVMRealOEQ : LLVMRealONE, left, right, "cmp");
+	if (binary_op == BINARYOP_EQ)
+	{
+		cmp = llvm_emit_call_intrinsic(c, intrinsic_id.vector_reduce_and, &bool_vec, 1, &cmp, 1);
+	}
+	else
+	{
+		cmp = llvm_emit_call_intrinsic(c, intrinsic_id.vector_reduce_or, &bool_vec, 1, &cmp, 1);
+	}
+	llvm_value_set(be_value, cmp, type_bool);
+}
+
+static inline void llvm_emit_bool_vector_compare(GenContext *c, BEValue *be_value, BEValue *lhs, BEValue *rhs, BinaryOp binary_op, unsigned len)
+{
+	LLVMTypeRef bool_vec = LLVMVectorType(c->bool_type, len);
+	LLVMTypeRef load_vec = LLVMVectorType(c->byte_type, len);
+	llvm_value_addr(c, lhs);
+	llvm_value_addr(c, rhs);
+	LLVMValueRef left = llvm_load(c, load_vec, lhs->value, lhs->alignment, "lhs");
+	LLVMValueRef right = llvm_load(c, load_vec, rhs->value, rhs->alignment, "rhs");
+	left = LLVMBuildTrunc(c->builder, left, bool_vec, "");
+	right = LLVMBuildTrunc(c->builder, right, bool_vec, "");
+	LLVMValueRef cmp = LLVMBuildICmp(c->builder, binary_op == BINARYOP_EQ ? LLVMIntEQ : LLVMIntNE, left, right, "cmp");
+	if (binary_op == BINARYOP_EQ)
+	{
+		cmp = llvm_emit_call_intrinsic(c, intrinsic_id.vector_reduce_and, &bool_vec, 1, &cmp, 1);
+	}
+	else
+	{
+		cmp = llvm_emit_call_intrinsic(c, intrinsic_id.vector_reduce_or, &bool_vec, 1, &cmp, 1);
+	}
+	llvm_value_set(be_value, cmp, type_bool);
+}
+
 static void llvm_emit_array_comp(GenContext *c, BEValue *be_value, BEValue *lhs, BEValue *rhs, BinaryOp binary_op)
 {
 	Type *array_base = type_flatten(lhs->type->array.base);
@@ -3744,11 +3786,11 @@ MEMCMP:
 		case TYPE_TYPEINFO:
 		case TYPE_MEMBER:
 			UNREACHABLE
+		case TYPE_BOOL:
 		case ALL_FLOATS:
 		case TYPE_SLICE:
 		case TYPE_ARRAY:
 		case TYPE_FLEXIBLE_ARRAY:
-		case TYPE_BOOL:
 			break;
 	}
 
@@ -3758,6 +3800,16 @@ MEMCMP:
 	LLVMTypeRef array_type = llvm_get_type(c, lhs->type);
 	if (should_inline_array_comp(len, array_base_type))
 	{
+		if (array_base_type == type_bool)
+		{
+			llvm_emit_bool_vector_compare(c, be_value, lhs, rhs, binary_op, len);
+			return;
+		}
+		if (type_is_float(array_base_type))
+		{
+			llvm_emit_fp_vector_compare(c, be_value, lhs, rhs, binary_op, array_base_type, len);
+			return;
+		}
 		LLVMBasicBlockRef blocks[17];
 		LLVMValueRef value_block[17];
 		LLVMBasicBlockRef ok_block = llvm_basic_block_new(c, "match");
diff --git a/test/test_suite/arrays/array_comparison.c3t b/test/test_suite/arrays/array_comparison.c3t
index be9f2991a..ef0f26e47 100644
--- a/test/test_suite/arrays/array_comparison.c3t
+++ b/test/test_suite/arrays/array_comparison.c3t
@@ -78,94 +78,46 @@ entry:
   %y3 = alloca i8, align 1
   call void @get(ptr %a)
   call void @get(ptr %b)
-  %0 = load double, ptr %a, align 8
-  %1 = load double, ptr %b, align 8
-  %eq = fcmp oeq double %0, %1
-  br i1 %eq, label %next_check, label %exit
-next_check:                                       ; preds = %entry
-  %ptradd = getelementptr inbounds i8, ptr %a, i64 8
-  %ptradd1 = getelementptr inbounds i8, ptr %b, i64 8
-  %2 = load double, ptr %ptradd, align 8
-  %3 = load double, ptr %ptradd1, align 8
-  %eq2 = fcmp oeq double %2, %3
-  br i1 %eq2, label %match, label %exit
-match:                                            ; preds = %next_check
-  br label %exit
-exit:                                             ; preds = %match, %next_check, %entry
-  %array_cmp_phi = phi i1 [ false, %entry ], [ false, %next_check ], [ true, %match ]
-  %4 = zext i1 %array_cmp_phi to i8
-  store i8 %4, ptr %x, align 1
-  %5 = load double, ptr %a, align 8
-  %6 = load double, ptr %b, align 8
-  %eq3 = fcmp oeq double %5, %6
-  br i1 %eq3, label %next_check4, label %exit9
-next_check4:                                      ; preds = %exit
-  %ptradd5 = getelementptr inbounds i8, ptr %a, i64 8
-  %ptradd6 = getelementptr inbounds i8, ptr %b, i64 8
-  %7 = load double, ptr %ptradd5, align 8
-  %8 = load double, ptr %ptradd6, align 8
-  %eq7 = fcmp oeq double %7, %8
-  br i1 %eq7, label %match8, label %exit9
-match8:                                           ; preds = %next_check4
-  br label %exit9
-exit9:                                            ; preds = %match8, %next_check4, %exit
-  %array_cmp_phi10 = phi i1 [ true, %exit ], [ true, %next_check4 ], [ false, %match8 ]
-  %9 = zext i1 %array_cmp_phi10 to i8
-  store i8 %9, ptr %y, align 1
+  %lhs = load <2 x double>, ptr %a, align 16
+  %rhs = load <2 x double>, ptr %b, align 16
+  %cmp = fcmp oeq <2 x double> %lhs, %rhs
+  %0 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %cmp)
+  %1 = zext i1 %0 to i8
+  store i8 %1, ptr %x, align 1
+  %lhs1 = load <2 x double>, ptr %a, align 16
+  %rhs2 = load <2 x double>, ptr %b, align 16
+  %cmp3 = fcmp one <2 x double> %lhs1, %rhs2
+  %2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cmp3)
+  %3 = zext i1 %2 to i8
+  store i8 %3, ptr %y, align 1
   call void @get2(ptr %a2)
   call void @get2(ptr %b2)
-  %cmp = call i32 @memcmp(ptr %a2, ptr %b2, i64 8)
-  %eq11 = icmp eq i32 %cmp, 0
-  %10 = zext i1 %eq11 to i8
-  store i8 %10, ptr %x2, align 1
-  %cmp12 = call i32 @memcmp(ptr %a2, ptr %b2, i64 8)
-  %neq = icmp ne i32 %cmp12, 0
-  %11 = zext i1 %neq to i8
-  store i8 %11, ptr %y2, align 1
+  %cmp4 = call i32 @memcmp(ptr %a2, ptr %b2, i64 8)
+  %eq = icmp eq i32 %cmp4, 0
+  %4 = zext i1 %eq to i8
+  store i8 %4, ptr %x2, align 1
+  %cmp5 = call i32 @memcmp(ptr %a2, ptr %b2, i64 8)
+  %neq = icmp ne i32 %cmp5, 0
+  %5 = zext i1 %neq to i8
+  store i8 %5, ptr %y2, align 1
   call void @get3(ptr %a3)
   call void @get3(ptr %b3)
-  %12 = load i8, ptr %a3, align 1
-  %13 = trunc i8 %12 to i1
-  %14 = load i8, ptr %b3, align 1
-  %15 = trunc i8 %14 to i1
-  %eq13 = icmp eq i1 %13, %15
-  br i1 %eq13, label %next_check14, label %exit19
-next_check14:                                     ; preds = %exit9
-  %ptradd15 = getelementptr inbounds i8, ptr %a3, i64 1
-  %ptradd16 = getelementptr inbounds i8, ptr %b3, i64 1
-  %16 = load i8, ptr %ptradd15, align 1
-  %17 = trunc i8 %16 to i1
-  %18 = load i8, ptr %ptradd16, align 1
-  %19 = trunc i8 %18 to i1
-  %eq17 = icmp eq i1 %17, %19
-  br i1 %eq17, label %match18, label %exit19
-match18:                                          ; preds = %next_check14
-  br label %exit19
-exit19:                                           ; preds = %match18, %next_check14, %exit9
-  %array_cmp_phi20 = phi i1 [ false, %exit9 ], [ false, %next_check14 ], [ true, %match18 ]
-  %20 = zext i1 %array_cmp_phi20 to i8
-  store i8 %20, ptr %x3, align 1
-  %21 = load i8, ptr %a3, align 1
-  %22 = trunc i8 %21 to i1
-  %23 = load i8, ptr %b3, align 1
-  %24 = trunc i8 %23 to i1
-  %eq21 = icmp eq i1 %22, %24
-  br i1 %eq21, label %next_check22, label %exit27
-next_check22:                                     ; preds = %exit19
-  %ptradd23 = getelementptr inbounds i8, ptr %a3, i64 1
-  %ptradd24 = getelementptr inbounds i8, ptr %b3, i64 1
-  %25 = load i8, ptr %ptradd23, align 1
-  %26 = trunc i8 %25 to i1
-  %27 = load i8, ptr %ptradd24, align 1
-  %28 = trunc i8 %27 to i1
-  %eq25 = icmp eq i1 %26, %28
-  br i1 %eq25, label %match26, label %exit27
-match26:                                          ; preds = %next_check22
-  br label %exit27
-exit27:                                           ; preds = %match26, %next_check22, %exit19
-  %array_cmp_phi28 = phi i1 [ true, %exit19 ], [ true, %next_check22 ], [ false, %match26 ]
-  %29 = zext i1 %array_cmp_phi28 to i8
-  store i8 %29, ptr %y3, align 1
+  %lhs6 = load <2 x i8>, ptr %a3, align 1
+  %rhs7 = load <2 x i8>, ptr %b3, align 1
+  %6 = trunc <2 x i8> %lhs6 to <2 x i1>
+  %7 = trunc <2 x i8> %rhs7 to <2 x i1>
+  %cmp8 = icmp eq <2 x i1> %6, %7
+  %8 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %cmp8)
+  %9 = zext i1 %8 to i8
+  store i8 %9, ptr %x3, align 1
+  %lhs9 = load <2 x i8>, ptr %a3, align 1
+  %rhs10 = load <2 x i8>, ptr %b3, align 1
+  %10 = trunc <2 x i8> %lhs9 to <2 x i1>
+  %11 = trunc <2 x i8> %rhs10 to <2 x i1>
+  %cmp11 = icmp ne <2 x i1> %10, %11
+  %12 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cmp11)
+  %13 = zext i1 %12 to i8
+  store i8 %13, ptr %y3, align 1
   ret void
 }
 define void @test.test2() #0 {
@@ -190,6 +142,7 @@ entry:
   call void @aget(ptr %b)
   store i64 0, ptr %cmp.idx, align 8
   br label %array_loop_start
+
 array_loop_start:                                 ; preds = %array_loop_comparison, %entry
   %0 = load i64, ptr %cmp.idx, align 8
   %ptroffset = getelementptr inbounds [8 x i8], ptr %a, i64 %0
@@ -198,17 +151,20 @@ array_loop_start:                                 ; preds = %array_loop_comparis
   %2 = load double, ptr %ptroffset1, align 8
   %eq = fcmp oeq double %1, %2
   br i1 %eq, label %array_loop_comparison, label %array_cmp_exit
+
 array_loop_comparison:                            ; preds = %array_loop_start
   %inc = add i64 %0, 1
   store i64 %inc, ptr %cmp.idx, align 8
   %lt = icmp ult i64 %inc, 200
   br i1 %lt, label %array_loop_start, label %array_cmp_exit
+
 array_cmp_exit:                                   ; preds = %array_loop_comparison, %array_loop_start
   %array_cmp_phi = phi i1 [ true, %array_loop_comparison ], [ false, %array_loop_start ]
   %3 = zext i1 %array_cmp_phi to i8
   store i8 %3, ptr %x, align 1
   store i64 0, ptr %cmp.idx2, align 8
   br label %array_loop_start3
+
 array_loop_start3:                                ; preds = %array_loop_comparison7, %array_cmp_exit
   %4 = load i64, ptr %cmp.idx2, align 8
   %ptroffset4 = getelementptr inbounds [8 x i8], ptr %a, i64 %4
@@ -217,11 +173,13 @@ array_loop_start3:                                ; preds = %array_loop_comparis
   %6 = load double, ptr %ptroffset5, align 8
   %eq6 = fcmp oeq double %5, %6
   br i1 %eq6, label %array_loop_comparison7, label %array_cmp_exit10
+
 array_loop_comparison7:                           ; preds = %array_loop_start3
   %inc8 = add i64 %4, 1
   store i64 %inc8, ptr %cmp.idx2, align 8
   %lt9 = icmp ult i64 %inc8, 200
   br i1 %lt9, label %array_loop_start3, label %array_cmp_exit10
+
 array_cmp_exit10:                                 ; preds = %array_loop_comparison7, %array_loop_start3
   %array_cmp_phi11 = phi i1 [ false, %array_loop_comparison7 ], [ true, %array_loop_start3 ]
   %7 = zext i1 %array_cmp_phi11 to i8
@@ -240,6 +198,7 @@ array_cmp_exit10:                                 ; preds = %array_loop_comparis
   call void @aget3(ptr %b3)
   store i64 0, ptr %cmp.idx14, align 8
   br label %array_loop_start15
+
 array_loop_start15:                               ; preds = %array_loop_comparison18, %array_cmp_exit10
   %10 = load i64, ptr %cmp.idx14, align 8
   %ptradd = getelementptr inbounds i8, ptr %a3, i64 %10
@@ -250,17 +209,20 @@ array_loop_start15:                               ; preds = %array_loop_comparis
   %14 = trunc i8 %13 to i1
   %eq17 = icmp eq i1 %12, %14
   br i1 %eq17, label %array_loop_comparison18, label %array_cmp_exit21
+
 array_loop_comparison18:                          ; preds = %array_loop_start15
   %inc19 = add i64 %10, 1
   store i64 %inc19, ptr %cmp.idx14, align 8
   %lt20 = icmp ult i64 %inc19, 200
   br i1 %lt20, label %array_loop_start15, label %array_cmp_exit21
+
 array_cmp_exit21:                                 ; preds = %array_loop_comparison18, %array_loop_start15
   %array_cmp_phi22 = phi i1 [ true, %array_loop_comparison18 ], [ false, %array_loop_start15 ]
   %15 = zext i1 %array_cmp_phi22 to i8
   store i8 %15, ptr %x3, align 1
   store i64 0, ptr %cmp.idx23, align 8
   br label %array_loop_start24
+
 array_loop_start24:                               ; preds = %array_loop_comparison28, %array_cmp_exit21
   %16 = load i64, ptr %cmp.idx23, align 8
   %ptradd25 = getelementptr inbounds i8, ptr %a3, i64 %16
@@ -271,11 +233,13 @@ array_loop_start24:                               ; preds = %array_loop_comparis
   %20 = trunc i8 %19 to i1
   %eq27 = icmp eq i1 %18, %20
   br i1 %eq27, label %array_loop_comparison28, label %array_cmp_exit31
+
 array_loop_comparison28:                          ; preds = %array_loop_start24
   %inc29 = add i64 %16, 1
   store i64 %inc29, ptr %cmp.idx23, align 8
   %lt30 = icmp ult i64 %inc29, 200
   br i1 %lt30, label %array_loop_start24, label %array_cmp_exit31
+
 array_cmp_exit31:                                 ; preds = %array_loop_comparison28, %array_loop_start24
   %array_cmp_phi32 = phi i1 [ false, %array_loop_comparison28 ], [ true, %array_loop_start24 ]
   %21 = zext i1 %array_cmp_phi32 to i8