diff --git a/src/compiler/llvm_codegen_builtins.c b/src/compiler/llvm_codegen_builtins.c index 0050c4080..5e6975d48 100644 --- a/src/compiler/llvm_codegen_builtins.c +++ b/src/compiler/llvm_codegen_builtins.c @@ -430,7 +430,7 @@ INLINE void llvm_emit_prefetch(GenContext *c, BEValue *be_value, Expr *expr) llvm_value_set(be_value, result, type_void); } -void llvm_emit_reduce_int_builtin(GenContext *c, unsigned intrinsic, BEValue *be_value, Expr *expr) +static void llvm_emit_reduce_int_builtin(GenContext *c, unsigned intrinsic, BEValue *be_value, Expr *expr) { Expr **args = expr->call_expr.arguments; LLVMValueRef arg_slots[1]; @@ -685,6 +685,7 @@ static void llvm_emit_veccomp(GenContext *c, BEValue *value, Expr *expr, Builtin { Expr **args = expr->call_expr.arguments; unsigned count = vec_size(args); + (void)count; assert(count == 2); LLVMValueRef mask; diff --git a/src/compiler/llvm_codegen_expr.c b/src/compiler/llvm_codegen_expr.c index 01f1245c7..733258af7 100644 --- a/src/compiler/llvm_codegen_expr.c +++ b/src/compiler/llvm_codegen_expr.c @@ -3702,6 +3702,48 @@ static inline void llvm_emit_memcmp(GenContext *c, BEValue *be_value, LLVMValueR llvm_value_set(be_value, function, type_cint); } +static inline void llvm_emit_fp_vector_compare(GenContext *c, BEValue *be_value, BEValue *lhs, BEValue *rhs, BinaryOp binary_op, Type *base_type, unsigned len) +{ + LLVMTypeRef fp_vec = LLVMVectorType(llvm_get_type(c, base_type), len); + LLVMTypeRef bool_vec = LLVMVectorType(c->bool_type, len); + llvm_value_addr(c, lhs); + llvm_value_addr(c, rhs); + LLVMValueRef left = llvm_load(c, fp_vec, lhs->value, lhs->alignment, "lhs"); + LLVMValueRef right = llvm_load(c, fp_vec, rhs->value, rhs->alignment, "rhs"); + LLVMValueRef cmp = LLVMBuildFCmp(c->builder, binary_op == BINARYOP_EQ ? LLVMRealOEQ : LLVMRealONE, left, right, "cmp"); + if (binary_op == BINARYOP_EQ) + { + cmp = llvm_emit_call_intrinsic(c, intrinsic_id.vector_reduce_and, &bool_vec, 1, &cmp, 1); + } + else + { + cmp = llvm_emit_call_intrinsic(c, intrinsic_id.vector_reduce_or, &bool_vec, 1, &cmp, 1); + } + llvm_value_set(be_value, cmp, type_bool); +} + +static inline void llvm_emit_bool_vector_compare(GenContext *c, BEValue *be_value, BEValue *lhs, BEValue *rhs, BinaryOp binary_op, unsigned len) +{ + LLVMTypeRef bool_vec = LLVMVectorType(c->bool_type, len); + LLVMTypeRef load_vec = LLVMVectorType(c->byte_type, len); + llvm_value_addr(c, lhs); + llvm_value_addr(c, rhs); + LLVMValueRef left = llvm_load(c, load_vec, lhs->value, lhs->alignment, "lhs"); + LLVMValueRef right = llvm_load(c, load_vec, rhs->value, rhs->alignment, "rhs"); + left = LLVMBuildTrunc(c->builder, left, bool_vec, ""); + right = LLVMBuildTrunc(c->builder, right, bool_vec, ""); + LLVMValueRef cmp = LLVMBuildICmp(c->builder, binary_op == BINARYOP_EQ ? LLVMIntEQ : LLVMIntNE, left, right, "cmp"); + if (binary_op == BINARYOP_EQ) + { + cmp = llvm_emit_call_intrinsic(c, intrinsic_id.vector_reduce_and, &bool_vec, 1, &cmp, 1); + } + else + { + cmp = llvm_emit_call_intrinsic(c, intrinsic_id.vector_reduce_or, &bool_vec, 1, &cmp, 1); + } + llvm_value_set(be_value, cmp, type_bool); +} + static void llvm_emit_array_comp(GenContext *c, BEValue *be_value, BEValue *lhs, BEValue *rhs, BinaryOp binary_op) { Type *array_base = type_flatten(lhs->type->array.base); @@ -3744,11 +3786,11 @@ MEMCMP: case TYPE_TYPEINFO: case TYPE_MEMBER: UNREACHABLE + case TYPE_BOOL: case ALL_FLOATS: case TYPE_SLICE: case TYPE_ARRAY: case TYPE_FLEXIBLE_ARRAY: - case TYPE_BOOL: break; } @@ -3758,6 +3800,16 @@ MEMCMP: LLVMTypeRef array_type = llvm_get_type(c, lhs->type); if (should_inline_array_comp(len, array_base_type)) { + if (array_base_type == type_bool) + { + llvm_emit_bool_vector_compare(c, be_value, lhs, rhs, binary_op, len); + return; + } + if (type_is_float(array_base_type)) + { + llvm_emit_fp_vector_compare(c, be_value, lhs, rhs, binary_op, array_base_type, len); + return; + } LLVMBasicBlockRef blocks[17]; LLVMValueRef value_block[17]; LLVMBasicBlockRef ok_block = llvm_basic_block_new(c, "match"); diff --git a/test/test_suite/arrays/array_comparison.c3t b/test/test_suite/arrays/array_comparison.c3t index be9f2991a..ef0f26e47 100644 --- a/test/test_suite/arrays/array_comparison.c3t +++ b/test/test_suite/arrays/array_comparison.c3t @@ -78,94 +78,46 @@ entry: %y3 = alloca i8, align 1 call void @get(ptr %a) call void @get(ptr %b) - %0 = load double, ptr %a, align 8 - %1 = load double, ptr %b, align 8 - %eq = fcmp oeq double %0, %1 - br i1 %eq, label %next_check, label %exit -next_check: ; preds = %entry - %ptradd = getelementptr inbounds i8, ptr %a, i64 8 - %ptradd1 = getelementptr inbounds i8, ptr %b, i64 8 - %2 = load double, ptr %ptradd, align 8 - %3 = load double, ptr %ptradd1, align 8 - %eq2 = fcmp oeq double %2, %3 - br i1 %eq2, label %match, label %exit -match: ; preds = %next_check - br label %exit -exit: ; preds = %match, %next_check, %entry - %array_cmp_phi = phi i1 [ false, %entry ], [ false, %next_check ], [ true, %match ] - %4 = zext i1 %array_cmp_phi to i8 - store i8 %4, ptr %x, align 1 - %5 = load double, ptr %a, align 8 - %6 = load double, ptr %b, align 8 - %eq3 = fcmp oeq double %5, %6 - br i1 %eq3, label %next_check4, label %exit9 -next_check4: ; preds = %exit - %ptradd5 = getelementptr inbounds i8, ptr %a, i64 8 - %ptradd6 = getelementptr inbounds i8, ptr %b, i64 8 - %7 = load double, ptr %ptradd5, align 8 - %8 = load double, ptr %ptradd6, align 8 - %eq7 = fcmp oeq double %7, %8 - br i1 %eq7, label %match8, label %exit9 -match8: ; preds = %next_check4 - br label %exit9 -exit9: ; preds = %match8, %next_check4, %exit - %array_cmp_phi10 = phi i1 [ true, %exit ], [ true, %next_check4 ], [ false, %match8 ] - %9 = zext i1 %array_cmp_phi10 to i8 - store i8 %9, ptr %y, align 1 + %lhs = load <2 x double>, ptr %a, align 16 + %rhs = load <2 x double>, ptr %b, align 16 + %cmp = fcmp oeq <2 x double> %lhs, %rhs + %0 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %cmp) + %1 = zext i1 %0 to i8 + store i8 %1, ptr %x, align 1 + %lhs1 = load <2 x double>, ptr %a, align 16 + %rhs2 = load <2 x double>, ptr %b, align 16 + %cmp3 = fcmp one <2 x double> %lhs1, %rhs2 + %2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cmp3) + %3 = zext i1 %2 to i8 + store i8 %3, ptr %y, align 1 call void @get2(ptr %a2) call void @get2(ptr %b2) - %cmp = call i32 @memcmp(ptr %a2, ptr %b2, i64 8) - %eq11 = icmp eq i32 %cmp, 0 - %10 = zext i1 %eq11 to i8 - store i8 %10, ptr %x2, align 1 - %cmp12 = call i32 @memcmp(ptr %a2, ptr %b2, i64 8) - %neq = icmp ne i32 %cmp12, 0 - %11 = zext i1 %neq to i8 - store i8 %11, ptr %y2, align 1 + %cmp4 = call i32 @memcmp(ptr %a2, ptr %b2, i64 8) + %eq = icmp eq i32 %cmp4, 0 + %4 = zext i1 %eq to i8 + store i8 %4, ptr %x2, align 1 + %cmp5 = call i32 @memcmp(ptr %a2, ptr %b2, i64 8) + %neq = icmp ne i32 %cmp5, 0 + %5 = zext i1 %neq to i8 + store i8 %5, ptr %y2, align 1 call void @get3(ptr %a3) call void @get3(ptr %b3) - %12 = load i8, ptr %a3, align 1 - %13 = trunc i8 %12 to i1 - %14 = load i8, ptr %b3, align 1 - %15 = trunc i8 %14 to i1 - %eq13 = icmp eq i1 %13, %15 - br i1 %eq13, label %next_check14, label %exit19 -next_check14: ; preds = %exit9 - %ptradd15 = getelementptr inbounds i8, ptr %a3, i64 1 - %ptradd16 = getelementptr inbounds i8, ptr %b3, i64 1 - %16 = load i8, ptr %ptradd15, align 1 - %17 = trunc i8 %16 to i1 - %18 = load i8, ptr %ptradd16, align 1 - %19 = trunc i8 %18 to i1 - %eq17 = icmp eq i1 %17, %19 - br i1 %eq17, label %match18, label %exit19 -match18: ; preds = %next_check14 - br label %exit19 -exit19: ; preds = %match18, %next_check14, %exit9 - %array_cmp_phi20 = phi i1 [ false, %exit9 ], [ false, %next_check14 ], [ true, %match18 ] - %20 = zext i1 %array_cmp_phi20 to i8 - store i8 %20, ptr %x3, align 1 - %21 = load i8, ptr %a3, align 1 - %22 = trunc i8 %21 to i1 - %23 = load i8, ptr %b3, align 1 - %24 = trunc i8 %23 to i1 - %eq21 = icmp eq i1 %22, %24 - br i1 %eq21, label %next_check22, label %exit27 -next_check22: ; preds = %exit19 - %ptradd23 = getelementptr inbounds i8, ptr %a3, i64 1 - %ptradd24 = getelementptr inbounds i8, ptr %b3, i64 1 - %25 = load i8, ptr %ptradd23, align 1 - %26 = trunc i8 %25 to i1 - %27 = load i8, ptr %ptradd24, align 1 - %28 = trunc i8 %27 to i1 - %eq25 = icmp eq i1 %26, %28 - br i1 %eq25, label %match26, label %exit27 -match26: ; preds = %next_check22 - br label %exit27 -exit27: ; preds = %match26, %next_check22, %exit19 - %array_cmp_phi28 = phi i1 [ true, %exit19 ], [ true, %next_check22 ], [ false, %match26 ] - %29 = zext i1 %array_cmp_phi28 to i8 - store i8 %29, ptr %y3, align 1 + %lhs6 = load <2 x i8>, ptr %a3, align 1 + %rhs7 = load <2 x i8>, ptr %b3, align 1 + %6 = trunc <2 x i8> %lhs6 to <2 x i1> + %7 = trunc <2 x i8> %rhs7 to <2 x i1> + %cmp8 = icmp eq <2 x i1> %6, %7 + %8 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %cmp8) + %9 = zext i1 %8 to i8 + store i8 %9, ptr %x3, align 1 + %lhs9 = load <2 x i8>, ptr %a3, align 1 + %rhs10 = load <2 x i8>, ptr %b3, align 1 + %10 = trunc <2 x i8> %lhs9 to <2 x i1> + %11 = trunc <2 x i8> %rhs10 to <2 x i1> + %cmp11 = icmp ne <2 x i1> %10, %11 + %12 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cmp11) + %13 = zext i1 %12 to i8 + store i8 %13, ptr %y3, align 1 ret void } define void @test.test2() #0 { @@ -190,6 +142,7 @@ entry: call void @aget(ptr %b) store i64 0, ptr %cmp.idx, align 8 br label %array_loop_start + array_loop_start: ; preds = %array_loop_comparison, %entry %0 = load i64, ptr %cmp.idx, align 8 %ptroffset = getelementptr inbounds [8 x i8], ptr %a, i64 %0 @@ -198,17 +151,20 @@ array_loop_start: ; preds = %array_loop_comparis %2 = load double, ptr %ptroffset1, align 8 %eq = fcmp oeq double %1, %2 br i1 %eq, label %array_loop_comparison, label %array_cmp_exit + array_loop_comparison: ; preds = %array_loop_start %inc = add i64 %0, 1 store i64 %inc, ptr %cmp.idx, align 8 %lt = icmp ult i64 %inc, 200 br i1 %lt, label %array_loop_start, label %array_cmp_exit + array_cmp_exit: ; preds = %array_loop_comparison, %array_loop_start %array_cmp_phi = phi i1 [ true, %array_loop_comparison ], [ false, %array_loop_start ] %3 = zext i1 %array_cmp_phi to i8 store i8 %3, ptr %x, align 1 store i64 0, ptr %cmp.idx2, align 8 br label %array_loop_start3 + array_loop_start3: ; preds = %array_loop_comparison7, %array_cmp_exit %4 = load i64, ptr %cmp.idx2, align 8 %ptroffset4 = getelementptr inbounds [8 x i8], ptr %a, i64 %4 @@ -217,11 +173,13 @@ array_loop_start3: ; preds = %array_loop_comparis %6 = load double, ptr %ptroffset5, align 8 %eq6 = fcmp oeq double %5, %6 br i1 %eq6, label %array_loop_comparison7, label %array_cmp_exit10 + array_loop_comparison7: ; preds = %array_loop_start3 %inc8 = add i64 %4, 1 store i64 %inc8, ptr %cmp.idx2, align 8 %lt9 = icmp ult i64 %inc8, 200 br i1 %lt9, label %array_loop_start3, label %array_cmp_exit10 + array_cmp_exit10: ; preds = %array_loop_comparison7, %array_loop_start3 %array_cmp_phi11 = phi i1 [ false, %array_loop_comparison7 ], [ true, %array_loop_start3 ] %7 = zext i1 %array_cmp_phi11 to i8 @@ -240,6 +198,7 @@ array_cmp_exit10: ; preds = %array_loop_comparis call void @aget3(ptr %b3) store i64 0, ptr %cmp.idx14, align 8 br label %array_loop_start15 + array_loop_start15: ; preds = %array_loop_comparison18, %array_cmp_exit10 %10 = load i64, ptr %cmp.idx14, align 8 %ptradd = getelementptr inbounds i8, ptr %a3, i64 %10 @@ -250,17 +209,20 @@ array_loop_start15: ; preds = %array_loop_comparis %14 = trunc i8 %13 to i1 %eq17 = icmp eq i1 %12, %14 br i1 %eq17, label %array_loop_comparison18, label %array_cmp_exit21 + array_loop_comparison18: ; preds = %array_loop_start15 %inc19 = add i64 %10, 1 store i64 %inc19, ptr %cmp.idx14, align 8 %lt20 = icmp ult i64 %inc19, 200 br i1 %lt20, label %array_loop_start15, label %array_cmp_exit21 + array_cmp_exit21: ; preds = %array_loop_comparison18, %array_loop_start15 %array_cmp_phi22 = phi i1 [ true, %array_loop_comparison18 ], [ false, %array_loop_start15 ] %15 = zext i1 %array_cmp_phi22 to i8 store i8 %15, ptr %x3, align 1 store i64 0, ptr %cmp.idx23, align 8 br label %array_loop_start24 + array_loop_start24: ; preds = %array_loop_comparison28, %array_cmp_exit21 %16 = load i64, ptr %cmp.idx23, align 8 %ptradd25 = getelementptr inbounds i8, ptr %a3, i64 %16 @@ -271,11 +233,13 @@ array_loop_start24: ; preds = %array_loop_comparis %20 = trunc i8 %19 to i1 %eq27 = icmp eq i1 %18, %20 br i1 %eq27, label %array_loop_comparison28, label %array_cmp_exit31 + array_loop_comparison28: ; preds = %array_loop_start24 %inc29 = add i64 %16, 1 store i64 %inc29, ptr %cmp.idx23, align 8 %lt30 = icmp ult i64 %inc29, 200 br i1 %lt30, label %array_loop_start24, label %array_cmp_exit31 + array_cmp_exit31: ; preds = %array_loop_comparison28, %array_loop_start24 %array_cmp_phi32 = phi i1 [ false, %array_loop_comparison28 ], [ true, %array_loop_start24 ] %21 = zext i1 %array_cmp_phi32 to i8