Open
Description
This is an issue to track the code size regression caused by the scheduling model changes, from PR #144564.
The results: https://lnt.lukelau.me/db_default/v4/nts/674?compare_to=673
After updating SpacemiT X60 scheduling model with hardware-measured latencies, the code size increased due to extra vector register spills. The following reduced test from Blender shows the issue:
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"
define fastcc <vscale x 64 x i8> @do_cross_effect_byte(ptr %rect2, i16 %0, i16 %1, <vscale x 16 x i16> %2) {
entry:
%3 = insertelement <vscale x 16 x i16> zeroinitializer, i16 %1, i64 0
%4 = shufflevector <vscale x 16 x i16> %3, <vscale x 16 x i16> zeroinitializer, <vscale x 16 x i32> zeroinitializer
%5 = insertelement <vscale x 16 x i16> zeroinitializer, i16 %0, i64 0
%strided.vec142 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> zeroinitializer)
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec142, 0
%7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec142, 1
%8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec142, 2
%9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec142, 3
%10 = zext <vscale x 16 x i8> %6 to <vscale x 16 x i16>
%11 = mul <vscale x 16 x i16> %4, %10
%wide.vec143 = load <vscale x 64 x i8>, ptr %rect2, align 1
%strided.vec144 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.vec143)
%12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec144, 0
%13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec144, 1
%14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec144, 2
%15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %strided.vec144, 3
%16 = zext <vscale x 16 x i8> %12 to <vscale x 16 x i16>
%17 = mul <vscale x 16 x i16> %2, %16
%18 = add <vscale x 16 x i16> %17, %11
%19 = trunc <vscale x 16 x i16> %18 to <vscale x 16 x i8>
%20 = zext <vscale x 16 x i8> %7 to <vscale x 16 x i16>
%21 = mul <vscale x 16 x i16> %4, %20
%22 = zext <vscale x 16 x i8> %13 to <vscale x 16 x i16>
%23 = mul <vscale x 16 x i16> %2, %22
%24 = add <vscale x 16 x i16> %23, %21
%25 = trunc <vscale x 16 x i16> %24 to <vscale x 16 x i8>
%26 = zext <vscale x 16 x i8> %8 to <vscale x 16 x i16>
%27 = mul <vscale x 16 x i16> %4, %26
%28 = zext <vscale x 16 x i8> %14 to <vscale x 16 x i16>
%29 = mul <vscale x 16 x i16> %2, %28
%30 = add <vscale x 16 x i16> %29, %27
%31 = trunc <vscale x 16 x i16> %30 to <vscale x 16 x i8>
%32 = zext <vscale x 16 x i8> %9 to <vscale x 16 x i16>
%33 = mul <vscale x 16 x i16> %4, %32
%34 = zext <vscale x 16 x i8> %15 to <vscale x 16 x i16>
%35 = mul <vscale x 16 x i16> %5, %34
%36 = add <vscale x 16 x i16> %35, %33
%37 = trunc <vscale x 16 x i16> %36 to <vscale x 16 x i8>
%interleaved.vec145 = tail call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> %19, <vscale x 16 x i8> %25, <vscale x 16 x i8> %31, <vscale x 16 x i8> %37)
ret <vscale x 64 x i8> %interleaved.vec145
}
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8>) #0
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) #0
; uselistorder directives
uselistorder ptr @llvm.vector.deinterleave4.nxv64i8, { 1, 0 }
attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
After the patch, we start to see extra vs4r.v
and vl4r.v
.
While debugging the issue, I've tried a couple of things:
- Increase the latency of vector ld/st instructions: Didn't fix the issue. Even going as high as 1000 cycles of latencies doesn't change the final generated code.
- Adding ReleaseAtCycle to all vector integer instructions, which scales with LMUL: it fixes the issue. I'm investigating why. I got this idea from looking at the P400 scheduling model.