Skip to content
Snippets Groups Projects
  • Paolo Bonzini's avatar
    633f6502
    optimize: optimize using nonzero bits · 633f6502
    Paolo Bonzini authored
    
    This adds two optimizations using the non-zero bit mask.  In some cases
    involving shifts or ANDs the value can become zero, and can thus be
    optimized to a move of zero.  Second, useless zero-extension or an
    AND with constant can be detected that would only zero bits that are
    already zero.
    
    The main advantage of this optimization is that it turns zero-extensions
    into moves, thus enabling much better copy propagation (around 1% code
    reduction).  Here is for example a "test $0xff0000,%ecx + je" before
    optimization:
    
     mov_i64 tmp0,rcx
     movi_i64 tmp1,$0xff0000
     discard cc_src
     and_i64 cc_dst,tmp0,tmp1
     movi_i32 cc_op,$0x1c
     ext32u_i64 tmp0,cc_dst
     movi_i64 tmp12,$0x0
     brcond_i64 tmp0,tmp12,eq,$0x0
    
    and after (without patch on the left, with on the right):
    
     movi_i64 tmp1,$0xff0000                 movi_i64 tmp1,$0xff0000
     discard cc_src                          discard cc_src
     and_i64 cc_dst,rcx,tmp1                 and_i64 cc_dst,rcx,tmp1
     movi_i32 cc_op,$0x1c                    movi_i32 cc_op,$0x1c
     ext32u_i64 tmp0,cc_dst
     movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
     brcond_i64 tmp0,tmp12,eq,$0x0           brcond_i64 cc_dst,tmp12,eq,$0x0
    
    Other similar cases: "test %eax, %eax + jne" where eax is already 32-bit
    (after optimization, without patch on the left, with on the right):
    
     discard cc_src                          discard cc_src
     mov_i64 cc_dst,rax                      mov_i64 cc_dst,rax
     movi_i32 cc_op,$0x1c                    movi_i32 cc_op,$0x1c
     ext32u_i64 tmp0,cc_dst
     movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
     brcond_i64 tmp0,tmp12,ne,$0x0           brcond_i64 rax,tmp12,ne,$0x0
    
    "test $0x1, %dl + je":
    
     movi_i64 tmp1,$0x1                      movi_i64 tmp1,$0x1
     discard cc_src                          discard cc_src
     and_i64 cc_dst,rdx,tmp1                 and_i64 cc_dst,rdx,tmp1
     movi_i32 cc_op,$0x1a                    movi_i32 cc_op,$0x1a
     ext8u_i64 tmp0,cc_dst
     movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
     brcond_i64 tmp0,tmp12,eq,$0x0           brcond_i64 cc_dst,tmp12,eq,$0x0
    
    In some cases TCG even outsmarts GCC. :)  Here the input code has
    "and $0x2,%eax + movslq %eax,%rbx + test %rbx, %rbx" and the optimizer,
    thanks to copy propagation, does the following:
    
     movi_i64 tmp12,$0x2                     movi_i64 tmp12,$0x2
     and_i64 rax,rax,tmp12                   and_i64 rax,rax,tmp12
     mov_i64 cc_dst,rax                      mov_i64 cc_dst,rax
     ext32s_i64 tmp0,rax                  -> nop
     mov_i64 rbx,tmp0                     -> mov_i64 rbx,cc_dst
     and_i64 cc_dst,rbx,rbx               -> nop
    
    Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
    Signed-off-by: default avatarRichard Henderson <rth@twiddle.net>
    Signed-off-by: default avatarBlue Swirl <blauwirbel@gmail.com>
    633f6502
    History
    optimize: optimize using nonzero bits
    Paolo Bonzini authored
    
    This adds two optimizations using the non-zero bit mask.  In some cases
    involving shifts or ANDs the value can become zero, and can thus be
    optimized to a move of zero.  Second, useless zero-extension or an
    AND with constant can be detected that would only zero bits that are
    already zero.
    
    The main advantage of this optimization is that it turns zero-extensions
    into moves, thus enabling much better copy propagation (around 1% code
    reduction).  Here is for example a "test $0xff0000,%ecx + je" before
    optimization:
    
     mov_i64 tmp0,rcx
     movi_i64 tmp1,$0xff0000
     discard cc_src
     and_i64 cc_dst,tmp0,tmp1
     movi_i32 cc_op,$0x1c
     ext32u_i64 tmp0,cc_dst
     movi_i64 tmp12,$0x0
     brcond_i64 tmp0,tmp12,eq,$0x0
    
    and after (without patch on the left, with on the right):
    
     movi_i64 tmp1,$0xff0000                 movi_i64 tmp1,$0xff0000
     discard cc_src                          discard cc_src
     and_i64 cc_dst,rcx,tmp1                 and_i64 cc_dst,rcx,tmp1
     movi_i32 cc_op,$0x1c                    movi_i32 cc_op,$0x1c
     ext32u_i64 tmp0,cc_dst
     movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
     brcond_i64 tmp0,tmp12,eq,$0x0           brcond_i64 cc_dst,tmp12,eq,$0x0
    
    Other similar cases: "test %eax, %eax + jne" where eax is already 32-bit
    (after optimization, without patch on the left, with on the right):
    
     discard cc_src                          discard cc_src
     mov_i64 cc_dst,rax                      mov_i64 cc_dst,rax
     movi_i32 cc_op,$0x1c                    movi_i32 cc_op,$0x1c
     ext32u_i64 tmp0,cc_dst
     movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
     brcond_i64 tmp0,tmp12,ne,$0x0           brcond_i64 rax,tmp12,ne,$0x0
    
    "test $0x1, %dl + je":
    
     movi_i64 tmp1,$0x1                      movi_i64 tmp1,$0x1
     discard cc_src                          discard cc_src
     and_i64 cc_dst,rdx,tmp1                 and_i64 cc_dst,rdx,tmp1
     movi_i32 cc_op,$0x1a                    movi_i32 cc_op,$0x1a
     ext8u_i64 tmp0,cc_dst
     movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
     brcond_i64 tmp0,tmp12,eq,$0x0           brcond_i64 cc_dst,tmp12,eq,$0x0
    
    In some cases TCG even outsmarts GCC. :)  Here the input code has
    "and $0x2,%eax + movslq %eax,%rbx + test %rbx, %rbx" and the optimizer,
    thanks to copy propagation, does the following:
    
     movi_i64 tmp12,$0x2                     movi_i64 tmp12,$0x2
     and_i64 rax,rax,tmp12                   and_i64 rax,rax,tmp12
     mov_i64 cc_dst,rax                      mov_i64 cc_dst,rax
     ext32s_i64 tmp0,rax                  -> nop
     mov_i64 rbx,tmp0                     -> mov_i64 rbx,cc_dst
     and_i64 cc_dst,rbx,rbx               -> nop
    
    Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
    Signed-off-by: default avatarRichard Henderson <rth@twiddle.net>
    Signed-off-by: default avatarBlue Swirl <blauwirbel@gmail.com>