For some value of optimisation we can replace the division with an
bitwise and. And it even shrinks the code. Before:
6c9: 53 push %rbx
6ca: 4c 8b 47 08 mov 0x8(%rdi),%r8
6ce: 31 d2 xor %edx,%edx
6d0: 48 89 fb mov %rdi,%rbx
6d3: 8b 87 c8 05 00 00 mov 0x5c8(%rdi),%eax
6d9: 41 8b 48 04 mov 0x4(%r8),%ecx
6dd: f7 d0 not %eax
6df: 21 c8 and %ecx,%eax
6e1: 83 c1 01 add $0x1,%ecx
6e4: 83 c0 01 add $0x1,%eax
6e7: f7 f1 div %ecx
6e9: 89 d6 mov %edx,%esi
6eb: 41 ff 90 88 00 00 00 call *0x88(%r8)
After:
6c9: 53 push %rbx
6ca: 48 8b 57 08 mov 0x8(%rdi),%rdx
6ce: 48 89 fb mov %rdi,%rbx
6d1: 8b 87 c8 05 00 00 mov 0x5c8(%rdi),%eax
6d7: 8b 72 04 mov 0x4(%rdx),%esi
6da: f7 d0 not %eax
6dc: 21 f0 and %esi,%eax
6de: 83 c0 01 add $0x1,%eax
6e1: 21 c6 and %eax,%esi
6e3: ff 92 88 00 00 00 call *0x88(%rdx)
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Sunil Khatri <sunil.khatri@amd.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
/* We pad to match fetch size */
count = ring->funcs->align_mask + 1 -
(ring->wptr & ring->funcs->align_mask);
- count %= ring->funcs->align_mask + 1;
+ count &= ring->funcs->align_mask;
if (count != 0)
ring->funcs->insert_nop(ring, count);