Some more optimizations on RISC-V

Posted by

Using A0/A1 for parameters, and return value in A0

#define BRANCH_COND(a) { s8 depl=(s8)PC[1]; PC=PC+2; if (a) PC+=depl; TICKS(3); return PC;}

uint8_t* op_bcc(uint8_t *PC, struct hd6301_regs *ctx) { BRANCH_COND(ctx->CC_C==0) }

00002b78 <op_bcc>:
    2b78:	0155c703          	lbu	a4,21(a1)
    2b7c:	00154783          	lbu	a5,1(a0)
    2b80:	0509                	addi	a0,a0,2
    2b82:	e701                	bnez	a4,2b8a <op_bcc+0x12>
    2b84:	07e2                	slli	a5,a5,0x18
    2b86:	87e1                	srai	a5,a5,0x18
    2b88:	953e                	add	a0,a0,a5
    2b8a:	499c                	lw	a5,16(a1)
    2b8c:	078d                	addi	a5,a5,3
    2b8e:	c99c                	sw	a5,16(a1)
    2b90:	8082                	ret

Some weird results when trying to just swap 2 bytes :


( Opened a GCC bug : https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116560 )

#include <stdint.h>

uint8_t test_swap(uint8_t *ptr, uint32_t *res)
{
    uint16_t temp=(ptr[0]<<8)|ptr[1];
    *res=temp;
    if (temp==0) return 1;
    return 0;
}

Up to 9.x, with -O2, the generated code is great , with 1 OR:

test_swap:
        lbu     a5,0(a0)
        slli    a5,a5,8
        lbu     a0,1(a0)
        or      a0,a5,a0
        sw      a0,0(a1)
        seqz    a0,a0
        ret

After 9.x (tested on godbolt.org - https://godbolt.org/z/f38EaosxM ),
useless slli/srli, and 2 OR...

test_swap:
        lbu     a5,1(a0)
        lbu     a4,0(a0)
        slli    a0,a5,8
        or      a0,a0,a4  // ptr[1]<<8 | ptr[0] ??? why ? want the opposite
        slli    a5,a0,8   // ptr[1]<<16 | ptr[0]<<8 | 0
        srli    a0,a0,8   // ptr[1]
        or      a0,a5,a0  // ptr[1]<<16 | ptr[0]<<8 | ptr[1]
        slli    a0,a0,16  // and 0xFFFF
        srli    a0,a0,16
        sw      a0,0(a1)  // ptr[0]<<8 | ptr[1]
        seqz    a0,a0
        ret

Actually, up to 12.x with -O1, the code is the same as 8.x/9.x
After that, with 13.x and -O1, a pair of useless slli/srli (i.e.
and 0xFFFF) is added :

test_swap:
        lbu     a5,0(a0)
        slli    a5,a5,8
        lbu     a4,1(a0)
        or      a0,a5,a4
        slli    a0,a0,16  // not necessary
        srli    a0,a0,16  // not necessary
        sw      a0,0(a1)
        seqz    a0,a0
        ret

Since the result is highly dependent on compiler options (-O1/-O2) and compiler version, I have used a custom macro for byte swapping :

The equivalent of :
#define swap16b(res,ptr,offset) { res=(ptr[offset+0]<<8)|ptr[offset+1]; }

Becomes :
#define swap16b(res,ptr,offset)  { u32 _dum; \
			   asm volatile("lbu %[tmp],%[hb] \n" \
			  	        "lbu %[sw],%[lb] \n" \
					"slli %[tmp],%[tmp],8 \n" \
				   	"or %[sw],%[sw],%[tmp]": [sw] "=&r"(res), [tmp]"=&r"(_dum) : \
			[hb] "m"(*(ptr+offset)), [lb] "m"(*(ptr+offset+1)), "r"(ptr) ); };

And always produces :
test_swap:
        lbu a4,0(a0) 
        lbu a5,1(a0) 
        slli a4,a4,8 
        or a5,a5,a4
        sw      a5,0(a1)
        seqz    a0,a5
        ret

Leave a Reply

Your email address will not be published. Required fields are marked *