Bon j'ai en principe remplacé par celle-ci...
Elle ressemble très peu à l'originale. Quant aux optimizations:
- la mémoire se copie par longwords (au lieu de words), ce qui réduit le nombre d'access
- si le décallage est > 16, on fait décallage vers la droite
- au lieu de faire de faire des tests de wraparound dans le main loop, on les fait avant et on modifie le code
Pour les cycles, le main loop (celui qui se repète 144 fois, donc le plus important), avant faisait:
[116,176]*10=[1160,1760] (selon le nombre de décallages éffectués)
maintenant, il fait:
[114,174]*5=[456,870] (selon le nombre de décallages effectués vers la droite)
[130,190]*5=[650,950] (selon le nombre de décallages effectués vers la gauche)
Ce qui fait carrément la moitié. Quelqu'un peut mieux faire?
Et pas moins important, on dirait que ça marche

.
movem.l d0-d7/a0-a6,-(a7)
;copy the visible portion of the background to the lcd
lea bg_draw(PC),a1 ;used to self-modify the code
move.l agblcd0(PC),a6 ;(a6)=gblcd0
move.l agblcd1(PC),a5 ;(a5)=gblcd1
move.l abuffs(PC),a3
lea buffs_bgbuff0(a3),a2 ;(a2)=bgbuff0
lea buffs_bgbuff1(a3),a3 ;(a3)=bgbuff1
clr.w d5 ;d7=number to shift left
move.b scx(a4),d5
move.w d5,d7
andi.b #$1f,d7
cmp.b #16,d7 ;if(d7>=16) faster to rotate right
bpl r
;scx c1nc c2nc scx/32 scx/32*4
;-----------------------------------------------
;[224,256) -1 4 7 28
;[192,224) 0 3 6 24
;[160,192) 1 2 5 20
;[128,160) 2 1 4 16
;[96,128) 3 0 3 12
;[64,96) 4 -1 2 8
;[32,64) 4 -1 1 4
;[0,32) 4 -1 0 0
lsr.b #5,d5 ;add to a2 and a3 the offset of
move.w d5,d0 ;the 1st column to copy
lsl.b #2,d5
add.w d5,a2
add.w d5,a3
subq.w #2,d0 ;patch ncycles after X wraparound
bpl lpl
clr.w d0
lpl subq.w #1,d0
move.b d0,lc2nc+1-bg_draw(a1)
subq.w #3,d0 ;patch ncycles before X wraparound
neg.w d0
move.b d0,lc1nc+1-bg_draw(a1)
move.w #$ffff,d2 ;d2=mask and d3=~mask
lsl.w d7,d2
move.w d2,d3
not.w d3
clr.w d5 ;add to a2 and a3 the offset of
move.b scy(a4),d5 ;the 1st line to copy
move.w d5,d6
asl.w #5,d5
add.w d5,a2
add.w d5,a3
move.w #144-1,d1 ;d1=nlines before Y wraparound
cmpi.w #112,d6 ; =min(scy+144,256)-scy
bmi lally
move.w #256-1,d1
sub.w d6,d1
lally
move.w #144-2,d0 ;patch nlines after Y wraparound
sub.w d1,d0
move.w d0,llnc+2-bg_draw(a1)
llc move.l (a2)+,d5
move.l (a3)+,d6
lsl.l d7,d5
lsl.l d7,d6
lc1nc moveq #-1,d0 ;this number gets patched
bmi lc1e
lc1c move.l (a2)+,d4 ;12 -> [52,82]
rol.l d7,d4 ;8+[2*0,2*15]
move.l d4,a0 ;4
and.w d3,d4 ;4
or.w d4,d5 ;4
move.l d5,(a6)+ ;12
move.l a0,d5 ;4
and.w d2,d5 ;4
move.l (a3)+,d4
rol.l d7,d4
move.l d4,a0
and.w d3,d4
or.w d4,d6
move.l d6,(a5)+
move.l a0,d6
and.w d2,d6
dbra.w d0,lc1c
lc1e lea -32(a2),a2
lea -32(a3),a3
lc2nc moveq #-1,d0 ;this number gets patched
bmi lc2e
lc2c move.l (a2)+,d4
rol.l d7,d4
move.l d4,a0
and.w d3,d4
or.w d4,d5
move.l d5,(a6)+
move.l a0,d5
and.w d2,d5
move.l (a3)+,d4
rol.l d7,d4
move.l d4,a0
and.w d3,d4
or.w d4,d6
move.l d6,(a5)+
move.l a0,d6
and.w d2,d6
dbra.w d0,lc2c
lc2e lea 40(a2),a2
lea 40(a3),a3
dbra.w d1,llc
lea -32*256(a2),a2 ;wraparound and continue
lea -32*256(a3),a3
llnc move.w #-1,d1 ;this number gets patched
bmi lle
move.w #-1,llnc+2-bg_draw(a1) ;prevent any further loops
bra llc
lle
lret movem.l (a7)+,d0-d7/a0-a6
rts
;scx c1nc c2nc (scx+160)/32+1 (scx+160)/32*4
;-------------------------------------------------------------
;[224,256) 3 0 5 20
;[192,224) 2 1 4 16
;[160,192) 1 2 3 12
;[128,160) 0 3 2 8
;[96,128) -1 4 1 4
;[64,96) 4 -1 8 32
;[32,64) 4 -1 7 28
;[0,32) 4 -1 6 24
;
;c1nc=max(6,(scx+160)/32+1)-2
;c2nc=3-c1nc
r lea 160*144/8(a6),a6
lea 160*144/8(a5),a5
sub.b #32,d7 ;d7=number to shift right
neg.b d7
add.b #160,d5 ;add to a2 and a3 the offset of
lsr.b #5,d5 ;the last column to copy + 1
addq.b #1,d5
move.w d5,d0
lsl.b #2,d5
add.w d5,a2
add.w d5,a3
cmp.w #6,d0 ;patch ncycles before X wraparound
bmi rmi
moveq #6,d0
rmi subq.w #2,d0
move.b d0,rc1nc+1-bg_draw(a1)
subq.w #3,d0 ;patch ncycles after X wraparound
neg.w d0
move.b d0,rc2nc+1-bg_draw(a1)
move.w #$ffffffff,d2 ;d2=mask and d3=~mask
lsr.w d7,d2
move.w d2,d3
not.w d3
clr.w d5 ;add to a2 and a3 the offset of
move.b scy(a4),d5 ;the last line to copy
add.b #144-1,d5
move.w d5,d1
asl.w #5,d5
add.w d5,a2
add.w d5,a3
cmpi.w #144-1,d1 ;d1=nlines before Y wraparound
bmi rally ; =min(scy+144-1,144-1)
move.w #144-1,d1
rally
move.w #144-2,d0 ;patch nlines after Y wraparound
sub.w d1,d0
move.w d0,rlnc+2-bg_draw(a1)
rlc move.l -(a2),d5
move.l -(a3),d6
lsr.l d7,d5
lsr.l d7,d6
swap d5
swap d6
rc1nc moveq #-1,d0 ;this number gets patched
bmi rc1e
rc1c move.l -(a2),d4 ;12 -> [60,90]
ror.l d7,d4 ;8+[2*0,2*15]
swap d4 ;4
move.l d4,a0 ;4
and.w d3,d4 ;4
or.w d4,d5 ;4
swap d5 ;4
move.l d5,-(a6) ;12
move.l a0,d5 ;4
and.w d2,d5 ;4
move.l -(a3),d4
ror.l d7,d4
swap d4
move.l d4,a0
and.w d3,d4
or.w d4,d6
swap d6
move.l d6,-(a5)
move.l a0,d6
and.w d2,d6
dbra.w d0,rc1c
rc1e lea 32(a2),a2
lea 32(a3),a3
rc2nc moveq #-1,d0 ;this number gets patched
bmi rc2e
rc2c move.l -(a2),d4
ror.l d7,d4
swap d4
move.l d4,a0
and.w d3,d4
or.w d4,d5
swap d5
move.l d5,-(a6)
move.l a0,d5
and.w d2,d5
move.l -(a3),d4
ror.l d7,d4
swap d4
move.l d4,a0
and.w d3,d4
or.w d4,d6
swap d6
move.l d6,-(a5)
move.l a0,d6
and.w d2,d6
dbra.w d0,rc2c
rc2e lea -40(a2),a2
lea -40(a3),a3
dbra.w d1,rlc
lea 32*256(a2),a2 ;wraparound and continue
lea 32*256(a3),a3
rlnc move.w #-1,d1 ;this number gets patched
bmi rle
move.w #-1,rlnc+2-bg_draw(a1) ;prevent any further loops
bra rlc
rle
rret movem.l (a7)+,d0-d7/a0-a6
rts