;******************************************************;
;* PLASMA.ASM -- Simple plasma display, with rotating *;
;* palette. Requires VGA, 386. *;
;******************************************************;
;
; Submitter: Paul Hsieh
; Size: 390 bytes (total of 169 bytes savings from original)
; Date: Mon, Feb 17.
;
;------------------------------------------------------------------------------
;
; Techniques used in size optimizing from the original source as given in
; the PT&T 2.0 contest.
;
; (1) There were numerous gimme's such as: only using a byte where the
; original was using a word, or word instead of dword; removing unnecessary
; stack frames; skipping the "wait for vertical blank end" before the "wait
; for vertical blank start"; removing redundant "test" operations when an
; arithmetic operation had occurred just before it; optimizing stack and
; register references.
;
; (2) The next thing I did was try to rearrange the small routines (SetPal,
; SetPixel, GetPixel, Rand and rand192) in terms of content and register API.
; It was fairly straight forward removing the stack frames and minimizing all
; but the SetPal routine. (At this point the SetPixel function looked more
; like the mirror image of the current GetPixel function.) The SetPal
; routine needed special attention, since it was artificially tied down to
; the data structure "Palette[bx]". By changing the palette initialization
; function and inlining the SetPal function into the inner loop, I was able
; to drop the redundant copy for the second half of the Palette table. I
; also changed the "Palette" offset declaration to a constant so that it
; could simultaneously initialize bl for whatever extraneous purpose. Only in
; the final phase of optimization did I see a way to take advantage of it!
;
; (3) With all the simple optimization tasks out of the way, I set about
; trying to understand how the algorithm works. After a while I realized
; that the algorithm was essentially a recursive stochastic mountain fractal
; that is color mapped rather than height mapped. By understanding this at a
; high level I was able to picture the whole algorithm in my head and it
; became instantly clear what the next big redundancy in the algorithm was:
; The plotting routines had 3 complicated termination conditions which were
; glutting up the code. I more fully characterized the essential "singular"
; recursive subdivision configuration as being a rectangle of height 1 and
; the termination condition being a rectangle of width 1 or 0. All under the
; proviso that no pixel is ever drawn twice, and "Adjust" calls are always
; made before "Plasma" recurse calls. Rather than pulling the logic of the
; duplicated pixels into the subdivision algorithm, I put it into the SetPixel
; routine itself, where I took advantage of GetPixel's address offset
; calculation as a side effect. The recursive structure changed somewhat, and
; I had to play with the stack to accomodate it. Although I only got modest
; gains from it (since it was really just a trade in complexity) it was by far
; the most complicated of all the changes. I actually implemented some
; debugging code to trace through so that I had everything clear in my mind.
;
; (4) After all these rearrangements other redundancies were uncovered that
; were hidden to me before. Many more references to the stack were actually
; redundant since nearly all the values were already in registers. Two more
; "gimmes" using the fact that all Y values are < 256 and thus the high byte
; is set to zero. At this point there were only two xor reg,reg commands.
; I found a way to change "Rand" yet again to more closely match the contexts
; of the callers, and save the xor edx,edx in the process. I took a second
; look at the stack frames and realized that I could get rid of all of them
; easily. After careful consideration, I've decided that without compromise,
; the add constant in the random number generator can be 1.
;
; (5) After a nights sleep and dreaming about 16 bit optimizations (a very
; twisted state to be in!) I found 4 more gimmes. (1) Set palette buffer
; address so that as a side effect after the last InitPal, bx=A000. (2) Use
; dl instead of al in SetPixel. (3) Remove ds override on self modifying
; code (boy was *that* ever stupid ...) (4) Set EBX to 95, via xor ebx,ebx
; then setting bl.
;
; (6) Re-implemented all recursive calls, as a massive return chain setup.
; That is, the parameters for all the calls are worked out in advance, so they
; along with the start addresses of the next call (in lieu of the actual return
; addresses) are stored on the stack in the correct call order. Doing this
; allows me to take advantage of call address redundancy, by sticking the call
; addresses in a register (bp) and just pushing them onto the stack. Code was
; then rearranged so that many jmps became unneccessary (they just fell
; through.) At this point, the code becomes extremely unmaintainable, as the
; recursive call structure is now completely hidden.
;
; (7) I implemented the call chain idea in the two other places where is
; seemed to work out. Put color accumulator into the GetPixel call itself.
; A few more gimme's (folding the init of ebx and pushing of zeros together.)
; At this point, finding new optimization areas is getting painful. Optimize
; the clamping to take sign of dx into account from previous sar, and unsigned
; byte comparison to check against 192 (251, is maximal possible value.)
;
; (8) Use only 16 bit bx as divisor (range is small enough) without
; compromising randomness, use CX = 0 side effect for zero rather than
; clearing ebx. Reduce pushad/popad to pusha/popa. Swap roles of si and bx
; so that I can use lea si,[bx+di] command!! Hey! In most cases cwd is the
; same as xor dx,dx!! Did a recheck on the (dx+ax*2)>>1 and realized it
; simplifies to (dx>>1)+ax since there can never be a bottom bit carry.
;
; (9) Fixed problem with esp. Win 3.0 does not clear the upper half of esp
; on DOS box initialization, which must do wonders for stability. Anyhow, I
; went back to using bp, and lost two bytes.
;
; (10) I went over the code again, but in its current state it has so far
; resisted further ideas from me. In this vein I hand it over to others
; (though lets see how I do in the competition first.) In the event that I
; win (and my approach is worthwhile analyzing further), I would like to
; offer the following suggestions for further optimizations that I suspect
; may lead to something, but I myself could not work out how:
;
; - The random number generator is horrendously bloated, given its use of 32
; bit registers. Using a simple LFSR may be potential alternative, but its
; not clear that it will both be "random" and shorter than the current
; implementation. Either way, something should be done about it; there's a
; 9 byte instruction in there! (T. Remmel disallowed use of the FPU other-
; wise I would have tried to maintain "RandNum" in the FPU stack.)
;
; - The initial palette table clear feels very redundant considering that the
; InitPal call also walks over the same structure. There ought to be a way
; reinvest less then 9 bytes into clearing that table somehow.
;
; - The self modification for the initial palette set up was an after thought
; (once I learned that not setting the palette on start up constituted lack
; of duplication.) So there might be room for rearchitecting in there
; somewhere to save some of the 6 bytes used in the self modification write
; instruction itself.
;
; - There may be a way to further reduce the partial stack frame used in
; "Adjust". BP is a completely free register, and Adjust does not use any
; more stack, except for the call to "Rand" and tail usage via SetPixel.
; Perhaps retargetting the stack via bp for the pusha/popa's then ripping
; the parameters off the stack with pop's can somehow be made to work.
;
;------------------------------------------------------------------------------
;
; I've had way too much fun with this code. I especially enjoyed it because
; it was not so terribly off from the very beginning as to be too much work
; from the very get go. It really exercised my abstract thinking and there
; were a lot of things that I could do to squeeze bytes here and there.
;
;------------------------------------------------------------------------------
.Model Tiny
.386
.Code
Org 100h
Prog Proc
mov ax,13h ;Set video mode 13h
int 10h
mov bx,9d00h
mov cx,576
mov di,bx ;Zero the palette
xor ax,ax
rep stosw
call InitPal ;First half of palette
mov bx,9d00h+0240h
call InitPal ;Second half of palette
call RotateStart
mov byte ptr [MungePoint],0b4h ; Re-patch opcode.
push 40h ;ES = 40h
pop es
mov eax,es:[6Ch] ;Seed RNG with current time.
mov RandNum,eax
mov bp,offset rand192
mov es,bx ;Side effect, BX = 0A000h, ES points to VGA
mov bx,319 ;Small redundancy savings.
push word ptr 199 ;Draw plasma
push bx
push cx ;Side effect CX = 0.
push cx
push offset Reconnect
push cx
push bp
push 63999
push bp
push 63680
push bp
push bx
rand192:
mov si,95 ;Get random number 1-191
call Rand
add ax,si
pop bx
mov es:[bx],al
ret
Reconnect:
call Plasma
RotateStart:
mov si,9d00h
Rotate: mov dx,03DAh ;Status port
VRTloop2: in al,dx ;Wait for next vertical blank interval
test al,8
jz VRTloop2
mov dl,0C8h ;Index port
mov al,1 ;Set register 1
out dx,al
inc dx ;Data port
mov cx,0240h
rep outsb
add si,3-576 ;Advance palette
cmp si,9d00h+0240h ;Check for wrap
je RotateStart ;Loop back...
MungePoint:
ret
db 01h ; Self-modded to "mov ah,1".
int 16h ; Check for keypress. (Dont wait.)
jz Rotate
Done: mov ax,3 ;Set text mode
int 10h
ret ;Return (exit)
Prog EndP
;**************************** InitPal -- Setup rainbow palette values.
InitPal Proc
mov ah,3fh
PalLoop: mov [bx],ah ;Set palette values
mov [bx+2],al
mov word ptr [bx+193],ax
mov word ptr [bx+384],ax
add bx,3 ;Advance pointers
add ax,0ff01h ;Loop back
jns PalLoop
ret
InitPal Endp
P_Done: push bp
ret ;Return, pop args
;**************************** Plasma -- Recursive plasma procedure
Plasma Proc
pop bp ; Save return address
pop si ; Pull values off stack.
pop ax
pop cx
pop dx
mov bx,si ;x = (x1 + x2) / 2
add bx,cx
sar bx,1
mov di,ax ;y = (y1 + y2) / 2
add di,dx
sar di,1
cmp bx,si ; Implies that |x2-x1|<=1
je P_Done ; Actual recursive termination condition.
push dx
push bx
push di
push si
push bp ;Actual return address.
mov bp,offset Plasma
push dx ;Plasma x, y, x2, y2
push cx
push di
push bx
push bp ;Ret-chain back to plasma.
push dx ;Adjust bottom side
push bx
push dx
push cx
push dx
push si
push bp ;Ret-chain back to plasma.
mov bp,offset Adjust
push ax ;Adjust top side
push bx
push ax
push cx
push ax
push si
push bp
cmp di,ax ; Implies that |y2-y1|<=1
je Adjust
push bp
mov bp,sp
mov word ptr [bp+8*2],offset P_Center
pop bp
push di ;Adjust right side
push cx
push dx
push cx
push ax
push cx
push bp
push di ;Adjust left side
push si
push dx
push si
push ax
push si
push bp
Plasma EndP
;**************************** Adjust -- Random adjust pixel midpoint
Adjust Proc
mov bp,sp
pusha ;Save all registers
cwd ;Sends DX->0 since 0<=ax<200
mov bx,[bp+2]
mov di,[bp+4]
call GetPixel
lea cx,[bx+di]
mov bx,[bp+6]
mov di,[bp+8]
call GetPixel
lea si,[bx+di]
sub si,cx ;SI = |x2 - x1| + |y2 - y1|
call Rand
sar dx,1 ;We know this is equivalent because a carry
add dx,ax ;can never happen on the last bit.
jg $+4
mov dl,1
cmp dl,192
jbe $+4
mov dl,192
mov bx,[bp+10] ;Get values
mov di,[bp+12]
call SetPixel
A_Done: popa ;Restore registers
ret 12 ;Return, pop args
Adjust EndP
;**************************** Additional Plasma fragments
P_Center:
mov bp,offset Plasma
push di ;Plasma x, y1, x2, y
push cx
push ax
push bx
push bp ;Ret-chain back to plasma.
push di ;Plasma x1, y1, x, y
push bx
push ax
push si
push bp ;Ret-chain back to plasma.
push bp
mov bp,offset GetPixel
push bx
push di
mov bx,si
mov di,ax
push di
push dx
cwd ;Same as xor dx,dx since 0<=ax<200
call bp
pop di
call bp
mov bx,cx
call bp
pop di
call bp
pop di
pop bx
shr dx,2 ;DX = average of corners
;**************************** SetPixel -- Write a pixel
SetPixel Proc
call GetPixel
test al,al ; Is pixel already plotted?
jnz SP_Skip ; If so just punt. (DX destroyed.)
mov es:[si+bx],dl ; SI and DL, side effects from GetPixel
SP_Skip: ret
SetPixel EndP
;**************************** GetPixel -- Read a pixel
GetPixel Proc
imul word ptr si,di,0140H
mov al,es:[si+bx]
add dx,ax
ret
GetPixel EndP
;**************************** Rand -- Random number generator
; Random number between in the range {-si, ..., 0, ..., si}.
Rand Proc
pusha ;Save registers
inc si ;SI = maximum
imul dword ptr eax,RandNum,19660Dh
inc ax ;EAX = RandNum * 19660Dh + 1h
mov RandNum,eax ;Save random number
mov ax,word ptr RandNum[2] ;Get top half. (retain randomness)
cwd ;DX:AX = number
idiv si ;Divide by max (note: si can never be 0)
mov bp,sp
mov [bp+0eh],dx ;Change pushed AX
popa ;Restore registers
ret ;Return
Rand EndP
;**************************** DATA section
RandNum dd ?
End Prog