Some code written a long time ago, far far away, to bump map one pixel of a 2d surface.

     
       movd mm1,[esi+eax+4+%1]
      movd edx,mm1
       sub dl,128
      mov dh,dl
      sub dh,[esi+%1+eax+4-1+2]
      sub dl,[esi+%1+4+3+2]
      movd mm2,edx
      punpcklbw mm2,mm3
      paddsw mm2,mm7
      psrlw mm2,1
      packuswb mm2,mm3
      movd edx,mm2
      and edx,0x0000FFFF
      movd mm2,[ebx+edx*4]
      paddusb mm1,mm2
      movd [edi+eax+4+%1],mm1
      paddw mm7,mm4

Do not ask for an explanation, your puny humanoid brains cannot possibly fathom it's subtle complexities.

OK OK, here's the stuff that puts it in context. Sorry about the tabs, the original source doesn't have the wonked out tabs. MSG me if you'd like the entire source and VC workspace, it's very old code of mine and I don't have much use for it anymore.

-----------bump.cpp--------------------

#include <stdlib.h>
#include <stdio.h>
#include <conio.h>
#include <iostream.h>
#include <math.h>

unsigned char* bumpMap;

void bumpInit(void ) {
   bumpMap=new unsigned char[256*256*4];
	long i,j;
	long double temp;
   long double intensity;


	for(i=0;i<256;i++) {
      for(j=0;j<256;j++) {

         intensity = 1 - pow ( (((i)&255) / 127.5) - 1 , 2 )
			         - pow ( (((j)&255) / 127.5) - 1, 2 );
         if ( intensity >0) {
            temp = ( (pow (intensity,15)));
            temp =temp*256;

            }
         else temp=0;
		 if(temp>255)
			 temp=255;
		 



         bumpMap[(( ( i)&255) +
 ((j)&255)*256)*4+0]=(unsigned char)((temp-128-128));
      bumpMap[(( ( i)&255) +
 ((j)&255)*256)*4+1]=(unsigned char)((temp-128-128));
	  bumpMap[(( ( i)&255) +
 ((j)&255)*256)*4+2]=(unsigned char)((temp-128-128));
bumpMap[(( ( i)&255) +
 ((j)&255)*256)*4+3]=(unsigned char)((temp-128-128));	  
	  }
   }
}

void doBump(int width,int height,unsigned char
*source,unsigned char *dest,signed long spotX1
,signed long spotY1,signed long spotX2,signed long spotY2) {
	//what to do with final row and final column? dunno right now.
    
	doBumpASM(source,dest,width,height,
       ((((unsigned short)spotX1))
	   +((((unsigned short)spotY1))<<16))
	   ,spotY1,bumpMap,spotX2,spotY2);
	return;
}







-----------bump.nas--------------------
%macro onePixel 1
       movd mm1,[esi+eax+4+%1] 
	   movd edx,mm1
       sub dl,128
	   mov dh,dl
	   sub dh,[esi+%1+eax+4-1]
	   sub dl,[esi+%1+4+3]
	   
	  ; not dx ;todo[]eradicate
	   movd mm2,edx
       
	   
	   punpcklbw mm2,mm3
       
	   paddsw mm2,mm7
	   psrlw mm2,1
	   packuswb mm2,mm3
       

	   movd edx,mm2
	   and edx,0x0000FFFF ;todo[] eliminate
	   movd mm2,[ebx+edx*4]
	   paddusb mm1,mm2
	   movd [edi+eax+4+%1],mm1

	   paddw mm7,mm4

%endmacro


bits 32

section	.data

GLOBAL	_doBumpASM
GLOBAL  doBumpASM

doBumpASM:
_doBumpASM:
    
	push ebp
         push eax
	push ecx
	push esi
    
	push ebx
	push edi
	push edx
	


pxor mm0,mm0
pxor mm1,mm1
pxor mm2,mm2
pxor mm3,mm3
pxor mm4,mm4
pxor mm5,mm5
pxor mm6,mm6
pxor mm7,mm7

	mov ebp,esp

    ; remember, [ebp+28] is pushed ebp (2 lines above)
    mov ebx,[ebp+56] ; bump map
    mov esi,[ebp+32] ;source
    mov edi,[ebp+36] ;dest
    mov eax,[ebp+40]
    shl eax,2
    mov [ebp+36],eax      ;i eat them by the handful!
    movd mm6,[ebp+48] ;(int spotX)+(int spotY)*65536	
    mov ecx,[ebp+44] ;height
    dec ecx          ;height -1
	
    
nop
nop
nop
nop
mov edx,0x00000001
movd mm4,edx
mov edx,0x00010000
movd mm5,edx

outerLoop:
    push ecx
	push esi
	push edi
    
	mov ecx,[ebp+40]
	shr ecx,2
	dec ecx
    
	movd mm7,[ebp+48]
	paddw mm7,mm5
	movd [ebp+48],mm7
    
		
    innerLoop:
       onePixel 0
	   onePixel 4
	   onePixel 8
	   onePixel 12
	   
	   add esi,16
	   add edi,16

	   dec ecx

	   jz doneinnerLoop
	   jmp innerLoop
doneinnerLoop:
    pop edi
 	pop esi
    add edi,[ebp+36]
	add esi,[ebp+36]
    pop ecx
	dec ecx
    jz nouterLoop
    jmp outerLoop

nouterLoop
    pop edx
	pop edi
	pop ebx
	pop esi
	pop ecx
	pop eax
	pop ebp
	emms
    ret
align 16