3 ; This file contains both 32 and 64 bit versions of the functions.
     4 ; Copyright (c) Love Nystrom.
     6 TITLE 32/64 bit assembly routines.
    17 ;==============================================================================
    18 ;  Common x86/x64 - Constants and parameterless functions
    19 ;==============================================================================
    22 ; Common code and data...
    25 ;==============================================================================
    26 IF (_WIN64 eq 0) ; 32 bit version
    27 ;==============================================================================
    28 ECHO === 32-BIT ASSEMBLY (ML) ===
    31 ;   Argument-passing: Right to left. By value, unless pointer or reference.
    32 ;   Stack-maintenance: Called function pops the stack.
    33 ;   Name-decoration: An underscore (_) is prefixed to the name. The name is 
    34 ;   followed by the at sign (@) followed by the number of bytes (in decimal)
    35 ;   in the argument list. In other words, a function declared as 
    36 ;   int func( int a, double b ); is decorated as: _func@12
    42 ; EXTERN_C COLORREF GradientColor( COLORREF c1, COLORREF c2, WORD Ix, WORD Length )
    44 ; GradientColor procuces a smooth color crossfade from c1 to c2.
    45 ; Length is the desired run length from c1 to c2, and Ix is the 0-based step nr.
    47 ;    BYTE r1 = GetRValue( c1 );
    48 ;    BYTE g1 = GetGValue( c1 );
    49 ;    BYTE b1 = GetBValue( c1 );
    50 ;    int dr = int(GetRValue( c2 )) - r1;
    51 ;    int dg = int(GetGValue( c2 )) - g1;
    52 ;    int db = int(GetBValue( c2 )) - b1;
    53 ;    int N = Length-1; // Divisor need to comply with Ix 0..Length-1
    54 ;    int r = int(r1) + ((dr * int(Ix)) / N);
    55 ;    int g = int(g1) + ((dg * int(Ix)) / N);
    56 ;    int b = int(b1) + ((db * int(Ix)) / N);
    57 ;    return RGB( byte(r), byte(g), byte(b) );
    59 ; PONDER: This could probably get much faster by use of some SIMD ops.
    60 ; AGH, can't use SIMD since there's no packed div..
    62 _GradientColor PROC ;; Implementation 1, plain CPU, no MMX
    66     sub     esp, 8              ; [esp] = N = Length-1
    69     mov     ecx, [ebp+12]       ; c2
    70     ; int N = Length-1; // Divisor need to comply with Ix 0..Length-1
    71     mov     eax, [ebp+20]       ; Length
    73     mov     [esp], eax          ; [esp] = N = Length-1
    75     mov     [esp+4], eax        ; [esp+4] = Result
    78     ; BYTE r1 = GetRValue( c1 );
    79     ; int dr = int(GetRValue( c2 )) - r1;
    83     ; int r = int(r1) + ((dr * int(Ix)) / N);
    84     imul    word ptr [ebp+16]   ; Ix
    85     idiv    word ptr [esp]      ; N
    87     mov     byte ptr [esp+4], al ; r
    94     imul    word ptr [ebp+16]   ; Ix
    95     idiv    word ptr [esp]      ; N
    97     mov     byte ptr [esp+5], al ; g
   104     imul    word ptr [ebp+16]   ; Ix
   105     idiv    word ptr [esp]      ; N
   107     mov     byte ptr [esp+6], al ; b
   109     mov     eax, [esp+4]        ; [Result]
   117 IF 0 ;; FIXME: Saturation problem in 'div bh' when result would be > 255
   118 ;; Meanwhile use the C++ version in GdiUtil.cpp
   120 PUBLIC _ScaleColorRef@12 ; COLORREF __stdcall ScaleColorRef( COLORREF rgb, BYTE mul, BYTE div );
   122 _ScaleColorRef@12 PROC ; Linetest OK
   123     ; Sadly, it can't be done in MMX since there's no packed div.
   127     mov     esi, [esp+12]   ; rgb
   128     xor     edx, edx        ; result
   129     mov     bl, [esp+16]    ; mul
   130     mov     bh, [esp+20]    ; div
   136     ; If mul+div <= 2, return color unmodified.
   137     ; E.g: mul or div == 0, mul and div == 1
   141     mov     ecx, 3          ; loop cnt
   143     mov     eax, esi        ; Get color component into AL
   144     or     al, al          ; Test color component
   145     jz     L2              ; Skip muldiv by zero color
   146     mul     bl              ; AX <- AL * r8
   147     cmp        bh, 1           ; Test divisor
   148     jbe        L2              ; div <= 1, so don't div
   149     div     bh              ; AL <- AX / r8, AH <- AX % r8
   151     mov     dl, al          ; Get scaled color component
   152     shl     edx, 8          ; Shift scaled comp up
   153     shr     esi, 8          ; Shift to next color comp
   157     shr        eax, 8          ; Undo the last color shift 
   162 _ScaleColorRef@12 ENDP
   167 ;==============================================================================
   168 ELSE ; 64 bit version
   169 ;==============================================================================
   170 ECHO === 64-BIT ASSEMBLY (ML64) ===
   172 ; Extern C assembly routines does NOT get an added underscore with MSVC + ML64.
   173 ; Hence the x64 assembly routines must be named _exactly as the C prototypes_,
   174 ; or, in case of C++ class members, the mangled C++ identifiers.
   176 ; Fastcall is used regardless of prototype declaration!
   177 ; Arguments -> RCX, RDX, R8, R9, then stack.
   179 ; The four register args are backed by unused stack cells.
   180 ; Ergo, after std prologue the fifth argument is at [RBP+48].
   182 ; Normal fastcall stack cleanup convention (function pop args) is *not used*.
   183 ; Functions end with 'ret 0' even if they had stack args.
   185 ; RAX, RCX, RDX, R8, R9, R10, R11 are considered volatile.
   186 ; RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15 are nonvolatile
   187 ; and must be saved and restored by a function that use them.
   189 ; frame$ = 10h ; Offset from rbp to first shadow arg after 'enter 0,0'
   190 ; x64 fastcall arguments:
   191 ;   rcx = arg1 (rbp+10h)
   192 ;   rdx = arg2 (rbp+18h)
   193 ;   r8  = arg3 (rbp+20h)
   194 ;   r9  = arg4 (rbp+28h)
   201 ; COLORREF GradientColor( COLORREF c1, COLORREF c2, WORD Ix, WORD Length )
   209    mov     r10d, edx   ; r10d = c2
   210     sub     rsp, 8      ; [rsp] = result (need bytewise access).
   212     dec     r9w         ; r9w = N = Length-1
   214     mov     [rsp], eax  ; [rsp] = Result
   219     sub     ax, dx      ; ax = dr = r2-r1
   223     mov     byte ptr [rsp], al ; R
   233     mov     byte ptr [rsp+1], al ; G
   243     mov     byte ptr [esp+2], al ; B
   245     mov     eax, [rsp] ; [Result]
   248     ret     ; Caller balances the stack