diff --git a/CMakeLists.txt b/CMakeLists.txt
index e37ed34..3310e97 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,7 +75,13 @@ else()
     message(STATUS "Unsupported platform")
 endif()
 
-target_include_directories(${PROJECT_NAME} PRIVATE "vendor")
+target_include_directories(${PROJECT_NAME} PRIVATE vendor)
+target_sources(${PROJECT_NAME} PRIVATE "vendor/SDL3_gfx/SDL3_framerate.c")
+target_sources(${PROJECT_NAME} PRIVATE "vendor/SDL3_gfx/SDL3_gfxPrimitives.c")
+target_sources(${PROJECT_NAME} PRIVATE "vendor/SDL3_gfx/SDL3_imageFilter.c")
+target_sources(${PROJECT_NAME} PRIVATE "vendor/SDL3_gfx/SDL3_rotozoom.c")
+target_sources(${PROJECT_NAME} PRIVATE "vendor/clay/clay_renderer_SDL3.c")
+
 target_link_libraries(${PROJECT_NAME} PRIVATE
 	SDL3::SDL3-static
 	SDL3_ttf::SDL3_ttf-static
diff --git a/main.c b/main.c
index b39fbcb..9a0b3eb 100644
--- a/main.c
+++ b/main.c
@@ -6,7 +6,7 @@
 
 #define CLAY_IMPLEMENTATION
 #include <clay/clay.h>
-#include <clay/clay_renderer_SDL3.c>
+#include <clay/clay_renderer_SDL3.h>
 
 #include "ui/clay_video_demo.c"
 
diff --git a/vendor/SDL3_gfx/SDL3_framerate.c b/vendor/SDL3_gfx/SDL3_framerate.c
new file mode 100644
index 0000000..5bff7e2
--- /dev/null
+++ b/vendor/SDL3_gfx/SDL3_framerate.c
@@ -0,0 +1,189 @@
+/*
+
+SDL3_framerate.c: framerate manager
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#include "SDL3_framerate.h"
+
+/*!
+\brief Internal wrapper to SDL_GetTicks that ensures a non-zero return value.
+
+\return The tick count.
+*/
+Uint64 _getTicks()
+{
+	Uint64 ticks = SDL_GetTicks();
+
+	/* 
+	* Since baseticks!=0 is used to track initialization
+	* we need to ensure that the tick count is always >0 
+	* since SDL_GetTicks may not have incremented yet and
+	* return 0 depending on the timing of the calls.
+	*/
+	if (ticks == 0) {
+		return 1;
+	} else {
+		return ticks;
+	}
+}
+
+/*!
+\brief Initialize the framerate manager.
+
+Initialize the framerate manager, set default framerate of 30Hz and
+reset delay interpolation.
+
+\param manager Pointer to the framerate manager.
+*/
+void SDL_initFramerate(FPSmanager * manager)
+{
+	/*
+	* Store some sane values 
+	*/
+	manager->framecount = 0;
+	manager->rate = FPS_DEFAULT;
+	manager->rateticks = (1000.0f / (float) FPS_DEFAULT);
+	manager->baseticks = _getTicks();
+	manager->lastticks = manager->baseticks;
+
+}
+
+/*!
+\brief Set the framerate in Hz 
+
+Sets a new framerate for the manager and reset delay interpolation.
+Rate values must be between FPS_LOWER_LIMIT and FPS_UPPER_LIMIT inclusive to be accepted.
+
+\param manager Pointer to the framerate manager.
+\param rate The new framerate in Hz (frames per second).
+
+\return 0 for sucess and -1 for error.
+*/
+int SDL_setFramerate(FPSmanager * manager, Uint32 rate)
+{
+	if ((rate >= FPS_LOWER_LIMIT) && (rate <= FPS_UPPER_LIMIT)) {
+		manager->framecount = 0;
+		manager->rate = rate;
+		manager->rateticks = (1000.0f / (float) rate);
+		return (0);
+	} else {
+		return (-1);
+	}
+}
+
+/*!
+\brief Return the current target framerate in Hz 
+
+Get the currently set framerate of the manager.
+
+\param manager Pointer to the framerate manager.
+
+\return Current framerate in Hz or -1 for error.
+*/
+int SDL_getFramerate(FPSmanager * manager)
+{
+	if (manager == NULL) {
+		return (-1);
+	} else {
+		return ((int)manager->rate);
+	}
+}
+
+/*!
+\brief Return the current framecount.
+
+Get the current framecount from the framerate manager. 
+A frame is counted each time SDL_framerateDelay is called.
+
+\param manager Pointer to the framerate manager.
+
+\return Current frame count or -1 for error.
+*/
+int SDL_getFramecount(FPSmanager * manager)
+{
+	if (manager == NULL) {
+		return (-1);
+	} else {
+		return ((int)manager->framecount);
+	}
+}
+
+/*!
+\brief Delay execution to maintain a constant framerate and calculate fps.
+
+Generate a delay to accomodate currently set framerate. Call once in the
+graphics/rendering loop. If the computer cannot keep up with the rate (i.e.
+drawing too slow), the delay is zero and the delay interpolation is reset.
+
+\param manager Pointer to the framerate manager.
+
+\return The time that passed since the last call to the function in ms. May return 0.
+*/
+Uint64 SDL_framerateDelay(FPSmanager * manager)
+{
+	Uint64 current_ticks;
+	Uint64 target_ticks;
+	Uint64 the_delay;
+	Uint64 time_passed = 0;
+
+	/*
+	* No manager, no delay
+	*/
+	if (manager == NULL) {
+		return 0;
+	}
+
+	/*
+	* Initialize uninitialized manager 
+	*/
+	if (manager->baseticks == 0) {
+		SDL_initFramerate(manager);
+	}
+
+	/*
+	* Next frame 
+	*/
+	manager->framecount++;
+
+	/*
+	* Get/calc ticks 
+	*/
+	current_ticks = _getTicks();
+	time_passed = current_ticks - manager->lastticks;
+	manager->lastticks = current_ticks;
+	target_ticks = manager->baseticks + (Uint64) ((float) manager->framecount * manager->rateticks);
+
+	if (current_ticks <= target_ticks) {
+		the_delay = target_ticks - current_ticks;
+		SDL_Delay(the_delay);
+	} else {
+		manager->framecount = 0;
+		manager->baseticks = _getTicks();
+	}
+
+	return time_passed;
+}
diff --git a/vendor/SDL3_gfx/SDL3_framerate.h b/vendor/SDL3_gfx/SDL3_framerate.h
new file mode 100644
index 0000000..43bf8fc
--- /dev/null
+++ b/vendor/SDL3_gfx/SDL3_framerate.h
@@ -0,0 +1,100 @@
+/*
+
+SDL3_framerate.h: framerate manager
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifndef _SDL3_framerate_h
+#define _SDL3_framerate_h
+
+/* Set up for C function definitions, even when using C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/* --- */
+
+#include <SDL3/SDL.h>
+
+	/* --------- Definitions */
+
+	/*!
+	\brief Highest possible rate supported by framerate controller in Hz (1/s).
+	*/
+#define FPS_UPPER_LIMIT		200
+
+	/*!
+	\brief Lowest possible rate supported by framerate controller in Hz (1/s).
+	*/
+#define FPS_LOWER_LIMIT		1
+
+	/*!
+	\brief Default rate of framerate controller in Hz (1/s).
+	*/
+#define FPS_DEFAULT		30
+
+	/*! 
+	\brief Structure holding the state and timing information of the framerate controller. 
+	*/
+	typedef struct {
+		Uint32 framecount;
+		float rateticks;
+		Uint64 baseticks;
+		Uint64 lastticks;
+		Uint32 rate;
+	} FPSmanager;
+
+	/* ---- Function Prototypes */
+
+#ifdef _MSC_VER
+#  if defined(DLL_EXPORT) && !defined(LIBSDL3_GFX_DLL_IMPORT)
+#    define SDL3_FRAMERATE_SCOPE __declspec(dllexport)
+#  else
+#    ifdef LIBSDL3_GFX_DLL_IMPORT
+#      define SDL3_FRAMERATE_SCOPE __declspec(dllimport)
+#    endif
+#  endif
+#endif
+#ifndef SDL3_FRAMERATE_SCOPE
+#  define SDL3_FRAMERATE_SCOPE extern
+#endif
+
+	/* Functions return 0 or value for sucess and -1 for error */
+
+	SDL3_FRAMERATE_SCOPE void SDL_initFramerate(FPSmanager * manager);
+	SDL3_FRAMERATE_SCOPE int SDL_setFramerate(FPSmanager * manager, Uint32 rate);
+	SDL3_FRAMERATE_SCOPE int SDL_getFramerate(FPSmanager * manager);
+	SDL3_FRAMERATE_SCOPE int SDL_getFramecount(FPSmanager * manager);
+	SDL3_FRAMERATE_SCOPE Uint64 SDL_framerateDelay(FPSmanager * manager);
+
+	/* --- */
+
+	/* Ends C function definitions when using C++ */
+#ifdef __cplusplus
+}
+#endif
+
+#endif				/* _SDL3_framerate_h */
diff --git a/vendor/SDL3_gfx/SDL3_gfxPrimitives.c b/vendor/SDL3_gfx/SDL3_gfxPrimitives.c
new file mode 100644
index 0000000..c67d806
--- /dev/null
+++ b/vendor/SDL3_gfx/SDL3_gfxPrimitives.c
@@ -0,0 +1,3784 @@
+/* 
+
+SDL3_gfxPrimitives.c: graphics primitives for SDL3 renderers
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#include "SDL3_gfxPrimitives.h"
+#include "SDL3_rotozoom.h"
+#include "SDL3_gfxPrimitives_font.h"
+
+/* ---- Pixel */
+
+/*!
+\brief Draw pixel  in currently set color.
+
+\param renderer The renderer to draw on.
+\param x X (horizontal) coordinate of the pixel.
+\param y Y (vertical) coordinate of the pixel.
+
+\returns Returns true on success, false on failure.
+*/
+bool pixel(SDL_Renderer *renderer, Sint16 x, Sint16 y)
+{
+	return SDL_RenderPoint(renderer, x, y);
+}
+
+/*!
+\brief Draw pixel with blending enabled if a<255.
+
+\param renderer The renderer to draw on.
+\param x X (horizontal) coordinate of the pixel.
+\param y Y (vertical) coordinate of the pixel.
+\param color The color value of the pixel to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool pixelColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return pixelRGBA(renderer, x, y, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw pixel with blending enabled if a<255.
+
+\param renderer The renderer to draw on.
+\param x X (horizontal) coordinate of the pixel.
+\param y Y (vertical) coordinate of the pixel.
+\param r The red color value of the pixel to draw. 
+\param g The green color value of the pixel to draw.
+\param b The blue color value of the pixel to draw.
+\param a The alpha value of the pixel to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool pixelRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	bool result = true;
+	result &= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result &= SDL_SetRenderDrawColor(renderer, r, g, b, a);
+	result &= SDL_RenderPoint(renderer, x, y);
+	return result;
+}
+
+/*!
+\brief Draw pixel with blending enabled and using alpha weight on color.
+
+\param renderer The renderer to draw on.
+\param x The horizontal coordinate of the pixel.
+\param y The vertical position of the pixel.
+\param r The red color value of the pixel to draw. 
+\param g The green color value of the pixel to draw.
+\param b The blue color value of the pixel to draw.
+\param a The alpha value of the pixel to draw.
+\param weight The weight multiplied into the alpha value of the pixel.
+
+\returns Returns true on success, false on failure.
+*/
+bool pixelRGBAWeight(SDL_Renderer * renderer, Sint16 x, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a, Uint32 weight)
+{
+	/*
+	* Modify Alpha by weight 
+	*/
+	Uint32 ax = a;
+	ax = ((ax * weight) >> 8);
+	if (ax > 255) {
+		a = 255;
+	} else {
+		a = (Uint8)(ax & 0x000000ff);
+	}
+
+	return pixelRGBA(renderer, x, y, r, g, b, a);
+}
+
+/* ---- Hline */
+
+/*!
+\brief Draw horizontal line in currently set color
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. left) of the line.
+\param x2 X coordinate of the second point (i.e. right) of the line.
+\param y Y coordinate of the points of the line.
+
+\returns Returns true on success, false on failure.
+*/
+bool hline(SDL_Renderer * renderer, Sint16 x1, Sint16 x2, Sint16 y)
+{
+	return SDL_RenderLine(renderer, x1, y, x2, y);;
+}
+
+
+/*!
+\brief Draw horizontal line with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. left) of the line.
+\param x2 X coordinate of the second point (i.e. right) of the line.
+\param y Y coordinate of the points of the line.
+\param color The color value of the line to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool hlineColor(SDL_Renderer * renderer, Sint16 x1, Sint16 x2, Sint16 y, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return hlineRGBA(renderer, x1, x2, y, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw horizontal line with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. left) of the line.
+\param x2 X coordinate of the second point (i.e. right) of the line.
+\param y Y coordinate of the points of the line.
+\param r The red value of the line to draw. 
+\param g The green value of the line to draw. 
+\param b The blue value of the line to draw. 
+\param a The alpha value of the line to draw. 
+
+\returns Returns true on success, false on failure.
+*/
+bool hlineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 x2, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	bool result = true;
+	result &= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result &= SDL_SetRenderDrawColor(renderer, r, g, b, a);
+	result &= SDL_RenderLine(renderer, x1, y, x2, y);
+	return result;
+}
+
+/* ---- Vline */
+
+/*!
+\brief Draw vertical line in currently set color
+
+\param renderer The renderer to draw on.
+\param x X coordinate of points of the line.
+\param y1 Y coordinate of the first point (i.e. top) of the line.
+\param y2 Y coordinate of the second point (i.e. bottom) of the line.
+
+\returns Returns true on success, false on failure.
+*/
+bool vline(SDL_Renderer * renderer, Sint16 x, Sint16 y1, Sint16 y2)
+{
+	return SDL_RenderLine(renderer, x, y1, x, y2);;
+}
+
+/*!
+\brief Draw vertical line with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the points of the line.
+\param y1 Y coordinate of the first point (i.e. top) of the line.
+\param y2 Y coordinate of the second point (i.e. bottom) of the line.
+\param color The color value of the line to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool vlineColor(SDL_Renderer * renderer, Sint16 x, Sint16 y1, Sint16 y2, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return vlineRGBA(renderer, x, y1, y2, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw vertical line with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the points of the line.
+\param y1 Y coordinate of the first point (i.e. top) of the line.
+\param y2 Y coordinate of the second point (i.e. bottom) of the line.
+\param r The red value of the line to draw. 
+\param g The green value of the line to draw. 
+\param b The blue value of the line to draw. 
+\param a The alpha value of the line to draw. 
+
+\returns Returns true on success, false on failure.
+*/
+bool vlineRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y1, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	bool result = true;
+	result &= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result &= SDL_SetRenderDrawColor(renderer, r, g, b, a);
+	result &= SDL_RenderLine(renderer, x, y1, x, y2);
+	return result;
+}
+
+/* ---- Rectangle */
+
+/*!
+\brief Draw rectangle with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the rectangle.
+\param y1 Y coordinate of the first point (i.e. top right) of the rectangle.
+\param x2 X coordinate of the second point (i.e. bottom left) of the rectangle.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the rectangle.
+\param color The color value of the rectangle to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool rectangleColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return rectangleRGBA(renderer, x1, y1, x2, y2, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw rectangle with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the rectangle.
+\param y1 Y coordinate of the first point (i.e. top right) of the rectangle.
+\param x2 X coordinate of the second point (i.e. bottom left) of the rectangle.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the rectangle.
+\param r The red value of the rectangle to draw. 
+\param g The green value of the rectangle to draw. 
+\param b The blue value of the rectangle to draw. 
+\param a The alpha value of the rectangle to draw. 
+
+\returns Returns true on success, false on failure.
+*/
+bool rectangleRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	bool result;
+	Sint16 tmp;
+	SDL_FRect rect;
+
+	/*
+	* Test for special cases of straight lines or single point 
+	*/
+	if (x1 == x2) {
+		if (y1 == y2) {
+			return (pixelRGBA(renderer, x1, y1, r, g, b, a));
+		} else {
+			return (vlineRGBA(renderer, x1, y1, y2, r, g, b, a));
+		}
+	} else {
+		if (y1 == y2) {
+			return (hlineRGBA(renderer, x1, x2, y1, r, g, b, a));
+		}
+	}
+
+	/*
+	* Swap x1, x2 if required 
+	*/
+	if (x1 > x2) {
+		tmp = x1;
+		x1 = x2;
+		x2 = tmp;
+	}
+
+	/*
+	* Swap y1, y2 if required 
+	*/
+	if (y1 > y2) {
+		tmp = y1;
+		y1 = y2;
+		y2 = tmp;
+	}
+
+	/* 
+	* Create destination rect
+	*/	
+	rect.x = x1;
+	rect.y = y1;
+	rect.w = x2 - x1;
+	rect.h = y2 - y1;
+	
+	/*
+	* Draw
+	*/
+	result = true;
+	result &= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result &= SDL_SetRenderDrawColor(renderer, r, g, b, a);	
+	result &= SDL_RenderRect(renderer, &rect);
+	return result;
+}
+
+/* ---- Rounded Rectangle */
+
+/*!
+\brief Draw rounded-corner rectangle with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the rectangle.
+\param y1 Y coordinate of the first point (i.e. top right) of the rectangle.
+\param x2 X coordinate of the second point (i.e. bottom left) of the rectangle.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the rectangle.
+\param rad The radius of the corner arc.
+\param color The color value of the rectangle to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool roundedRectangleColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return roundedRectangleRGBA(renderer, x1, y1, x2, y2, rad, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw rounded-corner rectangle with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the rectangle.
+\param y1 Y coordinate of the first point (i.e. top right) of the rectangle.
+\param x2 X coordinate of the second point (i.e. bottom left) of the rectangle.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the rectangle.
+\param rad The radius of the corner arc.
+\param r The red value of the rectangle to draw. 
+\param g The green value of the rectangle to draw. 
+\param b The blue value of the rectangle to draw. 
+\param a The alpha value of the rectangle to draw. 
+
+\returns Returns true on success, false on failure.
+*/
+bool roundedRectangleRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int result = 0;
+	Sint16 tmp;
+	Sint16 w, h;
+	Sint16 xx1, xx2;
+	Sint16 yy1, yy2;
+	
+	/*
+	* Check renderer
+	*/
+	if (renderer == NULL)
+	{
+		return false;
+	}
+
+	/*
+	* Check radius vor valid range
+	*/
+	if (rad < 0) {
+		return false;
+	}
+
+	/*
+	* Special case - no rounding
+	*/
+	if (rad <= 1) {
+		return rectangleRGBA(renderer, x1, y1, x2, y2, r, g, b, a);
+	}
+
+	/*
+	* Test for special cases of straight lines or single point 
+	*/
+	if (x1 == x2) {
+		if (y1 == y2) {
+			return (pixelRGBA(renderer, x1, y1, r, g, b, a));
+		} else {
+			return (vlineRGBA(renderer, x1, y1, y2, r, g, b, a));
+		}
+	} else {
+		if (y1 == y2) {
+			return (hlineRGBA(renderer, x1, x2, y1, r, g, b, a));
+		}
+	}
+
+	/*
+	* Swap x1, x2 if required 
+	*/
+	if (x1 > x2) {
+		tmp = x1;
+		x1 = x2;
+		x2 = tmp;
+	}
+
+	/*
+	* Swap y1, y2 if required 
+	*/
+	if (y1 > y2) {
+		tmp = y1;
+		y1 = y2;
+		y2 = tmp;
+	}
+
+	/*
+	* Calculate width&height 
+	*/
+	w = x2 - x1;
+	h = y2 - y1;
+
+	/*
+	* Maybe adjust radius
+	*/
+	if ((rad * 2) > w)  
+	{
+		rad = w / 2;
+	}
+	if ((rad * 2) > h)
+	{
+		rad = h / 2;
+	}
+
+	/*
+	* Draw corners
+	*/
+	xx1 = x1 + rad;
+	xx2 = x2 - rad;
+	yy1 = y1 + rad;
+	yy2 = y2 - rad;
+	result &= arcRGBA(renderer, xx1, yy1, rad, 180, 270, r, g, b, a);
+	result &= arcRGBA(renderer, xx2, yy1, rad, 270, 360, r, g, b, a);
+	result &= arcRGBA(renderer, xx1, yy2, rad,  90, 180, r, g, b, a);
+	result &= arcRGBA(renderer, xx2, yy2, rad,   0,  90, r, g, b, a);
+
+	/*
+	* Draw lines
+	*/
+	if (xx1 <= xx2) {
+		result &= hlineRGBA(renderer, xx1, xx2, y1, r, g, b, a);
+		result &= hlineRGBA(renderer, xx1, xx2, y2, r, g, b, a);
+	}
+	if (yy1 <= yy2) {
+		result &= vlineRGBA(renderer, x1, yy1, yy2, r, g, b, a);
+		result &= vlineRGBA(renderer, x2, yy1, yy2, r, g, b, a);
+	}
+
+	return result;
+}
+
+/* ---- Rounded Box */
+
+/*!
+\brief Draw rounded-corner box (filled rectangle) with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the box.
+\param y1 Y coordinate of the first point (i.e. top right) of the box.
+\param x2 X coordinate of the second point (i.e. bottom left) of the box.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the box.
+\param rad The radius of the corner arcs of the box.
+\param color The color value of the box to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool roundedBoxColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return roundedBoxRGBA(renderer, x1, y1, x2, y2, rad, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw rounded-corner box (filled rectangle) with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the box.
+\param y1 Y coordinate of the first point (i.e. top right) of the box.
+\param x2 X coordinate of the second point (i.e. bottom left) of the box.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the box.
+\param rad The radius of the corner arcs of the box.
+\param r The red value of the box to draw. 
+\param g The green value of the box to draw. 
+\param b The blue value of the box to draw. 
+\param a The alpha value of the box to draw. 
+
+\returns Returns true on success, false on failure.
+*/
+bool roundedBoxRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2,
+	Sint16 y2, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	bool result;
+	Sint16 w, h, r2, tmp;
+	Sint16 cx = 0;
+	Sint16 cy = rad;
+	Sint16 ocx = (Sint16) 0xffff;
+	Sint16 ocy = (Sint16) 0xffff;
+	Sint16 df = 1 - rad;
+	Sint16 d_e = 3;
+	Sint16 d_se = -2 * rad + 5;
+	Sint16 xpcx, xmcx, xpcy, xmcy;
+	Sint16 ypcy, ymcy, ypcx, ymcx;
+	Sint16 x, y, dx, dy;
+
+	/* 
+	* Check destination renderer 
+	*/
+	if (renderer == NULL)
+	{
+		return false;
+	}
+
+	/*
+	* Check radius vor valid range
+	*/
+	if (rad < 0) {
+		return false;
+	}
+
+	/*
+	* Special case - no rounding
+	*/
+	if (rad <= 1) {
+		return boxRGBA(renderer, x1, y1, x2, y2, r, g, b, a);
+	}
+
+	/*
+	* Test for special cases of straight lines or single point 
+	*/
+	if (x1 == x2) {
+		if (y1 == y2) {
+			return (pixelRGBA(renderer, x1, y1, r, g, b, a));
+		} else {
+			return (vlineRGBA(renderer, x1, y1, y2, r, g, b, a));
+		}
+	} else {
+		if (y1 == y2) {
+			return (hlineRGBA(renderer, x1, x2, y1, r, g, b, a));
+		}
+	}
+
+	/*
+	* Swap x1, x2 if required 
+	*/
+	if (x1 > x2) {
+		tmp = x1;
+		x1 = x2;
+		x2 = tmp;
+	}
+
+	/*
+	* Swap y1, y2 if required 
+	*/
+	if (y1 > y2) {
+		tmp = y1;
+		y1 = y2;
+		y2 = tmp;
+	}
+
+	/*
+	* Calculate width&height 
+	*/
+	w = x2 - x1 + 1;
+	h = y2 - y1 + 1;
+
+	/*
+	* Maybe adjust radius
+	*/
+	r2 = rad + rad;
+	if (r2 > w)  
+	{
+		rad = w / 2;
+		r2 = rad + rad;
+	}
+	if (r2 > h)
+	{
+		rad = h / 2;
+	}
+
+	/* Setup filled circle drawing for corners */
+	x = x1 + rad;
+	y = y1 + rad;
+	dx = x2 - x1 - rad - rad;
+	dy = y2 - y1 - rad - rad;
+
+	/*
+	* Set color
+	*/
+	result = true;
+	result &= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result &= SDL_SetRenderDrawColor(renderer, r, g, b, a);
+
+	/*
+	* Draw corners
+	*/
+	do {
+		xpcx = x + cx;
+		xmcx = x - cx;
+		xpcy = x + cy;
+		xmcy = x - cy;
+		if (ocy != cy) {
+			if (cy > 0) {
+				ypcy = y + cy;
+				ymcy = y - cy;
+				result &= hline(renderer, xmcx, xpcx + dx, ypcy + dy);
+				result &= hline(renderer, xmcx, xpcx + dx, ymcy);
+			} else {
+				result &= hline(renderer, xmcx, xpcx + dx, y);
+			}
+			ocy = cy;
+		}
+		if (ocx != cx) {
+			if (cx != cy) {
+				if (cx > 0) {
+					ypcx = y + cx;
+					ymcx = y - cx;
+					result &= hline(renderer, xmcy, xpcy + dx, ymcx);
+					result &= hline(renderer, xmcy, xpcy + dx, ypcx + dy);
+				} else {
+					result &= hline(renderer, xmcy, xpcy + dx, y);
+				}
+			}
+			ocx = cx;
+		}
+
+		/*
+		* Update 
+		*/
+		if (df < 0) {
+			df += d_e;
+			d_e += 2;
+			d_se += 2;
+		} else {
+			df += d_se;
+			d_e += 2;
+			d_se += 4;
+			cy--;
+		}
+		cx++;
+	} while (cx <= cy);
+
+	/* Inside */
+	if (dx > 0 && dy > 0) {
+		result &= boxRGBA(renderer, x1, y1 + rad + 1, x2, y2 - rad, r, g, b, a);
+	}
+
+	return (result);
+}
+
+/* ---- Box */
+
+/*!
+\brief Draw box (filled rectangle) with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the box.
+\param y1 Y coordinate of the first point (i.e. top right) of the box.
+\param x2 X coordinate of the second point (i.e. bottom left) of the box.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the box.
+\param color The color value of the box to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool boxColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return boxRGBA(renderer, x1, y1, x2, y2, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw box (filled rectangle) with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the box.
+\param y1 Y coordinate of the first point (i.e. top right) of the box.
+\param x2 X coordinate of the second point (i.e. bottom left) of the box.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the box.
+\param r The red value of the box to draw. 
+\param g The green value of the box to draw. 
+\param b The blue value of the box to draw. 
+\param a The alpha value of the box to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool boxRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	bool result;
+	Sint16 tmp;
+	SDL_FRect rect;
+
+	/*
+	* Test for special cases of straight lines or single point 
+	*/
+	if (x1 == x2) {
+		if (y1 == y2) {
+			return (pixelRGBA(renderer, x1, y1, r, g, b, a));
+		} else {
+			return (vlineRGBA(renderer, x1, y1, y2, r, g, b, a));
+		}
+	} else {
+		if (y1 == y2) {
+			return (hlineRGBA(renderer, x1, x2, y1, r, g, b, a));
+		}
+	}
+
+	/*
+	* Swap x1, x2 if required 
+	*/
+	if (x1 > x2) {
+		tmp = x1;
+		x1 = x2;
+		x2 = tmp;
+	}
+
+	/*
+	* Swap y1, y2 if required 
+	*/
+	if (y1 > y2) {
+		tmp = y1;
+		y1 = y2;
+		y2 = tmp;
+	}
+
+	/* 
+	* Create destination rect
+	*/	
+	rect.x = x1;
+	rect.y = y1;
+	rect.w = x2 - x1 + 1;
+	rect.h = y2 - y1 + 1;
+	
+	/*
+	* Draw
+	*/
+	result = true;
+	result &= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result &= SDL_SetRenderDrawColor(renderer, r, g, b, a);	
+	result &= SDL_RenderFillRect(renderer, &rect);
+	return result;
+}
+
+/* ----- Line */
+
+/*!
+\brief Draw line with alpha blending using the currently set color.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the second point of the line.
+
+\returns Returns true on success, false on failure.
+*/
+bool line(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2)
+{
+	/*
+	* Draw
+	*/
+	return SDL_RenderLine(renderer, x1, y1, x2, y2);
+}
+
+/*!
+\brief Draw line with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the seond point of the line.
+\param color The color value of the line to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool lineColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return lineRGBA(renderer, x1, y1, x2, y2, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw line with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the second point of the line.
+\param r The red value of the line to draw. 
+\param g The green value of the line to draw. 
+\param b The blue value of the line to draw. 
+\param a The alpha value of the line to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool lineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw
+	*/
+	bool result = true;
+	result &= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result &= SDL_SetRenderDrawColor(renderer, r, g, b, a);	
+	result &= SDL_RenderLine(renderer, x1, y1, x2, y2);
+	return result;
+}
+
+/* ---- AA Line */
+
+#define AAlevels 256
+#define AAbits 8
+
+/*!
+\brief Internal function to draw anti-aliased line with alpha blending and endpoint control.
+
+This implementation of the Wu antialiasing code is based on Mike Abrash's
+DDJ article which was reprinted as Chapter 42 of his Graphics Programming
+Black Book, but has been optimized to work with SDL and utilizes 32-bit
+fixed-point arithmetic by A. Schiffler. The endpoint control allows the
+supression to draw the last pixel useful for rendering continous aa-lines
+with alpha<255.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the aa-line.
+\param y1 Y coordinate of the first point of the aa-line.
+\param x2 X coordinate of the second point of the aa-line.
+\param y2 Y coordinate of the second point of the aa-line.
+\param r The red value of the aa-line to draw. 
+\param g The green value of the aa-line to draw. 
+\param b The blue value of the aa-line to draw. 
+\param a The alpha value of the aa-line to draw.
+\param draw_endpoint Flag indicating if the endpoint should be drawn; draw if non-zero.
+
+\returns Returns true on success, false on failure.
+*/
+int _aalineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a, int draw_endpoint)
+{
+	Sint32 xx0, yy0, xx1, yy1;
+	bool result;
+	Uint32 intshift, erracc, erradj;
+	Uint32 erracctmp, wgt, wgtcompmask;
+	int dx, dy, tmp, xdir, y0p1, x0pxdir;
+
+	/*
+	* Keep on working with 32bit numbers 
+	*/
+	xx0 = x1;
+	yy0 = y1;
+	xx1 = x2;
+	yy1 = y2;
+
+	/*
+	* Reorder points to make dy positive 
+	*/
+	if (yy0 > yy1) {
+		tmp = yy0;
+		yy0 = yy1;
+		yy1 = tmp;
+		tmp = xx0;
+		xx0 = xx1;
+		xx1 = tmp;
+	}
+
+	/*
+	* Calculate distance 
+	*/
+	dx = xx1 - xx0;
+	dy = yy1 - yy0;
+
+	/*
+	* Adjust for negative dx and set xdir 
+	*/
+	if (dx >= 0) {
+		xdir = 1;
+	} else {
+		xdir = -1;
+		dx = (-dx);
+	}
+	
+	/*
+	* Check for special cases 
+	*/
+	if (dx == 0) {
+		/*
+		* Vertical line 
+		*/
+		if (draw_endpoint)
+		{
+			return (vlineRGBA(renderer, x1, y1, y2, r, g, b, a));
+		} else {
+			if (dy > 0) {
+				return (vlineRGBA(renderer, x1, yy0, yy0+dy, r, g, b, a));
+			} else {
+				return (pixelRGBA(renderer, x1, y1, r, g, b, a));
+			}
+		}
+	} else if (dy == 0) {
+		/*
+		* Horizontal line 
+		*/
+		if (draw_endpoint)
+		{
+			return (hlineRGBA(renderer, x1, x2, y1, r, g, b, a));
+		} else {
+			if (dx > 0) {
+				return (hlineRGBA(renderer, xx0, xx0+(xdir*dx), y1, r, g, b, a));
+			} else {
+				return (pixelRGBA(renderer, x1, y1, r, g, b, a));
+			}
+		}
+	} else if ((dx == dy) && (draw_endpoint)) {
+		/*
+		* Diagonal line (with endpoint)
+		*/
+		return (lineRGBA(renderer, x1, y1, x2, y2,  r, g, b, a));
+	}
+
+
+	/*
+	* Line is not horizontal, vertical or diagonal (with endpoint)
+	*/
+	result = true;
+
+	/*
+	* Zero accumulator 
+	*/
+	erracc = 0;
+
+	/*
+	* # of bits by which to shift erracc to get intensity level 
+	*/
+	intshift = 32 - AAbits;
+
+	/*
+	* Mask used to flip all bits in an intensity weighting 
+	*/
+	wgtcompmask = AAlevels - 1;
+
+	/*
+	* Draw the initial pixel in the foreground color 
+	*/
+	result &= pixelRGBA(renderer, x1, y1, r, g, b, a);
+
+	/*
+	* x-major or y-major? 
+	*/
+	if (dy > dx) {
+
+		/*
+		* y-major.  Calculate 16-bit fixed point fractional part of a pixel that
+		* X advances every time Y advances 1 pixel, truncating the result so that
+		* we won't overrun the endpoint along the X axis 
+		*/
+		/*
+		* Not-so-portable version: erradj = ((Uint64)dx << 32) / (Uint64)dy; 
+		*/
+		erradj = ((dx << 16) / dy) << 16;
+
+		/*
+		* draw all pixels other than the first and last 
+		*/
+		x0pxdir = xx0 + xdir;
+		while (--dy) {
+			erracctmp = erracc;
+			erracc += erradj;
+			if (erracc <= erracctmp) {
+				/*
+				* rollover in error accumulator, x coord advances 
+				*/
+				xx0 = x0pxdir;
+				x0pxdir += xdir;
+			}
+			yy0++;		/* y-major so always advance Y */
+
+			/*
+			* the AAbits most significant bits of erracc give us the intensity
+			* weighting for this pixel, and the complement of the weighting for
+			* the paired pixel. 
+			*/
+			wgt = (erracc >> intshift) & 255;
+			result &= pixelRGBAWeight (renderer, xx0, yy0, r, g, b, a, 255 - wgt);
+			result &= pixelRGBAWeight (renderer, x0pxdir, yy0, r, g, b, a, wgt);
+		}
+
+	} else {
+
+		/*
+		* x-major line.  Calculate 16-bit fixed-point fractional part of a pixel
+		* that Y advances each time X advances 1 pixel, truncating the result so
+		* that we won't overrun the endpoint along the X axis. 
+		*/
+		/*
+		* Not-so-portable version: erradj = ((Uint64)dy << 32) / (Uint64)dx; 
+		*/
+		erradj = ((dy << 16) / dx) << 16;
+
+		/*
+		* draw all pixels other than the first and last 
+		*/
+		y0p1 = yy0 + 1;
+		while (--dx) {
+
+			erracctmp = erracc;
+			erracc += erradj;
+			if (erracc <= erracctmp) {
+				/*
+				* Accumulator turned over, advance y 
+				*/
+				yy0 = y0p1;
+				y0p1++;
+			}
+			xx0 += xdir;	/* x-major so always advance X */
+			/*
+			* the AAbits most significant bits of erracc give us the intensity
+			* weighting for this pixel, and the complement of the weighting for
+			* the paired pixel. 
+			*/
+			wgt = (erracc >> intshift) & 255;
+			result &= pixelRGBAWeight (renderer, xx0, yy0, r, g, b, a, 255 - wgt);
+			result &= pixelRGBAWeight (renderer, xx0, y0p1, r, g, b, a, wgt);
+		}
+	}
+
+	/*
+	* Do we have to draw the endpoint 
+	*/
+	if (draw_endpoint) {
+		/*
+		* Draw final pixel, always exactly intersected by the line and doesn't
+		* need to be weighted. 
+		*/
+		result &= pixelRGBA (renderer, x2, y2, r, g, b, a);
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw anti-aliased line with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the aa-line.
+\param y1 Y coordinate of the first point of the aa-line.
+\param x2 X coordinate of the second point of the aa-line.
+\param y2 Y coordinate of the second point of the aa-line.
+\param color The color value of the aa-line to draw (0xRRGGBBAA).
+
+\returns Returns true on success, false on failure.
+*/
+bool aalineColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return _aalineRGBA(renderer, x1, y1, x2, y2, c[0], c[1], c[2], c[3], 1);
+}
+
+/*!
+\brief Draw anti-aliased line with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the aa-line.
+\param y1 Y coordinate of the first point of the aa-line.
+\param x2 X coordinate of the second point of the aa-line.
+\param y2 Y coordinate of the second point of the aa-line.
+\param r The red value of the aa-line to draw. 
+\param g The green value of the aa-line to draw. 
+\param b The blue value of the aa-line to draw. 
+\param a The alpha value of the aa-line to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool aalineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return _aalineRGBA(renderer, x1, y1, x2, y2, r, g, b, a, 1);
+}
+
+/* ----- Circle */
+
+/*!
+\brief Draw circle with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the circle.
+\param y Y coordinate of the center of the circle.
+\param rad Radius in pixels of the circle.
+\param color The color value of the circle to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool circleColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return ellipseRGBA(renderer, x, y, rad, rad, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw circle with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the circle.
+\param y Y coordinate of the center of the circle.
+\param rad Radius in pixels of the circle.
+\param r The red value of the circle to draw. 
+\param g The green value of the circle to draw. 
+\param b The blue value of the circle to draw. 
+\param a The alpha value of the circle to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool circleRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return ellipseRGBA(renderer, x, y, rad, rad, r, g, b, a);
+}
+
+/* ----- Arc */
+
+/*!
+\brief Arc with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the arc.
+\param y Y coordinate of the center of the arc.
+\param rad Radius in pixels of the arc.
+\param start Starting radius in degrees of the arc. 0 degrees is down, increasing counterclockwise.
+\param end Ending radius in degrees of the arc. 0 degrees is down, increasing counterclockwise.
+\param color The color value of the arc to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool arcColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return arcRGBA(renderer, x, y, rad, start, end, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Arc with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the arc.
+\param y Y coordinate of the center of the arc.
+\param rad Radius in pixels of the arc.
+\param start Starting radius in degrees of the arc. 0 degrees is down, increasing counterclockwise.
+\param end Ending radius in degrees of the arc. 0 degrees is down, increasing counterclockwise.
+\param r The red value of the arc to draw. 
+\param g The green value of the arc to draw. 
+\param b The blue value of the arc to draw. 
+\param a The alpha value of the arc to draw.
+
+\returns Returns true on success, false on failure.
+*/
+/* TODO: rewrite algorithm; arc endpoints are not always drawn */
+bool arcRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	bool result;
+	Sint16 cx = 0;
+	Sint16 cy = rad;
+	Sint16 df = 1 - rad;
+	Sint16 d_e = 3;
+	Sint16 d_se = -2 * rad + 5;
+	Sint16 xpcx, xmcx, xpcy, xmcy;
+	Sint16 ypcy, ymcy, ypcx, ymcx;
+	Uint8 drawoct;
+	int startoct, endoct, oct, stopval_start = 0, stopval_end = 0;
+	double dstart, dend, temp = 0.;
+
+	/*
+	* Sanity check radius 
+	*/
+	if (rad < 0) {
+		return (false);
+	}
+
+	/*
+	* Special case for rad=0 - draw a point 
+	*/
+	if (rad == 0) {
+		return (pixelRGBA(renderer, x, y, r, g, b, a));
+	}
+
+	/*
+	 Octant labeling
+	      
+	  \ 5 | 6 /
+	   \  |  /
+	  4 \ | / 7
+	     \|/
+	------+------ +x
+	     /|\
+	  3 / | \ 0
+	   /  |  \
+	  / 2 | 1 \
+	      +y
+
+	 Initially reset bitmask to 0x00000000
+	 the set whether or not to keep drawing a given octant.
+	 For example: 0x00111100 means we're drawing in octants 2-5
+	*/
+	drawoct = 0; 
+
+	/*
+	* Fixup angles
+	*/
+	start %= 360;
+	end %= 360;
+	/* 0 <= start & end < 360; note that sometimes start > end - if so, arc goes back through 0. */
+	while (start < 0) start += 360;
+	while (end < 0) end += 360;
+	start %= 360;
+	end %= 360;
+
+	/* now, we find which octants we're drawing in. */
+	startoct = start / 45;
+	endoct = end / 45;
+	oct = startoct - 1;
+
+	/* stopval_start, stopval_end; what values of cx to stop at. */
+	do {
+		oct = (oct + 1) % 8;
+
+		if (oct == startoct) {
+			/* need to compute stopval_start for this octant.  Look at picture above if this is unclear */
+			dstart = (double)start;
+			switch (oct) 
+			{
+			case 0:
+			case 3:
+				temp = sin(dstart * M_PI / 180.);
+				break;
+			case 1:
+			case 6:
+				temp = cos(dstart * M_PI / 180.);
+				break;
+			case 2:
+			case 5:
+				temp = -cos(dstart * M_PI / 180.);
+				break;
+			case 4:
+			case 7:
+				temp = -sin(dstart * M_PI / 180.);
+				break;
+			}
+			temp *= rad;
+			stopval_start = (int)temp;
+
+			/* 
+			This isn't arbitrary, but requires graph paper to explain well.
+			The basic idea is that we're always changing drawoct after we draw, so we
+			stop immediately after we render the last sensible pixel at x = ((int)temp).
+			and whether to draw in this octant initially
+			*/
+			if (oct % 2) drawoct |= (1 << oct);			/* this is basically like saying drawoct[oct] = true, if drawoct were a bool array */
+			else		 drawoct &= 255 - (1 << oct);	/* this is basically like saying drawoct[oct] = false */
+		}
+		if (oct == endoct) {
+			/* need to compute stopval_end for this octant */
+			dend = (double)end;
+			switch (oct)
+			{
+			case 0:
+			case 3:
+				temp = sin(dend * M_PI / 180);
+				break;
+			case 1:
+			case 6:
+				temp = cos(dend * M_PI / 180);
+				break;
+			case 2:
+			case 5:
+				temp = -cos(dend * M_PI / 180);
+				break;
+			case 4:
+			case 7:
+				temp = -sin(dend * M_PI / 180);
+				break;
+			}
+			temp *= rad;
+			stopval_end = (int)temp;
+
+			/* and whether to draw in this octant initially */
+			if (startoct == endoct)	{
+				/* note:      we start drawing, stop, then start again in this case */
+				/* otherwise: we only draw in this octant, so initialize it to false, it will get set back to true */
+				if (start > end) {
+					/* unfortunately, if we're in the same octant and need to draw over the whole circle, */
+					/* we need to set the rest to true, because the while loop will end at the bottom. */
+					drawoct = 255;
+				} else {
+					drawoct &= 255 - (1 << oct);
+				}
+			} 
+			else if (oct % 2) drawoct &= 255 - (1 << oct);
+			else			  drawoct |= (1 << oct);
+		} else if (oct != startoct) { /* already verified that it's != endoct */
+			drawoct |= (1 << oct); /* draw this entire segment */
+		}
+	} while (oct != endoct);
+
+	/* so now we have what octants to draw and when to draw them. all that's left is the actual raster code. */
+
+	/*
+	* Set color 
+	*/
+	result = true;
+	result &= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result &= SDL_SetRenderDrawColor(renderer, r, g, b, a);
+
+	/*
+	* Draw arc 
+	*/
+	do {
+		ypcy = y + cy;
+		ymcy = y - cy;
+		if (cx > 0) {
+			xpcx = x + cx;
+			xmcx = x - cx;
+
+			/* always check if we're drawing a certain octant before adding a pixel to that octant. */
+			if (drawoct & 4)  result &= pixel(renderer, xmcx, ypcy);
+			if (drawoct & 2)  result &= pixel(renderer, xpcx, ypcy);
+			if (drawoct & 32) result &= pixel(renderer, xmcx, ymcy);
+			if (drawoct & 64) result &= pixel(renderer, xpcx, ymcy);
+		} else {
+			if (drawoct & 96) result &= pixel(renderer, x, ymcy);
+			if (drawoct & 6)  result &= pixel(renderer, x, ypcy);
+		}
+
+		xpcy = x + cy;
+		xmcy = x - cy;
+		if (cx > 0 && cx != cy) {
+			ypcx = y + cx;
+			ymcx = y - cx;
+			if (drawoct & 8)   result &= pixel(renderer, xmcy, ypcx);
+			if (drawoct & 1)   result &= pixel(renderer, xpcy, ypcx);
+			if (drawoct & 16)  result &= pixel(renderer, xmcy, ymcx);
+			if (drawoct & 128) result &= pixel(renderer, xpcy, ymcx);
+		} else if (cx == 0) {
+			if (drawoct & 24)  result &= pixel(renderer, xmcy, y);
+			if (drawoct & 129) result &= pixel(renderer, xpcy, y);
+		}
+
+		/*
+		* Update whether we're drawing an octant
+		*/
+		if (stopval_start == cx) {
+			/* works like an on-off switch. */  
+			/* This is just in case start & end are in the same octant. */
+			if (drawoct & (1 << startoct)) drawoct &= 255 - (1 << startoct);		
+			else						   drawoct |= (1 << startoct);
+		}
+		if (stopval_end == cx) {
+			if (drawoct & (1 << endoct)) drawoct &= 255 - (1 << endoct);
+			else						 drawoct |= (1 << endoct);
+		}
+
+		/*
+		* Update pixels
+		*/
+		if (df < 0) {
+			df += d_e;
+			d_e += 2;
+			d_se += 2;
+		} else {
+			df += d_se;
+			d_e += 2;
+			d_se += 4;
+			cy--;
+		}
+		cx++;
+	} while (cx <= cy);
+
+	return (result);
+}
+
+/* ----- AA Circle */
+
+/*!
+\brief Draw anti-aliased circle with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the aa-circle.
+\param y Y coordinate of the center of the aa-circle.
+\param rad Radius in pixels of the aa-circle.
+\param color The color value of the aa-circle to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool aacircleColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return aaellipseRGBA(renderer, x, y, rad, rad, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw anti-aliased circle with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the aa-circle.
+\param y Y coordinate of the center of the aa-circle.
+\param rad Radius in pixels of the aa-circle.
+\param r The red value of the aa-circle to draw. 
+\param g The green value of the aa-circle to draw. 
+\param b The blue value of the aa-circle to draw. 
+\param a The alpha value of the aa-circle to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool aacircleRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return aaellipseRGBA(renderer, x, y, rad, rad, r, g, b, a);
+}
+
+/* ----- Ellipse */
+
+/*!
+\brief Internal function to draw pixels or lines in 4 quadrants.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the quadrant.
+\param y Y coordinate of the center of the quadrant.
+\param dx X offset in pixels of the corners of the quadrant.
+\param dy Y offset in pixels of the corners of the quadrant.
+\param f Flag indicating if the quadrant should be filled (1) or not (0).
+
+\returns Returns true on success, false on failure.
+*/
+int _drawQuadrants(SDL_Renderer * renderer,  Sint16 x, Sint16 y, Sint16 dx, Sint16 dy, Sint32 f)
+{
+	bool result = true;
+	Sint16 xpdx, xmdx;
+	Sint16 ypdy, ymdy;
+
+	if (dx == 0) {
+		if (dy == 0) {
+			result &= pixel(renderer, x, y);
+		} else {
+			ypdy = y + dy;
+			ymdy = y - dy;
+			if (f) {
+				result &= vline(renderer, x, ymdy, ypdy);
+			} else {
+				result &= pixel(renderer, x, ypdy);
+				result &= pixel(renderer, x, ymdy);
+			}
+		}
+	} else {	
+		xpdx = x + dx;
+		xmdx = x - dx;
+		ypdy = y + dy;
+		ymdy = y - dy;
+		if (f) {
+				result &= vline(renderer, xpdx, ymdy, ypdy);
+				result &= vline(renderer, xmdx, ymdy, ypdy);
+		} else {
+				result &= pixel(renderer, xpdx, ypdy);
+				result &= pixel(renderer, xmdx, ypdy);
+				result &= pixel(renderer, xpdx, ymdy);
+				result &= pixel(renderer, xmdx, ymdy);
+		}
+	}
+
+	return result;
+}
+
+/*!
+\brief Internal function to draw ellipse or filled ellipse with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the ellipse.
+\param y Y coordinate of the center of the ellipse.
+\param rx Horizontal radius in pixels of the ellipse.
+\param ry Vertical radius in pixels of the ellipse.
+\param r The red value of the ellipse to draw. 
+\param g The green value of the ellipse to draw. 
+\param b The blue value of the ellipse to draw. 
+\param a The alpha value of the ellipse to draw.
+\param f Flag indicating if the ellipse should be filled (1) or not (0).
+
+\returns Returns true on success, false on failure.
+*/
+#define DEFAULT_ELLIPSE_OVERSCAN	4
+bool _ellipseRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a, Sint32 f)
+{
+	bool result;
+	Sint32 rxi, ryi;
+	Sint32 rx2, ry2, rx22, ry22; 
+    Sint32 error;
+    Sint32 curX, curY, curXp1, curYm1;
+	Sint32 scrX, scrY, oldX, oldY;
+    Sint32 deltaX, deltaY;
+	Sint32 ellipseOverscan;
+
+	/*
+	* Sanity check radii 
+	*/
+	if ((rx < 0) || (ry < 0)) {
+		return (false);
+	}
+
+	/*
+	* Set color
+	*/
+	result = true;
+	result &= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result &= SDL_SetRenderDrawColor(renderer, r, g, b, a);
+
+	/*
+	* Special cases for rx=0 and/or ry=0: draw a hline/vline/pixel 
+	*/
+	if (rx == 0) {
+		if (ry == 0) {
+			return (pixel(renderer, x, y));
+		} else {
+			return (vline(renderer, x, y - ry, y + ry));
+		}
+	} else {
+		if (ry == 0) {
+			return (hline(renderer, x - rx, x + rx, y));
+		}
+	}
+	
+	/*
+ 	 * Adjust overscan 
+	 */
+	rxi = rx;
+	ryi = ry;
+	if (rxi >= 512 || ryi >= 512)
+	{
+		ellipseOverscan = DEFAULT_ELLIPSE_OVERSCAN / 4;
+	} 
+	else if (rxi >= 256 || ryi >= 256)
+	{
+		ellipseOverscan = DEFAULT_ELLIPSE_OVERSCAN / 2;
+	}
+	else
+	{
+		ellipseOverscan = DEFAULT_ELLIPSE_OVERSCAN / 1;
+	}
+
+	/*
+	 * Top/bottom center points.
+	 */
+	oldX = scrX = 0;
+	oldY = scrY = ryi;
+	result &= _drawQuadrants(renderer, x, y, 0, ry, f);
+
+	/* Midpoint ellipse algorithm with overdraw */
+	rxi *= ellipseOverscan;
+	ryi *= ellipseOverscan;
+	rx2 = rxi * rxi;
+	rx22 = rx2 + rx2;
+    ry2 = ryi * ryi;
+	ry22 = ry2 + ry2;
+    curX = 0;
+    curY = ryi;
+    deltaX = 0;
+    deltaY = rx22 * curY;
+ 
+	/* Points in segment 1 */ 
+    error = ry2 - rx2 * ryi + rx2 / 4;
+    while (deltaX <= deltaY)
+    {
+          curX++;
+          deltaX += ry22;
+ 
+          error +=  deltaX + ry2; 
+          if (error >= 0)
+          {
+               curY--;
+               deltaY -= rx22; 
+               error -= deltaY;
+          }
+
+		  scrX = curX / ellipseOverscan;
+		  scrY = curY / ellipseOverscan;
+		  if ((scrX != oldX && scrY == oldY) || (scrX != oldX && scrY != oldY)) {
+			result &= _drawQuadrants(renderer, x, y, scrX, scrY, f);
+			oldX = scrX;
+			oldY = scrY;
+		  }
+    }
+
+	/* Points in segment 2 */
+	if (curY > 0) 
+	{
+		curXp1 = curX + 1;
+		curYm1 = curY - 1;
+		error = ry2 * curX * curXp1 + ((ry2 + 3) / 4) + rx2 * curYm1 * curYm1 - rx2 * ry2;
+		while (curY > 0)
+		{
+			curY--;
+			deltaY -= rx22;
+
+			error += rx2;
+			error -= deltaY;
+ 
+			if (error <= 0) 
+			{
+               curX++;
+               deltaX += ry22;
+               error += deltaX;
+			}
+
+		    scrX = curX / ellipseOverscan;
+		    scrY = curY / ellipseOverscan;
+		    if ((scrX != oldX && scrY == oldY) || (scrX != oldX && scrY != oldY)) {
+				oldY--;
+				for (;oldY >= scrY; oldY--) {
+					result &= _drawQuadrants(renderer, x, y, scrX, oldY, f);
+					/* prevent overdraw */
+					if (f) {
+						oldY = scrY - 1;
+					}
+				}
+  				oldX = scrX;
+				oldY = scrY;
+		    }		
+		}
+
+		/* Remaining points in vertical */
+		if (!f) {
+			oldY--;
+			for (;oldY >= 0; oldY--) {
+				result &= _drawQuadrants(renderer, x, y, scrX, oldY, f);
+			}
+		}
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw ellipse with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the ellipse.
+\param y Y coordinate of the center of the ellipse.
+\param rx Horizontal radius in pixels of the ellipse.
+\param ry Vertical radius in pixels of the ellipse.
+\param color The color value of the ellipse to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool ellipseColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return _ellipseRGBA(renderer, x, y, rx, ry, c[0], c[1], c[2], c[3], 0);
+}
+
+/*!
+\brief Draw ellipse with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the ellipse.
+\param y Y coordinate of the center of the ellipse.
+\param rx Horizontal radius in pixels of the ellipse.
+\param ry Vertical radius in pixels of the ellipse.
+\param r The red value of the ellipse to draw. 
+\param g The green value of the ellipse to draw. 
+\param b The blue value of the ellipse to draw. 
+\param a The alpha value of the ellipse to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool ellipseRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return _ellipseRGBA(renderer, x, y, rx, ry, r, g, b, a, 0);
+}
+
+/* ----- Filled Circle */
+
+/*!
+\brief Draw filled circle with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the filled circle.
+\param y Y coordinate of the center of the filled circle.
+\param rad Radius in pixels of the filled circle.
+\param color The color value of the filled circle to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool filledCircleColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return filledEllipseRGBA(renderer, x, y, rad, rad, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw filled circle with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the filled circle.
+\param y Y coordinate of the center of the filled circle.
+\param rad Radius in pixels of the filled circle.
+\param r The red value of the filled circle to draw. 
+\param g The green value of the filled circle to draw. 
+\param b The blue value of the filled circle to draw. 
+\param a The alpha value of the filled circle to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool filledCircleRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return _ellipseRGBA(renderer, x, y, rad, rad, r, g ,b, a, 1);
+}
+
+
+/* ----- AA Ellipse */
+
+/* Windows targets do not have lrint, so provide a local inline version */
+#if defined(_MSC_VER) && _MSC_VER < 1920
+/* Detect 64bit and use intrinsic version */
+#ifdef _M_X64
+#include <emmintrin.h>
+static __inline long 
+	lrint(float f) 
+{
+	return _mm_cvtss_si32(_mm_load_ss(&f));
+}
+#elif defined(_M_IX86)
+__inline long int
+	lrint (double flt)
+{	
+	int intgr;
+	_asm
+	{
+		fld flt
+			fistp intgr
+	};
+	return intgr;
+}
+#elif defined(_M_ARM)
+#include <armintr.h>
+#pragma warning(push)
+#pragma warning(disable: 4716)
+__declspec(naked) long int
+	lrint (double flt)
+{
+	__emit(0xEC410B10); // fmdrr  d0, r0, r1
+	__emit(0xEEBD0B40); // ftosid s0, d0
+	__emit(0xEE100A10); // fmrs   r0, s0
+	__emit(0xE12FFF1E); // bx     lr
+}
+#pragma warning(pop)
+#else
+#error lrint needed for MSVC on non X86/AMD64/ARM targets.
+#endif
+#endif
+
+/*!
+\brief Draw anti-aliased ellipse with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the aa-ellipse.
+\param y Y coordinate of the center of the aa-ellipse.
+\param rx Horizontal radius in pixels of the aa-ellipse.
+\param ry Vertical radius in pixels of the aa-ellipse.
+\param color The color value of the aa-ellipse to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool aaellipseColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return aaellipseRGBA(renderer, x, y, rx, ry, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw anti-aliased ellipse with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the aa-ellipse.
+\param y Y coordinate of the center of the aa-ellipse.
+\param rx Horizontal radius in pixels of the aa-ellipse.
+\param ry Vertical radius in pixels of the aa-ellipse.
+\param r The red value of the aa-ellipse to draw. 
+\param g The green value of the aa-ellipse to draw. 
+\param b The blue value of the aa-ellipse to draw. 
+\param a The alpha value of the aa-ellipse to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool aaellipseRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	bool result;
+	int i;
+	int a2, b2, ds, dt, dxt, t, s, d;
+	Sint16 xp, yp, xs, ys, dyt, od, xx, yy, xc2, yc2;
+	float cp;
+	double sab;
+	Uint8 weight, iweight;
+
+	/*
+	* Sanity check radii 
+	*/
+	if ((rx < 0) || (ry < 0)) {
+		return (false);
+	}
+
+	/*
+	* Special cases for rx=0 and/or ry=0: draw a hline/vline/pixel 
+	*/
+	if (rx == 0) {
+		if (ry == 0) {
+			return (pixelRGBA(renderer, x, y, r, g, b, a));
+		} else {
+			return (vlineRGBA(renderer, x, y - ry, y + ry, r, g, b, a));
+		}
+	} else {
+		if (ry == 0) {
+			return (hlineRGBA(renderer, x - rx, x + rx, y, r, g, b, a));
+		}
+	}
+
+	/* Variable setup */
+	a2 = rx * rx;
+	b2 = ry * ry;
+
+	ds = 2 * a2;
+	dt = 2 * b2;
+
+	xc2 = 2 * x;
+	yc2 = 2 * y;
+
+	sab = sqrt((double)(a2 + b2));
+	od = (Sint16)lrint(sab*0.01) + 1; /* introduce some overdraw */
+	dxt = (Sint16)lrint((double)a2 / sab) + od;
+
+	t = 0;
+	s = -2 * a2 * ry;
+	d = 0;
+
+	xp = x;
+	yp = y - ry;
+
+	/* Draw */
+	result = true;
+	result &= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+
+	/* "End points" */
+	result &= pixelRGBA(renderer, xp, yp, r, g, b, a);
+	result &= pixelRGBA(renderer, xc2 - xp, yp, r, g, b, a);
+	result &= pixelRGBA(renderer, xp, yc2 - yp, r, g, b, a);
+	result &= pixelRGBA(renderer, xc2 - xp, yc2 - yp, r, g, b, a);
+
+	for (i = 1; i <= dxt; i++) {
+		xp--;
+		d += t - b2;
+
+		if (d >= 0)
+			ys = yp - 1;
+		else if ((d - s - a2) > 0) {
+			if ((2 * d - s - a2) >= 0)
+				ys = yp + 1;
+			else {
+				ys = yp;
+				yp++;
+				d -= s + a2;
+				s += ds;
+			}
+		} else {
+			yp++;
+			ys = yp + 1;
+			d -= s + a2;
+			s += ds;
+		}
+
+		t -= dt;
+
+		/* Calculate alpha */
+		if (s != 0) {
+			cp = (float) abs(d) / (float) abs(s);
+			if (cp > 1.0) {
+				cp = 1.0;
+			}
+		} else {
+			cp = 1.0;
+		}
+
+		/* Calculate weights */
+		weight = (Uint8) (cp * 255);
+		iweight = 255 - weight;
+
+		/* Upper half */
+		xx = xc2 - xp;
+		result &= pixelRGBAWeight(renderer, xp, yp, r, g, b, a, iweight);
+		result &= pixelRGBAWeight(renderer, xx, yp, r, g, b, a, iweight);
+
+		result &= pixelRGBAWeight(renderer, xp, ys, r, g, b, a, weight);
+		result &= pixelRGBAWeight(renderer, xx, ys, r, g, b, a, weight);
+
+		/* Lower half */
+		yy = yc2 - yp;
+		result &= pixelRGBAWeight(renderer, xp, yy, r, g, b, a, iweight);
+		result &= pixelRGBAWeight(renderer, xx, yy, r, g, b, a, iweight);
+
+		yy = yc2 - ys;
+		result &= pixelRGBAWeight(renderer, xp, yy, r, g, b, a, weight);
+		result &= pixelRGBAWeight(renderer, xx, yy, r, g, b, a, weight);
+	}
+
+	/* Replaces original approximation code dyt = abs(yp - yc); */
+	dyt = (Sint16)lrint((double)b2 / sab ) + od;    
+
+	for (i = 1; i <= dyt; i++) {
+		yp++;
+		d -= s + a2;
+
+		if (d <= 0)
+			xs = xp + 1;
+		else if ((d + t - b2) < 0) {
+			if ((2 * d + t - b2) <= 0)
+				xs = xp - 1;
+			else {
+				xs = xp;
+				xp--;
+				d += t - b2;
+				t -= dt;
+			}
+		} else {
+			xp--;
+			xs = xp - 1;
+			d += t - b2;
+			t -= dt;
+		}
+
+		s += ds;
+
+		/* Calculate alpha */
+		if (t != 0) {
+			cp = (float) abs(d) / (float) abs(t);
+			if (cp > 1.0) {
+				cp = 1.0;
+			}
+		} else {
+			cp = 1.0;
+		}
+
+		/* Calculate weight */
+		weight = (Uint8) (cp * 255);
+		iweight = 255 - weight;
+
+		/* Left half */
+		xx = xc2 - xp;
+		yy = yc2 - yp;
+		result &= pixelRGBAWeight(renderer, xp, yp, r, g, b, a, iweight);
+		result &= pixelRGBAWeight(renderer, xx, yp, r, g, b, a, iweight);
+
+		result &= pixelRGBAWeight(renderer, xp, yy, r, g, b, a, iweight);
+		result &= pixelRGBAWeight(renderer, xx, yy, r, g, b, a, iweight);
+
+		/* Right half */
+		xx = xc2 - xs;
+		result &= pixelRGBAWeight(renderer, xs, yp, r, g, b, a, weight);
+		result &= pixelRGBAWeight(renderer, xx, yp, r, g, b, a, weight);
+
+		result &= pixelRGBAWeight(renderer, xs, yy, r, g, b, a, weight);
+		result &= pixelRGBAWeight(renderer, xx, yy, r, g, b, a, weight);		
+	}
+
+	return (result);
+}
+
+/* ---- Filled Ellipse */
+
+/*!
+\brief Draw filled ellipse with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the filled ellipse.
+\param y Y coordinate of the center of the filled ellipse.
+\param rx Horizontal radius in pixels of the filled ellipse.
+\param ry Vertical radius in pixels of the filled ellipse.
+\param color The color value of the filled ellipse to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool filledEllipseColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return _ellipseRGBA(renderer, x, y, rx, ry, c[0], c[1], c[2], c[3], 1);
+}
+
+/*!
+\brief Draw filled ellipse with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the filled ellipse.
+\param y Y coordinate of the center of the filled ellipse.
+\param rx Horizontal radius in pixels of the filled ellipse.
+\param ry Vertical radius in pixels of the filled ellipse.
+\param r The red value of the filled ellipse to draw. 
+\param g The green value of the filled ellipse to draw. 
+\param b The blue value of the filled ellipse to draw. 
+\param a The alpha value of the filled ellipse to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool filledEllipseRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return _ellipseRGBA(renderer, x, y, rx, ry, r, g, b, a, 1);
+}
+
+/* ----- Pie */
+
+/*!
+\brief Internal float (low-speed) pie-calc implementation by drawing polygons.
+
+Note: Determines vertex array and uses polygon or filledPolygon drawing routines to render.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the pie.
+\param y Y coordinate of the center of the pie.
+\param rad Radius in pixels of the pie.
+\param start Starting radius in degrees of the pie.
+\param end Ending radius in degrees of the pie.
+\param r The red value of the pie to draw. 
+\param g The green value of the pie to draw. 
+\param b The blue value of the pie to draw. 
+\param a The alpha value of the pie to draw.
+\param filled Flag indicating if the pie should be filled (=1) or not (=0).
+
+\returns Returns true on success, false on failure.
+*/
+/* TODO: rewrite algorithm; pie is not always accurate */
+bool _pieRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end,  Uint8 r, Uint8 g, Uint8 b, Uint8 a, Uint8 filled)
+{
+	bool result;
+	double angle, start_angle, end_angle;
+	double deltaAngle;
+	double dr;
+	int numpoints, i;
+	Sint16 *vx, *vy;
+
+	/*
+	* Sanity check radii 
+	*/
+	if (rad < 0) {
+		return (false);
+	}
+
+	/*
+	* Fixup angles
+	*/
+	start = start % 360;
+	end = end % 360;
+
+	/*
+	* Special case for rad=0 - draw a point 
+	*/
+	if (rad == 0) {
+		return (pixelRGBA(renderer, x, y, r, g, b, a));
+	}
+
+	/*
+	* Variable setup 
+	*/
+	dr = (double) rad;
+	deltaAngle = 3.0 / dr;
+	start_angle = (double) start *(2.0 * M_PI / 360.0);
+	end_angle = (double) end *(2.0 * M_PI / 360.0);
+	if (start > end) {
+		end_angle += (2.0 * M_PI);
+	}
+
+	/* We will always have at least 2 points */
+	numpoints = 2;
+
+	/* Count points (rather than calculating it) */
+	angle = start_angle;
+	while (angle < end_angle) {
+		angle += deltaAngle;
+		numpoints++;
+	}
+
+	/* Allocate combined vertex array */
+	vx = vy = (Sint16 *) malloc(2 * sizeof(Uint16) * numpoints);
+	if (vx == NULL) {
+		return (false);
+	}
+
+	/* Update point to start of vy */
+	vy += numpoints;
+
+	/* Center */
+	vx[0] = x;
+	vy[0] = y;
+
+	/* First vertex */
+	angle = start_angle;
+	vx[1] = x + (int) (dr * cos(angle));
+	vy[1] = y + (int) (dr * sin(angle));
+
+	if (numpoints<3)
+	{
+		result = lineRGBA(renderer, vx[0], vy[0], vx[1], vy[1], r, g, b, a);
+	}
+	else
+	{
+		/* Calculate other vertices */
+		i = 2;
+		angle = start_angle;
+		while (angle < end_angle) {
+			angle += deltaAngle;
+			if (angle>end_angle)
+			{
+				angle = end_angle;
+			}
+			vx[i] = x + (int) (dr * cos(angle));
+			vy[i] = y + (int) (dr * sin(angle));
+			i++;
+		}
+
+		/* Draw */
+		if (filled) {
+			result = filledPolygonRGBA(renderer, vx, vy, numpoints, r, g, b, a);
+		} else {
+			result = polygonRGBA(renderer, vx, vy, numpoints, r, g, b, a);
+		}
+	}
+
+	/* Free combined vertex array */
+	free(vx);
+
+	return (result);
+}
+
+/*!
+\brief Draw pie (outline) with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the pie.
+\param y Y coordinate of the center of the pie.
+\param rad Radius in pixels of the pie.
+\param start Starting radius in degrees of the pie.
+\param end Ending radius in degrees of the pie.
+\param color The color value of the pie to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool pieColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, 
+	Sint16 start, Sint16 end, Uint32 color) 
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return _pieRGBA(renderer, x, y, rad, start, end, c[0], c[1], c[2], c[3], 0);
+}
+
+/*!
+\brief Draw pie (outline) with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the pie.
+\param y Y coordinate of the center of the pie.
+\param rad Radius in pixels of the pie.
+\param start Starting radius in degrees of the pie.
+\param end Ending radius in degrees of the pie.
+\param r The red value of the pie to draw. 
+\param g The green value of the pie to draw. 
+\param b The blue value of the pie to draw. 
+\param a The alpha value of the pie to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool pieRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad,
+	Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return _pieRGBA(renderer, x, y, rad, start, end, r, g, b, a, 0);
+}
+
+/*!
+\brief Draw filled pie with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the filled pie.
+\param y Y coordinate of the center of the filled pie.
+\param rad Radius in pixels of the filled pie.
+\param start Starting radius in degrees of the filled pie.
+\param end Ending radius in degrees of the filled pie.
+\param color The color value of the filled pie to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool filledPieColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return _pieRGBA(renderer, x, y, rad, start, end, c[0], c[1], c[2], c[3], 1);
+}
+
+/*!
+\brief Draw filled pie with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the filled pie.
+\param y Y coordinate of the center of the filled pie.
+\param rad Radius in pixels of the filled pie.
+\param start Starting radius in degrees of the filled pie.
+\param end Ending radius in degrees of the filled pie.
+\param r The red value of the filled pie to draw. 
+\param g The green value of the filled pie to draw. 
+\param b The blue value of the filled pie to draw. 
+\param a The alpha value of the filled pie to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool filledPieRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad,
+	Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return _pieRGBA(renderer, x, y, rad, start, end, r, g, b, a, 1);
+}
+
+/* ------ Trigon */
+
+/*!
+\brief Draw trigon (triangle outline) with alpha blending.
+
+Note: Creates vertex array and uses polygon routine to render.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the trigon.
+\param y1 Y coordinate of the first point of the trigon.
+\param x2 X coordinate of the second point of the trigon.
+\param y2 Y coordinate of the second point of the trigon.
+\param x3 X coordinate of the third point of the trigon.
+\param y3 Y coordinate of the third point of the trigon.
+\param color The color value of the trigon to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool trigonColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(polygonColor(renderer,vx,vy,3,color));
+}
+
+/*!
+\brief Draw trigon (triangle outline) with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the trigon.
+\param y1 Y coordinate of the first point of the trigon.
+\param x2 X coordinate of the second point of the trigon.
+\param y2 Y coordinate of the second point of the trigon.
+\param x3 X coordinate of the third point of the trigon.
+\param y3 Y coordinate of the third point of the trigon.
+\param r The red value of the trigon to draw. 
+\param g The green value of the trigon to draw. 
+\param b The blue value of the trigon to draw. 
+\param a The alpha value of the trigon to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool trigonRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+	Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(polygonRGBA(renderer,vx,vy,3,r,g,b,a));
+}				 
+
+/* ------ AA-Trigon */
+
+/*!
+\brief Draw anti-aliased trigon (triangle outline) with alpha blending.
+
+Note: Creates vertex array and uses aapolygon routine to render.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the aa-trigon.
+\param y1 Y coordinate of the first point of the aa-trigon.
+\param x2 X coordinate of the second point of the aa-trigon.
+\param y2 Y coordinate of the second point of the aa-trigon.
+\param x3 X coordinate of the third point of the aa-trigon.
+\param y3 Y coordinate of the third point of the aa-trigon.
+\param color The color value of the aa-trigon to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool aatrigonColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(aapolygonColor(renderer,vx,vy,3,color));
+}
+
+/*!
+\brief Draw anti-aliased trigon (triangle outline) with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the aa-trigon.
+\param y1 Y coordinate of the first point of the aa-trigon.
+\param x2 X coordinate of the second point of the aa-trigon.
+\param y2 Y coordinate of the second point of the aa-trigon.
+\param x3 X coordinate of the third point of the aa-trigon.
+\param y3 Y coordinate of the third point of the aa-trigon.
+\param r The red value of the aa-trigon to draw. 
+\param g The green value of the aa-trigon to draw. 
+\param b The blue value of the aa-trigon to draw. 
+\param a The alpha value of the aa-trigon to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool aatrigonRGBA(SDL_Renderer * renderer,  Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+	Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(aapolygonRGBA(renderer,vx,vy,3,r,g,b,a));
+}				   
+
+/* ------ Filled Trigon */
+
+/*!
+\brief Draw filled trigon (triangle) with alpha blending.
+
+Note: Creates vertex array and uses aapolygon routine to render.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the filled trigon.
+\param y1 Y coordinate of the first point of the filled trigon.
+\param x2 X coordinate of the second point of the filled trigon.
+\param y2 Y coordinate of the second point of the filled trigon.
+\param x3 X coordinate of the third point of the filled trigon.
+\param y3 Y coordinate of the third point of the filled trigon.
+\param color The color value of the filled trigon to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool filledTrigonColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(filledPolygonColor(renderer,vx,vy,3,color));
+}
+
+/*!
+\brief Draw filled trigon (triangle) with alpha blending.
+
+Note: Creates vertex array and uses aapolygon routine to render.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the filled trigon.
+\param y1 Y coordinate of the first point of the filled trigon.
+\param x2 X coordinate of the second point of the filled trigon.
+\param y2 Y coordinate of the second point of the filled trigon.
+\param x3 X coordinate of the third point of the filled trigon.
+\param y3 Y coordinate of the third point of the filled trigon.
+\param r The red value of the filled trigon to draw. 
+\param g The green value of the filled trigon to draw. 
+\param b The blue value of the filled trigon to draw. 
+\param a The alpha value of the filled trigon to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool filledTrigonRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+	Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(filledPolygonRGBA(renderer,vx,vy,3,r,g,b,a));
+}
+
+/* ---- Polygon */
+
+/*!
+\brief Draw polygon with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the polygon.
+\param vy Vertex array containing Y coordinates of the points of the polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param color The color value of the polygon to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool polygonColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return polygonRGBA(renderer, vx, vy, n, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw polygon with the currently set color and blend mode.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the polygon.
+\param vy Vertex array containing Y coordinates of the points of the polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+
+\returns Returns true on success, false on failure.
+*/
+bool polygon(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n)
+{
+	/*
+	* Draw 
+	*/
+	bool result = true;
+	int i, nn;
+	SDL_FPoint* points;
+
+	/*
+	* Vertex array NULL check 
+	*/
+	if (vx == NULL) {
+		return (false);
+	}
+	if (vy == NULL) {
+		return (false);
+	}
+
+	/*
+	* Sanity check 
+	*/
+	if (n < 3) {
+		return (false);
+	}
+
+	/*
+	* Create array of points
+	*/
+	nn = n + 1;
+	points = (SDL_FPoint*)malloc(sizeof(SDL_FPoint) * nn);
+	if (points == NULL)
+	{
+		return false;
+	}
+	for (i=0; i<n; i++)
+	{
+		points[i].x = vx[i];
+		points[i].y = vy[i];
+	}
+	points[n].x = vx[0];
+	points[n].y = vy[0];
+
+	/*
+	* Draw 
+	*/
+	result &= SDL_RenderLines(renderer, points, nn);
+	free(points);
+
+	return (result);
+}
+
+/*!
+\brief Draw polygon with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the polygon.
+\param vy Vertex array containing Y coordinates of the points of the polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param r The red value of the polygon to draw. 
+\param g The green value of the polygon to draw. 
+\param b The blue value of the polygon to draw. 
+\param a The alpha value of the polygon to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool polygonRGBA(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	bool result;
+	const Sint16 *x1, *y1, *x2, *y2;
+
+	/*
+	* Vertex array NULL check 
+	*/
+	if (vx == NULL) {
+		return (false);
+	}
+	if (vy == NULL) {
+		return (false);
+	}
+
+	/*
+	* Sanity check 
+	*/
+	if (n < 3) {
+		return (false);
+	}
+
+	/*
+	* Pointer setup 
+	*/
+	x1 = x2 = vx;
+	y1 = y2 = vy;
+	x2++;
+	y2++;
+
+	/*
+	* Set color 
+	*/
+	result = true;
+	result &= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result &= SDL_SetRenderDrawColor(renderer, r, g, b, a);	
+
+	/*
+	* Draw 
+	*/
+	result &= polygon(renderer, vx, vy, n);
+
+	return (result);
+}
+
+/* ---- AA-Polygon */
+
+/*!
+\brief Draw anti-aliased polygon with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the aa-polygon.
+\param vy Vertex array containing Y coordinates of the points of the aa-polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param color The color value of the aa-polygon to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool aapolygonColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return aapolygonRGBA(renderer, vx, vy, n, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw anti-aliased polygon with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the aa-polygon.
+\param vy Vertex array containing Y coordinates of the points of the aa-polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param r The red value of the aa-polygon to draw. 
+\param g The green value of the aa-polygon to draw. 
+\param b The blue value of the aa-polygon to draw. 
+\param a The alpha value of the aa-polygon to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool aapolygonRGBA(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	bool result;
+	int i;
+	const Sint16 *x1, *y1, *x2, *y2;
+
+	/*
+	* Vertex array NULL check 
+	*/
+	if (vx == NULL) {
+		return (false);
+	}
+	if (vy == NULL) {
+		return (false);
+	}
+
+	/*
+	* Sanity check 
+	*/
+	if (n < 3) {
+		return (false);
+	}
+
+	/*
+	* Pointer setup 
+	*/
+	x1 = x2 = vx;
+	y1 = y2 = vy;
+	x2++;
+	y2++;
+
+	/*
+	* Draw 
+	*/
+	result = true;
+	for (i = 1; i < n; i++) {
+		result &= _aalineRGBA(renderer, *x1, *y1, *x2, *y2, r, g, b, a, 0);
+		x1 = x2;
+		y1 = y2;
+		x2++;
+		y2++;
+	}
+
+	result &= _aalineRGBA(renderer, *x1, *y1, *vx, *vy, r, g, b, a, 0);
+
+	return (result);
+}
+
+/* ---- Filled Polygon */
+
+/*!
+\brief Internal helper qsort callback functions used in filled polygon drawing.
+
+\param a The surface to draw on.
+\param b Vertex array containing X coordinates of the points of the polygon.
+
+\returns Returns 0 if a==b, a negative number if a<b or a positive number if a>b.
+*/
+int _gfxPrimitivesCompareInt(const void *a, const void *b)
+{
+	return (*(const int *) a) - (*(const int *) b);
+}
+
+/*!
+\brief Global vertex array to use if optional parameters are not given in filledPolygonMT calls.
+
+Note: Used for non-multithreaded (default) operation of filledPolygonMT.
+*/
+static int *gfxPrimitivesPolyIntsGlobal = NULL;
+
+/*!
+\brief Flag indicating if global vertex array was already allocated.
+
+Note: Used for non-multithreaded (default) operation of filledPolygonMT.
+*/
+static int gfxPrimitivesPolyAllocatedGlobal = 0;
+
+/*!
+\brief Draw filled polygon with alpha blending (multi-threaded capable).
+
+Note: The last two parameters are optional; but are required for multithreaded operation.  
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the filled polygon.
+\param vy Vertex array containing Y coordinates of the points of the filled polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param r The red value of the filled polygon to draw. 
+\param g The green value of the filled polygon to draw. 
+\param b The blue value of the filled polygon to draw. 
+\param a The alpha value of the filled polygon to draw.
+\param polyInts Preallocated, temporary vertex array used for sorting vertices. Required for multithreaded operation; set to NULL otherwise.
+\param polyAllocated Flag indicating if temporary vertex array was allocated. Required for multithreaded operation; set to NULL otherwise.
+
+\returns Returns true on success, false on failure.
+*/
+int filledPolygonRGBAMT(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a, int **polyInts, int *polyAllocated)
+{
+	bool result;
+	int i;
+	int y, xa, xb;
+	int miny, maxy;
+	int x1, y1;
+	int x2, y2;
+	int ind1, ind2;
+	int ints;
+	int *gfxPrimitivesPolyInts = NULL;
+	int *gfxPrimitivesPolyIntsNew = NULL;
+	int gfxPrimitivesPolyAllocated = 0;
+
+	/*
+	* Vertex array NULL check 
+	*/
+	if (vx == NULL) {
+		return (false);
+	}
+	if (vy == NULL) {
+		return (false);
+	}
+
+	/*
+	* Sanity check number of edges
+	*/
+	if (n < 3) {
+		return false;
+	}
+
+	/*
+	* Map polygon cache  
+	*/
+	if ((polyInts==NULL) || (polyAllocated==NULL)) {
+		/* Use global cache */
+		gfxPrimitivesPolyInts = gfxPrimitivesPolyIntsGlobal;
+		gfxPrimitivesPolyAllocated = gfxPrimitivesPolyAllocatedGlobal;
+	} else {
+		/* Use local cache */
+		gfxPrimitivesPolyInts = *polyInts;
+		gfxPrimitivesPolyAllocated = *polyAllocated;
+	}
+
+	/*
+	* Allocate temp array, only grow array 
+	*/
+	if (!gfxPrimitivesPolyAllocated) {
+		gfxPrimitivesPolyInts = (int *) malloc(sizeof(int) * n);
+		gfxPrimitivesPolyAllocated = n;
+	} else {
+		if (gfxPrimitivesPolyAllocated < n) {
+			gfxPrimitivesPolyIntsNew = (int *) realloc(gfxPrimitivesPolyInts, sizeof(int) * n);
+			if (!gfxPrimitivesPolyIntsNew) {
+				if (!gfxPrimitivesPolyInts) {
+					free(gfxPrimitivesPolyInts);
+					gfxPrimitivesPolyInts = NULL;
+				}
+				gfxPrimitivesPolyAllocated = 0;
+			} else {
+				gfxPrimitivesPolyInts = gfxPrimitivesPolyIntsNew;
+				gfxPrimitivesPolyAllocated = n;
+			}
+		}
+	}
+
+	/*
+	* Check temp array
+	*/
+	if (gfxPrimitivesPolyInts==NULL) {        
+		gfxPrimitivesPolyAllocated = 0;
+	}
+
+	/*
+	* Update cache variables
+	*/
+	if ((polyInts==NULL) || (polyAllocated==NULL)) { 
+		gfxPrimitivesPolyIntsGlobal =  gfxPrimitivesPolyInts;
+		gfxPrimitivesPolyAllocatedGlobal = gfxPrimitivesPolyAllocated;
+	} else {
+		*polyInts = gfxPrimitivesPolyInts;
+		*polyAllocated = gfxPrimitivesPolyAllocated;
+	}
+
+	/*
+	* Check temp array again
+	*/
+	if (gfxPrimitivesPolyInts==NULL) {        
+		return(false);
+	}
+
+	/*
+	* Determine Y maxima 
+	*/
+	miny = vy[0];
+	maxy = vy[0];
+	for (i = 1; (i < n); i++) {
+		if (vy[i] < miny) {
+			miny = vy[i];
+		} else if (vy[i] > maxy) {
+			maxy = vy[i];
+		}
+	}
+
+	/*
+	* Draw, scanning y 
+	*/
+	for (y = miny; (y <= maxy); y++) {
+		ints = 0;
+		for (i = 0; (i < n); i++) {
+			if (!i) {
+				ind1 = n - 1;
+				ind2 = 0;
+			} else {
+				ind1 = i - 1;
+				ind2 = i;
+			}
+			y1 = vy[ind1];
+			y2 = vy[ind2];
+			if (y1 < y2) {
+				x1 = vx[ind1];
+				x2 = vx[ind2];
+			} else if (y1 > y2) {
+				y2 = vy[ind1];
+				y1 = vy[ind2];
+				x2 = vx[ind1];
+				x1 = vx[ind2];
+			} else {
+				continue;
+			}
+			if ( ((y >= y1) && (y < y2)) || ((y == maxy) && (y > y1) && (y <= y2)) ) {
+				gfxPrimitivesPolyInts[ints++] = ((65536 * (y - y1)) / (y2 - y1)) * (x2 - x1) + (65536 * x1);
+			} 	    
+		}
+
+		qsort(gfxPrimitivesPolyInts, ints, sizeof(int), _gfxPrimitivesCompareInt);
+
+		/*
+		* Set color 
+		*/
+		result = true;
+	   result &= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+		result &= SDL_SetRenderDrawColor(renderer, r, g, b, a);	
+
+		for (i = 0; (i < ints); i += 2) {
+			xa = gfxPrimitivesPolyInts[i] + 1;
+			xa = (xa >> 16) + ((xa & 32768) >> 15);
+			xb = gfxPrimitivesPolyInts[i+1] - 1;
+			xb = (xb >> 16) + ((xb & 32768) >> 15);
+			result &= hline(renderer, xa, xb, y);
+		}
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw filled polygon with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the filled polygon.
+\param vy Vertex array containing Y coordinates of the points of the filled polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param color The color value of the filled polygon to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool filledPolygonColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return filledPolygonRGBAMT(renderer, vx, vy, n, c[0], c[1], c[2], c[3], NULL, NULL);
+}
+
+/*!
+\brief Draw filled polygon with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the filled polygon.
+\param vy Vertex array containing Y coordinates of the points of the filled polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param r The red value of the filled polygon to draw. 
+\param g The green value of the filled polygon to draw. 
+\param b The blue value of the filed polygon to draw. 
+\param a The alpha value of the filled polygon to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool filledPolygonRGBA(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return filledPolygonRGBAMT(renderer, vx, vy, n, r, g, b, a, NULL, NULL);
+}
+
+/* ---- Textured Polygon */
+
+/*!
+\brief Internal function to draw a textured horizontal line.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. left) of the line.
+\param x2 X coordinate of the second point (i.e. right) of the line.
+\param y Y coordinate of the points of the line.
+\param texture The texture to retrieve color information from.
+\param texture_w The width of the texture.
+\param texture_h The height of the texture.
+\param texture_dx The X offset for the texture lookup.
+\param texture_dy The Y offset for the textured lookup.
+
+\returns Returns true on success, false on failure.
+*/
+bool _HLineTextured(SDL_Renderer *renderer, Sint16 x1, Sint16 x2, Sint16 y, SDL_Texture *texture, int texture_w, int texture_h, int texture_dx, int texture_dy)
+{
+	Sint16 w;
+	Sint16 xtmp;
+	bool result = true;
+	int texture_x_walker;    
+	int texture_y_start;    
+	SDL_FRect source_rect,dst_rect;
+	int pixels_written,write_width;
+
+	/*
+	* Swap x1, x2 if required to ensure x1<=x2
+	*/
+	if (x1 > x2) {
+		xtmp = x1;
+		x1 = x2;
+		x2 = xtmp;
+	}
+
+	/*
+	* Calculate width to draw
+	*/
+	w = x2 - x1 + 1;
+
+	/*
+	* Determine where in the texture we start drawing
+	*/
+	texture_x_walker =   (x1 - texture_dx)  % texture_w;
+	if (texture_x_walker < 0){
+		texture_x_walker = texture_w + texture_x_walker ;
+	}
+
+	texture_y_start = (y + texture_dy) % texture_h;
+	if (texture_y_start < 0){
+		texture_y_start = texture_h + texture_y_start;
+	}
+
+	/* setup the source rectangle; we are only drawing one horizontal line */
+	source_rect.y = texture_y_start;
+	source_rect.x = texture_x_walker;
+	source_rect.h = 1;
+
+	/* we will draw to the current y */
+	dst_rect.y = y;
+	dst_rect.h = 1;
+
+	/* if there are enough pixels left in the current row of the texture */
+	/* draw it all at once */
+	if (w <= texture_w -texture_x_walker){
+		source_rect.w = w;
+		source_rect.x = texture_x_walker;
+		dst_rect.x= x1;
+		dst_rect.w = source_rect.w;
+		result = (SDL_RenderTexture(renderer, texture, &source_rect, &dst_rect) == 0);
+	} else { 
+		/* we need to draw multiple times */
+		/* draw the first segment */
+		pixels_written = texture_w  - texture_x_walker;
+		source_rect.w = pixels_written;
+		source_rect.x = texture_x_walker;
+		dst_rect.x= x1;
+		dst_rect.w = source_rect.w;
+		result &= (SDL_RenderTexture(renderer, texture, &source_rect, &dst_rect) == 0);
+		write_width = texture_w;
+
+		/* now draw the rest */
+		/* set the source x to 0 */
+		source_rect.x = 0;
+		while (pixels_written < w){
+			if (write_width >= w - pixels_written) {
+				write_width =  w - pixels_written;
+			}
+			source_rect.w = write_width;
+			dst_rect.x = x1 + pixels_written;
+			dst_rect.w = source_rect.w;
+			result &= (SDL_RenderTexture(renderer, texture, &source_rect, &dst_rect) == 0);
+			pixels_written += write_width;
+		}
+	}
+
+	return result;
+}
+
+/*!
+\brief Draws a polygon filled with the given texture (Multi-Threading Capable). 
+
+\param renderer The renderer to draw on.
+\param vx array of x vector components
+\param vy array of x vector components
+\param n the amount of vectors in the vx and vy array
+\param texture the sdl surface to use to fill the polygon
+\param texture_dx the offset of the texture relative to the screeen. If you move the polygon 10 pixels 
+to the left and want the texture to apear the same you need to increase the texture_dx value
+\param texture_dy see texture_dx
+\param polyInts Preallocated temp array storage for vertex sorting (used for multi-threaded operation)
+\param polyAllocated Flag indicating oif the temp array was allocated (used for multi-threaded operation)
+
+\returns Returns true on success, false on failure.
+*/
+bool texturedPolygonMT(SDL_Renderer *renderer, const Sint16 * vx, const Sint16 * vy, int n, 
+	SDL_Surface * texture, int texture_dx, int texture_dy, int **polyInts, int *polyAllocated)
+{
+	bool result;
+	int i;
+	int y, xa, xb;
+	int minx,maxx,miny, maxy;
+	int x1, y1;
+	int x2, y2;
+	int ind1, ind2;
+	int ints;
+	int *gfxPrimitivesPolyInts = NULL;
+	int *gfxPrimitivesPolyIntsTemp = NULL;
+	int gfxPrimitivesPolyAllocated = 0;
+	SDL_Texture *textureAsTexture = NULL;
+
+	/*
+	* Sanity check number of edges
+	*/
+	if (n < 3) {
+		return false;
+	}
+
+	/*
+	* Map polygon cache  
+	*/
+	if ((polyInts==NULL) || (polyAllocated==NULL)) {
+		/* Use global cache */
+		gfxPrimitivesPolyInts = gfxPrimitivesPolyIntsGlobal;
+		gfxPrimitivesPolyAllocated = gfxPrimitivesPolyAllocatedGlobal;
+	} else {
+		/* Use local cache */
+		gfxPrimitivesPolyInts = *polyInts;
+		gfxPrimitivesPolyAllocated = *polyAllocated;
+	}
+
+	/*
+	* Allocate temp array, only grow array 
+	*/
+	if (!gfxPrimitivesPolyAllocated) {
+		gfxPrimitivesPolyInts = (int *) malloc(sizeof(int) * n);
+		gfxPrimitivesPolyAllocated = n;
+	} else {
+		if (gfxPrimitivesPolyAllocated < n) {
+			gfxPrimitivesPolyIntsTemp = (int *) realloc(gfxPrimitivesPolyInts, sizeof(int) * n);
+			if (gfxPrimitivesPolyIntsTemp == NULL) {
+				/* Realloc failed - keeps original memory block, but fails this operation */
+				return(false);
+			}
+			gfxPrimitivesPolyInts = gfxPrimitivesPolyIntsTemp;
+			gfxPrimitivesPolyAllocated = n;
+		}
+	}
+
+	/*
+	* Check temp array
+	*/
+	if (gfxPrimitivesPolyInts==NULL) {        
+		gfxPrimitivesPolyAllocated = 0;
+	}
+
+	/*
+	* Update cache variables
+	*/
+	if ((polyInts==NULL) || (polyAllocated==NULL)) { 
+		gfxPrimitivesPolyIntsGlobal =  gfxPrimitivesPolyInts;
+		gfxPrimitivesPolyAllocatedGlobal = gfxPrimitivesPolyAllocated;
+	} else {
+		*polyInts = gfxPrimitivesPolyInts;
+		*polyAllocated = gfxPrimitivesPolyAllocated;
+	}
+
+	/*
+	* Check temp array again
+	*/
+	if (gfxPrimitivesPolyInts==NULL) {        
+		return(false);
+	}
+
+	/*
+	* Determine X,Y minima,maxima 
+	*/
+	miny = vy[0];
+	maxy = vy[0];
+	minx = vx[0];
+	maxx = vx[0];
+	for (i = 1; (i < n); i++) {
+		if (vy[i] < miny) {
+			miny = vy[i];
+		} else if (vy[i] > maxy) {
+			maxy = vy[i];
+		}
+		if (vx[i] < minx) {
+			minx = vx[i];
+		} else if (vx[i] > maxx) {
+			maxx = vx[i];
+		}
+	}
+
+    /* Create texture for drawing */
+	textureAsTexture = SDL_CreateTextureFromSurface(renderer, texture);
+	if (textureAsTexture == NULL)
+	{
+		return false;
+	}
+	SDL_SetTextureBlendMode(textureAsTexture, SDL_BLENDMODE_BLEND);
+	
+	/*
+	* Draw, scanning y 
+	*/
+	result = true;
+	for (y = miny; (y <= maxy); y++) {
+		ints = 0;
+		for (i = 0; (i < n); i++) {
+			if (!i) {
+				ind1 = n - 1;
+				ind2 = 0;
+			} else {
+				ind1 = i - 1;
+				ind2 = i;
+			}
+			y1 = vy[ind1];
+			y2 = vy[ind2];
+			if (y1 < y2) {
+				x1 = vx[ind1];
+				x2 = vx[ind2];
+			} else if (y1 > y2) {
+				y2 = vy[ind1];
+				y1 = vy[ind2];
+				x2 = vx[ind1];
+				x1 = vx[ind2];
+			} else {
+				continue;
+			}
+			if ( ((y >= y1) && (y < y2)) || ((y == maxy) && (y > y1) && (y <= y2)) ) {
+				gfxPrimitivesPolyInts[ints++] = ((65536 * (y - y1)) / (y2 - y1)) * (x2 - x1) + (65536 * x1);
+			} 
+		}
+
+		qsort(gfxPrimitivesPolyInts, ints, sizeof(int), _gfxPrimitivesCompareInt);
+
+		for (i = 0; (i < ints); i += 2) {
+			xa = gfxPrimitivesPolyInts[i] + 1;
+			xa = (xa >> 16) + ((xa & 32768) >> 15);
+			xb = gfxPrimitivesPolyInts[i+1] - 1;
+			xb = (xb >> 16) + ((xb & 32768) >> 15);
+			result &= _HLineTextured(renderer, xa, xb, y, textureAsTexture, texture->w, texture->h, texture_dx, texture_dy);
+		}
+	}
+
+	SDL_DestroyTexture(textureAsTexture);
+
+	return (result);
+}
+
+/*!
+\brief Draws a polygon filled with the given texture. 
+
+This standard version is calling multithreaded versions with NULL cache parameters.
+
+\param renderer The renderer to draw on.
+\param vx array of x vector components
+\param vy array of x vector components
+\param n the amount of vectors in the vx and vy array
+\param texture the sdl surface to use to fill the polygon
+\param texture_dx the offset of the texture relative to the screeen. if you move the polygon 10 pixels 
+to the left and want the texture to apear the same you need to increase the texture_dx value
+\param texture_dy see texture_dx
+
+\returns Returns true on success, false on failure.
+*/
+bool texturedPolygon(SDL_Renderer *renderer, const Sint16 * vx, const Sint16 * vy, int n, SDL_Surface *texture, int texture_dx, int texture_dy)
+{
+	/*
+	* Draw
+	*/
+	return (texturedPolygonMT(renderer, vx, vy, n, texture, texture_dx, texture_dy, NULL, NULL));
+}
+
+/* ---- Character */
+
+/*!
+\brief Global cache for NxM pixel font textures created at runtime.
+*/
+static SDL_Texture *gfxPrimitivesFont[256];
+
+/*!
+\brief Pointer to the current font data. Default is a 8x8 pixel internal font. 
+*/
+static const unsigned char *currentFontdata = gfxPrimitivesFontdata;
+
+/*!
+\brief Width of the current font. Default is 8. 
+*/
+static Uint32 charWidth = 8;
+
+/*!
+\brief Height of the current font. Default is 8. 
+*/
+static Uint32 charHeight = 8;
+
+/*!
+\brief Width for rendering. Autocalculated.
+*/
+static Uint32 charWidthLocal = 8;
+
+/*!
+\brief Height for rendering. Autocalculated.
+*/
+static Uint32 charHeightLocal = 8;
+
+/*!
+\brief Pitch of the current font in bytes. Default is 1. 
+*/
+static Uint32 charPitch = 1;
+
+/*!
+\brief Characters 90deg clockwise rotations. Default is 0. Max is 3. 
+*/
+static Uint32 charRotation = 0;
+
+/*!
+\brief Character data size in bytes of the current font. Default is 8. 
+*/
+static Uint32 charSize = 8;
+
+/*!
+\brief Sets or resets the current global font data.
+
+The font data array is organized in follows: 
+[fontdata] = [character 0][character 1]...[character 255] where
+[character n] = [byte 1 row 1][byte 2 row 1]...[byte {pitch} row 1][byte 1 row 2] ...[byte {pitch} row height] where
+[byte n] = [bit 0]...[bit 7] where 
+[bit n] = [0 for transparent pixel|1 for colored pixel]
+
+\param fontdata Pointer to array of font data. Set to NULL, to reset global font to the default 8x8 font.
+\param cw Width of character in bytes. Ignored if fontdata==NULL.
+\param ch Height of character in bytes. Ignored if fontdata==NULL.
+*/
+void gfxPrimitivesSetFont(const void *fontdata, Uint32 cw, Uint32 ch)
+{
+	int i;
+
+	if ((fontdata) && (cw) && (ch)) {
+		currentFontdata = (unsigned char *)fontdata;
+		charWidth = cw;
+		charHeight = ch;
+	} else {
+		currentFontdata = gfxPrimitivesFontdata;
+		charWidth = 8;
+		charHeight = 8;
+	}
+
+	charPitch = (charWidth+7)/8;
+	charSize = charPitch * charHeight;
+
+	/* Maybe flip width/height for rendering */
+	if ((charRotation==1) || (charRotation==3))
+	{
+		charWidthLocal = charHeight;
+		charHeightLocal = charWidth;
+	}
+	else
+	{
+		charWidthLocal = charWidth;
+		charHeightLocal = charHeight;
+	}
+
+	/* Clear character cache */
+	for (i = 0; i < 256; i++) {
+		if (gfxPrimitivesFont[i]) {
+			SDL_DestroyTexture(gfxPrimitivesFont[i]);
+			gfxPrimitivesFont[i] = NULL;
+		}
+	}
+}
+
+/*!
+\brief Sets current global font character rotation steps. 
+
+Default is 0 (no rotation). 1 = 90deg clockwise. 2 = 180deg clockwise. 3 = 270deg clockwise.
+Changing the rotation, will reset the character cache.
+
+\param rotation Number of 90deg clockwise steps to rotate
+*/
+void gfxPrimitivesSetFontRotation(Uint32 rotation)
+{
+	int i;
+
+	rotation = rotation & 3;
+	if (charRotation != rotation)
+	{
+		/* Store rotation */
+		charRotation = rotation;
+
+		/* Maybe flip width/height for rendering */
+		if ((charRotation==1) || (charRotation==3))
+		{
+			charWidthLocal = charHeight;
+			charHeightLocal = charWidth;
+		}
+		else
+		{
+			charWidthLocal = charWidth;
+			charHeightLocal = charHeight;
+		}
+
+		/* Clear character cache */
+		for (i = 0; i < 256; i++) {
+			if (gfxPrimitivesFont[i]) {
+				SDL_DestroyTexture(gfxPrimitivesFont[i]);
+				gfxPrimitivesFont[i] = NULL;
+			}
+		}
+	}
+}
+
+/*!
+\brief Draw a character of the currently set font.
+
+\param renderer The Renderer to draw on.
+\param x X (horizontal) coordinate of the upper left corner of the character.
+\param y Y (vertical) coordinate of the upper left corner of the character.
+\param c The character to draw.
+\param r The red value of the character to draw. 
+\param g The green value of the character to draw. 
+\param b The blue value of the character to draw. 
+\param a The alpha value of the character to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool characterRGBA(SDL_Renderer *renderer, Sint16 x, Sint16 y, char c, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	SDL_FRect srect;
+	SDL_FRect drect;
+	bool result;
+	Uint32 ix, iy;
+	const unsigned char *charpos;
+	Uint8 *curpos;
+	Uint8 patt, mask;
+	Uint8 *linepos;
+	Uint32 pitch;
+	SDL_Surface *character;
+	SDL_Surface *rotatedCharacter;
+	Uint32 ci;
+
+	/*
+	* Setup source rectangle
+	*/
+	srect.x = 0;
+	srect.y = 0;
+	srect.w = charWidthLocal;
+	srect.h = charHeightLocal;
+
+	/*
+	* Setup destination rectangle
+	*/
+	drect.x = x;
+	drect.y = y;
+	drect.w = charWidthLocal;
+	drect.h = charHeightLocal;
+
+	/* Character index in cache */
+	ci = (unsigned char) c;
+
+	/*
+	* Create new charWidth x charHeight bitmap surface if not already present.
+	* Might get rotated later.
+	*/
+	if (gfxPrimitivesFont[ci] == NULL) {
+		/*
+		* Redraw character into surface
+		*/
+		character =	SDL_CreateSurface(
+			charWidth, charHeight, SDL_PIXELFORMAT_RGBA8888);
+		if (character == NULL) {
+			return (false);
+		}
+
+		charpos = currentFontdata + ci * charSize;
+				linepos = (Uint8 *)character->pixels;
+		pitch = character->pitch;
+
+		/*
+		* Drawing loop 
+		*/
+		patt = 0;
+		for (iy = 0; iy < charHeight; iy++) {
+			mask = 0x00;
+			curpos = linepos;
+			for (ix = 0; ix < charWidth; ix++) {
+				if (!(mask >>= 1)) {
+					patt = *charpos++;
+					mask = 0x80;
+				}
+				if (patt & mask) {
+					*(Uint32 *)curpos = 0xffffffff;
+				} else {
+					*(Uint32 *)curpos = 0;
+				}
+				curpos += 4;
+			}
+			linepos += pitch;
+		}
+
+		/* Maybe rotate and replace cached image */
+		if (charRotation>0)
+		{
+			rotatedCharacter = rotateSurface90Degrees(character, charRotation);
+			SDL_DestroySurface(character);
+			character = rotatedCharacter;
+		}
+
+		/* Convert temp surface into texture */
+		gfxPrimitivesFont[ci] = SDL_CreateTextureFromSurface(renderer, character);
+		SDL_DestroySurface(character);
+
+		/*
+		* Check pointer 
+		*/
+		if (gfxPrimitivesFont[ci] == NULL) {
+			return (false);
+		}
+	}
+
+	/*
+	* Set color 
+	*/
+	result = true;
+	result &= SDL_SetTextureColorMod(gfxPrimitivesFont[ci], r, g, b);
+	result &= SDL_SetTextureAlphaMod(gfxPrimitivesFont[ci], a);
+
+	/*
+	* Draw texture onto destination 
+	*/
+	result &= SDL_RenderTexture(renderer, gfxPrimitivesFont[ci], &srect, &drect);
+
+	return (result);
+}
+
+
+/*!
+\brief Draw a character of the currently set font.
+
+\param renderer The renderer to draw on.
+\param x X (horizontal) coordinate of the upper left corner of the character.
+\param y Y (vertical) coordinate of the upper left corner of the character.
+\param c The character to draw.
+\param color The color value of the character to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool characterColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, char c, Uint32 color)
+{
+	Uint8 *co = (Uint8 *)&color; 
+	return characterRGBA(renderer, x, y, c, co[0], co[1], co[2], co[3]);
+}
+
+
+/*!
+\brief Draw a string in the currently set font.
+
+The spacing between consequtive characters in the string is the fixed number of pixels 
+of the character width of the current global font.
+
+\param renderer The renderer to draw on.
+\param x X (horizontal) coordinate of the upper left corner of the string.
+\param y Y (vertical) coordinate of the upper left corner of the string.
+\param s The string to draw.
+\param color The color value of the string to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool stringColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, const char *s, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return stringRGBA(renderer, x, y, s, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw a string in the currently set font.
+
+\param renderer The renderer to draw on.
+\param x X (horizontal) coordinate of the upper left corner of the string.
+\param y Y (vertical) coordinate of the upper left corner of the string.
+\param s The string to draw.
+\param r The red value of the string to draw. 
+\param g The green value of the string to draw. 
+\param b The blue value of the string to draw. 
+\param a The alpha value of the string to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool stringRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, const char *s, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	bool result = true;
+	Sint16 curx = x;
+	Sint16 cury = y;
+	const char *curchar = s;
+
+	while (*curchar && result) {
+		result &= characterRGBA(renderer, curx, cury, *curchar, r, g, b, a);
+		switch (charRotation)
+		{
+		case 0:
+			curx += charWidthLocal;
+			break;
+		case 2:
+			curx -= charWidthLocal;
+			break;
+		case 1:
+			cury += charHeightLocal;
+			break;
+		case 3:
+			cury -= charHeightLocal;
+			break;
+		}
+		curchar++;
+	}
+
+	return (result);
+}
+
+/* ---- Bezier curve */
+
+/*!
+\brief Internal function to calculate bezier interpolator of data array with ndata values at position 't'.
+
+\param data Array of values.
+\param ndata Size of array.
+\param t Position for which to calculate interpolated value. t should be between [0, ndata].
+
+\returns Interpolated value at position t, value[0] when t<0, value[n-1] when t>n.
+*/
+double _evaluateBezier (double *data, int ndata, double t) 
+{
+	double mu, result;
+	int n,k,kn,nn,nkn;
+	double blend,muk,munk;
+
+	/* Sanity check bounds */
+	if (t<0.0) {
+		return(data[0]);
+	}
+	if (t>=(double)ndata) {
+		return(data[ndata-1]);
+	}
+
+	/* Adjust t to the range 0.0 to 1.0 */ 
+	mu=t/(double)ndata;
+
+	/* Calculate interpolate */
+	n=ndata-1;
+	result=0.0;
+	muk = 1;
+	munk = pow(1-mu,(double)n);
+	for (k=0;k<=n;k++) {
+		nn = n;
+		kn = k;
+		nkn = n - k;
+		blend = muk * munk;
+		muk *= mu;
+		munk /= (1-mu);
+		while (nn >= 1) {
+			blend *= nn;
+			nn--;
+			if (kn > 1) {
+				blend /= (double)kn;
+				kn--;
+			}
+			if (nkn > 1) {
+				blend /= (double)nkn;
+				nkn--;
+			}
+		}
+		result += data[k] * blend;
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw a bezier curve with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the bezier curve.
+\param vy Vertex array containing Y coordinates of the points of the bezier curve.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param s Number of steps for the interpolation. Minimum number is 2.
+\param color The color value of the bezier curve to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool bezierColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, int s, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return bezierRGBA(renderer, vx, vy, n, s, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw a bezier curve with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the bezier curve.
+\param vy Vertex array containing Y coordinates of the points of the bezier curve.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param s Number of steps for the interpolation. Minimum number is 2.
+\param r The red value of the bezier curve to draw. 
+\param g The green value of the bezier curve to draw. 
+\param b The blue value of the bezier curve to draw. 
+\param a The alpha value of the bezier curve to draw.
+
+\returns Returns true on success, false on failure.
+*/
+bool bezierRGBA(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, int s, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	bool result;
+	int i;
+	double *x, *y, t, stepsize;
+	Sint16 x1, y1, x2, y2;
+
+	/*
+	* Sanity check 
+	*/
+	if (n < 3) {
+		return (false);
+	}
+	if (s < 2) {
+		return (false);
+	}
+
+	/*
+	* Variable setup 
+	*/
+	stepsize=(double)1.0/(double)s;
+
+	/* Transfer vertices into float arrays */
+	if ((x=(double *)malloc(sizeof(double)*(n+1)))==NULL) {
+		return(false);
+	}
+	if ((y=(double *)malloc(sizeof(double)*(n+1)))==NULL) {
+		free(x);
+		return(false);
+	}    
+	for (i=0; i<n; i++) {
+		x[i]=(double)vx[i];
+		y[i]=(double)vy[i];
+	}      
+	x[n]=(double)vx[0];
+	y[n]=(double)vy[0];
+
+	/*
+	* Set color 
+	*/
+	result = true;
+	result &= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result &= SDL_SetRenderDrawColor(renderer, r, g, b, a);
+
+	/*
+	* Draw 
+	*/
+	t=0.0;
+	x1=(Sint16)lrint(_evaluateBezier(x,n+1,t));
+	y1=(Sint16)lrint(_evaluateBezier(y,n+1,t));
+	for (i = 0; i <= (n*s); i++) {
+		t += stepsize;
+		x2=(Sint16)_evaluateBezier(x,n,t);
+		y2=(Sint16)_evaluateBezier(y,n,t);
+		result &= line(renderer, x1, y1, x2, y2);
+		x1 = x2;
+		y1 = y2;
+	}
+
+	/* Clean up temporary array */
+	free(x);
+	free(y);
+
+	return (result);
+}
+
+
+/*!
+\brief Draw a thick line with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the second point of the line.
+\param width Width of the line in pixels. Must be >0.
+\param color The color value of the line to draw (0xRRGGBBAA). 
+
+\returns Returns true on success, false on failure.
+*/
+bool thickLineColor(SDL_Renderer *renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 width, Uint32 color)
+{	
+	Uint8 *c = (Uint8 *)&color; 
+	return thickLineRGBA(renderer, x1, y1, x2, y2, width, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw a thick line with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the second point of the line.
+\param width Width of the line in pixels. Must be >0.
+\param r The red value of the character to draw. 
+\param g The green value of the character to draw. 
+\param b The blue value of the character to draw. 
+\param a The alpha value of the character to draw.
+
+\returns Returns true on success, false on failure.
+*/	
+bool thickLineRGBA(SDL_Renderer *renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 width, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int wh;
+	double dx, dy, dx1, dy1, dx2, dy2;
+	double l, wl2, nx, ny, ang, adj;
+	Sint16 px[4], py[4];
+
+	if (renderer == NULL) {
+		return false;
+	}
+
+	if (width < 1) {
+		return false;
+	}
+
+	/* Special case: thick "point" */
+	if ((x1 == x2) && (y1 == y2)) {
+		wh = width / 2;
+		return boxRGBA(renderer, x1 - wh, y1 - wh, x2 + width, y2 + width, r, g, b, a);		
+	}
+
+	/* Special case: width == 1 */
+	if (width == 1) {
+		return lineRGBA(renderer, x1, y1, x2, y2, r, g, b, a);		
+	}
+
+	/* Calculate offsets for sides */
+	dx = (double)(x2 - x1);
+	dy = (double)(y2 - y1);
+	l = SDL_sqrt(dx*dx + dy*dy);
+	ang = SDL_atan2(dx, dy);
+	adj = 0.1 + 0.9 * SDL_fabs(SDL_cos(2.0 * ang));
+	wl2 = ((double)width - adj)/(2.0 * l);
+	nx = dx * wl2;
+	ny = dy * wl2;
+
+	/* Build polygon */
+	dx1 = (double)x1;
+	dy1 = (double)y1;
+	dx2 = (double)x2;
+	dy2 = (double)y2;
+	px[0] = (Sint16)(dx1 + ny);
+	px[1] = (Sint16)(dx1 - ny);
+	px[2] = (Sint16)(dx2 - ny);
+	px[3] = (Sint16)(dx2 + ny);
+	py[0] = (Sint16)(dy1 - nx);
+	py[1] = (Sint16)(dy1 + nx);
+	py[2] = (Sint16)(dy2 + nx);
+	py[3] = (Sint16)(dy2 - nx);
+
+	/* Draw polygon */
+	return filledPolygonRGBA(renderer, px, py, 4, r, g, b, a);
+}
diff --git a/vendor/SDL3_gfx/SDL3_gfxPrimitives.h b/vendor/SDL3_gfx/SDL3_gfxPrimitives.h
new file mode 100644
index 0000000..2d13b62
--- /dev/null
+++ b/vendor/SDL3_gfx/SDL3_gfxPrimitives.h
@@ -0,0 +1,241 @@
+/* 
+
+SDL3_gfxPrimitives.h: graphics primitives for SDL
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifndef _SDL3_gfxPrimitives_h
+#define _SDL3_gfxPrimitives_h
+
+#include <math.h>
+#ifndef M_PI
+#define M_PI	3.1415926535897932384626433832795
+#endif
+
+#include <SDL3/SDL.h>
+
+/* Set up for C function definitions, even when using C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/* ----- Versioning */
+
+#define SDL3_GFXPRIMITIVES_MAJOR	1
+#define SDL3_GFXPRIMITIVES_MINOR	0
+#define SDL3_GFXPRIMITIVES_MICRO	0
+
+
+	/* ---- Function Prototypes */
+
+#ifdef _MSC_VER
+#  if defined(DLL_EXPORT) && !defined(LIBSDL3_GFX_DLL_IMPORT)
+#    define SDL3_GFXPRIMITIVES_SCOPE __declspec(dllexport)
+#  else
+#    ifdef LIBSDL3_GFX_DLL_IMPORT
+#      define SDL3_GFXPRIMITIVES_SCOPE __declspec(dllimport)
+#    endif
+#  endif
+#endif
+#ifndef SDL3_GFXPRIMITIVES_SCOPE
+#  define SDL3_GFXPRIMITIVES_SCOPE extern
+#endif
+
+	/* Note: all ___Color routines expect the color to be in format 0xRRGGBBAA */
+
+	/* Pixel */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool pixelColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool pixelRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Horizontal line */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool hlineColor(SDL_Renderer * renderer, Sint16 x1, Sint16 x2, Sint16 y, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool hlineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 x2, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Vertical line */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool vlineColor(SDL_Renderer * renderer, Sint16 x, Sint16 y1, Sint16 y2, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool vlineRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y1, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Rectangle */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool rectangleColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool rectangleRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1,
+		Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Rounded-Corner Rectangle */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool roundedRectangleColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool roundedRectangleRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1,
+		Sint16 x2, Sint16 y2, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled rectangle (Box) */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool boxColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool boxRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2,
+		Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Rounded-Corner Filled rectangle (Box) */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool roundedBoxColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool roundedBoxRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2,
+		Sint16 y2, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Line */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool lineColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool lineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1,
+		Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA Line */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool aalineColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool aalineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1,
+		Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Thick Line */
+	SDL3_GFXPRIMITIVES_SCOPE bool thickLineColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2,
+		Uint8 width, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool thickLineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2,
+		Uint8 width, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Circle */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool circleColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool circleRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Arc */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool arcColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool arcRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end,
+		Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA Circle */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool aacircleColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool aacircleRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y,
+		Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Circle */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool filledCircleColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 r, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool filledCircleRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y,
+		Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Ellipse */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool ellipseColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool ellipseRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y,
+		Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA Ellipse */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool aaellipseColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool aaellipseRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y,
+		Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Ellipse */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool filledEllipseColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool filledEllipseRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y,
+		Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Pie */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool pieColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad,
+		Sint16 start, Sint16 end, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool pieRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad,
+		Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Pie */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool filledPieColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad,
+		Sint16 start, Sint16 end, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool filledPieRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad,
+		Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Trigon */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool trigonColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool trigonRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+		Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA-Trigon */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool aatrigonColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool aatrigonRGBA(SDL_Renderer * renderer,  Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+		Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Trigon */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool filledTrigonColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool filledTrigonRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+		Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Polygon */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool polygonColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool polygonRGBA(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy,
+		int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA-Polygon */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool aapolygonColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool aapolygonRGBA(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy,
+		int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Polygon */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool filledPolygonColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool filledPolygonRGBA(SDL_Renderer * renderer, const Sint16 * vx,
+		const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Textured Polygon */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool texturedPolygon(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, SDL_Surface * texture,int texture_dx,int texture_dy);
+
+	/* Bezier */
+
+	SDL3_GFXPRIMITIVES_SCOPE bool bezierColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, int s, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool bezierRGBA(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy,
+		int n, int s, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Characters/Strings */
+
+	SDL3_GFXPRIMITIVES_SCOPE void gfxPrimitivesSetFont(const void *fontdata, Uint32 cw, Uint32 ch);
+	SDL3_GFXPRIMITIVES_SCOPE void gfxPrimitivesSetFontRotation(Uint32 rotation);
+	SDL3_GFXPRIMITIVES_SCOPE bool characterColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, char c, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool characterRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, char c, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+	SDL3_GFXPRIMITIVES_SCOPE bool stringColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, const char *s, Uint32 color);
+	SDL3_GFXPRIMITIVES_SCOPE bool stringRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, const char *s, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Ends C function definitions when using C++ */
+#ifdef __cplusplus
+}
+#endif
+
+#endif				/* _SDL3_gfxPrimitives_h */
diff --git a/vendor/SDL3_gfx/SDL3_gfxPrimitives_font.h b/vendor/SDL3_gfx/SDL3_gfxPrimitives_font.h
new file mode 100644
index 0000000..41cb552
--- /dev/null
+++ b/vendor/SDL3_gfx/SDL3_gfxPrimitives_font.h
@@ -0,0 +1,3106 @@
+/*
+
+SDL3_gfxPrimitives_font.h: 8x8 font definition
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#define GFX_FONTDATAMAX (8*256)
+
+static unsigned char gfxPrimitivesFontdata[GFX_FONTDATAMAX] = {
+
+	/*
+	* 0 0x00 '^@' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 1 0x01 '^A' 
+	*/
+	0x7e,			/* 01111110 */
+	0x81,			/* 10000001 */
+	0xa5,			/* 10100101 */
+	0x81,			/* 10000001 */
+	0xbd,			/* 10111101 */
+	0x99,			/* 10011001 */
+	0x81,			/* 10000001 */
+	0x7e,			/* 01111110 */
+
+	/*
+	* 2 0x02 '^B' 
+	*/
+	0x7e,			/* 01111110 */
+	0xff,			/* 11111111 */
+	0xdb,			/* 11011011 */
+	0xff,			/* 11111111 */
+	0xc3,			/* 11000011 */
+	0xe7,			/* 11100111 */
+	0xff,			/* 11111111 */
+	0x7e,			/* 01111110 */
+
+	/*
+	* 3 0x03 '^C' 
+	*/
+	0x6c,			/* 01101100 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0x7c,			/* 01111100 */
+	0x38,			/* 00111000 */
+	0x10,			/* 00010000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 4 0x04 '^D' 
+	*/
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+	0x7c,			/* 01111100 */
+	0xfe,			/* 11111110 */
+	0x7c,			/* 01111100 */
+	0x38,			/* 00111000 */
+	0x10,			/* 00010000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 5 0x05 '^E' 
+	*/
+	0x38,			/* 00111000 */
+	0x7c,			/* 01111100 */
+	0x38,			/* 00111000 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0xd6,			/* 11010110 */
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+
+	/*
+	* 6 0x06 '^F' 
+	*/
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+	0x7c,			/* 01111100 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0x7c,			/* 01111100 */
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+
+	/*
+	* 7 0x07 '^G' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 8 0x08 '^H' 
+	*/
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xe7,			/* 11100111 */
+	0xc3,			/* 11000011 */
+	0xc3,			/* 11000011 */
+	0xe7,			/* 11100111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 9 0x09 '^I' 
+	*/
+	0x00,			/* 00000000 */
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x42,			/* 01000010 */
+	0x42,			/* 01000010 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 10 0x0a '^J' 
+	*/
+	0xff,			/* 11111111 */
+	0xc3,			/* 11000011 */
+	0x99,			/* 10011001 */
+	0xbd,			/* 10111101 */
+	0xbd,			/* 10111101 */
+	0x99,			/* 10011001 */
+	0xc3,			/* 11000011 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 11 0x0b '^K' 
+	*/
+	0x0f,			/* 00001111 */
+	0x07,			/* 00000111 */
+	0x0f,			/* 00001111 */
+	0x7d,			/* 01111101 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x78,			/* 01111000 */
+
+	/*
+	* 12 0x0c '^L' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 13 0x0d '^M' 
+	*/
+	0x3f,			/* 00111111 */
+	0x33,			/* 00110011 */
+	0x3f,			/* 00111111 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x70,			/* 01110000 */
+	0xf0,			/* 11110000 */
+	0xe0,			/* 11100000 */
+
+	/*
+	* 14 0x0e '^N' 
+	*/
+	0x7f,			/* 01111111 */
+	0x63,			/* 01100011 */
+	0x7f,			/* 01111111 */
+	0x63,			/* 01100011 */
+	0x63,			/* 01100011 */
+	0x67,			/* 01100111 */
+	0xe6,			/* 11100110 */
+	0xc0,			/* 11000000 */
+
+	/*
+	* 15 0x0f '^O' 
+	*/
+	0x18,			/* 00011000 */
+	0xdb,			/* 11011011 */
+	0x3c,			/* 00111100 */
+	0xe7,			/* 11100111 */
+	0xe7,			/* 11100111 */
+	0x3c,			/* 00111100 */
+	0xdb,			/* 11011011 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 16 0x10 '^P' 
+	*/
+	0x80,			/* 10000000 */
+	0xe0,			/* 11100000 */
+	0xf8,			/* 11111000 */
+	0xfe,			/* 11111110 */
+	0xf8,			/* 11111000 */
+	0xe0,			/* 11100000 */
+	0x80,			/* 10000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 17 0x11 '^Q' 
+	*/
+	0x02,			/* 00000010 */
+	0x0e,			/* 00001110 */
+	0x3e,			/* 00111110 */
+	0xfe,			/* 11111110 */
+	0x3e,			/* 00111110 */
+	0x0e,			/* 00001110 */
+	0x02,			/* 00000010 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 18 0x12 '^R' 
+	*/
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 19 0x13 '^S' 
+	*/
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 20 0x14 '^T' 
+	*/
+	0x7f,			/* 01111111 */
+	0xdb,			/* 11011011 */
+	0xdb,			/* 11011011 */
+	0x7b,			/* 01111011 */
+	0x1b,			/* 00011011 */
+	0x1b,			/* 00011011 */
+	0x1b,			/* 00011011 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 21 0x15 '^U' 
+	*/
+	0x3e,			/* 00111110 */
+	0x61,			/* 01100001 */
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x86,			/* 10000110 */
+	0x7c,			/* 01111100 */
+
+	/*
+	* 22 0x16 '^V' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x7e,			/* 01111110 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 23 0x17 '^W' 
+	*/
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 24 0x18 '^X' 
+	*/
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 25 0x19 '^Y' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 26 0x1a '^Z' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0xfe,			/* 11111110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 27 0x1b '^[' 
+	*/
+	0x00,			/* 00000000 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0xfe,			/* 11111110 */
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 28 0x1c '^\' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 29 0x1d '^]' 
+	*/
+	0x00,			/* 00000000 */
+	0x24,			/* 00100100 */
+	0x66,			/* 01100110 */
+	0xff,			/* 11111111 */
+	0x66,			/* 01100110 */
+	0x24,			/* 00100100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 30 0x1e '^^' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 31 0x1f '^_' 
+	*/
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0x7e,			/* 01111110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 32 0x20 ' ' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 33 0x21 '!' 
+	*/
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 34 0x22 '"' 
+	*/
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x24,			/* 00100100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 35 0x23 '#' 
+	*/
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 36 0x24 '$' 
+	*/
+	0x18,			/* 00011000 */
+	0x3e,			/* 00111110 */
+	0x60,			/* 01100000 */
+	0x3c,			/* 00111100 */
+	0x06,			/* 00000110 */
+	0x7c,			/* 01111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 37 0x25 '%' 
+	*/
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xcc,			/* 11001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x66,			/* 01100110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 38 0x26 '&' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 39 0x27 ''' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 40 0x28 '(' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 41 0x29 ')' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 42 0x2a '*' 
+	*/
+	0x00,			/* 00000000 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0xff,			/* 11111111 */
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 43 0x2b '+' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 44 0x2c ',' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+
+	/*
+	* 45 0x2d '-' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 46 0x2e '.' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 47 0x2f '/' 
+	*/
+	0x06,			/* 00000110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0xc0,			/* 11000000 */
+	0x80,			/* 10000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 48 0x30 '0' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xd6,			/* 11010110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 49 0x31 '1' 
+	*/
+	0x18,			/* 00011000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 50 0x32 '2' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0x06,			/* 00000110 */
+	0x1c,			/* 00011100 */
+	0x30,			/* 00110000 */
+	0x66,			/* 01100110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 51 0x33 '3' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0x06,			/* 00000110 */
+	0x3c,			/* 00111100 */
+	0x06,			/* 00000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 52 0x34 '4' 
+	*/
+	0x1c,			/* 00011100 */
+	0x3c,			/* 00111100 */
+	0x6c,			/* 01101100 */
+	0xcc,			/* 11001100 */
+	0xfe,			/* 11111110 */
+	0x0c,			/* 00001100 */
+	0x1e,			/* 00011110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 53 0x35 '5' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xfc,			/* 11111100 */
+	0x06,			/* 00000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 54 0x36 '6' 
+	*/
+	0x38,			/* 00111000 */
+	0x60,			/* 01100000 */
+	0xc0,			/* 11000000 */
+	0xfc,			/* 11111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 55 0x37 '7' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 56 0x38 '8' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 57 0x39 '9' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7e,			/* 01111110 */
+	0x06,			/* 00000110 */
+	0x0c,			/* 00001100 */
+	0x78,			/* 01111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 58 0x3a ':' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 59 0x3b ';' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+
+	/*
+	* 60 0x3c '<' 
+	*/
+	0x06,			/* 00000110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x06,			/* 00000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 61 0x3d '=' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 62 0x3e '>' 
+	*/
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 63 0x3f '?' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 64 0x40 '@' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xde,			/* 11011110 */
+	0xde,			/* 11011110 */
+	0xde,			/* 11011110 */
+	0xc0,			/* 11000000 */
+	0x78,			/* 01111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 65 0x41 'A' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 66 0x42 'B' 
+	*/
+	0xfc,			/* 11111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0xfc,			/* 11111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 67 0x43 'C' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 68 0x44 'D' 
+	*/
+	0xf8,			/* 11111000 */
+	0x6c,			/* 01101100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x6c,			/* 01101100 */
+	0xf8,			/* 11111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 69 0x45 'E' 
+	*/
+	0xfe,			/* 11111110 */
+	0x62,			/* 01100010 */
+	0x68,			/* 01101000 */
+	0x78,			/* 01111000 */
+	0x68,			/* 01101000 */
+	0x62,			/* 01100010 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 70 0x46 'F' 
+	*/
+	0xfe,			/* 11111110 */
+	0x62,			/* 01100010 */
+	0x68,			/* 01101000 */
+	0x78,			/* 01111000 */
+	0x68,			/* 01101000 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 71 0x47 'G' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xce,			/* 11001110 */
+	0x66,			/* 01100110 */
+	0x3a,			/* 00111010 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 72 0x48 'H' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 73 0x49 'I' 
+	*/
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 74 0x4a 'J' 
+	*/
+	0x1e,			/* 00011110 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x78,			/* 01111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 75 0x4b 'K' 
+	*/
+	0xe6,			/* 11100110 */
+	0x66,			/* 01100110 */
+	0x6c,			/* 01101100 */
+	0x78,			/* 01111000 */
+	0x6c,			/* 01101100 */
+	0x66,			/* 01100110 */
+	0xe6,			/* 11100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 76 0x4c 'L' 
+	*/
+	0xf0,			/* 11110000 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0x62,			/* 01100010 */
+	0x66,			/* 01100110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 77 0x4d 'M' 
+	*/
+	0xc6,			/* 11000110 */
+	0xee,			/* 11101110 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0xd6,			/* 11010110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 78 0x4e 'N' 
+	*/
+	0xc6,			/* 11000110 */
+	0xe6,			/* 11100110 */
+	0xf6,			/* 11110110 */
+	0xde,			/* 11011110 */
+	0xce,			/* 11001110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 79 0x4f 'O' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 80 0x50 'P' 
+	*/
+	0xfc,			/* 11111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 81 0x51 'Q' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xce,			/* 11001110 */
+	0x7c,			/* 01111100 */
+	0x0e,			/* 00001110 */
+
+	/*
+	* 82 0x52 'R' 
+	*/
+	0xfc,			/* 11111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0x6c,			/* 01101100 */
+	0x66,			/* 01100110 */
+	0xe6,			/* 11100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 83 0x53 'S' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 84 0x54 'T' 
+	*/
+	0x7e,			/* 01111110 */
+	0x7e,			/* 01111110 */
+	0x5a,			/* 01011010 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 85 0x55 'U' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 86 0x56 'V' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 87 0x57 'W' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xd6,			/* 11010110 */
+	0xd6,			/* 11010110 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 88 0x58 'X' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 89 0x59 'Y' 
+	*/
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 90 0x5a 'Z' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0x8c,			/* 10001100 */
+	0x18,			/* 00011000 */
+	0x32,			/* 00110010 */
+	0x66,			/* 01100110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 91 0x5b '[' 
+	*/
+	0x3c,			/* 00111100 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 92 0x5c '\' 
+	*/
+	0xc0,			/* 11000000 */
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x06,			/* 00000110 */
+	0x02,			/* 00000010 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 93 0x5d ']' 
+	*/
+	0x3c,			/* 00111100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 94 0x5e '^' 
+	*/
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 95 0x5f '_' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 96 0x60 '`' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 97 0x61 'a' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 98 0x62 'b' 
+	*/
+	0xe0,			/* 11100000 */
+	0x60,			/* 01100000 */
+	0x7c,			/* 01111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 99 0x63 'c' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc0,			/* 11000000 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 100 0x64 'd' 
+	*/
+	0x1c,			/* 00011100 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 101 0x65 'e' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 102 0x66 'f' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x60,			/* 01100000 */
+	0xf8,			/* 11111000 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 103 0x67 'g' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x7c,			/* 01111100 */
+	0x0c,			/* 00001100 */
+	0xf8,			/* 11111000 */
+
+	/*
+	* 104 0x68 'h' 
+	*/
+	0xe0,			/* 11100000 */
+	0x60,			/* 01100000 */
+	0x6c,			/* 01101100 */
+	0x76,			/* 01110110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0xe6,			/* 11100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 105 0x69 'i' 
+	*/
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 106 0x6a 'j' 
+	*/
+	0x06,			/* 00000110 */
+	0x00,			/* 00000000 */
+	0x06,			/* 00000110 */
+	0x06,			/* 00000110 */
+	0x06,			/* 00000110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+
+	/*
+	* 107 0x6b 'k' 
+	*/
+	0xe0,			/* 11100000 */
+	0x60,			/* 01100000 */
+	0x66,			/* 01100110 */
+	0x6c,			/* 01101100 */
+	0x78,			/* 01111000 */
+	0x6c,			/* 01101100 */
+	0xe6,			/* 11100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 108 0x6c 'l' 
+	*/
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 109 0x6d 'm' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xec,			/* 11101100 */
+	0xfe,			/* 11111110 */
+	0xd6,			/* 11010110 */
+	0xd6,			/* 11010110 */
+	0xd6,			/* 11010110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 110 0x6e 'n' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xdc,			/* 11011100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 111 0x6f 'o' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 112 0x70 'p' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xdc,			/* 11011100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+
+	/*
+	* 113 0x71 'q' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x7c,			/* 01111100 */
+	0x0c,			/* 00001100 */
+	0x1e,			/* 00011110 */
+
+	/*
+	* 114 0x72 'r' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xdc,			/* 11011100 */
+	0x76,			/* 01110110 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 115 0x73 's' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x06,			/* 00000110 */
+	0xfc,			/* 11111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 116 0x74 't' 
+	*/
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0xfc,			/* 11111100 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x36,			/* 00110110 */
+	0x1c,			/* 00011100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 117 0x75 'u' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 118 0x76 'v' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 119 0x77 'w' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xd6,			/* 11010110 */
+	0xd6,			/* 11010110 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 120 0x78 'x' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 121 0x79 'y' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7e,			/* 01111110 */
+	0x06,			/* 00000110 */
+	0xfc,			/* 11111100 */
+
+	/*
+	* 122 0x7a 'z' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x4c,			/* 01001100 */
+	0x18,			/* 00011000 */
+	0x32,			/* 00110010 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 123 0x7b '{' 
+	*/
+	0x0e,			/* 00001110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x70,			/* 01110000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x0e,			/* 00001110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 124 0x7c '|' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 125 0x7d '}' 
+	*/
+	0x70,			/* 01110000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x0e,			/* 00001110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x70,			/* 01110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 126 0x7e '~' 
+	*/
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 127 0x7f '' 
+	*/
+	0x00,			/* 00000000 */
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 128 0x80 '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x0c,			/* 00001100 */
+	0x78,			/* 01111000 */
+
+	/*
+	* 129 0x81 '�' 
+	*/
+	0xcc,			/* 11001100 */
+	0x00,			/* 00000000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 130 0x82 '�' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 131 0x83 '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0x82,			/* 10000010 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 132 0x84 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 133 0x85 '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 134 0x86 '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 135 0x87 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x7e,			/* 01111110 */
+	0x0c,			/* 00001100 */
+	0x38,			/* 00111000 */
+
+	/*
+	* 136 0x88 '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0x82,			/* 10000010 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 137 0x89 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 138 0x8a '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 139 0x8b '�' 
+	*/
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 140 0x8c '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0x82,			/* 10000010 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 141 0x8d '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 142 0x8e '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 143 0x8f '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 144 0x90 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0xf8,			/* 11111000 */
+	0xc0,			/* 11000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 145 0x91 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0xd8,			/* 11011000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 146 0x92 '�' 
+	*/
+	0x3e,			/* 00111110 */
+	0x6c,			/* 01101100 */
+	0xcc,			/* 11001100 */
+	0xfe,			/* 11111110 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xce,			/* 11001110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 147 0x93 '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0x82,			/* 10000010 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 148 0x94 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 149 0x95 '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 150 0x96 '�' 
+	*/
+	0x78,			/* 01111000 */
+	0x84,			/* 10000100 */
+	0x00,			/* 00000000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 151 0x97 '�' 
+	*/
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 152 0x98 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7e,			/* 01111110 */
+	0x06,			/* 00000110 */
+	0xfc,			/* 11111100 */
+
+	/*
+	* 153 0x99 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 154 0x9a '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 155 0x9b '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 156 0x9c '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x64,			/* 01100100 */
+	0xf0,			/* 11110000 */
+	0x60,			/* 01100000 */
+	0x66,			/* 01100110 */
+	0xfc,			/* 11111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 157 0x9d '�' 
+	*/
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 158 0x9e '�' 
+	*/
+	0xf8,			/* 11111000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xfa,			/* 11111010 */
+	0xc6,			/* 11000110 */
+	0xcf,			/* 11001111 */
+	0xc6,			/* 11000110 */
+	0xc7,			/* 11000111 */
+
+	/*
+	* 159 0x9f '�' 
+	*/
+	0x0e,			/* 00001110 */
+	0x1b,			/* 00011011 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0xd8,			/* 11011000 */
+	0x70,			/* 01110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 160 0xa0 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 161 0xa1 '�' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 162 0xa2 '�' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 163 0xa3 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 164 0xa4 '�' 
+	*/
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0xdc,			/* 11011100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 165 0xa5 '�' 
+	*/
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0xe6,			/* 11100110 */
+	0xf6,			/* 11110110 */
+	0xde,			/* 11011110 */
+	0xce,			/* 11001110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 166 0xa6 '�' 
+	*/
+	0x3c,			/* 00111100 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x3e,			/* 00111110 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 167 0xa7 '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 168 0xa8 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x63,			/* 01100011 */
+	0x3e,			/* 00111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 169 0xa9 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 170 0xaa '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x06,			/* 00000110 */
+	0x06,			/* 00000110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 171 0xab '�' 
+	*/
+	0x63,			/* 01100011 */
+	0xe6,			/* 11100110 */
+	0x6c,			/* 01101100 */
+	0x7e,			/* 01111110 */
+	0x33,			/* 00110011 */
+	0x66,			/* 01100110 */
+	0xcc,			/* 11001100 */
+	0x0f,			/* 00001111 */
+
+	/*
+	* 172 0xac '�' 
+	*/
+	0x63,			/* 01100011 */
+	0xe6,			/* 11100110 */
+	0x6c,			/* 01101100 */
+	0x7a,			/* 01111010 */
+	0x36,			/* 00110110 */
+	0x6a,			/* 01101010 */
+	0xdf,			/* 11011111 */
+	0x06,			/* 00000110 */
+
+	/*
+	* 173 0xad '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 174 0xae '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x33,			/* 00110011 */
+	0x66,			/* 01100110 */
+	0xcc,			/* 11001100 */
+	0x66,			/* 01100110 */
+	0x33,			/* 00110011 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 175 0xaf '�' 
+	*/
+	0x00,			/* 00000000 */
+	0xcc,			/* 11001100 */
+	0x66,			/* 01100110 */
+	0x33,			/* 00110011 */
+	0x66,			/* 01100110 */
+	0xcc,			/* 11001100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 176 0xb0 '�' 
+	*/
+	0x22,			/* 00100010 */
+	0x88,			/* 10001000 */
+	0x22,			/* 00100010 */
+	0x88,			/* 10001000 */
+	0x22,			/* 00100010 */
+	0x88,			/* 10001000 */
+	0x22,			/* 00100010 */
+	0x88,			/* 10001000 */
+
+	/*
+	* 177 0xb1 '�' 
+	*/
+	0x55,			/* 01010101 */
+	0xaa,			/* 10101010 */
+	0x55,			/* 01010101 */
+	0xaa,			/* 10101010 */
+	0x55,			/* 01010101 */
+	0xaa,			/* 10101010 */
+	0x55,			/* 01010101 */
+	0xaa,			/* 10101010 */
+
+	/*
+	* 178 0xb2 '�' 
+	*/
+	0x77,			/* 01110111 */
+	0xdd,			/* 11011101 */
+	0x77,			/* 01110111 */
+	0xdd,			/* 11011101 */
+	0x77,			/* 01110111 */
+	0xdd,			/* 11011101 */
+	0x77,			/* 01110111 */
+	0xdd,			/* 11011101 */
+
+	/*
+	* 179 0xb3 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 180 0xb4 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 181 0xb5 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 182 0xb6 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf6,			/* 11110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 183 0xb7 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 184 0xb8 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 185 0xb9 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf6,			/* 11110110 */
+	0x06,			/* 00000110 */
+	0xf6,			/* 11110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 186 0xba '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 187 0xbb '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x06,			/* 00000110 */
+	0xf6,			/* 11110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 188 0xbc '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf6,			/* 11110110 */
+	0x06,			/* 00000110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 189 0xbd '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 190 0xbe '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 191 0xbf '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 192 0xc0 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 193 0xc1 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 194 0xc2 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 195 0xc3 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 196 0xc4 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 197 0xc5 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 198 0xc6 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 199 0xc7 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x37,			/* 00110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 200 0xc8 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x37,			/* 00110111 */
+	0x30,			/* 00110000 */
+	0x3f,			/* 00111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 201 0xc9 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x3f,			/* 00111111 */
+	0x30,			/* 00110000 */
+	0x37,			/* 00110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 202 0xca '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf7,			/* 11110111 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 203 0xcb '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0xf7,			/* 11110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 204 0xcc '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x37,			/* 00110111 */
+	0x30,			/* 00110000 */
+	0x37,			/* 00110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 205 0xcd '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 206 0xce '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf7,			/* 11110111 */
+	0x00,			/* 00000000 */
+	0xf7,			/* 11110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 207 0xcf '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 208 0xd0 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 209 0xd1 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 210 0xd2 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 211 0xd3 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x3f,			/* 00111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 212 0xd4 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 213 0xd5 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 214 0xd6 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x3f,			/* 00111111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 215 0xd7 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xff,			/* 11111111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 216 0xd8 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 217 0xd9 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 218 0xda '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 219 0xdb '�' 
+	*/
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 220 0xdc '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 221 0xdd '�' 
+	*/
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+
+	/*
+	* 222 0xde '�' 
+	*/
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+
+	/*
+	* 223 0xdf '�' 
+	*/
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 224 0xe0 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0xc8,			/* 11001000 */
+	0xdc,			/* 11011100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 225 0xe1 '�' 
+	*/
+	0x78,			/* 01111000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xd8,			/* 11011000 */
+	0xcc,			/* 11001100 */
+	0xc6,			/* 11000110 */
+	0xcc,			/* 11001100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 226 0xe2 '�' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 227 0xe3 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 228 0xe4 '�' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 229 0xe5 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0xd8,			/* 11011000 */
+	0xd8,			/* 11011000 */
+	0xd8,			/* 11011000 */
+	0x70,			/* 01110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 230 0xe6 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0xc0,			/* 11000000 */
+
+	/*
+	* 231 0xe7 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 232 0xe8 '�' 
+	*/
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+
+	/*
+	* 233 0xe9 '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 234 0xea '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0xee,			/* 11101110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 235 0xeb '�' 
+	*/
+	0x0e,			/* 00001110 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x3e,			/* 00111110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 236 0xec '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0xdb,			/* 11011011 */
+	0xdb,			/* 11011011 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 237 0xed '�' 
+	*/
+	0x06,			/* 00000110 */
+	0x0c,			/* 00001100 */
+	0x7e,			/* 01111110 */
+	0xdb,			/* 11011011 */
+	0xdb,			/* 11011011 */
+	0x7e,			/* 01111110 */
+	0x60,			/* 01100000 */
+	0xc0,			/* 11000000 */
+
+	/*
+	* 238 0xee '�' 
+	*/
+	0x1e,			/* 00011110 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0x7e,			/* 01111110 */
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x1e,			/* 00011110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 239 0xef '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 240 0xf0 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 241 0xf1 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 242 0xf2 '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 243 0xf3 '�' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 244 0xf4 '�' 
+	*/
+	0x0e,			/* 00001110 */
+	0x1b,			/* 00011011 */
+	0x1b,			/* 00011011 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 245 0xf5 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xd8,			/* 11011000 */
+	0xd8,			/* 11011000 */
+	0x70,			/* 01110000 */
+
+	/*
+	* 246 0xf6 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 247 0xf7 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 248 0xf8 '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 249 0xf9 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 250 0xfa '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 251 0xfb '�' 
+	*/
+	0x0f,			/* 00001111 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0xec,			/* 11101100 */
+	0x6c,			/* 01101100 */
+	0x3c,			/* 00111100 */
+	0x1c,			/* 00011100 */
+
+	/*
+	* 252 0xfc '�' 
+	*/
+	0x6c,			/* 01101100 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 253 0xfd '�' 
+	*/
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 254 0xfe '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 255 0xff ' ' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+};
diff --git a/vendor/SDL3_gfx/SDL3_imageFilter.c b/vendor/SDL3_gfx/SDL3_imageFilter.c
new file mode 100644
index 0000000..7417de3
--- /dev/null
+++ b/vendor/SDL3_gfx/SDL3_imageFilter.c
@@ -0,0 +1,7371 @@
+/*
+
+SDL3_imageFilter.c: byte-image "filter" routines
+
+Copyright (C) 2012-2014  Andreas Schiffler
+Copyright (C) 2013  Sylvain Beucler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+   1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+
+   2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+
+   3. This notice may not be removed or altered from any source
+   distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+/*
+
+Note: Uses inline x86 MMX or ASM optimizations if available and enabled.
+
+Note: Most of the MMX code is based on published routines 
+by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to 
+him for his work.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <SDL3/SDL.h>
+
+/* Use GCC intrinsics if available: they support both i386 and x86_64,
+   provide ASM-grade performances, and lift the PUSHA/POPA issues. */
+#ifdef __GNUC__
+#  ifdef USE_MMX
+#    include <mmintrin.h>
+#  endif
+#  include <SDL3/SDL_cpuinfo.h>
+#endif
+
+#include "SDL3_imageFilter.h"
+
+/*!
+\brief Swaps the byte order in a 32bit integer (LSB becomes MSB, etc.). 
+*/
+#define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8)  | (((x) & 0x0000ff00) << 8)  | ((x) << 24))
+
+/* ------ Static variables ----- */
+
+/*! 
+\brief Static state which enables the use of the MMX routines. Enabled by default 
+*/
+static int SDL_imageFilterUseMMX = 1;
+
+/* Detect GCC */
+#if defined(__GNUC__)
+#define GCC__
+#endif
+
+/*!
+\brief MMX detection routine (with override flag). 
+
+\returns 1 of MMX was detected, 0 otherwise.
+*/
+int SDL_imageFilterMMXdetect(void)
+{
+	/* Check override flag */
+	if (SDL_imageFilterUseMMX == 0) {
+		return (0);
+	}
+
+    return SDL_HasMMX();
+}
+
+/*!
+\brief Disable MMX check for filter functions and and force to use non-MMX C based code.
+*/
+void SDL_imageFilterMMXoff()
+{
+	SDL_imageFilterUseMMX = 0;
+}
+
+/*!
+\brief Enable MMX check for filter functions and use MMX code if available.
+*/
+void SDL_imageFilterMMXon()
+{
+	SDL_imageFilterUseMMX = 1;
+}
+
+/* ------------------------------------------------------------------------------------ */
+
+/*!
+\brief Internal MMX Filter using Add: D = saturation255(S1 + S2) 
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1	/* load Src1 address into eax */
+			mov ebx, Src2	/* load Src2 address into ebx */
+			mov edi, Dest	/* load Dest address into edi */
+			mov ecx, SrcLength	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16	/* 16 byte alignment of the loop entry */
+L1010:
+		movq mm1, [eax]	/* load 8 bytes from Src1 into mm1 */
+		paddusb mm1, [ebx]	/* mm1=Src1+Src2 (add 8 bytes with saturation) */
+		movq [edi], mm1	/* store result in Dest */
+			add eax, 8	/* increase Src1, Src2 and Dest  */
+			add ebx, 8	/* register pointers by 8 */
+			add edi, 8
+			dec ecx	/* decrease loop counter */
+			jnz L1010	/* check loop termination, proceed if required */
+			emms /* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_paddusb(*mSrc1, *mSrc2);	/* Src1+Src2 (add 8 bytes with saturation) */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Add: D = saturation255(S1 + S2) 
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* Use MMX assembly routine */
+		SDL_imageFilterAddMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 + (int) *cursrc2;
+		if (result > 255)
+			result = 255;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using Mean: D = S1/2 + S2/2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+\param Mask Mask array containing 8 bytes with 0x7F value.
+]
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength,
+						   unsigned char *Mask)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{ 
+		pusha
+			mov edx, Mask /* load Mask address into edx */
+			movq mm0, [edx] /* load Mask into mm0 */
+		mov eax, Src1 /* load Src1 address into eax */
+			mov ebx, Src2 /* load Src2 address into ebx */
+			mov edi, Dest /* load Dest address into edi */
+			mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
+			shr ecx, 3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16	/* 16 byte alignment of the loop entry */
+L21011:
+		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
+		movq mm2,  [ebx] 	/* load 8 bytes from Src2 into mm2 */
+		/* --- Byte shift via Word shift --- */
+		psrlw mm1, 1 	/* shift 4 WORDS of mm1 1 bit to the right */
+			psrlw mm2, 1 	/* shift 4 WORDS of mm2 1 bit to the right */
+			pand mm1, mm0   // apply Mask to 8 BYTES of mm1 */
+			/* byte     0x0f, 0xdb, 0xc8 */
+			pand mm2, mm0   // apply Mask to 8 BYTES of mm2 */
+			/* byte     0x0f, 0xdb, 0xd0 */
+			paddusb mm1,  mm2 	/* mm1=mm1+mm2 (add 8 bytes with saturation) */
+			movq [edi],  mm1 	/* store result in Dest */
+			add eax,  8 	/* increase Src1, Src2 and Dest  */
+			add ebx,  8 	/* register pointers by 8 */
+			add edi,  8
+			dec ecx 	/* decrease loop counter */
+			jnz L21011	/* check loop termination, proceed if required */
+			emms	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 *mMask = (__m64*)Mask;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1 = *mSrc1,
+		      mm2 = *mSrc2;
+		mm1 = _m_psrlwi(mm1, 1);	/* shift 4 WORDS of mm1 1 bit to the right */
+		mm2 = _m_psrlwi(mm2, 1);	/* shift 4 WORDS of mm2 1 bit to the right */
+		mm1 = _m_pand(mm1, *mMask);	/* apply Mask to 8 BYTES of mm1 */
+		mm2 = _m_pand(mm2, *mMask);	/* apply Mask to 8 BYTES of mm2 */
+		*mDest = _m_paddusb(mm1, mm2);	/* mm1+mm2 (add 8 bytes with saturation) */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();				/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Mean: D = S1/2 + S2/2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using Sub: D = saturation0(S1 - S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax,  Src1 	/* load Src1 address into eax */
+			mov ebx,  Src2 	/* load Src2 address into ebx */
+			mov edi,  Dest 	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16 /* 16 byte alignment of the loop entry */
+L1012:
+		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
+		psubusb mm1,  [ebx] 	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
+		movq [edi],  mm1 	/* store result in Dest */
+			add eax, 8 	/* increase Src1, Src2 and Dest  */
+			add ebx, 8 	/* register pointers by 8 */
+			add edi, 8
+			dec ecx	/* decrease loop counter */
+			jnz L1012	/* check loop termination, proceed if required */
+			emms /* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_psubusb(*mSrc1, *mSrc2);	/* Src1-Src2 (sub 8 bytes with saturation) */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Sub: D = saturation0(S1 - S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterSubMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 - (int) *cursrc2;
+		if (result < 0)
+			result = 0;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using AbsDiff: D = | S1 - S2 |
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1  	/* load Src1 address into eax */
+			mov ebx, Src2 	/* load Src2 address into ebx */
+			mov edi, Dest 	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16	/* 16 byte alignment of the loop entry */
+L1013:
+		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
+		movq mm2,  [ebx] 	/* load 8 bytes from Src2 into mm2 */
+		psubusb mm1,  [ebx] 	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
+		psubusb mm2,  [eax] 	/* mm2=Src2-Src1 (sub 8 bytes with saturation) */
+		por mm1,  mm2 	/* combine both mm2 and mm1 results */
+			movq [edi],  mm1 	/* store result in Dest */
+			add eax, 8 	/* increase Src1, Src2 and Dest  */
+			add ebx, 8 	/* register pointers by 8 */
+			add edi, 8
+			dec ecx 	/* decrease loop counter */
+			jnz L1013    	/* check loop termination, proceed if required */
+			emms         /* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1 = _m_psubusb(*mSrc2, *mSrc1);	/* Src1-Src2 (sub 8 bytes with saturation) */
+		__m64 mm2 = _m_psubusb(*mSrc1, *mSrc2);	/* Src2-Src1 (sub 8 bytes with saturation) */
+		*mDest = _m_por(mm1, mm2);		/* combine both mm2 and mm1 results */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using AbsDiff: D = | S1 - S2 |
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = abs((int) *cursrc1 - (int) *cursrc2);
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using Mult: D = saturation255(S1 * S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   /* load Src1 address into eax */
+			mov ebx, Src2   /* load Src2 address into ebx */
+			mov edi, Dest   /* load Dest address into edi */
+			mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
+			shr ecx, 3   /* counter/8 (MMX loads 8 bytes at a time) */
+			pxor mm0, mm0   /* zero mm0 register */
+			align 16      	/* 16 byte alignment of the loop entry */
+L1014:
+		movq mm1, [eax]   /* load 8 bytes from Src1 into mm1 */
+		movq mm3, [ebx]   /* load 8 bytes from Src2 into mm3 */
+		movq mm2, mm1   /* copy mm1 into mm2 */
+			movq mm4, mm3   /* copy mm3 into mm4  */
+			punpcklbw mm1, mm0   /* unpack low  bytes of Src1 into words */
+			punpckhbw mm2, mm0   /* unpack high bytes of Src1 into words */
+			punpcklbw mm3, mm0   /* unpack low  bytes of Src2 into words */
+			punpckhbw mm4, mm0   /* unpack high bytes of Src2 into words */
+			pmullw mm1, mm3   /* mul low  bytes of Src1 and Src2  */
+			pmullw mm2, mm4   /* mul high bytes of Src1 and Src2 */
+			/* Take abs value of the results (signed words) */
+			movq mm5, mm1   /* copy mm1 into mm5 */
+			movq mm6, mm2   /* copy mm2 into mm6 */
+			psraw mm5, 15   /* fill mm5 words with word sign bit */
+			psraw mm6, 15   /* fill mm6 words with word sign bit */
+			pxor mm1, mm5   /* take 1's compliment of only neg. words */
+			pxor mm2, mm6   /* take 1's compliment of only neg. words */
+			psubsw mm1, mm5   /* add 1 to only neg. words, W-(-1) or W-0 */
+			psubsw mm2, mm6   /* add 1 to only neg. words, W-(-1) or W-0 */
+			packuswb mm1, mm2   /* pack words back into bytes with saturation */
+			movq [edi], mm1   /* store result in Dest */
+			add eax, 8   /* increase Src1, Src2 and Dest  */
+			add ebx, 8   /* register pointers by 8 */
+			add edi, 8
+			dec ecx 	/* decrease loop counter */
+			jnz L1014	/* check loop termination, proceed if required */
+			emms /* exit MMX state */
+			popa
+	}
+#else
+	/* i386 ASM with constraints: */
+	/* asm volatile ( */
+	/* 	"shr $3, %%ecx \n\t"	/\* counter/8 (MMX loads 8 bytes at a time) *\/ */
+	/* 	"pxor      %%mm0, %%mm0 \n\t"	/\* zero mm0 register *\/ */
+	/* 	".align 16       \n\t"	/\* 16 byte alignment of the loop entry *\/ */
+	/* 	"1: movq (%%eax), %%mm1 \n\t"     /\* load 8 bytes from Src1 into mm1 *\/ */
+	/* 	"movq    (%%ebx), %%mm3 \n\t"	/\* load 8 bytes from Src2 into mm3 *\/ */
+	/* 	"movq      %%mm1, %%mm2 \n\t"	/\* copy mm1 into mm2 *\/ */
+	/* 	"movq      %%mm3, %%mm4 \n\t"	/\* copy mm3 into mm4  *\/ */
+	/* 	"punpcklbw %%mm0, %%mm1 \n\t"	/\* unpack low  bytes of Src1 into words *\/ */
+	/* 	"punpckhbw %%mm0, %%mm2 \n\t"	/\* unpack high bytes of Src1 into words *\/ */
+	/* 	"punpcklbw %%mm0, %%mm3 \n\t"	/\* unpack low  bytes of Src2 into words *\/ */
+	/* 	"punpckhbw %%mm0, %%mm4 \n\t"	/\* unpack high bytes of Src2 into words *\/ */
+	/* 	"pmullw    %%mm3, %%mm1 \n\t"	/\* mul low  bytes of Src1 and Src2  *\/ */
+	/* 	"pmullw    %%mm4, %%mm2 \n\t"	/\* mul high bytes of Src1 and Src2 *\/ */
+	/* 	/\* Take abs value of the results (signed words) *\/ */
+	/* 	"movq      %%mm1, %%mm5 \n\t"	/\* copy mm1 into mm5 *\/ */
+	/* 	"movq      %%mm2, %%mm6 \n\t"	/\* copy mm2 into mm6 *\/ */
+	/* 	"psraw       $15, %%mm5 \n\t"	/\* fill mm5 words with word sign bit *\/ */
+	/* 	"psraw       $15, %%mm6 \n\t"	/\* fill mm6 words with word sign bit *\/ */
+	/* 	"pxor      %%mm5, %%mm1 \n\t"	/\* take 1's compliment of only neg. words *\/ */
+	/* 	"pxor      %%mm6, %%mm2 \n\t"	/\* take 1's compliment of only neg. words *\/ */
+	/* 	"psubsw    %%mm5, %%mm1 \n\t"	/\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
+	/* 	"psubsw    %%mm6, %%mm2 \n\t"	/\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
+	/* 	"packuswb  %%mm2, %%mm1 \n\t"	/\* pack words back into bytes with saturation *\/ */
+	/* 	"movq    %%mm1, (%%edi) \n\t"	/\* store result in Dest *\/ */
+	/* 	"add $8, %%eax \n\t"	/\* increase Src1, Src2 and Dest  *\/ */
+	/* 	"add $8, %%ebx \n\t"	/\* register pointers by 8 *\/ */
+	/* 	"add $8, %%edi \n\t" */
+	/* 	"dec %%ecx     \n\t"	/\* decrease loop counter *\/ */
+	/* 	"jnz 1b        \n\t"	/\* check loop termination, proceed if required *\/ */
+	/* 	"emms          \n\t"	/\* exit MMX state *\/ */
+	/* 	: "+a" (Src1),		/\* load Src1 address into rax, modified by the loop *\/ */
+	/* 	  "+b" (Src2),		/\* load Src2 address into rbx, modified by the loop *\/ */
+	/* 	  "+c" (SrcLength),	/\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
+	/* 	  "+D" (Dest)		/\* load Dest address into rdi, modified by the loop *\/ */
+	/* 	: */
+	/* 	: "memory",		/\* *Dest is modified *\/ */
+        /*           "mm0","mm1","mm2","mm3","mm4","mm5","mm6"	/\* registers modified *\/ */
+	/* ); */
+
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0); /* zero mm0 register */
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1, mm2, mm3, mm4, mm5, mm6;
+		mm1 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+		mm2 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_punpcklbw(*mSrc2, mm0);	/* unpack low  bytes of Src2 into words */
+		mm4 = _m_punpckhbw(*mSrc2, mm0);	/* unpack high bytes of Src2 into words */
+		mm1 = _m_pmullw(mm1, mm3);		/* mul low  bytes of Src1 and Src2  */
+		mm2 = _m_pmullw(mm2, mm4);		/* mul high bytes of Src1 and Src2 */
+		mm5 = _m_psrawi(mm1, 15);		/* fill mm5 words with word sign bit */
+		mm6 = _m_psrawi(mm2, 15);		/* fill mm6 words with word sign bit */
+		mm1 = _m_pxor(mm1, mm5);		/* take 1's compliment of only neg. words */
+		mm2 = _m_pxor(mm2, mm6);		/* take 1's compliment of only neg. words */
+		mm1 = _m_psubsw(mm1, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
+		mm2 = _m_psubsw(mm2, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
+		*mDest = _m_packuswb(mm1, mm2);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Mult: D = saturation255(S1 * S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterMultMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+
+		/* NOTE: this is probably wrong - dunno what the MMX code does */
+
+		result = (int) *cursrc1 * (int) *cursrc2;
+		if (result > 255)
+			result = 255;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal ASM Filter using MultNor: D = S1 * S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov edx, Src1   /* load Src1 address into edx */
+			mov esi, Src2   /* load Src2 address into esi */
+			mov edi, Dest   /* load Dest address into edi */
+			mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
+			align 16 	/* 16 byte alignment of the loop entry */
+L10141:
+		mov al, [edx]   /* load a byte from Src1 */
+		mul [esi] 	/* mul with a byte from Src2 */
+		mov [edi], al   /* move a byte result to Dest */
+			inc edx 	/* increment Src1, Src2, Dest */
+			inc esi 		/* pointer registers by one */
+			inc edi
+			dec ecx	/* decrease loop counter */
+			jnz L10141  	/* check loop termination, proceed if required */
+			popa
+	}
+#else
+	/* Note: ~5% gain on i386, less efficient than C on x86_64 */
+	/* Also depends on whether this function is static (?!) */
+	asm volatile (
+		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
+#  if defined(i386)
+		"1:mov  (%%edx), %%al \n\t"      /* load a byte from Src1 */
+		"mulb (%%esi)       \n\t"	/* mul with a byte from Src2 */
+		"mov %%al, (%%edi)  \n\t"       /* move a byte result to Dest */
+		"inc %%edx \n\t"		/* increment Src1, Src2, Dest */
+		"inc %%esi \n\t"		/* pointer registers by one */
+		"inc %%edi \n\t"
+		"dec %%ecx      \n\t"	/* decrease loop counter */
+#  elif defined(__x86_64__)
+		"1:mov  (%%rdx), %%al \n\t"      /* load a byte from Src1 */
+		"mulb (%%rsi)       \n\t"	/* mul with a byte from Src2 */
+		"mov %%al, (%%rdi)  \n\t"       /* move a byte result to Dest */
+		"inc %%rdx \n\t"		/* increment Src1, Src2, Dest */
+		"inc %%rsi \n\t"		/* pointer registers by one */
+		"inc %%rdi \n\t"
+		"dec %%rcx      \n\t"	/* decrease loop counter */
+#  endif
+		"jnz 1b         \n\t"	/* check loop termination, proceed if required */
+		: "+d" (Src1),		/* load Src1 address into edx */
+		  "+S" (Src2),		/* load Src2 address into esi */
+		  "+c" (SrcLength),	/* load loop counter (SIZE) into ecx */
+		  "+D" (Dest)		/* load Dest address into edi */
+		:
+		: "memory", "rax"
+		);
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using MultNor: D = S1 * S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (SDL_imageFilterMMXdetect()) {
+		if (length > 0) {
+			/* ASM routine */
+			SDL_imageFilterMultNorASM(Src1, Src2, Dest, length);
+
+			/* Check for unaligned bytes */
+			if ((length & 7) > 0) {
+				/* Setup to process unaligned bytes */
+				istart = length & 0xfffffff8;
+				cursrc1 = &Src1[istart];
+				cursrc2 = &Src2[istart];
+				curdst = &Dest[istart];
+			} else {
+				/* No unaligned bytes - we are done */
+				return (0);
+			}
+		} else {
+			/* No bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdst = (int)*cursrc1 * (int)*cursrc2;  // (int) for efficiency
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using MultDivby2: D = saturation255(S1/2 * S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{ 
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov ebx, Src2   	/* load Src2 address into ebx */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			pxor mm0,  mm0 	/* zero mm0 register */
+			align 16          	/* 16 byte alignment of the loop entry */
+L1015:
+		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
+		movq mm3,  [ebx] 	/* load 8 bytes from Src2 into mm3 */
+		movq mm2,  mm1 	/* copy mm1 into mm2 */
+			movq mm4,  mm3 	/* copy mm3 into mm4  */
+			punpcklbw mm1,  mm0 	/* unpack low  bytes of Src1 into words */
+			punpckhbw mm2,  mm0 	/* unpack high bytes of Src1 into words */
+			punpcklbw mm3,  mm0 	/* unpack low  bytes of Src2 into words */
+			punpckhbw mm4,  mm0 	/* unpack high bytes of Src2 into words */
+			psrlw mm1,  1 	/* divide mm1 words by 2, Src1 low bytes */
+			psrlw mm2,  1 	/* divide mm2 words by 2, Src1 high bytes */
+			pmullw mm1,  mm3 	/* mul low  bytes of Src1 and Src2  */
+			pmullw mm2,  mm4 	/* mul high bytes of Src1 and Src2 */
+			packuswb mm1,  mm2 	/* pack words back into bytes with saturation */
+			movq [edi],  mm1 	/* store result in Dest */
+			add eax,  8 	/* increase Src1, Src2 and Dest  */
+			add ebx,  8 	/* register pointers by 8 */
+			add edi,  8
+			dec ecx        	/* decrease loop counter */
+			jnz L1015       	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0); /* zero mm0 register */
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1, mm2, mm3, mm4, mm5, mm6;
+		mm1 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+		mm2 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_punpcklbw(*mSrc2, mm0);	/* unpack low  bytes of Src2 into words */
+		mm4 = _m_punpckhbw(*mSrc2, mm0);	/* unpack high bytes of Src2 into words */
+		mm1 = _m_psrlwi(mm1, 1);		/* divide mm1 words by 2, Src1 low bytes */
+		mm2 = _m_psrlwi(mm2, 1);		/* divide mm2 words by 2, Src1 high bytes */
+		mm1 = _m_pmullw(mm1, mm3);		/* mul low  bytes of Src1 and Src2  */
+		mm2 = _m_pmullw(mm2, mm4);		/* mul high bytes of Src1 and Src2 */
+		*mDest = _m_packuswb(mm1, mm2);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using MultDivby2: D = saturation255(S1/2 * S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = ((int) *cursrc1 / 2) * (int) *cursrc2;
+		if (result > 255)
+			result = 255;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov ebx, Src2   	/* load Src2 address into ebx */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			pxor mm0, mm0   	/* zero mm0 register */
+			align 16          	/* 16 byte alignment of the loop entry */
+L1016:
+		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
+		movq mm3, [ebx]   	/* load 8 bytes from Src2 into mm3 */
+		movq mm2, mm1   	/* copy mm1 into mm2 */
+			movq mm4, mm3   	/* copy mm3 into mm4  */
+			punpcklbw mm1, mm0   	/* unpack low  bytes of Src1 into words */
+			punpckhbw mm2, mm0   	/* unpack high bytes of Src1 into words */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of Src2 into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of Src2 into words */
+			psrlw mm1, 1   	/* divide mm1 words by 2, Src1 low bytes */
+			psrlw mm2, 1   	/* divide mm2 words by 2, Src1 high bytes */
+			psrlw mm3, 1   	/* divide mm3 words by 2, Src2 low bytes */
+			psrlw mm4, 1   	/* divide mm4 words by 2, Src2 high bytes */
+			pmullw mm1, mm3   	/* mul low  bytes of Src1 and Src2  */
+			pmullw mm2, mm4   	/* mul high bytes of Src1 and Src2 */
+			packuswb mm1, mm2   	/* pack words back into bytes with saturation */
+			movq [edi], mm1   	/* store result in Dest */
+			add eax, 8   	/* increase Src1, Src2 and Dest  */
+			add ebx, 8   	/* register pointers by 8 */
+			add edi,  8
+			dec ecx        	/* decrease loop counter */
+			jnz L1016       	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0); /* zero mm0 register */
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1, mm2, mm3, mm4, mm5, mm6;
+		mm1 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+		mm2 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_punpcklbw(*mSrc2, mm0);	/* unpack low  bytes of Src2 into words */
+		mm4 = _m_punpckhbw(*mSrc2, mm0);	/* unpack high bytes of Src2 into words */
+		mm1 = _m_psrlwi(mm1, 1);		/* divide mm1 words by 2, Src1 low bytes */
+		mm2 = _m_psrlwi(mm2, 1);		/* divide mm2 words by 2, Src1 high bytes */
+		mm3 = _m_psrlwi(mm3, 1);		/* divide mm3 words by 2, Src2 low bytes */
+		mm4 = _m_psrlwi(mm4, 1);		/* divide mm4 words by 2, Src2 high bytes */
+		mm1 = _m_pmullw(mm1, mm3);		/* mul low  bytes of Src1 and Src2  */
+		mm2 = _m_pmullw(mm2, mm4);		/* mul high bytes of Src1 and Src2 */
+		*mDest = _m_packuswb(mm1, mm2);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2);
+		if (result > 255)
+			result = 255;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using BitAnd: D = S1 & S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov ebx, Src2   	/* load Src2 address into ebx */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16          	/* 16 byte alignment of the loop entry */
+L1017:
+		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
+		pand mm1, [ebx]   	/* mm1=Src1&Src2 */
+		movq [edi], mm1   	/* store result in Dest */
+			add eax, 8   	/* increase Src1, Src2 and Dest  */
+			add ebx, 8   	/* register pointers by 8 */
+			add edi, 8
+			dec ecx        	/* decrease loop counter */
+			jnz L1017       	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* x86_64 ASM with constraints: */
+	/* asm volatile ( */
+	/* 	"shr $3, %%rcx \n\t"	/\* counter/8 (MMX loads 8 bytes at a time) *\/ */
+	/* 	".align 16       \n\t"	/\* 16 byte alignment of the loop entry *\/ */
+	/* 	"1: movq (%%rax), %%mm1 \n\t"	/\* load 8 bytes from Src1 into mm1 *\/ */
+	/* 	"pand    (%%rbx), %%mm1 \n\t"	/\* mm1=Src1&Src2 *\/ */
+	/* 	"movq    %%mm1, (%%rdi) \n\t"	/\* store result in Dest *\/ */
+	/* 	"add $8, %%rax \n\t"	/\* increase Src1, Src2 and Dest  *\/ */
+	/* 	"add $8, %%rbx \n\t"	/\* register pointers by 8 *\/ */
+	/* 	"add $8, %%rdi \n\t" */
+	/* 	"dec %%rcx     \n\t"	/\* decrease loop counter *\/ */
+	/* 	"jnz 1b        \n\t"	/\* check loop termination, proceed if required *\/ */
+	/* 	"emms          \n\t"	/\* exit MMX state *\/ */
+	/* 	: "+a" (Src1),		/\* load Src1 address into rax, modified by the loop *\/ */
+	/* 	  "+b" (Src2),		/\* load Src2 address into rbx, modified by the loop *\/ */
+	/* 	  "+c" (SrcLength),	/\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
+	/* 	  "+D" (Dest)		/\* load Dest address into rdi, modified by the loop *\/ */
+	/* 	: */
+	/* 	: "memory",		/\* *Dest is modified *\/ */
+        /*           "mm1"			/\* register mm1 modified *\/ */
+	/* ); */
+
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_pand(*mSrc1, *mSrc2);	/* Src1&Src2 */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using BitAnd: D = S1 & S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()>0) && (length>7)) {
+		/*  if (length > 7) { */
+		/* Call MMX routine */
+
+		SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdst = (*cursrc1) & (*cursrc2);
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using BitOr: D = S1 | S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov ebx, Src2   	/* load Src2 address into ebx */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16          	/* 16 byte alignment of the loop entry */
+L91017:
+		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
+		por mm1, [ebx]   	/* mm1=Src1|Src2 */
+		movq [edi], mm1   	/* store result in Dest */
+			add eax, 8   	/* increase Src1, Src2 and Dest  */
+			add ebx, 8   	/* register pointers by 8 */
+			add edi,  8
+			dec ecx        	/* decrease loop counter */
+			jnz L91017      	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_por(*mSrc1, *mSrc2);	/* Src1|Src2 */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using BitOr: D = S1 | S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdst = *cursrc1 | *cursrc2;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+	return (0);
+}
+
+/*!
+\brief Internal ASM Filter using Div: D = S1 / S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov edx, Src1   	/* load Src1 address into edx */
+			mov esi, Src2   	/* load Src2 address into esi */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			align 16        	/* 16 byte alignment of the loop entry */
+L10191:
+		mov bl, [esi]   	/* load a byte from Src2 */
+		cmp bl, 0   	/* check if it zero */
+			jnz L10192
+			mov [edi], 255   	/* division by zero = 255 !!! */
+			jmp  L10193
+L10192:
+		xor ah, ah   	/* prepare AX, zero AH register */
+			mov al, [edx]   	/* load a byte from Src1 into AL */
+		div   bl             	/* divide AL by BL */
+			mov [edi], al   	/* move a byte result to Dest */
+L10193:
+		inc edx    	/* increment Src1, Src2, Dest */
+			inc esi    		/* pointer registers by one */
+			inc edi
+			dec ecx       	/* decrease loop counter */
+			jnz L10191     	/* check loop termination, proceed if required */
+			popa
+	}
+#else
+	/* Note: ~15% gain on i386, less efficient than C on x86_64 */
+	/* Also depends on whether the function is static (?!) */
+	/* Also depends on whether we work on malloc() or static char[] */
+	asm volatile (
+#  if defined(i386)
+		"pushl %%ebx \n\t"		/* %ebx may be the PIC register.  */
+		".align 16     \n\t"		/* 16 byte alignment of the loop entry */
+		"1: mov (%%esi), %%bl  \n\t"	/* load a byte from Src2 */
+		"cmp       $0, %%bl    \n\t"	/* check if it zero */
+		"jnz 2f                \n\t"
+		"movb  $255, (%%edi)   \n\t"	/* division by zero = 255 !!! */
+		"jmp 3f                \n\t"
+		"2: xor %%ah, %%ah     \n\t"	/* prepare AX, zero AH register */
+		"mov   (%%edx), %%al   \n\t"	/* load a byte from Src1 into AL */
+		"div   %%bl            \n\t"	/* divide AL by BL */
+		"mov   %%al, (%%edi)   \n\t"	/* move a byte result to Dest */
+		"3: inc %%edx          \n\t"	/* increment Src1, Src2, Dest */
+		"inc %%esi \n\t"		/* pointer registers by one */
+		"inc %%edi \n\t"
+		"dec %%ecx \n\t"		/* decrease loop counter */
+		"jnz 1b    \n\t"		/* check loop termination, proceed if required */
+		"popl %%ebx \n\t"		/* restore %ebx */
+		: "+d" (Src1),		/* load Src1 address into edx */
+		  "+S" (Src2),		/* load Src2 address into esi */
+		  "+c" (SrcLength),	/* load loop counter (SIZE) into ecx */
+		  "+D" (Dest)		/* load Dest address into edi */
+		:
+		: "memory", "rax"
+#  elif defined(__x86_64__)
+		".align 16     \n\t"		/* 16 byte alignment of the loop entry */
+		"1: mov (%%rsi), %%bl  \n\t"	/* load a byte from Src2 */
+		"cmp       $0, %%bl    \n\t"	/* check if it zero */
+		"jnz 2f                \n\t"
+		"movb  $255, (%%rdi)   \n\t"	/* division by zero = 255 !!! */
+		"jmp 3f                \n\t"
+		"2: xor %%ah, %%ah     \n\t"	/* prepare AX, zero AH register */
+		"mov   (%%rdx), %%al   \n\t"	/* load a byte from Src1 into AL */
+		"div   %%bl            \n\t"	/* divide AL by BL */
+		"mov   %%al, (%%rdi)   \n\t"	/* move a byte result to Dest */
+		"3: inc %%rdx          \n\t"	/* increment Src1, Src2, Dest */
+		"inc %%rsi \n\t"		/* pointer registers by one */
+		"inc %%rdi \n\t"
+		"dec %%rcx \n\t"		/* decrease loop counter */
+		"jnz 1b    \n\t"		/* check loop termination, proceed if required */
+		: "+d" (Src1),		/* load Src1 address into edx */
+		  "+S" (Src2),		/* load Src2 address into esi */
+		  "+c" (SrcLength),	/* load loop counter (SIZE) into ecx */
+		  "+D" (Dest)		/* load Dest address into edi */
+		:
+		: "memory", "rax", "rbx"
+#  endif
+		);
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Div: D = S1 / S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (SDL_imageFilterMMXdetect()) {
+		if (length > 0) {
+			/* Call ASM routine */
+			SDL_imageFilterDivASM(Src1, Src2, Dest, length);
+
+			/* Never unaligned bytes - we are done */
+			return (0);
+		} else {
+			return (-1);
+		}
+	} 
+	
+	/* Setup to process whole image */
+	istart = 0;
+	cursrc1 = Src1;
+	cursrc2 = Src2;
+	curdst = Dest;
+
+	/* C routine to process image */
+	/* for (i = istart; i < length; i++) { */
+	/* 	if (*cursrc2 == 0) { */
+	/* 		*curdst = 255; */
+	/* 	} else { */
+	/* 		result = (int) *cursrc1 / (int) *cursrc2; */
+	/* 		*curdst = (unsigned char) result; */
+	/* 	} */
+	/* 	/\* Advance pointers *\/ */
+	/* 	cursrc1++; */
+	/* 	cursrc2++; */
+	/* 	curdst++; */
+	/* } */
+	for (i = istart; i < length; i++) {
+		if (*cursrc2 == 0) {
+			*curdst = 255;
+		} else {
+			*curdst = (int)*cursrc1 / (int)*cursrc2;  // (int) for efficiency
+		}
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/* ------------------------------------------------------------------------------------ */
+
+/*!
+\brief Internal MMX Filter using BitNegation: D = !S
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16          	/* 16 byte alignment of the loop entry */
+L91117:
+		movq mm0, [eax]   	/* load 8 bytes from Src1 into mm1 */
+		pxor mm0, mm1   	/* negate mm0 by xoring with mm1 */
+			movq [edi], mm0   	/* store result in Dest */
+			add eax, 8   	/* increase Src1, Src2 and Dest  */
+			add edi,  8
+			dec ecx        	/* decrease loop counter */
+			jnz L91117      	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+        __m64 mm1;
+	mm1 = _m_pcmpeqb(mm1, mm1);		/* generate all 1's in mm1 */
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_pxor(*mSrc1, mm1);	/* negate mm0 by xoring with mm1 */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();				/* clean MMX state */
+
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using BitNegation: D = !S
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterBitNegationMMX(Src1, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdst = ~(*cursrc1);
+		/* Advance pointers */
+		cursrc1++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using AddByte: D = saturation255(S + C) 
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant value to add (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 8 bytes of MM1 ** */
+			mov al, C   	/* load C into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1021:
+		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
+		paddusb mm0,  mm1 	/* MM0=SrcDest+C (add 8 bytes with saturation) */
+			movq [edi], mm0   	/* store result in Dest */
+			add eax, 8   	/* increase Dest register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1021    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate C in 8 bytes of MM1 */
+	int i;
+	memset(&i, C, 4);
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_paddusb(*mSrc1, mm1);	/* Src1+C (add 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using AddByte: D = saturation255(S + C) 
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param C Constant value to add (C).
+
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
+{
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1, *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: C==0 */
+	if (C == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterAddByteMMX(Src1, Dest, length, C);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 + iC;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to add (C).
+\param D Byteorder-swapped constant to add (Cs).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate (int)C in 8 bytes of MM1 ** */
+			mov eax, C   	/* load C into EAX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			mov eax, D   	/* load D into EAX */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L11023:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		paddusb mm0,  mm1 	/* MM0=SrcDest+C (add 8 bytes with saturation) */
+			movq [edi],  mm0 	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L11023    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate (int)C in 8 bytes of MM1 */
+	__m64 mm1 = _m_from_int(C);
+	__m64 mm2 = _m_from_int(C);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_paddusb(*mSrc1, mm1);	/* Src1+C (add 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param C Constant to add (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
+{
+	unsigned int i, j, istart, D;
+	int iC[4];
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: C==0 */
+	if (C == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		D=SWAP_32(C);
+		SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process bytes */
+	iC[3] = (int) ((C >> 24) & 0xff);
+	iC[2] = (int) ((C >> 16) & 0xff);
+	iC[1] = (int) ((C >>  8) & 0xff);
+	iC[0] = (int) ((C >>  0) & 0xff);
+	for (i = istart; i < length; i += 4) {
+		for (j = 0; j < 4; j++) {
+			if ((i+j)<length) {
+				result = (int) *cursrc1 + iC[j];
+				if (result > 255) result = 255;
+				*curdest = (unsigned char) result;
+				/* Advance pointers */
+				cursrc1++;
+				curdest++;
+			}
+		}
+	}
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using AddByteToHalf: D = saturation255(S/2 + C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to add (C).
+\param Mask Pointer to 8 mask bytes of value 0x7F.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C,
+									unsigned char *Mask)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 8 bytes of MM1 ** */
+			mov al, C   	/* load C into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov edx, Mask   	/* load Mask address into edx */
+			movq mm0, [edx]   	/* load Mask into mm0 */
+		mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1022:
+		movq mm2, [eax]   	/* load 8 bytes from Src1 into MM2 */
+		psrlw mm2, 1   	/* shift 4 WORDS of MM2 1 bit to the right */
+			pand mm2, mm0        // apply Mask to 8 BYTES of MM2 */
+			paddusb mm2,  mm1 	/* MM2=SrcDest+C (add 8 bytes with saturation) */
+			movq [edi], mm2   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1022    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 *mMask = (__m64*)Mask;
+	/* Duplicate C in 8 bytes of MM1 */
+	int i;
+	memset(&i, C, 4);
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm2 = _m_psrlwi(*mSrc1, 1);	/* shift 4 WORDS of MM2 1 bit to the right */
+		mm2 = _m_pand(mm2, *mMask);		/* apply Mask to 8 BYTES of MM2 */
+							/* byte     0x0f, 0xdb, 0xd0 */
+		*mDest = _m_paddusb(mm1, mm2);		/* Src1+C (add 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using AddByteToHalf: D = saturation255(S/2 + C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param C Constant to add (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
+{
+	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) (*cursrc1 / 2) + iC;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using SubByte: D = saturation0(S - C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to subtract (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 8 bytes of MM1 ** */
+			mov al, C   	/* load C into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1023:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		psubusb mm0,  mm1 	/* MM0=SrcDest-C (sub 8 bytes with saturation) */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1023    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate C in 8 bytes of MM1 */
+	int i;
+	memset(&i, C, 4);
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_psubusb(*mSrc1, mm1);	/* Src1-C (sub 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using SubByte: D = saturation0(S - C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+\param C Constant to subtract (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
+{
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: C==0 */
+	if (C == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterSubByteMMX(Src1, Dest, length, C);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 - iC;
+		if (result < 0)
+			result = 0;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to subtract (C).
+\param D Byteorder-swapped constant to subtract (Cs).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate (int)C in 8 bytes of MM1 ** */
+			mov eax, C   	/* load C into EAX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			mov eax, D   	/* load D into EAX */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L11024:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		psubusb mm0, mm1 	/* MM0=SrcDest-C (sub 8 bytes with saturation) */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L11024    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate (int)C in 8 bytes of MM1 */
+	__m64 mm1 = _m_from_int(C);
+	__m64 mm2 = _m_from_int(C);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_psubusb(*mSrc1, mm1);	/* Src1-C (sub 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param C Constant to subtract (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
+{
+	unsigned int i, j, istart, D;
+	int iC[4];
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+    /* Special case: C==0 */
+	if (C == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		D=SWAP_32(C);
+		SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC[3] = (int) ((C >> 24) & 0xff);
+	iC[2] = (int) ((C >> 16) & 0xff);
+	iC[1] = (int) ((C >>  8) & 0xff);
+	iC[0] = (int) ((C >>  0) & 0xff);
+	for (i = istart; i < length; i += 4) {
+		for (j = 0; j < 4; j++) {
+			if ((i+j)<length) {
+				result = (int) *cursrc1 - iC[j];
+				if (result < 0) result = 0;
+				*curdest = (unsigned char) result;
+				/* Advance pointers */
+				cursrc1++;
+				curdest++;
+			}
+		}
+	}
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftRight: D = saturation0(S >> N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+\param Mask Byte array containing 8 bytes with 0x7F value.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
+								 unsigned char *Mask)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov edx, Mask   	/* load Mask address into edx */
+			movq mm0, [edx]   	/* load Mask into mm0 */
+		xor ecx, ecx   	/* zero ECX */
+			mov cl,  N 	/* load loop counter (N) into CL */
+			movd mm3,  ecx 	/* copy (N) into MM3  */
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+L10240:                  	/* ** Prepare proper bit-Mask in MM1 ** */
+		psrlw mm1,  1 	/* shift 4 WORDS of MM1 1 bit to the right */
+			pand mm1, mm0   // apply Mask to 8 BYTES of MM1 */
+			/*  byte     0x0f, 0xdb, 0xc8 */
+			dec               cl    	/* decrease loop counter */
+			jnz            L10240    	/* check loop termination, proceed if required */
+			/* ** Shift all bytes of the image ** */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10241:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		psrlw mm0, mm3   	/* shift 4 WORDS of MM0 (N) bits to the right */
+			pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
+			/* byte     0x0f, 0xdb, 0xc1 */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10241    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 *mMask = (__m64*)Mask;
+        __m64 mm1;
+	int i;
+	mm1 = _m_pcmpeqb(mm1, mm1);			/* generate all 1's in mm1 */
+	/* Prepare proper bit-Mask in MM1 */
+	for (i = 0; i < N; i++) {
+		mm1 = _m_psrlwi(mm1, 1);		/* shift 4 WORDS of MM1 1 bit to the right */
+		mm1 = _m_pand(mm1, *mMask);		/* apply Mask to 8 BYTES of MM1 */
+	}
+        /* Shift all bytes of the image */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm0 = _m_psrlwi(*mSrc1, N);	/* shift 4 WORDS of MM0 (N) bits to the right */
+		*mDest = _m_pand(mm0, mm1);		/* apply proper bit-Mask to 8 BYTES of MM0 */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftRight: D = saturation0(S >> N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
+	unsigned int i, istart;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Check shift */
+	if (N > 8) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdest = (unsigned char) *cursrc1 >> N;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L13023:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		psrld mm0, N
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L13023    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_psrldi(*mSrc1, N);
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 32.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdest;
+	unsigned int *icursrc1, *icurdest;
+	unsigned int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (N > 32) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	icursrc1=(unsigned int *)cursrc1;
+	icurdest=(unsigned int *)curdest;
+	for (i = istart; i < length; i += 4) {
+		if ((i+4)<length) {
+			result = ((unsigned int)*icursrc1 >> N);
+			*icurdest = result;
+		}
+		/* Advance pointers */
+		icursrc1++;
+		icurdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using MultByByte: D = saturation255(S * C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to multiply with (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 4 words of MM1 ** */
+			mov al, C   	/* load C into AL */
+			xor ah, ah   	/* zero AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher words of MM1 with C */
+			pxor mm0, mm0   	/* zero MM0 register */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			cmp al, 128   	/* if (C <= 128) execute more efficient code */
+			jg             L10251
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10250:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			pmullw mm3, mm1   	/* mul low  bytes of SrcDest and MM1 */
+			pmullw mm4, mm1   	/* mul high bytes of SrcDest and MM1 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10250    	/* check loop termination, proceed if required */
+			jmp            L10252
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10251:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			pmullw mm3, mm1   	/* mul low  bytes of SrcDest and MM1 */
+			pmullw mm4, mm1   	/* mul high bytes of SrcDest and MM1 */
+			/* ** Take abs value of the results (signed words) ** */
+			movq mm5, mm3   	/* copy mm3 into mm5 */
+			movq mm6, mm4   	/* copy mm4 into mm6 */
+			psraw mm5, 15   	/* fill mm5 words with word sign bit */
+			psraw mm6, 15   	/* fill mm6 words with word sign bit */
+			pxor mm3, mm5   	/* take 1's compliment of only neg words */
+			pxor mm4, mm6   	/* take 1's compliment of only neg words */
+			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
+			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10251    	/* check loop termination, proceed if required */
+L10252:
+		emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0);				/* zero mm0 register */
+	/* Duplicate C in 4 words of MM1 */
+	int i;
+	i = C | C<<16;
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);				/* fill higher words of MM1 with C */
+	// long long lli = C | C<<16 | (long long)C<<32 | (long long)C<<48;
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	if (C <= 128) {						/* if (C <= 128) execute more efficient code */
+		for (i = 0; i < SrcLength/8; i++) {
+			__m64 mm3, mm4;
+			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+			mm3 = _m_pmullw(mm3, mm1);		/* mul low  bytes of Src1 and MM1 */
+			mm4 = _m_pmullw(mm4, mm1);		/* mul high bytes of Src1 and MM1 */
+			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+			mSrc1++;
+			mDest++;
+		}
+	} else {
+		for (i = 0; i < SrcLength/8; i++) {
+			__m64 mm3, mm4, mm5, mm6;
+			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+			mm3 = _m_pmullw(mm3, mm1);		/* mul low  bytes of Src1 and MM1 */
+			mm4 = _m_pmullw(mm4, mm1);		/* mul high bytes of Src1 and MM1 */
+			/* Take abs value of the results (signed words) */
+			mm5 = _m_psrawi(mm3, 15);		/* fill mm5 words with word sign bit */
+			mm6 = _m_psrawi(mm4, 15);		/* fill mm6 words with word sign bit */
+			mm3 = _m_pxor(mm3, mm5);		/* take 1's compliment of only neg. words */
+			mm4 = _m_pxor(mm4, mm6);		/* take 1's compliment of only neg. words */
+			mm3 = _m_psubsw(mm3, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
+			mm4 = _m_psubsw(mm4, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
+			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+			mSrc1++;
+			mDest++;
+		}
+	}
+	_m_empty();						/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using MultByByte: D = saturation255(S * C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+\param C Constant to multiply with (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
+{
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: C==1 */
+	if (C == 1) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterMultByByteMMX(Src1, Dest, length, C);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 * iC;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftRightAndMultByByteMMX: D = saturation255((S >> N) * C) 
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+\param C Constant to multiply with (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
+											  unsigned char C)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 4 words of MM1 ** */
+			mov al, C   	/* load C into AL */
+			xor ah, ah   	/* zero AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher words of MM1 with C */
+			xor ecx, ecx   	/* zero ECX */
+			mov cl, N   	/* load N into CL */
+			movd mm7, ecx   	/* copy N into MM7 */
+			pxor mm0, mm0   	/* zero MM0 register */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1026:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			psrlw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the right */
+			psrlw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the right */
+			pmullw mm3, mm1   	/* mul low  bytes of SrcDest by MM1 */
+			pmullw mm4, mm1   	/* mul high bytes of SrcDest by MM1 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1026    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0);			/* zero mm0 register */
+	/* Duplicate C in 4 words of MM1 */
+	int i;
+	i = (C<<16)|C;
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher words of MM1 with C */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm3, mm4, mm5, mm6;
+		mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+		mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_psrlwi(mm3, N);		/* shift 4 WORDS of MM3 (N) bits to the right */
+		mm4 = _m_psrlwi(mm4, N);		/* shift 4 WORDS of MM4 (N) bits to the right */
+		mm3 = _m_pmullw(mm3, mm1);		/* mul low  bytes of Src1 and MM1 */
+		mm4 = _m_pmullw(mm4, mm1);		/* mul high bytes of Src1 and MM1 */
+		*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftRightAndMultByByte: D = saturation255((S >> N) * C) 
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+\param C Constant to multiply with (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N,
+										   unsigned char C)
+{
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Check shift */
+	if (N > 8) {
+		return (-1);
+	}
+
+	/* Special case: N==0 && C==1 */
+	if ((N == 0) && (C == 1)) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) (*cursrc1 >> N) * iC;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftLeftByte: D = (S << N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+\param Mask Byte array containing 8 bytes of 0xFE value.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
+									unsigned char *Mask)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov edx, Mask   	/* load Mask address into edx */
+			movq mm0, [edx]   	/* load Mask into mm0 */
+		xor ecx, ecx   	/* zero ECX */
+			mov cl, N   	/* load loop counter (N) into CL */
+			movd mm3, ecx   	/* copy (N) into MM3  */
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+L10270:                  	/* ** Prepare proper bit-Mask in MM1 ** */
+		psllw mm1, 1   	/* shift 4 WORDS of MM1 1 bit to the left */
+			pand mm1, mm0        // apply Mask to 8 BYTES of MM1 */
+			/*  byte     0x0f, 0xdb, 0xc8 */
+			dec cl                  	/* decrease loop counter */
+			jnz            L10270    	/* check loop termination, proceed if required */
+			/* ** Shift all bytes of the image ** */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load SrcDest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10271:
+		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
+		psllw mm0, mm3   	/* shift 4 WORDS of MM0 (N) bits to the left */
+			pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
+			/* byte     0x0f, 0xdb, 0xc1 */
+			movq [edi], mm0   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10271    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 *mMask = (__m64*)Mask;
+        __m64 mm1;
+	int i;
+	mm1 = _m_pcmpeqb(mm1, mm1);			/* generate all 1's in mm1 */
+	/* Prepare proper bit-Mask in MM1 */
+	for (i = 0; i < N; i++) {
+		mm1 = _m_psllwi(mm1, 1);		/* shift 4 WORDS of MM1 1 bit to the left */
+		mm1 = _m_pand(mm1, *mMask);		/* apply Mask to 8 BYTES of MM1 */
+	}
+	/* ** Shift all bytes of the image ** */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm0 = _m_psllwi(*mSrc1, N);	/* shift 4 WORDS of MM0 (N) bits to the left */
+		*mDest = _m_pand(mm0, mm1);		/* apply proper bit-Mask to 8 BYTES of MM0 */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftLeftByte: D = (S << N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE };
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (N > 8) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = ((int) *cursrc1 << N) & 0xff;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftLeftUint: D = ((uint)S << N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 32.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L12023:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		pslld mm0, N   	/* MM0=SrcDest+C (add 8 bytes with saturation) */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L12023    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_pslldi(*mSrc1, N);	/* Src1+C (add 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();				/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftLeftUint: D = ((uint)S << N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 32.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdest;
+	unsigned int *icursrc1, *icurdest;
+	unsigned int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (N > 32) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	icursrc1=(unsigned int *)cursrc1;
+	icurdest=(unsigned int *)curdest;
+	for (i = istart; i < length; i += 4) {
+		if ((i+4)<length) {
+			result = ((unsigned int)*icursrc1 << N);
+			*icurdest = result;
+		}
+		/* Advance pointers */
+		icursrc1++;
+		icurdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter ShiftLeft: D = saturation255(S << N)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			xor eax, eax   	/* zero EAX */
+			mov al, N   	/* load N into AL */
+			movd mm7, eax   	/* copy N into MM7 */
+			pxor mm0, mm0   	/* zero MM0 register */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			cmp al, 7   	/* if (N <= 7) execute more efficient code */
+			jg             L10281
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10280:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			psllw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the left */
+			psllw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the left */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10280    	/* check loop termination, proceed if required */
+			jmp            L10282
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10281:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			psllw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the left */
+			psllw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the left */
+			/* ** Take abs value of the signed words ** */
+			movq mm5, mm3   	/* copy mm3 into mm5 */
+			movq mm6, mm4   	/* copy mm4 into mm6 */
+			psraw mm5, 15   	/* fill mm5 words with word sign bit */
+			psraw mm6, 15   	/* fill mm6 words with word sign bit */
+			pxor mm3, mm5   	/* take 1's compliment of only neg words */
+			pxor mm4, mm6   	/* take 1's compliment of only neg words */
+			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
+			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10281    	/* check loop termination, proceed if required */
+L10282:
+		emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0);				/* zero mm0 register */
+	int i;
+	if (N <= 7) {						/* if (N <= 7) execute more efficient code */
+		for (i = 0; i < SrcLength/8; i++) {
+			__m64 mm3, mm4;
+			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+			mm3 = _m_psllwi(mm3, N);		/* shift 4 WORDS of MM3 (N) bits to the left */
+			mm4 = _m_psllwi(mm4, N);		/* shift 4 WORDS of MM4 (N) bits to the left */
+			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+			mSrc1++;
+			mDest++;
+		}
+	} else {
+		for (i = 0; i < SrcLength/8; i++) {
+			__m64 mm3, mm4, mm5, mm6;
+			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+			mm3 = _m_psllwi(mm3, N);		/* shift 4 WORDS of MM3 (N) bits to the left */
+			mm4 = _m_psllwi(mm4, N);		/* shift 4 WORDS of MM4 (N) bits to the left */
+			/* Take abs value of the signed words */
+			mm5 = _m_psrawi(mm3, 15);		/* fill mm5 words with word sign bit */
+			mm6 = _m_psrawi(mm4, 15);		/* fill mm6 words with word sign bit */
+			mm3 = _m_pxor(mm3, mm5);		/* take 1's compliment of only neg. words */
+			mm4 = _m_pxor(mm4, mm6);		/* take 1's compliment of only neg. words */
+			mm3 = _m_psubsw(mm3, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
+			mm4 = _m_psubsw(mm4, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
+			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+			mSrc1++;
+			mDest++;
+		}
+	}
+	_m_empty();						/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter ShiftLeft: D = saturation255(S << N)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (N > 8) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 << N;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief MMX BinarizeUsingThreshold: D = (S >= T) ? 255:0
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param T The threshold boundary (inclusive).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate T in 8 bytes of MM3 ** */
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+			pcmpeqb mm2, mm2   	/* generate all 1's in mm2 */
+			mov al, T   	/* load T into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm3, eax   	/* copy EAX into MM3 */
+			movd mm4, eax   	/* copy EAX into MM4 */
+			punpckldq mm3, mm4   	/* fill higher bytes of MM3 with T */
+			psubusb mm2, mm3   	/* store 0xFF - T in MM2 */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1029:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		paddusb mm0, mm2   	/* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
+			pcmpeqb mm0, mm1   	/* binarize 255:0, comparing to 255 */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1029    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate T in 8 bytes of MM3 */
+	__m64 mm1 = _m_pcmpeqb(mm1, mm1);			/* generate all 1's in mm1 */
+	__m64 mm2 = _m_pcmpeqb(mm2, mm2);			/* generate all 1's in mm1 */
+	int i;
+	memset(&i, T, 4);
+	__m64 mm3 = _m_from_int(i);
+	__m64 mm4 = _m_from_int(i);
+	mm3 = _m_punpckldq(mm3, mm4);			/* fill higher bytes of MM3 with T */
+	mm2 = _m_psubusb(mm2, mm3);			/* store 0xFF - T in MM2 */
+        //__m64 mm3 = _m_from_int64(lli); // x86_64 only
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm0 = _m_paddusb(*mSrc1, mm2);	/* Src1+(0xFF-T) (add 8 bytes with saturation) */
+		*mDest = _m_pcmpeqb(mm0, mm1);		/* binarize 255:0, comparing to 255 */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using BinarizeUsingThreshold: D = (S >= T) ? 255:0
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param T The threshold boundary (inclusive).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: T==0 */
+	if (T == 0) {
+		memset(Dest, 255, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdest = (unsigned char)(((unsigned char)*cursrc1 >= T) ? 255 : 0);
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param Tmin Lower (inclusive) boundary of the clipping range.
+\param Tmax Upper (inclusive) boundary of the clipping range.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin,
+								  unsigned char Tmax)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+			/* ** Duplicate Tmax in 8 bytes of MM3 ** */
+			mov al, Tmax   	/* load Tmax into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm3, eax   	/* copy EAX into MM3 */
+			movd mm4, eax   	/* copy EAX into MM4 */
+			punpckldq mm3, mm4   	/* fill higher bytes of MM3 with Tmax */
+			psubusb mm1, mm3   	/* store 0xFF - Tmax in MM1 */
+			/* ** Duplicate Tmin in 8 bytes of MM5 ** */
+			mov al, Tmin   	/* load Tmin into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm5, eax   	/* copy EAX into MM5 */
+			movd mm4, eax   	/* copy EAX into MM4 */
+			punpckldq mm5, mm4   	/* fill higher bytes of MM5 with Tmin */
+			movq mm7, mm5   	/* copy MM5 into MM7 */
+			paddusb mm7, mm1   	/* store 0xFF - Tmax + Tmin in MM7 */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1030:
+		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
+		paddusb mm0, mm1   	/* MM0=SrcDest+(0xFF-Tmax) */
+			psubusb mm0, mm7   	/* MM0=MM0-(0xFF-Tmax+Tmin) */
+			paddusb mm0, mm5   	/* MM0=MM0+Tmin */
+			movq [edi], mm0   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1030    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm1 = _m_pcmpeqb(mm1, mm1);	/* generate all 1's in mm1 */
+	int i;
+	/* Duplicate Tmax in 8 bytes of MM3 */
+	__m64 mm3, mm4;
+	memset(&i, Tmax, 4);
+	mm3 = _m_from_int(i);
+	mm4 = _m_from_int(i);
+	mm3 = _m_punpckldq(mm3, mm4);		/* fill higher bytes of MM3 with Tmax */
+	mm1 = _m_psubusb(mm1, mm3);		/* store 0xFF - Tmax in MM1 */
+        //__m64 mm3 = _m_from_int64(lli); // x86_64 only
+	/* Duplicate Tmax in 8 bytes of MM3 */
+	__m64 mm5, mm7;
+	memset(&i, Tmin, 4);
+	mm5 = _m_from_int(i);
+	mm4 = _m_from_int(i);
+	mm5 = _m_punpckldq(mm5, mm4);		/* fill higher bytes of MM5 with Tmin */
+	mm7 = _m_paddusb(mm5, mm1);	/* store 0xFF - Tmax + Tmin in MM7 */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm0;
+		mm0 = _m_paddusb(*mSrc1, mm1);	/* MM0=Src1+(0xFF-Tmax) */
+		mm0 = _m_psubusb(mm0, mm7);	/* MM0=MM0-(0xFF-Tmax+Tmin) */
+		*mDest = _m_paddusb(mm0, mm5);	/* MM0+Tmin */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();				/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param Tmin Lower (inclusive) boundary of the clipping range.
+\param Tmax Upper (inclusive) boundary of the clipping range.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin,
+							   unsigned char Tmax)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: Tmin==0 && Tmax = 255 */
+	if ((Tmin == 0) && (Tmax == 25)) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		if (*cursrc1 < Tmin) {
+			*curdest = Tmin;
+		} else if (*cursrc1 > Tmax) {
+			*curdest = Tmax;
+		} else {
+			*curdest = *cursrc1;
+		}
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param Cmin Normalization constant (Cmin).
+\param Cmax Normalization constant (Cmax).
+\param Nmin Normalization constant (Nmin).
+\param Nmax Normalization constant (Nmax).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax,
+									  int Nmin, int Nmax)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov ax, WORD PTR Nmax   	/* load Nmax in AX */
+			mov bx, WORD PTR Cmax   	/* load Cmax in BX */
+			sub ax, WORD PTR Nmin   	/* AX = Nmax - Nmin */
+			sub bx, WORD PTR Cmin   	/* BX = Cmax - Cmin */
+			jz             L10311    	/* check division by zero */
+			xor dx, dx   	/* prepare for division, zero DX */
+			div               bx    	/* AX = AX/BX */
+			jmp            L10312
+L10311:
+		mov ax, 255   	/* if div by zero, assume result max byte value */
+L10312:                  	/* ** Duplicate AX in 4 words of MM0 ** */
+		mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm0, eax   	/* copy EAX into MM0 */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			punpckldq mm0, mm1   	/* fill higher words of MM0 with AX */
+			/* ** Duplicate Cmin in 4 words of MM1 ** */
+			mov ax, WORD PTR Cmin   	/* load Cmin into AX */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher words of MM1 with Cmin */
+			/* ** Duplicate Nmin in 4 words of MM2 ** */
+			mov ax, WORD PTR Nmin   	/* load Nmin into AX */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			movd mm3, eax   	/* copy EAX into MM3 */
+			punpckldq mm2, mm3   	/* fill higher words of MM2 with Nmin */
+			pxor mm7, mm7   	/* zero MM7 register */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1031:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm7   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm7   	/* unpack high bytes of SrcDest into words */
+			psubusb mm3, mm1   	/* S-Cmin, low  bytes */
+			psubusb mm4, mm1   	/* S-Cmin, high bytes */
+			pmullw mm3, mm0   	/* MM0*(S-Cmin), low  bytes */
+			pmullw mm4, mm0   	/* MM0*(S-Cmin), high bytes */
+			paddusb mm3, mm2   	/* MM0*(S-Cmin)+Nmin, low  bytes */
+			paddusb mm4, mm2   	/* MM0*(S-Cmin)+Nmin, high bytes */
+			/* ** Take abs value of the signed words ** */
+			movq mm5, mm3   	/* copy mm3 into mm5 */
+			movq mm6, mm4   	/* copy mm4 into mm6 */
+			psraw mm5, 15   	/* fill mm5 words with word sign bit */
+			psraw mm6, 15   	/* fill mm6 words with word sign bit */
+			pxor mm3, mm5   	/* take 1's compliment of only neg words */
+			pxor mm4, mm6   	/* take 1's compliment of only neg words */
+			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
+			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1031    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0, mm1, mm2, mm3;
+
+	int i;
+	/* Duplicate (Nmax-Nmin)/(Cmax-Cmin) in 4 words of MM0 */
+	unsigned short a = Nmax - Nmin;
+	unsigned short b = Cmax - Cmin;
+	if (b == 0) {
+	    a = 255;
+	} else {
+	    a /= b;
+	}
+	i = (a<<16)|a;
+	mm0 = _m_from_int(i);
+	mm1 = _m_from_int(i);
+	mm0 = _m_punpckldq(mm0, mm1);			/* fill higher words of MM0 with AX */
+	/* Duplicate Cmin in 4 words of MM1 */
+	i = (Cmin<<16)|(short)Cmin;
+	mm1 = _m_from_int(i);
+	mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher words of MM1 with Cmin */
+	/* Duplicate Nmin in 4 words of MM2 */
+	i = (Nmin<<16)|(short)Nmin;
+	mm2 = _m_from_int(i);
+	mm3 = _m_from_int(i);
+	mm2 = _m_punpckldq(mm2, mm3);			/* fill higher words of MM2 with Nmin */
+	__m64 mm7 = _m_from_int(0);			/* zero mm0 register */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm3, mm4, mm5, mm6;
+		mm3 = _m_punpcklbw(*mSrc1, mm7);	/* unpack low  bytes of Src1 into words */
+		mm4 = _m_punpckhbw(*mSrc1, mm7);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_psubusb(mm3, mm1);		/* S-Cmin, low	bytes */
+		mm4 = _m_psubusb(mm4, mm1);		/* S-Cmin, high bytes */
+		mm3 = _m_pmullw(mm3, mm0);		/* MM0*(S-Cmin), low  bytes */
+		mm4 = _m_pmullw(mm4, mm0);		/* MM0*(S-Cmin), high bytes */
+		mm3 = _m_paddusb(mm3, mm2);		/* MM0*(S-Cmin)+Nmin, low  bytes */
+		mm4 = _m_paddusb(mm4, mm2);		/* MM0*(S-Cmin)+Nmin, high bytes */
+		/* Take abs value of the signed words */
+		mm5 = _m_psrawi(mm3, 15);		/* fill mm5 words with word sign bit */
+		mm6 = _m_psrawi(mm4, 15);		/* fill mm6 words with word sign bit */
+		mm3 = _m_pxor(mm3, mm5);		/* take 1's compliment of only neg. words */
+		mm4 = _m_pxor(mm4, mm6);		/* take 1's compliment of only neg. words */
+		mm3 = _m_psubsw(mm3, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
+		mm4 = _m_psubsw(mm4, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
+		*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
+
+\param Src Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param Cmin Normalization constant.
+\param Cmax Normalization constant.
+\param Nmin Normalization constant.
+\param Nmax Normalization constant.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin,
+								   int Nmax)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc;
+	unsigned char *curdest;
+	int dN, dC, factor;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc = &Src[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc = Src;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	dC = Cmax - Cmin;
+	if (dC == 0)
+		return (0);
+	dN = Nmax - Nmin;
+	factor = dN / dC;
+	for (i = istart; i < length; i++) {
+		result = factor * ((int) (*cursrc) - Cmin) + Nmin;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/* ------------------------------------------------------------------------------------ */
+
+/*!
+\brief Filter using ConvolveKernel3x3Divide: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >2.
+\param columns Number of columns in source/destination array. Must be >2.
+\param Kernel The 2D convolution kernel of size 3x3.
+\param Divisor The divisor of the convolution sum. Must be >0.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+										   signed short *Kernel, unsigned char Divisor)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 3) || (rows < 3) || (Divisor == 0))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, Divisor   	/* load Divisor into BL */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				movq mm5, [edx]   	/* MM5 = {0,K2,K1,K0} */
+			add edx, 8   	/* second row              |K0 K1 K2 0| */
+				movq mm6, [edx]   	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
+			add edx, 8   	/* third row               |K6 K7 K8 0| */
+				movq mm7, [edx]   	/* MM7 = {0,K8,K7,K6} */
+			/* ---, */
+			mov eax, columns   	/* load columns into EAX */
+				mov esi, Src   	/* ESI = Src row 0 address */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, eax   	/* EDI = EDI + columns */
+				inc              edi    	/* 1 byte offset from the left edge */
+				mov edx, rows   	/* initialize ROWS counter */
+				sub edx, 2   	/* do not use first and last row */
+				/* ---, */
+L10320:
+			mov ecx, eax   	/* initialize COLUMS counter */
+				sub ecx, 2   	/* do not use first and last column */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10322:
+			/* ---, */
+			movq mm1, [esi]   	/* load 8 bytes of the image first row */
+			add esi, eax   	/* move one row below */
+				movq mm2, [esi]   	/* load 8 bytes of the image second row */
+			add esi, eax   	/* move one row below */
+				movq mm3, [esi]   	/* load 8 bytes of the image third row */
+			punpcklbw mm1, mm0   	/* unpack first 4 bytes into words */
+				punpcklbw mm2, mm0   	/* unpack first 4 bytes into words */
+				punpcklbw mm3, mm0   	/* unpack first 4 bytes into words */
+				pmullw mm1, mm5   	/* multiply words first row  image*Kernel */
+				pmullw mm2, mm6   	/* multiply words second row image*Kernel */
+				pmullw mm3, mm7   	/* multiply words third row  image*Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the first and second rows */
+				paddsw mm1, mm3   	/* add 4 words of the third row and result */
+				movq mm2, mm1   	/* copy MM1 into MM2 */
+				psrlq mm1, 32   	/* shift 2 left words to the right */
+				paddsw mm1, mm2   	/* add 2 left and 2 right result words */
+				movq mm3, mm1   	/* copy MM1 into MM3 */
+				psrlq mm1, 16   	/* shift 1 left word to the right */
+				paddsw mm1, mm3   	/* add 1 left and 1 right result words */
+				/* --, */
+				movd mm2, eax   	/* save EAX in MM2 */
+				movd mm3, edx   	/* save EDX in MM3 */
+				movd eax, mm1   	/* copy MM1 into EAX */
+				psraw mm1, 15   	/* spread sign bit of the result */
+				movd edx, mm1   	/* fill EDX with a sign bit */
+				idiv bx    	/* IDIV - VERY EXPENSIVE */
+				movd mm1, eax   	/* move result of division into MM1 */
+				packuswb mm1, mm0   	/* pack division result with saturation */
+				movd eax, mm1   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd edx, mm3   	/* restore saved EDX */
+				movd eax, mm2   	/* restore saved EAX */
+				/* --, */
+				sub esi, eax   	/* move two rows up */
+				sub esi, eax   	/* */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10322    	/* check loop termination, proceed if required */
+				add esi, 2   	/* move to the next row in Src */
+				add edi, 2   	/* move to the next row in Dest */
+				dec              edx    	/* decrease loop counter ROWS */
+				jnz            L10320    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"movq    (%%edx), %%mm5 \n\t"	/* MM5 = {0,K2,K1,K0} */
+			"add          $8, %%edx \n\t"	/* second row              |K0 K1 K2 0| */
+			"movq    (%%edx), %%mm6 \n\t"	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
+			"add          $8, %%edx \n\t"	/* third row               |K6 K7 K8 0| */
+			"movq    (%%edx), %%mm7 \n\t"	/* MM7 = {0,K8,K7,K6} */
+			/* --- */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
+			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
+			"mov          %2, %%edx \n\t"	/* initialize ROWS counter */
+			"sub          $2, %%edx \n\t"	/* do not use first and last row */
+			/* --- */
+			".L10320:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
+			"sub          $2, %%ecx \n\t"	/* do not use first and last column */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10322:               \n\t"
+			/* --- */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the image first row */
+			"add       %%eax, %%esi \n\t"	/* move one row below */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes of the image second row */
+			"add       %%eax, %%esi \n\t"	/* move one row below */
+			"movq    (%%esi), %%mm3 \n\t"	/* load 8 bytes of the image third row */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first 4 bytes into words */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack first 4 bytes into words */
+			"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack first 4 bytes into words */
+			"pmullw    %%mm5, %%mm1 \n\t"	/* multiply words first row  image*Kernel */
+			"pmullw    %%mm6, %%mm2 \n\t"	/* multiply words second row image*Kernel */
+			"pmullw    %%mm7, %%mm3 \n\t"	/* multiply words third row  image*Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the first and second rows */
+			"paddsw    %%mm3, %%mm1 \n\t"	/* add 4 words of the third row and result */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"psrlq       $32, %%mm1 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm1, %%mm3 \n\t"	/* copy MM1 into MM3 */
+			"psrlq       $16, %%mm1 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm3, %%mm1 \n\t"	/* add 1 left and 1 right result words */
+			/* -- */
+			"movd      %%eax, %%mm2 \n\t"	/* save EAX in MM2 */
+			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
+			"movd      %%mm1, %%eax \n\t"	/* copy MM1 into EAX */
+			"psraw       $15, %%mm1 \n\t"	/* spread sign bit of the result */
+			"movd      %%mm1, %%edx \n\t"	/* fill EDX with a sign bit */
+			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
+			"movd      %%eax, %%mm1 \n\t"	/* move result of division into MM1 */
+			"packuswb  %%mm0, %%mm1 \n\t"	/* pack division result with saturation */
+			"movd      %%mm1, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
+			"movd      %%mm2, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"sub       %%eax, %%esi \n\t"	/* move two rows up */
+			"sub       %%eax, %%esi \n\t"	/* */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10322 \n\t"	/* check loop termination, proceed if required */
+			"add          $2, %%esi \n\t"	/* move to the next row in Src */
+			"add          $2, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%edx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10320 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(Divisor)		/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel5x5Divide: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >4.
+\param columns Number of columns in source/destination array. Must be >4.
+\param Kernel The 2D convolution kernel of size 5x5.
+\param Divisor The divisor of the convolution sum. Must be >0.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+										   signed short *Kernel, unsigned char Divisor)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 5) || (rows < 5) || (Divisor == 0))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, Divisor   	/* load Divisor into BL */
+				movd mm5, ebx   	/* copy Divisor into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 2   	/* 2 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				shl eax, 1   	/* EAX = columns * 2 */
+				add edi, eax   	/* 2 row offset from the top edge */
+				shr eax, 1   	/* EAX = columns */
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 4   	/* do not use first 2 and last 2 rows */
+				/* ---, */
+L10330:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 4   	/* do not use first 2 and last 2 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10332:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				/* ---, */
+				movd mm1, eax   	/* save EDX in MM1 */
+				movd mm2, ebx   	/* save EDX in MM2 */
+				movd mm3, edx   	/* save EDX in MM3 */
+				movd eax, mm7   	/* load summation result into EAX */
+				psraw mm7, 15   	/* spread sign bit of the result */
+				movd ebx, mm5   	/* load Divisor into EBX */
+				movd edx, mm7   	/* fill EDX with a sign bit */
+				idiv bx    	/* IDIV - VERY EXPENSIVE */
+				movd mm7, eax   	/* move result of division into MM7 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd edx, mm3   	/* restore saved EDX */
+				movd ebx, mm2   	/* restore saved EBX */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 72   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10332    	/* check loop termination, proceed if required */
+				add esi, 4   	/* move to the next row in Src */
+				add edi, 4   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10330    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $2, %%edi \n\t"	/* 2 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"shl          $1, %%eax \n\t"	/* EAX = columns * 2 */
+			"add       %%eax, %%edi \n\t"	/* 2 row offset from the top edge */
+			"shr          $1, %%eax \n\t"	/* EAX = columns */
+			"mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $4, %%ebx \n\t"	/* do not use first 2 and last 2 rows */
+			/* --- */
+			".L10330:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $4, %%ecx \n\t"	/* do not use first 2 and last 2 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10332:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			/* --- */
+			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
+			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
+			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
+			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
+			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
+			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
+			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
+			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
+			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
+			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub         $72, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10332 \n\t"	/* check loop termination, proceed if required */
+			"add          $4, %%esi \n\t"	/* move to the next row in Src */
+			"add          $4, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10330 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(Divisor)		/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel7x7Divide: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >6.
+\param columns Number of columns in source/destination array. Must be >6.
+\param Kernel The 2D convolution kernel of size 7x7.
+\param Divisor The divisor of the convolution sum. Must be >0.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+										   signed short *Kernel, unsigned char Divisor)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 7) || (rows < 7) || (Divisor == 0))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, Divisor   	/* load Divisor into BL */
+				movd mm5, ebx   	/* copy Divisor into MM5 */
+				mov edx, Kernel  	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 3   	/* 3 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				add edi, eax   	/* 3 row offset from the top edge */
+				add edi, eax
+				add edi, eax
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 6   	/* do not use first 3 and last 3 rows */
+				/* ---, */
+L10340:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 6   	/* do not use first 3 and last 3 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10342:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 6 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				/* ---, */
+				movd mm1, eax   	/* save EDX in MM1 */
+				movd mm2, ebx   	/* save EDX in MM2 */
+				movd mm3, edx   	/* save EDX in MM3 */
+				movd eax, mm7   	/* load summation result into EAX */
+				psraw mm7, 15   	/* spread sign bit of the result */
+				movd ebx, mm5   	/* load Divisor into EBX */
+				movd edx, mm7   	/* fill EDX with a sign bit */
+				idiv bx    	/* IDIV - VERY EXPENSIVE */
+				movd mm7, eax   	/* move result of division into MM7 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd edx, mm3   	/* restore saved EDX */
+				movd ebx, mm2   	/* restore saved EBX */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 104   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10342    	/* check loop termination, proceed if required */
+				add esi, 6   	/* move to the next row in Src */
+				add edi, 6   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10340    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $3, %%edi \n\t"	/* 3 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"add       %%eax, %%edi \n\t"	/* 3 row offset from the top edge */
+			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $6, %%ebx \n\t"	/* do not use first 3 and last 3 rows */
+			/* --- */
+			".L10340:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $6, %%ecx \n\t"	/* do not use first 3 and last 3 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10342:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 6 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			/* --- */
+			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
+			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
+			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
+			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
+			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
+			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
+			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
+			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
+			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
+			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub        $104, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10342 \n\t"	/* check loop termination, proceed if required */
+			"add          $6, %%esi \n\t"	/* move to the next row in Src */
+			"add          $6, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10340 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(Divisor)		/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel9x9Divide: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >8.
+\param columns Number of columns in source/destination array. Must be >8.
+\param Kernel The 2D convolution kernel of size 9x9.
+\param Divisor The divisor of the convolution sum. Must be >0.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+										   signed short *Kernel, unsigned char Divisor)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 9) || (rows < 9) || (Divisor == 0))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, Divisor   	/* load Divisor into BL */
+				movd mm5, ebx   	/* copy Divisor into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 4   	/* 4 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				add edi, eax   	/* 4 row offset from the top edge */
+				add edi, eax
+				add edi, eax
+				add edi, eax
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 8   	/* do not use first 4 and last 4 rows */
+				/* ---, */
+L10350:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 8   	/* do not use first 4 and last 4 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10352:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 6 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 8 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 9 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm3, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				/* ---, */
+				movd mm1, eax   	/* save EDX in MM1 */
+				movd mm2, ebx   	/* save EDX in MM2 */
+				movd mm3, edx   	/* save EDX in MM3 */
+				movd eax, mm7   	/* load summation result into EAX */
+				psraw mm7, 15   	/* spread sign bit of the result */
+				movd ebx, mm5   	/* load Divisor into EBX */
+				movd edx, mm7   	/* fill EDX with a sign bit */
+				idiv bx    	/* IDIV - VERY EXPENSIVE */
+				movd mm7, eax   	/* move result of division into MM7 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd edx, mm3   	/* restore saved EDX */
+				movd ebx, mm2   	/* restore saved EBX */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 208   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10352    	/* check loop termination, proceed if required */
+				add esi, 8   	/* move to the next row in Src */
+				add edi, 8   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10350    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $4, %%edi \n\t"	/* 4 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"add       %%eax, %%edi \n\t"	/* 4 row offset from the top edge */
+			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $8, %%ebx \n\t"	/* do not use first 4 and last 4 rows */
+			/* --- */
+			".L10350:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $8, %%ecx \n\t"	/* do not use first 4 and last 4 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10352:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 6 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 8 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 9 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			/* --- */
+			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
+			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
+			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
+			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
+			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
+			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
+			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
+			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
+			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
+			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub        $208, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10352 \n\t"	/* check loop termination, proceed if required */
+			"add          $8, %%esi \n\t"	/* move to the next row in Src */
+			"add          $8, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10350 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(Divisor)		/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel3x3ShiftRight: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >2.
+\param columns Number of columns in source/destination array. Must be >2.
+\param Kernel The 2D convolution kernel of size 3x3.
+\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+											   signed short *Kernel, unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 3) || (rows < 3) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm4, ebx   	/* copy NRightShift into MM4 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				movq mm5, [edx]   	/* MM5 = {0,K2,K1,K0} */
+			add edx, 8   	/* second row              |K0 K1 K2 0| */
+				movq mm6, [edx]   	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
+			add edx, 8   	/* third row               |K6 K7 K8 0| */
+				movq mm7, [edx]   	/* MM7 = {0,K8,K7,K6} */
+			/* ---, */
+			mov eax, columns   	/* load columns into EAX */
+				mov esi, Src   	/* ESI = Src row 0 address */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, eax   	/* EDI = EDI + columns */
+				inc              edi    	/* 1 byte offset from the left edge */
+				mov edx, rows   	/* initialize ROWS counter */
+				sub edx, 2   	/* do not use first and last row */
+				/* ---, */
+L10360:
+			mov ecx, eax   	/* initialize COLUMS counter */
+				sub ecx, 2   	/* do not use first and last column */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10362:
+			/* ---, */
+			movq mm1, [esi]   	/* load 8 bytes of the image first row */
+			add esi, eax   	/* move one row below */
+				movq mm2, [esi]   	/* load 8 bytes of the image second row */
+			add esi, eax   	/* move one row below */
+				movq mm3, [esi]   	/* load 8 bytes of the image third row */
+			punpcklbw mm1, mm0   	/* unpack first 4 bytes into words */
+				punpcklbw mm2, mm0   	/* unpack first 4 bytes into words */
+				punpcklbw mm3, mm0   	/* unpack first 4 bytes into words */
+				psrlw mm1, mm4   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm4   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm4   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm5   	/* multiply words first row  image*Kernel */
+				pmullw mm2, mm6   	/* multiply words second row image*Kernel */
+				pmullw mm3, mm7   	/* multiply words third row  image*Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the first and second rows */
+				paddsw mm1, mm3   	/* add 4 words of the third row and result */
+				movq mm2, mm1   	/* copy MM1 into MM2 */
+				psrlq mm1, 32   	/* shift 2 left words to the right */
+				paddsw mm1, mm2   	/* add 2 left and 2 right result words */
+				movq mm3, mm1   	/* copy MM1 into MM3 */
+				psrlq mm1, 16   	/* shift 1 left word to the right */
+				paddsw mm1, mm3   	/* add 1 left and 1 right result words */
+				packuswb mm1, mm0   	/* pack shift result with saturation */
+				movd ebx, mm1   	/* copy saturated result into EBX */
+				mov [edi], bl   	/* copy a byte result into Dest */
+				/* --, */
+				sub esi, eax   	/* move two rows up */
+				sub esi, eax
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10362    	/* check loop termination, proceed if required */
+				add esi, 2   	/* move to the next row in Src */
+				add edi, 2   	/* move to the next row in Dest */
+				dec              edx    	/* decrease loop counter ROWS */
+				jnz            L10360    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm4 \n\t"	/* copy NRightShift into MM4 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"movq    (%%edx), %%mm5 \n\t"	/* MM5 = {0,K2,K1,K0} */
+			"add          $8, %%edx \n\t"	/* second row              |K0 K1 K2 0| */
+			"movq    (%%edx), %%mm6 \n\t"	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
+			"add          $8, %%edx \n\t"	/* third row               |K6 K7 K8 0| */
+			"movq    (%%edx), %%mm7 \n\t"	/* MM7 = {0,K8,K7,K6} */
+			/* --- */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
+			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
+			"mov          %2, %%edx \n\t"	/* initialize ROWS counter */
+			"sub          $2, %%edx \n\t"	/* do not use first and last row */
+			/* --- */
+			".L10360:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
+			"sub          $2, %%ecx \n\t"	/* do not use first and last column */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10362:               \n\t"
+			/* --- */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the image first row */
+			"add       %%eax, %%esi \n\t"	/* move one row below */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes of the image second row */
+			"add       %%eax, %%esi \n\t"	/* move one row below */
+			"movq    (%%esi), %%mm3 \n\t"	/* load 8 bytes of the image third row */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first 4 bytes into words */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack first 4 bytes into words */
+			"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack first 4 bytes into words */
+			"psrlw     %%mm4, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm4, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm4, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm5, %%mm1 \n\t"	/* multiply words first row  image*Kernel */
+			"pmullw    %%mm6, %%mm2 \n\t"	/* multiply words second row image*Kernel */
+			"pmullw    %%mm7, %%mm3 \n\t"	/* multiply words third row  image*Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the first and second rows */
+			"paddsw    %%mm3, %%mm1 \n\t"	/* add 4 words of the third row and result */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"psrlq       $32, %%mm1 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm1, %%mm3 \n\t"	/* copy MM1 into MM3 */
+			"psrlq       $16, %%mm1 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm3, %%mm1 \n\t"	/* add 1 left and 1 right result words */
+			"packuswb  %%mm0, %%mm1 \n\t"	/* pack shift result with saturation */
+			"movd      %%mm1, %%ebx \n\t"	/* copy saturated result into EBX */
+			"mov      %%bl, (%%edi) \n\t"	/* copy a byte result into Dest */
+			/* -- */
+			"sub       %%eax, %%esi \n\t"	/* move two rows up */
+			"sub       %%eax, %%esi \n\t" "inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10362 \n\t"	/* check loop termination, proceed if required */
+			"add          $2, %%esi \n\t"	/* move to the next row in Src */
+			"add          $2, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%edx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10360 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(NRightShift)	/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel5x5ShiftRight: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >4.
+\param columns Number of columns in source/destination array. Must be >4.
+\param Kernel The 2D convolution kernel of size 5x5.
+\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+											   signed short *Kernel, unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 5) || (rows < 5) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm5, ebx   	/* copy NRightShift into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 2   	/* 2 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				shl eax, 1   	/* EAX = columns * 2 */
+				add edi, eax   	/* 2 row offset from the top edge */
+				shr eax, 1   	/* EAX = columns */
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 4   	/* do not use first 2 and last 2 rows */
+				/* ---, */
+L10370:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 4   	/* do not use first 2 and last 2 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10372:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				movd mm1, eax   	/* save EAX in MM1 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 72   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10372    	/* check loop termination, proceed if required */
+				add esi, 4   	/* move to the next row in Src */
+				add edi, 4   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10370    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy NRightShift into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $2, %%edi \n\t"	/* 2 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"shl          $1, %%eax \n\t"	/* EAX = columns * 2 */
+			"add       %%eax, %%edi \n\t"	/* 2 row offset from the top edge */
+			"shr          $1, %%eax \n\t"	/* EAX = columns */
+			"mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $4, %%ebx \n\t"	/* do not use first 2 and last 2 rows */
+			/* --- */
+			".L10370:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $4, %%ecx \n\t"	/* do not use first 2 and last 2 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10372:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			"movd      %%eax, %%mm1 \n\t"	/* save EAX in MM1 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub         $72, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10372 \n\t"	/* check loop termination, proceed if required */
+			"add          $4, %%esi \n\t"	/* move to the next row in Src */
+			"add          $4, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10370 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(NRightShift)	/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel7x7ShiftRight: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >6.
+\param columns Number of columns in source/destination array. Must be >6.
+\param Kernel The 2D convolution kernel of size 7x7.
+\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+											   signed short *Kernel, unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 7) || (rows < 7) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm5, ebx   	/* copy NRightShift into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 3   	/* 3 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				add edi, eax   	/* 3 row offset from the top edge */
+				add edi, eax
+				add edi, eax
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 6   	/* do not use first 3 and last 3 rows */
+				/* ---, */
+L10380:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 6   	/* do not use first 3 and last 3 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10382:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 6 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				movd mm1, eax   	/* save EAX in MM1 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 104   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10382    	/* check loop termination, proceed if required */
+				add esi, 6   	/* move to the next row in Src */
+				add edi, 6   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10380    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy NRightShift into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $3, %%edi \n\t"	/* 3 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"add       %%eax, %%edi \n\t"	/* 3 row offset from the top edge */
+			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $6, %%ebx \n\t"	/* do not use first 3 and last 3 rows */
+			/* --- */
+			".L10380:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $6, %%ecx \n\t"	/* do not use first 3 and last 3 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10382:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 6 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			"movd      %%eax, %%mm1 \n\t"	/* save EAX in MM1 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub        $104, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10382 \n\t"	/* check loop termination, proceed if required */
+			"add          $6, %%esi \n\t"	/* move to the next row in Src */
+			"add          $6, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10380 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(NRightShift)	/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel9x9ShiftRight: Dij = saturation255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >8.
+\param columns Number of columns in source/destination array. Must be >8.
+\param Kernel The 2D convolution kernel of size 9x9.
+\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+											   signed short *Kernel, unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 9) || (rows < 9) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm5, ebx   	/* copy NRightShift into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 4   	/* 4 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				add edi, eax   	/* 4 row offset from the top edge */
+				add edi, eax
+				add edi, eax
+				add edi, eax
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 8   	/* do not use first 4 and last 4 rows */
+				/* ---, */
+L10390:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 8   	/* do not use first 4 and last 4 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10392:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 6 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 8 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 9 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm3, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				movd mm1, eax   	/* save EAX in MM1 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 208   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10392    	/* check loop termination, proceed if required */
+				add esi, 8   	/* move to the next row in Src */
+				add edi, 8   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10390    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy NRightShift into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $4, %%edi \n\t"	/* 4 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"add       %%eax, %%edi \n\t"	/* 4 row offset from the top edge */
+			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $8, %%ebx \n\t"	/* do not use first 4 and last 4 rows */
+			/* --- */
+			".L10390:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $8, %%ecx \n\t"	/* do not use first 4 and last 4 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10392:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 6 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 8 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 9 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			"movd      %%eax, %%mm1 \n\t"	/* save EAX in MM1 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub        $208, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10392 \n\t"	/* check loop termination, proceed if required */
+			"add          $8, %%esi \n\t"	/* move to the next row in Src */
+			"add          $8, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10390 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(NRightShift)	/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/* ------------------------------------------------------------------------------------ */
+
+/*!
+\brief Filter using SobelX: Dij = saturation255( ... ) 
+
+\param Src The source 2D byte array to sobel-filter. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >2.
+\param columns Number of columns in source/destination array. Must be >7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL))
+		return(-1);
+
+	if ((columns < 8) || (rows < 3))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				mov eax, columns   	/* load columns into EAX */
+				/* ---, */
+				mov esi, Src   	/* ESI = Src row 0 address */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, eax   	/* EDI = EDI + columns */
+				inc              edi    	/* 1 byte offset from the left edge */
+				mov edx, rows   	/* initialize ROWS counter */
+				sub edx, 2   	/* do not use first and last rows */
+				/* ---, */
+L10400:
+			mov ecx, eax   	/* initialize COLUMS counter */
+				shr ecx, 3   	/* EBX/8 (MMX loads 8 bytes at a time) */
+				mov ebx, esi   	/* save ESI in EBX */
+				movd mm1, edi   	/* save EDI in MM1 */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10402:
+			/* ---, */
+			movq mm4, [esi]   	/* load 8 bytes from Src */
+			movq mm5, mm4   	/* save MM4 in MM5 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm4, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm5, mm0   	/* unpack 4 high bytes into words */
+				movq mm6, [esi]   	/* load 8 bytes from Src */
+			movq mm7, mm6   	/* save MM6 in MM7 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm6, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm7, mm0   	/* unpack 4 high bytes into words */
+				add esi, eax   	/* move to the next row of Src */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				add esi, eax   	/* move to the next row of Src */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				/* ---, */
+				movq mm2, mm4   	/* copy MM4 into MM2 */
+				psrlq mm4, 32   	/* shift 2 left words to the right */
+				psubw mm4, mm2   	/* MM4 = MM4 - MM2 */
+				movq mm3, mm6   	/* copy MM6 into MM3 */
+				psrlq mm6, 32   	/* shift 2 left words to the right */
+				psubw mm6, mm3   	/* MM6 = MM6 - MM3 */
+				punpckldq mm4, mm6   	/* combine 2 words of MM6 and 2 words of MM4 */
+				movq mm2, mm5   	/* copy MM6 into MM2 */
+				psrlq mm5, 32   	/* shift 2 left words to the right */
+				psubw mm5, mm2   	/* MM5 = MM5 - MM2 */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				psubw mm7, mm3   	/* MM7 = MM7 - MM3 */
+				punpckldq mm5, mm7   	/* combine 2 words of MM7 and 2 words of MM5 */
+				/* Take abs values of MM4 and MM5 */
+				movq mm6, mm4   	/* copy MM4 into MM6 */
+				movq mm7, mm5   	/* copy MM5 into MM7 */
+				psraw mm6, 15   	/* fill MM6 words with word sign bit */
+				psraw mm7, 15   	/* fill MM7 words with word sign bit */
+				pxor mm4, mm6   	/* take 1's compliment of only neg words */
+				pxor mm5, mm7   	/* take 1's compliment of only neg words */
+				psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+				psubsw mm5, mm7   	/* add 1 to only neg words, W-(-1) or W-0 */
+				packuswb mm4, mm5   	/* combine and pack/saturate MM5 and MM4 */
+				movq [edi], mm4   	/* store result in Dest */
+				/* ---, */
+				sub esi, eax   	/* move to the current top row in Src */
+				sub esi, eax
+				add esi, 8   	/* move Src  pointer to the next 8 pixels */
+				add edi, 8   	/* move Dest pointer to the next 8 pixels */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10402    	/* check loop termination, proceed if required */
+				mov esi, ebx   	/* restore most left current row Src  address */
+				movd edi, mm1   	/* restore most left current row Dest address */
+				add esi, eax   	/* move to the next row in Src */
+				add edi, eax   	/* move to the next row in Dest */
+				dec              edx    	/* decrease loop counter ROWS */
+				jnz            L10400    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			/* --- */
+			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
+			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
+			"mov          %2, %%edx \n\t"	/* initialize ROWS counter */
+			"sub          $2, %%edx \n\t"	/* do not use first and last rows */
+			/* --- */
+			".L10400:                \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
+			"shr          $3, %%ecx \n\t"	/* EBX/8 (MMX loads 8 bytes at a time) */
+			"mov       %%esi, %%ebx \n\t"	/* save ESI in EBX */
+			"movd      %%edi, %%mm1 \n\t"	/* save EDI in MM1 */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10402:               \n\t"
+			/* --- */
+			"movq    (%%esi), %%mm4 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm4, %%mm5 \n\t"	/* save MM4 in MM5 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm4 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm5 \n\t"	/* unpack 4 high bytes into words */
+			"movq    (%%esi), %%mm6 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm6, %%mm7 \n\t"	/* save MM6 in MM7 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm6 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm7 \n\t"	/* unpack 4 high bytes into words */
+			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			/* --- */
+			"movq      %%mm4, %%mm2 \n\t"	/* copy MM4 into MM2 */
+			"psrlq       $32, %%mm4 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm2, %%mm4 \n\t"	/* MM4 = MM4 - MM2 */
+			"movq      %%mm6, %%mm3 \n\t"	/* copy MM6 into MM3 */
+			"psrlq       $32, %%mm6 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm3, %%mm6 \n\t"	/* MM6 = MM6 - MM3 */
+			"punpckldq %%mm6, %%mm4 \n\t"	/* combine 2 words of MM6 and 2 words of MM4 */
+			"movq      %%mm5, %%mm2 \n\t"	/* copy MM6 into MM2 */
+			"psrlq       $32, %%mm5 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm2, %%mm5 \n\t"	/* MM5 = MM5 - MM2 */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm3, %%mm7 \n\t"	/* MM7 = MM7 - MM3 */
+			"punpckldq %%mm7, %%mm5 \n\t"	/* combine 2 words of MM7 and 2 words of MM5 */
+			/* Take abs values of MM4 and MM5 */
+			"movq      %%mm4, %%mm6 \n\t"	/* copy MM4 into MM6 */
+			"movq      %%mm5, %%mm7 \n\t"	/* copy MM5 into MM7 */
+			"psraw       $15, %%mm6 \n\t"	/* fill MM6 words with word sign bit */
+			"psraw       $15, %%mm7 \n\t"	/* fill MM7 words with word sign bit */
+			"pxor      %%mm6, %%mm4 \n\t"	/* take 1's compliment of only neg. words */
+			"pxor      %%mm7, %%mm5 \n\t"	/* take 1's compliment of only neg. words */
+			"psubsw    %%mm6, %%mm4 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
+			"psubsw    %%mm7, %%mm5 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
+			"packuswb  %%mm5, %%mm4 \n\t"	/* combine and pack/saturate MM5 and MM4 */
+			"movq    %%mm4, (%%edi) \n\t"	/* store result in Dest */
+			/* --- */
+			"sub       %%eax, %%esi \n\t"	/* move to the current top row in Src */
+			"sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"	/* move Src  pointer to the next 8 pixels */
+			"add $8,          %%edi \n\t"	/* move Dest pointer to the next 8 pixels */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10402 \n\t"	/* check loop termination, proceed if required */
+			"mov       %%ebx, %%esi \n\t"	/* restore most left current row Src  address */
+			"movd      %%mm1, %%edi \n\t"	/* restore most left current row Dest address */
+			"add       %%eax, %%esi \n\t"	/* move to the next row in Src */
+			"add       %%eax, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%edx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10400 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns)		/* %3 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using SobelXShiftRight: Dij = saturation255( ... ) 
+
+\param Src The source 2D byte array to sobel-filter. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >2.
+\param columns Number of columns in source/destination array. Must be >8.
+\param NRightShift The number of right bit shifts to apply to the filter sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+									unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL))
+		return(-1);
+	if ((columns < 8) || (rows < 3) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				mov eax, columns   	/* load columns into EAX */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm1, ebx   	/* copy NRightShift into MM1 */
+				/* ---, */
+				mov esi, Src   	/* ESI = Src row 0 address */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, eax   	/* EDI = EDI + columns */
+				inc              edi    	/* 1 byte offset from the left edge */
+				/* initialize ROWS counter */
+				sub rows, 2   	/* do not use first and last rows */
+				/* ---, */
+L10410:
+			mov ecx, eax   	/* initialize COLUMS counter */
+				shr ecx, 3   	/* EBX/8 (MMX loads 8 bytes at a time) */
+				mov ebx, esi   	/* save ESI in EBX */
+				mov edx, edi   	/* save EDI in EDX */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10412:
+			/* ---, */
+			movq mm4, [esi]   	/* load 8 bytes from Src */
+			movq mm5, mm4   	/* save MM4 in MM5 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm4, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm5, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm4, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm5, mm1   	/* shift right each pixel NshiftRight times */
+				movq mm6, [esi]   	/* load 8 bytes from Src */
+			movq mm7, mm6   	/* save MM6 in MM7 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm6, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm7, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm6, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm7, mm1   	/* shift right each pixel NshiftRight times */
+				add esi, eax   	/* move to the next row of Src */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				add esi, eax   	/* move to the next row of Src */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				/* ---, */
+				movq mm2, mm4   	/* copy MM4 into MM2 */
+				psrlq mm4, 32   	/* shift 2 left words to the right */
+				psubw mm4, mm2   	/* MM4 = MM4 - MM2 */
+				movq mm3, mm6   	/* copy MM6 into MM3 */
+				psrlq mm6, 32   	/* shift 2 left words to the right */
+				psubw mm6, mm3   	/* MM6 = MM6 - MM3 */
+				punpckldq mm4, mm6   	/* combine 2 words of MM6 and 2 words of MM4 */
+				movq mm2, mm5   	/* copy MM6 into MM2 */
+				psrlq mm5, 32   	/* shift 2 left words to the right */
+				psubw mm5, mm2   	/* MM5 = MM5 - MM2 */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				psubw mm7, mm3   	/* MM7 = MM7 - MM3 */
+				punpckldq mm5, mm7   	/* combine 2 words of MM7 and 2 words of MM5 */
+				/* Take abs values of MM4 and MM5 */
+				movq mm6, mm4   	/* copy MM4 into MM6 */
+				movq mm7, mm5   	/* copy MM5 into MM7 */
+				psraw mm6, 15   	/* fill MM6 words with word sign bit */
+				psraw mm7, 15   	/* fill MM7 words with word sign bit */
+				pxor mm4, mm6   	/* take 1's compliment of only neg words */
+				pxor mm5, mm7   	/* take 1's compliment of only neg words */
+				psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+				psubsw mm5, mm7   	/* add 1 to only neg words, W-(-1) or W-0 */
+				packuswb mm4, mm5   	/* combine and pack/saturate MM5 and MM4 */
+				movq [edi], mm4   	/* store result in Dest */
+				/* ---, */
+				sub esi, eax   	/* move to the current top row in Src */
+				sub esi, eax
+				add esi, 8   	/* move Src  pointer to the next 8 pixels */
+				add edi, 8   	/* move Dest pointer to the next 8 pixels */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10412    	/* check loop termination, proceed if required */
+				mov esi, ebx   	/* restore most left current row Src  address */
+				mov edi, edx   	/* restore most left current row Dest address */
+				add esi, eax   	/* move to the next row in Src */
+				add edi, eax   	/* move to the next row in Dest */
+				dec rows    	/* decrease loop counter ROWS */
+				jnz            L10410    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %4, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm1 \n\t"	/* copy NRightShift into MM1 */
+			/* --- */
+			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
+			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
+			/* initialize ROWS counter */
+			"subl            $2, %2 \n\t"	/* do not use first and last rows */
+			/* --- */
+			".L10410:                \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
+			"shr          $3, %%ecx \n\t"	/* EBX/8 (MMX loads 8 bytes at a time) */
+			"mov       %%esi, %%ebx \n\t"	/* save ESI in EBX */
+			"mov       %%edi, %%edx \n\t"	/* save EDI in EDX */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10412:               \n\t"
+			/* --- */
+			"movq    (%%esi), %%mm4 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm4, %%mm5 \n\t"	/* save MM4 in MM5 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm4 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm5 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm4 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm5 \n\t"	/* shift right each pixel NshiftRight times */
+			"movq    (%%esi), %%mm6 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm6, %%mm7 \n\t"	/* save MM6 in MM7 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm6 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm7 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm6 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm7 \n\t"	/* shift right each pixel NshiftRight times */
+			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			/* --- */
+			"movq      %%mm4, %%mm2 \n\t"	/* copy MM4 into MM2 */
+			"psrlq       $32, %%mm4 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm2, %%mm4 \n\t"	/* MM4 = MM4 - MM2 */
+			"movq      %%mm6, %%mm3 \n\t"	/* copy MM6 into MM3 */
+			"psrlq       $32, %%mm6 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm3, %%mm6 \n\t"	/* MM6 = MM6 - MM3 */
+			"punpckldq %%mm6, %%mm4 \n\t"	/* combine 2 words of MM6 and 2 words of MM4 */
+			"movq      %%mm5, %%mm2 \n\t"	/* copy MM6 into MM2 */
+			"psrlq       $32, %%mm5 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm2, %%mm5 \n\t"	/* MM5 = MM5 - MM2 */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm3, %%mm7 \n\t"	/* MM7 = MM7 - MM3 */
+			"punpckldq %%mm7, %%mm5 \n\t"	/* combine 2 words of MM7 and 2 words of MM5 */
+			/* Take abs values of MM4 and MM5 */
+			"movq      %%mm4, %%mm6 \n\t"	/* copy MM4 into MM6 */
+			"movq      %%mm5, %%mm7 \n\t"	/* copy MM5 into MM7 */
+			"psraw       $15, %%mm6 \n\t"	/* fill MM6 words with word sign bit */
+			"psraw       $15, %%mm7 \n\t"	/* fill MM7 words with word sign bit */
+			"pxor      %%mm6, %%mm4 \n\t"	/* take 1's compliment of only neg. words */
+			"pxor      %%mm7, %%mm5 \n\t"	/* take 1's compliment of only neg. words */
+			"psubsw    %%mm6, %%mm4 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
+			"psubsw    %%mm7, %%mm5 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
+			"packuswb  %%mm5, %%mm4 \n\t"	/* combine and pack/saturate MM5 and MM4 */
+			"movq    %%mm4, (%%edi) \n\t"	/* store result in Dest */
+			/* --- */
+			"sub       %%eax, %%esi \n\t"	/* move to the current top row in Src */
+			"sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"	/* move Src  pointer to the next 8 pixels */
+			"add $8,          %%edi \n\t"	/* move Dest pointer to the next 8 pixels */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10412 \n\t"	/* check loop termination, proceed if required */
+			"mov       %%ebx, %%esi \n\t"	/* restore most left current row Src  address */
+			"mov       %%edx, %%edi \n\t"	/* restore most left current row Dest address */
+			"add       %%eax, %%esi \n\t"	/* move to the next row in Src */
+			"add       %%eax, %%edi \n\t"	/* move to the next row in Dest */
+			"decl                %2 \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10410 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(NRightShift)	/* %4 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Align stack to 32 byte boundary,
+*/
+void SDL_imageFilterAlignStack(void)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{				/* --- stack alignment --- */
+		mov ebx, esp   	/* load ESP into EBX */
+			sub ebx, 4   	/* reserve space on stack for old value of ESP */
+			and ebx, -32   	/* align EBX along a 32 byte boundary */
+			mov [ebx], esp   	/* save old value of ESP in stack, behind the bndry */
+			mov esp, ebx   	/* align ESP along a 32 byte boundary */
+	}
+#else
+	asm volatile
+		(				/* --- stack alignment --- */
+		"mov       %%esp, %%ebx \n\t"	/* load ESP into EBX */
+		"sub          $4, %%ebx \n\t"	/* reserve space on stack for old value of ESP */
+		"and        $-32, %%ebx \n\t"	/* align EBX along a 32 byte boundary */
+		"mov     %%esp, (%%ebx) \n\t"	/* save old value of ESP in stack, behind the bndry */
+		"mov       %%ebx, %%esp \n\t"	/* align ESP along a 32 byte boundary */
+		::);
+#endif
+#endif
+}
+
+/*!
+\brief Restore previously aligned stack.
+*/
+void SDL_imageFilterRestoreStack(void)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{				/* --- restoring old stack --- */
+		mov ebx, [esp]   	/* load old value of ESP */
+		mov esp, ebx   	/* restore old value of ESP */
+	}
+#else
+	asm volatile
+		(				/* --- restoring old stack --- */
+		"mov     (%%esp), %%ebx \n\t"	/* load old value of ESP */
+		"mov       %%ebx, %%esp \n\t"	/* restore old value of ESP */
+		::);
+#endif
+#endif
+}
diff --git a/vendor/SDL3_gfx/SDL3_imageFilter.h b/vendor/SDL3_gfx/SDL3_imageFilter.h
new file mode 100644
index 0000000..3a89c37
--- /dev/null
+++ b/vendor/SDL3_gfx/SDL3_imageFilter.h
@@ -0,0 +1,166 @@
+/*
+
+SDL3_imageFilter.h: byte-image "filter" routines 
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifndef _SDL3_imageFilter_h
+#define _SDL3_imageFilter_h
+
+/* Set up for C function definitions, even when using C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/* ---- Function Prototypes */
+
+#ifdef _MSC_VER
+#  if defined(DLL_EXPORT) && !defined(LIBSDL3_GFX_DLL_IMPORT)
+#    define SDL3_IMAGEFILTER_SCOPE __declspec(dllexport)
+#  else
+#    ifdef LIBSDL3_GFX_DLL_IMPORT
+#      define SDL3_IMAGEFILTER_SCOPE __declspec(dllimport)
+#    endif
+#  endif
+#endif
+#ifndef SDL3_IMAGEFILTER_SCOPE
+#  define SDL3_IMAGEFILTER_SCOPE extern
+#endif
+
+	/* Comments:                                                                           */
+	/*  1.) MMX functions work best if all data blocks are aligned on a 32 bytes boundary. */
+	/*  2.) Data that is not within an 8 byte boundary is processed using the C routine.   */
+	/*  3.) Convolution routines do not have C routines at this time.                      */
+
+	// Detect MMX capability in CPU
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterMMXdetect(void);
+
+	// Force use of MMX off (or turn possible use back on)
+	SDL3_IMAGEFILTER_SCOPE void SDL_imageFilterMMXoff(void);
+	SDL3_IMAGEFILTER_SCOPE void SDL_imageFilterMMXon(void);
+
+	//
+	// All routines return:
+	//   0   OK
+	//  -1   Error (internal error, parameter error)
+	//
+
+	//  SDL_imageFilterAdd: D = saturation255(S1 + S2)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterMean: D = S1/2 + S2/2
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterSub: D = saturation0(S1 - S2)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterAbsDiff: D = | S1 - S2 |
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterMult: D = saturation(S1 * S2)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterMultNor: D = S1 * S2   (non-MMX)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterMultDivby2: D = saturation255(S1/2 * S2)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest,
+		unsigned int length);
+
+	//  SDL_imageFilterMultDivby4: D = saturation255(S1/2 * S2/2)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest,
+		unsigned int length);
+
+	//  SDL_imageFilterBitAnd: D = S1 & S2
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterBitOr: D = S1 | S2
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterDiv: D = S1 / S2   (non-MMX)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterBitNegation: D = !S
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterAddByte: D = saturation255(S + C)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C);
+
+	//  SDL_imageFilterAddUint: D = saturation255(S + (uint)C)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C);
+
+	//  SDL_imageFilterAddByteToHalf: D = saturation255(S/2 + C)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char C);
+
+	//  SDL_imageFilterSubByte: D = saturation0(S - C)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C);
+
+	//  SDL_imageFilterSubUint: D = saturation0(S - (uint)C)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C);
+
+	//  SDL_imageFilterShiftRight: D = saturation0(S >> N)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N);
+
+	//  SDL_imageFilterShiftRightUint: D = saturation0((uint)S >> N)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N);
+
+	//  SDL_imageFilterMultByByte: D = saturation255(S * C)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C);
+
+	//  SDL_imageFilterShiftRightAndMultByByte: D = saturation255((S >> N) * C)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char N, unsigned char C);
+
+	//  SDL_imageFilterShiftLeftByte: D = (S << N)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char N);
+
+	//  SDL_imageFilterShiftLeftUint: D = ((uint)S << N)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char N);
+
+	//  SDL_imageFilterShiftLeft: D = saturation255(S << N)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N);
+
+	//  SDL_imageFilterBinarizeUsingThreshold: D = S >= T ? 255:0
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char T);
+
+	//  SDL_imageFilterClipToRange: D = (S >= Tmin) & (S <= Tmax) 255:0
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char Tmin, unsigned char Tmax);
+
+	//  SDL_imageFilterNormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
+	SDL3_IMAGEFILTER_SCOPE int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin,
+		int Cmax, int Nmin, int Nmax);
+
+	/* Ends C function definitions when using C++ */
+#ifdef __cplusplus
+}
+#endif
+
+#endif				/* _SDL3_imageFilter_h */
diff --git a/vendor/SDL3_gfx/SDL3_rotozoom.c b/vendor/SDL3_gfx/SDL3_rotozoom.c
new file mode 100644
index 0000000..0ba72ac
--- /dev/null
+++ b/vendor/SDL3_gfx/SDL3_rotozoom.c
@@ -0,0 +1,1636 @@
+/*  
+
+SDL3_rotozoom.c: rotozoomer, zoomer and shrinker for 32bit or 8bit surfaces
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifdef WIN32
+#include <windows.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "SDL3_rotozoom.h"
+
+/*! 
+\brief Returns maximum of two numbers a and b.
+*/
+#define MAX(a,b)    (((a) > (b)) ? (a) : (b))
+
+/*! 
+\brief Number of guard rows added to destination surfaces.
+
+This is a simple but effective workaround for observed issues.
+These rows allocate extra memory and are then hidden from the surface.
+Rows are added to the end of destination surfaces when they are allocated. 
+This catches any potential overflows which seem to happen with 
+just the right src image dimensions and scale/rotation and can lead
+to a situation where the program can segfault.
+*/
+#define GUARD_ROWS (2)
+
+/*!
+\brief Lower limit of absolute zoom factor or rotation degrees.
+*/
+#define VALUE_LIMIT	0.001
+
+/*!
+\brief Returns colorkey info for a surface
+*/
+Uint32 _colorkey(SDL_Surface *src)
+{
+	Uint32 key = 0; 
+	SDL_GetSurfaceColorKey(src, &key);
+	return key;
+}
+
+
+/*! 
+\brief Internal 32 bit integer-factor averaging Shrinker.
+
+Shrinks 32 bit RGBA/ABGR 'src' surface to 'dst' surface.
+Averages color and alpha values values of src pixels to calculate dst pixels.
+Assumes src and dst surfaces are of 32 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src The surface to shrink (input).
+\param dst The shrunken surface (output).
+\param factorx The horizontal shrinking ratio.
+\param factory The vertical shrinking ratio.
+
+\return 0 for success or -1 for error.
+*/
+int _shrinkSurfaceRGBA(SDL_Surface * src, SDL_Surface * dst, int factorx, int factory)
+{
+	int x, y, dx, dy, dgap, ra, ga, ba, aa;
+	int n_average;
+	SDL_Color *sp, *osp, *oosp;
+	SDL_Color *dp;
+
+	/*
+	* Averaging integer shrink
+	*/
+
+	/* Precalculate division factor */
+	n_average = factorx*factory;
+
+	/*
+	* Scan destination
+	*/
+	sp = (SDL_Color *) src->pixels;
+	
+	dp = (SDL_Color *) dst->pixels;
+	dgap = dst->pitch - dst->w * 4;
+
+	for (y = 0; y < dst->h; y++) {
+
+		osp=sp;
+		for (x = 0; x < dst->w; x++) {
+
+			/* Trace out source box and accumulate */
+			oosp=sp;
+			ra=ga=ba=aa=0;
+			for (dy=0; dy < factory; dy++) {
+				for (dx=0; dx < factorx; dx++) {
+					ra += sp->r;
+					ga += sp->g;
+					ba += sp->b;
+					aa += sp->a;
+
+					sp++;
+				} 
+				/* src dx loop */
+				sp = (SDL_Color *)((Uint8*)sp + (src->pitch - 4*factorx)); // next y
+			}
+			/* src dy loop */
+
+			/* next box-x */
+			sp = (SDL_Color *)((Uint8*)oosp + 4*factorx);
+
+			/* Store result in destination */
+			dp->r = ra/n_average;
+			dp->g = ga/n_average;
+			dp->b = ba/n_average;
+			dp->a = aa/n_average;
+
+			/*
+			* Advance destination pointer 
+			*/
+			dp++;
+		} 
+		/* dst x loop */
+
+		/* next box-y */
+		sp = (SDL_Color *)((Uint8*)osp + src->pitch*factory);
+
+		/*
+		* Advance destination pointers 
+		*/
+		dp = (SDL_Color *) ((Uint8 *) dp + dgap);
+	} 
+	/* dst y loop */
+
+	return (0);
+}
+
+/*! 
+\brief Internal 8 bit integer-factor averaging shrinker.
+
+Shrinks 8bit Y 'src' surface to 'dst' surface.
+Averages color (brightness) values values of src pixels to calculate dst pixels.
+Assumes src and dst surfaces are of 8 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src The surface to shrink (input).
+\param dst The shrunken surface (output).
+\param factorx The horizontal shrinking ratio.
+\param factory The vertical shrinking ratio.
+
+\return 0 for success or -1 for error.
+*/
+int _shrinkSurfaceY(SDL_Surface * src, SDL_Surface * dst, int factorx, int factory)
+{
+	int x, y, dx, dy, dgap, a;
+	int n_average;
+	Uint8 *sp, *osp, *oosp;
+	Uint8 *dp;
+
+	/*
+	* Averaging integer shrink
+	*/
+
+	/* Precalculate division factor */
+	n_average = factorx*factory;
+
+	/*
+	* Scan destination
+	*/
+	sp = (Uint8 *) src->pixels;
+
+	dp = (Uint8 *) dst->pixels;
+	dgap = dst->pitch - dst->w;
+
+	for (y = 0; y < dst->h; y++) {    
+
+		osp=sp;
+		for (x = 0; x < dst->w; x++) {
+
+			/* Trace out source box and accumulate */
+			oosp=sp;
+			a=0;
+			for (dy=0; dy < factory; dy++) {
+				for (dx=0; dx < factorx; dx++) {
+					a += (*sp);
+					/* next x */           
+					sp++;
+				} 
+				/* end src dx loop */         
+				/* next y */
+				sp = (Uint8 *)((Uint8*)sp + (src->pitch - factorx)); 
+			} 
+			/* end src dy loop */
+
+			/* next box-x */
+			sp = (Uint8 *)((Uint8*)oosp + factorx);
+
+			/* Store result in destination */
+			*dp = a/n_average;
+
+			/*
+			* Advance destination pointer 
+			*/
+			dp++;
+		} 
+		/* end dst x loop */
+
+		/* next box-y */
+		sp = (Uint8 *)((Uint8*)osp + src->pitch*factory);
+
+		/*
+		* Advance destination pointers 
+		*/
+		dp = (Uint8 *)((Uint8 *)dp + dgap);
+	} 
+	/* end dst y loop */
+
+	return (0);
+}
+
+/*! 
+\brief Internal 32 bit Zoomer with optional anti-aliasing by bilinear interpolation.
+
+Zooms 32 bit RGBA/ABGR 'src' surface to 'dst' surface.
+Assumes src and dst surfaces are of 32 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src The surface to zoom (input).
+\param dst The zoomed surface (output).
+\param flipx Flag indicating if the image should be horizontally flipped.
+\param flipy Flag indicating if the image should be vertically flipped.
+\param smooth Antialiasing flag; set to SMOOTHING_ON to enable.
+
+\return 0 for success or -1 for error.
+*/
+int _zoomSurfaceRGBA(SDL_Surface * src, SDL_Surface * dst, int flipx, int flipy, int smooth)
+{
+	int x, y, sx, sy, ssx, ssy, *sax, *say, *csax, *csay, *salast, csx, csy, ex, ey, cx, cy, sstep, sstepx, sstepy;
+	SDL_Color *c00, *c01, *c10, *c11;
+	SDL_Color *sp, *csp, *dp;
+	int spixelgap, spixelw, spixelh, dgap, t1, t2;
+
+	/*
+	* Allocate memory for row/column increments 
+	*/
+	if ((sax = (int *) malloc((dst->w + 1) * sizeof(Uint32))) == NULL) {
+		return (-1);
+	}
+	if ((say = (int *) malloc((dst->h + 1) * sizeof(Uint32))) == NULL) {
+		free(sax);
+		return (-1);
+	}
+
+	/*
+	* Precalculate row increments 
+	*/
+	spixelw = (src->w - 1);
+	spixelh = (src->h - 1);
+	if (smooth) {
+		sx = (int) (65536.0 * (float) spixelw / (float) (dst->w - 1));
+		sy = (int) (65536.0 * (float) spixelh / (float) (dst->h - 1));
+	} else {
+		sx = (int) (65536.0 * (float) (src->w) / (float) (dst->w));
+		sy = (int) (65536.0 * (float) (src->h) / (float) (dst->h));
+	}
+
+	/* Maximum scaled source size */
+	ssx = (src->w << 16) - 1;
+	ssy = (src->h << 16) - 1;
+
+	/* Precalculate horizontal row increments */
+	csx = 0;
+	csax = sax;
+	for (x = 0; x <= dst->w; x++) {
+		*csax = csx;
+		csax++;
+		csx += sx;
+
+		/* Guard from overflows */
+		if (csx > ssx) { 
+			csx = ssx; 
+		}
+	}
+
+	/* Precalculate vertical row increments */
+	csy = 0;
+	csay = say;
+	for (y = 0; y <= dst->h; y++) {
+		*csay = csy;
+		csay++;
+		csy += sy;
+
+		/* Guard from overflows */
+		if (csy > ssy) {
+			csy = ssy;
+		}
+	}
+
+	sp = (SDL_Color *) src->pixels;
+	dp = (SDL_Color *) dst->pixels;
+	dgap = dst->pitch - dst->w * 4;
+	spixelgap = src->pitch/4;
+
+	if (flipx) sp += spixelw;
+	if (flipy) sp += (spixelgap * spixelh);
+
+	/*
+	* Switch between interpolating and non-interpolating code 
+	*/
+	if (smooth) {
+
+		/*
+		* Interpolating Zoom 
+		*/
+		csay = say;
+		for (y = 0; y < dst->h; y++) {
+			csp = sp;
+			csax = sax;
+			for (x = 0; x < dst->w; x++) {
+				/*
+				* Setup color source pointers 
+				*/
+				ex = (*csax & 0xffff);
+				ey = (*csay & 0xffff);
+				cx = (*csax >> 16);
+				cy = (*csay >> 16);
+				sstepx = cx < spixelw;
+				sstepy = cy < spixelh;
+				c00 = sp;
+				c01 = sp;
+				c10 = sp;
+				if (sstepy) {
+					if (flipy) {
+						c10 -= spixelgap;
+					} else {
+						c10 += spixelgap;
+					}
+				}
+				c11 = c10;
+				if (sstepx) {
+					if (flipx) {
+						c01--;
+						c11--;
+					} else {
+						c01++;
+						c11++;
+					}
+				}
+
+				/*
+				* Draw and interpolate colors 
+				*/
+				t1 = ((((c01->r - c00->r) * ex) >> 16) + c00->r) & 0xff;
+				t2 = ((((c11->r - c10->r) * ex) >> 16) + c10->r) & 0xff;
+				dp->r = (((t2 - t1) * ey) >> 16) + t1;
+				t1 = ((((c01->g - c00->g) * ex) >> 16) + c00->g) & 0xff;
+				t2 = ((((c11->g - c10->g) * ex) >> 16) + c10->g) & 0xff;
+				dp->g = (((t2 - t1) * ey) >> 16) + t1;
+				t1 = ((((c01->b - c00->b) * ex) >> 16) + c00->b) & 0xff;
+				t2 = ((((c11->b - c10->b) * ex) >> 16) + c10->b) & 0xff;
+				dp->b = (((t2 - t1) * ey) >> 16) + t1;
+				t1 = ((((c01->a - c00->a) * ex) >> 16) + c00->a) & 0xff;
+				t2 = ((((c11->a - c10->a) * ex) >> 16) + c10->a) & 0xff;
+				dp->a = (((t2 - t1) * ey) >> 16) + t1;				
+				/*
+				* Advance source pointer x
+				*/
+				salast = csax;
+				csax++;				
+				sstep = (*csax >> 16) - (*salast >> 16);
+				if (flipx) {
+					sp -= sstep;
+				} else {
+					sp += sstep;
+				}
+
+				/*
+				* Advance destination pointer x
+				*/
+				dp++;
+			}
+			/*
+			* Advance source pointer y
+			*/
+			salast = csay;
+			csay++;
+			sstep = (*csay >> 16) - (*salast >> 16);
+			sstep *= spixelgap;
+			if (flipy) { 
+				sp = csp - sstep;
+			} else {
+				sp = csp + sstep;
+			}
+
+			/*
+			* Advance destination pointer y
+			*/
+			dp = (SDL_Color *) ((Uint8 *) dp + dgap);
+		}
+	} else {
+		/*
+		* Non-Interpolating Zoom 
+		*/		
+		csay = say;
+		for (y = 0; y < dst->h; y++) {
+			csp = sp;
+			csax = sax;
+			for (x = 0; x < dst->w; x++) {
+				/*
+				* Draw 
+				*/
+				*dp = *sp;
+
+				/*
+				* Advance source pointer x
+				*/
+				salast = csax;
+				csax++;				
+				sstep = (*csax >> 16) - (*salast >> 16);
+				if (flipx) sstep = -sstep;
+				sp += sstep;
+
+				/*
+				* Advance destination pointer x
+				*/
+				dp++;
+			}
+			/*
+			* Advance source pointer y
+			*/
+			salast = csay;
+			csay++;
+			sstep = (*csay >> 16) - (*salast >> 16);
+			sstep *= spixelgap;
+			if (flipy) sstep = -sstep;			
+			sp = csp + sstep;
+
+			/*
+			* Advance destination pointer y
+			*/
+			dp = (SDL_Color *) ((Uint8 *) dp + dgap);
+		}
+	}
+
+	/*
+	* Remove temp arrays 
+	*/
+	free(sax);
+	free(say);
+
+	return (0);
+}
+
+/*! 
+
+\brief Internal 8 bit Zoomer without smoothing.
+
+Zooms 8bit palette/Y 'src' surface to 'dst' surface.
+Assumes src and dst surfaces are of 8 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src The surface to zoom (input).
+\param dst The zoomed surface (output).
+\param flipx Flag indicating if the image should be horizontally flipped.
+\param flipy Flag indicating if the image should be vertically flipped.
+
+\return 0 for success or -1 for error.
+*/
+int _zoomSurfaceY(SDL_Surface * src, SDL_Surface * dst, int flipx, int flipy)
+{
+	int x, y;
+	Uint32 *sax, *say, *csax, *csay;
+	int csx, csy;
+	Uint8 *sp, *dp, *csp;
+	int dgap;
+
+	/*
+	* Allocate memory for row increments 
+	*/
+	if ((sax = (Uint32 *) malloc((dst->w + 1) * sizeof(Uint32))) == NULL) {
+		return (-1);
+	}
+	if ((say = (Uint32 *) malloc((dst->h + 1) * sizeof(Uint32))) == NULL) {
+		free(sax);
+		return (-1);
+	}
+
+	/*
+	* Pointer setup 
+	*/
+	sp = csp = (Uint8 *) src->pixels;
+	dp = (Uint8 *) dst->pixels;
+	dgap = dst->pitch - dst->w;
+
+	if (flipx) csp += (src->w-1);
+	if (flipy) csp  = ( (Uint8*)csp + src->pitch*(src->h-1) );
+
+	/*
+	* Precalculate row increments 
+	*/
+	csx = 0;
+	csax = sax;
+	for (x = 0; x < dst->w; x++) {
+		csx += src->w;
+		*csax = 0;
+		while (csx >= dst->w) {
+			csx -= dst->w;
+			(*csax)++;
+		}
+		(*csax) = (*csax) * (flipx ? -1 : 1);
+		csax++;
+	}
+	csy = 0;
+	csay = say;
+	for (y = 0; y < dst->h; y++) {
+		csy += src->h;
+		*csay = 0;
+		while (csy >= dst->h) {
+			csy -= dst->h;
+			(*csay)++;
+		}
+		(*csay) = (*csay) * (flipy ? -1 : 1);
+		csay++;
+	}
+
+	/*
+	* Draw 
+	*/
+	csay = say;
+	for (y = 0; y < dst->h; y++) {
+		csax = sax;
+		sp = csp;
+		for (x = 0; x < dst->w; x++) {
+			/*
+			* Draw 
+			*/
+			*dp = *sp;
+			/*
+			* Advance source pointers 
+			*/
+			sp += (*csax);
+			csax++;
+			/*
+			* Advance destination pointer 
+			*/
+			dp++;
+		}
+		/*
+		* Advance source pointer (for row) 
+		*/
+		csp += ((*csay) * src->pitch);
+		csay++;
+
+		/*
+		* Advance destination pointers 
+		*/
+		dp += dgap;
+	}
+
+	/*
+	* Remove temp arrays 
+	*/
+	free(sax);
+	free(say);
+
+	return (0);
+}
+
+/*! 
+\brief Internal 32 bit rotozoomer with optional anti-aliasing.
+
+Rotates and zooms 32 bit RGBA/ABGR 'src' surface to 'dst' surface based on the control 
+parameters by scanning the destination surface and applying optionally anti-aliasing
+by bilinear interpolation.
+Assumes src and dst surfaces are of 32 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src Source surface.
+\param dst Destination surface.
+\param cx Horizontal center coordinate.
+\param cy Vertical center coordinate.
+\param isin Integer version of sine of angle.
+\param icos Integer version of cosine of angle.
+\param flipx Flag indicating horizontal mirroring should be applied.
+\param flipy Flag indicating vertical mirroring should be applied.
+\param smooth Flag indicating anti-aliasing should be used.
+*/
+void _transformSurfaceRGBA(SDL_Surface * src, SDL_Surface * dst, int cx, int cy, int isin, int icos, int flipx, int flipy, int smooth)
+{
+	int x, y, t1, t2, dx, dy, xd, yd, sdx, sdy, ax, ay, ex, ey, sw, sh;
+	SDL_Color c00, c01, c10, c11, cswap;
+	SDL_Color *pc, *sp;
+	int gap;
+
+	/*
+	* Variable setup 
+	*/
+	xd = ((src->w - dst->w) << 15);
+	yd = ((src->h - dst->h) << 15);
+	ax = (cx << 16) - (icos * cx);
+	ay = (cy << 16) - (isin * cx);
+	sw = src->w - 1;
+	sh = src->h - 1;
+	pc = (SDL_Color *) dst->pixels;
+	gap = dst->pitch - dst->w * 4;
+
+	/*
+	* Switch between interpolating and non-interpolating code 
+	*/
+	if (smooth) {
+		for (y = 0; y < dst->h; y++) {
+			dy = cy - y;
+			sdx = (ax + (isin * dy)) + xd;
+			sdy = (ay - (icos * dy)) + yd;
+			for (x = 0; x < dst->w; x++) {
+				dx = (sdx >> 16);
+				dy = (sdy >> 16);
+				if (flipx) dx = sw - dx;
+				if (flipy) dy = sh - dy;
+				if ((dx > -1) && (dy > -1) && (dx < (src->w-1)) && (dy < (src->h-1))) {
+					sp = (SDL_Color *)src->pixels;;
+					sp += ((src->pitch/4) * dy);
+					sp += dx;
+					c00 = *sp;
+					sp += 1;
+					c01 = *sp;
+					sp += (src->pitch/4);
+					c11 = *sp;
+					sp -= 1;
+					c10 = *sp;
+					if (flipx) {
+						cswap = c00; c00=c01; c01=cswap;
+						cswap = c10; c10=c11; c11=cswap;
+					}
+					if (flipy) {
+						cswap = c00; c00=c10; c10=cswap;
+						cswap = c01; c01=c11; c11=cswap;
+					}
+					/*
+					* Interpolate colors 
+					*/
+					ex = (sdx & 0xffff);
+					ey = (sdy & 0xffff);
+					t1 = ((((c01.r - c00.r) * ex) >> 16) + c00.r) & 0xff;
+					t2 = ((((c11.r - c10.r) * ex) >> 16) + c10.r) & 0xff;
+					pc->r = (((t2 - t1) * ey) >> 16) + t1;
+					t1 = ((((c01.g - c00.g) * ex) >> 16) + c00.g) & 0xff;
+					t2 = ((((c11.g - c10.g) * ex) >> 16) + c10.g) & 0xff;
+					pc->g = (((t2 - t1) * ey) >> 16) + t1;
+					t1 = ((((c01.b - c00.b) * ex) >> 16) + c00.b) & 0xff;
+					t2 = ((((c11.b - c10.b) * ex) >> 16) + c10.b) & 0xff;
+					pc->b = (((t2 - t1) * ey) >> 16) + t1;
+					t1 = ((((c01.a - c00.a) * ex) >> 16) + c00.a) & 0xff;
+					t2 = ((((c11.a - c10.a) * ex) >> 16) + c10.a) & 0xff;
+					pc->a = (((t2 - t1) * ey) >> 16) + t1;
+				}
+				sdx += icos;
+				sdy += isin;
+				pc++;
+			}
+			pc = (SDL_Color *) ((Uint8 *) pc + gap);
+		}
+	} else {
+		for (y = 0; y < dst->h; y++) {
+			dy = cy - y;
+			sdx = (ax + (isin * dy)) + xd;
+			sdy = (ay - (icos * dy)) + yd;
+			for (x = 0; x < dst->w; x++) {
+				dx = (short) (sdx >> 16);
+				dy = (short) (sdy >> 16);
+				if (flipx) dx = (src->w-1)-dx;
+				if (flipy) dy = (src->h-1)-dy;
+				if ((dx >= 0) && (dy >= 0) && (dx < src->w) && (dy < src->h)) {
+					sp = (SDL_Color *) ((Uint8 *) src->pixels + src->pitch * dy);
+					sp += dx;
+					*pc = *sp;
+				}
+				sdx += icos;
+				sdy += isin;
+				pc++;
+			}
+			pc = (SDL_Color *) ((Uint8 *) pc + gap);
+		}
+	}
+}
+
+/*!
+
+\brief Rotates and zooms 8 bit palette/Y 'src' surface to 'dst' surface without smoothing.
+
+Rotates and zooms 8 bit RGBA/ABGR 'src' surface to 'dst' surface based on the control 
+parameters by scanning the destination surface.
+Assumes src and dst surfaces are of 8 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src Source surface.
+\param dst Destination surface.
+\param cx Horizontal center coordinate.
+\param cy Vertical center coordinate.
+\param isin Integer version of sine of angle.
+\param icos Integer version of cosine of angle.
+\param flipx Flag indicating horizontal mirroring should be applied.
+\param flipy Flag indicating vertical mirroring should be applied.
+*/
+void transformSurfaceY(SDL_Surface * src, SDL_Surface * dst, int cx, int cy, int isin, int icos, int flipx, int flipy)
+{
+	int x, y, dx, dy, xd, yd, sdx, sdy, ax, ay;
+	Uint8 *pc, *sp;
+	int gap;
+
+	/*
+	* Variable setup 
+	*/
+	xd = ((src->w - dst->w) << 15);
+	yd = ((src->h - dst->h) << 15);
+	ax = (cx << 16) - (icos * cx);
+	ay = (cy << 16) - (isin * cx);
+	pc = (Uint8 *) dst->pixels;
+	gap = dst->pitch - dst->w;
+	/*
+	* Clear surface to colorkey 
+	*/ 	
+	memset(pc, (int)(_colorkey(src) & 0xff), dst->pitch * dst->h);
+	/*
+	* Iterate through destination surface 
+	*/
+	for (y = 0; y < dst->h; y++) {
+		dy = cy - y;
+		sdx = (ax + (isin * dy)) + xd;
+		sdy = (ay - (icos * dy)) + yd;
+		for (x = 0; x < dst->w; x++) {
+			dx = (short) (sdx >> 16);
+			dy = (short) (sdy >> 16);
+			if (flipx) dx = (src->w-1)-dx;
+			if (flipy) dy = (src->h-1)-dy;
+			if ((dx >= 0) && (dy >= 0) && (dx < src->w) && (dy < src->h)) {
+				sp = (Uint8 *) (src->pixels);
+				sp += (src->pitch * dy + dx);
+				*pc = *sp;
+			}
+			sdx += icos;
+			sdy += isin;
+			pc++;
+		}
+		pc += gap;
+	}
+}
+
+/*!
+\brief Rotates a 8/16/24/32 bit surface in increments of 90 degrees.
+
+Specialized 90 degree rotator which rotates a 'src' surface in 90 degree 
+increments clockwise returning a new surface. Faster than rotozoomer since
+no scanning or interpolation takes place. Input surface must be 8/16/24/32 bit.
+(code contributed by J. Schiller, improved by C. Allport and A. Schiffler)
+
+\param src Source surface to rotate.
+\param numClockwiseTurns Number of clockwise 90 degree turns to apply to the source.
+
+\returns The new, rotated surface; or NULL for surfaces with incorrect input format.
+*/
+SDL_Surface* rotateSurface90Degrees(SDL_Surface* src, int numClockwiseTurns) 
+{
+	int row, col, newWidth, newHeight;
+	int bpp, bpr;
+	SDL_Surface* dst;
+	Uint8* srcBuf;
+	Uint8* dstBuf;
+	int normalizedClockwiseTurns;
+	const SDL_PixelFormatDetails* details;
+
+	/* Has to be a valid surface pointer and be a Nbit surface where n is divisible by 8 */
+	if (!src || 
+	    !src->format) {
+		SDL_SetError("NULL source surface or source surface format");
+	    return NULL; 
+	}
+
+	details = SDL_GetPixelFormatDetails(src->format);
+	if ((details->bits_per_pixel % 8) != 0) {
+		SDL_SetError("Invalid source surface bit depth");
+	    return NULL; 
+	}
+
+	/* normalize numClockwiseTurns */
+	normalizedClockwiseTurns = (numClockwiseTurns % 4);
+	if (normalizedClockwiseTurns < 0) {
+		normalizedClockwiseTurns += 4;
+	}
+
+	/* If turns are even, our new width/height will be the same as the source surface */
+	if (normalizedClockwiseTurns % 2) {
+		newWidth = src->h;
+		newHeight = src->w;
+	} else {
+		newWidth = src->w;
+		newHeight = src->h;
+	}
+
+	dst = SDL_CreateSurface(newWidth, newHeight, src->format);
+	if(!dst) {
+		SDL_SetError("Could not create destination surface"); 
+		return NULL;
+	}
+
+	if (SDL_MUSTLOCK(src)) {
+		SDL_LockSurface(src);
+	}
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_LockSurface(dst);
+	}
+
+	/* Calculate byte-per-pixel */
+	bpp = details->bits_per_pixel / 8;
+
+	switch(normalizedClockwiseTurns) {
+	case 0: /* Make a copy of the surface */
+		{
+			/* Unfortunately SDL_BlitSurface cannot be used to make a copy of the surface
+			since it does not preserve alpha. */
+
+			if (src->pitch == dst->pitch) {
+				/* If the pitch is the same for both surfaces, the memory can be copied all at once. */
+				memcpy(dst->pixels, src->pixels, (src->h * src->pitch));
+			}
+			else
+			{
+				/* If the pitch differs, copy each row separately */
+				srcBuf = (Uint8*)(src->pixels);
+				dstBuf = (Uint8*)(dst->pixels);
+				bpr = src->w * bpp;
+				for (row = 0; row < src->h; row++) {
+					memcpy(dstBuf, srcBuf, bpr);
+					srcBuf += src->pitch;
+					dstBuf += dst->pitch;
+				}
+			}
+		}
+		break;
+
+		/* rotate clockwise */
+	case 1: /* rotated 90 degrees clockwise */
+		{
+			for (row = 0; row < src->h; ++row) {
+				srcBuf = (Uint8*)(src->pixels) + (row * src->pitch);
+				dstBuf = (Uint8*)(dst->pixels) + (dst->w - row - 1) * bpp;
+				for (col = 0; col < src->w; ++col) {
+					memcpy (dstBuf, srcBuf, bpp);
+					srcBuf += bpp;
+					dstBuf += dst->pitch;
+				} 
+			} 
+		}
+		break;
+
+	case 2: /* rotated 180 degrees clockwise */
+		{
+			for (row = 0; row < src->h; ++row) {
+				srcBuf = (Uint8*)(src->pixels) + (row * src->pitch);
+				dstBuf = (Uint8*)(dst->pixels) + ((dst->h - row - 1) * dst->pitch) + (dst->w - 1) * bpp;
+				for (col = 0; col < src->w; ++col) {
+					memcpy (dstBuf, srcBuf, bpp);
+					srcBuf += bpp;
+					dstBuf -= bpp;
+				} 
+			} 
+		}
+		break;
+
+	case 3: /* rotated 270 degrees clockwise */
+		{
+			for (row = 0; row < src->h; ++row) {
+				srcBuf = (Uint8*)(src->pixels) + (row * src->pitch);
+				dstBuf = (Uint8*)(dst->pixels) + (row * bpp) + ((dst->h - 1) * dst->pitch);
+				for (col = 0; col < src->w; ++col) {
+					memcpy (dstBuf, srcBuf, bpp);
+					srcBuf += bpp;
+					dstBuf -= dst->pitch;
+				} 
+			} 
+		}
+		break;
+	} 
+	/* end switch */
+
+	if (SDL_MUSTLOCK(src)) {
+		SDL_UnlockSurface(src);
+	}
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_UnlockSurface(dst);
+	}
+
+	return dst;
+}
+
+
+/*!
+\brief Internal target surface sizing function for rotozooms with trig result return. 
+
+\param width The source surface width.
+\param height The source surface height.
+\param angle The angle to rotate in degrees.
+\param zoomx The horizontal scaling factor.
+\param zoomy The vertical scaling factor.
+\param dstwidth The calculated width of the destination surface.
+\param dstheight The calculated height of the destination surface.
+\param canglezoom The sine of the angle adjusted by the zoom factor.
+\param sanglezoom The cosine of the angle adjusted by the zoom factor.
+
+*/
+void _rotozoomSurfaceSizeTrig(int width, int height, double angle, double zoomx, double zoomy, 
+	int *dstwidth, int *dstheight, 
+	double *canglezoom, double *sanglezoom)
+{
+	double x, y, cx, cy, sx, sy;
+	double radangle;
+	int dstwidthhalf, dstheighthalf;
+
+	/*
+	* Determine destination width and height by rotating a centered source box 
+	*/
+	radangle = angle * (M_PI / 180.0);
+	*sanglezoom = sin(radangle);
+	*canglezoom = cos(radangle);
+	*sanglezoom *= zoomx;
+	*canglezoom *= zoomy;
+	x = (double)(width / 2);
+	y = (double)(height / 2);
+	cx = *canglezoom * x;
+	cy = *canglezoom * y;
+	sx = *sanglezoom * x;
+	sy = *sanglezoom * y;
+
+	dstwidthhalf = MAX((int)
+		ceil(MAX(MAX(MAX(fabs(cx + sy), fabs(cx - sy)), fabs(-cx + sy)), fabs(-cx - sy))), 1);
+	dstheighthalf = MAX((int)
+		ceil(MAX(MAX(MAX(fabs(sx + cy), fabs(sx - cy)), fabs(-sx + cy)), fabs(-sx - cy))), 1);
+	*dstwidth = 2 * dstwidthhalf;
+	*dstheight = 2 * dstheighthalf;
+}
+
+/*! 
+\brief Returns the size of the resulting target surface for a rotozoomSurfaceXY() call. 
+
+\param width The source surface width.
+\param height The source surface height.
+\param angle The angle to rotate in degrees.
+\param zoomx The horizontal scaling factor.
+\param zoomy The vertical scaling factor.
+\param dstwidth The calculated width of the rotozoomed destination surface.
+\param dstheight The calculated height of the rotozoomed destination surface.
+*/
+void rotozoomSurfaceSizeXY(int width, int height, double angle, double zoomx, double zoomy, int *dstwidth, int *dstheight)
+{
+	double dummy_sanglezoom, dummy_canglezoom;
+
+	_rotozoomSurfaceSizeTrig(width, height, angle, zoomx, zoomy, dstwidth, dstheight, &dummy_sanglezoom, &dummy_canglezoom);
+}
+
+/*! 
+\brief Returns the size of the resulting target surface for a rotozoomSurface() call. 
+
+\param width The source surface width.
+\param height The source surface height.
+\param angle The angle to rotate in degrees.
+\param zoom The scaling factor.
+\param dstwidth The calculated width of the rotozoomed destination surface.
+\param dstheight The calculated height of the rotozoomed destination surface.
+*/
+void rotozoomSurfaceSize(int width, int height, double angle, double zoom, int *dstwidth, int *dstheight)
+{
+	double dummy_sanglezoom, dummy_canglezoom;
+
+	_rotozoomSurfaceSizeTrig(width, height, angle, zoom, zoom, dstwidth, dstheight, &dummy_sanglezoom, &dummy_canglezoom);
+}
+
+/*!
+\brief Rotates and zooms a surface and optional anti-aliasing. 
+
+Rotates and zoomes a 32bit or 8bit 'src' surface to newly created 'dst' surface.
+'angle' is the rotation in degrees and 'zoom' a scaling factor. If 'smooth' is set
+then the destination 32bit surface is anti-aliased. If the surface is not 8bit
+or 32bit RGBA/ABGR it will be converted into a 32bit RGBA format on the fly.
+
+\param src The surface to rotozoom.
+\param angle The angle to rotate in degrees.
+\param zoom The scaling factor.
+\param smooth Antialiasing flag; set to SMOOTHING_ON to enable.
+
+\return The new rotozoomed surface.
+*/
+SDL_Surface *rotozoomSurface(SDL_Surface * src, double angle, double zoom, int smooth)
+{
+	return rotozoomSurfaceXY(src, angle, zoom, zoom, smooth);
+}
+
+/*!
+\brief Rotates and zooms a surface with different horizontal and vertival scaling factors and optional anti-aliasing. 
+
+Rotates and zooms a 32bit or 8bit 'src' surface to newly created 'dst' surface.
+'angle' is the rotation in degrees, 'zoomx and 'zoomy' scaling factors. If 'smooth' is set
+then the destination 32bit surface is anti-aliased. If the surface is not 8bit
+or 32bit RGBA/ABGR it will be converted into a 32bit RGBA format on the fly.
+
+\param src The surface to rotozoom.
+\param angle The angle to rotate in degrees.
+\param zoomx The horizontal scaling factor.
+\param zoomy The vertical scaling factor.
+\param smooth Antialiasing flag; set to SMOOTHING_ON to enable.
+
+\return The new rotozoomed surface.
+*/
+SDL_Surface *rotozoomSurfaceXY(SDL_Surface * src, double angle, double zoomx, double zoomy, int smooth)
+{
+	SDL_Surface *rz_src;
+	SDL_Surface *rz_dst;
+	double zoominv;
+	double sanglezoom, canglezoom, sanglezoominv, canglezoominv;
+	int dstwidthhalf, dstwidth, dstheighthalf, dstheight;
+	int is32bit;
+	int i, src_converted;
+	int flipx,flipy;
+	const SDL_PixelFormatDetails* details;
+	SDL_Palette* pal_dst;
+	SDL_Palette* pal_src;
+
+	/*
+	* Sanity check 
+	*/
+	if (src == NULL) {
+		return (NULL);
+	}
+
+	/*
+	* Determine if source surface is 32bit or 8bit 
+	*/
+	details = SDL_GetPixelFormatDetails(src->format);
+	is32bit = (details->bits_per_pixel == 32);
+	if ((is32bit) || (details->bits_per_pixel == 8)) {
+		/*
+		* Use source surface 'as is' 
+		*/
+		rz_src = src;
+		src_converted = 0;
+	} else {
+		/*
+		* New source surface is 32bit with a defined RGBA ordering 
+		*/
+		rz_src =
+			SDL_CreateSurface(src->w, src->h, SDL_PIXELFORMAT_RGBA32);
+
+		SDL_BlitSurface(src, NULL, rz_src, NULL);
+
+		src_converted = 1;
+		is32bit = 1;
+	}
+
+	/*
+	* Sanity check zoom factor 
+	*/
+	flipx = (zoomx<0.0);
+	if (flipx) zoomx=-zoomx;
+	flipy = (zoomy<0.0);
+	if (flipy) zoomy=-zoomy;
+	if (zoomx < VALUE_LIMIT) zoomx = VALUE_LIMIT;
+	if (zoomy < VALUE_LIMIT) zoomy = VALUE_LIMIT;
+	zoominv = 65536.0 / (zoomx * zoomx);
+
+	/*
+	* Check if we have a rotozoom or just a zoom 
+	*/
+	if (fabs(angle) > VALUE_LIMIT) {
+
+		/*
+		* Angle!=0: full rotozoom 
+		*/
+		/*
+		* ----------------------- 
+		*/
+
+		/* Determine target size */
+		_rotozoomSurfaceSizeTrig(rz_src->w, rz_src->h, angle, zoomx, zoomy, &dstwidth, &dstheight, &canglezoom, &sanglezoom);
+
+		/*
+		* Calculate target factors from sin/cos and zoom 
+		*/
+		sanglezoominv = sanglezoom;
+		canglezoominv = canglezoom;
+		sanglezoominv *= zoominv;
+		canglezoominv *= zoominv;
+
+		/* Calculate half size */
+		dstwidthhalf = dstwidth / 2;
+		dstheighthalf = dstheight / 2;
+
+		/*
+		* Alloc space to completely contain the rotated surface 
+		*/
+		rz_dst = NULL;
+		if (is32bit) {
+			/*
+			* Target surface is 32bit with source RGBA/ABGR ordering 
+			*/
+			rz_dst =
+				SDL_CreateSurface(dstwidth, dstheight + GUARD_ROWS, rz_src->format);
+		} else {
+			/*
+			* Target surface is 8bit 
+			*/
+			rz_dst = SDL_CreateSurface(dstwidth, dstheight + GUARD_ROWS, SDL_PIXELFORMAT_INDEX8);
+			pal_dst = SDL_CreateSurfacePalette(rz_dst);
+		}
+
+		/* Check target */
+		if (rz_dst == NULL)
+			return NULL;
+
+		/* Adjust for guard rows */
+		rz_dst->h = dstheight;
+
+		/*
+		* Lock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_LockSurface(rz_src);
+		}
+
+		/*
+		* Check which kind of surface we have 
+		*/
+		if (is32bit) {
+			/*
+			* Call the 32bit transformation routine to do the rotation (using alpha) 
+			*/
+			_transformSurfaceRGBA(rz_src, rz_dst, dstwidthhalf, dstheighthalf,
+				(int) (sanglezoominv), (int) (canglezoominv), 
+				flipx, flipy,
+				smooth);
+		} else {
+			/*
+			* Copy palette and colorkey info 
+			*/
+			pal_src = SDL_GetSurfacePalette(rz_src);
+			for (i = 0; i < pal_src->ncolors; i++) {
+				pal_dst->colors[i] = pal_src->colors[i];
+			}
+			pal_dst->ncolors = pal_src->ncolors;
+			/*
+			* Call the 8bit transformation routine to do the rotation 
+			*/
+			transformSurfaceY(rz_src, rz_dst, dstwidthhalf, dstheighthalf,
+				(int) (sanglezoominv), (int) (canglezoominv),
+				flipx, flipy);
+		}
+		/*
+		* Unlock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_UnlockSurface(rz_src);
+		}
+
+	} else {
+
+		/*
+		* Angle=0: Just a zoom 
+		*/
+		/*
+		* -------------------- 
+		*/
+
+		/*
+		* Calculate target size
+		*/
+		zoomSurfaceSize(rz_src->w, rz_src->h, zoomx, zoomy, &dstwidth, &dstheight);
+
+		/*
+		* Alloc space to completely contain the zoomed surface 
+		*/
+		rz_dst = NULL;
+		if (is32bit) {
+			/*
+			* Target surface is 32bit with source RGBA/ABGR ordering 
+			*/
+			rz_dst =
+				SDL_CreateSurface(dstwidth, dstheight + GUARD_ROWS, rz_src->format);
+		} else {
+			/*
+			* Target surface is 8bit 
+			*/
+			rz_dst = SDL_CreateSurface(dstwidth, dstheight + GUARD_ROWS, SDL_PIXELFORMAT_INDEX8);
+			pal_dst = SDL_CreateSurfacePalette(rz_dst);
+		}
+
+		/* Check target */
+		if (rz_dst == NULL)
+			return NULL;
+
+		/* Adjust for guard rows */
+		rz_dst->h = dstheight;
+
+		/*
+		* Lock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_LockSurface(rz_src);
+		}
+
+		/*
+		* Check which kind of surface we have 
+		*/
+		if (is32bit) {
+			/*
+			* Call the 32bit transformation routine to do the zooming (using alpha) 
+			*/
+			_zoomSurfaceRGBA(rz_src, rz_dst, flipx, flipy, smooth);
+
+		} else {
+			/*
+			* Copy palette and colorkey info 
+			*/
+			pal_src = SDL_GetSurfacePalette(rz_src);
+			for (i = 0; i < pal_src->ncolors; i++) {
+				pal_dst->colors[i] = pal_src->colors[i];
+			}
+			pal_dst->ncolors = pal_src->ncolors;
+
+			/*
+			* Call the 8bit transformation routine to do the zooming 
+			*/
+			_zoomSurfaceY(rz_src, rz_dst, flipx, flipy);
+		}
+
+		/*
+		* Unlock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_UnlockSurface(rz_src);
+		}
+	}
+
+	/*
+	* Cleanup temp surface 
+	*/
+	if (src_converted) {
+		SDL_DestroySurface(rz_src);
+	}
+
+	/*
+	* Return destination surface 
+	*/
+	return (rz_dst);
+}
+
+/*!
+\brief Calculates the size of the target surface for a zoomSurface() call.
+
+The minimum size of the target surface is 1. The input factors can be positive or negative.
+
+\param width The width of the source surface to zoom.
+\param height The height of the source surface to zoom.
+\param zoomx The horizontal zoom factor.
+\param zoomy The vertical zoom factor.
+\param dstwidth Pointer to an integer to store the calculated width of the zoomed target surface.
+\param dstheight Pointer to an integer to store the calculated height of the zoomed target surface.
+*/
+void zoomSurfaceSize(int width, int height, double zoomx, double zoomy, int *dstwidth, int *dstheight)
+{
+	/*
+	* Make zoom factors positive 
+	*/
+	int flipx, flipy;
+	flipx = (zoomx<0.0);
+	if (flipx) zoomx = -zoomx;
+	flipy = (zoomy<0.0);
+	if (flipy) zoomy = -zoomy;
+
+	/*
+	* Sanity check zoom factors 
+	*/
+	if (zoomx < VALUE_LIMIT) {
+		zoomx = VALUE_LIMIT;
+	}
+	if (zoomy < VALUE_LIMIT) {
+		zoomy = VALUE_LIMIT;
+	}
+
+	/*
+	* Calculate target size 
+	*/
+	*dstwidth = (int) floor(((double) width * zoomx) + 0.5);
+	*dstheight = (int) floor(((double) height * zoomy) + 0.5);
+	if (*dstwidth < 1) {
+		*dstwidth = 1;
+	}
+	if (*dstheight < 1) {
+		*dstheight = 1;
+	}
+}
+
+/*! 
+\brief Zoom a surface by independent horizontal and vertical factors with optional smoothing.
+
+Zooms a 32bit or 8bit 'src' surface to newly created 'dst' surface.
+'zoomx' and 'zoomy' are scaling factors for width and height. If 'smooth' is on
+then the destination 32bit surface is anti-aliased. If the surface is not 8bit
+or 32bit RGBA/ABGR it will be converted into a 32bit RGBA format on the fly.
+If zoom factors are negative, the image is flipped on the axes.
+
+\param src The surface to zoom.
+\param zoomx The horizontal zoom factor.
+\param zoomy The vertical zoom factor.
+\param smooth Antialiasing flag; set to SMOOTHING_ON to enable.
+
+\return The new, zoomed surface.
+*/
+SDL_Surface *zoomSurface(SDL_Surface * src, double zoomx, double zoomy, int smooth)
+{
+	SDL_Surface *rz_src;
+	SDL_Surface *rz_dst;
+	int dstwidth, dstheight;
+	int is32bit;
+	int i, src_converted;
+	int flipx, flipy;
+	const SDL_PixelFormatDetails* details;
+	SDL_Palette* pal_src;
+	SDL_Palette* pal_dst;
+
+	/*
+	* Sanity check 
+	*/
+	if (src == NULL)
+		return (NULL);
+
+	/*
+	* Determine if source surface is 32bit or 8bit 
+	*/
+	details = SDL_GetPixelFormatDetails(src->format);
+	is32bit = (details->bits_per_pixel == 32);
+	if ((is32bit) || (details->bits_per_pixel == 8)) {
+		/*
+		* Use source surface 'as is' 
+		*/
+		rz_src = src;
+		src_converted = 0;
+	} else {
+		/*
+		* New source surface is 32bit with a defined RGBA ordering 
+		*/
+		rz_src =
+			SDL_CreateSurface(src->w, src->h, SDL_PIXELFORMAT_RGBA32);
+		if (rz_src == NULL) {
+			return NULL;
+		}
+		SDL_BlitSurface(src, NULL, rz_src, NULL);
+		src_converted = 1;
+		is32bit = 1;
+	}
+
+	flipx = (zoomx<0.0);
+	if (flipx) zoomx = -zoomx;
+	flipy = (zoomy<0.0);
+	if (flipy) zoomy = -zoomy;
+
+	/* Get size if target */
+	zoomSurfaceSize(rz_src->w, rz_src->h, zoomx, zoomy, &dstwidth, &dstheight);
+
+	/*
+	* Alloc space to completely contain the zoomed surface 
+	*/
+	rz_dst = NULL;
+	if (is32bit) {
+		/*
+		* Target surface is 32bit with source RGBA/ABGR ordering 
+		*/
+		rz_dst =
+			SDL_CreateSurface(dstwidth, dstheight + GUARD_ROWS, rz_src->format);
+	} else {
+		/*
+		* Target surface is 8bit 
+		*/
+		rz_dst = SDL_CreateSurface(dstwidth, dstheight + GUARD_ROWS, SDL_PIXELFORMAT_INDEX8);
+		pal_dst = SDL_CreateSurfacePalette(rz_dst);
+	}
+
+	/* Check target */
+	if (rz_dst == NULL) {
+		/*
+		* Cleanup temp surface 
+		*/
+		if (src_converted) {
+			SDL_DestroySurface(rz_src);
+		}		
+		return NULL;
+	}
+
+	/* Adjust for guard rows */
+	rz_dst->h = dstheight;
+
+	/*
+	* Lock source surface 
+	*/
+	if (SDL_MUSTLOCK(rz_src)) {
+		SDL_LockSurface(rz_src);
+	}
+
+	/*
+	* Check which kind of surface we have 
+	*/
+	if (is32bit) {
+		/*
+		* Call the 32bit transformation routine to do the zooming (using alpha) 
+		*/
+		_zoomSurfaceRGBA(rz_src, rz_dst, flipx, flipy, smooth);
+	} else {
+		/*
+		* Copy palette and colorkey info 
+		*/
+		pal_src = SDL_GetSurfacePalette(rz_src);
+		for (i = 0; i < pal_src->ncolors; i++) {
+			pal_dst->colors[i] = pal_src->colors[i];
+		}
+		pal_dst->ncolors = pal_src->ncolors;
+		/*
+		* Call the 8bit transformation routine to do the zooming 
+		*/
+		_zoomSurfaceY(rz_src, rz_dst, flipx, flipy);
+	}
+	/*
+	* Unlock source surface 
+	*/
+	if (SDL_MUSTLOCK(rz_src)) {
+		SDL_UnlockSurface(rz_src);
+	}
+
+	/*
+	* Cleanup temp surface 
+	*/
+	if (src_converted) {
+		SDL_DestroySurface(rz_src);
+	}
+
+	/*
+	* Return destination surface 
+	*/
+	return (rz_dst);
+}
+
+/*! 
+\brief Shrink a surface by an integer ratio using averaging.
+
+Shrinks a 32bit or 8bit 'src' surface to a newly created 'dst' surface.
+'factorx' and 'factory' are the shrinking ratios (i.e. 2=1/2 the size,
+3=1/3 the size, etc.) The destination surface is antialiased by averaging
+the source box RGBA or Y information. If the surface is not 8bit
+or 32bit RGBA/ABGR it will be converted into a 32bit RGBA format on the fly.
+The input surface is not modified. The output surface is newly allocated.
+
+\param src The surface to shrink.
+\param factorx The horizontal shrinking ratio.
+\param factory The vertical shrinking ratio.
+
+\return The new, shrunken surface.
+*/
+/*@null@*/ 
+SDL_Surface *shrinkSurface(SDL_Surface *src, int factorx, int factory)
+{
+	int result;
+	SDL_Surface *rz_src;
+	SDL_Surface *rz_dst = NULL;
+	int dstwidth, dstheight;
+	int is32bit;
+	int i, src_converted;
+	int haveError = 0;
+	const SDL_PixelFormatDetails* details;
+	SDL_Palette* pal_src;
+	SDL_Palette* pal_dst;
+
+	/*
+	* Sanity check 
+	*/
+	if (src == NULL) {
+		return (NULL);
+	}
+
+	/*
+	* Determine if source surface is 32bit or 8bit 
+	*/
+	details = SDL_GetPixelFormatDetails(src->format);
+	is32bit = (details->bits_per_pixel == 32);
+	if ((is32bit) || (details->bits_per_pixel == 8)) {
+		/*
+		* Use source surface 'as is' 
+		*/
+		rz_src = src;
+		src_converted = 0;
+	} else {
+		/*
+		* New source surface is 32bit with a defined RGBA ordering 
+		*/
+		rz_src = SDL_CreateSurface(src->w, src->h, SDL_PIXELFORMAT_RGBA32);
+		if (rz_src==NULL) {
+			haveError = 1;
+			goto exitShrinkSurface;
+		}
+
+		SDL_BlitSurface(src, NULL, rz_src, NULL);
+		src_converted = 1;
+		is32bit = 1;
+	}
+
+	/*
+	* Lock the surface 
+	*/
+	if (SDL_MUSTLOCK(rz_src)) {
+		if (!SDL_LockSurface(rz_src)) {
+			haveError = 1;
+			goto exitShrinkSurface;
+		}
+	}
+
+	/* Get size for target */
+	dstwidth=rz_src->w/factorx;
+	while (dstwidth*factorx>rz_src->w) { dstwidth--; }
+	dstheight=rz_src->h/factory;
+	while (dstheight*factory>rz_src->h) { dstheight--; }
+
+	/*
+	* Alloc space to completely contain the shrunken surface
+	* (with added guard rows)
+	*/
+	if (is32bit==1) {
+		/*
+		* Target surface is 32bit with source RGBA/ABGR ordering 
+		*/
+		rz_dst =
+			SDL_CreateSurface(dstwidth, dstheight + GUARD_ROWS, rz_src->format);
+	} else {
+		/*
+		* Target surface is 8bit 
+		*/
+		rz_dst = SDL_CreateSurface(dstwidth, dstheight + GUARD_ROWS, SDL_PIXELFORMAT_INDEX8);
+		pal_dst = SDL_CreateSurfacePalette(rz_dst);
+	}
+
+	/* Check target */
+	if (rz_dst == NULL) {
+		haveError = 1;
+		goto exitShrinkSurface;
+	}
+
+	/* Adjust for guard rows */
+	rz_dst->h = dstheight;
+
+	/*
+	* Check which kind of surface we have 
+	*/
+	if (is32bit==1) {
+		/*
+		* Call the 32bit transformation routine to do the shrinking (using alpha) 
+		*/
+		result = _shrinkSurfaceRGBA(rz_src, rz_dst, factorx, factory);		
+		if ((result!=0) || (rz_dst==NULL)) {
+			haveError = 1;
+			goto exitShrinkSurface;
+		}
+	} else {
+		/*
+		* Copy palette and colorkey info 
+		*/
+		pal_src = SDL_GetSurfacePalette(rz_src);
+		for (i = 0; i < pal_src->ncolors; i++) {
+			pal_dst->colors[i] = pal_src->colors[i];
+		}
+		pal_dst->ncolors = pal_src->ncolors;
+		/*
+		* Call the 8bit transformation routine to do the shrinking 
+		*/
+		result = _shrinkSurfaceY(rz_src, rz_dst, factorx, factory);
+		if (result!=0) {
+			haveError = 1;
+			goto exitShrinkSurface;
+		}
+	}
+
+exitShrinkSurface:
+	if (rz_src!=NULL) {
+		/*
+		* Unlock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_UnlockSurface(rz_src);
+		}
+
+		/*
+		* Cleanup temp surface 
+		*/
+		if (src_converted==1) {
+			SDL_DestroySurface(rz_src);
+		}
+	}
+
+	/* Check error state; maybe need to cleanup destination */
+	if (haveError==1) {
+		if (rz_dst!=NULL) {
+			SDL_DestroySurface(rz_dst);
+		}
+		rz_dst=NULL;
+	} 
+
+	/*
+	* Return destination surface 
+	*/
+	return (rz_dst);
+}
diff --git a/vendor/SDL3_gfx/SDL3_rotozoom.h b/vendor/SDL3_gfx/SDL3_rotozoom.h
new file mode 100644
index 0000000..015e119
--- /dev/null
+++ b/vendor/SDL3_gfx/SDL3_rotozoom.h
@@ -0,0 +1,123 @@
+/*  
+
+SDL3_rotozoom.c: rotozoomer, zoomer and shrinker for 32bit or 8bit surfaces
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifndef _SDL3_rotozoom_h
+#define _SDL3_rotozoom_h
+
+#include <math.h>
+
+/* Set up for C function definitions, even when using C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef M_PI
+#define M_PI	3.1415926535897932384626433832795
+#endif
+
+#include <SDL3/SDL.h>
+
+	/* ---- Defines */
+
+	/*!
+	\brief Disable anti-aliasing (no smoothing).
+	*/
+#define SMOOTHING_OFF		0
+
+	/*!
+	\brief Enable anti-aliasing (smoothing).
+	*/
+#define SMOOTHING_ON		1
+
+	/* ---- Function Prototypes */
+
+#ifdef _MSC_VER
+#  if defined(DLL_EXPORT) && !defined(LIBSDL3_GFX_DLL_IMPORT)
+#    define SDL3_ROTOZOOM_SCOPE __declspec(dllexport)
+#  else
+#    ifdef LIBSDL3_GFX_DLL_IMPORT
+#      define SDL3_ROTOZOOM_SCOPE __declspec(dllimport)
+#    endif
+#  endif
+#endif
+#ifndef SDL3_ROTOZOOM_SCOPE
+#  define SDL3_ROTOZOOM_SCOPE extern
+#endif
+
+	/* 
+
+	Rotozoom functions
+
+	*/
+
+	SDL3_ROTOZOOM_SCOPE SDL_Surface *rotozoomSurface(SDL_Surface * src, double angle, double zoom, int smooth);
+
+	SDL3_ROTOZOOM_SCOPE SDL_Surface *rotozoomSurfaceXY
+		(SDL_Surface * src, double angle, double zoomx, double zoomy, int smooth);
+
+
+	SDL3_ROTOZOOM_SCOPE void rotozoomSurfaceSize(int width, int height, double angle, double zoom, int *dstwidth,
+		int *dstheight);
+
+	SDL3_ROTOZOOM_SCOPE void rotozoomSurfaceSizeXY
+		(int width, int height, double angle, double zoomx, double zoomy, 
+		int *dstwidth, int *dstheight);
+
+	/* 
+
+	Zooming functions
+
+	*/
+
+	SDL3_ROTOZOOM_SCOPE SDL_Surface *zoomSurface(SDL_Surface * src, double zoomx, double zoomy, int smooth);
+
+	SDL3_ROTOZOOM_SCOPE void zoomSurfaceSize(int width, int height, double zoomx, double zoomy, int *dstwidth, int *dstheight);
+
+	/* 
+
+	Shrinking functions
+
+	*/     
+
+	SDL3_ROTOZOOM_SCOPE SDL_Surface *shrinkSurface(SDL_Surface * src, int factorx, int factory);
+
+	/* 
+
+	Specialized rotation functions
+
+	*/
+
+	SDL3_ROTOZOOM_SCOPE SDL_Surface* rotateSurface90Degrees(SDL_Surface* src, int numClockwiseTurns);
+
+	/* Ends C function definitions when using C++ */
+#ifdef __cplusplus
+}
+#endif
+
+#endif				/* _SDL3_rotozoom_h */
diff --git a/vendor/clay/clay_renderer_SDL3.c b/vendor/clay/clay_renderer_SDL3.c
index 3739726..5ccec48 100644
--- a/vendor/clay/clay_renderer_SDL3.c
+++ b/vendor/clay/clay_renderer_SDL3.c
@@ -1,247 +1,37 @@
-#include "clay.h"
-#include <SDL3/SDL_main.h>
-#include <SDL3/SDL.h>
-#include <SDL3_ttf/SDL_ttf.h>
-#include <SDL3_image/SDL_image.h>
+#include "clay_renderer_SDL3.h"
 
-typedef struct {
-    SDL_Renderer *renderer;
-    TTF_TextEngine *textEngine;
-    TTF_Font **fonts;
-} Clay_SDL3RendererData;
-
-/* Global for convenience. Even in 4K this is enough for smooth curves (low radius or rect size coupled with
- * no AA or low resolution might make it appear as jagged curves) */
-static int NUM_CIRCLE_SEGMENTS = 16;
-
-//all rendering is performed by a single SDL call, avoiding multiple RenderRect + plumbing choice for circles.
-static void SDL_Clay_RenderFillRoundedRect(Clay_SDL3RendererData *rendererData, const SDL_FRect rect, const float cornerRadius, const Clay_Color _color) {
-    const SDL_FColor color = { _color.r/255, _color.g/255, _color.b/255, _color.a/255 };
-
-    int indexCount = 0, vertexCount = 0;
-
-    const float minRadius = SDL_min(rect.w, rect.h) / 2.0f;
-    const float clampedRadius = SDL_min(cornerRadius, minRadius);
-
-    const int numCircleSegments = SDL_max(NUM_CIRCLE_SEGMENTS, (int) clampedRadius * 0.5f);
-
-    int totalVertices = 4 + (4 * (numCircleSegments * 2)) + 2*4;
-    int totalIndices = 6 + (4 * (numCircleSegments * 3)) + 6*4;
-
-    SDL_Vertex vertices[totalVertices];
-    int indices[totalIndices];
-
-    //define center rectangle
-    vertices[vertexCount++] = (SDL_Vertex){ {rect.x + clampedRadius, rect.y + clampedRadius}, color, {0, 0} }; //0 center TL
-    vertices[vertexCount++] = (SDL_Vertex){ {rect.x + rect.w - clampedRadius, rect.y + clampedRadius}, color, {1, 0} }; //1 center TR
-    vertices[vertexCount++] = (SDL_Vertex){ {rect.x + rect.w - clampedRadius, rect.y + rect.h - clampedRadius}, color, {1, 1} }; //2 center BR
-    vertices[vertexCount++] = (SDL_Vertex){ {rect.x + clampedRadius, rect.y + rect.h - clampedRadius}, color, {0, 1} }; //3 center BL
-
-    indices[indexCount++] = 0;
-    indices[indexCount++] = 1;
-    indices[indexCount++] = 3;
-    indices[indexCount++] = 1;
-    indices[indexCount++] = 2;
-    indices[indexCount++] = 3;
-
-    //define rounded corners as triangle fans
-    const float step = (SDL_PI_F/2) / numCircleSegments;
-    for (int i = 0; i < numCircleSegments; i++) {
-        const float angle1 = (float)i * step;
-        const float angle2 = ((float)i + 1.0f) * step;
-
-        for (int j = 0; j < 4; j++) {  // Iterate over four corners
-            float cx, cy, signX, signY;
-
-            switch (j) {
-                case 0: cx = rect.x + clampedRadius; cy = rect.y + clampedRadius; signX = -1; signY = -1; break; // Top-left
-                case 1: cx = rect.x + rect.w - clampedRadius; cy = rect.y + clampedRadius; signX = 1; signY = -1; break; // Top-right
-                case 2: cx = rect.x + rect.w - clampedRadius; cy = rect.y + rect.h - clampedRadius; signX = 1; signY = 1; break; // Bottom-right
-                case 3: cx = rect.x + clampedRadius; cy = rect.y + rect.h - clampedRadius; signX = -1; signY = 1; break; // Bottom-left
-                default: return;
-            }
-
-            vertices[vertexCount++] = (SDL_Vertex){ {cx + SDL_cosf(angle1) * clampedRadius * signX, cy + SDL_sinf(angle1) * clampedRadius * signY}, color, {0, 0} };
-            vertices[vertexCount++] = (SDL_Vertex){ {cx + SDL_cosf(angle2) * clampedRadius * signX, cy + SDL_sinf(angle2) * clampedRadius * signY}, color, {0, 0} };
-
-            indices[indexCount++] = j;  // Connect to corresponding central rectangle vertex
-            indices[indexCount++] = vertexCount - 2;
-            indices[indexCount++] = vertexCount - 1;
-        }
-    }
-
-    //Define edge rectangles
-    // Top edge
-    vertices[vertexCount++] = (SDL_Vertex){ {rect.x + clampedRadius, rect.y}, color, {0, 0} }; //TL
-    vertices[vertexCount++] = (SDL_Vertex){ {rect.x + rect.w - clampedRadius, rect.y}, color, {1, 0} }; //TR
-
-    indices[indexCount++] = 0;
-    indices[indexCount++] = vertexCount - 2; //TL
-    indices[indexCount++] = vertexCount - 1; //TR
-    indices[indexCount++] = 1;
-    indices[indexCount++] = 0;
-    indices[indexCount++] = vertexCount - 1; //TR
-    // Right edge
-    vertices[vertexCount++] = (SDL_Vertex){ {rect.x + rect.w, rect.y + clampedRadius}, color, {1, 0} }; //RT
-    vertices[vertexCount++] = (SDL_Vertex){ {rect.x + rect.w, rect.y + rect.h - clampedRadius}, color, {1, 1} }; //RB
-
-    indices[indexCount++] = 1;
-    indices[indexCount++] = vertexCount - 2; //RT
-    indices[indexCount++] = vertexCount - 1; //RB
-    indices[indexCount++] = 2;
-    indices[indexCount++] = 1;
-    indices[indexCount++] = vertexCount - 1; //RB
-    // Bottom edge
-    vertices[vertexCount++] = (SDL_Vertex){ {rect.x + rect.w - clampedRadius, rect.y + rect.h}, color, {1, 1} }; //BR
-    vertices[vertexCount++] = (SDL_Vertex){ {rect.x + clampedRadius, rect.y + rect.h}, color, {0, 1} }; //BL
-
-    indices[indexCount++] = 2;
-    indices[indexCount++] = vertexCount - 2; //BR
-    indices[indexCount++] = vertexCount - 1; //BL
-    indices[indexCount++] = 3;
-    indices[indexCount++] = 2;
-    indices[indexCount++] = vertexCount - 1; //BL
-    // Left edge
-    vertices[vertexCount++] = (SDL_Vertex){ {rect.x, rect.y + rect.h - clampedRadius}, color, {0, 1} }; //LB
-    vertices[vertexCount++] = (SDL_Vertex){ {rect.x, rect.y + clampedRadius}, color, {0, 0} }; //LT
-
-    indices[indexCount++] = 3;
-    indices[indexCount++] = vertexCount - 2; //LB
-    indices[indexCount++] = vertexCount - 1; //LT
-    indices[indexCount++] = 0;
-    indices[indexCount++] = 3;
-    indices[indexCount++] = vertexCount - 1; //LT
-
-    // Render everything
-    SDL_RenderGeometry(rendererData->renderer, NULL, vertices, vertexCount, indices, indexCount);
-}
-
-static void SDL_Clay_RenderArc(Clay_SDL3RendererData *rendererData, const SDL_FPoint center, const float radius, const float startAngle, const float endAngle, const float thickness, const Clay_Color color) {
-    SDL_SetRenderDrawColor(rendererData->renderer, color.r, color.g, color.b, color.a);
-
-    const float radStart = startAngle * (SDL_PI_F / 180.0f);
-    const float radEnd = endAngle * (SDL_PI_F / 180.0f);
-
-    const int numCircleSegments = SDL_max(NUM_CIRCLE_SEGMENTS, (int)(radius * 1.5f)); //increase circle segments for larger circles, 1.5 is arbitrary.
-
-    const float angleStep = (radEnd - radStart) / (float)numCircleSegments;
-    const float thicknessStep = 0.4f; //arbitrary value to avoid overlapping lines. Changing THICKNESS_STEP or numCircleSegments might cause artifacts.
-
-    for (float t = thicknessStep; t < thickness - thicknessStep; t += thicknessStep) {
-        SDL_FPoint points[numCircleSegments + 1];
-        const float clampedRadius = SDL_max(radius - t, 1.0f);
-
-        for (int i = 0; i <= numCircleSegments; i++) {
-            const float angle = radStart + i * angleStep;
-            points[i] = (SDL_FPoint){
-                    SDL_roundf(center.x + SDL_cosf(angle) * clampedRadius),
-                    SDL_roundf(center.y + SDL_sinf(angle) * clampedRadius) };
-        }
-        SDL_RenderLines(rendererData->renderer, points, numCircleSegments + 1);
-    }
-}
-
-SDL_Rect currentClippingRectangle;
-
-static void SDL_Clay_RenderClayCommands(Clay_SDL3RendererData *rendererData, Clay_RenderCommandArray *rcommands)
-{
+void SDL_Clay_RenderClayCommands(Clay_SDL3RendererData *rendererData, Clay_RenderCommandArray *rcommands) {
     for (size_t i = 0; i < rcommands->length; i++) {
         Clay_RenderCommand *rcmd = Clay_RenderCommandArray_Get(rcommands, i);
         const Clay_BoundingBox bounding_box = rcmd->boundingBox;
-        const SDL_FRect rect = { (int)bounding_box.x, (int)bounding_box.y, (int)bounding_box.width, (int)bounding_box.height };
 
         switch (rcmd->commandType) {
             case CLAY_RENDER_COMMAND_TYPE_RECTANGLE: {
                 Clay_RectangleRenderData *config = &rcmd->renderData.rectangle;
-                SDL_SetRenderDrawBlendMode(rendererData->renderer, SDL_BLENDMODE_BLEND);
-                SDL_SetRenderDrawColor(rendererData->renderer, config->backgroundColor.r, config->backgroundColor.g, config->backgroundColor.b, config->backgroundColor.a);
-                if (config->cornerRadius.topLeft > 0) {
-                    SDL_Clay_RenderFillRoundedRect(rendererData, rect, config->cornerRadius.topLeft, config->backgroundColor);
-                } else {
-                    SDL_RenderFillRect(rendererData->renderer, &rect);
-                }
+                roundedBoxRGBA(rendererData->renderer, bounding_box.x, bounding_box.y,
+                    bounding_box.x + bounding_box.width, bounding_box.y + bounding_box.height, config->cornerRadius.topLeft,
+                    config->backgroundColor.r, config->backgroundColor.g, config->backgroundColor.b, config->backgroundColor.a);
             } break;
             case CLAY_RENDER_COMMAND_TYPE_TEXT: {
                 Clay_TextRenderData *config = &rcmd->renderData.text;
                 TTF_Font *font = rendererData->fonts[config->fontId];
                 TTF_Text *text = TTF_CreateText(rendererData->textEngine, font, config->stringContents.chars, config->stringContents.length);
                 TTF_SetTextColor(text, config->textColor.r, config->textColor.g, config->textColor.b, config->textColor.a);
-                TTF_DrawRendererText(text, rect.x, rect.y);
+                TTF_DrawRendererText(text, bounding_box.x, bounding_box.y);
                 TTF_DestroyText(text);
             } break;
             case CLAY_RENDER_COMMAND_TYPE_BORDER: {
                 Clay_BorderRenderData *config = &rcmd->renderData.border;
-
-                const float minRadius = SDL_min(rect.w, rect.h) / 2.0f;
-                const Clay_CornerRadius clampedRadii = {
-                    .topLeft = SDL_min(config->cornerRadius.topLeft, minRadius),
-                    .topRight = SDL_min(config->cornerRadius.topRight, minRadius),
-                    .bottomLeft = SDL_min(config->cornerRadius.bottomLeft, minRadius),
-                    .bottomRight = SDL_min(config->cornerRadius.bottomRight, minRadius)
-                };
-                //edges
-                SDL_SetRenderDrawColor(rendererData->renderer, config->color.r, config->color.g, config->color.b, config->color.a);
-                if (config->width.left > 0) {
-                    const float starting_y = rect.y + clampedRadii.topLeft;
-                    const float length = rect.h - clampedRadii.topLeft - clampedRadii.bottomLeft;
-                    SDL_FRect line = { rect.x, starting_y, config->width.left, length };
-                    SDL_RenderFillRect(rendererData->renderer, &line);
-                }
-                if (config->width.right > 0) {
-                    const float starting_x = rect.x + rect.w - (float)config->width.right;
-                    const float starting_y = rect.y + clampedRadii.topRight;
-                    const float length = rect.h - clampedRadii.topRight - clampedRadii.bottomRight;
-                    SDL_FRect line = { starting_x, starting_y, config->width.right, length };
-                    SDL_RenderFillRect(rendererData->renderer, &line);
-                }
-                if (config->width.top > 0) {
-                    const float starting_x = rect.x + clampedRadii.topLeft;
-                    const float length = rect.w - clampedRadii.topLeft - clampedRadii.topRight;
-                    SDL_FRect line = { starting_x, rect.y, length, config->width.top };
-                    SDL_RenderFillRect(rendererData->renderer, &line);
-                }
-                if (config->width.bottom > 0) {
-                    const float starting_x = rect.x + clampedRadii.bottomLeft;
-                    const float starting_y = rect.y + rect.h - (float)config->width.bottom;
-                    const float length = rect.w - clampedRadii.bottomLeft - clampedRadii.bottomRight;
-                    SDL_FRect line = { starting_x, starting_y, length, config->width.bottom };
-                    SDL_SetRenderDrawColor(rendererData->renderer, config->color.r, config->color.g, config->color.b, config->color.a);
-                    SDL_RenderFillRect(rendererData->renderer, &line);
-                }
-                //corners
-                if (config->cornerRadius.topLeft > 0) {
-                    const float centerX = rect.x + clampedRadii.topLeft -1;
-                    const float centerY = rect.y + clampedRadii.topLeft;
-                    SDL_Clay_RenderArc(rendererData, (SDL_FPoint){centerX, centerY}, clampedRadii.topLeft,
-                        180.0f, 270.0f, config->width.top, config->color);
-                }
-                if (config->cornerRadius.topRight > 0) {
-                    const float centerX = rect.x + rect.w - clampedRadii.topRight -1;
-                    const float centerY = rect.y + clampedRadii.topRight;
-                    SDL_Clay_RenderArc(rendererData, (SDL_FPoint){centerX, centerY}, clampedRadii.topRight,
-                        270.0f, 360.0f, config->width.top, config->color);
-                }
-                if (config->cornerRadius.bottomLeft > 0) {
-                    const float centerX = rect.x + clampedRadii.bottomLeft -1;
-                    const float centerY = rect.y + rect.h - clampedRadii.bottomLeft -1;
-                    SDL_Clay_RenderArc(rendererData, (SDL_FPoint){centerX, centerY}, clampedRadii.bottomLeft,
-                        90.0f, 180.0f, config->width.bottom, config->color);
-                }
-                if (config->cornerRadius.bottomRight > 0) {
-                    const float centerX = rect.x + rect.w - clampedRadii.bottomRight -1; //TODO: why need to -1 in all calculations???
-                    const float centerY = rect.y + rect.h - clampedRadii.bottomRight -1;
-                    SDL_Clay_RenderArc(rendererData, (SDL_FPoint){centerX, centerY}, clampedRadii.bottomRight,
-                        0.0f, 90.0f, config->width.bottom, config->color);
-                }
-
+                roundedRectangleRGBA(rendererData->renderer, bounding_box.x, bounding_box.y,
+                    bounding_box.x + bounding_box.width, bounding_box.y + bounding_box.height, config->cornerRadius.topLeft,
+                    config->color.r, config->color.g, config->color.b, config->color.a);
             } break;
             case CLAY_RENDER_COMMAND_TYPE_SCISSOR_START: {
-                Clay_BoundingBox boundingBox = rcmd->boundingBox;
-                currentClippingRectangle = (SDL_Rect) {
-                        .x = boundingBox.x,
-                        .y = boundingBox.y,
-                        .w = boundingBox.width,
-                        .h = boundingBox.height,
+                const SDL_Rect currentClippingRectangle = (SDL_Rect) {
+                        .x = bounding_box.x,
+                        .y = bounding_box.y,
+                        .w = bounding_box.width,
+                        .h = bounding_box.height,
                 };
                 SDL_SetRenderClipRect(rendererData->renderer, &currentClippingRectangle);
                 break;
@@ -253,8 +43,12 @@ static void SDL_Clay_RenderClayCommands(Clay_SDL3RendererData *rendererData, Cla
             case CLAY_RENDER_COMMAND_TYPE_IMAGE: {
                 SDL_Surface *image = (SDL_Surface *)rcmd->renderData.image.imageData;
                 SDL_Texture *texture = SDL_CreateTextureFromSurface(rendererData->renderer, image);
-                const SDL_FRect dest = { rect.x, rect.y, rect.w, rect.h };
-
+                const SDL_FRect dest = (SDL_FRect) {
+                        .x = bounding_box.x,
+                        .y = bounding_box.y,
+                        .w = bounding_box.width,
+                        .h = bounding_box.height,
+                };
                 SDL_RenderTexture(rendererData->renderer, texture, NULL, &dest);
                 SDL_DestroyTexture(texture);
                 break;
diff --git a/vendor/clay/clay_renderer_SDL3.h b/vendor/clay/clay_renderer_SDL3.h
new file mode 100644
index 0000000..8bb5366
--- /dev/null
+++ b/vendor/clay/clay_renderer_SDL3.h
@@ -0,0 +1,20 @@
+#ifndef _clay_renderer_SDL3_h
+#define _clay_renderer_SDL3_h
+
+#include <SDL3/SDL_main.h>
+#include <SDL3/SDL.h>
+#include <SDL3_ttf/SDL_ttf.h>
+#include <SDL3_image/SDL_image.h>
+#include <SDL3_gfx/SDL3_gfxPrimitives.h>
+
+#include "clay.h"
+
+typedef struct {
+    SDL_Renderer *renderer;
+    TTF_TextEngine *textEngine;
+    TTF_Font **fonts;
+} Clay_SDL3RendererData;
+
+void SDL_Clay_RenderClayCommands(Clay_SDL3RendererData *rendererData, Clay_RenderCommandArray *rcommands);
+
+#endif