diff --git a/src/driver/video/doublebuffer.pas b/src/driver/video/doublebuffer.pas
index 2fa942bd..dfff1206 100644
--- a/src/driver/video/doublebuffer.pas
+++ b/src/driver/video/doublebuffer.pas
@@ -22,7 +22,7 @@ unit doublebuffer;
 interface
 
 uses
-    lmemorymanager, tracer, videotypes;
+    lmemorymanager, tracer, videotypes, serial, util;
 
 //Init the driver, and register with the video interface in a state ready for execution.
 procedure init(Register : FRegisterDriver);
@@ -54,21 +54,28 @@ end;
 
 procedure Flush(FrontBuffer : PVideoBuffer; BackBuffer : PVideoBuffer);
 var
-    X,Y : uint32;
+    idx : uint32;
     Back,Front : PuInt64;
+    BufferSize : uint32;
+
+const
+    COPY_WIDTH = 64;
 
 begin
-    tracer.push_trace('doublebuffer.Flush.enter');
+    //tracer.push_trace('doublebuffer.Flush.enter');
     if not(BackBuffer^.Initialized) then exit;
     if ((FrontBuffer^.Width > BackBuffer^.Width) or (FrontBuffer^.Height > BackBuffer^.Height)) then exit;
     Back:= PUint64(BackBuffer^.Location);
     Front:= PuInt64(FrontBuffer^.Location);
-    for X:=0 to (BackBuffer^.Width-1) div 2 do begin
-        for Y:=0 to (BackBuffer^.Height-1) div 2 do begin
-            Front[(Y * BackBuffer^.Width) + X]:= Back[(Y * BackBuffer^.Width) + X];
-        end;
+    BufferSize:= ( ( BackBuffer^.Width * BackBuffer^.Height * BackBuffer^.BitsPerPixel) div COPY_WIDTH ) - 1;
+    for idx:=0 to BufferSize do begin
+        Front[idx]:= Back[idx];
+        // -- TODO: Get SSE working here for 128bit copies --
+        // __SSE_128_memcpy(uint32(Front), uint32(Back));
+        // Front:= PUint64(uint32(Front) + 16);
+        // Back:= PUint64(uint32(Back) + 16);     
     end;
-    tracer.push_trace('doublebuffer.Flush.exit');
+    //tracer.push_trace('doublebuffer.Flush.exit');
 end;
 
 function enable(VideoInterface : PVideoInterface) : boolean;
diff --git a/src/driver/video/video.pas b/src/driver/video/video.pas
index 62fdce87..1c9b6f46 100644
--- a/src/driver/video/video.pas
+++ b/src/driver/video/video.pas
@@ -51,19 +51,22 @@ end;
 
 Procedure basicFFlush(FrontBuffer : PVideoBuffer; BackBuffer : PVideoBuffer);
 var
-    x, y : uint32;
+    idx : uint32;
     Back,Front : puint32;
+    BufferSize : uint32;
+
+const
+    COPY_WIDTH = 32;
 
 begin
-    tracer.push_trace('video.basicFFlush.enter');
+    //tracer.push_trace('video.basicFFlush.enter');
     If not(FrontBuffer^.Initialized and BackBuffer^.Initialized) then exit;
     if (BackBuffer^.Width > FrontBuffer^.Width) or (BackBuffer^.Height > FrontBuffer^.Height) then exit;
     Back:= puint32(BackBuffer^.Location);
     Front:= puint32(FrontBuffer^.Location);
-    for x:=0 to BackBuffer^.Width-1 do begin
-        for y:=0 to BackBuffer^.Height-1 do begin
-            Front[(Y * BackBuffer^.Width) + X]:= Back[(Y * BackBuffer^.Width) + X];
-        end;
+    BufferSize:= ( (BackBuffer^.Width * BackBuffer^.Height * BackBuffer^.BitsPerPixel ) div COPY_WIDTH ) - 1;
+    for idx:=0 to BufferSize do begin
+        Front[idx]:= Back[idx];
     end;
 end;
 
diff --git a/src/include/util.pas b/src/include/util.pas
index 39093f8f..ac648a11 100644
--- a/src/include/util.pas
+++ b/src/include/util.pas
@@ -49,6 +49,7 @@ procedure io_wait;
 
 procedure memset(location : uint32; value : uint8; size : uint32);
 procedure memcpy(source : uint32; dest : uint32; size : uint32);
+procedure __SSE_128_memcpy(source : uint32; dest : uint32);
 
 procedure printmemory(source : uint32; length : uint32; col : uint32; delim : PChar; offset_row : boolean);
 procedure printmemoryWND(source : uint32; length : uint32; col : uint32; delim : PChar; offset_row : boolean; WND : HWND);
@@ -127,6 +128,14 @@ begin
     div6432:= (r0 SHL 32) OR r4;
 end;
 
+procedure __SSE_128_memcpy(source : uint32; dest : uint32); assembler;
+asm
+    MOV EAX, Source
+    MOV ECX, Dest
+    MOVUPS XMM0, [EAX]
+    MOVUPS [ECX], XMM0
+end;
+
 function switchendian16(b : uint16) : uint16;
 begin
     switchendian16:= ((b AND $FF00) SHR 8) OR ((b AND $00FF) SHL 8);