21 files changed, 2980 insertions, 0 deletions
diff --git a/mpeglib/lib/util/render/dither/Makefile.am b/mpeglib/lib/util/render/dither/Makefile.am
new file mode 100644
index 00000000..166d5ca3
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/Makefile.am
@@ -0,0 +1,40 @@
+#  liboutplugin - Makefile.am
+
+INCLUDES                =       $(all_includes)
+
+EXTRA_DIST		=	ditherDef.h ditherMMX.h \
+				ditherer_mmx16.cpp dither32mmx.cpp
+
+noinst_LTLIBRARIES	=	libdither.la
+
+noinst_HEADERS		=	ditherWrapper.h \
+				dither8Bit.h colorTable8Bit.h \
+				colorTableHighBit.h dither16Bit.h \
+				dither32Bit.h ditherRGB_flipped.h \
+				ditherRGB.h
+
+libdither_la_SOURCES	=	ditherWrapper.cpp \
+				dither8Bit.cpp \
+				colorTable8Bit.cpp colorTableHighBit.cpp \
+				dither16Bit.cpp dither32Bit.cpp \
+				ditherRGB_flipped.cpp ditherRGB.cpp \
+				ditherer_mmx16.cpp dither32mmx.cpp
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/mpeglib/lib/util/render/dither/colorTable8Bit.cpp b/mpeglib/lib/util/render/dither/colorTable8Bit.cpp
new file mode 100644
index 00000000..57c533de
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/colorTable8Bit.cpp
@@ -0,0 +1,147 @@
+/*
+  colorTables for 8 Bit depth
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+#include "colorTable8Bit.h"
+
+
+ColorTable8Bit::ColorTable8Bit() {
+
+  lum_values = new int[LUM_RANGE];
+  cr_values = new int[CR_RANGE];
+  cb_values = new int[CB_RANGE];
+
+
+  /* We can exploit cache by allocating contiguous blocks */
+  
+  colortab = new TABTYPE[5*256];
+  
+  Cr_r_tab = &colortab[0*256];
+  Cr_g_tab = &colortab[1*256];
+  Cb_g_tab = &colortab[2*256];
+  Cb_b_tab = &colortab[3*256];
+  L_tab    = &colortab[4*256];
+
+  init8BitColor();
+} 
+
+
+ColorTable8Bit::~ColorTable8Bit() {
+  delete lum_values;
+  delete cr_values;
+  delete cb_values;
+  delete colortab;
+}
+
+
+void ColorTable8Bit::init8BitColor() {
+  int i;
+
+
+
+  for (i=0; i<LUM_RANGE; i++) {
+    lum_values[i]  = ((i * 256) / (LUM_RANGE)) + (256/(LUM_RANGE*2));
+    L_tab[i] = lum_values[i];
+    if (gammaCorrectFlag) {
+      L_tab[i] = GAMMA_CORRECTION(L_tab[i]);
+    }
+
+  }
+
+  
+  for (i=0; i<CR_RANGE; i++) {
+    register double tmp;
+    if (chromaCorrectFlag) {
+      tmp = ((i * 256) / (CR_RANGE)) + (256/(CR_RANGE*2));
+      Cr_r_tab[i]=(TABTYPE) ((0.419/0.299)*CHROMA_CORRECTION128D(tmp-128.0));
+      Cr_g_tab[i]=(TABTYPE) (-(0.299/0.419)*CHROMA_CORRECTION128D(tmp-128.0));
+      cr_values[i] = CHROMA_CORRECTION256(tmp);
+    } else {
+      tmp = ((i * 256) / (CR_RANGE)) + (256/(CR_RANGE*2));
+      Cr_r_tab[i] = (TABTYPE)  ((0.419/0.299) * (tmp - 128.0));
+      Cr_g_tab[i] = (TABTYPE) (-(0.299/0.419) * (tmp - 128.0));
+      cr_values[i] = (int) tmp;
+    }
+  }
+
+  
+  for (i=0; i<CB_RANGE; i++) {
+    register double tmp;
+    if (chromaCorrectFlag) {
+      tmp = ((i * 256) / (CB_RANGE)) + (256/(CB_RANGE*2));
+      Cb_g_tab[i]=(TABTYPE) (-(0.114/0.331)*CHROMA_CORRECTION128D(tmp-128.0));
+      Cb_b_tab[i]=(TABTYPE) ((0.587/0.331)*CHROMA_CORRECTION128D(tmp-128.0));
+      cb_values[i] = CHROMA_CORRECTION256(tmp);
+    } else {
+      tmp = ((i * 256) / (CB_RANGE)) + (256/(CB_RANGE*2));
+      Cb_g_tab[i] = (TABTYPE) (-(0.114/0.331) * (tmp - 128.0));
+      Cb_b_tab[i] = (TABTYPE) ((0.587/0.331) * (tmp - 128.0));
+      cb_values[i] = (int) tmp;
+    }
+  }
+}
+
+
+
+/*
+ *--------------------------------------------------------------
+ *
+ * ConvertColor --
+ *
+ *      Given a l, cr, cb tuple, converts it to r,g,b.
+ *
+ * Results:
+ *      r,g,b values returned in pointers passed as parameters.
+ *
+ * Side effects:
+ *      None.
+ *
+ *--------------------------------------------------------------
+ */
+void ColorTable8Bit::ConvertColor(unsigned int l, unsigned int cr, 
+				  unsigned int cb, unsigned char* r, 
+				  unsigned char* g, unsigned char* b) {
+
+  double fl, fr, fg, fb;
+
+  /*
+   * Old method w/o lookup table
+   *
+   * fl = 1.164*(((double) l)-16.0);
+   * fcr =  ((double) cr) - 128.0;
+   * fcb =  ((double) cb) - 128.0;
+   *
+   * fr = fl + (1.366 * fcr);
+   * fg = fl - (0.700 * fcr) - (0.334 * fcb);
+   * fb = fl + (1.732 * fcb);
+   */
+  
+  fl = L_tab[l];
+
+  fr = fl + Cr_r_tab[cr];
+  fg = fl + Cr_g_tab[cr] + Cb_g_tab[cb];
+  fb = fl + Cb_b_tab[cb];
+
+  if (fr < 0.0) fr = 0.0;
+  else if (fr > 255.0) fr = 255.0;
+
+  if (fg < 0.0) fg = 0.0;
+  else if (fg > 255.0) fg = 255.0;
+
+  if (fb < 0.0) fb = 0.0;
+  else if (fb > 255.0) fb = 255.0;
+
+  *r = (unsigned char) fr;
+  *g = (unsigned char) fg;
+  *b = (unsigned char) fb;
+
+}
diff --git a/mpeglib/lib/util/render/dither/colorTable8Bit.h b/mpeglib/lib/util/render/dither/colorTable8Bit.h
new file mode 100644
index 00000000..6d873d1d
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/colorTable8Bit.h
@@ -0,0 +1,57 @@
+/*
+  colorTables for 8 Bit depth
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+#ifndef __COLORTABLE8BIT_H
+#define __COLORTABLE8BIT_H
+
+#include "ditherDef.h"
+
+
+class ColorTable8Bit {
+  
+  // Arrays holding quantized value ranged for lum, cr, and cb. 
+  // (used for 8 Bit)
+  
+  int* lum_values;
+  int* cr_values;
+  int* cb_values;
+  
+
+
+
+  TABTYPE *L_tab;
+  TABTYPE *Cr_r_tab;
+  TABTYPE *Cr_g_tab;
+  TABTYPE *Cb_g_tab;
+  TABTYPE *Cb_b_tab;
+  TABTYPE *colortab;
+ 
+
+ public:
+  ColorTable8Bit();
+  ~ColorTable8Bit();
+
+  inline int* getLumValues() { return lum_values; }
+  inline int* getCrValues()  { return cr_values; }
+  inline int* getCbValues() { return cb_values; }
+  
+  void ConvertColor(unsigned int l, unsigned int cr, unsigned int cb,
+		    unsigned char* r, unsigned char* g, unsigned char* b);
+
+
+ private:
+  void init8BitColor();
+
+
+};
+#endif
diff --git a/mpeglib/lib/util/render/dither/colorTableHighBit.cpp b/mpeglib/lib/util/render/dither/colorTableHighBit.cpp
new file mode 100644
index 00000000..171f4e97
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/colorTableHighBit.cpp
@@ -0,0 +1,248 @@
+/*
+  colorTables for 16,32 Bit depth
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+
+#include "colorTableHighBit.h"
+
+//#define INTERPOLATE
+
+
+/*
+ * Erik Corry's multi-byte dither routines.
+ *
+ * The basic idea is that the Init generates all the necessary tables.
+ * The tables incorporate the information about the layout of pixels
+ * in the XImage, so that it should be able to cope with 15-bit, 16-bit
+ * 24-bit (non-packed) and 32-bit (10-11 bits per color!) screens.
+ * At present it cannot cope with 24-bit packed mode, since this involves
+ * getting down to byte level again. It is assumed that the bits for each
+ * color are contiguous in the longword.
+ * 
+ * Writing to memory is done in shorts or ints. (Unfortunately, short is not
+ * very fast on Alpha, so there is room for improvement here). There is no
+ * dither time check for overflow - instead the tables have slack at
+ * each end. This is likely to be faster than an 'if' test as many modern
+ * architectures are really bad at ifs. Potentially, each '&&' causes a 
+ * pipeline flush!
+ *
+ * There is no shifting and fixed point arithmetic, as I really doubt you
+ * can see the difference, and it costs. This may be just my bias, since I
+ * heard that Intel is really bad at shifting.
+ */
+
+
+/*
+ * How many 1 bits are there in the PIXVALword.
+ * Low performance, do not call often.
+ */
+static int number_of_bits_set(unsigned PIXVAL a) {
+    if(!a) return 0;
+    if(a & 1) return 1 + number_of_bits_set(a >> 1);
+    return(number_of_bits_set(a >> 1));
+}
+
+
+
+/*
+ * How many 0 bits are there at most significant end of PIXVALword.
+ * Low performance, do not call often.
+ */
+static int free_bits_at_top(unsigned PIXVAL a) {
+      /* assume char is 8 bits */
+    if(!a) return sizeof(unsigned PIXVAL) * 8;
+        /* assume twos complement */
+    if(((PIXVAL)a) < 0l) return 0;
+    return 1 + free_bits_at_top ( a << 1);
+}
+
+/*
+ * How many 0 bits are there at least significant end of PIXVALword.
+ * Low performance, do not call often.
+ */
+static int free_bits_at_bottom(unsigned PIXVAL a) {
+      /* assume char is 8 bits */
+    if(!a) return sizeof(unsigned PIXVAL) * 8;
+    if(((PIXVAL)a) & 1l) return 0;
+    return 1 + free_bits_at_bottom ( a >> 1);
+}
+
+
+
+ColorTableHighBit::ColorTableHighBit(int bpp,unsigned int redMask,
+				     unsigned int greenMask,
+				     unsigned int blueMask) {
+  this->bpp=bpp;
+  this->redMask=redMask;
+  this->greenMask=greenMask;
+  this->blueMask=blueMask;
+
+  colortab = new TABTYPE[5*256];
+  
+  Cr_r_tab = &colortab[0*256];
+  Cr_g_tab = &colortab[1*256];
+  Cb_g_tab = &colortab[2*256];
+  Cb_b_tab = &colortab[3*256];
+  L_tab    = &colortab[4*256];
+  
+  rgb_2_pix = new PIXVAL [3*768];
+
+  r_2_pix_alloc = &rgb_2_pix[0*768];
+  g_2_pix_alloc = &rgb_2_pix[1*768];
+  b_2_pix_alloc = &rgb_2_pix[2*768];
+  
+  initHighColor(bpp>=24,redMask,greenMask,blueMask);
+  
+}
+
+
+ColorTableHighBit::~ColorTableHighBit() {
+  delete colortab;
+  delete rgb_2_pix;
+}
+
+/*
+ *--------------------------------------------------------------
+ *
+ * InitColor16Dither --
+ *
+ *	To get rid of the multiply and other conversions in color
+ *	dither, we use a lookup table.
+ *
+ * Results:
+ *	None.
+ *
+ * Side effects:
+ *	The lookup tables are initialized.
+ *
+ *--------------------------------------------------------------
+ */
+
+void ColorTableHighBit::initHighColor(int thirty2,unsigned int redMask,
+				      unsigned int greenMask,
+				      unsigned int blueMask) {
+  
+  unsigned PIXVAL red_mask = redMask;
+  unsigned PIXVAL green_mask =greenMask;
+  unsigned PIXVAL blue_mask = blueMask;
+
+  int CR, CB, i;
+    
+
+  for (i=0; i<256; i++) {
+    L_tab[i] = i;
+    if (gammaCorrectFlag) {
+      L_tab[i] = (TABTYPE)GAMMA_CORRECTION(i);
+    }
+    
+    CB = CR = i;
+    
+    if (chromaCorrectFlag) {
+      CB -= 128; 
+      CB = CHROMA_CORRECTION128(CB);
+      CR -= 128;
+      CR = CHROMA_CORRECTION128(CR);
+    } else {
+      CB -= 128; CR -= 128;
+    }
+/* was
+      Cr_r_tab[i] =  1.596 * CR;
+      Cr_g_tab[i] = -0.813 * CR;
+      Cb_g_tab[i] = -0.391 * CB;   
+      Cb_b_tab[i] =  2.018 * CB;
+  but they were just messed up.
+  Then was (_Video Deymstified_):
+      Cr_r_tab[i] =  1.366 * CR;
+      Cr_g_tab[i] = -0.700 * CR;
+      Cb_g_tab[i] = -0.334 * CB;   
+      Cb_b_tab[i] =  1.732 * CB;
+  but really should be:
+   (from ITU-R BT.470-2 System B, G and SMPTE 170M )
+*/
+      Cr_r_tab[i] = (TABTYPE) ( (0.419/0.299) * CR  );
+      Cr_g_tab[i] = (TABTYPE) ( -(0.299/0.419) * CR );
+      Cb_g_tab[i] = (TABTYPE) ( -(0.114/0.331) * CB ); 
+      Cb_b_tab[i] = (TABTYPE) (  (0.587/0.331) * CB );
+
+/*
+  though you could argue for:
+    SMPTE 240M
+      Cr_r_tab[i] =  (0.445/0.212) * CR;
+      Cr_g_tab[i] = -(0.212/0.445) * CR;
+      Cb_g_tab[i] = -(0.087/0.384) * CB; 
+      Cb_b_tab[i] =  (0.701/0.384) * CB;
+    FCC 
+      Cr_r_tab[i] =  (0.421/0.30) * CR;
+      Cr_g_tab[i] = -(0.30/0.421) * CR;
+      Cb_g_tab[i] = -(0.11/0.331) * CB; 
+      Cb_b_tab[i] =  (0.59/0.331) * CB;
+    ITU-R BT.709 
+      Cr_r_tab[i] =  (0.454/0.2125) * CR;
+      Cr_g_tab[i] = -(0.2125/0.454) * CR;
+      Cb_g_tab[i] = -(0.0721/0.386) * CB; 
+      Cb_b_tab[i] =  (0.7154/0.386) * CB;
+*/
+    }
+
+    /* 
+     * Set up entries 0-255 in rgb-to-pixel value tables.
+     */
+    for (i = 0; i < 256; i++) {
+      r_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set(red_mask));
+      r_2_pix_alloc[i + 256] <<= free_bits_at_bottom(red_mask);
+      g_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set(green_mask));
+      g_2_pix_alloc[i + 256] <<= free_bits_at_bottom(green_mask);
+      b_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set(blue_mask));
+      b_2_pix_alloc[i + 256] <<= free_bits_at_bottom(blue_mask);
+      /*
+       * If we have 16-bit output depth, then we double the value
+       * in the top word. This means that we can write out both
+       * pixels in the pixel doubling mode with one op. It is 
+       * harmless in the normal case as storing a 32-bit value
+       * through a short pointer will lose the top bits anyway.
+       * A similar optimisation for Alpha for 64 bit has been
+       * prepared for, but is not yet implemented.
+       */
+      if(!thirty2) {
+	r_2_pix_alloc[i + 256] |= (r_2_pix_alloc[i + 256]) << 16;
+	g_2_pix_alloc[i + 256] |= (g_2_pix_alloc[i + 256]) << 16;
+	b_2_pix_alloc[i + 256] |= (b_2_pix_alloc[i + 256]) << 16;
+
+      }
+#ifdef SIXTYFOUR_BIT
+      if(thirty2) {
+
+	r_2_pix_alloc[i + 256] |= (r_2_pix_alloc[i + 256]) << 32;
+	g_2_pix_alloc[i + 256] |= (g_2_pix_alloc[i + 256]) << 32;
+	b_2_pix_alloc[i + 256] |= (b_2_pix_alloc[i + 256]) << 32;
+
+      }
+#endif
+    }
+
+    /*
+     * Spread out the values we have to the rest of the array so that
+     * we do not need to check for overflow.
+     */
+    for (i = 0; i < 256; i++) {
+      r_2_pix_alloc[i] = r_2_pix_alloc[256];
+      r_2_pix_alloc[i+ 512] = r_2_pix_alloc[511];
+      g_2_pix_alloc[i] = g_2_pix_alloc[256];
+      g_2_pix_alloc[i+ 512] = g_2_pix_alloc[511];
+      b_2_pix_alloc[i] = b_2_pix_alloc[256];
+      b_2_pix_alloc[i+ 512] = b_2_pix_alloc[511];
+    }
+
+    r_2_pix = r_2_pix_alloc + 256;
+    g_2_pix = g_2_pix_alloc + 256;
+    b_2_pix = b_2_pix_alloc + 256;
+}
diff --git a/mpeglib/lib/util/render/dither/colorTableHighBit.h b/mpeglib/lib/util/render/dither/colorTableHighBit.h
new file mode 100644
index 00000000..9945414d
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/colorTableHighBit.h
@@ -0,0 +1,73 @@
+/*
+  colorTables for 16,32 Bit depth
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+#ifndef __COLORTABLEHIGHBIT_H
+#define __COLORTABLEHIGHBIT_H
+
+#include "ditherDef.h"
+
+
+
+
+class ColorTableHighBit {
+
+  TABTYPE *L_tab;
+  TABTYPE *Cr_r_tab;
+  TABTYPE *Cr_g_tab;
+  TABTYPE *Cb_g_tab;
+  TABTYPE *Cb_b_tab;
+  TABTYPE *colortab;
+  
+  
+  PIXVAL *r_2_pix;
+  PIXVAL *g_2_pix;
+  PIXVAL *b_2_pix;
+  PIXVAL *rgb_2_pix;
+
+  PIXVAL *r_2_pix_alloc;
+  PIXVAL *g_2_pix_alloc;
+  PIXVAL *b_2_pix_alloc;
+
+
+
+  // init stuff
+  int bpp;
+  // colorMask
+  unsigned int redMask;
+  unsigned int greenMask;
+  unsigned int blueMask;
+
+ public:
+  ColorTableHighBit(int bpp,unsigned int redMask,
+		    unsigned int greenMask,unsigned int blueMask);
+  ~ColorTableHighBit();
+
+  inline TABTYPE* getL_tab()        { return L_tab     ; }
+  inline TABTYPE* getCr_r_tab()     { return Cr_r_tab  ; }
+  inline TABTYPE* getCr_g_tab()     { return Cr_g_tab  ; }
+  inline TABTYPE* getCb_g_tab()     { return Cb_g_tab  ; }
+  inline TABTYPE* getCb_b_tab()     { return Cb_b_tab  ; }
+
+  
+  inline PIXVAL* getr_2_pix()       { return r_2_pix   ; }
+  inline PIXVAL* getg_2_pix()       { return g_2_pix   ; }
+  inline PIXVAL* getb_2_pix()       { return b_2_pix   ; }
+
+
+
+ private:
+  void initHighColor(int thirty2,unsigned int redMask,
+		     unsigned int greenMask,unsigned int blueMask);
+
+};
+#endif
diff --git a/mpeglib/lib/util/render/dither/dither16Bit.cpp b/mpeglib/lib/util/render/dither/dither16Bit.cpp
new file mode 100644
index 00000000..0a843ee9
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/dither16Bit.cpp
@@ -0,0 +1,300 @@
+/*
+  dither 16 bit depth yuv images
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+#include "dither16Bit.h"
+
+
+Dither16Bit::Dither16Bit(unsigned int redMask,
+			 unsigned int greenMask,unsigned int blueMask) {
+
+
+  colorTableHighBit=new ColorTableHighBit(16,redMask,greenMask,blueMask);
+  L_tab=colorTableHighBit->getL_tab();
+  Cr_r_tab=colorTableHighBit->getCr_r_tab();
+  Cr_g_tab=colorTableHighBit->getCr_g_tab();
+  Cb_g_tab=colorTableHighBit->getCb_g_tab();
+  Cb_b_tab=colorTableHighBit->getCb_b_tab();
+  
+  r_2_pix=colorTableHighBit->getr_2_pix();
+  g_2_pix=colorTableHighBit->getg_2_pix();
+  b_2_pix=colorTableHighBit->getb_2_pix();
+
+}
+
+
+Dither16Bit::~Dither16Bit() {
+  delete colorTableHighBit;
+}
+
+
+/*
+ *--------------------------------------------------------------
+ *
+ * Color16DitherImage --
+ *
+ *	Converts image into 16 bit color.
+ *
+ * Results:
+ *	None.
+ *
+ * Side effects:
+ *	None.
+ *
+ *--------------------------------------------------------------
+ */
+
+void Dither16Bit::ditherImageColor16(unsigned char* lum, 
+				     unsigned char* cr, 
+				     unsigned char* cb,
+				     unsigned char* out,
+				     int rows,
+				     int cols,
+				     int offset) {
+  
+  int L, CR, CB;
+  unsigned short *row1, *row2;
+  unsigned char *lum2;
+  int x, y;
+  int cr_r;
+  int cr_g;
+  int cb_g;
+  int cb_b;
+  int cols_2 = cols/2;
+  
+  row1 = (unsigned short *)out;
+  row2=row1+cols_2+cols_2+offset;                   // start of second row 
+
+  offset=2*offset+cols_2+cols_2;
+
+  lum2 = lum + cols_2 + cols_2;
+
+  
+  for (y=0; y<rows; y+=2) {
+    for (x=0; x<cols_2; x++) {
+      int R, G, B;
+      
+      CR = *cr++;
+      CB = *cb++;
+      cr_r = Cr_r_tab[CR];
+      cr_g = Cr_g_tab[CR];
+      cb_g = Cb_g_tab[CB];
+      cb_b = Cb_b_tab[CB];
+      
+      L = L_tab[(int) *lum++];
+      
+      R = L + cr_r;
+      G = L + cr_g + cb_g;
+      B = L + cb_b;
+      
+      *row1++ = (r_2_pix[R] | g_2_pix[G] | b_2_pix[B]);
+      
+      
+#ifdef INTERPOLATE
+      if(x != cols_2 - 1) {
+	CR = (CR + *cr) >> 1;
+	CB = (CB + *cb) >> 1;
+	cr_r = Cr_r_tab[CR];
+	cr_g = Cr_g_tab[CR];
+	cb_g = Cb_g_tab[CB];
+	cb_b = Cb_b_tab[CB];
+      }
+#endif
+      
+      L = L_tab[(int) *lum++];
+      
+      R = L + cr_r;
+      G = L + cr_g + cb_g;
+      B = L + cb_b;
+      
+      *row1++ = (r_2_pix[R] | g_2_pix[G] | b_2_pix[B]);
+      
+      /*
+       * Now, do second row.
+       */
+#ifdef INTERPOLATE
+      if(y != rows - 2) {
+	CR = (CR + *(cr + cols_2 - 1)) >> 1;
+	CB = (CB + *(cb + cols_2 - 1)) >> 1;
+	cr_r = Cr_r_tab[CR];
+	cr_g = Cr_g_tab[CR];
+	cb_g = Cb_g_tab[CB];
+	cb_b = Cb_b_tab[CB];
+      }
+#endif
+      
+      L = L_tab[(int) *lum2++];
+      R = L + cr_r;
+      G = L + cr_g + cb_g;
+      B = L + cb_b;
+      
+      *row2++ = (r_2_pix[R] | g_2_pix[G] | b_2_pix[B]);
+      
+      L = L_tab[(int) *lum2++];
+      R = L + cr_r;
+      G = L + cr_g + cb_g;
+      B = L + cb_b;
+      
+      *row2++ = (r_2_pix[R] | g_2_pix[G] | b_2_pix[B]);
+    }
+    /*
+     * These values are at the start of the next line, (due
+     * to the ++'s above),but they need to be at the start
+     * of the line after that.
+     */
+    lum += cols_2 + cols_2;
+    lum2 += cols_2 + cols_2;
+    row1 += offset;
+    row2 += offset;
+  }
+}
+
+
+/*
+ * Erik Corry's pixel doubling routines for 15/16/24/32 bit screens.
+ */
+
+
+/*
+ *--------------------------------------------------------------
+ *
+ * Twox2Color16DitherImage --
+ *
+ *	Converts image into 16 bit color at double size.
+ *
+ * Results:
+ *	None.
+ *
+ * Side effects:
+ *	None.
+ *
+ *--------------------------------------------------------------
+ */
+
+/*
+ * In this function I make use of a nasty trick. The tables have the lower
+ * 16 bits replicated in the upper 16. This means I can write ints and get
+ * the horisontal doubling for free (almost).
+ */
+
+void Dither16Bit::ditherImageTwox2Color16(unsigned char* lum,
+					  unsigned char* cr,
+					  unsigned char* cb,
+					  unsigned char* out,
+					  int rows,
+					  int cols,
+					  int mod) {
+  int L, CR, CB;
+  unsigned int *row1 = (unsigned int *)out;
+  unsigned int *row2 = row1 + cols + mod/2;
+  unsigned int *row3 = row2 + cols + mod/2;
+  unsigned int *row4 = row3 + cols + mod/2;
+  unsigned char *lum2;
+  int x, y;
+  int cr_r;
+  int cr_g;
+  int cb_g;
+  int cb_b;
+  int cols_2 = cols/2;
+  
+  lum2 = lum + cols_2 + cols_2;
+  for (y=0; y<rows; y+=2) {
+    for (x=0; x<cols_2; x++) {
+      int R, G, B;
+      int t;
+      
+      CR = *cr++;
+      CB = *cb++;
+      cr_r = Cr_r_tab[CR];
+      cr_g = Cr_g_tab[CR];
+      cb_g = Cb_g_tab[CB];
+      cb_b = Cb_b_tab[CB];
+      
+      L = L_tab[(int) *lum++];
+      
+      R = L + cr_r;
+      G = L + cr_g + cb_g;
+      B = L + cb_b;
+      
+      t = (r_2_pix[R] | g_2_pix[G] | b_2_pix[B]);
+      row1[0] = t;
+      row1++;
+      row2[0] = t;
+      row2++;
+      
+      // INTERPOLATE
+      if(x != cols_2 - 1) {
+	CR = (CR + *cr) >> 1;
+	CB = (CB + *cb) >> 1;
+	cr_r = Cr_r_tab[CR];
+	cr_g = Cr_g_tab[CR];
+	cb_g = Cb_g_tab[CB];
+	cb_b = Cb_b_tab[CB];
+      }
+      // end
+      
+      L = L_tab[(int) *lum++];
+      
+      R = L + cr_r;
+      G = L + cr_g + cb_g;
+      B = L + cb_b;
+      
+      t = (r_2_pix[R] | g_2_pix[G] | b_2_pix[B]);
+      row1[0] = t;
+      row1++;
+      row2[0] = t;
+      row2++;
+      
+      /*
+       * Now, do second row.
+       */
+      // INTERPOLATE
+      if(y != rows - 2) {
+	CR = (CR + *(cr + cols_2 - 1)) >> 1;
+	CB = (CB + *(cb + cols_2 - 1)) >> 1;
+	cr_r = Cr_r_tab[CR];
+	cr_g = Cr_g_tab[CR];
+	cb_g = Cb_g_tab[CB];
+	cb_b = Cb_b_tab[CB];
+      }
+      // end
+      
+      L = L_tab[(int) *lum2++];
+      R = L + cr_r;
+      G = L + cr_g + cb_g;
+      B = L + cb_b;
+      
+      t = (r_2_pix[R] | g_2_pix[G] | b_2_pix[B]);
+      row3[0] = t;
+      row3++;
+      row4[0] = t;
+      row4++;
+      
+      L = L_tab[(int) *lum2++];
+      R = L + cr_r;
+      G = L + cr_g + cb_g;
+      B = L + cb_b;
+      
+      t = (r_2_pix[R] | g_2_pix[G] | b_2_pix[B]);
+      row3[0] = t;
+      row3++;
+      row4[0] = t;
+      row4++;
+    }
+    lum += cols_2 + cols_2;
+    lum2 += cols_2 + cols_2;
+    row1 += 6 * cols_2 + 2*mod;
+    row3 += 6 * cols_2 + 2*mod;
+    row2 += 6 * cols_2 + 2*mod;
+    row4 += 6 * cols_2 + 2*mod;
+  }
+}
diff --git a/mpeglib/lib/util/render/dither/dither16Bit.h b/mpeglib/lib/util/render/dither/dither16Bit.h
new file mode 100644
index 00000000..2e47c01c
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/dither16Bit.h
@@ -0,0 +1,55 @@
+/*
+  dither 16 bit depth yuv images
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+#ifndef __DITHER16Bit_H
+#define __DITHER16Bit_H
+
+#include "colorTableHighBit.h"
+
+class Dither16Bit {
+
+  ColorTableHighBit* colorTableHighBit;
+
+  TABTYPE *L_tab;
+  TABTYPE *Cr_r_tab;
+  TABTYPE *Cr_g_tab;
+  TABTYPE *Cb_g_tab;
+  TABTYPE *Cb_b_tab;
+
+  PIXVAL *r_2_pix;
+  PIXVAL *g_2_pix;
+  PIXVAL *b_2_pix;
+
+ public:
+  Dither16Bit(unsigned int redMask,
+	      unsigned int greenMask,unsigned int blueMask);
+  ~Dither16Bit();
+
+  void ditherImageColor16(unsigned char* lum, 
+			  unsigned char* cr, 
+			  unsigned char* cb,
+			  unsigned char* out,
+			  int rows,
+			  int cols,
+			  int offset);
+
+  void ditherImageTwox2Color16(unsigned char* lum,
+			       unsigned char* cr,
+			       unsigned char* cb,
+			       unsigned char* out,
+			       int rows,
+			       int cols,
+			       int mod);
+
+};
+
+#endif
diff --git a/mpeglib/lib/util/render/dither/dither32Bit.cpp b/mpeglib/lib/util/render/dither/dither32Bit.cpp
new file mode 100644
index 00000000..61a1d2dc
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/dither32Bit.cpp
@@ -0,0 +1,253 @@
+/*
+  dither 32 bit depth yuv images
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+#include "dither32Bit.h"
+
+
+#define doRow(row,Lum)  *row++=(local_r_2_pix[Lum] | \
+ local_g_2_pix[Lum] | local_b_2_pix[Lum])
+
+
+Dither32Bit::Dither32Bit(unsigned int redMask,
+			 unsigned int greenMask,unsigned int blueMask) {
+
+
+  colorTableHighBit=new ColorTableHighBit(32,redMask,greenMask,blueMask);
+  L_tab=colorTableHighBit->getL_tab();
+  Cr_r_tab=colorTableHighBit->getCr_r_tab();
+  Cr_g_tab=colorTableHighBit->getCr_g_tab();
+  Cb_g_tab=colorTableHighBit->getCb_g_tab();
+  Cb_b_tab=colorTableHighBit->getCb_b_tab();
+  
+  r_2_pix=colorTableHighBit->getr_2_pix();
+  g_2_pix=colorTableHighBit->getg_2_pix();
+  b_2_pix=colorTableHighBit->getb_2_pix();
+
+}
+
+
+Dither32Bit::~Dither32Bit() {
+  delete colorTableHighBit;
+}
+
+
+void Dither32Bit::ditherImageColor32(unsigned char* lum, 
+				     unsigned char* cr, 
+				     unsigned char* cb,
+				     unsigned char* out,
+				     int rows,
+				     int cols,
+				     int mod) {
+  
+  int L;
+  int n;
+  int rowWork;
+  int colWork;
+  
+  unsigned int *row1, *row2;
+  unsigned char *lum2;
+  PIXVAL* local_r_2_pix;
+  PIXVAL* local_g_2_pix;
+  PIXVAL* local_b_2_pix;
+  
+  row1 = (unsigned int *)out;
+  
+  row2 = row1+cols+mod;
+  lum2 = lum+cols;
+  
+  // because the width/height are a multiply of a macroblocksize
+  // cols/rows always are even
+  colWork=cols>>1;
+  rowWork=rows>>1;
+  mod=cols+2*mod;
+  
+  while(rowWork--) {
+    n=colWork;
+    while(n--) {
+      
+      local_r_2_pix=r_2_pix+Cr_r_tab[*cr];
+      local_g_2_pix=g_2_pix+Cr_g_tab[*cr++] + Cb_g_tab[*cb];
+      local_b_2_pix=b_2_pix+Cb_b_tab[*cb++];
+      
+      L = L_tab[*lum++];
+      doRow(row1,L);
+	
+      L = L_tab[*lum++];
+      doRow(row1,L);
+	
+      L = L_tab [*lum2++];
+      doRow(row2,L);
+	
+      L = L_tab [*lum2++];
+      doRow(row2,L);
+	
+      
+    }
+    row2 += mod;
+    lum += cols;
+    lum2 += cols;
+    row1 += mod;
+    
+  }
+
+}
+
+/*
+ *--------------------------------------------------------------
+ *
+ * Twox2Color32 --
+ *
+ *	Converts image into 24/32 bit color.
+ *
+ * Results:
+ *	None.
+ *
+ * Side effects:
+ *	None.
+ *
+ *--------------------------------------------------------------
+ */
+
+void Dither32Bit::ditherImageTwox2Color32(unsigned char* lum,
+					  unsigned char* cr,
+					  unsigned char* cb,
+					  unsigned char* out,
+					  int rows,
+					  int cols,
+					  int mod) {
+  int L, CR, CB;
+  unsigned PIXVAL *row1 = (unsigned PIXVAL *)out;
+  unsigned PIXVAL *row2 = row1 + cols * ONE_TWO + mod;
+  unsigned PIXVAL *row3 = row2 + cols * ONE_TWO + mod;
+  unsigned PIXVAL *row4 = row3 + cols * ONE_TWO + mod;
+  unsigned char *lum2;
+  int x, y;
+  int cr_r;
+  int cr_g;
+  int cb_g;
+  int cb_b;
+  int cols_2 = cols/2;
+  int loffset = ONE_TWO * 6 *cols_2 + 4*mod ;
+  
+  lum2 = lum + cols_2 + cols_2;
+  for (y=0; y<rows; y+=2) {
+    for (x=0; x<cols_2; x++) {
+      int R, G, B;
+      PIXVAL t; 
+      
+      CR = *cr++;
+      CB = *cb++;
+      cr_r = Cr_r_tab[CR];
+      cr_g = Cr_g_tab[CR];
+      cb_g = Cb_g_tab[CB];
+      cb_b = Cb_b_tab[CB];
+      
+      L = L_tab[ (int) *lum++];
+      
+      R = L + cr_r;
+      G = L + cr_g + cb_g;
+      B = L + cb_b;
+      
+      t = (r_2_pix[R] | g_2_pix[G] | b_2_pix[B]);
+      row1[0] = t;
+      row2[0] = t;
+#ifndef SIXTYFOUR_BIT
+      row1[1] = t;
+      row2[1] = t;
+#endif
+      row1 += ONE_TWO;
+      row2 += ONE_TWO;
+      
+      /* INTERPOLATE is now standard */
+      // INTERPOLATE
+      if(x != cols_2 - 1) {
+	CR = (CR + *cr) >> 1;
+	CB = (CB + *cb) >> 1;
+	cr_r = Cr_r_tab[CR];
+	cr_g = Cr_g_tab[CR];
+	cb_g = Cb_g_tab[CB];
+	cb_b = Cb_b_tab[CB];
+      }
+      // end
+      /* end INTERPOLATE */
+      
+      L = L_tab[ (int) *lum++];
+      
+      R = L + cr_r;
+      G = L + cr_g + cb_g;
+      B = L + cb_b;
+      
+      t = (r_2_pix[R] | g_2_pix[G] | b_2_pix[B]);
+      row1[0] = t;
+      row2[0] = t;
+#ifndef SIXTYFOUR_BIT
+      row1[1] = t;
+      row2[1] = t;
+#endif
+      row1 += ONE_TWO;
+      row2 += ONE_TWO;
+      
+      /*
+       * Now, do second row.
+       */
+      /* INTERPOLATE is now standard */
+      // INTERPOLATE
+      if(y != rows - 2) {
+	CR = (unsigned int) (CR + *(cr + cols_2 - 1)) >> 1;
+	CB = (unsigned int) (CB + *(cb + cols_2 - 1)) >> 1;
+	cr_r = Cr_r_tab[CR];
+	cr_g = Cr_g_tab[CR];
+	cb_g = Cb_g_tab[CB];
+	cb_b = Cb_b_tab[CB];
+      }
+      // end
+      /* endif */
+      L = L_tab[ (int) *lum2++];
+      R = L + cr_r;
+      G = L + cr_g + cb_g;
+      B = L + cb_b;
+      
+      t = (r_2_pix[R] | g_2_pix[G] | b_2_pix[B]);
+      row3[0] = t;
+      row4[0] = t;
+#ifndef SIXTYFOUR_BIT
+      row3[1] = t;
+      row4[1] = t;
+#endif
+      row3 += ONE_TWO;
+      row4 += ONE_TWO;
+      
+      L = L_tab[(int) *lum2++];
+      R = L + cr_r;
+      G = L + cr_g + cb_g;
+      B = L + cb_b;
+      
+      t = (r_2_pix[R] | g_2_pix[G] | b_2_pix[B]);
+      row3[0] = t;
+      row4[0] = t;
+#ifndef SIXTYFOUR_BIT
+      row3[1] = t;
+      row4[1] = t;
+#endif
+      row3 += ONE_TWO;
+      row4 += ONE_TWO;
+    }
+    lum += cols_2 + cols_2;
+    lum2 += cols_2 + cols_2;
+    
+    row1 += loffset;
+    row3 += loffset;
+    row2 += loffset;
+    row4 += loffset;
+  }
+}
diff --git a/mpeglib/lib/util/render/dither/dither32Bit.h b/mpeglib/lib/util/render/dither/dither32Bit.h
new file mode 100644
index 00000000..440d021a
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/dither32Bit.h
@@ -0,0 +1,55 @@
+/*
+  dither 32 bit depth yuv images
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+#ifndef __DITHER32Bit_H
+#define __DITHER32Bit_H
+
+#include "colorTableHighBit.h"
+
+class Dither32Bit {
+
+  ColorTableHighBit* colorTableHighBit;
+
+  TABTYPE *L_tab;
+  TABTYPE *Cr_r_tab;
+  TABTYPE *Cr_g_tab;
+  TABTYPE *Cb_g_tab;
+  TABTYPE *Cb_b_tab;
+
+  PIXVAL *r_2_pix;
+  PIXVAL *g_2_pix;
+  PIXVAL *b_2_pix;
+
+ public:
+  Dither32Bit(unsigned int redMask,
+	      unsigned int greenMask,unsigned int blueMask);
+  ~Dither32Bit();
+
+  void ditherImageColor32(unsigned char* lum, 
+			  unsigned char* cr, 
+			  unsigned char* cb,
+			  unsigned char* out,
+			  int rows,
+			  int cols,
+			  int offset);
+
+  void ditherImageTwox2Color32(unsigned char* lum,
+			       unsigned char* cr,
+			       unsigned char* cb,
+			       unsigned char* out,
+			       int rows,
+			       int cols,
+			       int mod);
+
+};
+
+#endif
diff --git a/mpeglib/lib/util/render/dither/dither32mmx.cpp b/mpeglib/lib/util/render/dither/dither32mmx.cpp
new file mode 100644
index 00000000..b5fa4807
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/dither32mmx.cpp
@@ -0,0 +1,272 @@
+/*
+  MMX ditherer for 32 bit displays
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+
+#include "ditherMMX.h"
+
+#include <iostream>
+
+using namespace std;
+
+
+#ifndef INTEL
+  void dither32_mmx(unsigned char* lum,
+		  unsigned char* cr,
+		  unsigned char* cb,
+		  unsigned char* out,
+		  int rows,
+		  int cols,
+		  int mod) {
+  printf("urgs! dither32_mmx \n");
+  printf("never should happen!\n");
+  exit(0);
+}
+
+#else
+
+
+static unsigned long  MMX32_80w[]         = {0x00800080, 0x00800080};
+static unsigned long  MMX32_10w[]         = {0x00100010, 0x00100010};   
+static unsigned long  MMX32_00FFw[]       = {0x00ff00ff, 0x00ff00ff}; 
+static unsigned long  MMX32_FF00w[]       = {0xff00ff00, 0xff00ff00}; 
+static unsigned short MMX32_Ycoeff[]      = {0x4a, 0x4a, 0x4a, 0x4a}; 
+static unsigned short MMX32_Vredcoeff[]   = {0x59, 0x59, 0x59, 0x59};  
+static unsigned short MMX32_Ubluecoeff[]  = {0x72, 0x72, 0x72, 0x72};    
+static unsigned short MMX32_Ugrncoeff[]   = {0xffea,0xffea,0xffea,0xffea}; 
+static unsigned short MMX32_Vgrncoeff[]   = {0xffd2,0xffd2,0xffd2,0xffd2};  
+
+void dummy_dithermmx32() {
+  cout << "MMX32_10w:"<<MMX32_10w<<endl;
+  cout << "MMX32_80w:"<<MMX32_80w<<endl;
+  cout << "MMX32_Ubluecoeff:"<<MMX32_Ubluecoeff<<endl;
+  cout << "MMX32_Vredcoeff:"<<MMX32_Vredcoeff<<endl;
+  cout << "MMX32_Ugrncoeff:"<<MMX32_Ugrncoeff<<endl;
+  cout << "MMX32_Vgrncoeff:"<<MMX32_Vgrncoeff<<endl;
+  cout << "MMX32_Ycoeff:"<<MMX32_Ycoeff<<endl;
+  cout << "MMX32_00FFw:"<<MMX32_00FFw<<endl;
+  cout << "MMX32_FF00w:"<<MMX32_FF00w<<endl;
+}
+
+
+/**
+   This MMX assembler is my first assembler/MMX program ever.
+   Thus it maybe buggy.
+   Send patches to:
+   mvogt@rhrk.uni-kl.de
+
+   After it worked fine I have "obfuscated" the code a bit to have
+   more parallism in the MMX units. This means I moved
+   initilisation around and delayed other instruction.
+   Performance measurement did not show that this brought any advantage
+   but in theory it _should_ be faster this way.
+
+   The overall performanve gain to the C based dither was 30%-40%.
+   The MMX routine calculates 256bit=8RGB values in each cycle
+   (4 for row1 & 4 for row2)
+
+   The red/green/blue.. coefficents are taken from the mpeg_play 
+   player. They look nice, but I dont know if you can have
+   better values, to avoid integer rounding errors.
+   
+
+   IMPORTANT:
+   ==========
+
+   It is a requirement that the cr/cb/lum are 8 byte aligned and
+   the out are 16byte aligned or you will/may get segfaults
+
+*/
+
+void dither32_mmx(unsigned char* lum,
+		  unsigned char* cr,
+		  unsigned char* cb,
+		  unsigned char* out,
+		  int rows,
+		  int cols,
+		  int mod) {
+
+
+    
+    unsigned int *row1;
+    unsigned int *row2;
+    row1 = (unsigned int *)out;           // 32 bit target
+
+    unsigned char* end = lum +cols*rows;    // Pointer to the end
+    int x=cols;
+    row2=row1+cols+mod;                   // start of second row 
+    mod=4*cols+8*mod;                     // increment for row1 in byte
+
+    // buffer for asm function
+    int buf[6];
+    buf[0]=(int)(lum+cols);   // lum2 pointer
+    buf[1]=(int)end;
+    buf[2]=x;
+    buf[3]=mod;     
+    buf[4]=0; //tmp0;
+    buf[5]=cols;
+
+
+    __asm__ __volatile__ (
+	         ".align 32\n"
+		 "1:\n"
+		
+		 // create Cr (result in mm1)
+		 "movd (%0), %%mm1\n"      //         0  0  0  0  v3 v2 v1 v0
+		 "pxor %%mm7,%%mm7\n"      //         00 00 00 00 00 00 00 00
+		 "movd (%2), %%mm2\n"           //    0  0  0  0 l3 l2 l1 l0
+		 "punpcklbw %%mm7,%%mm1\n" //         0  v3 0  v2 00 v1 00 v0
+		 "punpckldq %%mm1,%%mm1\n" //         00 v1 00 v0 00 v1 00 v0
+		 "psubw MMX32_80w,%%mm1\n"   // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 
+
+		 // create Cr_g (result in mm0)
+		 "movq %%mm1,%%mm0\n"           // r1 r1 r0 r0 r1 r1 r0 r0
+		 "pmullw MMX32_Vgrncoeff,%%mm0\n" // red*-46dec=0.7136*64
+		 "pmullw MMX32_Vredcoeff,%%mm1\n" // red*89dec=1.4013*64
+		 "psraw  $6, %%mm0\n"           // red=red/64
+		 "psraw  $6, %%mm1\n"           // red=red/64
+
+		 
+		 // create L1 L2 (result in mm2,mm4)
+		 // L2=lum2
+		 "movl %2,16%5\n"               // store register in tmp0
+		 "movl %5,%2\n"                 // lum2->register
+		 "movd (%2),%%mm3\n"            //    0  0  0  0 L3 L2 L1 L0
+		 "movl 16%5,%2\n"               // tmp0->register
+		 "punpckldq %%mm3,%%mm2\n"      //   L3 L2 L1 L0 l3 l2 l1 l0
+		 "movq %%mm2,%%mm4\n"           //   L3 L2 L1 L0 l3 l2 l1 l0
+		 "pand MMX32_FF00w, %%mm2\n"      //   L3 0  L1  0 l3  0 l1  0
+		 "pand MMX32_00FFw, %%mm4\n"      //   0  L2  0 L0  0 l2  0 l0
+		 "psrlw $8,%%mm2\n"             //   0  L3  0 L1  0 l3  0 l1
+
+
+
+		 // create R (result in mm6)
+		 "movq %%mm2,%%mm5\n"           //   0 L3  0 L1  0 l3  0 l1
+		 "movq %%mm4,%%mm6\n"           //   0 L2  0 L0  0 l2  0 l0
+		 "paddsw  %%mm1, %%mm5\n"       // lum1+red:x R3 x R1 x r3 x r1
+		 "paddsw  %%mm1, %%mm6\n"       // lum1+red:x R2 x R0 x r2 x r0
+		 "packuswb %%mm5,%%mm5\n"       //  R3 R1 r3 r1 R3 R1 r3 r1
+		 "packuswb %%mm6,%%mm6\n"       //  R2 R0 r2 r0 R2 R0 r2 r0
+		 "pxor %%mm7,%%mm7\n"      //         00 00 00 00 00 00 00 00
+		 "punpcklbw %%mm5,%%mm6\n"      //  R3 R2 R1 R0 r3 r2 r1 r0
+
+
+		 // create Cb (result in mm1)
+		 "movd (%1), %%mm1\n"      //         0  0  0  0  u3 u2 u1 u0
+		 "punpcklbw %%mm7,%%mm1\n" //         0  u3 0  u2 00 u1 00 u0
+		 "punpckldq %%mm1,%%mm1\n" //         00 u1 00 u0 00 u1 00 u0
+		 "psubw MMX32_80w,%%mm1\n"   // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 
+		 // create Cb_g (result in mm5)
+		 "movq %%mm1,%%mm5\n"            // u1 u1 u0 u0 u1 u1 u0 u0
+		 "pmullw MMX32_Ugrncoeff,%%mm5\n"  // blue*-109dec=1.7129*64
+		 "pmullw MMX32_Ubluecoeff,%%mm1\n" // blue*114dec=1.78125*64
+		 "psraw  $6, %%mm5\n"            // blue=red/64
+		 "psraw  $6, %%mm1\n"            // blue=blue/64
+
+
+		 // create G (result in mm7)
+		 "movq %%mm2,%%mm3\n"      //   0  L3  0 L1  0 l3  0 l1
+		 "movq %%mm4,%%mm7\n"      //   0  L2  0 L0  0 l2  0 l1
+		 "paddsw  %%mm5, %%mm3\n"  // lum1+Cb_g:x G3t x G1t x g3t x g1t
+		 "paddsw  %%mm5, %%mm7\n"  // lum1+Cb_g:x G2t x G0t x g2t x g0t
+		 "paddsw  %%mm0, %%mm3\n"  // lum1+Cr_g:x G3  x G1  x g3  x g1
+		 "paddsw  %%mm0, %%mm7\n"  // lum1+blue:x G2  x G0  x g2  x g0
+		 "packuswb %%mm3,%%mm3\n"  // G3 G1 g3 g1 G3 G1 g3 g1
+		 "packuswb %%mm7,%%mm7\n"  // G2 G0 g2 g0 G2 G0 g2 g0
+		 "punpcklbw %%mm3,%%mm7\n" // G3 G2 G1 G0 g3 g2 g1 g0
+		 
+
+		 // create B (result in mm5)
+		 "movq %%mm2,%%mm3\n"         //   0  L3  0 L1  0 l3  0 l1
+		 "movq %%mm4,%%mm5\n"         //   0  L2  0 L0  0 l2  0 l1
+		 "paddsw  %%mm1, %%mm3\n"     // lum1+blue:x B3 x B1 x b3 x b1
+		 "paddsw  %%mm1, %%mm5\n"     // lum1+blue:x B2 x B0 x b2 x b0
+		 "packuswb %%mm3,%%mm3\n"     // B3 B1 b3 b1 B3 B1 b3 b1
+		 "packuswb %%mm5,%%mm5\n"     // B2 B0 b2 b0 B2 B0 b2 b0
+		 "punpcklbw %%mm3,%%mm5\n"    // B3 B2 B1 B0 b3 b2 b1 b0
+
+
+		 // fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb)
+
+		 "pxor %%mm2,%%mm2\n"           //  0  0  0  0  0  0  0  0
+		 "pxor %%mm4,%%mm4\n"           //  0  0  0  0  0  0  0  0
+		 "movq %%mm6,%%mm1\n"           // R3 R2 R1 R0 r3 r2 r1 r0
+		 "movq %%mm5,%%mm3\n"           // B3 B2 B1 B0 b3 b2 b1 b0
+		 // process lower lum
+		 "punpcklbw %%mm4,%%mm1\n"      //  0 r3  0 r2  0 r1  0 r0
+		 "punpcklbw %%mm4,%%mm3\n"      //  0 b3  0 b2  0 b1  0 b0
+		 "movq %%mm1,%%mm2\n"           //  0 r3  0 r2  0 r1  0 r0
+		 "movq %%mm3,%%mm0\n"           //  0 b3  0 b2  0 b1  0 b0
+		 "punpcklwd %%mm1,%%mm3\n"      //  0 r1  0 b1  0 r0  0 b0
+		 "punpckhwd %%mm2,%%mm0\n"      //  0 r3  0 b3  0 r2  0 b2
+
+		 "pxor %%mm2,%%mm2\n"           //  0  0  0  0  0  0  0  0
+		 "movq %%mm7,%%mm1\n"           // G3 G2 G1 G0 g3 g2 g1 g0
+		 "punpcklbw %%mm1,%%mm2\n"      // g3  0 g2  0 g1  0 g0  0
+		 "punpcklwd %%mm4,%%mm2\n"      //  0  0 g1  0  0  0 g0  0 
+		 "por  %%mm3, %%mm2\n"      //  0 r1 g1 b1  0 r0 g0 b0
+		 "movq   %%mm2,(%3)\n"          // wrote out ! row1
+
+		 "pxor %%mm2,%%mm2\n"           //  0  0  0  0  0  0  0  0
+		 "punpcklbw %%mm1,%%mm4\n"      // g3  0 g2  0 g1  0 g0  0
+		 "punpckhwd %%mm2,%%mm4\n"      //  0  0 g3  0  0  0 g2  0 
+		 "por  %%mm0, %%mm4\n"      //  0 r3 g3 b3  0 r2 g2 b2
+		 "movq   %%mm4,8(%3)\n"         // wrote out ! row1
+		 
+		 // fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb)
+		 // this can be done "destructive"
+		 "pxor %%mm2,%%mm2\n"           //  0  0  0  0  0  0  0  0
+		 "punpckhbw %%mm2,%%mm6\n"      //  0 R3  0 R2  0 R1  0 R0
+		 "punpckhbw %%mm1,%%mm5\n"      // G3 B3 G2 B2 G1 B1 G0 B0
+		 "movq %%mm5,%%mm1\n"           // G3 B3 G2 B2 G1 B1 G0 B0
+		 "punpcklwd %%mm6,%%mm1\n"      //  0 R1 G1 B1  0 R0 G0 B0
+		 "movq   %%mm1,(%4)\n"          // wrote out ! row2
+		 "punpckhwd %%mm6,%%mm5\n"      //  0 R3 G3 B3  0 R2 G2 B2
+		 "movq   %%mm5,8(%4)\n"         // wrote out ! row2
+		 
+		 "addl  $4,%2\n"            // lum+4
+		 "addl  $4,%5\n"            // lum2+4
+		 "leal  16(%3),%3\n"        // row1+16
+		 "leal  16(%4),%4\n"        // row2+16
+		 "addl  $2, %0\n"           // cr+2
+		 "addl  $2, %1\n"           // cb+2
+
+		 "subl  $4,8%5\n"           // x+4 x is buf[2]
+		 "cmpl  $0,8%5\n"
+
+		 "jne   1b\n"
+		 "addl           20%5,   %2\n" // lum  += cols 
+		 "movl %2,16%5\n"              // store register in tmp0
+		 "movl 20%5,%2\n"              // cols->register
+
+		 "addl           %2,     %5\n" // lum2 += cols 
+		 "addl           12%5,   %3\n" // row1+= mod is buf[0]
+		 "addl           12%5,   %4\n" // row2+= mod is buf[0]
+
+		 "movl %2, 8%5\n"              // x=cols
+		 "movl 16%5,%2\n"              // store tmp0 in register
+
+		 "cmpl           4%5,    %2\n"  // buf[1] is end
+		 "jl             1b\n"
+		 "emms\n"
+		 :
+		 : "r" (cr), "r"(cb),"r"(lum),
+		 "r"(row1),"r"(row2),"m"(buf[0])
+		 );
+
+
+
+}
+
+
+#endif
diff --git a/mpeglib/lib/util/render/dither/dither8Bit.cpp b/mpeglib/lib/util/render/dither/dither8Bit.cpp
new file mode 100644
index 00000000..4f85d3fb
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/dither8Bit.cpp
@@ -0,0 +1,306 @@
+/*
+  dither 8 bit depth yuv images
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+#include "dither8Bit.h"
+
+
+Dither8Bit::Dither8Bit(unsigned char pixel[256]) {
+
+  int i;
+  for(i=0;i<256;i++) {
+    this->pixel[i]=pixel[i];
+  }
+  colorTable8Bit=new ColorTable8Bit();
+
+  lum_values = colorTable8Bit->getLumValues();
+  cr_values = colorTable8Bit->getCrValues();
+  cb_values = colorTable8Bit->getCbValues();
+
+
+
+  initOrderedDither();
+
+}
+
+
+Dither8Bit::~Dither8Bit() {
+  int i;
+  for (i=0; i<DITH_SIZE; i++) {
+    delete cb_darrays[i];
+    delete l_darrays[i];
+    delete cr_darrays[i];
+  }
+}
+
+
+
+
+
+/*
+ *--------------------------------------------------------------
+ *
+ *  InitOrderedDither--
+ *
+ *	Structures initialized for ordered dithering. 
+ *
+ * Results:
+ *	None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *--------------------------------------------------------------
+ */
+void Dither8Bit::initOrderedDither() {
+  int i, j, k, err_range, threshval;
+  unsigned char *lmark, *cmark;
+
+  for (i=0; i<DITH_SIZE; i++) {
+    lmark = l_darrays[i] = new unsigned char[256];
+    for (j=0; j<lum_values[0]; j++) {
+      *lmark++ = 0;
+    }
+    for (j=0; j<(LUM_RANGE-1); j++) {
+      err_range = lum_values[j+1] - lum_values[j];
+      threshval = ((i * err_range) / DITH_SIZE)+lum_values[j];
+
+      for (k=lum_values[j]; k<lum_values[j+1]; k++) {
+        if (k > threshval) {
+          *lmark++ = ((j+1) * (CR_RANGE * CB_RANGE));
+        }
+        else {
+          *lmark++ = (j * (CR_RANGE * CB_RANGE));
+        }
+      }
+    }
+    for (j=lum_values[LUM_RANGE-1]; j<256; j++) {
+      *lmark++ = (LUM_RANGE-1)*(CR_RANGE * CB_RANGE);
+    }
+  }
+  for (i=0; i<DITH_SIZE; i++) {
+    cmark = cr_darrays[i] = new unsigned char[256];
+
+    for (j=0; j<cr_values[0]; j++) {
+      *cmark++ = 0;
+    }
+
+    for (j=0; j<(CR_RANGE-1); j++) {
+      err_range = cr_values[j+1] - cr_values[j];
+      threshval = ((i * err_range) / DITH_SIZE)+cr_values[j];
+
+      for (k=cr_values[j]; k<cr_values[j+1]; k++) {
+        if (k > threshval) {
+          *cmark++ = ((j+1) * CB_RANGE);
+        }
+        else {
+          *cmark++ = (j * CB_RANGE);
+        }
+      }
+    }
+
+    for (j=cr_values[CR_RANGE-1]; j<256; j++) {
+      *cmark++ = (CR_RANGE-1)*(CB_RANGE);
+    }
+  }
+
+  for (i=0; i<DITH_SIZE; i++) {
+    cmark = cb_darrays[i] = new unsigned char[256];
+
+    for (j=0; j<cb_values[0]; j++) {
+      *cmark++ = 0;
+    }
+
+    for (j=0; j<(CB_RANGE-1); j++) {
+      err_range = cb_values[j+1] - cb_values[j];
+      threshval = ((i * err_range) / DITH_SIZE)+cb_values[j];
+
+      for (k=cb_values[j]; k<cb_values[j+1]; k++) {
+        if (k > threshval) {
+          *cmark++ = j+1;
+        }
+        else {
+          *cmark++ = j;
+        }
+      }
+    }
+
+    for (j=cb_values[CB_RANGE-1]; j<256; j++) {
+      *cmark++ = CB_RANGE-1;
+    }
+  }
+}
+
+
+
+/*
+ *--------------------------------------------------------------
+ *
+ * OrderedDitherImage --
+ *
+ *	Dithers an image using an ordered dither.
+ *	Assumptions made:
+ *	  1) The color space is allocated y:cr:cb = 8:4:4
+ *	  2) The spatial resolution of y:cr:cb is 4:1:1
+ *      The channels are dithered based on the standard
+ *      ordered dither pattern for a 4x4 area. 
+ *
+ * Results:
+ *	None.
+ *
+ * Side effects:
+ *	None.
+ *
+ *--------------------------------------------------------------
+ */
+
+void  Dither8Bit::ditherImageOrdered (unsigned char* lum,
+				      unsigned char* cr,
+				      unsigned char* cb,
+				      unsigned char* out,
+				      int h,
+				      int w) {
+  unsigned char *l, *r, *b, *o1, *o2;
+  unsigned char *l2;
+  unsigned char L, R, B;
+  int i, j;
+
+  l = lum;
+  l2 = lum+w;
+  r = cr;
+  b = cb;
+  o1 = out;
+  o2 = out+w;
+
+
+  for (i=0; i<h; i+=4) {
+
+    for (j=0; j<w; j+=8) {
+
+      R = r[0]; B = b[0];
+
+      L = l[0];
+      o1[0] = pixel[(l_darrays[0][L] + cr_darrays[0][R] + cb_darrays[0][B])];
+      L = l[1];
+      o1[1] = pixel[(l_darrays[8][L] + cr_darrays[8][R] + cb_darrays[8][B])];
+      L = l2[0];
+      o2[0] = pixel[(l_darrays[12][L] + cr_darrays[12][R] + cb_darrays[12][B])];
+      L = l2[1];
+      o2[1] = pixel[(l_darrays[4][L] + cr_darrays[4][R] + cb_darrays[4][B])];
+
+      R = r[1]; B = b[1];
+
+      L = l[2];
+      o1[2] = pixel[(l_darrays[2][L] + cr_darrays[2][R] + cb_darrays[2][B])];
+      L = l[3];
+      o1[3] = pixel[(l_darrays[10][L] + cr_darrays[10][R] + cb_darrays[10][B])];
+      L = l2[2];
+      o2[2] = pixel[(l_darrays[14][L] + cr_darrays[14][R] + cb_darrays[14][B])];
+      L = l2[3];
+      o2[3] = pixel[(l_darrays[6][L] + cr_darrays[6][R] + cb_darrays[6][B])];
+
+      R = r[2]; B = b[2];
+
+      L = l[4];
+      o1[4] = pixel[(l_darrays[0][L] + cr_darrays[0][R] + cb_darrays[0][B])];
+      L = l[5];
+      o1[5] = pixel[(l_darrays[8][L] + cr_darrays[8][R] + cb_darrays[8][B])];
+      L = l2[4];
+      o2[4] = pixel[(l_darrays[12][L] + cr_darrays[12][R] + cb_darrays[12][B])];
+      L = l2[5];
+      o2[5] = pixel[(l_darrays[4][L] + cr_darrays[4][R] + cb_darrays[4][B])];
+
+      R = r[3]; B = b[3];
+
+      L = l[6];
+      o1[6] = pixel[(l_darrays[2][L] + cr_darrays[2][R] + cb_darrays[2][B])];
+      L = l[7];
+      o1[7] = pixel[(l_darrays[10][L] + cr_darrays[10][R] + cb_darrays[10][B])];
+      L = l2[6];
+      o2[6] = pixel[(l_darrays[14][L] + cr_darrays[14][R] + cb_darrays[14][B])];
+      L = l2[7];
+      o2[7] = pixel[(l_darrays[6][L] + cr_darrays[6][R] + cb_darrays[6][B])];
+
+      l += 8;
+      l2 += 8;
+      r += 4;
+      b += 4;
+      o1 += 8;
+      o2 += 8;
+    }
+
+    l += w; 
+	l2 += w;
+    o1 += w; 
+	o2 += w;
+
+    for (j=0; j<w; j+=8) {
+
+      R = r[0]; B = b[0];
+
+      L = l[0];
+      o1[0] = pixel[(l_darrays[3][L] + cr_darrays[3][R] + cb_darrays[3][B])];
+      L = l[1];
+      o1[1] = pixel[(l_darrays[11][L] + cr_darrays[11][R] + cb_darrays[11][B])];
+      L = l2[0];
+      o2[0] = pixel[(l_darrays[15][L] + cr_darrays[15][R] + cb_darrays[15][B])];
+      L = l2[1];
+      o2[1] = pixel[(l_darrays[7][L] + cr_darrays[7][R] + cb_darrays[7][B])];
+
+      R = r[1]; B = b[1];
+
+      L = l[2];
+      o1[2] = pixel[(l_darrays[1][L] + cr_darrays[1][R] + cb_darrays[1][B])];
+      L = l[3];
+      o1[3] = pixel[(l_darrays[9][L] + cr_darrays[9][R] + cb_darrays[9][B])];
+      L = l2[2];
+      o2[2] = pixel[(l_darrays[13][L] + cr_darrays[13][R] + cb_darrays[13][B])];
+      L = l2[3];
+      o2[3] = pixel[(l_darrays[5][L] + cr_darrays[5][R] + cb_darrays[5][B])];
+
+      R = r[2]; B = b[2];
+
+      L = l[4];
+      o1[4] = pixel[(l_darrays[3][L] + cr_darrays[3][R] + cb_darrays[3][B])];
+      L = l[5];
+      o1[5] = pixel[(l_darrays[11][L] + cr_darrays[11][R] + cb_darrays[11][B])];
+      L = l2[4];
+      o2[4] = pixel[(l_darrays[15][L] + cr_darrays[15][R] + cb_darrays[15][B])];
+      L = l2[5];
+      o2[5] = pixel[(l_darrays[7][L] + cr_darrays[7][R] + cb_darrays[7][B])];
+
+      R = r[3]; B = b[3];
+
+      L = l[6];
+      o1[6] = pixel[(l_darrays[1][L] + cr_darrays[1][R] + cb_darrays[1][B])];
+      L = l[7];
+      o1[7] = pixel[(l_darrays[9][L] + cr_darrays[9][R] + cb_darrays[9][B])];
+      L = l2[6];
+      o2[6] = pixel[(l_darrays[13][L] + cr_darrays[13][R] + cb_darrays[13][B])];
+      L = l2[7];
+      o2[7] = pixel[(l_darrays[5][L] + cr_darrays[5][R] + cb_darrays[5][B])];
+
+      l += 8;
+      l2 += 8;
+      r += 4;
+      b += 4;
+      o1 += 8;
+      o2 += 8;
+    }
+
+    l += w; 
+	l2 += w;
+    o1 += w; 
+	o2 += w;
+  }
+}
+
diff --git a/mpeglib/lib/util/render/dither/dither8Bit.h b/mpeglib/lib/util/render/dither/dither8Bit.h
new file mode 100644
index 00000000..7bdd4d8f
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/dither8Bit.h
@@ -0,0 +1,63 @@
+/*
+  dither 8 bit depth yuv images
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+
+#ifndef __DITHER_8BIT_H
+#define __DITHER_8BIT_H
+
+
+#include "colorTable8Bit.h"
+
+#define DITH_SIZE 16
+
+
+class Dither8Bit {
+
+  /* Structures used to implement hybrid ordered dither/floyd-steinberg
+     dither algorithm.
+  */
+  
+  unsigned char *l_darrays[DITH_SIZE];
+  unsigned char *cr_darrays[DITH_SIZE];
+  unsigned char *cb_darrays[DITH_SIZE];
+
+  // private colormap
+  unsigned char pixel[256];
+
+  ColorTable8Bit* colorTable8Bit;
+ 
+  // Arrays holding quantized value ranged for lum, cr, and cb. 
+  // (used for 8 Bit)
+  
+  int* lum_values;
+  int* cr_values;
+  int* cb_values;
+  
+
+ public:
+  Dither8Bit(unsigned char pixel[256]);
+  ~Dither8Bit();
+
+  void  ditherImageOrdered (unsigned char* lum,
+			    unsigned char* cr,
+			    unsigned char* cb,
+			    unsigned char* out,
+			    int h,
+			    int w);
+  
+ private:
+  void initOrderedDither();
+};
+
+#endif
+
diff --git a/mpeglib/lib/util/render/dither/ditherDef.h b/mpeglib/lib/util/render/dither/ditherDef.h
new file mode 100644
index 00000000..2e8d7d0e
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/ditherDef.h
@@ -0,0 +1,100 @@
+/*
+  global definitions for dithering
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+
+#ifndef __DITHERDEF_H
+#define __DITHERDEF_H
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+extern "C" {
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+}
+
+
+#ifdef __GNUC__
+#if (__GNUC__ < 2 || ( __GNUC__ == 2 && __GNUC_MINOR__ < 91 ) )
+#ifndef _AIX
+#warning "inline code disabled! (buggy egcs version)"
+#undef __NO_MATH_INLINES
+#define __NO_MATH_INLINES 1
+#endif
+#endif
+#endif
+#include <math.h>
+
+
+
+/* Gamma correction stuff */
+extern int gammaCorrectFlag;
+extern double gammaCorrect;
+
+/* Chroma correction stuff */
+extern int chromaCorrectFlag;
+extern double chromaCorrect;
+
+
+#define CB_BASE 1
+#define CR_BASE (CB_BASE*CB_RANGE)
+#define LUM_BASE (CR_BASE*CR_RANGE)
+
+#define TABTYPE short
+
+#ifdef SIXTYFOUR_BIT
+#define PIXVAL long
+#else
+#define PIXVAL int
+#endif
+
+#ifdef SIXTYFOUR_BIT
+#define ONE_TWO 1
+#else
+#define ONE_TWO 2
+#endif
+
+
+
+#define Min(x,y) (((x) < (y)) ? (x) : (y))
+#define Max(x,y) (((x) > (y)) ? (x) : (y))
+
+#define CHROMA_CORRECTION128(x) ((x) >= 0 \
+                        ? Min(127,  (int)(((x) * chromaCorrect))) \
+                        : Max(-128, (int)(((x) * chromaCorrect))))
+#define CHROMA_CORRECTION256D(x) ((x) >= 128 \
+                        ? 128.0 + Min(127.0, (((x)-128.0) * chromaCorrect)) \
+                        : 128.0 - Min(128.0, (((128.0-(x))* chromaCorrect))))
+
+
+
+#define GAMMA_CORRECTION(x) ((int)(pow((x) / 255.0, 1.0/gammaCorrect)* 255.0))
+
+#define CHROMA_CORRECTION128D(x) ((x) >= 0 \
+                        ? Min(127.0,  ((x) * chromaCorrect)) \
+                        : Max(-128.0, ((x) * chromaCorrect)))
+
+#define CHROMA_CORRECTION256(x) ((x) >= 128 \
+                        ? 128 + Min(127, (int)(((x)-128.0) * chromaCorrect)) \
+                        : 128 - Min(128, (int)((128.0-(x)) * chromaCorrect)))
+
+// Range values for lum, cr, cb. 
+#define  LUM_RANGE  8
+#define  CR_RANGE   4
+#define  CB_RANGE   4
+
+ 
+#endif
diff --git a/mpeglib/lib/util/render/dither/ditherMMX.h b/mpeglib/lib/util/render/dither/ditherMMX.h
new file mode 100644
index 00000000..2f08b689
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/ditherMMX.h
@@ -0,0 +1,38 @@
+/*
+  mmx ditherer
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+#ifndef __DITHERMMX_H
+#define __DITHERMMX_H
+
+#include "ditherDef.h"
+
+
+// The mmx dither routine come from NIST
+// NIST is an mpeg2/dvd player
+// more: http://home.germany.net/100-5083/
+extern void  ditherBlock(unsigned char *lum, 
+			 unsigned char *cr, 
+			 unsigned char *cb,
+			 unsigned char *out,
+			 int rows, int cols, int mod);
+
+extern void dither32_mmx(unsigned char* lum,
+			 unsigned char* cr,
+			 unsigned char* cb,
+			 unsigned char* out,
+			 int rows,
+			 int cols,
+			 int mod);
+
+
+#endif
diff --git a/mpeglib/lib/util/render/dither/ditherRGB.cpp b/mpeglib/lib/util/render/dither/ditherRGB.cpp
new file mode 100644
index 00000000..1bcdb2ff
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/ditherRGB.cpp
@@ -0,0 +1,230 @@
+/*
+  copys RGB images to a destination
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+#include "ditherRGB.h"
+
+#include <iostream>
+
+using namespace std;
+
+DitherRGB::DitherRGB() {
+}
+
+
+DitherRGB::~DitherRGB() {
+}
+
+
+int DitherRGB::getDepth(int pixel) {
+  int byteDepth=0;
+
+  switch(pixel) {
+  case 8:
+    byteDepth=1;
+    break;
+  case 15:
+  case 16:
+    byteDepth=2;
+    break;
+  case 24:
+  case 32:
+    byteDepth=4;
+    break;
+  default:
+    cout << "unknown byteDepth:"<<pixel
+         << " in DitherRGB_flipped::flipRGBImage"<<endl;
+  }
+  return byteDepth;
+   
+}
+
+void DitherRGB::ditherRGBImage(unsigned char* dest,unsigned char* src,
+			       int depth,int width,int height,int offset) {
+  int byteDepth=getDepth(depth);
+  if (byteDepth == 0) {
+    return;
+  }
+
+  
+  if (offset==0) {
+    int bytes=height*width*byteDepth;
+    memcpy(dest,src,bytes);
+    return;
+  }
+
+  int i;
+  int lineSize=width*byteDepth;
+  
+  offset=offset*byteDepth+lineSize;
+
+  for (i=0;i<height;i++) {
+    memcpy(dest,src,lineSize);
+    src+=lineSize;
+    dest+=offset;
+  }
+
+  
+}
+
+void DitherRGB::ditherRGBImage_x2(unsigned char* dest,unsigned char* src,
+				  int depth,int width,int height,int offset) {
+
+  int byteDepth=getDepth(depth);
+  if (byteDepth == 0) {
+    return;
+  }
+
+  switch(byteDepth) {
+  case 1:
+    ditherRGB1Byte_x2(dest,src,1,width, height,offset);
+    break;
+  case 2:
+    ditherRGB2Byte_x2(dest,src,2,width, height,offset);
+    break;
+  case 4:
+    ditherRGB4Byte_x2(dest,src,4,width, height,offset);
+    break;
+  default:
+    cout <<"ditherRGBImage_x2 byteDepth:"<<byteDepth
+	 <<" not supported"<<endl;
+  }
+}
+ 
+
+void DitherRGB::ditherRGB1Byte_x2(unsigned char* dest,unsigned char* src,
+				  int depth,int width,int height,int offset) {
+  
+  //
+  // dest  destr
+  // destd destrd
+
+  int lineInc=2*width+offset;
+  unsigned char* destr=dest+1;
+  unsigned char* destd=dest+lineInc;
+  unsigned char* destrd=destd+1;
+  
+  int row;
+  int col;
+  //
+  // We copy byte by byte this is slow, but works for
+  // all byteDepth
+  // this memcpy can be optimized with MMX very i) good ii) easily
+
+  for(row=0;row<height;row++) {
+    for(col=0;col<width;col++) {
+      *dest++=*src;
+      *destr++=*src;
+      *destd++=*src;
+      *destrd++=*src;
+      dest++;
+      destr++;
+      destd++;
+      destrd++;
+      
+      src++;
+    }
+    dest+=lineInc;
+    destr+=lineInc;
+    destd+=lineInc;
+    destrd+=lineInc;
+  }
+}
+
+
+void DitherRGB::ditherRGB2Byte_x2(unsigned char* destination,
+				  unsigned char* source,
+				  int depth,int width,int height,int offset) {
+  //
+  // dest  destr
+  // destd destrd
+
+  unsigned short int* src=(unsigned short int*) source;
+  unsigned short int* dest=(unsigned short int*) destination;
+  
+  int lineInc=2*width+offset;
+  unsigned short int* destr=dest+1;
+  unsigned short int* destd=dest+lineInc;
+  unsigned short int* destrd=destd+1;
+  
+  int row;
+  int col;
+  //
+  // We copy byte by byte this is slow, but works for
+  // all byteDepth
+  // this memcpy can be optimized with MMX very i) good ii) easily
+
+  for(row=0;row<height;row++) {
+    for(col=0;col<width;col++) {
+      *dest++=*src;
+      *destr++=*src;
+      *destd++=*src;
+      *destrd++=*src;
+      dest++;
+      destr++;
+      destd++;
+      destrd++;
+      
+      src++;
+    }
+    dest+=lineInc;
+    destr+=lineInc;
+    destd+=lineInc;
+    destrd+=lineInc;
+  }
+}
+
+
+void DitherRGB::ditherRGB4Byte_x2(unsigned char* destination,
+				  unsigned char* source,
+				  int depth,int width,int height,int offset) {
+
+  //
+  // dest  destr
+  // destd destrd
+
+  unsigned int* src=(unsigned int*) source;
+  unsigned int* dest=(unsigned int*) destination;
+  
+  int lineInc=2*width+offset;
+  unsigned int* destr=dest+1;
+  unsigned int* destd=dest+lineInc;
+  unsigned int* destrd=destd+1;
+  
+  int row;
+  int col;
+  //
+  // We copy byte by byte this is slow, but works for
+  // all byteDepth
+  // this memcpy can be optimized with MMX very i) good ii) easily
+
+  for(row=0;row<height;row++) {
+    for(col=0;col<width;col++) {
+      *dest++=*src;
+      *destr++=*src;
+      *destd++=*src;
+      *destrd++=*src;
+      dest++;
+      destr++;
+      destd++;
+      destrd++;
+      
+      src++;
+    }
+    dest+=lineInc;
+    destr+=lineInc;
+    destd+=lineInc;
+    destrd+=lineInc;
+  }
+
+}
+ 
diff --git a/mpeglib/lib/util/render/dither/ditherRGB.h b/mpeglib/lib/util/render/dither/ditherRGB.h
new file mode 100644
index 00000000..6f24cd8c
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/ditherRGB.h
@@ -0,0 +1,45 @@
+/*
+  copys RGB images to a destination
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+#ifndef __DITHERRGB_H
+#define __DITHERRGB_H
+
+#include "colorTableHighBit.h"
+
+class DitherRGB {
+
+  int flipSize;
+  unsigned char* flipSpace;
+
+ public:
+  DitherRGB();
+  ~DitherRGB();
+
+  // Note: this methods swaps the image
+  // itsself
+  void ditherRGBImage(unsigned char* dest,unsigned char* src,
+		      int depth,int width,int height,int offset);
+  void ditherRGBImage_x2(unsigned char* dest,unsigned char* src,
+			 int depth,int width,int height,int offset);
+ private:
+  int getDepth(int pixel);
+  // depth is here in byte!
+  void ditherRGB1Byte_x2(unsigned char* dest,unsigned char* src,
+			 int depth,int width,int height,int offset);
+  void ditherRGB2Byte_x2(unsigned char* dest,unsigned char* src,
+			 int depth,int width,int height,int offset);
+  void ditherRGB4Byte_x2(unsigned char* dest,unsigned char* src,
+			 int depth,int width,int height,int offset);
+ 
+};
+
+#endif
diff --git a/mpeglib/lib/util/render/dither/ditherRGB_flipped.cpp b/mpeglib/lib/util/render/dither/ditherRGB_flipped.cpp
new file mode 100644
index 00000000..ba177675
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/ditherRGB_flipped.cpp
@@ -0,0 +1,82 @@
+/*
+  flips RGB images
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+#include "ditherRGB_flipped.h"
+
+#include <iostream>
+
+using namespace std;
+
+
+DitherRGB_flipped::DitherRGB_flipped() {
+  flipSpace=NULL;
+  flipSize=0;
+}
+
+DitherRGB_flipped::~DitherRGB_flipped() {
+  if (flipSpace != NULL) {
+    delete flipSpace;
+  }
+}
+
+
+
+
+void DitherRGB_flipped::flipRGBImage(unsigned char* dest,unsigned char* src,
+				     int depth,int width,int height,int ) {
+
+  int byteDepth;
+
+  switch(depth) {
+  case 8:
+    byteDepth=1;
+    break;
+  case 15:
+  case 16:
+    byteDepth=2;
+    break;
+  case 24:
+  case 32:
+    byteDepth=4;
+    break;
+  default:
+    cout << "unknown byteDepth:"<<depth
+         << " in DitherRGB_flipped::flipRGBImage"<<endl;
+    return;
+  }
+    
+
+  int spaceNeeded=width*height*byteDepth;
+
+  if (spaceNeeded > flipSize) {
+    if (flipSpace != NULL) {
+      delete flipSpace;
+    }
+    cout << "flipSpace:"<<spaceNeeded<<endl;
+    flipSpace=new unsigned char[spaceNeeded+64];
+    flipSize=spaceNeeded;
+  }
+
+  int i;
+  int lineSize=width*byteDepth;
+  unsigned char* end=dest+lineSize*(height-1);
+
+  for (i=0;i<height;i++) {
+    memcpy(end,src,lineSize);
+    src+=lineSize;
+    end-=lineSize;
+  }
+
+}
+
+
diff --git a/mpeglib/lib/util/render/dither/ditherRGB_flipped.h b/mpeglib/lib/util/render/dither/ditherRGB_flipped.h
new file mode 100644
index 00000000..1d99f7f6
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/ditherRGB_flipped.h
@@ -0,0 +1,34 @@
+/*
+  flips RGB images
+  Copyright (C) 2000  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+#ifndef __DITHERRGB_FLIPPED_H
+#define __DITHERRGB_FLIPPED_H
+
+#include "colorTableHighBit.h"
+
+class DitherRGB_flipped {
+
+  int flipSize;
+  unsigned char* flipSpace;
+
+ public:
+  DitherRGB_flipped();
+  ~DitherRGB_flipped();
+
+  // Note: this methods swaps the image
+  // itsself
+  void flipRGBImage(unsigned char* dest,unsigned char* src,
+		    int depth,int width,int height,int offset);
+
+};
+
+#endif
diff --git a/mpeglib/lib/util/render/dither/ditherWrapper.cpp b/mpeglib/lib/util/render/dither/ditherWrapper.cpp
new file mode 100644
index 00000000..c6c37a79
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/ditherWrapper.cpp
@@ -0,0 +1,246 @@
+/*
+  wrapper for X11 Window
+  Copyright (C) 1999  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+#include "ditherWrapper.h"
+
+#include <iostream>
+
+using namespace std;
+
+
+/* 
+   Flag for gamma correction 
+   Makes images brighter/darker. 
+   It's in the source but not activated (for now)
+*/
+int gammaCorrectFlag = 0;
+double gammaCorrect = 1.0;
+
+/* 
+   Flag for chroma correction.
+   reduce the color intensity..
+   It's in the source but not activated (for now)
+*/
+int chromaCorrectFlag = 0;
+double chromaCorrect = 1.0;
+
+
+
+DitherWrapper::DitherWrapper(int bpp,unsigned int redMask,
+			     unsigned int greenMask,unsigned int blueMask,
+			     unsigned char pixel[256]) {
+
+  this->bpp=bpp;
+  this->redMask=redMask;
+  this->greenMask=greenMask;
+  this->blueMask=blueMask;
+
+
+  dither8Bit=new Dither8Bit(pixel);
+  dither16Bit=new Dither16Bit(redMask,greenMask,blueMask);
+  dither32Bit=new Dither32Bit(redMask,greenMask,blueMask);
+  ditherRGB_flipped=new DitherRGB_flipped();
+  ditherRGB=new DitherRGB();
+ 
+  
+#ifdef INTEL
+  lmmx=mm_support();
+#else
+  lmmx=false;
+#endif
+
+
+}
+
+
+DitherWrapper::~DitherWrapper(){
+  delete dither16Bit;
+  delete dither8Bit;
+  delete dither32Bit;
+  delete ditherRGB_flipped;
+  delete ditherRGB;
+}
+
+
+
+
+
+void  DitherWrapper::doDither(YUVPicture* pic,int depth,int imageMode,
+			      unsigned char* dest,int offset) {
+
+
+  //
+  // according to the input imageType and the output area
+  // handle different dither methods
+  //
+
+  int inputType=pic->getImageType();
+
+  if ( (inputType == PICTURE_YUVMODE_CR_CB) ||
+       (inputType == PICTURE_YUVMODE_CB_CR) ) {
+    doDitherYUV(pic,depth,imageMode,dest,offset);
+    return;
+  }
+
+  if ( (inputType == PICTURE_RGB) ||
+       (inputType == PICTURE_RGB_FLIPPED) ){
+    doDitherRGB(pic,depth,imageMode,dest,offset);
+    return;
+  }
+  
+  cout << "unknown inputType:"<<inputType
+       << " in DitherWrapper::doDither"<<endl;
+}
+
+
+void DitherWrapper::doDitherRGB(YUVPicture* pic,int depth,int imageMode,
+				unsigned char* dest,int offset) {
+  
+  int inputType=pic->getImageType();
+
+  switch(inputType) {
+  case PICTURE_RGB:
+    doDitherRGB_NORMAL(pic,depth,imageMode,dest,offset);
+    break;
+  case  PICTURE_RGB_FLIPPED:
+    doDitherRGB_FLIPPED(pic,depth,imageMode,dest,offset);
+    break;
+  default:
+    cout << "unknown RGB type:"<<inputType<<" in DitherWrapper"<<endl;
+    exit(0);
+  }
+}
+
+
+void DitherWrapper::doDitherRGB_NORMAL(YUVPicture* pic,
+				       int depth,int imageMode,
+				       unsigned char* dest,int offset) {
+ 
+  int w=pic->getWidth();
+  int h=pic->getHeight();
+
+  unsigned char* src=pic->getImagePtr();
+
+  if (imageMode & _IMAGE_DOUBLE) {
+    ditherRGB->ditherRGBImage_x2(dest,src,depth,w,h,offset);
+  } else {
+    ditherRGB->ditherRGBImage(dest,src,depth,w,h,offset);
+  }
+}
+
+void DitherWrapper::doDitherRGB_FLIPPED(YUVPicture* pic,
+					int depth,int imageMode,
+					unsigned char* dest,int offset) {
+ 
+  int w=pic->getWidth();
+  int h=pic->getHeight();
+
+  unsigned char* src=pic->getImagePtr();
+
+  ditherRGB_flipped->flipRGBImage(dest,src,depth,w,h,offset);
+}
+
+
+
+void DitherWrapper::doDitherYUV(YUVPicture* pic,int depth,int imageMode,
+				 unsigned char* dest,int offset) {
+
+  if (imageMode & _IMAGE_DOUBLE) {
+    doDither_x2(pic,depth,dest,offset);
+  } else {
+    doDither_std(pic,depth,dest,offset);
+  }
+}
+
+
+void DitherWrapper::doDither_std(YUVPicture* pic,int depth,
+				 unsigned char* dest,int offset){
+  
+  int h=pic->getHeight();
+  int w=pic->getWidth();
+  unsigned char* lum=pic->getLuminancePtr();
+  unsigned char* cr=pic->getCrPtr();
+  unsigned char* cb=pic->getCbPtr();
+
+
+  switch (depth) {
+  case 8:
+    dither8Bit->ditherImageOrdered(lum, cr, cb,dest , h, w);
+    break;
+  case 16:
+    if (lmmx) {
+      ditherBlock(lum,cr,cb,dest,h,w,offset);
+    } else {
+      dither16Bit->ditherImageColor16(lum,cr,cb,dest,h,w,offset);
+    }
+
+    break;
+  case 24:
+  case 32:
+    if (lmmx) {
+      dither32_mmx(lum, cr, cb,dest ,h,w,offset);
+    } else {
+      dither32Bit->ditherImageColor32(lum, cr, cb,dest ,h,w,offset);
+    }
+
+
+    break;
+  default:
+    cout << "cannot dither depth:"<<depth<<endl;
+  }
+
+}
+  
+
+void DitherWrapper::doDither_x2(YUVPicture* pic,int depth,
+				unsigned char* dest,int offset){
+
+  int h=pic->getHeight();
+  int w=pic->getWidth();
+  unsigned char* lum=pic->getLuminancePtr();
+  unsigned char* cr=pic->getCrPtr();
+  unsigned char* cb=pic->getCbPtr();
+
+
+  switch (depth) {
+  case 8: {
+    // we do dither with the 8Bit std YUV ditherer to RGB
+    // and then we do the double part with the
+    // RGB ditherer. Its obviously much slower but at
+    // least it works. To not allocate memory twice
+    // we are a bit tricky. We know that the image 
+    // has space for doubls size. We but the not double size
+    // image at the bottom of the dest. Maybe that
+    // the last line gets overwritten
+    int memPos=3*h*w;
+    dither8Bit->ditherImageOrdered(lum, cr, cb,dest+memPos, h, w);
+    unsigned char* src=dest+memPos;
+    ditherRGB->ditherRGBImage_x2(dest,src,depth,w,h,0);
+    break;
+  }
+  case 16:
+    dither16Bit->ditherImageTwox2Color16(lum,cr,cb,dest,h,w,offset);
+    break;
+  case 24:
+  case 32:
+    if (lmmx) {
+      //dither32x2_mmx(lum, cr, cb,dest ,h,w,offset);
+      dither32Bit->ditherImageTwox2Color32(lum,cr,cb,dest,h,w,offset);
+    } else {
+      dither32Bit->ditherImageTwox2Color32(lum,cr,cb,dest,h,w,offset);
+    }
+    break;
+  default:
+    cout << "cannot dither depth:" << depth << endl;
+  }
+}
diff --git a/mpeglib/lib/util/render/dither/ditherWrapper.h b/mpeglib/lib/util/render/dither/ditherWrapper.h
new file mode 100644
index 00000000..b01abff8
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/ditherWrapper.h
@@ -0,0 +1,80 @@
+/*
+  wrapper for X11 Window
+  Copyright (C) 1999  Martin Vogt
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation.
+
+  For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+#ifndef __DITHERWRAPPER_H
+#define __DITHERWRAPPER_H
+
+ 
+#include "../../mmx/mmx.h"
+
+#include "../yuvPicture.h"
+#include "../imageBase.h"
+#include <stdlib.h>
+#include "ditherMMX.h"
+#include "dither8Bit.h"
+#include "dither16Bit.h"
+#include "dither32Bit.h"
+#include "ditherRGB_flipped.h"
+#include "ditherRGB.h"
+
+
+/**
+   Wraps all calls to software ditherer and the different 
+   resolutions,mmx enhancements, and doublesize ditherers.
+*/
+
+
+class DitherWrapper {
+
+  int lmmx;
+
+  int bpp;
+  // colorMask
+  unsigned int redMask;
+  unsigned int greenMask;
+  unsigned int blueMask;
+
+  Dither8Bit* dither8Bit;
+  Dither16Bit* dither16Bit;
+  Dither32Bit* dither32Bit;
+  DitherRGB_flipped* ditherRGB_flipped;
+  DitherRGB* ditherRGB;
+  
+ public:
+  DitherWrapper(int bpp,unsigned int redMask,
+		unsigned int greenMask,unsigned int blueMask,
+		unsigned char pixel[256]);
+  ~DitherWrapper();
+  
+/*    int getDitherSize(); */
+/*    void setDitherSize(int ditherMode); */
+
+  void doDither(YUVPicture* pic,int depth,int imageMode,
+		unsigned char* dest,int offset);
+  
+
+ private:
+  void doDitherYUV(YUVPicture* pic,int depth,int imageMode,
+		   unsigned char* dest,int offset);
+  void doDitherRGB(YUVPicture* pic,int depth,int imageMode,
+		   unsigned char* dest,int offset);
+  void doDitherRGB_NORMAL(YUVPicture* pic,int depth,int imageMode,
+			  unsigned char* dest,int offset);
+  void doDitherRGB_FLIPPED(YUVPicture* pic,int depth,int imageMode,
+			   unsigned char* dest,int offset);
+     
+  void doDither_std(YUVPicture* pic,int depth,unsigned char* dest,int offset);
+  void doDither_x2(YUVPicture* pic,int depth,unsigned char* dest,int offset);
+};
+
+#endif
diff --git a/mpeglib/lib/util/render/dither/ditherer_mmx16.cpp b/mpeglib/lib/util/render/dither/ditherer_mmx16.cpp
new file mode 100644
index 00000000..757f0676
--- /dev/null
+++ b/mpeglib/lib/util/render/dither/ditherer_mmx16.cpp
@@ -0,0 +1,256 @@
+
+#include "ditherMMX.h"
+
+#include <iostream>
+
+using namespace std;
+		
+#ifndef INTEL
+// nothing
+void  ditherBlock(unsigned char *lum, unsigned char *cr, unsigned char *cb,
+                  unsigned char *out,
+                  int cols, int rows, int screen_width) {
+  printf("call to ditherBlock. this should never happen\n");
+  printf("check mmx detection routine.\n");
+  exit(0);
+}
+#else	
+	
+
+static long long MMX16_0 = 0L;
+static unsigned long  MMX16_10w[]         = {0x00100010, 0x00100010};
+static unsigned long  MMX16_80w[]         = {0x00800080, 0x00800080};
+static unsigned long  MMX16_00FFw[]       = {0x00ff00ff, 0x00ff00ff};
+static unsigned short MMX16_Ublucoeff[]   = {0x81, 0x81, 0x81, 0x81};
+static unsigned short MMX16_Vredcoeff[]   = {0x66, 0x66, 0x66, 0x66};
+static unsigned short MMX16_Ugrncoeff[]   = {0xffe8, 0xffe8, 0xffe8, 0xffe8};
+static unsigned short MMX16_Vgrncoeff[]   = {0xffcd, 0xffcd, 0xffcd, 0xffcd};
+static unsigned short MMX16_Ycoeff[]      = {0x4a, 0x4a, 0x4a, 0x4a};   
+static unsigned short MMX16_redmask[]     = {0xf800, 0xf800, 0xf800, 0xf800}; 
+static unsigned short MMX16_grnmask[]     = {0x7e0, 0x7e0, 0x7e0, 0x7e0}; 
+
+void dummy_dithermmx16() {
+  cout << "MMX16_0"<<MMX16_0<<endl;
+  cout << "MMX16_10w:"<<MMX16_10w<<endl;
+  cout << "MMX16_80w:"<<MMX16_80w<<endl;
+  cout << "MMX16_Ublucoeff:"<<MMX16_Ublucoeff<<endl;
+  cout << "MMX16_Vredcoeff:"<<MMX16_Vredcoeff<<endl;
+  cout << "MMX16_Ugrncoeff:"<<MMX16_Ugrncoeff<<endl;
+  cout << "MMX16_Vgrncoeff:"<<MMX16_Vgrncoeff<<endl;
+  cout << "MMX16_Ycoeff:"<<MMX16_Ycoeff<<endl;
+  cout << "MMX16_redmask:"<<MMX16_redmask<<endl;
+  cout << "MMX16_grnmask:"<<MMX16_grnmask<<endl;
+  cout << "MMX16_00FFw:"<<MMX16_00FFw<<endl;
+}
+
+
+void  ditherBlock(unsigned char *lum, 
+		  unsigned char *cr, 
+		  unsigned char *cb,
+		  unsigned char *out,
+		  int rows, 
+		  int cols, 
+		  int mod) {
+
+    unsigned short *row1;
+    unsigned short *row2;
+    row1 = (unsigned short* )out;         // 16 bit target
+
+    unsigned char* end = lum +cols*rows;    // Pointer to the end
+    int x=cols;
+    row2=row1+mod+cols;                   // start of second row 
+    mod=2*cols+4*mod;                     // increment for row1 in byte
+
+    // buffer for asm function
+    int buf[6];
+    buf[0]=(int)(lum+cols);   // lum2 pointer
+    buf[1]=(int)end;     
+    buf[2]=x;
+    buf[3]=mod;     
+    buf[4]=0; //tmp0;
+    buf[5]=cols;
+
+
+
+    __asm__ __volatile__(
+         ".align 32\n"
+         "1:\n"
+         "movd           (%1),                   %%mm0\n"        // 4 Cb         0  0  0  0 u3 u2 u1 u0
+         "pxor           %%mm7,                  %%mm7\n"
+         "movd           (%0),                   %%mm1\n" // 4 Cr                0  0  0  0 v3 v2 v1 v0
+         "punpcklbw      %%mm7,                  %%mm0\n" // 4 W cb   0 u3  0 u2  0 u1  0 u0
+         "punpcklbw      %%mm7,                  %%mm1\n" // 4 W cr   0 v3  0 v2  0 v1  0 v0
+         "psubw          MMX16_80w,                %%mm0\n"
+         "psubw          MMX16_80w,                %%mm1\n"
+         "movq           %%mm0,                  %%mm2\n"        // Cb                   0 u3  0 u2  0 u1  0 u0
+         "movq           %%mm1,                  %%mm3\n" // Cr
+         "pmullw         MMX16_Ugrncoeff,          %%mm2\n" // Cb2green 0 R3  0 R2  0 R1  0 R0
+         "movq           (%2),                   %%mm6\n"        // L1      l7 L6 L5 L4 L3 L2 L1 L0
+         "pmullw         MMX16_Ublucoeff,          %%mm0\n" // Cb2blue
+         "pand           MMX16_00FFw,              %%mm6\n" // L1      00 L6 00 L4 00 L2 00 L0
+         "pmullw         MMX16_Vgrncoeff,          %%mm3\n" // Cr2green
+         "movq           (%2),                   %%mm7\n" // L2
+         "pmullw         MMX16_Vredcoeff,          %%mm1\n" // Cr2red
+         //                      "psubw          MMX16_10w,                %%mm6\n"
+         "psrlw          $8,                     %%mm7\n"        // L2           00 L7 00 L5 00 L3 00 L1
+         "pmullw         MMX16_Ycoeff,             %%mm6\n" // lum1
+         //                      "psubw          MMX16_10w,                %%mm7\n" // L2
+         "paddw          %%mm3,                  %%mm2\n" // Cb2green + Cr2green == green
+         "pmullw         MMX16_Ycoeff,             %%mm7\n"  // lum2
+
+         "movq           %%mm6,                  %%mm4\n"  // lum1
+         "paddw          %%mm0,                  %%mm6\n"  // lum1 +blue 00 B6 00 B4 00 B2 00 B0
+         "movq           %%mm4,                  %%mm5\n"  // lum1
+         "paddw          %%mm1,                  %%mm4\n"  // lum1 +red  00 R6 00 R4 00 R2 00 R0
+         "paddw          %%mm2,                  %%mm5\n"  // lum1 +green 00 G6 00 G4 00 G2 00 G0
+         "psraw          $6,                     %%mm4\n"  // R1 0 .. 64
+         "movq           %%mm7,                  %%mm3\n"  // lum2                       00 L7 00 L5 00 L3 00 L1
+         "psraw          $6,                     %%mm5\n"  // G1  - .. +
+         "paddw          %%mm0,                  %%mm7\n"  // Lum2 +blue 00 B7 00 B5 00 B3 00 B1
+         "psraw          $6,                     %%mm6\n"  // B1         0 .. 64
+         "packuswb       %%mm4,                  %%mm4\n"  // R1 R1
+         "packuswb       %%mm5,                  %%mm5\n"  // G1 G1
+         "packuswb       %%mm6,                  %%mm6\n"  // B1 B1
+         "punpcklbw      %%mm4,                  %%mm4\n"
+         "punpcklbw      %%mm5,                  %%mm5\n"
+
+         "pand           MMX16_redmask,            %%mm4\n"
+         "psllw          $3,                     %%mm5\n"  // GREEN       1
+         "punpcklbw      %%mm6,                  %%mm6\n"
+         "pand           MMX16_grnmask,            %%mm5\n"
+         "pand           MMX16_redmask,            %%mm6\n"
+         "por            %%mm5,                  %%mm4\n" //
+         "psrlw          $11,                    %%mm6\n"                // BLUE        1
+         "movq           %%mm3,                  %%mm5\n" // lum2
+         "paddw          %%mm1,                  %%mm3\n"        // lum2 +red      00 R7 00 R5 00 R3 00 R1
+         "paddw          %%mm2,                  %%mm5\n" // lum2 +green 00 G7 00 G5 00 G3 00 G1
+         "psraw          $6,                     %%mm3\n" // R2
+         "por            %%mm6,                  %%mm4\n" // MM4
+         "psraw          $6,                     %%mm5\n" // G2
+
+	 "movl %2,16%5\n"               // store register in tmp0
+	 "movl %5,%2\n"                 // lum2->register
+	 "movq (%2),%%mm6\n"            // 0  0  0  0 L3 L2 L1 L0 (load lum2)
+
+
+         //"movq           (%2, %5),               %%mm6\n" // L3 load lum2
+         "psraw          $6,                     %%mm7\n"
+         "packuswb       %%mm3,                  %%mm3\n"
+         "packuswb       %%mm5,                  %%mm5\n"
+         "packuswb       %%mm7,                  %%mm7\n"
+         "pand                   MMX16_00FFw,              %%mm6\n"  // L3
+         "punpcklbw      %%mm3,                  %%mm3\n"
+         //                              "psubw          MMX16_10w,                        %%mm6\n"  // L3
+         "punpcklbw      %%mm5,                  %%mm5\n"
+         "pmullw         MMX16_Ycoeff,             %%mm6\n"  // lum3
+         "punpcklbw      %%mm7,                  %%mm7\n"
+         "psllw          $3,                             %%mm5\n"  // GREEN 2
+         "pand                   MMX16_redmask,    %%mm7\n"
+         "pand                   MMX16_redmask,    %%mm3\n"
+         "psrlw          $11,                            %%mm7\n"  // BLUE  2
+         "pand                   MMX16_grnmask,    %%mm5\n"
+         "por                    %%mm7,                  %%mm3\n"
+	 
+         "movq                   (%2),        %%mm7\n"  // L4 load lum2
+	 "movl 16%5,%2\n"               // tmp0->register
+
+         "por                    %%mm5,                  %%mm3\n"     //
+         "psrlw          $8,                             %%mm7\n"    // L4
+         "movq                   %%mm4,                  %%mm5\n"
+         //                              "psubw          MMX16_10w,                        %%mm7\n"                // L4
+         "punpcklwd      %%mm3,                  %%mm4\n"
+         "pmullw         MMX16_Ycoeff,             %%mm7\n"    // lum4
+         "punpckhwd      %%mm3,                  %%mm5\n"
+
+         "movq                   %%mm4,                  (%3)\n" // write row1
+         "movq                   %%mm5,                  8(%3)\n" // write row1
+
+         "movq                   %%mm6,                  %%mm4\n"        // Lum3
+         "paddw          %%mm0,                  %%mm6\n"                // Lum3 +blue
+
+         "movq                   %%mm4,                  %%mm5\n"                        // Lum3
+         "paddw          %%mm1,                  %%mm4\n"       // Lum3 +red
+         "paddw          %%mm2,                  %%mm5\n"                        // Lum3 +green
+         "psraw          $6,                             %%mm4\n"
+         "movq                   %%mm7,                  %%mm3\n"                        // Lum4
+         "psraw          $6,                             %%mm5\n"
+         "paddw          %%mm0,                  %%mm7\n"                   // Lum4 +blue
+         "psraw          $6,                             %%mm6\n"                        // Lum3 +blue
+         "movq                   %%mm3,                  %%mm0\n"  // Lum4
+         "packuswb       %%mm4,                  %%mm4\n"
+         "paddw          %%mm1,                  %%mm3\n"  // Lum4 +red
+         "packuswb       %%mm5,                  %%mm5\n"
+         "paddw          %%mm2,                  %%mm0\n"         // Lum4 +green
+         "packuswb       %%mm6,                  %%mm6\n"
+         "punpcklbw      %%mm4,                  %%mm4\n"
+         "punpcklbw      %%mm5,                  %%mm5\n"
+         "punpcklbw      %%mm6,                  %%mm6\n"
+         "psllw          $3,                             %%mm5\n" // GREEN 3
+         "pand                   MMX16_redmask,    %%mm4\n"
+         "psraw          $6,                             %%mm3\n" // psr 6
+         "psraw          $6,                             %%mm0\n"
+         "pand                   MMX16_redmask,    %%mm6\n" // BLUE
+         "pand                   MMX16_grnmask,    %%mm5\n"
+         "psrlw          $11,                            %%mm6\n"  // BLUE  3
+         "por                    %%mm5,                  %%mm4\n"
+         "psraw          $6,                             %%mm7\n"
+         "por                    %%mm6,                  %%mm4\n"
+         "packuswb       %%mm3,                  %%mm3\n"
+         "packuswb       %%mm0,                  %%mm0\n"
+         "packuswb       %%mm7,                  %%mm7\n"
+         "punpcklbw      %%mm3,                  %%mm3\n"
+         "punpcklbw      %%mm0,                  %%mm0\n"
+         "punpcklbw      %%mm7,                  %%mm7\n"
+         "pand                   MMX16_redmask,    %%mm3\n"
+         "pand                   MMX16_redmask,    %%mm7\n" // BLUE
+         "psllw          $3,                             %%mm0\n" // GREEN 4
+         "psrlw          $11,                            %%mm7\n"
+         "pand                   MMX16_grnmask,    %%mm0\n"
+         "por                    %%mm7,                  %%mm3\n"
+         "por                    %%mm0,                  %%mm3\n"
+
+         "movq                   %%mm4,                  %%mm5\n"
+
+         "punpcklwd      %%mm3,                  %%mm4\n"
+         "punpckhwd      %%mm3,                  %%mm5\n"
+
+         "movq                   %%mm4,                  (%4)\n"
+	 "movq                   %%mm5,                  8(%4)\n"
+
+         "subl      $8, 8%5\n" // x-=8
+	 "addl      $8, %5\n"            // lum2+8
+         "addl      $8, %2\n"
+         "addl      $4, %0\n"
+         "addl      $4, %1\n"
+         "cmpl      $0, 8%5\n"
+         "leal  16(%3), %3\n"
+	 "leal  16(%4), %4\n"        // row2+16
+
+
+         "jne            1b\n"
+	 "addl           20%5,   %2\n" // lum += cols 
+
+	 "movl %2,16%5\n"              // store register in tmp0
+	 "movl 20%5,%2\n"              // cols->register
+
+	 "addl           %2,     %5\n" // lum2 += cols 
+	 "addl           12%5,   %3\n" // row1+= mod
+	 "addl           12%5,   %4\n" // row2+= mod
+	 "movl           %2,     8%5\n" // x=cols
+	 "movl 16%5,%2\n"              // store tmp0 in register
+
+	 "cmpl           4%5,    %2\n"
+	 "jl             1b\n"
+
+         :
+         :"r" (cr), "r"(cb),"r"(lum),
+	 "r"(row1),"r"(row2),"m"(buf[0])
+
+         );
+      __asm__ (
+         "emms\n"
+         );
+
+   }
+
+#endif