ADVANCE_RING();
        }
 
+       /* flus cache and wait idle clean after cliprect change */
+       BEGIN_RING(2);
+       OUT_RING(CP_PACKET0(R300_RB3D_DSTCACHE_CTLSTAT, 0));
+       OUT_RING(R300_RB3D_DC_FLUSH);
+       ADVANCE_RING();
+       BEGIN_RING(2);
+       OUT_RING(CP_PACKET0(RADEON_WAIT_UNTIL, 0));
+       OUT_RING(RADEON_WAIT_3D_IDLECLEAN);
+       ADVANCE_RING();
+       /* set flush flag */
+       dev_priv->track_flush |= RADEON_FLUSH_EMITED;
+
        return 0;
 }
 
        ADD_RANGE(0x21DC, 1);
        ADD_RANGE(R300_VAP_UNKNOWN_221C, 1);
        ADD_RANGE(R300_VAP_CLIP_X_0, 4);
-       ADD_RANGE(R300_VAP_PVS_WAITIDLE, 1);
+       ADD_RANGE(R300_VAP_PVS_STATE_FLUSH_REG, 1);
        ADD_RANGE(R300_VAP_UNKNOWN_2288, 1);
        ADD_RANGE(R300_VAP_OUTPUT_VTX_FMT_0, 2);
        ADD_RANGE(R300_VAP_PVS_CNTL_1, 3);
        ADD_RANGE(R300_GB_ENABLE, 1);
        ADD_RANGE(R300_GB_MSPOS0, 5);
-       ADD_RANGE(R300_TX_CNTL, 1);
+       ADD_RANGE(R300_TX_INVALTAGS, 1);
        ADD_RANGE(R300_TX_ENABLE, 1);
        ADD_RANGE(0x4200, 4);
        ADD_RANGE(0x4214, 1);
        if (sz * 16 > cmdbuf->bufsz)
                return -EINVAL;
 
-       BEGIN_RING(5 + sz * 4);
-       /* Wait for VAP to come to senses.. */
-       /* there is no need to emit it multiple times, (only once before VAP is programmed,
-          but this optimization is for later */
-       OUT_RING_REG(R300_VAP_PVS_WAITIDLE, 0);
+       /* VAP is very sensitive so we purge cache before we program it
+        * and we also flush its state before & after */
+       BEGIN_RING(6);
+       OUT_RING(CP_PACKET0(R300_RB3D_DSTCACHE_CTLSTAT, 0));
+       OUT_RING(R300_RB3D_DC_FLUSH);
+       OUT_RING(CP_PACKET0(RADEON_WAIT_UNTIL, 0));
+       OUT_RING(RADEON_WAIT_3D_IDLECLEAN);
+       OUT_RING(CP_PACKET0(R300_VAP_PVS_STATE_FLUSH_REG, 0));
+       OUT_RING(0);
+       ADVANCE_RING();
+       /* set flush flag */
+       dev_priv->track_flush |= RADEON_FLUSH_EMITED;
+
+       BEGIN_RING(3 + sz * 4);
        OUT_RING_REG(R300_VAP_PVS_UPLOAD_ADDRESS, addr);
        OUT_RING(CP_PACKET0_TABLE(R300_VAP_PVS_UPLOAD_DATA, sz * 4 - 1));
        OUT_RING_TABLE((int *)cmdbuf->buf, sz * 4);
+       ADVANCE_RING();
 
+       BEGIN_RING(2);
+       OUT_RING(CP_PACKET0(R300_VAP_PVS_STATE_FLUSH_REG, 0));
+       OUT_RING(0);
        ADVANCE_RING();
 
        cmdbuf->buf += sz * 16;
        OUT_RING_TABLE((int *)cmdbuf->buf, 8);
        ADVANCE_RING();
 
+       BEGIN_RING(4);
+       OUT_RING(CP_PACKET0(R300_RB3D_DSTCACHE_CTLSTAT, 0));
+       OUT_RING(R300_RB3D_DC_FLUSH);
+       OUT_RING(CP_PACKET0(RADEON_WAIT_UNTIL, 0));
+       OUT_RING(RADEON_WAIT_3D_IDLECLEAN);
+       ADVANCE_RING();
+       /* set flush flag */
+       dev_priv->track_flush |= RADEON_FLUSH_EMITED;
+
        cmdbuf->buf += 8 * 4;
        cmdbuf->bufsz -= 8 * 4;
 
        case RADEON_CNTL_BITBLT_MULTI:
                return r300_emit_bitblt_multi(dev_priv, cmdbuf);
 
-       case RADEON_CP_INDX_BUFFER:     /* DRAW_INDX_2 without INDX_BUFFER seems to lock up the gpu */
+       case RADEON_CP_INDX_BUFFER:
+               /* DRAW_INDX_2 without INDX_BUFFER seems to lock up the gpu */
                return r300_emit_indx_buffer(dev_priv, cmdbuf);
-       case RADEON_CP_3D_DRAW_IMMD_2:  /* triggers drawing using in-packet vertex data */
-       case RADEON_CP_3D_DRAW_VBUF_2:  /* triggers drawing of vertex buffers setup elsewhere */
-       case RADEON_CP_3D_DRAW_INDX_2:  /* triggers drawing using indices to vertex buffer */
+       case RADEON_CP_3D_DRAW_IMMD_2:
+               /* triggers drawing using in-packet vertex data */
+       case RADEON_CP_3D_DRAW_VBUF_2:
+               /* triggers drawing of vertex buffers setup elsewhere */
+       case RADEON_CP_3D_DRAW_INDX_2:
+               /* triggers drawing using indices to vertex buffer */
+               /* whenever we send vertex we clear flush & purge */
+               dev_priv->track_flush &= ~(RADEON_FLUSH_EMITED |
+                                          RADEON_PURGE_EMITED);
+               break;
        case RADEON_WAIT_FOR_IDLE:
        case RADEON_CP_NOP:
                /* these packets are safe */
  */
 static __inline__ void r300_pacify(drm_radeon_private_t *dev_priv)
 {
+       uint32_t cache_z, cache_3d, cache_2d;
        RING_LOCALS;
+       
+       cache_z = R300_ZC_FLUSH;
+       cache_2d = R300_RB2D_DC_FLUSH;
+       cache_3d = R300_RB3D_DC_FLUSH;
+       if (!(dev_priv->track_flush & RADEON_PURGE_EMITED)) {
+               /* we can purge, primitive where draw since last purge */
+               cache_z |= R300_ZC_FREE;
+               cache_2d |= R300_RB2D_DC_FREE;
+               cache_3d |= R300_RB3D_DC_FREE;
+       }
 
-       BEGIN_RING(6);
-       OUT_RING(CP_PACKET0(R300_RB3D_DSTCACHE_CTLSTAT, 0));
-       OUT_RING(R300_RB3D_DSTCACHE_UNKNOWN_0A);
+       /* flush & purge zbuffer */
+       BEGIN_RING(2);
        OUT_RING(CP_PACKET0(R300_ZB_ZCACHE_CTLSTAT, 0));
-       OUT_RING(R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE|
-                R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
-       OUT_RING(CP_PACKET3(RADEON_CP_NOP, 0));
-       OUT_RING(0x0);
+       OUT_RING(cache_z);
+       ADVANCE_RING();
+       /* flush & purge 3d */
+       BEGIN_RING(2);
+       OUT_RING(CP_PACKET0(R300_RB3D_DSTCACHE_CTLSTAT, 0));
+       OUT_RING(cache_3d);
+       ADVANCE_RING();
+       /* flush & purge texture */
+       BEGIN_RING(2);
+       OUT_RING(CP_PACKET0(R300_TX_INVALTAGS, 0));
+       OUT_RING(0);
+       ADVANCE_RING();
+       /* FIXME: is this one really needed ? */
+       BEGIN_RING(2);
+       OUT_RING(CP_PACKET0(R300_RB3D_AARESOLVE_CTL, 0));
+       OUT_RING(0);
+       ADVANCE_RING();
+       BEGIN_RING(2);
+       OUT_RING(CP_PACKET0(RADEON_WAIT_UNTIL, 0));
+       OUT_RING(RADEON_WAIT_3D_IDLECLEAN);
+       ADVANCE_RING();
+       /* flush & purge 2d through E2 as RB2D will trigger lockup */
+       BEGIN_RING(4);
+       OUT_RING(CP_PACKET0(R300_DSTCACHE_CTLSTAT, 0));
+       OUT_RING(cache_2d);
+       OUT_RING(CP_PACKET0(RADEON_WAIT_UNTIL, 0));
+       OUT_RING(RADEON_WAIT_2D_IDLECLEAN |
+                RADEON_WAIT_HOST_IDLECLEAN);
        ADVANCE_RING();
+       /* set flush & purge flags */
+       dev_priv->track_flush |= RADEON_FLUSH_EMITED | RADEON_PURGE_EMITED;
 }
 
 /**
 
        DRM_DEBUG("\n");
 
-       /* See the comment above r300_emit_begin3d for why this call must be here,
-        * and what the cleanup gotos are for. */
+       /* pacify */
        r300_pacify(dev_priv);
 
        if (cmdbuf->nbox <= R300_SIMULTANEOUS_CLIPRECTS) {
 
 #define RADEON_FIFO_DEBUG      0
 
 static int radeon_do_cleanup_cp(struct drm_device * dev);
+static void radeon_do_cp_start(drm_radeon_private_t * dev_priv);
 
 static u32 R500_READ_MCIND(drm_radeon_private_t *dev_priv, int addr)
 {
                        DRM_UDELAY(1);
                }
        } else {
-               /* 3D */
-               tmp = RADEON_READ(R300_RB3D_DSTCACHE_CTLSTAT);
-               tmp |= RADEON_RB3D_DC_FLUSH_ALL;
-               RADEON_WRITE(R300_RB3D_DSTCACHE_CTLSTAT, tmp);
-
-               /* 2D */
-               tmp = RADEON_READ(R300_DSTCACHE_CTLSTAT);
-               tmp |= RADEON_RB3D_DC_FLUSH_ALL;
-               RADEON_WRITE(R300_DSTCACHE_CTLSTAT, tmp);
-
-               for (i = 0; i < dev_priv->usec_timeout; i++) {
-                       if (!(RADEON_READ(R300_DSTCACHE_CTLSTAT)
-                         & RADEON_RB3D_DC_BUSY)) {
-                               return 0;
-                       }
-                       DRM_UDELAY(1);
-               }
+               /* don't flush or purge cache here or lockup */
+               return 0;
        }
 
 #if RADEON_FIFO_DEBUG
                        return 0;
                DRM_UDELAY(1);
        }
+       DRM_INFO("wait for fifo failed status : 0x%08X 0x%08X\n",
+                RADEON_READ(RADEON_RBBM_STATUS),
+                RADEON_READ(R300_VAP_CNTL_STATUS));
 
 #if RADEON_FIFO_DEBUG
        DRM_ERROR("failed!\n");
                }
                DRM_UDELAY(1);
        }
+       DRM_INFO("wait idle failed status : 0x%08X 0x%08X\n",
+                RADEON_READ(RADEON_RBBM_STATUS),
+                RADEON_READ(R300_VAP_CNTL_STATUS));
 
 #if RADEON_FIFO_DEBUG
        DRM_ERROR("failed!\n");
 
        dev_priv->cp_running = 1;
 
-       BEGIN_RING(6);
-
+       BEGIN_RING(8);
+       /* isync can only be written through cp on r5xx write it here */
+       OUT_RING(CP_PACKET0(RADEON_ISYNC_CNTL, 0));
+       OUT_RING(RADEON_ISYNC_ANY2D_IDLE3D |
+                RADEON_ISYNC_ANY3D_IDLE2D |
+                RADEON_ISYNC_WAIT_IDLEGUI |
+                RADEON_ISYNC_CPSCRATCH_IDLEGUI);
        RADEON_PURGE_CACHE();
        RADEON_PURGE_ZCACHE();
        RADEON_WAIT_UNTIL_IDLE();
-
        ADVANCE_RING();
        COMMIT_RING();
+
+       dev_priv->track_flush |= RADEON_FLUSH_EMITED | RADEON_PURGE_EMITED;
 }
 
 /* Reset the Command Processor.  This will not flush any pending
 
        struct drm_file *file_priv;
 };
 
+#define RADEON_FLUSH_EMITED    (1 < 0)
+#define RADEON_PURGE_EMITED    (1 < 1)
+
 typedef struct drm_radeon_private {
        drm_radeon_ring_buffer_t ring;
        drm_radeon_sarea_t *sarea_priv;
        unsigned long fb_aper_offset;
 
        int num_gb_pipes;
+       int track_flush;
 } drm_radeon_private_t;
 
 typedef struct drm_radeon_buf_priv {
 #define R300_ZB_ZCACHE_CTLSTAT                  0x4f18
 #      define R300_ZC_FLUSH                    (1 << 0)
 #      define R300_ZC_FREE                     (1 << 1)
-#      define R300_ZC_FLUSH_ALL                0x3
 #      define R300_ZC_BUSY                     (1 << 31)
 #define RADEON_RB3D_DSTCACHE_CTLSTAT   0x325c
 #      define RADEON_RB3D_DC_FLUSH             (3 << 0)
 #      define RADEON_RB3D_DC_FLUSH_ALL         0xf
 #      define RADEON_RB3D_DC_BUSY              (1 << 31)
 #define R300_RB3D_DSTCACHE_CTLSTAT              0x4e4c
+#      define R300_RB3D_DC_FLUSH               (2 << 0)
+#      define R300_RB3D_DC_FREE                (2 << 2)
 #      define R300_RB3D_DC_FINISH              (1 << 4)
 #define RADEON_RB3D_ZSTENCILCNTL       0x1c2c
 #      define RADEON_Z_TEST_MASK               (7 << 4)
                OUT_RING(RADEON_RB3D_DC_FLUSH);                         \
        } else {                                                        \
                OUT_RING(CP_PACKET0(R300_RB3D_DSTCACHE_CTLSTAT, 0));    \
-               OUT_RING(RADEON_RB3D_DC_FLUSH);                         \
+               OUT_RING(R300_RB3D_DC_FLUSH);                           \
        }                                                               \
 } while (0)
 
 #define RADEON_PURGE_CACHE() do {                                      \
        if ((dev_priv->flags & RADEON_FAMILY_MASK) <= CHIP_RV280) {     \
                OUT_RING(CP_PACKET0(RADEON_RB3D_DSTCACHE_CTLSTAT, 0));  \
-               OUT_RING(RADEON_RB3D_DC_FLUSH_ALL);                     \
+               OUT_RING(RADEON_RB3D_DC_FLUSH | RADEON_RB3D_DC_FREE);   \
        } else {                                                        \
                OUT_RING(CP_PACKET0(R300_RB3D_DSTCACHE_CTLSTAT, 0));    \
-               OUT_RING(RADEON_RB3D_DC_FLUSH_ALL);                     \
+               OUT_RING(R300_RB3D_DC_FLUSH | R300_RB3D_DC_FREE);       \
        }                                                               \
 } while (0)
 
 #define RADEON_PURGE_ZCACHE() do {                                     \
        if ((dev_priv->flags & RADEON_FAMILY_MASK) <= CHIP_RV280) {     \
                OUT_RING(CP_PACKET0(RADEON_RB3D_ZCACHE_CTLSTAT, 0));    \
-               OUT_RING(RADEON_RB3D_ZC_FLUSH_ALL);                     \
+               OUT_RING(RADEON_RB3D_ZC_FLUSH | RADEON_RB3D_ZC_FREE);                   \
        } else {                                                        \
-               OUT_RING(CP_PACKET0(R300_RB3D_DSTCACHE_CTLSTAT, 0));    \
-               OUT_RING(R300_ZC_FLUSH_ALL);                            \
+               OUT_RING(CP_PACKET0(R300_ZB_ZCACHE_CTLSTAT, 0));        \
+               OUT_RING(R300_ZC_FLUSH | R300_ZC_FREE);                         \
        }                                                               \
 } while (0)