#include <linux/gpio/consumer.h>
 #include <linux/i2c.h>
 #include <linux/kernel.h>
+#include <linux/math64.h>
 #include <linux/media-bus-format.h>
 #include <linux/minmax.h>
 #include <linux/module.h>
        u32 frs;        /* PLL Freqency range for HSCK (post divider) */
 
        u32 dsiclk;     /* pll_clk / 2 */
+       u32 pclk;       /* incoming pclk rate */
 };
 
 static inline struct tc358768_priv *dsi_host_to_tc358768(struct mipi_dsi_host
        priv->prd = best_prd;
        priv->frs = frs;
        priv->dsiclk = best_pll / 2;
+       priv->pclk = mode->clock * 1000;
 
        return 0;
 }
        return ps / 1000;
 }
 
+static u32 tc358768_dpi_to_ns(u32 val, u32 pclk)
+{
+       return (u32)div_u64((u64)val * NANO, pclk);
+}
+
+/* Convert value in DPI pixel clock units to DSI byte count */
+static u32 tc358768_dpi_to_dsi_bytes(struct tc358768_priv *priv, u32 val)
+{
+       u64 m = (u64)val * priv->dsiclk / 4 * priv->dsi_lanes;
+       u64 n = priv->pclk;
+
+       return (u32)div_u64(m + n - 1, n);
+}
+
+static u32 tc358768_dsi_bytes_to_ns(struct tc358768_priv *priv, u32 val)
+{
+       u64 m = (u64)val * NANO;
+       u64 n = priv->dsiclk / 4 * priv->dsi_lanes;
+
+       return (u32)div_u64(m, n);
+}
+
 static void tc358768_bridge_pre_enable(struct drm_bridge *bridge)
 {
        struct tc358768_priv *priv = bridge_to_tc358768(bridge);
        s32 raw_val;
        const struct drm_display_mode *mode;
        u32 hsbyteclk_ps, dsiclk_ps, ui_ps;
-       u32 dsiclk, hsbyteclk, video_start;
-       const u32 internal_delay = 40;
+       u32 dsiclk, hsbyteclk;
        int ret, i;
        struct videomode vm;
        struct device *dev = priv->dev;
+       /* In pixelclock units */
+       u32 dpi_htot, dpi_data_start;
+       /* In byte units */
+       u32 dsi_dpi_htot, dsi_dpi_data_start;
+       u32 dsi_hsw, dsi_hbp, dsi_hact, dsi_hfp;
+       const u32 dsi_hss = 4; /* HSS is a short packet (4 bytes) */
+       /* In hsbyteclk units */
+       u32 dsi_vsdly;
+       const u32 internal_dly = 40;
 
        if (mode_flags & MIPI_DSI_CLOCK_NON_CONTINUOUS) {
                dev_warn_once(dev, "Non-continuous mode unimplemented, falling back to continuous\n");
        case MIPI_DSI_FMT_RGB888:
                val |= (0x3 << 4);
                hact = vm.hactive * 3;
-               video_start = (vm.hsync_len + vm.hback_porch) * 3;
                data_type = MIPI_DSI_PACKED_PIXEL_STREAM_24;
                break;
        case MIPI_DSI_FMT_RGB666:
                val |= (0x4 << 4);
                hact = vm.hactive * 3;
-               video_start = (vm.hsync_len + vm.hback_porch) * 3;
                data_type = MIPI_DSI_PACKED_PIXEL_STREAM_18;
                break;
 
        case MIPI_DSI_FMT_RGB666_PACKED:
                val |= (0x4 << 4) | BIT(3);
                hact = vm.hactive * 18 / 8;
-               video_start = (vm.hsync_len + vm.hback_porch) * 18 / 8;
                data_type = MIPI_DSI_PIXEL_STREAM_3BYTE_18;
                break;
 
        case MIPI_DSI_FMT_RGB565:
                val |= (0x5 << 4);
                hact = vm.hactive * 2;
-               video_start = (vm.hsync_len + vm.hback_porch) * 2;
                data_type = MIPI_DSI_PACKED_PIXEL_STREAM_16;
                break;
        default:
                return;
        }
 
+       /*
+        * There are three important things to make TC358768 work correctly,
+        * which are not trivial to manage:
+        *
+        * 1. Keep the DPI line-time and the DSI line-time as close to each
+        *    other as possible.
+        * 2. TC358768 goes to LP mode after each line's active area. The DSI
+        *    HFP period has to be long enough for entering and exiting LP mode.
+        *    But it is not clear how to calculate this.
+        * 3. VSDly (video start delay) has to be long enough to ensure that the
+        *    DSI TX does not start transmitting until we have started receiving
+        *    pixel data from the DPI input. It is not clear how to calculate
+        *    this either.
+        */
+
+       dpi_htot = vm.hactive + vm.hfront_porch + vm.hsync_len + vm.hback_porch;
+       dpi_data_start = vm.hsync_len + vm.hback_porch;
+
+       dev_dbg(dev, "dpi horiz timing (pclk): %u + %u + %u + %u = %u\n",
+               vm.hsync_len, vm.hback_porch, vm.hactive, vm.hfront_porch,
+               dpi_htot);
+
+       dev_dbg(dev, "dpi horiz timing (ns): %u + %u + %u + %u = %u\n",
+               tc358768_dpi_to_ns(vm.hsync_len, vm.pixelclock),
+               tc358768_dpi_to_ns(vm.hback_porch, vm.pixelclock),
+               tc358768_dpi_to_ns(vm.hactive, vm.pixelclock),
+               tc358768_dpi_to_ns(vm.hfront_porch, vm.pixelclock),
+               tc358768_dpi_to_ns(dpi_htot, vm.pixelclock));
+
+       dev_dbg(dev, "dpi data start (ns): %u + %u = %u\n",
+               tc358768_dpi_to_ns(vm.hsync_len, vm.pixelclock),
+               tc358768_dpi_to_ns(vm.hback_porch, vm.pixelclock),
+               tc358768_dpi_to_ns(dpi_data_start, vm.pixelclock));
+
+       dsi_dpi_htot = tc358768_dpi_to_dsi_bytes(priv, dpi_htot);
+       dsi_dpi_data_start = tc358768_dpi_to_dsi_bytes(priv, dpi_data_start);
+
+       if (dsi_dev->mode_flags & MIPI_DSI_MODE_VIDEO_SYNC_PULSE) {
+               dsi_hsw = tc358768_dpi_to_dsi_bytes(priv, vm.hsync_len);
+               dsi_hbp = tc358768_dpi_to_dsi_bytes(priv, vm.hback_porch);
+       } else {
+               /* HBP is included in HSW in event mode */
+               dsi_hbp = 0;
+               dsi_hsw = tc358768_dpi_to_dsi_bytes(priv,
+                                                   vm.hsync_len +
+                                                   vm.hback_porch);
+
+               /*
+                * The pixel packet includes the actual pixel data, and:
+                * DSI packet header = 4 bytes
+                * DCS code = 1 byte
+                * DSI packet footer = 2 bytes
+                */
+               dsi_hact = hact + 4 + 1 + 2;
+
+               dsi_hfp = dsi_dpi_htot - dsi_hact - dsi_hsw - dsi_hss;
+
+               /*
+                * Here we should check if HFP is long enough for entering LP
+                * and exiting LP, but it's not clear how to calculate that.
+                * Instead, this is a naive algorithm that just adjusts the HFP
+                * and HSW so that HFP is (at least) roughly 2/3 of the total
+                * blanking time.
+                */
+               if (dsi_hfp < (dsi_hfp + dsi_hsw + dsi_hss) * 2 / 3) {
+                       u32 old_hfp = dsi_hfp;
+                       u32 old_hsw = dsi_hsw;
+                       u32 tot = dsi_hfp + dsi_hsw + dsi_hss;
+
+                       dsi_hsw = tot / 3;
+
+                       /*
+                        * Seems like sometimes HSW has to be divisible by num-lanes, but
+                        * not always...
+                        */
+                       dsi_hsw = roundup(dsi_hsw, priv->dsi_lanes);
+
+                       dsi_hfp = dsi_dpi_htot - dsi_hact - dsi_hsw - dsi_hss;
+
+                       dev_dbg(dev,
+                               "hfp too short, adjusting dsi hfp and dsi hsw from %u, %u to %u, %u\n",
+                               old_hfp, old_hsw, dsi_hfp, dsi_hsw);
+               }
+
+               dev_dbg(dev,
+                       "dsi horiz timing (bytes): %u, %u + %u + %u + %u = %u\n",
+                       dsi_hss, dsi_hsw, dsi_hbp, dsi_hact, dsi_hfp,
+                       dsi_hss + dsi_hsw + dsi_hbp + dsi_hact + dsi_hfp);
+
+               dev_dbg(dev, "dsi horiz timing (ns): %u + %u + %u + %u + %u = %u\n",
+                       tc358768_dsi_bytes_to_ns(priv, dsi_hss),
+                       tc358768_dsi_bytes_to_ns(priv, dsi_hsw),
+                       tc358768_dsi_bytes_to_ns(priv, dsi_hbp),
+                       tc358768_dsi_bytes_to_ns(priv, dsi_hact),
+                       tc358768_dsi_bytes_to_ns(priv, dsi_hfp),
+                       tc358768_dsi_bytes_to_ns(priv, dsi_hss + dsi_hsw +
+                                                dsi_hbp + dsi_hact + dsi_hfp));
+       }
+
+       /* VSDly calculation */
+
+       /* Start with the HW internal delay */
+       dsi_vsdly = internal_dly;
+
+       /* Convert to byte units as the other variables are in byte units */
+       dsi_vsdly *= priv->dsi_lanes;
+
+       /* Do we need more delay, in addition to the internal? */
+       if (dsi_dpi_data_start > dsi_vsdly + dsi_hss + dsi_hsw + dsi_hbp) {
+               dsi_vsdly = dsi_dpi_data_start - dsi_hss - dsi_hsw - dsi_hbp;
+               dsi_vsdly = roundup(dsi_vsdly, priv->dsi_lanes);
+       }
+
+       dev_dbg(dev, "dsi data start (bytes) %u + %u + %u + %u = %u\n",
+               dsi_vsdly, dsi_hss, dsi_hsw, dsi_hbp,
+               dsi_vsdly + dsi_hss + dsi_hsw + dsi_hbp);
+
+       dev_dbg(dev, "dsi data start (ns) %u + %u + %u + %u = %u\n",
+               tc358768_dsi_bytes_to_ns(priv, dsi_vsdly),
+               tc358768_dsi_bytes_to_ns(priv, dsi_hss),
+               tc358768_dsi_bytes_to_ns(priv, dsi_hsw),
+               tc358768_dsi_bytes_to_ns(priv, dsi_hbp),
+               tc358768_dsi_bytes_to_ns(priv, dsi_vsdly + dsi_hss + dsi_hsw + dsi_hbp));
+
+       /* Convert back to hsbyteclk */
+       dsi_vsdly /= priv->dsi_lanes;
+
+       /*
+        * The docs say that there is an internal delay of 40 cycles.
+        * However, we get underflows if we follow that rule. If we
+        * instead ignore the internal delay, things work. So either
+        * the docs are wrong or the calculations are wrong.
+        *
+        * As a temporary fix, add the internal delay here, to counter
+        * the subtraction when writing the register.
+        */
+       dsi_vsdly += internal_dly;
+
+       /* Clamp to the register max */
+       if (dsi_vsdly - internal_dly > 0x3ff) {
+               dev_warn(dev, "VSDly too high, underflows likely\n");
+               dsi_vsdly = 0x3ff + internal_dly;
+       }
+
        /* VSDly[9:0] */
-       video_start = max(video_start, internal_delay + 1) - internal_delay;
-       tc358768_write(priv, TC358768_VSDLY, video_start);
+       tc358768_write(priv, TC358768_VSDLY, dsi_vsdly - internal_dly);
 
        tc358768_write(priv, TC358768_DATAFMT, val);
        tc358768_write(priv, TC358768_DSITX_DT, data_type);
 
                /* vbp */
                tc358768_write(priv, TC358768_DSI_VBPR, vm.vback_porch);
-
-               /* hsw * byteclk * ndl / pclk */
-               val = (u32)div_u64(vm.hsync_len *
-                                  (u64)hsbyteclk * priv->dsi_lanes,
-                                  vm.pixelclock);
-               tc358768_write(priv, TC358768_DSI_HSW, val);
-
-               /* hbp * byteclk * ndl / pclk */
-               val = (u32)div_u64(vm.hback_porch *
-                                  (u64)hsbyteclk * priv->dsi_lanes,
-                                  vm.pixelclock);
-               tc358768_write(priv, TC358768_DSI_HBPR, val);
        } else {
                /* Set event mode */
                tc358768_write(priv, TC358768_DSI_EVENT, 1);
 
                /* vbp (not used in event mode) */
                tc358768_write(priv, TC358768_DSI_VBPR, 0);
+       }
 
-               /* (hsw + hbp) * byteclk * ndl / pclk */
-               val = (u32)div_u64((vm.hsync_len + vm.hback_porch) *
-                                  (u64)hsbyteclk * priv->dsi_lanes,
-                                  vm.pixelclock);
-               tc358768_write(priv, TC358768_DSI_HSW, val);
+       /* hsw (bytes) */
+       tc358768_write(priv, TC358768_DSI_HSW, dsi_hsw);
 
-               /* hbp (not used in event mode) */
-               tc358768_write(priv, TC358768_DSI_HBPR, 0);
-       }
+       /* hbp (bytes) */
+       tc358768_write(priv, TC358768_DSI_HBPR, dsi_hbp);
 
        /* hact (bytes) */
        tc358768_write(priv, TC358768_DSI_HACT, hact);