#include <linux/tick.h>
 #include <linux/cpuidle.h>
 #include <linux/cpu_pm.h>
+#include <linux/delay.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 
 }
 #endif
 
+static void tegra20_report_cpus_state(void)
+{
+       unsigned long cpu, lcpu, csr;
+
+       for_each_cpu(lcpu, cpu_possible_mask) {
+               cpu = cpu_logical_map(lcpu);
+               csr = flowctrl_read_cpu_csr(cpu);
+
+               pr_err("cpu%lu: online=%d flowctrl_csr=0x%08lx\n",
+                      cpu, cpu_online(lcpu), csr);
+       }
+}
+
+static int tegra20_wait_for_secondary_cpu_parking(void)
+{
+       unsigned int retries = 3;
+
+       while (retries--) {
+               unsigned int delay_us = 10;
+               unsigned int timeout_us = 500 * 1000 / delay_us;
+
+               /*
+                * The primary CPU0 core shall wait for the secondaries
+                * shutdown in order to power-off CPU's cluster safely.
+                * The timeout value depends on the current CPU frequency,
+                * it takes about 40-150us  in average and over 1000us in
+                * a worst case scenario.
+                */
+               do {
+                       if (tegra_cpu_rail_off_ready())
+                               return 0;
+
+                       udelay(delay_us);
+
+               } while (timeout_us--);
+
+               pr_err("secondary CPU taking too long to park\n");
+
+               tegra20_report_cpus_state();
+       }
+
+       pr_err("timed out waiting secondaries to park\n");
+
+       return -ETIMEDOUT;
+}
+
 static bool tegra20_cpu_cluster_power_down(struct cpuidle_device *dev,
                                           struct cpuidle_driver *drv,
                                           int index)
 {
        bool ret;
 
-       while (!tegra_cpu_rail_off_ready())
-               cpu_relax();
+       if (tegra20_wait_for_secondary_cpu_parking())
+               return false;
 
        ret = !tegra_pm_enter_lp2();