#ifndef __ASM_ARM_DELAY_H
 #define __ASM_ARM_DELAY_H
 
+#include <asm/memory.h>
 #include <asm/param.h> /* HZ */
 
-extern void __delay(int loops);
+#define MAX_UDELAY_MS  2
+#define UDELAY_MULT    ((UL(2199023) * HZ) >> 11)
+#define UDELAY_SHIFT   30
+
+#ifndef __ASSEMBLY__
+
+extern struct arm_delay_ops {
+       void (*delay)(unsigned long);
+       void (*const_udelay)(unsigned long);
+       void (*udelay)(unsigned long);
+} arm_delay_ops;
+
+#define __delay(n)             arm_delay_ops.delay(n)
 
 /*
  * This function intentionally does not exist; if you see references to
  * division by multiplication: you don't have to worry about
  * loss of precision.
  *
- * Use only for very small delays ( < 1 msec).  Should probably use a
+ * Use only for very small delays ( < 2 msec).  Should probably use a
  * lookup table, really, as the multiplications take much too long with
  * short delays.  This is a "reasonable" implementation, though (and the
  * first constant multiplications gets optimized away if the delay is
  * a constant)
  */
-extern void __udelay(unsigned long usecs);
-extern void __const_udelay(unsigned long);
-
-#define MAX_UDELAY_MS 2
+#define __udelay(n)            arm_delay_ops.udelay(n)
+#define __const_udelay(n)      arm_delay_ops.const_udelay(n)
 
 #define udelay(n)                                                      \
        (__builtin_constant_p(n) ?                                      \
          ((n) > (MAX_UDELAY_MS * 1000) ? __bad_udelay() :              \
-                       __const_udelay((n) * ((2199023U*HZ)>>11))) :    \
+                       __const_udelay((n) * UDELAY_MULT)) :            \
          __udelay(n))
 
+/* Loop-based definitions for assembly code. */
+extern void __loop_delay(unsigned long loops);
+extern void __loop_udelay(unsigned long usecs);
+extern void __loop_const_udelay(unsigned long);
+
+#endif /* __ASSEMBLY__ */
+
 #endif /* defined(_ARM_DELAY_H) */
 
 
 
 static struct clock_event_device __percpu **arch_timer_evt;
 
+extern void init_current_timer_delay(unsigned long freq);
+
 /*
  * Architected system timer support.
  */
        if (err)
                goto out_free_irq;
 
+       init_current_timer_delay(arch_timer_rate);
        return 0;
 
 out_free_irq:
 
 extern void fpundefinstr(void);
 
        /* platform dependent support */
-EXPORT_SYMBOL(__udelay);
-EXPORT_SYMBOL(__const_udelay);
+EXPORT_SYMBOL(arm_delay_ops);
 
        /* networking */
 EXPORT_SYMBOL(csum_partial);
 
 
 lib-y          := backtrace.o changebit.o csumipv6.o csumpartial.o   \
                   csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
-                  delay.o findbit.o memchr.o memcpy.o                \
+                  delay.o delay-loop.o findbit.o memchr.o memcpy.o   \
                   memmove.o memset.o memzero.o setbit.o              \
                   strncpy_from_user.o strnlen_user.o                 \
                   strchr.o strrchr.o                                 \
 
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/param.h>
+#include <asm/delay.h>
                .text
 
 .LC0:          .word   loops_per_jiffy
-.LC1:          .word   (2199023*HZ)>>11
+.LC1:          .word   UDELAY_MULT
 
 /*
  * r0  <= 2000
  * HZ  <= 1000
  */
 
-ENTRY(__udelay)
+ENTRY(__loop_udelay)
                ldr     r2, .LC1
                mul     r0, r2, r0
-ENTRY(__const_udelay)                          @ 0 <= r0 <= 0x7fffff06
+ENTRY(__loop_const_udelay)                     @ 0 <= r0 <= 0x7fffff06
                mov     r1, #-1
                ldr     r2, .LC0
                ldr     r2, [r2]                @ max = 0x01ffffff
 
 /*
  * loops = r0 * HZ * loops_per_jiffy / 1000000
- *
- * Oh, if only we had a cycle counter...
  */
 
 @ Delay routine
-ENTRY(__delay)
+ENTRY(__loop_delay)
                subs    r0, r0, #1
 #if 0
                movls   pc, lr
                movls   pc, lr
                subs    r0, r0, #1
 #endif
-               bhi     __delay
+               bhi     __loop_delay
                mov     pc, lr
-ENDPROC(__udelay)
-ENDPROC(__const_udelay)
-ENDPROC(__delay)
+ENDPROC(__loop_udelay)
+ENDPROC(__loop_const_udelay)
+ENDPROC(__loop_delay)
 
--- /dev/null
+/*
+ * Delay loops based on the OpenRISC implementation.
+ *
+ * Copyright (C) 2012 ARM Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/timex.h>
+
+/*
+ * Default to the loop-based delay implementation.
+ */
+struct arm_delay_ops arm_delay_ops = {
+       .delay          = __loop_delay,
+       .const_udelay   = __loop_const_udelay,
+       .udelay         = __loop_udelay,
+};
+
+#ifdef ARCH_HAS_READ_CURRENT_TIMER
+static void __timer_delay(unsigned long cycles)
+{
+       cycles_t start = get_cycles();
+
+       while ((get_cycles() - start) < cycles)
+               cpu_relax();
+}
+
+static void __timer_const_udelay(unsigned long xloops)
+{
+       unsigned long long loops = xloops;
+       loops *= loops_per_jiffy;
+       __timer_delay(loops >> UDELAY_SHIFT);
+}
+
+static void __timer_udelay(unsigned long usecs)
+{
+       __timer_const_udelay(usecs * UDELAY_MULT);
+}
+
+void __init init_current_timer_delay(unsigned long freq)
+{
+       pr_info("Switching to timer-based delay loop\n");
+       lpj_fine                        = freq / HZ;
+       arm_delay_ops.delay             = __timer_delay;
+       arm_delay_ops.const_udelay      = __timer_const_udelay;
+       arm_delay_ops.udelay            = __timer_udelay;
+}
+
+unsigned long __cpuinit calibrate_delay_is_known(void)
+{
+       return lpj_fine;
+}
+#endif
 
        orr     r4, r4, #MDREFR_K1DB2
        ldr     r5, =PPCR
 
-       @ Pre-load __udelay into the I-cache
+       @ Pre-load __loop_udelay into the I-cache
        mov     r0, #1
-       bl      __udelay
+       bl      __loop_udelay
        mov     r0, r0
 
        @ The following must all exist in a single cache line to
        @ delay 90us and set CPU PLL to lowest speed
        @ fixes resume problem on high speed SA1110
        mov     r0, #90
-       bl      __udelay
+       bl      __loop_udelay
        mov     r1, #0
        str     r1, [r5]
        mov     r0, #90
-       bl      __udelay
+       bl      __loop_udelay
 
        /*
         * SA1110 SDRAM controller workaround.  register values: