mov ar.rsc=0            // place RSE in enforced lazy mode
        ;;
        loadrs                  // clear the dirty partition
-       mov IA64_KR(PER_CPU_DATA)=r0    // clear physical per-CPU base
+       movl r19=__phys_per_cpu_start
+       mov r18=PERCPU_PAGE_SIZE
+       ;;
+#ifndef CONFIG_SMP
+       add r19=r19,r18
+       ;;
+#else
+(isAP) br.few 2f
+       mov r20=r19
+       sub r19=r19,r18
+       ;;
+       shr.u r18=r18,3
+1:
+       ld8 r21=[r20],8;;
+       st8[r19]=r21,8
+       adds r18=-1,r18;;
+       cmp4.lt p7,p6=0,r18
+(p7)   br.cond.dptk.few 1b
+2:
+#endif
+       tpa r19=r19
+       ;;
+       .pred.rel.mutex isBP,isAP
+(isBP) mov IA64_KR(PER_CPU_DATA)=r19   // per-CPU base for cpu0
+(isAP) mov IA64_KR(PER_CPU_DATA)=r0    // clear physical per-CPU base
        ;;
        mov ar.bspstore=r2      // establish the new RSE stack
        ;;
 
        if (smp_processor_id() == 0) {
                cpu_set(0, per_cpu(cpu_sibling_map, 0));
                cpu_set(0, cpu_core_map[0]);
+       } else {
+               /*
+                * Set ar.k3 so that assembly code in MCA handler can compute
+                * physical addresses of per cpu variables with a simple:
+                *   phys = ar.k3 + &per_cpu_var
+                * and the alt-dtlb-miss handler can set per-cpu mapping into
+                * the TLB when needed. head.S already did this for cpu0.
+                */
+               ia64_set_kr(IA64_KR_PER_CPU_DATA,
+                           ia64_tpa(cpu_data) - (long) __per_cpu_start);
        }
 #endif
 
-       /*
-        * We set ar.k3 so that assembly code in MCA handler can compute
-        * physical addresses of per cpu variables with a simple:
-        *   phys = ar.k3 + &per_cpu_var
-        */
-       ia64_set_kr(IA64_KR_PER_CPU_DATA,
-                   ia64_tpa(cpu_data) - (long) __per_cpu_start);
-
        get_max_cacheline_size();
 
        /*
 
 {
        /* Early console may use I/O ports */
        ia64_set_kr(IA64_KR_IO_BASE, __pa(ia64_iobase));
+#ifndef CONFIG_PRINTK_TIME
        Dprintk("start_secondary: starting CPU 0x%x\n", hard_smp_processor_id());
+#endif
        efi_map_pal_code();
        cpu_init();
        preempt_disable();
 
   /* Per-cpu data: */
   percpu : { } :percpu
   . = ALIGN(PERCPU_PAGE_SIZE);
+#ifdef CONFIG_SMP
+  . = . + PERCPU_PAGE_SIZE;    /* cpu0 per-cpu space */
+#endif
   __phys_per_cpu_start = .;
   .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET)
        {
 
         * get_zeroed_page().
         */
        if (first_time) {
+               void *cpu0_data = __phys_per_cpu_start - PERCPU_PAGE_SIZE;
+
                first_time=0;
-               for (cpu = 0; cpu < NR_CPUS; cpu++) {
+
+               __per_cpu_offset[0] = (char *) cpu0_data - __per_cpu_start;
+               per_cpu(local_per_cpu_offset, 0) = __per_cpu_offset[0];
+
+               for (cpu = 1; cpu < NR_CPUS; cpu++) {
                        memcpy(cpu_data, __phys_per_cpu_start, __per_cpu_end - __per_cpu_start);
                        __per_cpu_offset[cpu] = (char *) cpu_data - __per_cpu_start;
                        cpu_data += PERCPU_PAGE_SIZE;
 static inline void
 alloc_per_cpu_data(void)
 {
-       cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS,
+       cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS-1,
                                   PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 }
 #else
 
        int cpu;
 
        for_each_possible_early_cpu(cpu) {
-               if (node == node_cpuid[cpu].nid) {
+               if (cpu == 0) {
+                       void *cpu0_data = __phys_per_cpu_start - PERCPU_PAGE_SIZE;
+                       __per_cpu_offset[cpu] = (char*)cpu0_data -
+                               __per_cpu_start;
+               } else if (node == node_cpuid[cpu].nid) {
                        memcpy(__va(cpu_data), __phys_per_cpu_start,
                               __per_cpu_end - __per_cpu_start);
                        __per_cpu_offset[cpu] = (char*)__va(cpu_data) -