]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
SPARC64: PORT LDOMS TO UEK4
authorAaron Young <Aaron.Young@oracle.com>
Tue, 18 Aug 2015 19:10:23 +0000 (12:10 -0700)
committerAllen Pais <allen.pais@oracle.com>
Fri, 11 Sep 2015 15:58:54 +0000 (21:28 +0530)
    Initial port of LDoms code to UEK4.

    NOTE: due to UEK4 kernel issue(s) encountered during testing,
    this port has NOT been fully tested.

Signed-off-by: Aaron Young <aaron.young@oracle.com>
    Orabug: 21644721
(cherry picked from commit 6dfe4cf1cc02dbea298480804d030850bfef1ab3)

Conflicts:
arch/sparc/kernel/ds.c
drivers/tty/Kconfig
drivers/tty/Makefile
(cherry picked from commit c398fd2a3c18f6385eb4db80305ab693027a58d5)

Conflicts:
drivers/tty/Kconfig
drivers/tty/Makefile
Signed-off-by: Allen Pais <allen.pais@oracle.com>
38 files changed:
arch/sparc/configs/sparc32_defconfig
arch/sparc/configs/sparc64_defconfig
arch/sparc/include/asm/ldc.h
arch/sparc/include/asm/mdesc.h
arch/sparc/include/asm/vio.h
arch/sparc/kernel/ds.c
arch/sparc/kernel/ldc.c
arch/sparc/kernel/mdesc.c
arch/sparc/kernel/vio.c
arch/sparc/kernel/viohs.c
drivers/block/Kconfig
drivers/block/Makefile
drivers/block/sunvdc.c
drivers/block/vds/Makefile [new file with mode: 0644]
drivers/block/vds/vds.h [new file with mode: 0644]
drivers/block/vds/vds_blk.c [new file with mode: 0644]
drivers/block/vds/vds_efi.c [new file with mode: 0644]
drivers/block/vds/vds_io.c [new file with mode: 0644]
drivers/block/vds/vds_io.h [new file with mode: 0644]
drivers/block/vds/vds_label.c [new file with mode: 0644]
drivers/block/vds/vds_main.c [new file with mode: 0644]
drivers/block/vds/vds_reg.c [new file with mode: 0644]
drivers/block/vds/vds_vtoc.c [new file with mode: 0644]
drivers/block/vds/vds_vtoc.h [new file with mode: 0644]
drivers/char/Kconfig
drivers/char/Makefile
drivers/char/vldc.c [new file with mode: 0644]
drivers/char/vlds.c [new file with mode: 0644]
drivers/net/ethernet/sun/sunvnet.c
drivers/tty/Kconfig
drivers/tty/Makefile
drivers/tty/vcc.c [new file with mode: 0644]
include/linux/ds.h [new file with mode: 0644]
include/linux/vldc.h [new file with mode: 0644]
include/linux/vlds.h [new file with mode: 0644]
include/uapi/linux/ds.h [new file with mode: 0644]
include/uapi/linux/vldc.h [new file with mode: 0644]
include/uapi/linux/vlds.h [new file with mode: 0644]

index fb23fd6b186a1b0ffeeace7cca9e9a5a136fbfa6..d8b26c882bdb08c0c5ac6dba58119239f32c769a 100644 (file)
@@ -103,3 +103,7 @@ CONFIG_CRYPTO_TWOFISH=m
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
 CONFIG_LIBCRC32C=m
+CONFIG_VCC=m
+CONFIG_VLDC=m
+CONFIG_VLDS=m
+CONFIG_VDS=m
index 6b68f12f29db4615fe36569b0e4d9d9fe30229be..98bb934751de03c3778431a712f9bcefb0a308f3 100644 (file)
@@ -241,3 +241,7 @@ CONFIG_CRYPTO_TWOFISH=m
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRC16=m
 CONFIG_LIBCRC32C=m
+CONFIG_VCC=m
+CONFIG_VLDC=m
+CONFIG_VLDS=m
+CONFIG_VDS=m
index 6e9004aa6f25d5af47c3a2e95a74a1bdb6008c6f..37b61235fe8a0b098cd87bba5a6c4a8b814c0680 100644 (file)
@@ -24,6 +24,9 @@ struct ldc_channel_config {
        u32                     mtu;
        unsigned int            rx_irq;
        unsigned int            tx_irq;
+       u64                     rx_ino;
+       u64                     tx_ino;
+       u64                     dev_handle;
        u8                      mode;
 #define LDC_MODE_RAW           0x00
 #define LDC_MODE_UNRELIABLE    0x01
@@ -48,6 +51,8 @@ struct ldc_channel_config {
 #define LDC_STATE_READY                0x03
 #define LDC_STATE_CONNECTED    0x04
 
+#define LDC_PACKET_SIZE                64
+
 struct ldc_channel;
 
 /* Allocate state for a channel.  */
@@ -72,6 +77,11 @@ int ldc_connect(struct ldc_channel *lp);
 int ldc_disconnect(struct ldc_channel *lp);
 
 int ldc_state(struct ldc_channel *lp);
+void ldc_set_state(struct ldc_channel *lp, u8 state);
+int ldc_mode(struct ldc_channel *lp);
+void ldc_print(struct ldc_channel *lp);
+int ldc_rx_reset(struct ldc_channel *lp);
+void ldc_clr_reset(struct ldc_channel *lp);
 
 /* Read and write operations.  Only valid when the link is up.  */
 int ldc_write(struct ldc_channel *lp, const void *buf,
@@ -137,4 +147,12 @@ void ldc_free_exp_dring(struct ldc_channel *lp, void *buf,
                        unsigned int len,
                        struct ldc_trans_cookie *cookies, int ncookies);
 
+int ldc_tx_space_available(struct ldc_channel *lp, unsigned long size);
+
+int ldc_rx_data_available(struct ldc_channel *lp);
+
+void ldc_enable_hv_intr(struct ldc_channel *lp);
+
+void ldc_disable_hv_intr(struct ldc_channel *lp);
+
 #endif /* _SPARC64_LDC_H */
index aebeb88f70db908db355aeeffb9d519f4871994e..d4821f062fda624358d13cd282698d0286664793 100644 (file)
@@ -16,6 +16,7 @@ struct mdesc_handle *mdesc_grab(void);
 void mdesc_release(struct mdesc_handle *);
 
 #define MDESC_NODE_NULL                (~(u64)0)
+#define MDESC_MAX_STR_LEN      256
 
 u64 mdesc_node_by_name(struct mdesc_handle *handle,
                       u64 from_node, const char *name);
@@ -62,8 +63,11 @@ u64 mdesc_arc_target(struct mdesc_handle *hp, u64 arc);
 void mdesc_update(void);
 
 struct mdesc_notifier_client {
-       void (*add)(struct mdesc_handle *handle, u64 node);
-       void (*remove)(struct mdesc_handle *handle, u64 node);
+       void (*add)(struct mdesc_handle *handle, u64 node,
+           const char *node_name);
+       void (*remove)(struct mdesc_handle *handle, u64 node,
+           const char *node_name);
+
 
        const char                      *node_name;
        struct mdesc_notifier_client    *next;
@@ -71,6 +75,22 @@ struct mdesc_notifier_client {
 
 void mdesc_register_notifier(struct mdesc_notifier_client *client);
 
+union md_node_info {
+       struct vdev_port {
+               char name[MDESC_MAX_STR_LEN];   /* name (property) */
+               u64 id;                         /* id */
+               u64 parent_cfg_hdl;             /* parent config handle */
+       } vdev_port;
+       struct ds_port {
+               u64 id;                         /* id */
+       } ds_port;
+};
+u64 mdesc_get_node(struct mdesc_handle *hp, char *node_name,
+       union md_node_info *node_info);
+int mdesc_get_node_info(struct mdesc_handle *hp, u64 node,
+       char *node_name, union md_node_info *node_info);
+
+
 void mdesc_fill_in_cpu_data(cpumask_t *mask);
 void mdesc_populate_present_mask(cpumask_t *mask);
 void mdesc_get_page_sizes(cpumask_t *mask, unsigned long *pgsz_mask);
index 8174f6cdbbbbd87af5bdbcb923352ddadb4e237e..e990d29b64a18d37806d0805f0763df44f65b130 100644 (file)
@@ -52,6 +52,8 @@ struct vio_ver_info {
 #define VDEV_NETWORK_SWITCH    0x02
 #define VDEV_DISK              0x03
 #define VDEV_DISK_SERVER       0x04
+#define VDEV_CONSOLE_CON       0x05
+#define VDEV_VLDC              0x06
 
        u8                      resv1[3];
        u64                     resv2[5];
@@ -100,6 +102,17 @@ struct vio_dring_data {
        u64                     __par4[2];
 };
 
+/*
+ * VIO Common header for inband descriptor messages.
+ *
+ * Clients will then combine this header with a device specific payload.
+ */
+struct vio_desc_data {
+       struct vio_msg_tag      tag;
+       u64                     seq;
+       u64                     desc_handle;
+};
+
 struct vio_dring_hdr {
        u8                      state;
 #define VIO_DESC_FREE          0x01
@@ -162,6 +175,30 @@ struct vio_disk_desc {
        struct ldc_trans_cookie cookies[0];
 };
 
+struct vio_disk_dring_payload {
+       u64                     req_id;
+       u8                      operation;
+       u8                      slice;
+       u16                     resv1;
+       u32                     status;
+       u64                     offset;
+       u64                     size;
+       u32                     ncookies;
+       u32                     resv2;
+       struct ldc_trans_cookie cookies[0];
+};
+
+/*
+ * VIO disk inband descriptor message.
+ *
+ * For clients that do not use descriptor rings, the descriptor contents
+ * are sent as part of an inband message.
+ */
+struct vio_disk_desc_inband {
+       struct vio_desc_data            hdr;
+       struct vio_disk_dring_payload   payload;
+};
+
 #define VIO_DISK_VNAME_LEN     8
 #define VIO_DISK_ALABEL_LEN    128
 #define VIO_DISK_NUM_PART      8
@@ -282,6 +319,14 @@ struct vio_dring_state {
        struct ldc_trans_cookie cookies[VIO_MAX_RING_COOKIES];
 };
 
+#define        VIO_TAG_SIZE            (sizeof(struct vio_msg_tag))
+#define        VIO_VCC_MTU_SIZE        (LDC_PACKET_SIZE - 8)
+
+struct vio_vcc {
+       struct vio_msg_tag      tag;
+       char                    data[VIO_VCC_MTU_SIZE];
+};
+
 static inline void *vio_dring_cur(struct vio_dring_state *dr)
 {
        return dr->base + (dr->entry_size * dr->prod);
@@ -316,24 +361,32 @@ static inline u32 vio_dring_prev(struct vio_dring_state *dr, u32 index)
 }
 
 #define VIO_MAX_TYPE_LEN       32
+#define VIO_MAX_NAME_LEN       32
 #define VIO_MAX_COMPAT_LEN     64
 
 struct vio_dev {
-       u64                     mp;
        struct device_node      *dp;
 
+       char                    node_name[VIO_MAX_NAME_LEN];
        char                    type[VIO_MAX_TYPE_LEN];
        char                    compat[VIO_MAX_COMPAT_LEN];
        int                     compat_len;
 
        u64                     dev_no;
 
+       unsigned long           port_id;
        unsigned long           channel_id;
 
        unsigned int            tx_irq;
        unsigned int            rx_irq;
        u64                     rx_ino;
 
+       u64                     dev_handle;
+       u64                     tx_ino;
+
+       /* MD specific data used to match the vdev in the MD */
+       union md_node_info      md_node_info;
+
        struct device           dev;
 };
 
@@ -346,6 +399,7 @@ struct vio_driver {
        void (*shutdown)(struct vio_dev *dev);
        unsigned long                   driver_data;
        struct device_driver            driver;
+       bool                            no_irq;
 };
 
 struct vio_version {
@@ -489,5 +543,6 @@ int vio_driver_init(struct vio_driver_state *vio, struct vio_dev *vdev,
 
 void vio_port_up(struct vio_driver_state *vio);
 int vio_set_intr(unsigned long dev_ino, int state);
+u64 vio_vdev_node(struct mdesc_handle *hp, struct vio_dev *vdev);
 
 #endif /* _SPARC64_VIO_H */
index 973c04531cec85ade1c45ecdfed153b06a1da443..944620d1e07e3d0e84f3c03412ad27f2e52ceedd 100644 (file)
@@ -1,8 +1,11 @@
-/* ds.c: Domain Services driver for Logical Domains
+/*
+ * ds.c: Sun4v LDOMs Domain Services Driver
  *
  * Copyright (C) 2007, 2008 David S. Miller <davem@davemloft.net>
+ * Copyright (C) 2015 Oracle. All rights reserved.
  */
-
+#include <linux/ds.h>
+#include <linux/ioctl.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
+#include <linux/spinlock.h>
 #include <linux/mutex.h>
 #include <linux/kthread.h>
+#include <linux/completion.h>
 #include <linux/reboot.h>
 #include <linux/cpu.h>
+#include <linux/miscdevice.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/smp.h>
 
 #include <asm/hypervisor.h>
 #include <asm/ldc.h>
 
 #include "kernel.h"
 
+/*
+ * Def to enable kernel timer bug workaround.
+ * See additional comments below.
+ */
+#define DS_KERNEL_TIMER_BUG_WAR 1
+
+/*
+ * Theory of operation:
+ *
+ * Domain Services provide a protocol for a logical domain (ldom) to provide
+ * or use a service to/from another ldom or the SP. For a given service there is
+ * a provider and a client. The provider and client can share a service across
+ * a LDC or directly in the case of a "loopback" service on the same local
+ * domain. For example, a guest ldom can provide a shutdown service to the
+ * control domain (the client) to allow the control domain to use the service
+ * to shutdown the guest. On the control domain, the kernel can provide
+ * the shutdown service to the domain manager software in loopback mode to
+ * allow the domain manager to shutdown the local control domain.
+ * Several software entities can provide or use domain services: OBP, SP,
+ * user-level logical domain manager and kernel driver (this module).
+ * After establishing a domain service protocol link between two entities,
+ * many services can be shared on the link. Services advertise
+ * their availablility by sending a service registration request containing
+ * a service id (a string identifying the service) and a generated numerical
+ * handle (a value to use to identify the service connection after the
+ * connection has been established). A service request is acknowledged
+ * (ACK'd) by the other end of the link if the service is supported.
+ * Once the service registration is ACK'd, the service connection is
+ * established and service protocol packets can be exchanged by
+ * both entities (client and provider) on either side of the link.
+ * This driver can execute in the control domain, guest domains or both.
+ * It contains a set of builtin services associated with the "primary" (or
+ * control) domain. The driver also contains an API which allows external
+ * domain services to be registered with the driver. This API can be utilized by
+ * another kernel driver to provide/use services. The API can also be used by
+ * another kernel driver (i.e. vlds) to provide user-level domain services.
+ *
+ */
+
+static unsigned int dsdbg_level;
+module_param(dsdbg_level, uint, S_IRUGO|S_IWUSR);
+
 #define DRV_MODULE_NAME                "ds"
 #define PFX DRV_MODULE_NAME    ": "
-#define DRV_MODULE_VERSION     "1.0"
-#define DRV_MODULE_RELDATE     "Jul 11, 2007"
 
-static char version[] =
-       DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
-MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
-MODULE_DESCRIPTION("Sun LDOM domain services driver");
+#define XSTR(s) STR(s)
+#define STR(s) #s
+#define DRV_MODULE_VERSION XSTR(DS_MAJOR_VERSION) "." XSTR(DS_MINOR_VERSION)
+
+static char version[] = DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION "\n";
+
+#define dprintk(fmt, args...) do {\
+if (dsdbg_level > 0)\
+       printk(KERN_ERR "%s: %s: " fmt, DRV_MODULE_NAME, __func__, ##args);\
+} while (0)
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("Sun4v LDOM domain services driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
 
+#define LDC_IRQ_NAME_MAX       32
+
+#define        DS_DEFAULT_BUF_SIZE     4096
+#define        DS_DEFAULT_MTU          4096
+
+#define        DS_PRIMARY_ID           0
+
+#define DS_INVALID_HANDLE      0xFFFFFFFFFFFFFFFFUL
+
+/*
+ * The DS spec mentions that a DS handle is just any random number.
+ * However, the Solaris code uses some conventions to identify server
+ * and consumer handles, based on the setting of some bits in the
+ * handle. We have to use the same convention to be compatible with
+ * services from Solaris.
+ */
+#define        DS_HDL_ISCLIENT_BIT             0x80000000ull
+#define        DS_HDL_ISCNTRLD_BIT             0x40000000ull
+
+/* Globals to identify the local ldom handle */
+u64 ds_local_ldom_handle;
+bool ds_local_ldom_handle_set;
+
+/* Global driver data struct for data common to all ds devices. */
+struct ds_driver_data {
+
+       /* list of all ds devices */
+       struct list_head        ds_dev_list;
+       int                     num_ds_dev_list;
+
+};
+struct ds_driver_data ds_data;
+static DEFINE_SPINLOCK(ds_data_lock); /* protect ds_data */
+
+/*
+ * For each DS port, a timer fires every DS_REG_TIMER_FREQ
+ * milliseconds to attempt to register services on that DS port.
+ */
+#define        DS_REG_TIMER_FREQ       100     /* in ms */
+
+/* Timeout to wait for responses for sp-token and var-config DS requests */
+#define        DS_RESPONSE_TIMEOUT     10      /* in seconds */
+
+#ifdef DS_KERNEL_TIMER_BUG_WAR
+/*
+ * Define a partial type for ldc_channel so the compiler knows
+ * how to indirect ds->lp->lock. This must match the definition in ldc.c
+ * (which should probably be moved to ldc.h).
+ */
+struct ldc_channel {
+       /* Protects all operations that depend upon channel state.  */
+       spinlock_t                      lock;
+};
+#endif /* DS_KERNEL_TIMER_BUG_WAR */
+
+/*
+ * DS device structure. There is one of these probed/created per
+ * domain-services-port node in the MD.
+ * On a guest ldom, there is typically just one primary ds device
+ * for services provided from/to the "primary".
+ * On the primary ldom, there can be several ds devices - typically
+ * one for the SP, primary and each guest ldom.
+ */
+struct ds_dev {
+       /* link into the global driver data dev list */
+       struct list_head        list;
+
+       /* protect this ds_dev */
+       spinlock_t              ds_lock;
+
+       /* number of references to this ds_dev on the callout queue */
+       u64                     co_ref_cnt;
+
+       /* flag to indicate if this ds_dev is active */
+       bool                    active;
+
+       /* flag to indicate if this is a domain DS (versus the SP DS) */
+       bool                    is_domain;
+
+       /* LDC connection info for this ds_dev */
+       struct ldc_channel      *lp;
+       u8                      hs_state;
+       u64                     id;
+       u64                     handle;
+
+       /* negotiated DS version */
+       ds_ver_t                neg_vers;
+
+       /* LDC receive data buffer for this ds_dev */
+       u8                      *rcv_buf;
+       int                     rcv_buf_len;
+
+       /* service registration timer */
+       struct timer_list       ds_reg_tmr;
+
+       u32                     next_service_handle;
+
+       /* list of local service providers registered with this ds_dev */
+       struct list_head        service_provider_list;
+
+       /* list of local service clients registered with this ds_dev */
+       struct list_head        service_client_list;
+
+       /* list of work items queued for processing (by callout thread) */
+       struct list_head        callout_list;
+
+};
+
+/* ds_dev hs_state values */
+#define DS_HS_LDC_DOWN         0x00
+#define DS_HS_START            0x01
+#define DS_HS_COMPLETE         0x02
+
+/*
+ * LDC interrupts are not blocked by spin_lock_irqsave(). So, for any
+ * lock which the LDC interrupt handler (ds_event) obtains, we must
+ * explicitly disable the LDC interrupt before grabbing the lock
+ * throughout the driver (and re-enable the interrupt after releasing
+ * the lock). This is to prevent a deadlock where the interrupt handler
+ * waits indefinitely for a lock which is held by another thread on the
+ * same CPU.
+ *
+ * The reason behind this is as follows:
+ * spin_lock_irqsave() raises the PIL to level 14 which effectively
+ * blocks interrupt_level_n traps (for n < 15). However, LDC
+ * interrupts are not interrupt_level_n traps. They are dev_mondo traps,
+ * so they are not impacted by the PIL.
+ */
+
+#define LOCK_DS_DEV(ds, flags) do {\
+       ldc_disable_hv_intr((ds)->lp); \
+       spin_lock_irqsave(&((ds)->ds_lock), (flags)); \
+} while (0);
+
+#define UNLOCK_DS_DEV(ds, flags)  do {\
+       spin_unlock_irqrestore(&((ds)->ds_lock), flags); \
+       ldc_enable_hv_intr((ds)->lp); \
+} while (0);
+
+/*
+ * Generic service info structure used to describe
+ * a provider service or local client service.
+ */
+struct ds_service_info {
+       /* link into a ds_dev service list */
+       struct list_head        list;
+
+       /* id of the service */
+       char                    *id;
+
+       /* supported max version */
+       ds_ver_t                vers;
+
+       /* callback ops for reg/unreg and data */
+       ds_ops_t                ops;
+
+       /* registration state */
+       u64                     reg_state;
+
+       /* registration timeout */
+       u64                     svc_reg_timeout;
+
+       /* connection negotiated version */
+       ds_ver_t                neg_vers;
+
+       /*
+        * Flag to indicate if the service is a
+        * a client or provider. This flag should always
+        * correspond to the list this service_info
+        * it is in (i.e. in the client or provider service
+        * list in the ds_dev).
+        */
+       bool                    is_client;
+
+       /* Flag to indicate if the service is a builtin service */
+       bool                    is_builtin;
+
+       /*
+        * Service is in loopback mode.
+        * Loopback mode allows a service provider and client
+        * which reside on the same/local host to connect directly
+        * (without using a LDC).
+        */
+       bool                    is_loopback;
+
+       /* flag to indicate if this service is connected */
+       bool                    is_connected;
+
+       /* Unique handle associated with this service */
+       u64                     handle;
+
+       /* Handle used for service connection. */
+       u64                     con_handle;
+
+};
+
+/* service_info reg_states */
+#define DS_REG_STATE_UNREG                     0x00
+#define DS_REG_STATE_REG_SENT                  0x01
+#define DS_REG_STATE_REGISTERED_LDC            0x02
+#define DS_REG_STATE_REGISTERED_LOOPBACK       0x03
+
+/*
+ * DS service data structures
+ */
+
 struct ds_msg_tag {
-       __u32                   type;
+       u32                     type;
 #define DS_INIT_REQ            0x00
 #define DS_INIT_ACK            0x01
 #define DS_INIT_NACK           0x02
@@ -50,7 +314,12 @@ struct ds_msg_tag {
 #define DS_DATA                        0x09
 #define DS_NACK                        0x0a
 
-       __u32                   len;
+       u32                     len;
+};
+
+struct ds_msg {
+       struct ds_msg_tag       tag;
+       char                    payload[0];
 };
 
 /* Result codes */
@@ -61,397 +330,1242 @@ struct ds_msg_tag {
 #define DS_TYPE_UNKNOWN                0x04
 
 struct ds_version {
-       __u16                   major;
-       __u16                   minor;
+       u16                             major;
+       u16                             minor;
+};
+
+struct ds_ver_req_payload {
+       struct ds_version               ver;
 };
 
 struct ds_ver_req {
-       struct ds_msg_tag       tag;
-       struct ds_version       ver;
+       struct ds_msg_tag               tag;
+       struct ds_ver_req_payload       payload;
+};
+
+struct ds_ver_ack_payload {
+       u16                             minor;
 };
 
 struct ds_ver_ack {
-       struct ds_msg_tag       tag;
-       __u16                   minor;
+       struct ds_msg_tag               tag;
+       struct ds_ver_ack_payload       payload;
+};
+
+struct ds_ver_nack_payload {
+       u16                             major;
 };
 
 struct ds_ver_nack {
-       struct ds_msg_tag       tag;
-       __u16                   major;
+       struct ds_msg_tag               tag;
+       struct ds_ver_nack_payload      payload;
+};
+
+struct ds_reg_req_payload {
+       u64                             handle;
+       u16                             major;
+       u16                             minor;
+       char                            svc_id[0];
 };
 
 struct ds_reg_req {
-       struct ds_msg_tag       tag;
-       __u64                   handle;
-       __u16                   major;
-       __u16                   minor;
-       char                    svc_id[0];
+       struct ds_msg_tag               tag;
+       struct ds_reg_req_payload       payload;
+};
+
+struct ds_reg_ack_payload {
+       u64                             handle;
+       u16                             minor;
 };
 
 struct ds_reg_ack {
-       struct ds_msg_tag       tag;
-       __u64                   handle;
-       __u16                   minor;
+       struct ds_msg_tag               tag;
+       struct ds_reg_ack_payload       payload;
+};
+
+struct ds_reg_nack_payload {
+       u64                             handle;
+       u64                             result;
+       u16                             major;
 };
 
 struct ds_reg_nack {
-       struct ds_msg_tag       tag;
-       __u64                   handle;
-       __u16                   major;
+       struct ds_msg_tag               tag;
+       struct ds_reg_nack_payload      payload;
+};
+
+struct ds_unreg_req_payload {
+       u64                             handle;
 };
 
 struct ds_unreg_req {
-       struct ds_msg_tag       tag;
-       __u64                   handle;
+       struct ds_msg_tag               tag;
+       struct ds_unreg_req_payload     payload;
+};
+
+struct ds_unreg_ack_payload {
+       u64                             handle;
 };
 
 struct ds_unreg_ack {
-       struct ds_msg_tag       tag;
-       __u64                   handle;
+       struct ds_msg_tag               tag;
+       struct ds_unreg_ack_payload     payload;
+};
+
+struct ds_unreg_nack_payload {
+       u64                             handle;
 };
 
 struct ds_unreg_nack {
-       struct ds_msg_tag       tag;
-       __u64                   handle;
+       struct ds_msg_tag               tag;
+       struct ds_unreg_nack_payload    payload;
 };
 
-struct ds_data {
-       struct ds_msg_tag       tag;
-       __u64                   handle;
+struct ds_data_req_payload {
+       u64                             handle;
+       char                            data[0];
+};
+
+struct ds_data_req {
+       struct ds_msg_tag               tag;
+       struct ds_data_req_payload      payload;
+};
+
+#define        DS_DATA_REQ_DSIZE(req) \
+       ((req)->tag.len - sizeof(struct ds_data_req_payload))
+
+struct ds_data_nack_payload {
+       u64                             handle;
+       u64                             result;
 };
 
 struct ds_data_nack {
-       struct ds_msg_tag       tag;
-       __u64                   handle;
-       __u64                   result;
+       struct ds_msg_tag               tag;
+       struct ds_data_nack_payload     payload;
+};
+
+struct ds_unknown_msg_payload {
+       u64                             handle; /* ??? */
+};
+
+struct ds_unknown_msg {
+       struct ds_msg_tag               tag;
+       struct ds_unknown_msg_payload   payload;
+};
+
+struct ds_md_update_req {
+       u64                             req_num;
+};
+
+struct ds_md_update_res {
+       u64                             req_num;
+       u32                             result;
+};
+
+struct ds_shutdown_req {
+       u64                             req_num;
+       u32                             ms_delay;
+};
+
+struct ds_shutdown_res {
+       u64                             req_num;
+       u32                             result;
+       char                            reason[1];
+};
+
+struct ds_panic_req {
+       u64                             req_num;
+};
+
+struct ds_panic_res {
+       u64                             req_num;
+       u32                             result;
+       char                            reason[1];
+};
+
+struct ds_pri_msg {
+       u64                             req_num;
+       u64                             type;
+#define DS_PRI_REQUEST                 0x00
+#define DS_PRI_DATA                    0x01
+#define DS_PRI_UPDATE                  0x02
+};
+
+struct ds_var_hdr {
+       u32                             type;
+#define DS_VAR_SET_REQ                 0x00
+#define DS_VAR_DELETE_REQ              0x01
+#define DS_VAR_SET_RESP                        0x02
+#define DS_VAR_DELETE_RESP             0x03
+};
+
+struct ds_var_set_msg {
+       struct ds_var_hdr               hdr;
+       char                            name_and_value[0];
+};
+
+struct ds_var_delete_msg {
+       struct ds_var_hdr               hdr;
+       char                            name[0];
+};
+
+struct ds_var_resp {
+       struct ds_var_hdr               hdr;
+       u32                             result;
+#define DS_VAR_SUCCESS                 0x00
+#define DS_VAR_NO_SPACE                        0x01
+#define DS_VAR_INVALID_VAR             0x02
+#define DS_VAR_INVALID_VAL             0x03
+#define DS_VAR_NOT_PRESENT             0x04
+};
+
+struct ds_sp_token_msg {
+       u64                             req_num;
+       u64                             type;
+       __u8                            service[];
+#define DS_SPTOK_REQUEST               0x01
+};
+
+struct ds_sp_token_resp {
+       u64                             req_num;
+       u32                             result;
+       u32                             ip_addr;
+       u32                             portid;
+       __u8                            token[DS_SPTOK_TOKEN_LEN];
+#define DS_SP_TOKEN_RES_OK             0x00
+#define DS_SP_TOKEN_RES_SVC_UNKNOWN    0x01
+#define DS_SP_TOKEN_RES_SVC_UNAVAIL    0x02
+#define DS_SP_TOKEN_RES_DOWN           0x03
+};
+
+#ifdef CONFIG_HOTPLUG_CPU
+struct dr_cpu_tag {
+       u64                             req_num;
+       u32                             type;
+#define DR_CPU_CONFIGURE               0x43
+#define DR_CPU_UNCONFIGURE             0x55
+#define DR_CPU_FORCE_UNCONFIGURE       0x46
+#define DR_CPU_STATUS                  0x53
+
+/* Responses */
+#define DR_CPU_OK                      0x6f
+#define DR_CPU_ERROR                   0x65
+
+       u32                             num_records;
+};
+
+struct dr_cpu_resp_entry {
+       u32                             cpu;
+       u32                             result;
+#define DR_CPU_RES_OK                  0x00
+#define DR_CPU_RES_FAILURE             0x01
+#define DR_CPU_RES_BLOCKED             0x02
+#define DR_CPU_RES_CPU_NOT_RESPONDING  0x03
+#define DR_CPU_RES_NOT_IN_MD           0x04
+
+       u32                             stat;
+#define DR_CPU_STAT_NOT_PRESENT                0x00
+#define DR_CPU_STAT_UNCONFIGURED       0x01
+#define DR_CPU_STAT_CONFIGURED         0x02
+
+       u32                             str_off;
 };
+#endif /* CONFIG_HOTPLUG_CPU */
 
-struct ds_info;
-struct ds_cap_state {
-       __u64                   handle;
 
-       void                    (*data)(struct ds_info *dp,
-                                       struct ds_cap_state *cp,
-                                       void *buf, int len);
+/*
+ * Builtin services provided directly by this module.
+ */
+struct ds_builtin_service {
+       /* service id */
+       char            *id;
 
-       const char              *service_id;
+       /* supported max version */
+       ds_ver_t        vers;
 
-       u8                      state;
-#define CAP_STATE_UNKNOWN      0x00
-#define CAP_STATE_REG_SENT     0x01
-#define CAP_STATE_REGISTERED   0x02
+       /* callback ops for this service */
+       ds_ops_t        ops;
 };
 
-static void md_update_data(struct ds_info *dp, struct ds_cap_state *cp,
-                          void *buf, int len);
-static void domain_shutdown_data(struct ds_info *dp,
-                                struct ds_cap_state *cp,
-                                void *buf, int len);
-static void domain_panic_data(struct ds_info *dp,
-                             struct ds_cap_state *cp,
-                             void *buf, int len);
+/* Prototypes for the builtin service callbacks */
+static void ds_md_update_data_cb(ds_cb_arg_t arg,
+       ds_svc_hdl_t hdl, void *buf, size_t len);
+static void ds_dom_shutdown_data_cb(ds_cb_arg_t arg,
+       ds_svc_hdl_t hdl, void *buf, size_t len);
+static void ds_dom_panic_data_cb(ds_cb_arg_t arg,
+       ds_svc_hdl_t hdl, void *buf, size_t len);
 #ifdef CONFIG_HOTPLUG_CPU
-static void dr_cpu_data(struct ds_info *dp,
-                       struct ds_cap_state *cp,
-                       void *buf, int len);
+static void ds_dr_cpu_data_cb(ds_cb_arg_t arg,
+       ds_svc_hdl_t hdl, void *buf, size_t len);
 #endif
-static void ds_pri_data(struct ds_info *dp,
-                       struct ds_cap_state *cp,
-                       void *buf, int len);
-static void ds_var_data(struct ds_info *dp,
-                       struct ds_cap_state *cp,
-                       void *buf, int len);
-
-static struct ds_cap_state ds_states_template[] = {
+static void ds_var_data_cb(ds_cb_arg_t arg,
+       ds_svc_hdl_t hdl, void *buf, size_t len);
+static void ds_sp_token_data_cb(ds_cb_arg_t arg,
+       ds_svc_hdl_t hdl, void *buf, size_t len);
+/*
+ * Each service can have a unique supported maj/min version, but for
+ * now we set them all to the same supported maj/min value below.
+ */
+#define        DS_CAP_MAJOR    1
+#define        DS_CAP_MINOR    0
+
+/*
+ * Builtin service providers connected to the primary domain. These
+ * service providers are started on any domain, and they are connected
+ * and consumed by the primary domain.
+ */
+static struct ds_builtin_service ds_primary_builtin_template[] = {
+
        {
-               .service_id     = "md-update",
-               .data           = md_update_data,
+               .id             = "md-update",
+               .vers           = {DS_CAP_MAJOR, DS_CAP_MINOR},
+               .ops            = {NULL,
+                                  NULL,
+                                  ds_md_update_data_cb},
        },
        {
-               .service_id     = "domain-shutdown",
-               .data           = domain_shutdown_data,
+               .id             = "domain-shutdown",
+               .vers           = {DS_CAP_MAJOR, DS_CAP_MINOR},
+               .ops            = {NULL,
+                                  NULL,
+                                  ds_dom_shutdown_data_cb},
        },
        {
-               .service_id     = "domain-panic",
-               .data           = domain_panic_data,
+               .id             = "domain-panic",
+               .vers           = {DS_CAP_MAJOR, DS_CAP_MINOR},
+               .ops            = {NULL,
+                                  NULL,
+                                  ds_dom_panic_data_cb},
        },
+
 #ifdef CONFIG_HOTPLUG_CPU
        {
-               .service_id     = "dr-cpu",
-               .data           = dr_cpu_data,
+               .id             = "dr-cpu",
+               .vers           = {DS_CAP_MAJOR, DS_CAP_MINOR},
+               .ops            = {NULL,
+                                  NULL,
+                                  ds_dr_cpu_data_cb},
        },
 #endif
+
+       /*
+        * var-config effectively behaves has a service client. But all kernel
+        * ds services are defined as providers, no matter if they actually
+        * behave as a server or as client.
+        */
        {
-               .service_id     = "pri",
-               .data           = ds_pri_data,
+               .id             = "var-config",
+               .vers           = {DS_CAP_MAJOR, DS_CAP_MINOR},
+               .ops            = {NULL,
+                                  NULL,
+                                  ds_var_data_cb},
        },
+};
+
+/*
+ * Builtin service clients connected to the SP. These service providers are
+ * started only on the primary domain (which is the only domain connected
+ * to the SP). They are connected to the SP which is the consumer of
+ * these services.
+ */
+static struct ds_builtin_service ds_sp_builtin_template[] = {
+
        {
-               .service_id     = "var-config",
-               .data           = ds_var_data,
+               .id             = "var-config-backup",
+               .vers           = {DS_CAP_MAJOR, DS_CAP_MINOR},
+               .ops            = {NULL,
+                                  NULL,
+                                  ds_var_data_cb},
        },
        {
-               .service_id     = "var-config-backup",
-               .data           = ds_var_data,
+               .id             = "sp-token",
+               .vers           = {DS_CAP_MAJOR, DS_CAP_MINOR},
+               .ops            = {NULL,
+                                  NULL,
+                                  ds_sp_token_data_cb},
        },
 };
 
-static DEFINE_SPINLOCK(ds_lock);
-
-struct ds_info {
-       struct ldc_channel      *lp;
-       u8                      hs_state;
-#define DS_HS_START            0x01
-#define DS_HS_DONE             0x02
+/* prototypes for local functions */
+static void ds_unregister_ldc_services(struct ds_dev *ds);
+static struct ds_service_info *ds_find_service_client_handle(
+       struct ds_dev *ds, u64 handle);
+static struct ds_service_info *ds_find_service_provider_handle(
+       struct ds_dev *ds, u64 handle);
+static struct ds_service_info *ds_find_service_client_con_handle(
+       struct ds_dev *ds, u64 handle);
+static struct ds_service_info *ds_find_service_provider_con_handle(
+       struct ds_dev *ds, u64 handle);
+static struct ds_service_info *ds_find_service_provider_id(struct ds_dev *ds,
+       char *svc_id);
+static void ds_remove_service_provider(struct ds_dev *ds,
+       struct ds_service_info *provider_svc_info);
+static struct ds_service_info *ds_add_service_provider(struct ds_dev *ds,
+       char *id, ds_ver_t vers, ds_ops_t *ops, bool is_builtin);
+static struct ds_service_info *ds_find_service_client_id(struct ds_dev *ds,
+       char *svc_id);
+static struct ds_service_info *ds_add_service_client(struct ds_dev *ds,
+       char *id, ds_ver_t vers, ds_ops_t *ops, bool is_builtin);
+static void ds_remove_service_client(struct ds_dev *ds,
+       struct ds_service_info *client_svc_info);
+static int ds_service_unreg(struct ds_dev *ds, u64 handle);
+static void ds_disconnect_service_client(struct ds_dev *ds,
+       struct ds_service_info *client_svc_info);
+static void ds_disconnect_service_provider(struct ds_dev *ds,
+       struct ds_service_info *provider_svc_info);
+
+#define        LDOMS_DEBUG_LEVEL_SETUP         "ldoms_debug_level="
+#define        LDOMS_MAX_DEBUG_LEVEL           7
+unsigned int ldoms_debug_level;
+EXPORT_SYMBOL(ldoms_debug_level);
+
+module_param(ldoms_debug_level, uint, S_IRUGO|S_IWUSR);
+
+static int __init ldoms_debug_level_setup(char *level_str)
+{
+       unsigned long level;
 
-       u64                     id;
+       if (!level_str)
+               return -EINVAL;
 
-       void                    *rcv_buf;
-       int                     rcv_buf_len;
+       level = simple_strtoul(level_str, NULL, 0);
 
-       struct ds_cap_state     *ds_states;
-       int                     num_ds_states;
+       if (level < LDOMS_MAX_DEBUG_LEVEL)
+               ldoms_debug_level = level;
 
-       struct ds_info          *next;
-};
+       return 1;
 
-static struct ds_info *ds_info_list;
+}
+__setup(LDOMS_DEBUG_LEVEL_SETUP, ldoms_debug_level_setup);
 
-static struct ds_cap_state *find_cap(struct ds_info *dp, u64 handle)
+static void ds_reset(struct ds_dev *ds)
 {
-       unsigned int index = handle >> 32;
+       dprintk("entered.\n");
 
-       if (index >= dp->num_ds_states)
-               return NULL;
-       return &dp->ds_states[index];
-}
+       ds->hs_state = DS_HS_LDC_DOWN;
 
-static struct ds_cap_state *find_cap_by_string(struct ds_info *dp,
-                                              const char *name)
-{
-       int i;
+       ds_unregister_ldc_services(ds);
 
-       for (i = 0; i < dp->num_ds_states; i++) {
-               if (strcmp(dp->ds_states[i].service_id, name))
-                       continue;
+       /* Disconnect the LDC */
+       ldc_disconnect(ds->lp);
 
-               return &dp->ds_states[i];
-       }
-       return NULL;
+       /* clear the LDC RESET flag so that the LDC can reconnect */
+       ldc_clr_reset(ds->lp);
 }
 
-static int __ds_send(struct ldc_channel *lp, void *data, int len)
+static int ds_ldc_send_msg(struct ldc_channel *lp, void *data, int len)
 {
-       int err, limit = 1000;
+       int rv, limit = 1000;
 
-       err = -EINVAL;
+       rv = -EINVAL;
        while (limit-- > 0) {
-               err = ldc_write(lp, data, len);
-               if (!err || (err != -EAGAIN))
+               rv = ldc_write(lp, data, len);
+               if (rv != -EAGAIN)
                        break;
                udelay(1);
        }
 
-       return err;
+       return rv;
 }
 
-static int ds_send(struct ldc_channel *lp, void *data, int len)
+static int ds_ldc_send_payload(struct ldc_channel *lp, u32 type,
+       void *data, int len)
 {
-       unsigned long flags;
-       int err;
+       struct ds_msg *msg;
+       size_t msglen;
+       gfp_t alloc_flags;
+       int rv;
+
+       /* This function can be called in either process or atomic mode */
+       if (in_atomic())
+               alloc_flags = GFP_ATOMIC;
+       else
+               alloc_flags = GFP_KERNEL;
+       msglen = sizeof(struct ds_msg) + len;
+       msg = kzalloc(msglen, alloc_flags);
+       if (msg == NULL)
+               return -ENOMEM;
 
-       spin_lock_irqsave(&ds_lock, flags);
-       err = __ds_send(lp, data, len);
-       spin_unlock_irqrestore(&ds_lock, flags);
+       msg->tag.type = type;
+       msg->tag.len = len;
+       memcpy(msg->payload, data, len);
 
-       return err;
-}
+       rv = ds_ldc_send_msg(lp, msg, msglen);
 
-struct ds_md_update_req {
-       __u64                           req_num;
-};
+       kfree(msg);
 
-struct ds_md_update_res {
-       __u64                           req_num;
-       __u32                           result;
-};
+       return rv;
+}
 
-static void md_update_data(struct ds_info *dp,
-                          struct ds_cap_state *cp,
-                          void *buf, int len)
+static void ds_send_data_nack(struct ds_dev *ds, u64 handle, u64 result)
 {
-       struct ldc_channel *lp = dp->lp;
-       struct ds_data *dpkt = buf;
-       struct ds_md_update_req *rp;
-       struct {
-               struct ds_data          data;
-               struct ds_md_update_res res;
-       } pkt;
-
-       rp = (struct ds_md_update_req *) (dpkt + 1);
+       struct ds_data_nack_payload req;
+       int rv;
 
-       pr_info("ds-%llu: Machine description update.\n", dp->id);
-
-       mdesc_update();
+       dprintk("entered.\n");
 
-       memset(&pkt, 0, sizeof(pkt));
-       pkt.data.tag.type = DS_DATA;
-       pkt.data.tag.len = sizeof(pkt) - sizeof(struct ds_msg_tag);
-       pkt.data.handle = cp->handle;
-       pkt.res.req_num = rp->req_num;
-       pkt.res.result = DS_OK;
+       req.handle = handle;
+       req.result = result;
 
-       ds_send(lp, &pkt, sizeof(pkt));
+       rv = ds_ldc_send_payload(ds->lp, DS_NACK, &req, sizeof(req));
+       if (rv <= 0)
+               pr_err("ds-%llu: %s: ldc_send failed. (%d)\n ", ds->id,
+                   __func__, rv);
 }
 
-struct ds_shutdown_req {
-       __u64                           req_num;
-       __u32                           ms_delay;
+struct ds_callout_entry_hdr {
+       struct list_head                list;
+       u8                              type;
+       struct ds_dev                   *ds;
 };
-
-struct ds_shutdown_res {
-       __u64                           req_num;
-       __u32                           result;
-       char                            reason[1];
+/* callout queue entry types */
+#define        DS_QTYPE_DATA           0x1
+#define        DS_QTYPE_REG            0x2
+#define        DS_QTYPE_UNREG          0x3
+
+/* callout queue entry for data cb */
+struct ds_callout_data_entry {
+       struct ds_callout_entry_hdr     hdr;
+       u8                              data_req_type;
+       u64                             req[0];
+};
+/* data_req_type field types */
+#define        DS_DTYPE_CLIENT_REQ             0x1
+#define        DS_DTYPE_PROVIDER_REQ           0x2
+#define        DS_DTYPE_LDC_REQ                0x3
+
+/* callout queue entry for reg or unreg cb */
+struct ds_callout_reg_entry {
+       struct ds_callout_entry_hdr     hdr;
+       u64                             hdl;
 };
 
-static void domain_shutdown_data(struct ds_info *dp,
-                                struct ds_cap_state *cp,
-                                void *buf, int len)
+static struct ds_service_info *ds_callout_data_get_service(
+       struct ds_dev *ds, u8 data_req_type, u64 hdl)
 {
-       struct ldc_channel *lp = dp->lp;
-       struct ds_data *dpkt = buf;
-       struct ds_shutdown_req *rp;
-       struct {
-               struct ds_data          data;
-               struct ds_shutdown_res  res;
-       } pkt;
+       struct ds_service_info *svc_info;
 
-       rp = (struct ds_shutdown_req *) (dpkt + 1);
+       /*
+        * Find the provider or client service to which
+        * a data message is intended to be sent.
+        * If the original request was from a client, find
+        * a provider handle. If the original request was
+        * from a provider, find a client handle. If the
+        * original request was from a LDC, look for either.
+        * This check is required to support a loopback
+        * connection where both a client and provider
+        * connected in loopback mode have the same con_handle.
+        */
 
-       pr_info("ds-%llu: Shutdown request from LDOM manager received.\n",
-               dp->id);
+       svc_info = NULL;
 
-       memset(&pkt, 0, sizeof(pkt));
-       pkt.data.tag.type = DS_DATA;
-       pkt.data.tag.len = sizeof(pkt) - sizeof(struct ds_msg_tag);
-       pkt.data.handle = cp->handle;
-       pkt.res.req_num = rp->req_num;
-       pkt.res.result = DS_OK;
-       pkt.res.reason[0] = 0;
+       if (data_req_type == DS_DTYPE_CLIENT_REQ ||
+           data_req_type == DS_DTYPE_LDC_REQ)
+               svc_info = ds_find_service_provider_con_handle(ds, hdl);
 
-       ds_send(lp, &pkt, sizeof(pkt));
+       if (!svc_info &&
+           (data_req_type == DS_DTYPE_PROVIDER_REQ ||
+           data_req_type == DS_DTYPE_LDC_REQ))
+               svc_info = ds_find_service_client_con_handle(ds, hdl);
+
+       if (!svc_info || !svc_info->is_connected) {
+
+               if (!svc_info)
+                       dprintk("ds-%llu: Data received for "
+                           "unknown handle %llu\n", ds->id, hdl);
+               else
+                       dprintk("ds-%llu: Data received for "
+                           "unconnected handle %llu\n", ds->id, hdl);
+
+               /*
+                * If this was a LDC data packet, nack it.
+                * NOTE: If this was a loopback data packet,
+                * we should always find a connected target
+                * service and never execute this code. In
+                * the unlikely event that the loopback
+                * connection has been disconnected while the
+                * data packet is "in-flight", the packet will
+                * just be ignored and ignoring the packet is
+                * probably appropriate in that case.
+                */
+               if (data_req_type == DS_DTYPE_LDC_REQ)
+                       ds_send_data_nack(ds, hdl, DS_INV_HDL);
+
+               return NULL;
+       }
+
+       return svc_info;
 
-       orderly_poweroff(true);
 }
 
-struct ds_panic_req {
-       __u64                           req_num;
-};
+static struct ds_service_info *ds_callout_reg_get_service(
+       struct ds_dev *ds, u8 type, u64 hdl)
+{
+       struct ds_service_info *svc_info;
 
-struct ds_panic_res {
-       __u64                           req_num;
-       __u32                           result;
-       char                            reason[1];
-};
+       svc_info = ds_find_service_provider_handle(ds, hdl);
+       if (svc_info == NULL) {
+
+               svc_info = ds_find_service_client_handle(ds, hdl);
+               if (svc_info == NULL) {
+                       dprintk("ds-%llu: %s cb request received for "
+                           "unknown handle %llu\n", ds->id,
+                           (type == DS_QTYPE_REG) ? "Reg" : "Unreg", hdl);
+                       return NULL;
+               }
+       }
 
-static void domain_panic_data(struct ds_info *dp,
-                             struct ds_cap_state *cp,
-                             void *buf, int len)
+       return svc_info;
+
+}
+
+static void ds_do_callout_processing(void)
 {
-       struct ldc_channel *lp = dp->lp;
-       struct ds_data *dpkt = buf;
-       struct ds_panic_req *rp;
-       struct {
-               struct ds_data          data;
-               struct ds_panic_res     res;
-       } pkt;
+       unsigned long flags;
+       unsigned long ds_flags;
+       struct ds_dev *ds;
+       struct ds_callout_entry_hdr *qhdrp;
+       struct ds_callout_entry_hdr *tmp;
+       struct ds_callout_reg_entry *rentry;
+       struct ds_callout_data_entry *dentry;
+       struct ds_service_info *svc_info;
+       struct ds_data_req *data_req;
+       void (*reg_cb)(ds_cb_arg_t, ds_svc_hdl_t, ds_ver_t *);
+       void (*unreg_cb)(ds_cb_arg_t, ds_svc_hdl_t);
+       void (*data_cb)(ds_cb_arg_t, ds_svc_hdl_t, void *, size_t);
+       ds_cb_arg_t cb_arg;
+       ds_ver_t neg_vers;
+       u64 hdl;
+       LIST_HEAD(todo);
+
+       dprintk("ds: CPU[%d]: callout processing START\n", smp_processor_id());
+
+       /*
+        * Merge all the ds_dev callout lists into a
+        * single local todo list for processing. The
+        * ds_dev callout lists are re-initialized to empty.
+        * We do this because we cannot hold any driver locks
+        * while we process the entries (and make callbacks)
+        * because it's possible that the callbacks could
+        * call back into this driver and attempt to re-acquire
+        * the lock(s) resulting in deadlock.
+        */
+       spin_lock_irqsave(&ds_data_lock, flags);
+       list_for_each_entry(ds, &ds_data.ds_dev_list, list) {
+               LOCK_DS_DEV(ds, ds_flags)
+               list_splice_tail_init(&ds->callout_list, &todo);
+               UNLOCK_DS_DEV(ds, ds_flags)
+       }
+       spin_unlock_irqrestore(&ds_data_lock, flags);
+
+       list_for_each_entry_safe(qhdrp, tmp, &todo, list) {
+
+               LOCK_DS_DEV(qhdrp->ds, ds_flags)
+               /*
+                * If the ds this entry references
+                * has been deactivated, skip it.
+                * If this is the last reference to it,
+                * free the ds.
+                */
+               qhdrp->ds->co_ref_cnt--;
+
+               if (unlikely(!qhdrp->ds->active)) {
+
+                       UNLOCK_DS_DEV(qhdrp->ds, ds_flags)
 
-       rp = (struct ds_panic_req *) (dpkt + 1);
+                       if (qhdrp->ds->co_ref_cnt == 0)
+                               kfree(qhdrp->ds);
 
-       pr_info("ds-%llu: Panic request from LDOM manager received.\n",
-               dp->id);
+                       list_del(&qhdrp->list);
+                       kfree(qhdrp);
 
-       memset(&pkt, 0, sizeof(pkt));
-       pkt.data.tag.type = DS_DATA;
-       pkt.data.tag.len = sizeof(pkt) - sizeof(struct ds_msg_tag);
-       pkt.data.handle = cp->handle;
-       pkt.res.req_num = rp->req_num;
-       pkt.res.result = DS_OK;
-       pkt.res.reason[0] = 0;
+                       continue;
+               }
+
+               if (qhdrp->type == DS_QTYPE_DATA) {
+                       /* process data entry */
+                       dentry = (struct ds_callout_data_entry *)qhdrp;
+                       data_req = (struct ds_data_req *) dentry->req;
+                       ds = dentry->hdr.ds;
+
+                       svc_info = ds_callout_data_get_service(ds,
+                           dentry->data_req_type, data_req->payload.handle);
+
+                       if (unlikely(svc_info == NULL)) {
+                               UNLOCK_DS_DEV(ds, ds_flags)
+                               list_del(&qhdrp->list);
+                               kfree(qhdrp);
+                               continue;
+                       }
+
+                       /*
+                        * We unlock the ds_dev before we make the data
+                        * callback to enforce the rule that no locks be held
+                        * when making callbacks. However, this opens a timing
+                        * hole where a service unregistration could come in
+                        * between releasing the lock and making the callback
+                        * rendering the svc_info * stale/freed. So, copy
+                        * over the svc_info fields into locals before we
+                        * free the lock to close this very unlikely but
+                        * possible hole.
+                        */
+                       hdl = svc_info->handle;
+                       data_cb = svc_info->ops.ds_data_cb;
+                       cb_arg = svc_info->ops.cb_arg;
+
+                       UNLOCK_DS_DEV(ds, ds_flags)
+
+                       /*
+                        * We strip off the DS protocol header (ds_data_req)
+                        * portion of the data for the callback to receive.
+                        * Since tag->len includes the handle (a u64) of the
+                        * ds_data_req + the payload, we must subtract an extra
+                        * u64 from the len. This is per spec.
+                        */
+                       data_cb(cb_arg, hdl, data_req->payload.data,
+                           DS_DATA_REQ_DSIZE(data_req));
+
+               } else {
+                       /* process reg/ureg entry */
+                       rentry = (struct ds_callout_reg_entry *)qhdrp;
+                       ds = rentry->hdr.ds;
+
+                       svc_info = ds_callout_reg_get_service(ds,
+                           rentry->hdr.type, rentry->hdl);
+
+                       if (unlikely(svc_info == NULL)) {
+                               UNLOCK_DS_DEV(ds, ds_flags)
+                               list_del(&qhdrp->list);
+                               kfree(qhdrp);
+                               continue;
+                       }
+
+                       /*
+                        * We unlock the ds_dev before we make the reg/unreg
+                        * callback to enforce the rule that no locks be held
+                        * when making callbacks. However, this opens a timing
+                        * hole where a service unregistration could come in
+                        * between releasing the lock and making the callback
+                        * rendering the svc_info * stale/freed. So, copy
+                        * over the svc_info fields into locals before we
+                        * free the lock to close this very unlikely but
+                        * possible hole.
+                        */
+                       hdl = svc_info->handle;
+                       reg_cb = svc_info->ops.ds_reg_cb;
+                       unreg_cb = svc_info->ops.ds_unreg_cb;
+                       cb_arg = svc_info->ops.cb_arg;
+                       neg_vers = svc_info->neg_vers;
+
+                       UNLOCK_DS_DEV(ds, ds_flags)
+
+                       if (rentry->hdr.type == DS_QTYPE_REG) {
+                               if (reg_cb != NULL)
+                                       reg_cb(cb_arg, hdl, &neg_vers);
+                       } else {
+                               if (unreg_cb != NULL)
+                                       unreg_cb(cb_arg, hdl);
+                       }
 
-       ds_send(lp, &pkt, sizeof(pkt));
+               }
+
+               /* done processing the entry, remove it from the list */
+               list_del(&qhdrp->list);
+               kfree(qhdrp);
+       }
 
-       panic("PANIC requested by LDOM manager.");
+       dprintk("ds: CPU[%d]: callout processing END\n", smp_processor_id());
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-struct dr_cpu_tag {
-       __u64                           req_num;
-       __u32                           type;
-#define DR_CPU_CONFIGURE               0x43
-#define DR_CPU_UNCONFIGURE             0x55
-#define DR_CPU_FORCE_UNCONFIGURE       0x46
-#define DR_CPU_STATUS                  0x53
+static DECLARE_WAIT_QUEUE_HEAD(ds_wait);
 
-/* Responses */
-#define DR_CPU_OK                      0x6f
-#define DR_CPU_ERROR                   0x65
+static int ds_callout_thread(void *__unused)
+{
+       DEFINE_WAIT(wait);
+       unsigned long flags;
+       struct ds_dev *ds;
+       bool work_to_do;
 
-       __u32                           num_records;
-};
+       while (1) {
+               prepare_to_wait(&ds_wait, &wait, TASK_INTERRUPTIBLE);
 
-struct dr_cpu_resp_entry {
-       __u32                           cpu;
-       __u32                           result;
-#define DR_CPU_RES_OK                  0x00
-#define DR_CPU_RES_FAILURE             0x01
-#define DR_CPU_RES_BLOCKED             0x02
-#define DR_CPU_RES_CPU_NOT_RESPONDING  0x03
-#define DR_CPU_RES_NOT_IN_MD           0x04
+               work_to_do = false;
+               spin_lock_irqsave(&ds_data_lock, flags);
+               list_for_each_entry(ds, &ds_data.ds_dev_list, list) {
+                       if (!list_empty(&ds->callout_list)) {
+                               work_to_do = true;
+                               break;
+                       }
+               }
+               spin_unlock_irqrestore(&ds_data_lock, flags);
 
-       __u32                           stat;
-#define DR_CPU_STAT_NOT_PRESENT                0x00
-#define DR_CPU_STAT_UNCONFIGURED       0x01
-#define DR_CPU_STAT_CONFIGURED         0x02
+               if (!work_to_do)
+                       schedule();
 
-       __u32                           str_off;
-};
+               finish_wait(&ds_wait, &wait);
+
+               if (kthread_should_stop())
+                       break;
+
+               ds_do_callout_processing();
+       }
+
+       return 0;
+}
 
-static void __dr_cpu_send_error(struct ds_info *dp,
-                               struct ds_cap_state *cp,
-                               struct ds_data *data)
+static int ds_submit_reg_cb(struct ds_dev *ds, u64 hdl, u8 type)
 {
-       struct dr_cpu_tag *tag = (struct dr_cpu_tag *) (data + 1);
-       struct {
-               struct ds_data          data;
-               struct dr_cpu_tag       tag;
-       } pkt;
-       int msg_len;
+       struct ds_callout_reg_entry *rentry;
+       gfp_t alloc_flags;
 
-       memset(&pkt, 0, sizeof(pkt));
-       pkt.data.tag.type = DS_DATA;
-       pkt.data.handle = cp->handle;
-       pkt.tag.req_num = tag->req_num;
-       pkt.tag.type = DR_CPU_ERROR;
-       pkt.tag.num_records = 0;
+       /* This function can be called in either process or atomic mode */
+       if (in_atomic())
+               alloc_flags = GFP_ATOMIC;
+       else
+               alloc_flags = GFP_KERNEL;
+
+       rentry = kzalloc(sizeof(struct ds_callout_reg_entry), alloc_flags);
+       if (!rentry)
+               return -ENOMEM;
+
+       rentry->hdr.type = type;
+       rentry->hdr.ds = ds;
+       rentry->hdl = hdl;
 
-       msg_len = (sizeof(struct ds_data) +
-                  sizeof(struct dr_cpu_tag));
+       list_add_tail(&rentry->hdr.list, &ds->callout_list);
+       ds->co_ref_cnt++;
 
-       pkt.data.tag.len = msg_len - sizeof(struct ds_msg_tag);
+       dprintk("ds-%llu: Added %s item to work queue "
+           "(co_ref_cnt=%llu)\n", ds->id,
+           (rentry->hdr.type == DS_QTYPE_REG) ? "Reg" : "Unreg",
+           ds->co_ref_cnt);
 
-       __ds_send(dp->lp, &pkt, msg_len);
+       wake_up(&ds_wait);
+
+       return 0;
 }
 
-static void dr_cpu_send_error(struct ds_info *dp,
-                             struct ds_cap_state *cp,
-                             struct ds_data *data)
+static int ds_submit_data_cb(struct ds_dev *ds, struct ds_msg_tag *pkt,
+       u8 data_type)
 {
-       unsigned long flags;
+       struct ds_callout_data_entry *dentry;
+       u64 pktlen;
+       gfp_t alloc_flags;
+
+       pktlen = (sizeof(struct ds_msg_tag) + pkt->len);
+
+       /*
+        * Data packets are added to our data thread's
+        * data work queue for later processing.
+        */
+
+       /* This function can be called in either process or atomic mode */
+       if (in_atomic())
+               alloc_flags = GFP_ATOMIC;
+       else
+               alloc_flags = GFP_KERNEL;
+
+       dentry = kzalloc(sizeof(struct ds_callout_data_entry) + pktlen,
+           alloc_flags);
+       if (!dentry)
+               return -ENOMEM;
+
+       dentry->hdr.type = DS_QTYPE_DATA;
+       dentry->hdr.ds = ds;
+       dentry->data_req_type = data_type;
+       memcpy(&dentry->req, pkt, pktlen);
+
+       list_add_tail(&dentry->hdr.list, &ds->callout_list);
+       ds->co_ref_cnt++;
+
+       dprintk("ds-%llu: Added data item (type=%u) to work queue "
+           "(co_ref_cnt=%llu)\n", ds->id, pkt->type, ds->co_ref_cnt);
+
+       wake_up(&ds_wait);
+
+       return 0;
+}
+
+/*
+ * External service registration interface functions
+ */
+int ds_cap_init(ds_capability_t *cap, ds_ops_t *ops, u32 flags,
+       u64 domain_hdl,  ds_svc_hdl_t *hdlp)
+{
+       struct ds_dev *ds;
+       struct ds_service_info *svc_info = NULL;
+       unsigned long data_flags = 0;
+       unsigned long ds_flags = 0;
+       bool is_domain;
+
+       dprintk("entered.\n");
+
+       /* validate args */
+       if (cap == NULL || ops == NULL) {
+               pr_err("%s: Error: NULL argument(s) received\n", __func__);
+               return -EINVAL;
+       }
+
+       /* flags must be set to PROVIDER or CLIENT but not both. */
+       if (!(flags & DS_CAP_IS_PROVIDER || flags & DS_CAP_IS_CLIENT) ||
+           (flags & DS_CAP_IS_PROVIDER && flags & DS_CAP_IS_CLIENT)) {
+               pr_err("%s: Error: Invalid flags argument received %u\n",
+                   __func__, flags);
+               return -EINVAL;
+       }
+
+       /* data callback must be specified, other ops callbacks can be NULL */
+       if (ops->ds_data_cb == NULL) {
+               pr_err("%s: Error: data callback op must be present\n",
+                   __func__);
+               return -EINVAL;
+       }
+
+       is_domain = ((flags & DS_TARGET_IS_DOMAIN) != 0);
+
+       /* Find the ds_dev associated with domain_hdl. */
+       spin_lock_irqsave(&ds_data_lock, data_flags);
+       ds = NULL;
+       list_for_each_entry(ds, &ds_data.ds_dev_list, list) {
+
+               LOCK_DS_DEV(ds, ds_flags)
+
+               if ((is_domain && ds->is_domain && ds->handle == domain_hdl) ||
+                   (!is_domain && !ds->is_domain))
+                       break;
+
+               UNLOCK_DS_DEV(ds, ds_flags)
+       }
+       spin_unlock_irqrestore(&ds_data_lock, data_flags);
+
+       if (ds == NULL) {
+               pr_err("%s: Error: dom_hdl %llu (domain=%d) DS "
+                   "port not found\n", __func__, domain_hdl,
+                   ((flags & DS_TARGET_IS_DOMAIN) != 0));
+               return -ENODEV;
+       }
+
+       if (flags & DS_CAP_IS_PROVIDER) {
+
+               /* Check if there is already a registered service provider */
+               svc_info = ds_find_service_provider_id(ds, cap->svc_id);
+               if (svc_info != NULL) {
+                       if (svc_info->is_connected && !svc_info->is_builtin) {
+                               pr_err("%s: Error: service provider %s "
+                                   "already registered\n", __func__,
+                                   cap->svc_id);
+                               UNLOCK_DS_DEV(ds, ds_flags)
+                               return -EBUSY;
+                       } else {
+                               /*
+                                * Existing service is not connected or is
+                                * a builtin (i.e. allow external to override
+                                * builtin). Remove the service.
+                                */
+                               ds_remove_service_provider(ds, svc_info);
+                       }
+               }
+
+               svc_info = ds_add_service_provider(ds, cap->svc_id, cap->vers,
+                   ops, false);
+
+               if (svc_info == NULL) {
+                       pr_err("ds-%llu: %s: Failed to add service "
+                           "provider %s", ds->id, __func__, cap->svc_id);
+                       UNLOCK_DS_DEV(ds, ds_flags)
+                       return -ENOMEM;
+               }
+
+       } else if (flags & DS_CAP_IS_CLIENT) {
+
+               /* Check if there is already a registered service client */
+               svc_info = ds_find_service_client_id(ds, cap->svc_id);
+               if (svc_info != NULL) {
+                       if (svc_info->is_connected && !svc_info->is_builtin) {
+                               pr_err("%s: Error: service client %s "
+                                   "already registered\n", __func__,
+                                   cap->svc_id);
+                               UNLOCK_DS_DEV(ds, ds_flags)
+                               return -EBUSY;
+                       } else {
+                               /*
+                                * Existing service is not connected or is
+                                * a builtin (i.e. allow external to override
+                                * builtin). Remove the service.
+                                */
+                               ds_remove_service_client(ds, svc_info);
+                       }
+               }
+
+               svc_info = ds_add_service_client(ds, cap->svc_id, cap->vers,
+                   ops, false);
+
+               if (svc_info == NULL) {
+                       pr_err("ds-%llu: %s: Failed to add service "
+                           "client %s", ds->id, __func__, cap->svc_id);
+                       UNLOCK_DS_DEV(ds, ds_flags)
+                       return -ENOMEM;
+               }
+       }
+
+       /* populate the unique handle to passed in hdlp argument */
+       *hdlp = (ds_svc_hdl_t)svc_info->handle;
+
+       dprintk("ds-%llu: Registered %s service (%llx), client=%d\n",
+           ds->id, svc_info->id, svc_info->handle, svc_info->is_client);
+
+       UNLOCK_DS_DEV(ds, ds_flags)
+
+       return 0;
+
+}
+EXPORT_SYMBOL(ds_cap_init);
+
+int ds_cap_fini(ds_svc_hdl_t hdl)
+{
+       struct ds_dev *ds;
+       struct ds_service_info *svc_info, *tmp;
+       unsigned long flags = 0;
+       unsigned long ds_flags = 0;
+
+       dprintk("entered.\n");
+
+       /* validate args */
+       if (hdl == 0) {
+               pr_err("%s: Error: hdl argument received is 0\n", __func__);
+               return -EINVAL;
+       }
+
+       /* Find and remove all services associated with hdl. */
+
+       spin_lock_irqsave(&ds_data_lock, flags);
+
+       list_for_each_entry(ds, &ds_data.ds_dev_list, list) {
+
+               LOCK_DS_DEV(ds, ds_flags)
+
+               list_for_each_entry_safe(svc_info, tmp,
+                   &ds->service_provider_list, list) {
+                       if (svc_info->handle == (u64)hdl)
+                               ds_remove_service_provider(ds, svc_info);
+               }
+
+               list_for_each_entry_safe(svc_info, tmp,
+                   &ds->service_client_list, list) {
+                       if (svc_info->handle == (u64)hdl)
+                               ds_remove_service_client(ds, svc_info);
+               }
+
+               UNLOCK_DS_DEV(ds, ds_flags)
+       }
+
+       spin_unlock_irqrestore(&ds_data_lock, flags);
+
+       return 0;
+
+}
+EXPORT_SYMBOL(ds_cap_fini);
+
+int ds_cap_send(ds_svc_hdl_t hdl, void *buf, size_t buflen)
+{
+       struct ds_dev *ds;
+       struct ds_service_info *svc_info;
+       unsigned long flags = 0;
+       unsigned long ds_flags = 0;
+       struct ds_data_req *hdr;
+       int msglen;
+       u8 type;
+       int rv;
+
+       dprintk("entered.\n");
+
+       /* validate args */
+       if (hdl == 0) {
+               pr_err("%s: Error: hdl argument received is 0\n", __func__);
+               return -EINVAL;
+       }
+
+       if (buf == NULL) {
+               pr_err("%s: Error: Invalid NULL buffer argument\n", __func__);
+               return -EINVAL;
+       }
+
+       if (buflen == 0)
+               return 0;
+
+       /* Find the service uniquely identified by hdl */
+
+       svc_info = NULL;
+
+       spin_lock_irqsave(&ds_data_lock, flags);
+       list_for_each_entry(ds, &ds_data.ds_dev_list, list) {
+
+               LOCK_DS_DEV(ds, ds_flags)
+
+               svc_info = ds_find_service_provider_handle(ds, (u64)hdl);
+               if (svc_info == NULL)
+                       svc_info = ds_find_service_client_handle(ds,
+                           (u64)hdl);
+
+               /* if we found the hdl, break but do not release the ds_lock */
+               if (svc_info != NULL)
+                       break;
+
+               UNLOCK_DS_DEV(ds, ds_flags)
+       }
+
+       spin_unlock_irqrestore(&ds_data_lock, flags);
+
+       if (svc_info == NULL) {
+               pr_err("%s: Error: no service found "
+                   "for handle %llx\n", __func__, hdl);
+               return -ENODEV;
+       }
+
+       if (!svc_info->is_connected) {
+               pr_err("%s: Error: Service %s not connected.\n", __func__,
+                   svc_info->id);
+               UNLOCK_DS_DEV(ds, ds_flags)
+               return -ENODEV;
+       }
+
+       /* build the data packet containing the data */
+       msglen = sizeof(struct ds_data_req) + buflen;
+       hdr = kzalloc(msglen, GFP_KERNEL);
+       if (hdr == NULL) {
+               pr_err("ds-%llu: %s: failed to alloc mem for data msg.\n",
+                   ds->id, __func__);
+               UNLOCK_DS_DEV(ds, ds_flags)
+               return -ENOMEM;
+       }
+       hdr->tag.type = DS_DATA;
+       hdr->tag.len = sizeof(struct ds_data_req_payload) + buflen;
+       hdr->payload.handle = svc_info->con_handle;
+       (void) memcpy(hdr->payload.data, buf, buflen);
+
+       if (svc_info->is_loopback) {
+               /*
+                * If the service is connected via loopback, submit the
+                * packet to our local work queue.
+                */
+               type = (svc_info->is_client) ? DS_DTYPE_CLIENT_REQ
+                   : DS_DTYPE_PROVIDER_REQ;
+               rv = ds_submit_data_cb(ds, (struct ds_msg_tag *)hdr, type);
+               if (rv < 0)
+                       pr_err("ds-%llu: %s: ds_submit_data_cb failed.\n ",
+                           ds->id, __func__);
+       } else {
+               /* send the data out to the LDC */
+               rv = ds_ldc_send_msg(ds->lp, (void *)hdr, msglen);
+               if (rv <= 0) {
+                       pr_err("ds-%llu: %s: ldc_send failed.(%d)\n ",
+                           ds->id, __func__, rv);
+                       rv = -EIO;
+               } else {
+                       rv = 0;
+               }
+       }
+
+       kfree(hdr);
+
+       UNLOCK_DS_DEV(ds, ds_flags)
+
+       return rv;
+}
+EXPORT_SYMBOL(ds_cap_send);
+
+/*
+ * Builtin service callback routines
+ */
+
+static void ds_md_update_data_cb(ds_cb_arg_t arg,
+                  ds_svc_hdl_t handle, void *buf, size_t len)
+{
+       struct ds_dev *ds = (struct ds_dev *)arg;
+       struct ds_md_update_req *rp;
+       struct ds_md_update_res res;
+
+       dprintk("entered.\n");
+
+       rp = (struct ds_md_update_req *)buf;
+
+       pr_alert("ds-%llu: Machine description update.\n", ds->id);
+
+       mdesc_update();
+
+       res.req_num = rp->req_num;
+       res.result = DS_OK;
+
+       ds_cap_send(handle, &res, sizeof(struct ds_md_update_res));
+}
+
+static void ds_dom_shutdown_data_cb(ds_cb_arg_t arg,
+               ds_svc_hdl_t handle, void *buf, size_t len)
+{
+       struct ds_dev *ds = (struct ds_dev *)arg;
+       struct ds_shutdown_req *rp;
+       struct ds_shutdown_res res;
+
+       dprintk("entered.\n");
+
+       rp = (struct ds_shutdown_req *)buf;
+
+       pr_alert("ds-%llu: Shutdown request received.\n", ds->id);
+
+       res.req_num = rp->req_num;
+       res.result = DS_OK;
+       res.reason[0] = 0;
+
+       ds_cap_send(handle, &res, sizeof(struct ds_shutdown_res));
+
+       /* give a message to the console if the delay is greater than 1 sec. */
+       if (rp->ms_delay > 1000) {
+               pr_alert("ds-%llu: Shutting down in %u seconds.\n",
+                   ds->id, rp->ms_delay/1000);
+               /* delay for specified ms before shutdown */
+               mdelay(rp->ms_delay);
+       }
+
+
+       orderly_poweroff(true);
+}
+
+static void ds_dom_panic_data_cb(ds_cb_arg_t arg,
+               ds_svc_hdl_t handle, void *buf, size_t len)
+{
+       struct ds_dev *ds = (struct ds_dev *)arg;
+       struct ds_panic_req *rp;
+       struct ds_panic_res res;
+
+       dprintk("entered.\n");
+
+       rp = (struct ds_panic_req *)buf;
+
+       pr_alert("ds-%llu: Panic request received.\n", ds->id);
+
+       res.req_num = rp->req_num;
+       res.result = DS_OK;
+       res.reason[0] = 0;
+
+       ds_cap_send(handle, &res, sizeof(struct ds_panic_res));
+
+       panic("PANIC requested.\n");
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void __dr_cpu_send_error(struct ds_dev *ds,
+       u64 handle, struct dr_cpu_tag *tag)
+{
+       struct dr_cpu_tag       resp_tag;
+
+       dprintk("entered.\n");
 
-       spin_lock_irqsave(&ds_lock, flags);
-       __dr_cpu_send_error(dp, cp, data);
-       spin_unlock_irqrestore(&ds_lock, flags);
+       resp_tag.req_num = tag->req_num;
+       resp_tag.type = DR_CPU_ERROR;
+       resp_tag.num_records = 0;
+
+       ds_cap_send(handle, &resp_tag, sizeof(struct dr_cpu_tag));
 }
 
 #define CPU_SENTINEL   0xffffffff
@@ -460,6 +1574,8 @@ static void purge_dups(u32 *list, u32 num_ents)
 {
        unsigned int i;
 
+       dprintk("entered.\n");
+
        for (i = 0; i < num_ents; i++) {
                u32 cpu = list[i];
                unsigned int j;
@@ -476,25 +1592,19 @@ static void purge_dups(u32 *list, u32 num_ents)
 
 static int dr_cpu_size_response(int ncpus)
 {
-       return sizeof(struct ds_data) +
-               sizeof(struct dr_cpu_tag) +
+       return sizeof(struct dr_cpu_tag) +
                (sizeof(struct dr_cpu_resp_entry) * ncpus);
 }
 
-static void dr_cpu_init_response(struct ds_data *resp, u64 req_num,
+static void dr_cpu_init_response(struct dr_cpu_tag *tag, u64 req_num,
                                 u64 handle, int resp_len, int ncpus,
                                 cpumask_t *mask, u32 default_stat)
 {
        struct dr_cpu_resp_entry *ent;
-       struct dr_cpu_tag *tag;
        int i, cpu;
 
-       tag = (struct dr_cpu_tag *) (resp + 1);
        ent = (struct dr_cpu_resp_entry *) (tag + 1);
 
-       resp->tag.type = DS_DATA;
-       resp->tag.len = resp_len - sizeof(struct ds_msg_tag);
-       resp->handle = handle;
        tag->req_num = req_num;
        tag->type = DR_CPU_OK;
        tag->num_records = ncpus;
@@ -509,14 +1619,12 @@ static void dr_cpu_init_response(struct ds_data *resp, u64 req_num,
        BUG_ON(i != ncpus);
 }
 
-static void dr_cpu_mark(struct ds_data *resp, int cpu, int ncpus,
+static void dr_cpu_mark(struct dr_cpu_tag *tag, int cpu, int ncpus,
                        u32 res, u32 stat)
 {
        struct dr_cpu_resp_entry *ent;
-       struct dr_cpu_tag *tag;
        int i;
 
-       tag = (struct dr_cpu_tag *) (resp + 1);
        ent = (struct dr_cpu_resp_entry *) (tag + 1);
 
        for (i = 0; i < ncpus; i++) {
@@ -528,12 +1636,13 @@ static void dr_cpu_mark(struct ds_data *resp, int cpu, int ncpus,
        }
 }
 
-static int dr_cpu_configure(struct ds_info *dp, struct ds_cap_state *cp,
-                           u64 req_num, cpumask_t *mask)
+static int __cpuinit dr_cpu_configure(struct ds_dev *ds,
+       u64 handle, u64 req_num, cpumask_t *mask)
 {
-       struct ds_data *resp;
+       struct dr_cpu_tag *resp;
        int resp_len, ncpus, cpu;
-       unsigned long flags;
+
+       dprintk("entered.\n");
 
        ncpus = cpumask_weight(mask);
        resp_len = dr_cpu_size_response(ncpus);
@@ -541,7 +1650,7 @@ static int dr_cpu_configure(struct ds_info *dp, struct ds_cap_state *cp,
        if (!resp)
                return -ENOMEM;
 
-       dr_cpu_init_response(resp, req_num, cp->handle,
+       dr_cpu_init_response(resp, req_num, handle,
                             resp_len, ncpus, mask,
                             DR_CPU_STAT_CONFIGURED);
 
@@ -551,29 +1660,27 @@ static int dr_cpu_configure(struct ds_info *dp, struct ds_cap_state *cp,
        for_each_cpu(cpu, mask) {
                int err;
 
-               pr_info("ds-%llu: Starting cpu %d...\n", dp->id, cpu);
+               dprintk("ds-%llu: Starting cpu %d...\n", ds->id, cpu);
                err = cpu_up(cpu);
                if (err) {
-                       __u32 res = DR_CPU_RES_FAILURE;
-                       __u32 stat = DR_CPU_STAT_UNCONFIGURED;
+                       u32 res = DR_CPU_RES_FAILURE;
+                       u32 stat = DR_CPU_STAT_UNCONFIGURED;
 
                        if (!cpu_present(cpu)) {
                                /* CPU not present in MD */
-                               res = DR_CPU_RES_NOT_IN_MD;
                                stat = DR_CPU_STAT_NOT_PRESENT;
                        } else if (err == -ENODEV) {
                                /* CPU did not call in successfully */
                                res = DR_CPU_RES_CPU_NOT_RESPONDING;
                        }
 
-                       pr_info("ds-%llu: CPU startup failed err=%d\n", dp->id, err);
+                       pr_err("ds-%llu: CPU startup failed err=%d\n", ds->id,
+                               err);
                        dr_cpu_mark(resp, cpu, ncpus, res, stat);
                }
        }
 
-       spin_lock_irqsave(&ds_lock, flags);
-       __ds_send(dp->lp, resp, resp_len);
-       spin_unlock_irqrestore(&ds_lock, flags);
+       ds_cap_send(handle, resp, resp_len);
 
        kfree(resp);
 
@@ -583,14 +1690,13 @@ static int dr_cpu_configure(struct ds_info *dp, struct ds_cap_state *cp,
        return 0;
 }
 
-static int dr_cpu_unconfigure(struct ds_info *dp,
-                             struct ds_cap_state *cp,
-                             u64 req_num,
-                             cpumask_t *mask)
+static int dr_cpu_unconfigure(struct ds_dev *ds,
+       u64 handle, u64 req_num, cpumask_t *mask)
 {
-       struct ds_data *resp;
+       struct dr_cpu_tag *resp;
        int resp_len, ncpus, cpu;
-       unsigned long flags;
+
+       dprintk("entered.\n");
 
        ncpus = cpumask_weight(mask);
        resp_len = dr_cpu_size_response(ncpus);
@@ -598,14 +1704,14 @@ static int dr_cpu_unconfigure(struct ds_info *dp,
        if (!resp)
                return -ENOMEM;
 
-       dr_cpu_init_response(resp, req_num, cp->handle,
+       dr_cpu_init_response(resp, req_num, handle,
                             resp_len, ncpus, mask,
                             DR_CPU_STAT_UNCONFIGURED);
 
        for_each_cpu(cpu, mask) {
                int err;
 
-               pr_info("ds-%llu: Shutting down cpu %d...\n", dp->id, cpu);
+               pr_info("ds-%llu: Shutting down cpu %d...\n", ds->id, cpu);
                err = cpu_down(cpu);
                if (err)
                        dr_cpu_mark(resp, cpu, ncpus,
@@ -613,26 +1719,26 @@ static int dr_cpu_unconfigure(struct ds_info *dp,
                                    DR_CPU_STAT_CONFIGURED);
        }
 
-       spin_lock_irqsave(&ds_lock, flags);
-       __ds_send(dp->lp, resp, resp_len);
-       spin_unlock_irqrestore(&ds_lock, flags);
+       ds_cap_send(handle, resp, resp_len);
 
        kfree(resp);
 
        return 0;
 }
 
-static void dr_cpu_data(struct ds_info *dp, struct ds_cap_state *cp, void *buf,
-                       int len)
+static void __cpuinit ds_dr_cpu_data_cb(ds_cb_arg_t arg,
+               ds_svc_hdl_t handle, void *buf, size_t len)
 {
-       struct ds_data *data = buf;
-       struct dr_cpu_tag *tag = (struct dr_cpu_tag *) (data + 1);
+       struct ds_dev *ds = (struct ds_dev *)arg;
+       struct dr_cpu_tag *tag = (struct dr_cpu_tag *)buf;
        u32 *cpu_list = (u32 *) (tag + 1);
        u64 req_num = tag->req_num;
        cpumask_t mask;
        unsigned int i;
        int err;
 
+       dprintk("entered.\n");
+
        switch (tag->type) {
        case DR_CPU_CONFIGURE:
        case DR_CPU_UNCONFIGURE:
@@ -640,7 +1746,7 @@ static void dr_cpu_data(struct ds_info *dp, struct ds_cap_state *cp, void *buf,
                break;
 
        default:
-               dr_cpu_send_error(dp, cp, data);
+               __dr_cpu_send_error(ds, handle, tag);
                return;
        }
 
@@ -656,630 +1762,2226 @@ static void dr_cpu_data(struct ds_info *dp, struct ds_cap_state *cp, void *buf,
        }
 
        if (tag->type == DR_CPU_CONFIGURE)
-               err = dr_cpu_configure(dp, cp, req_num, &mask);
+               err = dr_cpu_configure(ds, handle, req_num, &mask);
        else
-               err = dr_cpu_unconfigure(dp, cp, req_num, &mask);
+               err = dr_cpu_unconfigure(ds, handle, req_num, &mask);
 
        if (err)
-               dr_cpu_send_error(dp, cp, data);
+               __dr_cpu_send_error(ds, handle, tag);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-struct ds_pri_msg {
-       __u64                           req_num;
-       __u64                           type;
-#define DS_PRI_REQUEST                 0x00
-#define DS_PRI_DATA                    0x01
-#define DS_PRI_UPDATE                  0x02
-};
+static DEFINE_MUTEX(ds_var_mutex);
+static DECLARE_COMPLETION(ds_var_config_cb_complete);
+static DEFINE_MUTEX(ds_var_complete_mutex);
+static int ds_var_response;
 
-static void ds_pri_data(struct ds_info *dp,
-                       struct ds_cap_state *cp,
-                       void *buf, int len)
+static void ds_var_data_cb(ds_cb_arg_t arg,
+       ds_svc_hdl_t handle, void *buf, size_t len)
 {
-       struct ds_data *dpkt = buf;
-       struct ds_pri_msg *rp;
+       struct ds_var_resp *rp;
 
-       rp = (struct ds_pri_msg *) (dpkt + 1);
+       dprintk("entered.\n");
 
-       pr_info("ds-%llu: PRI REQ [%llx:%llx], len=%d\n", dp->id, rp->req_num,
-               rp->type, len);
-}
+       rp = (struct ds_var_resp *)buf;
 
-struct ds_var_hdr {
-       __u32                           type;
-#define DS_VAR_SET_REQ                 0x00
-#define DS_VAR_DELETE_REQ              0x01
-#define DS_VAR_SET_RESP                        0x02
-#define DS_VAR_DELETE_RESP             0x03
-};
+       dprintk("hdr.type = %u\n", rp->hdr.type);
+       dprintk("result = %u\n", rp->result);
 
-struct ds_var_set_msg {
-       struct ds_var_hdr               hdr;
-       char                            name_and_value[0];
-};
+       if (rp->hdr.type != DS_VAR_SET_RESP &&
+           rp->hdr.type != DS_VAR_DELETE_RESP)
+               return;
 
-struct ds_var_delete_msg {
-       struct ds_var_hdr               hdr;
-       char                            name[0];
-};
+       ds_var_response = rp->result;
+       wmb();
 
-struct ds_var_resp {
-       struct ds_var_hdr               hdr;
-       __u32                           result;
-#define DS_VAR_SUCCESS                 0x00
-#define DS_VAR_NO_SPACE                        0x01
-#define DS_VAR_INVALID_VAR             0x02
-#define DS_VAR_INVALID_VAL             0x03
-#define DS_VAR_NOT_PRESENT             0x04
-};
+       mutex_lock(&ds_var_complete_mutex);
+       complete(&ds_var_config_cb_complete);
+       mutex_unlock(&ds_var_complete_mutex);
+}
 
-static DEFINE_MUTEX(ds_var_mutex);
-static int ds_var_doorbell;
-static int ds_var_response;
+static DEFINE_MUTEX(ds_sp_token_mutex);
+static DECLARE_COMPLETION(ds_sp_token_cb_complete);
+static DEFINE_MUTEX(ds_sp_token_complete_mutex);
+static u32             ds_sp_token_resp_result;
+static u64             ds_sp_token_resp_req_num;
+static u64             ds_sp_token_next_req_num;
+static ds_sptok_t      ds_sp_token_data;
 
-static void ds_var_data(struct ds_info *dp,
-                       struct ds_cap_state *cp,
-                       void *buf, int len)
+static void ds_sp_token_data_cb(ds_cb_arg_t arg,
+       ds_svc_hdl_t handle, void *buf, size_t len)
 {
-       struct ds_data *dpkt = buf;
-       struct ds_var_resp *rp;
+       struct ds_dev *ds = (struct ds_dev *)arg;
+       struct ds_sp_token_resp *rp;
+
+       dprintk("entered.\n");
+
+       rp = (struct ds_sp_token_resp *)buf;
+
+       dprintk("ds-%llu: SP TOKEN REQ [%llx:%x], len=%lu ip_addr=%x (%d.%d)"
+           "portid=%d\n", ds->id, rp->req_num, rp->result, len, rp->ip_addr,
+           (rp->ip_addr & 0xFF00) >> 8, rp->ip_addr & 0xFF, rp->portid);
+
+       dprintk("[%x:%x...0x%x...:%x].\n", (__u8)rp->token[0],
+           (__u8)rp->token[1], (__u8)rp->token[11], (__u8)rp->token[19]);
+
+       (void) memcpy(&ds_sp_token_data, &(rp->ip_addr), sizeof(ds_sptok_t));
+       ds_sp_token_resp_result = rp->result;
+       ds_sp_token_resp_req_num = rp->req_num;
+       wmb();
+
+       mutex_lock(&ds_sp_token_complete_mutex);
+       complete(&ds_sp_token_cb_complete);
+       mutex_unlock(&ds_sp_token_complete_mutex);
+
+}
+
+/*
+ * Helper functions
+ */
+
+static u64 ds_get_service_timeout(void)
+{
+       u8 random_byte;
+       u64 timeout_cnt;
+
+       /*
+        * Return a random number of jiffies that is
+        * between 3000 and 9000ms in the future.
+        * XXX - make these values configurable.
+        */
+       get_random_bytes(&random_byte, 1);
+       timeout_cnt = (((random_byte % 7) + 3));
+
+       return jiffies + msecs_to_jiffies(timeout_cnt * 1000);
+
+}
+
+static struct ds_service_info *ds_find_connected_prov_service(char *svc_id)
+{
+       struct ds_dev *ds;
+       unsigned long flags;
+       unsigned long ds_flags = 0;
+       struct ds_service_info *svc_info;
+
+       spin_lock_irqsave(&ds_data_lock, flags);
+
+       list_for_each_entry(ds, &ds_data.ds_dev_list, list) {
+
+               LOCK_DS_DEV(ds, ds_flags)
+
+               svc_info = ds_find_service_provider_id(ds, svc_id);
+               if (svc_info != NULL && svc_info->is_connected) {
+                       UNLOCK_DS_DEV(ds, ds_flags)
+                       spin_unlock_irqrestore(&ds_data_lock, flags);
+                       return svc_info;
+               }
+
+               UNLOCK_DS_DEV(ds, ds_flags)
+       }
+
+       spin_unlock_irqrestore(&ds_data_lock, flags);
+
+       return NULL;
+
+}
+
+static struct ds_service_info *ds_find_service_provider_id(struct ds_dev *ds,
+       char *svc_id)
+{
+       struct ds_service_info *svc_info;
+
+       list_for_each_entry(svc_info, &ds->service_provider_list, list) {
+               if (strncmp(svc_info->id, svc_id, DS_MAX_SVC_NAME_LEN) == 0)
+                       return svc_info;
+       }
+
+       return NULL;
+}
+
+static struct ds_service_info *ds_find_service_provider_handle(
+       struct ds_dev *ds, u64 handle)
+{
+       struct ds_service_info *svc_info;
+
+       list_for_each_entry(svc_info, &ds->service_provider_list, list) {
+               if (svc_info->handle == handle)
+                       return svc_info;
+       }
+
+       return NULL;
+}
+
+static struct ds_service_info *ds_find_service_provider_con_handle(
+       struct ds_dev *ds, u64 handle)
+{
+       struct ds_service_info *svc_info;
+
+       list_for_each_entry(svc_info, &ds->service_provider_list, list) {
+               if (svc_info->con_handle == handle)
+                       return svc_info;
+       }
+
+       return NULL;
+}
+
+static struct ds_service_info *ds_find_service_client_id(struct ds_dev *ds,
+       char *svc_id)
+{
+       struct ds_service_info *svc_info;
+
+       list_for_each_entry(svc_info, &ds->service_client_list, list) {
+               if (strncmp(svc_info->id, svc_id, DS_MAX_SVC_NAME_LEN) == 0)
+                       return svc_info;
+       }
+
+       return NULL;
+}
+
+static struct ds_service_info *ds_find_service_client_handle(
+       struct ds_dev *ds, u64 handle)
+{
+       struct ds_service_info *svc_info;
+
+       list_for_each_entry(svc_info, &ds->service_client_list, list) {
+               if (svc_info->handle == handle)
+                       return svc_info;
+       }
+
+       return NULL;
+}
+
+static struct ds_service_info *ds_find_service_client_con_handle(
+       struct ds_dev *ds, u64 handle)
+{
+       struct ds_service_info *svc_info;
+
+       list_for_each_entry(svc_info, &ds->service_client_list, list) {
+               if (svc_info->con_handle == handle)
+                       return svc_info;
+       }
+
+       return NULL;
+}
+
+static struct ds_service_info *ds_find_lb_service_peer(struct ds_dev *ds,
+       struct ds_service_info *svc_info)
+{
+       struct ds_service_info *peer_svc_info;
+
+       /* if the service is a client, find a provider with the same id */
+       if (svc_info->is_client) {
+               peer_svc_info = ds_find_service_provider_id(ds, svc_info->id);
+               if (peer_svc_info && peer_svc_info->reg_state ==
+                   DS_REG_STATE_REGISTERED_LOOPBACK)
+                       return peer_svc_info;
+       } else {
+               peer_svc_info = ds_find_service_client_id(ds, svc_info->id);
+               if (peer_svc_info && peer_svc_info->reg_state ==
+                   DS_REG_STATE_REGISTERED_LOOPBACK)
+                       return peer_svc_info;
+       }
+
+       return NULL;
+}
+
+
+static u64 ds_get_new_service_handle(struct ds_dev *ds, bool is_client)
+{
+
+       u64 handle;
+
+       /*
+        * Solaris uses a couple of bits in the handle as flags.
+        * See, DS_HDL_ISCLIENT_BIT, DS_HDL_ISCNTRLD_BIT.
+        * So, to avoid using these bits in a handle we only use the
+        * bottom 30 bits. This will help avoid issues on mixed
+        * systems running both Linux and Solaris domains.
+        */
+
+       /* handle wrap at DS_HDL_ISCNTRLD_BIT - don't use 0 */
+       if (ds->next_service_handle == DS_HDL_ISCNTRLD_BIT)
+               ds->next_service_handle = 1;
+
+       handle = (ds->id << 32) | ds->next_service_handle++;
+
+       /*
+        * If the service is a client service, set the ISLCLIENT
+        * bit which is an indication (or "ping") to the other end
+        * to send a REG_REQ for the provider service.
+        */
+       if (is_client)
+               handle |= DS_HDL_ISCLIENT_BIT;
+
+       return handle;
+
+}
+
+static struct ds_service_info *ds_add_service_provider(struct ds_dev *ds,
+       char *id, ds_ver_t vers, ds_ops_t *ops, bool is_builtin)
+{
+       struct ds_service_info *svc_info;
+
+       dprintk("entered.\n");
+
+       svc_info = kzalloc(sizeof(struct ds_service_info), GFP_KERNEL);
+       if (unlikely(svc_info == NULL))
+               return NULL;
+
+       svc_info->id = kmemdup(id, (strlen(id) + 1), GFP_KERNEL);
+       svc_info->vers = vers;
+       svc_info->ops = *ops;
+       svc_info->is_client = false;
+       svc_info->is_builtin = is_builtin;
+       svc_info->is_loopback = false;
+       svc_info->is_connected = false;
+       svc_info->reg_state = DS_REG_STATE_UNREG;
+       svc_info->svc_reg_timeout = ds_get_service_timeout();
+
+       /*
+        * Get a service handle to use to reference this svc_info.
+        * This handle is also used to send a REG_REQ for this service.
+        */
+       svc_info->handle = ds_get_new_service_handle(ds, false);
+       svc_info->con_handle = 0;
+
+       /* init the the ops arg for builtin services to the ds */
+       if (is_builtin)
+               svc_info->ops.cb_arg = ds;
+
+       list_add_tail(&svc_info->list, &ds->service_provider_list);
+
+       return svc_info;
+}
+
+static void ds_remove_service_provider(struct ds_dev *ds,
+       struct ds_service_info *provider_svc_info)
+{
+       dprintk("entered.\n");
+
+       if (provider_svc_info->is_connected)
+               ds_disconnect_service_provider(ds, provider_svc_info);
+
+       kfree(provider_svc_info->id);
+       list_del(&provider_svc_info->list);
+       kfree(provider_svc_info);
+
+}
+
+static struct ds_service_info *ds_add_service_client(struct ds_dev *ds,
+       char *id, ds_ver_t vers, ds_ops_t *ops, bool is_builtin)
+{
+       struct ds_service_info *svc_info;
+
+       dprintk("entered.\n");
+
+       svc_info = kzalloc(sizeof(struct ds_service_info), GFP_KERNEL);
+       if (unlikely(svc_info == NULL))
+               return NULL;
+
+       svc_info->id = kmemdup(id, (strlen(id) + 1), GFP_KERNEL);
+       svc_info->vers = vers;
+       svc_info->ops = *ops;
+       svc_info->is_client = true;
+       svc_info->is_builtin = is_builtin;
+       svc_info->is_loopback = false;
+       svc_info->is_connected = false;
+       svc_info->reg_state = DS_REG_STATE_UNREG;
+       svc_info->svc_reg_timeout = ds_get_service_timeout();
+
+       /* Get a service handle to use to reference this svc_info. */
+       svc_info->handle = ds_get_new_service_handle(ds, true);
+       svc_info->con_handle = 0;
+
+       /* init the the ops arg for builtin services to the ds */
+       if (is_builtin)
+               svc_info->ops.cb_arg = ds;
+
+       list_add_tail(&svc_info->list, &ds->service_client_list);
+
+       return svc_info;
+}
+
+static void ds_remove_service_client(struct ds_dev *ds,
+       struct ds_service_info *client_svc_info)
+{
+       dprintk("entered.\n");
+
+        /* If the service is connected, send a unreg message */
+       if (client_svc_info->is_connected)
+               ds_disconnect_service_client(ds, client_svc_info);
+
+       kfree(client_svc_info->id);
+       list_del(&client_svc_info->list);
+       kfree(client_svc_info);
+
+}
+
+static void ds_connect_service_client(struct ds_dev *ds, u64 handle,
+       u16 major, u16 minor, struct ds_service_info *client_svc_info)
+{
+       dprintk("entered.\n");
+
+       /* assign the client to the service */
+       client_svc_info->is_loopback = false;
+       client_svc_info->con_handle = handle;
+       client_svc_info->neg_vers.major = major;
+       client_svc_info->neg_vers.minor = minor;
+       client_svc_info->reg_state = DS_REG_STATE_REGISTERED_LDC;
+       client_svc_info->is_connected = true;
+
+       /* submit the register callback */
+       (void) ds_submit_reg_cb(ds, client_svc_info->handle, DS_QTYPE_REG);
+}
+
+static void ds_disconnect_service_client(struct ds_dev *ds,
+       struct ds_service_info *client_svc_info)
+{
+       struct ds_service_info *peer_svc_info;
+       int rv;
+
+       dprintk("entered.\n");
+
+       peer_svc_info = NULL;
+
+       if (client_svc_info->reg_state == DS_REG_STATE_REGISTERED_LOOPBACK) {
+               peer_svc_info = ds_find_lb_service_peer(ds, client_svc_info);
+       } else if (client_svc_info->reg_state == DS_REG_STATE_REGISTERED_LDC) {
+               rv = ds_service_unreg(ds, client_svc_info->con_handle);
+               if (rv != 0) {
+                       pr_err("ds-%llu: %s: failed to send UNREG_REQ for "
+                           "handle %llx (%d)\n", ds->id, __func__,
+                           client_svc_info->con_handle, rv);
+               }
+       }
+       client_svc_info->is_loopback = false;
+       client_svc_info->con_handle = 0;
+       client_svc_info->neg_vers.major = 0;
+       client_svc_info->neg_vers.minor = 0;
+       client_svc_info->reg_state = DS_REG_STATE_UNREG;
+       client_svc_info->is_connected = false;
+       client_svc_info->svc_reg_timeout = ds_get_service_timeout();
+
+       /* submit the unregister callback */
+       (void) ds_submit_reg_cb(ds, client_svc_info->handle, DS_QTYPE_UNREG);
+
+       /* if it was a loopback connection, disconnect the peer */
+       if (peer_svc_info)
+               ds_disconnect_service_provider(ds, peer_svc_info);
+}
+
+static void ds_connect_service_provider(struct ds_dev *ds, u64 handle,
+       u16 major, u16 minor, struct ds_service_info *provider_svc_info)
+{
+       dprintk("entered.\n");
+
+       /* register the provider */
+       provider_svc_info->is_loopback = false;
+       provider_svc_info->con_handle = handle;
+       provider_svc_info->neg_vers.major = major;
+       provider_svc_info->neg_vers.minor = minor;
+       provider_svc_info->reg_state = DS_REG_STATE_REGISTERED_LDC;
+       provider_svc_info->is_connected = true;
+
+       /* submit the register callback */
+       (void) ds_submit_reg_cb(ds, provider_svc_info->handle, DS_QTYPE_REG);
+
+}
+
+static void ds_disconnect_service_provider(struct ds_dev *ds,
+       struct ds_service_info *provider_svc_info)
+{
+       struct ds_service_info *peer_svc_info;
+       int rv;
+
+       dprintk("entered.\n");
+
+       peer_svc_info = NULL;
+       if (provider_svc_info->reg_state == DS_REG_STATE_REGISTERED_LOOPBACK) {
+               peer_svc_info = ds_find_lb_service_peer(ds, provider_svc_info);
+       } else if (provider_svc_info->reg_state ==
+           DS_REG_STATE_REGISTERED_LDC) {
+               rv = ds_service_unreg(ds, provider_svc_info->con_handle);
+               if (rv != 0) {
+                       pr_err("ds-%llu: %s: failed to send UNREG_REQ for "
+                           "handle %llx (%d)\n", ds->id, __func__,
+                           provider_svc_info->con_handle, rv);
+               }
+       }
+       provider_svc_info->is_loopback = false;
+       provider_svc_info->con_handle = 0;
+       provider_svc_info->neg_vers.major = 0;
+       provider_svc_info->neg_vers.minor = 0;
+       provider_svc_info->reg_state = DS_REG_STATE_UNREG;
+       provider_svc_info->is_connected = false;
+       provider_svc_info->svc_reg_timeout = ds_get_service_timeout();
+
+       /* submit the unregister callback */
+       (void) ds_submit_reg_cb(ds, provider_svc_info->handle, DS_QTYPE_UNREG);
+
+       /* if it was a loopback connection, disconnect the peer */
+       if (peer_svc_info)
+               ds_disconnect_service_client(ds, peer_svc_info);
+}
+
+static int ds_connect_loopback_service(struct ds_dev *ds,
+       struct ds_service_info *svc_info,
+       struct ds_service_info *peer_svc_info)
+{
+       ds_ver_t neg_vers;
+
+       dprintk("entered.\n");
+
+       /* First check to make sure the versions are compatible */
+       if (svc_info->vers.major != peer_svc_info->vers.major) {
+               pr_err("ds-%llu: failed to connect loopback service %s due "
+                   "version incompatibilty (%llu, %llu)\n", ds->id,
+                   svc_info->id, svc_info->vers.major,
+                   peer_svc_info->vers.major);
+               return -EINVAL;
+       }
+
+       /* create the negotiated version */
+       neg_vers.minor = min_t(u64, svc_info->vers.minor,
+           peer_svc_info->vers.minor);
+       neg_vers.major = svc_info->vers.major;
+
+       /* establish the loopback connection */
+       svc_info->is_loopback = true;
+       svc_info->neg_vers = neg_vers;
+       svc_info->reg_state = DS_REG_STATE_REGISTERED_LOOPBACK;
+       svc_info->con_handle = svc_info->handle;
+       svc_info->is_connected = true;
+       peer_svc_info->is_loopback = true;
+       peer_svc_info->neg_vers = neg_vers;
+       peer_svc_info->reg_state = DS_REG_STATE_REGISTERED_LOOPBACK;
+       peer_svc_info->con_handle = svc_info->handle;
+       peer_svc_info->is_connected = true;
+
+       /* submit the register callbacks */
+       (void) ds_submit_reg_cb(ds, svc_info->handle, DS_QTYPE_REG);
+       (void) ds_submit_reg_cb(ds, peer_svc_info->handle, DS_QTYPE_REG);
+
+       return 0;
+}
+
+static void ds_unregister_ldc_services(struct ds_dev *ds)
+{
+       struct ds_service_info *svc_info;
+
+       dprintk("entered.\n");
+
+       list_for_each_entry(svc_info, &ds->service_provider_list, list) {
+               if (svc_info->reg_state == DS_REG_STATE_REGISTERED_LDC)
+                       ds_disconnect_service_provider(ds, svc_info);
+       }
+
+       list_for_each_entry(svc_info, &ds->service_client_list, list) {
+               if (svc_info->reg_state == DS_REG_STATE_REGISTERED_LDC)
+                       ds_disconnect_service_client(ds, svc_info);
+       }
+
+}
+
+static void ds_reregister_ldc_services(struct ds_dev *ds)
+{
+       struct ds_service_info *svc_info;
+
+       dprintk("entered.\n");
+
+       list_for_each_entry(svc_info, &ds->service_provider_list, list) {
+               if (svc_info->reg_state == DS_REG_STATE_REG_SENT) {
+                       svc_info->reg_state = DS_REG_STATE_UNREG;
+                       svc_info->svc_reg_timeout = ds_get_service_timeout();
+               }
+       }
+
+       list_for_each_entry(svc_info, &ds->service_client_list, list) {
+               if (svc_info->reg_state == DS_REG_STATE_REG_SENT) {
+                       svc_info->reg_state = DS_REG_STATE_UNREG;
+                       svc_info->svc_reg_timeout = ds_get_service_timeout();
+               }
+       }
+
+}
+
+static void ds_remove_services(struct ds_dev *ds)
+{
+       struct ds_service_info *svc_info, *tmp;
+
+       dprintk("entered.\n");
+
+       list_for_each_entry_safe(svc_info, tmp,
+           &ds->service_provider_list, list) {
+               ds_remove_service_provider(ds, svc_info);
+       }
+
+       list_for_each_entry_safe(svc_info, tmp,
+           &ds->service_client_list, list) {
+               ds_remove_service_client(ds, svc_info);
+       }
+
+}
+
+/*
+ * DS Kernel Interface functions
+ */
+void ldom_set_var(const char *var, const char *value)
+{
+       struct ds_service_info *svc_info;
+       union {
+               struct ds_var_set_msg   msg;
+               char                    all[512];
+       } payload;
+       char  *base, *p;
+       int msg_len;
+       int rv;
+
+       dprintk("entered.\n");
+
+       if (var == NULL) {
+               pr_err("%s: Invalid NULL variable name argument.\n", __func__);
+               return;
+       }
+
+       if (value == NULL) {
+               pr_err("%s: Invalid NULL variable value argument.\n", __func__);
+               return;
+       }
+
+       if (strlen(var) > 254) {
+               pr_err("%s: Variable name too long.\n", __func__);
+               return;
+       }
+
+       if (strlen(value) > 254) {
+               pr_err("%s: Variable value too long.\n", __func__);
+               return;
+       }
+
+       svc_info = ds_find_connected_prov_service("var-config");
+       if (svc_info == NULL)
+               svc_info = ds_find_connected_prov_service("var-config-backup");
+
+       if (svc_info == NULL) {
+               pr_err("%s: var-config and var-config-backup service "
+                   "not registered. Failed to set (%s) variable "
+                   "to (%s).\n", __func__, var, value);
+               return;
+       }
+
+       dprintk("%s: found %s client service\n", __func__, svc_info->id);
+
+       memset(&payload, 0, sizeof(payload));
+       payload.msg.hdr.type = DS_VAR_SET_REQ;
+       base = p = &payload.msg.name_and_value[0];
+       strcpy(p, var);
+       p += strlen(var) + 1;
+       strcpy(p, value);
+       p += strlen(value) + 1;
+
+       msg_len = (sizeof(struct ds_var_set_msg) + (p - base));
+       msg_len = (msg_len + 3) & ~3;
+
+       mutex_lock(&ds_var_mutex);
+
+       ds_var_response = -1;
+       wmb();
+
+       /*
+        * (re)init the completion var to help guarantee
+        * responses are for this request (and not an older
+        * request which came in late). Use a mutex to protect
+        * against the possibility of re-initializing at the same time
+        * as the callout thread calling complete() in the callback.
+        */
+       mutex_lock(&ds_var_complete_mutex);
+       init_completion(&ds_var_config_cb_complete);
+       mutex_unlock(&ds_var_complete_mutex);
+
+       rv = ds_cap_send(svc_info->handle, &payload, msg_len);
+
+       if (!rv) {
+               /* wait for response here */
+               wait_for_completion_timeout(&ds_var_config_cb_complete,
+                   (DS_RESPONSE_TIMEOUT * HZ));
+       }
+
+       if (ds_var_response != DS_VAR_SUCCESS)
+               pr_err("%s: var-config [%s:%s] failed, response(%d).\n",
+                   __func__, var, value, ds_var_response);
+
+       mutex_unlock(&ds_var_mutex);
+
+       return;
+
+}
+
+static int ldom_req_sp_token(const char *service_name, u32 *sp_token_result,
+       ds_sptok_t *sp_token_data)
+{
+       struct ds_service_info *svc_info;
+       struct ds_sp_token_msg  *payload;
+       int     svc_len;        /* length of service_name string */
+       int     payload_len;    /* length of ds_sp_token_msg payload */
+       int rv;
+
+       dprintk("entered.\n");
+
+       if (service_name == NULL) {
+               pr_err("%s: Invalid NULL service name argument.\n", __func__);
+               return -EINVAL;
+       }
+
+       svc_info = ds_find_connected_prov_service("sp-token");
+       if (svc_info == NULL) {
+               pr_err("%s: sp-token service not registered.\n", __func__);
+               return -EIO;
+       }
+
+       svc_len = (service_name == NULL || *service_name == '\0') ? 0 :
+           strlen(service_name) + 1;
+       if (svc_len > DS_MAX_SVC_NAME_LEN) {
+               pr_err("%s: service name '%s' too long.\n",
+                   __func__, service_name);
+               return -EINVAL;
+       }
+
+       payload_len = sizeof(struct ds_sp_token_msg) + svc_len;
+       payload = kzalloc(payload_len, GFP_KERNEL);
+       if (payload == NULL) {
+               pr_err("%s: failed to alloc mem for msg.\n", __func__);
+               return -ENOMEM;
+       }
+
+       payload->type = DS_SPTOK_REQUEST;
+       (void) memcpy(payload->service, service_name, svc_len);
+
+       mutex_lock(&ds_sp_token_mutex);
+
+       payload->req_num = ds_sp_token_next_req_num;
+
+       dprintk("%s: sizeof ds_sp_token_msg=%lu svclen=%d.\n",
+           __func__, sizeof(struct ds_sp_token_msg), svc_len);
+       dprintk("req_num %llu: payload(%p): type[0x%llx] svc[%s].\n",
+           payload->req_num, payload, payload->type, payload->service);
+
+       /* set init values */
+       ds_sp_token_resp_req_num = ~0;
+       ds_sp_token_resp_result = ~0;
+       wmb();
+
+       /*
+        * (re)init the completion var to help guarantee
+        * responses are for this request (and not an older
+        * request which came in late). Use a mutex to protect
+        * against the possibility of re-initializing at the same time
+        * as the callout thread calling complete() in the callback.
+        */
+       mutex_lock(&ds_sp_token_complete_mutex);
+       init_completion(&ds_sp_token_cb_complete);
+       mutex_unlock(&ds_sp_token_complete_mutex);
+
+       rv = ds_cap_send(svc_info->handle, payload, payload_len);
+
+       kfree(payload);
+
+       if (!rv) {
+
+               while (1) {
+                       /* wait for response here */
+                       rv = wait_for_completion_timeout(
+                           &ds_sp_token_cb_complete,
+                           (DS_RESPONSE_TIMEOUT * HZ));
+
+                       if (!rv) {
+                               pr_err("%s: set-token failed: no reply.\n",
+                                   __func__);
+                               rv = -ETIMEDOUT;
+                               break;
+                       }
+
+                       /* got a reply, validate it */
+
+                       /* If the response wasn't for this request, try again */
+                       if (ds_sp_token_resp_req_num !=
+                           ds_sp_token_next_req_num) {
+                               continue;
+                       }
+
+                       /* if we didn't get a valid reply, abort */
+                       if (ds_sp_token_resp_result != DS_SP_TOKEN_RES_OK) {
+                               pr_err("%s: set-token failed [%d].\n", __func__,
+                                   ds_sp_token_resp_result);
+                               rv = -EIO;
+                               break;
+                       } else {
+                               /*
+                                * Got a valid response.
+                                * Copy the response/result to caller.
+                                */
+                               *sp_token_result = ds_sp_token_resp_result;
+                               *sp_token_data = ds_sp_token_data;
+                               rv = 0;
+                               break;
+                       }
+               }
+       }
+
+       /* increment sequence number for next caller - wrap at ~0 */
+       if (++ds_sp_token_next_req_num == ~0)
+               ds_sp_token_next_req_num = 0;
+
+       mutex_unlock(&ds_sp_token_mutex);
+
+       return rv;
+}
+
+static char full_boot_str[256] __aligned(32);
+static int reboot_data_supported;
+
+void ldom_reboot(const char *boot_command)
+{
+       dprintk("entered.\n");
+
+       /*
+        * Don't bother with any of this if the boot_command
+        * is empty.
+        */
+       if (boot_command && strlen(boot_command)) {
+               unsigned long len;
+
+               strcpy(full_boot_str, "boot ");
+               strcpy(full_boot_str + strlen("boot "), boot_command);
+               len = strlen(full_boot_str);
+
+               if (reboot_data_supported) {
+                       unsigned long ra = kimage_addr_to_ra(full_boot_str);
+                       unsigned long hv_ret;
+
+                       hv_ret = sun4v_reboot_data_set(ra, len);
+                       if (hv_ret != HV_EOK)
+                               pr_err("%s: Unable to set reboot "
+                                   "data hv_ret=%lu\n", __func__, hv_ret);
+               } else {
+                       ldom_set_var("reboot-command", full_boot_str);
+               }
+       }
+       sun4v_mach_sir();
+}
+
+void ldom_power_off(void)
+{
+       dprintk("entered.\n");
+
+       sun4v_mach_exit(0);
+}
+
+static int ds_handle_data_nack(struct ds_dev *ds, struct ds_msg_tag *pkt)
+{
+       int rv;
+       struct ds_data_nack *data_nack;
+
+       dprintk("entered.\n");
+
+       data_nack = (struct ds_data_nack *)pkt;
+
+       switch (data_nack->payload.result) {
+       case DS_INV_HDL:
+
+               pr_err("ds-%llu: received INV_HDL data NACK for "
+                       "handle %llx\n", ds->id, data_nack->payload.handle);
+
+               /*
+                * If we got back an DS_INV_HDL data nack, it means
+                * the other side could not find a handle associated
+                * with a data pack we sent to it. So, we interpret this
+                * to mean the other side's client has gone away, so we
+                * send an unregister request to clean things up.
+                */
+               rv = ds_service_unreg(ds, data_nack->payload.handle);
+               if (rv != 0) {
+                       pr_err("ds-%llu: failed to send UNREG_REQ for "
+                           "handle %llx on data NACK (%d)\n", ds->id,
+                           data_nack->payload.handle, rv);
+               }
+
+               break;
+
+       case DS_TYPE_UNKNOWN:
+
+               /*
+                * If we got back a TYPE_UNKNOWN, it means the other side
+                * got an unknown msg_type from a pkt we sent to it. Maybe
+                * it's an older/buggy driver? What to do?
+                */
+               pr_err("ds-%llu: received UNKNOWN data NACK for "
+                       "handle %llx\n", ds->id, data_nack->payload.handle);
+               rv = 0;
+
+               break;
+       };
+
+       return rv;
+}
+
+static int ds_data_msg(struct ds_dev *ds, struct ds_msg_tag *pkt)
+{
+       int rv;
+       struct ds_unknown_msg *unknown_msg;
+
+       dprintk("entered.\n");
+
+       switch (pkt->type) {
+       case DS_DATA:
+               rv = ds_submit_data_cb(ds, pkt, DS_DTYPE_LDC_REQ);
+               break;
+       case DS_NACK:
+               rv = ds_handle_data_nack(ds, pkt);
+               break;
+       default:
+               /*
+                * XXX - If we receive an unknown msg_type, per spec,
+                * we are supposed to send back a nack with the handle
+                * However, since this is an unknown msg_type,
+                * we don't know how to retrieve the handle from the msg!
+                * (a deficiency with the protocol). Let's just hope
+                * the handle is the first 8 bytes of the payload...?
+                */
+               unknown_msg = (struct ds_unknown_msg *)pkt;
+               ds_send_data_nack(ds, unknown_msg->payload.handle,
+                   DS_TYPE_UNKNOWN);
+               rv = 0;
+       };
+
+       return rv;
+}
+
+static int ds_service_reg(struct ds_dev *ds, struct ds_service_info *svc_info)
+{
+       int rv;
+       int payload_len;
+       struct {
+               struct ds_reg_req_payload req;
+               u8 id_buf[256];
+       } pbuf;
+
+       dprintk("entered.\n");
+
+       payload_len = (sizeof(struct ds_reg_req_payload) +
+                  strlen(svc_info->id) + 1);
+
+       /* adjust for 4 bytes of default padding of ds_reg_req_payload */
+       payload_len -= 4;
+
+       memset(&pbuf, 0, sizeof(pbuf));
+       pbuf.req.handle = svc_info->handle; /* use the unique handle */
+       pbuf.req.major = svc_info->vers.major;
+       pbuf.req.minor = svc_info->vers.minor;
+       strcpy(pbuf.req.svc_id, svc_info->id);
+
+       rv = ds_ldc_send_payload(ds->lp, DS_REG_REQ, &pbuf, payload_len);
+
+       if (rv > 0)
+               dprintk("ds-%llu: DS_REG_REQ sent for %s service (%llu.%llu), "
+                   "hdl=(%llx)\n", ds->id, svc_info->id, svc_info->vers.major,
+                   svc_info->vers.minor, svc_info->handle);
+
+       return (rv <= 0);
+}
+
+static int ds_service_unreg(struct ds_dev *ds, u64 handle)
+{
+       struct ds_unreg_req_payload req;
+       int rv;
+
+       dprintk("entered.\n");
+
+       req.handle = handle;
+
+       rv = ds_ldc_send_payload(ds->lp, DS_UNREG_REQ, &req, sizeof(req));
+
+       return (rv <= 0);
+}
+
+static void ds_service_ack(struct ds_dev *ds, u64 handle, u16 minor)
+{
+       struct ds_reg_ack_payload req;
+       int rv;
+
+       dprintk("entered.\n");
+
+       req.handle = handle;
+       req.minor = minor;
+
+       rv = ds_ldc_send_payload(ds->lp, DS_REG_ACK, &req, sizeof(req));
+       if (rv <= 0)
+               pr_err("ds-%llu: %s: ldc_send failed. (%d)\n ", ds->id,
+                   __func__, rv);
+}
+
+static void ds_service_nack(struct ds_dev *ds, u64 handle, u64 result,
+       u16 major)
+{
+       struct ds_reg_nack_payload req;
+       int rv;
+
+       dprintk("entered.\n");
+
+       req.handle = handle;
+       req.result = result;
+       req.major = major;
+
+       rv = ds_ldc_send_payload(ds->lp, DS_REG_NACK, &req, sizeof(req));
+       if (rv <= 0)
+               pr_err("ds-%llu: %s: ldc_send failed. (%d)\n ", ds->id,
+                   __func__, rv);
+
+}
+
+static void ds_service_unreg_ack(struct ds_dev *ds, u64 handle)
+{
+       struct ds_unreg_ack_payload req;
+       int rv;
+
+       dprintk("entered.\n");
+
+       req.handle = handle;
+
+       rv = ds_ldc_send_payload(ds->lp, DS_UNREG_ACK, &req, sizeof(req));
+       if (rv <= 0)
+               pr_err("ds-%llu: %s: ldc_send failed. (%d)\n ", ds->id,
+                   __func__, rv);
+
+}
+
+/*
+ * Process DS service registration packets received from LDC.
+ */
+static int ds_handshake_reg(struct ds_dev *ds, struct ds_msg_tag *pkt)
+{
+       int rv;
+       u16 neg_svc_minor;
+       struct ds_reg_req *reg_req = NULL;
+       struct ds_reg_ack *reg_ack = NULL;
+       struct ds_reg_nack *reg_nack = NULL;
+       struct ds_unreg_req *unreg_req = NULL;
+       struct ds_unreg_ack *unreg_ack = NULL;
+       struct ds_unreg_nack *unreg_nack = NULL;
+       struct ds_service_info *svc_info;
+
+       dprintk("entered.\n");
+
+       rv = 0;
+
+       if (ds->hs_state != DS_HS_COMPLETE) {
+               /*
+                * We should not be getting service registration type
+                * packets unless the HS has been established, so reset
+                * to get back to a sane state.
+                */
+               pr_err("ds-%llu: ds_handshake_reg: received REG packet "
+                   "but HS is not complete!\n", ds->id);
+               goto conn_reset;
+       }
+
+       /*
+        * In HS_COMPLETE state, we expect only the following service
+        * registration packets:
+        * DS_REG_REQ: The other end of the LDC is requesting registration
+        *             of a service.
+        *             Action:
+        *             If we have a provider or client registered for
+        *             this service, ACK with the supported minor and
+        *             connect the service.
+        *             Use major sent in request and lowest minor.
+        *             If we don't have a registered service, NACK it.
+        * DS_REG_ACK: The other end of the LDC has ACK'd our request to
+        *             register a service.
+        *             Action:
+        *             Use the handle sent in the ACK.
+        *             Use the major sent with the original request and
+        *             lowest minor.
+        * DS_REG_NACK: The other end of the LDC has NACK'd our request
+        *             to register a service.
+        *
+        * DS_UNREG_REQ:
+        * DS_UNREG_ACK:
+        * DS_UNREG_NACK: Behave according to the spec.
+        */
+
+       if (pkt->type == DS_REG_REQ) {
+
+               /* Other end has sent a register request */
+
+               reg_req = (struct ds_reg_req *)pkt;
+
+               /*
+                * For compatibility with Solaris ldoms on mixed
+                * systems, if we receive a REG_REQ with the
+                * DS_HDL_ISCLIENT_BIT, it is an indication (or "ping")
+                * to send a REG_REQ for any provider services for this
+                * svc_id.
+                */
+
+               if (reg_req->payload.handle & DS_HDL_ISCLIENT_BIT) {
+
+                       dprintk("ds-%llu: Received REG_REQ 'ping' "
+                           "for %s service", ds->id,
+                           reg_req->payload.svc_id);
+
+                       /*
+                        * If there is a provider service in SENT
+                        * state (which means the service never got
+                        * connected), put it back into UNREG state
+                        * so it will be registered again.
+                        */
+                       svc_info = ds_find_service_provider_id(ds,
+                           reg_req->payload.svc_id);
+                       if (svc_info != NULL &&
+                           svc_info->reg_state == DS_REG_STATE_REG_SENT) {
+                               svc_info->reg_state = DS_REG_STATE_UNREG;
+                               svc_info->svc_reg_timeout =
+                                   ds_get_service_timeout();
+                       }
+
+                       goto done;
+
+               }
+
+               /* check if there is a registered service for this request */
+               svc_info = ds_find_service_client_id(ds,
+                   reg_req->payload.svc_id);
+               if (svc_info == NULL) {
+                       svc_info = ds_find_service_provider_id(ds,
+                           reg_req->payload.svc_id);
+                       if (svc_info == NULL) {
+                               /* There is no registered service */
+                               dprintk("ds-%llu: no service registered for "
+                                   "REG_REQ service %s (%llx)\n", ds->id,
+                                   reg_req->payload.svc_id,
+                                   reg_req->payload.handle);
+
+                               /* NACK it */
+                               ds_service_nack(ds, reg_req->payload.handle,
+                                   DS_INV_HDL, 0);
+
+                               goto done;
+                       }
+               }
+
+               /* Found a registered service */
+
+               if (svc_info->is_connected) {
+                       /* service is already registered */
+                       ds_service_nack(ds, reg_req->payload.handle,
+                           DS_REG_DUP, 0);
+                       goto done;
+               }
+
+               if (reg_req->payload.major != svc_info->vers.major) {
+                       /* service version is incompatible */
+                       ds_service_nack(ds, reg_req->payload.handle,
+                           DS_REG_VER_NACK, 0);
+                       goto done;
+               }
+
+               neg_svc_minor = min_t(u16, (u16)svc_info->vers.minor,
+                   reg_req->payload.minor);
+
+               if (svc_info->is_client)
+                       ds_connect_service_client(ds, reg_req->payload.handle,
+                           reg_req->payload.major, neg_svc_minor, svc_info);
+               else
+                       ds_connect_service_provider(ds, reg_req->payload.handle,
+                           reg_req->payload.major, neg_svc_minor, svc_info);
+
+               /* ACK the init request */
+               ds_service_ack(ds, reg_req->payload.handle,
+                   (u16)svc_info->vers.minor);
+
+               dprintk("ds-%llu: Registered %s %s service (%llx) "
+                   "version %llu.%llu  to (%llx).\n", ds->id,
+                   (svc_info->is_client ? "Client" : "Provider"),
+                   svc_info->id, svc_info->handle,
+                   svc_info->neg_vers.major,
+                   svc_info->neg_vers.minor,
+                   svc_info->con_handle);
+
+       } else if (pkt->type == DS_REG_ACK) {
+
+               /* other end has ACK'd our reg request */
+
+               reg_ack = (struct ds_reg_ack *)pkt;
+
+               svc_info = ds_find_service_provider_handle(ds,
+                   reg_ack->payload.handle);
+               if (svc_info == NULL) {
+                       svc_info = ds_find_service_client_handle(ds,
+                           reg_ack->payload.handle);
+
+                       if (svc_info == NULL) {
+                               /* no service for this handle */
+                               pr_err("ds-%llu: REG ACK for unknown "
+                                   "handle %llx\n", ds->id,
+                                   reg_ack->payload.handle);
+                               goto done;
+                       }
+               }
+
+               if (svc_info->reg_state != DS_REG_STATE_REG_SENT) {
+                       pr_err("ds-%llu: REG ACK for %s service in "
+                           "%llu state (%llx)\n", ds->id, svc_info->id,
+                           svc_info->reg_state, reg_ack->payload.handle);
+                       goto done;
+               }
+
+               /* Use the lowest negotiated DS minor version */
+               neg_svc_minor = min_t(u16, reg_ack->payload.minor,
+                   svc_info->vers.minor);
+
+               if (svc_info->is_client)
+                       ds_connect_service_client(ds, reg_ack->payload.handle,
+                           svc_info->vers.major, neg_svc_minor, svc_info);
+               else
+                       ds_connect_service_provider(ds, reg_ack->payload.handle,
+                           svc_info->vers.major, neg_svc_minor, svc_info);
+
+
+               dprintk("ds-%llu: Registered %s service "
+                   "version %llu.%llu (%llx).\n", ds->id,
+                   svc_info->id, svc_info->neg_vers.major,
+                   svc_info->neg_vers.minor, svc_info->handle);
+
+       } else if (pkt->type == DS_REG_NACK) {
+
+               /* other end has NACK'd our reg request */
+
+               reg_nack = (struct ds_reg_nack *)pkt;
+
+               svc_info = ds_find_service_provider_handle(ds,
+                   reg_nack->payload.handle);
+               if (svc_info == NULL) {
+                       svc_info = ds_find_service_client_handle(ds,
+                           reg_nack->payload.handle);
+                       if (svc_info == NULL) {
+                               /* No service for this handle */
+                               pr_err("ds-%llu: REG NACK for "
+                                   "unknown handle %llx\n",
+                                   ds->id, reg_nack->payload.handle);
+                               goto done;
+                       }
+               }
+
+               if (svc_info->reg_state != DS_REG_STATE_REG_SENT) {
+                       pr_err("ds-%llu: REG NACK for %s service in "
+                           "%llu state (%llx)\n", ds->id, svc_info->id,
+                           svc_info->reg_state, reg_nack->payload.handle);
+                       goto done;
+               }
+
+               /*
+                * If a service is NACK'd for any reason we simply put
+                * the service into UNREG state. At some point in the
+                * future, the service registration will be re-tried
+                * by the timer thread.
+                */
+               svc_info->reg_state = DS_REG_STATE_UNREG;
+               svc_info->svc_reg_timeout = ds_get_service_timeout();
+
+               dprintk("ds-%llu: Registration nack'd for %s service "
+                   "(%llx). Result=%llu. Major=%u\n", ds->id, svc_info->id,
+                   reg_nack->payload.handle, reg_nack->payload.result,
+                   reg_nack->payload.major);
+
+       } else if (pkt->type == DS_UNREG_REQ) {
+
+               /* other end has sent a unregister request */
+
+               unreg_req = (struct ds_unreg_req *)pkt;
+
+               /* unregister any service associated with the handle */
+
+               /* see if service registered */
+               svc_info = ds_find_service_client_con_handle(ds,
+                   unreg_req->payload.handle);
+               if (svc_info == NULL) {
+                       svc_info = ds_find_service_provider_con_handle(ds,
+                           unreg_req->payload.handle);
+
+                       if (svc_info == NULL) {
+                               /* There is no service */
+
+                               pr_err("ds-%llu: no service registered for "
+                                   "UNREG_REQ handle %llx\n", ds->id,
+                                  unreg_req->payload.handle);
+
+                               /*
+                                * Our service could have been unregistered and
+                                * removed. Go ahead and ACK it. This allows
+                                * the other side to still clean up properly.
+                                */
+                               ds_service_unreg_ack(ds,
+                                   unreg_req->payload.handle);
+
+                               goto done;
+                       }
+               }
+
+
+               if (svc_info->reg_state != DS_REG_STATE_REGISTERED_LDC) {
+                       pr_err("ds-%llu: UNREG_REQ for %s service in "
+                           "%llu state (%llx)\n", ds->id, svc_info->id,
+                           svc_info->reg_state, unreg_req->payload.handle);
+                       goto done;
+               }
+
+               dprintk("ds-%llu: Unregistered %s service (%llx) "
+                   "from (%llx).\n", ds->id, svc_info->id,
+                   svc_info->con_handle, unreg_req->payload.handle);
+
+               if (svc_info->is_client)
+                       ds_disconnect_service_client(ds, svc_info);
+               else
+                       ds_disconnect_service_provider(ds, svc_info);
+
+               /* ACK the unreg request */
+               ds_service_unreg_ack(ds, unreg_req->payload.handle);
+
+       } else if (pkt->type == DS_UNREG_ACK) {
+
+               /* Got an ACK to our UNREG_REQ */
+
+               unreg_ack = (struct ds_unreg_ack *)pkt;
+
+               svc_info = ds_find_service_client_con_handle(ds,
+                   unreg_ack->payload.handle);
+               if (svc_info == NULL) {
+                       svc_info = ds_find_service_provider_con_handle(ds,
+                           unreg_ack->payload.handle);
+                       if (svc_info == NULL) {
+                               /*
+                                * There is no service for this handle.
+                                * It's possible the service was
+                                * unregistered and removed.
+                                */
+                               dprintk("ds-%llu: UNREG ACK for unknown "
+                                   "handle %llx\n", ds->id,
+                                   unreg_ack->payload.handle);
+                               goto done;
+                       }
+               }
+
+               dprintk("ds-%llu: Unregistered %s service (%llx).\n",
+                   ds->id, svc_info->id, unreg_ack->payload.handle);
+
+               if (svc_info->is_client)
+                       ds_disconnect_service_client(ds, svc_info);
+               else
+                       ds_disconnect_service_provider(ds, svc_info);
+
+       } else if (pkt->type == DS_UNREG_NACK) {
+
+               /* Got a NACK to our UNREG_REQ */
+
+               unreg_nack = (struct ds_unreg_nack *)pkt;
+
+               /* XXX - what to do on an unreg NACK??? */
+
+               dprintk("ds-%llu: Received UNREG_NACK for (%llx).\n",
+                   ds->id, unreg_nack->payload.handle);
+
+       } else {
+               /* Unexpected packet type. Reset to get back to a sane state. */
+               goto conn_reset;
+       }
+
+done:
+       return 0;
+
+conn_reset:
+
+       ds_reset(ds);
+
+       return -ECONNRESET;
+}
+
+static int ds_is_local_ds(struct ds_dev *ds)
+{
+       struct mdesc_handle *hp;
+       u64 cd_node;
+       u64 anode;
+       u64 target;
+       const u64 *local_handle;
+
+       if (!ds_local_ldom_handle_set) {
+               /*
+                * Find the virtual-domain-service node under the
+                * channel-devices node in the MD which
+                * contains the vlds-domain-handle property.
+                * This is the "local" ldom handle.
+                * Cache it in ds_local_ldom_handle global var.
+                */
+               hp = mdesc_grab();
+               if (hp) {
+                       /* get the channel-devices ndoe in the MD */
+                       cd_node = mdesc_node_by_name(hp, MDESC_NODE_NULL,
+                           "channel-devices");
+                       if (cd_node != MDESC_NODE_NULL) {
+                               /*
+                                * For each node under look for the
+                                * virtual-device node which contains the
+                                * vlds-domain-handle property.
+                                */
+                               mdesc_for_each_arc(anode, hp, cd_node,
+                                   MDESC_ARC_TYPE_FWD) {
+
+                                       target = mdesc_arc_target(hp, anode);
+
+                                       local_handle = mdesc_get_property(hp,
+                                           target, "vlds-domain-handle", NULL);
+                                       if (local_handle != NULL) {
+                                               ds_local_ldom_handle =
+                                                   *local_handle;
+                                               ds_local_ldom_handle_set = true;
+                                       }
+                               }
+                       }
 
-       rp = (struct ds_var_resp *) (dpkt + 1);
+                       mdesc_release(hp);
+               }
+       }
 
-       if (rp->hdr.type != DS_VAR_SET_RESP &&
-           rp->hdr.type != DS_VAR_DELETE_RESP)
-               return;
+       if (ds_local_ldom_handle_set &&
+           ds->handle == ds_local_ldom_handle) {
+               return 1;
+       }
 
-       ds_var_response = rp->result;
-       wmb();
-       ds_var_doorbell = 1;
+       return 0;
 }
 
-void ldom_set_var(const char *var, const char *value)
+static void ds_timer_register_service(struct ds_dev *ds,
+               struct ds_service_info *svc_info)
 {
-       struct ds_cap_state *cp;
-       struct ds_info *dp;
-       unsigned long flags;
+       struct ds_service_info *peer_svc_info;
+       int rv;
 
-       spin_lock_irqsave(&ds_lock, flags);
-       cp = NULL;
-       for (dp = ds_info_list; dp; dp = dp->next) {
-               struct ds_cap_state *tmp;
+       /* Check if the service is allowed to register yet */
+       if (jiffies < svc_info->svc_reg_timeout)
+               return;
 
-               tmp = find_cap_by_string(dp, "var-config");
-               if (tmp && tmp->state == CAP_STATE_REGISTERED) {
-                       cp = tmp;
-                       break;
-               }
-       }
-       if (!cp) {
-               for (dp = ds_info_list; dp; dp = dp->next) {
-                       struct ds_cap_state *tmp;
+       if (svc_info->reg_state != DS_REG_STATE_UNREG)
+               return;
 
-                       tmp = find_cap_by_string(dp, "var-config-backup");
-                       if (tmp && tmp->state == CAP_STATE_REGISTERED) {
-                               cp = tmp;
-                               break;
+       /* We have a service ready to be registered. */
+
+       /*
+        * First check to see if there is a local unconnected loopback peer
+        * for this service id and if so, connect it in loopback mode.
+        * NOTE: we only allow loopback connections on the "local" DS port.
+        */
+       if (ds_is_local_ds(ds)) {
+               if (svc_info->is_client)
+                       peer_svc_info = ds_find_service_provider_id(ds,
+                           svc_info->id);
+               else
+                       peer_svc_info = ds_find_service_client_id(ds,
+                           svc_info->id);
+
+               if (peer_svc_info && !peer_svc_info->is_connected) {
+                       rv = ds_connect_loopback_service(ds, svc_info,
+                           peer_svc_info);
+                       if (rv == 0) {
+                               dprintk("ds-%llu: Registered loopback "
+                                   "service %s (%llu)\n", ds->id,
+                                   svc_info->id, svc_info->con_handle);
+                               return;
+                       } else {
+                               pr_err("ds-%llu: failed to connect "
+                                   "loopback %s service\n", ds->id,
+                                       svc_info->id);
                        }
+                       /* fallthrough and attempt LDC registration? */
                }
        }
-       spin_unlock_irqrestore(&ds_lock, flags);
-
-       if (cp) {
-               union {
-                       struct {
-                               struct ds_data          data;
-                               struct ds_var_set_msg   msg;
-                       } header;
-                       char                    all[512];
-               } pkt;
-               char  *base, *p;
-               int msg_len, loops;
-
-               if (strlen(var) + strlen(value) + 2 >
-                   sizeof(pkt) - sizeof(pkt.header)) {
-                       printk(KERN_ERR PFX
-                               "contents length: %zu, which more than max: %lu,"
-                               "so could not set (%s) variable to (%s).\n",
-                               strlen(var) + strlen(value) + 2,
-                               sizeof(pkt) - sizeof(pkt.header), var, value);
+
+       /* Only attempt LDC registration if the HS is complete */
+       if (ds->hs_state == DS_HS_COMPLETE) {
+               rv = ds_service_reg(ds, svc_info);
+               if (rv == 0) {
+                       svc_info->reg_state = DS_REG_STATE_REG_SENT;
+                       /*
+                        * Clear the reg SENT timeout.
+                        * We don't retry unless the LDC is reconnected.
+                        * Or if we receive a client "ping" for the service.
+                        */
+                       svc_info->svc_reg_timeout = 0;
                        return;
+               } else {
+                       dprintk("ds-%llu: failed to send REG_REQ for "
+                               " \"%s\" service (%d)\n", ds->id,
+                               svc_info->id, rv);
                }
+       }
 
-               memset(&pkt, 0, sizeof(pkt));
-               pkt.header.data.tag.type = DS_DATA;
-               pkt.header.data.handle = cp->handle;
-               pkt.header.msg.hdr.type = DS_VAR_SET_REQ;
-               base = p = &pkt.header.msg.name_and_value[0];
-               strcpy(p, var);
-               p += strlen(var) + 1;
-               strcpy(p, value);
-               p += strlen(value) + 1;
-
-               msg_len = (sizeof(struct ds_data) +
-                          sizeof(struct ds_var_set_msg) +
-                          (p - base));
-               msg_len = (msg_len + 3) & ~3;
-               pkt.header.data.tag.len = msg_len - sizeof(struct ds_msg_tag);
-
-               mutex_lock(&ds_var_mutex);
-
-               spin_lock_irqsave(&ds_lock, flags);
-               ds_var_doorbell = 0;
-               ds_var_response = -1;
-
-               __ds_send(dp->lp, &pkt, msg_len);
-               spin_unlock_irqrestore(&ds_lock, flags);
-
-               loops = 1000;
-               while (ds_var_doorbell == 0) {
-                       if (loops-- < 0)
-                               break;
-                       barrier();
-                       udelay(100);
-               }
+       /*
+        * We failed to register the service.
+        * Try again in the future.
+        */
+       svc_info->svc_reg_timeout = ds_get_service_timeout();
+}
 
-               mutex_unlock(&ds_var_mutex);
+static void ds_exec_reg_timer(unsigned long data)
+{
+       struct ds_dev *ds = (struct ds_dev *)data;
+       unsigned long flags;
+       struct ds_service_info *svc_info;
+       int rv;
 
-               if (ds_var_doorbell == 0 ||
-                   ds_var_response != DS_VAR_SUCCESS)
-                       pr_info("ds-%llu: var-config [%s:%s] failed, response(%d).\n",
-                              dp->id, var, value, ds_var_response);
-       } else {
-               pr_info("var-config not registered so could not set (%s) variable to (%s).\n",
-                      var, value);
+#ifdef DS_KERNEL_TIMER_BUG_WAR
+       /*
+        * There appears to be a bug in the UEK kernel where
+        * timers can execute on a CPU where local interrupts
+        * have been disabled. Deadlocks have been observed
+        * where the DS registration timer (ds_reg_tmr) can
+        * execute on a CPU, interrupting a thread on the CPU
+        * which is holding the ds->ds_lock or the ds->lp->lock
+        * resulting in a deadlock when the timer attempts
+        * to grab the lock. As a workaround, the timer handler will
+        * first check if the locks are held and if so, simply
+        * reschedule the timer and exit (without grabbing the
+        * locks - thus avoiding the deadlock). the kernel needs
+        * to be fixed at some point since executing timers
+        * on CPUs with local interrupts disabled is a violation
+        * of spin_lock_irqsave() semantics.
+        */
+       if (spin_is_locked(&ds->ds_lock) || spin_is_locked(&ds->lp->lock)) {
+               mod_timer(&ds->ds_reg_tmr,
+                   jiffies + msecs_to_jiffies(DS_REG_TIMER_FREQ));
+               return;
        }
-}
+#endif /* DS_KERNEL_TIMER_BUG_WAR */
 
-static char full_boot_str[256] __aligned(32);
-static int reboot_data_supported;
+       LOCK_DS_DEV(ds, flags)
 
-void ldom_reboot(const char *boot_command)
-{
-       /* Don't bother with any of this if the boot_command
-        * is empty.
+       /*
+        * Walk through the services for this ds and for those
+        * which are not yet registered, (re)send a REG_REQ.
         */
-       if (boot_command && strlen(boot_command)) {
-               unsigned long len;
+       list_for_each_entry(svc_info, &ds->service_provider_list, list)
+               ds_timer_register_service(ds, svc_info);
 
-               snprintf(full_boot_str, sizeof(full_boot_str), "boot %s",
-                        boot_command);
-               len = strlen(full_boot_str);
+       list_for_each_entry(svc_info, &ds->service_client_list, list)
+               ds_timer_register_service(ds, svc_info);
 
-               if (reboot_data_supported) {
-                       unsigned long ra = kimage_addr_to_ra(full_boot_str);
-                       unsigned long hv_ret;
+       /* reset the timer to fire again in DS_REG_TIMER_FREQ ms */
+       rv = mod_timer(&ds->ds_reg_tmr,
+           jiffies + msecs_to_jiffies(DS_REG_TIMER_FREQ));
 
-                       hv_ret = sun4v_reboot_data_set(ra, len);
-                       if (hv_ret != HV_EOK)
-                               pr_err("SUN4V: Unable to set reboot data hv_ret=%lu\n",
-                                       hv_ret);
-               } else {
-                       ldom_set_var("reboot-command", full_boot_str);
-               }
-       }
-       sun4v_mach_sir();
-}
+       UNLOCK_DS_DEV(ds, flags)
 
-void ldom_power_off(void)
-{
-       sun4v_mach_exit(0);
 }
 
-static void ds_conn_reset(struct ds_info *dp)
+static void ds_start_service_reg_timer(struct ds_dev *ds)
 {
-       pr_err("ds-%llu: ds_conn_reset() from %pf\n", dp->id,
-               __builtin_return_address(0));
+       int rv;
+
+       dprintk("entered.\n");
+
+       setup_timer(&ds->ds_reg_tmr, ds_exec_reg_timer,
+           (unsigned long)ds);
+
+       /* kick off the first timer in DS_REG_TIMER_FREQ ms */
+       rv = mod_timer(&ds->ds_reg_tmr,
+           jiffies + msecs_to_jiffies(DS_REG_TIMER_FREQ));
+
+       if (rv)
+               pr_err("ds-%llu: Error setting ds registration timer",
+                       ds->id);
 }
 
-static unsigned long long register_services(struct ds_info *dp)
+
+/*
+ * NOTE: All kernel ds services are defined as providers, no matter if
+ * they actually behave as a server or as client.
+ */
+static void ds_add_builtin_services(struct ds_dev *ds,
+       struct ds_builtin_service *ds_builtin_service_template,
+       int num_template_services)
 {
-       struct ldc_channel *lp = dp->lp;
+
+       struct ds_service_info  *svc_info;
        int i;
-       unsigned long long nreg = 0;
-
-       for (i = 0; i < dp->num_ds_states; i++) {
-               struct {
-                       struct ds_reg_req req;
-                       u8 id_buf[256];
-               } pbuf;
-               struct ds_cap_state *cp = &dp->ds_states[i];
-               int err, msg_len;
-               u64 new_count;
-
-               if (cp->state == CAP_STATE_REGISTERED)
-                       continue;
 
-               nreg |= (1 << i);
+       dprintk("entered.\n");
 
-               /* solaris service domains think 0x80000000 indicates clients */
-               new_count = sched_clock() & 0x7fffffff;
-               cp->handle = ((u64) i << 32) | new_count;
+       /* walk the builtin service provider array and add to the ds */
+       for (i = 0; i < num_template_services; i++) {
 
-               msg_len = (sizeof(struct ds_reg_req) +
-                          strlen(cp->service_id));
+               /*
+                * If there is already a registered service provider
+                * for this id, skip it since there can only be 1
+                * service provider per ds/service id.
+                */
+               svc_info = ds_find_service_provider_id(ds,
+                   ds_builtin_service_template[i].id);
 
-               memset(&pbuf, 0, sizeof(pbuf));
-               pbuf.req.tag.type = DS_REG_REQ;
-               pbuf.req.tag.len = (msg_len - sizeof(struct ds_msg_tag));
-               pbuf.req.handle = cp->handle;
-               pbuf.req.major = 1;
-               pbuf.req.minor = 0;
-               strcpy(pbuf.req.svc_id, cp->service_id);
+               if (svc_info != NULL)
+                       continue;
 
-               err = __ds_send(lp, &pbuf, msg_len);
-               if (err > 0)
-                       cp->state = CAP_STATE_REG_SENT;
+               /* if no existing service provider, add the builtin */
+               svc_info = ds_add_service_provider(ds,
+                   ds_builtin_service_template[i].id,
+                   ds_builtin_service_template[i].vers,
+                   &ds_builtin_service_template[i].ops,
+                   true);
+
+               if (svc_info == NULL)
+                       pr_err("ds-%llu: Failed to add builtin "
+                           "provider service %s", ds->id,
+                           ds_builtin_service_template[i].id);
        }
-       return nreg;
-}
 
-static struct timer_list ds_reg_tmr;
-static int reg_cnt;
+}
 
-static void ds_run_timer(unsigned long data)
+static int ds_init_req(struct ds_dev *ds)
 {
-       unsigned long flags;
-       unsigned long long ret;
-       struct ds_info *dp = (struct ds_info *)data;
+       struct ds_ver_req_payload req;
+       int rv;
 
-       spin_lock_irqsave(&ds_lock, flags);
-       ret = register_services(dp);
-       ++reg_cnt;
-       spin_unlock_irqrestore(&ds_lock, flags);
+       dprintk("entered.\n");
 
-       if (!ret)
-               return;
+       /* send a DS version init request */
+       req.ver.major = DS_MAJOR_VERSION;
+       req.ver.minor = DS_MINOR_VERSION;
 
-       if (reg_cnt > 5) {
-               int i;
-               for (i = 0; i < dp->num_ds_states; i++)
-                       if (ret & (1 << i)) {
-                               struct ds_cap_state *cp = &dp->ds_states[i];
-                               pr_err("ds-%llu: registration of \"%s\" failed\n",
-                                       dp->id, cp->service_id);
-                       }
-       } else {
-               ret = mod_timer(&ds_reg_tmr, jiffies + msecs_to_jiffies(3000));
-               if (ret)
-                       pr_err("ds-%llu: Error setting timer callback\n",
-                               dp->id);
-       }
+       rv = ds_ldc_send_payload(ds->lp, DS_INIT_REQ, &req, sizeof(req));
+
+       return (rv <= 0);
 }
 
-static void ds_setup_retry_timer(struct ds_info *dp)
+static void ds_init_ack(struct ds_dev *ds)
 {
-       int ret;
+       struct ds_ver_ack_payload req;
+       int rv;
 
-       /*
-        * "reliable" ldc communication will not catch if ack/nack's are
-        * not received for service registering attempts. retry via timer.
-        */
-       setup_timer(&ds_reg_tmr, ds_run_timer, (unsigned long)dp);
+       dprintk("entered.\n");
+
+       req.minor = DS_MINOR_VERSION;
+
+       rv = ds_ldc_send_payload(ds->lp, DS_INIT_ACK, &req, sizeof(req));
+       if (rv <= 0)
+               pr_err("ds-%llu: %s: ldc_send failed. (%d)\n ", ds->id,
+                   __func__, rv);
 
-       ret = mod_timer(&ds_reg_tmr, jiffies + msecs_to_jiffies(2000));;
-       if (ret)
-               pr_err("ds-%llu: Error setting ds registration retry timer\n",
-                       dp->id);
 }
 
-static int ds_handshake(struct ds_info *dp, struct ds_msg_tag *pkt)
+static void ds_init_nack(struct ds_dev *ds, u16 major)
 {
+       struct ds_ver_nack_payload req;
+       int rv;
 
-       if (dp->hs_state == DS_HS_START) {
-               if (pkt->type != DS_INIT_ACK)
-                       goto conn_reset;
+       dprintk("entered.\n");
 
-               dp->hs_state = DS_HS_DONE;
-               ds_setup_retry_timer(dp);
-               return register_services(dp);
-       }
+       req.major = major;
 
-       if (dp->hs_state != DS_HS_DONE)
-               goto conn_reset;
+       rv = ds_ldc_send_payload(ds->lp, DS_INIT_NACK, &req, sizeof(req));
+       if (rv <= 0)
+               pr_err("ds-%llu: %s: ldc_send failed. (%d)\n ", ds->id,
+                   __func__, rv);
 
-       if (pkt->type == DS_REG_ACK) {
-               struct ds_reg_ack *ap = (struct ds_reg_ack *) pkt;
-               struct ds_cap_state *cp = find_cap(dp, ap->handle);
+}
 
-               if (!cp) {
-                       pr_err("ds-%llu: REG ACK for unknown handle %llx\n",
-                               dp->id, ap->handle);
-                       return 0;
+/* Process DS init packets received from LDC. */
+static int ds_handshake_init(struct ds_dev *ds, struct ds_msg_tag *pkt)
+{
+       struct ds_ver_req *init_req;
+       struct ds_ver_ack *init_ack;
+       u16 neg_ds_major;
+       u16 neg_ds_minor;
+
+       dprintk("entered.\n");
+
+       if (ds->hs_state != DS_HS_START) {
+
+               if (ds->hs_state == DS_HS_COMPLETE) {
+                       /*
+                        * If an INIT type pkt comes through while in
+                        * HS_COMPLETE state, it could be a extraneuous packet
+                        * left over from a (simultaneous) handshake. So, we
+                        * will just ignore it since the connection has already
+                        * been established. No need to error out.
+                        */
+                       goto done;
                }
-               pr_info("ds-%llu: Registered %s service.\n", dp->id,
-                       cp->service_id);
-               cp->state = CAP_STATE_REGISTERED;
-       } else if (pkt->type == DS_REG_NACK) {
-               struct ds_reg_nack *np = (struct ds_reg_nack *) pkt;
-               struct ds_cap_state *cp = find_cap(dp, np->handle);
 
-               if (!cp) {
-                       pr_err("ds-%llu: REG NACK for unknown handle %llx\n",
-                              dp->id, np->handle);
-                       return 0;
-               }
-               cp->state = CAP_STATE_UNKNOWN;
+               /* Invalid state, reset to get sane again */
+               goto conn_reset;
        }
 
-       return 0;
+       /*
+        * In the DS_HS_START state, only valid pkt types are:
+        * DS_INIT_REQ: Other end of LDC is requesting INIT of DS.
+        *              Action:
+        *              If the sent major is compatible, ACK
+        *              with supported minor.
+        *              Use major sent in request and lowest minor.
+        * DS_INIT_ACK: Other end of LDC has ack'd our DS INIT request.
+        *              Action:
+        *              Use major sent in original INIT_REQ and
+        *              lowest minor.
+        * DS_INIT_NACK: Other end of LDC nack'd our DS INIT request.
+        *              Action:
+        *              Remiain in HS_START state. Other side could try to
+        *              init the DS (with an acceptable major #).
+        */
 
-conn_reset:
-       ds_conn_reset(dp);
-       return -ECONNRESET;
-}
+       if (pkt->type == DS_INIT_REQ) {
 
-static void __send_ds_nack(struct ds_info *dp, u64 handle)
-{
-       struct ds_data_nack nack = {
-               .tag = {
-                       .type = DS_NACK,
-                       .len = (sizeof(struct ds_data_nack) -
-                               sizeof(struct ds_msg_tag)),
-               },
-               .handle = handle,
-               .result = DS_INV_HDL,
-       };
+               init_req = (struct ds_ver_req *)pkt;
 
-       __ds_send(dp->lp, &nack, sizeof(nack));
-}
+               /* Check if the major is compatible */
 
-static LIST_HEAD(ds_work_list);
-static DECLARE_WAIT_QUEUE_HEAD(ds_wait);
+               /* NOTE - we currently only support DS_MAJOR_VERSION.  */
+               if (init_req->payload.ver.major != DS_MAJOR_VERSION) {
+                       /*
+                        * Incompatible major, NACK it. But remain in
+                        * HS_START state since it's possible our
+                        * INIT_REQ will still be successfully ACK'd.
+                        */
+                       ds_init_nack(ds, 0);
+                       goto done;
+               }
 
-struct ds_queue_entry {
-       struct list_head                list;
-       struct ds_info                  *dp;
-       int                             req_len;
-       int                             __pad;
-       u64                             req[0];
-};
+               /* Use the requested DS major version */
+               neg_ds_major = init_req->payload.ver.major;
 
-static void process_ds_work(void)
-{
-       struct ds_queue_entry *qp, *tmp;
-       unsigned long flags;
-       LIST_HEAD(todo);
+               /* Use the lowest negotiated DS minor version */
+               neg_ds_minor = min_t(u16, init_req->payload.ver.minor,
+                   DS_MINOR_VERSION);
 
-       spin_lock_irqsave(&ds_lock, flags);
-       list_splice_init(&ds_work_list, &todo);
-       spin_unlock_irqrestore(&ds_lock, flags);
+               /* ACK the init request */
+               ds_init_ack(ds);
 
-       list_for_each_entry_safe(qp, tmp, &todo, list) {
-               struct ds_data *dpkt = (struct ds_data *) qp->req;
-               struct ds_info *dp = qp->dp;
-               struct ds_cap_state *cp = find_cap(dp, dpkt->handle);
-               int req_len = qp->req_len;
+       } else if (pkt->type == DS_INIT_ACK) {
 
-               if (!cp) {
-                       pr_err("ds-%llu: Data for unknown handle %llu\n",
-                              dp->id, dpkt->handle);
+               init_ack = (struct ds_ver_ack *)pkt;
 
-                       spin_lock_irqsave(&ds_lock, flags);
-                       __send_ds_nack(dp, dpkt->handle);
-                       spin_unlock_irqrestore(&ds_lock, flags);
-               } else {
-                       cp->data(dp, cp, dpkt, req_len);
-               }
+               /* Use the major version we sent in the INIT request */
+               neg_ds_major = DS_MAJOR_VERSION;
+
+               /* Use the lowest negotiated DS minor version */
+               neg_ds_minor = min_t(u16, init_ack->payload.minor,
+                   DS_MINOR_VERSION);
 
-               list_del(&qp->list);
-               kfree(qp);
+       } else if (pkt->type == DS_INIT_NACK) {
+               /*
+                * If we get a NACK, per spec, we could try another
+                * request with an alternate major number. However, for now,
+                * we do not and we just remain in HS_START state.
+                * We remain in START state so the other end could
+                * still potentially make/complete a HS init request.
+                * If code is ever added in the future to retry the INIT_REQ
+                * with an alternate major, per spec, the code should use the
+                * major returned in the NACK.
+                */
+               goto done;
+
+       } else {
+
+               /* Unexpected packet type. Reset to get back to a sane state. */
+               goto conn_reset;
        }
-}
 
-static int ds_thread(void *__unused)
-{
-       DEFINE_WAIT(wait);
+       /* assign the negotiated maj/min for the DS connection */
+       ds->neg_vers.major = (u64)neg_ds_major;
+       ds->neg_vers.minor = (u64)neg_ds_minor;
 
-       while (1) {
-               prepare_to_wait(&ds_wait, &wait, TASK_INTERRUPTIBLE);
-               if (list_empty(&ds_work_list))
-                       schedule();
-               finish_wait(&ds_wait, &wait);
+       /* Handshake established, move to complete state */
+       ds->hs_state = DS_HS_COMPLETE;
 
-               if (kthread_should_stop())
-                       break;
+       /*
+        * If there were any services which failed to
+        * register before, then try to re-register them.
+        */
+       ds_reregister_ldc_services(ds);
 
-               process_ds_work();
-       }
+       dprintk("ds-%llu: DS INIT HS Complete Version=%llu.%llu.\n", ds->id,
+           ds->neg_vers.major, ds->neg_vers.minor);
 
+done:
        return 0;
+
+conn_reset:
+
+       ds_reset(ds);
+
+       return -ECONNRESET;
+
 }
 
-static int ds_data(struct ds_info *dp, struct ds_msg_tag *pkt, int len)
+static int ds_handshake_msg(struct ds_dev *ds, struct ds_msg_tag *pkt)
 {
-       struct ds_data *dpkt = (struct ds_data *) pkt;
-       struct ds_queue_entry *qp;
 
-       qp = kmalloc(sizeof(struct ds_queue_entry) + len, GFP_ATOMIC);
-       if (!qp) {
-               __send_ds_nack(dp, dpkt->handle);
-       } else {
-               qp->dp = dp;
-               memcpy(&qp->req, pkt, len);
-               list_add_tail(&qp->list, &ds_work_list);
-               wake_up(&ds_wait);
+       dprintk("entered.\n");
+
+       dprintk("ds-%llu: ds_handshake: hs_state=%d, pkt_type = %d\n", ds->id,
+           ds->hs_state, pkt->type);
+
+       if (ds->hs_state == DS_HS_LDC_DOWN) {
+
+               /* We should not be getting HS packets until the LDC is UP */
+
+               pr_err("ds-%llu: ds_handshake: received HS packet "
+                   "but LDC is down!\n", ds->id);
+
+               /* reset the connection to get back to a sane state */
+               goto conn_reset;
        }
-       return 0;
-}
 
-static void ds_up(struct ds_info *dp)
-{
-       struct ldc_channel *lp = dp->lp;
-       struct ds_ver_req req;
-       int err;
+       switch (pkt->type) {
+       case DS_INIT_REQ:
+       case DS_INIT_ACK:
+       case DS_INIT_NACK:
+
+               /* handle ds initialization packets */
+               return ds_handshake_init(ds, pkt);
+
+       case DS_REG_REQ:
+       case DS_REG_ACK:
+       case DS_REG_NACK:
+       case DS_UNREG_REQ:
+       case DS_UNREG_ACK:
+       case DS_UNREG_NACK:
+
+               /* handle service registration packets */
+               return ds_handshake_reg(ds, pkt);
 
-       req.tag.type = DS_INIT_REQ;
-       req.tag.len = sizeof(req) - sizeof(struct ds_msg_tag);
-       req.ver.major = 1;
-       req.ver.minor = 0;
+       default:
+               /* Invalid pkt type */
+               pr_err("ds-%llu: Invalid pkt received %d\n", ds->id, pkt->type);
+               return -EINVAL;
+       }
+
+conn_reset:
 
-       err = __ds_send(lp, &req, sizeof(req));
-       if (err > 0)
-               dp->hs_state = DS_HS_START;
+       ds_reset(ds);
+
+       return -ECONNRESET;
 }
 
-static void ds_reset(struct ds_info *dp)
+static void ds_up(struct ds_dev *ds)
 {
-       int i;
+       int rv;
 
-       dp->hs_state = 0;
+       dprintk("entered.\n");
 
-       for (i = 0; i < dp->num_ds_states; i++) {
-               struct ds_cap_state *cp = &dp->ds_states[i];
+       /* reset the HS state machine */
+       ds->hs_state = DS_HS_START;
 
-               cp->state = CAP_STATE_UNKNOWN;
-       }
+       /* send a DS init request */
+       rv = ds_init_req(ds);
+
+       if (rv != 0)
+               pr_err("ds-%llu: failed to send DS_INIT_REQ (%d)\n",
+                   ds->id, rv);
 }
 
 static void ds_event(void *arg, int event)
 {
-       struct ds_info *dp = arg;
-       struct ldc_channel *lp = dp->lp;
+       struct ds_dev *ds = arg;
        unsigned long flags;
-       int err;
+       int rv;
 
-       spin_lock_irqsave(&ds_lock, flags);
+       dprintk("ds-%llu: CPU[%d] event received = %d\n", ds->id,
+           smp_processor_id(), event);
+
+       /*
+        * NOTE - we don't use the UN/LOCK_DS_DEV macros here
+        * since we do not need to disable the HV interrupt - since
+        * we are in the interrupt handler.
+        */
+       spin_lock_irqsave(&ds->ds_lock, flags);
 
        if (event == LDC_EVENT_UP) {
-               ds_up(dp);
-               spin_unlock_irqrestore(&ds_lock, flags);
+               ds_up(ds);
+               spin_unlock_irqrestore(&ds->ds_lock, flags);
                return;
        }
 
        if (event == LDC_EVENT_RESET) {
-               ds_reset(dp);
-               spin_unlock_irqrestore(&ds_lock, flags);
+               ds_reset(ds);
+               spin_unlock_irqrestore(&ds->ds_lock, flags);
                return;
        }
 
        if (event != LDC_EVENT_DATA_READY) {
-               pr_warn("ds-%llu: Unexpected LDC event %d\n", dp->id, event);
-               spin_unlock_irqrestore(&ds_lock, flags);
+               pr_err("ds-%llu: Unexpected LDC event %d\n", ds->id, event);
+               spin_unlock_irqrestore(&ds->ds_lock, flags);
                return;
        }
 
-       err = 0;
+       rv = 0;
        while (1) {
                struct ds_msg_tag *tag;
 
-               err = ldc_read(lp, dp->rcv_buf, sizeof(*tag));
+               rv = ldc_read(ds->lp, ds->rcv_buf, sizeof(*tag));
 
-               if (unlikely(err < 0)) {
-                       if (err == -ECONNRESET)
-                               ds_conn_reset(dp);
+               if (unlikely(rv < 0)) {
+                       if (rv == -ECONNRESET)
+                               ds_reset(ds);
                        break;
                }
-               if (err == 0)
+
+               if (rv == 0)
                        break;
 
-               tag = dp->rcv_buf;
-               err = ldc_read(lp, tag + 1, tag->len);
+               tag = (struct ds_msg_tag *)ds->rcv_buf;
 
-               if (unlikely(err < 0)) {
-                       if (err == -ECONNRESET)
-                               ds_conn_reset(dp);
+               /* Make sure the read won't overrun our buffer */
+               if (tag->len > (DS_DEFAULT_BUF_SIZE -
+                   sizeof(struct ds_msg_tag))) {
+                       pr_err("ds-%llu: %s: msg tag length too big.\n",
+                           ds->id, __func__);
+                       ds_reset(ds);
                        break;
                }
-               if (err < tag->len)
+
+               rv = ldc_read(ds->lp, tag + 1, tag->len);
+
+               if (unlikely(rv < 0)) {
+                       if (rv == -ECONNRESET)
+                               ds_reset(ds);
                        break;
+               }
 
-               if (tag->type < DS_DATA)
-                       err = ds_handshake(dp, dp->rcv_buf);
-               else
-                       err = ds_data(dp, dp->rcv_buf,
-                                     sizeof(*tag) + err);
-               if (err == -ECONNRESET)
+               if (rv < tag->len)
                        break;
+
+               if (tag->type < DS_DATA) {
+                       dprintk("ds-%llu: hs data received (%d bytes)\n",
+                           ds->id, rv);
+                       rv = ds_handshake_msg(ds,
+                           (struct ds_msg_tag *)ds->rcv_buf);
+               } else {
+                       dprintk("ds-%llu: data received (%d bytes)\n",
+                           ds->id, rv);
+                       /* only process data if the HS is complete */
+                       if (ds->hs_state == DS_HS_COMPLETE) {
+                               rv = ds_data_msg(ds,
+                                   (struct ds_msg_tag *)ds->rcv_buf);
+                       } else {
+                               /* just eat the data packet */
+                               pr_err("ds-%llu: %s: received data for "
+                                   "unconnected DS - ignored.\n",
+                                   ds->id, __func__);
+                               rv = 0;
+                       }
+               }
+
+               if (unlikely(rv < 0)) {
+
+                       if (rv == -ECONNRESET)
+                               break;
+
+                       pr_err("ds-%llu: %s: failed process data "
+                               "packet rv = %d\n", ds->id, __func__, rv);
+               }
+       }
+
+       spin_unlock_irqrestore(&ds->ds_lock, flags);
+}
+
+static long ds_fops_ioctl(struct file *filp, unsigned int cmd,
+               unsigned long arg)
+{
+       ds_ioctl_sptok_data_t __user *uarg;
+       u32                     major_version;
+       u32                     minor_version;
+       u32                     sp_token_result;
+       ds_sptok_t              sp_token_data;
+       char                    service_name[DS_MAX_SVC_NAME_LEN];
+       int rv;
+
+       dprintk("entered.\n");
+
+       rv = 0;
+
+       switch (cmd) {
+       case DS_SPTOK_GET:
+               pr_info("%s Getting sp-token\n", __func__);
+               uarg = (ds_ioctl_sptok_data_t __user *)arg;
+               if (get_user(major_version, &uarg->major_version) != 0 ||
+                   get_user(minor_version, &uarg->minor_version) != 0 ||
+                   copy_from_user(service_name, &uarg->service_name,
+                           DS_MAX_SVC_NAME_LEN)) {
+                       return -EFAULT;
+               }
+               if ((major_version > DS_MAJOR_VERSION) ||
+                   (major_version == DS_MAJOR_VERSION &&
+                    minor_version > DS_MINOR_VERSION)) {
+                       pr_err("%s Invalid version number %u.%u\n",
+                           __func__, major_version, minor_version);
+                       return -EINVAL;
+               }
+               rv = ldom_req_sp_token(service_name, &sp_token_result,
+                   &sp_token_data);
+               if (!rv && sp_token_result == DS_SP_TOKEN_RES_OK) {
+                       dprintk("Copying sp token to userland\n");
+                       if (copy_to_user(&uarg->sp_tok,
+                           (void *)&sp_token_data,
+                           sizeof(struct ds_sptok))) {
+                               rv = -EFAULT;
+                       }
+               }
+               break;
+       default:
+               pr_err("%s Invalid cmd (%d)\n", __func__, cmd);
+               rv = -EINVAL;
        }
 
-       spin_unlock_irqrestore(&ds_lock, flags);
+       return rv;
 }
 
 static int ds_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 {
-       static int ds_version_printed;
        struct ldc_channel_config ds_cfg = {
                .event          = ds_event,
-               .mtu            = 4096,
+               .mtu            = DS_DEFAULT_MTU,
                .mode           = LDC_MODE_STREAM,
        };
        struct mdesc_handle *hp;
        struct ldc_channel *lp;
-       struct ds_info *dp;
+       struct ds_dev *ds;
        const u64 *val;
-       int err, i;
+       char ds_irq_name[LDC_IRQ_NAME_MAX];
+       unsigned long flags;
+       unsigned long ds_flags = 0;
+       bool is_sp;
+       u64 node;
+       int rv;
 
-       if (ds_version_printed++ == 0)
-               pr_info("%s", version);
+       dprintk("entered.\n");
 
-       dp = kzalloc(sizeof(*dp), GFP_KERNEL);
-       err = -ENOMEM;
-       if (!dp)
+       ds = kzalloc(sizeof(struct ds_dev), GFP_KERNEL);
+       rv = -ENOMEM;
+       if (unlikely(!ds))
                goto out_err;
 
+       spin_lock_init(&ds->ds_lock);
+
+       INIT_LIST_HEAD(&ds->service_provider_list);
+       INIT_LIST_HEAD(&ds->service_client_list);
+       INIT_LIST_HEAD(&ds->callout_list);
+
+       ds->co_ref_cnt = 0;
+       ds->active = true;
+
        hp = mdesc_grab();
-       val = mdesc_get_property(hp, vdev->mp, "id", NULL);
-       if (val)
-               dp->id = *val;
-       mdesc_release(hp);
 
-       dp->rcv_buf = kzalloc(4096, GFP_KERNEL);
-       if (!dp->rcv_buf)
-               goto out_free_dp;
+       node = vio_vdev_node(hp, vdev);
+       if (node == MDESC_NODE_NULL) {
+               dprintk("ds: Failed to get vdev MD node.\n");
+               mdesc_release(hp);
+               rv = -ENXIO;
+               goto out_free_ds;
+       }
+
+       val = mdesc_get_property(hp, node, "id", NULL);
+       if (val == NULL) {
+               mdesc_release(hp);
+               rv = -ENXIO;
+               goto out_free_ds;
+       } else {
+               ds->id = *val;
+       }
 
-       dp->rcv_buf_len = 4096;
+       /* The SP DS port is identified by a unique ldc-ids property */
+       val = mdesc_get_property(hp, node, "ldc-ids", NULL);
+       is_sp = (val != NULL);
 
-       dp->ds_states = kmemdup(ds_states_template,
-                               sizeof(ds_states_template), GFP_KERNEL);
-       if (!dp->ds_states)
-               goto out_free_rcv_buf;
+       val = mdesc_get_property(hp, node, "vlds-remote-domain-handle",
+           NULL);
+       if (val == NULL) {
+               /* Not all DS ports have a handle (such as the SP DS port). */
+               ds->handle = DS_INVALID_HANDLE;
+       } else {
+               ds->handle = *val;
+       }
+
+       mdesc_release(hp);
 
-       dp->num_ds_states = ARRAY_SIZE(ds_states_template);
+       /* If this is not the SP DS, then this is a domain DS */
+       ds->is_domain = !is_sp;
 
-       for (i = 0; i < dp->num_ds_states; i++)
-               dp->ds_states[i].handle = ((u64)i << 32);
+       ds->rcv_buf = kzalloc(DS_DEFAULT_BUF_SIZE, GFP_KERNEL);
+       if (unlikely(!ds->rcv_buf))
+               goto out_free_ds;
 
+       ds->rcv_buf_len = DS_DEFAULT_BUF_SIZE;
+
+       ds->hs_state = DS_HS_LDC_DOWN;
+
+       ds_cfg.debug = 0;
        ds_cfg.tx_irq = vdev->tx_irq;
        ds_cfg.rx_irq = vdev->rx_irq;
+       ds_cfg.rx_ino = vdev->rx_ino;
+       ds_cfg.tx_ino = vdev->tx_ino;
+       ds_cfg.dev_handle = vdev->dev_handle;
 
-       lp = ldc_alloc(vdev->channel_id, &ds_cfg, dp, "DS");
+       /* create the irq name for the ldc */
+       (void) scnprintf(ds_irq_name, LDC_IRQ_NAME_MAX, "DS-%llu", ds->handle);
+
+       lp = ldc_alloc(vdev->channel_id, &ds_cfg, ds, ds_irq_name);
        if (IS_ERR(lp)) {
-               err = PTR_ERR(lp);
-               goto out_free_ds_states;
+               rv = PTR_ERR(lp);
+               goto out_free_rcv_buf;
        }
-       dp->lp = lp;
+       ds->lp = lp;
 
-       err = ldc_bind(lp);
-       if (err)
+       /*
+        * As soon as we bind the LDC, we can start getting
+        * events. So grab the ds_lock here and hold it
+        * until we are done initializing the ds.
+        */
+       LOCK_DS_DEV(ds, ds_flags)
+
+       rv = ldc_bind(lp);
+       if (rv) {
+               UNLOCK_DS_DEV(ds, ds_flags)
                goto out_free_ldc;
+       }
 
-       spin_lock_irq(&ds_lock);
-       dp->next = ds_info_list;
-       ds_info_list = dp;
-       spin_unlock_irq(&ds_lock);
+       (void) ldc_connect(ds->lp);
 
-       return err;
+       dev_set_drvdata(&vdev->dev, ds);
 
-out_free_ldc:
-       ldc_free(dp->lp);
+       ds->next_service_handle = 1; /* start assigning handles from 1 */
+
+       /* add primary builtin services */
+       if (ds->id == DS_PRIMARY_ID)
+               ds_add_builtin_services(ds, ds_primary_builtin_template,
+                   ARRAY_SIZE(ds_primary_builtin_template));
+
+       /* add SP builtin services */
+       if (is_sp)
+               ds_add_builtin_services(ds, ds_sp_builtin_template,
+                   ARRAY_SIZE(ds_sp_builtin_template));
 
-out_free_ds_states:
-       kfree(dp->ds_states);
+       /* add the ds_dev to the global ds_data device list */
+       spin_lock_irqsave(&ds_data_lock, flags);
+       list_add_tail(&ds->list, &ds_data.ds_dev_list);
+       ds_data.num_ds_dev_list++;
+       spin_unlock_irqrestore(&ds_data_lock, flags);
+
+       /*
+        * begin the process of registering services.
+        * Note - we do this here to allow loopback services
+        * even if the DS LDC connection/handshake fails to establish.
+        */
+       ds_start_service_reg_timer(ds);
+
+       dprintk("ds-%llu: probe successful for domain %llu (channel_id=%lu).\n",
+           ds->id, ds->handle, vdev->channel_id);
+
+       UNLOCK_DS_DEV(ds, ds_flags)
+
+       return rv;
+
+out_free_ldc:
+       ldc_free(ds->lp);
 
 out_free_rcv_buf:
-       kfree(dp->rcv_buf);
+       kfree(ds->rcv_buf);
 
-out_free_dp:
-       kfree(dp);
+out_free_ds:
+       kfree(ds);
 
 out_err:
-       return err;
+       return rv;
 }
 
 static int ds_remove(struct vio_dev *vdev)
 {
+       struct ds_dev *ds;
+       struct ds_callout_entry_hdr *qhdrp;
+       struct ds_callout_entry_hdr *tmp;
+       unsigned long flags;
+       unsigned long ds_flags;
+
+       dprintk("entered.\n");
+
+       ds = dev_get_drvdata(&vdev->dev);
+
+       if (ds == NULL)
+               return 0;
+
+       /*
+        * Lock the global ds_dev list to prevent another thread
+        * from finding the ds in the list while we are removing it.
+        */
+       spin_lock_irqsave(&ds_data_lock, flags);
+
+       /*
+        * Lock down the ds_dev to prevent removing it
+        * while being used by another thread.
+        */
+       LOCK_DS_DEV(ds, ds_flags)
+
+       /* remove the ds_dev from the global ds_data device list */
+       list_del(&ds->list);
+       ds_data.num_ds_dev_list--;
+
+       del_timer(&ds->ds_reg_tmr);
+
+       ds_remove_services(ds);
+
+       ds->hs_state = DS_HS_LDC_DOWN;
+
+       ldc_disconnect(ds->lp);
+
+       ldc_unbind(ds->lp);
+
+       ldc_free(ds->lp);
+
+       kfree(ds->rcv_buf);
+
+       /* free any entries left on the callout list */
+       list_for_each_entry_safe(qhdrp, tmp, &ds->callout_list, list) {
+               list_del(&qhdrp->list);
+               kfree(qhdrp);
+               ds->co_ref_cnt--;
+       }
+
+       dprintk("ds-%llu: removing domain %llu (co_ref_cnt=%llu)\n",
+           ds->id, ds->handle, ds->co_ref_cnt);
+
+       /*
+        * When the callout thread processes work entries, it
+        * creates a local list of entries which can contain
+        * references to this ds. So, we maintain
+        * a ds reference count for entries on the callout todo list.
+        * If there are no outstanding references to this ds, free
+        * the ds now (it's safely locked down). If there are outstanding
+        * references (because the callout thread is currently processing them),
+        * allow the callout thread to clean things up - we do not want to
+        * remove the ds here since the callout thread will reference it.
+        */
+       if (ds->co_ref_cnt == 0) {
+               UNLOCK_DS_DEV(ds, ds_flags);
+               kfree(ds);
+       } else {
+               /*
+                * Mark the ds_dev as inactive.
+                * ds_dev will be cleaned up by the
+                * callout processing.
+                */
+               ds->active = false;
+               UNLOCK_DS_DEV(ds, ds_flags)
+       }
+
+       spin_unlock_irqrestore(&ds_data_lock, flags);
+
        return 0;
 }
 
@@ -1294,22 +3996,54 @@ static struct vio_driver ds_driver = {
        .id_table       = ds_match,
        .probe          = ds_probe,
        .remove         = ds_remove,
-       .name           = "ds",
+       .name           = DRV_MODULE_NAME,
+};
+
+static struct file_operations ds_fops = {
+       .owner                  = THIS_MODULE,
+       .unlocked_ioctl         = ds_fops_ioctl
+};
+
+static struct miscdevice ds_miscdev = {
+       .minor = MISC_DYNAMIC_MINOR,
+       .name = DRV_MODULE_NAME,
+       .fops = &ds_fops
 };
 
 static int __init ds_init(void)
 {
        unsigned long hv_ret, major, minor;
+       struct task_struct *callout_task;
+       int             err;
+
+       /* set the default ldoms debug level */
+       dsdbg_level = ldoms_debug_level;
+
+       dprintk("%s", version);
+
+       INIT_LIST_HEAD(&ds_data.ds_dev_list);
+       ds_data.num_ds_dev_list = 0;
+
+       err = misc_register(&ds_miscdev);
+       if (err)
+               return err;
+
+       dprintk("minor is %d.\n", ds_miscdev.minor);
 
        if (tlb_type == hypervisor) {
                hv_ret = sun4v_get_version(HV_GRP_REBOOT_DATA, &major, &minor);
                if (hv_ret == HV_EOK) {
-                       pr_info("SUN4V: Reboot data supported (maj=%lu,min=%lu).\n",
-                               major, minor);
+                       dprintk("SUN4V: Reboot data supported "
+                           "(maj=%lu,min=%lu).\n", major, minor);
                        reboot_data_supported = 1;
                }
        }
-       kthread_run(ds_thread, NULL, "kldomd");
+
+       callout_task = kthread_run(ds_callout_thread, NULL, "ldoms-ds");
+       if (IS_ERR(callout_task)) {
+               misc_deregister(&ds_miscdev);
+               return PTR_ERR(callout_task);
+       }
 
        return vio_register_driver(&ds_driver);
 }
index 1ae5eb1bb045130f05c8edc4573dbcd75e00475f..d6ac8969d315880d7e302cd0670e046e597a2c1c 100644 (file)
@@ -34,7 +34,6 @@
 
 static char version[] =
        DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
-#define LDC_PACKET_SIZE                64
 
 /* Packet header layout for unreliable and reliable mode frames.
  * When in RAW mode, packets are simply straight 64-byte payloads
@@ -178,6 +177,8 @@ do {        if (lp->cfg.debug & LDC_DEBUG_##TYPE) \
                printk(KERN_INFO PFX "ID[%lu] " f, lp->id, ## a); \
 } while (0)
 
+#define        LDC_ABORT(lp)   ldc_abort((lp), __func__)
+
 static const char *state_to_str(u8 state)
 {
        switch (state) {
@@ -196,15 +197,6 @@ static const char *state_to_str(u8 state)
        }
 }
 
-static void ldc_set_state(struct ldc_channel *lp, u8 state)
-{
-       ldcdbg(STATE, "STATE (%s) --> (%s)\n",
-              state_to_str(lp->state),
-              state_to_str(state));
-
-       lp->state = state;
-}
-
 static unsigned long __advance(unsigned long off, unsigned long num_entries)
 {
        off += LDC_PACKET_SIZE;
@@ -516,11 +508,12 @@ static int send_data_nack(struct ldc_channel *lp, struct ldc_packet *data_pkt)
        return err;
 }
 
-static int ldc_abort(struct ldc_channel *lp)
+static int ldc_abort(struct ldc_channel *lp, const char *msg)
 {
        unsigned long hv_err;
 
-       ldcdbg(STATE, "ABORT\n");
+       ldcdbg(STATE, "ABORT[%s]\n", msg);
+       ldc_print(lp);
 
        /* We report but do not act upon the hypervisor errors because
         * there really isn't much we can do if they fail at this point.
@@ -605,7 +598,7 @@ static int process_ver_info(struct ldc_channel *lp, struct ldc_version *vp)
                }
        }
        if (err)
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
 
        return 0;
 }
@@ -618,13 +611,13 @@ static int process_ver_ack(struct ldc_channel *lp, struct ldc_version *vp)
        if (lp->hs_state == LDC_HS_GOTVERS) {
                if (lp->ver.major != vp->major ||
                    lp->ver.minor != vp->minor)
-                       return ldc_abort(lp);
+                       return LDC_ABORT(lp);
        } else {
                lp->ver = *vp;
                lp->hs_state = LDC_HS_GOTVERS;
        }
        if (send_rts(lp))
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
        return 0;
 }
 
@@ -635,17 +628,17 @@ static int process_ver_nack(struct ldc_channel *lp, struct ldc_version *vp)
        unsigned long new_tail;
 
        if (vp->major == 0 && vp->minor == 0)
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
 
        vap = find_by_major(vp->major);
        if (!vap)
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
 
        p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS,
                                           vap, sizeof(*vap),
                                           &new_tail);
        if (!p)
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
 
        return send_tx_packet(lp, p, new_tail);
 }
@@ -668,7 +661,7 @@ static int process_version(struct ldc_channel *lp,
                return process_ver_nack(lp, vp);
 
        default:
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
        }
 }
 
@@ -681,13 +674,13 @@ static int process_rts(struct ldc_channel *lp,
        if (p->stype     != LDC_INFO       ||
            lp->hs_state != LDC_HS_GOTVERS ||
            p->env       != lp->cfg.mode)
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
 
        lp->snd_nxt = p->seqid;
        lp->rcv_nxt = p->seqid;
        lp->hs_state = LDC_HS_SENTRTR;
        if (send_rtr(lp))
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
 
        return 0;
 }
@@ -700,7 +693,7 @@ static int process_rtr(struct ldc_channel *lp,
 
        if (p->stype     != LDC_INFO ||
            p->env       != lp->cfg.mode)
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
 
        lp->snd_nxt = p->seqid;
        lp->hs_state = LDC_HS_COMPLETE;
@@ -723,7 +716,7 @@ static int process_rdx(struct ldc_channel *lp,
 
        if (p->stype != LDC_INFO ||
            !(rx_seq_ok(lp, p->seqid)))
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
 
        lp->rcv_nxt = p->seqid;
 
@@ -750,14 +743,14 @@ static int process_control_frame(struct ldc_channel *lp,
                return process_rdx(lp, p);
 
        default:
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
        }
 }
 
 static int process_error_frame(struct ldc_channel *lp,
                               struct ldc_packet *p)
 {
-       return ldc_abort(lp);
+       return LDC_ABORT(lp);
 }
 
 static int process_data_ack(struct ldc_channel *lp,
@@ -776,12 +769,45 @@ static int process_data_ack(struct ldc_channel *lp,
                        return 0;
                }
                if (head == lp->tx_tail)
-                       return ldc_abort(lp);
+                       return LDC_ABORT(lp);
        }
 
        return 0;
 }
 
+void ldc_enable_hv_intr(struct ldc_channel *lp)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&lp->lock, flags);
+
+       ldcdbg(RX, "ldc_enable_hv_intr: dh=%llu, ino=%llu\n",
+              lp->cfg.dev_handle, lp->cfg.rx_ino);
+       sun4v_vintr_set_valid(lp->cfg.dev_handle, lp->cfg.rx_ino,
+                             HV_INTR_ENABLED);
+
+       spin_unlock_irqrestore(&lp->lock, flags);
+
+}
+EXPORT_SYMBOL(ldc_enable_hv_intr);
+
+
+void ldc_disable_hv_intr(struct ldc_channel *lp)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&lp->lock, flags);
+
+       ldcdbg(RX, "ldc_disable_hv_intr: dh=%llu, ino=%llu\n",
+              lp->cfg.dev_handle, lp->cfg.rx_ino);
+       sun4v_vintr_set_valid(lp->cfg.dev_handle, lp->cfg.rx_ino,
+                             HV_INTR_DISABLED);
+
+       spin_unlock_irqrestore(&lp->lock, flags);
+
+}
+EXPORT_SYMBOL(ldc_disable_hv_intr);
+
 static void send_events(struct ldc_channel *lp, unsigned int event_mask)
 {
        if (event_mask & LDC_EVENT_RESET)
@@ -820,16 +846,21 @@ static irqreturn_t ldc_rx(int irq, void *dev_id)
                lp->hs_state = LDC_HS_COMPLETE;
                ldc_set_state(lp, LDC_STATE_CONNECTED);
 
-               event_mask |= LDC_EVENT_UP;
-
-               orig_state = lp->chan_state;
+               /*
+                * Generate an LDC_EVENT_UP event if the channel
+                * was not already up.
+                */
+               if (orig_state != LDC_CHANNEL_UP) {
+                       event_mask |= LDC_EVENT_UP;
+                       orig_state = lp->chan_state;
+               }
        }
 
        /* If we are in reset state, flush the RX queue and ignore
         * everything.
         */
        if (lp->flags & LDC_FLAG_RESET) {
-               (void) __set_rx_head(lp, lp->rx_tail);
+               (void) ldc_rx_reset(lp);
                goto out;
        }
 
@@ -880,7 +911,7 @@ handshake_complete:
                        break;
 
                default:
-                       err = ldc_abort(lp);
+                       err = LDC_ABORT(lp);
                        break;
                }
 
@@ -895,7 +926,7 @@ handshake_complete:
 
                err = __set_rx_head(lp, new);
                if (err < 0) {
-                       (void) ldc_abort(lp);
+                       (void) LDC_ABORT(lp);
                        break;
                }
                if (lp->hs_state == LDC_HS_COMPLETE)
@@ -936,7 +967,14 @@ static irqreturn_t ldc_tx(int irq, void *dev_id)
                lp->hs_state = LDC_HS_COMPLETE;
                ldc_set_state(lp, LDC_STATE_CONNECTED);
 
-               event_mask |= LDC_EVENT_UP;
+               /*
+                * Generate an LDC_EVENT_UP event if the channel
+                * was not already up.
+                */
+               if (orig_state != LDC_CHANNEL_UP) {
+                       event_mask |= LDC_EVENT_UP;
+                       orig_state = lp->chan_state;
+               }
        }
 
        spin_unlock_irqrestore(&lp->lock, flags);
@@ -1342,6 +1380,14 @@ int ldc_bind(struct ldc_channel *lp)
        lp->hs_state = LDC_HS_OPEN;
        ldc_set_state(lp, LDC_STATE_BOUND);
 
+       if (lp->cfg.mode == LDC_MODE_RAW) {
+               /*
+                * There is no handshake in RAW mode, so handshake
+                * is completed.
+                */
+               lp->hs_state = LDC_HS_COMPLETE;
+       }
+
        spin_unlock_irqrestore(&lp->lock, flags);
 
        return 0;
@@ -1447,12 +1493,62 @@ int ldc_state(struct ldc_channel *lp)
 }
 EXPORT_SYMBOL(ldc_state);
 
+void ldc_set_state(struct ldc_channel *lp, u8 state)
+{
+       ldcdbg(STATE, "STATE (%s) --> (%s)\n",
+              state_to_str(lp->state),
+              state_to_str(state));
+
+       lp->state = state;
+}
+EXPORT_SYMBOL(ldc_set_state);
+
+int ldc_mode(struct ldc_channel *lp)
+{
+       return lp->cfg.mode;
+}
+EXPORT_SYMBOL(ldc_mode);
+
+int ldc_rx_reset(struct ldc_channel *lp)
+{
+       return __set_rx_head(lp, lp->rx_tail);
+}
+EXPORT_SYMBOL(ldc_rx_reset);
+
+void ldc_clr_reset(struct ldc_channel *lp)
+{
+       lp->flags &= ~LDC_FLAG_RESET;
+}
+EXPORT_SYMBOL(ldc_clr_reset);
+
+void ldc_print(struct ldc_channel *lp)
+{
+       pr_info("%s: id=0x%lx flags=0x%x state=%s cstate=0x%lx hsstate=0x%x\n"
+               "\trx_h=0x%lx rx_t=0x%lx rx_n=%ld\n"
+               "\ttx_h=0x%lx tx_t=0x%lx tx_n=%ld\n"
+               "\trcv_nxt=%u snd_nxt=%u\n",
+       __func__, lp->id, lp->flags, state_to_str(lp->state),
+       lp->chan_state, lp->hs_state,
+       lp->rx_head, lp->rx_tail, lp->rx_num_entries,
+       lp->tx_head, lp->tx_tail, lp->tx_num_entries,
+       lp->rcv_nxt, lp->snd_nxt);
+}
+EXPORT_SYMBOL(ldc_print);
+
 static int write_raw(struct ldc_channel *lp, const void *buf, unsigned int size)
 {
        struct ldc_packet *p;
-       unsigned long new_tail;
+       unsigned long new_tail, hv_err;
        int err;
 
+       hv_err = sun4v_ldc_tx_get_state(lp->id, &lp->tx_head, &lp->tx_tail,
+                                       &lp->chan_state);
+       if (unlikely(hv_err))
+               return -EBUSY;
+
+       if (unlikely(lp->chan_state != LDC_CHANNEL_UP))
+               return LDC_ABORT(lp);
+
        if (size > LDC_PACKET_SIZE)
                return -EMSGSIZE;
 
@@ -1483,7 +1579,7 @@ static int read_raw(struct ldc_channel *lp, void *buf, unsigned int size)
                                        &lp->rx_tail,
                                        &lp->chan_state);
        if (hv_err)
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
 
        if (lp->chan_state == LDC_CHANNEL_DOWN ||
            lp->chan_state == LDC_CHANNEL_RESETTING)
@@ -1526,7 +1622,7 @@ static int write_nonraw(struct ldc_channel *lp, const void *buf,
                return -EBUSY;
 
        if (unlikely(lp->chan_state != LDC_CHANNEL_UP))
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
 
        if (!tx_has_space_for(lp, size))
                return -EAGAIN;
@@ -1592,9 +1688,9 @@ static int rx_bad_seq(struct ldc_channel *lp, struct ldc_packet *p,
        if (err)
                return err;
 
-       err = __set_rx_head(lp, lp->rx_tail);
+       err = ldc_rx_reset(lp);
        if (err < 0)
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
 
        return 0;
 }
@@ -1607,7 +1703,7 @@ static int data_ack_nack(struct ldc_channel *lp, struct ldc_packet *p)
                        return err;
        }
        if (p->stype & LDC_NACK)
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
 
        return 0;
 }
@@ -1627,7 +1723,7 @@ static int rx_data_wait(struct ldc_channel *lp, unsigned long cur_head)
                                                &lp->rx_tail,
                                                &lp->chan_state);
                if (hv_err)
-                       return ldc_abort(lp);
+                       return LDC_ABORT(lp);
 
                if (lp->chan_state == LDC_CHANNEL_DOWN ||
                    lp->chan_state == LDC_CHANNEL_RESETTING)
@@ -1650,7 +1746,7 @@ static int rx_set_head(struct ldc_channel *lp, unsigned long head)
        int err = __set_rx_head(lp, head);
 
        if (err < 0)
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
 
        lp->rx_head = head;
        return 0;
@@ -1689,7 +1785,7 @@ static int read_nonraw(struct ldc_channel *lp, void *buf, unsigned int size)
                                        &lp->rx_tail,
                                        &lp->chan_state);
        if (hv_err)
-               return ldc_abort(lp);
+               return LDC_ABORT(lp);
 
        if (lp->chan_state == LDC_CHANNEL_DOWN ||
            lp->chan_state == LDC_CHANNEL_RESETTING)
@@ -1733,9 +1829,14 @@ static int read_nonraw(struct ldc_channel *lp, void *buf, unsigned int size)
 
                lp->rcv_nxt = p->seqid;
 
+               /*
+                * If this is a control-only packet, there is nothing
+                * else to do but advance the rx queue since the packet
+                * was already processed above.
+                */
                if (!(p->type & LDC_DATA)) {
                        new = rx_advance(lp, new);
-                       goto no_data;
+                       break;
                }
                if (p->stype & (LDC_ACK | LDC_NACK)) {
                        err = data_ack_nack(lp, p);
@@ -1871,6 +1972,25 @@ static const struct ldc_mode_ops stream_ops = {
        .read           =       read_stream,
 };
 
+int ldc_tx_space_available(struct ldc_channel *lp, unsigned long size)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&lp->lock, flags);
+
+       /* tx_has_space_for() works for all modes */
+       if (!tx_has_space_for(lp, size)) {
+               spin_unlock_irqrestore(&lp->lock, flags);
+               return 0;
+       }
+
+       spin_unlock_irqrestore(&lp->lock, flags);
+
+       return 1;
+
+}
+EXPORT_SYMBOL(ldc_tx_space_available);
+
 int ldc_write(struct ldc_channel *lp, const void *buf, unsigned int size)
 {
        unsigned long flags;
@@ -1895,11 +2015,36 @@ int ldc_write(struct ldc_channel *lp, const void *buf, unsigned int size)
 }
 EXPORT_SYMBOL(ldc_write);
 
+int ldc_rx_data_available(struct ldc_channel *lp)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&lp->lock, flags);
+
+       if (lp->cfg.mode == LDC_MODE_STREAM && lp->mssbuf_len > 0) {
+               spin_unlock_irqrestore(&lp->lock, flags);
+               return 1;
+       }
+
+       if (lp->rx_head == lp->rx_tail) {
+               spin_unlock_irqrestore(&lp->lock, flags);
+               return 0;
+       }
+
+       spin_unlock_irqrestore(&lp->lock, flags);
+
+       return 1;
+
+}
+EXPORT_SYMBOL(ldc_rx_data_available);
+
 int ldc_read(struct ldc_channel *lp, void *buf, unsigned int size)
 {
        unsigned long flags;
        int err;
 
+       ldcdbg(RX, "ldc_read: entered size=%d\n", size);
+
        if (!buf)
                return -EINVAL;
 
@@ -1915,6 +2060,9 @@ int ldc_read(struct ldc_channel *lp, void *buf, unsigned int size)
 
        spin_unlock_irqrestore(&lp->lock, flags);
 
+       ldcdbg(RX, "ldc_read:mode=%d, head=%lu, tail=%lu rv=%d\n",
+              lp->cfg.mode, lp->rx_head, lp->rx_tail, err);
+
        return err;
 }
 EXPORT_SYMBOL(ldc_read);
index b9fe42450308cc45a4e47b2483225eebd1c46891..4d6242ab5f342a6f6b2268538385f699f917883f 100644 (file)
@@ -75,6 +75,58 @@ struct mdesc_handle {
        struct mdesc_hdr        mdesc;
 };
 
+typedef int (*mdesc_node_info_f)(struct mdesc_handle *, u64,
+       union md_node_info *);
+typedef bool (*mdesc_node_match_f)(union md_node_info *, union md_node_info *);
+
+struct md_node_ops {
+       char *name;
+       mdesc_node_info_f get_info;
+       mdesc_node_match_f node_match;
+};
+
+static int get_vdev_port_node_info(struct mdesc_handle *md, u64 node,
+       union md_node_info *node_info);
+static bool vdev_port_node_match(union md_node_info *a_node_info,
+       union md_node_info *b_node_info);
+static int get_ds_port_node_info(struct mdesc_handle *md, u64 node,
+       union md_node_info *node_info);
+static bool ds_port_node_match(union md_node_info *a_node_info,
+       union md_node_info *b_node_info);
+
+/* supported node types which can be registered */
+static struct md_node_ops md_node_ops_table[] = {
+       {"virtual-device-port", get_vdev_port_node_info, vdev_port_node_match},
+       {"domain-services-port", get_ds_port_node_info, ds_port_node_match},
+       {NULL, NULL, NULL}
+};
+
+void mdesc_get_node_ops(char *node_name, mdesc_node_info_f *node_info_f,
+       mdesc_node_match_f *node_match_f)
+{
+       int i;
+       mdesc_node_info_f get_info_func;
+       mdesc_node_match_f node_match_func;
+
+       get_info_func = NULL;
+       node_match_func = NULL;
+
+       if (node_name != NULL) {
+               for (i = 0; md_node_ops_table[i].name != NULL; i++) {
+                       if (strcmp(md_node_ops_table[i].name, node_name) == 0) {
+                               get_info_func = md_node_ops_table[i].get_info;
+                               node_match_func =
+                                   md_node_ops_table[i].node_match;
+                               break;
+                       }
+               }
+       }
+
+       *node_info_f = get_info_func;
+       *node_match_f = node_match_func;
+
+}
+
 static void mdesc_handle_init(struct mdesc_handle *hp,
                              unsigned int handle_size,
                              void *base)
@@ -130,26 +182,26 @@ static struct mdesc_mem_ops memblock_mdesc_ops = {
 static struct mdesc_handle *mdesc_kmalloc(unsigned int mdesc_size)
 {
        unsigned int handle_size;
-       struct mdesc_handle *hp;
-       unsigned long addr;
        void *base;
 
        handle_size = (sizeof(struct mdesc_handle) -
                       sizeof(struct mdesc_hdr) +
                       mdesc_size);
+       base = kmalloc(handle_size + 15, GFP_KERNEL | __GFP_REPEAT);
+       if (base) {
+               struct mdesc_handle *hp;
+               unsigned long addr;
 
-       /*
-        * Allocation has to succeed because mdesc update would be missed
-        * and such events are not retransmitted.
-        */
-       base = kmalloc(handle_size + 15, GFP_KERNEL | __GFP_NOFAIL);
-       addr = (unsigned long)base;
-       addr = (addr + 15UL) & ~15UL;
-       hp = (struct mdesc_handle *) addr;
+               addr = (unsigned long)base;
+               addr = (addr + 15UL) & ~15UL;
+               hp = (struct mdesc_handle *) addr;
 
-       mdesc_handle_init(hp, handle_size, base);
+               mdesc_handle_init(hp, handle_size, base);
+               return hp;
+       }
+
+       return NULL;
 
-       return hp;
 }
 
 static void mdesc_kfree(struct mdesc_handle *hp)
@@ -220,15 +272,35 @@ static struct mdesc_notifier_client *client_list;
 void mdesc_register_notifier(struct mdesc_notifier_client *client)
 {
        u64 node;
+       int i;
+       bool supported;
 
        mutex_lock(&mdesc_mutex);
+
+       /* check to see if the node is supported for registration */
+       supported = false;
+       for (i = 0; md_node_ops_table[i].name != NULL; i++) {
+               if (strcmp(md_node_ops_table[i].name, client->node_name) == 0) {
+                       supported = true;
+                       break;
+               }
+       }
+
+       if (!supported) {
+               printk(KERN_ERR "MD: %s: %s node not supported\n",
+                   __func__, client->node_name);
+               mutex_unlock(&mdesc_mutex);
+               return;
+       }
+
        client->next = client_list;
        client_list = client;
 
        mdesc_for_each_node_by_name(cur_mdesc, node, client->node_name)
-               client->add(cur_mdesc, node);
+               client->add(cur_mdesc, node, client->node_name);
 
        mutex_unlock(&mdesc_mutex);
+
 }
 
 static const u64 *parent_cfg_handle(struct mdesc_handle *hp, u64 node)
@@ -250,59 +322,128 @@ static const u64 *parent_cfg_handle(struct mdesc_handle *hp, u64 node)
        return id;
 }
 
+static int get_vdev_port_node_info(struct mdesc_handle *md, u64 node,
+       union md_node_info *node_info)
+{
+       const u64 *idp;
+       const u64 *parent_cfg_hdlp;
+       const char *name;
+
+       /*
+        * Virtual device nodes are distinguished by:
+        * 1. "id" property
+        * 2. "name" property
+        * 3. parent node "cfg-handle" property
+        */
+       idp = mdesc_get_property(md, node, "id", NULL);
+       name = mdesc_get_property(md, node, "name", NULL);
+       parent_cfg_hdlp = parent_cfg_handle(md, node);
+
+       if (!idp || !name || !parent_cfg_hdlp)
+               return -1;
+
+       node_info->vdev_port.id = *idp;
+       strncpy(node_info->vdev_port.name, name, MDESC_MAX_STR_LEN);
+       node_info->vdev_port.parent_cfg_hdl = *parent_cfg_hdlp;
+
+       return 0;
+}
+
+static bool vdev_port_node_match(union md_node_info *a_node_info,
+       union md_node_info *b_node_info)
+{
+       if (a_node_info->vdev_port.id != b_node_info->vdev_port.id)
+               return false;
+
+       if (a_node_info->vdev_port.parent_cfg_hdl !=
+           b_node_info->vdev_port.parent_cfg_hdl)
+               return false;
+
+       if (strncmp(a_node_info->vdev_port.name,
+           b_node_info->vdev_port.name, MDESC_MAX_STR_LEN) != 0)
+               return false;
+
+       return true;
+
+}
+
+static int get_ds_port_node_info(struct mdesc_handle *md, u64 node,
+       union md_node_info *node_info)
+{
+       const u64 *idp;
+
+       /* DS port nodes use the "id" property to distinguish them */
+       idp = mdesc_get_property(md, node, "id", NULL);
+       if (!idp)
+               return -1;
+
+       node_info->ds_port.id = *idp;
+
+       return 0;
+}
+
+
+static bool ds_port_node_match(union md_node_info *a_node_info,
+       union md_node_info *b_node_info)
+{
+       if (a_node_info->ds_port.id != b_node_info->ds_port.id)
+               return false;
+
+       return true;
+}
+
 /* Run 'func' on nodes which are in A but not in B.  */
 static void invoke_on_missing(const char *name,
-                             struct mdesc_handle *a,
-                             struct mdesc_handle *b,
-                             void (*func)(struct mdesc_handle *, u64))
+               struct mdesc_handle *a,
+               struct mdesc_handle *b,
+               void (*func)(struct mdesc_handle *, u64, const char *node_name))
 {
-       u64 node;
+       u64 a_node;
+       u64 b_node;
+       union md_node_info a_node_info;
+       union md_node_info b_node_info;
+       mdesc_node_info_f get_info_func;
+       mdesc_node_match_f node_match_func;
+       int rv;
+       bool found;
+
+       /* Find the get_info and node_match ops for the given node name */
+       mdesc_get_node_ops((char *)name, &get_info_func, &node_match_func);
+
+       /* If we didn't find a match, the node type is not supported */
+       if (get_info_func == NULL || node_match_func == NULL) {
+               printk(KERN_ERR "MD: %s: %s node type is not supported\n",
+                   __func__, name);
+               return;
+       }
 
-       mdesc_for_each_node_by_name(a, node, name) {
-               int found = 0, is_vdc_port = 0;
-               const char *name_prop;
-               const u64 *id;
-               u64 fnode;
-
-               name_prop = mdesc_get_property(a, node, "name", NULL);
-               if (name_prop && !strcmp(name_prop, "vdc-port")) {
-                       is_vdc_port = 1;
-                       id = parent_cfg_handle(a, node);
-               } else
-                       id = mdesc_get_property(a, node, "id", NULL);
-
-               if (!id) {
-                       printk(KERN_ERR "MD: Cannot find ID for %s node.\n",
-                              (name_prop ? name_prop : name));
+       mdesc_for_each_node_by_name(a, a_node, name) {
+
+               found = false;
+
+               rv = get_info_func(a, a_node, &a_node_info);
+               if (rv != 0) {
+                       printk(KERN_ERR "MD: %s: Cannot find 1 or more required "
+                           "match properties for %s node.\n", __func__, name);
                        continue;
                }
 
-               mdesc_for_each_node_by_name(b, fnode, name) {
-                       const u64 *fid;
-
-                       if (is_vdc_port) {
-                               name_prop = mdesc_get_property(b, fnode,
-                                                              "name", NULL);
-                               if (!name_prop ||
-                                   strcmp(name_prop, "vdc-port"))
-                                       continue;
-                               fid = parent_cfg_handle(b, fnode);
-                               if (!fid) {
-                                       printk(KERN_ERR "MD: Cannot find ID "
-                                              "for vdc-port node.\n");
-                                       continue;
-                               }
-                       } else
-                               fid = mdesc_get_property(b, fnode,
-                                                        "id", NULL);
-
-                       if (*id == *fid) {
-                               found = 1;
+               /* Check each node in B for node matching a_node */
+               mdesc_for_each_node_by_name(b, b_node, name) {
+
+                       rv = get_info_func(b, b_node, &b_node_info);
+                       if (rv != 0)
+                               continue;
+
+                       if (node_match_func(&a_node_info, &b_node_info)) {
+                               found = true;
                                break;
                        }
                }
+
                if (!found)
-                       func(a, node);
+                       func(a, a_node, name);
+
        }
 }
 
@@ -368,6 +509,77 @@ out:
        mutex_unlock(&mdesc_mutex);
 }
 
+u64 mdesc_get_node(struct mdesc_handle *hp, char *node_name,
+       union md_node_info *node_info)
+{
+       mdesc_node_info_f get_info_func;
+       mdesc_node_match_f node_match_func;
+       u64 hp_node;
+       union md_node_info hp_node_info;
+       int rv;
+
+       if (hp == NULL || node_name == NULL || node_info == NULL)
+               return MDESC_NODE_NULL;
+
+       /* Find the ops for the given node name */
+       mdesc_get_node_ops(node_name, &get_info_func, &node_match_func);
+
+       /* If we didn't find a node_match func, the node is not supported */
+       if (get_info_func == NULL || node_match_func == NULL) {
+               printk(KERN_ERR "MD: %s: %s node is not supported\n",
+                   __func__, node_name);
+               return -EINVAL;
+       }
+
+       mdesc_for_each_node_by_name(hp, hp_node, node_name) {
+
+               rv = get_info_func(hp, hp_node, &hp_node_info);
+               if (rv != 0)
+                       continue;
+
+               if (node_match_func(node_info, &hp_node_info))
+                       break;
+       }
+
+       return hp_node;
+
+}
+EXPORT_SYMBOL(mdesc_get_node);
+
+int mdesc_get_node_info(struct mdesc_handle *hp, u64 node, char *node_name,
+       union md_node_info *node_info)
+{
+       mdesc_node_info_f get_info_func;
+       mdesc_node_match_f node_match_func;
+       int rv;
+
+       if (hp == NULL || node == MDESC_NODE_NULL ||
+           node_name == NULL || node_info == NULL)
+               return -EINVAL;
+
+       /* Find the get_info op for the given node name */
+       mdesc_get_node_ops(node_name, &get_info_func, &node_match_func);
+
+       /* If we didn't find a get_info_func, the node name is not supported */
+       if (get_info_func == NULL) {
+               printk(KERN_ERR "MD: %s: %s node is not supported\n",
+                   __func__, node_name);
+               return -EINVAL;
+       }
+
+       rv = get_info_func(hp, node, node_info);
+       if (rv != 0) {
+               printk(KERN_ERR "MD: %s: Cannot find 1 or more required "
+                   "match properties for %s node.\n", __func__, node_name);
+               return -1;
+       }
+
+       return 0;
+
+}
+EXPORT_SYMBOL(mdesc_get_node_info);
+
+
 static struct mdesc_elem *node_block(struct mdesc_hdr *mdesc)
 {
        return (struct mdesc_elem *) (mdesc + 1);
index cb5789c9f9613ed692733d50dcf0e2c39784b1f7..4e02b3df6d57430775db28c584bdd69c202c5fb9 100644 (file)
@@ -62,14 +62,31 @@ static int vio_device_probe(struct device *dev)
        struct vio_dev *vdev = to_vio_dev(dev);
        struct vio_driver *drv = to_vio_driver(dev->driver);
        const struct vio_device_id *id;
-       int error = -ENODEV;
+       int error;
+
+       if (!drv->probe)
+               return -ENODEV;
+
+       id = vio_match_device(drv->id_table, vdev);
+       if (!id)
+               return -ENODEV;
+
+       /* alloc irqs (unless the driver specified not to) */
+       if (!drv->no_irq) {
+               if (vdev->tx_irq == 0 && vdev->tx_ino != ~0UL)
+                       vdev->tx_irq =
+                           sun4v_build_virq(vdev->dev_handle,
+                           vdev->tx_ino);
+
+               if (vdev->rx_irq == 0 && vdev->rx_ino != ~0UL)
+                       vdev->rx_irq =
+                           sun4v_build_virq(vdev->dev_handle,
+                           vdev->rx_ino);
 
-       if (drv->probe) {
-               id = vio_match_device(drv->id_table, vdev);
-               if (id)
-                       error = drv->probe(vdev, id);
        }
 
+       error = drv->probe(vdev, id);
+
        return error;
 }
 
@@ -78,9 +95,17 @@ static int vio_device_remove(struct device *dev)
        struct vio_dev *vdev = to_vio_dev(dev);
        struct vio_driver *drv = to_vio_driver(dev->driver);
 
-       if (drv->remove)
+       if (drv->remove) {
+
                return drv->remove(vdev);
 
+               /*
+                * Ideally, we would remove/deallocate tx/rx virqs
+                * here - however, there are currently no support
+                * routines to do so at the moment. TBD
+                */
+       }
+
        return 1;
 }
 
@@ -163,11 +188,55 @@ static struct device_node *cdev_node;
 static struct vio_dev *root_vdev;
 static u64 cdev_cfg_handle;
 
+static const u64 *vio_cfg_handle(struct mdesc_handle *hp, u64 node)
+{
+       const u64 *cfg_handle;
+       u64 a;
+
+       cfg_handle = NULL;
+       mdesc_for_each_arc(a, hp, node, MDESC_ARC_TYPE_BACK) {
+               u64 target;
+
+               target = mdesc_arc_target(hp, a);
+               cfg_handle = mdesc_get_property(hp, target,
+                                               "cfg-handle", NULL);
+               if (cfg_handle)
+                       break;
+       }
+
+       return cfg_handle;
+}
+
+/*
+ * vio_dev_node
+ * Find the node in the current MD which matches the
+ * given vio_dev. This must be done dynamically since the
+ * node value can change if the MD is updated.
+ * NOTE: the MD must be locked, using mdesc_grab(),
+ * when calling this routine!
+ */
+u64 vio_vdev_node(struct mdesc_handle *hp, struct vio_dev *vdev)
+{
+       u64 node;
+
+       if (vdev == NULL)
+               return MDESC_NODE_NULL;
+
+       node = mdesc_get_node(hp, vdev->node_name, &vdev->md_node_info);
+
+       return node;
+
+}
+EXPORT_SYMBOL(vio_vdev_node);
+
 static void vio_fill_channel_info(struct mdesc_handle *hp, u64 mp,
                                  struct vio_dev *vdev)
 {
        u64 a;
 
+       vdev->tx_ino = ~0UL;
+       vdev->rx_ino = ~0UL;
+       vdev->channel_id = ~0UL;
        mdesc_for_each_arc(a, hp, mp, MDESC_ARC_TYPE_FWD) {
                const u64 *chan_id;
                const u64 *irq;
@@ -177,18 +246,19 @@ static void vio_fill_channel_info(struct mdesc_handle *hp, u64 mp,
 
                irq = mdesc_get_property(hp, target, "tx-ino", NULL);
                if (irq)
-                       vdev->tx_irq = sun4v_build_virq(cdev_cfg_handle, *irq);
+                       vdev->tx_ino = *irq;
 
                irq = mdesc_get_property(hp, target, "rx-ino", NULL);
-               if (irq) {
-                       vdev->rx_irq = sun4v_build_virq(cdev_cfg_handle, *irq);
+               if (irq)
                        vdev->rx_ino = *irq;
-               }
 
                chan_id = mdesc_get_property(hp, target, "id", NULL);
                if (chan_id)
                        vdev->channel_id = *chan_id;
        }
+
+       vdev->dev_handle = cdev_cfg_handle;
+
 }
 
 int vio_set_intr(unsigned long dev_ino, int state)
@@ -201,14 +271,13 @@ int vio_set_intr(unsigned long dev_ino, int state)
 EXPORT_SYMBOL(vio_set_intr);
 
 static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
-                                     struct device *parent)
+       char *node_name, struct device *parent)
 {
-       const char *type, *compat, *bus_id_name;
+       const char *type, *compat;
        struct device_node *dp;
        struct vio_dev *vdev;
        int err, tlen, clen;
        const u64 *id, *cfg_handle;
-       u64 a;
 
        type = mdesc_get_property(hp, mp, "device-type", &tlen);
        if (!type) {
@@ -218,7 +287,7 @@ static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
                        tlen = strlen(type) + 1;
                }
        }
-       if (tlen > VIO_MAX_TYPE_LEN) {
+       if (tlen > VIO_MAX_TYPE_LEN || strlen(type) >= VIO_MAX_TYPE_LEN) {
                printk(KERN_ERR "VIO: Type string [%s] is too long.\n",
                       type);
                return NULL;
@@ -226,31 +295,7 @@ static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
 
        id = mdesc_get_property(hp, mp, "id", NULL);
 
-       cfg_handle = NULL;
-       mdesc_for_each_arc(a, hp, mp, MDESC_ARC_TYPE_BACK) {
-               u64 target;
-
-               target = mdesc_arc_target(hp, a);
-               cfg_handle = mdesc_get_property(hp, target,
-                                               "cfg-handle", NULL);
-               if (cfg_handle)
-                       break;
-       }
-
-       bus_id_name = type;
-       if (!strcmp(type, "domain-services-port"))
-               bus_id_name = "ds";
-
-       /*
-        * 20 char is the old driver-core name size limit, which is no more.
-        * This check can probably be removed after review and possible
-        * adaption of the vio users name length handling.
-        */
-       if (strlen(bus_id_name) >= 20 - 4) {
-               printk(KERN_ERR "VIO: bus_id_name [%s] is too long.\n",
-                      bus_id_name);
-               return NULL;
-       }
+       cfg_handle = vio_cfg_handle(hp, mp);
 
        compat = mdesc_get_property(hp, mp, "device-type", &clen);
        if (!compat) {
@@ -267,7 +312,6 @@ static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
                return NULL;
        }
 
-       vdev->mp = mp;
        memcpy(vdev->type, type, tlen);
        if (compat)
                memcpy(vdev->compat, compat, clen);
@@ -275,22 +319,23 @@ static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
                memset(vdev->compat, 0, sizeof(vdev->compat));
        vdev->compat_len = clen;
 
-       vdev->channel_id = ~0UL;
-       vdev->tx_irq = ~0;
-       vdev->rx_irq = ~0;
+       vdev->port_id = ~0UL;
+       vdev->tx_irq = 0;
+       vdev->rx_irq = 0;
 
        vio_fill_channel_info(hp, mp, vdev);
 
        if (!id) {
-               dev_set_name(&vdev->dev, "%s", bus_id_name);
+               dev_set_name(&vdev->dev, "%s", type);
                vdev->dev_no = ~(u64)0;
        } else if (!cfg_handle) {
-               dev_set_name(&vdev->dev, "%s-%llu", bus_id_name, *id);
+               dev_set_name(&vdev->dev, "%s-%llu", type, *id);
                vdev->dev_no = *id;
        } else {
-               dev_set_name(&vdev->dev, "%s-%llu-%llu", bus_id_name,
+               dev_set_name(&vdev->dev, "%s-%llu-%llu", type,
                             *cfg_handle, *id);
                vdev->dev_no = *cfg_handle;
+               vdev->port_id = *id;
        }
 
        vdev->dev.parent = parent;
@@ -312,7 +357,27 @@ static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
        }
        vdev->dp = dp;
 
-       printk(KERN_INFO "VIO: Adding device %s\n", dev_name(&vdev->dev));
+       /*
+        * node_name is NULL for the parent/channel-devices node and
+        * the parent doesn't require the MD node info.
+        */
+       if (node_name != NULL) {
+
+               strncpy(vdev->node_name, node_name, VIO_MAX_NAME_LEN);
+
+               err = mdesc_get_node_info(hp, mp, node_name,
+                   &vdev->md_node_info);
+               if (err) {
+                       printk(KERN_ERR "VIO: Could not get MD node "
+                           "info %s, err=%d\n", dev_name(&vdev->dev), err);
+                       kfree(vdev);
+                       return NULL;
+               }
+       }
+
+       printk(KERN_INFO "VIO: Adding device %s (tx_ino = %llx, "
+           "rx_ino = %llx)\n", dev_name(&vdev->dev), vdev->tx_ino,
+           vdev->rx_ino);
 
        err = device_register(&vdev->dev);
        if (err) {
@@ -328,26 +393,42 @@ static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
        return vdev;
 }
 
-static void vio_add(struct mdesc_handle *hp, u64 node)
+static void vio_add(struct mdesc_handle *hp, u64 node,
+       const char *node_name)
 {
-       (void) vio_create_one(hp, node, &root_vdev->dev);
+       (void) vio_create_one(hp, node, (char *)node_name, &root_vdev->dev);
 }
 
+struct vio_remove_node_data {
+       struct mdesc_handle *hp;
+       u64 node;
+};
+
 static int vio_md_node_match(struct device *dev, void *arg)
 {
        struct vio_dev *vdev = to_vio_dev(dev);
+       u64 node;
+       struct vio_remove_node_data *node_data;
 
-       if (vdev->mp == (u64) arg)
-               return 1;
+       node_data = (struct vio_remove_node_data *)arg;
+
+       node = vio_vdev_node(node_data->hp, vdev);
 
-       return 0;
+       if (node == node_data->node)
+               return 1;
+       else
+               return 0;
 }
 
-static void vio_remove(struct mdesc_handle *hp, u64 node)
+static void vio_remove(struct mdesc_handle *hp, u64 node, const char *node_name)
 {
        struct device *dev;
+       struct vio_remove_node_data node_data;
+
+       node_data.hp = hp;
+       node_data.node = node;
 
-       dev = device_find_child(&root_vdev->dev, (void *) node,
+       dev = device_find_child(&root_vdev->dev, (void *)&node_data,
                                vio_md_node_match);
        if (dev) {
                printk(KERN_INFO "VIO: Removing device %s\n", dev_name(dev));
@@ -368,7 +449,8 @@ static struct mdesc_notifier_client vio_device_notifier = {
  * under "openboot" that we should not mess with as aparently that is
  * reserved exclusively for OBP use.
  */
-static void vio_add_ds(struct mdesc_handle *hp, u64 node)
+static void vio_add_ds(struct mdesc_handle *hp, u64 node,
+       const char *node_name)
 {
        int found;
        u64 a;
@@ -385,7 +467,8 @@ static void vio_add_ds(struct mdesc_handle *hp, u64 node)
        }
 
        if (found)
-               (void) vio_create_one(hp, node, &root_vdev->dev);
+               (void) vio_create_one(hp, node, (char *)node_name,
+                   &root_vdev->dev);
 }
 
 static struct mdesc_notifier_client vio_ds_notifier = {
@@ -452,7 +535,7 @@ static int __init vio_init(void)
 
        cdev_cfg_handle = *cfg_handle;
 
-       root_vdev = vio_create_one(hp, root, NULL);
+       root_vdev = vio_create_one(hp, root, NULL, NULL);
        err = -ENODEV;
        if (!root_vdev) {
                printk(KERN_ERR "VIO: Could not create root device.\n");
index 526fcb5d8ce95d54c7afa7f5ea7c9c3a652dce3a..aff57cdd10b053fa99bbd2dbddb15648c1e40391 100644 (file)
@@ -113,7 +113,7 @@ void vio_link_state_change(struct vio_driver_state *vio, int event)
                        break;
                case VDEV_DISK_SERVER:
                        vio->dr_state = VIO_DR_STATE_RXREQ;
-                       break;
+                       return; /* VDS never initiates a handshake */
                }
                start_handshake(vio);
        } else if (event == LDC_EVENT_RESET) {
@@ -222,7 +222,11 @@ static int send_rdx(struct vio_driver_state *vio)
 
 static int send_attr(struct vio_driver_state *vio)
 {
-       return vio->ops->send_attr(vio);
+       if (vio->ops && vio->ops->send_attr)
+               return vio->ops->send_attr(vio);
+
+       return -EINVAL;
+
 }
 
 static struct vio_version *find_by_major(struct vio_driver_state *vio,
@@ -282,6 +286,7 @@ static int process_ver_info(struct vio_driver_state *vio,
                        ver.minor = vap->minor;
                pkt->minor = ver.minor;
                pkt->tag.stype = VIO_SUBTYPE_ACK;
+               pkt->dev_class = vio->dev_class;
                viodbg(HS, "SEND VERSION ACK maj[%u] min[%u]\n",
                       pkt->major, pkt->minor);
                err = send_ctrl(vio, &pkt->tag, sizeof(*pkt));
@@ -373,20 +378,23 @@ static int process_attr(struct vio_driver_state *vio, void *pkt)
        if (!(vio->hs_state & VIO_HS_GOTVERS))
                return handshake_failure(vio);
 
-       err = vio->ops->handle_attr(vio, pkt);
-       if (err < 0) {
-               return handshake_failure(vio);
-       } else {
-               vio->hs_state |= VIO_HS_GOT_ATTR;
+       if (vio->ops && vio->ops->handle_attr) {
+               err = vio->ops->handle_attr(vio, pkt);
+               if (err < 0) {
+                       return handshake_failure(vio);
+               } else {
+                       vio->hs_state |= VIO_HS_GOT_ATTR;
 
-               if ((vio->dr_state & VIO_DR_STATE_TXREQ) &&
-                   !(vio->hs_state & VIO_HS_SENT_DREG)) {
-                       if (send_dreg(vio) < 0)
-                               return handshake_failure(vio);
+                       if ((vio->dr_state & VIO_DR_STATE_TXREQ) &&
+                           !(vio->hs_state & VIO_HS_SENT_DREG)) {
+                               if (send_dreg(vio) < 0)
+                                       return handshake_failure(vio);
 
-                       vio->hs_state |= VIO_HS_SENT_DREG;
+                               vio->hs_state |= VIO_HS_SENT_DREG;
+                       }
                }
        }
+
        return 0;
 }
 
@@ -646,10 +654,14 @@ int vio_control_pkt_engine(struct vio_driver_state *vio, void *pkt)
                err = process_unknown(vio, pkt);
                break;
        }
+
        if (!err &&
            vio->hs_state != prev_state &&
-           (vio->hs_state & VIO_HS_COMPLETE))
-               vio->ops->handshake_complete(vio);
+           (vio->hs_state & VIO_HS_COMPLETE)) {
+
+               if (vio->ops && vio->ops->handshake_complete)
+                       vio->ops->handshake_complete(vio);
+       }
 
        return err;
 }
@@ -724,6 +736,10 @@ int vio_ldc_alloc(struct vio_driver_state *vio,
        cfg.tx_irq = vio->vdev->tx_irq;
        cfg.rx_irq = vio->vdev->rx_irq;
 
+       cfg.rx_ino = vio->vdev->rx_ino;
+       cfg.tx_ino = vio->vdev->tx_ino;
+       cfg.dev_handle = vio->vdev->dev_handle;
+
        lp = ldc_alloc(vio->vdev->channel_id, &cfg, event_arg, vio->name);
        if (IS_ERR(lp))
                return PTR_ERR(lp);
@@ -764,7 +780,11 @@ void vio_port_up(struct vio_driver_state *vio)
        }
 
        if (!err) {
-               err = ldc_connect(vio->lp);
+               if (ldc_mode(vio->lp) == LDC_MODE_RAW)
+                       ldc_set_state(vio->lp, LDC_STATE_CONNECTED);
+               else
+                       err = ldc_connect(vio->lp);
+
                if (err)
                        printk(KERN_WARNING "%s: Port %lu connect failed, "
                               "err=%d\n",
@@ -798,16 +818,22 @@ int vio_driver_init(struct vio_driver_state *vio, struct vio_dev *vdev,
        case VDEV_NETWORK_SWITCH:
        case VDEV_DISK:
        case VDEV_DISK_SERVER:
+       case VDEV_CONSOLE_CON:
+       case VDEV_VLDC:
                break;
 
        default:
                return -EINVAL;
        }
 
-       if (!ops->send_attr ||
-           !ops->handle_attr ||
-           !ops->handshake_complete)
-               return -EINVAL;
+       if (dev_class == VDEV_NETWORK ||
+           dev_class == VDEV_NETWORK_SWITCH ||
+           dev_class == VDEV_DISK ||
+           dev_class == VDEV_DISK_SERVER) {
+               if (!ops || !ops->send_attr || !ops->handle_attr ||
+                   !ops->handshake_complete)
+                       return -EINVAL;
+       }
 
        if (!ver_table || ver_table_size < 0)
                return -EINVAL;
index 3ccef9eba6f9dc53cecb785c23582cbdeb3b8618..fc6eb17192a9026c3aeed98881b0375fe6d80321 100644 (file)
@@ -570,4 +570,11 @@ config BLK_DEV_RSXX
          To compile this driver as a module, choose M here: the
          module will be called rsxx.
 
+config VDS
+       tristate "Sun virtual disk server (VDS)"
+       depends on SUN_LDOMS
+       default m
+       help
+               Support for Sun logical domain disks.
+
 endif # BLK_DEV
index 9cc6c18a1c7e2af444620b2d3257b100e9bc6f0f..b80fd295da423307a4d1852d224c554b2409c396 100644 (file)
@@ -26,6 +26,7 @@ obj-$(CONFIG_SUNVDC)          += sunvdc.o
 obj-$(CONFIG_BLK_DEV_NVME)     += nvme.o
 obj-$(CONFIG_BLK_DEV_SKD)      += skd.o
 obj-$(CONFIG_BLK_DEV_OSD)      += osdblk.o
+obj-$(CONFIG_VDS)              += vds/
 
 obj-$(CONFIG_BLK_DEV_UMEM)     += umem.o
 obj-$(CONFIG_BLK_DEV_NBD)      += nbd.o
index 4b911ed96ea3e8ee6996c74299fe8a9d414c1be0..2c59f2778064f3a611aa9d95c9fd00ab2c0e2dbc 100644 (file)
@@ -866,11 +866,19 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
        struct vdc_port *port;
        int err;
        const u64 *ldc_timeout;
+       u64 node;
 
        print_version();
 
        hp = mdesc_grab();
 
+       node = vio_vdev_node(hp, vdev);
+       if (node == MDESC_NODE_NULL) {
+               printk(KERN_ERR PFX "Failed to get vdev MD node.\n");
+               err = -ENXIO;
+               goto err_out_release_mdesc;
+       }
+
        err = -ENODEV;
        if ((vdev->dev_no << PARTITION_SHIFT) & ~(u64)MINORMASK) {
                printk(KERN_ERR PFX "Port id [%llu] too large.\n",
@@ -899,7 +907,7 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
         * a readahead I/O first, and once that fails it will try to read a
         * single page.
         */
-       ldc_timeout = mdesc_get_property(hp, vdev->mp, "vdc-timeout", NULL);
+       ldc_timeout = mdesc_get_property(hp, node, "vdc-timeout", NULL);
        port->ldc_timeout = ldc_timeout ? *ldc_timeout : 0;
        setup_timer(&port->ldc_reset_timer, vdc_ldc_reset_timer,
                    (unsigned long)port);
diff --git a/drivers/block/vds/Makefile b/drivers/block/vds/Makefile
new file mode 100644 (file)
index 0000000..102f076
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_VDS) := vds.o
+
+vds-y := vds_blk.o vds_efi.o vds_io.o vds_label.o vds_main.o vds_reg.o \
+        vds_vtoc.o
+
diff --git a/drivers/block/vds/vds.h b/drivers/block/vds/vds.h
new file mode 100644 (file)
index 0000000..28be0e2
--- /dev/null
@@ -0,0 +1,166 @@
+/*
+ * vds.h: LDOM Virtual Disk Server.
+ *
+ * Copyright (C) 2014 Oracle. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/namei.h>
+#include <linux/device-mapper.h>
+#include <linux/sysfs.h>
+
+#include <asm/vio.h>
+#include <asm/ldc.h>
+
+struct vds_part {
+       sector_t                start;
+       sector_t                size;
+};
+
+#define        VDS_MAXPART     128     /* max # of logical partitions */
+#define        DK_LABEL_SIZE   512     /* size of disk label */
+
+struct vds_port {
+       struct vio_driver_state vio;
+       u8                      flags;
+       u8                      xfer_mode;
+       u8                      media_type;
+       u8                      label_type;
+       u8                      npart;
+       u64                     max_xfer_size;
+       u64                     vdisk_size;
+       u32                     vdisk_bsize;
+       u32                     msglen;
+       u64                     seq;
+       const char              *path;
+       void                    *msgbuf;
+       struct vds_be_ops       *be_ops;        /* backend ops */
+       void                    *be_data;
+       struct mutex            label_lock;
+       char                    label[DK_LABEL_SIZE];   /* for vtoc/gpt */
+       struct vds_part         *part;
+       struct vio_disk_geom    *geom;
+       struct vio_disk_vtoc    *vtoc;
+       struct workqueue_struct *ioq;
+       struct workqueue_struct *rtq;
+};
+
+#define        VDS_PORT_SEQ            0x1
+
+static inline struct vds_port *to_vds_port(struct vio_driver_state *vio)
+{
+       return container_of(vio, struct vds_port, vio);
+}
+
+struct vds_io;
+
+/*
+ * Backend interface.
+ */
+struct vds_be_ops {
+       int (*init)(struct vds_port *port);
+       void (*fini)(struct vds_port *port);
+       int (*rw)(struct vds_io *io);
+       int (*flush)(struct vds_port *port);
+};
+
+struct vds_be_ops *vds_blk_get_ops(void);
+struct vds_be_ops *vds_reg_get_ops(void);
+
+int vds_be_init(struct vds_port *port);
+void vds_be_fini(struct vds_port *port);
+
+/*
+ * Label interface.
+ */
+void vds_label_init(struct vds_port *port);
+void vds_label_fini(struct vds_port *port);
+void vds_label_reset(struct vds_port *port);
+void vds_label_clear_part(struct vds_port *port);
+int vds_label_get_vtoc(struct vds_port *port);
+int vds_label_get_start(struct vds_port *port, int slice, sector_t *start);
+int vds_label_chk_iso(struct vds_port *port, bool *iso);
+
+int vds_efi_get(struct vds_port *port, sector_t lba, size_t len, void *data);
+int vds_efi_set(struct vds_port *port, sector_t lba, size_t len, void *data);
+int vds_efi_clear(struct vds_port *port);
+int vds_efi_validate(struct vds_port *port);
+
+int vds_vtoc_get(struct vds_port *port);
+int vds_vtoc_set(struct vds_port *port, struct vio_disk_vtoc *vtoc);
+int vds_vtoc_clear(struct vds_port *port);
+
+#define        vds_label_lock(p, v)                                    \
+       do {                                                    \
+               vdsdbg(LOCK, "label lock\n");   \
+               mutex_lock(&(p)->label_lock);                   \
+       } while (0)
+
+#define        vds_label_unlock(p, v)                                  \
+       do {                                                    \
+               vdsdbg(LOCK, "label unlock\n"); \
+               mutex_unlock(&(p)->label_lock);                 \
+       } while (0)
+
+#define        VDS_LABEL_NONE          0
+#define        VDS_LABEL_VTOC          1
+#define        VDS_LABEL_EFI           2
+
+#define        VDS_EFI_GPT             1
+
+/*
+ * Solaris ENOTSUP error.  Solaris vdisk expects to receive this error
+ * when getting the vtoc or geometry of a disk with and EFI label.
+ */
+#define        VDS_ENOTSUP             48
+
+#define        ONE_MEGABYTE    (1ULL << 20)
+#define        ONE_GIGABYTE    (1ULL << 30)
+
+#define        vds_vio_lock(v, f)                              \
+       do {                                            \
+               vdsdbg(LOCK, "%s: lock\n", __func__);   \
+               spin_lock_irqsave(&(v)->lock, (f));     \
+       } while (0)
+
+#define        vds_vio_unlock(v, f)                            \
+       do {                                            \
+               vdsdbg(LOCK, "%s: unlock\n", __func__); \
+               spin_unlock_irqrestore(&(v)->lock, (f));        \
+       } while (0)
+
+#define VDS_DEBUG_INIT         0x01
+#define VDS_DEBUG_HS           0x02
+#define VDS_DEBUG_DATA         0x04
+#define VDS_DEBUG_LOCK         0x08
+#define VDS_DEBUG_WQ           0x10
+#define VDS_DEBUG_MEM          0x20
+#define VDS_DEBUG_IOC          0x40
+#define VDS_DEBUG_FLUSH                0x80
+#define VDS_DEBUG_IO           0x100
+#define VDS_DEBUG_BIO          0x200
+#define VDS_DEBUG_FIO          0x400
+
+extern int vds_dbg;
+extern int vds_dbg_ldc;
+extern int vds_dbg_vio;
+
+#define vdsdbg(TYPE, f, a...)                                          \
+       do {                                                            \
+               if (vds_dbg & VDS_DEBUG_##TYPE)                         \
+                       pr_info("vds: ID[%lu] %s " f,                   \
+                           vio->vdev->channel_id, __func__, ## a);     \
+       } while (0)
+
+#define        vdsmsg(type, f, a...)                                   \
+       pr_##type("%s: " f, __func__, ## a);
diff --git a/drivers/block/vds/vds_blk.c b/drivers/block/vds/vds_blk.c
new file mode 100644 (file)
index 0000000..42bc4a3
--- /dev/null
@@ -0,0 +1,202 @@
+/*
+ * vds_blk.c: LDOM Virtual Disk Server.
+ *
+ * Copyright (C) 2014 Oracle. All rights reserved.
+ */
+
+#include "vds.h"
+#include "vds_io.h"
+
+#define        VDS_FMODE               (FMODE_READ | FMODE_WRITE | FMODE_EXCL)
+
+static int vds_blk_init(struct vds_port *port)
+{
+       struct block_device *bdev;
+
+       bdev = blkdev_get_by_path(port->path, VDS_FMODE, (void *)port);
+       if (IS_ERR(bdev))
+               return (int)(PTR_ERR(bdev));
+
+       port->vdisk_bsize = bdev_logical_block_size(bdev);
+       port->vdisk_size = i_size_read(bdev->bd_inode) / port->vdisk_bsize;
+       port->max_xfer_size = to_bytes(blk_queue_get_max_sectors(
+                             bdev_get_queue(bdev), 0)) / port->vdisk_bsize;
+
+       port->be_data = bdev;
+
+       return 0;
+}
+
+static void vds_blk_fini(struct vds_port *port)
+{
+       struct block_device *bdev = port->be_data;
+
+       if (bdev)
+               blkdev_put(bdev, VDS_FMODE);
+}
+
+static void vds_blk_end_io(struct bio *bio, int error)
+{
+       struct vds_io *io = bio->bi_private;
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+       unsigned long flags;
+       int done;
+
+       vdsdbg(BIO, "bio_put(%p), count=%d\n", bio, atomic_read(&io->count));
+       bio_put(bio);
+
+       if (error) {
+               vdsmsg(err, "IO error (%d)\n", error);
+               if (!io->error)
+                       io->error = error;
+       }
+
+       /*
+        * Make sure complete() is called atomically for
+        * io.count == 0 and the IO operation is completely
+        * finished in case vds_event checks io.count.
+        */
+       BUG_ON(atomic_read(&io->count) <= 0);
+       vdsdbg(LOCK, "lock\n");
+       spin_lock_irqsave(&port->vio.lock, flags);
+       vdsdbg(WQ, "cpu=%d work=%p\n", smp_processor_id(), &io->vds_work);
+       done = atomic_dec_and_test(&io->count);
+       mb();   /* XXX need barrier? */
+       if (done)
+               complete(&io->event);
+       spin_unlock_irqrestore(&port->vio.lock, flags);
+       vdsdbg(LOCK, "unlock\n");
+}
+
+static int vds_blk_rw(struct vds_io *io)
+{
+       int i;
+       int rw;
+       int done;
+       int err = 0;
+       struct bio *bio;
+       struct page *page, *pages;
+       unsigned npages;
+       unsigned long len;
+       unsigned long biolen, biomax;
+       sector_t offset, size, resid;
+       struct blk_plug plug;
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+       struct block_device *bdev = port->be_data;
+
+       vdsdbg(BIO, "(0x%p, %lld, %ld, %d)\n", io->pages, io->size,
+              io->offset, io->rw);
+
+       rw = io->rw;
+       size = to_sector(io->size);
+       offset = io->offset;
+       pages = io->pages;
+       npages = io->npages;
+       len = npages << PAGE_SHIFT;
+
+       rw |= REQ_SYNC; /* device IO is always sync */
+       resid = size;
+       i = 0;
+
+       BUG_ON(atomic_read(&io->count));
+       atomic_set(&io->count, 1);
+       init_completion(&io->event);
+
+       /*
+        * Tell the driver to coalesce bio operations if possible.
+        */
+       blk_start_plug(&plug);
+
+       biomax = port->max_xfer_size * port->vdisk_bsize;
+
+       /*
+        * Break up the request into bio operations and submit them.
+        */
+       while (resid) {
+               bio = bio_alloc(GFP_NOIO, npages);
+               bio->bi_iter.bi_sector = offset + (size - resid);
+               bio->bi_bdev = bdev;
+               bio->bi_end_io = vds_blk_end_io;
+               bio->bi_private = io;
+
+               for (biolen = 0; resid; biolen += len) {
+                       int rv;
+
+                       /*
+                        * Try and add as many pages as possible.
+                        */
+                       BUG_ON(biolen > biomax);
+                       len = min(PAGE_SIZE, to_bytes(resid));
+                       len = min(len, biomax - biolen);
+                       if (!len)
+                               break;
+                       page = pages + i;
+
+                       /*
+                        * XXX Can offset be non-zero?
+                        */
+                       rv = bio_add_page(bio, page, len, 0);
+                       vdsdbg(BIO, "bio_add_page(%p, %p, %lx)=%d\n",
+                              bio, page, len, rv);
+                       vdsdbg(BIO, "bi_sector=%lu, bi_size=%u\n",
+                              bio->bi_iter.bi_sector, bio->bi_iter.bi_size);
+
+                       if (!rv) {
+                               vdsmsg(err,
+                                      "bio_add_page: resid=%ld biolen=%ld\n",
+                                      resid, biolen);
+                               err = -EIO;
+                               break;
+                       }
+
+                       i++;
+                       npages--;
+                       resid -= to_sector(len);
+                       vdsdbg(BIO, "npages=%d, resid=%lu\n", npages, resid);
+               }
+
+               if (err)
+                       break;
+
+               atomic_inc(&io->count);
+               mb();   /* XXX need barrier? */
+               vdsdbg(BIO, "submit_bio(%d, %p) count=%d\n",
+                      rw, bio, atomic_read(&io->count));
+               submit_bio(rw, bio);
+       }
+
+       blk_finish_plug(&plug); /* let the bio ops go... */
+
+       /*
+        * If the last bio completes after the dec_and_test check
+        * wait_for_completion() should not block and just return.
+        */
+       done = atomic_dec_and_test(&io->count);
+       mb();   /* XXX need barrier? */
+       if (!done)
+               wait_for_completion(&io->event);
+       vdsdbg(BIO, "io complete count=%d\n", atomic_read(&io->count));
+
+       return err;
+}
+
+static int vds_blk_flush(struct vds_port *port)
+{
+       struct block_device *bdev = port->be_data;
+
+       return blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
+}
+
+struct vds_be_ops vds_blk_ops = {
+       vds_blk_init,
+       vds_blk_fini,
+       vds_blk_rw,
+       vds_blk_flush,
+};
+
+struct vds_be_ops *vds_blk_get_ops()
+{
+       return &vds_blk_ops;
+}
diff --git a/drivers/block/vds/vds_efi.c b/drivers/block/vds/vds_efi.c
new file mode 100644 (file)
index 0000000..43c929c
--- /dev/null
@@ -0,0 +1,239 @@
+/*
+ * vds_vtoc.c: LDOM Virtual Disk Server.
+ *
+ * Copyright (C) 2014 Oracle. All rights reserved.
+ */
+
+#include "vds.h"
+#include "vds_io.h"
+#include <../block/partitions/check.h>
+#include <../block/partitions/efi.h>
+#include <linux/byteorder/generic.h>
+#include <linux/crc32.h>
+
+#define        VDS_EFI_GPE_LEN(port, nparts) \
+       roundup((sizeof(gpt_entry) * (nparts)), (port)->vdisk_bsize)
+
+/*
+ * Return a 32-bit CRC of the contents of the buffer.
+ *
+ * The seed is 0xffffffff and the result is XORed with 0xffffffff
+ * because this is what the Itanium firmware expects.
+ */
+static unsigned int vds_efi_crc32(const unsigned char *s, unsigned int len)
+{
+       return crc32(~0L, (void *)s, len) ^ ~0L;
+
+}
+
+/*
+ * vds_efi_crc_check
+ *
+ * Compute the CRC on the range of memory specified by (addr, len)
+ * and return whether that CRC value matches the value stored at
+ * the location referenced by crc_field.
+ */
+static int vds_efi_crc_check(u32 *crc_field, unsigned char *addr, u32 len)
+{
+       u32             crc_stored;
+       u32             crc_computed;
+       int             rv = 0;
+
+       crc_stored = *crc_field;
+       *crc_field = cpu_to_le32(0);
+       crc_computed = vds_efi_crc32(addr, len);
+       *crc_field = crc_stored;
+
+       if (le32_to_cpu(crc_stored) != crc_computed) {
+               vdsmsg(warn,
+                      "Bad EFI CRC: (stored, computed): (0x%x, 0x%x)\n",
+                      crc_stored, crc_computed);
+               rv = -EINVAL;
+       }
+
+       return rv;
+}
+
+/*
+ * Check that an EFI GPT is valid. This function should be called with a raw
+ * EFI GPT i.e. GPT data should be in little endian format as indicated in the
+ * EFI specification and they should not have been swapped to match with the
+ * system endianness.
+ */
+static int vds_efi_check_gpt(struct vio_driver_state *vio,
+                            gpt_header *gpt, size_t block_size)
+{
+       if (gpt->signature != cpu_to_le64(GPT_HEADER_SIGNATURE)) {
+               vdsdbg(IOC, "Bad EFI signature: 0x%llx != 0x%llx\n",
+                   (long long)gpt->signature,
+                   (long long)cpu_to_le64(GPT_HEADER_SIGNATURE));
+               return -EINVAL;
+       }
+
+       /*
+        * check CRC of the header; the size of the header should
+        * never be larger than one block
+        */
+       if (le32_to_cpu(gpt->header_size) > block_size) {
+               vdsmsg(warn, "Header (%u bytes) larger than one block (%u)\n",
+                      le32_to_cpu(gpt->header_size),
+                      (unsigned int)block_size);
+               return -EINVAL;
+       }
+
+       return vds_efi_crc_check(&gpt->header_crc32,
+           (unsigned char *)gpt, le32_to_cpu(gpt->header_size));
+}
+
+static void vds_efi_update_part(struct vds_port *port, gpt_entry *gpe)
+{
+       int i;
+       u64 start, end;
+
+       vds_label_clear_part(port);
+
+       for (i = 0; i < port->npart; i++) {
+
+               start = le64_to_cpu(gpe[i].starting_lba);
+               end = le64_to_cpu(gpe[i].ending_lba);
+
+               if (start && end) {
+                       port->part[i].start = start;
+                       port->part[i].size = end - start + 1;
+               }
+       }
+}
+
+static int vds_efi_update(struct vds_port *port, gpt_header *gpt)
+{
+       int rv;
+       u32 nparts;
+       size_t gpe_len;
+       sector_t lba;
+       gpt_entry *gpe = NULL;
+       struct vio_driver_state *vio = &port->vio;
+
+       /*
+        * Validate GPT and update partition info.
+        */
+       rv = vds_efi_check_gpt(vio, gpt, port->vdisk_bsize);
+       if (rv) {
+               vdsdbg(IOC, "bad EFI GPT\n");
+               return rv;
+       }
+
+       lba = le64_to_cpu(gpt->partition_entry_lba);
+       nparts = le32_to_cpu(gpt->num_partition_entries);
+
+       /*
+        * If the number of partitions represented in the GPT
+        * Header is larger than what is created by convention
+        * force the vdisk subsystem to use the conventional value.
+        *
+        * Note that we do not force a fatal error.  The vdisk
+        * client will not be able to access partitions beyond
+        * the specified value, but the vdisk client will also
+        * not fail on operations that access an EFI disk having
+        * a large number of unused partitions.
+        */
+       nparts = min_t(u32, nparts, VDS_MAXPART);
+       port->npart = nparts;
+
+       gpe_len = VDS_EFI_GPE_LEN(port, nparts);
+       if (gpe_len) {
+               gpe = kzalloc(gpe_len, GFP_KERNEL);
+
+               rv = vds_read(port, (void *)gpe, lba, gpe_len);
+               if (rv) {
+                       kfree(gpe);
+                       port->npart = 0;
+                       return rv;
+               }
+
+               vds_efi_update_part(port, gpe);
+               kfree(gpe);
+       }
+
+       port->label_type = VDS_LABEL_EFI;
+
+       return 0;
+}
+
+/*
+ * Get the EFI GPT or GPE from the disk backend. The on-disk GPT and GPE
+ * are stored in little endian format and this function converts selected
+ * fields using the endianness of the system for it's internal use but the
+ * client data is returned unmodified.
+ *
+ * The number of partitions in an EFI GPT can be larger than what the vdisk
+ * subsystem supports.  Return the smaller of what is in the label and what
+ * the vdisk subsystem supports.
+ */
+int vds_efi_validate(struct vds_port *port)
+{
+       int rv;
+       struct vio_driver_state *vio = &port->vio;
+
+       rv = vds_read(port, port->label, VDS_EFI_GPT, DK_LABEL_SIZE);
+
+       if (!rv)
+               rv = vds_efi_update(port, (gpt_header *)port->label);
+
+       if (rv)
+               vdsdbg(IOC, "failed: rv=%d\n", rv);
+
+       return rv;
+}
+
+inline int vds_efi_get(struct vds_port *port, sector_t lba, size_t len,
+                      void *data)
+{
+       return vds_read(port, data, lba, len);
+}
+
+int vds_efi_set(struct vds_port *port, sector_t lba, size_t len, void *data)
+{
+       int rv, err;
+       struct vio_driver_state *vio = &port->vio;
+
+       vdsdbg(IOC, "data=%p lba=%lu len=%lu\n", data, lba, len);
+
+       err = vds_write(port, data, lba, len);
+
+       if (err) {
+               vdsmsg(err, "write EFI label failed: rv=%d\n", err);
+       } else if (lba == VDS_EFI_GPT) {
+               rv = vds_efi_validate(port);
+               if (rv)
+                       /*
+                        * To convert from EFI to VTOC, Solaris format(1M)
+                        * clears the EFI signature, issues a GETGEOM command
+                        * and puts the EFI signature back on the disk, so
+                        * ignore invalid signature errors here just in case.
+                        */
+                       vdsdbg(IOC, "read EFI label failed: rv=%d\n", rv);
+       }
+
+       return err;
+}
+
+int vds_efi_clear(struct vds_port *port)
+{
+       int rv;
+       struct vio_driver_state *vio = &port->vio;
+
+       /*
+        * Clear primary and backup GPT.
+        */
+       rv = vds_clear(port, VDS_EFI_GPT, port->vdisk_bsize);
+       if (rv)
+               return rv;
+
+       rv = vds_clear(port, port->vdisk_size - 1, port->vdisk_bsize);
+       if (rv)
+               vdsdbg(IOC, "Clearing backup GPT failed rv=%d\n", rv);
+
+       vds_label_reset(port);
+
+       return 0;
+}
diff --git a/drivers/block/vds/vds_io.c b/drivers/block/vds/vds_io.c
new file mode 100644 (file)
index 0000000..d44bee6
--- /dev/null
@@ -0,0 +1,622 @@
+/*
+ * vds_io.c: LDOM Virtual Disk Server.
+ *
+ * Copyright (C) 2014 Oracle. All rights reserved.
+ */
+
+#include "vds.h"
+#include "vds_io.h"
+
+#define        VDS_MAX_XFER_SIZE       (128 * 1024)
+#define        VDS_RETRIES             5
+#define VDS_DEV_DELAY           1000000 /* 1 sec */
+#define        VDS_SLICE_NONE          0xff
+
+static struct kmem_cache *vds_io_cache;
+static int vds_ioc_size;
+static char *vds_ioc_name = "vds_io";
+
+int vds_io_init(void)
+{
+       int max_entry;
+       int max_cookies;
+       int max_dring_mode;
+       int max_desc_mode;
+
+       /*
+        * Create a kmem_cache for vds_io allocations.
+        *
+        * The size of the cache object accomdate the largest possible
+        * IO transfer initiated from either dring or descriptor mode.
+        */
+       max_cookies = (roundup(VDS_MAX_XFER_SIZE, PAGE_SIZE) / PAGE_SIZE) + 1;
+       max_cookies = max(max_cookies, VIO_MAX_RING_COOKIES);
+       max_entry = max_cookies * sizeof(struct ldc_trans_cookie);
+
+       max_dring_mode = LDC_PACKET_SIZE + sizeof(struct vio_disk_desc) +
+                        max_entry;
+       max_desc_mode = sizeof(struct vio_disk_desc_inband) + max_entry;
+
+       vds_ioc_size = sizeof(struct vds_io) +
+                      max(max_dring_mode, max_desc_mode);
+
+       vds_io_cache = kmem_cache_create(vds_ioc_name, vds_ioc_size, 0,
+                                        0, NULL);
+       if (!vds_io_cache) {
+               vdsmsg(err, "Failed to create vds_io_cache\n");
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+void vds_io_fini(void)
+{
+       kmem_cache_destroy(vds_io_cache);
+}
+
+/*
+ * Allocate a vds_io request structure.
+ *
+ * Allocate the structure from vds_io_cache if the total required
+ * space fits within a vds_io_cache object; otherwise use kmalloc().
+ *
+ * XXX In principle, the kmalloc() method should not be required
+ * since vds_io_cache should accommodate the largest supported IO
+ * transfer size defined as VDS_MAX_XFER_SIZE.  The max_xfer_size
+ * parameter is negotiated during the handshake and should be honored
+ * by all clients; however, it seems that OBP does not do that.
+ * This should not be an issue since VDS_MAX_XFER_SIZE should
+ * always be larger than any OBP transfer size but the kmalloc()
+ * option is there since an OBP transfer size > VDS_MAX_XFER_SIZE
+ * could theoretically cause memory corruption.
+ *
+ * The proper thing to do would be nack an non-conforming transfer size.
+ */
+struct vds_io *vds_io_alloc(struct vio_driver_state *vio,
+                           void (*func)(struct work_struct *))
+{
+       struct vds_port *port = to_vds_port(vio);
+       struct vds_io *io;
+       int size;
+
+       size = sizeof(*io) + port->msglen + vio->desc_buf_len;
+       vdsdbg(MEM, "size=%d ioc_size=%d\n", size, vds_ioc_size);
+
+       if (size <= vds_ioc_size) {
+               io = kmem_cache_zalloc(vds_io_cache, GFP_ATOMIC);
+
+               if (!io)
+                       return NULL;
+               io->flags = VDS_IO_CACHE;
+               io->msgbuf = io->buf;
+               io->desc_buf = io->buf + port->msglen;
+       } else {
+               io = kzalloc(sizeof(*io), GFP_ATOMIC);
+               if (!io)
+                       goto err;
+               io->msgbuf = kzalloc(port->msglen, GFP_ATOMIC);
+               if (!io->msgbuf)
+                       goto err;
+               BUG_ON(!vio->desc_buf_len);
+               io->desc_buf = kzalloc(vio->desc_buf_len, GFP_ATOMIC);
+               if (!io->desc_buf)
+                       goto err;
+       }
+       io->vio = vio;
+       if (func)
+               INIT_WORK(&io->vds_work, func);
+
+       return io;
+
+err:
+       kfree(io->msgbuf);
+       kfree(io->desc_buf);
+       kfree(io);
+
+       return NULL;
+}
+
+void vds_io_free(struct vds_io *io)
+{
+       if (io->flags & VDS_IO_CACHE) {
+               kmem_cache_free(vds_io_cache, io);
+       } else {
+               kfree(io->msgbuf);
+               kfree(io->desc_buf);
+               kfree(io);
+       }
+}
+
+static int vds_io_alloc_pages(struct vds_io *io, unsigned long len)
+{
+       struct vio_driver_state *vio = io->vio;
+
+       BUG_ON(len % PAGE_SIZE != 0);
+       io->ord = get_order(len);
+       io->pages = alloc_pages(GFP_KERNEL | __GFP_COMP, io->ord);
+       if (!io->pages)
+               return -ENOMEM;
+       io->npages = len >> PAGE_SHIFT;
+
+       vdsdbg(MEM, "ord=%d pages=%p npages=%d\n", io->ord, io->pages,
+              io->npages);
+
+       return 0;
+}
+
+static void vds_io_free_pages(struct vds_io *io)
+{
+       __free_pages(io->pages, io->ord);
+
+       io->pages = NULL;
+       io->npages = 0;
+       io->ord = 0;
+}
+
+void vds_io_enq(struct vds_io *io)
+{
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+
+       vdsdbg(WQ, "cpu=%d\n", smp_processor_id());
+
+       BUG_ON(!in_interrupt());
+
+       if (io->flags & VDS_IO_FINI)
+               queue_work(port->rtq, &io->vds_work);
+       else
+               queue_work(port->ioq, &io->vds_work);
+}
+
+static int vds_io_rw(struct vds_io *io)
+{
+       int err;
+       void *buf;
+       unsigned long len;
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+
+       vdsdbg(IO, "(0x%p, %lld, %ld, %d)\n", io->addr, io->size,
+              io->offset, io->rw);
+
+       if (!to_sector(io->size))
+               return -EINVAL;
+
+       if (!port->be_ops)
+               return -EIO;
+
+       len = (unsigned long)roundup(io->size, PAGE_SIZE);
+       err = vds_io_alloc_pages(io, len);
+       if (err)
+               return err;
+
+       buf = page_address(io->pages);
+
+       BUG_ON(!buf);
+       BUG_ON(!io->addr);
+
+       if (io->rw & WRITE)
+               memcpy(buf, io->addr, io->size);
+
+       err = port->be_ops->rw(io);
+
+       if (!err && !(io->rw & WRITE))
+               memcpy(io->addr, buf, io->size);
+
+       vds_io_free_pages(io);
+
+       return err;
+}
+
+/*
+ * Common routine for read/write/clear interfaces.
+ */
+static int vds_rw(struct vds_port *port, void *addr, sector_t offset, u64 size,
+                 int rw)
+{
+       int rv = -ENOMEM;
+       struct vds_io *io;
+       struct vio_driver_state *vio = &port->vio;
+
+       io = vds_io_alloc(vio, NULL);
+       if (io) {
+               io->addr = addr;
+               io->offset = offset;
+               io->size = size;
+               io->rw = rw;
+               rv = vds_io_rw(io);
+               vds_io_free(io);
+       }
+
+       vdsdbg(IO, "addr=%p offset=%lu size=%llu rw=%d rv=%d\n",
+              addr, offset, size, rw, rv);
+
+       return rv;
+}
+
+inline int vds_read(struct vds_port *port, void *addr, sector_t off, u64 size)
+{
+       return vds_rw(port, addr, off, size, 0);
+}
+
+inline int vds_write(struct vds_port *port, void *addr, sector_t off, u64 size)
+{
+       return vds_rw(port, addr, off, size, WRITE);
+}
+
+inline int vds_clear(struct vds_port *port, sector_t offset, u64 size)
+{
+       int rv;
+       void *addr;
+
+       addr = kzalloc(size, GFP_KERNEL);
+       if (!addr)
+               return -ENOMEM;
+
+       rv = vds_rw(port, addr, offset, size, WRITE);
+
+       kfree(addr);
+
+       return rv;
+}
+
+static int vds_copy(struct vio_driver_state *vio, int dir, void *buf,
+                   struct vio_disk_dring_payload *desc, u64 size, u64 offset)
+{
+       int rv, err;
+
+       if (!size)
+               size = desc->size;
+
+       rv = ldc_copy(vio->lp, dir, buf, size, offset, desc->cookies,
+                     desc->ncookies);
+       if (rv > 0) {
+               if (rv == size)
+                       err = 0;
+               else
+                       err = -EIO;
+       } else
+               err = rv;
+
+       vdsdbg(BIO, "dir=%d size=%llu offset=%llu rv=%d err=%d\n",
+              dir, size, offset, rv, err);
+
+       return err;
+}
+
+int vd_op_get_vtoc(struct vds_io *io)
+{
+       int rv;
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+
+       rv = vds_label_get_vtoc(port);
+       if (rv)
+               vdsdbg(IOC, "vds_label_get_vtoc rv=%d\n", rv);
+
+       if (rv == 0 || rv == -EINVAL)
+               rv = vds_copy(vio, LDC_COPY_OUT, port->vtoc, io->desc, 0, 0);
+
+       vdsdbg(IOC, "VD_OP_GET_VTOC ascii=%s\n", port->vtoc->ascii_label);
+       vdsdbg(IOC, "VD_OP_GET_VTOC rv=%d\n", rv);
+
+       return rv;
+}
+
+int vd_op_set_vtoc(struct vds_io *io)
+{
+       int rv = 0;
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+
+       vds_label_lock(port, vio);
+
+       rv = vds_copy(vio, LDC_COPY_IN, port->vtoc, io->desc, 0, 0);
+
+       if (rv == 0 && port->label_type == VDS_LABEL_EFI)
+               rv = vds_efi_clear(port);
+
+       if (!rv)
+               rv = vds_vtoc_set(port, port->vtoc);
+
+       vds_label_unlock(port, vio);
+
+       vdsdbg(IOC, "VD_OP_SET_VTOC ascii=%s\n", port->vtoc->ascii_label);
+       vdsdbg(IOC, "VD_OP_SET_VTOC rv=%d\n", rv);
+       return rv;
+}
+
+int vd_op_get_geom(struct vds_io *io)
+{
+       int rv;
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+
+       rv = vds_label_get_vtoc(port);
+       if (rv)
+               vdsdbg(IOC, "vds_label_get_vtoc rv=%d\n", rv);
+
+       if (rv == 0 || rv == -EINVAL) {
+               struct vio_disk_geom *geom = port->geom;
+
+               vdsdbg(IOC, "ncyl=%u nhd=%u nsec=%u\n",
+                      geom->phy_cyl, geom->num_hd, geom->num_sec);
+
+               rv = vds_copy(vio, LDC_COPY_OUT, geom, io->desc, 0, 0);
+       }
+
+       vdsdbg(IOC, "VD_OP_GET_DISKGEOM rv=%d\n", rv);
+
+       return rv;
+}
+
+int vd_op_set_geom(struct vds_io *io)
+{
+       int rv;
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+
+       rv = vds_copy(vio, LDC_COPY_IN, port->geom, io->desc, 0, 0);
+
+       vdsdbg(IOC, "VD_OP_SET_DISKGEOM rv=%d\n", rv);
+
+       return rv;
+}
+
+int vd_op_get_efi(struct vds_io *io)
+{
+       int rv;
+       size_t len;
+       void *data;
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+       struct vio_disk_efi efi_in;
+       struct vio_disk_efi *efi_out = NULL;
+
+       rv = vds_copy(vio, LDC_COPY_IN, &efi_in, io->desc, sizeof(efi_in), 0);
+       if (rv)
+               goto done;
+
+       vds_label_lock(port, vio);
+
+       /*
+        * Adjust the required len by an additional VIO EFI header
+        * so that the returned results are contiguous and can be
+        * copied out all at once.
+        */
+       len = efi_in.len + sizeof(struct vio_disk_efi);
+       efi_out = kzalloc(len, GFP_KERNEL);
+       if (efi_out) {
+               data = (void *)efi_out + sizeof(struct vio_disk_efi);
+               rv = vds_efi_get(port, efi_in.lba, efi_in.len, data);
+       } else
+               rv = -ENOMEM;
+
+       if (!rv) {
+               efi_out->lba = efi_in.lba;
+               efi_out->len = efi_in.len;
+               rv = vds_copy(vio, LDC_COPY_OUT, efi_out, io->desc, len, 0);
+       }
+
+       vds_label_unlock(port, vio);
+
+done:
+       vdsdbg(IOC, "VD_OP_GET_EFI rv=%d\n", rv);
+       kfree(efi_out);
+
+       return rv;
+}
+
+int vd_op_set_efi(struct vds_io *io)
+{
+       int rv;
+       struct vio_disk_efi *efi;
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+
+       efi = kzalloc(roundup(io->desc->size, 8), GFP_KERNEL);
+       if (!efi) {
+               rv = -ENOMEM;
+               goto done;
+       }
+
+       vds_label_lock(port, vio);
+
+       rv = vds_copy(vio, LDC_COPY_IN, efi, io->desc, 0, 0);
+
+       if (rv == 0 && port->label_type == VDS_LABEL_VTOC)
+               rv = vds_vtoc_clear(port);
+
+       if (!rv)
+               rv = vds_efi_set(port, efi->lba, efi->len, efi->data);
+
+       vds_label_unlock(port, vio);
+
+done:
+       vdsdbg(IOC, "VD_OP_SET_EFI rv=%d\n", rv);
+       kfree(efi);
+
+       return rv;
+}
+
+int vd_op_flush(struct vio_driver_state *vio)
+{
+       int rv;
+       struct vds_port *port = to_vds_port(vio);
+
+       if (port->be_ops) {
+               flush_workqueue(port->ioq);
+               rv = port->be_ops->flush(port);
+       } else
+               rv = -EIO;
+
+       vdsdbg(FLUSH, "VD_OP_FLUSH rv=%d\n", rv);
+       return rv;
+}
+
+int vd_op_rw(struct vds_io *io)
+{
+       int err = 0;
+       u8 slice;
+       unsigned long len, dsz;
+       sector_t offset, size, start;
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+       struct vio_disk_dring_payload *desc;
+       void *buf;
+
+       desc = io->desc;
+
+       /*
+        * Get the request size and block offset.
+        */
+       offset = to_sector((desc->offset * port->vdisk_bsize));
+       size = to_sector(desc->size);
+       if (!size) {
+               io->ack = VIO_SUBTYPE_NACK;
+               goto done;
+       }
+
+       /*
+        * If a slice is provided, make sure there is label info
+        * to read the slice offset from.
+        */
+       slice = desc->slice;
+       if (slice != VDS_SLICE_NONE) {
+               err = vds_label_get_start(port, slice, &start);
+               if (err) {
+                       io->ack = VIO_SUBTYPE_NACK;
+                       goto done;
+               }
+               offset += start;
+       }
+
+       /*
+        * Allocate pages for io.
+        *
+        * Calculate one page per cookie rather using desc->size because
+        * for example a PAGE_SIZE request may be split across number of
+        * cookies.
+        *
+        * XXX Coalesce cookies with contiguous addresses in order to
+        * reduce the number of page allocations and bio requests.
+        */
+       len = (unsigned long)desc->ncookies * PAGE_SIZE;
+       dsz = (unsigned long)roundup(desc->size, PAGE_SIZE);
+       len = max(len, dsz);
+       err = vds_io_alloc_pages(io, len);
+       if (err)
+               goto done;
+
+       buf = page_address(io->pages);
+
+       if (io->rw & WRITE) {
+               err = vds_copy(vio, LDC_COPY_IN, buf, desc, 0, 0);
+               if (err)
+                       goto free;
+       }
+
+       /*
+        * Call the backend to perform the actual operation.
+        */
+       io->size = desc->size;
+       io->offset = offset;
+
+       if (port->be_ops)
+               err = port->be_ops->rw(io);
+       else
+               err = -EIO;
+
+       if (!err && !(io->rw & WRITE))
+               err = vds_copy(vio, LDC_COPY_OUT, buf, desc, 0, 0);
+
+free:
+       vds_io_free_pages(io);
+
+       if (offset <= 1 && (io->rw & WRITE))
+               vds_label_init(port);
+
+done:
+       return err;
+}
+
+/*
+ * Backend operations.
+ */
+int vds_be_init(struct vds_port *port)
+{
+       int i, rv;
+       bool iso;
+       umode_t mode;
+       struct path path;
+       struct inode *inode;
+       struct vio_driver_state *vio = &port->vio;
+
+       rv = kern_path(port->path, LOOKUP_FOLLOW, &path);
+       if (rv)
+               goto done;
+
+       inode = path.dentry->d_inode;
+       mode = inode->i_mode;
+       path_put(&path);
+
+       if (S_ISREG(mode))
+               port->be_ops = vds_reg_get_ops();
+       else if (S_ISBLK(mode))
+               port->be_ops = vds_blk_get_ops();
+       else
+               rv = -ENODEV;
+
+       if (!rv)
+               for (i = 0; i < VDS_RETRIES; i++) {
+                       rv = port->be_ops->init(port);
+                       if (rv == 0 || rv != -EAGAIN)
+                               break;
+                       udelay(VDS_DEV_DELAY);
+               }
+
+       vdsdbg(HS, "vdisk_blk_sz=%u vdisk_sz=%llu max_xfer_sz=%llu\n",
+              port->vdisk_bsize, port->vdisk_size, port->max_xfer_size);
+
+       if (!(port->vdisk_bsize && port->vdisk_size && port->max_xfer_size)) {
+               rv = -EINVAL;
+               goto done;
+       }
+
+       rv = vds_label_chk_iso(port, &iso);
+       if (rv) {
+               vdsmsg(err, "media check error\n");
+               goto done;
+       }
+
+       /*
+        * Indicate whether to call this a CD or DVD from the size
+        * of the ISO image (images for both drive types are stored
+        * in the ISO-9600 format). CDs can store up to just under 1Gb
+        */
+       if (!iso)
+               port->media_type = VD_MEDIA_TYPE_FIXED;
+       else if ((port->vdisk_size * port->vdisk_bsize) > ONE_GIGABYTE)
+               port->media_type = VD_MEDIA_TYPE_DVD;
+       else
+               port->media_type = VD_MEDIA_TYPE_CD;
+
+       vds_label_init(port);
+
+done:
+       if (rv)
+               vdsmsg(err, "%s: init failed (%d)\n", port->path, rv);
+
+       return rv;
+}
+
+void vds_be_fini(struct vds_port *port)
+{
+       flush_workqueue(port->ioq);
+       vds_label_fini(port);
+       if (port->be_ops) {
+               port->be_ops->fini(port);
+               port->be_data = NULL;
+       }
+}
diff --git a/drivers/block/vds/vds_io.h b/drivers/block/vds/vds_io.h
new file mode 100644 (file)
index 0000000..6faf752
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * vds_io.h: LDOM Virtual Disk Server.
+ *
+ * Copyright (C) 2014 Oracle. All rights reserved.
+ */
+
+struct vds_port;
+
+/*
+ * IO interface.
+ *
+ * I/O struct allocated dynamically per client request.
+ * A request is scheduled in interrupt context and executed later
+ * in a worker kernel thread in process context.  The default events
+ * worker threads are used (1 per cpu).
+ * A client request may cause a number bio operations which
+ * are tracked by count below.
+ */
+struct vds_io {
+       int flags;
+       int ack;
+       int error;
+       u32 msglen;
+       atomic_t count;
+       void *msgbuf;
+       void *desc_buf;
+       struct vio_disk_dring_payload *desc;
+       struct vio_driver_state *vio;
+       int rw;
+       u64 size;
+       unsigned ord;
+       void *addr;
+       sector_t offset;
+       unsigned npages;
+       struct page *pages;
+       struct completion event;
+       struct work_struct vds_work;
+       char buf[0];
+};
+
+#define        VDS_IO_CACHE            0x1
+#define        VDS_IO_INIT             0x2
+#define        VDS_IO_FINI             0x4
+
+int vds_io_init(void);
+void vds_io_fini(void);
+struct vds_io *vds_io_alloc(struct vio_driver_state *vio,
+                           void (*func)(struct work_struct *));
+void vds_io_free(struct vds_io *io);
+void vds_io_enq(struct vds_io *io);
+
+void *vds_get(struct vds_port *port, sector_t offset, u64 size);
+int vds_clear(struct vds_port *port, sector_t offset, u64 size);
+int vds_read(struct vds_port *port, void *addr, sector_t offset, u64 size);
+int vds_write(struct vds_port *port, void *addr, sector_t offset, u64 size);
+
+/*
+ * VIO interface.
+ */
+int vd_op_get_vtoc(struct vds_io *io);
+int vd_op_set_vtoc(struct vds_io *io);
+int vd_op_get_geom(struct vds_io *io);
+int vd_op_set_geom(struct vds_io *io);
+int vd_op_get_efi(struct vds_io *io);
+int vd_op_set_efi(struct vds_io *io);
+int vd_op_flush(struct vio_driver_state *vio);
+int vd_op_rw(struct vds_io *io);
diff --git a/drivers/block/vds/vds_label.c b/drivers/block/vds/vds_label.c
new file mode 100644 (file)
index 0000000..bf08a10
--- /dev/null
@@ -0,0 +1,145 @@
+/*
+ * vds_lb.c: LDOM Virtual Disk Server.
+ *
+ * Copyright (C) 2014 Oracle. All rights reserved.
+ */
+
+#include "vds.h"
+#include "vds_io.h"
+#include <linux/iso_fs.h>
+
+#define        ISO_VOLDESC_SEC 16      /* 1st sector of volume descriptors */
+
+inline void vds_label_clear_part(struct vds_port *port)
+{
+       memset(port->part, 0, sizeof(*port->part) * VDS_MAXPART);
+}
+
+void vds_label_reset(struct vds_port *port)
+{
+       struct vio_driver_state *vio = &port->vio;
+
+       vdsdbg(IOC, "media=%u label=%u\n", port->media_type, port->label_type);
+       vds_label_clear_part(port);
+       port->npart = 0;
+       port->label_type = VDS_LABEL_NONE;
+}
+
+int vds_label_chk_iso(struct vds_port *port, bool *iso)
+{
+       int rv;
+       sector_t sec;
+       struct iso_volume_descriptor *vdp;
+       char iso_buf[ISOFS_BLOCK_SIZE];
+       struct vio_driver_state *vio = &port->vio;
+
+       /*
+        * Read the sector that should contain the 2nd ISO volume
+        * descriptor. The second field in this descriptor is called the
+        * Standard Identifier and is set to CD001 for a CD-ROM compliant
+        * to the ISO 9660 standard.
+        */
+       sec = (ISO_VOLDESC_SEC * ISOFS_BLOCK_SIZE) / port->vdisk_bsize;
+       rv = vds_read(port, (void *)iso_buf, sec, ISOFS_BLOCK_SIZE);
+       if (rv)
+               goto done;
+
+       vdp = (struct iso_volume_descriptor *)iso_buf;
+
+       if (strncmp(vdp->id, ISO_STANDARD_ID, sizeof(vdp->id)) == 0)
+               *iso = 1;
+       else
+               *iso = 0;
+
+done:
+       vdsdbg(IOC, "media=%d rv=%d\n", port->media_type, rv);
+       return rv;
+}
+
+/*
+ * Cache the label info since partition offsets are needed for
+ * IO requests against a particular slice vs. VD_SLICE_NONE.
+ *
+ * A call to vds_label_init() unconditionally reads the label
+ * (VTOC/EFI) from the disk and caches the result if the read
+ * succeeds.
+ *
+ * Don't check for errors here since VD_SLICE_NONE requests
+ * don't need partition offsets; instead any IO request requiring
+ * partition info will later fail.
+ */
+void vds_label_init(struct vds_port *port)
+{
+       struct vio_driver_state *vio = &port->vio;
+       int rv;
+
+       /*
+        * Set the ops according to the label type (VTOC/EFI)
+        * and init as appropriate.  Make sure ops is set
+        * atomically and cannot change while the label info is
+        * fetched.  This is conceivably possible if multiple
+        * requests are processed in concurrent work threads.
+        */
+       vds_label_lock(port, vio);
+
+       if (port->npart)
+               vdsdbg(INIT, "existing partitions (%d).\n", port->npart);
+
+       vds_label_reset(port);
+
+       rv = vds_vtoc_get(port);
+       if (rv == -EINVAL)
+               rv = vds_efi_validate(port);
+
+       if (rv)
+               vdsdbg(INIT, "unknown disk label\n");
+
+       vds_label_unlock(port, vio);
+}
+
+void vds_label_fini(struct vds_port *port)
+{
+       struct vio_driver_state *vio = &port->vio;
+
+       vds_label_lock(port, vio);
+       vds_label_reset(port);
+       vds_label_unlock(port, vio);
+}
+
+int vds_label_get_vtoc(struct vds_port *port)
+{
+       int rv;
+       struct vio_driver_state *vio = &port->vio;
+
+       vds_label_lock(port, vio);
+
+       vds_label_reset(port);
+
+       rv = vds_vtoc_get(port);
+       if (rv == -EINVAL) {
+               (void) vds_efi_validate(port);
+               if (port->label_type == VDS_LABEL_EFI)
+                       rv = -VDS_ENOTSUP;
+       }
+
+       vds_label_unlock(port, vio);
+
+       return rv;
+}
+
+int vds_label_get_start(struct vds_port *port, int slice, sector_t *start)
+{
+       struct vio_driver_state *vio = &port->vio;
+       int rv = -EIO;
+
+       vds_label_lock(port, vio);
+       if (slice < port->npart) {
+               *start = port->part[slice].start;
+               rv = 0;
+       }
+       vds_label_unlock(port, vio);
+
+       vdsdbg(IO, "(%d)=(%d, %lu)\n", slice, rv, *start);
+
+       return rv;
+}
diff --git a/drivers/block/vds/vds_main.c b/drivers/block/vds/vds_main.c
new file mode 100644 (file)
index 0000000..e9eae09
--- /dev/null
@@ -0,0 +1,949 @@
+/*
+ * vds_main.c: LDOM Virtual Disk Server.
+ *
+ * Copyright (C) 2014 Oracle. All rights reserved.
+ */
+
+#include "vds.h"
+#include "vds_io.h"
+
+#define DRV_MOD_NAME           "vds"
+#define DRV_MOD_VERSION                "1.0"
+
+static char version[] = DRV_MOD_NAME ".c:v" DRV_MOD_VERSION "\n";
+MODULE_DESCRIPTION("LDOM virtual disk server driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_MOD_VERSION);
+
+#define        VDS_OPS                 (1 << VD_OP_BREAD |             \
+                                1 << VD_OP_BWRITE |            \
+                                1 << VD_OP_GET_VTOC |          \
+                                1 << VD_OP_SET_VTOC |          \
+                                1 << VD_OP_GET_DISKGEOM |      \
+                                1 << VD_OP_SET_DISKGEOM |      \
+                                1 << VD_OP_GET_EFI |           \
+                                1 << VD_OP_SET_EFI |           \
+                                1 << VD_OP_FLUSH)
+/*
+ * XXX The recommended value is 0 but that creates threads
+ * which scale with ncpu and because of some apparent
+ * flow control issues cause scsi timeouts so limit to
+ * 1 thread for now.
+ */
+int vds_wq = 1;
+int vds_dbg;
+int vds_dbg_ldc;
+int vds_dbg_vio;
+
+module_param(vds_dbg, uint, 0664);
+module_param(vds_dbg_ldc, uint, 0664);
+module_param(vds_dbg_vio, uint, 0664);
+module_param(vds_wq, uint, 0664);
+
+/* Ordered from largest major to lowest */
+static struct vio_version vds_versions[] = {
+       { .major = 1, .minor = 1 },
+       { .major = 1, .minor = 0 },
+};
+
+static void vds_handshake_complete(struct vio_driver_state *vio)
+{
+       struct vio_dring_state *dr;
+
+       dr = &vio->drings[VIO_DRIVER_RX_RING];
+       dr->snd_nxt = dr->rcv_nxt = 1;
+}
+
+static int vds_handle_unknown(struct vds_port *port)
+{
+       struct vio_msg_tag *pkt = port->msgbuf;
+
+       vdsmsg(err, "Received unknown msg [%02x:%02x:%04x:%08x]\n",
+              pkt->type, pkt->stype, pkt->stype_env, pkt->sid);
+       vdsmsg(err, "Resetting connection.\n");
+
+       ldc_disconnect(port->vio.lp);
+
+       return -ECONNRESET;
+}
+
+/* vio_driver_init() expects this. */
+static int vds_send_attr(struct vio_driver_state *vio)
+{
+       return 0;
+}
+
+static int vds_handle_attr(struct vio_driver_state *vio, void *arg)
+{
+       struct vds_port *port = to_vds_port(vio);
+       struct vio_disk_attr_info *pkt = arg;
+
+       /* checkpatch.pl doesn't like split format strings */
+       vdsdbg(HS, "GOT ATTR stype[0x%x] stype_env[0x%x] ",
+              pkt->tag.stype, pkt->tag.stype_env);
+
+       vdsdbg(HS, "xfer_mode[0x%x] blksz[%u] max_xfer[%llu]\n",
+              pkt->xfer_mode, pkt->vdisk_block_size, pkt->max_xfer_size);
+
+       if (pkt->tag.type != VIO_TYPE_CTRL ||
+           pkt->tag.stype != VIO_SUBTYPE_INFO ||
+           pkt->tag.stype_env != VIO_ATTR_INFO ||
+           pkt->max_xfer_size == 0) {
+               vdsmsg(err, "%s: Attribute NACK\n", vio->name);
+               return -ECONNRESET;
+       }
+
+       if (pkt->xfer_mode == VIO_DESC_MODE) {
+               struct vio_disk_attr_info tmp;
+
+               /*
+                * vio_disk_dring_inband contains no cookies; need room
+                * for up to n cookies, where "n" is the number of full
+                * pages plus possibly one partial page required to cover
+                * "max_xfer_size".  Add room for one more cookie if
+                * "max_xfer_size" isn't an integral multiple of the page size.
+                * Must first get the maximum transfer size in bytes.
+                */
+               size_t max_xfer_bytes = pkt->vdisk_block_size ?
+                   pkt->vdisk_block_size * pkt->max_xfer_size :
+                   pkt->max_xfer_size;
+
+               size_t max_inband_msglen =
+                   sizeof(struct vio_disk_desc_inband) +
+                   (((roundup(max_xfer_bytes, PAGE_SIZE) / PAGE_SIZE) + 1) *
+                   sizeof(struct ldc_trans_cookie));
+
+               vdsdbg(HS, "DESC ATTR max_ibm=%ld\n", max_inband_msglen);
+
+               /*
+                * Set the maximum expected message length to
+                * accommodate in-band-descriptor messages with all
+                * their cookies.
+                */
+               vio->desc_buf_len = max_inband_msglen;
+
+               /*
+                * Reallocate before responding to the message since
+                * the next request in the handshake will use this size
+                * and a small msgbuf would make the ldc read fail.
+                */
+               tmp = *pkt;
+               kfree(port->msgbuf);
+               port->msglen = max_inband_msglen;
+               port->msgbuf = kzalloc(port->msglen, GFP_ATOMIC);
+               if (!port->msgbuf) {
+                       vdsmsg(err, "%s: kzalloc failed\n", vio->name);
+                       return -ECONNRESET;
+               }
+               memcpy(port->msgbuf, &tmp, sizeof(tmp));
+               pkt = port->msgbuf;
+
+       }
+
+       port->xfer_mode = pkt->xfer_mode;
+
+       pkt->vdisk_block_size = port->vdisk_bsize;
+
+       /* XXX OBP doesn't seem to honor max_xfer_size */
+       pkt->max_xfer_size = port->max_xfer_size;
+       pkt->vdisk_size = port->vdisk_size;
+       pkt->vdisk_type = VD_DISK_TYPE_DISK;
+       pkt->vdisk_mtype = port->media_type;
+       pkt->operations = VDS_OPS;
+       pkt->tag.stype = VIO_SUBTYPE_ACK;
+       pkt->tag.sid = vio_send_sid(vio);
+
+       vdsdbg(HS, "SEND ATTR dksz[%llu] blksz[%u] max_xfer[%llu] ops[%llx]\n",
+              pkt->vdisk_size, pkt->vdisk_block_size,
+              pkt->max_xfer_size, pkt->operations);
+
+       return vio_ldc_send(&port->vio, pkt, sizeof(*pkt));
+}
+
+static struct vio_driver_ops vds_vio_ops = {
+       .send_attr              = vds_send_attr,
+       .handle_attr            = vds_handle_attr,
+       .handshake_complete     = vds_handshake_complete,
+};
+
+static void vds_reset(struct vio_driver_state *vio);
+static void vds_evt_reset(struct vio_driver_state *vio);
+
+static int vds_dring_done(struct vds_io *io)
+{
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+       struct vio_dring_data *pkt = io->msgbuf;
+       struct vio_dring_state *dr = &vio->drings[VIO_DRIVER_RX_RING];
+       struct vio_disk_desc *desc;
+       int rv;
+       int idx;
+
+       desc = io->desc_buf;
+       desc->status = io->error;
+       desc->hdr.state = VIO_DESC_DONE;
+
+       vdsdbg(DATA, "DRING DONE [%08llx:%08x:%08x:%02x:%08llx:%08llx]\n",
+              pkt->dring_ident,
+              pkt->start_idx,
+              pkt->end_idx,
+              pkt->state,
+              pkt->seq,
+              port->seq);
+
+       vdsdbg(DATA,
+              "DRING DONE"
+              " [%02x:%02x:%08llx:%02x:%02x:%04d:%08llx:%08llx:%08x]\n",
+              desc->hdr.state,
+              desc->hdr.ack,
+              desc->req_id,
+              desc->operation,
+              desc->slice,
+              desc->status,
+              desc->offset,
+              desc->size,
+              desc->ncookies);
+
+       idx = pkt->start_idx;
+       rv = ldc_put_dring_entry(vio->lp, io->desc_buf, dr->entry_size,
+                                 (idx * dr->entry_size), dr->cookies,
+                                 dr->ncookies);
+       if (rv != dr->entry_size)
+               goto reset;
+
+       /*
+        * If we successfully responded to the request (ack or nack),
+        * then return the actual IO operation return value, otherwise
+        * reset the connection.
+        */
+       pkt->tag.stype = io->ack;
+       rv = vio_ldc_send(vio, pkt, sizeof(*pkt));
+       if (rv > 0) {
+               rv = io->error;
+               vds_io_free(io);
+               vdsdbg(DATA, "DRING RET %d\n", rv);
+               return rv;
+       }
+
+reset:
+       vdsmsg(err, "Reset VDS LDC rv[%d]\n", rv);
+       vds_reset(vio);
+       vds_io_free(io);
+
+       vdsdbg(DATA, "DRING RESET\n");
+       return -ECONNRESET;
+}
+
+static int vds_desc_done(struct vds_io *io)
+{
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+       struct vio_disk_desc_inband *pkt = io->msgbuf;
+       struct vio_desc_data *hdr = &pkt->hdr;
+       int rv;
+
+       pkt->payload.status = io->error;
+       hdr->tag.stype = io->ack;
+
+       vdsdbg(DATA, "DESC DONE [%02x:%02x:%04x:%08x:%08llx:%08llx:%08llx]\n",
+              hdr->tag.type,
+              hdr->tag.stype,
+              hdr->tag.stype_env,
+              hdr->tag.sid,
+              hdr->desc_handle,
+              hdr->seq,
+              port->seq);
+
+       vdsdbg(DATA, "DESC DONE [%08llx:%02x:%02x:%04d:%08llx:%08llx:%08x]\n",
+              pkt->payload.req_id,
+              pkt->payload.operation,
+              pkt->payload.slice,
+              pkt->payload.status,
+              pkt->payload.offset,
+              pkt->payload.size,
+              pkt->payload.ncookies);
+
+       rv = vio_ldc_send(vio, pkt, io->msglen);
+       if (rv <= 0) {
+               vdsmsg(err, "Reset VDS LDC rv[%d]\n", rv);
+               vds_reset(vio);
+               rv = -ECONNRESET;
+       } else {
+               rv = io->error;
+       }
+
+       vds_io_free(io);
+       return rv;
+}
+
+static void vds_get_desc(struct vds_io *io)
+{
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+       struct vio_disk_dring_payload *desc = NULL;
+
+       switch (port->xfer_mode) {
+       case VIO_DRING_MODE: {
+               struct vio_disk_desc *d = io->desc_buf;
+               desc = (struct vio_disk_dring_payload *)&d->req_id;
+
+               vdsdbg(DATA, "DRING desc[%08llx:%08x:%08llx:%08llx]\n",
+                      desc->size, desc->ncookies,
+                      desc->cookies[0].cookie_addr,
+                      desc->cookies[0].cookie_size);
+               break;
+       }
+       case VIO_DESC_MODE: {
+               int i;
+               struct vio_disk_desc_inband *d = io->desc_buf;
+
+               desc = &d->payload;
+               for (i = 0; i < desc->ncookies; i++)
+                       vdsdbg(DATA, "DESC desc[%08llx:%04x:%08llx:%08llx]\n",
+                              desc->size, desc->ncookies,
+                              desc->cookies[i].cookie_addr,
+                              desc->cookies[i].cookie_size);
+               break;
+       }
+       default:
+               break;
+       }
+
+       io->desc = desc;
+       return;
+}
+
+/*
+ * Bottom half handshake routine.
+ */
+static void vds_bh_hs(struct work_struct *work)
+{
+       struct vds_io *io = container_of(work, struct vds_io, vds_work);
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+       int err = 0;
+
+       vdsdbg(HS, "%s\n", port->path);
+
+       BUG_ON(in_interrupt());
+
+       if (io->flags & VDS_IO_INIT)
+               err = vds_be_init(port);
+
+       if (!err)
+               err = vio_control_pkt_engine(vio, port->msgbuf);
+
+       if (err)
+               vdsmsg(err, "%s: handshake failed (%d)\n", port->path, err);
+
+       vds_io_free(io);
+}
+
+/*
+ * Bottom half IO routine.
+ */
+static void vds_bh_io(struct work_struct *work)
+{
+       struct vds_io *io = container_of(work, struct vds_io, vds_work);
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+       int err;
+
+       BUG_ON(in_interrupt());
+
+       vds_get_desc(io);
+       BUG_ON(!io->desc);
+
+       io->ack = VIO_SUBTYPE_ACK;
+       io->error = 0;
+
+       switch (io->desc->operation) {
+       case VD_OP_BREAD:
+               err = vd_op_rw(io);
+               break;
+       case VD_OP_BWRITE:
+               io->rw = WRITE;
+               err = vd_op_rw(io);
+               break;
+       case VD_OP_GET_VTOC:
+               err = vd_op_get_vtoc(io);
+               break;
+       case VD_OP_SET_VTOC:
+               err = vd_op_set_vtoc(io);
+               break;
+       case VD_OP_GET_DISKGEOM:
+               err = vd_op_get_geom(io);
+               break;
+       case VD_OP_SET_DISKGEOM:
+               err = vd_op_set_geom(io);
+               break;
+       case VD_OP_GET_EFI:
+               err = vd_op_get_efi(io);
+               break;
+       case VD_OP_SET_EFI:
+               err = vd_op_set_efi(io);
+               break;
+       case VD_OP_FLUSH:
+               err = vd_op_flush(vio);
+               break;
+       default:
+               err = -ENOTSUPP;
+               break;
+       }
+
+       if (io->ack == VIO_SUBTYPE_ACK && err != 0 && io->error == 0)
+               io->error = err > 0 ? err : -err;
+
+       if (port->xfer_mode == VIO_DRING_MODE)
+               (void) vds_dring_done(io);
+       else if (port->xfer_mode == VIO_DESC_MODE)
+               (void) vds_desc_done(io);
+       else
+               BUG();
+}
+
+static void vds_reset(struct vio_driver_state *vio)
+{
+       struct vds_port *port = to_vds_port(vio);
+       unsigned long flags;
+       int err;
+
+       vdsdbg(HS, "%s\n", port->path);
+
+       BUG_ON(in_interrupt());
+
+       vds_vio_lock(vio, flags);
+       vds_be_fini(port);
+
+       vio_link_state_change(vio, LDC_EVENT_RESET);
+       vio->desc_buf_len = 0;
+
+       port->flags = 0;
+       kfree(port->msgbuf);
+       port->msglen = LDC_PACKET_SIZE;
+       port->msgbuf = kzalloc(port->msglen, GFP_ATOMIC);
+       if (!port->msgbuf) {
+               vdsmsg(err, "%s: kzalloc failed\n", vio->name);
+               goto done;
+       }
+
+       err = ldc_connect(vio->lp);
+       if (err)
+               vdsmsg(warn, "%s: Port %lu connect failed, err=%d\n",
+                        vio->name, vio->vdev->channel_id, err);
+
+done:
+       vds_vio_unlock(vio, flags);
+}
+
+static void vds_bh_reset(struct work_struct *work)
+{
+       struct vds_io *io = container_of(work, struct vds_io, vds_work);
+       struct vio_driver_state *vio = io->vio;
+
+       vds_io_free(io);
+       vds_reset(vio);
+       ldc_enable_hv_intr(vio->lp);
+}
+
+static int vds_dring_io(struct vio_driver_state *vio)
+{
+       struct vds_port *port = to_vds_port(vio);
+       struct vio_dring_data *pkt = port->msgbuf;
+       struct vio_dring_state *dr = &vio->drings[VIO_DRIVER_RX_RING];
+       struct vio_disk_desc *desc;
+       struct vds_io *io;
+       int reset = 0;
+       int rv;
+       int idx;
+
+       vdsdbg(DATA, "DRING [%08llx:%08x:%08x:%02x:%08llx:%08llx]\n",
+              pkt->dring_ident,
+              pkt->start_idx,
+              pkt->end_idx,
+              pkt->state,
+              pkt->seq,
+              port->seq);
+
+       io = vds_io_alloc(vio, vds_bh_io);
+       if (!io)
+               return -ENOMEM;
+
+       memcpy(io->msgbuf, port->msgbuf, port->msglen);
+
+       if ((port->flags & VDS_PORT_SEQ) && (pkt->seq != port->seq + 1)) {
+               vdsmsg(err,
+                      "Message out of sequence seq[0x%llx] vds_seq[0x%llx]\n",
+                      pkt->seq, port->seq);
+               goto err;
+       }
+       port->seq = pkt->seq;
+       port->flags |= VDS_PORT_SEQ;
+       reset = 1;
+
+       if (port->xfer_mode != VIO_DRING_MODE) {
+               vdsmsg(err, "Invalid xfer mode pkt[0x%x] port[0x%x]\n",
+                      pkt->tag.stype_env, port->xfer_mode);
+               goto err;
+       }
+
+       idx = pkt->start_idx;
+       if (idx != pkt->end_idx) {
+               vdsmsg(err,
+                      "Invalid idx start[%d] end[%d]\n", idx, pkt->end_idx);
+               goto err;
+       }
+
+       rv = ldc_get_dring_entry(vio->lp, io->desc_buf, dr->entry_size,
+                                 (idx * dr->entry_size), dr->cookies,
+                                 dr->ncookies);
+       if (rv != dr->entry_size)
+               goto err;
+
+       desc = (struct vio_disk_desc *)io->desc_buf;
+
+       vdsdbg(DATA,
+              "DRING [%02x:%02x:%08llx:%02x:%02x:%04d:%08llx:%08llx:%08x]\n",
+              desc->hdr.state,
+              desc->hdr.ack,
+              desc->req_id,
+              desc->operation,
+              desc->slice,
+              desc->status,
+              desc->offset,
+              desc->size,
+              desc->ncookies);
+
+       /*
+        * Queue the request.
+        */
+       if (desc->hdr.state == VIO_DESC_READY) {
+               vds_io_enq(io);
+               return 0;
+       }
+
+err:
+       if (reset) {
+               vdsmsg(err, "Reset VDS LDC\n");
+               vds_io_free(io);
+               vds_evt_reset(vio);
+               rv = -ECONNRESET;
+       } else {
+               vdsmsg(err, "NACK request io=%p\n", io);
+               io->ack = VIO_SUBTYPE_NACK;
+               io->error = 0;
+               rv = vds_dring_done(io);
+       }
+       return rv;
+}
+
+static int vds_desc_io(struct vio_driver_state *vio, int msglen)
+{
+       struct vds_port *port = to_vds_port(vio);
+       struct vio_disk_desc_inband *pkt = port->msgbuf;
+       struct vio_desc_data *hdr = &pkt->hdr;
+       struct vds_io *io;
+       int rv;
+
+       vdsdbg(DATA, "DESC [%02x:%02x:%04x:%08x:%08llx:%08llx:%08llx]\n",
+              hdr->tag.type,
+              hdr->tag.stype,
+              hdr->tag.stype_env,
+              hdr->tag.sid,
+              hdr->desc_handle,
+              hdr->seq,
+              port->seq);
+
+       vdsdbg(DATA, "DESC [%08llx:%02x:%02x:%04d:%08llx:%08llx:%08x]\n",
+              pkt->payload.req_id,
+              pkt->payload.operation,
+              pkt->payload.slice,
+              pkt->payload.status,
+              pkt->payload.offset,
+              pkt->payload.size,
+              pkt->payload.ncookies);
+
+       io = vds_io_alloc(vio, vds_bh_io);
+       if (!io)
+               return -ENOMEM;
+
+       memcpy(io->msgbuf, port->msgbuf, msglen);
+
+       if ((port->flags & VDS_PORT_SEQ) && (hdr->seq != port->seq + 1)) {
+               vdsmsg(err,
+                      "Message out of sequence seq[0x%llx] vds_seq[0x%llx]\n",
+                      hdr->seq, port->seq);
+#if 0
+               /* XXX OBP seems to send out of sequence messages */
+               goto nack;
+#endif
+       }
+       port->seq = hdr->seq;
+       port->flags |= VDS_PORT_SEQ;
+
+       if (port->xfer_mode != VIO_DESC_MODE) {
+               vdsmsg(err, "Invalid xfer mode pkt[0x%x] port[0x%x]\n",
+                      hdr->tag.stype_env, port->xfer_mode);
+               goto nack;
+       }
+
+       /*
+        * Queue the request.
+        */
+       memcpy(io->desc_buf, port->msgbuf, msglen);
+       io->msglen = msglen;
+       vds_io_enq(io);
+
+       return 0;
+
+nack:
+       io->ack = VIO_SUBTYPE_NACK;
+       io->error = 0;
+       rv = vds_desc_done(io);
+       return rv;
+}
+
+static void vds_evt_reset(struct vio_driver_state *vio)
+{
+       struct vds_io *io;
+
+       vdsdbg(HS, "\n");
+
+       BUG_ON(!in_interrupt());
+
+       io = vds_io_alloc(vio, vds_bh_reset);
+       if (!io)
+               return;
+
+       ldc_disable_hv_intr(vio->lp);
+       io->flags |= VDS_IO_FINI;
+
+       vds_io_enq(io);
+}
+
+static void vds_evt_up(struct vio_driver_state *vio)
+{
+       BUG_ON(!in_interrupt());
+
+       vio_link_state_change(vio, LDC_EVENT_UP);
+       /* this is needed in dring mode */
+       vio->dr_state &= ~VIO_DR_STATE_RXREQ;
+}
+
+static int
+vds_evt_ctl(struct vio_driver_state *vio)
+{
+       struct vds_io *io;
+
+       BUG_ON(!in_interrupt());
+
+       io = vds_io_alloc(vio, vds_bh_hs);
+       if (!io)
+               return -ENOMEM;
+
+       if (vio->hs_state == VIO_HS_INVALID)
+               io->flags |= VDS_IO_INIT;
+
+       vds_io_enq(io);
+
+       return 0;
+}
+
+static void vds_evt_data(struct vio_driver_state *vio)
+{
+       int rv;
+       int msglen;
+       struct vio_msg_tag *tag;
+       struct vds_port *port = to_vds_port(vio);
+
+       BUG_ON(!in_interrupt());
+
+       while (1) {
+               rv = ldc_read(vio->lp, port->msgbuf, port->msglen);
+               vdsdbg(DATA, "ldc_read(%d)=%d\n", port->msglen, rv);
+               if (rv < 0) {
+                       if (rv == -ECONNRESET)
+                               vds_evt_reset(vio);
+                       break;
+               }
+               if (rv == 0)
+                       break;
+               tag = port->msgbuf;
+               vdsdbg(DATA, "TAG [%02x:%02x:%04x:%08x]\n",
+                      tag->type,
+                      tag->stype,
+                      tag->stype_env,
+                      tag->sid);
+               msglen = rv;
+               rv = vio_validate_sid(vio, tag);
+               if (rv < 0)
+                       break;
+               switch (tag->type) {
+               case VIO_TYPE_CTRL:
+                       /*
+                        * This is needed in dring mode.
+                        */
+                       if (tag->stype == VIO_SUBTYPE_INFO &&
+                           tag->stype_env == VIO_DRING_REG)
+                               vio->dr_state |= VIO_DR_STATE_RXREQ;
+                       rv = vds_evt_ctl(vio);
+                       break;
+               case VIO_TYPE_DATA:
+                       switch (tag->stype) {
+                       case VIO_SUBTYPE_INFO:
+                               switch (tag->stype_env) {
+                               case VIO_DRING_DATA:
+                                       rv = vds_dring_io(vio);
+                                       break;
+                               case VIO_DESC_DATA:
+                                       rv = vds_desc_io(vio, msglen);
+                                       break;
+                               default:
+                                       rv = -EINVAL;
+                                       break;
+                               }
+                               break;
+                       default:
+                               rv = vds_handle_unknown(port);
+                               break;
+                       }
+                       break;
+               default:
+                       rv = vds_handle_unknown(port);
+                       break;
+               }
+               if (rv < 0)
+                       break;
+       }
+}
+
+static void vds_event(void *arg, int event)
+{
+       unsigned long flags;
+       struct vds_port *port = arg;
+       struct vio_driver_state *vio = &port->vio;
+
+       vdsdbg(DATA, "event=%d cpu=%d\n", event, smp_processor_id());
+
+       vds_vio_lock(vio, flags);
+
+       switch (event) {
+       case LDC_EVENT_RESET:
+               vds_evt_reset(vio);
+               break;
+       case LDC_EVENT_UP:
+               vds_evt_up(vio);
+               break;
+       case LDC_EVENT_DATA_READY:
+               vds_evt_data(vio);
+               break;
+       default:
+               vdsmsg(warn, "Unexpected LDC event %d\n", event);
+               break;
+       }
+
+       vds_vio_unlock(vio, flags);
+}
+
+static struct ldc_channel_config vds_ldc_cfg = {
+       .event          = vds_event,
+       .mtu            = 64,
+       .mode           = LDC_MODE_UNRELIABLE,
+};
+
+static ssize_t vds_sysfs_path_show(struct device *device,
+       struct device_attribute *attr, char *buf)
+{
+       int rv;
+       unsigned long flags;
+       struct vds_port *port = dev_get_drvdata(device);
+       struct vio_driver_state *vio = &port->vio;
+
+       vds_vio_lock(vio, flags);
+       rv = scnprintf(buf, PAGE_SIZE, "%s\n", port->path);
+       vds_vio_unlock(vio, flags);
+
+       return rv;
+}
+
+static DEVICE_ATTR(path, S_IRUSR, vds_sysfs_path_show, NULL);
+
+static struct attribute *vds_sysfs_entries[] = {
+       &dev_attr_path.attr,
+       NULL
+};
+
+static struct attribute_group vds_attribute_group = {
+       .name = NULL,   /* put in device directory */
+       .attrs = vds_sysfs_entries,
+};
+
+static void print_version(void)
+{
+       printk_once(KERN_INFO "%s", version);
+}
+
+static int vds_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
+{
+       struct mdesc_handle *hp;
+       struct vds_port *port;
+       struct vio_driver_state *vio;
+       const char *path;
+       u64 node;
+       int err;
+
+       print_version();
+
+       port = kzalloc(sizeof(*port), GFP_KERNEL);
+       if (!port) {
+               vdsmsg(err, "Cannot allocate vds_port.\n");
+               return -ENOMEM;
+       }
+
+       port->msglen = LDC_PACKET_SIZE;
+       port->msgbuf = kzalloc(port->msglen, GFP_KERNEL);
+       if (!port->msgbuf) {
+               err = -ENOMEM;
+               goto free_port;
+       }
+
+       vio = &port->vio;
+
+       err = vio_driver_init(vio, vdev, VDEV_DISK_SERVER,
+                             vds_versions, ARRAY_SIZE(vds_versions),
+                             &vds_vio_ops, (char *)dev_name(&vdev->dev));
+       if (err)
+               goto free_msgbuf;
+
+       vio->debug = vds_dbg_vio;
+       vds_ldc_cfg.debug = vds_dbg_ldc;
+
+       err = vio_ldc_alloc(vio, &vds_ldc_cfg, port);
+       if (err)
+               goto free_msgbuf;
+
+       hp = mdesc_grab();
+
+       node = vio_vdev_node(hp, vdev);
+       if (node == MDESC_NODE_NULL) {
+               err = -ENXIO;
+               mdesc_release(hp);
+               goto free_ldc;
+       }
+
+       path = mdesc_get_property(hp, node, "vds-block-device", NULL);
+       if (!path) {
+               err = -ENXIO;
+               mdesc_release(hp);
+               goto free_ldc;
+       }
+       port->path = kstrdup(path, GFP_KERNEL);
+       mdesc_release(hp);
+       vdsdbg(INIT, "path=%s\n", path);
+       port->vtoc = kzalloc(roundup(sizeof(*port->vtoc), 8), GFP_KERNEL);
+       port->geom = kzalloc(roundup(sizeof(*port->geom), 8), GFP_KERNEL);
+       port->part = kzalloc(sizeof(*port->part) * VDS_MAXPART, GFP_KERNEL);
+
+       /*
+        * The io and reset work queues are separate because the
+        * io work queue is flushed during reset which would hang
+        * if reset itself was scheduled on the io queue.
+        */
+       port->ioq = alloc_workqueue("vds_io", WQ_UNBOUND, vds_wq);
+       port->rtq = alloc_ordered_workqueue("vds_reset", 0);
+       if (!port->ioq || !port->rtq) {
+               err = -ENXIO;
+               goto free_path;
+       }
+
+       mutex_init(&port->label_lock);
+
+       dev_set_drvdata(&vdev->dev, port);
+
+       err = sysfs_create_group(&vdev->dev.kobj, &vds_attribute_group);
+       if (err)
+               goto free_path;
+
+       vio_port_up(vio);
+
+       return 0;
+
+free_path:
+       kfree(port->path);
+       kfree(port->vtoc);
+       kfree(port->geom);
+       kfree(port->part);
+
+free_ldc:
+       vio_ldc_free(vio);
+
+free_msgbuf:
+       kfree(port->msgbuf);
+
+free_port:
+       kfree(port);
+
+       return err;
+}
+
+static int vds_port_remove(struct vio_dev *vdev)
+{
+       struct vds_port *port = dev_get_drvdata(&vdev->dev);
+       struct vio_driver_state *vio = &port->vio;
+
+       if (!port)
+               return 0;
+
+       del_timer_sync(&vio->timer);
+       ldc_disconnect(vio->lp);        /* XXX vds_port_down() */
+       vio_ldc_free(vio);
+       sysfs_remove_group(&vdev->dev.kobj, &vds_attribute_group);
+       dev_set_drvdata(&vdev->dev, NULL);
+
+       mutex_destroy(&port->label_lock);
+       kfree(port->path);
+       kfree(port->msgbuf);
+       kfree(port->vtoc);
+       kfree(port->geom);
+       kfree(port->part);
+       kfree(port);
+
+       return 0;
+}
+
+static const struct vio_device_id vds_port_match[] = {
+       {
+               .type = "vds-port",
+       },
+       {},
+};
+
+static struct vio_driver vds_port_driver = {
+       .id_table       = vds_port_match,
+       .probe          = vds_port_probe,
+       .remove         = vds_port_remove,
+       .name           = "vds_port",
+};
+
+static int __init vds_init(void)
+{
+       int rv;
+
+       rv = vds_io_init();
+       if (!rv) {
+               rv = vio_register_driver(&vds_port_driver);
+               if (rv < 0)
+                       vds_io_fini();
+       }
+
+       return rv;
+}
+
+static void __exit vds_exit(void)
+{
+       vio_unregister_driver(&vds_port_driver);
+       vds_io_fini();
+}
+
+module_init(vds_init);
+module_exit(vds_exit);
diff --git a/drivers/block/vds/vds_reg.c b/drivers/block/vds/vds_reg.c
new file mode 100644 (file)
index 0000000..b790f99
--- /dev/null
@@ -0,0 +1,86 @@
+/*
+ * vds_reg.c: LDOM Virtual Disk Server.
+ *
+ * Copyright (C) 2014 Oracle. All rights reserved.
+ */
+
+#include "vds.h"
+#include "vds_io.h"
+
+static int vds_reg_init(struct vds_port *port)
+{
+       struct file *file;
+
+       file = filp_open(port->path, O_RDWR | O_EXCL | O_LARGEFILE, 0);
+       if (IS_ERR(file))
+               return (int)PTR_ERR(file);
+
+       port->vdisk_bsize = 512;
+       port->vdisk_size = i_size_read(file_inode(file)) /
+                                      port->vdisk_bsize;
+       port->max_xfer_size = 1024;
+
+       port->be_data = file;
+
+       return 0;
+}
+
+static void vds_reg_fini(struct vds_port *port)
+{
+       struct file *file = port->be_data;
+
+       if (file)
+               filp_close(file, NULL);
+}
+
+static int vds_reg_rw(struct vds_io *io)
+{
+       loff_t off;
+       ssize_t iosz;
+       void *addr;
+       struct vio_driver_state *vio = io->vio;
+       struct vds_port *port = to_vds_port(vio);
+       struct file *file = port->be_data;
+
+       vdsdbg(FIO, "(0x%p, %lld, %ld, %d)\n", io->pages, io->size,
+              io->offset, io->rw);
+
+       if (file == NULL) {
+               vdsmsg(err, "NULL file pointer for IO\n");
+               return -EIO;
+       }
+
+       addr = page_address(io->pages);
+       off = to_bytes(io->offset);
+
+       if (io->rw & WRITE)
+               iosz = file->f_op->write(file, addr, io->size, &off);
+       else
+               iosz = file->f_op->read(file, addr, io->size, &off);
+
+       if (iosz != io->size) {
+               vdsmsg(err, "file IO failed: iosz=%ld\n", iosz);
+               return -EIO;
+       }
+
+       return 0;
+}
+
+static int vds_reg_flush(struct vds_port *port)
+{
+       struct file *file = port->be_data;
+
+       return vfs_fsync(file, 0);
+}
+
+struct vds_be_ops vds_reg_ops = {
+       vds_reg_init,
+       vds_reg_fini,
+       vds_reg_rw,
+       vds_reg_flush,
+};
+
+struct vds_be_ops *vds_reg_get_ops()
+{
+       return &vds_reg_ops;
+}
diff --git a/drivers/block/vds/vds_vtoc.c b/drivers/block/vds/vds_vtoc.c
new file mode 100644 (file)
index 0000000..06b692a
--- /dev/null
@@ -0,0 +1,427 @@
+/*
+ * vds_vtoc.c: LDOM Virtual Disk Server.
+ *
+ * Copyright (C) 2014 Oracle. All rights reserved.
+ */
+
+#include "vds.h"
+#include "vds_io.h"
+#include "vds_vtoc.h"
+
+/*
+ * By Solaris convention, slice/partition 2 represents the entire disk;
+ * unfortunately, this convention does not appear to be codified.
+ */
+#define        VDS_ENTIRE_DISK_SLICE   2
+
+/* Number of backup labels */
+#define        VDS_DSKIMG_NUM_BACKUP   5
+
+static unsigned short vds_lbl2cksum(struct dk_label *label)
+{
+       int count;
+       unsigned short sum, *sp;
+
+       count = (sizeof(struct dk_label)) / (sizeof(short)) - 1;
+       sp = (unsigned short *)label;
+       sum = 0;
+       while (count--)
+               sum ^= *sp++;
+
+       return sum;
+}
+
+static void
+vds_vtoc_update_part(struct vds_port *port, struct dk_label *label)
+{
+       int i;
+
+       vds_label_clear_part(port);
+
+       for (i = 0; i < port->npart; i++) {
+               port->part[i].start = label->dkl_map[i].dkl_cylno *
+                                     label->dkl_nhead * label->dkl_nsect;
+               port->part[i].size = label->dkl_map[i].dkl_nblk;
+       }
+}
+
+/*
+ * Function:
+ *     vd_get_readable_size
+ *
+ * Description:
+ *     Convert a given size in bytes to a human readable format in
+ *     kilobytes, megabytes, gigabytes or terabytes.
+ *
+ * Parameters:
+ *     full_size       - the size to convert in bytes.
+ *     size            - the converted size.
+ *     unit            - the unit of the converted size: 'K' (kilobyte),
+ *                       'M' (Megabyte), 'G' (Gigabyte), 'T' (Terabyte).
+ *
+ * Return Code:
+ *     none
+ */
+static void vd_get_readable_size(size_t full_size, size_t *size, char *unit)
+{
+       if (full_size < (1ULL << 20)) {
+               *size = full_size >> 10;
+               *unit = 'K'; /* Kilobyte */
+       } else if (full_size < (1ULL << 30)) {
+               *size = full_size >> 20;
+               *unit = 'M'; /* Megabyte */
+       } else if (full_size < (1ULL << 40)) {
+               *size = full_size >> 30;
+               *unit = 'G'; /* Gigabyte */
+       } else {
+               *size = full_size >> 40;
+               *unit = 'T'; /* Terabyte */
+       }
+}
+
+/*
+ * Set the default label for a given disk size. This is used when the disk
+ * does not have a valid VTOC so that the user can get a valid default
+ * configuration. The default label has all slice sizes set to 0 (except
+ * slice 2 which is the entire disk) to force the user to write a valid
+ * label onto the disk image.
+ */
+static void vds_vtoc_set_default(struct vds_port *port, struct dk_label *label)
+{
+       char unit;
+       size_t size;
+       size_t bsize = port->vdisk_bsize;
+       size_t disk_size = port->vdisk_size * bsize;
+       struct vio_driver_state *vio = &port->vio;
+
+       memset(label, 0, sizeof(struct dk_label));
+
+       /*
+        * Ideally we would like the cylinder size (nsect * nhead) to be the
+        * same whatever the disk size is. That way the VTOC label could be
+        * easily updated in case the disk size is increased (keeping the
+        * same cylinder size allows to preserve the existing partitioning
+        * when updating the VTOC label). But it is not possible to have
+        * a fixed cylinder size and to cover all disk size.
+        *
+        * So we define different cylinder sizes depending on the disk size.
+        * The cylinder size is chosen so that we don't have too few cylinders
+        * for a small disk image, or so many on a big disk image that you
+        * waste space for backup superblocks or cylinder group structures.
+        * Also we must have a resonable number of cylinders and sectors so
+        * that newfs can run using default values.
+        *
+        *      +-----------+--------+---------+--------+
+        *      | disk_size |  < 2MB | 2MB-4GB | >= 8GB |
+        *      +-----------+--------+---------+--------+
+        *      | nhead     |    1   |     1   |    96  |
+        *      | nsect     |  200   |   600   |   768  |
+        *      +-----------+--------+---------+--------+
+        *
+        * Other parameters are computed from these values:
+        *
+        *      pcyl = disk_size / (nhead * nsect * 512)
+        *      acyl = (pcyl > 2)? 2 : 0
+        *      ncyl = pcyl - acyl
+        *
+        * The maximum number of cylinder is 65535 so this allows to define a
+        * geometry for a disk size up to 65535 * 96 * 768 * 512 = 2.24 TB
+        * which is more than enough to cover the maximum size allowed by the
+        * extended VTOC format (2TB).
+        */
+
+       if (disk_size >= 8 * ONE_GIGABYTE) {
+
+               label->dkl_nhead = 96;
+               label->dkl_nsect = 768;
+
+       } else if (disk_size >= 2 * ONE_MEGABYTE) {
+
+               label->dkl_nhead = 1;
+               label->dkl_nsect = 600;
+
+       } else {
+
+               label->dkl_nhead = 1;
+               label->dkl_nsect = 200;
+       }
+
+       label->dkl_pcyl = disk_size /
+           (label->dkl_nsect * label->dkl_nhead * bsize);
+
+       if (label->dkl_pcyl == 0)
+               label->dkl_pcyl = 1;
+
+       label->dkl_acyl = 0;
+
+       if (label->dkl_pcyl > 2)
+               label->dkl_acyl = 2;
+
+       label->dkl_ncyl = label->dkl_pcyl - label->dkl_acyl;
+       label->dkl_write_reinstruct = 0;
+       label->dkl_read_reinstruct = 0;
+       label->dkl_rpm = 7200;
+       label->dkl_apc = 0;
+       label->dkl_intrlv = 0;
+
+       vdsdbg(IOC, "requested disk size: %ld bytes\n", disk_size);
+       vdsdbg(IOC, "setup: ncyl=%d nhead=%d nsec=%d\n", label->dkl_pcyl,
+              label->dkl_nhead, label->dkl_nsect);
+       vdsdbg(IOC, "provided disk size: %lld bytes\n", (uint64_t)
+              (label->dkl_pcyl * label->dkl_nhead *
+              label->dkl_nsect * bsize));
+
+       vd_get_readable_size(disk_size, &size, &unit);
+
+       /*
+        * We must have a correct label name otherwise format(1m) will
+        * not recognized the disk as labeled.
+        */
+       (void) snprintf(label->dkl_asciilabel, LEN_DKL_ASCII,
+           "SUN-DiskImage-%ld%cB cyl %d alt %d hd %d sec %d",
+           size, unit,
+           label->dkl_ncyl, label->dkl_acyl, label->dkl_nhead,
+           label->dkl_nsect);
+
+       /* default VTOC */
+       label->dkl_vtoc.v_version = V_EXTVERSION;
+       label->dkl_vtoc.v_nparts = V_NUMPAR;
+       label->dkl_vtoc.v_sanity = VTOC_SANE;
+       label->dkl_vtoc.v_part[VDS_ENTIRE_DISK_SLICE].p_tag = V_BACKUP;
+       label->dkl_map[VDS_ENTIRE_DISK_SLICE].dkl_cylno = 0;
+       label->dkl_map[VDS_ENTIRE_DISK_SLICE].dkl_nblk = label->dkl_ncyl *
+           label->dkl_nhead * label->dkl_nsect;
+       label->dkl_magic = DKL_MAGIC;
+       label->dkl_cksum = vds_lbl2cksum(label);
+}
+
+/*
+ * Get the disk label.  If the type is unknown, initialize a default label.
+ */
+static int vds_vtoc_get_label(struct vds_port *port, struct dk_label **lp)
+{
+       int rv = -EIO;
+       struct dk_label *label = (struct dk_label *)port->label;
+       struct vio_driver_state *vio = &port->vio;
+
+       rv = vds_read(port, label, 0, DK_LABEL_SIZE);
+       if (rv)
+               return rv;
+
+       if (label->dkl_magic != DKL_MAGIC) {
+               vdsdbg(IOC, "bad VTOC label magic %04x\n", label->dkl_magic);
+               if (port->label_type == VDS_LABEL_NONE) {
+                       vds_vtoc_set_default(port, label);
+                       rv = -EINVAL;
+               }
+       } else if (label->dkl_cksum != vds_lbl2cksum(label)) {
+               vdsmsg(err, "bad VTOC label checksum\n");
+       } else {
+               vdsdbg(IOC, "VTOC magic=%04x\n", label->dkl_magic);
+               vdsdbg(IOC, "ncyl=%d nhead=%d nsec=%d\n", label->dkl_pcyl,
+                      label->dkl_nhead, label->dkl_nsect);
+               rv = 0;
+       }
+
+       if (rv != 0 && rv != -EINVAL)
+               label = NULL;
+
+       *lp = label;
+
+       return rv;
+}
+
+static void
+vds_vtoc_l2g(struct dk_label *label, struct vio_disk_geom *geom)
+{
+       geom->num_cyl = label->dkl_ncyl;
+       geom->alt_cyl = label->dkl_acyl;
+       geom->num_hd = label->dkl_nhead;
+       geom->num_sec = label->dkl_nsect;
+       geom->ifact = label->dkl_intrlv;
+       geom->apc = label->dkl_apc;
+       geom->rpm = label->dkl_rpm;
+       geom->phy_cyl = label->dkl_pcyl;
+       geom->rd_skip = label->dkl_read_reinstruct;
+       geom->wr_skip = label->dkl_write_reinstruct;
+}
+
+static void
+vds_vtoc_g2l(struct vio_disk_geom *geom, struct dk_label *label)
+{
+       label->dkl_ncyl = geom->num_cyl;
+       label->dkl_acyl = geom->alt_cyl;
+       label->dkl_nhead = geom->num_hd;
+       label->dkl_nsect = geom->num_sec;
+       label->dkl_intrlv = geom->ifact;
+       label->dkl_apc = geom->apc;
+       label->dkl_rpm = geom->rpm;
+       label->dkl_pcyl = geom->phy_cyl;
+       label->dkl_read_reinstruct = geom->rd_skip;
+       label->dkl_write_reinstruct = geom->wr_skip;
+       label->dkl_cksum = vds_lbl2cksum(label);
+}
+
+/*
+ * Get the disk VTOC.  If there is no valid label,
+ * set a default VTOC.
+ */
+/*ARGSUSED*/
+int vds_vtoc_get(struct vds_port *port)
+{
+       int i, rv;
+       struct dk_label *label;
+       struct vio_disk_vtoc *vtoc = port->vtoc;
+
+       rv = vds_vtoc_get_label(port, &label);
+       if (!label)
+               return rv;
+
+       memcpy(vtoc->volume_name, label->dkl_vtoc.v_volume,
+              VIO_DISK_VNAME_LEN);
+       memcpy(vtoc->ascii_label, label->dkl_asciilabel, LEN_DKL_ASCII);
+       vtoc->sector_size = 512;
+       vtoc->num_partitions = label->dkl_vtoc.v_nparts;
+
+       for (i = 0; i < vtoc->num_partitions; i++) {
+               vtoc->partitions[i].id = label->dkl_vtoc.v_part[i].p_tag;
+               vtoc->partitions[i].perm_flags =
+                   label->dkl_vtoc.v_part[i].p_flag;
+               vtoc->partitions[i].start_block =
+                   label->dkl_map[i].dkl_cylno *
+                   label->dkl_nhead * label->dkl_nsect;
+               vtoc->partitions[i].num_blocks = label->dkl_map[i].dkl_nblk;
+       }
+
+       vds_vtoc_l2g(label, port->geom);
+
+       /*
+        * Always update the cached copy, in case this is
+        * a shared disk and the label has been updated.
+        */
+       if (!rv) {
+               port->label_type = VDS_LABEL_VTOC;
+               port->npart = label->dkl_vtoc.v_nparts;
+               vds_vtoc_update_part(port, label);
+       }
+
+       return rv;
+}
+
+static int
+vds_vtoc_set_backup(struct vds_port *port, struct dk_label *label, bool clear)
+{
+       int rv;
+       sector_t blk, sec, cyl, head, cnt, nsect;
+       struct vio_driver_state *vio = &port->vio;
+
+       /*
+        * Backup labels are on the last alternate cylinder's
+        * first five odd sectors.
+        */
+       if (label->dkl_acyl == 0) {
+               vdsdbg(IOC, "no alt cylinder, cannot store backup labels");
+               return 0;
+       }
+
+       cyl = label->dkl_ncyl  + label->dkl_acyl - 1;
+       head = label->dkl_nhead - 1;
+       nsect = label->dkl_nsect;
+
+       blk = (cyl * ((label->dkl_nhead * nsect) - label->dkl_apc)) +
+           (head * nsect);
+
+       if (clear == true)
+               memset(label, 0, sizeof(*label));
+
+       /*
+        * Write the backup labels. Make sure we don't try to write past
+        * the last cylinder.
+        */
+       sec = 1;
+
+       for (cnt = 0; cnt < VDS_DSKIMG_NUM_BACKUP; cnt++) {
+
+               if (sec >= nsect) {
+                       vdsdbg(IOC, "not enough sectors for backup labels");
+                       return 0;
+               }
+
+               rv = vds_write(port, label, blk + sec, DK_LABEL_SIZE);
+               if (rv) {
+                       vdsdbg(IOC, "error writing label at block %lu\n rv=%d",
+                              blk + sec, rv);
+                       return rv;
+               }
+
+               vdsdbg(IOC, "wrote backup label at block %lu\n", blk + sec);
+               vdsdbg(IOC, "ncyl=%d nhead=%d nsec=%d\n",
+                      label->dkl_pcyl, label->dkl_nhead, label->dkl_nsect);
+
+               sec += 2;
+       }
+
+       return 0;
+}
+
+int vds_vtoc_set(struct vds_port *port, struct vio_disk_vtoc *vtoc)
+{
+       int i, rv;
+       struct dk_label *label;
+
+       rv = vds_vtoc_get_label(port, &label);
+       if (!label)
+               return rv;
+
+       vds_vtoc_g2l(port->geom, label);
+
+       memcpy(label->dkl_vtoc.v_volume, vtoc->volume_name,
+              VIO_DISK_VNAME_LEN);
+       memcpy(label->dkl_asciilabel, vtoc->ascii_label, LEN_DKL_ASCII);
+       label->dkl_vtoc.v_nparts = vtoc->num_partitions;
+
+       for (i = 0; i < vtoc->num_partitions; i++) {
+               label->dkl_vtoc.v_part[i].p_tag = vtoc->partitions[i].id;
+               label->dkl_vtoc.v_part[i].p_flag =
+                   vtoc->partitions[i].perm_flags;
+               label->dkl_map[i].dkl_cylno = vtoc->partitions[i].start_block /
+                   (label->dkl_nhead * label->dkl_nsect);
+               label->dkl_map[i].dkl_nblk = vtoc->partitions[i].num_blocks;
+       }
+
+       label->dkl_cksum = vds_lbl2cksum(label);
+
+       rv = vds_write(port, label, 0, DK_LABEL_SIZE);
+
+       if (!rv) {
+               port->label_type = VDS_LABEL_VTOC;
+               port->npart = label->dkl_vtoc.v_nparts;
+               vds_vtoc_update_part(port, label);
+       }
+
+       /*
+        * There is no need to return an error for backups
+        * since the primary succeeded.
+        */
+       (void) vds_vtoc_set_backup(port, label, false);
+
+       return rv;
+}
+
+int vds_vtoc_clear(struct vds_port *port)
+{
+       int rv;
+       struct dk_label *label;
+
+       rv = vds_vtoc_get_label(port, &label);
+       if (!label)
+               return rv;
+
+       rv = vds_clear(port, 0, DK_LABEL_SIZE);
+       if (!rv) {
+               vds_label_reset(port);
+               (void) vds_vtoc_set_backup(port, label, true);
+       }
+
+       return rv;
+}
diff --git a/drivers/block/vds/vds_vtoc.h b/drivers/block/vds/vds_vtoc.h
new file mode 100644 (file)
index 0000000..12f1e90
--- /dev/null
@@ -0,0 +1,120 @@
+/*
+ * vds_vtoc.h: LDOM Virtual Disk Server.
+ *
+ * Copyright (C) 2014 Oracle. All rights reserved.
+ *
+ * Format of a Sun disk label.
+ * Resides in cylinder 0, head 0, sector 0.
+ *
+ * From Solaris dklabel.h
+ *
+ */
+
+#define        NDKMAP          8               /* # of logical partitions */
+#define        DKL_MAGIC       0xDABE          /* magic number */
+#define        LEN_DKL_ASCII   128             /* length of dkl_asciilabel */
+#define        LEN_DKL_VVOL    8               /* length of v_volume */
+
+
+/*
+ * partition headers:  section 1
+ * Fixed size for on-disk dk_label
+ */
+struct dk_map32 {
+       uint32_t        dkl_cylno;      /* starting cylinder */
+       uint32_t        dkl_nblk;       /* number of blocks;  if == 0, */
+                                       /* partition is undefined */
+};
+
+/*
+ * partition headers:  section 2,
+ * brought over from AT&T SVr4 vtoc structure.
+ */
+struct dk_map2 {
+       uint16_t        p_tag;          /* ID tag of partition */
+       uint16_t        p_flag;         /* permission flag */
+};
+
+/*
+ * VTOC inclusions from AT&T SVr4
+ * Fixed sized types for on-disk VTOC
+ */
+struct dk_vtoc {
+       uint32_t        v_version;              /* layout version */
+       char            v_volume[LEN_DKL_VVOL]; /* volume name */
+       uint16_t        v_nparts;               /* number of partitions  */
+       struct dk_map2  v_part[NDKMAP];         /* partition hdrs, sec 2 */
+       uint32_t        v_bootinfo[3];          /* info needed by mboot */
+       uint32_t        v_sanity;               /* to verify vtoc sanity */
+       uint32_t        v_reserved[10];         /* free space */
+       int32_t         v_timestamp[NDKMAP];    /* partition timestamp */
+};
+
+/*
+ * define the amount of disk label padding needed to make
+ * the entire structure occupy 512 bytes.
+ */
+#define        LEN_DKL_PAD     (DK_LABEL_SIZE \
+                           - ((LEN_DKL_ASCII) + \
+                           (sizeof(struct dk_vtoc)) + \
+                           (sizeof(struct dk_map32)  * NDKMAP) + \
+                           (14 * (sizeof(uint16_t))) + \
+                           (2 * (sizeof(uint16_t)))))
+
+struct dk_label {
+       char            dkl_asciilabel[LEN_DKL_ASCII]; /* for compatibility */
+       struct dk_vtoc  dkl_vtoc;       /* vtoc inclusions from AT&T SVr4 */
+       uint16_t        dkl_write_reinstruct;   /* # sectors to skip, writes */
+       uint16_t        dkl_read_reinstruct;    /* # sectors to skip, reads */
+       char            dkl_pad[LEN_DKL_PAD]; /* unused part of 512 bytes */
+       uint16_t        dkl_rpm;        /* rotations per minute */
+       uint16_t        dkl_pcyl;       /* # physical cylinders */
+       uint16_t        dkl_apc;        /* alternates per cylinder */
+       uint16_t        dkl_obs1;       /* obsolete */
+       uint16_t        dkl_obs2;       /* obsolete */
+       uint16_t        dkl_intrlv;     /* interleave factor */
+       uint16_t        dkl_ncyl;       /* # of data cylinders */
+       uint16_t        dkl_acyl;       /* # of alternate cylinders */
+       uint16_t        dkl_nhead;      /* # of heads in this partition */
+       uint16_t        dkl_nsect;      /* # of sectors per track */
+       uint16_t        dkl_obs3;       /* obsolete */
+       uint16_t        dkl_obs4;       /* obsolete */
+       struct dk_map32 dkl_map[NDKMAP]; /* logical partition headers */
+       uint16_t        dkl_magic;      /* identifies this label format */
+       uint16_t        dkl_cksum;      /* xor checksum of sector */
+};
+
+#define        V_NUMPAR        NDKMAP          /* The number of partitions */
+                                       /* (from dkio.h) */
+
+#define        VTOC_SANE       0x600DDEEE      /* Indicates a sane VTOC */
+#define        V_VERSION       0x01            /* layout version number */
+#define        V_EXTVERSION    V_VERSION       /* extvtoc layout version number */
+
+/*
+ * Partition identification tags
+ */
+#define        V_UNASSIGNED    0x00            /* unassigned partition */
+#define        V_BOOT          0x01            /* Boot partition */
+#define        V_ROOT          0x02            /* Root filesystem */
+#define        V_SWAP          0x03            /* Swap filesystem */
+#define        V_USR           0x04            /* Usr filesystem */
+#define        V_BACKUP        0x05            /* full disk */
+#define        V_STAND         0x06            /* Stand partition */
+#define        V_VAR           0x07            /* Var partition */
+#define        V_HOME          0x08            /* Home partition */
+#define        V_ALTSCTR       0x09            /* Alternate sector partition */
+#define        V_CACHE         0x0a            /* Obsolete (was for cachefs) */
+
+/* The following partition identification tags apply to EFI/GPT labels only */
+#define        V_RESERVED      0x0b            /* SMI reserved data */
+#define        V_SYSTEM        0x0c            /* EFI/GPT system partition */
+#define        V_BIOS_BOOT     0x18            /* BIOS Boot partition */
+
+#define        V_UNKNOWN       0xff            /* Unknown partition */
+
+/*
+ * Partition permission flags
+ */
+#define        V_UNMNT         0x01            /* Unmountable partition */
+#define        V_RONLY         0x10            /* Read only */
index a4af8221751e4d6905d37cefc628409f1fb30b7e..ad999ac07521a599d92f624a9ca18d6422f06c1a 100644 (file)
@@ -609,6 +609,20 @@ config TILE_SROM
          device appear much like a simple EEPROM, and knows
          how to partition a single ROM for multiple purposes.
 
+config VLDC
+       tristate "Logical Domains Virtual Channel"
+       depends on SUN_LDOMS
+       default m
+       help
+          Support for virtual channel under Logical Domains.
+
+config VLDS
+       tristate "Logical Domains Domain Services"
+       depends on SUN_LDOMS
+       default m
+       help
+          Support for domain services under Logical Domains.
+
 source "drivers/char/xillybus/Kconfig"
 
 endmenu
index d06cde26031b19e418c1ca66a560d9a59bab6af7..f752fe2746052502371e880217456505b30e0d22 100644 (file)
@@ -61,4 +61,8 @@ obj-$(CONFIG_JS_RTC)          += js-rtc.o
 js-rtc-y = rtc.o
 
 obj-$(CONFIG_TILE_SROM)                += tile-srom.o
+
+obj-$(CONFIG_VLDC)             += vldc.o
+obj-$(CONFIG_VLDS)             += vlds.o
+
 obj-$(CONFIG_XILLYBUS)         += xillybus/
diff --git a/drivers/char/vldc.c b/drivers/char/vldc.c
new file mode 100644 (file)
index 0000000..ef8d266
--- /dev/null
@@ -0,0 +1,1426 @@
+/*
+ * vldc.c: Sun4v Virtual LDC (Logical Domain Channel) Driver
+ *
+ * Copyright (C) 2014 Oracle. All rights reserved.
+ */
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/sysfs.h>
+#include <linux/ioctl.h>
+#include <linux/vldc.h>
+#include <linux/atomic.h>
+#include <linux/uaccess.h>
+#include <asm/mdesc.h>
+#include <asm/vio.h>
+#include <asm/ldc.h>
+
+
+#define VLDC_DEBUG 1   /* force VLDC_DEBUG on for development */
+
+#ifdef VLDC_DEBUG
+static bool vldcdbg;
+module_param(vldcdbg, bool, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(vldcdbg, "Boolean to enable debug messages (0 == off, 1 == on)");
+
+#define dprintk(fmt, args...) do {\
+if (vldcdbg)\
+       printk(KERN_ERR "%s: " fmt, __func__, ##args);\
+} while (0)
+
+#else
+#define dprintk(fmt, args...)
+#endif /* VLDC_DEBUG */
+
+#define DRV_NAME               "vldc"
+#define DRV_VERSION            "1.0"
+#define VLDC_DEVICE_NAME DRV_NAME
+
+#define VLDC_MINOR_BASE 0
+#define VLDC_MAX_DEVS 64  /* Arbitrary # - hopefully enough */
+
+#define        VLDC_DEFAULT_MTU        0x1000   /* default mtu size 4K */
+#define VLDC_MAX_MTU           (256 * 1024) /* 256K */
+#define        VLDC_DEFAULT_MODE       LDC_MODE_RAW
+#define VLDC_MAX_COOKIE         (256 * 1024) /* 256K */
+
+/* Time (in ms) to sleep waiting for write space to become available */
+#define VLDC_WRITE_BLOCK_SLEEP_DELAY 1
+
+/* Timeout (in ms) to sleep waiting for LDC connection to complete */
+#define VLDC_CONNECTION_TIMEOUT 10000
+
+static char driver_version[] = DRV_NAME ".c:v" DRV_VERSION "\n";
+
+/* Global driver data struct for data common to all devices */
+struct vldc_driver_data {
+       struct list_head        vldc_dev_list; /* list of all vldc devices */
+       int                     num_vldc_dev_list;
+       struct class            *chrdev_class;
+       dev_t                   devt;
+};
+struct vldc_driver_data vldc_data;
+static DEFINE_MUTEX(vldc_data_mutex); /* protect vldc_data */
+
+/*
+ * VLDC device struct. Each vldc device which is probed
+ * will have one of these structs associated with it.
+ * Integer type fields which could possibly be accessed by more
+ * than 1 thread simultaneously are declared as type atomic_t
+ * to assure atomic access.
+ */
+struct vldc_dev {
+       /* link into the global driver data dev list */
+       struct list_head        list;
+
+       struct mutex            vldc_mutex;
+       struct cdev             cdev;
+       char                    *tx_buf;
+       char                    *rx_buf;
+       dev_t                   devt;
+       char                    *name;
+       struct device           *device;
+       struct vio_dev          *vdev;
+       struct ldc_channel      *lp;
+       atomic_t                mtu;
+       atomic_t                mode;
+
+       /* each device gets its own read cookie buf */
+       void                    *cookie_read_buf;
+
+       /* each device gets its own write cookie buf */
+       void                    *cookie_write_buf;
+
+       /* waitqueue for poll() or blocking read() operations */
+       wait_queue_head_t       waitqueue;
+
+       /* atomic var to indicate if the device is released - i.e. not open */
+       atomic_t                is_released;
+
+       /* atomic var to indicate if reset has been asserted on the device */
+       atomic_t                is_reset_asserted;
+};
+
+static bool vldc_will_write_block(struct vldc_dev *vldc, size_t count)
+{
+       if (atomic_read(&vldc->is_released) ||
+           atomic_read(&vldc->is_reset_asserted)) {
+               /* device was released or reset, exit */
+               return false;
+       }
+
+       return !ldc_tx_space_available(vldc->lp, count);
+}
+
+static int vldc_ldc_send(struct vldc_dev *vldc, void *data, int len)
+{
+       int err, limit = 1000;
+
+       err = -EINVAL;
+       while (limit-- > 0) {
+               err = ldc_write(vldc->lp, data, len);
+               if (!err || (err != -EAGAIN))
+                       break;
+               udelay(1);
+       }
+
+       return err;
+}
+
+static ssize_t vldc_fops_write(struct file *filp, const char __user *ubuf,
+                              size_t count, loff_t *off)
+{
+       struct vldc_dev *vldc;
+       int rv;
+       char *ubufp;
+       int nbytes_written;
+       int nbytes_left;
+       size_t size;
+
+       dprintk("entered.\n");
+
+       /* validate args */
+       if (filp == NULL || ubuf == NULL)
+               return -EINVAL;
+
+       nbytes_written = 0; /* number of bytes written */
+
+       vldc = filp->private_data;
+       rv = 0;
+
+       /*
+        * If the device has been released/closed
+        * or has been reset, exit with error.
+        */
+       if (atomic_read(&vldc->is_released)) {
+               rv = -ENODEV;
+               goto done;
+       }
+
+       if (atomic_read(&vldc->is_reset_asserted)) {
+               rv = -EIO;
+               goto done;
+       }
+
+       if (vldc_will_write_block(vldc, count) &&
+           (filp->f_flags & O_NONBLOCK)) {
+               rv = -EAGAIN;
+               goto done;
+       }
+
+       /*
+        * Loop here waiting for write space to become available.
+        * NOTE: we can't wait on an event here because there is no event
+        * to indicate that write space has become available.
+        */
+       while (vldc_will_write_block(vldc, count)) {
+               msleep_interruptible(VLDC_WRITE_BLOCK_SLEEP_DELAY);
+               if (signal_pending(current)) {
+                       /* task caught a signal during the sleep - abort. */
+                       rv = -EINTR;
+                       goto done;
+               }
+       }
+
+       /*
+        * Check again if the device has been released/closed
+        * or has been reset while we were waiting.
+        */
+       if (atomic_read(&vldc->is_released)) {
+               rv = -ENODEV;
+               goto done;
+       }
+
+       if (atomic_read(&vldc->is_reset_asserted)) {
+               rv = -EIO;
+               goto done;
+       }
+
+       nbytes_left = count; /* number of bytes left to write */
+       ubufp = (char *)ubuf;
+
+       while (nbytes_left > 0) {
+
+               /* NOTE: RAW mode can only write max size of LDC_PACKET_SIZE */
+               if (atomic_read(&vldc->mode) == LDC_MODE_RAW)
+                       size = min_t(int, LDC_PACKET_SIZE, nbytes_left);
+               else
+                       size = min_t(int, atomic_read(&vldc->mtu), nbytes_left);
+
+               if (copy_from_user(vldc->tx_buf, ubufp, size) != 0) {
+                       rv = -EFAULT;
+                       goto done;
+               }
+
+               rv = vldc_ldc_send(vldc, vldc->tx_buf, size);
+
+               dprintk("(%s) ldc_write() returns %d\n", vldc->name, rv);
+
+               if (unlikely(rv < 0))
+                       break;
+
+               if (unlikely(rv == 0))
+                       break;
+
+               ubufp += rv;
+               nbytes_written += rv;
+               nbytes_left -= rv;
+       }
+
+       /* Return any data written (even if we got a subsequent error) */
+       if (nbytes_written > 0)
+               rv = nbytes_written;
+
+done:
+
+       dprintk("(%s) num bytes written=%d, return value=%d\n",
+               vldc->name, nbytes_written, rv);
+
+       return (ssize_t)rv;
+}
+
+static bool vldc_will_read_block(struct vldc_dev *vldc)
+{
+
+       if (atomic_read(&vldc->is_released) ||
+           atomic_read(&vldc->is_reset_asserted)) {
+               /* device was released or reset, exit */
+               return false;
+       }
+
+       return !ldc_rx_data_available(vldc->lp);
+}
+
+static ssize_t vldc_fops_read(struct file *filp, char __user *ubuf,
+                             size_t count, loff_t *offp)
+{
+       struct vldc_dev *vldc;
+       int rv;
+       char *ubufp;
+       int nbytes_read;
+       int nbytes_left;
+       size_t size;
+
+       dprintk("entered.\n");
+
+       /* validate args */
+       if (filp == NULL || ubuf == NULL)
+               return -EINVAL;
+
+       nbytes_read = 0; /* number of bytes read */
+
+       vldc = filp->private_data;
+       rv = 0;
+
+       /*  Per spec if reading 0 bytes, just return 0. */
+       if (count == 0) {
+               rv = 0;
+               goto done;
+       }
+
+       /*
+        * If the device has been released/closed or
+        * has been reset, exit with error.
+        */
+       if (atomic_read(&vldc->is_released)) {
+               rv = -ENODEV;
+               goto done;
+       }
+
+       if (atomic_read(&vldc->is_reset_asserted)) {
+               rv = -EIO;
+               goto done;
+       }
+
+       if (vldc_will_read_block(vldc) && (filp->f_flags & O_NONBLOCK)) {
+               rv = -EAGAIN;
+               goto done;
+       }
+
+       /*
+        * NOTE: this will only wait if the vldc_will_read_block
+        * initially returns true
+        */
+       rv = wait_event_interruptible(vldc->waitqueue,
+                                     !vldc_will_read_block(vldc));
+       if (rv < 0)
+               goto done;
+
+       /*
+        * Check again if the device has been released/closed
+        * or has been reset while we were waiting
+        */
+       if (atomic_read(&vldc->is_released)) {
+               /* device was released, exit */
+               rv = -ENODEV;
+               goto done;
+       }
+
+       if (atomic_read(&vldc->is_reset_asserted)) {
+               rv = -EIO;
+               goto done;
+       }
+
+       nbytes_left = count; /* number of bytes left to read */
+       ubufp = (char *)ubuf;
+
+       /* read count bytes or until LDC has no more read data (or error) */
+       while (nbytes_left > 0) {
+
+               /* NOTE: RAW mode can only read min size of LDC_PACKET_SIZE */
+               if (atomic_read(&vldc->mode) == LDC_MODE_RAW)
+                       size = max_t(int, LDC_PACKET_SIZE, nbytes_left);
+               else
+                       size = min_t(int, atomic_read(&vldc->mtu), nbytes_left);
+
+               rv = ldc_read(vldc->lp, vldc->rx_buf, size);
+
+               dprintk("(%s) ldc_read() returns %d\n", vldc->name, rv);
+
+               if (unlikely(rv < 0))
+                       break;
+
+               if (unlikely(rv == 0))
+                       break;
+
+               if (copy_to_user(ubufp, vldc->rx_buf, rv) != 0) {
+                       rv = -EFAULT;
+                       goto done;
+               }
+
+               ubufp += rv;
+               nbytes_read += rv;
+               nbytes_left -= rv;
+       }
+
+       /* Return any data read (even if we got a subsequent error) */
+       if (nbytes_read > 0)
+               rv = nbytes_read;
+
+done:
+
+       dprintk("(%s) num bytes read=%d, return value=%d\n",
+               vldc->name, nbytes_read, rv);
+
+       /* re-enable interrupts */
+       ldc_enable_hv_intr(vldc->lp);
+
+       return (ssize_t)rv;
+
+}
+
+static unsigned int vldc_fops_poll(struct file *filp, poll_table *wait)
+{
+       struct vldc_dev *vldc;
+       int mask;
+
+       dprintk("entered.\n");
+
+       vldc = filp->private_data;
+
+       /*
+        * XXX For the error cases, should return error codes or POLLHUP?
+        * If the device has been released/closed or has been reset,
+        * exit with error.
+        */
+       if (atomic_read(&vldc->is_released))
+               return -ENODEV;
+
+       if (atomic_read(&vldc->is_reset_asserted))
+               return -EIO;
+
+       poll_wait(filp, &vldc->waitqueue, wait);
+
+       /*
+        * Check again if the device has been released/closed
+        * or has been reset while we were waiting
+        */
+       if (atomic_read(&vldc->is_released))
+               return -ENODEV;
+
+       if (atomic_read(&vldc->is_reset_asserted))
+               return -EIO;
+
+       mask = 0;
+
+       if (!vldc_will_read_block(vldc))
+               mask |= POLLIN | POLLRDNORM;
+
+       /* Check that we can write at least MTU bytes */
+       if (!vldc_will_write_block(vldc, (size_t)atomic_read(&vldc->mtu)))
+               mask |= POLLOUT | POLLWRNORM;
+
+       return mask;
+}
+
+static long vldc_read_cookie(struct vldc_dev *vldc, u64 src_addr, u64 dst_addr,
+                            u64 len)
+{
+       struct ldc_trans_cookie cookie;
+       int rv;
+       char *ubufp;
+       u32 nbytes_read;
+       u32 nbytes_left;
+
+       dprintk("entered.\n");
+
+       nbytes_read = 0; /* number of bytes read */
+
+       /* validate args */
+       if (vldc == NULL || src_addr == 0 || dst_addr == 0) {
+               rv = -EINVAL;
+               goto done;
+       }
+
+       dprintk("(%s) src_addr=0x%llx dst_addr=0x%llx len=0x%llx\n",
+               vldc->name, src_addr, dst_addr, len);
+
+       if (atomic_read(&vldc->is_released)) {
+               rv = -ENODEV;
+               goto done;
+       }
+
+       if (atomic_read(&vldc->is_reset_asserted)) {
+               rv = -EIO;
+               goto done;
+       }
+
+       if (len == 0) {
+               rv = 0;
+               goto done;
+       }
+
+       if (unlikely(len > VLDC_MAX_COOKIE)) {
+               rv = -E2BIG;
+               goto done;
+       }
+
+       rv = 0;
+       nbytes_left = (u32)len; /* number of bytes left to read */
+       ubufp = (char *)src_addr;
+
+       /* copy in len bytes or until LDC has no more read data (or error) */
+       while (nbytes_left > 0) {
+
+               cookie.cookie_addr = dst_addr;
+               cookie.cookie_size = nbytes_left;
+
+               rv = ldc_copy(vldc->lp, LDC_COPY_IN, vldc->cookie_read_buf,
+                             nbytes_left, 0, &cookie, 1);
+
+               dprintk("(%s) ldc_copy() returns %d\n", vldc->name, rv);
+
+               if (unlikely(rv < 0))
+                       goto done;
+
+               if (unlikely(rv == 0))
+                       break;
+
+               if (copy_to_user(ubufp, vldc->cookie_read_buf, rv) != 0) {
+                       rv = -EFAULT;
+                       goto done;
+               }
+
+               ubufp += rv;
+               dst_addr += rv;
+               nbytes_read += rv;
+               nbytes_left -= rv;
+       }
+
+       rv = nbytes_read;
+
+done:
+
+       dprintk("(%s) num bytes read=%d, return value=%d\n",
+               vldc->name, nbytes_read, rv);
+
+       return rv;
+
+}
+
+static long vldc_write_cookie(struct vldc_dev *vldc, u64 src_addr, u64 dst_addr,
+                             u64 len)
+{
+       struct ldc_trans_cookie cookie;
+       int rv;
+       char *ubufp;
+       u32 nbytes_written;
+       u32 nbytes_left;
+
+       dprintk("entered.\n");
+
+       nbytes_written = 0; /* number of bytes written */
+
+       /* validate args */
+       if (vldc == NULL || src_addr == 0 || dst_addr == 0) {
+               rv = -EINVAL;
+               goto done;
+       }
+
+       dprintk("(%s) src_addr=0x%llx dst_addr=0x%llx len=0x%llx\n",
+               vldc->name, src_addr, dst_addr, len);
+
+       if (atomic_read(&vldc->is_released)) {
+               rv = -ENODEV;
+               goto done;
+       }
+
+       if (atomic_read(&vldc->is_reset_asserted)) {
+               rv = -EIO;
+               goto done;
+       }
+
+       if (len == 0) {
+               rv = 0;
+               goto done;
+       }
+
+       if (unlikely(len > VLDC_MAX_COOKIE)) {
+               rv = -E2BIG;
+               goto done;
+       }
+
+       rv = 0;
+       nbytes_left = (u32)len; /* number of bytes left to write */
+       ubufp = (char *)src_addr;
+
+       /* copy in len bytes or until LDC has no more read data (or error) */
+       while (nbytes_left > 0) {
+
+               if (copy_from_user(vldc->cookie_write_buf,
+                   ubufp, nbytes_left) != 0) {
+                       rv = -EFAULT;
+                       goto done;
+               }
+
+               cookie.cookie_addr = dst_addr;
+               cookie.cookie_size = nbytes_left;
+
+               rv = ldc_copy(vldc->lp, LDC_COPY_OUT, vldc->cookie_write_buf,
+                             nbytes_left, 0, &cookie, 1);
+
+               dprintk("(%s) ldc_copy() returns %d\n", vldc->name, rv);
+
+               if (unlikely(rv < 0))
+                       goto done;
+
+               if (unlikely(rv == 0))
+                       break;
+
+               ubufp += rv;
+               dst_addr += rv;
+               nbytes_written += rv;
+               nbytes_left -= rv;
+       }
+
+       rv = nbytes_written;
+
+done:
+
+       dprintk("(%s) num bytes written=%d, return value=%d\n",
+               vldc->name, nbytes_written, rv);
+
+       return rv;
+
+}
+
+static long vldc_fops_ioctl(struct file *filp, unsigned int cmd,
+                           unsigned long arg)
+{
+
+       struct vldc_dev *vldc;
+       struct vldc_data_t __user *uarg;
+       u64 src_addr;
+       u64 dst_addr;
+       u64 len;
+       int rv;
+
+       dprintk("entered.\n");
+
+       rv = 0;
+       src_addr = 0;
+       dst_addr = 0;
+       len = 0;
+
+       vldc = filp->private_data;
+
+       /* get the arg for the read/write cookie ioctls */
+       if (cmd == VLDC_IOCTL_READ_COOKIE || cmd == VLDC_IOCTL_WRITE_COOKIE) {
+               uarg = (struct vldc_data_t __user *)arg;
+               if (get_user(src_addr, &uarg->src_addr) != 0 ||
+                   get_user(dst_addr, &uarg->dst_addr) != 0 ||
+                   get_user(len, &uarg->length) != 0) {
+                       rv = -EFAULT;
+                       goto done;
+               }
+       }
+
+       switch (cmd) {
+       case VLDC_IOCTL_READ_COOKIE:
+
+               rv = vldc_read_cookie(vldc, src_addr, dst_addr, len);
+
+               break;
+
+       case VLDC_IOCTL_WRITE_COOKIE:
+
+               rv = vldc_write_cookie(vldc, src_addr, dst_addr, len);
+
+               break;
+
+       default:
+               rv = -EINVAL;
+               break;
+       }
+
+done:
+
+       return rv;
+
+}
+
+/*
+ * Event function does the following:
+ * 1. If data is ready from the LDC, indicate it
+ *    in the corresponding device struct.
+ * 2. Wake up any (poll or read) waiters on this device
+ *
+ * NOTE - this routine is called in interrupt context.
+ */
+static void vldc_event(void *arg, int event)
+{
+       struct vldc_dev *vldc = arg;
+
+       dprintk("entered.\n");
+
+       dprintk("%s: LDC event %d\n", vldc->name, event);
+
+       if (event == LDC_EVENT_RESET) {
+               atomic_set(&vldc->is_reset_asserted, 1);
+               return;
+       }
+
+       if (event == LDC_EVENT_UP)
+               return;
+
+       if (unlikely(event != LDC_EVENT_DATA_READY)) {
+               dprintk("Unexpected LDC event %d\n", event);
+               return;
+       }
+
+       /*
+        *  disable interrupts until we have completed reading the data.
+        *  NOTE: this will hold off all types of events including RESET
+        *  until read has complete. If a device reset occurs within this
+        *  window (while interrupts are disabled), attempts to read/write
+        *  the device should/will fail at the LDC level (since a check is
+        *  at that level - via an HV call - to first ensure the LDC is UP).
+        */
+
+       ldc_disable_hv_intr(vldc->lp);
+
+       /* walkup any read or poll waiters */
+       wake_up_interruptible(&vldc->waitqueue);
+
+}
+
+
+static int vldc_connect(struct ldc_channel *lp)
+{
+       int timeout;
+       int state;
+
+       /* no connection required in RAW mode */
+       if (ldc_mode(lp) == LDC_MODE_RAW)
+               return 0;
+
+       /*
+        * Issue a ldc_connect to make sure the handshake is initiated.
+        * NOTE: ldc_connect can fail if the LDC connection handshake
+        * completed since we called bind(). So, ignore
+        * ldc_connect() failures.
+        */
+       (void) ldc_connect(lp);
+
+       /* wait for the connection to complete */
+       timeout = VLDC_CONNECTION_TIMEOUT;
+       do {
+               state = ldc_state(lp);
+               if (state == LDC_STATE_CONNECTED)
+                       break;
+               msleep_interruptible(1);
+       } while (timeout-- > 0);
+
+       if (state == LDC_STATE_CONNECTED)
+               return 0;
+       else
+               return -ETIMEDOUT;
+}
+
+/*
+ * Open function does the following:
+ * 1. Alloc and bind LDC to the device (using sysfs parameters)
+ */
+static int vldc_fops_open(struct inode *inode, struct file *filp)
+{
+       struct vldc_dev *vldc;
+       char *tbuffer;
+       char *rbuffer;
+       char *crbuffer;
+       char *cwbuffer;
+       struct ldc_channel_config ldc_cfg;
+       struct ldc_channel *lp;
+       u32 mtu;
+       int rv;
+       int err;
+       bool ldc_bound;
+
+       dprintk("entered.\n");
+
+       rv = 0;
+       ldc_bound = false;
+       tbuffer = NULL;
+       rbuffer = NULL;
+       crbuffer = NULL;
+       cwbuffer = NULL;
+
+       vldc = container_of(inode->i_cdev, struct vldc_dev, cdev);
+
+       /* just to be safe, if the device is in reset, deny the open. */
+       if (atomic_read(&vldc->is_reset_asserted))
+               return -EIO;
+
+       dprintk("(%s)\n", vldc->name);
+
+       /*
+        * We hold the vldc_mutex during the open to prevent
+        * a race with vldc_sysfs_mode_store() and vldc_sysfs_mtu_store().
+        * See comments in those routines for more detail.
+        */
+       mutex_lock(&vldc->vldc_mutex);
+
+       /*
+        * Atomically test and mark the device as opened.
+        * This limits the usage of the device to one process at
+        * a time which is good enough for our purposes (and which
+        * simplifies locking).
+        */
+       if (!atomic_dec_and_test(&vldc->is_released)) {
+               atomic_inc(&vldc->is_released);
+               dprintk("failed: Multiple open.\n");
+               mutex_unlock(&vldc->vldc_mutex);
+               return -EBUSY;
+       }
+
+       mutex_unlock(&vldc->vldc_mutex);
+
+       mtu = (u32) atomic_read(&vldc->mtu);
+
+       tbuffer = kzalloc(mtu, GFP_KERNEL);
+       if (tbuffer == NULL) {
+               dprintk("failed to allocate tbuffer.\n");
+               rv = -ENOMEM;
+               goto error;
+       }
+       vldc->tx_buf = tbuffer;
+
+       rbuffer = kzalloc(mtu, GFP_KERNEL);
+       if (rbuffer == NULL) {
+               dprintk("failed to allocate rbuffer.\n");
+               rv = -ENOMEM;
+               goto error;
+       }
+       vldc->rx_buf = rbuffer;
+
+       crbuffer = kzalloc(VLDC_MAX_COOKIE, GFP_KERNEL);
+       if (crbuffer == NULL) {
+               dprintk("failed to allocate crbuffer.\n");
+               rv = -ENOMEM;
+               goto error;
+       }
+       vldc->cookie_read_buf = crbuffer;
+
+       cwbuffer = kzalloc(VLDC_MAX_COOKIE, GFP_KERNEL);
+       if (cwbuffer == NULL) {
+               dprintk("failed to allocate cwbuffer.\n");
+               rv = -ENOMEM;
+               goto error;
+       }
+       vldc->cookie_write_buf = cwbuffer;
+
+       ldc_cfg.event = vldc_event;
+       ldc_cfg.mtu = mtu;
+       ldc_cfg.mode = atomic_read(&vldc->mode);
+       ldc_cfg.debug = 0;
+       ldc_cfg.tx_irq = vldc->vdev->tx_irq;
+       ldc_cfg.rx_irq = vldc->vdev->rx_irq;
+       ldc_cfg.rx_ino = vldc->vdev->rx_ino;
+       ldc_cfg.tx_ino = vldc->vdev->tx_ino;
+       ldc_cfg.dev_handle = vldc->vdev->dev_handle;
+
+       /* Alloc and init the associated LDC */
+       lp = ldc_alloc(vldc->vdev->channel_id, &ldc_cfg, vldc, vldc->name);
+       if (IS_ERR(lp)) {
+               err = PTR_ERR(lp);
+               dprintk("ldc_alloc() failed. err=%d\n", err);
+               rv = err;
+               goto error;
+       }
+       vldc->lp = lp;
+
+       rv = ldc_bind(vldc->lp);
+       if (rv != 0) {
+               dprintk("ldc_bind() failed, err=%d.\n", rv);
+               goto error;
+       }
+       ldc_bound = true;
+
+       rv = vldc_connect(vldc->lp);
+       if (rv != 0) {
+               dprintk("vldc_connect() failed, err=%d.\n", rv);
+               goto error;
+       }
+
+       /* tuck away the vldc device for subsequent fops */
+       filp->private_data = vldc;
+
+       dprintk("Success.\n");
+
+       return 0;
+
+error:
+
+       if (ldc_bound)
+               ldc_unbind(vldc->lp);
+
+       if (vldc->lp != NULL)
+               ldc_free(vldc->lp);
+
+       if (cwbuffer != NULL)
+               kfree(cwbuffer);
+
+       if (crbuffer != NULL)
+               kfree(crbuffer);
+
+       if (rbuffer != NULL)
+               kfree(rbuffer);
+
+       if (tbuffer != NULL)
+               kfree(tbuffer);
+
+       atomic_inc(&vldc->is_released);
+
+       return rv;
+
+}
+
+static int vldc_fops_release(struct inode *inode, struct file *filp)
+{
+       struct vldc_dev *vldc;
+
+       dprintk("entered.\n");
+
+       vldc = filp->private_data;
+
+       ldc_unbind(vldc->lp);
+
+       ldc_free(vldc->lp);
+
+       kfree(vldc->cookie_write_buf);
+
+       kfree(vldc->cookie_read_buf);
+
+       kfree(vldc->rx_buf);
+
+       kfree(vldc->tx_buf);
+
+       /* mark the device as released */
+       atomic_inc(&vldc->is_released);
+
+       /*
+        * User must close and re-open the device to clear
+        * the reset asserted flag.
+        */
+       atomic_set(&vldc->is_reset_asserted, 0);
+
+       /*
+        * Wake up any rogue read or poll waiters.
+        * They will exit (with an error) since is_released is now set.
+        */
+       wake_up_interruptible(&vldc->waitqueue);
+
+       return 0;
+}
+
+static const struct file_operations vldc_fops = {
+       .owner          = THIS_MODULE,
+       .open           = vldc_fops_open,
+       .release        = vldc_fops_release,
+       .poll           = vldc_fops_poll,
+       .read           = vldc_fops_read,
+       .write          = vldc_fops_write,
+       .unlocked_ioctl = vldc_fops_ioctl,
+};
+
+static int vldc_get_next_avail_minor(void)
+{
+       struct vldc_dev *vldc;
+       bool found;
+       int i;
+
+       /*
+        * walk the vldc_dev_list list to find the next
+        * lowest available minor.
+        */
+       mutex_lock(&vldc_data_mutex);
+       for (i = VLDC_MINOR_BASE; i < VLDC_MAX_DEVS; i++) {
+               found = false;
+               list_for_each_entry(vldc, &vldc_data.vldc_dev_list, list) {
+                       if (i == MINOR(vldc->devt)) {
+                               found = true;
+                               break;
+                       }
+               }
+               if (!found) {
+                       /* found a free minor, use it */
+                       break;
+               }
+       }
+       mutex_unlock(&vldc_data_mutex);
+
+       if (i == VLDC_MAX_DEVS) {
+               dprintk("no more minors left for allocation!\n");
+               return -1;
+       }
+
+       return i;
+}
+
+static ssize_t vldc_sysfs_mode_show(struct device *device,
+                             struct device_attribute *attr, char *buffer)
+{
+       struct vldc_dev *vldc;
+
+       dprintk("entered.\n");
+
+       vldc = dev_get_drvdata(device);
+
+       return scnprintf(buffer, PAGE_SIZE, "%d\n", atomic_read(&vldc->mode));
+}
+
+static ssize_t vldc_sysfs_mode_store(struct device *device,
+                struct device_attribute *attr, const char *buf, size_t count)
+{
+       struct vldc_dev *vldc;
+       unsigned int mode;
+
+       dprintk("entered.\n");
+
+       if (sscanf(buf, "%ud", &mode) != 1)
+               return -EINVAL;
+
+       /* validate the value from the user */
+       if (!(mode == LDC_MODE_RAW ||
+             mode == LDC_MODE_UNRELIABLE ||
+             mode == LDC_MODE_STREAM)) {
+               return -EINVAL;
+       }
+
+       vldc = dev_get_drvdata(device);
+
+       /*
+        * Only allow the mode to be set if the device is closed.
+        * Use vldc_mutex to ensure that an open does not
+        * come in between the check for is_released and the set
+        * of the mode.
+        */
+       mutex_lock(&vldc->vldc_mutex);
+
+       if (!atomic_read(&vldc->is_released)) {
+               /* can't change the mode while the device is open */
+               mutex_unlock(&vldc->vldc_mutex);
+               return -EBUSY;
+       }
+
+       atomic_set(&vldc->mode, mode);
+
+       mutex_unlock(&vldc->vldc_mutex);
+
+       dprintk("mode changed to %d.\n", mode);
+
+       return strnlen(buf, count);
+}
+
+
+static ssize_t vldc_sysfs_mtu_show(struct device *device,
+                             struct device_attribute *attr, char *buffer)
+{
+       struct vldc_dev *vldc;
+
+       dprintk("entered.\n");
+
+       vldc = dev_get_drvdata(device);
+
+       return scnprintf(buffer, PAGE_SIZE, "%d\n", atomic_read(&vldc->mtu));
+}
+
+static ssize_t vldc_sysfs_mtu_store(struct device *device,
+                struct device_attribute *attr, const char *buf, size_t count)
+{
+       struct vldc_dev *vldc;
+       unsigned int mtu;
+       int rv;
+
+       dprintk("entered.\n");
+
+       rv = 0;
+
+       if (sscanf(buf, "%ud", &mtu) != 1)
+               return -EINVAL;
+
+       /* validate the value from the user */
+       if (mtu < LDC_PACKET_SIZE || mtu > VLDC_MAX_MTU)
+               return -EINVAL;
+
+       vldc = dev_get_drvdata(device);
+
+       /*
+        * Only allow the mtu to be set if the device is closed.
+        * Use vldc_mutex to ensure that an open does not
+        * come in between the check for is_released and the set
+        * of the mtu.
+        */
+       mutex_lock(&vldc->vldc_mutex);
+
+       if (!atomic_read(&vldc->is_released)) {
+               /* can't change the mtu while the device is open */
+               mutex_unlock(&vldc->vldc_mutex);
+               return -EBUSY;
+       }
+
+       atomic_set(&vldc->mtu, mtu);
+
+       mutex_unlock(&vldc->vldc_mutex);
+
+       dprintk("mtu changed to %d.\n", mtu);
+
+       return strnlen(buf, count);
+
+}
+
+
+
+static DEVICE_ATTR(mode, (S_IRUSR|S_IWUSR), vldc_sysfs_mode_show,
+                  vldc_sysfs_mode_store);
+static DEVICE_ATTR(mtu, (S_IRUSR|S_IWUSR), vldc_sysfs_mtu_show,
+                  vldc_sysfs_mtu_store);
+
+static struct attribute *vldc_sysfs_entries[] = {
+       &dev_attr_mode.attr,
+       &dev_attr_mtu.attr,
+       NULL
+};
+
+static struct attribute_group vldc_attribute_group = {
+       .name = NULL,           /* put in device directory */
+       .attrs = vldc_sysfs_entries,
+};
+
+/*
+ * Probe function does the following:
+ * 1. Create/Init vldc_dev for newly probed device
+ * 2. Create /dev entry for the device
+ * 3. Create sysfs entries for the device
+ */
+static int vldc_probe(struct vio_dev *vdev, const struct vio_device_id *vio_did)
+{
+       struct vldc_dev *vldc;
+       struct mdesc_handle *hp;
+       const char *valstr;
+       const u64 *id;
+       int rv, slen;
+       dev_t devt;
+       struct device *device;
+       int next_minor;
+       bool created_sysfs_group;
+       u64 node;
+#ifdef VLDC_DEBUG
+       unsigned char devt_buf[32];
+#endif
+
+       dprintk("entered.\n");
+
+       vldc = NULL;
+       hp = NULL;
+       valstr = NULL;
+       devt = 0;
+       device = NULL;
+       created_sysfs_group = false;
+
+       vldc = kzalloc(sizeof(struct vldc_dev), GFP_KERNEL);
+       if (vldc == NULL) {
+               dprintk("failed to allocate vldc_dev\n");
+               rv = -ENOMEM;
+               goto error;
+       }
+
+       mutex_init(&vldc->vldc_mutex);
+
+       hp = mdesc_grab();
+
+       node = vio_vdev_node(hp, vdev);
+       if (node == MDESC_NODE_NULL) {
+               dprintk("Failed to get vdev MD node.\n");
+               mdesc_release(hp);
+               rv = -ENXIO;
+               goto error;
+       }
+
+       id = mdesc_get_property(hp, node, "id", NULL);
+       if (id == NULL) {
+               dprintk("failed to get id property.\n");
+               mdesc_release(hp);
+               rv = -ENXIO;
+               goto error;
+       }
+
+       /* get the name of the service this vldc-port provides */
+       valstr = mdesc_get_property(hp, node, "vldc-svc-name", &slen);
+       if (valstr == NULL) {
+               dprintk("failed to get vldc-svc-name property.\n");
+               mdesc_release(hp);
+               rv = -ENXIO;
+               goto error;
+       }
+
+       mdesc_release(hp);
+
+       vldc->name = kzalloc(slen+1, GFP_KERNEL); /* +1 for NUll byte */
+       if (vldc->name == NULL) {
+               dprintk("failed to alloc vldc->name.\n");
+               rv = -ENOMEM;
+               goto error;
+       }
+       memcpy(vldc->name, valstr, slen);
+       vldc->name[slen] = '\0';
+
+       dprintk("%s: cfg_handle=%llu, id=%llu\n", vldc->name,
+               vdev->dev_no, *id);
+
+       init_waitqueue_head(&vldc->waitqueue);
+
+       /* mark the device as initially released (e.g. closed) */
+       atomic_set(&vldc->is_released, 1);
+
+       /* clear the reset asserted flag */
+       atomic_set(&vldc->is_reset_asserted, 0);
+
+       dev_set_drvdata(&vdev->dev, vldc);
+
+       /* create the devt for this device */
+       next_minor = vldc_get_next_avail_minor();
+       if (next_minor == -1) {
+               dprintk("vldc_get_next_avail_minor() failed.\n");
+               rv = -ENXIO;
+               goto error;
+       }
+       devt = MKDEV(MAJOR(vldc_data.devt), next_minor);
+       vldc->devt = devt;
+
+       dprintk("%s: dev_t=%s\n", vldc->name, format_dev_t(devt_buf,
+               vldc->devt));
+
+       /*
+        * Use the default mode and mtu for starters.
+        * They are exported via sysfs for modification by the user
+        */
+       atomic_set(&vldc->mode, VLDC_DEFAULT_MODE);
+       atomic_set(&vldc->mtu, VLDC_DEFAULT_MTU);
+
+       /* create/add the associated cdev */
+       cdev_init(&vldc->cdev, &vldc_fops);
+       vldc->cdev.owner = THIS_MODULE;
+       rv = cdev_add(&vldc->cdev, devt, 1);
+       if (rv != 0) {
+               dprintk("cdev_add() failed.\n");
+               devt = 0;
+               goto error;
+       }
+
+       /* create the associated /sys and /dev entries */
+       device = device_create(vldc_data.chrdev_class, &vdev->dev, devt,
+                              vldc, "%s", vldc->name);
+       if (IS_ERR(device)) {
+               dprintk("device_create() failed.\n");
+               rv = PTR_ERR(device);
+               device = NULL;
+               goto error;
+       }
+       vldc->device = device;
+
+       vldc->vdev = vdev;
+
+       rv = sysfs_create_group(&device->kobj, &vldc_attribute_group);
+       if (rv)
+               goto error;
+
+       created_sysfs_group = true;
+
+       /* add the vldc to the global vldc_data device list */
+       mutex_lock(&vldc_data_mutex);
+       list_add_tail(&vldc->list, &vldc_data.vldc_dev_list);
+       vldc_data.num_vldc_dev_list++;
+       mutex_unlock(&vldc_data_mutex);
+
+       dprintk("%s: probe successful\n", vldc->name);
+
+       return 0;
+
+error:
+
+       if (!created_sysfs_group)
+               sysfs_remove_group(&device->kobj, &vldc_attribute_group);
+
+       if (device)
+               device_destroy(vldc_data.chrdev_class, devt);
+
+       if (devt)
+               cdev_del(&vldc->cdev);
+
+       if (vldc->name)
+               kfree(vldc->name);
+
+       if (vldc != NULL) {
+               mutex_destroy(&vldc->vldc_mutex);
+               kfree(vldc);
+       }
+
+       dprintk("probe failed (rv=%d)\n", rv);
+
+       return rv;
+}
+
+static int vldc_free_vldc_dev(struct vldc_dev *vldc)
+{
+
+       dprintk("entered. (%s)\n", vldc->name);
+
+       mutex_lock(&vldc_data_mutex);
+       list_del(&vldc->list);
+       vldc_data.num_vldc_dev_list--;
+       mutex_unlock(&vldc_data_mutex);
+
+       sysfs_remove_group(&vldc->device->kobj, &vldc_attribute_group);
+       device_destroy(vldc_data.chrdev_class, vldc->devt);
+       cdev_del(&vldc->cdev);
+       kfree(vldc->name);
+       mutex_destroy(&vldc->vldc_mutex);
+       kfree(vldc);
+
+       return 0;
+}
+
+static int vldc_remove(struct vio_dev *vdev)
+{
+       int rv;
+       struct vldc_dev *vldc;
+
+       dprintk("entered.\n");
+
+       vldc = dev_get_drvdata(&vdev->dev);
+
+       if (vldc == NULL) {
+               dprintk("failed to get vldc_dev from vio_dev.\n");
+               rv = -ENXIO;
+       } else {
+               dprintk("removing (%s)\n", vldc->name);
+               rv = vldc_free_vldc_dev(vldc);
+       }
+
+       return rv;
+}
+
+static const struct vio_device_id vldc_match[] = {
+       {
+               .type = "vldc-port",
+       },
+       {},
+};
+
+static struct vio_driver vldc_driver = {
+       .id_table       = vldc_match,
+       .probe          = vldc_probe,
+       .remove         = vldc_remove,
+       .name           = VLDC_DEVICE_NAME,
+};
+
+static char *vldc_devnode(struct device *dev, umode_t *mode)
+{
+       if (mode != NULL)
+               *mode = 0600;
+
+       return kasprintf(GFP_KERNEL, "vldc/%s", dev_name(dev));
+}
+
+/*
+ * Init function does the following
+ * 1. Init vldc_data struct fields
+ * 2. Register VIO driver
+ */
+static int __init vldc_init(void)
+{
+       int rv;
+#ifdef VLDC_DEBUG
+       unsigned char devt_buf[32];
+#endif
+
+       dprintk("entered. (DEBUG enabled)\n");
+
+       printk(KERN_INFO "%s", driver_version);
+
+       INIT_LIST_HEAD(&vldc_data.vldc_dev_list);
+       vldc_data.num_vldc_dev_list = 0;
+
+       rv = alloc_chrdev_region(&vldc_data.devt, VLDC_MINOR_BASE,
+                                VLDC_MAX_DEVS, VLDC_DEVICE_NAME);
+       if (rv < 0) {
+               dprintk("alloc_chrdev_region failed: %d\n", rv);
+               return rv;
+       }
+
+       if (vldc_data.devt == (dev_t)0) {
+               dprintk("alloc_chrdev_region failed: (vldc_data.devt == 0)\n");
+               rv = -ENXIO;
+               return rv;
+       }
+
+       dprintk("dev_t allocated = %s\n",
+               format_dev_t(devt_buf, vldc_data.devt));
+
+       vldc_data.chrdev_class = class_create(THIS_MODULE, VLDC_DEVICE_NAME);
+       if (IS_ERR(vldc_data.chrdev_class)) {
+               rv = PTR_ERR(vldc_data.chrdev_class);
+               dprintk("class_create() failed: %d\n", rv);
+               vldc_data.chrdev_class = NULL;
+               goto error;
+       }
+
+       /* set callback to create devices under /dev/vldc directory */
+       vldc_data.chrdev_class->devnode = vldc_devnode;
+
+       rv = vio_register_driver(&vldc_driver);
+       if (rv != 0) {
+               dprintk("vio_register_driver() failed: %d\n", rv);
+               goto error;
+       }
+
+       return 0;
+
+error:
+       if (vldc_data.chrdev_class)
+               class_destroy(vldc_data.chrdev_class);
+
+       if (vldc_data.devt)
+               unregister_chrdev_region(vldc_data.devt, VLDC_MAX_DEVS);
+
+       return rv;
+}
+
+static void __exit vldc_exit(void)
+{
+
+       dprintk("entered.\n");
+
+       /*
+        * Note - vio_unregister_driver() will invoke a call to
+        * vldc_remove() for every successfully probed device.
+        */
+       vio_unregister_driver(&vldc_driver);
+
+       if (vldc_data.chrdev_class)
+               class_destroy(vldc_data.chrdev_class);
+
+       if (vldc_data.devt)
+               unregister_chrdev_region(vldc_data.devt, VLDC_MAX_DEVS);
+}
+
+module_init(vldc_init);
+module_exit(vldc_exit);
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("Sun4v Virtual LDC Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
+
diff --git a/drivers/char/vlds.c b/drivers/char/vlds.c
new file mode 100644 (file)
index 0000000..fd65777
--- /dev/null
@@ -0,0 +1,2092 @@
+/*
+ * vlds.c: Sun4v LDOMs Virtual Domain Services Driver
+ *
+ * Copyright (C) 2015 Oracle. All rights reserved.
+ */
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/sysfs.h>
+#include <linux/ioctl.h>
+#include <linux/vlds.h>
+#include <linux/atomic.h>
+#include <linux/uaccess.h>
+#include <linux/pid.h>
+#include <linux/fdtable.h>
+#include <linux/rcupdate.h>
+#include <linux/eventfd.h>
+#include <linux/ds.h>
+#include <asm/mdesc.h>
+#include <asm/vio.h>
+
+extern unsigned int ldoms_debug_level;
+static unsigned int vldsdbg_level;
+module_param(vldsdbg_level, uint, S_IRUGO|S_IWUSR);
+
+#define DRV_NAME               "vlds"
+#define DRV_VERSION            "1.0"
+#define VLDS_DEVICE_NAME DRV_NAME
+
+#define VLDS_MINOR_BASE 0
+#define VLDS_MAX_DEVS  65535 /* need one per guest domain - max is 2^20 */
+#define VLDS_MAX_MSG_SIZE (256 * 1024)
+
+#define        VLDS_SP_INT_NAME        DS_SP_NAME /* SP DS internal name */
+#define        VLDS_SP_DEV_NAME        "sp" /* SP DS device name */
+#define VLDS_PATH_MAX          256
+
+#define VLDS_INVALID_HANDLE    0xFFFFFFFFFFFFFFFFUL
+
+static char driver_version[] = DRV_NAME ".c:v" DRV_VERSION "\n";
+
+#define dprintk(fmt, args...) do {\
+if (vldsdbg_level > 0)\
+       printk(KERN_ERR "%s: %s: " fmt, DRV_NAME, __func__, ##args);\
+} while (0)
+
+/* Global driver data struct for data common to all devices */
+struct vlds_driver_data {
+       struct list_head        vlds_dev_list; /* list of all vlds devices */
+       int                     num_vlds_dev_list;
+       struct class            *chrdev_class;
+       dev_t                   devt;
+};
+struct vlds_driver_data vlds_data;
+static DEFINE_MUTEX(vlds_data_mutex); /* protect vlds_data */
+
+struct vlds_dev {
+       /* link into the global driver data dev list */
+       struct list_head        list;
+
+       struct mutex            vlds_mutex; /* protect this vlds_dev */
+       struct cdev             cdev;
+       dev_t                   devt;
+       char                    *int_name; /* internal name for device */
+       struct device           *device;
+       u64                     domain_handle; /* only valid for domain dev */
+
+       /* list of all services for this vlds device */
+       struct list_head        service_info_list;
+
+};
+
+/* we maintain a global vlds_dev for the SP device */
+struct vlds_dev *sp_vlds;
+
+struct vlds_service_info {
+       /* link into the vlds_dev service info list */
+       struct list_head        list;
+
+       /* name/id of the service */
+       char                    *name;
+
+       u64                     state;
+
+       u64                     flags;
+
+       /* the thread group id which is using this service */
+       pid_t                   tgid;
+
+       /* unique handle assigned to this service */
+       u64                     handle;
+
+       /* version that was negotiated */
+       vlds_ver_t              neg_vers;
+
+       /* Queue of received data messages for this service */
+       struct list_head        msg_queue;
+       u64                     msg_queue_size;
+
+};
+#define VLDS_SVC_IS_CLIENT(svc) ((svc)->flags & VLDS_REG_CLIENT)
+#define VLDS_SVC_IS_EVENT(svc) ((svc)->flags & VLDS_REG_EVENT)
+
+struct vlds_msg_data {
+       /* link into the vlds_service_info message queue */
+       struct list_head        list;
+
+       size_t                  size;  /* message data size */
+       u8                      data[0]; /* message data */
+};
+#define VLDS_MAX_MSG_LIST_NUM          16
+
+/*
+ * If a process registers an event fd, we create an
+ * event_info to track events for the process.
+ */
+struct vlds_event_info {
+       /* link into the vlds_event_info_list */
+       struct list_head        list;
+
+       /* the thread group id (i.e. pid) to which this event_info belongs */
+       pid_t                   tgid;
+
+       /* fd to signal process of received event - See eventfd(2) */
+       int                     fd;
+
+       /* List of received events */
+       struct list_head        event_list;
+};
+
+struct list_head       vlds_event_info_list;
+static DEFINE_MUTEX(vlds_event_info_list_mutex);
+
+struct vlds_event {
+       /* link into the vlds_event_info event_list */
+       struct list_head        list;
+
+       /* service associated with the event */
+       struct vlds_service_info *svc_info;
+
+       /* type of event - reg/unreg/data */
+       u64                     type;
+
+       /* negotiated version (for reg events) */
+       vlds_ver_t              neg_vers;
+};
+
+/*
+ * When holding multiple locks in this driver, locking
+ * MUST be consistently performed in this order:
+ * vlds_data_mutex
+ * vlds_dev->vlds_mutex
+ * vlds_event_info_list_mutex
+ */
+
+/* vlds_event_info_list_mutex must be held */
+static int vlds_add_event_info(pid_t tgid, int fd)
+{
+       struct vlds_event_info *event_info;
+
+       dprintk("called\n");
+
+       event_info = kzalloc(sizeof(struct vlds_event_info), GFP_KERNEL);
+       if (unlikely(event_info == NULL)) {
+               dprintk("failed to allocate event_info\n");
+               return -ENOMEM;
+       }
+
+       event_info->tgid = tgid;
+       event_info->fd = fd;
+       INIT_LIST_HEAD(&event_info->event_list);
+
+       list_add_tail(&event_info->list, &vlds_event_info_list);
+
+       return 0;
+
+}
+
+/* vlds_event_info_list_mutex must be held */
+static int vlds_get_event_info(pid_t tgid,
+       struct vlds_event_info **ret_event_info)
+{
+       struct vlds_event_info *event_info;
+       bool found;
+
+       found = false;
+       list_for_each_entry(event_info, &vlds_event_info_list, list) {
+               if (event_info->tgid == tgid) {
+                       found = true;
+                       break;
+               }
+       }
+
+       if (!found)
+               return -ENODEV;
+
+       *ret_event_info = event_info;
+
+       return 0;
+
+}
+
+/* vlds_event_info_list_mutex must be held */
+static void vlds_remove_event_info(pid_t tgid)
+{
+       struct vlds_event_info *event_info;
+       struct vlds_event *event;
+       struct vlds_event *next;
+       bool found;
+
+       dprintk("called\n");
+
+       found = false;
+       list_for_each_entry(event_info, &vlds_event_info_list, list) {
+               if (event_info->tgid == tgid) {
+                       found = true;
+                       break;
+               }
+       }
+
+       if (found) {
+               /* Remove all events queued on this event_info */
+               list_for_each_entry_safe(event, next, &event_info->event_list,
+                   list) {
+                       list_del(&event->list);
+                       kfree(event);
+               }
+
+               list_del(&event_info->list);
+               kfree(event_info);
+       }
+
+}
+
+static int vlds_add_event(pid_t tgid, struct vlds_service_info *svc_info,
+       u64 type, vlds_ver_t *neg_vers)
+{
+       struct vlds_event_info *event_info;
+       struct vlds_event *event;
+       struct task_struct *utask;
+       struct file *efd_file;
+       struct eventfd_ctx *efd_ctx;
+       int rv;
+
+       mutex_lock(&vlds_event_info_list_mutex);
+
+       event_info = NULL;
+       rv = vlds_get_event_info(tgid, &event_info);
+       if (rv || event_info == NULL) {
+               /*
+                * If we failed to find an event_info, it probably just
+                * means the process did not register for events in favor
+                * of using polling - which is valid.
+                */
+               mutex_unlock(&vlds_event_info_list_mutex);
+               return 0;
+       }
+
+       event = kzalloc(sizeof(struct vlds_event), GFP_KERNEL);
+       if (unlikely(event == NULL)) {
+               dprintk("failed to allocate event for "
+                   "service %llx\n", svc_info->handle);
+               mutex_unlock(&vlds_event_info_list_mutex);
+               return -ENOMEM;
+       } else {
+               event->type = type;
+               event->svc_info = svc_info;
+               if (neg_vers != NULL)
+                       event->neg_vers = *neg_vers;
+
+               list_add_tail(&event->list,
+                   &event_info->event_list);
+       }
+
+       mutex_unlock(&vlds_event_info_list_mutex);
+
+       /*
+        * Signal the process that there is an event pending
+        * This is tricky as it requires searching the task's
+        * file table for the entry corresponding to the event fd
+        * to get the event fd context.
+        */
+
+       rcu_read_lock();
+
+       /* Get the task struct */
+       utask = pid_task(find_vpid(tgid), PIDTYPE_PID);
+       if (!utask) {
+               rcu_read_unlock();
+               return -EIO;
+       }
+
+       /* Get the file corresponding to event_info->fd */
+       efd_file = fcheck_files(utask->files, event_info->fd);
+       if (!efd_file) {
+               rcu_read_unlock();
+               return -EIO;
+       }
+
+       /* Get the eventfd context associated with the file */
+       efd_ctx = eventfd_ctx_fileget(efd_file);
+       if (!efd_ctx) {
+               rcu_read_unlock();
+               return -EIO;
+       }
+
+       /* signal the task by incrementing the counter by 1 */
+       eventfd_signal(efd_ctx, 1);
+
+       /* release the eventfd context */
+       eventfd_ctx_put(efd_ctx);
+
+       rcu_read_unlock();
+
+       return rv;
+
+}
+
+static struct vlds_event *vlds_get_event(struct vlds_event_info *event_info)
+{
+
+       struct vlds_event *event;
+
+       if (list_empty(&event_info->event_list))
+               return NULL;
+
+       event = list_first_entry(&event_info->event_list,
+           struct vlds_event, list);
+
+       BUG_ON(event == NULL);
+
+       return event;
+
+}
+
+static void vlds_remove_event(struct vlds_event_info *event_info,
+       struct vlds_event *event)
+{
+       if (event == NULL || list_empty(&event_info->event_list))
+               return;
+
+       /* Check here that the event is actually on the list? TBD */
+
+       list_del(&event->list);
+
+       kfree(event);
+}
+
+static void vlds_remove_svc_events(struct vlds_service_info *svc_info)
+{
+       struct vlds_event_info *event_info;
+       struct vlds_event *event;
+       struct vlds_event *next;
+
+       mutex_lock(&vlds_event_info_list_mutex);
+
+       list_for_each_entry(event_info, &vlds_event_info_list, list) {
+
+               list_for_each_entry_safe(event, next, &event_info->event_list,
+                   list) {
+                       if (event->svc_info == svc_info) {
+                               list_del(&event->list);
+                               kfree(event);
+                       }
+               }
+       }
+
+       mutex_unlock(&vlds_event_info_list_mutex);
+}
+
+static struct vlds_service_info *vlds_get_svc_info(struct vlds_dev *vlds,
+       char *svc_str, bool is_client)
+{
+       struct vlds_service_info *svc_info;
+
+       list_for_each_entry(svc_info, &vlds->service_info_list, list) {
+               if (!strncmp(svc_info->name, svc_str, VLDS_MAX_NAMELEN) &&
+                   VLDS_SVC_IS_CLIENT(svc_info) == is_client) {
+                       return svc_info;
+               }
+       }
+
+       return NULL;
+}
+
+static struct vlds_service_info *vlds_get_svc_info_hdl(struct vlds_dev *vlds,
+       u64 hdl)
+{
+       struct vlds_service_info *svc_info;
+
+       list_for_each_entry(svc_info, &vlds->service_info_list, list) {
+               if (svc_info->handle == hdl)
+                       return svc_info;
+       }
+
+       return NULL;
+}
+
+/* Add a message to a service message queue */
+static int vlds_add_msg(struct vlds_service_info *svc_info, void *buf,
+       size_t buflen)
+{
+       struct vlds_msg_data *msg_data;
+
+       /* check if we've reached the max num of queued messages */
+       if (svc_info->msg_queue_size > VLDS_MAX_MSG_LIST_NUM)
+               return -ENOSPC;
+
+       /* make sure the message size isn't too large */
+       if (buflen > VLDS_MAX_MSG_SIZE)
+               return -EFBIG;
+
+       /* we don't allow enqueing zero length messages */
+       if (buflen == 0)
+               return -EINVAL;
+
+       /* allocate/copy a buffer for the message */
+       msg_data = kzalloc(sizeof(struct vlds_msg_data) + buflen, GFP_KERNEL);
+       if (unlikely(msg_data == NULL))
+               return -ENOMEM;
+
+       /* copy the message/size */
+       memcpy(msg_data->data, buf, buflen);
+       msg_data->size = buflen;
+
+       /* add it to the queue */
+       list_add_tail(&msg_data->list, &svc_info->msg_queue);
+
+       svc_info->msg_queue_size++;
+
+       return 0;
+}
+
+/*
+ * Get a message (data and size) from a service message queue.
+ * NOTE: the message remains on the queue.
+ */
+static struct vlds_msg_data *vlds_get_msg(struct vlds_service_info *svc_info)
+{
+       struct vlds_msg_data *msg_data;
+
+       if (list_empty(&svc_info->msg_queue)) {
+               /*
+                * TBD: Block instead of return here
+                * (unless NONBLOCK flag specified).
+                */
+               return NULL;
+       }
+
+       msg_data = list_first_entry(&svc_info->msg_queue, struct vlds_msg_data,
+           list);
+
+       BUG_ON(msg_data == NULL);
+
+       return msg_data;
+}
+
+/* Dequeue a message from a service message queue. */
+static void vlds_dequeue_msg(struct vlds_service_info *svc_info,
+       struct vlds_msg_data *msg_data)
+{
+       if (msg_data == NULL || list_empty(&svc_info->msg_queue))
+               return;
+
+       /* Check here that the message is actually on the queue? TBD */
+
+       list_del(&msg_data->list);
+
+       kfree(msg_data);
+
+       svc_info->msg_queue_size--;
+}
+
+static void vlds_free_msg_queue(struct vlds_service_info *svc_info)
+{
+       struct vlds_msg_data *msg_data;
+       struct vlds_msg_data *next;
+
+       list_for_each_entry_safe(msg_data, next, &svc_info->msg_queue,
+           list) {
+
+               list_del(&msg_data->list);
+
+               kfree(msg_data);
+
+               svc_info->msg_queue_size--;
+       }
+
+}
+
+/*
+ * Callback ops
+ */
+static void
+vlds_ds_reg_cb(ds_cb_arg_t arg, ds_svc_hdl_t hdl, ds_ver_t *ver)
+{
+       struct vlds_dev *vlds;
+       struct vlds_service_info *svc_info;
+       int rv;
+
+       dprintk("entered.\n");
+
+       vlds = (struct vlds_dev *)arg;
+
+       mutex_lock(&vlds->vlds_mutex);
+
+       svc_info = vlds_get_svc_info_hdl(vlds, hdl);
+       if (svc_info == NULL) {
+               dprintk("%s: received invalid handle (%llx)\n",
+                   vlds->int_name, hdl);
+               mutex_unlock(&vlds->vlds_mutex);
+               return;
+       }
+
+       svc_info->neg_vers.vlds_major = (u16)ver->major;
+       svc_info->neg_vers.vlds_minor = (u16)ver->minor;
+       svc_info->state = VLDS_HDL_STATE_CONNECTED;
+
+       /*
+        * if the service requires events,
+        * add an event to the process's event_info queue
+        */
+       if (VLDS_SVC_IS_EVENT(svc_info)) {
+               rv = vlds_add_event(svc_info->tgid, svc_info,
+                   VLDS_EVENT_TYPE_REG, &svc_info->neg_vers);
+               if (rv) {
+                       /* just give an error if we failed to add the event */
+                       pr_err("%s: failed to create registration event "
+                           "(%llx)\n", vlds->int_name, hdl);
+               }
+       }
+
+       dprintk("%s: service %s registered version (%u.%u) hdl=%llx\n",
+           vlds->int_name, svc_info->name, svc_info->neg_vers.vlds_major,
+           svc_info->neg_vers.vlds_minor, hdl);
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+}
+
+static void
+vlds_ds_unreg_cb(ds_cb_arg_t arg, ds_svc_hdl_t hdl)
+{
+       struct vlds_dev *vlds;
+       struct vlds_service_info *svc_info;
+       int rv;
+
+       dprintk("entered.\n");
+
+       vlds = (struct vlds_dev *)arg;
+
+       mutex_lock(&vlds->vlds_mutex);
+
+       svc_info = vlds_get_svc_info_hdl(vlds, hdl);
+       if (svc_info == NULL) {
+               dprintk("%s: recevied invalid handle (%llx)\n",
+                   vlds->int_name, hdl);
+               mutex_unlock(&vlds->vlds_mutex);
+               return;
+       }
+
+       svc_info->neg_vers.vlds_major = 0;
+       svc_info->neg_vers.vlds_minor = 0;
+       svc_info->state = VLDS_HDL_STATE_DISCONNECTED;
+
+       /*
+        * if the service requires events,
+        * add an event to the process's event_info queue
+        */
+       if (VLDS_SVC_IS_EVENT(svc_info)) {
+               rv = vlds_add_event(svc_info->tgid, svc_info,
+                   VLDS_EVENT_TYPE_UNREG, NULL);
+               if (rv) {
+                       /* just give an error if we failed to add the event */
+                       pr_err("%s: failed to create unregistration event "
+                           "(%llx)\n", vlds->int_name, hdl);
+               }
+       }
+
+       dprintk("%s: service %s unregistered hdl=%llx\n",
+           vlds->int_name, svc_info->name, hdl);
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+}
+
+static void
+vlds_ds_data_cb(ds_cb_arg_t arg, ds_svc_hdl_t hdl, void *buf, size_t buflen)
+{
+       struct vlds_dev *vlds;
+       struct vlds_service_info *svc_info;
+       int rv;
+
+       dprintk("entered.\n");
+
+       vlds = (struct vlds_dev *)arg;
+
+       mutex_lock(&vlds->vlds_mutex);
+
+       svc_info = vlds_get_svc_info_hdl(vlds, hdl);
+       if (svc_info == NULL) {
+               dprintk("%s: recevied invalid handle (%llx)\n",
+                   vlds->int_name, hdl);
+               mutex_unlock(&vlds->vlds_mutex);
+               return;
+       }
+
+       /* received data is assumed to be 1 complete message */
+       rv = vlds_add_msg(svc_info, buf, buflen);
+       if (rv) {
+               if (rv == -ENOSPC)
+                       dprintk("%s: service %s: message queue overflow!\n",
+                           vlds->int_name, svc_info->name);
+               else if (rv == -EFBIG)
+                       dprintk("%s: service %s: message too large "
+                           "(%lu bytes)!\n", vlds->int_name, svc_info->name,
+                           buflen);
+               else
+                       dprintk("%s: service %s: failed to add message "
+                           "(err = %d)!\n", vlds->int_name,
+                           svc_info->name, rv);
+
+               mutex_unlock(&vlds->vlds_mutex);
+
+               return;
+       }
+
+       /*
+        * if the service requires events,
+        * add an event to the process's event_info queue
+        */
+       if (VLDS_SVC_IS_EVENT(svc_info)) {
+               rv = vlds_add_event(svc_info->tgid, svc_info,
+                   VLDS_EVENT_TYPE_DATA, NULL);
+               if (rv) {
+                       /* just give an error if we failed to add the event */
+                       pr_err("%s: failed to create data event (%llx)\n",
+                           vlds->int_name, hdl);
+               }
+       }
+
+       dprintk("%s: %s service: Received %lu bytes hdl=%llx\n",
+           vlds->int_name, svc_info->name, buflen, hdl);
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+}
+
+static ds_ops_t vlds_ds_ops = {
+       vlds_ds_reg_cb,         /* register */
+       vlds_ds_unreg_cb,       /* unregister */
+       vlds_ds_data_cb,        /* data */
+       NULL                    /* optional arg to ops */
+};
+
+static int vlds_svc_reg(struct vlds_dev *vlds, const void __user *uarg)
+{
+
+       vlds_svc_reg_arg_t svc_reg;
+       vlds_cap_t cap;
+       char *svc_str;
+       bool is_client_reg;
+       ds_capability_t dscap;
+       u32 flags;
+       ds_svc_hdl_t ds_hdl;
+       int rv;
+       struct vlds_service_info *svc_info;
+
+       dprintk("entered.\n");
+
+       svc_str = NULL;
+       svc_info = NULL;
+
+       /* Get (and validate) userland args */
+       if (uarg == NULL || copy_from_user(&svc_reg, uarg,
+           sizeof(vlds_svc_reg_arg_t)) != 0) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       /* Validate svc_reg.vlds_hdlp is present/accessible */
+       if (!access_ok(VERIFY_WRITE, (void __user *)svc_reg.vlds_hdlp,
+           sizeof(u64))) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       if (copy_from_user(&cap, (const void __user *)svc_reg.vlds_capp,
+           sizeof(vlds_cap_t)) != 0) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       /* make sure the service strlen is sane */
+       if (cap.vlds_service.vlds_strlen == 0 ||
+           cap.vlds_service.vlds_strlen > VLDS_MAX_NAMELEN) {
+               rv = -EINVAL;
+               goto error_out1;
+       }
+
+       /* get the service string from userland */
+       svc_str = kzalloc(cap.vlds_service.vlds_strlen + 1, GFP_KERNEL);
+       if (unlikely(svc_str == NULL)) {
+               rv = -ENOMEM;
+               goto error_out1;
+       }
+
+       if (copy_from_user(svc_str,
+           (const void __user *)cap.vlds_service.vlds_strp,
+           cap.vlds_service.vlds_strlen) != 0) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       is_client_reg = (svc_reg.vlds_reg_flags & VLDS_REG_CLIENT);
+
+       mutex_lock(&vlds->vlds_mutex);
+
+       /* Check if the service is already being used */
+       svc_info = vlds_get_svc_info(vlds, svc_str, is_client_reg);
+       if (svc_info != NULL) {
+               /* This service is already in use */
+               rv = -EBUSY;
+               svc_info = NULL;
+               goto error_out2;
+       }
+
+       /* init the ds capability structure */
+       dscap.svc_id = svc_str;
+       dscap.vers.major = (u64)cap.vlds_vers.vlds_major;
+       dscap.vers.minor = (u64)cap.vlds_vers.vlds_minor;
+
+       /* The svc_info will be passed back as an arg to the cb */
+       vlds_ds_ops.cb_arg = (void *)vlds;
+
+       flags = 0x0;
+       if (is_client_reg)
+               flags |= DS_CAP_IS_CLIENT;
+       else
+               flags |= DS_CAP_IS_PROVIDER;
+
+       if (vlds != sp_vlds)
+               flags |= DS_TARGET_IS_DOMAIN;
+
+       ds_hdl = 0;
+       rv = ds_cap_init(&dscap, &vlds_ds_ops, flags, vlds->domain_handle,
+           &ds_hdl);
+       if (rv || ds_hdl == 0) {
+               dprintk("%s: ds_cap_init failed for %s service\n",
+                   vlds->int_name, svc_str);
+               goto error_out2;
+       }
+
+       if (copy_to_user((void __user *)(svc_reg.vlds_hdlp), (u64 *)&ds_hdl,
+           sizeof(u64)) != 0) {
+               (void) ds_cap_fini(ds_hdl);
+               rv = -EFAULT;
+               goto error_out2;
+       }
+
+       /* create a service info for the new service */
+       svc_info = kzalloc(sizeof(struct vlds_service_info), GFP_KERNEL);
+       if (unlikely(svc_str == NULL)) {
+               (void) ds_cap_fini(ds_hdl);
+               rv = -ENOMEM;
+               goto error_out2;
+       }
+
+       svc_info->name = svc_str;
+       svc_info->state = VLDS_HDL_STATE_NOT_YET_CONNECTED;
+       svc_info->flags = svc_reg.vlds_reg_flags;
+       svc_info->tgid = task_tgid_vnr(current);
+       svc_info->handle = (u64)ds_hdl;
+       INIT_LIST_HEAD(&svc_info->msg_queue);
+       svc_info->msg_queue_size = 0;
+
+       /* add the service_info to the vlds device */
+       list_add_tail(&svc_info->list, &vlds->service_info_list);
+
+       dprintk("%s: registered %s service (client = %llu) "
+           "(hdl = %llx) (tgid = %u) with ds\n", vlds->int_name, svc_str,
+           VLDS_SVC_IS_CLIENT(svc_info), svc_info->handle, svc_info->tgid);
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+       return 0;
+
+error_out2:
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+error_out1:
+
+       dprintk("%s: failed to register service rv = %d\n", vlds->int_name, rv);
+
+       if (svc_info)
+               kfree(svc_info);
+
+       if (svc_str)
+               kfree(svc_str);
+
+       return rv;
+}
+
+static int vlds_unreg_hdl(struct vlds_dev *vlds, const void __user *uarg)
+{
+       vlds_unreg_hdl_arg_t unreg;
+       struct vlds_service_info *svc_info;
+       int rv;
+
+       dprintk("entered.\n");
+
+       /* Get (and validate) userland args */
+       if (uarg == NULL || copy_from_user(&unreg, uarg,
+           sizeof(vlds_unreg_hdl_arg_t)) != 0) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       mutex_lock(&vlds->vlds_mutex);
+
+       svc_info = vlds_get_svc_info_hdl(vlds, unreg.vlds_hdl);
+       if (svc_info == NULL) {
+               rv = -ENODEV;
+               goto error_out2;
+       }
+
+       /* unregister the service from ds */
+       rv = ds_cap_fini(unreg.vlds_hdl);
+       if (rv) {
+               dprintk("%s: ds_cap_fini failed for %s service ",
+                   vlds->int_name, svc_info->name);
+               goto error_out2;
+       }
+
+       dprintk("%s: unregistered %s service (client = %llu) "
+           "(hdl = %llx) with ds\n", vlds->int_name, svc_info->name,
+           VLDS_SVC_IS_CLIENT(svc_info), unreg.vlds_hdl);
+
+       list_del(&svc_info->list);
+
+       /* remove any events referencing this svc_info */
+       vlds_remove_svc_events(svc_info);
+
+       kfree(svc_info->name);
+       vlds_free_msg_queue(svc_info);
+       kfree(svc_info);
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+       return 0;
+
+error_out2:
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+error_out1:
+
+       dprintk("%s: failed to unregister service rv = %d\n",
+           vlds->int_name, rv);
+
+       return rv;
+}
+
+static int vlds_hdl_lookup(struct vlds_dev *vlds, const void __user *uarg)
+{
+       vlds_hdl_lookup_arg_t hdl_lookup;
+       struct vlds_service_info *svc_info;
+       char *svc_str;
+       u64 num_hdls;
+       int rv;
+
+       dprintk("entered.\n");
+
+       svc_str = NULL;
+
+       /* Get (and validate) userland args */
+       if (uarg == NULL || copy_from_user(&hdl_lookup, uarg,
+           sizeof(vlds_hdl_lookup_arg_t)) != 0) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       /* we only support 1 return handle */
+       if (hdl_lookup.vlds_maxhdls != 1) {
+               rv = -EINVAL;
+               goto error_out1;
+       }
+
+       /* get the service string */
+
+       /* make sure the service strlen is sane */
+       if (hdl_lookup.vlds_service.vlds_strlen == 0 ||
+           hdl_lookup.vlds_service.vlds_strlen > VLDS_MAX_NAMELEN) {
+               rv = -EINVAL;
+               goto error_out1;
+       }
+
+       /* get the service string from userland */
+       svc_str = kzalloc(hdl_lookup.vlds_service.vlds_strlen + 1, GFP_KERNEL);
+       if (unlikely(svc_str == NULL)) {
+               rv = -ENOMEM;
+               goto error_out1;
+       }
+
+       if (copy_from_user(svc_str,
+           (const void __user *)hdl_lookup.vlds_service.vlds_strp,
+           hdl_lookup.vlds_service.vlds_strlen) != 0) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       mutex_lock(&vlds->vlds_mutex);
+
+       svc_info = vlds_get_svc_info(vlds, svc_str, hdl_lookup.vlds_isclient);
+       if (svc_info == NULL) {
+               rv = -ENODEV;
+               goto error_out2;
+       }
+
+       if (copy_to_user((void __user *)(hdl_lookup.vlds_hdlsp),
+           &svc_info->handle, sizeof(u64)) != 0) {
+               rv = -EFAULT;
+               goto error_out2;
+       }
+
+       num_hdls = 1;
+       if (put_user(num_hdls, (u64 __user *)(hdl_lookup.vlds_nhdlsp)) != 0) {
+               rv = -EFAULT;
+               goto error_out2;
+       }
+
+       dprintk("%s: handle lookup for  %s service (client = %llu) "
+           "returned (hdl = %llx)\n", vlds->int_name, svc_str,
+           hdl_lookup.vlds_isclient, svc_info->handle);
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+       return 0;
+
+error_out2:
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+error_out1:
+
+       dprintk("%s: failed to lookup handle rv = %d\n", vlds->int_name, rv);
+
+       if (svc_str)
+               kfree(svc_str);
+
+       return rv;
+
+}
+
+static int vlds_dmn_lookup(struct vlds_dev *vlds, const void __user *uarg)
+{
+       vlds_dmn_lookup_arg_t dmn_lookup;
+       int rv;
+
+       /* Get (and validate) userland args */
+       if (uarg == NULL || copy_from_user(&dmn_lookup, uarg,
+           sizeof(vlds_dmn_lookup_arg_t)) != 0) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       /* make sure the string buffer size is sane */
+       if (dmn_lookup.vlds_dname.vlds_strlen < (strlen(vlds->int_name) + 1)) {
+               rv = -EINVAL;
+               goto error_out1;
+       }
+
+       if (put_user(vlds->domain_handle,
+           (u64 __user *)(dmn_lookup.vlds_dhdlp)) != 0) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       if (copy_to_user((void __user *)(dmn_lookup.vlds_dname.vlds_strp),
+           vlds->int_name, (strlen(vlds->int_name) + 1)) != 0) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       return 0;
+
+error_out1:
+
+       dprintk("%s: failed to lookup domain info. rv = %d\n",
+           vlds->int_name, rv);
+
+       return rv;
+}
+
+static int vlds_hdl_get_state(struct vlds_dev *vlds, const void __user *uarg)
+{
+       vlds_hdl_get_state_arg_t hdl_get_state;
+       struct vlds_service_info *svc_info;
+       vlds_hdl_state_t hdl_state;
+       int rv;
+
+       /* Get (and validate) userland args */
+       if (uarg == NULL || copy_from_user(&hdl_get_state, uarg,
+           sizeof(vlds_hdl_get_state_arg_t)) != 0) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       mutex_lock(&vlds->vlds_mutex);
+
+       svc_info = vlds_get_svc_info_hdl(vlds, hdl_get_state.vlds_hdl);
+       if (svc_info == NULL) {
+               rv = -ENODEV;
+               goto error_out2;
+       }
+
+       memset(&hdl_state, 0, sizeof(hdl_state));
+       hdl_state.state = svc_info->state;
+       /* if the state is connected, return the negotiated version */
+       if (svc_info->state == VLDS_HDL_STATE_CONNECTED) {
+               hdl_state.vlds_vers.vlds_major = svc_info->neg_vers.vlds_major;
+               hdl_state.vlds_vers.vlds_minor = svc_info->neg_vers.vlds_minor;
+       }
+
+       if (copy_to_user((void __user *)(hdl_get_state.vlds_statep),
+           &hdl_state, sizeof(vlds_hdl_state_t)) != 0) {
+               rv = -EFAULT;
+               goto error_out2;
+       }
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+       return 0;
+
+error_out2:
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+error_out1:
+
+       dprintk("%s: failed to get handle state rv = %d\n", vlds->int_name, rv);
+
+       return rv;
+
+}
+
+static int vlds_send_msg(struct vlds_dev *vlds, const void __user *uarg)
+{
+       vlds_send_msg_arg_t send_msg;
+       struct vlds_service_info *svc_info;
+       u8 *send_buf;
+       int rv;
+
+       dprintk("entered.\n");
+
+       send_buf = NULL;
+
+       /* Get (and validate) userland args */
+       if (uarg == NULL || copy_from_user(&send_msg, uarg,
+           sizeof(vlds_send_msg_arg_t)) != 0) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       if (send_msg.vlds_buflen == 0 ||
+           send_msg.vlds_buflen > VLDS_MAX_SENDBUF_LEN) {
+               rv = -EINVAL;
+               goto error_out1;
+       }
+
+       mutex_lock(&vlds->vlds_mutex);
+
+       svc_info = vlds_get_svc_info_hdl(vlds, send_msg.vlds_hdl);
+       if (svc_info == NULL) {
+               rv = -ENODEV;
+               goto error_out2;
+       }
+
+       /* make sure we are in connected state before sending the data */
+       if (svc_info->state != VLDS_HDL_STATE_CONNECTED) {
+               rv = -EIO;
+               goto error_out2;
+       }
+
+       send_buf = kzalloc(send_msg.vlds_buflen, GFP_KERNEL);
+       if (unlikely(send_buf == NULL)) {
+               rv = -ENOMEM;
+               goto error_out2;
+       }
+
+       if (copy_from_user(send_buf, (const void __user *)send_msg.vlds_bufp,
+           send_msg.vlds_buflen) != 0) {
+               rv = -EFAULT;
+               goto error_out2;
+       }
+
+       rv = ds_cap_send(send_msg.vlds_hdl, send_buf, send_msg.vlds_buflen);
+       if (rv) {
+
+               /*
+                * TBD: If rv == -EAGAIN, block here trying again in loop
+                * (unless NONBLOCK flag specified).
+                */
+               dprintk("%s: ds_cap_send failed for %s service (rv=%d)\n",
+                   vlds->int_name, svc_info->name, rv);
+               goto error_out2;
+       }
+
+       kfree(send_buf);
+
+       dprintk("%s: send msg hdl = %llx (buflen=%llu) SUCCESS\n",
+           vlds->int_name, send_msg.vlds_hdl, send_msg.vlds_buflen);
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+       return 0;
+
+error_out2:
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+error_out1:
+
+       dprintk("%s: failed to send msg rv = %d\n", vlds->int_name, rv);
+
+       if (send_buf != NULL)
+               kfree(send_buf);
+
+       return rv;
+
+}
+
+static int vlds_recv_msg(struct vlds_dev *vlds, const void __user *uarg)
+{
+       vlds_recv_msg_arg_t recv_msg;
+       struct vlds_service_info *svc_info;
+       u8 *msg;
+       size_t msglen;
+       int rv;
+       struct vlds_msg_data *msg_data;
+
+       /* Get (and validate) userland args */
+       if (uarg == NULL || copy_from_user(&recv_msg, uarg,
+           sizeof(vlds_recv_msg_arg_t)) != 0) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       if (recv_msg.vlds_buflen > VLDS_MAX_SENDBUF_LEN) {
+               rv = -EINVAL;
+               goto error_out1;
+       }
+
+       mutex_lock(&vlds->vlds_mutex);
+
+       svc_info = vlds_get_svc_info_hdl(vlds, recv_msg.vlds_hdl);
+       if (svc_info == NULL) {
+               rv = -ENODEV;
+               goto error_out2;
+       }
+
+       msg_data =  vlds_get_msg(svc_info);
+       if (msg_data == NULL) {
+               msg = NULL;
+               msglen = 0;
+       } else {
+               msg = msg_data->data;
+               msglen = msg_data->size;
+       }
+
+       if (put_user(msglen, (u64 __user *)(recv_msg.vlds_msglenp)) != 0) {
+               rv = -EFAULT;
+               goto error_out2;
+       }
+
+       /*
+        * Special handling for a buflen of 0: if buflen is 0, we return
+        * the number of bytes for the next message in the queue.
+        *
+        * This is a mechanism for the caller to use to poll the queue
+        * to detect if a msg is ready to be received and to get the
+        * size of the next message so the appropriate sized buffer can
+        * be allocated to receive the msg.
+        */
+       if (recv_msg.vlds_buflen == 0) {
+
+               if (msglen > 0)
+                       dprintk("%s: service %s: buflen==0 poll "
+                           "returned %zu bytes\n",
+                           vlds->int_name, svc_info->name, msglen);
+
+               mutex_unlock(&vlds->vlds_mutex);
+
+               return 0;
+       }
+
+       /*
+        * We do not return truncated data. Return EFBIG error if
+        * supplied buffer is too small to hold the next message.
+        */
+       if (msglen > 0 && recv_msg.vlds_buflen < msglen) {
+               dprintk("%s: service %s: recv buffer too small for "
+                   "next message (supplied buffer = %llu bytes, "
+                   "next message = %lu bytes)\n",
+                   vlds->int_name, svc_info->name, recv_msg.vlds_buflen,
+                   msglen);
+
+               rv = -EFBIG;
+               goto error_out2;
+       }
+
+       if (msglen > 0) {
+
+               if (copy_to_user((void __user *)(recv_msg.vlds_bufp),
+                   msg, msglen) != 0) {
+                       rv = -EFAULT;
+                       goto error_out2;
+               }
+
+               /*
+                * We successfully copied the data to user,
+                * so dequeue the message
+                */
+               vlds_dequeue_msg(svc_info, msg_data);
+
+               dprintk("%s: recv msg hdl = %llx (len=%lu) SUCCESS\n",
+                   vlds->int_name, recv_msg.vlds_hdl, msglen);
+       }
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+       return 0;
+
+error_out2:
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+error_out1:
+
+       dprintk("%s: failed to recv msg rv = %d\n",
+           vlds->int_name, rv);
+
+       return rv;
+}
+
+static int vlds_set_event_fd(struct vlds_dev *vlds, const void __user *uarg)
+{
+       vlds_set_event_fd_arg_t set_event_fd;
+       int rv;
+       pid_t tgid;
+
+       /* Get (and validate) userland args */
+       if (uarg == NULL || copy_from_user(&set_event_fd, uarg,
+           sizeof(vlds_set_event_fd_arg_t)) != 0) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       tgid = task_tgid_vnr(current);
+
+       mutex_lock(&vlds_event_info_list_mutex);
+
+       /*
+        * If there is already an event fd
+        * registered for this process, remove it.
+        */
+       vlds_remove_event_info(tgid);
+
+       rv = vlds_add_event_info(tgid, set_event_fd.fd);
+
+       mutex_unlock(&vlds_event_info_list_mutex);
+
+       if (rv)
+               goto error_out1;
+
+       dprintk("%s: vlds_set_event_fd: SUCCESS\n", vlds->int_name);
+
+       return 0;
+
+
+error_out1:
+
+       dprintk("%s: failed to set event fd: rv = %d\n",
+           vlds->int_name, rv);
+
+       return rv;
+}
+
+static int vlds_unset_event_fd(struct vlds_dev *vlds, const void __user *uarg)
+{
+       pid_t tgid;
+
+       tgid = task_tgid_vnr(current);
+
+       mutex_lock(&vlds_event_info_list_mutex);
+
+       vlds_remove_event_info(tgid);
+
+       mutex_unlock(&vlds_event_info_list_mutex);
+
+       dprintk("%s: vlds_unset_event_fd: SUCCESS\n", vlds->int_name);
+
+       return 0;
+
+}
+
+static int vlds_get_next_event(struct vlds_dev *vlds, const void __user *uarg)
+{
+       vlds_get_next_event_arg_t next_event;
+       struct vlds_event_info *event_info;
+       struct vlds_event *event;
+       struct vlds_msg_data *msg_data;
+       u8 *msg;
+       size_t msglen;
+       int rv;
+
+       dprintk("called\n");
+
+       /* Get (and validate) userland args */
+       if (uarg == NULL || copy_from_user(&next_event, uarg,
+           sizeof(vlds_get_next_event_arg_t)) != 0) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       /* Validate next_event.vlds_hdlp is present/accessible */
+       if (!access_ok(VERIFY_WRITE, (void __user *)next_event.vlds_hdlp,
+           sizeof(u64))) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       /* Validate next_event.vlds_event_typep is present/accessible */
+       if (!access_ok(VERIFY_WRITE, (void __user *)next_event.vlds_event_typep,
+           sizeof(u64))) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       /* Validate next_event.neg_versp is present/accessible */
+       if (!access_ok(VERIFY_WRITE, (void __user *)next_event.neg_versp,
+           sizeof(u64))) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       /* Validate next_event.vlds_buflen is valid */
+       if (next_event.vlds_buflen == 0 ||
+           next_event.vlds_buflen > VLDS_MAX_SENDBUF_LEN) {
+               rv = -EINVAL;
+               goto error_out1;
+       }
+
+       /* Validate next_event.vlds_bufp is present/accessible */
+       if (!access_ok(VERIFY_WRITE, (void __user *)next_event.vlds_bufp,
+           next_event.vlds_buflen)) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       /* Validate next_event.vlds_msglenp is present/accessible */
+       if (!access_ok(VERIFY_WRITE, (void __user *)next_event.vlds_msglenp,
+           sizeof(u64))) {
+               rv = -EFAULT;
+               goto error_out1;
+       }
+
+       /* user arg is valid, get the next event */
+
+       mutex_lock(&vlds->vlds_mutex);
+
+       mutex_lock(&vlds_event_info_list_mutex);
+
+
+       event_info = NULL;
+       rv = vlds_get_event_info(task_tgid_vnr(current), &event_info);
+       if (rv || event_info == NULL) {
+               /*
+                * Process didn't register an event fd!
+                * This is required to start receiving events.
+                */
+               rv = -EIO;
+               goto error_out2;
+       }
+
+       event = vlds_get_event(event_info);
+       if (event == NULL) {
+               /*
+                * No events left outstanding. Return -ENOENT (-2)
+                * to indicate no more events to process.
+                */
+               rv = -ENOENT;
+               goto error_out2;
+       }
+
+       /* populate the return event handle */
+       if (put_user(event->svc_info->handle,
+           (u64 __user *)(next_event.vlds_hdlp)) != 0) {
+               rv = -EFAULT;
+               goto error_out2;
+       }
+
+       /* populate the return event type */
+       if (put_user(event->type, (u64 __user *)(next_event.vlds_event_typep)) != 0) {
+               rv = -EFAULT;
+               goto error_out2;
+       }
+
+       /* if it's a reg type event, populate the negotiated version */
+       if (event->type == VLDS_EVENT_TYPE_REG) {
+               if (copy_to_user((void __user *)(next_event.neg_versp),
+                   &event->neg_vers, sizeof(vlds_ver_t)) != 0) {
+                       rv = -EFAULT;
+                       goto error_out2;
+               }
+       }
+
+       /*
+        * if it's a data type event, populate the data buffer
+        * with next message from the service
+        */
+       if (event->type == VLDS_EVENT_TYPE_DATA) {
+               msg_data =  vlds_get_msg(event->svc_info);
+               if (msg_data == NULL || msg_data->size == 0) {
+                       rv = -EIO;
+                       goto error_out2;
+               }
+
+               msg = msg_data->data;
+               msglen = msg_data->size;
+
+               if (next_event.vlds_buflen < msglen) {
+                       dprintk("%s: service %s: recv buffer too small for "
+                           "next message (supplied buffer = %llu bytes, "
+                           "next message = %lu bytes)\n",
+                           vlds->int_name, event->svc_info->name,
+                           next_event.vlds_buflen, msglen);
+
+                       rv = -EFBIG;
+                       goto error_out2;
+               }
+
+               if (put_user(msglen, (u64 __user *)(next_event.vlds_msglenp))
+                   != 0) {
+                       rv = -EFAULT;
+                       goto error_out2;
+               }
+
+               if (copy_to_user((void __user *)(next_event.vlds_bufp),
+                   msg, msglen) != 0) {
+                       rv = -EFAULT;
+                       goto error_out2;
+               }
+
+               /* we copied the data to user, so dequeue the message */
+               vlds_dequeue_msg(event->svc_info, msg_data);
+       }
+
+       /* We successfully transferred the event, remove it from the list */
+       vlds_remove_event(event_info, event);
+
+       mutex_unlock(&vlds_event_info_list_mutex);
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+       return 0;
+
+error_out2:
+
+       mutex_unlock(&vlds_event_info_list_mutex);
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+error_out1:
+
+       if (rv != -ENOENT)
+               dprintk("%s: failed to get next event: rv = %d\n",
+                   vlds->int_name, rv);
+
+       return rv;
+}
+
+static int vlds_fops_open(struct inode *inode, struct file *filp)
+{
+
+       struct vlds_dev *vlds;
+
+       dprintk("entered.\n");
+
+       /*
+        * We allow all opens on the device. We just need to
+        * tuck away the vlds device for subsequent fops.
+        */
+       vlds = container_of(inode->i_cdev, struct vlds_dev, cdev);
+
+       filp->private_data = vlds;
+
+       return 0;
+}
+
+static void vlds_unreg_all(struct vlds_dev *vlds)
+{
+
+       struct vlds_service_info *svc_info;
+       struct vlds_service_info *next;
+
+       if (vlds == NULL)
+               return;
+
+       mutex_lock(&vlds->vlds_mutex);
+
+       list_for_each_entry_safe(svc_info, next, &vlds->service_info_list,
+           list) {
+
+               (void) ds_cap_fini(svc_info->handle);
+
+               dprintk("%s: unregistered %s service (client = %llu) "
+                   "(hdl = %llx) with ds\n", vlds->int_name,
+                   svc_info->name, VLDS_SVC_IS_CLIENT(svc_info),
+                   svc_info->handle);
+
+               list_del(&svc_info->list);
+               vlds_remove_svc_events(svc_info);
+               kfree(svc_info->name);
+               vlds_free_msg_queue(svc_info);
+               kfree(svc_info);
+
+       }
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+}
+
+static void vlds_unreg_all_tgid(struct vlds_dev *vlds, pid_t tgid)
+{
+
+       struct vlds_service_info *svc_info;
+       struct vlds_service_info *next;
+
+       mutex_lock(&vlds->vlds_mutex);
+
+       list_for_each_entry_safe(svc_info, next, &vlds->service_info_list,
+           list) {
+
+               if (svc_info->tgid == tgid) {
+
+                       (void) ds_cap_fini(svc_info->handle);
+
+                       dprintk("%s: unregistered %s service "
+                           "(client = %llu) (hdl = %llx) with ds\n",
+                           vlds->int_name, svc_info->name,
+                           VLDS_SVC_IS_CLIENT(svc_info), svc_info->handle);
+
+                       list_del(&svc_info->list);
+
+                       kfree(svc_info->name);
+                       vlds_free_msg_queue(svc_info);
+                       kfree(svc_info);
+               }
+
+       }
+
+       mutex_unlock(&vlds->vlds_mutex);
+
+}
+
+static int vlds_fops_release(struct inode *inode, struct file *filp)
+{
+       struct vlds_dev *vlds;
+       pid_t tgid;
+
+       dprintk("entered.\n");
+
+       if (filp == NULL)
+               return -EINVAL;
+
+       vlds = filp->private_data;
+
+       if (vlds == NULL) {
+               /* This should not happen, but... */
+               pr_err("vlds_fops_release: ERROR- failed to get "
+                   "associated vlds_dev\n");
+               return 0;
+       }
+
+       tgid = task_tgid_vnr(current);
+
+       dprintk("%s: unregistering all events and services for tgid = %u\n",
+           vlds->int_name, tgid);
+
+       /* Remove all events queued for this tgid */
+       mutex_lock(&vlds_event_info_list_mutex);
+
+       vlds_remove_event_info(tgid);
+
+       mutex_unlock(&vlds_event_info_list_mutex);
+
+       /* Close all services used by this process */
+       vlds_unreg_all_tgid(vlds, tgid);
+
+       return 0;
+}
+
+static long vlds_fops_ioctl(struct file *filp, unsigned int cmd,
+                           unsigned long arg)
+{
+       struct vlds_dev *vlds;
+       int rv;
+
+       rv = 0;
+
+       vlds = filp->private_data;
+
+       switch (cmd) {
+
+       case VLDS_IOCTL_SVC_REG:
+
+               rv = vlds_svc_reg(vlds, (const void __user *)arg);
+
+               break;
+
+       case VLDS_IOCTL_UNREG_HDL:
+
+               rv = vlds_unreg_hdl(vlds, (const void __user *)arg);
+
+               break;
+
+       case VLDS_IOCTL_HDL_LOOKUP:
+
+               rv = vlds_hdl_lookup(vlds, (const void __user *)arg);
+
+               break;
+
+       case VLDS_IOCTL_DMN_LOOKUP:
+
+               rv = vlds_dmn_lookup(vlds, (const void __user *)arg);
+
+               break;
+
+       case VLDS_IOCTL_SEND_MSG:
+
+               rv = vlds_send_msg(vlds, (const void __user *)arg);
+
+               break;
+
+       case VLDS_IOCTL_RECV_MSG:
+
+               rv = vlds_recv_msg(vlds, (const void __user *)arg);
+
+               break;
+
+       case VLDS_IOCTL_HDL_GET_STATE:
+
+               rv = vlds_hdl_get_state(vlds, (const void __user *)arg);
+
+               break;
+
+       case VLDS_IOCTL_SET_EVENT_FD:
+
+               rv = vlds_set_event_fd(vlds, (const void __user *)arg);
+
+               break;
+
+       case VLDS_IOCTL_UNSET_EVENT_FD:
+
+               rv = vlds_unset_event_fd(vlds, (const void __user *)arg);
+
+               break;
+
+       case VLDS_IOCTL_GET_NEXT_EVENT:
+
+               rv = vlds_get_next_event(vlds, (const void __user *)arg);
+
+               break;
+
+       default:
+
+               return -EINVAL;
+       }
+
+       return rv;
+}
+
+static const struct file_operations vlds_fops = {
+       .owner          = THIS_MODULE,
+       .open           = vlds_fops_open,
+       .release        = vlds_fops_release,
+       .unlocked_ioctl = vlds_fops_ioctl,
+};
+
+static int vlds_get_next_avail_minor(void)
+{
+       struct vlds_dev *vlds;
+       bool found;
+       int i;
+
+       /*
+        * walk the vlds_dev_list list to find the next
+        * lowest available minor.
+        */
+       mutex_lock(&vlds_data_mutex);
+       for (i = VLDS_MINOR_BASE; i < VLDS_MAX_DEVS; i++) {
+               found = false;
+               list_for_each_entry(vlds, &vlds_data.vlds_dev_list, list) {
+                       if (i == MINOR(vlds->devt)) {
+                               found = true;
+                               break;
+                       }
+               }
+               if (!found) {
+                       /* found a free minor, use it */
+                       break;
+               }
+       }
+       mutex_unlock(&vlds_data_mutex);
+
+       if (i == VLDS_MAX_DEVS) {
+               dprintk("no more minors left for allocation!\n");
+               return -1;
+       }
+
+       return i;
+}
+
+static int vlds_alloc_vlds_dev(char *int_name, char *dev_name,
+       struct device *vdev_dev, const u64 domain_handle,
+       struct vlds_dev **vldsp)
+{
+       struct vlds_dev *vlds;
+       int rv;
+       dev_t devt;
+       struct device *device;
+       int next_minor;
+       unsigned char devt_buf[32];
+
+       dprintk("entered.\n");
+
+       devt = 0;
+       device = NULL;
+
+       vlds = kzalloc(sizeof(struct vlds_dev), GFP_KERNEL);
+       if (unlikely(vlds == NULL)) {
+               dprintk("failed to allocate vlds_dev\n");
+               rv = -ENOMEM;
+               goto error;
+       }
+
+       vlds->domain_handle = domain_handle;
+
+       mutex_init(&vlds->vlds_mutex);
+
+       INIT_LIST_HEAD(&vlds->service_info_list);
+
+       vlds->int_name = kmemdup(int_name, (strlen(int_name) + 1), GFP_KERNEL);
+       if (unlikely(vlds->int_name == NULL)) {
+               dprintk("failed to alloc vlds int name.\n");
+               rv = -ENOMEM;
+               goto error;
+       }
+
+       /* create the devt for this device */
+       next_minor = vlds_get_next_avail_minor();
+       if (next_minor == -1) {
+               dprintk("vlds_get_next_avail_minor() failed.\n");
+               rv = -ENXIO;
+               goto error;
+       }
+       devt = MKDEV(MAJOR(vlds_data.devt), next_minor);
+       vlds->devt = devt;
+
+       dprintk("%s: dev_t=%s\n", vlds->int_name, format_dev_t(devt_buf,
+               vlds->devt));
+       dprintk("%s: domain_handle = %llu\n", vlds->int_name, domain_handle);
+
+       /* create/add the associated cdev */
+       cdev_init(&vlds->cdev, &vlds_fops);
+       vlds->cdev.owner = THIS_MODULE;
+       rv = cdev_add(&vlds->cdev, devt, 1);
+       if (rv != 0) {
+               dprintk("cdev_add() failed.\n");
+               devt = 0;
+               goto error;
+       }
+
+       /* create the associated /sys and /dev entries */
+       device = device_create(vlds_data.chrdev_class, vdev_dev, devt,
+                      vlds, "%s", dev_name);
+       if (IS_ERR(device)) {
+               dprintk("device_create() failed.\n");
+               rv = PTR_ERR(device);
+               device = NULL;
+               goto error;
+       }
+
+       vlds->device = device;
+
+       /* add the vlds to the global vlds_data device list */
+       mutex_lock(&vlds_data_mutex);
+       list_add_tail(&vlds->list, &vlds_data.vlds_dev_list);
+       vlds_data.num_vlds_dev_list++;
+       mutex_unlock(&vlds_data_mutex);
+
+       if (vldsp != NULL)
+               *vldsp = vlds;
+
+       return 0;
+
+error:
+
+       if (device)
+               device_destroy(vlds_data.chrdev_class, devt);
+
+       if (devt)
+               cdev_del(&vlds->cdev);
+
+       if (vlds->int_name)
+               kfree(vlds->int_name);
+
+       if (vlds != NULL) {
+               mutex_destroy(&vlds->vlds_mutex);
+               kfree(vlds);
+       }
+
+       dprintk("dev alloc failed (rv=%d)\n", rv);
+
+       return rv;
+}
+
+static int vlds_probe(struct vio_dev *vdev, const struct vio_device_id *vio_did)
+{
+       struct vlds_dev *vlds;
+       struct mdesc_handle *hp;
+       const u64 *id;
+       const char *name;
+       const u64 *dom_handle;
+       int name_len;
+       char int_name_buf[DS_MAX_DOM_NAME_LEN + 1];
+       char dev_name_buf[VLDS_PATH_MAX];
+       u64 node;
+       int rv;
+
+       dprintk("entered.\n");
+
+       rv = 0;
+
+       hp = mdesc_grab();
+
+       node = vio_vdev_node(hp, vdev);
+       if (node == MDESC_NODE_NULL) {
+               dprintk("Failed to get vdev MD node.\n");
+               mdesc_release(hp);
+               rv = -ENXIO;
+               goto error;
+       }
+
+       id = mdesc_get_property(hp, node, "id", NULL);
+       if (id == NULL) {
+               dprintk("failed to get id property.\n");
+               mdesc_release(hp);
+               rv = -ENXIO;
+               goto error;
+       }
+
+       dom_handle = mdesc_get_property(hp, node,
+           "vlds-remote-domain-handle", NULL);
+       if (dom_handle == NULL) {
+               dprintk("failed to get vlds-remote-domain-handle property.\n");
+               mdesc_release(hp);
+               rv = -ENXIO;
+               goto error;
+       }
+
+       /* get the name of the ldom this vlds-port refers to */
+       name = mdesc_get_property(hp, node, "vlds-remote-domain-name",
+           &name_len);
+       if (name == NULL) {
+               dprintk("failed to get vlds-remote-domain-name property.\n");
+               mdesc_release(hp);
+               rv = -ENXIO;
+               goto error;
+       }
+
+       mdesc_release(hp);
+
+       /* sanity check - should never happen */
+       if (name_len > DS_MAX_DOM_NAME_LEN)
+               goto error;
+
+       /* create the (NULL-terminated) internal name */
+       memcpy(int_name_buf, name, name_len);
+       int_name_buf[name_len] = '\0';
+
+       /* create the /dev name */
+       (void) scnprintf(dev_name_buf, VLDS_PATH_MAX, "%s%llu",
+           VLDS_DEV_DOMAIN_FILENAME_TAG, *dom_handle);
+
+       rv = vlds_alloc_vlds_dev(int_name_buf, dev_name_buf, &vdev->dev,
+           *dom_handle, &vlds);
+       if (rv != 0)
+               goto error;
+
+       dev_set_drvdata(&vdev->dev, vlds);
+
+       dprintk("%s: Probe successfful: cfg_handle=%llu, id=%llu\n",
+           vlds->int_name, vdev->dev_no, *id);
+
+       return 0;
+
+error:
+
+       dprintk("probe failed (rv=%d)\n", rv);
+
+       return rv;
+}
+
+static int vlds_free_vlds_dev(struct vlds_dev *vlds)
+{
+
+       dprintk("entered. (%s)\n", vlds->int_name);
+
+       /* Unregister all the services associated with this vlds. */
+       vlds_unreg_all(vlds);
+
+       mutex_lock(&vlds_data_mutex);
+       list_del(&vlds->list);
+       vlds_data.num_vlds_dev_list--;
+       mutex_unlock(&vlds_data_mutex);
+
+       device_destroy(vlds_data.chrdev_class, vlds->devt);
+       cdev_del(&vlds->cdev);
+       kfree(vlds->int_name);
+       mutex_destroy(&vlds->vlds_mutex);
+       kfree(vlds);
+
+       return 0;
+}
+
+static int vlds_remove(struct vio_dev *vdev)
+{
+       int rv;
+       struct vlds_dev *vlds;
+
+       dprintk("entered.\n");
+
+       vlds = dev_get_drvdata(&vdev->dev);
+
+       if (vlds == NULL) {
+               dprintk("failed to get vlds_dev from vio_dev.\n");
+               rv = -ENXIO;
+       } else {
+               dprintk("removing (%s)\n", vlds->int_name);
+               rv = vlds_free_vlds_dev(vlds);
+       }
+
+       return rv;
+}
+
+static const struct vio_device_id vlds_match[] = {
+       {
+               .type = "vlds-port",
+       },
+       {},
+};
+
+static char *vlds_devnode(struct device *dev, umode_t *mode)
+{
+       if (mode != NULL)
+               *mode = 0600;
+
+       return kasprintf(GFP_KERNEL, "vlds/%s", dev_name(dev));
+}
+
+static struct vio_driver vlds_driver = {
+       .id_table       = vlds_match,
+       .probe          = vlds_probe,
+       .remove         = vlds_remove,
+       .name           = VLDS_DEVICE_NAME,
+       .no_irq         = true,
+};
+
+static int __init vlds_init(void)
+{
+       int rv;
+       unsigned char devt_buf[32];
+
+       /* set the default ldoms debug level */
+       vldsdbg_level = ldoms_debug_level;
+
+       dprintk("entered. (DEBUG enabled)\n");
+
+       dprintk("%s", driver_version);
+
+       INIT_LIST_HEAD(&vlds_data.vlds_dev_list);
+       vlds_data.num_vlds_dev_list = 0;
+
+       INIT_LIST_HEAD(&vlds_event_info_list);
+
+       rv = alloc_chrdev_region(&vlds_data.devt, VLDS_MINOR_BASE,
+                                VLDS_MAX_DEVS, VLDS_DEVICE_NAME);
+       if (rv < 0) {
+               dprintk("alloc_chrdev_region failed: %d\n", rv);
+               return rv;
+       }
+
+       if (vlds_data.devt == (dev_t)0) {
+               dprintk("alloc_chrdev_region failed: (vlds_data.devt == 0)\n");
+               rv = -ENXIO;
+               return rv;
+       }
+
+       dprintk("dev_t allocated = %s\n",
+               format_dev_t(devt_buf, vlds_data.devt));
+
+       vlds_data.chrdev_class = class_create(THIS_MODULE, VLDS_DEVICE_NAME);
+       if (IS_ERR(vlds_data.chrdev_class)) {
+               rv = PTR_ERR(vlds_data.chrdev_class);
+               dprintk("class_create() failed: %d\n", rv);
+               vlds_data.chrdev_class = NULL;
+               goto error;
+       }
+
+       /* set callback to create devices under /dev/ds directory */
+       vlds_data.chrdev_class->devnode = vlds_devnode;
+
+       /*
+        * Add a device for the SP directly since there is no
+        * vlds-port MD node for the SP and we need one to provide
+        * access to SP domain services.
+        */
+       rv = vlds_alloc_vlds_dev(VLDS_SP_INT_NAME, VLDS_SP_DEV_NAME,
+           NULL, VLDS_INVALID_HANDLE, &sp_vlds);
+       if (rv != 0)
+               dprintk("Failed to create SP vlds device (%d)\n", rv);
+
+       rv = vio_register_driver(&vlds_driver);
+       if (rv != 0) {
+               dprintk("vio_register_driver() failed: %d\n", rv);
+               goto error;
+       }
+
+       return 0;
+
+error:
+       if (vlds_data.chrdev_class)
+               class_destroy(vlds_data.chrdev_class);
+
+       if (vlds_data.devt)
+               unregister_chrdev_region(vlds_data.devt, VLDS_MAX_DEVS);
+
+       return rv;
+}
+
+static void __exit vlds_exit(void)
+{
+
+       dprintk("entered.\n");
+
+       /* remove the SP vlds */
+       vlds_free_vlds_dev(sp_vlds);
+
+       /*
+        * Note - vio_unregister_driver() will invoke a call to
+        * vlds_remove() for every successfully probed device.
+        */
+       vio_unregister_driver(&vlds_driver);
+
+       if (vlds_data.chrdev_class)
+               class_destroy(vlds_data.chrdev_class);
+
+       if (vlds_data.devt)
+               unregister_chrdev_region(vlds_data.devt, VLDS_MAX_DEVS);
+}
+
+module_init(vlds_init);
+module_exit(vlds_exit);
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("Sun4v LDOMs Virtual Domain Services Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
index 53fe200e0b7949b810071c23a08af49167ccdf63..0a8156db81ccfaa70edb8b1014faacf21b409835 100644 (file)
@@ -1918,19 +1918,27 @@ static int vnet_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
        struct vnet *vp;
        const u64 *rmac;
        int len, i, err, switch_port;
+       u64 node;
 
        print_version();
 
        hp = mdesc_grab();
 
-       vp = vnet_find_parent(hp, vdev->mp);
+       node = vio_vdev_node(hp, vdev);
+       if (node == MDESC_NODE_NULL) {
+               pr_err("Failed to get vdev MD node.\n");
+               err = -ENXIO;
+               goto err_out_put_mdesc;
+       }
+
+       vp = vnet_find_parent(hp, node);
        if (IS_ERR(vp)) {
                pr_err("Cannot find port parent vnet\n");
                err = PTR_ERR(vp);
                goto err_out_put_mdesc;
        }
 
-       rmac = mdesc_get_property(hp, vdev->mp, remote_macaddr_prop, &len);
+       rmac = mdesc_get_property(hp, node, remote_macaddr_prop, &len);
        err = -ENODEV;
        if (!rmac) {
                pr_err("Port lacks %s property\n", remote_macaddr_prop);
@@ -1939,8 +1947,10 @@ static int vnet_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 
        port = kzalloc(sizeof(*port), GFP_KERNEL);
        err = -ENOMEM;
-       if (!port)
+       if (!port) {
+               pr_err("Cannot allocate vnet_port\n");
                goto err_out_put_mdesc;
+       }
 
        for (i = 0; i < ETH_ALEN; i++)
                port->raddr[i] = (*rmac >> (5 - i) * 8) & 0xff;
@@ -1963,7 +1973,7 @@ static int vnet_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
        INIT_LIST_HEAD(&port->list);
 
        switch_port = 0;
-       if (mdesc_get_property(hp, vdev->mp, "switch-port", NULL) != NULL)
+       if (mdesc_get_property(hp, node, "switch-port", NULL) != NULL)
                switch_port = 1;
        port->switch_port = switch_port;
        port->tso = true;
index c01f4509587779346e20cace71d192fe0a997c00..1f2eeb5ffe9769bf7c10a1250084428adcc6b642 100644 (file)
@@ -465,5 +465,10 @@ config MIPS_EJTAG_FDC_KGDB_CHAN
        default 3
        help
          FDC channel number to use for KGDB.
+config VCC
+       tristate "Sun Virtual Console Concentrator"
+       depends on SUN_LDOMS
+       help
+         Support for Sun logical domain consoles.
 
 endif # TTY
index 5817e2397463788a2e1631aa82ecc7bf330877c1..6712e9248f6852e6084280a708db0720c363722f 100644 (file)
@@ -30,5 +30,6 @@ obj-$(CONFIG_PPC_EPAPR_HV_BYTECHAN) += ehv_bytechan.o
 obj-$(CONFIG_GOLDFISH_TTY)     += goldfish.o
 obj-$(CONFIG_DA_TTY)           += metag_da.o
 obj-$(CONFIG_MIPS_EJTAG_FDC_TTY) += mips_ejtag_fdc.o
+obj-$(CONFIG_VCC)              += vcc.o
 
 obj-y += ipwireless/
diff --git a/drivers/tty/vcc.c b/drivers/tty/vcc.c
new file mode 100644 (file)
index 0000000..e1c90ea
--- /dev/null
@@ -0,0 +1,879 @@
+/*
+ * vcc.c: sun4v virtual channel concentrator
+ *
+ * Copyright (C) 2014 Oracle. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <asm/vio.h>
+#include <asm/ldc.h>
+
+#define DRV_MODULE_NAME                "vcc"
+#define DRV_MODULE_VERSION     "1.0"
+#define DRV_MODULE_RELDATE     "July 20, 2014"
+
+static char version[] =
+       DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+MODULE_DESCRIPTION("Sun LDOM virtual console concentrator driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_MODULE_VERSION);
+
+struct vcc {
+       struct tty_port port;   /* must be first element */
+       spinlock_t lock;
+       char *domain;
+
+       /*
+        * This buffer is required to support the tty write_room interface
+        * and guarantee that any characters that the driver accepts will
+        * be eventually sent, either immediately or later.
+        */
+       int chars_in_buffer;
+       struct vio_vcc buffer;
+
+       struct timer_list rx_timer;
+       struct timer_list tx_timer;
+       struct vio_driver_state vio;
+};
+
+#define VCC_MAX_PORTS  256
+#define VCC_MINOR_START        0
+#define VCC_BUFF_LEN   VIO_VCC_MTU_SIZE
+
+#define        VCC_CTL_BREAK   -1
+#define        VCC_CTL_HUP     -2
+
+#define        TIMER_SET(v, x, t)      ((v)->x##_timer.expires = (t))
+#define        TIMER_CLEAR(v, x)       ((v)->x##_timer.expires = 0)
+#define        TIMER_ACTIVE(v, x)      ((v)->x##_timer.expires)
+
+static const char vcc_driver_name[] = "vcc";
+static const char vcc_device_node[] = "vcc";
+static struct tty_driver *vcc_tty_driver;
+
+int vcc_dbg;
+int vcc_dbg_ldc;
+int vcc_dbg_vio;
+
+module_param(vcc_dbg, uint, 0664);
+module_param(vcc_dbg_ldc, uint, 0664);
+module_param(vcc_dbg_vio, uint, 0664);
+
+#define        VCC_DBG_DRV     0x1
+#define        VCC_DBG_LDC     0x2
+#define        VCC_DBG_PKT     0x4
+
+#define vccdbg(f, a...)                                                        \
+       do {                                                            \
+               if (vcc_dbg & VCC_DBG_DRV)                              \
+                       pr_info(f, ## a);                               \
+       } while (0)                                                     \
+
+#define vccdbgl(l)                                                     \
+       do {                                                            \
+               if (vcc_dbg & VCC_DBG_LDC)                              \
+                       ldc_print(l);                                   \
+       } while (0)                                                     \
+
+#define vccdbgp(pkt)                                                   \
+       do {                                                            \
+               if (vcc_dbg & VCC_DBG_PKT) {                            \
+                       int i;                                          \
+                       for (i = 0; i < pkt.tag.stype; i++)             \
+                               pr_info("[%c]", pkt.data[i]);           \
+               }                                                       \
+       } while (0)                                                     \
+
+/*
+ * xxx Be careful when adding flags to this line discipline.  Don't add anything
+ * that will cause echoing or we'll go into recursive loop echoing chars back
+ * and forth with the console drivers.
+ */
+static struct ktermios vcc_tty_termios = {
+       .c_iflag = IGNBRK | IGNPAR,
+       .c_oflag = OPOST,
+       .c_cflag = B38400 | CS8 | CREAD | HUPCL,
+       .c_cc = INIT_C_CC,
+       .c_ispeed = 38400,
+       .c_ospeed = 38400
+};
+
+static void vcc_kick_rx(struct vcc *vcc)
+{
+       struct vio_driver_state *vio = &vcc->vio;
+
+       vccdbg("%s\n", __func__);
+
+       assert_spin_locked(&vcc->lock);
+
+       if (TIMER_ACTIVE(vcc, rx))
+               return;
+
+       /*
+        * Disable interrupts until we can read the data again.
+        */
+       ldc_disable_hv_intr(vio->lp);
+
+       TIMER_SET(vcc, rx, jiffies + 1);
+       add_timer(&vcc->rx_timer);
+}
+
+static void vcc_kick_tx(struct vcc *vcc)
+{
+       vccdbg("%s\n", __func__);
+
+       assert_spin_locked(&vcc->lock);
+
+       if (TIMER_ACTIVE(vcc, tx))
+               return;
+
+       TIMER_SET(vcc, tx, jiffies + 1);
+       add_timer(&vcc->tx_timer);
+}
+
+static int vcc_rx_check(struct tty_struct *tty, int size)
+{
+       BUG_ON(!tty);
+
+       /*
+        * tty_buffer_request_room eventually calls kmalloc with GFP_ATOMIC
+        * so it won't sleep.
+        */
+       if (test_bit(TTY_THROTTLED, &tty->flags) ||
+           tty_buffer_request_room(tty->port, VCC_BUFF_LEN) < VCC_BUFF_LEN)
+               return 0;
+
+       return 1;
+}
+
+static int vcc_rx(struct tty_struct *tty, char *buf, int size)
+{
+       int len;
+
+       BUG_ON(!tty);
+
+       /*
+        * tty_insert_flig_string... calls __tty_buffer_request_room.
+        */
+       len = tty_insert_flip_string(tty->port, buf, size);
+
+       /* This is synch because tty->low_latency == 1 */
+       if (len)
+               tty_flip_buffer_push(tty->port);
+
+       vccdbg("%s: rv=%d\n", __func__, len);
+
+       return len;
+}
+
+static int vcc_ldc_read(struct vcc *vcc)
+{
+       struct vio_driver_state *vio = &vcc->vio;
+       struct tty_struct *tty;
+       struct vio_vcc pkt;
+       int rv = 0;
+       vccdbg("%s\n", __func__);
+
+       tty = vcc->port.tty;
+       if (!tty) {
+               rv = ldc_rx_reset(vio->lp);
+               vccdbg("%s: reset rx q: rv=%d\n", __func__, rv);
+               goto done;
+       }
+
+       /*
+        * Read as long as the LDC has incoming data.
+        * xxx Since we read in interrupt context, should we defer to
+        * a lower IRQ level?
+        */
+       while (1) {
+               if (!vcc_rx_check(tty, VIO_VCC_MTU_SIZE)) {
+                       vcc_kick_rx(vcc);
+                       break;
+               }
+               vccdbgl(vio->lp);
+               rv = ldc_read(vio->lp, &pkt, sizeof(pkt));
+               if (rv <= 0)
+                       break;
+
+               vccdbg("%s: ldc_read()=%d\n", __func__, rv);
+               vccdbg("TAG [%02x:%02x:%04x:%08x]\n",
+                      pkt.tag.type,
+                      pkt.tag.stype,
+                      pkt.tag.stype_env,
+                      pkt.tag.sid);
+
+               if (pkt.tag.type == VIO_TYPE_DATA) {
+                       /*
+                        * We called vcc_rx_check before which should allocate
+                        * space so this should not fail.
+                        */
+                       vccdbgp(pkt);
+                       vcc_rx(tty, pkt.data, pkt.tag.stype);
+               } else {
+                       pr_err("%s: unknown msg [%02x:%02x:%04x:%08x]\n",
+                               __func__, pkt.tag.type, pkt.tag.stype,
+                               pkt.tag.stype_env, pkt.tag.sid);
+
+                       rv = -ECONNRESET;
+                       break;
+               }
+               BUG_ON(rv != LDC_PACKET_SIZE);
+       }
+done:
+       vccdbg("%s: rv=%d\n", __func__, rv);
+       return rv;
+}
+
+static void vcc_rx_timer(unsigned long arg)
+{
+       struct vcc *vcc = (struct vcc *)arg;
+       struct vio_driver_state *vio = &vcc->vio;
+       unsigned long flags;
+       int rv;
+
+       vccdbg("%s\n", __func__);
+       spin_lock_irqsave(&vcc->lock, flags);
+       TIMER_CLEAR(vcc, rx);
+
+       /*
+        * Re-enable interrupts.
+        */
+       ldc_enable_hv_intr(vio->lp);
+
+       rv = vcc_ldc_read(vcc);
+       if (rv < 0) {
+               struct vio_driver_state *vio = &vcc->vio;
+
+               if (rv == -ECONNRESET)
+                       vio_conn_reset(vio);    /* xxx noop */
+       }
+       spin_unlock_irqrestore(&vcc->lock, flags);
+       vccdbg("%s done\n", __func__);
+}
+
+static void vcc_tx_timer(unsigned long arg)
+{
+       struct vcc *vcc = (struct vcc *)arg;
+       struct vio_vcc *pkt;
+       unsigned long flags;
+       int tosend = 0;
+       int rv;
+
+       vccdbg("%s\n", __func__);
+       if (!vcc) {
+               pr_err("%s: vcc not found\n", __func__);
+               return;
+       }
+
+       spin_lock_irqsave(&vcc->lock, flags);
+       TIMER_CLEAR(vcc, tx);
+
+       tosend = min(VCC_BUFF_LEN, vcc->chars_in_buffer);
+       if (!tosend)
+               goto done;
+
+       pkt = &vcc->buffer;
+       pkt->tag.type = VIO_TYPE_DATA;
+       pkt->tag.stype = tosend;
+       vccdbgl(vcc->vio.lp);
+
+       /* won't send partial writes */
+       rv = ldc_write(vcc->vio.lp, pkt, VIO_TAG_SIZE + tosend);
+       BUG_ON(!rv);
+
+       if (rv < 0) {
+               vccdbg("%s: ldc_write()=%d\n", __func__, rv);
+               vcc_kick_tx(vcc);
+       } else {
+               struct tty_struct *tty = vcc->port.tty;
+
+               vcc->chars_in_buffer = 0;
+
+               /*
+                * We are still obligated to deliver the data to the
+                * hypervisor even if the tty has been closed because
+                * we committed to delivering it.  But don't try to wake
+                * a non-existent tty.
+                */
+               if (tty)
+                       tty_wakeup(tty);
+       }
+done:
+       spin_unlock_irqrestore(&vcc->lock, flags);
+       vccdbg("%s done\n", __func__);
+}
+
+static void vcc_event(void *arg, int event)
+{
+       struct vcc *vcc = arg;
+       struct vio_driver_state *vio = &vcc->vio;
+       unsigned long flags;
+       int rv;
+
+       vccdbg("%s(%d)\n", __func__, event);
+       spin_lock_irqsave(&vcc->lock, flags);
+
+       if (event == LDC_EVENT_RESET || event == LDC_EVENT_UP) {
+               vio_link_state_change(vio, event);
+               spin_unlock_irqrestore(&vcc->lock, flags);
+               return;
+       }
+
+       if (event != LDC_EVENT_DATA_READY) {
+               pr_err("%s: unexpected LDC event %d\n", __func__, event);
+               spin_unlock_irqrestore(&vcc->lock, flags);
+               return;
+       }
+
+       rv = vcc_ldc_read(vcc);
+       if (rv < 0) {
+               if (rv == -ECONNRESET)
+                       vio_conn_reset(vio);    /* xxx noop */
+       }
+       spin_unlock_irqrestore(&vcc->lock, flags);
+}
+
+static struct ldc_channel_config vcc_ldc_cfg = {
+       .event          = vcc_event,
+       .mtu            = VIO_VCC_MTU_SIZE,
+       .mode           = LDC_MODE_RAW,
+       .debug          = 0,
+};
+
+/* Ordered from largest major to lowest */
+static struct vio_version vcc_versions[] = {
+       { .major = 1, .minor = 0 },
+};
+
+static struct tty_port_operations vcc_port_ops = { 0 };
+
+static ssize_t vcc_sysfs_domain_show(struct device *device,
+       struct device_attribute *attr, char *buf)
+{
+       int rv;
+       unsigned long flags;
+       struct vcc *vcc = dev_get_drvdata(device);
+
+       spin_lock_irqsave(&vcc->lock, flags);
+       rv = scnprintf(buf, PAGE_SIZE, "%s\n", vcc->domain);
+       spin_unlock_irqrestore(&vcc->lock, flags);
+
+       return rv;
+}
+
+static int vcc_send_ctl(struct vcc *vcc, int ctl)
+{
+       int rv;
+       struct vio_vcc pkt;
+
+       pkt.tag.type = VIO_TYPE_CTRL;
+       pkt.tag.sid = ctl;      /* ctrl_msg */
+       pkt.tag.stype = 0;      /* size */
+
+       rv = ldc_write(vcc->vio.lp, &pkt, sizeof(pkt.tag));
+       BUG_ON(!rv);
+       vccdbg("%s: ldc_write(%ld)=%d\n", __func__, sizeof(pkt.tag), rv);
+
+       return rv;
+}
+
+static ssize_t vcc_sysfs_break_store(struct device *device,
+       struct device_attribute *attr, const char *buf, size_t count)
+{
+       int rv = count;
+       int brk;
+       unsigned long flags;
+       struct vcc *vcc = dev_get_drvdata(device);
+
+       spin_lock_irqsave(&vcc->lock, flags);
+
+       if (sscanf(buf, "%ud", &brk) != 1 || brk != 1)
+               rv = -EINVAL;
+       else if (vcc_send_ctl(vcc, VCC_CTL_HUP) < 0)
+               vcc_kick_tx(vcc);
+
+       spin_unlock_irqrestore(&vcc->lock, flags);
+
+       return count;
+}
+
+static DEVICE_ATTR(domain, S_IRUSR, vcc_sysfs_domain_show, NULL);
+static DEVICE_ATTR(break, S_IWUSR, NULL, vcc_sysfs_break_store);
+
+static struct attribute *vcc_sysfs_entries[] = {
+       &dev_attr_domain.attr,
+       &dev_attr_break.attr,
+       NULL
+};
+
+static struct attribute_group vcc_attribute_group = {
+       .name = NULL,   /* put in device directory */
+       .attrs = vcc_sysfs_entries,
+};
+
+static void print_version(void)
+{
+       printk_once(KERN_INFO "%s", version);
+}
+
+static int vcc_probe(struct vio_dev *vdev,
+       const struct vio_device_id *id)
+{
+       int rv;
+       char *name;
+       const char *domain;
+       struct vcc *vcc;
+       struct device *dev;
+       struct mdesc_handle *hp;
+       u64 node;
+
+       print_version();
+
+       vccdbg("%s: name=%s port=%ld\n", __func__, dev_name(&vdev->dev),
+              vdev->port_id);
+
+       if (vdev->port_id >= VCC_MAX_PORTS)
+               return -ENXIO;
+
+       if (!vcc_tty_driver) {
+               pr_err("%s: vcc tty driver not registered\n", __func__);
+               return -ENODEV;
+       }
+
+       vcc = kzalloc(sizeof(*vcc), GFP_KERNEL);
+       if (!vcc) {
+               pr_err("%s: cannot allocate vcc\n", __func__);
+               return -ENOMEM;
+       }
+
+       name = kstrdup(dev_name(&vdev->dev), GFP_KERNEL);
+       rv = vio_driver_init(&vcc->vio, vdev, VDEV_CONSOLE_CON,
+                             vcc_versions, ARRAY_SIZE(vcc_versions),
+                             NULL, name);
+       if (rv)
+               goto free_port;
+
+       vcc->vio.debug = vcc_dbg_vio;
+       vcc_ldc_cfg.debug = vcc_dbg_ldc;
+
+       rv = vio_ldc_alloc(&vcc->vio, &vcc_ldc_cfg, vcc);
+       if (rv)
+               goto free_port;
+
+       tty_port_init(&vcc->port);
+       spin_lock_init(&vcc->lock);
+       vcc->port.ops = &vcc_port_ops;
+
+       dev = tty_port_register_device(&vcc->port, vcc_tty_driver,
+               vdev->port_id, &vdev->dev);
+       if (IS_ERR(dev)) {
+               rv = PTR_ERR(dev);
+               goto free_ldc;
+       }
+
+       hp = mdesc_grab();
+
+       node = vio_vdev_node(hp, vdev);
+       if (node == MDESC_NODE_NULL) {
+               rv = -ENXIO;
+               mdesc_release(hp);
+               goto unreg_tty;
+       }
+
+       domain = mdesc_get_property(hp, node, "vcc-domain-name", NULL);
+       if (!domain) {
+               rv  = -ENXIO;
+               mdesc_release(hp);
+               goto unreg_tty;
+       }
+       vcc->domain = kstrdup(domain, GFP_KERNEL);
+
+       mdesc_release(hp);
+
+       rv = sysfs_create_group(&vdev->dev.kobj, &vcc_attribute_group);
+       if (rv)
+               goto remove_sysfs;
+
+       init_timer(&vcc->rx_timer);
+       vcc->rx_timer.function = vcc_rx_timer;
+       vcc->rx_timer.data = (unsigned long)vcc;
+
+       init_timer(&vcc->tx_timer);
+       vcc->tx_timer.function = vcc_tx_timer;
+       vcc->tx_timer.data = (unsigned long)vcc;
+
+       dev_set_drvdata(&vdev->dev, vcc);
+
+       /*
+        * Disable interrupts before the port is up.
+        *
+        * We can get an interrupt during vio_port_up() -> ldc_bind().
+        * vio_port_up() grabs the vio->lock beforehand so we cannot
+        * grab it in vcc_event().
+        *
+        * Once the port is up and the lock released, we can field
+        * interrupts.
+        */
+       ldc_disable_hv_intr(vcc->vio.lp);
+       vio_port_up(&vcc->vio);
+       ldc_enable_hv_intr(vcc->vio.lp);
+
+       return 0;
+
+remove_sysfs:
+       sysfs_remove_group(&vdev->dev.kobj, &vcc_attribute_group);
+       kfree(vcc->domain);
+
+unreg_tty:
+       tty_unregister_device(vcc_tty_driver, vdev->port_id);
+
+free_ldc:
+       vio_ldc_free(&vcc->vio);
+
+free_port:
+       kfree(name);
+       kfree(vcc);
+
+       return rv;
+}
+
+static int vcc_remove(struct vio_dev *vdev)
+{
+       struct vcc *vcc = dev_get_drvdata(&vdev->dev);
+       struct tty_struct *tty;
+       unsigned long flags;
+
+       vccdbg("%s\n", __func__);
+
+       if (!vcc)
+               return -ENODEV;
+
+       del_timer_sync(&vcc->rx_timer);
+       del_timer_sync(&vcc->tx_timer);
+
+       spin_lock_irqsave(&vcc->lock, flags);
+       tty = vcc->port.tty;
+       spin_unlock_irqrestore(&vcc->lock, flags);
+
+       if (tty)
+               tty_hangup(tty);
+
+       tty_unregister_device(vcc_tty_driver, vdev->port_id);
+
+       del_timer_sync(&vcc->vio.timer);
+       vio_ldc_free(&vcc->vio);
+       sysfs_remove_group(&vdev->dev.kobj, &vcc_attribute_group);
+       dev_set_drvdata(&vdev->dev, NULL);
+
+       kfree(vcc->vio.name);
+       kfree(vcc->domain);
+       kfree(vcc);
+
+       return 0;
+}
+
+static const struct vio_device_id vcc_match[] = {
+       {
+               .type = "vcc-port",
+       },
+       {},
+};
+MODULE_DEVICE_TABLE(vio, vcc_match);
+
+static struct vio_driver vcc_driver = {
+       .id_table       = vcc_match,
+       .probe          = vcc_probe,
+       .remove         = vcc_remove,
+       .name           = "vcc",
+};
+
+static int vcc_open(struct tty_struct *tty, struct file *filp)
+{
+       struct vcc *vcc;
+       int rv, count;
+
+       vccdbg("%s\n", __func__);
+       if (!tty) {
+               pr_err("%s: NULL tty\n", __func__);
+               return -ENXIO;
+       }
+
+       if (!tty->port) {
+               pr_err("%s: NULL tty port\n", __func__);
+               return -ENXIO;
+       }
+       if (!tty->port->ops) {
+               pr_err("%s: NULL tty port ops\n", __func__);
+               return -ENXIO;
+       }
+
+       vcc = container_of(tty->port, struct vcc, port);
+
+       if (!vcc->vio.lp) {
+               pr_err("%s: NULL lp\n", __func__);
+               return -ENXIO;
+       }
+       vccdbgl(vcc->vio.lp);
+
+       /*
+        * vcc_close is called even if vcc_open fails so call
+        * tty_port_open() regardless in case of -EBUSY.
+        */
+       count = tty->port->count;
+       if (count)
+               pr_err("%s: tty port busy\n", __func__);
+       rv = tty_port_open(tty->port, tty, filp);
+       if (rv == 0 && count != 0)
+               rv = -EBUSY;
+
+       return rv;
+
+}
+
+static void vcc_close(struct tty_struct *tty, struct file *filp)
+{
+       vccdbg("%s\n", __func__);
+       if (!tty) {
+               pr_err("%s: NULL tty\n", __func__);
+               return;
+       }
+       if (!tty->port) {
+               pr_err("%s: NULL tty port\n", __func__);
+               return;
+       }
+       tty_port_close(tty->port, tty, filp);
+}
+
+static void vcc_ldc_hup(struct vcc *vcc)
+{
+       unsigned long flags;
+
+       vccdbg("%s\n", __func__);
+
+       spin_lock_irqsave(&vcc->lock, flags);
+
+       if (vcc_send_ctl(vcc, VCC_CTL_HUP) < 0)
+               vcc_kick_tx(vcc);
+
+       spin_unlock_irqrestore(&vcc->lock, flags);
+}
+
+static void vcc_hangup(struct tty_struct *tty)
+{
+       struct vcc *vcc = container_of(tty->port, struct vcc, port);
+
+       vcc_ldc_hup(vcc);
+       tty_port_hangup(tty->port);
+}
+
+static int vcc_write(struct tty_struct *tty,
+               const unsigned char *buf, int count)
+{
+       struct vcc *vcc = container_of(tty->port, struct vcc, port);
+       struct vio_vcc *pkt;
+       unsigned long flags;
+       int total_sent = 0;
+       int tosend = 0;
+       int rv = -EINVAL;
+
+       vccdbg("%s\n", __func__);
+
+       spin_lock_irqsave(&vcc->lock, flags);
+
+       pkt = &vcc->buffer;
+       pkt->tag.type = VIO_TYPE_DATA;
+
+       while (count > 0) {
+               tosend = min(count, (VCC_BUFF_LEN - vcc->chars_in_buffer));
+               /*
+                * No more space, this probably means that the last call to
+                * vcc_write() didn't succeed and the buffer was filled up.
+                */
+               if (!tosend)
+                       break;
+
+               memcpy(&pkt->data[vcc->chars_in_buffer],
+                       &buf[total_sent],
+                       tosend);
+
+               vcc->chars_in_buffer += tosend;
+
+               pkt->tag.stype = tosend;
+               vccdbg("TAG [%02x:%02x:%04x:%08x]\n",
+                      pkt->tag.type,
+                      pkt->tag.stype,
+                      pkt->tag.stype_env,
+                      pkt->tag.sid);
+               vccdbg("DATA [%s]\n", pkt->data);
+               vccdbgl(vcc->vio.lp);
+
+               /* won't send partial writes */
+               rv = ldc_write(vcc->vio.lp, pkt, VIO_TAG_SIZE + tosend);
+               vccdbg("%s: ldc_write(%ld)=%d\n", __func__,
+                      VIO_TAG_SIZE + tosend, rv);
+
+               /*
+                * Since we know we have enough room in vcc->buffer for
+                * tosend we record that it was sent regardless of whether the
+                * hypervisor actually took it because we have it buffered.
+                */
+               total_sent += tosend;
+               count -= tosend;
+               if (rv < 0) {
+                       vcc_kick_tx(vcc);
+                       break;
+               }
+
+               vcc->chars_in_buffer = 0;
+       }
+
+       spin_unlock_irqrestore(&vcc->lock, flags);
+
+       vccdbg("%s: total=%d rv=%d\n", __func__, total_sent, rv);
+
+       return total_sent ? total_sent : rv;
+}
+
+static int vcc_write_room(struct tty_struct *tty)
+{
+       struct vcc *vcc = container_of(tty->port, struct vcc, port);
+
+       return VCC_BUFF_LEN - vcc->chars_in_buffer;
+}
+
+static int vcc_chars_in_buffer(struct tty_struct *tty)
+{
+       struct vcc *vcc = container_of(tty->port, struct vcc, port);
+
+       return vcc->chars_in_buffer;
+}
+
+static int vcc_break_ctl(struct tty_struct *tty, int state)
+{
+       struct vcc *vcc = container_of(tty->port, struct vcc, port);
+       unsigned long flags;
+
+       vccdbg("%s(%d)\n", __func__, state);
+
+       if (state == 0)         /* turn off break */
+               return 0;
+
+       spin_lock_irqsave(&vcc->lock, flags);
+
+       if (vcc_send_ctl(vcc, VCC_CTL_BREAK) < 0)
+               vcc_kick_tx(vcc);
+
+       spin_unlock_irqrestore(&vcc->lock, flags);
+
+       return 0;
+}
+
+static const struct tty_operations vcc_ops = {
+       .open = vcc_open,
+       .close = vcc_close,
+       .hangup = vcc_hangup,
+       .write = vcc_write,
+       .write_room = vcc_write_room,
+       .chars_in_buffer = vcc_chars_in_buffer,
+       .break_ctl = vcc_break_ctl
+};
+
+/*
+ * We want to dynamically manage our ports through the tty_port_*
+ * interfaces so we allocate and register/unregister on our own.
+ */
+#define        VCC_TTY_FLAGS   (TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_REAL_RAW)
+
+static int vcc_tty_init(void)
+{
+       int rv;
+
+       vcc_tty_driver = tty_alloc_driver(VCC_MAX_PORTS, VCC_TTY_FLAGS);
+
+       if (!vcc_tty_driver) {
+               pr_err("%s: tty driver alloc failed\n", __func__);
+               return -ENOMEM;
+       }
+
+       vcc_tty_driver->driver_name = vcc_driver_name;
+       vcc_tty_driver->name = vcc_device_node;
+
+       /*
+        * We'll let the system assign us a major number, indicated by leaving
+        * it blank.
+        */
+       vcc_tty_driver->minor_start = VCC_MINOR_START;
+       vcc_tty_driver->type = TTY_DRIVER_TYPE_SYSTEM;
+       vcc_tty_driver->init_termios = vcc_tty_termios;
+
+       tty_set_operations(vcc_tty_driver, &vcc_ops);
+
+       /*
+        * The following call will result in sysfs entries that denote the
+        * dynamically assigned major and minor numbers for our devices.
+        */
+       rv = tty_register_driver(vcc_tty_driver);
+       if (!rv) {
+               vccdbg("%s: tty driver registered\n", __func__);
+               return 0;
+       }
+
+       pr_err("%s: tty driver register failed\n", __func__);
+
+       tty_unregister_driver(vcc_tty_driver);
+       put_tty_driver(vcc_tty_driver);
+       vcc_tty_driver = NULL;
+
+       return rv;
+}
+
+static void vcc_tty_exit(void)
+{
+       vccdbg("%s\n", __func__);
+
+       tty_unregister_driver(vcc_tty_driver);
+       put_tty_driver(vcc_tty_driver);
+       vccdbg("%s: tty driver unregistered\n", __func__);
+
+       vcc_tty_driver = NULL;
+}
+
+static int __init vcc_init(void)
+{
+       int rv;
+
+       vccdbg("%s\n", __func__);
+
+       rv = vcc_tty_init();
+       if (rv) {
+               pr_err("%s: vcc_tty_init failed (%d)\n", __func__, rv);
+               return rv;
+       }
+
+       rv = vio_register_driver(&vcc_driver);
+       if (rv) {
+               pr_err("%s: vcc driver register failed (%d)\n", __func__, rv);
+               vcc_tty_exit();
+       } else {
+               vccdbg("%s: vcc driver registered\n", __func__);
+       }
+
+       return rv;
+}
+
+static void __exit vcc_exit(void)
+{
+       vccdbg("%s\n", __func__);
+       vio_unregister_driver(&vcc_driver);
+       vccdbg("%s: vcc vio driver unregistered\n", __func__);
+       vcc_tty_exit();
+       vccdbg("%s: vcc tty driver unregistered\n", __func__);
+}
+
+module_init(vcc_init);
+module_exit(vcc_exit);
diff --git a/include/linux/ds.h b/include/linux/ds.h
new file mode 100644 (file)
index 0000000..021acb3
--- /dev/null
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2015 Oracle Corporation
+ */
+
+#ifndef _DS_H
+#define _DS_H
+
+#include <uapi/linux/ds.h>
+
+typedef u64    ds_svc_hdl_t;
+typedef void   *ds_cb_arg_t;
+
+typedef struct ds_ver {
+       u64     major;
+       u64     minor;
+} ds_ver_t;
+
+/*
+ * Domain Services Capability
+ *
+ * A DS capability is exported by a provider using a unique service
+ * identifier string. Along with this identifier the highest
+ * version that the capability that the client supports. It is
+ * assumed that the capability supports this specified version or
+ * any lower version (down to 1.0). The service may be negotiated to
+ * register at this specified version or at a lower version.
+ */
+typedef struct ds_capability {
+       char            *svc_id;        /* service identifier */
+       ds_ver_t        vers;           /* supported version */
+} ds_capability_t;
+
+/*
+ * Domain Services Client Event Callbacks
+ *
+ * A client implementing a DS capability provides a set of callbacks
+ * when it registers with the DS framework. The use of these callbacks
+ * is described below:
+ *
+ *    ds_reg_cb()
+ *
+ *         The ds_reg_cb() callback is invoked when the DS framework
+ *         has successfully completed version negotiation with the
+ *         remote endpoint for the capability. The cb also passes the
+ *         negotiated version of the service.
+ *
+ *    ds_unreg_cb()
+ *
+ *         The ds_unreg_cb() callback is invoked when the DS framework
+ *         detects an event that causes the registered capability to
+ *         become unavailable. This includes an explicit unregister
+ *         message, a failure in the underlying communication transport,
+ *         etc. Any such event invalidates the service handle that was
+ *         received from the register callback. Once this callback has
+ *         been made, the client must re-register (unreg+reg) the service.
+ *
+ *    ds_data_cb()
+ *
+ *         The ds_data_cb() callback is invoked whenever there is an
+ *         incoming data message for the client to process. It provides
+ *         the contents of the message along with the message length.
+ */
+typedef struct ds_ops {
+       void (*ds_reg_cb)(ds_cb_arg_t arg, ds_svc_hdl_t hdl, ds_ver_t *ver);
+       void (*ds_unreg_cb)(ds_cb_arg_t arg, ds_svc_hdl_t hdl);
+       void (*ds_data_cb)(ds_cb_arg_t arg, ds_svc_hdl_t hdl,
+           void *buf, size_t buflen);
+       ds_cb_arg_t     cb_arg; /* optional arg to ops - can be NULL */
+} ds_ops_t;
+
+/*
+ * Domain Services Capability Interface
+ */
+extern int ds_cap_init(ds_capability_t *cap, ds_ops_t *ops, u32 flags,
+       u64 domain_handle,  ds_svc_hdl_t *hdlp);
+extern int ds_cap_fini(ds_svc_hdl_t hdl);
+extern int ds_cap_send(ds_svc_hdl_t hdl, void *buf, size_t buflen);
+
+#define DS_CAP_IS_CLIENT       0x0001 /* client service */
+#define DS_CAP_IS_PROVIDER     0x0002 /* provider service */
+#define DS_TARGET_IS_DOMAIN    0x0004 /* domain target */
+
+#endif /* _DS_H */
diff --git a/include/linux/vldc.h b/include/linux/vldc.h
new file mode 100644 (file)
index 0000000..cf269f7
--- /dev/null
@@ -0,0 +1,10 @@
+/*
+ * Copyright (C) 2014 Oracle Corporation
+ */
+
+#ifndef _VLDC_H
+#define _VLDC_H
+
+#include <uapi/linux/vldc.h>
+
+#endif /* _VLDC_H */
diff --git a/include/linux/vlds.h b/include/linux/vlds.h
new file mode 100644 (file)
index 0000000..e4c2420
--- /dev/null
@@ -0,0 +1,10 @@
+/*
+ * Copyright (C) 2015 Oracle Corporation
+ */
+
+#ifndef _VLDS_H
+#define _VLDS_H
+
+#include <uapi/linux/vlds.h>
+
+#endif /* _VLDS_H */
diff --git a/include/uapi/linux/ds.h b/include/uapi/linux/ds.h
new file mode 100644 (file)
index 0000000..c737098
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2015 Oracle Corporation
+ */
+
+#ifndef _UAPI_DS_H
+#define _UAPI_DS_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define        DS_MAJOR_VERSION        1
+#define        DS_MINOR_VERSION        0
+
+#define        DS_SPTOK_TOKEN_LEN      20      /* SP token length */
+
+#define        DS_MAX_DOM_NAME_LEN     256     /* Max length of DS domain name */
+#define        DS_MAX_SVC_NAME_LEN     256     /* Max length of DS service name */
+
+#define        DS_SP_NAME              "sp"    /* name assigned to the SP DS dev */
+
+typedef struct ds_sptok {
+       __u32   ds_sptok_ipaddr;        /* IP address on SP */
+       __u32   ds_sptok_portid;        /* Port number on SP */
+       __u8    ds_sptok_token[DS_SPTOK_TOKEN_LEN];
+} ds_sptok_t;
+
+typedef struct ds_ioctl_sptok_data {
+       __u32           major_version;
+       __u32           minor_version;
+       char            service_name[DS_MAX_SVC_NAME_LEN];
+       ds_sptok_t      sp_tok;
+} ds_ioctl_sptok_data_t ;
+
+#define DS_IOCTL_BASE          'D'
+
+#define DS_SPTOK_GET   _IOR(DS_IOCTL_BASE, 1, ds_ioctl_sptok_data_t)
+
+#endif /* _UAPI_DS_H */
diff --git a/include/uapi/linux/vldc.h b/include/uapi/linux/vldc.h
new file mode 100644 (file)
index 0000000..2d821da
--- /dev/null
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2014 Oracle Corporation
+ */
+
+#ifndef _UAPI_VLDC_H
+#define _UAPI_VLDC_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+struct vldc_data_t {
+       u64             src_addr;
+       u64             dst_addr;
+       u64             length;
+};
+
+#define VLDC_IOCTL_BASE                'V'
+
+#define VLDC_IOCTL_READ_COOKIE _IOR(VLDC_IOCTL_BASE, 1, struct vldc_data_t)
+#define VLDC_IOCTL_WRITE_COOKIE        _IOW(VLDC_IOCTL_BASE, 2, struct vldc_data_t)
+
+#endif /* _UAPI_VLDC_H */
diff --git a/include/uapi/linux/vlds.h b/include/uapi/linux/vlds.h
new file mode 100644 (file)
index 0000000..5b8d12b
--- /dev/null
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2015 Oracle Corporation
+ */
+
+#ifndef _UAPI_VLDS_H
+#define _UAPI_VLDS_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define        VLDS_DEV_DIR    "/dev/vlds"
+
+#define        VLDS_DEV_DOMAIN_FILENAME_TAG    "host:"
+
+/* String arguments to ioctl */
+typedef struct vlds_string_arg {
+       u64     vlds_strp;
+       u64     vlds_strlen;
+} vlds_string_t;
+#define        VLDS_MAX_NAMELEN        256
+
+/* Version (used by VLDS_IOCTL_SVC_REG) */
+typedef struct vlds_ver {
+       u16     vlds_major;
+       u16     vlds_minor;
+} vlds_ver_t;
+
+/* Capability structure (used by VLDS_IOCTL_SVC_REG) */
+typedef struct vlds_cap {
+       vlds_string_t   vlds_service;
+       vlds_ver_t      vlds_vers; /* max supported version */
+} vlds_cap_t;
+
+typedef struct vlds_svc_reg_arg {
+       u64     vlds_hdlp;      /* DS Service Handle ptr. (returned) */
+       u64     vlds_capp;      /* DS Capability Structure ptr. */
+       u64     vlds_reg_flags; /* DS reg flags */
+} vlds_svc_reg_arg_t;
+
+/* vlds_reg_flags */
+#define        VLDS_REG_CLIENT         0x01    /* Register as client */
+#define        VLDS_REG_EVENT          0x02    /* Event driven service - not polled */
+
+typedef struct vlds_unreg_hdl_arg {
+       u64     vlds_hdl;       /* DS Service Handle */
+} vlds_unreg_hdl_arg_t;
+
+typedef struct vlds_hdl_lookup_arg {
+       vlds_string_t   vlds_service;   /* DS Service Name */
+       u64     vlds_isclient;  /* DS Client flag */
+       u64     vlds_hdlsp;     /* DS Handle array ptr */
+       u64     vlds_maxhdls;   /* DS Max no. of hdls to return */
+       u64     vlds_nhdlsp;    /* DS No. of hdls returned */
+} vlds_hdl_lookup_arg_t;
+
+typedef struct vlds_dmn_lookup_arg {
+       u64     vlds_dhdlp;     /* DS Domain hdl ptr. (returned) */
+       vlds_string_t   vlds_dname; /* DS Domain name (returned) */
+} vlds_dmn_lookup_arg_t;
+
+typedef struct vlds_send_msg_arg {
+       u64     vlds_hdl;       /* DS Service Handle */
+       u64     vlds_bufp;      /* buffer */
+       u64     vlds_buflen;    /* message length/buffer size */
+} vlds_send_msg_arg_t;
+#define VLDS_MAX_SENDBUF_LEN   65535 /* 64k max buf size */
+
+typedef struct vlds_recv_msg_arg {
+       u64     vlds_hdl;       /* DS Service Handle */
+       u64     vlds_bufp;      /* buffer */
+       u64     vlds_buflen;    /* message length/buffer size */
+       u64     vlds_msglenp;   /* ptr to returned message length */
+} vlds_recv_msg_arg_t;
+
+typedef struct vlds_hdl_state {
+       u64     state;
+       vlds_ver_t vlds_vers; /* negotiated version */
+} vlds_hdl_state_t;
+
+typedef struct vlds_hdl_get_state_arg {
+       u64     vlds_hdl;       /* DS Service Handle */
+       u64     vlds_statep;    /* Ptr to vlds_hdl_state */
+} vlds_hdl_get_state_arg_t;
+#define VLDS_HDL_STATE_NOT_YET_CONNECTED       0x0
+#define VLDS_HDL_STATE_CONNECTED               0x1
+#define VLDS_HDL_STATE_DISCONNECTED            0x2
+
+typedef struct vlds_set_event_fd_arg {
+       int     fd;             /* eventfd() fd used by process */
+} vlds_set_event_fd_arg_t;
+
+typedef struct vlds_get_next_event_arg {
+       u64     vlds_hdlp;      /* Event Service Handle (returned) */
+       u64     vlds_event_typep; /* Reg, Unreg or Data event? (returned) */
+       u64     neg_versp;      /* reg event negotiated version (returned) */
+       u64     vlds_bufp;      /* data event msg buffer (returned) */
+       u64     vlds_buflen;    /* data event msg buffer size */
+       u64     vlds_msglenp;   /* data event returned msg length (returned) */
+} vlds_get_next_event_arg_t;
+/* event types returned in event_typep field */
+#define        VLDS_EVENT_TYPE_REG                     0x0
+#define        VLDS_EVENT_TYPE_UNREG                   0x1
+#define        VLDS_EVENT_TYPE_DATA                    0x2
+
+#define VLDS_IOCTL_BASE                'D'
+
+#define        VLDS_IOCTL_SVC_REG      _IOWR(VLDS_IOCTL_BASE, 1, \
+                                    struct vlds_svc_reg_arg)
+#define        VLDS_IOCTL_UNREG_HDL    _IOW(VLDS_IOCTL_BASE, 2, \
+                                    struct vlds_unreg_hdl_arg)
+#define        VLDS_IOCTL_HDL_LOOKUP   _IOR(VLDS_IOCTL_BASE, 3, \
+                                    struct vlds_hdl_lookup_arg)
+#define        VLDS_IOCTL_DMN_LOOKUP   _IOR(VLDS_IOCTL_BASE, 4, \
+                                    struct vlds_dmn_lookup_arg)
+#define        VLDS_IOCTL_SEND_MSG     _IOW(VLDS_IOCTL_BASE, 5, \
+                                    struct vlds_send_msg_arg)
+#define        VLDS_IOCTL_RECV_MSG     _IOR(VLDS_IOCTL_BASE, 6, \
+                                    struct vlds_recv_msg_arg)
+#define        VLDS_IOCTL_HDL_GET_STATE _IOR(VLDS_IOCTL_BASE, 7, \
+                                    struct vlds_hdl_get_state_arg)
+
+/* start Linux specific ioctls at 32 */
+#define        VLDS_IOCTL_SET_EVENT_FD _IOW(VLDS_IOCTL_BASE, 32, \
+                                    struct vlds_set_event_fd_arg)
+#define        VLDS_IOCTL_UNSET_EVENT_FD _IO(VLDS_IOCTL_BASE, 33)
+#define        VLDS_IOCTL_GET_NEXT_EVENT _IOR(VLDS_IOCTL_BASE, 34, \
+                                    struct vlds_get_next_event_arg)
+
+#endif /* _UAPI_VLDS_H */
+
+