From 66bdec675259e61334bc40bac7dfbad40c39d04d Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Mon, 21 Sep 2015 15:54:06 +0800 Subject: [PATCH] ocfs2: o2hb: don't negotiate if last hb fail Sometimes io error is returned when storage is down for a while. Like for iscsi device, stroage is made offline when session timeout, and this will make all io return -EIO. For this case, nodes shouldn't do negotiate timeout but should fence self. So let nodes fence self when o2hb_do_disk_heartbeat return an error, this is the same behavior with o2hb without negotiate timer. Oracle-bug: 21862940 Signed-off-by: Junxiao Bi Reviewed-by: Ryan Ding --- fs/ocfs2/cluster/heartbeat.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 35a084d9e914..d5c19d9b27ea 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -283,6 +283,9 @@ struct o2hb_region { /* Message key for negotiate timeout message. */ unsigned int hr_key; struct list_head hr_handler_list; + + /* last hb status, 0 for success, other value for error. */ + int hr_last_hb_status; }; struct o2hb_bio_wait_ctxt { @@ -396,6 +399,12 @@ static void o2hb_nego_timeout(struct work_struct *work) unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; int master_node, i, ret; + /* don't negotiate timeout if last hb failed since it is very + * possible io failed. Should let write timeout fence self. + */ + if (reg->hr_last_hb_status) + return; + o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); /* lowest node as master node to make negotiate decision. */ master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0); @@ -1253,6 +1262,7 @@ static int o2hb_thread(void *data) do_gettimeofday(&before_hb); ret = o2hb_do_disk_heartbeat(reg); + reg->hr_last_hb_status = ret; do_gettimeofday(&after_hb); elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); -- 2.50.1