[PATCH] UBUNTU: Update drbd to 8.3.0

Ante ivoks at ubuntu.com
Sat Jan 17 09:03:17 GMT 2009


Signed-off-by: Ante <ivoks at ubuntu.com>
---
 ubuntu/drbd/BOM                    |    2 +-
 ubuntu/drbd/Makefile               |    4 +-
 ubuntu/drbd/drbd_actlog.c          |  138 ++--
 ubuntu/drbd/drbd_bitmap.c          |  818 ++++++++++++++-------
 ubuntu/drbd/drbd_buildtag.c        |    6 +-
 ubuntu/drbd/drbd_int.h             |  556 +++++++++-----
 ubuntu/drbd/drbd_main.c            |  992 ++++++++++++++++---------
 ubuntu/drbd/drbd_nl.c              |  461 +++++++++---
 ubuntu/drbd/drbd_proc.c            |   61 +-
 ubuntu/drbd/drbd_receiver.c        | 1489 +++++++++++++++++++++++++-----------
 ubuntu/drbd/drbd_req.c             |  148 +++--
 ubuntu/drbd/drbd_req.h             |   12 +-
 ubuntu/drbd/drbd_strings.c         |    2 +-
 ubuntu/drbd/drbd_worker.c          |  344 +++++++--
 ubuntu/drbd/drbd_wrappers.h        |   65 ++-
 ubuntu/drbd/linux/drbd.h           |   67 +-
 ubuntu/drbd/linux/drbd_config.h    |   14 +-
 ubuntu/drbd/linux/drbd_limits.h    |   12 +-
 ubuntu/drbd/linux/drbd_nl.h        |   15 +-
 ubuntu/drbd/linux/drbd_tag_magic.h |    2 +-
 ubuntu/drbd/lru_cache.c            |   26 +-
 ubuntu/drbd/lru_cache.h            |    6 +-
 22 files changed, 3587 insertions(+), 1653 deletions(-)

diff --git a/ubuntu/drbd/BOM b/ubuntu/drbd/BOM
index 529693f..fe895b9 100644
--- a/ubuntu/drbd/BOM
+++ b/ubuntu/drbd/BOM
@@ -1,2 +1,2 @@
 Downloaded from:	http://oss.linbit.com/drbd/
-Current Version:	8.2.6
+Current Version:	8.3.0
diff --git a/ubuntu/drbd/Makefile b/ubuntu/drbd/Makefile
index 90b79ab..fa81f99 100644
--- a/ubuntu/drbd/Makefile
+++ b/ubuntu/drbd/Makefile
@@ -1,7 +1,9 @@
+#CFLAGS_drbd_sizeof_sanity_check.o = -Wpadded # -Werror
+
 drbd-objs  :=	drbd_buildtag.o drbd_bitmap.o drbd_proc.o \
 		drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o \
 		lru_cache.o drbd_main.o drbd_strings.o drbd_nl.o
 
 EXTRA_CFLAGS += -I$(src)
 
-#obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
diff --git a/ubuntu/drbd/drbd_actlog.c b/ubuntu/drbd/drbd_actlog.c
index 7cf95bb..0283910 100644
--- a/ubuntu/drbd/drbd_actlog.c
+++ b/ubuntu/drbd/drbd_actlog.c
@@ -48,7 +48,7 @@ STATIC int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 	md_io.error = 0;
 
 	if (rw == WRITE && !test_bit(MD_NO_BARRIER, &mdev->flags))
-	    rw |= (1<<BIO_RW_BARRIER);
+		rw |= (1<<BIO_RW_BARRIER);
 	rw |= (1 << BIO_RW_SYNC);
 
  retry:
@@ -64,18 +64,20 @@ STATIC int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 
 	dump_internal_bio("Md", mdev, bio, 0);
 
-	if (FAULT_ACTIVE(mdev, (rw & WRITE)? DRBD_FAULT_MD_WR:DRBD_FAULT_MD_RD))
+	if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
 		bio_endio(bio, -EIO);
 	else
 		submit_bio(rw, bio);
 	wait_for_completion(&md_io.event);
-	ok = bio_flagged(bio, BIO_UPTODATE);
+	ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
 
-	/* check for unsupported barrier op */
-	if (unlikely(md_io.error == -EOPNOTSUPP && bio_barrier(bio))) {
+	/* check for unsupported barrier op.
+	 * would rather check on EOPNOTSUPP, but that is not reliable.
+	 * don't try again for ANY return value != 0 */
+	if (unlikely(bio_barrier(bio) && !ok)) {
 		/* Try again with no barrier */
-		DRBD_WARN("Barriers not supported on meta data device - disabling\n");
-		set_bit(MD_NO_BARRIER,&mdev->flags);
+		drbd_WARN("Barriers not supported on meta data device - disabling\n");
+		set_bit(MD_NO_BARRIER, &mdev->flags);
 		rw &= ~(1 << BIO_RW_BARRIER);
 		bio_put(bio);
 		goto retry;
@@ -113,16 +115,16 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
 			if (!page)
 				return 0;
 
-			DRBD_WARN("Meta data's bdev hardsect = %d != %d\n",
+			drbd_WARN("Meta data's bdev hardsect = %d != %d\n",
 			     hardsect, MD_HARDSECT);
-			DRBD_WARN("Workaround engaged (has performace impact).\n");
+			drbd_WARN("Workaround engaged (has performace impact).\n");
 
 			mdev->md_io_tmpp = page;
 		}
 
-		mask = ( hardsect / MD_HARDSECT ) - 1;
-		D_ASSERT( mask == 1 || mask == 3 || mask == 7 );
-		D_ASSERT( hardsect == (mask+1) * MD_HARDSECT );
+		mask = (hardsect / MD_HARDSECT) - 1;
+		D_ASSERT(mask == 1 || mask == 3 || mask == 7);
+		D_ASSERT(hardsect == (mask+1) * MD_HARDSECT);
 		offset = sector & mask;
 		sector = sector & ~mask;
 		iop = mdev->md_io_tmpp;
@@ -227,9 +229,9 @@ struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
 	/*
 	if (!al_ext) {
 		if (al_flags & LC_STARVING)
-			DRBD_WARN("Have to wait for LRU element (AL too small?)\n");
+			drbd_WARN("Have to wait for LRU element (AL too small?)\n");
 		if (al_flags & LC_DIRTY)
-			DRBD_WARN("Ongoing AL update (AL device too slow?)\n");
+			drbd_WARN("Ongoing AL update (AL device too slow?)\n");
 	}
 	*/
 
@@ -253,7 +255,7 @@ void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
 		    (int)BM_SECT_TO_EXT(sector));
 	       );
 
-	wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)) );
+	wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
 
 	if (al_ext->lc_number != enr) {
 		/* drbd_al_write_transaction(mdev,al_ext,enr);
@@ -313,20 +315,20 @@ void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
 int
 w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 {
-	struct update_al_work *aw = (struct update_al_work*)w;
+	struct update_al_work *aw = (struct update_al_work *)w;
 	struct lc_element *updated = aw->al_ext;
 	const unsigned int new_enr = aw->enr;
 	const unsigned int evicted = aw->old_enr;
 
-	struct al_transaction* buffer;
+	struct al_transaction *buffer;
 	sector_t sector;
-	int i,n,mx;
+	int i, n, mx;
 	unsigned int extent_nr;
-	u32 xor_sum=0;
+	u32 xor_sum = 0;
 
 	if (!inc_local(mdev)) {
 		ERR("inc_local() failed in w_al_write_transaction\n");
-		complete(&((struct update_al_work*)w)->event);
+		complete(&((struct update_al_work *)w)->event);
 		return 1;
 	}
 	/* do we have to do a bitmap write, first?
@@ -414,9 +416,9 @@ STATIC int drbd_al_read_tr(struct drbd_conf *mdev,
 	if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
 		return -1;
 
-	rv = ( be32_to_cpu(b->magic) == DRBD_MAGIC );
+	rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
 
-	for (i = 0; i < AL_EXTENTS_PT+1; i++)
+	for (i = 0; i < AL_EXTENTS_PT + 1; i++)
 		xor_sum ^= be32_to_cpu(b->updates[i].extent);
 	rv &= (xor_sum == be32_to_cpu(b->xor_sum));
 
@@ -461,7 +463,6 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 			return 0;
 		}
 		cnr = be32_to_cpu(buffer->tr_number);
-		/* INFO("index %d valid tnr=%d\n",i,cnr); */
 
 		if (cnr == -1)
 			overflow = 1;
@@ -477,7 +478,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 	}
 
 	if (from == -1 || to == -1) {
-		DRBD_WARN("No usable activity log found.\n");
+		drbd_WARN("No usable activity log found.\n");
 
 		up(&mdev->md_io_mutex);
 		return 1;
@@ -560,8 +561,7 @@ STATIC BIO_ENDIO_TYPE atodb_endio BIO_ENDIO_ARGS(struct bio *bio, int error)
 	BIO_ENDIO_FN_START;
 	/* strange behaviour of some lower level drivers...
 	 * fail the request by clearing the uptodate flag,
-	 * but do not return any error?!
-	 * do we want to DRBD_WARN() on this? */
+	 * but do not return any error?! */
 	if (!error && !uptodate)
 		error = -EIO;
 
@@ -606,7 +606,7 @@ STATIC int atodb_prepare_unless_covered(struct drbd_conf *mdev,
 	 * the last invocation iterates over all bios,
 	 * and finds the last NULL entry.
 	 */
-	while ( (bio = bios[i]) ) {
+	while ((bio = bios[i])) {
 		if (bio->bi_sector == on_disk_sector)
 			return 0;
 		i++;
@@ -632,9 +632,9 @@ STATIC int atodb_prepare_unless_covered(struct drbd_conf *mdev,
 	}
 
 	offset = S2W(enr);
-	drbd_bm_get_lel( mdev, offset,
-			 min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset),
-			 kmap(page) + page_offset );
+	drbd_bm_get_lel(mdev, offset,
+			min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset),
+			kmap(page) + page_offset);
 	kunmap(page);
 
 	bio->bi_private = wc;
@@ -713,7 +713,7 @@ void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
 	for (i = 0; i < nr_elements; i++) {
 		if (bios[i] == NULL)
 			break;
-		if (FAULT_ACTIVE( mdev, DRBD_FAULT_MD_WR )) {
+		if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
 			bios[i]->bi_rw = WRITE;
 			bio_endio(bios[i], -EIO);
 		} else {
@@ -723,18 +723,17 @@ void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
 
 	drbd_blk_run_queue(bdev_get_queue(mdev->bc->md_bdev));
 
+	/* always (try to) flush bitmap to stable storage */
+	drbd_md_flush(mdev);
+
 	/* In case we did not submit a single IO do not wait for
 	 * them to complete. ( Because we would wait forever here. )
 	 *
 	 * In case we had IOs and they are already complete, there
 	 * is not point in waiting anyways.
 	 * Therefore this if () ... */
-	if (atomic_read(&wc.count)) {
+	if (atomic_read(&wc.count))
 		wait_for_completion(&wc.io_done);
-		/* flush bitmap to stable storage */
-		if (!test_bit(MD_NO_BARRIER, &mdev->flags))
-			blkdev_issue_flush(mdev->bc->md_bdev, NULL);
-	}
 
 	dec_local(mdev);
 
@@ -751,7 +750,7 @@ void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
 	kfree(bios);
 
  submit_one_by_one:
-	DRBD_WARN("Using the slow drbd_al_to_on_disk_bm()\n");
+	drbd_WARN("Using the slow drbd_al_to_on_disk_bm()\n");
 
 	for (i = 0; i < mdev->act_log->nr_elements; i++) {
 		enr = lc_entry(mdev->act_log, i)->lc_number;
@@ -759,7 +758,7 @@ void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
 			continue;
 		/* Really slow: if we have al-extents 16..19 active,
 		 * sector 4 will be written four times! Synchronous! */
-		drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT );
+		drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
 	}
 
 	lc_unlock(mdev->act_log);
@@ -800,12 +799,13 @@ static inline int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
 
 	spin_lock_irq(&mdev->al_lock);
 	rv = (al_ext->refcnt == 0);
-	if (likely(rv)) lc_del(mdev->act_log, al_ext);
+	if (likely(rv))
+		lc_del(mdev->act_log, al_ext);
 	spin_unlock_irq(&mdev->al_lock);
 
-	MTRACE(TraceTypeALExts,TraceLvlMetrics,
-	       if(unlikely(!rv))
-		       INFO("Waiting for extent in drbd_al_shrink()\n");
+	MTRACE(TraceTypeALExts, TraceLvlMetrics,
+		if (unlikely(!rv))
+			INFO("Waiting for extent in drbd_al_shrink()\n");
 	       );
 
 	return rv;
@@ -821,7 +821,7 @@ void drbd_al_shrink(struct drbd_conf *mdev)
 	struct lc_element *al_ext;
 	int i;
 
-	D_ASSERT( test_bit(__LC_DIRTY, &mdev->act_log->flags) );
+	D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
 
 	for (i = 0; i < mdev->act_log->nr_elements; i++) {
 		al_ext = lc_entry(mdev->act_log, i);
@@ -839,11 +839,11 @@ STATIC int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused
 
 	if (!inc_local(mdev)) {
 		if (DRBD_ratelimit(5*HZ, 5))
-			DRBD_WARN("Can not update on disk bitmap, local IO disabled.\n");
+			drbd_WARN("Can not update on disk bitmap, local IO disabled.\n");
 		return 1;
 	}
 
-	drbd_bm_write_sect(mdev, udw->enr );
+	drbd_bm_write_sect(mdev, udw->enr);
 	dec_local(mdev);
 
 	kfree(udw);
@@ -853,7 +853,9 @@ STATIC int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused
 		case SyncSource:  case SyncTarget:
 		case PausedSyncS: case PausedSyncT:
 			drbd_resync_finished(mdev);
-		default: /* nothing to do */;
+		default:
+			/* nothing to do */
+			break;
 		}
 	}
 	drbd_bcast_sync_progress(mdev);
@@ -907,18 +909,18 @@ STATIC void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
 			 * since drbd_rs_begin_io() pulled it already in.
 			 *
 			 * But maybe an application write finished, and we set
-			 * something in outside the resync lru_cache in sync.
+			 * something outside the resync lru_cache in sync.
 			 */
 			int rs_left = drbd_bm_e_weight(mdev, enr);
 			if (ext->flags != 0) {
-				DRBD_WARN("changing resync lce: %d[%u;%02lx]"
+				drbd_WARN("changing resync lce: %d[%u;%02lx]"
 				     " -> %d[%u;00]\n",
 				     ext->lce.lc_number, ext->rs_left,
 				     ext->flags, enr, rs_left);
 				ext->flags = 0;
 			}
 			if (ext->rs_failed) {
-				DRBD_WARN("Kicking resync_lru element enr=%u "
+				drbd_WARN("Kicking resync_lru element enr=%u "
 				     "out with rs_failed=%d\n",
 				     ext->lce.lc_number, ext->rs_failed);
 				set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
@@ -939,7 +941,7 @@ STATIC void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
 				udw->w.cb = w_update_odbm;
 				drbd_queue_work_front(&mdev->data.work, &udw->w);
 			} else {
-				DRBD_WARN("Could not kmalloc an udw\n");
+				drbd_WARN("Could not kmalloc an udw\n");
 				set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
 			}
 		}
@@ -974,7 +976,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
 		return;
 	}
 	nr_sectors = drbd_get_capacity(mdev->this_bdev);
-	esector = sector + (size>>9) -1;
+	esector = sector + (size >> 9) - 1;
 
 	ERR_IF(sector >= nr_sectors) return;
 	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
@@ -1011,9 +1013,9 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
 		if (jiffies - mdev->rs_mark_time > HZ*10) {
 			/* should be roling marks,
 			 * but we estimate only anyways. */
-			if ( mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
+			if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
 			    mdev->state.conn != PausedSyncT &&
-			    mdev->state.conn != PausedSyncS ) {
+			    mdev->state.conn != PausedSyncS) {
 				mdev->rs_mark_time = jiffies;
 				mdev->rs_mark_left = drbd_bm_total_weight(mdev);
 			}
@@ -1057,7 +1059,7 @@ void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
 		return; /* no disk, no metadata, no bitmap to set bits in */
 
 	nr_sectors = drbd_get_capacity(mdev->this_bdev);
-	esector = sector + (size>>9) -1;
+	esector = sector + (size >> 9) - 1;
 
 	ERR_IF(sector >= nr_sectors)
 		goto out;
@@ -1123,7 +1125,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
 
 	if (!bm_ext) {
 		if (rs_flags & LC_STARVING)
-			DRBD_WARN("Have to wait for element"
+			drbd_WARN("Have to wait for element"
 			     " (resync LRU too small?)\n");
 		BUG_ON(rs_flags & LC_DIRTY);
 	}
@@ -1137,7 +1139,8 @@ static inline int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
 	int rv = 0;
 
 	spin_lock_irq(&mdev->al_lock);
-	if (unlikely(enr == mdev->act_log->new_number)) rv = 1;
+	if (unlikely(enr == mdev->act_log->new_number))
+		rv = 1;
 	else {
 		al_ext = lc_find(mdev->act_log, enr);
 		if (al_ext) {
@@ -1176,19 +1179,20 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
 		    (unsigned long long)sector, enr);
 	    );
 
-	sig = wait_event_interruptible( mdev->al_wait,
-			(bm_ext = _bme_get(mdev, enr)) );
+	sig = wait_event_interruptible(mdev->al_wait,
+			(bm_ext = _bme_get(mdev, enr)));
 	if (sig)
 		return 0;
 
-	if (test_bit(BME_LOCKED, &bm_ext->flags)) return 1;
+	if (test_bit(BME_LOCKED, &bm_ext->flags))
+		return 1;
 
 	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
-		sig = wait_event_interruptible( mdev->al_wait,
-				!_is_in_al(mdev, enr*AL_EXT_PER_BM_SECT+i) );
+		sig = wait_event_interruptible(mdev->al_wait,
+				!_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i));
 		if (sig) {
 			spin_lock_irq(&mdev->al_lock);
-			if ( lc_put(mdev->resync, &bm_ext->lce) == 0 ) {
+			if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
 				clear_bit(BME_NO_WRITES, &bm_ext->flags);
 				mdev->resync_locked--;
 				wake_up(&mdev->al_wait);
@@ -1287,7 +1291,7 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
 		if (!bm_ext) {
 			const unsigned long rs_flags = mdev->resync->flags;
 			if (rs_flags & LC_STARVING)
-				DRBD_WARN("Have to wait for element"
+				drbd_WARN("Have to wait for element"
 				     " (resync LRU too small?)\n");
 			BUG_ON(rs_flags & LC_DIRTY);
 			goto try_again;
@@ -1357,7 +1361,7 @@ void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
 		return;
 	}
 
-	if ( lc_put(mdev->resync, (struct lc_element *)bm_ext) == 0 ) {
+	if (lc_put(mdev->resync, (struct lc_element *)bm_ext) == 0) {
 		clear_bit(BME_LOCKED, &bm_ext->flags);
 		clear_bit(BME_NO_WRITES, &bm_ext->flags);
 		mdev->resync_locked--;
@@ -1379,7 +1383,7 @@ void drbd_rs_cancel_all(struct drbd_conf *mdev)
 
 	spin_lock_irq(&mdev->al_lock);
 
-	if (inc_local_if_state(mdev,Failed)) { /* Makes sure ->resync is there. */
+	if (inc_local_if_state(mdev, Failed)) { /* Makes sure ->resync is there. */
 		lc_reset(mdev->resync);
 		dec_local(mdev);
 	}
@@ -1413,7 +1417,7 @@ int drbd_rs_del_all(struct drbd_conf *mdev)
 			if (bm_ext->lce.lc_number == LC_FREE)
 				continue;
 			if (bm_ext->lce.lc_number == mdev->resync_wenr) {
-				INFO("dropping %u in drbd_rs_del_all, aparently"
+				INFO("dropping %u in drbd_rs_del_all, apparently"
 				     " got 'synced' by application io\n",
 				     mdev->resync_wenr);
 				D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
@@ -1465,7 +1469,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
 		return;
 	}
 	nr_sectors = drbd_get_capacity(mdev->this_bdev);
-	esector = sector + (size>>9) -1;
+	esector = sector + (size >> 9) - 1;
 
 	ERR_IF(sector >= nr_sectors) return;
 	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
diff --git a/ubuntu/drbd/drbd_bitmap.c b/ubuntu/drbd/drbd_bitmap.c
index ef57d3a..0d321ba 100644
--- a/ubuntu/drbd/drbd_bitmap.c
+++ b/ubuntu/drbd/drbd_bitmap.c
@@ -40,7 +40,6 @@
  * Note that since find_first_bit returns int, at the current granularity of
  * the bitmap (4KB per byte), this implementation "only" supports up to
  * 1<<(32+12) == 16 TB...
- * other shortcomings in the meta data area may reduce this even further.
  *
  * we will eventually change the implementation to not allways hold the full
  * bitmap in memory, but only some 'lru_cache' of the on disk bitmap.
@@ -52,12 +51,13 @@
 
 /*
  * NOTE
- *  Access to the *bm is protected by bm_lock.
+ *  Access to the *bm_pages is protected by bm_lock.
  *  It is safe to read the other members within the lock.
  *
  *  drbd_bm_set_bits is called from bio_endio callbacks,
  *  We may be called with irq already disabled,
  *  so we need spin_lock_irqsave().
+ *  And we need the kmap_atomic.
  * FIXME
  *  for performance reasons, when we _know_ we have irq disabled, we should
  *  probably introduce some _in_irq variants, so we know to only spin_lock().
@@ -76,7 +76,7 @@
  *  than have two resize operations interfere somewhen.
  */
 struct drbd_bitmap {
-	unsigned long *bm;
+	struct page **bm_pages;
 	spinlock_t bm_lock;
 	/* WARNING unsigned long bm_fo and friends:
 	 * 32bit number of bit offset is just enough for 512 MB bitmap.
@@ -89,6 +89,7 @@ struct drbd_bitmap {
 	unsigned long bm_set;       /* nr of set bits; THINK maybe atomic_t? */
 	unsigned long bm_bits;
 	size_t   bm_words;
+	size_t   bm_number_of_pages;
 	sector_t bm_dev_capacity;
 	struct semaphore bm_change; /* serializes resize operations */
 
@@ -98,72 +99,129 @@ struct drbd_bitmap {
 	unsigned long  bm_flags;
 
 	/* debugging aid, in case we are still racy somewhere */
-	unsigned long  bm_line;
-	char          *bm_file;
+	char          *bm_why;
+	struct task_struct *bm_task;
 };
 
 /* definition of bits in bm_flags */
 #define BM_LOCKED 0
 #define BM_MD_IO_ERROR (BITS_PER_LONG-1) /* 31? 63? */
 
-void __drbd_bm_lock(struct drbd_conf *mdev, char *file, int line)
+static inline int bm_is_locked(struct drbd_bitmap *b)
 {
-	struct drbd_bitmap *b = mdev->bitmap;
+	return test_bit(BM_LOCKED, &b->bm_flags);
+}
 
-	spin_lock_irq(&b->bm_lock);
-	if (!__test_and_set_bit(BM_LOCKED,&b->bm_flags)) {
-		b->bm_file = file;
-		b->bm_line = line;
-	} else if (DRBD_ratelimit(5*HZ,5)) {
-		ERR("%s:%d: bitmap already locked by %s:%lu\n",
-		    file, line, b->bm_file,b->bm_line);
-		/*
-		dump_stack();
-		ERR("This is no oops, but debug stack trace only.\n");
-		ERR("If you get this often, or in reproducable situations, "
-		    "notify <drbd-devel at linbit.com>\n");
-		*/
-	}
-	spin_unlock_irq(&b->bm_lock);
+#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
+static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	if (!DRBD_ratelimit(5*HZ, 5))
+		return;
+	ERR("FIXME %s in %s, bitmap locked for '%s' by %s\n",
+	    current == mdev->receiver.task ? "receiver" :
+	    current == mdev->asender.task  ? "asender"  :
+	    current == mdev->worker.task   ? "worker"   : current->comm,
+	    func, b->bm_why ?: "?",
+	    b->bm_task == mdev->receiver.task ? "receiver" :
+	    b->bm_task == mdev->asender.task  ? "asender"  :
+	    b->bm_task == mdev->worker.task   ? "worker"   : "?");
 }
 
-void drbd_bm_unlock(struct drbd_conf *mdev)
+void drbd_bm_lock(struct drbd_conf *mdev, char *why)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
-	spin_lock_irq(&b->bm_lock);
-	if (!__test_and_clear_bit(BM_LOCKED,&mdev->bitmap->bm_flags)) {
-		ERR("bitmap not locked in bm_unlock\n");
-	} else {
-		/* FIXME if we got a "is already locked" previously,
-		 * we unlock here even though we actually MUST NOT do so... */
-		b->bm_file = NULL;
-		b->bm_line = -1;
+	int trylock_failed;
+
+	if (!b) {
+		ERR("FIXME no bitmap in drbd_bm_lock!?\n");
+		return;
 	}
-	spin_unlock_irq(&b->bm_lock);
+
+	trylock_failed = down_trylock(&b->bm_change);
+
+	if (trylock_failed) {
+		DBG("%s going to '%s' but bitmap already locked for '%s' by %s\n",
+		    current == mdev->receiver.task ? "receiver" :
+		    current == mdev->asender.task  ? "asender"  :
+		    current == mdev->worker.task   ? "worker"   : "?",
+		    why, b->bm_why ?: "?",
+		    b->bm_task == mdev->receiver.task ? "receiver" :
+		    b->bm_task == mdev->asender.task  ? "asender"  :
+		    b->bm_task == mdev->worker.task   ? "worker"   : "?");
+		down(&b->bm_change);
+	}
+	if (__test_and_set_bit(BM_LOCKED, &b->bm_flags))
+		ERR("FIXME bitmap already locked in bm_lock\n");
+
+	b->bm_why  = why;
+	b->bm_task = current;
 }
 
-#if DUMP_MD >= 3
-/* debugging aid */
-void bm_end_info(struct drbd_conf *mdev, const char *where)
+void drbd_bm_unlock(struct drbd_conf *mdev)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
-	size_t w = (b->bm_bits-1) >> LN2_BPL;
+	if (!b) {
+		ERR("FIXME no bitmap in drbd_bm_unlock!?\n");
+		return;
+	}
 
-	INFO("%s: bm_set=%lu\n", where, b->bm_set);
-	INFO("bm[%d]=0x%lX\n", w, b->bm[w]);
-	w++;
+	if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags))
+		ERR("FIXME bitmap not locked in bm_unlock\n");
 
-	if (w < b->bm_words) {
-		D_ASSERT(w == b->bm_words -1);
-		INFO("bm[%d]=0x%lX\n", w, b->bm[w]);
-	}
+	b->bm_why  = NULL;
+	b->bm_task = NULL;
+	up(&b->bm_change);
 }
-#else
+
 #define bm_end_info(ignored...)	((void)(0))
+
+#if 0
+#define catch_oob_access_start() do {	\
+	do {				\
+		if ((bm-p_addr) >= PAGE_SIZE/sizeof(long)) { \
+			printk(KERN_ALERT "drbd_bitmap.c:%u %s: p_addr:%p bm:%p %d\n", \
+					__LINE__ , __func__ , p_addr, bm, (bm-p_addr)); \
+			break;		\
+		}
+#define catch_oob_access_end()	\
+	} while (0); } while (0)
+#else
+#define catch_oob_access_start() do {
+#define catch_oob_access_end() } while (0)
 #endif
 
+/* word offset to long pointer */
+STATIC unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset)
+{
+	struct page *page;
+	unsigned long page_nr;
+
+	/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
+	page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
+	BUG_ON(page_nr >= b->bm_number_of_pages);
+	page = b->bm_pages[page_nr];
+
+	return (unsigned long *) kmap_atomic(page, KM_IRQ1);
+}
+
+STATIC void bm_unmap(unsigned long *p_addr)
+{
+	kunmap_atomic(p_addr, KM_IRQ1);
+};
+
 /* long word offset of _bitmap_ sector */
 #define S2W(s)	((s)<<(BM_EXT_SIZE_B-BM_BLOCK_SIZE_B-LN2_BPL))
+/* word offset from start of bitmap to word number _in_page_
+ * modulo longs per page
+#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
+ hm, well, Philipp thinks gcc might not optimze the % into & (... - 1)
+ so do it explicitly:
+ */
+#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
+
+/* Long words per page */
+#define LWPP (PAGE_SIZE/sizeof(long))
 
 /*
  * actually most functions herein should take a struct drbd_bitmap*, not a
@@ -177,6 +235,85 @@ void bm_end_info(struct drbd_conf *mdev, const char *where)
  * But it is NOT strictly ok.
  */
 
+STATIC void bm_free_pages(struct page **pages, unsigned long number)
+{
+	unsigned long i;
+	if (!pages)
+		return;
+
+	for (i = 0; i < number; i++) {
+		if (!pages[i]) {
+			printk(KERN_ALERT "drbd: bm_free_pages tried to free "
+					  "a NULL pointer; i=%lu n=%lu\n",
+					  i, number);
+			continue;
+		}
+		__free_page(pages[i]);
+		pages[i] = NULL;
+	}
+}
+
+/*
+ * "have" and "want" are NUMBER OF PAGES.
+ */
+STATIC struct page **bm_realloc_pages(struct page **old_pages,
+				       unsigned long have,
+				       unsigned long want)
+{
+	struct page** new_pages, *page;
+	unsigned int i, bytes;
+
+	BUG_ON(have == 0 && old_pages != NULL);
+	BUG_ON(have != 0 && old_pages == NULL);
+
+	if (have == want)
+		return old_pages;
+
+	/* To use kmalloc here is ok, as long as we support 4TB at max...
+	 * otherwise this might become bigger than 128KB, which is
+	 * the maximum for kmalloc.
+	 *
+	 * no, it is not: on 64bit boxes, sizeof(void*) == 8,
+	 * 128MB bitmap @ 4K pages -> 256K of page pointers.
+	 * ==> use vmalloc for now again.
+	 * then again, we could do something like
+	 *   if (nr_pages > watermark) vmalloc else kmalloc :*> ...
+	 * or do cascading page arrays:
+	 *   one page for the page array of the page array,
+	 *   those pages for the real bitmap pages.
+	 *   there we could even add some optimization members,
+	 *   so we won't need to kmap_atomic in bm_find_next_bit just to see
+	 *   that the page has no bits set ...
+	 * or we can try a "huge" page ;-)
+	 */
+	bytes = sizeof(struct page*)*want;
+	new_pages = vmalloc(bytes);
+	if (!new_pages)
+		return NULL;
+
+	memset(new_pages, 0, bytes);
+	if (want >= have) {
+		for (i = 0; i < have; i++)
+			new_pages[i] = old_pages[i];
+		for (; i < want; i++) {
+			if (!(page = alloc_page(GFP_HIGHUSER))) {
+				bm_free_pages(new_pages + have, i - have);
+				vfree(new_pages);
+				return NULL;
+			}
+			new_pages[i] = page;
+		}
+	} else {
+		for (i = 0; i < want; i++)
+			new_pages[i] = old_pages[i];
+		/* NOT HERE, we are outside the spinlock!
+		bm_free_pages(old_pages + want, have - want);
+		*/
+	}
+
+	return new_pages;
+}
+
 /*
  * called on driver init only. TODO call when a device is created.
  * allocates the drbd_bitmap, and stores it in mdev->bitmap.
@@ -213,7 +350,8 @@ void drbd_bm_cleanup(struct drbd_conf *mdev)
 	 *
 	WARN_ON(mdev->bitmap->bm);
 	 */
-	vfree(mdev->bitmap->bm);
+	bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
+	vfree(mdev->bitmap->bm_pages);
 	kfree(mdev->bitmap);
 	mdev->bitmap = NULL;
 }
@@ -225,46 +363,75 @@ void drbd_bm_cleanup(struct drbd_conf *mdev)
  */
 STATIC int bm_clear_surplus(struct drbd_bitmap *b)
 {
-	const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) -1;
+	const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
 	size_t w = b->bm_bits >> LN2_BPL;
 	int cleared = 0;
+	unsigned long *p_addr, *bm;
 
+	p_addr = bm_map_paddr(b, w);
+	bm = p_addr + MLPP(w);
 	if (w < b->bm_words) {
-		cleared = hweight_long(b->bm[w] & ~mask);
-		b->bm[w++] &= mask;
+		catch_oob_access_start();
+		cleared = hweight_long(*bm & ~mask);
+		*bm &= mask;
+		catch_oob_access_end();
+		w++; bm++;
 	}
 
 	if (w < b->bm_words) {
-		cleared += hweight_long(b->bm[w]);
-		b->bm[w++] = 0;
+		catch_oob_access_start();
+		cleared += hweight_long(*bm);
+		*bm = 0;
+		catch_oob_access_end();
 	}
-
+	bm_unmap(p_addr);
 	return cleared;
 }
 
 STATIC void bm_set_surplus(struct drbd_bitmap *b)
 {
-	const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) -1;
+	const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
 	size_t w = b->bm_bits >> LN2_BPL;
+	unsigned long *p_addr, *bm;
 
-	if (w < b->bm_words)
-		b->bm[w++] |= ~mask;
+	p_addr = bm_map_paddr(b, w);
+	bm = p_addr + MLPP(w);
+	if (w < b->bm_words) {
+		catch_oob_access_start();
+		*bm |= ~mask;
+		bm++; w++;
+		catch_oob_access_end();
+	}
 
-	if (w < b->bm_words)
-		b->bm[w++] = ~(0UL);
+	if (w < b->bm_words) {
+		catch_oob_access_start();
+		*bm = ~(0UL);
+		catch_oob_access_end();
+	}
+	bm_unmap(p_addr);
 }
 
 STATIC unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian)
 {
-	unsigned long *bm = b->bm;
-	unsigned long *ep = b->bm + b->bm_words;
+	unsigned long *p_addr, *bm, offset = 0;
 	unsigned long bits = 0;
-
-	while ( bm < ep ) {
+	unsigned long i, do_now;
+
+	while (offset < b->bm_words) {
+		i = do_now = min_t(size_t, b->bm_words-offset, LWPP);
+		p_addr = bm_map_paddr(b, offset);
+		bm = p_addr + MLPP(offset);
+		while (i--) {
+			catch_oob_access_start();
 #ifndef __LITTLE_ENDIAN
-		if (swap_endian) *bm = lel_to_cpu(*bm);
+			if (swap_endian)
+				*bm = lel_to_cpu(*bm);
 #endif
-		bits += hweight_long(*bm++);
+			bits += hweight_long(*bm++);
+			catch_oob_access_end();
+		}
+		bm_unmap(p_addr);
+		offset += do_now;
 	}
 
 	return bits;
@@ -272,22 +439,27 @@ STATIC unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endia
 
 static inline unsigned long bm_count_bits(struct drbd_bitmap *b)
 {
-	return __bm_count_bits(b,0);
+	return __bm_count_bits(b, 0);
 }
 
 static inline unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b)
 {
-	return __bm_count_bits(b,1);
+	return __bm_count_bits(b, 1);
 }
 
-
-void _drbd_bm_recount_bits(struct drbd_conf *mdev, char* file, int line)
+void _drbd_bm_recount_bits(struct drbd_conf *mdev, char *file, int line)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
 	unsigned long flags, bits;
 
 	ERR_IF(!b) return;
 
+	/* IMO this should be inside drbd_bm_lock/unlock.
+	 * Unfortunately it is used outside of the locks.
+	 * And I'm not yet sure where we need to place the
+	 * lock/unlock correctly.
+	 */
+
 	spin_lock_irqsave(&b->bm_lock, flags);
 	bits = bm_count_bits(b);
 	if (bits != b->bm_set) {
@@ -298,8 +470,38 @@ void _drbd_bm_recount_bits(struct drbd_conf *mdev, char* file, int line)
 	spin_unlock_irqrestore(&b->bm_lock, flags);
 }
 
+/* offset and len in long words.*/
+STATIC void bm_memset(struct drbd_bitmap * b, size_t offset, int c, size_t len)
+{
+	unsigned long *p_addr, *bm;
+	size_t do_now, end;
+
 #define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
 
+	end = offset + len;
+
+	if (end > b->bm_words) {
+		printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
+		return;
+	}
+
+	while (offset < end) {
+		do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
+		p_addr = bm_map_paddr(b, offset);
+		bm = p_addr + MLPP(offset);
+		catch_oob_access_start();
+		if (bm+do_now > p_addr + LWPP) {
+			printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
+			       p_addr, bm, (int)do_now);
+			break; /* breaks to after catch_oob_access_end() only! */
+		}
+		memset(bm, c, do_now * sizeof(long));
+		catch_oob_access_end();
+		bm_unmap(p_addr);
+		offset += do_now;
+	}
+}
+
 /*
  * make sure the bitmap has enough room for the attached storage,
  * if neccessary, resize.
@@ -311,14 +513,14 @@ void _drbd_bm_recount_bits(struct drbd_conf *mdev, char* file, int line)
 int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
-	unsigned long bits, bytes, words, *nbm, *obm = NULL;
+	unsigned long bits, words, owords, obits, *p_addr, *bm;
+	unsigned long want, have, onpages; /* number of pages */
+	struct page **npages, **opages = NULL;
 	int err = 0, growing;
 
 	ERR_IF(!b) return -ENOMEM;
 
-	ERR_IF (down_trylock(&b->bm_change)) {
-		down(&b->bm_change);
-	}
+	drbd_bm_lock(mdev, "resize");
 
 	INFO("drbd_bm_resize called with capacity == %llu\n",
 			(unsigned long long)capacity);
@@ -328,87 +530,93 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
 
 	if (capacity == 0) {
 		spin_lock_irq(&b->bm_lock);
-		obm = b->bm;
-		b->bm = NULL;
+		opages = b->bm_pages;
+		onpages = b->bm_number_of_pages;
+		owords = b->bm_words;
+		b->bm_pages = NULL;
+		b->bm_number_of_pages =
 		b->bm_fo    =
 		b->bm_set   =
 		b->bm_bits  =
 		b->bm_words =
 		b->bm_dev_capacity = 0;
 		spin_unlock_irq(&b->bm_lock);
-		goto free_obm;
-	} else {
-		bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECTORS_PER_BIT));
+		bm_free_pages(opages, onpages);
+		vfree(opages);
+		goto out;
+	}
+	bits  = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
+
+	/* if we would use
+	   words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
+	   a 32bit host could present the wrong number of words
+	   to a 64bit host.
+	*/
+	words = ALIGN(bits, 64) >> LN2_BPL;
+
+	if (inc_local(mdev)) {
+		D_ASSERT((u64)bits <= (((u64)mdev->bc->md.md_size_sect-MD_BM_OFFSET) << 12));
+		dec_local(mdev);
+	}
 
-		/* if we would use
-		   words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
-		   a 32bit host could present the wrong number of words
-		   to a 64bit host.
-		*/
-		words = ALIGN(bits, 64) >> LN2_BPL;
+	/* one extra long to catch off by one errors */
+	want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
+	have = b->bm_number_of_pages;
+	if (want == have) {
+		D_ASSERT(b->bm_pages != NULL);
+		npages = b->bm_pages;
+	} else
+		npages = bm_realloc_pages(b->bm_pages, have, want);
+
+	if (!npages) {
+		err = -ENOMEM;
+		goto out;
+	}
 
-		if (inc_local(mdev)) {
-			D_ASSERT((u64)bits <= (((u64)mdev->bc->md.md_size_sect-MD_BM_OFFSET) << 12));
-			dec_local(mdev);
-		}
+	spin_lock_irq(&b->bm_lock);
+	opages = b->bm_pages;
+	owords = b->bm_words;
+	obits  = b->bm_bits;
+
+	growing = bits > obits;
+	if (opages)
+		bm_set_surplus(b);
+
+	b->bm_pages = npages;
+	b->bm_number_of_pages = want;
+	b->bm_bits  = bits;
+	b->bm_words = words;
+	b->bm_dev_capacity = capacity;
+
+	if (growing) {
+		bm_memset(b, owords, 0xff, words-owords);
+		b->bm_set += bits - obits;
+	}
 
-		if (words == b->bm_words) {
-			/* optimize: capacity has changed,
-			 * but only within one long word worth of bits.
-			 * just update the bm_dev_capacity and bm_bits members.
-			 */
-			spin_lock_irq(&b->bm_lock);
-			b->bm_bits    = bits;
-			b->bm_dev_capacity = capacity;
-			b->bm_set -= bm_clear_surplus(b);
-			bm_end_info(mdev, __FUNCTION__ );
-			spin_unlock_irq(&b->bm_lock);
-			goto out;
-		} else {
-			/* one extra long to catch off by one errors */
-			bytes = (words+1)*sizeof(long);
-			nbm = vmalloc(bytes);
-			if (!nbm) {
-				ERR("bitmap: failed to vmalloc %lu bytes\n",
-					bytes);
-				err = -ENOMEM;
-				goto out;
-			}
-		}
-		spin_lock_irq(&b->bm_lock);
-		obm = b->bm;
-		/* brgs. move several MB within spinlock...
-		 * FIXME this should go into userspace! */
-		if (obm) {
-			bm_set_surplus(b);
-			D_ASSERT(b->bm[b->bm_words] == DRBD_MAGIC);
-			memcpy(nbm, obm, min_t(size_t, b->bm_words, words)
-								*sizeof(long));
-		}
-		growing = words > b->bm_words;
-		if (growing) {
-			/* set all newly allocated bits
-			 * start at -1, just to be sure. */
-			memset( nbm + (b->bm_words?:1)-1 , 0xff,
-				(words - ((b->bm_words?:1)-1)) * sizeof(long) );
-			b->bm_set  += bits - b->bm_bits;
-		}
-		nbm[words] = DRBD_MAGIC;
-		b->bm = nbm;
-		b->bm_bits  = bits;
-		b->bm_words = words;
-		b->bm_dev_capacity = capacity;
-		bm_clear_surplus(b);
-		if (!growing)
-			b->bm_set = bm_count_bits(b);
-		bm_end_info(mdev, __FUNCTION__ );
-		spin_unlock_irq(&b->bm_lock);
-		INFO("resync bitmap: bits=%lu words=%lu\n", bits, words);
+	if (want < have) {
+		/* implicit: (opages != NULL) && (opages != npages) */
+		bm_free_pages(opages + want, have - want);
 	}
- free_obm:
-	vfree(obm); /* vfree(NULL) is noop */
+
+	p_addr = bm_map_paddr(b, words);
+	bm = p_addr + MLPP(words);
+	catch_oob_access_start();
+	*bm = DRBD_MAGIC;
+	catch_oob_access_end();
+	bm_unmap(p_addr);
+
+	(void)bm_clear_surplus(b);
+	if (!growing)
+		b->bm_set = bm_count_bits(b);
+
+	bm_end_info(mdev, __FUNCTION__);
+	spin_unlock_irq(&b->bm_lock);
+	if (opages != npages)
+		vfree(opages);
+	INFO("resync bitmap: bits=%lu words=%lu\n", bits, words);
+
  out:
-	up(&b->bm_change);
+	drbd_bm_unlock(mdev);
 	return err;
 }
 
@@ -427,6 +635,7 @@ unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
 	unsigned long flags;
 
 	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
 
 	spin_lock_irqsave(&b->bm_lock, flags);
 	s = b->bm_set;
@@ -439,6 +648,8 @@ size_t drbd_bm_words(struct drbd_conf *mdev)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
 	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
 	return b->bm_words;
 }
 
@@ -450,42 +661,52 @@ unsigned long drbd_bm_bits(struct drbd_conf *mdev)
 	return b->bm_bits;
 }
 
-
 /* merge number words from buffer into the bitmap starting at offset.
  * buffer[i] is expected to be little endian unsigned long.
+ * bitmap must be locked by drbd_bm_lock.
+ * currently only used from receive_bitmap.
  */
-void drbd_bm_merge_lel( struct drbd_conf *mdev, size_t offset, size_t number,
-			unsigned long *buffer )
+void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
+			unsigned long *buffer)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
-	unsigned long *bm;
+	unsigned long *p_addr, *bm;
 	unsigned long word, bits;
-	size_t n = number;
+	size_t end, do_now;
+
+	end = offset + number;
 
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm_pages) return;
 	if (number == 0)
 		return;
-	ERR_IF(!b) return;
-	ERR_IF(!b->bm) return;
-	WARN_ON(offset        >= b->bm_words);
-	WARN_ON(offset+number >  b->bm_words);
-	WARN_ON(number > PAGE_SIZE/sizeof(long));
+	WARN_ON(offset >= b->bm_words);
+	WARN_ON(end    >  b->bm_words);
 
 	spin_lock_irq(&b->bm_lock);
-	bm = b->bm + offset;
-	while (n--) {
-		bits = hweight_long(*bm);
-		word = *bm | lel_to_cpu(*buffer++);
-		*bm++ = word;
-		b->bm_set += hweight_long(word) - bits;
+	while (offset < end) {
+		do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+		p_addr = bm_map_paddr(b, offset);
+		bm = p_addr + MLPP(offset);
+		offset += do_now;
+		while (do_now--) {
+			catch_oob_access_start();
+			bits = hweight_long(*bm);
+			word = *bm | lel_to_cpu(*buffer++);
+			*bm++ = word;
+			b->bm_set += hweight_long(word) - bits;
+			catch_oob_access_end();
+		}
+		bm_unmap(p_addr);
 	}
 	/* with 32bit <-> 64bit cross-platform connect
 	 * this is only correct for current usage,
 	 * where we _know_ that we are 64 bit aligned,
 	 * and know that this function is used in this way, too...
 	 */
-	if (offset+number == b->bm_words) {
+	if (end == b->bm_words) {
 		b->bm_set -= bm_clear_surplus(b);
-		bm_end_info(mdev, __FUNCTION__ );
+		bm_end_info(mdev, __func__);
 	}
 	spin_unlock_irq(&b->bm_lock);
 }
@@ -494,30 +715,39 @@ void drbd_bm_merge_lel( struct drbd_conf *mdev, size_t offset, size_t number,
  * buffer[i] will be little endian unsigned long.
  */
 void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
-		     unsigned long *buffer )
+		     unsigned long *buffer)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
-	unsigned long *bm;
+	unsigned long *p_addr, *bm;
+	size_t end, do_now;
+
+	end = offset + number;
 
-	if (number == 0)
-		return;
 	ERR_IF(!b) return;
-	ERR_IF(!b->bm) return;
-	if ( (offset        >= b->bm_words) ||
-	     (offset+number >  b->bm_words) ||
-	     (number > PAGE_SIZE/sizeof(long)) ||
-	     (number <= 0) ) {
-		/* yes, there is "%z", but that gives compiler warnings... */
+	ERR_IF(!b->bm_pages) return;
+
+	spin_lock_irq(&b->bm_lock);
+	if ((offset >= b->bm_words) ||
+	    (end    >  b->bm_words) ||
+	    (number <= 0))
 		ERR("offset=%lu number=%lu bm_words=%lu\n",
 			(unsigned long)	offset,
 			(unsigned long)	number,
 			(unsigned long) b->bm_words);
-		return;
+	else {
+		while (offset < end) {
+			do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+			p_addr = bm_map_paddr(b, offset);
+			bm = p_addr + MLPP(offset);
+			offset += do_now;
+			while (do_now--) {
+				catch_oob_access_start();
+				*buffer++ = cpu_to_lel(*bm++);
+				catch_oob_access_end();
+			}
+			bm_unmap(p_addr);
+		}
 	}
-
-	spin_lock_irq(&b->bm_lock);
-	bm = b->bm + offset;
-	while (number--) *buffer++ = cpu_to_lel(*bm++);
 	spin_unlock_irq(&b->bm_lock);
 }
 
@@ -526,25 +756,39 @@ void drbd_bm_set_all(struct drbd_conf *mdev)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
 	ERR_IF(!b) return;
-	ERR_IF(!b->bm) return;
+	ERR_IF(!b->bm_pages) return;
 
 	spin_lock_irq(&b->bm_lock);
-	memset(b->bm, 0xff, b->bm_words*sizeof(long));
-	bm_clear_surplus(b);
+	bm_memset(b, 0, 0xff, b->bm_words);
+	(void)bm_clear_surplus(b);
 	b->bm_set = b->bm_bits;
 	spin_unlock_irq(&b->bm_lock);
 }
 
+/* clear all bits in the bitmap */
+void drbd_bm_clear_all(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm_pages) return;
+
+	spin_lock_irq(&b->bm_lock);
+	bm_memset(b, 0, 0, b->bm_words);
+	b->bm_set = 0;
+	spin_unlock_irq(&b->bm_lock);
+}
+
 static BIO_ENDIO_TYPE bm_async_io_complete BIO_ENDIO_ARGS(struct bio *bio, int error)
 {
 	struct drbd_bitmap *b = bio->bi_private;
 	int uptodate = bio_flagged(bio, BIO_UPTODATE);
 
 	BIO_ENDIO_FN_START;
+
 	/* strange behaviour of some lower level drivers...
 	 * fail the request by clearing the uptodate flag,
 	 * but do not return any error?!
-	 * do we want to DRBD_WARN() on this? */
+	 * do we want to WARN() on this? */
 	if (!error && !uptodate)
 		error = -EIO;
 
@@ -566,10 +810,7 @@ static BIO_ENDIO_TYPE bm_async_io_complete BIO_ENDIO_ARGS(struct bio *bio, int e
 STATIC void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local)
 {
 	/* we are process context. we always get a bio */
-	/* THINK: do we need GFP_NOIO here? */
 	struct bio *bio = bio_alloc(GFP_KERNEL, 1);
-	struct page *page = vmalloc_to_page((char *)(b->bm)
-						+ (PAGE_SIZE*page_nr));
 	unsigned int len;
 	sector_t on_disk_sector =
 		mdev->bc->md.md_offset + mdev->bc->md.bm_offset;
@@ -585,11 +826,11 @@ STATIC void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int
 
 	bio->bi_bdev = mdev->bc->md_bdev;
 	bio->bi_sector = on_disk_sector;
-	bio_add_page(bio, page, len, 0);
+	bio_add_page(bio, b->bm_pages[page_nr], len, 0);
 	bio->bi_private = b;
 	bio->bi_end_io = bm_async_io_complete;
 
-	if (FAULT_ACTIVE(mdev, (rw&WRITE)?DRBD_FAULT_MD_WR:DRBD_FAULT_MD_RD)) {
+	if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
 		bio->bi_rw |= rw;
 		bio_endio(bio, -EIO);
 	} else {
@@ -607,20 +848,25 @@ void bm_cpu_to_lel(struct drbd_bitmap *b)
 	 * this may be optimized by using
 	 * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0;
 	 * the following is still not optimal, but better than nothing */
-	const unsigned long *end = b->bm+b->bm_words;
-	unsigned long *bm;
 	if (b->bm_set == 0) {
 		/* no page at all; avoid swap if all is 0 */
-		return;
+		i = b->bm_number_of_pages;
 	} else if (b->bm_set == b->bm_bits) {
-		/* only the last words */
-		bm = end-2;
+		/* only the last page */
+		i = b->bm_number_of_pages -1;
 	} else {
 		/* all pages */
-		bm = b->bm;
+		i = 0;
+	}
+	for (; i < b->bm_number_of_pages; i++) {
+		unsigned long *bm;
+		/* if you'd want to use kmap_atomic, you'd have to disable irq! */
+		p_addr = kmap(b->bm_pages[i]);
+		for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++) {
+			*bm = cpu_to_lel(*bm);
+		}
+		kunmap(p_addr);
 	}
-	for (; bm < end; bm++)
-		*bm = cpu_to_lel(*bm);
 }
 # endif
 /* lel_to_cpu == cpu_to_lel */
@@ -638,17 +884,13 @@ STATIC int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
 	char ppb[10];
 	int err = 0;
 
+	WARN_ON(!bm_is_locked(b));
+
+	/* no spinlock here, the drbd_bm_lock should be enough! */
+
 	bm_words  = drbd_bm_words(mdev);
 	num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT;
 
-	/* OK, I manipulate the bitmap low level,
-	 * and I expect to be the exclusive user.
-	 * If not, I am really in a bad mood...
-	 * to catch such bugs early, make all people who want to access the
-	 * bitmap while I read/write it dereference a NULL pointer :->
-	 */
-	mdev->bitmap = NULL;
-
 	/* on disk bitmap is little endian */
 	if (rw == WRITE)
 		bm_cpu_to_lel(b);
@@ -663,8 +905,11 @@ STATIC int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
 
 	drbd_blk_run_queue(bdev_get_queue(mdev->bc->md_bdev));
 	wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0);
-	INFO("%s of bitmap took %lu jiffies\n",
-	     rw == READ ? "reading" : "writing", jiffies - now);
+
+	MTRACE(TraceTypeMDIO, TraceLvlSummary,
+	       INFO("%s of bitmap took %lu jiffies\n",
+		    rw == READ ? "reading" : "writing", jiffies - now);
+	       );
 
 	if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) {
 		ALERT("we had at least one MD IO ERROR during bitmap IO\n");
@@ -678,8 +923,7 @@ STATIC int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
 		/* swap back endianness */
 		bm_lel_to_cpu(b);
 		/* flush bitmap to stable storage */
-		if (!test_bit(MD_NO_BARRIER,&mdev->flags))
-			blkdev_issue_flush(mdev->bc->md_bdev, NULL);
+		drbd_md_flush(mdev);
 	} else /* rw == READ */ {
 		/* just read, if neccessary adjust endianness */
 		b->bm_set = bm_count_bits_swap_endian(b);
@@ -688,12 +932,6 @@ STATIC int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
 	}
 	now = b->bm_set;
 
-	/* ok, done,
-	 * now it is visible again
-	 */
-
-	mdev->bitmap = b;
-
 	INFO("%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
 	     ppsize(ppb, now << (BM_BLOCK_SIZE_B-10)), now);
 
@@ -707,18 +945,7 @@ STATIC int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
  */
 int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
 {
-	struct drbd_bitmap *b = mdev->bitmap;
-	int err = 0;
-
-	if (b->bm) {
-		/* bitmap size > 0 */
-		err = bm_rw(mdev, READ);
-
-		if (err == 0)
-			b->bm[b->bm_words] = DRBD_MAGIC;
-	}
-
-	return err;
+	return bm_rw(mdev, READ);
 }
 
 /**
@@ -755,14 +982,14 @@ int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(lo
 #endif
 	if (num_words < S2W(1))
 		memset(page_address(mdev->md_io_page), 0, MD_HARDSECT);
-	drbd_bm_get_lel( mdev, offset, num_words,
-			 page_address(mdev->md_io_page) );
+	drbd_bm_get_lel(mdev, offset, num_words,
+			page_address(mdev->md_io_page));
 	if (!drbd_md_sync_page_io(mdev, mdev->bc, on_disk_sector, WRITE)) {
 		int i;
 		err = -EIO;
-		ERR( "IO ERROR writing bitmap sector %lu "
-		     "(meta-disk sector %llus)\n",
-		     enr, (unsigned long long)on_disk_sector );
+		ERR("IO ERROR writing bitmap sector %lu "
+		    "(meta-disk sector %llus)\n",
+		    enr, (unsigned long long)on_disk_sector);
 		drbd_chk_io_error(mdev, 1, TRUE);
 		drbd_io_error(mdev, TRUE);
 		for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
@@ -780,6 +1007,8 @@ void drbd_bm_reset_find(struct drbd_conf *mdev)
 	ERR_IF(!b) return;
 
 	spin_lock_irq(&b->bm_lock);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
 	b->bm_fo = 0;
 	spin_unlock_irq(&b->bm_lock);
 
@@ -788,28 +1017,46 @@ void drbd_bm_reset_find(struct drbd_conf *mdev)
 /* NOTE
  * find_first_bit returns int, we return unsigned long.
  * should not make much difference anyways, but ...
+ *
  * this returns a bit number, NOT a sector!
  */
+#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
 unsigned long drbd_bm_find_next(struct drbd_conf *mdev)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
 	unsigned long i = -1UL;
+	unsigned long *p_addr;
+	unsigned long bit_offset; /* bit offset of the mapped page. */
 
 	ERR_IF(!b) return i;
-	ERR_IF(!b->bm) return i;
+	ERR_IF(!b->bm_pages) return i;
 
 	spin_lock_irq(&b->bm_lock);
-	if (b->bm_fo < b->bm_bits)
-		i = find_next_bit(b->bm, b->bm_bits, b->bm_fo);
-	else if (b->bm_fo > b->bm_bits)
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
+	if (b->bm_fo > b->bm_bits) {
 		ERR("bm_fo=%lu bm_bits=%lu\n", b->bm_fo, b->bm_bits);
-
-	if (i >= b->bm_bits) {
+	} else {
+		while (b->bm_fo < b->bm_bits) {
+			unsigned long offset;
+			bit_offset = b->bm_fo & ~BPP_MASK; /* bit offset of the page */
+			offset = bit_offset >> LN2_BPL;    /* word offset of the page */
+			p_addr = bm_map_paddr(b, offset);
+			i = find_next_bit(p_addr, PAGE_SIZE*8, b->bm_fo & BPP_MASK);
+			bm_unmap(p_addr);
+			if (i < PAGE_SIZE*8) {
+				i = bit_offset + i;
+				if (i >= b->bm_bits)
+					break;
+				b->bm_fo = i+1;
+				goto found;
+			}
+			b->bm_fo = bit_offset + PAGE_SIZE*8;
+		}
 		i = -1UL;
 		/* leave b->bm_fo unchanged. */
-	} else {
-		b->bm_fo = i+1;
 	}
+ found:
 	spin_unlock_irq(&b->bm_lock);
 	return i;
 }
@@ -825,10 +1072,10 @@ void drbd_bm_set_find(struct drbd_conf *mdev, unsigned long i)
 	spin_unlock_irq(&b->bm_lock);
 }
 
-
 int drbd_bm_rs_done(struct drbd_conf *mdev)
 {
-	return (mdev->bitmap->bm_fo >= mdev->bitmap->bm_bits);
+	D_ASSERT(mdev->bitmap);
+	return mdev->bitmap->bm_fo >= mdev->bitmap->bm_bits;
 }
 
 /* returns number of bits actually changed.
@@ -840,22 +1087,37 @@ static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
 {
 	unsigned long flags;
 	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr = NULL;
 	unsigned long bitnr;
+	unsigned long last_page_nr = -1UL;
 	int c = 0;
+
 	ERR_IF(!b) return 1;
-	ERR_IF(!b->bm) return 1;
+	ERR_IF(!b->bm_pages) return 0;
 
-	spin_lock_irqsave(&b->bm_lock,flags);
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
 	for (bitnr = s; bitnr <= e; bitnr++) {
 		ERR_IF (bitnr >= b->bm_bits) {
 			ERR("bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
 		} else {
+			unsigned long offset = bitnr>>LN2_BPL;
+			unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
+			if (page_nr != last_page_nr) {
+				if (p_addr)
+					bm_unmap(p_addr);
+				p_addr = bm_map_paddr(b, offset);
+				last_page_nr = page_nr;
+			}
 			if (val)
-				c += (0 == __test_and_set_bit(bitnr, b->bm));
+				c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr));
 			else
-				c -= (0 != __test_and_clear_bit(bitnr, b->bm));
+				c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr));
 		}
 	}
+	if (p_addr)
+		bm_unmap(p_addr);
 	b->bm_set += c;
 	spin_unlock_irqrestore(&b->bm_lock, flags);
 	return c;
@@ -884,13 +1146,20 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
 {
 	unsigned long flags;
 	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr;
 	int i;
+
 	ERR_IF(!b) return 0;
-	ERR_IF(!b->bm) return 0;
+	ERR_IF(!b->bm_pages) return 0;
 
 	spin_lock_irqsave(&b->bm_lock, flags);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
 	if (bitnr < b->bm_bits) {
-		i = test_bit(bitnr, b->bm) ? 1 : 0;
+		unsigned long offset = bitnr>>LN2_BPL;
+		p_addr = bm_map_paddr(b, offset);
+		i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
+		bm_unmap(p_addr);
 	} else if (bitnr == b->bm_bits) {
 		i = -1;
 	} else { /* (bitnr > b->bm_bits) */
@@ -907,25 +1176,36 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
 {
 	unsigned long flags;
 	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr = NULL, page_nr = -1;
 	unsigned long bitnr;
 	int c = 0;
+	size_t w;
 
 	/* If this is called without a bitmap, that is a bug.  But just to be
 	 * robust in case we screwed up elsewhere, in that case pretend there
 	 * was one dirty bit in the requested area, so we won't try to do a
 	 * local read there (no bitmap probably implies no disk) */
 	ERR_IF(!b) return 1;
-	ERR_IF(!b->bm) return 1;
+	ERR_IF(!b->bm_pages) return 1;
 
-	spin_lock_irqsave(&b->bm_lock,flags);
-	for (bitnr = s; bitnr <=e; bitnr++) {
+	spin_lock_irqsave(&b->bm_lock, flags);
+	for (bitnr = s; bitnr <= e; bitnr++) {
+		w = bitnr >> LN2_BPL;
+		if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) {
+			page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3);
+			if (p_addr)
+				bm_unmap(p_addr);
+			p_addr = bm_map_paddr(b, w);
+		}
 		ERR_IF (bitnr >= b->bm_bits) {
-			ERR("bitnr=%lu bm_bits=%lu\n",bitnr, b->bm_bits);
+			ERR("bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
 		} else {
-			c += (0 != test_bit(bitnr, b->bm));
+			c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
 		}
 	}
-	spin_unlock_irqrestore(&b->bm_lock,flags);
+	if (p_addr)
+		bm_unmap(p_addr);
+	spin_unlock_irqrestore(&b->bm_lock, flags);
 	return c;
 }
 
@@ -949,18 +1229,28 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
 	struct drbd_bitmap *b = mdev->bitmap;
 	int count, s, e;
 	unsigned long flags;
+	unsigned long *p_addr, *bm;
 
 	ERR_IF(!b) return 0;
-	ERR_IF(!b->bm) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
 	spin_lock_irqsave(&b->bm_lock, flags);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
 
 	s = S2W(enr);
 	e = min((size_t)S2W(enr+1), b->bm_words);
 	count = 0;
 	if (s < b->bm_words) {
-		const unsigned long *w = b->bm+s;
 		int n = e-s;
-		while (n--) count += hweight_long(*w++);
+		p_addr = bm_map_paddr(b, s);
+		bm = p_addr + MLPP(s);
+		while (n--) {
+			catch_oob_access_start();
+			count += hweight_long(*bm++);
+			catch_oob_access_end();
+		}
+		bm_unmap(p_addr);
 	} else {
 		ERR("start offset (%d) too large in drbd_bm_e_weight\n", s);
 	}
@@ -975,24 +1265,36 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
 unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr, *bm;
 	unsigned long weight;
-	int count, s, e;
+	int count, s, e, i, do_now;
 	ERR_IF(!b) return 0;
-	ERR_IF(!b->bm) return 0;
+	ERR_IF(!b->bm_pages) return 0;
 
 	spin_lock_irq(&b->bm_lock);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
 	weight = b->bm_set;
 
 	s = al_enr * BM_WORDS_PER_AL_EXT;
 	e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
+	/* assert that s and e are on the same page */
+	D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
+	      ==  s    >> (PAGE_SHIFT - LN2_BPL + 3));
 	count = 0;
 	if (s < b->bm_words) {
-		const unsigned long *w = b->bm+s;
-		int n = e-s;
-		while (n--) count += hweight_long(*w++);
-		n = e-s;
-		memset(b->bm+s, -1, n*sizeof(long));
-		b->bm_set += n*BITS_PER_LONG - count;
+		i = do_now = e-s;
+		p_addr = bm_map_paddr(b, s);
+		bm = p_addr + MLPP(s);
+		while (i--) {
+			catch_oob_access_start();
+			count += hweight_long(*bm);
+			*bm = -1UL;
+			catch_oob_access_end();
+			bm++;
+		}
+		bm_unmap(p_addr);
+		b->bm_set += do_now*BITS_PER_LONG - count;
 		if (e == b->bm_words)
 			b->bm_set -= bm_clear_surplus(b);
 	} else {
diff --git a/ubuntu/drbd/drbd_buildtag.c b/ubuntu/drbd/drbd_buildtag.c
index f7000b8..2021c1c 100644
--- a/ubuntu/drbd/drbd_buildtag.c
+++ b/ubuntu/drbd/drbd_buildtag.c
@@ -1,7 +1,7 @@
 /* automatically generated. DO NOT EDIT. */
 #include <linux/drbd_config.h>
-const char * drbd_buildtag(void)
+const char *drbd_buildtag(void)
 {
-	return "GIT-hash: 3e69822d3bb4920a8c1bfdf7d647169eba7d2eb4"
-		" build by phil at fat-tyre, 2008-05-30 12:59:17";
+	return "GIT-hash: 9ba8b93e24d842f0dd3fb1f9b90e8348ddb95829"
+		" build by ivoks at ubuntu, 2009-01-17 07:49:56";
 }
diff --git a/ubuntu/drbd/drbd_int.h b/ubuntu/drbd/drbd_int.h
index 5cf453f..7231cb4 100644
--- a/ubuntu/drbd/drbd_int.h
+++ b/ubuntu/drbd/drbd_int.h
@@ -57,23 +57,24 @@
 /* Compatibility for older kernels */
 #ifndef __acquires
 # ifdef __CHECKER__
-#  define __acquires(x) __attribute__((context(x,0,1)))
-#  define __releases(x) __attribute__((context(x,1,0)))
-#  define __acquire(x)  __context__(x,1)
-#  define __release(x)  __context__(x,-1)
-#  define __cond_lock(x,c)      ((c) ? ({ __acquire(x); 1; }) : 0)
+#  define __acquires(x)	__attribute__((context(x,0,1)))
+#  define __releases(x)	__attribute__((context(x,1,0)))
+#  define __acquire(x)	__context__(x,1)
+#  define __release(x)	__context__(x,-1)
+#  define __cond_lock(x,c)	((c) ? ({ __acquire(x); 1; }) : 0)
 # else
 #  define __acquires(x)
 #  define __releases(x)
-#  define __acquire(x)  (void)0
-#  define __release(x)  (void)0
+#  define __acquire(x)	(void)0
+#  define __release(x)	(void)0
 #  define __cond_lock(x,c) (c)
 # endif
 #endif
 
 /* module parameter, defined in drbd_main.c */
-extern int minor_count;
+extern unsigned int minor_count;
 extern int allow_oos;
+extern unsigned int cn_idx;
 
 #ifdef DRBD_ENABLE_FAULTS
 extern int enable_faults;
@@ -91,7 +92,7 @@ extern char usermode_helper[];
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 
-// XXX do we need this?
+/* XXX do we need this? */
 #ifndef TRUE
 #define TRUE 1
 #endif
@@ -146,11 +147,11 @@ struct drbd_conf;
  *************************/
 
 /* handy macro: DUMPP(somepointer) */
-#define DUMPP(A)   ERR( #A " = %p in %s:%d\n", (A), __FILE__, __LINE__);
-#define DUMPLU(A)  ERR( #A " = %lu in %s:%d\n", (unsigned long)(A), __FILE__, __LINE__);
-#define DUMPLLU(A) ERR( #A " = %llu in %s:%d\n", (unsigned long long)(A), __FILE__, __LINE__);
-#define DUMPLX(A)  ERR( #A " = %lx in %s:%d\n", (A), __FILE__, __LINE__);
-#define DUMPI(A)   ERR( #A " = %d in %s:%d\n", (int)(A), __FILE__, __LINE__);
+#define DUMPP(A)   ERR(#A " = %p in %s:%d\n", (A), __FILE__, __LINE__);
+#define DUMPLU(A)  ERR(#A " = %lu in %s:%d\n", (unsigned long)(A), __FILE__, __LINE__);
+#define DUMPLLU(A) ERR(#A " = %llu in %s:%d\n", (unsigned long long)(A), __FILE__, __LINE__);
+#define DUMPLX(A)  ERR(#A " = %lx in %s:%d\n", (A), __FILE__, __LINE__);
+#define DUMPI(A)   ERR(#A " = %d in %s:%d\n", (int)(A), __FILE__, __LINE__);
 
 #define DUMPST(A) DUMPLLU((unsigned long long)(A))
 
@@ -168,15 +169,14 @@ struct drbd_conf;
 #define D_DUMPI(A)
 #endif
 
-/* Info: do not remove the spaces around the "," before ##
- *	 Otherwise this is not portable from gcc-2.95 to gcc-3.3 */
 #define PRINTK(level, fmt, args...) \
 	printk(level "drbd%d: " fmt, \
 		mdev->minor , ##args)
 
 #define ALERT(fmt, args...) PRINTK(KERN_ALERT, fmt , ##args)
 #define ERR(fmt, args...)   PRINTK(KERN_ERR, fmt , ##args)
-#define DRBD_WARN(fmt, args...)  PRINTK(KERN_WARNING, fmt , ##args)
+/* nowadays, WARN() is defined as BUG() without crash in bug.h */
+#define drbd_WARN(fmt, args...)  PRINTK(KERN_WARNING, fmt , ##args)
 #define INFO(fmt, args...)  PRINTK(KERN_INFO, fmt , ##args)
 #define DBG(fmt, args...)   PRINTK(KERN_DEBUG, fmt , ##args)
 
@@ -201,8 +201,8 @@ struct drbd_conf;
 		missed = 0;					\
 		toks -= ratelimit_jiffies;			\
 		if (lost)					\
-			DRBD_WARN("%d messages suppressed in %s:%d.\n", \
-				lost , __FILE__ , __LINE__ );	\
+			drbd_WARN("%d messages suppressed in %s:%d.\n", \
+				lost, __FILE__, __LINE__);	\
 		__ret = 1;					\
 	} else {						\
 		missed++;					\
@@ -213,7 +213,7 @@ struct drbd_conf;
 
 
 #ifdef DBG_ASSERTS
-extern void drbd_assert_breakpoint(struct drbd_conf *, char *, char *, int );
+extern void drbd_assert_breakpoint(struct drbd_conf *, char *, char *, int);
 # define D_ASSERT(exp)	if (!(exp)) \
 	 drbd_assert_breakpoint(mdev, #exp, __FILE__, __LINE__)
 #else
@@ -221,9 +221,9 @@ extern void drbd_assert_breakpoint(struct drbd_conf *, char *, char *, int );
 	 ERR("ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
 #endif
 #define ERR_IF(exp) if (({				\
-	int _b = (exp)!=0;				\
+	int _b = (exp) != 0;				\
 	if (_b) ERR("%s: (%s) in %s:%d\n",		\
-		__func__, #exp, __FILE__,__LINE__);	\
+		__func__, #exp, __FILE__, __LINE__);	\
 	 _b;						\
 	}))
 
@@ -236,6 +236,7 @@ enum {
     DRBD_FAULT_DT_WR,		/* data            */
     DRBD_FAULT_DT_RD,
     DRBD_FAULT_DT_RA,		/* data read ahead */
+    DRBD_FAULT_AL_EE,		/* alloc ee */
 
     DRBD_FAULT_MAX,
 };
@@ -245,9 +246,9 @@ extern unsigned int
 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type);
 static inline int
 drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
-    return (fault_rate &&
+    return fault_rate &&
 	    (enable_faults & (1<<type)) &&
-	    _drbd_insert_fault(mdev, type));
+	    _drbd_insert_fault(mdev, type);
 }
 #define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t)))
 
@@ -257,9 +258,9 @@ drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
 
 #include <linux/stringify.h>
 /* integer division, round _UP_ to the next integer */
-#define div_ceil(A, B) ( (A)/(B) + ((A)%(B) ? 1 : 0) )
+#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
 /* usual integer division */
-#define div_floor(A, B) ( (A)/(B) )
+#define div_floor(A, B) ((A)/(B))
 
 /*
  * Compatibility Section
@@ -270,7 +271,7 @@ drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
 #define RECALC_SIGPENDING()	    recalc_sigpending();
 
 #if defined(DBG_SPINLOCKS) && defined(__SMP__)
-# define MUST_HOLD(lock) if (!spin_is_locked(lock)) { ERR("Not holding lock! in %s\n", __FUNCTION__ ); }
+# define MUST_HOLD(lock) if (!spin_is_locked(lock)) ERR("Not holding lock! in %s\n", __func__);
 #else
 # define MUST_HOLD(lock)
 #endif
@@ -281,6 +282,17 @@ drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
 # define HAVE_KERNEL_SENDMSG 0
 #endif
 
+#ifndef uninitialized_var
+/* in upstream since 9490991482a2091a828d997adbc088e24c310a4d
+ * Date:   Sun May 6 14:49:17 2007 -0700 */
+/*
+ * A trick to suppress uninitialized variable warning without generating any
+ * code
+ */
+#define uninitialized_var(x) x = x
+#endif
+
+
 
 /*
  * our structs
@@ -290,8 +302,8 @@ drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
 	({ typecheck(struct drbd_conf*, x); \
 	  (x)->magic = (long)(x) ^ DRBD_MAGIC; })
 #define IS_VALID_MDEV(x)  \
-	( typecheck(struct drbd_conf*, x) && \
-	  ((x) ? (((x)->magic ^ DRBD_MAGIC) == (long)(x)):0))
+	(typecheck(struct drbd_conf*, x) && \
+	  ((x) ? (((x)->magic ^ DRBD_MAGIC) == (long)(x)) : 0))
 
 /* drbd_meta-data.c (still in drbd_main.c) */
 /* 4th incarnation of the disk layout. */
@@ -304,55 +316,59 @@ extern struct drbd_conf **minor_table;
  *********************************************************************/
 
 enum Drbd_Packet_Cmd {
-	Data,
-	DataReply,     /* Response to DataRequest */
-	RSDataReply,   /* Response to RSDataRequest */
-	Barrier,
-	ReportBitMap,
-	BecomeSyncTarget,
-	BecomeSyncSource,
-	UnplugRemote,  /* Used at various times to hint the peer */
-	DataRequest,   /* Used to ask for a data block */
-	RSDataRequest, /* Used to ask for a data block for resync */
-	SyncParam,
-	ReportProtocol,
-	ReportUUIDs,
-	ReportSizes,
-	ReportState,
-	ReportSyncUUID,
-	AuthChallenge,
-	AuthResponse,
-	StateChgRequest,
-
-	FIRST_ASENDER_CMD,
-	Ping = FIRST_ASENDER_CMD,
-	PingAck,
-	RecvAck,      /* Used in protocol B */
-	WriteAck,     /* Used in protocol C */
-	RSWriteAck,   /* Is a WriteAck, additionally call set_in_sync(). */
-	DiscardAck,   /* Used in proto C, two-primaries conflict detection */
-	NegAck,       /* Sent if local disk is unusable */
-	NegDReply,    /* Local disk is broken... */
-	NegRSDReply,  /* Local disk is broken... */
-	BarrierAck,
-	StateChgReply,
-	LAST_ASENDER_CMD = StateChgReply,
-
-	OVRequest,
-	OVReply,
-	OVResult, // Exception to the FIRST/LAST ASENDER_CMD
-
-	MAX_CMD,
-	MayIgnore = 0x100, /* Flag to test if (cmd > MayIgnore) ... */
-	MAX_OPT_CMD,
-
-	/* FIXME
-	 * to get a more useful error message with drbd-8 <-> drbd 0.7.x,
-	 * these could be reimplemented as special case of HandShake. */
-	HandShakeM = 0xfff1, /* First Packet on the MetaSock */
-	HandShakeS = 0xfff2, /* First Packet on the Socket */
-
-	HandShake  = 0xfffe  /* FIXED for the next century! */
+	/* receiver (data socket) */
+	Data              = 0x00,
+	DataReply         = 0x01, /* Response to DataRequest */
+	RSDataReply       = 0x02, /* Response to RSDataRequest */
+	Barrier           = 0x03,
+	ReportBitMap      = 0x04,
+	BecomeSyncTarget  = 0x05,
+	BecomeSyncSource  = 0x06,
+	UnplugRemote      = 0x07, /* Used at various times to hint the peer */
+	DataRequest       = 0x08, /* Used to ask for a data block */
+	RSDataRequest     = 0x09, /* Used to ask for a data block for resync */
+	SyncParam         = 0x0a,
+	ReportProtocol    = 0x0b,
+	ReportUUIDs       = 0x0c,
+	ReportSizes       = 0x0d,
+	ReportState       = 0x0e,
+	ReportSyncUUID    = 0x0f,
+	AuthChallenge     = 0x10,
+	AuthResponse      = 0x11,
+	StateChgRequest   = 0x12,
+
+	/* asender (meta socket */
+	Ping              = 0x13,
+	PingAck           = 0x14,
+	RecvAck           = 0x15, /* Used in protocol B */
+	WriteAck          = 0x16, /* Used in protocol C */
+	RSWriteAck        = 0x17, /* Is a WriteAck, additionally call set_in_sync(). */
+	DiscardAck        = 0x18, /* Used in proto C, two-primaries conflict detection */
+	NegAck            = 0x19, /* Sent if local disk is unusable */
+	NegDReply         = 0x1a, /* Local disk is broken... */
+	NegRSDReply       = 0x1b, /* Local disk is broken... */
+	BarrierAck        = 0x1c,
+	StateChgReply     = 0x1d,
+
+	/* "new" commands, no longer fitting into the ordering scheme above */
+
+	OVRequest         = 0x1e, /* data socket */
+	OVReply           = 0x1f,
+	OVResult          = 0x20, /* meta socket */
+	CsumRSRequest     = 0x21, /* data socket */
+	RSIsInSync        = 0x22, /* meta socket */
+	SyncParam89       = 0x23, /* data socket, protocol version 89 replacement for SyncParam */
+
+	MAX_CMD           = 0x24,
+	MayIgnore         = 0x100, /* Flag to test if (cmd > MayIgnore) ... */
+	MAX_OPT_CMD       = 0x101,
+
+	/* special command ids for handshake */
+
+	HandShakeM        = 0xfff1, /* First Packet on the MetaSock */
+	HandShakeS        = 0xfff2, /* First Packet on the Socket */
+
+	HandShake         = 0xfffe  /* FIXED for the next century! */
 };
 
 static inline const char *cmdname(enum Drbd_Packet_Cmd cmd)
@@ -372,6 +388,7 @@ static inline const char *cmdname(enum Drbd_Packet_Cmd cmd)
 		[DataRequest]	   = "DataRequest",
 		[RSDataRequest]    = "RSDataRequest",
 		[SyncParam]	   = "SyncParam",
+		[SyncParam89]	   = "SyncParam89",
 		[ReportProtocol]   = "ReportProtocol",
 		[ReportUUIDs]	   = "ReportUUIDs",
 		[ReportSizes]	   = "ReportSizes",
@@ -394,24 +411,19 @@ static inline const char *cmdname(enum Drbd_Packet_Cmd cmd)
 		[OVRequest]        = "OVRequest",
 		[OVReply]          = "OVReply",
 		[OVResult]         = "OVResult",
+		[CsumRSRequest]    = "CsumRSRequest",
+		[RSIsInSync]       = "RSIsInSync",
+		[MAX_CMD]	   = NULL,
 	};
 
-	if (Data > cmd || cmd >= MAX_CMD) {
-	    switch (cmd) {
-	    case HandShakeM:
+	if (cmd == HandShakeM)
 		return "HandShakeM";
-		break;
-	    case HandShakeS:
+	if (cmd == HandShakeS)
 		return "HandShakeS";
-		break;
-	    case HandShake:
+	if (cmd == HandShake)
 		return "HandShake";
-		break;
-	    default:
+	if (cmd >= MAX_CMD)
 		return "Unknown";
-		break;
-	    }
-	}
 	return cmdnames[cmd];
 }
 
@@ -527,7 +539,15 @@ struct Drbd_SyncParam_Packet {
 	u32 rate;
 
 	      /* Since protocol version 88 and higher. */
-	char online_verify_alg[0];
+	char verify_alg[0];
+} __attribute((packed));
+
+struct Drbd_SyncParam89_Packet {
+	struct Drbd_Header head;
+	u32 rate;
+        /* protocol version 89: */
+	char verify_alg[SHARED_SECRET_MAX];
+	char csums_alg[SHARED_SECRET_MAX];
 } __attribute((packed));
 
 struct Drbd_Protocol_Packet {
@@ -603,7 +623,7 @@ union Drbd_Polymorph_Packet {
 	struct Drbd_BlockAck_Packet	BlockAck;
 	struct Drbd_Barrier_Packet	Barrier;
 	struct Drbd_BarrierAck_Packet	BarrierAck;
-	struct Drbd_SyncParam_Packet	SyncParam;
+	struct Drbd_SyncParam89_Packet	SyncParam89;
 	struct Drbd_Protocol_Packet	Protocol;
 	struct Drbd_Sizes_Packet	Sizes;
 	struct Drbd_GenCnt_Packet	GenCnt;
@@ -628,7 +648,7 @@ struct Drbd_thread {
 	enum Drbd_thread_state t_state;
 	int (*function) (struct Drbd_thread *);
 	struct drbd_conf *mdev;
-	struct mutex task_mutex;
+	int reset_cpu_mask;
 };
 
 static inline enum Drbd_thread_state get_t_state(struct Drbd_thread *thi)
@@ -697,6 +717,23 @@ struct drbd_request;
    read_ee   .. [RS]DataRequest being read
 */
 
+struct drbd_epoch {
+	struct list_head list;
+	unsigned int barrier_nr;
+	atomic_t epoch_size; /* increased on every request added. */
+	atomic_t active;     /* increased on every req. added, and dec on every finished. */
+	unsigned long flags;
+};
+
+/* drbd_epoch flag bits */
+enum {
+	DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
+	DE_BARRIER_IN_NEXT_EPOCH_DONE,
+	DE_CONTAINS_A_BARRIER,
+	DE_HAVE_BARRIER_NUMBER,
+	DE_IS_FINISHING,
+};
+
 struct Tl_epoch_entry {
 	struct drbd_work    w;
 	struct drbd_conf *mdev;
@@ -704,18 +741,11 @@ struct Tl_epoch_entry {
 	struct hlist_node colision;
 	sector_t sector;
 	unsigned int size;
-	unsigned int barrier_nr;
+	struct drbd_epoch *epoch;
 
 	/* up to here, the struct layout is identical to drbd_request;
 	 * we might be able to use that to our advantage...  */
 
-	unsigned int barrier_nr2;
-	/* If we issue the bio with BIO_RW_BARRIER we have to
-	   send a barrier ACK before we send the ACK to this
-	   write. We store the barrier number in here.
-	   In case the barrier after this write has been coalesced
-	   as well, we set it's barrier_nr into barrier_nr2 */
-
 	unsigned int flags;
 	u64    block_id;
 };
@@ -730,17 +760,18 @@ enum {
 	__EE_CALL_AL_COMPLETE_IO,
 	__EE_CONFLICT_PENDING,
 	__EE_MAY_SET_IN_SYNC,
+	__EE_IS_BARRIER,
 };
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
 #define EE_CONFLICT_PENDING    (1<<__EE_CONFLICT_PENDING)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
+#define EE_IS_BARRIER          (1<<__EE_IS_BARRIER)
 
 /* global flag bits */
 enum {
 	CREATE_BARRIER,		/* next Data is preceeded by a Barrier */
 	SIGNAL_ASENDER,		/* whether asender wants to be interrupted */
 	SEND_PING,		/* whether asender should send a ping asap */
-	WRITE_ACK_PENDING,	/* so BarrierAck won't overtake WriteAck */
 	WORK_PENDING,		/* completion flag for drbd_disconnect */
 	STOP_SYNC_TIMER,	/* tell timer to cancel itself */
 	UNPLUG_QUEUED,		/* only relevant with kernel 2.4 */
@@ -758,12 +789,13 @@ enum {
 	NO_BARRIER_SUPP,	/* underlying block device doesn't implement barriers */
 	CONSIDER_RESYNC,
 
-	LL_DEV_NO_FLUSH,	/* blkdev_issue_flush does not work,
-				   so don't even try */
 	MD_NO_BARRIER,		/* meta data device does not support barriers,
 				   so don't even try */
-	BITMAP_IO,		/* Let user IO drain */
+	SUSPEND_IO,		/* suspend application io */
+	BITMAP_IO,		/* suspend application io;
+				   once no more io in flight, start bitmap io */
 	BITMAP_IO_QUEUED,       /* Started bitmap IO */
+	RESYNC_AFTER_NEG,       /* Resync after online grow after the attach&negotiate finished. */
 };
 
 struct drbd_bitmap; /* opaque for drbd_conf */
@@ -817,11 +849,11 @@ struct drbd_md {
 	 */
 };
 
-// for sync_conf and other types...
+/* for sync_conf and other types... */
 #define NL_PACKET(name, number, fields) struct name { fields };
 #define NL_INTEGER(pn,pr,member) int member;
 #define NL_INT64(pn,pr,member) __u64 member;
-#define NL_BIT(pn,pr,member)   unsigned member : 1;
+#define NL_BIT(pn,pr,member)   unsigned member:1;
 #define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
 #include "linux/drbd_nl.h"
 
@@ -832,7 +864,7 @@ struct drbd_backing_dev {
 	struct file *md_file;
 	struct drbd_md md;
 	struct disk_conf dc; /* The user provided config... */
-	sector_t known_size;
+	sector_t known_size; /* last known size of that backing device */
 };
 
 struct drbd_md_io {
@@ -843,10 +875,18 @@ struct drbd_md_io {
 
 struct bm_io_work {
 	struct drbd_work w;
+	char *why;
 	int (*io_fn)(struct drbd_conf *mdev);
 	void (*done)(struct drbd_conf *mdev, int rv);
 };
 
+enum write_ordering_e {
+	WO_none,
+	WO_drain_io,
+	WO_bdev_flush,
+	WO_bio_barrier
+};
+
 struct drbd_conf {
 #ifdef PARANOIA
 	long magic;
@@ -897,6 +937,7 @@ struct drbd_conf {
 	struct drbd_barrier *unused_spare_barrier; /* for pre-allocation */
 	struct drbd_barrier *newest_barrier;
 	struct drbd_barrier *oldest_barrier;
+	struct list_head out_of_sequence_requests;
 	struct hlist_head *tl_hash;
 	unsigned int tl_hash_s;
 
@@ -912,11 +953,15 @@ struct drbd_conf {
 	unsigned long rs_mark_left;
 	/* marks's time [unit jiffies] */
 	unsigned long rs_mark_time;
-
+	/* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
+	unsigned long rs_same_csum;
 	sector_t ov_position;
-	sector_t ov_last_oos_start; /* Start sector of out of sync range */
-	sector_t ov_last_oos_size;  /* size of out-of-sync range in sectors */
+	/* Start sector of out of sync range. */
+	sector_t ov_last_oos_start;
+	/* size of out-of-sync range in sectors. */
+	sector_t ov_last_oos_size;
 	unsigned long ov_left;
+	struct crypto_hash *csums_tfm;
 	struct crypto_hash *verify_tfm;
 
 	struct Drbd_thread receiver;
@@ -935,7 +980,10 @@ struct drbd_conf {
 	u64 *p_uuid;
 	/* FIXME clean comments, restructure so it is more obvious which
 	 * members are protected by what */
-	unsigned int epoch_size;
+	struct drbd_epoch *current_epoch;
+	spinlock_t epoch_lock;
+	unsigned int epochs;
+	enum write_ordering_e write_ordering;
 	struct list_head active_ee; /* IO in progress */
 	struct list_head sync_ee;   /* IO in progress */
 	struct list_head done_ee;   /* send ack */
@@ -979,7 +1027,7 @@ struct drbd_conf {
 	struct mutex state_mutex;
 };
 
-static inline struct drbd_conf *minor_to_mdev(int minor)
+static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
 {
 	struct drbd_conf *mdev;
 
@@ -1042,14 +1090,14 @@ extern int _drbd_request_state(struct drbd_conf *, union drbd_state_t,
 extern int _drbd_set_state(struct drbd_conf *, union drbd_state_t,
 			   enum chg_state_flags, struct completion *done);
 extern void print_st_err(struct drbd_conf *, union drbd_state_t,
-			union drbd_state_t, int );
+			union drbd_state_t, int);
 extern int  drbd_thread_start(struct Drbd_thread *thi);
 extern void _drbd_thread_stop(struct Drbd_thread *thi, int restart, int wait);
 #ifdef CONFIG_SMP
-extern void drbd_thread_set_cpu(struct Drbd_thread *thi, cpumask_t cpu_mask);
+extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
 extern cpumask_t drbd_calc_cpu_mask(struct drbd_conf *mdev);
 #else
-#define drbd_thread_set_cpu(A,B) ({})
+#define drbd_thread_current_set_cpu(A) ({})
 #define drbd_calc_cpu_mask(A) CPU_MASK_ALL
 #endif
 extern void drbd_free_resources(struct drbd_conf *mdev);
@@ -1125,9 +1173,11 @@ extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
 extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
 extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
 				 int (*io_fn)(struct drbd_conf *),
-				 void (*done)(struct drbd_conf *, int));
+				 void (*done)(struct drbd_conf *, int),
+				 char *why);
 extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
-extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *));
+extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
+extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
 
 
 /* Meta data layout
@@ -1135,7 +1185,7 @@ extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf
    * either at the end of the backing device
    * or on a seperate meta data device. */
 
-#define MD_RESERVED_SECT ( 128LU << 11 )  /* 128 MB, unit sectors */
+#define MD_RESERVED_SECT (128LU << 11)  /* 128 MB, unit sectors */
 /* The following numbers are sectors */
 #define MD_AL_OFFSET 8	    /* 8 Sectors after start of meta area */
 #define MD_AL_MAX_SIZE 64   /* = 32 kb LOG  ~ 3776 extents ~ 14 GB Storage */
@@ -1187,7 +1237,7 @@ struct bm_extent {
 #define BM_BLOCK_SIZE	 (1<<BM_BLOCK_SIZE_B)
 /* (9+3) : 512 bytes @ 8 bits; representing 16M storage
  * per sector of on disk bitmap */
-#define BM_EXT_SIZE_B	 (BM_BLOCK_SIZE_B + MD_HARDSECT_B + 3 )  /* = 24 */
+#define BM_EXT_SIZE_B	 (BM_BLOCK_SIZE_B + MD_HARDSECT_B + 3)  /* = 24 */
 #define BM_EXT_SIZE	 (1<<BM_EXT_SIZE_B)
 
 #if (BM_EXT_SIZE_B != 24) || (BM_BLOCK_SIZE_B != 12)
@@ -1206,16 +1256,17 @@ struct bm_extent {
  * _storage_ sector is located in */
 #define BM_SECT_TO_EXT(x)   ((x)>>(BM_EXT_SIZE_B-9))
 
-/* who much _storage_ sectors we have per bitmap sector */
-#define BM_SECT_PER_EXT     (1ULL << (BM_EXT_SIZE_B-9))
+/* how much _storage_ sectors we have per bitmap sector */
+#define BM_EXT_TO_SECT(x)   ((sector_t)(x) << (BM_EXT_SIZE_B-9))
+#define BM_SECT_PER_EXT     BM_EXT_TO_SECT(1)
 
 /* in one sector of the bitmap, we have this many activity_log extents. */
-#define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SIZE_B - AL_EXTENT_SIZE_B) )
+#define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SIZE_B - AL_EXTENT_SIZE_B))
 #define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SIZE_B-BM_BLOCK_SIZE_B-LN2_BPL))
 
 
-#define BM_BLOCKS_PER_BM_EXT_B ( BM_EXT_SIZE_B - BM_BLOCK_SIZE_B )
-#define BM_BLOCKS_PER_BM_EXT_MASK  ( (1<<BM_BLOCKS_PER_BM_EXT_B) - 1 )
+#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SIZE_B - BM_BLOCK_SIZE_B)
+#define BM_BLOCKS_PER_BM_EXT_MASK  ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
 
 /* I want the packet to fit within one page
  * THINK maybe use a special bitmap header,
@@ -1223,6 +1274,10 @@ struct bm_extent {
  * Do not use PAGE_SIZE here! Use a architecture agnostic constant!
  */
 #define BM_PACKET_WORDS ((4096-sizeof(struct Drbd_Header))/sizeof(long))
+#if (PAGE_SIZE < 4096)
+/* drbd_send_bitmap / receive_bitmap would break horribly */
+#error "PAGE_SIZE too small"
+#endif
 
 /* the extent in "PER_EXTENT" below is an activity log extent
  * we need that many (long words/bytes) to store the bitmap
@@ -1240,11 +1295,11 @@ struct bm_extent {
 
 #define DRBD_MAX_SECTORS_32 (0xffffffffLU)
 #define DRBD_MAX_SECTORS_BM \
-	  ( (MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SIZE_B-9)) )
+	  ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SIZE_B-9)))
 #if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
 #define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_BM
 #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
-#elif   !defined(CONFIG_LBD) && BITS_PER_LONG == 32
+#elif !defined(CONFIG_LBD) && BITS_PER_LONG == 32
 #define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_32
 #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
 #else
@@ -1266,6 +1321,7 @@ extern int  drbd_bm_init(struct drbd_conf *mdev);
 extern int  drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors);
 extern void drbd_bm_cleanup(struct drbd_conf *mdev);
 extern void drbd_bm_set_all(struct drbd_conf *mdev);
+extern void drbd_bm_clear_all(struct drbd_conf *mdev);
 extern void drbd_bm_reset_find(struct drbd_conf *mdev);
 extern int  drbd_bm_set_bits(
 		struct drbd_conf *mdev, unsigned long s, unsigned long e);
@@ -1292,13 +1348,12 @@ extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
 extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
 		size_t number, unsigned long *buffer);
 
-extern void __drbd_bm_lock(struct drbd_conf *mdev, char *file, int line);
+extern void drbd_bm_lock(struct drbd_conf *mdev, char *why);
 extern void drbd_bm_unlock(struct drbd_conf *mdev);
-#define drbd_bm_lock(mdev)    __drbd_bm_lock(mdev, __FILE__, __LINE__ )
 
 extern void _drbd_bm_recount_bits(struct drbd_conf *mdev, char *file, int line);
 #define drbd_bm_recount_bits(mdev) \
-	_drbd_bm_recount_bits(mdev, __FILE__, __LINE__ )
+	_drbd_bm_recount_bits(mdev, __FILE__, __LINE__)
 extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
 /* drbd_main.c */
 
@@ -1306,7 +1361,6 @@ extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, con
  * because of kmem_cache_t weirdness */
 #include "drbd_wrappers.h"
 
-extern int minor_count;
 extern struct kmem_cache *drbd_request_cache;
 extern struct kmem_cache *drbd_ee_cache;
 extern mempool_t *drbd_request_mempool;
@@ -1345,16 +1399,18 @@ enum {
 	TraceTypeNl	= 0x00000040,
 	TraceTypeALExts = 0x00000080,
 	TraceTypeIntRq  = 0x00000100,
+	TraceTypeMDIO   = 0x00000200,
+	TraceTypeEpochs = 0x00000400,
 };
 
 static inline int
 is_trace(unsigned int type, unsigned int level) {
-	return ((trace_level >= level) && (type & trace_type));
+	return (trace_level >= level) && (type & trace_type);
 }
 static inline int
 is_mdev_trace(struct drbd_conf *mdev, unsigned int type, unsigned int level) {
-	return (is_trace(type, level) &&
-		( ( 1 << mdev_to_minor(mdev)) & trace_devs));
+	return is_trace(type, level) &&
+		((1 << mdev_to_minor(mdev)) & trace_devs);
 }
 
 #define MTRACE(type, lvl, code...) \
@@ -1388,19 +1444,20 @@ extern void drbd_print_buffer(const char *prefix, unsigned int flags, int size,
 			      unsigned int length);
 
 /* Bio printing support */
-extern void _dump_bio(const char *pfx, struct drbd_conf *mdev, struct bio *bio, int complete);
+extern void _dump_bio(const char *pfx, struct drbd_conf *mdev, struct bio *bio, int complete, struct drbd_request *r);
 
 static inline void dump_bio(struct drbd_conf *mdev,
-		struct bio *bio, int complete)
+		struct bio *bio, int complete, struct drbd_request *r)
 {
 	MTRACE(TraceTypeRq, TraceLvlSummary,
-	       _dump_bio("Rq", mdev, bio, complete);
+	       _dump_bio("Rq", mdev, bio, complete, r);
 		);
 }
 
-static inline void dump_internal_bio(const char *pfx, struct drbd_conf *mdev, struct bio *bio, int complete) {
-	MTRACE(TraceTypeIntRq,TraceLvlSummary,
-	       _dump_bio(pfx, mdev, bio, complete);
+static inline void dump_internal_bio(const char *pfx, struct drbd_conf *mdev, struct bio *bio, int complete)
+{
+	MTRACE(TraceTypeIntRq, TraceLvlSummary,
+	       _dump_bio(pfx, mdev, bio, complete, NULL);
 		);
 }
 
@@ -1431,11 +1488,19 @@ dump_packet(struct drbd_conf *mdev, struct socket *sock,
 /* drbd_req */
 extern int drbd_make_request_26(struct request_queue *q, struct bio *bio);
 extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
-extern int drbd_merge_bvec(struct request_queue *, struct bvec_merge_data *bvm, struct bio_vec *);
+extern int drbd_merge_bvec(struct request_queue *q,
+#ifdef HAVE_bvec_merge_data
+		struct bvec_merge_data *bvm,
+#else
+		struct bio *bvm,
+#endif
+		struct bio_vec *bvec);
 extern int is_valid_ar_handle(struct drbd_request *, sector_t);
 
 
 /* drbd_nl.c */
+extern void drbd_suspend_io(struct drbd_conf *mdev);
+extern void drbd_resume_io(struct drbd_conf *mdev);
 extern char *ppsize(char *buf, unsigned long long size);
 extern sector_t drbd_new_dev_size(struct drbd_conf *,
 		struct drbd_backing_dev *);
@@ -1445,10 +1510,7 @@ extern void resync_after_online_grow(struct drbd_conf *);
 extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
 extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
 		int force);
-extern int drbd_ioctl(struct inode *inode, struct file *file,
-		      unsigned int cmd, unsigned long arg);
 enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
-extern long drbd_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg);
 extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
 
 /* drbd_worker.c */
@@ -1480,6 +1542,7 @@ extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
 extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
 extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
 extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
 extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
 extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
 extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
@@ -1492,6 +1555,7 @@ extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
 extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
 extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
 extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
 
 extern void resync_timer_fn(unsigned long data);
 
@@ -1510,18 +1574,50 @@ extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
 extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
 extern void _drbd_clear_done_ee(struct drbd_conf *mdev);
 
+/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
+ * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
+static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
+			char __user *optval, int optlen)
+{
+	int err;
+	if (level == SOL_SOCKET)
+		err = sock_setsockopt(sock, level, optname, optval, optlen);
+	else
+		err = sock->ops->setsockopt(sock, level, optname, optval,
+					    optlen);
+	return err;
+}
+
 static inline void drbd_tcp_cork(struct socket *sock)
 {
 	int __user val = 1;
-	tcp_setsockopt(sock->sk, SOL_TCP, TCP_CORK, (char __user *)&val, sizeof(val) );
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
+			(char __user *)&val, sizeof(val));
 }
 
-static inline void drbd_tcp_flush(struct socket *sock)
+static inline void drbd_tcp_uncork(struct socket *sock)
 {
 	int __user val = 0;
-	tcp_setsockopt(sock->sk, SOL_TCP, TCP_CORK, (char __user *)&val, sizeof(val) );
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
+			(char __user *)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_nodelay(struct socket *sock)
+{
+	int __user val = 1;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+			(char __user *)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_quickack(struct socket *sock)
+{
+	int __user val = 1;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+			(char __user *)&val, sizeof(val));
 }
 
+void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
+
 /* drbd_proc.c */
 extern struct proc_dir_entry *drbd_proc;
 extern struct file_operations drbd_proc_fops;
@@ -1542,11 +1638,11 @@ extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
 extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
 		int size, const char *file, const unsigned int line);
 #define drbd_set_in_sync(mdev, sector, size) \
-	__drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__ )
+	__drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__)
 extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
 		int size, const char *file, const unsigned int line);
 #define drbd_set_out_of_sync(mdev, sector, size) \
-	__drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__ )
+	__drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
 extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
 extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev);
 extern void drbd_al_shrink(struct drbd_conf *mdev);
@@ -1573,27 +1669,34 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
 #define user_isp_mask 1
 #define aftr_isp_mask 1
 
+/* drbd state debug */
+#if DRBD_DEBUG_STATE_CHANGES
+#define DRBD_STATE_DEBUG_INIT_VAL(s) ({ (s).line = __LINE__; (s).func = __func__; })
+#else
+#define DRBD_STATE_DEBUG_INIT_VAL(s) do { } while (0)
+#endif
+
 #define NS(T, S) \
 	({ union drbd_state_t mask; mask.i = 0; mask.T = T##_mask; mask; }), \
-	({ union drbd_state_t val; val.i = 0; val.T = (S); val; })
+	({ union drbd_state_t val; DRBD_STATE_DEBUG_INIT_VAL(val); val.i = 0; val.T = (S); val; })
 #define NS2(T1, S1, T2, S2) \
 	({ union drbd_state_t mask; mask.i = 0; mask.T1 = T1##_mask; \
 	  mask.T2 = T2##_mask; mask; }), \
-	({ union drbd_state_t val; val.i = 0; val.T1 = (S1); \
+	({ union drbd_state_t val; DRBD_STATE_DEBUG_INIT_VAL(val); val.i = 0; val.T1 = (S1); \
 	  val.T2 = (S2); val; })
 #define NS3(T1, S1, T2, S2, T3, S3) \
 	({ union drbd_state_t mask; mask.i = 0; mask.T1 = T1##_mask; \
 	  mask.T2 = T2##_mask; mask.T3 = T3##_mask; mask; }), \
-	({ union drbd_state_t val; val.i = 0; val.T1 = (S1); \
+	({ union drbd_state_t val; DRBD_STATE_DEBUG_INIT_VAL(val); val.i = 0; val.T1 = (S1); \
 	  val.T2 = (S2); val.T3 = (S3); val; })
 
 #define _NS(D, T, S) \
-	D, ({ union drbd_state_t __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
+	D, ({ union drbd_state_t __ns; DRBD_STATE_DEBUG_INIT_VAL(__ns); __ns.i = D->state.i; __ns.T = (S); __ns; })
 #define _NS2(D, T1, S1, T2, S2) \
-	D, ({ union drbd_state_t __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
+	D, ({ union drbd_state_t __ns; DRBD_STATE_DEBUG_INIT_VAL(__ns); __ns.i = D->state.i; __ns.T1 = (S1); \
 	__ns.T2 = (S2); __ns; })
 #define _NS3(D, T1, S1, T2, S2, T3, S3) \
-	D, ({ union drbd_state_t __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
+	D, ({ union drbd_state_t __ns; DRBD_STATE_DEBUG_INIT_VAL(__ns); __ns.i = D->state.i; __ns.T1 = (S1); \
 	__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
 
 static inline void drbd_state_lock(struct drbd_conf *mdev)
@@ -1682,26 +1785,40 @@ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
 	switch (bdev->dc.meta_dev_idx) {
 	case DRBD_MD_INDEX_INTERNAL:
 	case DRBD_MD_INDEX_FLEX_INT:
-		return bdev->md.md_offset + MD_AL_OFFSET -1;
+		return bdev->md.md_offset + MD_AL_OFFSET - 1;
 	case DRBD_MD_INDEX_FLEX_EXT:
 	default:
 		return bdev->md.md_offset + bdev->md.md_size_sect;
 	}
 }
 
-/* returns the capacity we announce to out peer */
+/* returns the capacity we announce to out peer.
+ * we clip ourselves at the various MAX_SECTORS, because if we don't,
+ * current implementation will oops sooner or later */
 static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
 {
+	sector_t s;
 	switch (bdev->dc.meta_dev_idx) {
 	case DRBD_MD_INDEX_INTERNAL:
 	case DRBD_MD_INDEX_FLEX_INT:
-		return drbd_get_capacity(bdev->backing_bdev)
-			? drbd_md_first_sector(bdev)
+		s = drbd_get_capacity(bdev->backing_bdev)
+			? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+					drbd_md_first_sector(bdev))
 			: 0;
+		break;
 	case DRBD_MD_INDEX_FLEX_EXT:
+		s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+				drbd_get_capacity(bdev->backing_bdev));
+		/* clip at maximum size the meta device can support */
+		s = min_t(sector_t, s,
+			BM_EXT_TO_SECT(bdev->md.md_size_sect
+				     - bdev->md.bm_offset));
+		break;
 	default:
-		return drbd_get_capacity(bdev->backing_bdev);
+		s = min_t(sector_t, DRBD_MAX_SECTORS,
+				drbd_get_capacity(bdev->backing_bdev));
 	}
+	return s;
 }
 
 /* returns the sector number of our meta data 'super' block */
@@ -1840,7 +1957,7 @@ static inline void inc_ap_pending(struct drbd_conf *mdev)
 		    atomic_read(&mdev->which))
 
 #define dec_ap_pending(mdev)	do {				\
-	typecheck(struct drbd_conf *, mdev);				\
+	typecheck(struct drbd_conf *, mdev);			\
 	if (atomic_dec_and_test(&mdev->ap_pending_cnt))		\
 		wake_up(&mdev->misc_wait);			\
 	ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
@@ -1857,7 +1974,7 @@ static inline void inc_rs_pending(struct drbd_conf *mdev)
 }
 
 #define dec_rs_pending(mdev)	do {				\
-	typecheck(struct drbd_conf *, mdev);				\
+	typecheck(struct drbd_conf *, mdev);			\
 	atomic_dec(&mdev->rs_pending_cnt);			\
 	ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
 
@@ -1876,12 +1993,12 @@ static inline void inc_unacked(struct drbd_conf *mdev)
 }
 
 #define dec_unacked(mdev)	do {				\
-	typecheck(struct drbd_conf *, mdev);				\
+	typecheck(struct drbd_conf *, mdev);			\
 	atomic_dec(&mdev->unacked_cnt);				\
 	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
 
 #define sub_unacked(mdev, n)	do {				\
-	typecheck(struct drbd_conf *, mdev);				\
+	typecheck(struct drbd_conf *, mdev);			\
 	atomic_sub(n, &mdev->unacked_cnt);			\
 	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
 
@@ -1928,7 +2045,7 @@ static inline int _inc_local_if_state(struct drbd_conf *mdev, enum drbd_disk_sta
 	int io_allowed;
 
 	atomic_inc(&mdev->local_cnt);
-	io_allowed = (mdev->state.disk >= mins );
+	io_allowed = (mdev->state.disk >= mins);
 	if (!io_allowed)
 		dec_local(mdev);
 	return io_allowed;
@@ -1961,7 +2078,7 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
 		 * for now, just prevent in-kernel buffer overflow.
 		 */
 		smp_rmb();
-		DRBD_WARN("cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
+		drbd_WARN("cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
 				conns_to_name(mdev->state.conn),
 				*bits_left, mdev->rs_total, mdev->rs_failed);
 		*per_mil_done = 0;
@@ -1988,18 +2105,84 @@ static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
 	return mxb;
 }
 
+static inline int drbd_state_is_stable(union drbd_state_t s)
+{
+
+	/* DO NOT add a default clause, we want the compiler to warn us
+	 * for any newly introduced state we may have forgotten to add here */
+
+	switch ((enum drbd_conns)s.conn) {
+	/* new io only accepted when there is no connection, ... */
+	case StandAlone:
+	case WFConnection:
+	/* ... or there is a well established connection. */
+	case Connected:
+	case SyncSource:
+	case SyncTarget:
+	case VerifyS:
+	case VerifyT:
+	case PausedSyncS:
+	case PausedSyncT:
+		/* maybe stable, look at the disk state */
+		break;
+
+	/* no new io accepted during tansitional states
+	 * like handshake or teardown */
+	case Disconnecting:
+	case Unconnected:
+	case Timeout:
+	case BrokenPipe:
+	case NetworkFailure:
+	case ProtocolError:
+	case TearDown:
+	case WFReportParams:
+	case StartingSyncS:
+	case StartingSyncT:
+	case WFBitMapS:
+	case WFBitMapT:
+	case WFSyncUUID:
+	case conn_mask:
+		/* not "stable" */
+		return 0;
+	}
+
+	switch ((enum drbd_disk_state)s.disk) {
+	case Diskless:
+	case Inconsistent:
+	case Outdated:
+	case Consistent:
+	case UpToDate:
+		/* disk state is stable as well. */
+		break;
+
+	/* no new io accepted during tansitional states */
+	case Attaching:
+	case Failed:
+	case Negotiating:
+	case DUnknown:
+	case disk_mask:
+		/* not "stable" */
+		return 0;
+	}
+
+	return 1;
+}
+
 static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
 {
-	const unsigned int cs = mdev->state.conn;
-	const unsigned int ds = mdev->state.disk;
 	int mxb = drbd_get_max_buffers(mdev);
+
 	if (mdev->state.susp)
 		return 0;
-	/* to avoid deadlock or bitmap corruption, we need to lock out
-	 * application io during attaching and bitmap exchange */
-	if (Attaching <= ds && ds <= Negotiating)
+	if (test_bit(SUSPEND_IO, &mdev->flags))
 		return 0;
-	if (cs == WFBitMapS || cs == WFBitMapT || cs == WFReportParams || cs == WFSyncUUID)
+
+	/* to avoid potential deadlock or bitmap corruption,
+	 * in various places, we only allow new application io
+	 * to start during "stable" states. */
+
+	/* no new io accepted when attaching or detaching the disk */
+	if (!drbd_state_is_stable(mdev->state))
 		return 0;
 
 	/* since some older kernels don't have atomic_add_unless,
@@ -2008,14 +2191,13 @@ static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
 		return 0;
 	if (test_bit(BITMAP_IO, &mdev->flags))
 		return 0;
-	atomic_inc(&mdev->ap_bio_cnt);
 	return 1;
 }
 
 /* I'd like to use wait_event_lock_irq,
  * but I'm not sure when it got introduced,
  * and not sure when it has 3 or 4 arguments */
-static inline void inc_ap_bio(struct drbd_conf *mdev)
+static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
 {
 	/* compare with after_state_ch,
 	 * os.conn != WFBitMapS && ns.conn == WFBitMapS */
@@ -2037,6 +2219,7 @@ static inline void inc_ap_bio(struct drbd_conf *mdev)
 		finish_wait(&mdev->misc_wait, &wait);
 		spin_lock_irq(&mdev->req_lock);
 	}
+	atomic_add(one_or_two, &mdev->ap_bio_cnt);
 	spin_unlock_irq(&mdev->req_lock);
 }
 
@@ -2046,6 +2229,9 @@ static inline void dec_ap_bio(struct drbd_conf *mdev)
 	int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt);
 
 	D_ASSERT(ap_bio >= 0);
+	/* this currently does wake_up for every dec_ap_bio!
+	 * maybe rather introduce some type of hysteresis?
+	 * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
 	if (ap_bio < mxb)
 		wake_up(&mdev->misc_wait);
 	if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
@@ -2058,8 +2244,9 @@ static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
 {
 	mdev->ed_uuid = val;
 
-	MTRACE(TraceTypeUuid,TraceLvlMetrics,
-	       INFO(" exposed data uuid now %016llX\n",val);
+	MTRACE(TraceTypeUuid, TraceLvlMetrics,
+	       INFO(" exposed data uuid now %016llX\n",
+		    (unsigned long long)val);
 		);
 }
 
@@ -2070,7 +2257,7 @@ static inline int seq_cmp(u32 a, u32 b)
 	 * we'd have to
 	 *  a <<= 8; b <<= 8;
 	 */
-	return ((s32)(a) - (s32)(b));
+	return (s32)(a) - (s32)(b);
 }
 #define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
 #define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
@@ -2127,4 +2314,19 @@ static inline void drbd_kick_lo(struct drbd_conf *mdev)
 		dec_local(mdev);
 	}
 }
+
+static inline void drbd_md_flush(struct drbd_conf *mdev)
+{
+	int r;
+
+	if (test_bit(MD_NO_BARRIER, &mdev->flags))
+		return;
+
+	r = blkdev_issue_flush(mdev->bc->md_bdev, NULL);
+	if (r) {
+		set_bit(MD_NO_BARRIER, &mdev->flags);
+		ERR("meta data flush failed with status %d, disabling md-flushes\n", r);
+	}
+}
+
 #endif
diff --git a/ubuntu/drbd/drbd_main.c b/ubuntu/drbd/drbd_main.c
index aa33ec6..e1e50a3 100644
--- a/ubuntu/drbd/drbd_main.c
+++ b/ubuntu/drbd/drbd_main.c
@@ -69,13 +69,18 @@ int drbd_worker(struct Drbd_thread *);
 int drbd_asender(struct Drbd_thread *);
 
 int drbd_init(void);
-int drbd_open(struct inode *inode, struct file *file);
-int drbd_close(struct inode *inode, struct file *file);
-int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
-static void after_state_ch(struct drbd_conf *mdev, union drbd_state_t os,
+#ifdef BD_OPS_USE_FMODE
+static int drbd_open(struct block_device *bdev, fmode_t mode);
+static int drbd_release(struct gendisk *gd, fmode_t mode);
+#else
+static int drbd_open(struct inode *inode, struct file *file);
+static int drbd_release(struct inode *inode, struct file *file);
+#endif
+STATIC int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+STATIC void after_state_ch(struct drbd_conf *mdev, union drbd_state_t os,
 			   union drbd_state_t ns, enum chg_state_flags flags);
-int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
-void md_sync_timer_fn(unsigned long data);
+STATIC int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+STATIC void md_sync_timer_fn(unsigned long data);
 
 MODULE_AUTHOR("Philipp Reisner <phil at linbit.com>, "
 	      "Lars Ellenberg <lars at linbit.com>");
@@ -89,8 +94,9 @@ MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
 MODULE_PARM_DESC(allow_oos, "DONT USE!");
 /* thanks to these macros, if compiled into the kernel (not-module),
  * this becomes the boot parameter drbd.minor_count */
-module_param(minor_count, int,0444);
-module_param(allow_oos, bool,0);
+module_param(minor_count, uint, 0444);
+module_param(allow_oos, bool, 0);
+module_param(cn_idx, uint, 0444);
 
 #ifdef DRBD_ENABLE_FAULTS
 int enable_faults;
@@ -108,8 +114,9 @@ module_param(fault_devs, int, 0644);
 #endif
 
 /* module parameter, defined */
-int minor_count = 32;
+unsigned int minor_count = 32;
 int allow_oos;
+unsigned int cn_idx = CN_IDX_DRBD;
 
 #ifdef ENABLE_DYNAMIC_TRACE
 int trace_type;		/* Bitmap of trace types to enable */
@@ -153,7 +160,7 @@ wait_queue_head_t drbd_pp_wait;
 STATIC struct block_device_operations drbd_ops = {
 	.owner =   THIS_MODULE,
 	.open =    drbd_open,
-	.release = drbd_close,
+	.release = drbd_release,
 };
 
 #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
@@ -162,14 +169,14 @@ STATIC struct block_device_operations drbd_ops = {
 /* When checking with sparse, and this is an inline function, sparse will
    give tons of false positives. When this is a real functions sparse works.
  */
-int _inc_local_if_state(struct drbd_conf* mdev, enum drbd_disk_state mins)
+int _inc_local_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
 {
 	int io_allowed;
 
 	atomic_inc(&mdev->local_cnt);
-	io_allowed = (mdev->state.disk >= mins );
-	if( !io_allowed ) {
-		if(atomic_dec_and_test(&mdev->local_cnt))
+	io_allowed = (mdev->state.disk >= mins);
+	if (!io_allowed) {
+		if (atomic_dec_and_test(&mdev->local_cnt))
 			wake_up(&mdev->misc_wait);
 	}
 	return io_allowed;
@@ -194,6 +201,7 @@ STATIC int tl_init(struct drbd_conf *mdev)
 
 	mdev->oldest_barrier = b;
 	mdev->newest_barrier = b;
+	INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
 
 	mdev->tl_hash = NULL;
 	mdev->tl_hash_s = 0;
@@ -204,6 +212,7 @@ STATIC int tl_init(struct drbd_conf *mdev)
 STATIC void tl_cleanup(struct drbd_conf *mdev)
 {
 	D_ASSERT(mdev->oldest_barrier == mdev->newest_barrier);
+	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
 	kfree(mdev->oldest_barrier);
 	kfree(mdev->unused_spare_barrier);
 	kfree(mdev->tl_hash);
@@ -245,29 +254,42 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
 
 	b = mdev->oldest_barrier;
 
+	/* first some paranoia code */
+	if (b == NULL) {
+		ERR("BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+			barrier_nr);
+		goto bail;
+	}
+	if (b->br_number != barrier_nr) {
+		ERR("BAD! BarrierAck #%u received, expected #%u!\n",
+			barrier_nr, b->br_number);
+		goto bail;
+	}
+	if (b->n_req != set_size) {
+		ERR("BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
+			barrier_nr, set_size, b->n_req);
+		goto bail;
+	}
+
 	/* Clean up list of requests processed during current epoch */
 	list_for_each_safe(le, tle, &b->requests) {
 		r = list_entry(le, struct drbd_request, tl_requests);
 		_req_mod(r, barrier_acked, 0);
 	}
-	list_del(&b->requests);
 	/* There could be requests on the list waiting for completion
-	   of the write to the local disk, to avoid corruptions of
-	   slab's data structures we have to remove the lists head */
+	   of the write to the local disk. To avoid corruptions of
+	   slab's data structures we have to remove the lists head.
 
-	D_ASSERT(b->br_number == barrier_nr);
-	D_ASSERT(b->n_req == set_size);
+	   Also there could have been a barrier ack out of sequence, overtaking
+	   the write acks - which would be a but and violating write ordering.
+	   To not deadlock in case we lose connection while such requests are
+	   still pending, we need some way to find them for the
+	   _req_mode(connection_lost_while_pending).
 
-#if 1
-	if (b->br_number != barrier_nr) {
-		DUMPI(b->br_number);
-		DUMPI(barrier_nr);
-	}
-	if (b->n_req != set_size) {
-		DUMPI(b->n_req);
-		DUMPI(set_size);
-	}
-#endif
+	   These have been list_move'd to the out_of_sequence_requests list in
+	   _req_mod(, barrier_acked,) above.
+	   */
+	list_del_init(&b->requests);
 
 	nob = b->next;
 	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
@@ -283,6 +305,13 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
 	}
 
 	spin_unlock_irq(&mdev->req_lock);
+	dec_ap_pending(mdev);
+
+	return;
+
+bail:
+	spin_unlock_irq(&mdev->req_lock);
+	drbd_force_state(mdev, NS(conn, ProtocolError));
 }
 
 
@@ -291,16 +320,14 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
 void tl_clear(struct drbd_conf *mdev)
 {
 	struct drbd_barrier *b, *tmp;
-
-	DRBD_WARN("tl_clear()\n");
+	struct list_head *le, *tle;
+	struct drbd_request *r;
+	int new_initial_bnr = net_random();
 
 	spin_lock_irq(&mdev->req_lock);
 
 	b = mdev->oldest_barrier;
-	while ( b ) {
-		struct list_head *le, *tle;
-		struct drbd_request *r;
-
+	while (b) {
 		list_for_each_safe(le, tle, &b->requests) {
 			r = list_entry(le, struct drbd_request, tl_requests);
 			_req_mod(r, connection_lost_while_pending, 0);
@@ -323,7 +350,7 @@ void tl_clear(struct drbd_conf *mdev)
 			INIT_LIST_HEAD(&b->requests);
 			INIT_LIST_HEAD(&b->w.list);
 			b->w.cb = NULL;
-			b->br_number = 4711;
+			b->br_number = new_initial_bnr;
 			b->n_req = 0;
 
 			mdev->oldest_barrier = b;
@@ -332,8 +359,15 @@ void tl_clear(struct drbd_conf *mdev)
 		kfree(b);
 		b = tmp;
 	}
-	D_ASSERT(mdev->newest_barrier == mdev->oldest_barrier);
-	D_ASSERT(mdev->newest_barrier->br_number == 4711);
+
+	/* we expect this list to be empty. */
+	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
+
+	/* but just in case, clean it up anyways! */
+	list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
+		r = list_entry(le, struct drbd_request, tl_requests);
+		_req_mod(r, connection_lost_while_pending, 0);
+	}
 
 	/* ensure bit indicating barrier is required is clear */
 	clear_bit(CREATE_BARRIER, &mdev->flags);
@@ -380,8 +414,10 @@ int drbd_io_error(struct drbd_conf *mdev, int forcedetach)
 
 	if (mdev->state.conn >= Connected) {
 		ok = drbd_send_state(mdev);
-		if (ok) DRBD_WARN("Notified peer that my disk is broken.\n");
-		else ERR("Sending state in drbd_io_error() failed\n");
+		if (ok)
+			drbd_WARN("Notified peer that my disk is broken.\n");
+		else
+			ERR("Sending state in drbd_io_error() failed\n");
 	}
 
 	/* Make sure we try to flush meta-data to disk - we come
@@ -399,6 +435,12 @@ int drbd_io_error(struct drbd_conf *mdev, int forcedetach)
 	return ok;
 }
 
+#if DRBD_DEBUG_STATE_CHANGES
+static void trace_st(struct drbd_conf *mdev, const unsigned long long seq,
+		const char *func, unsigned int line,
+		const char *name, union drbd_state_t s);
+#endif
+
 /**
  * cl_wide_st_chg:
  * Returns TRUE if this state change should be preformed as a cluster wide
@@ -407,11 +449,11 @@ int drbd_io_error(struct drbd_conf *mdev, int forcedetach)
 STATIC int cl_wide_st_chg(struct drbd_conf *mdev,
 			  union drbd_state_t os, union drbd_state_t ns)
 {
-	return ( os.conn >= Connected && ns.conn >= Connected &&
-		 ( ( os.role != Primary && ns.role == Primary ) ||
-		   ( os.conn != StartingSyncT && ns.conn == StartingSyncT ) ||
-		   ( os.conn != StartingSyncS && ns.conn == StartingSyncS ) ||
-		   ( os.disk != Diskless && ns.disk == Diskless ) ) ) ||
+	return (os.conn >= Connected && ns.conn >= Connected &&
+		 ((os.role != Primary && ns.role == Primary) ||
+		  (os.conn != StartingSyncT && ns.conn == StartingSyncT) ||
+		  (os.conn != StartingSyncS && ns.conn == StartingSyncS) ||
+		  (os.disk != Diskless && ns.disk == Diskless))) ||
 		(os.conn >= Connected && ns.conn == Disconnecting) ||
 		(os.conn == Connected && ns.conn == VerifyS);
 }
@@ -419,6 +461,13 @@ STATIC int cl_wide_st_chg(struct drbd_conf *mdev,
 int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
 		      union drbd_state_t mask, union drbd_state_t val)
 {
+#if DRBD_DEBUG_STATE_CHANGES
+	static unsigned long long sseq = 0xf0000000LLU;
+	unsigned long seq;
+	unsigned int line = val.line;
+	const char *func = val.func;
+#endif
+
 	unsigned long flags;
 	union drbd_state_t os, ns;
 	int rv;
@@ -426,8 +475,17 @@ int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
 	spin_lock_irqsave(&mdev->req_lock, flags);
 	os = mdev->state;
 	ns.i = (os.i & ~mask.i) | val.i;
+#if DRBD_DEBUG_STATE_CHANGES
+	seq = ++sseq;
+	trace_st(mdev, seq, func, line, "!os", os);
+	trace_st(mdev, seq, func, line, "!ns", ns);
+	ns.func = NULL;
+#endif
 	rv = _drbd_set_state(mdev, ns, f, NULL);
 	ns = mdev->state;
+#if DRBD_DEBUG_STATE_CHANGES
+	trace_st(mdev, seq, func, line, "=ns", ns);
+#endif
 	spin_unlock_irqrestore(&mdev->req_lock, flags);
 
 	return rv;
@@ -462,7 +520,8 @@ STATIC enum set_st_err _req_st_cond(struct drbd_conf *mdev,
 	spin_lock_irqsave(&mdev->req_lock, flags);
 	os = mdev->state;
 	ns.i = (os.i & ~mask.i) | val.i;
-	if ( !cl_wide_st_chg(mdev, os, ns) ) rv = SS_CW_NoNeed;
+	if (!cl_wide_st_chg(mdev, os, ns))
+		rv = SS_CW_NoNeed;
 	if (!rv) {
 		rv = is_valid_state(mdev, ns);
 		if (rv == SS_Success) {
@@ -486,6 +545,13 @@ STATIC int drbd_req_state(struct drbd_conf *mdev,
 			  union drbd_state_t mask, union drbd_state_t val,
 			  enum chg_state_flags f)
 {
+#if DRBD_DEBUG_STATE_CHANGES
+	static unsigned long long sseq = 0;
+	unsigned long seq;
+	unsigned int line = val.line;
+	const char *func = val.func;
+#endif
+
 	struct completion done;
 	unsigned long flags;
 	union drbd_state_t os, ns;
@@ -496,10 +562,17 @@ STATIC int drbd_req_state(struct drbd_conf *mdev,
 	if (f & ChgSerialize)
 		mutex_lock(&mdev->state_mutex);
 
-	spin_lock_irqsave(&mdev->req_lock,flags);
+	spin_lock_irqsave(&mdev->req_lock, flags);
 	os = mdev->state;
 	ns.i = (os.i & ~mask.i) | val.i;
 
+#if DRBD_DEBUG_STATE_CHANGES
+	seq = ++sseq;
+	trace_st(mdev, seq, func, line, "?os", os);
+	trace_st(mdev, seq, func, line, "?ns", ns);
+	ns.func = NULL;
+#endif
+
 	if (cl_wide_st_chg(mdev, os, ns)) {
 		rv = is_valid_state(mdev, ns);
 		if (rv == SS_Success)
@@ -513,7 +586,7 @@ STATIC int drbd_req_state(struct drbd_conf *mdev,
 		}
 
 		drbd_state_lock(mdev);
-		if ( !drbd_send_state_req(mdev, mask, val) ) {
+		if (!drbd_send_state_req(mdev, mask, val)) {
 			drbd_state_unlock(mdev);
 			rv = SS_CW_FailedByPeer;
 			if (f & ChgStateVerbose)
@@ -547,7 +620,12 @@ STATIC int drbd_req_state(struct drbd_conf *mdev,
 		wait_for_completion(&done);
 	}
 
-  abort:
+abort:
+#if DRBD_DEBUG_STATE_CHANGES
+	trace_st(mdev, seq, func, line, ":os", os);
+	trace_st(mdev, seq, func, line, ":ns", ns);
+#endif
+
 	if (f & ChgSerialize)
 		mutex_unlock(&mdev->state_mutex);
 
@@ -571,10 +649,39 @@ int _drbd_request_state(struct drbd_conf *mdev,	union drbd_state_t mask,
 	return rv;
 }
 
+#if DRBD_DEBUG_STATE_CHANGES
+static void trace_st(struct drbd_conf *mdev, const unsigned long long seq,
+		const char *func, unsigned int line,
+		const char *name, union drbd_state_t s)
+{
+
+	const struct task_struct *c = current;
+	const char *context =
+		c == mdev->worker.task ? "worker" :
+		c == mdev->receiver.task ? "receiver" :
+		c == mdev->asender.task ? "asender" : "other";
+
+	DBG(" %8llx [%s] %s:%u %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
+	    seq, context, func, line,
+	    name,
+	    conns_to_name(s.conn),
+	    roles_to_name(s.role),
+	    roles_to_name(s.peer),
+	    disks_to_name(s.disk),
+	    disks_to_name(s.pdsk),
+	    s.susp ? 's' : 'r',
+	    s.aftr_isp ? 'a' : '-',
+	    s.peer_isp ? 'p' : '-',
+	    s.user_isp ? 'u' : '-'
+	    );
+}
+#else
+#define trace_st(...) do { } while (0)
+#endif
 
 STATIC void print_st(struct drbd_conf *mdev, char *name, union drbd_state_t ns)
 {
-	ERR(" %s = { cs:%s st:%s/%s ds:%s/%s %c%c%c%c }\n",
+	ERR(" %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
 	    name,
 	    conns_to_name(ns.conn),
 	    roles_to_name(ns.role),
@@ -602,10 +709,10 @@ void print_st_err(struct drbd_conf *mdev,
 #define peers_to_name roles_to_name
 #define pdsks_to_name disks_to_name
 
-#define susps_to_name(A) ( (A) ? "1" : "0" )
-#define aftr_isps_to_name(A) ( (A) ? "1" : "0" )
-#define peer_isps_to_name(A) ( (A) ? "1" : "0" )
-#define user_isps_to_name(A) ( (A) ? "1" : "0" )
+#define susps_to_name(A)     ((A) ? "1" : "0")
+#define aftr_isps_to_name(A) ((A) ? "1" : "0")
+#define peer_isps_to_name(A) ((A) ? "1" : "0")
+#define user_isps_to_name(A) ((A) ? "1" : "0")
 
 #define PSC(A) \
 	({ if (ns.A != os.A) { \
@@ -628,8 +735,8 @@ int is_valid_state(struct drbd_conf *mdev, union drbd_state_t ns)
 	}
 
 	if (inc_net(mdev)) {
-		if ( !mdev->net_conf->two_primaries &&
-		    ns.role == Primary && ns.peer == Primary )
+		if (!mdev->net_conf->two_primaries &&
+		    ns.role == Primary && ns.peer == Primary)
 			rv = SS_TwoPrimaries;
 		dec_net(mdev);
 	}
@@ -639,29 +746,28 @@ int is_valid_state(struct drbd_conf *mdev, union drbd_state_t ns)
 	else if (ns.role == Secondary && mdev->open_cnt)
 		rv = SS_DeviceInUse;
 
-	else if ( ns.role == Primary && ns.conn < Connected &&
-		 ns.disk < UpToDate ) rv = SS_NoUpToDateDisk;
+	else if (ns.role == Primary && ns.conn < Connected && ns.disk < UpToDate)
+		rv = SS_NoUpToDateDisk;
 
-	else if ( fp >= Resource &&
-		 ns.role == Primary && ns.conn < Connected &&
-		 ns.pdsk >= DUnknown ) rv = SS_PrimaryNOP;
+	else if (fp >= Resource &&
+		 ns.role == Primary && ns.conn < Connected && ns.pdsk >= DUnknown)
+		rv = SS_PrimaryNOP;
 
-	else if ( ns.role == Primary && ns.disk <= Inconsistent &&
-		 ns.pdsk <= Inconsistent ) rv = SS_NoUpToDateDisk;
+	else if (ns.role == Primary && ns.disk <= Inconsistent && ns.pdsk <= Inconsistent)
+		rv = SS_NoUpToDateDisk;
 
-	else if ( ns.conn > Connected &&
-		 ns.disk < UpToDate && ns.pdsk < UpToDate )
+	else if (ns.conn > Connected && ns.disk < UpToDate && ns.pdsk < UpToDate)
 		rv = SS_BothInconsistent;
 
-	else if ( ns.conn > Connected &&
-		 (ns.disk == Diskless || ns.pdsk == Diskless ) )
+	else if (ns.conn > Connected && (ns.disk == Diskless || ns.pdsk == Diskless))
 		rv = SS_SyncingDiskless;
 
-	else if ( (ns.conn == Connected ||
+	else if ((ns.conn == Connected ||
 		  ns.conn == WFBitMapS ||
 		  ns.conn == SyncSource ||
 		  ns.conn == PausedSyncS) &&
-		 ns.disk == Outdated ) rv = SS_ConnectedOutdates;
+		  ns.disk == Outdated)
+		rv = SS_ConnectedOutdates;
 
 	else if( (ns.conn == VerifyS ||
 		  ns.conn == VerifyT) &&
@@ -679,8 +785,9 @@ int is_valid_state_transition(struct drbd_conf *mdev,
 {
 	int rv = SS_Success;
 
-	if ( (ns.conn == StartingSyncT || ns.conn == StartingSyncS ) &&
-	    os.conn > Connected) rv = SS_ResyncRunning;
+	if ((ns.conn == StartingSyncT || ns.conn == StartingSyncS) &&
+	    os.conn > Connected)
+		rv = SS_ResyncRunning;
 
 	if (ns.conn == Disconnecting && os.conn == StandAlone)
 		rv = SS_AlreadyStandAlone;
@@ -688,11 +795,11 @@ int is_valid_state_transition(struct drbd_conf *mdev,
 	if (ns.disk > Attaching && os.disk == Diskless)
 		rv = SS_IsDiskLess;
 
-	if ( ns.conn == WFConnection && os.conn < Unconnected )
-		rv=SS_NoNetConfig;
+	if (ns.conn == WFConnection && os.conn < Unconnected)
+		rv = SS_NoNetConfig;
 
-	if ( ns.disk == Outdated && os.disk < Outdated && os.disk != Attaching)
-		rv=SS_LowerThanOutdated;
+	if (ns.disk == Outdated && os.disk < Outdated && os.disk != Attaching)
+		rv = SS_LowerThanOutdated;
 
 	if (ns.conn == Disconnecting && os.conn == Unconnected)
 		rv = SS_InTransientState;
@@ -700,7 +807,7 @@ int is_valid_state_transition(struct drbd_conf *mdev,
 	if (ns.conn == os.conn && ns.conn == WFReportParams)
 		rv = SS_InTransientState;
 
-	if( (ns.conn == VerifyS || ns.conn == VerifyT) && os.conn < Connected )
+	if ((ns.conn == VerifyS || ns.conn == VerifyT) && os.conn < Connected)
 		rv=SS_NeedConnection;
 
 	if ((ns.conn == VerifyS || ns.conn == VerifyT) &&
@@ -718,16 +825,28 @@ int _drbd_set_state(struct drbd_conf *mdev,
 		    union drbd_state_t ns, enum chg_state_flags flags,
 		    struct completion *done)
 {
+#if DRBD_DEBUG_STATE_CHANGES
+	static unsigned long long sseq = 0xff000000LLU;
+	unsigned long long seq = 0;
+#endif
 	union drbd_state_t os;
 	int rv = SS_Success;
 	int warn_sync_abort = 0;
 	enum fencing_policy fp;
-	struct after_state_chg_work* ascw;
+	struct after_state_chg_work *ascw;
 
 	MUST_HOLD(&mdev->req_lock);
 
 	os = mdev->state;
 
+#if DRBD_DEBUG_STATE_CHANGES
+	if (ns.func) {
+		seq = ++sseq;
+		trace_st(mdev, seq, ns.func, ns.line, "==os", os);
+		trace_st(mdev, seq, ns.func, ns.line, "==ns", ns);
+	}
+#endif
+
 	fp = DontCare;
 	if (inc_local(mdev)) {
 		fp = mdev->bc->dc.fencing;
@@ -736,13 +855,21 @@ int _drbd_set_state(struct drbd_conf *mdev,
 
 	/* Early state sanitising. */
 
+	/* Dissalow the invalidate command to connect  */
+	if ((ns.conn == StartingSyncS || ns.conn == StartingSyncT) &&
+		os.conn < Connected) {
+		ns.conn = os.conn;
+		ns.pdsk = os.pdsk;
+	}
+
 	/* Dissalow Network errors to configure a device's network part */
-	if ( (ns.conn >= Timeout && ns.conn <= TearDown ) &&
-	    os.conn <= Disconnecting )
+	if ((ns.conn >= Timeout && ns.conn <= TearDown) &&
+	    os.conn <= Disconnecting)
 		ns.conn = os.conn;
 
-	/* After a network error (+TearDown) only Unconnected can follow */
-	if (os.conn >= Timeout && os.conn <= TearDown && ns.conn != Unconnected)
+	/* After a network error (+TearDown) only Unconnected or Disconnecting can follow */
+	if (os.conn >= Timeout && os.conn <= TearDown &&
+	    ns.conn != Unconnected && ns.conn != Disconnecting)
 		ns.conn = os.conn;
 
 	/* After Disconnecting only StandAlone may follow */
@@ -752,22 +879,23 @@ int _drbd_set_state(struct drbd_conf *mdev,
 	if (ns.conn < Connected) {
 		ns.peer_isp = 0;
 		ns.peer = Unknown;
-		if ( ns.pdsk > DUnknown ||
-		     ns.pdsk < Inconsistent ) ns.pdsk = DUnknown;
+		if (ns.pdsk > DUnknown || ns.pdsk < Inconsistent)
+			ns.pdsk = DUnknown;
 	}
 
 	if (ns.conn <= Disconnecting && ns.disk == Diskless)
 		ns.pdsk = DUnknown;
 
 	if (os.conn > Connected && ns.conn > Connected &&
-	     (ns.disk <= Failed || ns.pdsk <= Failed )) {
+            (ns.disk <= Failed || ns.pdsk <= Failed)) {
 		warn_sync_abort = 1;
 		ns.conn = Connected;
 	}
 
-	if (ns.conn != os.conn && ns.conn >= Connected &&
-	    (ns.disk == Consistent || ns.disk == Outdated)) {
-		switch(ns.conn) {
+	if (ns.conn >= Connected &&
+	    ((ns.disk == Consistent || ns.disk == Outdated) ||
+	     (ns.disk == Negotiating && ns.conn == WFBitMapT))) {
+		switch (ns.conn) {
 		case WFBitMapT:
 		case PausedSyncT:
 			ns.disk = Outdated;
@@ -780,16 +908,16 @@ int _drbd_set_state(struct drbd_conf *mdev,
 			break;
 		case SyncTarget:
 			ns.disk = Inconsistent;
-			DRBD_WARN("Implicit set disk state Inconsistent!\n");
+			drbd_WARN("Implicit set disk state Inconsistent!\n");
 			break;
 		}
 		if (os.disk == Outdated && ns.disk == UpToDate)
-			DRBD_WARN("Implicit set disk from Outdate to UpToDate\n");
+			drbd_WARN("Implicit set disk from Outdate to UpToDate\n");
 	}
 
-	if (ns.conn != os.conn && ns.conn >= Connected &&
+	if (ns.conn >= Connected &&
 	    (ns.pdsk == Consistent || ns.pdsk == Outdated)) {
-		switch(ns.conn) {
+		switch (ns.conn) {
 		case Connected:
 		case WFBitMapT:
 		case PausedSyncT:
@@ -802,11 +930,11 @@ int _drbd_set_state(struct drbd_conf *mdev,
 			break;
 		case SyncSource:
 			ns.pdsk = Inconsistent;
-			DRBD_WARN("Implicit set pdsk Inconsistent!\n");
+			drbd_WARN("Implicit set pdsk Inconsistent!\n");
 			break;
 		}
 		if (os.pdsk == Outdated && ns.pdsk == UpToDate)
-			DRBD_WARN("Implicit set pdsk from Outdate to UpToDate\n");
+			drbd_WARN("Implicit set pdsk from Outdate to UpToDate\n");
 	}
 
 	/* Connection breaks down before we finished "Negotiating" */
@@ -841,10 +969,15 @@ int _drbd_set_state(struct drbd_conf *mdev,
 			ns.conn = SyncTarget;
 	}
 
+#if DRBD_DEBUG_STATE_CHANGES
+	if (ns.func)
+		trace_st(mdev, seq, ns.func, ns.line, "==ns", ns);
+#endif
+
 	if (ns.i == os.i)
 		return SS_NothingToDo;
 
-	if ( !(flags & ChgStateHard) ) {
+	if (!(flags & ChgStateHard)) {
 		/*  pre-state-change checks ; only look at ns  */
 		/* See drbd_state_sw_errors in drbd_strings.c */
 
@@ -853,13 +986,13 @@ int _drbd_set_state(struct drbd_conf *mdev,
 			/* If the old state was illegal as well, then let
 			   this happen...*/
 
-			if ( is_valid_state(mdev, os) == rv ) {
+			if (is_valid_state(mdev, os) == rv) {
 				ERR("Considering state change from bad state. "
 				    "Error would be: '%s'\n",
 				    set_st_err_name(rv));
 				print_st(mdev, "old", os);
 				print_st(mdev, "new", ns);
-				rv = is_valid_state_transition(mdev,ns,os);
+				rv = is_valid_state_transition(mdev, ns, os);
 			}
 		} else
 			rv = is_valid_state_transition(mdev, ns, os);
@@ -872,7 +1005,7 @@ int _drbd_set_state(struct drbd_conf *mdev,
 	}
 
 	if (warn_sync_abort)
-		DRBD_WARN("Resync aborted.\n");
+		drbd_WARN("Resync aborted.\n");
 
 #if DUMP_MD >= 2
 	{
@@ -892,6 +1025,11 @@ int _drbd_set_state(struct drbd_conf *mdev,
 	}
 #endif
 
+#if DRBD_DEBUG_STATE_CHANGES
+	if (ns.func)
+		trace_st(mdev, seq, ns.func, ns.line, ":=ns", ns);
+#endif
+
 	mdev->state.i = ns.i;
 	wake_up(&mdev->misc_wait);
 	wake_up(&mdev->state_wait);
@@ -902,14 +1040,13 @@ int _drbd_set_state(struct drbd_conf *mdev,
 		mod_timer(&mdev->resync_timer, jiffies);
 	}
 
-	if ( (os.conn == PausedSyncT || os.conn == PausedSyncS) &&
-	    (ns.conn == SyncTarget  || ns.conn == SyncSource) ) {
+	if ((os.conn == PausedSyncT || os.conn == PausedSyncS) &&
+	    (ns.conn == SyncTarget  || ns.conn == SyncSource)) {
 		INFO("Syncer continues.\n");
 		mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
 		if (ns.conn == SyncTarget) {
-			if (!test_and_clear_bit(STOP_SYNC_TIMER,&mdev->flags)) {
-				mod_timer(&mdev->resync_timer,jiffies);
-			}
+			if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
+				mod_timer(&mdev->resync_timer, jiffies);
 			/* This if (!test_bit) is only needed for the case
 			   that a device that has ceased to used its timer,
 			   i.e. it is already in drbd_resync_finished() gets
@@ -917,8 +1054,8 @@ int _drbd_set_state(struct drbd_conf *mdev,
 		}
 	}
 
-	if ( (os.conn == SyncTarget  || os.conn == SyncSource) &&
-	    (ns.conn == PausedSyncT || ns.conn == PausedSyncS) ) {
+	if ((os.conn == SyncTarget  || os.conn == SyncSource) &&
+	    (ns.conn == PausedSyncT || ns.conn == PausedSyncS)) {
 		INFO("Resync suspended\n");
 		mdev->rs_mark_time = jiffies;
 		if (ns.conn == PausedSyncT)
@@ -935,25 +1072,29 @@ int _drbd_set_state(struct drbd_conf *mdev,
 		mdev->rs_mark_time = jiffies;
 		mdev->ov_last_oos_size = 0;
 		mdev->ov_last_oos_start = 0;
-		if(ns.conn == VerifyS) {
+
+		if (ns.conn == VerifyS)
 			mod_timer(&mdev->resync_timer,jiffies);
-		}
 	}
 
-	if(inc_local(mdev)) {
+	if (inc_local(mdev)) {
 		u32 mdf = mdev->bc->md.flags & ~(MDF_Consistent|MDF_PrimaryInd|
 						 MDF_ConnectedInd|MDF_WasUpToDate|
-						 MDF_PeerOutDated );
-		if (test_bit(CRASHED_PRIMARY,&mdev->flags) ||
+						 MDF_PeerOutDated);
+
+		if (test_bit(CRASHED_PRIMARY, &mdev->flags) ||
 		    mdev->state.role == Primary ||
-		    ( mdev->state.pdsk < Inconsistent &&
-		      mdev->state.peer == Primary ) )  mdf |= MDF_PrimaryInd;
-		if (mdev->state.conn > WFReportParams) mdf |= MDF_ConnectedInd;
-		if (mdev->state.disk > Inconsistent)   mdf |= MDF_Consistent;
-		if (mdev->state.disk > Outdated)       mdf |= MDF_WasUpToDate;
-		if (mdev->state.pdsk <= Outdated &&
-		    mdev->state.pdsk >= Inconsistent)  mdf |= MDF_PeerOutDated;
-		if( mdf != mdev->bc->md.flags) {
+		    (mdev->state.pdsk < Inconsistent && mdev->state.peer == Primary))
+			mdf |= MDF_PrimaryInd;
+		if (mdev->state.conn > WFReportParams)
+			mdf |= MDF_ConnectedInd;
+		if (mdev->state.disk > Inconsistent)
+			mdf |= MDF_Consistent;
+		if (mdev->state.disk > Outdated)
+			mdf |= MDF_WasUpToDate;
+		if (mdev->state.pdsk <= Outdated && mdev->state.pdsk >= Inconsistent)
+			mdf |= MDF_PeerOutDated;
+		if (mdf != mdev->bc->md.flags) {
 			mdev->bc->md.flags = mdf;
 			drbd_md_mark_dirty(mdev);
 		}
@@ -989,13 +1130,13 @@ int _drbd_set_state(struct drbd_conf *mdev,
 		ascw->done = done;
 		drbd_queue_work(&mdev->data.work, &ascw->w);
 	} else {
-		DRBD_WARN("Could not kmalloc an ascw\n");
+		drbd_WARN("Could not kmalloc an ascw\n");
 	}
 
 	return rv;
 }
 
-int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+STATIC int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 {
 	struct after_state_chg_work *ascw;
 
@@ -1028,12 +1169,12 @@ static void abw_start_sync(struct drbd_conf *mdev, int rv)
 	}
 }
 
-static void after_state_ch(struct drbd_conf *mdev, union drbd_state_t os,
+STATIC void after_state_ch(struct drbd_conf *mdev, union drbd_state_t os,
 			   union drbd_state_t ns, enum chg_state_flags flags)
 {
 	enum fencing_policy fp;
 
-	if ( (os.conn != Connected && ns.conn == Connected) ) {
+	if (os.conn != Connected && ns.conn == Connected) {
 		clear_bit(CRASHED_PRIMARY, &mdev->flags);
 		if (mdev->p_uuid)
 			mdev->p_uuid[UUID_FLAGS] &= ~((u64)2);
@@ -1048,37 +1189,45 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state_t os,
 	/* Inform userspace about the change... */
 	drbd_bcast_state(mdev, ns);
 
+	if (!(os.role == Primary && os.disk < UpToDate && os.pdsk < UpToDate) &&
+	    (ns.role == Primary && ns.disk < UpToDate && ns.pdsk < UpToDate))
+		drbd_khelper(mdev, "pri-on-incon-degr");
+
 	/* Here we have the actions that are performed after a
 	   state change. This function might sleep */
 
 	if (fp == Stonith && ns.susp) {
 		/* case1: The outdate peer handler is successfull:
 		 * case2: The connection was established again: */
-		if ( (os.pdsk > Outdated  && ns.pdsk <= Outdated) ||
-		     (os.conn < Connected && ns.conn >= Connected) ) {
+		if ((os.pdsk > Outdated  && ns.pdsk <= Outdated) ||
+		    (os.conn < Connected && ns.conn >= Connected)) {
 			tl_clear(mdev);
 			spin_lock_irq(&mdev->req_lock);
 			_drbd_set_state(_NS(mdev, susp, 0), ChgStateVerbose, NULL);
 			spin_unlock_irq(&mdev->req_lock);
 		}
 	}
-	/* Do not change the order of the if above and below... */
+	/* Do not change the order of the if above and the two below... */
+	if (os.pdsk == Diskless && ns.pdsk > Diskless) {      /* attach on the peer */
+		drbd_send_uuids(mdev);
+		drbd_send_state(mdev);
+	}
 	if (os.conn != WFBitMapS && ns.conn == WFBitMapS)
-		drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL);
+		drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
 
 	/* Lost contact to peer's copy of the data */
-	if ( (os.pdsk >= Inconsistent &&
-	      os.pdsk != DUnknown &&
-	      os.pdsk != Outdated)
-	&&   (ns.pdsk < Inconsistent ||
-	      ns.pdsk == DUnknown ||
-	      ns.pdsk == Outdated) ) {
+	if ((os.pdsk >= Inconsistent &&
+	     os.pdsk != DUnknown &&
+	     os.pdsk != Outdated)
+	&&  (ns.pdsk < Inconsistent ||
+	     ns.pdsk == DUnknown ||
+	     ns.pdsk == Outdated)) {
+		/* FIXME race with drbd_sync_handshake accessing this! */
 		kfree(mdev->p_uuid);
 		mdev->p_uuid = NULL;
 		if (inc_local(mdev)) {
-			/* generate new uuid, unless we did already */
-			if (ns.role == Primary &&
-			    mdev->bc->md.uuid[Bitmap] == 0)
+			if (ns.role == Primary && mdev->bc->md.uuid[Bitmap] == 0 &&
+			    ns.disk >= UpToDate)
 				drbd_uuid_new_current(mdev);
 			if (ns.peer == Primary) {
 				/* Note: The condition ns.peer == Primary implies
@@ -1111,8 +1260,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state_t os,
 	}
 
 	/* Last part of the attaching process ... */
-	if ( ns.conn >= Connected &&
-	     os.disk == Attaching && ns.disk == Negotiating ) {
+	if (ns.conn >= Connected &&
+	    os.disk == Attaching && ns.disk == Negotiating) {
 		kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
 		mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
 		drbd_send_sizes(mdev);  /* to start sync... */
@@ -1121,35 +1270,30 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state_t os,
 	}
 
 	/* We want to pause/continue resync, tell peer. */
-	if ( ns.conn >= Connected &&
-	     (( os.aftr_isp != ns.aftr_isp ) ||
-	      ( os.user_isp != ns.user_isp )) )
+	if (ns.conn >= Connected &&
+	     ((os.aftr_isp != ns.aftr_isp) ||
+	      (os.user_isp != ns.user_isp)))
 		drbd_send_state(mdev);
 
 	/* In case one of the isp bits got set, suspend other devices. */
-	if ( ( !os.aftr_isp && !os.peer_isp && !os.user_isp) &&
-	     ( ns.aftr_isp || ns.peer_isp || ns.user_isp) )
+	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
+	    (ns.aftr_isp || ns.peer_isp || ns.user_isp))
 		suspend_other_sg(mdev);
 
 	/* Make sure the peer gets informed about eventual state
 	   changes (ISP bits) while we were in WFReportParams. */
-	if (os.conn == WFReportParams && ns.conn >= Connected) {
+	if (os.conn == WFReportParams && ns.conn >= Connected)
 		drbd_send_state(mdev);
-	}
 
 	/* We are in the progress to start a full sync... */
-	if ( ( os.conn != StartingSyncT && ns.conn == StartingSyncT ) ||
-	     ( os.conn != StartingSyncS && ns.conn == StartingSyncS ) ) {
-		INFO("Queueing bitmap io: about to start a forced full sync\n");
-		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync);
-	}
+	if ((os.conn != StartingSyncT && ns.conn == StartingSyncT) ||
+	    (os.conn != StartingSyncS && ns.conn == StartingSyncS))
+		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
 
 	/* We are invalidating our self... */
-	if ( os.conn < Connected && ns.conn < Connected &&
-	       os.disk > Inconsistent && ns.disk == Inconsistent ) {
-		INFO("Queueing bitmap io: invalidate forced full sync\n");
-		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL);
-	}
+	if (os.conn < Connected && ns.conn < Connected &&
+	    os.disk > Inconsistent && ns.disk == Inconsistent)
+		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
 
 	if (os.disk > Diskless && ns.disk == Diskless) {
 		/* since inc_local() only works as long as disk>=Inconsistent,
@@ -1166,10 +1310,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state_t os,
 		__no_warn(local, mdev->bc = NULL;);
 	}
 
+	/* Disks got bigger while they were detached */
+	if (ns.disk > Negotiating && ns.pdsk > Negotiating &&
+	    test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
+		if (ns.conn == Connected)
+			resync_after_online_grow(mdev);
+	}
+
 	/* A resync finished or aborted, wake paused devices... */
-	if ( (os.conn > Connected && ns.conn <= Connected) ||
-	     (os.peer_isp && !ns.peer_isp) ||
-	     (os.user_isp && !ns.user_isp) )
+	if ((os.conn > Connected && ns.conn <= Connected) ||
+	    (os.peer_isp && !ns.peer_isp) ||
+	    (os.user_isp && !ns.user_isp))
 		resume_next_sg(mdev);
 
 	/* Upon network connection, we need to start the received */
@@ -1178,7 +1329,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state_t os,
 
 	/* Terminate worker thread if we are unconfigured - it will be
 	   restarted as needed... */
-	if (ns.disk == Diskless && ns.conn == StandAlone)
+	if (ns.disk == Diskless && ns.conn == StandAlone && ns.role == Secondary)
 		drbd_thread_stop_nowait(&mdev->worker);
 
 	drbd_md_sync(mdev);
@@ -1189,7 +1340,12 @@ STATIC int drbd_thread_setup(void *arg)
 {
 	struct Drbd_thread *thi = (struct Drbd_thread *) arg;
 	struct drbd_conf *mdev = thi->mdev;
+	long timeout;
 	int retval;
+	const char *me =
+		thi == &mdev->receiver ? "receiver" :
+		thi == &mdev->asender  ? "asender"  :
+		thi == &mdev->worker   ? "worker"   : "NONSENSE";
 
 	daemonize("drbd_thread");
 	D_ASSERT(get_t_state(thi) == Running);
@@ -1198,31 +1354,46 @@ STATIC int drbd_thread_setup(void *arg)
 	thi->task = current;
 	smp_mb();
 	spin_unlock(&thi->t_lock);
+
+	/* stolen from kthread; FIXME we need to convert to kthread api!
+	 * wait for wakeup */
+	__set_current_state(TASK_UNINTERRUPTIBLE);
 	complete(&thi->startstop); /* notify: thi->task is set. */
+	timeout = schedule_timeout(10*HZ);
+	D_ASSERT(timeout != 0);
 
-	while (1) {
-		retval = thi->function(thi);
-		if (get_t_state(thi) != Restarting) break;
+restart:
+	retval = thi->function(thi);
+
+	spin_lock(&thi->t_lock);
+
+	/* if the receiver has been "Exiting", the last thing it did
+	 * was set the conn state to "StandAlone",
+	 * if now a re-connect request comes in, conn state goes Unconnected,
+	 * and receiver thread will be "started".
+	 * drbd_thread_start needs to set "Restarting" in that case.
+	 * t_state check and assignement needs to be within the same spinlock,
+	 * so either thread_start sees Exiting, and can remap to Restarting,
+	 * or thread_start see None, and can proceed as normal.
+	 */
+
+	if (thi->t_state == Restarting) {
+		INFO("Restarting %s thread\n", me);
 		thi->t_state = Running;
+		spin_unlock(&thi->t_lock);
+		goto restart;
 	}
 
-	mutex_lock(&thi->task_mutex);
-	spin_lock(&thi->t_lock);
 	thi->task = NULL;
 	thi->t_state = None;
 	smp_mb();
-	spin_unlock(&thi->t_lock);
-	mutex_unlock(&thi->task_mutex);
 
 	/* THINK maybe two different completions? */
 	complete(&thi->startstop); /* notify: thi->task unset. */
+	INFO("Terminating %s thread\n", me);
+	spin_unlock(&thi->t_lock);
 
-	INFO("Terminating %s thread\n",
-	     thi == &mdev->receiver ? "receiver" :
-	     thi == &mdev->asender  ? "asender"  :
-	     thi == &mdev->worker   ? "worker"   : "NONSENSE");
-
-	// Release mod reference taken when thread was started
+	/* Release mod reference taken when thread was started */
 	module_put(THIS_MODULE);
 	return retval;
 }
@@ -1235,22 +1406,23 @@ STATIC void drbd_thread_init(struct drbd_conf *mdev, struct Drbd_thread *thi,
 	thi->t_state = None;
 	thi->function = func;
 	thi->mdev = mdev;
-	mutex_init(&thi->task_mutex);
 }
 
 int drbd_thread_start(struct Drbd_thread *thi)
 {
 	int pid;
 	struct drbd_conf *mdev = thi->mdev;
+	const char *me =
+		thi == &mdev->receiver ? "receiver" :
+		thi == &mdev->asender  ? "asender"  :
+		thi == &mdev->worker   ? "worker"   : "NONSENSE";
 
 	spin_lock(&thi->t_lock);
 
-	if (thi->t_state == None) {
+	switch (thi->t_state) {
+	case None:
 		INFO("Starting %s thread (from %s [%d])\n",
-		     thi == &mdev->receiver ? "receiver" :
-		     thi == &mdev->asender  ? "asender"  :
-		     thi == &mdev->worker   ? "worker"   : "NONSENSE",
-		     current->comm, current->pid);
+				me, current->comm, current->pid);
 
 		/* Get ref on module for thread - this is released when thread exits */
 		if (!try_module_get(THIS_MODULE)) {
@@ -1261,6 +1433,7 @@ int drbd_thread_start(struct Drbd_thread *thi)
 
 		init_completion(&thi->startstop);
 		D_ASSERT(thi->task == NULL);
+		thi->reset_cpu_mask = 1;
 		thi->t_state = Running;
 		spin_unlock(&thi->t_lock);
 		flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
@@ -1275,10 +1448,23 @@ int drbd_thread_start(struct Drbd_thread *thi)
 		}
 		/* waits until thi->task is set */
 		wait_for_completion(&thi->startstop);
-		D_ASSERT(thi->task);
-		D_ASSERT(get_t_state(thi) == Running);
-	} else {
+		if (thi->t_state != Running)
+			ERR("ASSERT FAILED: %s t_state == %d expected %d.\n",
+					me, thi->t_state, Running);
+		if (thi->task)
+			wake_up_process(thi->task);
+		else
+			ERR("ASSERT FAILED thi->task is NULL where it should be set!?\n");
+		break;
+	case Exiting:
+		thi->t_state = Restarting;
+		INFO("Restarting %s thread (from %s [%d])\n",
+				me, current->comm, current->pid);
+	case Running:
+	case Restarting:
+	default:
 		spin_unlock(&thi->t_lock);
+		break;
 	}
 
 	return TRUE;
@@ -1289,6 +1475,10 @@ void _drbd_thread_stop(struct Drbd_thread *thi, int restart, int wait)
 {
 	struct drbd_conf *mdev = thi->mdev;
 	enum Drbd_thread_state ns = restart ? Restarting : Exiting;
+	const char *me =
+		thi == &mdev->receiver ? "receiver" :
+		thi == &mdev->asender  ? "asender"  :
+		thi == &mdev->worker   ? "worker"   : "NONSENSE";
 
 	spin_lock(&thi->t_lock);
 
@@ -1311,9 +1501,8 @@ void _drbd_thread_stop(struct Drbd_thread *thi, int restart, int wait)
 
 		thi->t_state = ns;
 		smp_mb();
+		init_completion(&thi->startstop);
 		if (thi->task != current) {
-			if (wait)
-				init_completion(&thi->startstop);
 			force_sig(DRBD_SIGKILL, thi->task);
 		} else
 			D_ASSERT(!wait);
@@ -1325,7 +1514,9 @@ void _drbd_thread_stop(struct Drbd_thread *thi, int restart, int wait)
 		wait_for_completion(&thi->startstop);
 		spin_lock(&thi->t_lock);
 		D_ASSERT(thi->task == NULL);
-		D_ASSERT(thi->t_state == None);
+		if (thi->t_state != None)
+			ERR("ASSERT FAILED: %s t_state == %d expected %d.\n",
+					me, thi->t_state, None);
 		spin_unlock(&thi->t_lock);
 	}
 }
@@ -1357,16 +1548,29 @@ cpumask_t drbd_calc_cpu_mask(struct drbd_conf *mdev)
 	return (cpumask_t) CPU_MASK_ALL; /* Never reached. */
 }
 
-void drbd_thread_set_cpu(struct Drbd_thread *thi, cpumask_t cpu_mask)
-{
-	struct task_struct *p;
-
-	mutex_lock(&thi->task_mutex);
-	p = thi->task;
-	if (p) set_cpus_allowed(p, cpu_mask);
-	mutex_unlock(&thi->task_mutex);
+/* modifies the cpu mask of the _current_ thread,
+ * call in the "main loop" of _all_ threads.
+ * no need for any mutex, current won't die prematurely.
+ */
+void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
+{
+	struct task_struct *p = current;
+	struct Drbd_thread *thi =
+		p == mdev->asender.task  ? &mdev->asender  :
+		p == mdev->receiver.task ? &mdev->receiver :
+		p == mdev->worker.task   ? &mdev->worker   :
+		NULL;
+	ERR_IF(thi == NULL)
+		return;
+	if (!thi->reset_cpu_mask)
+		return;
+	thi->reset_cpu_mask = 0;
+	/* preempt_disable();
+	   Thas was a kernel that warned about a call to smp_processor_id() while preemt
+	   was not disabled. It seems that this was fixed in manline. */
+	set_cpus_allowed(p, mdev->cpu_mask);
+	/* preempt_enable(); */
 }
-
 #endif
 
 /* the appropriate socket mutex must be held already */
@@ -1386,7 +1590,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
 	dump_packet(mdev, sock, 0, (void *)h, __FILE__, __LINE__);
 	sent = drbd_send(mdev, sock, h, size, msg_flags);
 
-	ok = ( sent == size );
+	ok = (sent == size);
 	if (!ok)
 		ERR("short sent %s size=%d sent=%d\n",
 		    cmdname(cmd), (int)size, sent);
@@ -1437,10 +1641,10 @@ int drbd_send_cmd2(struct drbd_conf *mdev, enum Drbd_Packet_Cmd cmd, char *data,
 
 	dump_packet(mdev, mdev->data.socket, 0, (void *)&h, __FILE__, __LINE__);
 
-	ok = ( sizeof(h) ==
-		drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0) );
-	ok = ok && ( size ==
-		drbd_send(mdev, mdev->data.socket, data, size, 0) );
+	ok = (sizeof(h) ==
+		drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
+	ok = ok && (size ==
+		drbd_send(mdev, mdev->data.socket, data, size, 0));
 
 	drbd_put_data_sock(mdev);
 
@@ -1449,26 +1653,43 @@ int drbd_send_cmd2(struct drbd_conf *mdev, enum Drbd_Packet_Cmd cmd, char *data,
 
 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
 {
-	struct Drbd_SyncParam_Packet *p;
+	struct Drbd_SyncParam89_Packet *p;
+	struct socket *sock;
 	int size, rv;
+	const int apv = mdev->agreed_pro_version;
 
-	size = sizeof(struct Drbd_SyncParam_Packet);
+	size = apv <= 87 ? sizeof(struct Drbd_SyncParam_Packet)
+	     : apv == 88 ? sizeof(struct Drbd_SyncParam_Packet)
+	                   + strlen(mdev->sync_conf.verify_alg) + 1
+	     : /* 89 */    sizeof(struct Drbd_SyncParam89_Packet);
 
-	if (mdev->agreed_pro_version >= 88)
-		size += strlen(mdev->sync_conf.verify_alg) + 1;
+	/* used from admin command context and receiver/worker context.
+	 * to avoid kmalloc, grab the socket right here,
+	 * then use the pre-allocated sbuf there */
+	down(&mdev->data.mutex);
+	sock = mdev->data.socket;
 
-	p = kmalloc(size, GFP_KERNEL);
-	if (p == NULL)
-		return 0;
+	if (likely(sock != NULL)) {
+		enum Drbd_Packet_Cmd cmd = apv >= 89 ? SyncParam89 : SyncParam;
 
-	p->rate      = cpu_to_be32(sc->rate);
+		p = &mdev->data.sbuf.SyncParam89;
 
-	if (mdev->agreed_pro_version >= 88)
-		strcpy(p->online_verify_alg,mdev->sync_conf.verify_alg);
+		/* initialize verify_alg and csums_alg */
+		memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+		p->rate = cpu_to_be32(sc->rate);
+
+		if (apv >= 88)
+			strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
+		if (apv >= 89)
+			strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
+
+		rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
+	} else
+		rv = 0; /* not ok */
+
+	up(&mdev->data.mutex);
 
-	rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, SyncParam,
-			   (struct Drbd_Header *)p, size);
-	kfree(p);
 	return rv;
 }
 
@@ -1525,7 +1746,7 @@ int drbd_send_uuids(struct drbd_conf *mdev)
 	dec_local(mdev);
 
 	return drbd_send_cmd(mdev, USE_DATA_SOCKET, ReportUUIDs,
-			     (struct Drbd_Header*)&p, sizeof(p));
+			     (struct Drbd_Header *)&p, sizeof(p));
 }
 
 int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
@@ -1593,7 +1814,7 @@ int drbd_send_state(struct drbd_conf *mdev)
 
 	if (likely(sock != NULL)) {
 		ok = _drbd_send_cmd(mdev, sock, ReportState,
-				    (struct Drbd_Header*)&p, sizeof(p), 0);
+				    (struct Drbd_Header *)&p, sizeof(p), 0);
 	}
 
 	up(&mdev->data.mutex);
@@ -1637,12 +1858,18 @@ int _drbd_send_bitmap(struct drbd_conf *mdev)
 
 	ERR_IF(!mdev->bitmap) return FALSE;
 
+	/* maybe we should use some per thread scratch page,
+	 * and allocate that during initial device creation? */
+	p = (struct Drbd_Header *) __get_free_page(GFP_NOIO);
+	if (!p) {
+		ERR("failed to allocate one page buffer in %s\n", __func__);
+		return FALSE;
+	}
 	bm_words = drbd_bm_words(mdev);
-	p  = vmalloc(PAGE_SIZE); /* sleeps. cannot fail. */
 	buffer = (unsigned long *)p->payload;
 
 	if (inc_local(mdev)) {
-		if (drbd_md_test_flag(mdev->bc,MDF_FullSync)) {
+		if (drbd_md_test_flag(mdev->bc, MDF_FullSync)) {
 			INFO("Writing the whole bitmap, MDF_FullSync was set.\n");
 			drbd_bm_set_all(mdev);
 			if (drbd_bm_write(mdev)) {
@@ -1650,9 +1877,8 @@ int _drbd_send_bitmap(struct drbd_conf *mdev)
 				 * but otherwise process as per normal - need to tell other
 				 * side that a full resync is required! */
 				ERR("Failed to write bitmap to disk!\n");
-			}
-			else {
-				drbd_md_clear_flag(mdev,MDF_FullSync);
+			} else {
+				drbd_md_clear_flag(mdev, MDF_FullSync);
 				drbd_md_sync(mdev);
 			}
 		}
@@ -1664,7 +1890,7 @@ int _drbd_send_bitmap(struct drbd_conf *mdev)
 	 * some such algorithms in the kernel anyways.
 	 */
 	do {
-		num_words = min_t(size_t, BM_PACKET_WORDS, bm_words-bm_i );
+		num_words = min_t(size_t, BM_PACKET_WORDS, bm_words - bm_i);
 		want = num_words * sizeof(long);
 		if (want)
 			drbd_bm_get_lel(mdev, bm_i, num_words, buffer);
@@ -1673,7 +1899,7 @@ int _drbd_send_bitmap(struct drbd_conf *mdev)
 		bm_i += num_words;
 	} while (ok && want);
 
-	vfree(p);
+	free_page((unsigned long) p);
 	return ok;
 }
 
@@ -1696,6 +1922,8 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
 	p.barrier  = barrier_nr;
 	p.set_size = cpu_to_be32(set_size);
 
+	if (mdev->state.conn < Connected)
+		return FALSE;
 	ok = drbd_send_cmd(mdev, USE_META_SOCKET, BarrierAck,
 			(struct Drbd_Header *)&p, sizeof(p));
 	return ok;
@@ -1752,18 +1980,17 @@ int drbd_send_ack(struct drbd_conf *mdev,
 			      e->block_id);
 }
 
+/* This function misuses the block_id field to signal if the blocks
+ * are is sync or not. */
 int drbd_send_ack_ex(struct drbd_conf *mdev, enum Drbd_Packet_Cmd cmd,
 		     sector_t sector, int blksize, u64 block_id)
 {
-   /* This function misuses the block_id field to signal if the blocks
-      are is sync or not. */
-	return _drbd_send_ack(mdev,cmd,
+	return _drbd_send_ack(mdev, cmd,
 			      cpu_to_be64(sector),
 			      cpu_to_be32(blksize),
 			      cpu_to_be64(block_id));
 }
 
-
 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
 		       sector_t sector, int size, u64 block_id)
 {
@@ -1781,7 +2008,6 @@ int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
 	return ok;
 }
 
-
 int drbd_send_drequest_csum(struct drbd_conf *mdev,
 			    sector_t sector,int size,
 			    void *digest, int digest_size,
@@ -1796,12 +2022,12 @@ int drbd_send_drequest_csum(struct drbd_conf *mdev,
 
 	p.head.magic   = BE_DRBD_MAGIC;
 	p.head.command = cpu_to_be16(cmd);
-	p.head.length  = cpu_to_be16( sizeof(p)-sizeof(struct Drbd_Header) + digest_size );
+	p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct Drbd_Header) + digest_size);
 
 	down(&mdev->data.mutex);
 
-	ok = ( sizeof(p) == drbd_send(mdev,mdev->data.socket,&p,sizeof(p),0) );
-	ok = ok&& ( digest_size == drbd_send(mdev,mdev->data.socket,digest,digest_size,0) );
+	ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
+	ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
 
 	up(&mdev->data.mutex);
 
@@ -1822,7 +2048,6 @@ int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size)
 	return ok;
 }
 
-
 /* called on sndtimeo
  * returns FALSE if we should retry,
  * TRUE if we think connection is dead
@@ -1911,15 +2136,16 @@ int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
 	 * doh. it triggered. so XFS _IS_ really kaputt ...
 	 * oh well...
 	 */
-	if ( (page_count(page) < 1) || PageSlab(page) ) {
+	if ((page_count(page) < 1) || PageSlab(page)) {
 		/* e.g. XFS meta- & log-data is in slab pages, which have a
 		 * page_count of 0 and/or have PageSlab() set...
 		 */
 #ifdef SHOW_SENDPAGE_USAGE
 		++fallback;
 #endif
-		sent =  _drbd_no_send_page(mdev, page, offset, size);
-		if (likely(sent > 0)) len -= sent;
+		sent = _drbd_no_send_page(mdev, page, offset, size);
+		if (likely(sent > 0))
+			len -= sent;
 		goto out;
 	}
 
@@ -1936,7 +2162,7 @@ int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
 				continue;
 		}
 		if (sent <= 0) {
-			DRBD_WARN("%s: size=%d len=%d sent=%d\n",
+			drbd_WARN("%s: size=%d len=%d sent=%d\n",
 			     __func__, (int)size, len, sent);
 			break;
 		}
@@ -1997,13 +2223,13 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 
 	p.head.magic   = BE_DRBD_MAGIC;
 	p.head.command = cpu_to_be16(Data);
-	p.head.length  = cpu_to_be16( sizeof(p)
-			-sizeof(struct Drbd_Header)+dgs+req->size);
+	p.head.length  =
+		cpu_to_be16(sizeof(p) - sizeof(struct Drbd_Header) + dgs + req->size);
 
 	p.sector   = cpu_to_be64(req->sector);
 	p.block_id = (unsigned long)req;
-	p.seq_num  = cpu_to_be32( req->seq_num =
-				  atomic_add_return(1, &mdev->packet_seq) );
+	p.seq_num  = cpu_to_be32(req->seq_num =
+				 atomic_add_return(1, &mdev->packet_seq));
 	dp_flags = 0;
 
 	/* NOTE: no need to check if barriers supported here as we would
@@ -2055,8 +2281,8 @@ int drbd_send_block(struct drbd_conf *mdev, enum Drbd_Packet_Cmd cmd,
 
 	p.head.magic   = BE_DRBD_MAGIC;
 	p.head.command = cpu_to_be16(cmd);
-	p.head.length  = cpu_to_be16( sizeof(p)
-			-sizeof(struct Drbd_Header) + dgs + e->size);
+	p.head.length  =
+		cpu_to_be16(sizeof(p) - sizeof(struct Drbd_Header) + dgs + e->size);
 
 	p.sector   = cpu_to_be64(e->sector);
 	p.block_id = e->block_id;
@@ -2064,7 +2290,7 @@ int drbd_send_block(struct drbd_conf *mdev, enum Drbd_Packet_Cmd cmd,
 
 	/* Only called by our kernel thread.
 	 * This one may be interupted by DRBD_SIG and/or DRBD_SIGKILL
-	 * in response to ioctl or module unload.
+	 * in response to admin command or module unload.
 	 */
 	if (!drbd_get_data_sock(mdev))
 		return 0;
@@ -2148,7 +2374,7 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock,
  * otherwise wake_asender() might interrupt some send_*Ack !
  */
 #if !HAVE_KERNEL_SENDMSG
-		rv = sock_sendmsg(sock, &msg, iov.iov_len );
+		rv = sock_sendmsg(sock, &msg, iov.iov_len);
 #else
 		rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
 #endif
@@ -2198,22 +2424,27 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock,
 	return sent;
 }
 
-int drbd_open(struct inode *inode, struct file *file)
+#ifdef BD_OPS_USE_FMODE
+static int drbd_open(struct block_device *bdev, fmode_t mode)
+#else
+static int drbd_open(struct inode *inode, struct file *file)
+#endif
 {
-	struct drbd_conf *mdev;
+#ifdef BD_OPS_USE_FMODE
+	struct drbd_conf *mdev = bdev->bd_disk->private_data;
+#else
+	int mode = file->f_mode;
+	struct drbd_conf *mdev = inode->i_bdev->bd_disk->private_data;
+#endif
 	unsigned long flags;
 	int rv = 0;
 
-	mdev = minor_to_mdev(MINOR(inode->i_rdev));
-	if (!mdev)
-		return -ENODEV;
-
 	spin_lock_irqsave(&mdev->req_lock, flags);
 	/* to have a stable mdev->state.role
 	 * and no race with updating open_cnt */
 
 	if (mdev->state.role != Primary) {
-		if (file->f_mode & FMODE_WRITE)
+		if (mode & FMODE_WRITE)
 			rv = -EROFS;
 		else if (!allow_oos)
 			rv = -EMEDIUMTYPE;
@@ -2226,25 +2457,21 @@ int drbd_open(struct inode *inode, struct file *file)
 	return rv;
 }
 
-int drbd_close(struct inode *inode, struct file *file)
+#ifdef BD_OPS_USE_FMODE
+static int drbd_release(struct gendisk *gd, fmode_t mode)
 {
-	/* do not use *file (May be NULL, in case of a unmount :-) */
-	struct drbd_conf *mdev;
-
-	mdev = minor_to_mdev(MINOR(inode->i_rdev));
-	if (!mdev)
-		return -ENODEV;
-
-	/*
-	printk(KERN_ERR "drbd: close(inode=%p,file=%p)"
-	       "current=%p,minor=%d,wc=%d\n", inode, file, current, minor,
-	       inode->i_writecount);
-	*/
-
+	struct drbd_conf *mdev = gd->private_data;
 	mdev->open_cnt--;
-
 	return 0;
 }
+#else
+static int drbd_release(struct inode *inode, struct file *file)
+{
+	struct drbd_conf *mdev = inode->i_bdev->bd_disk->private_data;
+	mdev->open_cnt--;
+	return 0;
+}
+#endif
 
 STATIC void drbd_unplug_fn(struct request_queue *q)
 {
@@ -2286,9 +2513,17 @@ STATIC void drbd_set_defaults(struct drbd_conf *mdev)
 	mdev->sync_conf.rate       = DRBD_RATE_DEF;
 	mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF;
 	mdev->state = (union drbd_state_t) {
-		{ Secondary, Unknown, StandAlone, Diskless, DUnknown, 0 } };
+		{ .role = Secondary,
+		  .peer = Unknown,
+		  .conn = StandAlone,
+		  .disk = Diskless,
+		  .pdsk = DUnknown,
+		  .susp = 0
+		} };
 }
 
+int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+
 void drbd_init_set_defaults(struct drbd_conf *mdev)
 {
 	/* the memset(,0,) did most of this.
@@ -2327,6 +2562,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	spin_lock_init(&mdev->al_lock);
 	spin_lock_init(&mdev->req_lock);
 	spin_lock_init(&mdev->peer_seq_lock);
+	spin_lock_init(&mdev->epoch_lock);
 
 	INIT_LIST_HEAD(&mdev->active_ee);
 	INIT_LIST_HEAD(&mdev->sync_ee);
@@ -2339,9 +2575,11 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	INIT_LIST_HEAD(&mdev->resync_work.list);
 	INIT_LIST_HEAD(&mdev->unplug_work.list);
 	INIT_LIST_HEAD(&mdev->md_sync_work.list);
+	INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
 	mdev->resync_work.cb  = w_resync_inactive;
 	mdev->unplug_work.cb  = w_send_write_hint;
 	mdev->md_sync_work.cb = w_md_sync;
+	mdev->bm_io_work.w.cb = w_bitmap_io;
 	init_timer(&mdev->resync_timer);
 	init_timer(&mdev->md_sync_timer);
 	mdev->resync_timer.function = resync_timer_fn;
@@ -2360,7 +2598,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	drbd_thread_init(mdev, &mdev->asender, drbd_asender);
 
 	mdev->agreed_pro_version = PRO_VERSION_MAX;
-
+	mdev->write_ordering = WO_bio_barrier;
 #ifdef __arch_um__
 	INFO("mdev = 0x%p\n", mdev);
 #endif
@@ -2409,15 +2647,24 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
 	 * oldest_barrier
 	 */
 
-	D_ASSERT(mdev->receiver.t_state == None);
+	if (mdev->receiver.t_state != None)
+		ERR("ASSERT FAILED: receiver t_state == %d expected 0.\n",
+				mdev->receiver.t_state);
 
-	if (mdev->verify_tfm) {
-		crypto_free_hash(mdev->verify_tfm);
-		mdev->verify_tfm=NULL;
-	}
+	crypto_free_hash(mdev->csums_tfm);
+	mdev->csums_tfm = NULL;
+
+	crypto_free_hash(mdev->verify_tfm);
+	mdev->verify_tfm = NULL;
+
+	crypto_free_hash(mdev->integrity_w_tfm);
+	mdev->integrity_w_tfm = NULL;
+
+	crypto_free_hash(mdev->integrity_r_tfm);
+	mdev->integrity_r_tfm = NULL;
 	/* no need to lock it, I'm the only thread alive */
-	if (mdev->epoch_size !=  0)
-		ERR("epoch_size:%d\n", mdev->epoch_size);
+	if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
+		ERR("epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
 	mdev->al_writ_cnt  =
 	mdev->bm_writ_cnt  =
 	mdev->read_cnt     =
@@ -2510,12 +2757,12 @@ STATIC int drbd_create_mempools(void)
 		goto Enomem;
 
 	/* mempools */
-	drbd_request_mempool = mempool_create( number,
+	drbd_request_mempool = mempool_create(number,
 		mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
 	if (drbd_request_mempool == NULL)
 		goto Enomem;
 
-	drbd_ee_mempool = mempool_create( number,
+	drbd_ee_mempool = mempool_create(number,
 		mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
 	if (drbd_request_mempool == NULL)
 		goto Enomem;
@@ -2563,15 +2810,13 @@ STATIC void drbd_cleanup(void)
 	drbd_nl_cleanup();
 
 	if (minor_table) {
-		if (drbd_proc) {
+		if (drbd_proc)
 			remove_proc_entry("drbd", NULL);
-			drbd_proc = NULL;
-		}
 		i = minor_count;
 		while (i--) {
-			struct drbd_conf        *mdev  = minor_to_mdev(i);
+			struct drbd_conf *mdev = minor_to_mdev(i);
 			struct gendisk  **disk = &mdev->vdisk;
-			struct request_queue **q    = &mdev->rq_queue;
+			struct request_queue **q = &mdev->rq_queue;
 
 			if (!mdev)
 				continue;
@@ -2591,6 +2836,8 @@ STATIC void drbd_cleanup(void)
 				bdput(mdev->this_bdev);
 
 			tl_cleanup(mdev);
+			if (mdev->bitmap)
+				drbd_bm_cleanup(mdev);
 			if (mdev->resync)
 				lc_free(mdev->resync);
 
@@ -2647,6 +2894,8 @@ STATIC void drbd_cleanup(void)
 			kfree(mdev->int_dig_out);
 			kfree(mdev->int_dig_in);
 			kfree(mdev->int_dig_vv);
+
+			kfree(mdev->current_epoch);
 		}
 		drbd_destroy_mempools();
 	}
@@ -2684,7 +2933,7 @@ struct drbd_conf *drbd_new_device(int minor)
 		goto Enomem;
 	mdev->vdisk = disk;
 
-	set_disk_ro( disk, TRUE );
+	set_disk_ro(disk, TRUE);
 
 	disk->queue = q;
 	disk->major = DRBD_MAJOR;
@@ -2699,6 +2948,7 @@ struct drbd_conf *drbd_new_device(int minor)
 	mdev->this_bdev->bd_contains = mdev->this_bdev;
 
 	blk_queue_make_request(q, drbd_make_request_26);
+	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
 	blk_queue_merge_bvec(q, drbd_merge_bvec);
 	q->queue_lock = &mdev->req_lock; /* needed since we use */
 		/* plugging on a queue, that actually has no requests! */
@@ -2708,12 +2958,20 @@ struct drbd_conf *drbd_new_device(int minor)
 	if (!mdev->md_io_page)
 		goto Enomem;
 
-	if (!tl_init(mdev)) goto Enomem;
+	if (drbd_bm_init(mdev))
+		goto Enomem;
+	/* no need to lock access, we are still initializing the module. */
+	if (!tl_init(mdev))
+		goto Enomem;
 
 	mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
 	if (!mdev->app_reads_hash)
 		goto Enomem;
 
+	mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+	INIT_LIST_HEAD(&mdev->current_epoch->list);
+	mdev->epochs = 1;
+
 	return mdev;
 
  Enomem:
@@ -2721,6 +2979,7 @@ struct drbd_conf *drbd_new_device(int minor)
 		kfree(mdev->app_reads_hash);
 		if (mdev->md_io_page)
 			__free_page(mdev->md_io_page);
+		kfree(mdev->current_epoch);
 		kfree(mdev);
 	}
 	return NULL;
@@ -2735,7 +2994,6 @@ int __init drbd_init(void)
 	       THIS_MODULE, THIS_MODULE->module_core);
 #endif
 
-	/* FIXME should be a compile time assert */
 	if (sizeof(struct Drbd_HandShake_Packet) != 80) {
 		printk(KERN_ERR
 		       "drbd: never change the size or layout "
@@ -2774,6 +3032,7 @@ int __init drbd_init(void)
 
 	init_waitqueue_head(&drbd_pp_wait);
 
+	drbd_proc = NULL; /* play safe for drbd_cleanup */
 	minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
 				GFP_KERNEL);
 	if (!minor_table)
@@ -2787,13 +3046,14 @@ int __init drbd_init(void)
 	/*
 	 * register with procfs
 	 */
-	drbd_proc = proc_create("drbd",  S_IFREG | S_IRUGO , NULL, &drbd_proc_fops);
+	drbd_proc = create_proc_entry("drbd",  S_IFREG | S_IRUGO , NULL);
 
 	if (!drbd_proc)	{
 		printk(KERN_ERR "drbd: unable to register proc file\n");
 		goto Enomem;
 	}
 
+	drbd_proc->proc_fops = &drbd_proc_fops;
 	drbd_proc->owner = THIS_MODULE;
 #else
 # error "Currently drbd depends on the proc file system (CONFIG_PROC_FS)"
@@ -2890,14 +3150,18 @@ void drbd_md_sync(struct drbd_conf *mdev)
 	sector_t sector;
 	int i;
 
-	if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) return;
+	if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
+		return;
 	del_timer(&mdev->md_sync_timer);
 
 	/* We use here Failed and not Attaching because we try to write
 	 * metadata even if we detach due to a disk failure! */
-	if (!inc_local_if_state(mdev, Failed)) return;
+	if (!inc_local_if_state(mdev, Failed))
+		return;
 
-	INFO("Writing meta data super block now.\n");
+	MTRACE(TraceTypeMDIO, TraceLvlSummary,
+	       INFO("Writing meta data super block now.\n");
+	       );
 
 	down(&mdev->md_io_mutex);
 	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
@@ -2950,7 +3214,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 	struct meta_data_on_disk *buffer;
 	int i, rv = NoError;
 
-	if (!inc_local_if_state(mdev, Attaching)) return MDIOError;
+	if (!inc_local_if_state(mdev, Attaching))
+		return MDIOError;
 
 	down(&mdev->md_io_mutex);
 	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
@@ -3023,7 +3288,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 void drbd_md_mark_dirty(struct drbd_conf *mdev)
 {
 	set_bit(MD_DIRTY, &mdev->flags);
-	mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ );
+	mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
 }
 
 
@@ -3031,7 +3296,7 @@ STATIC void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
 {
 	int i;
 
-	for ( i = History_start ; i < History_end ; i++ ) {
+	for (i = History_start; i < History_end; i++) {
 		mdev->bc->md.uuid[i+1] = mdev->bc->md.uuid[i];
 
 		MTRACE(TraceTypeUuid, TraceLvlAll,
@@ -3089,7 +3354,8 @@ void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
 	   working substitute, to avoid repetitive generating
 	   of new current UUIDs in case we loose connection
 	   and reconnect in a loop. */
-	if (mdev->bc->md.flags & MDF_FullSync) return;
+	if (mdev->bc->md.flags & MDF_FullSync)
+		return;
 	INFO("Creating new current UUID [no BitMap]\n");
 	get_random_bytes(&uuid, sizeof(u64));
 	drbd_uuid_set(mdev, Current, uuid);
@@ -3132,7 +3398,7 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
 			);
 	} else {
 		if (mdev->bc->md.uuid[Bitmap])
-			DRBD_WARN("bm UUID already set");
+			drbd_WARN("bm UUID already set");
 
 		mdev->bc->md.uuid[Bitmap] = val;
 		mdev->bc->md.uuid[Bitmap] &= ~((u64)1);
@@ -3171,21 +3437,43 @@ int drbd_bmio_set_n_write(struct drbd_conf *mdev)
 	return rv;
 }
 
-STATIC int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+/**
+ * drbd_bmio_clear_n_write:
+ * Is an io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() that clears
+ * all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
+{
+	int rv = -EIO;
+
+	if (inc_local_if_state(mdev, Attaching)) {
+		drbd_bm_clear_all(mdev);
+		rv = drbd_bm_write(mdev);
+		dec_local(mdev);
+	}
+
+	return rv;
+}
+
+int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 {
 	struct bm_io_work *work = (struct bm_io_work *)w;
 	int rv;
 
-	D_ASSERT(atomic_read(&mdev->ap_bio_cnt)==0);
+	D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
 
-	drbd_bm_lock(mdev);
+	drbd_bm_lock(mdev, work->why);
 	rv = work->io_fn(mdev);
 	drbd_bm_unlock(mdev);
 
 	clear_bit(BITMAP_IO, &mdev->flags);
 	wake_up(&mdev->misc_wait);
 
-        if (work->done) work->done(mdev, rv);
+	if (work->done)
+		work->done(mdev, rv);
+
+	clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
+	work->why = NULL;
 
 	return 1;
 }
@@ -3194,54 +3482,57 @@ STATIC int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
  * drbd_queue_bitmap_io:
  * Queues an IO operation on the whole bitmap.
  * While IO on the bitmap happens we freeze appliation IO thus we ensure
- * that drbd_set_out_of_sync() can not be called. This function might be
- * called from the worker thread and other contexts.
+ * that drbd_set_out_of_sync() can not be called.
+ * This function MUST ONLY be called from worker context.
+ * BAD API ALERT!
+ * It MUST NOT be used while a previous such work is still pending!
  */
 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
 			  int (*io_fn)(struct drbd_conf *),
-			  void (*done)(struct drbd_conf *, int))
+			  void (*done)(struct drbd_conf *, int),
+			  char *why)
 {
-	unsigned long flags;
+	D_ASSERT(current == mdev->worker.task);
 
+	D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
 	D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
+	D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
+	if (mdev->bm_io_work.why)
+		ERR("FIXME going to queue '%s' but '%s' still pending?\n",
+			why, mdev->bm_io_work.why);
 
-	mdev->bm_io_work.w.cb = w_bitmap_io;
 	mdev->bm_io_work.io_fn = io_fn;
 	mdev->bm_io_work.done = done;
+	mdev->bm_io_work.why = why;
 
-	spin_lock_irqsave(&mdev->req_lock, flags);
-	clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
 	set_bit(BITMAP_IO, &mdev->flags);
 	if (atomic_read(&mdev->ap_bio_cnt) == 0) {
-		set_bit(BITMAP_IO_QUEUED, &mdev->flags);
-		drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+		if (list_empty(&mdev->bm_io_work.w.list)) {
+			set_bit(BITMAP_IO_QUEUED, &mdev->flags);
+			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+		} else
+			ERR("FIXME avoided double queuing bm_io_work\n");
 	}
-	spin_unlock_irqrestore(&mdev->req_lock, flags);
 }
 
 /**
  * drbd_bitmap_io:
  * Does an IO operation on the bitmap, freezing application IO while that
- * IO operations runs. This functions might not be called from the context
- * of the worker thread.
+ * IO operations runs. This functions MUST NOT be called from worker context.
  */
-int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *))
+int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
 {
 	int rv;
 
 	D_ASSERT(current != mdev->worker.task);
-	D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
 
-	set_bit(BITMAP_IO_QUEUED, &mdev->flags);
-	set_bit(BITMAP_IO, &mdev->flags);
-	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+	drbd_suspend_io(mdev);
 
-	drbd_bm_lock(mdev);
+	drbd_bm_lock(mdev, why);
 	rv = io_fn(mdev);
 	drbd_bm_unlock(mdev);
 
-	clear_bit(BITMAP_IO, &mdev->flags);
-	wake_up(&mdev->misc_wait);
+	drbd_resume_io(mdev);
 
 	return rv;
 }
@@ -3249,7 +3540,7 @@ int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *))
 void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
 {
 	MUST_HOLD(mdev->req_lock);
-	if ( (mdev->bc->md.flags & flag) != flag) {
+	if ((mdev->bc->md.flags & flag) != flag) {
 		drbd_md_mark_dirty(mdev);
 		mdev->bc->md.flags |= flag;
 	}
@@ -3258,26 +3549,26 @@ void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
 void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
 {
 	MUST_HOLD(mdev->req_lock);
-	if ( (mdev->bc->md.flags & flag) != 0 ) {
+	if ((mdev->bc->md.flags & flag) != 0) {
 		drbd_md_mark_dirty(mdev);
 		mdev->bc->md.flags &= ~flag;
 	}
 }
 int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
 {
-	return ((bdev->md.flags & flag) != 0);
+	return (bdev->md.flags & flag) != 0;
 }
 
-void md_sync_timer_fn(unsigned long data)
+STATIC void md_sync_timer_fn(unsigned long data)
 {
 	struct drbd_conf *mdev = (struct drbd_conf *) data;
 
 	drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
 }
 
-int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+STATIC int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 {
-	DRBD_WARN("md_sync_timer expired! Worker calls drbd_md_sync().\n");
+	drbd_WARN("md_sync_timer expired! Worker calls drbd_md_sync().\n");
 	drbd_md_sync(mdev);
 
 	return 1;
@@ -3325,7 +3616,7 @@ _drbd_fault_str(unsigned int type) {
 		"Data read ahead",
 	};
 
-	return (type < DRBD_FAULT_MAX)? _faults[type] : "**Unknown**";
+	return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
 }
 
 unsigned int
@@ -3342,7 +3633,7 @@ _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
 		fault_count++;
 
 		if (printk_ratelimit())
-			DRBD_WARN("***Simulating %s failure\n",
+			drbd_WARN("***Simulating %s failure\n",
 				_drbd_fault_str(type));
 	}
 
@@ -3370,7 +3661,7 @@ STATIC char *_drbd_uuid_str(unsigned int idx)
 void drbd_print_uuid(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
 {
 	INFO(" uuid[%s] now %016llX\n",
-		_drbd_uuid_str(idx), mdev->bc->md.uuid[idx]);
+	     _drbd_uuid_str(idx), (unsigned long long)mdev->bc->md.uuid[idx]);
 }
 
 
@@ -3546,24 +3837,23 @@ do { \
 	if (trace_level >= TraceLvlAll) { \
 		INFO("%s:%d: %s [%d] %s %s " fmt , \
 		     file, line, current->comm, current->pid, \
-		     sockname, recv?"<<<":">>>", \
-		     ## args ); \
-	} \
-	else { \
+		     sockname, recv ? "<<<" : ">>>" , \
+		     ## args); \
+	} else { \
 		INFO("%s %s " fmt, sockname, \
-		     recv?"<<<":">>>", \
-		     ## args ); \
+		     recv ? "<<<" : ">>>" , \
+		     ## args); \
 	} \
 } while (0)
 
 STATIC char *_dump_block_id(u64 block_id, char *buff)
 {
-    if (is_syncer_block_id(block_id))
-	strcpy(buff, "SyncerId");
-    else
-	sprintf(buff, "%llx", block_id);
+	if (is_syncer_block_id(block_id))
+		strcpy(buff, "SyncerId");
+	else
+		sprintf(buff, "%llx", (unsigned long long)block_id);
 
-    return buff;
+	return buff;
 }
 
 void
@@ -3631,14 +3921,21 @@ _dump_packet(struct drbd_conf *mdev, struct socket *sock,
 		INFOP("%s (barrier %u)\n", cmdname(cmd), p->Barrier.barrier);
 		break;
 
+	case SyncParam:
+	case SyncParam89:
+		INFOP("%s (rate %u, verify-alg \"%.64s\", csums-alg \"%.64s\")\n",
+			cmdname(cmd), be32_to_cpu(p->SyncParam89.rate),
+			p->SyncParam89.verify_alg, p->SyncParam89.csums_alg);
+		break;
+
 	case ReportUUIDs:
 		INFOP("%s Curr:%016llX, Bitmap:%016llX, "
 		      "HisSt:%016llX, HisEnd:%016llX\n",
 		      cmdname(cmd),
-		      be64_to_cpu(p->GenCnt.uuid[Current]),
-		      be64_to_cpu(p->GenCnt.uuid[Bitmap]),
-		      be64_to_cpu(p->GenCnt.uuid[History_start]),
-		      be64_to_cpu(p->GenCnt.uuid[History_end]));
+		      (unsigned long long)be64_to_cpu(p->GenCnt.uuid[Current]),
+		      (unsigned long long)be64_to_cpu(p->GenCnt.uuid[Bitmap]),
+		      (unsigned long long)be64_to_cpu(p->GenCnt.uuid[History_start]),
+		      (unsigned long long)be64_to_cpu(p->GenCnt.uuid[History_end]));
 		break;
 
 	case ReportSizes:
@@ -3687,7 +3984,7 @@ _dump_packet(struct drbd_conf *mdev, struct socket *sock,
 
 /* Debug routine to dump info about bio */
 
-void _dump_bio(const char *pfx, struct drbd_conf *mdev, struct bio *bio, int complete)
+void _dump_bio(const char *pfx, struct drbd_conf *mdev, struct bio *bio, int complete, struct drbd_request *r)
 {
 #ifdef CONFIG_LBD
 #define SECTOR_FORMAT "%Lx"
@@ -3698,6 +3995,7 @@ void _dump_bio(const char *pfx, struct drbd_conf *mdev, struct bio *bio, int com
 
 	unsigned long lowaddr = (unsigned long)(bio->bi_sector << SECTOR_SHIFT);
 	char *faddr = (char *)(lowaddr);
+	char rb[sizeof(void*)*2+6] = { 0, };
 	struct bio_vec *bvec;
 	int segno;
 
@@ -3706,19 +4004,23 @@ void _dump_bio(const char *pfx, struct drbd_conf *mdev, struct bio *bio, int com
 	const int biobarrier = (rw & (1<<BIO_RW_BARRIER));
 	const int biosync    = (rw & (1<<BIO_RW_SYNC));
 
-	INFO("%s %s:%s%s%s Bio:%p - %soffset " SECTOR_FORMAT ", size %x\n",
-	     complete? "<<<":">>>",
+	if (r)
+		sprintf(rb,"Req:%p ", r);
+
+	INFO("%s %s:%s%s%s Bio:%p %s- %soffset " SECTOR_FORMAT ", size %x\n",
+	     complete ? "<<<" : ">>>",
 	     pfx,
-	     biorw==WRITE?"Write":"Read",
-	     biobarrier?":B":"",
-	     biosync?":S":"",
+	     biorw == WRITE ? "Write" : "Read",
+	     biobarrier ? " : B" : "",
+	     biosync ? " : S" : "",
 	     bio,
-	     complete? (drbd_bio_uptodate(bio)? "Success, ":"Failed, ") : "",
+	     rb,
+	     complete ? (drbd_bio_uptodate(bio) ? "Success, " : "Failed, ") : "",
 	     bio->bi_sector << SECTOR_SHIFT,
 	     bio->bi_size);
 
 	if (trace_level >= TraceLvlMetrics &&
-	    ((biorw == WRITE) ^ complete) ) {
+	    ((biorw == WRITE) ^ complete)) {
 		printk(KERN_DEBUG "  ind     page   offset   length\n");
 		__bio_for_each_segment(bvec, bio, segno, 0) {
 			printk(KERN_DEBUG "  [%d] %p %8.8x %8.8x\n", segno,
diff --git a/ubuntu/drbd/drbd_nl.c b/ubuntu/drbd/drbd_nl.c
index e7fb0cd..335618b 100644
--- a/ubuntu/drbd/drbd_nl.c
+++ b/ubuntu/drbd/drbd_nl.c
@@ -52,9 +52,9 @@ STATIC int name ## _from_tags (struct drbd_conf *mdev, \
 	int tag; \
 	int dlen; \
 	\
-	while ( (tag = *tags++) != TT_END ) { \
+	while ((tag = *tags++) != TT_END) { \
 		dlen = *tags++; \
-		switch ( tag_number(tag) ) { \
+		switch (tag_number(tag)) { \
 		fields \
 		default: \
 			if (tag & T_MANDATORY) { \
@@ -80,12 +80,17 @@ STATIC int name ## _from_tags (struct drbd_conf *mdev, \
 		 break;
 #define NL_STRING(pn, pr, member, len) \
 	case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
+		if (dlen > len) { \
+			ERR("arg too long: %s (%u wanted, max len: %u bytes)\n", \
+				#member, dlen, (unsigned int)len); \
+			return 0; \
+		} \
 		 arg->member ## _len = dlen; \
 		 memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
 		 break;
 #include "linux/drbd_nl.h"
 
-// Generate the struct to tag_list functions
+/* Generate the struct to tag_list functions */
 #define NL_PACKET(name, number, fields) \
 STATIC unsigned short* \
 name ## _to_tags (struct drbd_conf *mdev, \
@@ -159,7 +164,7 @@ STATIC void nl_trace_reply(void *data)
 	printk(KERN_INFO "drbd%d: "
 	       "Netlink: >> %s (%d) - seq: %x, ack: %x, len: %x\n",
 	       nlp->minor,
-	       nlp->packet_type == P_nl_after_last_packet?
+	       nlp->packet_type == P_nl_after_last_packet ?
 		   "Empty-Reply" : nl_packet_name(nlp->packet_type),
 	       nlp->packet_type,
 	       req->seq, req->ack, req->len);
@@ -169,6 +174,7 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
 {
 	char mb[12];
 	char *argv[] = {usermode_helper, cmd, mb, NULL };
+	int ret;
 	static char *envp[] = { "HOME=/",
 				"TERM=linux",
 				"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
@@ -176,10 +182,23 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
 
 	snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
 
-	INFO("helper command: %s %s\n", usermode_helper, cmd);
+	INFO("helper command: %s %s %s\n", usermode_helper, cmd, mb);
 
 	drbd_bcast_ev_helper(mdev, cmd);
-	return call_usermodehelper(usermode_helper, argv, envp, 1);
+	ret = call_usermodehelper(usermode_helper, argv, envp, 1);
+	if (ret)
+		drbd_WARN("helper command: %s %s %s exit code %u (0x%x)\n",
+				usermode_helper, cmd, mb,
+				(ret >> 8) & 0xff, ret);
+	else
+		INFO("helper command: %s %s %s exit code %u (0x%x)\n",
+				usermode_helper, cmd, mb,
+				(ret >> 8) & 0xff, ret);
+
+	if (ret < 0) /* Ignore any ERRNOs we got. */
+		ret = 0;
+
+	return ret;
 }
 
 enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
@@ -195,14 +214,14 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
 		fp = mdev->bc->dc.fencing;
 		dec_local(mdev);
 	} else {
-		DRBD_WARN("Not outdating peer, I'm not even Consistent myself.\n");
+		drbd_WARN("Not fencing peer, I'm not even Consistent myself.\n");
 		return mdev->state.pdsk;
 	}
 
 	if (fp == Stonith)
-		_drbd_request_state(mdev, NS(susp,1), ChgWaitComplete);
+		_drbd_request_state(mdev, NS(susp, 1), ChgWaitComplete);
 
-	r = drbd_khelper(mdev, "outdate-peer");
+	r = drbd_khelper(mdev, "fence-peer");
 
 	switch ((r>>8) & 0xff) {
 	case 3: /* peer is inconsistent */
@@ -222,24 +241,24 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
 		 * This is useful when an unconnected Secondary is asked to
 		 * become Primary, but findes the other peer being active. */
 		ex_to_string = "peer is active";
-		DRBD_WARN("Peer is primary, outdating myself.\n");
+		drbd_WARN("Peer is primary, outdating myself.\n");
 		nps = DUnknown;
 		_drbd_request_state(mdev, NS(disk, Outdated), ChgWaitComplete);
 		break;
 	case 7:
 		if (fp != Stonith)
-			ERR("outdate-peer() = 7 && fencing != Stonith !!!\n");
+			ERR("fence-peer() = 7 && fencing != Stonith !!!\n");
 		ex_to_string = "peer was stonithed";
 		nps = Outdated;
 		break;
 	default:
 		/* The script is broken ... */
 		nps = DUnknown;
-		ERR("outdate-peer helper broken, returned %d\n", (r>>8)&0xff);
+		ERR("fence-peer helper broken, returned %d\n", (r>>8)&0xff);
 		return nps;
 	}
 
-	INFO("outdate-peer helper returned %d (%s)\n",
+	INFO("fence-peer helper returned %d (%s)\n",
 			(r>>8) & 0xff, ex_to_string);
 	return nps;
 }
@@ -263,6 +282,7 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 	val.i  = 0; val.role  = new_role;
 
 	while (try++ < max_tries) {
+		DRBD_STATE_DEBUG_INIT_VAL(val);
 		r = _drbd_request_state(mdev, mask, val, ChgWaitComplete);
 
 		/* in case we first succeeded to outdate,
@@ -273,17 +293,17 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 			continue;
 		}
 
-		if( r == SS_NoUpToDateDisk && force &&
-		    ( mdev->state.disk == Inconsistent ||
-		      mdev->state.disk == Outdated ) ) {
+		if (r == SS_NoUpToDateDisk && force &&
+		    (mdev->state.disk == Inconsistent ||
+		     mdev->state.disk == Outdated)) {
 			mask.disk = disk_mask;
 			val.disk  = UpToDate;
 			forced = 1;
 			continue;
 		}
 
-		if ( r == SS_NoUpToDateDisk &&
-		    mdev->state.disk == Consistent ) {
+		if (r == SS_NoUpToDateDisk &&
+		    mdev->state.disk == Consistent) {
 			D_ASSERT(mdev->state.pdsk == DUnknown);
 			nps = drbd_try_outdate_peer(mdev);
 
@@ -304,7 +324,7 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 			nps = drbd_try_outdate_peer(mdev);
 
 			if (force && nps > Outdated) {
-				DRBD_WARN("Forced into split brain situation!\n");
+				drbd_WARN("Forced into split brain situation!\n");
 				nps = Outdated;
 			}
 
@@ -316,13 +336,14 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 		if (r == SS_TwoPrimaries) {
 			/* Maybe the peer is detected as dead very soon...
 			   retry at most once more in this case. */
-			set_current_state(TASK_INTERRUPTIBLE);
+			__set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10);
 			if (try < max_tries)
-				try = max_tries -1;
+				try = max_tries - 1;
 			continue;
 		}
 		if (r < SS_Success) {
+			DRBD_STATE_DEBUG_INIT_VAL(val);
 			r = _drbd_request_state(mdev, mask, val,
 						ChgStateVerbose + ChgWaitComplete);
 			if (r < SS_Success)
@@ -332,7 +353,7 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 	}
 
 	if (forced)
-		DRBD_WARN("Forced to consider local data as UpToDate!\n");
+		drbd_WARN("Forced to consider local data as UpToDate!\n");
 
 	fsync_bdev(mdev->this_bdev);
 
@@ -346,7 +367,7 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 	 * */
 
 	if (new_role == Secondary) {
-		set_disk_ro(mdev->vdisk, TRUE );
+		set_disk_ro(mdev->vdisk, TRUE);
 		if (inc_local(mdev)) {
 			mdev->bc->md.uuid[Current] &= ~(u64)1;
 			dec_local(mdev);
@@ -356,9 +377,9 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 			mdev->net_conf->want_lose = 0;
 			dec_net(mdev);
 		}
-		set_disk_ro(mdev->vdisk, FALSE );
+		set_disk_ro(mdev->vdisk, FALSE);
 		if (inc_local(mdev)) {
-			if ( ((mdev->state.conn < Connected ||
+			if (((mdev->state.conn < Connected ||
 			       mdev->state.pdsk <= Failed)
 			      && mdev->bc->md.uuid[Bitmap] == 0) || forced)
 				drbd_uuid_new_current(mdev);
@@ -368,8 +389,7 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 		}
 	}
 
-	if ((new_role == Secondary) && inc_local(mdev) )
-	{
+	if ((new_role == Secondary) && inc_local(mdev)) {
 		drbd_al_to_on_disk_bm(mdev);
 		dec_local(mdev);
 	}
@@ -463,7 +483,7 @@ char *ppsize(char *buf, unsigned long long size)
 	/* Needs 9 bytes at max. */
 	static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
 	int base = 0;
-	while (size >= 10000 ) {
+	while (size >= 10000) {
 		/* shift + round */
 		size = (size >> 10) + !!(size & (1<<9));
 		base++;
@@ -473,6 +493,34 @@ char *ppsize(char *buf, unsigned long long size)
 	return buf;
 }
 
+/* there is still a theoretical deadlock when called from receiver
+ * on an Inconsistent Primary:
+ *  remote READ does inc_ap_bio, receiver would need to receive answer
+ *  packet from remote to dec_ap_bio again.
+ *  receiver receive_sizes(), comes here,
+ *  waits for ap_bio_cnt == 0. -> deadlock.
+ * but this cannot happen, actually, because:
+ *  Primary Inconsistent, and peer's disk is unreachable
+ *  (not connected, *  or bad/no disk on peer):
+ *  see drbd_fail_request_early, ap_bio_cnt is zero.
+ *  Primary Inconsistent, and SyncTarget:
+ *  peer may not initiate a resize.
+ */
+void drbd_suspend_io(struct drbd_conf *mdev)
+{
+	int in_flight;
+	set_bit(SUSPEND_IO, &mdev->flags);
+	in_flight = atomic_read(&mdev->ap_bio_cnt);
+	if (in_flight)
+		wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+}
+
+void drbd_resume_io(struct drbd_conf *mdev)
+{
+	clear_bit(SUSPEND_IO, &mdev->flags);
+	wake_up(&mdev->misc_wait);
+}
+
 /**
  * drbd_determin_dev_size:
  * Evaluates all constraints and sets our correct device size.
@@ -488,8 +536,20 @@ enum determin_dev_size_enum drbd_determin_dev_size(struct drbd_conf *mdev) __mus
 	char ppb[10];
 
 	int md_moved, la_size_changed;
-	enum determin_dev_size_enum rv=unchanged;
+	enum determin_dev_size_enum rv = unchanged;
 
+	/* race:
+	 * application request passes inc_ap_bio,
+	 * but then cannot get an AL-reference.
+	 * this function later may wait on ap_bio_cnt == 0. -> deadlock.
+	 *
+	 * to avoid that:
+	 * Suspend IO right here.
+	 * still lock the act_log to not trigger ASSERTs there.
+	 */
+	drbd_suspend_io(mdev);
+
+	/* no wait necessary anymore, actually we could assert that */
 	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
 
 	prev_first_sect = drbd_md_first_sector(mdev->bc);
@@ -501,8 +561,8 @@ enum determin_dev_size_enum drbd_determin_dev_size(struct drbd_conf *mdev) __mus
 
 	size = drbd_new_dev_size(mdev, mdev->bc);
 
-	if ( drbd_get_capacity(mdev->this_bdev) != size ||
-	    drbd_bm_capacity(mdev) != size ) {
+	if (drbd_get_capacity(mdev->this_bdev) != size ||
+	    drbd_bm_capacity(mdev) != size) {
 		int err;
 		err = drbd_bm_resize(mdev, size);
 		if (unlikely(err)) {
@@ -531,27 +591,29 @@ enum determin_dev_size_enum drbd_determin_dev_size(struct drbd_conf *mdev) __mus
 
 	la_size_changed = (la_size != mdev->bc->md.la_size_sect);
 
-	/* LGE: flexible device size!! is this the right thing to test? */
 	md_moved = prev_first_sect != drbd_md_first_sector(mdev->bc)
 		|| prev_size	   != mdev->bc->md.md_size_sect;
 
 	if (md_moved) {
-		DRBD_WARN("Moving meta-data.\n");
+		drbd_WARN("Moving meta-data.\n");
 		/* assert: (flexible) internal meta data */
 	}
 
 	if (la_size_changed || md_moved) {
 		drbd_al_shrink(mdev); /* All extents inactive. */
 		INFO("Writing the whole bitmap, size changed\n");
-		rv = drbd_bitmap_io(mdev, &drbd_bm_write);
+		rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed");
 		drbd_md_mark_dirty(mdev);
 	}
 
-	if (size > la_size) rv = grew;
-	if (size < la_size) rv = shrunk;
+	if (size > la_size)
+		rv = grew;
+	if (size < la_size)
+		rv = shrunk;
 out:
 	lc_unlock(mdev->act_log);
 	wake_up(&mdev->al_wait);
+	drbd_resume_io(mdev);
 
 	return rv;
 }
@@ -615,8 +677,8 @@ STATIC int drbd_check_al_size(struct drbd_conf *mdev)
 	ERR_IF(mdev->sync_conf.al_extents < 7)
 		mdev->sync_conf.al_extents = 127;
 
-	if ( mdev->act_log &&
-	     mdev->act_log->nr_elements == mdev->sync_conf.al_extents )
+	if (mdev->act_log &&
+	    mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
 		return 0;
 
 	in_use = 0;
@@ -705,7 +767,7 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
 	       );
 
 	if (b->merge_bvec_fn)
-		DRBD_WARN("Backing device's merge_bvec_fn() = %p\n",
+		drbd_WARN("Backing device's merge_bvec_fn() = %p\n",
 		     b->merge_bvec_fn);
 	INFO("max_segment_size ( = BIO size ) = %u\n", q->max_segment_size);
 
@@ -723,6 +785,9 @@ STATIC int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 			     struct drbd_nl_cfg_reply *reply)
 {
 	enum ret_codes retcode;
+	enum determin_dev_size_enum dd;
+	sector_t max_possible_sectors;
+	sector_t min_md_device_sectors;
 	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
 	struct inode *inode, *inode2;
 	struct lru_cache *resync_lru = NULL;
@@ -735,20 +800,20 @@ STATIC int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		goto fail;
 	}
 
-       /*
-        * We may have gotten here very quickly from a detach. Wait for a bit
-        * then fail.
-        */
+	/*
+	* We may have gotten here very quickly from a detach. Wait for a bit
+	* then fail.
+	*/
 	while (1) {
 		__no_warn(local, nbc = mdev->bc; );
 		if (nbc == NULL)
 			break;
 		if (ntries++ >= 5) {
-			DRBD_WARN("drbd_nl_disk_conf: mdev->bc not NULL.\n");
+			drbd_WARN("drbd_nl_disk_conf: mdev->bc not NULL.\n");
 			retcode = HaveDiskConfig;
 			goto fail;
 		}
-		set_current_state(TASK_INTERRUPTIBLE);
+		__set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(HZ/10);
 	}
 
@@ -758,7 +823,9 @@ STATIC int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		goto fail;
 	}
 
-	if ( !(nlp->flags & DRBD_NL_SET_DEFAULTS) && inc_local(mdev) ) {
+	memset(&nbc->md, 0, sizeof(struct drbd_md));
+
+	if (!(nlp->flags & DRBD_NL_SET_DEFAULTS) && inc_local(mdev)) {
 		memcpy(&nbc->dc, &mdev->bc->dc, sizeof(struct disk_conf));
 		dec_local(mdev);
 	} else {
@@ -842,42 +909,70 @@ STATIC int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	if (bd_claim(nbc->md_bdev,
 		     (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
 		      nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT) ?
-		     (void *)mdev : (void *) drbd_m_holder )) {
+		     (void *)mdev : (void *) drbd_m_holder)) {
 		retcode = MDMounted;
 		goto release_bdev_fail;
 	}
 
-	if ( (nbc->backing_bdev == nbc->md_bdev) !=
-	     (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
-	      nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT) ) {
+	if ((nbc->backing_bdev == nbc->md_bdev) !=
+	    (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	     nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
 		retcode = LDMDInvalid;
 		goto release_bdev2_fail;
 	}
 
-	if ((drbd_get_capacity(nbc->backing_bdev)) < nbc->dc.disk_size) {
+	/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
+	drbd_md_set_sector_offsets(mdev, nbc);
+
+	if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
+		ERR("max capacity %llu smaller than disk size %llu\n",
+			(unsigned long long) drbd_get_max_capacity(nbc),
+			(unsigned long long) nbc->dc.disk_size);
 		retcode = LDDeviceTooSmall;
 		goto release_bdev2_fail;
 	}
 
-/* TODO check whether backing device size is within plausible limits */
+	if (nbc->dc.meta_dev_idx < 0) {
+		max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
+		/* at least one MB, otherwise it does not make sense */
+		min_md_device_sectors = (2<<10);
+	} else {
+		max_possible_sectors = DRBD_MAX_SECTORS;
+		min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
+	}
+
+	if (drbd_get_capacity(nbc->md_bdev) > max_possible_sectors)
+		drbd_WARN("truncating very big lower level device "
+		     "to currently maximum possible %llu sectors\n",
+		     (unsigned long long) max_possible_sectors);
+
+	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
+		retcode = MDDeviceTooSmall;
+		drbd_WARN("refusing attach: md-device too small, "
+		     "at least %llu sectors needed for this meta-disk type\n",
+		     (unsigned long long) min_md_device_sectors);
+		goto release_bdev2_fail;
+	}
 
 	/* Make sure the new disk is big enough
 	 * (we may currently be Primary with no local disk...) */
-	if (drbd_get_capacity(nbc->backing_bdev) <
-	    drbd_get_capacity(mdev->this_bdev) ) {
+	if (drbd_get_max_capacity(nbc) <
+	    drbd_get_capacity(mdev->this_bdev)) {
 		retcode = LDDeviceTooSmall;
 		goto release_bdev2_fail;
 	}
 
 	nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
 
+	drbd_suspend_io(mdev);
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt));
 	retcode = _drbd_request_state(mdev, NS(disk, Attaching), ChgStateVerbose);
-	if (retcode < SS_Success )
+	drbd_resume_io(mdev);
+	if (retcode < SS_Success)
 		goto release_bdev2_fail;
 
-	if (!inc_local_if_state(mdev, Attaching)) {
+	if (!inc_local_if_state(mdev, Attaching))
 		goto force_diskless;
-	}
 
 	drbd_thread_start(&mdev->worker);
 	drbd_md_set_sector_offsets(mdev, nbc);
@@ -904,6 +999,7 @@ STATIC int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	/* Prevent shrinking of consistent devices ! */
 	if (drbd_md_test_flag(nbc, MDF_Consistent) &&
 	   drbd_new_dev_size(mdev, nbc) < nbc->md.la_size_sect) {
+		drbd_WARN("refusing to truncate a consistent device\n");
 		retcode = LDDeviceTooSmall;
 		goto force_diskless_dec;
 	}
@@ -915,11 +1011,6 @@ STATIC int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 
 	/* Reset the "barriers don't work" bits here, then force meta data to
 	 * be written, to ensure we determine if barriers are supported. */
-	if (nbc->dc.no_disk_flush)
-		set_bit(LL_DEV_NO_FLUSH, &mdev->flags);
-	else
-		clear_bit(LL_DEV_NO_FLUSH, &mdev->flags);
-
 	if (nbc->dc.no_md_flush)
 		set_bit(MD_NO_BARRIER, &mdev->flags);
 	else
@@ -935,6 +1026,9 @@ STATIC int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	nbc = NULL;
 	resync_lru = NULL;
 
+	mdev->write_ordering = WO_bio_barrier;
+	drbd_bump_write_ordering(mdev, WO_bio_barrier);
+
 	if (drbd_md_test_flag(mdev->bc, MDF_PrimaryInd))
 		set_bit(CRASHED_PRIMARY, &mdev->flags);
 	else
@@ -967,25 +1061,27 @@ STATIC int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	 * degraded but active "cluster" after a certain timeout.
 	 */
 	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
-	if ( mdev->state.role != Primary &&
+	if (mdev->state.role != Primary &&
 	     drbd_md_test_flag(mdev->bc, MDF_PrimaryInd) &&
-	    !drbd_md_test_flag(mdev->bc, MDF_ConnectedInd) )
+	    !drbd_md_test_flag(mdev->bc, MDF_ConnectedInd))
 		set_bit(USE_DEGR_WFC_T, &mdev->flags);
 
-	if (drbd_determin_dev_size(mdev) == dev_size_error) {
+	dd = drbd_determin_dev_size(mdev);
+	if (dd == dev_size_error) {
 		retcode = VMallocFailed;
 		goto force_diskless_dec;
-	}
+	} else if (dd == grew)
+		set_bit(RESYNC_AFTER_NEG, &mdev->flags);
 
 	if (drbd_md_test_flag(mdev->bc, MDF_FullSync)) {
 		INFO("Assuming that all blocks are out of sync "
 		     "(aka FullSync)\n");
-		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write)) {
+		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) {
 			retcode = MDIOError;
 			goto force_diskless_dec;
 		}
 	} else {
-		if (drbd_bitmap_io(mdev, &drbd_bm_read) < 0) {
+		if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) {
 			retcode = MDIOError;
 			goto force_diskless_dec;
 		}
@@ -1020,7 +1116,7 @@ STATIC int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		ns.pdsk = Outdated;
 
 	if ( ns.disk == Consistent &&
-	    ( ns.pdsk == Outdated || mdev->bc->dc.fencing == DontCare ) )
+	    (ns.pdsk == Outdated || mdev->bc->dc.fencing == DontCare))
 		ns.disk = UpToDate;
 
 	/* All tests on MDF_PrimaryInd, MDF_ConnectedInd,
@@ -1036,6 +1132,7 @@ STATIC int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		ns.disk = Negotiating;
 	}
 
+	DRBD_STATE_DEBUG_INIT_VAL(ns);
 	rv = _drbd_set_state(mdev, ns, ChgStateVerbose, NULL);
 	ns = mdev->state;
 	spin_unlock_irq(&mdev->req_lock);
@@ -1043,8 +1140,10 @@ STATIC int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	if (rv < SS_Success)
 		goto force_diskless_dec;
 
-	if(mdev->state.role == Primary) mdev->bc->md.uuid[Current] |=  (u64)1;
-	else                            mdev->bc->md.uuid[Current] &= ~(u64)1;
+	if (mdev->state.role == Primary)
+		mdev->bc->md.uuid[Current] |=  (u64)1;
+	else
+		mdev->bc->md.uuid[Current] &= ~(u64)1;
 
 	drbd_md_mark_dirty(mdev);
 	drbd_md_sync(mdev);
@@ -1085,7 +1184,7 @@ STATIC int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 	fsync_bdev(mdev->this_bdev);
 	reply->ret_code = drbd_request_state(mdev, NS(disk, Diskless));
 
-	set_current_state(TASK_INTERRUPTIBLE);
+	__set_current_state(TASK_INTERRUPTIBLE);
 	schedule_timeout(HZ/20); /* 50ms; Time for worker to finally terminate */
 
 	return 0;
@@ -1121,7 +1220,7 @@ STATIC int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 		goto fail;
 	}
 
-	if ( !(nlp->flags & DRBD_NL_SET_DEFAULTS) && inc_net(mdev)) {
+	if (!(nlp->flags & DRBD_NL_SET_DEFAULTS) && inc_net(mdev)) {
 		memcpy(new_conf, mdev->net_conf, sizeof(struct net_conf));
 		dec_net(mdev);
 	} else {
@@ -1169,13 +1268,13 @@ STATIC int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 		odev = minor_to_mdev(i);
 		if (!odev || odev == mdev)
 			continue;
-		if ( inc_net(odev)) {
-			if ( M_ADDR(new_conf) == M_ADDR(odev->net_conf) &&
-			    M_PORT(new_conf) == M_PORT(odev->net_conf) )
+		if (inc_net(odev)) {
+			if (M_ADDR(new_conf) == M_ADDR(odev->net_conf) &&
+			    M_PORT(new_conf) == M_PORT(odev->net_conf))
 				retcode = LAAlreadyInUse;
 
 			if (O_ADDR(new_conf) == O_ADDR(odev->net_conf) &&
-			   O_PORT(new_conf) == O_PORT(odev->net_conf) )
+			    O_PORT(new_conf) == O_PORT(odev->net_conf))
 				retcode = OAAlreadyInUse;
 
 			dec_net(odev);
@@ -1215,7 +1314,7 @@ STATIC int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 
 		if (crypto_tfm_alg_type(crypto_hash_tfm(integrity_w_tfm)) != CRYPTO_ALG_TYPE_DIGEST) {
 			retcode=IntegrityAlgNotDigest;
-			goto fail;				
+			goto fail;
 		}
 
 		integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
@@ -1236,7 +1335,7 @@ STATIC int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 	}
 
 	ns = new_conf->max_buffers/8;
-	if (new_conf->two_primaries && ( mdev->ee_hash_s != ns ) ) {
+	if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
 		new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
 		if (!new_ee_hash) {
 			retcode = KMallocFailed;
@@ -1376,14 +1475,13 @@ STATIC int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 			drbd_force_state(mdev, NS(conn, Disconnecting));
 			retcode = SS_Success;
 		}
-
 	}
 
 	if (retcode < SS_Success)
 		goto fail;
 
 	if (wait_event_interruptible(mdev->state_wait,
-				     mdev->state.conn != Disconnecting) ) {
+				     mdev->state.conn != Disconnecting)) {
 		/* Do not test for mdev->state.conn == StandAlone, since
 		   someone else might connect us in the mean time! */
 		retcode = GotSignal;
@@ -1406,12 +1504,12 @@ void resync_after_online_grow(struct drbd_conf *mdev)
 	if (mdev->state.role != mdev->state.peer)
 		iass = (mdev->state.role == Primary);
 	else
-		iass = test_bit(DISCARD_CONCURRENT,&mdev->flags);
+		iass = test_bit(DISCARD_CONCURRENT, &mdev->flags);
 
 	if (iass)
-		drbd_start_resync(mdev,SyncSource);
+		drbd_start_resync(mdev, SyncSource);
 	else
-		drbd_request_state(mdev,NS(conn,WFSyncUUID));
+		_drbd_request_state(mdev, NS(conn, WFSyncUUID), ChgStateVerbose + ChgSerialize);
 }
 
 STATIC int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
@@ -1433,8 +1531,8 @@ STATIC int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 		goto fail;
 	}
 
-	if ( mdev->state.role == Secondary &&
-	     mdev->state.peer == Secondary) {
+	if (mdev->state.role == Secondary &&
+	    mdev->state.peer == Secondary) {
 		retcode = APrimaryNodeNeeded;
 		goto fail;
 	}
@@ -1458,7 +1556,7 @@ STATIC int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 		goto fail;
 	}
 
-	if (mdev->state.conn == Connected && ( dd != unchanged || ldsc) ) {
+	if (mdev->state.conn == Connected && (dd != unchanged || ldsc)) {
 		drbd_send_uuids(mdev);
 		drbd_send_sizes(mdev);
 		if (dd == grew)
@@ -1474,11 +1572,13 @@ STATIC int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 			       struct drbd_nl_cfg_reply *reply)
 {
 	int retcode = NoError;
-	struct syncer_conf sc;
-	struct drbd_conf *odev;
 	int err;
-	struct crypto_hash *verify_tfm = NULL, *old_verify_tfm = NULL;
 	int ovr; /* online verify running */
+	int rsr; /* re-sync running */
+	struct drbd_conf *odev;
+	struct crypto_hash *verify_tfm = NULL;
+	struct crypto_hash *csums_tfm = NULL;
+	struct syncer_conf sc;
 	cpumask_t n_cpu_mask = CPU_MASK_NONE;
 
 	memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
@@ -1496,7 +1596,7 @@ STATIC int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 	}
 
 	if (sc.after != -1) {
-		if ( sc.after < -1 || minor_to_mdev(sc.after) == NULL ) {
+		if (sc.after < -1 || minor_to_mdev(sc.after) == NULL) {
 			retcode = SyncAfterInvalid;
 			goto fail;
 		}
@@ -1512,6 +1612,32 @@ STATIC int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 		}
 	}
 
+	/* re-sync running */
+	rsr = (	mdev->state.conn == SyncSource ||
+		mdev->state.conn == SyncTarget ||
+		mdev->state.conn == PausedSyncS ||
+		mdev->state.conn == PausedSyncT );
+
+	if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
+		retcode = CSUMSResyncRunning;
+		goto fail;
+	}
+
+	if (!rsr && sc.csums_alg[0]) {
+		csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(csums_tfm)) {
+			csums_tfm = NULL;
+			retcode = CSUMSAlgNotAvail;
+			goto fail;
+		}
+
+		if (crypto_tfm_alg_type(crypto_hash_tfm(csums_tfm)) != CRYPTO_ALG_TYPE_DIGEST) {
+			retcode = CSUMSAlgNotDigest;
+			goto fail;
+		}
+	}
+
+	/* online verify running */
 	ovr = (mdev->state.conn == VerifyS || mdev->state.conn == VerifyT);
 
 	if (ovr) {
@@ -1525,12 +1651,12 @@ STATIC int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 		verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
 		if (IS_ERR(verify_tfm)) {
 			verify_tfm = NULL;
-			retcode=VERIFYAlgNotAvail;
+			retcode = VERIFYAlgNotAvail;
 			goto fail;
 		}
 
 		if (crypto_tfm_alg_type(crypto_hash_tfm(verify_tfm)) != CRYPTO_ALG_TYPE_DIGEST) {
-			retcode=VERIFYAlgNotDigest;
+			retcode = VERIFYAlgNotDigest;
 			goto fail;
 		}
 	}
@@ -1538,7 +1664,7 @@ STATIC int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 	if (sc.cpu_mask[0] != 0) {
 		err = __bitmap_parse(sc.cpu_mask, 32, 0, (unsigned long *)&n_cpu_mask, NR_CPUS);
 		if (err) {
-			DRBD_WARN("__bitmap_parse() failed with %d\n", err);
+			drbd_WARN("__bitmap_parse() failed with %d\n", err);
 			retcode = CPUMaskParseFailed;
 			goto fail;
 		}
@@ -1556,15 +1682,20 @@ STATIC int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 	spin_lock(&mdev->peer_seq_lock);
 	/* lock against receive_SyncParam() */
 	mdev->sync_conf = sc;
+
+	if (!rsr) {
+		crypto_free_hash(mdev->csums_tfm);
+		mdev->csums_tfm = csums_tfm;
+		csums_tfm = NULL;
+	}
+
 	if (!ovr) {
-		old_verify_tfm = mdev->verify_tfm;
+		crypto_free_hash(mdev->verify_tfm);
 		mdev->verify_tfm = verify_tfm;
 		verify_tfm = NULL;
 	}
 	spin_unlock(&mdev->peer_seq_lock);
 
-	crypto_free_hash(old_verify_tfm);
-
 	if (inc_local(mdev)) {
 		wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
 		drbd_al_shrink(mdev);
@@ -1588,13 +1719,14 @@ STATIC int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 
 	if (!cpus_equal(mdev->cpu_mask, n_cpu_mask)) {
 		mdev->cpu_mask = n_cpu_mask;
-		n_cpu_mask = drbd_calc_cpu_mask(mdev);
-		drbd_thread_set_cpu(&mdev->receiver, n_cpu_mask);
-		drbd_thread_set_cpu(&mdev->worker, n_cpu_mask);
-		drbd_thread_set_cpu(&mdev->asender, n_cpu_mask);
+		mdev->cpu_mask = drbd_calc_cpu_mask(mdev);
+		mdev->receiver.reset_cpu_mask = 1;
+		mdev->asender.reset_cpu_mask = 1;
+		mdev->worker.reset_cpu_mask = 1;
 	}
 
 fail:
+	crypto_free_hash(csums_tfm);
 	crypto_free_hash(verify_tfm);
 	reply->ret_code = retcode;
 	return 0;
@@ -1707,11 +1839,24 @@ STATIC int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 STATIC int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 			     struct drbd_nl_cfg_reply *reply)
 {
-	unsigned short *tl;
+	unsigned short *tl = reply->tag_list;
+	union drbd_state_t s = mdev->state;
+	unsigned long rs_left;
+	unsigned int res;
 
-	tl = reply->tag_list;
+	tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
 
-	tl = get_state_to_tags(mdev, (struct get_state *)&mdev->state, tl);
+	/* no local ref, no bitmap, no syncer progress. */
+	if (s.conn >= SyncSource && s.conn <= PausedSyncT) {
+		if (inc_local(mdev)) {
+			drbd_get_syncer_progress(mdev, &rs_left, &res);
+			*tl++ = T_sync_progress;
+			*tl++ = sizeof(int);
+			memcpy(tl, &res, sizeof(int));
+			tl = (unsigned short *)((char *)tl + sizeof(int));
+			dec_local(mdev);
+		}
+	}
 	*tl++ = TT_END; /* Close the tag list */
 
 	return (int)((char *)tl - (char *)reply->tag_list);
@@ -1768,6 +1913,52 @@ STATIC int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 }
 
 
+STATIC int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	int retcode = NoError;
+	int err;
+
+	struct new_c_uuid args;
+
+	memset(&args, 0, sizeof(struct new_c_uuid));
+	if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
+		reply->ret_code = UnknownMandatoryTag;
+		return 0;
+	}
+
+	mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
+
+	if (mdev->state.conn >= Connected) {
+		retcode = MayNotBeConnected;
+		goto out;
+	}
+
+	if (!inc_local(mdev)) {
+		retcode = HaveNoDiskConfig;
+		goto out;
+	}
+
+	drbd_uuid_set(mdev, Bitmap, 0); /* Rotate Bitmap to History 1, etc... */
+	drbd_uuid_new_current(mdev); /* New current, previous to Bitmap */
+
+	if (args.clear_bm) {
+		err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid");
+		if (err) {
+			ERR("Writing bitmap failed with %d\n",err);
+			retcode = MDIOError;
+		}
+	}
+
+	drbd_md_sync(mdev);
+	dec_local(mdev);
+out:
+	mutex_unlock(&mdev->state_mutex);
+
+	reply->ret_code = retcode;
+	return 0;
+}
+
 STATIC struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
 {
 	struct drbd_conf *mdev;
@@ -1801,7 +1992,7 @@ STATIC struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
 struct cn_handler_struct {
 	int (*function)(struct drbd_conf *,
 			 struct drbd_nl_cfg_req *,
-			 struct drbd_nl_cfg_reply* );
+			 struct drbd_nl_cfg_reply *);
 	int reply_body_size;
 };
 
@@ -1826,13 +2017,14 @@ static struct cn_handler_struct cnd_table[] = {
 				    sizeof(struct disk_conf_tag_len_struct) +
 				    sizeof(struct net_conf_tag_len_struct) },
 	[ P_get_state ]		= { &drbd_nl_get_state,
-				    sizeof(struct get_state_tag_len_struct) },
+				    sizeof(struct get_state_tag_len_struct) +
+				    sizeof(struct sync_progress_tag_len_struct)	},
 	[ P_get_uuids ]		= { &drbd_nl_get_uuids,
 				    sizeof(struct get_uuids_tag_len_struct) },
-	[ P_get_timeout_flag ]	=
-		{ &drbd_nl_get_timeout_flag,
-		  sizeof(struct get_timeout_flag_tag_len_struct)},
-	[ P_start_ov ]          = { &drbd_nl_start_ov,          0 },
+	[ P_get_timeout_flag ]	= { &drbd_nl_get_timeout_flag,
+				    sizeof(struct get_timeout_flag_tag_len_struct)},
+	[ P_start_ov ]		= { &drbd_nl_start_ov,		0 },
+	[ P_new_c_uuid ]	= { &drbd_nl_new_c_uuid,	0 },
 };
 
 STATIC void drbd_connector_callback(void *data)
@@ -1867,6 +2059,13 @@ STATIC void drbd_connector_callback(void *data)
 	}
 
 	cm = cnd_table + nlp->packet_type;
+
+	/* This may happen if packet number is 0: */
+	if (cm->function == NULL) {
+		retcode = UnknownNetLinkPacket;
+		goto fail;
+	}
+
 	reply_size += cm->reply_body_size;
 
 	cn_reply = kmalloc(reply_size, GFP_KERNEL);
@@ -1972,7 +2171,7 @@ void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state_t state)
 		(struct drbd_nl_cfg_reply *)cn_reply->data;
 	unsigned short *tl = reply->tag_list;
 
-	/* DRBD_WARN("drbd_bcast_state() got called\n"); */
+	/* drbd_WARN("drbd_bcast_state() got called\n"); */
 
 	tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
 	*tl++ = TT_END; /* Close the tag list */
@@ -2007,7 +2206,7 @@ void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
 	unsigned short *tl = reply->tag_list;
 	int str_len;
 
-	/* DRBD_WARN("drbd_bcast_state() got called\n"); */
+	/* drbd_WARN("drbd_bcast_state() got called\n"); */
 
 	str_len = strlen(helper_name)+1;
 	*tl++ = T_helper;
@@ -2148,12 +2347,31 @@ void drbd_bcast_sync_progress(struct drbd_conf *mdev)
 	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
 }
 
+#ifdef NETLINK_ROUTE6
+int __init cn_init(void);
+void __exit cn_fini(void);
+#endif
+
 int __init drbd_nl_init(void)
 {
-	static struct cb_id cn_id_drbd = { CN_IDX_DRBD, CN_VAL_DRBD };
-	int err;
+	static struct cb_id cn_id_drbd;
+	int err, try=10;
+
+#ifdef NETLINK_ROUTE6
+	/* pre 2.6.16 */
+	err = cn_init();
+	if (err)
+		return err;
+#endif
+	cn_id_drbd.val = CN_VAL_DRBD;
+	do {
+		cn_id_drbd.idx = cn_idx;
+		err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
+		if (!err)
+			break;
+		cn_idx = (cn_idx + CN_IDX_STEP);
+	} while (try--);
 
-	err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
 	if (err) {
 		printk(KERN_ERR "drbd: cn_drbd failed to register\n");
 		return err;
@@ -2164,13 +2382,20 @@ int __init drbd_nl_init(void)
 
 void drbd_nl_cleanup(void)
 {
-	static struct cb_id cn_id_drbd = { CN_IDX_DRBD, CN_VAL_DRBD };
+	static struct cb_id cn_id_drbd;
+
+	cn_id_drbd.idx = cn_idx;
+	cn_id_drbd.val = CN_VAL_DRBD;
 
 	cn_del_callback(&cn_id_drbd);
+
+#ifdef NETLINK_ROUTE6
+	/* pre 2.6.16 */
+	cn_fini();
+#endif
 }
 
-void drbd_nl_send_reply( struct cn_msg *req,
-			 int ret_code)
+void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
 {
 	char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
 	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
diff --git a/ubuntu/drbd/drbd_proc.c b/ubuntu/drbd/drbd_proc.c
index c1e3a23..c8593a7 100644
--- a/ubuntu/drbd/drbd_proc.c
+++ b/ubuntu/drbd/drbd_proc.c
@@ -38,7 +38,7 @@
 #include "drbd_int.h"
 #include "lru_cache.h" /* for lc_sprintf_stats */
 
-int drbd_proc_open(struct inode *inode, struct file *file);
+STATIC int drbd_proc_open(struct inode *inode, struct file *file);
 
 
 struct proc_dir_entry *drbd_proc;
@@ -80,11 +80,11 @@ STATIC void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
 	if (mdev->rs_total > 0x100000L)
 		seq_printf(seq, "(%lu/%lu)M\n\t",
 			    (unsigned long) Bit2KB(rs_left) >> 10,
-			    (unsigned long) Bit2KB(mdev->rs_total) >> 10 );
+			    (unsigned long) Bit2KB(mdev->rs_total) >> 10);
 	else
 		seq_printf(seq, "(%lu/%lu)K\n\t",
 			    (unsigned long) Bit2KB(rs_left),
-			    (unsigned long) Bit2KB(mdev->rs_total) );
+			    (unsigned long) Bit2KB(mdev->rs_total));
 
 	/* see drivers/md/md.c
 	 * We do not want to overflow, so the order of operands and
@@ -154,20 +154,34 @@ STATIC int drbd_seq_show(struct seq_file *seq, void *v)
 	const char *sn;
 	struct drbd_conf *mdev;
 
+	static char write_ordering_chars[] = {
+		[WO_none] = 'n',
+		[WO_drain_io] = 'd',
+		[WO_bdev_flush] = 'f',
+		[WO_bio_barrier] = 'b',
+	};
+
 	seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
 		   API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
 
 	/*
 	  cs .. connection state
-	  st .. node state (local/remote)
-	  ld .. local data consistentency
+	  ro .. node role (local/remote)
+	  ds .. disk state (local/remote)
+	     protocol
+	     various flags
 	  ns .. network send
 	  nr .. network receive
 	  dw .. disk write
 	  dr .. disk read
-	  pe .. pending (waiting for ack)
-	  ua .. unack'd (still need to send ack)
-	  al .. access log write count
+	  al .. activity log write count
+	  bm .. bitmap update write count
+	  pe .. pending (waiting for ack or data reply)
+	  ua .. unack'd (still need to send ack or data reply)
+	  ap .. application requests accepted, but not yet completed
+	  ep .. number of epochs currently "on the fly", BarrierAck pending
+	  wo .. write ordering mode currently in use
+	 oos .. known out-of-sync kB
 	*/
 
 	for (i = 0; i < minor_count; i++) {
@@ -178,19 +192,20 @@ STATIC int drbd_seq_show(struct seq_file *seq, void *v)
 		}
 		if (hole) {
 			hole = 0;
-			seq_printf( seq, "\n");
+			seq_printf(seq, "\n");
 		}
 
 		sn = conns_to_name(mdev->state.conn);
 
-		if ( mdev->state.conn == StandAlone &&
-		     mdev->state.disk == Diskless) {
-			seq_printf( seq, "%2d: cs:Unconfigured\n", i);
+		if (mdev->state.conn == StandAlone &&
+		    mdev->state.disk == Diskless &&
+		    mdev->state.role == Secondary) {
+			seq_printf(seq, "%2d: cs:Unconfigured\n", i);
 		} else {
-			seq_printf( seq,
-			   "%2d: cs:%s st:%s/%s ds:%s/%s %c %c%c%c%c\n"
+			seq_printf(seq,
+			   "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c\n"
 			   "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
-			   "lo:%d pe:%d ua:%d ap:%d",
+			   "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
 			   i, sn,
 			   roles_to_name(mdev->state.role),
 			   roles_to_name(mdev->state.peer),
@@ -212,23 +227,23 @@ STATIC int drbd_seq_show(struct seq_file *seq, void *v)
 			   atomic_read(&mdev->ap_pending_cnt) +
 			   atomic_read(&mdev->rs_pending_cnt),
 			   atomic_read(&mdev->unacked_cnt),
-			   atomic_read(&mdev->ap_bio_cnt)
+			   atomic_read(&mdev->ap_bio_cnt),
+			   mdev->epochs,
+			   write_ordering_chars[mdev->write_ordering]
 			);
 			seq_printf(seq, " oos:%lu\n",
-				   drbd_bm_total_weight(mdev) << (BM_BLOCK_SIZE_B - 10));
+				   Bit2KB(drbd_bm_total_weight(mdev)));
 		}
-		if ( mdev->state.conn == SyncSource ||
-		     mdev->state.conn == SyncTarget )
+		if (mdev->state.conn == SyncSource ||
+		    mdev->state.conn == SyncTarget)
 			drbd_syncer_progress(mdev, seq);
 
-		if ( mdev->state.conn == VerifyS ||
-		     mdev->state.conn == VerifyT ) {
+		if (mdev->state.conn == VerifyS || mdev->state.conn == VerifyT)
 			seq_printf(seq,"\t%3d%%      %lu/%lu\n",
 				   (int)((mdev->rs_total-mdev->ov_left) /
 					 (mdev->rs_total/100+1)),
 				   mdev->rs_total - mdev->ov_left,
 				   mdev->rs_total);
-		}
 
 #ifdef ENABLE_DYNAMIC_TRACE
 		if (proc_details >= 1 && inc_local_if_state(mdev, Failed)) {
@@ -249,7 +264,7 @@ STATIC int drbd_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-int drbd_proc_open(struct inode *inode, struct file *file)
+STATIC int drbd_proc_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, drbd_seq_show, PDE(inode)->data);
 }
diff --git a/ubuntu/drbd/drbd_receiver.c b/ubuntu/drbd/drbd_receiver.c
index a0c6886..5047cf0 100644
--- a/ubuntu/drbd/drbd_receiver.c
+++ b/ubuntu/drbd/drbd_receiver.c
@@ -46,12 +46,49 @@
 #include <linux/vmalloc.h>
 #include <linux/random.h>
 #ifdef HAVE_LINUX_SCATTERLIST_H
+/* 2.6.11 (suse 9.3, fc4) does not include requisites
+ * from linux/scatterlist.h :( */
+#include <asm/scatterlist.h>
+#include <linux/mm.h>
+#include <linux/string.h>
 #include <linux/scatterlist.h>
 #endif
 #include <linux/drbd.h>
 #include "drbd_int.h"
 #include "drbd_req.h"
 
+struct flush_work {
+	struct drbd_work w;
+	struct drbd_epoch *epoch;
+};
+
+enum epoch_event {
+	EV_put,
+	EV_got_barrier_nr,
+	EV_barrier_done,
+	EV_became_last,
+	EV_cleanup = 32, /* used as flag */
+};
+
+enum finish_epoch {
+	FE_still_live,
+	FE_destroyed,
+	FE_recycled,
+};
+
+STATIC enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
+STATIC int e_end_block(struct drbd_conf *, struct drbd_work *, int);
+static inline struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
+{
+	struct drbd_epoch *prev;
+	spin_lock(&mdev->epoch_lock);
+	prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
+	if (prev == epoch || prev == mdev->current_epoch)
+		prev = NULL;
+	spin_unlock(&mdev->epoch_lock);
+	return prev;
+}
+
 #ifdef DBG_ASSERTS
 void drbd_assert_breakpoint(struct drbd_conf *mdev, char *exp,
 			    char *file, int line)
@@ -60,7 +97,7 @@ void drbd_assert_breakpoint(struct drbd_conf *mdev, char *exp,
 }
 #endif
 
-#define GFP_TRY	( __GFP_HIGHMEM | __GFP_NOWARN )
+#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
 
 /**
  * drbd_bp_alloc: Returns a page. Fails only if a signal comes in.
@@ -126,11 +163,15 @@ STATIC struct page *drbd_pp_alloc(struct drbd_conf *mdev, gfp_t gfp_mask)
 		 * unless, of course, someone signalled us.
 		 */
 		if (signal_pending(current)) {
-			DRBD_WARN("drbd_pp_alloc interrupted!\n");
+			drbd_WARN("drbd_pp_alloc interrupted!\n");
 			finish_wait(&drbd_pp_wait, &wait);
 			return NULL;
 		}
 		drbd_kick_lo(mdev);
+		if (!(gfp_mask & __GFP_WAIT)) {
+			finish_wait(&drbd_pp_wait, &wait);
+			return NULL;
+		}
 		schedule();
 	}
 	finish_wait(&drbd_pp_wait, &wait);
@@ -198,15 +239,17 @@ struct Tl_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 	unsigned int ds;
 	int i;
 
-	e = mempool_alloc(drbd_ee_mempool, gfp_mask);
+	e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
 	if (!e) {
-		ERR("alloc_ee: Allocation of an EE failed\n");
+		if (!(gfp_mask & __GFP_NOWARN))
+			ERR("alloc_ee: Allocation of an EE failed\n");
 		return NULL;
 	}
 
-	bio = bio_alloc(GFP_KERNEL, div_ceil(data_size, PAGE_SIZE));
+	bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE));
 	if (!bio) {
-		ERR("alloc_ee: Allocation of a bio failed\n");
+		if (!(gfp_mask & __GFP_NOWARN))
+			ERR("alloc_ee: Allocation of a bio failed\n");
 		goto fail1;
 	}
 
@@ -217,7 +260,8 @@ struct Tl_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 	while (ds) {
 		page = drbd_pp_alloc(mdev, gfp_mask);
 		if (!page) {
-			ERR("alloc_ee: Allocation of a page failed\n");
+			if (!(gfp_mask & __GFP_NOWARN))
+				ERR("alloc_ee: Allocation of a page failed\n");
 			goto fail2;
 		}
 		if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) {
@@ -228,15 +272,20 @@ struct Tl_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 
 			q = bdev_get_queue(bio->bi_bdev);
 			if (q->merge_bvec_fn) {
+#ifdef HAVE_bvec_merge_data
 				struct bvec_merge_data bvm = {
 					.bi_bdev = bio->bi_bdev,
 					.bi_sector = bio->bi_sector,
 					.bi_size = bio->bi_size,
 					.bi_rw = bio->bi_rw,
 				};
-				ERR("merge_bvec_fn() = %d\n",
-				    q->merge_bvec_fn(q, &bvm,
-					  &bio->bi_io_vec[bio->bi_vcnt]));
+				int l = q->merge_bvec_fn(q, &bvm,
+						&bio->bi_io_vec[bio->bi_vcnt]);
+#else
+				int l = q->merge_bvec_fn(q, bio,
+						&bio->bi_io_vec[bio->bi_vcnt]);
+#endif
+				ERR("merge_bvec_fn() = %d\n", l);
 			}
 
 			/* dump more of the bio. */
@@ -244,7 +293,6 @@ struct Tl_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 			DUMPI(bio->bi_vcnt);
 			DUMPI(bio->bi_size);
 			DUMPI(bio->bi_phys_segments);
-			DUMPI(bio->bi_hw_segments);
 
 			goto fail2;
 			break;
@@ -252,7 +300,7 @@ struct Tl_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 		ds -= min_t(int, ds, PAGE_SIZE);
 	}
 
-	D_ASSERT( data_size == bio->bi_size);
+	D_ASSERT(data_size == bio->bi_size);
 
 	bio->bi_private = e;
 	e->mdev = mdev;
@@ -262,8 +310,7 @@ struct Tl_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 	e->private_bio = bio;
 	e->block_id = id;
 	INIT_HLIST_NODE(&e->colision);
-	e->barrier_nr = 0;
-	e->barrier_nr2 = 0;
+	e->epoch = NULL;
 	e->flags = 0;
 
 	MTRACE(TraceTypeEE, TraceLvlAll,
@@ -339,7 +386,8 @@ STATIC void reclaim_net_ee(struct drbd_conf *mdev)
 
 	list_for_each_safe(le, tle, &mdev->net_ee) {
 		e = list_entry(le, struct Tl_epoch_entry, w.list);
-		if ( drbd_bio_has_active_page(e->private_bio) ) break;
+		if (drbd_bio_has_active_page(e->private_bio))
+			break;
 		list_del(le);
 		drbd_free_ee(mdev, e);
 	}
@@ -349,7 +397,7 @@ STATIC void reclaim_net_ee(struct drbd_conf *mdev)
 /*
  * This function is called from _asender only_
  * but see also comments in _req_mod(,barrier_acked)
- * and receive_Barrier_no_tcq.
+ * and receive_Barrier.
  *
  * Move entries from net_ee to done_ee, if ready.
  * Grab done_ee, call all callbacks, free the entries.
@@ -360,7 +408,6 @@ STATIC int drbd_process_done_ee(struct drbd_conf *mdev)
 	LIST_HEAD(work_list);
 	struct Tl_epoch_entry *e, *t;
 	int ok = 1;
-	int do_clear_bit = test_bit(WRITE_ACK_PENDING, &mdev->flags);
 
 	spin_lock_irq(&mdev->req_lock);
 	reclaim_net_ee(mdev);
@@ -377,11 +424,10 @@ STATIC int drbd_process_done_ee(struct drbd_conf *mdev)
 			    (unsigned long long)e->sector, e->size, e);
 			);
 		/* list_del not necessary, next/prev members not touched */
-		if (e->w.cb(mdev, &e->w, 0) == 0) ok = 0;
+		if (e->w.cb(mdev, &e->w, 0) == 0)
+			ok = 0;
 		drbd_free_ee(mdev, e);
 	}
-	if (do_clear_bit)
-		clear_bit(WRITE_ACK_PENDING, &mdev->flags);
 	wake_up(&mdev->ee_wait);
 
 	return ok;
@@ -394,6 +440,7 @@ void _drbd_clear_done_ee(struct drbd_conf *mdev)
 {
 	struct list_head *le;
 	struct Tl_epoch_entry *e;
+	struct drbd_epoch *epoch;
 	int n = 0;
 
 	MUST_HOLD(&mdev->req_lock);
@@ -408,7 +455,17 @@ void _drbd_clear_done_ee(struct drbd_conf *mdev)
 		|| is_syncer_block_id(e->block_id))
 			++n;
 
-		if (!hlist_unhashed(&e->colision)) hlist_del_init(&e->colision);
+		if (!hlist_unhashed(&e->colision))
+			hlist_del_init(&e->colision);
+
+		if (e->epoch) {
+			if (e->flags & EE_IS_BARRIER) {
+				epoch = previous_epoch(mdev, e->epoch);
+				if (epoch)
+					drbd_may_finish_epoch(mdev, epoch, EV_barrier_done + EV_cleanup);
+			}
+			drbd_may_finish_epoch(mdev, e->epoch, EV_put + EV_cleanup);
+		}
 		drbd_free_ee(mdev, e);
 	}
 
@@ -439,33 +496,55 @@ void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
 	spin_unlock_irq(&mdev->req_lock);
 }
 
-STATIC struct socket *drbd_accept(struct drbd_conf *mdev, struct socket *sock)
+#ifdef DEFINE_SOCK_CREATE_KERN
+/* if there is no sock_create_kern,
+ * tthere is also sock_create_lite missing */
+int sock_create_lite(int family, int type, int protocol, struct socket **res)
 {
-	struct socket *newsock;
 	int err = 0;
+	struct socket *sock = NULL;
 
-	err = sock->ops->listen(sock, 5);
-	if (err)
-		goto out;
+	sock = sock_alloc();
+	if (!sock)
+		err = -ENOMEM;
+	else
+		sock->type = type;
 
-	if (sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &newsock))
-		goto out;
+	*res = sock;
+	return err;
+}
+#endif
 
-	newsock->type = sock->type;
-	newsock->ops  = sock->ops;
+/* see also kernel_accept; which is only present since 2.6.18.
+ * also we want to log which part of it failed, exactly */
+STATIC int drbd_accept(struct drbd_conf *mdev, const char **what,
+		struct socket *sock, struct socket **newsock)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
 
-	err = newsock->ops->accept(sock, newsock, 0);
+	*what = "listen";
+	err = sock->ops->listen(sock, 5);
 	if (err < 0)
-		goto out_release;
+		goto out;
 
-	return newsock;
+	*what = "sock_create_lite";
+	err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
+			       newsock);
+	if (err < 0)
+		goto out;
+
+	*what = "accept";
+	err = sock->ops->accept(sock, *newsock, 0);
+	if (err < 0) {
+		sock_release(*newsock);
+		*newsock = NULL;
+		goto out;
+	}
+	(*newsock)->ops  = sock->ops;
 
-out_release:
-	sock_release(newsock);
 out:
-	if (err != -EAGAIN && err != -EINTR)
-		ERR("accept failed! %d\n", err);
-	return NULL;
+	return err;
 }
 
 STATIC int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
@@ -546,17 +625,21 @@ STATIC int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
 
 STATIC struct socket *drbd_try_connect(struct drbd_conf *mdev)
 {
-	int err;
+	const char *what;
 	struct socket *sock;
-	struct sockaddr_in src_in;
-
-	if (!inc_net(mdev)) return NULL;
+	struct sockaddr_in6 src_in6;
+	int err;
+	int disconnect_on_error = 1;
 
-	err = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
-	if (err) {
-		dec_net(mdev);
-		ERR("sock_creat(..)=%d\n", err);
+	if (!inc_net(mdev))
 		return NULL;
+
+	what = "sock_create_kern";
+	err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
+		SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (err < 0) {
+		sock = NULL;
+		goto out;
 	}
 
 	sock->sk->sk_rcvtimeo =
@@ -569,29 +652,53 @@ STATIC struct socket *drbd_try_connect(struct drbd_conf *mdev)
 	* Make sure to use 0 as portnumber, so linux selects
 	*  a free one dynamically.
 	*/
-	memcpy(&src_in, &(mdev->net_conf->my_addr), sizeof(struct sockaddr_in));
-	src_in.sin_port = 0;
+	memcpy(&src_in6, mdev->net_conf->my_addr,
+	       min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
+	if(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
+		src_in6.sin6_port = 0;
+	else
+		((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
 
+	what = "bind before connect";
 	err = sock->ops->bind(sock,
-			      (struct sockaddr *) &src_in,
-			      sizeof(struct sockaddr_in));
-	if (err) {
-		ERR("Unable to bind source sock (%d)\n", err);
-		sock_release(sock);
-		sock = NULL;
-		dec_net(mdev);
-		return sock;
-	}
+			      (struct sockaddr *) &src_in6,
+			      mdev->net_conf->my_addr_len);
+	if (err < 0)
+		goto out;
 
+	/* connect may fail, peer not yet available.
+	 * stay WFConnection, don't go Disconnecting! */
+	disconnect_on_error = 0;
+	what = "connect";
 	err = sock->ops->connect(sock,
 				 (struct sockaddr *)mdev->net_conf->peer_addr,
 				 mdev->net_conf->peer_addr_len, 0);
 
-	if (err) {
-		sock_release(sock);
-		sock = NULL;
+out:
+	if (err < 0) {
+		if (sock) {
+			sock_release(sock);
+			sock = NULL;
+		}
+		switch (-err) {
+			/* timeout, busy, signal pending */
+		case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
+		case EINTR: case ERESTARTSYS:
+			/* peer not (yet) available, network problem */
+		case ECONNREFUSED: case ENETUNREACH:
+		case EHOSTDOWN:    case EHOSTUNREACH:
+#if 0
+			DBG("%s failure ignored, err = %d\n",
+					what, err);
+#endif
+			disconnect_on_error = 0;
+			break;
+		default:
+			ERR("%s failed, err = %d\n", what, err);
+		}
+		if (disconnect_on_error)
+			drbd_force_state(mdev, NS(conn, Disconnecting));
 	}
-
 	dec_net(mdev);
 	return sock;
 }
@@ -599,37 +706,51 @@ STATIC struct socket *drbd_try_connect(struct drbd_conf *mdev)
 STATIC struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
 {
 	int err;
-	struct socket *sock, *sock2;
+	struct socket *s_estab = NULL, *s_listen;
+	const char *what;
 
-	if (!inc_net(mdev)) return NULL;
+	if (!inc_net(mdev))
+		return NULL;
 
-	err = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock2);
+	what = "sock_create_kern";
+	err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
+		SOCK_STREAM, IPPROTO_TCP, &s_listen);
 	if (err) {
-		dec_net(mdev);
-		ERR("sock_creat(..)=%d\n", err);
-		return NULL;
+		s_listen = NULL;
+		goto out;
 	}
 
-	sock2->sk->sk_reuse    = 1; /* SO_REUSEADDR */
-	sock2->sk->sk_rcvtimeo =
-	sock2->sk->sk_sndtimeo =  mdev->net_conf->try_connect_int*HZ;
+	s_listen->sk->sk_reuse    = 1; /* SO_REUSEADDR */
+	s_listen->sk->sk_rcvtimeo =
+	s_listen->sk->sk_sndtimeo =  mdev->net_conf->try_connect_int*HZ;
 
-	err = sock2->ops->bind(sock2,
+	what = "bind before listen";
+	err = s_listen->ops->bind(s_listen,
 			      (struct sockaddr *) mdev->net_conf->my_addr,
 			      mdev->net_conf->my_addr_len);
-	dec_net(mdev);
+	if (err < 0)
+		goto out;
 
-	if (err) {
-		ERR("Unable to bind sock2 (%d)\n", err);
-		sock_release(sock2);
-		drbd_force_state(mdev, NS(conn, Disconnecting));
-		return NULL;
-	}
+	err = drbd_accept(mdev, &what, s_listen, &s_estab);
 
-	sock = drbd_accept(mdev, sock2);
-	sock_release(sock2);
+out:
+	if (s_listen)
+		sock_release(s_listen);
+	if (err < 0) {
+		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+			ERR("%s failed, err = %d\n", what, err);
+			drbd_force_state(mdev, NS(conn, Disconnecting));
+		}
+#if 0
+		else {
+			DBG("%s failure ignored, err = %d, not Disconnecting\n",
+					what, err);
+		}
+#endif
+	}
+	dec_net(mdev);
 
-	return sock;
+	return s_estab;
 }
 
 int drbd_do_handshake(struct drbd_conf *mdev);
@@ -661,7 +782,7 @@ STATIC enum Drbd_Packet_Cmd drbd_recv_fp(struct drbd_conf *mdev, struct socket *
  * Tests if the connection behind the socket still exists. If not it frees
  * the socket.
  */
-STATIC int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
+static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
 {
 	int rr;
 	char tb[4];
@@ -695,7 +816,7 @@ STATIC int drbd_connect(struct drbd_conf *mdev)
 	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
 		ERR("CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
 
-	if (drbd_request_state(mdev, NS(conn, WFConnection)) < SS_Success )
+	if (drbd_request_state(mdev, NS(conn, WFConnection)) < SS_Success)
 		return -2;
 
 	clear_bit(DISCARD_CONCURRENT, &mdev->flags);
@@ -710,7 +831,7 @@ STATIC int drbd_connect(struct drbd_conf *mdev)
 			if (s || ++try >= 3)
 				break;
 			/* give the other side time to call bind() & listen() */
-			set_current_state(TASK_INTERRUPTIBLE);
+			__set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(HZ / 10);
 		}
 
@@ -730,11 +851,12 @@ STATIC int drbd_connect(struct drbd_conf *mdev)
 		}
 
 		if (sock && msock) {
-			set_current_state(TASK_INTERRUPTIBLE);
+			__set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(HZ / 10);
 			ok = drbd_socket_okay(mdev, &sock);
 			ok = drbd_socket_okay(mdev, &msock) && ok;
-			if (ok) break;
+			if (ok)
+				break;
 		}
 
 		s = drbd_wait_for_connect(mdev);
@@ -752,7 +874,7 @@ STATIC int drbd_connect(struct drbd_conf *mdev)
 				set_bit(DISCARD_CONCURRENT, &mdev->flags);
 				break;
 			default:
-				DRBD_WARN("Error receiving initial packet\n");
+				drbd_WARN("Error receiving initial packet\n");
 				sock_release(s);
 			}
 		}
@@ -774,7 +896,8 @@ STATIC int drbd_connect(struct drbd_conf *mdev)
 		if (sock && msock) {
 			ok = drbd_socket_okay(mdev, &sock);
 			ok = drbd_socket_okay(mdev, &msock) && ok;
-			if (ok) break;
+			if (ok)
+				break;
 		}
 	} while (1);
 
@@ -784,23 +907,38 @@ STATIC int drbd_connect(struct drbd_conf *mdev)
 	sock->sk->sk_allocation = GFP_NOIO;
 	msock->sk->sk_allocation = GFP_NOIO;
 
-	sock->sk->sk_priority = TC_PRIO_BULK;
-	/* FIXME fold to limits. should be done in drbd_ioctl */
-	sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size;
-	sock->sk->sk_rcvbuf = mdev->net_conf->sndbuf_size;
+	sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+	msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
+
+	if (mdev->net_conf->sndbuf_size) {
+		/* FIXME fold to limits. should be done during configuration */
+		/* this is setsockopt SO_SNDBUFFORCE and SO_RCVBUFFORCE,
+		 * done directly. */
+		sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size;
+		sock->sk->sk_rcvbuf = mdev->net_conf->sndbuf_size;
+		sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK;
+	}
+
+#if 0 /* don't pin the msock bufsize, autotuning should work better */
+	msock->sk->sk_sndbuf = 2*32767;
+	msock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+#endif
+
 	/* NOT YET ...
 	 * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
 	 * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
 	 * first set it to the HandShake timeout, wich is hardcoded for now: */
 	sock->sk->sk_sndtimeo =
 	sock->sk->sk_rcvtimeo = 2*HZ;
-	sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK;
 
-	msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
-	msock->sk->sk_sndbuf = 2*32767;
 	msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
 	msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
 
+	/* we don't want delays.
+	 * we use TCP_CORK where apropriate, though */
+	drbd_tcp_nodelay(sock);
+	drbd_tcp_nodelay(msock);
+
 	mdev->data.socket = sock;
 	mdev->meta.socket = msock;
 	mdev->last_received = jiffies;
@@ -846,13 +984,13 @@ STATIC int drbd_recv_header(struct drbd_conf *mdev, struct Drbd_Header *h)
 
 	r = drbd_recv(mdev, h, sizeof(*h));
 
-	if (unlikely( r != sizeof(*h) )) {
+	if (unlikely(r != sizeof(*h))) {
 		ERR("short read expecting header on sock: r=%d\n", r);
 		return FALSE;
 	};
 	h->command = be16_to_cpu(h->command);
 	h->length  = be16_to_cpu(h->length);
-	if (unlikely( h->magic != BE_DRBD_MAGIC )) {
+	if (unlikely(h->magic != BE_DRBD_MAGIC)) {
 		ERR("magic?? on data m: 0x%lx c: %d l: %d\n",
 		    (long)be32_to_cpu(h->magic),
 		    h->command, h->length);
@@ -863,11 +1001,267 @@ STATIC int drbd_recv_header(struct drbd_conf *mdev, struct Drbd_Header *h)
 	return TRUE;
 }
 
-STATIC int receive_Barrier_no_tcq(struct drbd_conf *mdev, struct Drbd_Header *h)
+STATIC enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
 {
 	int rv;
-	int epoch_size;
+
+	if (mdev->write_ordering >= WO_bdev_flush && inc_local(mdev)) {
+		rv = blkdev_issue_flush(mdev->bc->backing_bdev, NULL);
+		if (rv) {
+			ERR("local disk flush failed with status %d\n",rv);
+			/* would rather check on EOPNOTSUPP, but that is not reliable.
+			 * don't try again for ANY return value != 0
+			 * if (rv == -EOPNOTSUPP) */
+			drbd_bump_write_ordering(mdev, WO_drain_io);
+		}
+		dec_local(mdev);
+	}
+
+	return drbd_may_finish_epoch(mdev, epoch, EV_barrier_done);
+}
+
+/**
+ * w_flush: Checks if an epoch can be closed and therefore might
+ * close and/or free the epoch object.
+ */
+STATIC int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct flush_work *fw = (struct flush_work *)w;
+	struct drbd_epoch *epoch = fw->epoch;
+
+	kfree(w);
+
+	if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
+		drbd_flush_after_epoch(mdev, epoch);
+
+	drbd_may_finish_epoch(mdev, epoch, EV_put |
+			      (mdev->state.conn < Connected ? EV_cleanup : 0));
+
+	return 1;
+}
+
+/**
+ * drbd_may_finish_epoch: Checks if an epoch can be closed and therefore might
+ * close and/or free the epoch object.
+ */
+STATIC enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
+					       struct drbd_epoch *epoch,
+					       enum epoch_event ev)
+{
+	int finish, epoch_size;
+	struct drbd_epoch *next_epoch;
+	int schedule_flush = 0;
+	enum finish_epoch rv = FE_still_live;
+
+	static char *epoch_event_str[] = {
+		[EV_put] = "put",
+		[EV_got_barrier_nr] = "got_barrier_nr",
+		[EV_barrier_done] = "barrier_done",
+		[EV_became_last] = "became_last",
+	};
+
+	spin_lock(&mdev->epoch_lock);
+	do {
+		next_epoch = NULL;
+		finish = 0;
+
+		epoch_size = atomic_read(&epoch->epoch_size);
+
+		switch (ev & ~EV_cleanup) {
+		case EV_put:
+			atomic_dec(&epoch->active);
+			break;
+		case EV_got_barrier_nr:
+			set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
+
+			/* Special case: If we just switched from WO_bio_barrier to
+			   WO_bdev_flush we should not finish the current epoch */
+			if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
+			    mdev->write_ordering != WO_bio_barrier &&
+			    epoch == mdev->current_epoch)
+				clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
+			break;
+		case EV_barrier_done:
+			set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
+			break;
+		case EV_became_last:
+			/* nothing to do*/
+			break;
+		}
+
+		MTRACE(TraceTypeEpochs, TraceLvlAll,
+		       INFO("Update epoch  %p/%d { size=%d active=%d %c%c n%c%c } ev=%s\n",
+			    epoch, epoch->barrier_nr, epoch_size, atomic_read(&epoch->active),
+			    test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) ? 'n' : '-',
+			    test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) ? 'b' : '-',
+			    test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) ? 'i' : '-',
+			    test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ? 'd' : '-',
+			    epoch_event_str[ev]);
+			);
+
+		if (epoch_size != 0 &&
+		    atomic_read(&epoch->active) == 0 &&
+		    test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
+		    epoch->list.prev == &mdev->current_epoch->list &&
+		    !test_bit(DE_IS_FINISHING, &epoch->flags)) {
+			/* Nearly all conditions are met to finish that epoch... */
+			if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
+			    mdev->write_ordering == WO_none ||
+			    (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
+			    ev & EV_cleanup) {
+				finish = 1;
+				set_bit(DE_IS_FINISHING, &epoch->flags);
+			} else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
+				 mdev->write_ordering == WO_bio_barrier) {
+				atomic_inc(&epoch->active);
+				schedule_flush = 1;
+			}
+		}
+		if (finish) {
+			if (!(ev & EV_cleanup)) {
+				spin_unlock(&mdev->epoch_lock);
+				drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
+				spin_lock(&mdev->epoch_lock);
+			}
+			dec_unacked(mdev);
+
+			if (mdev->current_epoch != epoch) {
+				next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
+				list_del(&epoch->list);
+				ev = EV_became_last | (ev & EV_cleanup);
+				mdev->epochs--;
+				MTRACE(TraceTypeEpochs, TraceLvlSummary,
+				       INFO("Freeing epoch %p/%d { size=%d } nr_epochs=%d\n",
+					    epoch, epoch->barrier_nr, epoch_size, mdev->epochs);
+					);
+				kfree(epoch);
+
+				if (rv == FE_still_live)
+					rv = FE_destroyed;
+			} else {
+				epoch->flags = 0;
+				atomic_set(&epoch->epoch_size, 0);
+				/* atomic_set(&epoch->active, 0); is alrady zero */
+				if (rv == FE_still_live)
+					rv = FE_recycled;
+			}
+		}
+
+		if (!next_epoch)
+			break;
+
+		epoch = next_epoch;
+	} while (1);
+
+	spin_unlock(&mdev->epoch_lock);
+
+	if (schedule_flush) {
+		struct flush_work *fw;
+		fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
+		if (fw) {
+			MTRACE(TraceTypeEpochs, TraceLvlMetrics,
+			       INFO("Schedul flush %p/%d { size=%d } nr_epochs=%d\n",
+				    epoch, epoch->barrier_nr, epoch_size, mdev->epochs);
+				);
+			fw->w.cb = w_flush;
+			fw->epoch = epoch;
+			drbd_queue_work(&mdev->data.work, &fw->w);
+		} else {
+			drbd_WARN("Could not kmalloc a flush_work obj\n");
+			set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+			/* That is not a recursion, only one level */
+			drbd_may_finish_epoch(mdev, epoch, EV_barrier_done);
+			drbd_may_finish_epoch(mdev, epoch, EV_put);
+		}
+	}
+
+	return rv;
+}
+
+/**
+ * drbd_bump_write_ordering: It turned out that the current mdev->write_ordering
+ * method does not work on the backing block device. Try the next allowed method.
+ */
+void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
+{
+	enum write_ordering_e pwo;
+	static char *write_ordering_str[] = {
+		[WO_none] = "none",
+		[WO_drain_io] = "drain",
+		[WO_bdev_flush] = "flush",
+		[WO_bio_barrier] = "barrier",
+	};
+
+	pwo = mdev->write_ordering;
+	wo = min(pwo, wo);
+	if (wo == WO_bio_barrier && mdev->bc->dc.no_disk_barrier)
+		wo = WO_bdev_flush;
+	if (wo == WO_bdev_flush && mdev->bc->dc.no_disk_flush)
+		wo = WO_drain_io;
+	if (wo == WO_drain_io && mdev->bc->dc.no_disk_drain)
+		wo = WO_none;
+	mdev->write_ordering = wo;
+	if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
+		INFO("Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
+}
+
+/**
+ * w_e_reissue: In case the IO subsystem delivered an error for an BIO with the
+ * BIO_RW_BARRIER flag set, retry that bio without the barrier flag set.
+ */
+int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
+{
+	struct Tl_epoch_entry *e = (struct Tl_epoch_entry *)w;
+	struct bio* bio = e->private_bio;
+
+	/* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
+	   (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
+	   so that we can finish that epoch in drbd_may_finish_epoch().
+	   That is necessary if we already have a long chain of Epochs, before
+	   we realize that BIO_RW_BARRIER is actually not supported */
+
+	/* As long as the -ENOTSUPP on the barrier is reported immediately
+	   that will never trigger. It it is reported late, we will just
+	   print that warning an continue corretly for all future requests
+	   with WO_bdev_flush */
+	if (previous_epoch(mdev, e->epoch))
+		drbd_WARN("Write ordering was not enforced (one time event)\n");
+
+	/* prepare bio for re-submit,
+	 * re-init volatile members */
+	/* we still have a local reference,
+	 * inc_local was done in receive_Data. */
+	bio->bi_bdev = mdev->bc->backing_bdev;
+	bio->bi_sector = e->sector;
+	bio->bi_size = e->size;
+	bio->bi_idx = 0;
+
+	bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+	bio->bi_flags |= 1 << BIO_UPTODATE;
+
+	/* don't know whether this is necessary: */
+	bio->bi_phys_segments = 0;
+	bio->bi_next = NULL;
+
+	/* these should be unchanged: */
+	/* bio->bi_end_io = drbd_endio_write_sec; */
+	/* bio->bi_vcnt = whatever; */
+
+	e->w.cb = e_end_block;
+
+	/* This is no longer a barrier request. */
+	bio->bi_rw &= ~(1UL << BIO_RW_BARRIER);
+
+	drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio);
+
+	return 1;
+}
+
+STATIC int receive_Barrier(struct drbd_conf *mdev, struct Drbd_Header *h)
+{
+	int rv, issue_flush;
 	struct Drbd_Barrier_Packet *p = (struct Drbd_Barrier_Packet *)h;
+	struct drbd_epoch *epoch;
 
 	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
 
@@ -879,42 +1273,71 @@ STATIC int receive_Barrier_no_tcq(struct drbd_conf *mdev, struct Drbd_Header *h)
 	if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
 		drbd_kick_lo(mdev);
 
-	spin_lock_irq(&mdev->req_lock);
-	_drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
-	epoch_size = mdev->epoch_size;
-	mdev->epoch_size = 0;
-	spin_unlock_irq(&mdev->req_lock);
+	mdev->current_epoch->barrier_nr = p->barrier;
+	rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_got_barrier_nr);
 
 	/* BarrierAck may imply that the corresponding extent is dropped from
 	 * the activity log, which means it would not be resynced in case the
 	 * Primary crashes now.
-	 * Just waiting for write_completion is not enough,
-	 * better flush to make sure it is all on stable storage. */
-	if (!test_bit(LL_DEV_NO_FLUSH, &mdev->flags) && inc_local(mdev)) {
-		rv = blkdev_issue_flush(mdev->bc->backing_bdev, NULL);
-		dec_local(mdev);
-		if (rv == -EOPNOTSUPP) /* don't try again */
-			set_bit(LL_DEV_NO_FLUSH, &mdev->flags);
-		if (rv)
-			ERR("local disk flush failed with status %d\n",rv);
+	 * Therefore we must send the barrier_ack after the barrier request was
+	 * completed. */
+	switch (mdev->write_ordering) {
+	case WO_bio_barrier:
+	case WO_none:
+		if (rv == FE_recycled)
+			return TRUE;
+		break;
+
+	case WO_bdev_flush:
+	case WO_drain_io:
+		D_ASSERT(rv == FE_still_live);
+		set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
+		drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+		rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
+		if (rv == FE_recycled)
+			return TRUE;
+
+		/* The asender will send all the ACKs and barrier ACKs out, since
+		   all EEs moved from the active_ee to the done_ee. We need to
+		   provide a new epoch object for the EEs that come in soon */
+		break;
 	}
 
-	/* FIXME CAUTION! receiver thread sending via msock.
-	 * to make sure this BarrierAck will not be received before the asender
-	 * had a chance to send all the write acks corresponding to this epoch,
-	 * wait_for that bit to clear... */
-	set_bit(WRITE_ACK_PENDING, &mdev->flags);
-	wake_asender(mdev);
-	rv = wait_event_interruptible(mdev->ee_wait,
-			      !test_bit(WRITE_ACK_PENDING, &mdev->flags));
+	epoch = kmalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+	if (!epoch) {
+		drbd_WARN("Allocation of an epoch failed, slowing down\n");
+		issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+		drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+		if (issue_flush) {
+			rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
+			if (rv == FE_recycled)
+				return TRUE;
+		}
 
-	if (rv == 0 && mdev->state.conn >= Connected)
-		rv = drbd_send_b_ack(mdev, p->barrier, epoch_size);
-	else
-		rv = 0;
-	dec_unacked(mdev);
+		drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
 
-	return rv;
+		return TRUE;
+	}
+
+	epoch->flags = 0;
+	atomic_set(&epoch->epoch_size, 0);
+	atomic_set(&epoch->active, 0);
+
+	spin_lock(&mdev->epoch_lock);
+	if (atomic_read(&mdev->current_epoch->epoch_size)) {
+		list_add(&epoch->list, &mdev->current_epoch->list);
+		mdev->current_epoch = epoch;
+		mdev->epochs++;
+		MTRACE(TraceTypeEpochs, TraceLvlMetrics,
+		       INFO("Allocat epoch %p/xxxx { } nr_epochs=%d\n", epoch, mdev->epochs);
+			);
+	} else {
+		/* The current_epoch got recycled while we allocated this one... */
+		kfree(epoch);
+	}
+	spin_unlock(&mdev->epoch_lock);
+
+	return TRUE;
 }
 
 /* used from receive_RSDataReply (recv_resync_read)
@@ -936,7 +1359,7 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
 	if (dgs) {
 		rr = drbd_recv(mdev, dig_in, dgs);
 		if (rr != dgs) {
-			DRBD_WARN("short read receiving data digest: read %d expected %d\n",
+			drbd_WARN("short read receiving data digest: read %d expected %d\n",
 			     rr, dgs);
 			return NULL;
 		}
@@ -956,9 +1379,9 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
 		page = bvec->bv_page;
 		rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE));
 		kunmap(page);
-		if ( rr != min_t(int, ds, PAGE_SIZE) ) {
+		if (rr != min_t(int, ds, PAGE_SIZE)) {
 			drbd_free_ee(mdev, e);
-			DRBD_WARN("short read receiving data: read %d expected %d\n",
+			drbd_WARN("short read receiving data: read %d expected %d\n",
 			     rr, min_t(int, ds, PAGE_SIZE));
 			return NULL;
 		}
@@ -993,9 +1416,9 @@ STATIC int drbd_drain_block(struct drbd_conf *mdev, int data_size)
 	data = kmap(page);
 	while (data_size) {
 		rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
-		if ( rr != min_t(int, data_size, PAGE_SIZE) ) {
+		if (rr != min_t(int, data_size, PAGE_SIZE)) {
 			rv = 0;
-			DRBD_WARN("short read receiving data: read %d expected %d\n",
+			drbd_WARN("short read receiving data: read %d expected %d\n",
 			     rr, min_t(int, data_size, PAGE_SIZE));
 			break;
 		}
@@ -1031,7 +1454,7 @@ STATIC int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
 	if (dgs) {
 		rr = drbd_recv(mdev, dig_in, dgs);
 		if (rr != dgs) {
-			DRBD_WARN("short read receiving data reply digest: read %d expected %d\n",
+			drbd_WARN("short read receiving data reply digest: read %d expected %d\n",
 			     rr, dgs);
 			return 0;
 		}
@@ -1040,7 +1463,7 @@ STATIC int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
 	data_size -= dgs;
 
 	bio = req->master_bio;
-	D_ASSERT( sector == bio->bi_sector );
+	D_ASSERT(sector == bio->bi_sector);
 
 	bio_for_each_segment(bvec, bio, i) {
 		expect = min_t(int, data_size, bvec->bv_len);
@@ -1049,7 +1472,7 @@ STATIC int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
 			     expect);
 		kunmap(bvec->bv_page);
 		if (rr != expect) {
-			DRBD_WARN("short read receiving data reply: "
+			drbd_WARN("short read receiving data reply: "
 			     "read %d expected %d\n",
 			     rr, expect);
 			return 0;
@@ -1080,7 +1503,7 @@ STATIC int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int u
 
 	D_ASSERT(hlist_unhashed(&e->colision));
 
-	if (likely( drbd_bio_uptodate(e->private_bio) )) {
+	if (likely(drbd_bio_uptodate(e->private_bio))) {
 		drbd_set_in_sync(mdev, sector, e->size);
 		ok = drbd_send_ack(mdev, RSWriteAck, e);
 	} else {
@@ -1142,9 +1565,6 @@ STATIC int receive_DataReply(struct drbd_conf *mdev, struct Drbd_Header *h)
 	header_size = sizeof(*p) - sizeof(*h);
 	data_size   = h->length  - header_size;
 
-	/* I expect a block to be a multiple of 512 byte,
-	 * and no more than DRBD_MAX_SEGMENT_SIZE.
-	 * is this too restrictive?  */
 	ERR_IF(data_size == 0) return FALSE;
 
 	if (drbd_recv(mdev, h->payload, header_size) != header_size)
@@ -1184,6 +1604,8 @@ STATIC int receive_RSDataReply(struct drbd_conf *mdev, struct Drbd_Header *h)
 	header_size = sizeof(*p) - sizeof(*h);
 	data_size   = h->length  - header_size;
 
+	ERR_IF(data_size == 0) return FALSE;
+
 	if (drbd_recv(mdev, h->payload, header_size) != header_size)
 		return FALSE;
 
@@ -1214,17 +1636,20 @@ STATIC int receive_RSDataReply(struct drbd_conf *mdev, struct Drbd_Header *h)
 
 /* e_end_block() is called via drbd_process_done_ee().
  * this means this function only runs in the asender thread
- *
- * for a broken example implementation of the TCQ barrier version of
- * e_end_block see older revisions...
  */
 STATIC int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 {
 	struct Tl_epoch_entry *e = (struct Tl_epoch_entry *)w;
 	sector_t sector = e->sector;
-	/* unsigned int epoch_size; */
+	struct drbd_epoch *epoch;
 	int ok = 1, pcmd;
 
+	if (e->flags & EE_IS_BARRIER) {
+		epoch = previous_epoch(mdev, e->epoch);
+		if (epoch)
+			drbd_may_finish_epoch(mdev, epoch, EV_barrier_done);
+	}
+
 	if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
 		if (likely(drbd_bio_uptodate(e->private_bio))) {
 			pcmd = (mdev->state.conn >= SyncSource &&
@@ -1263,6 +1688,8 @@ STATIC int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 		D_ASSERT(hlist_unhashed(&e->colision));
 	}
 
+	drbd_may_finish_epoch(mdev, e->epoch, EV_put);
+
 	return ok;
 }
 
@@ -1310,6 +1737,8 @@ STATIC int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u
 static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
 {
 	DEFINE_WAIT(wait);
+	unsigned int p_seq;
+	long timeout;
 	int ret = 0;
 	spin_lock(&mdev->peer_seq_lock);
 	for (;;) {
@@ -1320,9 +1749,15 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
 			ret = -ERESTARTSYS;
 			break;
 		}
+		p_seq = mdev->peer_seq;
 		spin_unlock(&mdev->peer_seq_lock);
-		schedule();
+		timeout = schedule_timeout(30*HZ);
 		spin_lock(&mdev->peer_seq_lock);
+		if (timeout == 0 && p_seq == mdev->peer_seq) {
+			ret = -ETIMEDOUT;
+			ERR("ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
+			break;
+		}
 	}
 	finish_wait(&mdev->seq_wait, &wait);
 	if (mdev->peer_seq+1 == packet_seq)
@@ -1339,8 +1774,6 @@ STATIC int receive_Data(struct drbd_conf *mdev, struct Drbd_Header *h)
 	struct Drbd_Data_Packet *p = (struct Drbd_Data_Packet *)h;
 	int header_size, data_size;
 	int rw = WRITE;
-	unsigned int barrier_nr = 0;
-	unsigned int epoch_size = 0;
 	u32 dp_flags;
 
 	/* FIXME merge this code dups into some helper function */
@@ -1365,7 +1798,7 @@ STATIC int receive_Data(struct drbd_conf *mdev, struct Drbd_Header *h)
 		spin_unlock(&mdev->peer_seq_lock);
 
 		drbd_send_ack_dp(mdev, NegAck, p);
-		mdev->epoch_size++; /* spin lock ? */
+		atomic_inc(&mdev->current_epoch->epoch_size);
 		return drbd_drain_block(mdev, data_size);
 	}
 
@@ -1377,9 +1810,44 @@ STATIC int receive_Data(struct drbd_conf *mdev, struct Drbd_Header *h)
 	}
 
 	e->private_bio->bi_end_io = drbd_endio_write_sec;
-	e->private_bio->bi_rw = WRITE;
 	e->w.cb = e_end_block;
 
+	spin_lock(&mdev->epoch_lock);
+	e->epoch = mdev->current_epoch;
+	atomic_inc(&e->epoch->epoch_size);
+	atomic_inc(&e->epoch->active);
+
+	if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
+		struct drbd_epoch *epoch;
+		/* Issue a barrier if we start a new epoch, and the previous epoch
+		   was not a epoch containing a single request which already was
+		   a Barrier. */
+		epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
+		if (epoch == e->epoch) {
+			MTRACE(TraceTypeEpochs, TraceLvlMetrics,
+			       INFO("Add barrier   %p/%d\n",
+				    epoch, epoch->barrier_nr);
+				);
+			set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
+			rw |= (1<<BIO_RW_BARRIER);
+			e->flags |= EE_IS_BARRIER;
+		} else {
+			if (atomic_read(&epoch->epoch_size) > 1 ||
+			    !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
+				MTRACE(TraceTypeEpochs, TraceLvlMetrics,
+				       INFO("Add barrier   %p/%d, setting bi in %p/%d\n",
+					    e->epoch, e->epoch->barrier_nr,
+					    epoch, epoch->barrier_nr);
+					);
+				set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+				set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
+				rw |= (1<<BIO_RW_BARRIER);
+				e->flags |= EE_IS_BARRIER;
+			}
+		}
+	}
+	spin_unlock(&mdev->epoch_lock);
+
 	dp_flags = be32_to_cpu(p->dp_flags);
 	if (dp_flags & DP_HARDBARRIER)
 		rw |= (1<<BIO_RW_BARRIER);
@@ -1479,7 +1947,6 @@ STATIC int receive_Data(struct drbd_conf *mdev, struct Drbd_Header *h)
 				ALERT("Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
 				     (unsigned long long)sector);
 				inc_unacked(mdev);
-				mdev->epoch_size++;
 				e->w.cb = e_send_discard_ack;
 				list_add_tail(&e->w.list, &mdev->done_ee);
 
@@ -1519,74 +1986,9 @@ STATIC int receive_Data(struct drbd_conf *mdev, struct Drbd_Header *h)
 		finish_wait(&mdev->misc_wait, &wait);
 	}
 
-	/* when using TCQ:
-	 * note that, when using tagged command queuing, we may
-	 * have more than one reorder domain "active" at a time.
-	 *
-	 * THINK:
-	 * do we have any guarantees that we get the completion
-	 * events of the different reorder domains in order?
-	 * or does the api only "guarantee" that the events
-	 * _happened_ in order, but eventually the completion
-	 * callbacks are shuffeled again?
-	 *
-	 * note that I wonder about the order in which the
-	 * callbacks are run, I am reasonable confident that the
-	 * actual completion happens in order.
-	 *
-	 * - can it happen that the tagged write completion is
-	 *   called even though not all of the writes before it
-	 *   have run their completion callback?
-	 * - can it happen that some completion callback of some
-	 *   write after the tagged one is run, even though the
-	 *   callback of the tagged one itself is still pending?
-	 *
-	 * if this can happen, we either need to drop our "debug
-	 * assertion" about the epoch size and just trust our code
-	 * and the layers below us (nah, won't do that).
-	 *
-	 * or we need to replace the "active_ee" list by some sort
-	 * of "transfer log" on the receiving side, too, which
-	 * uses epoch counters per reorder domain.
-	 */
-
-	/* when using tcq:
-	 * if we got a barrier packet before, but at that time the active_ee
-	 * was not yet empty, we just "remembered" this barrier request.
-	 *
-	 * if this is the first data packet since that barrier, maybe meanwhile
-	 * all previously active writes have been completed?
-	 * if so, send the b_ack right now
-	 * (though, maybe rather move it into the e_end_block callback,
-	 * where it would be sent as soon as possible).
-	 *
-	 * otherwise, tag the write with the barrier number, so it
-	 * will trigger the b_ack before its own ack.
-	 */
-	if (mdev->next_barrier_nr) {
-		/* only when using TCQ */
-		if (list_empty(&mdev->active_ee)) {
-			barrier_nr = mdev->next_barrier_nr;
-			epoch_size = mdev->epoch_size;
-			mdev->epoch_size = 0;
-		} else {
-			e->barrier_nr = mdev->next_barrier_nr;
-		}
-		rw |= (1<<BIO_RW_BARRIER);
-		mdev->next_barrier_nr = 0;
-	}
 	list_add(&e->w.list, &mdev->active_ee);
 	spin_unlock_irq(&mdev->req_lock);
 
-	if (barrier_nr) {
-		/* only when using TCQ
-		 * maybe rather move it into the e_end_block callback,
-		 * where it would be sent as soon as possible).
-		 */
-		(void)drbd_send_b_ack(mdev,
-					cpu_to_be32(barrier_nr), epoch_size);
-	}
-
 	switch (mdev->net_conf->wire_protocol) {
 	case DRBD_PROT_C:
 		inc_unacked(mdev);
@@ -1655,7 +2057,7 @@ STATIC int receive_DataRequest(struct drbd_conf *mdev, struct Drbd_Header *h)
 				(unsigned long long)sector, size);
 		return FALSE;
 	}
-	if ( sector + (size>>9) > capacity) {
+	if (sector + (size>>9) > capacity) {
 		ERR("%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
 				(unsigned long long)sector, size);
 		return FALSE;
@@ -1705,10 +2107,11 @@ STATIC int receive_DataRequest(struct drbd_conf *mdev, struct Drbd_Header *h)
 		break;
 
 	case OVReply:
+	case CsumRSRequest:
 		fault_type = DRBD_FAULT_RS_RD;
 		digest_size = h->length - brps ;
 		di = kmalloc(sizeof(*di) + digest_size ,GFP_KERNEL);
-		if(!di) {
+		if (!di) {
 			dec_local(mdev);
 			drbd_free_ee(mdev,e);
 			return 0;
@@ -1725,8 +2128,23 @@ STATIC int receive_DataRequest(struct drbd_conf *mdev, struct Drbd_Header *h)
 		}
 
 		e->block_id = (u64)(unsigned long)di;
-		e->w.cb = w_e_end_ov_reply;
-		dec_rs_pending(mdev);
+		if (h->command == CsumRSRequest) {
+			D_ASSERT(mdev->agreed_pro_version >= 89);
+			e->w.cb = w_e_end_csum_rs_req;
+		} else if (h->command == OVReply) {
+			e->w.cb = w_e_end_ov_reply;
+			dec_rs_pending(mdev);
+			break;
+		}
+
+		if (!drbd_rs_begin_io(mdev,sector)) {
+			// we have been interrupted, probably connection lost!
+			D_ASSERT(signal_pending(current));
+			drbd_free_ee(mdev,e);
+			kfree(di);
+			dec_local(mdev);
+			return FALSE;
+		}
 		break;
 
 	case OVRequest:
@@ -1748,7 +2166,7 @@ STATIC int receive_DataRequest(struct drbd_conf *mdev, struct Drbd_Header *h)
 		break;
 
 
-	default:; /* avoid compiler warning */
+	default:
 		ERR("unexpected command (%s) in receive_DataRequest\n",
 		    cmdname(h->command));
 		fault_type = DRBD_FAULT_MAX;
@@ -1783,7 +2201,7 @@ STATIC int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
 	ch_peer = mdev->p_uuid[UUID_SIZE];
 	ch_self = mdev->comm_bm_set;
 
-	switch ( mdev->net_conf->after_sb_0p ) {
+	switch (mdev->net_conf->after_sb_0p) {
 	case Consensus:
 	case DiscardSecondary:
 	case CallHelper:
@@ -1799,7 +2217,7 @@ STATIC int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
 		if (self == 0 && peer == 1) { rv =  1; break; }
 		if (self == 1 && peer == 0) { rv = -1; break; }
 		/* Else fall through to one of the other strategies... */
-		DRBD_WARN("Discard younger/older primary did not found a decision\n"
+		drbd_WARN("Discard younger/older primary did not found a decision\n"
 		     "Using discard-least-changes instead\n");
 	case DiscardZeroChg:
 		if (ch_peer == 0 && ch_self == 0) {
@@ -1813,7 +2231,7 @@ STATIC int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
 		if (mdev->net_conf->after_sb_0p == DiscardZeroChg)
 			break;
 	case DiscardLeastChg:
-		if	( ch_self < ch_peer )
+		if	(ch_self < ch_peer)
 			rv = -1;
 		else if (ch_self > ch_peer)
 			rv =  1;
@@ -1839,7 +2257,7 @@ STATIC int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
 	self = mdev->bc->md.uuid[Bitmap] & 1;
 	peer = mdev->p_uuid[Bitmap] & 1;
 
-	switch ( mdev->net_conf->after_sb_1p ) {
+	switch (mdev->net_conf->after_sb_1p) {
 	case DiscardYoungerPri:
 	case DiscardOlderPri:
 	case DiscardLeastChg:
@@ -1868,7 +2286,7 @@ STATIC int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
 			if (self != SS_Success) {
 				drbd_khelper(mdev, "pri-lost-after-sb");
 			} else {
-				DRBD_WARN("Sucessfully gave up primary role.\n");
+				drbd_WARN("Sucessfully gave up primary role.\n");
 				rv = hg;
 			}
 		} else
@@ -1885,7 +2303,7 @@ STATIC int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
 	self = mdev->bc->md.uuid[Bitmap] & 1;
 	peer = mdev->p_uuid[Bitmap] & 1;
 
-	switch ( mdev->net_conf->after_sb_2p ) {
+	switch (mdev->net_conf->after_sb_2p) {
 	case DiscardYoungerPri:
 	case DiscardOlderPri:
 	case DiscardLeastChg:
@@ -1907,7 +2325,7 @@ STATIC int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
 			if (self != SS_Success) {
 				drbd_khelper(mdev, "pri-lost-after-sb");
 			} else {
-				DRBD_WARN("Sucessfully gave up primary role.\n");
+				drbd_WARN("Sucessfully gave up primary role.\n");
 				rv = hg;
 			}
 		} else
@@ -1919,12 +2337,16 @@ STATIC int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
 
 STATIC void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid)
 {
+	if (!uuid) {
+		INFO("%s uuid info vanished while I was looking!\n", text);
+		return;
+	}
 	INFO("%s %016llX:%016llX:%016llX:%016llX\n",
 	     text,
-	     uuid[Current],
-	     uuid[Bitmap],
-	     uuid[History_start],
-	     uuid[History_end]);
+	     (unsigned long long)uuid[Current],
+	     (unsigned long long)uuid[Bitmap],
+	     (unsigned long long)uuid[History_start],
+	     (unsigned long long)uuid[History_end]);
 }
 
 /*
@@ -1950,19 +2372,19 @@ STATIC int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
 	    peer == UUID_JUST_CREATED) return 0;
 
 	*rule_nr = 2;
-	if ( (self == UUID_JUST_CREATED || self == (u64)0) &&
+	if ((self == UUID_JUST_CREATED || self == (u64)0) &&
 	     peer != UUID_JUST_CREATED) return -2;
 
 	*rule_nr = 3;
 	if ( self != UUID_JUST_CREATED &&
-	     (peer == UUID_JUST_CREATED || peer == (u64)0) ) return 2;
+	    (peer == UUID_JUST_CREATED || peer == (u64)0)) return 2;
 
 	*rule_nr = 4;
 	if (self == peer) { /* Common power [off|failure] */
 		int rct, dc; /* roles at crash time */
 
 		rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
-			( mdev->p_uuid[UUID_FLAGS] & 2 );
+			(mdev->p_uuid[UUID_FLAGS] & 2);
 		/* lowest bit is set when we were primary,
 		 * next bit (weight 2) is set when peer was primary */
 
@@ -1985,7 +2407,7 @@ STATIC int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
 		return -1;
 
 	*rule_nr = 6;
-	for ( i = History_start ; i <= History_end ; i++ ) {
+	for (i = History_start; i <= History_end; i++) {
 		peer = mdev->p_uuid[i] & ~((u64)1);
 		if (self == peer)
 			return -2;
@@ -1998,7 +2420,7 @@ STATIC int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
 		return 1;
 
 	*rule_nr = 8;
-	for ( i = History_start ; i <= History_end ; i++ ) {
+	for (i = History_start; i <= History_end; i++) {
 		self = mdev->bc->md.uuid[i] & ~((u64)1);
 		if (self == peer)
 			return 2;
@@ -2007,12 +2429,13 @@ STATIC int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
 	*rule_nr = 9;
 	self = mdev->bc->md.uuid[Bitmap] & ~((u64)1);
 	peer = mdev->p_uuid[Bitmap] & ~((u64)1);
-	if (self == peer && self != ((u64)0) ) return 100;
+	if (self == peer && self != ((u64)0))
+		return 100;
 
 	*rule_nr = 10;
-	for ( i = History_start ; i <= History_end ; i++ ) {
+	for (i = History_start; i <= History_end; i++) {
 		self = mdev->p_uuid[i] & ~((u64)1);
-		for ( j = History_start ; j <= History_end ; j++ ) {
+		for (j = History_start; j <= History_end; j++) {
 			peer = mdev->p_uuid[j] & ~((u64)1);
 			if (self == peer)
 				return -100;
@@ -2025,7 +2448,7 @@ STATIC int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
 /* drbd_sync_handshake() returns the new conn state on success, or
    conn_mask (-1) on failure.
  */
-STATIC enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, 
+STATIC enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
 					   enum drbd_disk_state peer_disk) __must_hold(local)
 {
 	int hg, rule_nr;
@@ -2038,20 +2461,18 @@ STATIC enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 
 	hg = drbd_uuid_compare(mdev, &rule_nr);
 
-	MTRACE(TraceTypeUuid, TraceLvlSummary,
-	       INFO("drbd_sync_handshake:\n");
-	       drbd_uuid_dump(mdev, "self", mdev->bc->md.uuid);
-	       drbd_uuid_dump(mdev, "peer", mdev->p_uuid);
-	       INFO("uuid_compare()=%d by rule %d\n", hg, rule_nr);
-	    );
+	INFO("drbd_sync_handshake:\n");
+	drbd_uuid_dump(mdev, "self", mdev->bc->md.uuid);
+	drbd_uuid_dump(mdev, "peer", mdev->p_uuid);
+	INFO("uuid_compare()=%d by rule %d\n", hg, rule_nr);
 
 	if (hg == -1000) {
 		ALERT("Unrelated data, aborting!\n");
 		return conn_mask;
 	}
 
-	if ( (mydisk == Inconsistent && peer_disk > Inconsistent) ||
-	    (peer_disk == Inconsistent && mydisk > Inconsistent) )  {
+	if (   (mydisk == Inconsistent && peer_disk > Inconsistent) ||
+	    (peer_disk == Inconsistent && mydisk    > Inconsistent))  {
 		int f = (hg == -100) || abs(hg) == 2;
 		hg = mydisk > Inconsistent ? 1 : -1;
 		if (f)
@@ -2060,7 +2481,7 @@ STATIC enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 		     hg > 0 ? "source" : "target");
 	}
 
-	if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp) ) {
+	if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
 		int pcount = (mdev->state.role == Primary)
 			   + (peer_role == Primary);
 		int forced = (hg == -100);
@@ -2076,12 +2497,12 @@ STATIC enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 			hg = drbd_asb_recover_2p(mdev);
 			break;
 		}
-		if ( abs(hg) < 100 ) {
-			DRBD_WARN("Split-Brain detected, %d primaries, "
+		if (abs(hg) < 100) {
+			drbd_WARN("Split-Brain detected, %d primaries, "
 			     "automatically solved. Sync from %s node\n",
-			     pcount, (hg < 0) ? "peer":"this");
+			     pcount, (hg < 0) ? "peer" : "this");
 			if (forced) {
-				DRBD_WARN("Doing a full sync, since"
+				drbd_WARN("Doing a full sync, since"
 				     " UUIDs where ambiguous.\n");
 				drbd_uuid_dump(mdev, "self", mdev->bc->md.uuid);
 				drbd_uuid_dump(mdev, "peer", mdev->p_uuid);
@@ -2096,10 +2517,10 @@ STATIC enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 		if (!mdev->net_conf->want_lose && (mdev->p_uuid[UUID_FLAGS]&1))
 			hg = 1;
 
-		if ( abs(hg) < 100 )
-			DRBD_WARN("Split-Brain detected, manually solved. "
+		if (abs(hg) < 100)
+			drbd_WARN("Split-Brain detected, manually solved. "
 			     "Sync from %s node\n",
-			     (hg < 0) ? "peer":"this");
+			     (hg < 0) ? "peer" : "this");
 	}
 
 	if (hg == -100) {
@@ -2116,7 +2537,7 @@ STATIC enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 	}
 
 	if (hg < 0 && /* by intention we do not use mydisk here. */
-	    mdev->state.role == Primary && mdev->state.disk >= Consistent ) {
+	    mdev->state.role == Primary && mdev->state.disk >= Consistent) {
 		switch (mdev->net_conf->rr_conflict) {
 		case CallHelper:
 			drbd_khelper(mdev, "pri-lost");
@@ -2125,14 +2546,14 @@ STATIC enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 			ERR("I shall become SyncTarget, but I am primary!\n");
 			return conn_mask;
 		case Violently:
-			DRBD_WARN("Becoming SyncTarget, violating the stable-data"
+			drbd_WARN("Becoming SyncTarget, violating the stable-data"
 			     "assumption\n");
 		}
 	}
 
 	if (abs(hg) >= 2) {
 		INFO("Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
-		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write))
+		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake"))
 			return conn_mask;
 	}
 
@@ -2157,12 +2578,14 @@ STATIC enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 STATIC int cmp_after_sb(enum after_sb_handler peer, enum after_sb_handler self)
 {
 	/* DiscardRemote - DiscardLocal is valid */
-	if ( (peer == DiscardRemote && self == DiscardLocal) ||
-	    (self == DiscardRemote && peer == DiscardLocal) ) return 0;
+	if ((peer == DiscardRemote && self == DiscardLocal) ||
+	    (self == DiscardRemote && peer == DiscardLocal))
+		return 0;
 
 	/* any other things with DiscardRemote or DiscardLocal are invalid */
-	if ( peer == DiscardRemote || peer == DiscardLocal ||
-	    self == DiscardRemote || self == DiscardLocal ) return 1;
+	if (peer == DiscardRemote || peer == DiscardLocal ||
+	    self == DiscardRemote || self == DiscardLocal)
+		return 1;
 
 	/* everything else is valid if they are equal on both sides. */
 	if (peer == self)
@@ -2178,7 +2601,7 @@ STATIC int receive_protocol(struct drbd_conf *mdev, struct Drbd_Header *h)
 	int header_size, data_size;
 	int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
 	int p_want_lose, p_two_primaries;
-	char p_integrity_alg[SHARED_SECRET_MAX];
+	char p_integrity_alg[SHARED_SECRET_MAX] = "";
 
 	header_size = sizeof(*p) - sizeof(*h);
 	data_size   = h->length  - header_size;
@@ -2198,17 +2621,17 @@ STATIC int receive_protocol(struct drbd_conf *mdev, struct Drbd_Header *h)
 		goto disconnect;
 	}
 
-	if ( cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p) ) {
+	if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
 		ERR("incompatible after-sb-0pri settings\n");
 		goto disconnect;
 	}
 
-	if ( cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p) ) {
+	if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
 		ERR("incompatible after-sb-1pri settings\n");
 		goto disconnect;
 	}
 
-	if ( cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p) ) {
+	if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
 		ERR("incompatible after-sb-2pri settings\n");
 		goto disconnect;
 	}
@@ -2245,60 +2668,145 @@ disconnect:
 	return FALSE;
 }
 
+/* helper function
+ * input: alg name, feature name
+ * return: NULL (alg name was "")
+ *         ERR_PTR(error) if something goes wrong
+ *         or the crypto hash ptr, if it worked out ok. */
+struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
+		const char *alg, const char *name)
+{
+	struct crypto_hash *tfm;
+
+	if (!alg[0])
+		return NULL;
+
+	tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm)) {
+		ERR("Can not allocate \"%s\" as %s (reason: %ld)\n",
+				alg, name, PTR_ERR(tfm));
+		return tfm;
+	}
+	if (crypto_tfm_alg_type(crypto_hash_tfm(tfm)) != CRYPTO_ALG_TYPE_DIGEST) {
+		crypto_free_hash(tfm);
+		ERR("\"%s\" is not a digest (%s)\n", alg, name);
+		return ERR_PTR(-EINVAL);
+	}
+	return tfm;
+}
+
 STATIC int receive_SyncParam(struct drbd_conf *mdev, struct Drbd_Header *h)
 {
 	int ok = TRUE;
-	struct Drbd_SyncParam_Packet *p = (struct Drbd_SyncParam_Packet *)h;
-	int header_size, data_size;
-	char p_verify_alg[SHARED_SECRET_MAX];
-	struct crypto_hash *verify_tfm = NULL, *old_verify_tfm;
+	struct Drbd_SyncParam89_Packet *p = (struct Drbd_SyncParam89_Packet *)h;
+	unsigned int header_size, data_size, exp_max_sz;
+	struct crypto_hash *verify_tfm = NULL;
+	struct crypto_hash *csums_tfm = NULL;
+	const int apv = mdev->agreed_pro_version;
+
+	exp_max_sz  = apv <= 87 ? sizeof(struct Drbd_SyncParam_Packet)
+		    : apv == 88 ? sizeof(struct Drbd_SyncParam_Packet)
+					+ SHARED_SECRET_MAX
+		    : /* 89 */    sizeof(struct Drbd_SyncParam89_Packet);
+
+	if (h->length > exp_max_sz) {
+		ERR("SyncParam packet too long: received %u, expected <= %u bytes\n",
+		    h->length, exp_max_sz);
+		return FALSE;
+	}
 
-	header_size = sizeof(*p) - sizeof(*h);
-	data_size   = h->length  - header_size;
+	if (apv <= 88) {
+		header_size = sizeof(struct Drbd_SyncParam_Packet) - sizeof(*h);
+		data_size   = h->length  - header_size;
+	} else /* apv >= 89 */ {
+		header_size = sizeof(struct Drbd_SyncParam89_Packet) - sizeof(*h);
+		data_size   = h->length  - header_size;
+		D_ASSERT(data_size == 0);
+	}
+
+	/* initialize verify_alg and csums_alg */
+	memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
 
 	if (drbd_recv(mdev, h->payload, header_size) != header_size)
 		return FALSE;
 
 	mdev->sync_conf.rate	  = be32_to_cpu(p->rate);
 
-	if (mdev->agreed_pro_version >= 88) {
+	if (apv >= 88) {
+		if (apv == 88) {
+			if (data_size > SHARED_SECRET_MAX) {
+				ERR("verify-alg too long, "
+				    "peer wants %u, accepting only %u byte\n",
+						data_size, SHARED_SECRET_MAX);
+				return FALSE;
+			}
 
-		if (drbd_recv(mdev, p_verify_alg, data_size) != data_size)
-			return FALSE;
+			if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
+				return FALSE;
 
-		p_verify_alg[SHARED_SECRET_MAX-1] = 0;
-		if (strcpy(mdev->sync_conf.verify_alg, p_verify_alg)) {
-			if (strlen(p_verify_alg)) {
-				verify_tfm = crypto_alloc_hash(p_verify_alg, 0,
-							       CRYPTO_ALG_ASYNC);
-				if (IS_ERR(verify_tfm)) {
-					ERR("Can not allocate \"%s\" as verify-alg\n",
-					    p_verify_alg);
-					return FALSE;
-				}
+			/* we expect NUL terminated string */
+			/* but just in case someone tries to be evil */
+			D_ASSERT(p->verify_alg[data_size-1] == 0);
+			p->verify_alg[data_size-1] = 0;
+
+		} else /* apv >= 89 */ {
+			/* we still expect NUL terminated strings */
+			/* but just in case someone tries to be evil */
+			D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
+			D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
+			p->verify_alg[SHARED_SECRET_MAX-1] = 0;
+			p->csums_alg[SHARED_SECRET_MAX-1] = 0;
+		}
 
-				if (crypto_tfm_alg_type(crypto_hash_tfm(verify_tfm)) !=
-				    CRYPTO_ALG_TYPE_DIGEST) {
-					crypto_free_hash(verify_tfm);
-					ERR("\"%s\" is not a digest (verify-alg)\n",
-					    p_verify_alg);
-					return FALSE;
-				}
+		if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
+			if (mdev->state.conn == WFReportParams) {
+				ERR("Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
+				    mdev->sync_conf.verify_alg, p->verify_alg);
+				goto disconnect;
 			}
+			verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
+					p->verify_alg, "verify-alg");
+			if (IS_ERR(verify_tfm))
+				goto disconnect;
+		}
 
-			spin_lock(&mdev->peer_seq_lock);
-			/* lock against drbd_nl_syncer_conf() */
-			strcpy(mdev->sync_conf.verify_alg, p_verify_alg);
-			mdev->sync_conf.verify_alg_len = strlen(p_verify_alg) + 1;
-			old_verify_tfm = mdev->verify_tfm;
-			mdev->verify_tfm = verify_tfm;
-			spin_unlock(&mdev->peer_seq_lock);
+		if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
+			if (mdev->state.conn == WFReportParams) {
+				ERR("Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
+				    mdev->sync_conf.csums_alg, p->csums_alg);
+				goto disconnect;
+			}
+			csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
+					p->csums_alg, "csums-alg");
+			if (IS_ERR(csums_tfm))
+				goto disconnect;
+		}
 
-			crypto_free_hash(old_verify_tfm);
+
+		spin_lock(&mdev->peer_seq_lock);
+		/* lock against drbd_nl_syncer_conf() */
+		if (verify_tfm) {
+			strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
+			mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
+			crypto_free_hash(mdev->verify_tfm);
+			mdev->verify_tfm = verify_tfm;
+			INFO("using verify-alg: \"%s\"\n", p->verify_alg);
+		}
+		if (csums_tfm) {
+			strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
+			mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
+			crypto_free_hash(mdev->csums_tfm);
+			mdev->csums_tfm = csums_tfm;
+			INFO("using csums-alg: \"%s\"\n", p->csums_alg);
 		}
+		spin_unlock(&mdev->peer_seq_lock);
 	}
 
 	return ok;
+disconnect:
+	crypto_free_hash(verify_tfm);
+	drbd_force_state(mdev, NS(conn, Disconnecting));
+	return FALSE;
 }
 
 STATIC void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
@@ -2315,8 +2823,8 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev,
 	if (a == 0 || b == 0)
 		return;
 	d = (a > b) ? (a - b) : (b - a);
-	if ( d > (a>>3) || d > (b>>3))
-		DRBD_WARN("Considerable difference in %s: %llus vs. %llus\n", s,
+	if (d > (a>>3) || d > (b>>3))
+		drbd_WARN("Considerable difference in %s: %llus vs. %llus\n", s,
 		     (unsigned long long)a, (unsigned long long)b);
 }
 
@@ -2349,7 +2857,7 @@ STATIC int receive_sizes(struct drbd_conf *mdev, struct Drbd_Header *h)
 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
 	if (inc_local(mdev)) {
 		warn_if_differ_considerably(mdev, "lower level device sizes",
-			   p_size, drbd_get_capacity(mdev->bc->backing_bdev));
+			   p_size, drbd_get_max_capacity(mdev->bc));
 		warn_if_differ_considerably(mdev, "user requested size",
 					    p_usize, mdev->bc->dc.disk_size);
 
@@ -2372,7 +2880,7 @@ STATIC int receive_sizes(struct drbd_conf *mdev, struct Drbd_Header *h)
 		if (drbd_new_dev_size(mdev, mdev->bc) <
 		   drbd_get_capacity(mdev->this_bdev) &&
 		   mdev->state.disk >= Outdated &&
-		   mdev->state.conn < Connected ) {
+		   mdev->state.conn < Connected) {
 			ERR("The peer's disk size is too small!\n");
 			drbd_force_state(mdev, NS(conn, Disconnecting));
 			mdev->bc->dc.disk_size = my_usize;
@@ -2386,7 +2894,8 @@ STATIC int receive_sizes(struct drbd_conf *mdev, struct Drbd_Header *h)
 	if (inc_local(mdev)) {
 		dd = drbd_determin_dev_size(mdev);
 		dec_local(mdev);
-		if (dd == dev_size_error) return FALSE;
+		if (dd == dev_size_error)
+			return FALSE;
 		drbd_md_sync(mdev);
 	} else {
 		/* I am diskless, need to accept the peer's size. */
@@ -2424,18 +2933,18 @@ STATIC int receive_sizes(struct drbd_conf *mdev, struct Drbd_Header *h)
 	}
 
 	if (mdev->state.conn > WFReportParams) {
-		if ( be64_to_cpu(p->c_size) !=
-		    drbd_get_capacity(mdev->this_bdev) || ldsc ) {
+		if (be64_to_cpu(p->c_size) !=
+		    drbd_get_capacity(mdev->this_bdev) || ldsc) {
 			/* we have different sizes, probabely peer
 			 * needs to know my new size... */
 			drbd_send_sizes(mdev);
 		}
-		if (dd == grew && mdev->state.conn == Connected &&
-		    mdev->state.pdsk >= Inconsistent &&
-		    mdev->state.disk >= Inconsistent) {
-			/* With disk >= Inconsistent we take care to not get
-			   here during an attach while we are connected. */
-			resync_after_online_grow(mdev);
+		if (dd == grew && mdev->state.conn == Connected) {
+			if (mdev->state.pdsk >= Inconsistent &&
+			    mdev->state.disk >= Inconsistent)
+				resync_after_online_grow(mdev);
+			else
+				set_bit(RESYNC_AFTER_NEG, &mdev->flags);
 		}
 	}
 
@@ -2461,15 +2970,20 @@ STATIC int receive_uuids(struct drbd_conf *mdev, struct Drbd_Header *h)
 	mdev->p_uuid = p_uuid;
 
 	if (mdev->state.conn < Connected &&
-	    mdev->state.disk < Outdated &&
+	    mdev->state.disk < Inconsistent &&
 	    mdev->state.role == Primary &&
 	    (mdev->ed_uuid & ~((u64)1)) != (p_uuid[Current] & ~((u64)1))) {
 		ERR("Can only connect to data with current UUID=%016llX\n",
 		    (unsigned long long)mdev->ed_uuid);
-		drbd_force_state(mdev,NS(conn,Disconnecting));
+		drbd_force_state(mdev, NS(conn, Disconnecting));
 		return FALSE;
 	}
 
+	/* Before we test for the disk state, we should wait until an eventually
+	   ongoing cluster wide state change is finished. That is important if
+	   we are primary and are detaching from our disk. We need to see the
+	   new disk state... */
+	wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
 	if (mdev->state.conn >= Connected && mdev->state.disk < Inconsistent)
 		drbd_set_ed_uuid(mdev, p_uuid[Current]);
 
@@ -2501,7 +3015,7 @@ STATIC union drbd_state_t convert_state(union drbd_state_t ps)
 	ms.role = ps.peer;
 	ms.pdsk = ps.disk;
 	ms.disk = ps.pdsk;
-	ms.peer_isp = ( ps.aftr_isp | ps.user_isp );
+	ms.peer_isp = (ps.aftr_isp | ps.user_isp);
 
 	return ms;
 }
@@ -2528,9 +3042,10 @@ STATIC int receive_req_state(struct drbd_conf *mdev, struct Drbd_Header *h)
 	mask = convert_state(mask);
 	val = convert_state(val);
 
+	DRBD_STATE_DEBUG_INIT_VAL(val);
 	rv = drbd_change_state(mdev, ChgStateVerbose, mask, val);
 
-	drbd_send_sr_reply(mdev,rv);
+	drbd_send_sr_reply(mdev, rv);
 	drbd_md_sync(mdev);
 
 	return TRUE;
@@ -2567,7 +3082,7 @@ STATIC int receive_state(struct drbd_conf *mdev, struct Drbd_Header *h)
 		nconn = Connected;
 
 	if (mdev->p_uuid && peer_state.disk >= Negotiating &&
-	    inc_local_if_state(mdev, Negotiating) ) {
+	    inc_local_if_state(mdev, Negotiating)) {
 		int cr; /* consider resync */
 
 		cr  = (oconn < Connected);
@@ -2577,7 +3092,8 @@ STATIC int receive_state(struct drbd_conf *mdev, struct Drbd_Header *h)
 		cr |= test_bit(CONSIDER_RESYNC, &mdev->flags); /* peer forced */
 		cr |= (oconn == Connected && peer_state.conn > Connected);
 
-		if (cr) nconn=drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
+		if (cr)
+			nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
 
 		dec_local(mdev);
 		if (nconn == conn_mask) {
@@ -2603,9 +3119,10 @@ STATIC int receive_state(struct drbd_conf *mdev, struct Drbd_Header *h)
 	ns.conn = nconn;
 	ns.peer = peer_state.role;
 	ns.pdsk = real_peer_disk;
-	ns.peer_isp = ( peer_state.aftr_isp | peer_state.user_isp );
+	ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
 	if ((nconn == Connected || nconn == WFBitMapS) && ns.disk == Negotiating)
 		ns.disk = mdev->new_state_tmp.disk;
+	DRBD_STATE_DEBUG_INIT_VAL(ns);
 	rv = _drbd_set_state(mdev, ns, ChgStateVerbose | ChgStateHard, NULL);
 	ns = mdev->state;
 	spin_unlock_irq(&mdev->req_lock);
@@ -2615,16 +3132,15 @@ STATIC int receive_state(struct drbd_conf *mdev, struct Drbd_Header *h)
 		return FALSE;
 	}
 
-	if (oconn > WFReportParams ) {
-		if (nconn > Connected && peer_state.conn <= Connected) {
-			// we want resync, peer has not yet decided to sync...
+	if (oconn > WFReportParams) {
+		if (nconn > Connected && peer_state.conn <= Connected &&
+		    peer_state.disk != Negotiating ) {
+			/* we want resync, peer has not yet decided to sync... */
+			/* Nowadays only used when forcing a node into primary role and
+			   setting its disk to UpTpDate with that */
 			drbd_send_uuids(mdev);
 			drbd_send_state(mdev);
 		}
-		else if (nconn == Connected && peer_state.disk == Negotiating) {
-			// peer is waiting for us to respond...
-			drbd_send_state(mdev);
-		}
 	}
 
 	mdev->net_conf->want_lose = 0;
@@ -2639,9 +3155,9 @@ STATIC int receive_sync_uuid(struct drbd_conf *mdev, struct Drbd_Header *h)
 {
 	struct Drbd_SyncUUID_Packet *p = (struct Drbd_SyncUUID_Packet *)h;
 
-	wait_event( mdev->misc_wait,
-		    mdev->state.conn < Connected ||
-		    mdev->state.conn == WFSyncUUID);
+	wait_event(mdev->misc_wait,
+		   mdev->state.conn < Connected ||
+		   mdev->state.conn == WFSyncUUID);
 
 	/* D_ASSERT( mdev->state.conn == WFSyncUUID ); */
 
@@ -2652,7 +3168,7 @@ STATIC int receive_sync_uuid(struct drbd_conf *mdev, struct Drbd_Header *h)
 	/* Here the _drbd_uuid_ functions are right, current should
 	   _not_ be rotated into the history */
 	if (inc_local_if_state(mdev, Negotiating)) {
-		_drbd_uuid_set(mdev, Current,be64_to_cpu(p->uuid));
+		_drbd_uuid_set(mdev, Current, be64_to_cpu(p->uuid));
 		_drbd_uuid_set(mdev, Bitmap, 0UL);
 
 		drbd_start_resync(mdev, SyncTarget);
@@ -2680,14 +3196,20 @@ STATIC int receive_bitmap(struct drbd_conf *mdev, struct Drbd_Header *h)
 
 	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
 
-	drbd_bm_lock(mdev);
+	drbd_bm_lock(mdev, "receive bitmap");
 
 	bm_words = drbd_bm_words(mdev);
 	bm_i	 = 0;
-	buffer	 = vmalloc(BM_PACKET_WORDS*sizeof(long));
+	/* maybe we should use some per thread scratch page,
+	 * and allocate that during initial device creation? */
+	buffer	 = (unsigned long *) __get_free_page(GFP_NOIO);
+	if (!buffer) {
+		ERR("failed to allocate one page buffer in %s\n", __func__);
+		goto out;
+	}
 
 	while (1) {
-		num_words = min_t(size_t, BM_PACKET_WORDS, bm_words-bm_i );
+		num_words = min_t(size_t, BM_PACKET_WORDS, bm_words-bm_i);
 		want = num_words * sizeof(long);
 		ERR_IF(want != h->length) goto out;
 		if (want == 0)
@@ -2703,22 +3225,26 @@ STATIC int receive_bitmap(struct drbd_conf *mdev, struct Drbd_Header *h)
 		D_ASSERT(h->command == ReportBitMap);
 	}
 
-	if (mdev->state.conn == WFBitMapS) {
-		drbd_start_resync(mdev, SyncSource);
-	} else if (mdev->state.conn == WFBitMapT) {
+	if (mdev->state.conn == WFBitMapT) {
 		ok = !drbd_send_bitmap(mdev);
-		if (!ok) goto out;
-		ok = drbd_request_state(mdev, NS(conn, WFSyncUUID));
-		D_ASSERT( ok == SS_Success );
-	} else {
-		ERR("unexpected cstate (%s) in receive_bitmap\n",
+		if (!ok)
+			goto out;
+		/* Omit ChgOrdered with this state transition to avoid deadlocks. */
+		ok = _drbd_request_state(mdev, NS(conn, WFSyncUUID), ChgStateVerbose);
+		D_ASSERT(ok == SS_Success);
+	} else if (mdev->state.conn != WFBitMapS) {
+		/* admin may have requested Disconnecting,
+		 * other threads may have noticed network errors */
+		INFO("unexpected cstate (%s) in receive_bitmap\n",
 		    conns_to_name(mdev->state.conn));
 	}
 
 	ok = TRUE;
  out:
 	drbd_bm_unlock(mdev);
-	vfree(buffer);
+	if (ok && mdev->state.conn == WFBitMapS)
+		drbd_start_resync(mdev, SyncSource);
+	free_page((unsigned long) buffer);
 	return ok;
 }
 
@@ -2728,8 +3254,8 @@ STATIC int receive_skip(struct drbd_conf *mdev, struct Drbd_Header *h)
 	static char sink[128];
 	int size, want, r;
 
-	DRBD_WARN("skipping unknown optional packet type %d, l: %d!\n",
-	     h->command, h->length );
+	drbd_WARN("skipping unknown optional packet type %d, l: %d!\n",
+	     h->command, h->length);
 
 	size = h->length;
 	while (size > 0) {
@@ -2738,14 +3264,19 @@ STATIC int receive_skip(struct drbd_conf *mdev, struct Drbd_Header *h)
 		ERR_IF(r < 0) break;
 		size -= r;
 	}
-	return (size == 0);
+	return size == 0;
 }
 
 STATIC int receive_UnplugRemote(struct drbd_conf *mdev, struct Drbd_Header *h)
 {
 	if (mdev->state.disk >= Inconsistent)
 		drbd_kick_lo(mdev);
-	return TRUE; /* cannot fail. */
+
+	/* Make sure we've acked all the TCP data associated
+	 * with the data requests being unplugged */
+	drbd_tcp_quickack(mdev->data.socket);
+
+	return TRUE;
 }
 
 typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct Drbd_Header *);
@@ -2754,12 +3285,13 @@ static drbd_cmd_handler_f drbd_default_handler[] = {
 	[Data]		   = receive_Data,
 	[DataReply]	   = receive_DataReply,
 	[RSDataReply]	   = receive_RSDataReply,
-	[Barrier]	   = receive_Barrier_no_tcq,
+	[Barrier]	   = receive_Barrier,
 	[ReportBitMap]	   = receive_bitmap,
 	[UnplugRemote]	   = receive_UnplugRemote,
 	[DataRequest]	   = receive_DataRequest,
 	[RSDataRequest]    = receive_DataRequest,
 	[SyncParam]	   = receive_SyncParam,
+	[SyncParam89]	   = receive_SyncParam,
 	[ReportProtocol]   = receive_protocol,
 	[ReportUUIDs]	   = receive_uuids,
 	[ReportSizes]	   = receive_sizes,
@@ -2768,6 +3300,7 @@ static drbd_cmd_handler_f drbd_default_handler[] = {
 	[ReportSyncUUID]   = receive_sync_uuid,
 	[OVRequest]        = receive_DataRequest,
 	[OVReply]          = receive_DataRequest,
+	[CsumRSRequest]    = receive_DataRequest,
 	/* anything missing from this table is in
 	 * the asender_tbl, see get_asender_cmd */
 	[MAX_CMD]	   = NULL,
@@ -2782,6 +3315,7 @@ STATIC void drbdd(struct drbd_conf *mdev)
 	struct Drbd_Header *header = &mdev->data.rbuf.head;
 
 	while (get_t_state(&mdev->receiver) == Running) {
+		drbd_thread_current_set_cpu(mdev);
 		if (!drbd_recv_header(mdev, header))
 			break;
 
@@ -2817,10 +3351,9 @@ STATIC void drbdd(struct drbd_conf *mdev)
 STATIC void drbd_fail_pending_reads(struct drbd_conf *mdev)
 {
 	struct hlist_head *slot;
-	struct hlist_node *n;
+	struct hlist_node *pos;
+	struct hlist_node *tmp;
 	struct drbd_request *req;
-	struct list_head *le;
-	LIST_HEAD(workset);
 	int i;
 
 	/*
@@ -2829,19 +3362,22 @@ STATIC void drbd_fail_pending_reads(struct drbd_conf *mdev)
 	spin_lock_irq(&mdev->req_lock);
 	for (i = 0; i < APP_R_HSIZE; i++) {
 		slot = mdev->app_reads_hash+i;
-		hlist_for_each_entry(req, n, slot, colision) {
-			list_add(&req->w.list, &workset);
+		hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
+			/* it may (but should not any longer!)
+			 * be on the work queue; if that assert triggers,
+			 * we need to also grab the
+			 * spin_lock_irq(&mdev->data.work.q_lock);
+			 * and list_del_init here. */
+			D_ASSERT(list_empty(&req->w.list));
+			_req_mod(req, connection_lost_while_pending, 0);
 		}
 	}
-	memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
+	for (i = 0; i < APP_R_HSIZE; i++)
+		if (!hlist_empty(mdev->app_reads_hash+i))
+			drbd_WARN("ASSERT FAILED: app_reads_hash[%d].first: "
+				"%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
 
-	while (!list_empty(&workset)) {
-		le = workset.next;
-		req = list_entry(le, struct drbd_request, w.list);
-		list_del(le);
-
-		_req_mod(req, connection_lost_while_pending, 0);
-	}
+	memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
 	spin_unlock_irq(&mdev->req_lock);
 }
 
@@ -2851,13 +3387,17 @@ STATIC void drbd_disconnect(struct drbd_conf *mdev)
 	enum fencing_policy fp;
 	union drbd_state_t os, ns;
 	int rv = SS_UnknownError;
+	unsigned int i;
 
-	D_ASSERT(mdev->state.conn < Connected);
-	if (mdev->state.conn == StandAlone) return;
+	if (mdev->state.conn == StandAlone)
+		return;
 	/* FIXME verify that:
 	 * the state change magic prevents us from becoming >= Connected again
 	 * while we are still cleaning up.
 	 */
+	if (mdev->state.conn >= WFConnection)
+		ERR("ASSERT FAILED cstate = %s, expected < WFConnection\n",
+				conns_to_name(mdev->state.conn));
 
 	/* asender does not clean up anything. it must not interfere, either */
 	drbd_thread_stop(&mdev->asender);
@@ -2874,11 +3414,6 @@ STATIC void drbd_disconnect(struct drbd_conf *mdev)
 	reclaim_net_ee(mdev);
 	spin_unlock_irq(&mdev->req_lock);
 
-	/* FIXME: fail pending reads?
-	 * when we are configured for freeze io,
-	 * we could retry them once we un-freeze. */
-	drbd_fail_pending_reads(mdev);
-
 	/* We do not have data structures that would allow us to
 	 * get the rs_pending_cnt down to 0 again.
 	 *  * On SyncTarget we do not have any data structures describing
@@ -2916,6 +3451,11 @@ STATIC void drbd_disconnect(struct drbd_conf *mdev)
 	if (!mdev->state.susp)
 		tl_clear(mdev);
 
+	/* FIXME: fail pending reads?
+	 * when we are configured for freeze io,
+	 * we could retry them once we un-freeze. */
+	drbd_fail_pending_reads(mdev);
+
 	INFO("Connection closed\n");
 
 	drbd_md_sync(mdev);
@@ -2939,20 +3479,38 @@ STATIC void drbd_disconnect(struct drbd_conf *mdev)
 		/* Do not restart in case we are Disconnecting */
 		ns = os;
 		ns.conn = Unconnected;
+		DRBD_STATE_DEBUG_INIT_VAL(ns);
 		rv = _drbd_set_state(mdev, ns, ChgStateVerbose, NULL);
 	}
 	spin_unlock_irq(&mdev->req_lock);
 
 	if (os.conn == Disconnecting) {
-		wait_event( mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0 );
+		struct hlist_head *h;
+		wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
+
+		/* we must not free the tl_hash
+		 * while application io is still on the fly */
+		wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
 
+		spin_lock_irq(&mdev->req_lock);
+		/* paranoia code */
+		for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
+			if (h->first)
+				ERR("ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
+						(int)(h - mdev->ee_hash), h->first);
 		kfree(mdev->ee_hash);
 		mdev->ee_hash = NULL;
 		mdev->ee_hash_s = 0;
 
+		/* paranoia code */
+		for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
+			if (h->first)
+				ERR("ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
+						(int)(h - mdev->tl_hash), h->first);
 		kfree(mdev->tl_hash);
 		mdev->tl_hash = NULL;
 		mdev->tl_hash_s = 0;
+		spin_unlock_irq(&mdev->req_lock);
 
 		crypto_free_hash(mdev->cram_hmac_tfm);
 		mdev->cram_hmac_tfm = NULL;
@@ -2964,17 +3522,21 @@ STATIC void drbd_disconnect(struct drbd_conf *mdev)
 
 	/* they do trigger all the time.
 	 * hm. why won't tcp release the page references,
-	 * we already released the socket!?
-	D_ASSERT(atomic_read(&mdev->pp_in_use) == 0);
-	D_ASSERT(list_empty(&mdev->net_ee));
-	 */
+	 * we already released the socket!? */
+	i = atomic_read(&mdev->pp_in_use);
+	if (i)
+		DBG("pp_in_use = %u, expected 0\n", i);
+	if (!list_empty(&mdev->net_ee))
+		DBG("net_ee not empty!\n");
+
 	D_ASSERT(list_empty(&mdev->read_ee));
 	D_ASSERT(list_empty(&mdev->active_ee));
 	D_ASSERT(list_empty(&mdev->sync_ee));
 	D_ASSERT(list_empty(&mdev->done_ee));
 
 	/* ok, no more ee's on the fly, it is safe to reset the epoch_size */
-	mdev->epoch_size = 0;
+	atomic_set(&mdev->current_epoch->epoch_size, 0);
+	D_ASSERT(list_empty(&mdev->current_epoch->list));
 }
 
 /*
@@ -3035,14 +3597,14 @@ int drbd_do_handshake(struct drbd_conf *mdev)
 		return 0;
 
 	if (p->head.command != HandShake) {
-		ERR( "expected HandShake packet, received: %s (0x%04x)\n",
-		     cmdname(p->head.command), p->head.command );
+		ERR("expected HandShake packet, received: %s (0x%04x)\n",
+		     cmdname(p->head.command), p->head.command);
 		return -1;
 	}
 
 	if (p->head.length != expect) {
-		ERR( "expected HandShake length: %u, received: %u\n",
-		     expect, p->head.length );
+		ERR("expected HandShake length: %u, received: %u\n",
+		     expect, p->head.length);
 		return -1;
 	}
 
@@ -3058,10 +3620,12 @@ int drbd_do_handshake(struct drbd_conf *mdev)
 
 	p->protocol_min = be32_to_cpu(p->protocol_min);
 	p->protocol_max = be32_to_cpu(p->protocol_max);
-	if(p->protocol_max == 0) p->protocol_max = p->protocol_min;
+	if (p->protocol_max == 0)
+		p->protocol_max = p->protocol_min;
 
-	if (PRO_VERSION_MAX < p->protocol_min ) goto incompat;
-	if (PRO_VERSION_MIN > p->protocol_max ) goto incompat;
+	if (PRO_VERSION_MAX < p->protocol_min ||
+	    PRO_VERSION_MIN > p->protocol_max)
+		goto incompat;
 
 	mdev->agreed_pro_version = min_t(int,PRO_VERSION_MAX,p->protocol_max);
 
@@ -3073,7 +3637,7 @@ int drbd_do_handshake(struct drbd_conf *mdev)
  incompat:
 	ERR("incompatible DRBD dialects: "
 	    "I support %d-%d, peer supports %d-%d\n",
-	    PRO_VERSION_MIN,PRO_VERSION_MAX, 
+	    PRO_VERSION_MIN, PRO_VERSION_MAX,
 	    p->protocol_min, p->protocol_max);
 	return -1;
 }
@@ -3081,8 +3645,8 @@ int drbd_do_handshake(struct drbd_conf *mdev)
 #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
 int drbd_do_auth(struct drbd_conf *mdev)
 {
-	ERR( "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
-	ERR( "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
+	ERR("This kernel was build without CONFIG_CRYPTO_HMAC.\n");
+	ERR("You need to disable 'cram-hmac-alg' in drbd.conf.\n");
 	return 0;
 }
 #else
@@ -3122,14 +3686,14 @@ int drbd_do_auth(struct drbd_conf *mdev)
 		goto fail;
 
 	if (p.command != AuthChallenge) {
-		ERR( "expected AuthChallenge packet, received: %s (0x%04x)\n",
-		     cmdname(p.command), p.command );
+		ERR("expected AuthChallenge packet, received: %s (0x%04x)\n",
+		    cmdname(p.command), p.command);
 		rv = 0;
 		goto fail;
 	}
 
 	if (p.length > CHALLENGE_LEN*2) {
-		ERR( "expected AuthChallenge payload too big.\n");
+		ERR("expected AuthChallenge payload too big.\n");
 		rv = 0;
 		goto fail;
 	}
@@ -3162,7 +3726,7 @@ int drbd_do_auth(struct drbd_conf *mdev)
 
 	rv = crypto_hash_digest(&desc, &sg, sg.length, response);
 	if (rv) {
-		ERR( "crypto_hash_digest() failed with %d\n", rv);
+		ERR("crypto_hash_digest() failed with %d\n", rv);
 		rv = 0;
 		goto fail;
 	}
@@ -3176,14 +3740,14 @@ int drbd_do_auth(struct drbd_conf *mdev)
 		goto fail;
 
 	if (p.command != AuthResponse) {
-		ERR( "expected AuthResponse packet, received: %s (0x%04x)\n",
-		     cmdname(p.command), p.command );
+		ERR("expected AuthResponse packet, received: %s (0x%04x)\n",
+		    cmdname(p.command), p.command);
 		rv = 0;
 		goto fail;
 	}
 
 	if (p.length != resp_size) {
-		ERR( "expected AuthResponse payload of wrong size\n" );
+		ERR("expected AuthResponse payload of wrong size\n");
 		rv = 0;
 		goto fail;
 	}
@@ -3207,7 +3771,7 @@ int drbd_do_auth(struct drbd_conf *mdev)
 
 	rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
 	if (rv) {
-		ERR( "crypto_hash_digest() failed with %d\n", rv);
+		ERR("crypto_hash_digest() failed with %d\n", rv);
 		rv = 0;
 		goto fail;
 	}
@@ -3234,20 +3798,21 @@ STATIC int drbdd_init(struct Drbd_thread *thi)
 	int h;
 
 	sprintf(current->comm, "drbd%d_receiver", minor);
-	set_cpus_allowed(current, drbd_calc_cpu_mask(mdev));
+
 	INFO("receiver (re)started\n");
 
 	do {
 		h = drbd_connect(mdev);
 		if (h == 0) {
 			drbd_disconnect(mdev);
+			__set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(HZ);
 		}
 		if (h == -1) {
-			DRBD_WARN("Discarding network configuration.\n");
+			drbd_WARN("Discarding network configuration.\n");
 			drbd_force_state(mdev, NS(conn, Disconnecting));
 		}
-	} while ( h == 0 );
+	} while (h == 0);
 
 	if (h > 0) {
 		if (inc_net(mdev)) {
@@ -3258,15 +3823,6 @@ STATIC int drbdd_init(struct Drbd_thread *thi)
 
 	drbd_disconnect(mdev);
 
-	/* Ensure that the thread state fits to our connection state. */
-	if (mdev->state.conn == Unconnected) {
-		ERR_IF( mdev->receiver.t_state != Restarting )
-			drbd_thread_restart_nowait(&mdev->receiver);
-	} else if (mdev->state.conn == StandAlone) {
-		ERR_IF( mdev->receiver.t_state != Exiting )
-			drbd_thread_stop_nowait(&mdev->receiver);
-	}
-
 	INFO("receiver terminated\n");
 	return 0;
 }
@@ -3283,8 +3839,8 @@ STATIC int got_RqSReply(struct drbd_conf *mdev, struct Drbd_Header *h)
 		set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
 	} else {
 		set_bit(CL_ST_CHG_FAIL, &mdev->flags);
-		ERR("Requested state change failed by peer: %s\n",
-		    set_st_err_name(retcode));
+		ERR("Requested state change failed by peer: %s (%d)\n",
+		    set_st_err_name(retcode), retcode);
 	}
 	wake_up(&mdev->state_wait);
 
@@ -3305,6 +3861,25 @@ STATIC int got_PingAck(struct drbd_conf *mdev, struct Drbd_Header *h)
 	return TRUE;
 }
 
+STATIC int got_IsInSync(struct drbd_conf *mdev, struct Drbd_Header *h)
+{
+	struct Drbd_BlockAck_Packet *p = (struct Drbd_BlockAck_Packet *)h;
+	sector_t sector = be64_to_cpu(p->sector);
+	int blksize = be32_to_cpu(p->blksize);
+
+	D_ASSERT(mdev->agreed_pro_version >= 89);
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	drbd_rs_complete_io(mdev, sector);
+	drbd_set_in_sync(mdev, sector, blksize);
+	/* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+	mdev->rs_same_csum += (blksize >> BM_BLOCK_SIZE_B);
+	dec_rs_pending(mdev);
+
+	return TRUE;
+}
+
 STATIC int got_BlockAck(struct drbd_conf *mdev, struct Drbd_Header *h)
 {
 	struct drbd_request *req;
@@ -3314,7 +3889,7 @@ STATIC int got_BlockAck(struct drbd_conf *mdev, struct Drbd_Header *h)
 
 	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
 
-	if ( is_syncer_block_id(p->block_id)) {
+	if (is_syncer_block_id(p->block_id)) {
 		drbd_set_in_sync(mdev, sector, blksize);
 		dec_rs_pending(mdev);
 	} else {
@@ -3364,7 +3939,7 @@ STATIC int got_NegAck(struct drbd_conf *mdev, struct Drbd_Header *h)
 	struct drbd_request *req;
 
 	if (DRBD_ratelimit(5*HZ, 5))
-		DRBD_WARN("Got NegAck packet. Peer is in troubles?\n");
+		drbd_WARN("Got NegAck packet. Peer is in troubles?\n");
 
 	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
 
@@ -3405,16 +3980,13 @@ STATIC int got_NegDReply(struct drbd_conf *mdev, struct Drbd_Header *h)
 		return FALSE;
 	}
 
-	/* FIXME explicitly warn if protocol != C */
-
-	ERR("Got NegDReply; Sector %llus, len %u; Fail original request.\n",
-	    (unsigned long long)sector, be32_to_cpu(p->blksize));
-
 	_req_mod(req, neg_acked, 0);
 	spin_unlock_irq(&mdev->req_lock);
 
-	/* "ugly and wrong" but what can we do !? */
-	drbd_khelper(mdev, "pri-on-incon-degr");
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	ERR("Got NegDReply; Sector %llus, len %u; Fail original request.\n",
+	    (unsigned long long)sector, be32_to_cpu(p->blksize));
 
 	return TRUE;
 }
@@ -3429,6 +4001,8 @@ STATIC int got_NegRSDReply(struct drbd_conf *mdev, struct Drbd_Header *h)
 	size = be32_to_cpu(p->blksize);
 	D_ASSERT(p->block_id == ID_SYNCER);
 
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
 	dec_rs_pending(mdev);
 
 	if (inc_local_if_state(mdev, Failed)) {
@@ -3445,12 +4019,10 @@ STATIC int got_BarrierAck(struct drbd_conf *mdev, struct Drbd_Header *h)
 	struct Drbd_BarrierAck_Packet *p = (struct Drbd_BarrierAck_Packet *)h;
 
 	tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
-	dec_ap_pending(mdev);
 
 	return TRUE;
 }
 
-
 STATIC int got_OVResult(struct drbd_conf *mdev, struct Drbd_Header* h)
 {
 	struct Drbd_BlockAck_Packet *p = (struct Drbd_BlockAck_Packet*)h;
@@ -3461,6 +4033,8 @@ STATIC int got_OVResult(struct drbd_conf *mdev, struct Drbd_Header* h)
 	sector = be64_to_cpu(p->sector);
 	size = be32_to_cpu(p->blksize);
 
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
 	if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) {
 		drbd_ov_oos_found(mdev, sector, size);
 	} else ov_oos_print(mdev);
@@ -3469,7 +4043,7 @@ STATIC int got_OVResult(struct drbd_conf *mdev, struct Drbd_Header* h)
 	dec_rs_pending(mdev);
 
 	if (--mdev->ov_left == 0) {
-		w = kmalloc(sizeof(w), GFP_KERNEL);
+		w = kmalloc(sizeof(*w), GFP_KERNEL);
 		if (w) {
 			w->cb = w_ov_finished;
 			drbd_queue_work_front(&mdev->data.work, w);
@@ -3481,13 +4055,12 @@ STATIC int got_OVResult(struct drbd_conf *mdev, struct Drbd_Header* h)
 	return TRUE;
 }
 
-
 struct asender_cmd {
 	size_t pkt_size;
 	int (*process)(struct drbd_conf *mdev, struct Drbd_Header *h);
 };
 
-static struct asender_cmd* get_asender_cmd(int cmd)
+static struct asender_cmd *get_asender_cmd(int cmd)
 {
 	static struct asender_cmd asender_tbl[] = {
 		/* anything missing from this table is in
@@ -3500,22 +4073,15 @@ static struct asender_cmd* get_asender_cmd(int cmd)
 	[RSWriteAck]	= { sizeof(struct Drbd_BlockAck_Packet), got_BlockAck },
 	[DiscardAck]	= { sizeof(struct Drbd_BlockAck_Packet), got_BlockAck },
 	[NegAck]	= { sizeof(struct Drbd_BlockAck_Packet), got_NegAck },
-	[NegDReply]	=
-		{ sizeof(struct Drbd_BlockAck_Packet), got_NegDReply },
-	[NegRSDReply]	=
-		{ sizeof(struct Drbd_BlockAck_Packet), got_NegRSDReply},
-	[OVResult]  = { sizeof(struct Drbd_BlockAck_Packet),  got_OVResult },
-
-	[BarrierAck]	=
-		{ sizeof(struct Drbd_BarrierAck_Packet), got_BarrierAck },
-	[StateChgReply] =
-		{ sizeof(struct Drbd_RqS_Reply_Packet), got_RqSReply },
+	[NegDReply]	= { sizeof(struct Drbd_BlockAck_Packet), got_NegDReply },
+	[NegRSDReply]	= { sizeof(struct Drbd_BlockAck_Packet), got_NegRSDReply},
+	[OVResult]      = { sizeof(struct Drbd_BlockAck_Packet),  got_OVResult },
+	[BarrierAck]	= { sizeof(struct Drbd_BarrierAck_Packet), got_BarrierAck },
+	[StateChgReply] = { sizeof(struct Drbd_RqS_Reply_Packet), got_RqSReply },
+	[RSIsInSync]	= { sizeof(struct Drbd_BlockAck_Packet), got_IsInSync },
+	[MAX_CMD]	= { 0, NULL },
 	};
-	if (cmd == OVResult)
-		return &asender_tbl[cmd];
-	if (cmd < FIRST_ASENDER_CMD)
-		return NULL;
-	if (cmd > LAST_ASENDER_CMD)
+	if (cmd > MAX_CMD)
 		return NULL;
 	return &asender_tbl[cmd];
 }
@@ -3526,7 +4092,7 @@ STATIC int drbd_asender(struct Drbd_thread *thi)
 	struct Drbd_Header *h = &mdev->meta.rbuf.head;
 	struct asender_cmd *cmd = NULL;
 
-	int rv,len;
+	int rv, len;
 	void *buf    = h;
 	int received = 0;
 	int expect   = sizeof(struct Drbd_Header);
@@ -3537,30 +4103,44 @@ STATIC int drbd_asender(struct Drbd_thread *thi)
 	current->policy = SCHED_RR;  /* Make this a realtime task! */
 	current->rt_priority = 2;    /* more important than all other tasks */
 
-	set_cpus_allowed(current, drbd_calc_cpu_mask(mdev));
-
 	while (get_t_state(thi) == Running) {
+		drbd_thread_current_set_cpu(mdev);
 		if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
 			ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
 			mdev->meta.socket->sk->sk_rcvtimeo =
 				mdev->net_conf->ping_timeo*HZ/10;
 		}
 
+		/* conditionally cork;
+		 * it may hurt latency if we cork without much to send */
+		if (!mdev->net_conf->no_cork &&
+			3 < atomic_read(&mdev->unacked_cnt))
+			drbd_tcp_cork(mdev->meta.socket);
 		while (1) {
+			clear_bit(SIGNAL_ASENDER, &mdev->flags);
+			flush_signals(current);
 			if (!drbd_process_done_ee(mdev)) {
 				ERR("process_done_ee() = NOT_OK\n");
 				goto reconnect;
 			}
+			/* to avoid race with newly queued ACKs */
 			set_bit(SIGNAL_ASENDER, &mdev->flags);
 			spin_lock_irq(&mdev->req_lock);
 			empty = list_empty(&mdev->done_ee);
 			spin_unlock_irq(&mdev->req_lock);
-			if (empty && !test_bit(WRITE_ACK_PENDING, &mdev->flags))
+			/* new ack may have been queued right here,
+			 * but then there is also a signal pending,
+			 * and we start over... */
+			if (empty)
 				break;
-			clear_bit(SIGNAL_ASENDER, &mdev->flags);
-			flush_signals(current);
 		}
-		drbd_tcp_flush(mdev->meta.socket);
+		/* but unconditionally uncork unless disabled */
+		if (!mdev->net_conf->no_cork)
+			drbd_tcp_uncork(mdev->meta.socket);
+
+		/* short circuit, recv_msg would return EINTR anyways. */
+		if (signal_pending(current))
+			continue;
 
 		rv = drbd_recv_short(mdev, mdev->meta.socket,
 				     buf, expect-received, 0);
@@ -3568,8 +4148,6 @@ STATIC int drbd_asender(struct Drbd_thread *thi)
 
 		flush_signals(current);
 
-		drbd_tcp_cork(mdev->meta.socket);
-
 		/* Note:
 		 * -EINTR	 (on meta) we got a signal
 		 * -EAGAIN	 (on meta) rcvtimeo expired
@@ -3587,8 +4165,8 @@ STATIC int drbd_asender(struct Drbd_thread *thi)
 			ERR("meta connection shut down by peer.\n");
 			goto reconnect;
 		} else if (rv == -EAGAIN) {
-			if ( mdev->meta.socket->sk->sk_rcvtimeo ==
-			    mdev->net_conf->ping_timeo*HZ/10 ) {
+			if (mdev->meta.socket->sk->sk_rcvtimeo ==
+			    mdev->net_conf->ping_timeo*HZ/10) {
 				ERR("PingAck did not arrive in time.\n");
 				goto reconnect;
 			}
@@ -3601,8 +4179,8 @@ STATIC int drbd_asender(struct Drbd_thread *thi)
 			goto reconnect;
 		}
 
-		if (received == expect && cmd == NULL ) {
-			if (unlikely( h->magic != BE_DRBD_MAGIC )) {
+		if (received == expect && cmd == NULL) {
+			if (unlikely(h->magic != BE_DRBD_MAGIC)) {
 				ERR("magic?? on meta m: 0x%lx c: %d l: %d\n",
 				    (long)be32_to_cpu(h->magic),
 				    h->command, h->length);
@@ -3618,15 +4196,16 @@ STATIC int drbd_asender(struct Drbd_thread *thi)
 			}
 			expect = cmd->pkt_size;
 			ERR_IF(len != expect-sizeof(struct Drbd_Header)) {
-				dump_packet(mdev,mdev->meta.socket,1,(void*)h, __FILE__, __LINE__);
+				dump_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__);
 				DUMPI(expect);
 				goto reconnect;
 			}
 		}
 		if (received == expect) {
 			D_ASSERT(cmd != NULL);
-			dump_packet(mdev,mdev->meta.socket,1,(void*)h, __FILE__, __LINE__);
-			if (!cmd->process(mdev,h)) goto reconnect;
+			dump_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__);
+			if (!cmd->process(mdev, h))
+				goto reconnect;
 
 			buf	 = h;
 			received = 0;
@@ -3636,12 +4215,12 @@ STATIC int drbd_asender(struct Drbd_thread *thi)
 	}
 
 	if (0) {
-	reconnect:
-		drbd_force_state(mdev,NS(conn, NetworkFailure));
+reconnect:
+		drbd_force_state(mdev, NS(conn, NetworkFailure));
 	}
 	if (0) {
-	disconnect:
-		drbd_force_state(mdev,NS(conn, Disconnecting));
+disconnect:
+		drbd_force_state(mdev, NS(conn, Disconnecting));
 	}
 	clear_bit(SIGNAL_ASENDER, &mdev->flags);
 
diff --git a/ubuntu/drbd/drbd_req.c b/ubuntu/drbd/drbd_req.c
index d7bcfe6..8615a84 100644
--- a/ubuntu/drbd/drbd_req.c
+++ b/ubuntu/drbd/drbd_req.c
@@ -122,12 +122,23 @@ STATIC void _print_req_mod(struct drbd_request *req, enum drbd_req_event what)
 static inline void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
 {
 	const int rw = bio_data_dir(bio);
+#ifndef __disk_stat_inc
+	int cpu;
+#endif
 
 	MUST_HOLD(&mdev->req_lock)
+#ifdef __disk_stat_inc
 	__disk_stat_inc(mdev->vdisk, ios[rw]);
 	__disk_stat_add(mdev->vdisk, sectors[rw], bio_sectors(bio));
 	disk_round_stats(mdev->vdisk);
 	mdev->vdisk->in_flight++;
+#else
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
+	part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
+	part_stat_unlock();
+	mdev->vdisk->part0.in_flight++;
+#endif
 }
 
 /* Update disk stats when completing request upwards */
@@ -135,11 +146,21 @@ static inline void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request
 {
 	int rw = bio_data_dir(req->master_bio);
 	unsigned long duration = jiffies - req->start_time;
+#ifndef __disk_stat_inc
+	int cpu;
+#endif
 
 	MUST_HOLD(&mdev->req_lock)
+#ifdef __disk_stat_add
 	__disk_stat_add(mdev->vdisk, ticks[rw], duration);
 	disk_round_stats(mdev->vdisk);
 	mdev->vdisk->in_flight--;
+#else
+	cpu = part_stat_lock();
+	part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration);
+	part_round_stats(cpu, &mdev->vdisk->part0);
+	part_stat_unlock();
+#endif
 }
 
 #endif
@@ -182,8 +203,8 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
 			if (inc_local_if_state(mdev, Failed)) {
 				drbd_al_complete_io(mdev, req->sector);
 				dec_local(mdev);
-			} else {
-				DRBD_WARN("Should have called drbd_al_complete_io(, %llu), "
+			} else if (DRBD_ratelimit(5*HZ,3)) {
+				drbd_WARN("Should have called drbd_al_complete_io(, %llu), "
 				     "but my Disk seems to have failed :(\n",
 				     (unsigned long long) req->sector);
 			}
@@ -202,7 +223,7 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
 	 * until the next barrier ack? */
 
 	if (rw == WRITE &&
-	    (( s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
+	    ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
 		if (!(req->w.list.next == LIST_POISON1 ||
 		      list_empty(&req->w.list))) {
 			/* DEBUG ASSERT only; if this triggers, we
@@ -305,7 +326,7 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev,
 static void _complete_master_bio(struct drbd_conf *mdev,
 	struct drbd_request *req, int error)
 {
-	dump_bio(mdev, req->master_bio, 1);
+	dump_bio(mdev, req->master_bio, 1, req);
 	bio_endio(req->master_bio, error);
 	req->master_bio = NULL;
 	dec_ap_bio(mdev);
@@ -381,8 +402,8 @@ void _req_may_be_done(struct drbd_request *req, int error)
 		/* Update disk stats */
 		_drbd_end_io_acct(mdev, req);
 
-		_complete_master_bio(mdev,req,
-				     ok ? 0 : ( error ? error : -EIO ) );
+		_complete_master_bio(mdev, req,
+				     ok ? 0 : (error ? error : -EIO));
 	} else {
 		/* only WRITE requests can end up here without a master_bio */
 		rw = WRITE;
@@ -517,7 +538,7 @@ void _req_mod(struct drbd_request *req, enum drbd_req_event what, int error)
 
 	switch (what) {
 	default:
-		ERR("LOGIC BUG in %s:%u\n", __FILE__ , __LINE__ );
+		ERR("LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
 		return;
 
 	/* does not happen...
@@ -688,10 +709,7 @@ void _req_mod(struct drbd_request *req, enum drbd_req_event what, int error)
 	 * we may not finish the request just yet.
 	 */
 	case send_canceled:
-		/* may be a write, may be a remote read */
-		if (req->rq_state & RQ_NET_PENDING) dec_ap_pending(mdev);
-		req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
-		/* fall through */
+		/* treat it the same */
 	case send_failed:
 		/* real cleanup will be done from tl_clear.  just update flags
 		 * so it is no longer marked as on the worker queue */
@@ -703,8 +721,8 @@ void _req_mod(struct drbd_request *req, enum drbd_req_event what, int error)
 
 	case handed_over_to_network:
 		/* assert something? */
-		if ( bio_data_dir(req->master_bio) == WRITE &&
-		     mdev->net_conf->wire_protocol == DRBD_PROT_A ) {
+		if (bio_data_dir(req->master_bio) == WRITE &&
+		    mdev->net_conf->wire_protocol == DRBD_PROT_A) {
 			/* this is what is dangerous about protocol A:
 			 * pretend it was sucessfully written on the peer.
 			 * FIXME in case we get a local io-error in
@@ -795,6 +813,7 @@ void _req_mod(struct drbd_request *req, enum drbd_req_event what, int error)
 			 * we won't be able to clean them up... */
 			_print_rq_state(req,
 				"FIXME (barrier_acked but pending)");
+			list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
 		}
 		D_ASSERT(req->rq_state & RQ_NET_SENT);
 		req->rq_state |= RQ_NET_DONE;
@@ -831,7 +850,7 @@ STATIC int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int s
 		return 0;
 	/* state.disk == Inconsistent   We will have a look at the BitMap */
 	nr_sectors = drbd_get_capacity(mdev->this_bdev);
-	esector = sector + (size>>9) -1;
+	esector = sector + (size >> 9) - 1;
 
 	D_ASSERT(sector  < nr_sectors);
 	D_ASSERT(esector < nr_sectors);
@@ -839,7 +858,7 @@ STATIC int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int s
 	sbnr = BM_SECT_TO_BIT(sector);
 	ebnr = BM_SECT_TO_BIT(esector);
 
-	return (0 == drbd_bm_count_bits(mdev, sbnr, ebnr));
+	return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
 }
 
 /*
@@ -870,9 +889,7 @@ STATIC int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
 	int local, remote;
 	int err = -EIO;
 
-	/* allocate outside of all locks; get a "reference count" (ap_bio_cnt)
-	 * to avoid races with the disconnect/reconnect code.  */
-	inc_ap_bio(mdev);
+	/* allocate outside of all locks; */
 	req = drbd_req_new(mdev, bio);
 	if (!req) {
 		dec_ap_bio(mdev);
@@ -883,7 +900,7 @@ STATIC int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
 		return 0;
 	}
 
-	dump_bio(mdev, bio, 0);
+	dump_bio(mdev, bio, 0, req);
 
 	local = inc_local(mdev);
 	if (!local) {
@@ -948,12 +965,12 @@ STATIC int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
 		drbd_al_begin_io(mdev, sector);
 
 	remote = remote && (mdev->state.pdsk == UpToDate ||
-			    ( mdev->state.pdsk == Inconsistent &&
-			      mdev->state.conn >= Connected ) );
+			    (mdev->state.pdsk == Inconsistent &&
+			     mdev->state.conn >= Connected));
 
 	if (!(local || remote)) {
 		ERR("IO ERROR: neither local nor remote disk\n");
-		goto fail_and_free_req;
+		goto fail_free_complete;
 	}
 
 	/* For WRITE request, we have to make sure that we have an
@@ -970,7 +987,7 @@ allocate_barrier:
 		if (!b) {
 			ERR("Failed to alloc barrier.\n");
 			err = -ENOMEM;
-			goto fail_and_free_req;
+			goto fail_free_complete;
 		}
 	}
 
@@ -980,14 +997,14 @@ allocate_barrier:
 	/* FIXME race with drbd_disconnect and tl_clear? */
 	if (remote) {
 		remote = (mdev->state.pdsk == UpToDate ||
-			    ( mdev->state.pdsk == Inconsistent &&
-			      mdev->state.conn >= Connected ) );
+			    (mdev->state.pdsk == Inconsistent &&
+			     mdev->state.conn >= Connected));
 		if (!remote)
-			DRBD_WARN("lost connection while grabbing the req_lock!\n");
+			drbd_WARN("lost connection while grabbing the req_lock!\n");
 		if (!(local || remote)) {
 			ERR("IO ERROR: neither local nor remote disk\n");
 			spin_unlock_irq(&mdev->req_lock);
-			goto fail_and_free_req;
+			goto fail_free_complete;
 		}
 	}
 
@@ -1019,7 +1036,7 @@ allocate_barrier:
 	 * make sure that, if this is a write request and it triggered a
 	 * barrier packet, this request is queued within the same spinlock. */
 	if (remote && mdev->unused_spare_barrier &&
-            test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
+	    test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
 		_tl_add_barrier(mdev, mdev->unused_spare_barrier);
 		mdev->unused_spare_barrier = NULL;
 	} else {
@@ -1095,9 +1112,9 @@ allocate_barrier:
 
 		dump_internal_bio("Pri", mdev, req->private_bio, 0);
 
-		if (FAULT_ACTIVE(mdev, rw==WRITE ? DRBD_FAULT_DT_WR :
-				       (rw==READ ? DRBD_FAULT_DT_RD :
-				                   DRBD_FAULT_DT_RA) ))
+		if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
+				     : rw == READ  ? DRBD_FAULT_DT_RD
+				     :               DRBD_FAULT_DT_RA))
 			bio_endio(req->private_bio, -EIO);
 		else
 			generic_make_request(req->private_bio);
@@ -1109,10 +1126,20 @@ allocate_barrier:
 
 	return 0;
 
+fail_free_complete:
+	if (rw == WRITE && local)
+		drbd_al_complete_io(mdev, sector);
 fail_and_free_req:
-	kfree(b);
+	if (local) {
+		bio_put(req->private_bio);
+		req->private_bio = NULL;
+		dec_local(mdev);
+	}
 	bio_endio(bio, err);
 	drbd_req_free(req);
+	dec_ap_bio(mdev);
+	kfree(b);
+
 	return 0;
 }
 
@@ -1129,7 +1156,7 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
 		return 1;
 
 	if (mdev->state.role != Primary &&
-		( !allow_oos || is_write) ) {
+		(!allow_oos || is_write)) {
 		if (DRBD_ratelimit(5*HZ, 5)) {
 			ERR("Process %s[%u] tried to %s; "
 			    "since we are not in Primary state, "
@@ -1149,8 +1176,7 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
 	 * to serialize state changes, this is racy, since we may lose
 	 * the connection *after* we test for the cstate.
 	 */
-	if ( mdev->state.disk < UpToDate &&
-	     mdev->state.conn < Connected) {
+	if (mdev->state.disk < UpToDate && mdev->state.pdsk < UpToDate) {
 		if (DRBD_ratelimit(5*HZ, 5))
 			ERR("Sorry, I have no access to good data anymore.\n");
 		/*
@@ -1181,7 +1207,7 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
 	 * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit.
 	 */
 	if (unlikely(bio_barrier(bio) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) {
-		/* DRBD_WARN("Rejecting barrier request as underlying device does not support\n"); */
+		/* drbd_WARN("Rejecting barrier request as underlying device does not support\n"); */
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
@@ -1190,7 +1216,7 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
 	 * what we "blindly" assume:
 	 */
 	D_ASSERT(bio->bi_size > 0);
-	D_ASSERT( (bio->bi_size & 0x1ff) == 0);
+	D_ASSERT((bio->bi_size & 0x1ff) == 0);
 	D_ASSERT(bio->bi_idx == 0);
 
 	/* to make some things easier, force allignment of requests within the
@@ -1198,15 +1224,20 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
 	s_enr = bio->bi_sector >> HT_SHIFT;
 	e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
 
-	if (unlikely(s_enr != e_enr)) {
-	if (bio->bi_vcnt != 1 || bio->bi_idx != 0) {
+	if (likely(s_enr == e_enr)) {
+		inc_ap_bio(mdev, 1);
+		return drbd_make_request_common(mdev, bio);
+	}
+
+	/* can this bio be split generically?
+	 * Maybe add our own split-arbitrary-bios function. */
+	if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) {
 		/* rather error out here than BUG in bio_split */
 		ERR("bio would need to, but cannot, be split: "
 		    "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
 		    bio->bi_vcnt, bio->bi_idx, bio->bi_size,
 		    (unsigned long long)bio->bi_sector);
 		bio_endio(bio, -EINVAL);
-		return 0;
 	} else {
 		/* This bio crosses some boundary, so we have to split it. */
 		struct bio_pair *bp;
@@ -1219,18 +1250,29 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
 		 * first_sectors = 64 - (262269 & 63) = 3
 		 */
 		const sector_t sect = bio->bi_sector;
-		const int sps = 1<<HT_SHIFT; /* sectors per slot */
-		const int mask = sps -1;
+		const int sps = 1 << HT_SHIFT; /* sectors per slot */
+		const int mask = sps - 1;
 		const sector_t first_sectors = sps - (sect & mask);
-		bp = bio_split(bio, bio_split_pool, first_sectors);
-		drbd_make_request_26(q, &bp->bio1);
-		drbd_make_request_26(q, &bp->bio2);
+		bp = bio_split(bio,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+				bio_split_pool,
+#endif
+				first_sectors);
+
+		/* we need to get a "reference count" (ap_bio_cnt)
+		 * to avoid races with the disconnect/reconnect/suspend code.
+		 * In case we need to split the bio here, we need to get two references
+		 * atomically, otherwise we might deadlock when trying to submit the
+		 * second one! */
+		inc_ap_bio(mdev, 2);
+
+		D_ASSERT(e_enr == s_enr + 1);
+
+		drbd_make_request_common(mdev, &bp->bio1);
+		drbd_make_request_common(mdev, &bp->bio2);
 		bio_pair_release(bp);
-		return 0;
-	}
 	}
-
-	return drbd_make_request_common(mdev,bio);
+	return 0;
 }
 
 /* This is called by bio_add_page().  With this function we reduce
@@ -1246,7 +1288,13 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
  * cross extent boundaries.  those are dealt with (bio_split) in
  * drbd_make_request_26.
  */
-int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
+int drbd_merge_bvec(struct request_queue *q,
+#ifdef HAVE_bvec_merge_data
+		struct bvec_merge_data *bvm,
+#else
+		struct bio *bvm,
+#endif
+		struct bio_vec *bvec)
 {
 	struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
 	unsigned int bio_offset =
diff --git a/ubuntu/drbd/drbd_req.h b/ubuntu/drbd/drbd_req.h
index 89535c0..f1da62a 100644
--- a/ubuntu/drbd/drbd_req.h
+++ b/ubuntu/drbd/drbd_req.h
@@ -175,8 +175,7 @@ enum drbd_req_state_bits {
 	__RQ_NET_SENT,
 
 	/* when set, the request may be freed (if RQ_NET_QUEUED is clear).
-	 * in (C) this happens when WriteAck is received,
-	 * in (B,A) when the corresponding BarrierAck is received */
+	 * basically this means the corresponding BarrierAck was received */
 	__RQ_NET_DONE,
 
 	/* whether or not we know (C) or pretend (B,A) that the write
@@ -235,8 +234,7 @@ static inline struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
 	struct drbd_request *req;
 
 	hlist_for_each_entry(req, n, slot, colision) {
-		if ((unsigned long)req == (unsigned long)id)
-		{
+		if ((unsigned long)req == (unsigned long)id) {
 			if (req->sector != sector) {
 				ERR("_ack_id_to_req: found req %p but it has "
 				    "wrong sector (%llus versus %llus)\n", req,
@@ -269,8 +267,7 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
 	struct drbd_request *req;
 
 	hlist_for_each_entry(req, n, slot, colision) {
-		if ((unsigned long)req == (unsigned long)id)
-		{
+		if ((unsigned long)req == (unsigned long)id) {
 			D_ASSERT(req->sector == sector);
 			return req;
 		}
@@ -297,6 +294,7 @@ static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
 		req->start_time  = jiffies;
 		INIT_HLIST_NODE(&req->colision);
 		INIT_LIST_HEAD(&req->tl_requests);
+		INIT_LIST_HEAD(&req->w.list);
 
 		bio->bi_private  = req;
 		bio->bi_end_io   = drbd_endio_pri;
@@ -312,7 +310,7 @@ static inline void drbd_req_free(struct drbd_request *req)
 
 static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
 {
-	return !( ( s1 + (l1>>9) <= s2 ) || ( s1 >= s2 + (l2>>9) ) );
+	return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
 }
 
 /* aparently too large to be inlined...
diff --git a/ubuntu/drbd/drbd_strings.c b/ubuntu/drbd/drbd_strings.c
index a6c77ef..d2941c6 100644
--- a/ubuntu/drbd/drbd_strings.c
+++ b/ubuntu/drbd/drbd_strings.c
@@ -110,7 +110,7 @@ const char *disks_to_name(enum drbd_disk_state s)
 
 const char *set_st_err_name(enum set_st_err err)
 {
-	return err < SS_NotSupported ? "TOO_SMALL" :
+	return err <= SS_AfterLastError ? "TOO_SMALL" :
 	       err > SS_TwoPrimaries ? "TOO_LARGE"
 			: drbd_state_sw_errors[-err];
 }
diff --git a/ubuntu/drbd/drbd_worker.c b/ubuntu/drbd/drbd_worker.c
index 645005a..053eecd 100644
--- a/ubuntu/drbd/drbd_worker.c
+++ b/ubuntu/drbd/drbd_worker.c
@@ -38,6 +38,11 @@
 #include <linux/slab.h>
 #include <linux/random.h>
 #ifdef HAVE_LINUX_SCATTERLIST_H
+/* 2.6.11 (suse 9.3, fc4) does not include requisites
+ * from linux/scatterlist.h :( */
+#include <asm/scatterlist.h>
+#include <linux/mm.h>
+#include <linux/string.h>
 #include <linux/scatterlist.h>
 #endif
 
@@ -70,7 +75,6 @@ BIO_ENDIO_TYPE drbd_md_io_complete BIO_ENDIO_ARGS(struct bio *bio, int error)
 	struct drbd_md_io *md_io;
 
 	BIO_ENDIO_FN_START;
-
 	/* error parameter ignored:
 	 * drbd_md_sync_page_io explicitly tests bio_uptodate(bio); */
 
@@ -81,7 +85,6 @@ BIO_ENDIO_TYPE drbd_md_io_complete BIO_ENDIO_ARGS(struct bio *bio, int error)
 	dump_internal_bio("Md", md_io->mdev, bio, 1);
 
 	complete(&md_io->event);
-
 	BIO_ENDIO_FN_RETURN;
 }
 
@@ -103,7 +106,7 @@ BIO_ENDIO_TYPE drbd_endio_read_sec BIO_ENDIO_ARGS(struct bio *bio, int error) __
 		/* strange behaviour of some lower level drivers...
 		 * fail the request by clearing the uptodate flag,
 		 * but do not return any error?!
-		 * do we want to DRBD_WARN() on this? */
+		 * do we want to drbd_WARN() on this? */
 		error = -EIO;
 	}
 
@@ -114,7 +117,8 @@ BIO_ENDIO_TYPE drbd_endio_read_sec BIO_ENDIO_ARGS(struct bio *bio, int error) __
 	spin_lock_irqsave(&mdev->req_lock, flags);
 	mdev->read_cnt += e->size >> 9;
 	list_del(&e->w.list);
-	if (list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait);
+	if (list_empty(&mdev->read_ee))
+		wake_up(&mdev->ee_wait);
 	spin_unlock_irqrestore(&mdev->req_lock, flags);
 
 	drbd_chk_io_error(mdev, error, FALSE);
@@ -150,10 +154,23 @@ BIO_ENDIO_TYPE drbd_endio_write_sec BIO_ENDIO_ARGS(struct bio *bio, int error) _
 		/* strange behaviour of some lower level drivers...
 		 * fail the request by clearing the uptodate flag,
 		 * but do not return any error?!
-		 * do we want to DRBD_WARN() on this? */
+		 * do we want to drbd_WARN() on this? */
 		error = -EIO;
 	}
 
+	/* error == -ENOTSUPP would be a better test,
+	 * alas it is not reliable */
+	if (error && e->flags & EE_IS_BARRIER) {
+		drbd_bump_write_ordering(mdev, WO_bdev_flush);
+		spin_lock_irqsave(&mdev->req_lock, flags);
+		list_del(&e->w.list);
+		e->w.cb = w_e_reissue;
+		__release(local); /* Actually happens in w_e_reissue. */
+		spin_unlock_irqrestore(&mdev->req_lock, flags);
+		drbd_queue_work(&mdev->data.work, &e->w);
+		BIO_ENDIO_FN_RETURN;
+	}
+
 	D_ASSERT(e->block_id != ID_VACANT);
 
 	dump_internal_bio("Sec", mdev, bio, 1);
@@ -182,9 +199,6 @@ BIO_ENDIO_TYPE drbd_endio_write_sec BIO_ENDIO_ARGS(struct bio *bio, int error) _
 	 * done from "drbd_process_done_ee" within the appropriate w.cb
 	 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
 
-	if (!is_syncer_req)
-		mdev->epoch_size++;
-
 	do_wake = is_syncer_req
 		? list_empty(&mdev->sync_ee)
 		: list_empty(&mdev->active_ee);
@@ -223,7 +237,7 @@ BIO_ENDIO_TYPE drbd_endio_pri BIO_ENDIO_ARGS(struct bio *bio, int error)
 		/* strange behaviour of some lower level drivers...
 		 * fail the request by clearing the uptodate flag,
 		 * but do not return any error?!
-		 * do we want to DRBD_WARN() on this? */
+		 * do we want to drbd_WARN() on this? */
 		error = -EIO;
 	}
 
@@ -238,7 +252,6 @@ BIO_ENDIO_TYPE drbd_endio_pri BIO_ENDIO_ARGS(struct bio *bio, int error)
 	spin_lock_irqsave(&mdev->req_lock, flags);
 	_req_mod(req, what, error);
 	spin_unlock_irqrestore(&mdev->req_lock, flags);
-
 	BIO_ENDIO_FN_RETURN;
 }
 
@@ -261,7 +274,8 @@ int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	drbd_req_free(req);
 
 	ok = drbd_io_error(mdev, FALSE);
-	if (unlikely(!ok)) ERR("Sending in w_io_error() failed\n");
+	if (unlikely(!ok))
+		ERR("Sending in w_io_error() failed\n");
 	return ok;
 }
 
@@ -272,15 +286,14 @@ int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	/* FIXME this is ugly. we should not detach for read io-error,
 	 * but try to WRITE the DataReply to the failed location,
 	 * to give the disk the chance to relocate that block */
-	drbd_io_error(mdev,FALSE); /* tries to schedule a detach and notifies peer */
+	drbd_io_error(mdev, FALSE); /* tries to schedule a detach and notifies peer */
 
 	spin_lock_irq(&mdev->req_lock);
-	if ( cancel ||
-	     mdev->state.conn < Connected ||
-	     mdev->state.pdsk <= Inconsistent ) {
+	if (cancel ||
+	    mdev->state.conn < Connected ||
+	    mdev->state.pdsk <= Inconsistent) {
 		_req_mod(req, send_canceled, 0); /* FIXME freeze? ... */
 		spin_unlock_irq(&mdev->req_lock);
-		drbd_khelper(mdev, "pri-on-incon-degr"); /* FIXME REALLY? */
 		ALERT("WE ARE LOST. Local IO failure, no peer.\n");
 		return 1;
 	}
@@ -316,8 +329,82 @@ STATIC void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bi
 	crypto_hash_final(&desc, digest);
 }
 
-int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work* w,int cancel);
+STATIC int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w;
+	int digest_size;
+	void *digest;
+	int ok;
+
+	D_ASSERT( e->block_id == DRBD_MAGIC + 0xbeef );
+
+	if(unlikely(cancel)) {
+		drbd_free_ee(mdev,e);
+		return 1;
+	}
+
+	if(likely(drbd_bio_uptodate(e->private_bio))) {
+		digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+		digest = kmalloc(digest_size,GFP_KERNEL);
+		if(digest) {
+			drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
 
+			inc_rs_pending(mdev);
+			ok = drbd_send_drequest_csum(mdev,
+						     e->sector,
+						     e->size,
+						     digest,
+						     digest_size,
+						     CsumRSRequest);
+			kfree(digest);
+		} else {
+			ERR("kmalloc() of digest failed.\n");
+			ok = 0;
+		}
+	} else {
+		drbd_io_error(mdev, FALSE);
+		ok=1;
+	}
+
+	drbd_free_ee(mdev,e);
+
+	if(unlikely(!ok)) ERR("drbd_send_drequest(..., csum) failed\n");
+	return ok;
+}
+
+#define GFP_TRY	( __GFP_HIGHMEM | __GFP_NOWARN )
+
+STATIC int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	struct Tl_epoch_entry *e;
+
+	if (!inc_local(mdev))
+		return 0;
+
+	if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
+		return 2;
+
+	e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
+	if (!e) {
+		dec_local(mdev);
+		return 2;
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	list_add(&e->w.list, &mdev->read_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	e->private_bio->bi_end_io = drbd_endio_read_sec;
+	e->private_bio->bi_rw = READ;
+	e->w.cb = w_e_send_csum;
+
+	mdev->read_cnt += size >> 9;
+	drbd_generic_make_request(mdev,DRBD_FAULT_RS_RD,e->private_bio);
+
+	return 1;
+}
+
+int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
 
 void resync_timer_fn(unsigned long data)
 {
@@ -329,9 +416,10 @@ void resync_timer_fn(unsigned long data)
 
 	if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
 		queue = 1;
-		if(mdev->state.conn == VerifyS ) {
+		if (mdev->state.conn == VerifyS)
 			mdev->resync_work.cb = w_make_ov_request;
-		} else mdev->resync_work.cb = w_make_resync_request;
+		else
+			mdev->resync_work.cb = w_make_resync_request;
 	} else {
 		queue = 0;
 		mdev->resync_work.cb = w_resync_inactive;
@@ -358,7 +446,8 @@ int w_make_resync_request(struct drbd_conf *mdev,
 
 	PARANOIA_BUG_ON(w != &mdev->resync_work);
 
-	if (unlikely(cancel)) return 1;
+	if (unlikely(cancel))
+		return 1;
 
 	if (unlikely(mdev->state.conn < Connected)) {
 		ERR("Confused in w_make_resync_request()! cstate < Connected");
@@ -408,7 +497,7 @@ next_sector:
 			goto requeue;
 		}
 
-		if (unlikely(drbd_bm_test_bit(mdev, bit) == 0 )) {
+		if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
 			drbd_rs_complete_io(mdev, sector);
 			goto next_sector;
 		}
@@ -432,22 +521,23 @@ next_sector:
 				break;
 
 			/* Be always aligned */
-			if (sector & ((1<<(align+3))-1) )
+			if (sector & ((1<<(align+3))-1))
 				break;
 
 			/* do not cross extent boundaries */
-			if (( (bit+1) & BM_BLOCKS_PER_BM_EXT_MASK ) == 0)
+			if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
 				break;
 			/* now, is it actually dirty, after all?
 			 * caution, drbd_bm_test_bit is tri-state for some
 			 * obscure reason; ( b == 0 ) would get the out-of-band
 			 * only accidentally right because of the "oddly sized"
 			 * adjustment below */
-			if ( drbd_bm_test_bit(mdev, bit+1) != 1 )
+			if (drbd_bm_test_bit(mdev, bit+1) != 1)
 				break;
 			bit++;
 			size += BM_BLOCK_SIZE;
-			if ( (BM_BLOCK_SIZE<<align) <= size) align++;
+			if ((BM_BLOCK_SIZE << align) <= size)
+				align++;
 			i++;
 		}
 		/* if we merged some,
@@ -459,13 +549,26 @@ next_sector:
 		/* adjust very last sectors, in case we are oddly sized */
 		if (sector + (size>>9) > capacity)
 			size = (capacity-sector)<<9;
-		inc_rs_pending(mdev);
-		if (!drbd_send_drequest(mdev, RSDataRequest,
-				       sector, size, ID_SYNCER)) {
-			ERR("drbd_send_drequest() failed, aborting...\n");
-			dec_rs_pending(mdev);
-			dec_local(mdev);
-			return 0;
+		if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
+			switch (read_for_csum(mdev, sector, size)) {
+			case 0: /* Disk failure*/
+				dec_local(mdev);
+				return 0;
+			case 2: /* Allocation failed */
+				drbd_rs_complete_io(mdev, sector);
+				drbd_bm_set_find(mdev, BM_SECT_TO_BIT(sector));
+				goto requeue;
+			/* case 1: everything ok */
+			}
+		} else {
+			inc_rs_pending(mdev);
+			if (!drbd_send_drequest(mdev, RSDataRequest,
+					       sector, size, ID_SYNCER)) {
+				ERR("drbd_send_drequest() failed, aborting...\n");
+				dec_rs_pending(mdev);
+				dec_local(mdev);
+				return 0;
+			}
 		}
 	}
 
@@ -487,7 +590,7 @@ next_sector:
 	return 1;
 }
 
-int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work* w,int cancel)
+int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 {
 	int number,i,size;
 	sector_t sector;
@@ -557,10 +660,11 @@ STATIC int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int ca
 
 int drbd_resync_finished(struct drbd_conf *mdev)
 {
-	unsigned long db,dt,dbdt;
+	unsigned long db, dt, dbdt;
+	unsigned long n_oos;
 	union drbd_state_t os, ns;
 	struct drbd_work *w;
-	int art = 0;
+	char * khelper_cmd = NULL;
 
 	/* Remove all elements from the resync LRU. Since future actions
 	 * might set bits in the (main) bitmap, then the entries in the
@@ -572,7 +676,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
 		 * is not finished by now).   Retry in 100ms. */
 
 		drbd_kick_lo(mdev);
-		set_current_state(TASK_INTERRUPTIBLE);
+		__set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(HZ / 10);
 		w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
 		if (w) {
@@ -606,17 +710,36 @@ int drbd_resync_finished(struct drbd_conf *mdev)
 
 	INFO("%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
 	     (os.conn == VerifyS || os.conn == VerifyT) ?
-	     "Online verify ": "Resync",
+	     "Online verify " : "Resync",
 	     dt + mdev->rs_paused, mdev->rs_paused, dbdt);
 
+	n_oos = drbd_bm_total_weight(mdev);
+
 	if (os.conn == VerifyS || os.conn == VerifyT) {
-		if (drbd_bm_total_weight(mdev)) {
+		if (n_oos) {
 			ALERT("Online verify found %lu %dk block out of sync!\n",
-			      drbd_bm_total_weight(mdev),BM_BLOCK_SIZE/1024);
-			drbd_khelper(mdev,"out-of-sync");
+			      n_oos, Bit2KB(1));
+			khelper_cmd = "out-of-sync";
 		}
 	} else {
-		D_ASSERT((drbd_bm_total_weight(mdev)-mdev->rs_failed) == 0);
+		D_ASSERT((n_oos - mdev->rs_failed) == 0);
+
+		if (os.conn == SyncTarget || os.conn == PausedSyncT)
+			khelper_cmd = "after-resync-target";
+
+		if (mdev->csums_tfm && mdev->rs_total) {
+			const unsigned long s = mdev->rs_same_csum;
+			const unsigned long t = mdev->rs_total;
+			const int ratio =
+				(t == 0)     ? 0 :
+			(t < 100000) ? ((s*100)/t) : (s/(t/100));
+			INFO("%u %% had equal check sums, eliminated: %luK; "
+			     "transferred %luK total %luK\n",
+			     ratio,
+			     Bit2KB(mdev->rs_same_csum),
+			     Bit2KB(mdev->rs_total - mdev->rs_same_csum),
+			     Bit2KB(mdev->rs_total));
+		}
 	}
 
 	if (mdev->rs_failed) {
@@ -656,26 +779,25 @@ int drbd_resync_finished(struct drbd_conf *mdev)
 		}
 	}
 
-	art = os.conn == SyncTarget || os.conn == PausedSyncT;
-
+	DRBD_STATE_DEBUG_INIT_VAL(ns);
 	_drbd_set_state(mdev, ns, ChgStateVerbose, NULL);
-  out_unlock:
+out_unlock:
 	spin_unlock_irq(&mdev->req_lock);
 	dec_local(mdev);
-  out:
+out:
 	mdev->rs_total  = 0;
 	mdev->rs_failed = 0;
 	mdev->rs_paused = 0;
 
 	if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
-		DRBD_WARN("Writing the whole bitmap, due to failed kmalloc\n");
-		drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL);
+		drbd_WARN("Writing the whole bitmap, due to failed kmalloc\n");
+		drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
 	}
 
 	drbd_bm_recount_bits(mdev);
 
-	if (art)
-		drbd_khelper(mdev, "after-resync-target");
+	if (khelper_cmd)
+		drbd_khelper(mdev, khelper_cmd);
 
 	return 1;
 }
@@ -720,7 +842,8 @@ int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	}
 	spin_unlock_irq(&mdev->req_lock);
 
-	if (unlikely(!ok)) ERR("drbd_send_block() failed\n");
+	if (unlikely(!ok))
+		ERR("drbd_send_block() failed\n");
 	return ok;
 }
 
@@ -744,7 +867,7 @@ int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	}
 
 	if (likely(drbd_bio_uptodate(e->private_bio))) {
-		if (likely( mdev->state.pdsk >= Inconsistent )) {
+		if (likely(mdev->state.pdsk >= Inconsistent)) {
 			inc_rs_pending(mdev);
 			ok = drbd_send_block(mdev, RSDataReply, e);
 		} else {
@@ -777,10 +900,77 @@ int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	}
 	spin_unlock_irq(&mdev->req_lock);
 
-	if (unlikely(!ok)) ERR("drbd_send_block() failed\n");
+	if (unlikely(!ok))
+		ERR("drbd_send_block() failed\n");
 	return ok;
 }
 
+int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w;
+	struct digest_info *di;
+	int digest_size;
+	void *digest = NULL;
+	int ok,eq=0;
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev,e);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	drbd_rs_complete_io(mdev, e->sector);
+
+	di = (struct digest_info *)(unsigned long)e->block_id;
+
+	if (likely(drbd_bio_uptodate(e->private_bio))) {
+		/* quick hack to try to avoid a race against reconfiguration.
+		 * a real fix would be much more involved,
+		 * introducing more locking mechanisms */
+		if (mdev->csums_tfm) {
+			digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+			D_ASSERT(digest_size == di->digest_size);
+			digest = kmalloc(digest_size,GFP_KERNEL);
+		}
+		if (digest) {
+			drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
+			eq = !memcmp(digest, di->digest, digest_size);
+			kfree(digest);
+		}
+
+		if (eq) {
+			drbd_set_in_sync(mdev, e->sector,e->size);
+			mdev->rs_same_csum++;
+			ok=drbd_send_ack(mdev, RSIsInSync, e);
+		} else {
+			inc_rs_pending(mdev);
+			e->block_id = ID_SYNCER;
+			ok=drbd_send_block(mdev, RSDataReply, e);
+		}
+	} else {
+		ok=drbd_send_ack(mdev,NegRSDReply,e);
+		if (DRBD_ratelimit(5*HZ,5))
+			ERR("Sending NegDReply. I guess it gets messy.\n");
+		drbd_io_error(mdev, FALSE);
+	}
+
+	dec_unacked(mdev);
+
+	kfree(di);
+
+	spin_lock_irq(&mdev->req_lock);
+	if (drbd_bio_has_active_page(e->private_bio)) {
+		/* This might happen if sendpage() has not finished */
+		list_add_tail(&e->w.list,&mdev->net_ee);
+	} else {
+		drbd_free_ee(mdev,e);
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (unlikely(!ok))
+		ERR("drbd_send_block/ack() failed\n");
+	return ok;
+}
 
 int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 {
@@ -887,7 +1077,6 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	return ok;
 }
 
-
 int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 {
 	clear_bit(WORK_PENDING, &mdev->flags);
@@ -968,17 +1157,13 @@ int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	ok = drbd_send_drequest(mdev, DataRequest, req->sector, req->size,
 				(unsigned long)req);
 
-	if (ok) {
-		req_mod(req, handed_over_to_network, 0);
-	} else {
-		/* ?? we set Timeout or BrokenPipe in drbd_send() */
+	if (!ok) {
+		/* ?? we set Timeout or BrokenPipe in drbd_send();
+		 * so this is probably redundant */
 		if (mdev->state.conn >= Connected)
 			drbd_force_state(mdev, NS(conn, NetworkFailure));
-		/* req_mod(req, send_failed); we should not fail it here,
-		 * we might have to "freeze" on disconnect.
-		 * handled by req_mod(req, connection_lost_while_pending);
-		 * in drbd_fail_pending_reads soon enough. */
 	}
+	req_mod(req, ok ? handed_over_to_network : send_failed, 0);
 
 	return ok;
 }
@@ -1024,10 +1209,11 @@ STATIC int _drbd_may_sync_now(struct drbd_conf *mdev)
 			return 1;
 		odev = minor_to_mdev(odev->sync_conf.after);
 		ERR_IF(!odev) return 1;
-		if ( (odev->state.conn >= SyncSource &&
+		if ((odev->state.conn >= SyncSource &&
 		     odev->state.conn <= PausedSyncT) ||
 		    odev->state.aftr_isp || odev->state.peer_isp ||
-		    odev->state.user_isp ) return 0;
+		    odev->state.user_isp)
+			return 0;
 	}
 }
 
@@ -1035,7 +1221,7 @@ STATIC int _drbd_may_sync_now(struct drbd_conf *mdev)
  * _drbd_pause_after:
  * Finds all devices that may not resync now, and causes them to
  * pause their resynchronisation.
- * Called from process context only ( ioctl and after_state_ch ).
+ * Called from process context only (admin command and after_state_ch).
  */
 STATIC int _drbd_pause_after(struct drbd_conf *mdev)
 {
@@ -1049,8 +1235,8 @@ STATIC int _drbd_pause_after(struct drbd_conf *mdev)
 		if (odev->state.conn == StandAlone && odev->state.disk == Diskless)
 			continue;
 		if (!_drbd_may_sync_now(odev))
-			rv |= ( _drbd_set_state(_NS(odev, aftr_isp, 1), ChgStateHard, NULL)
-				!= SS_NothingToDo ) ;
+			rv |= (_drbd_set_state(_NS(odev, aftr_isp, 1), ChgStateHard, NULL)
+				!= SS_NothingToDo);
 	}
 
 	return rv;
@@ -1060,7 +1246,7 @@ STATIC int _drbd_pause_after(struct drbd_conf *mdev)
  * _drbd_resume_next:
  * Finds all devices that can resume resynchronisation
  * process, and causes them to resume.
- * Called from process context only ( ioctl and worker ).
+ * Called from process context only (admin command and worker).
  */
 STATIC int _drbd_resume_next(struct drbd_conf *mdev)
 {
@@ -1114,7 +1300,7 @@ void drbd_alter_sa(struct drbd_conf *mdev, int na)
  * drbd_start_resync:
  * @side: Either SyncSource or SyncTarget
  * Start the resync process. Called from process context only,
- * either ioctl or drbd_receiver.
+ * either admin command or drbd_receiver.
  * Note, this function might bring you directly into one of the
  * PausedSync* states.
  */
@@ -1125,7 +1311,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 
 	MTRACE(TraceTypeResync, TraceLvlSummary,
 	       INFO("Resync starting: side=%s\n",
-		    side == SyncTarget?"SyncTarget":"SyncSource");
+		    side == SyncTarget ? "SyncTarget" : "SyncSource");
 	    );
 
 	drbd_bm_recount_bits(mdev);
@@ -1178,6 +1364,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 	else /* side == SyncSource */
 		ns.pdsk = Inconsistent;
 
+	DRBD_STATE_DEBUG_INIT_VAL(ns);
 	r = _drbd_set_state(mdev, ns, ChgStateVerbose, NULL);
 	ns = mdev->state;
 
@@ -1191,6 +1378,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 		mdev->rs_paused    = 0;
 		mdev->rs_start     =
 		mdev->rs_mark_time = jiffies;
+		mdev->rs_same_csum = 0;
 		_drbd_pause_after(mdev);
 	}
 	drbd_global_unlock();
@@ -1225,20 +1413,20 @@ int drbd_worker(struct Drbd_thread *thi)
 	int intr = 0, i;
 
 	sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
-	set_cpus_allowed(current, drbd_calc_cpu_mask(mdev));
 
 	while (get_t_state(thi) == Running) {
+		drbd_thread_current_set_cpu(mdev);
 
 		if (down_trylock(&mdev->data.work.s)) {
 			down(&mdev->data.mutex);
-			if (mdev->data.socket)
-				drbd_tcp_flush(mdev->data.socket);
+			if (mdev->data.socket && !mdev->net_conf->no_cork)
+				drbd_tcp_uncork(mdev->data.socket);
 			up(&mdev->data.mutex);
 
 			intr = down_interruptible(&mdev->data.work.s);
 
 			down(&mdev->data.mutex);
-			if (mdev->data.socket)
+			if (mdev->data.socket  && !mdev->net_conf->no_cork)
 				drbd_tcp_cork(mdev->data.socket);
 			up(&mdev->data.mutex);
 		}
@@ -1251,7 +1439,8 @@ int drbd_worker(struct Drbd_thread *thi)
 			break;
 		}
 
-		if (get_t_state(thi) != Running) break;
+		if (get_t_state(thi) != Running)
+			break;
 		/* With this break, we have done a down() but not consumed
 		   the entry from the list. The cleanup code takes care of
 		   this...   */
@@ -1277,8 +1466,8 @@ int drbd_worker(struct Drbd_thread *thi)
 		list_del_init(&w->list);
 		spin_unlock_irq(&mdev->data.work.q_lock);
 
-		if (!w->cb(mdev, w, mdev->state.conn < Connected )) {
-			/* DRBD_WARN("worker: a callback failed! \n"); */
+		if (!w->cb(mdev, w, mdev->state.conn < Connected)) {
+			/* drbd_WARN("worker: a callback failed! \n"); */
 			if (mdev->state.conn >= Connected)
 				drbd_force_state(mdev,
 						NS(conn, NetworkFailure));
@@ -1314,6 +1503,9 @@ int drbd_worker(struct Drbd_thread *thi)
 	 * device.. */
 
 	D_ASSERT(mdev->state.disk == Diskless && mdev->state.conn == StandAlone);
+	/* _drbd_set_state only uses stop_nowait.
+	 * wait here for the Exiting receiver. */
+	drbd_thread_stop(&mdev->receiver);
 	drbd_mdev_cleanup(mdev);
 
 	INFO("worker terminated\n");
diff --git a/ubuntu/drbd/drbd_wrappers.h b/ubuntu/drbd/drbd_wrappers.h
index cb8e199..74afbab 100644
--- a/ubuntu/drbd/drbd_wrappers.h
+++ b/ubuntu/drbd/drbd_wrappers.h
@@ -47,7 +47,8 @@ static inline int drbd_bio_has_active_page(struct bio *bio)
 	int i;
 
 	__bio_for_each_segment(bvec, bio, i, 0) {
-		if (page_count(bvec->bv_page) > 1) return 1;
+		if (page_count(bvec->bv_page) > 1)
+			return 1;
 	}
 
 	return 0;
@@ -64,11 +65,11 @@ static inline int drbd_bio_has_active_page(struct bio *bio)
 #else
 #define BIO_ENDIO_TYPE void
 #define BIO_ENDIO_ARGS(b,e) (b,e)
-#define BIO_ENDIO_FN_START while(0) {}
+#define BIO_ENDIO_FN_START do {} while (0)
 #define BIO_ENDIO_FN_RETURN return
 #endif
 
-// bi_end_io handlers
+/* bi_end_io handlers */
 extern BIO_ENDIO_TYPE drbd_md_io_complete BIO_ENDIO_ARGS(struct bio *bio, int error);
 extern BIO_ENDIO_TYPE drbd_endio_read_sec BIO_ENDIO_ARGS(struct bio *bio, int error);
 extern BIO_ENDIO_TYPE drbd_endio_write_sec BIO_ENDIO_ARGS(struct bio *bio, int error);
@@ -80,6 +81,11 @@ extern BIO_ENDIO_TYPE drbd_endio_pri BIO_ENDIO_ARGS(struct bio *bio, int error);
 #define kmem_cache_create(N,S,A,F,C) kmem_cache_create(N,S,A,F,C,NULL)
 #endif
 
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+# undef HAVE_bvec_merge_data
+# define HAVE_bvec_merge_data 1
+#endif
+
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
 static inline void sg_set_page(struct scatterlist *sg, struct page *page,
 			       unsigned int len, unsigned int offset)
@@ -101,6 +107,10 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
 
 #endif
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28)
+# define BD_OPS_USE_FMODE
+#endif
+
 /*
  * used to submit our private bio
  */
@@ -152,7 +162,7 @@ static inline void drbd_plug_device(struct drbd_conf *mdev)
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
 static inline void drbd_unregister_blkdev(unsigned int major, const char *name)
 {
-	int ret = unregister_blkdev(major,name);
+	int ret = unregister_blkdev(major, name);
 	if (ret)
 		printk(KERN_ERR "drbd: unregister of device failed\n");
 }
@@ -176,7 +186,7 @@ static __inline__ int atomic_add_return(int i, atomic_t *v)
 
 static __inline__ int atomic_sub_return(int i, atomic_t *v)
 {
-	return atomic_add_return(-i,v);
+	return atomic_add_return(-i, v);
 }
 
 #define atomic_inc_return(v)  (atomic_add_return(1,v))
@@ -212,7 +222,7 @@ no_xadd: /* Legacy 386 processor */
 
 static __inline__ int atomic_sub_return(int i, atomic_t *v)
 {
-	return atomic_add_return(-i,v);
+	return atomic_add_return(-i, v);
 }
 
 #define atomic_inc_return(v)  (atomic_add_return(1,v))
@@ -251,11 +261,12 @@ crypto_alloc_hash(char *alg_name, u32 type, u32 mask)
 	struct crypto_hash *ch;
 	char *closing_bracket;
 
-	// "hmac(xxx)" is in alg_name we need that xxx.
-	closing_bracket = strchr(alg_name,')');
-	if(!closing_bracket) {
-		ch = kmalloc(sizeof(struct crypto_hash),GFP_KERNEL);
-		if(!ch) return ERR_PTR(-ENOMEM);
+	/* "hmac(xxx)" is in alg_name we need that xxx. */
+	closing_bracket = strchr(alg_name, ')');
+	if (!closing_bracket) {
+		ch = kmalloc(sizeof(struct crypto_hash), GFP_KERNEL);
+		if (!ch)
+			return ERR_PTR(-ENOMEM);
 		ch->base = crypto_alloc_tfm(alg_name, 0);
 		if (ch->base == NULL) {
 			kfree(ch);
@@ -263,10 +274,12 @@ crypto_alloc_hash(char *alg_name, u32 type, u32 mask)
 		}
 		return ch;
 	}
-	if(closing_bracket-alg_name < 6) return ERR_PTR(-ENOENT);
+	if (closing_bracket-alg_name < 6)
+		return ERR_PTR(-ENOENT);
 
-	ch = kmalloc(sizeof(struct crypto_hash),GFP_KERNEL);
-	if(!ch) return ERR_PTR(-ENOMEM);
+	ch = kmalloc(sizeof(struct crypto_hash), GFP_KERNEL);
+	if (!ch)
+		return ERR_PTR(-ENOMEM);
 
 	*closing_bracket = 0;
 	ch->base = crypto_alloc_tfm(alg_name + 5, 0);
@@ -281,7 +294,7 @@ crypto_alloc_hash(char *alg_name, u32 type, u32 mask)
 }
 
 static inline int
-crypto_hash_setkey(struct crypto_hash *hash,const u8 *key,unsigned int keylen)
+crypto_hash_setkey(struct crypto_hash *hash, const u8 *key, unsigned int keylen)
 {
 	hash->key = key;
 	hash->keylen = keylen;
@@ -294,7 +307,7 @@ crypto_hash_digest(struct hash_desc *desc, struct scatterlist *sg,
 		   unsigned int nbytes, u8 *out)
 {
 
-	crypto_hmac(desc->tfm->base, (u8*)desc->tfm->key,
+	crypto_hmac(desc->tfm->base, (u8 *)desc->tfm->key,
 		    &desc->tfm->keylen, sg, 1 /* ! */ , out);
 	/* ! this is not generic. Would need to convert nbytes -> nsg */
 
@@ -303,8 +316,9 @@ crypto_hash_digest(struct hash_desc *desc, struct scatterlist *sg,
 
 static inline void crypto_free_hash(struct crypto_hash *tfm)
 {
-	if (!tfm) return;
-	crypto_free_tfm(tfm->base); 
+	if (!tfm)
+		return;
+	crypto_free_tfm(tfm->base);
 	kfree(tfm);
 }
 
@@ -345,8 +359,9 @@ static inline int crypto_hash_final(struct hash_desc *desc, u8 *out)
 #ifdef NEED_BACKPORT_OF_KZALLOC
 static inline void *kzalloc(size_t size, int flags)
 {
-	void *rv = kmalloc(size,flags);
-	if(rv) memset(rv,0,size);
+	void *rv = kmalloc(size, flags);
+	if (rv)
+		memset(rv, 0, size);
 
 	return rv;
 }
@@ -440,3 +455,13 @@ static inline int backport_bitmap_parse(const char *buf, unsigned int buflen,
 #define KERNEL_HAS_GFP_T
 typedef unsigned gfp_t;
 #endif
+
+
+/* struct kvec didn't exist before 2.6.8, this is an ugly
+ * #define to work around it ... - jt */
+
+#ifndef KERNEL_HAS_KVEC
+#define kvec iovec
+#endif
+
+
diff --git a/ubuntu/drbd/linux/drbd.h b/ubuntu/drbd/linux/drbd.h
index 3dfe59b..9f89b0f 100644
--- a/ubuntu/drbd/linux/drbd.h
+++ b/ubuntu/drbd/linux/drbd.h
@@ -26,6 +26,7 @@
 #ifndef DRBD_H
 #define DRBD_H
 #include <linux/drbd_config.h>
+#include <linux/connector.h>
 
 #include <asm/types.h>
 
@@ -37,7 +38,7 @@
 #include <sys/wait.h>
 #include <limits.h>
 
-/* Altough the Linux source code makes a difference between 
+/* Altough the Linux source code makes a difference between
    generic endiness and the bitfields' endianess, there is no
    architecture as of Linux-2.6.24-rc4 where the bitfileds' endianess
    does not match the generic endianess. */
@@ -139,6 +140,7 @@ enum ret_codes {
 	CSUMSResyncRunning,
 	VERIFYIsRunning,
 	DataOfWrongCurrent,
+	MayNotBeConnected,
 
 	/* insert new ones above this line */
 	AfterLastRetCode,
@@ -188,9 +190,7 @@ enum drbd_conns {
 	WFBitMapT,
 	WFSyncUUID,
 
-	/* The distance between original state and pause
-	 * state must be the same for source and target. (+2)
-	 * All SyncStates are tested with this comparison
+	/* All SyncStates are tested with this comparison
 	 * xx >= SyncSource && xx <= PausedSyncT */
 	SyncSource,
 	SyncTarget,
@@ -226,30 +226,37 @@ union drbd_state_t {
  */
 	struct {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
-		unsigned role : 2 ;   /* 3/4	 primary/secondary/unknown */
-		unsigned peer : 2 ;   /* 3/4	 primary/secondary/unknown */
-		unsigned conn : 5 ;   /* 17/32	 cstates */
-		unsigned disk : 4 ;   /* 8/16	 from Diskless to UpToDate */
-		unsigned pdsk : 4 ;   /* 8/16	 from Diskless to UpToDate */
-		unsigned susp : 1 ;   /* 2/2	 IO suspended  no/yes */
-		unsigned aftr_isp : 1 ; /* isp .. imposed sync pause */
-		unsigned peer_isp : 1 ;
-		unsigned user_isp : 1 ;
-		unsigned _pad : 11;   /* 0	 unused */
+		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned conn:5 ;   /* 17/32	 cstates */
+		unsigned disk:4 ;   /* 8/16	 from Diskless to UpToDate */
+		unsigned pdsk:4 ;   /* 8/16	 from Diskless to UpToDate */
+		unsigned susp:1 ;   /* 2/2	 IO suspended  no/yes */
+		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+		unsigned peer_isp:1 ;
+		unsigned user_isp:1 ;
+		unsigned _pad:11;   /* 0	 unused */
 #elif defined(__BIG_ENDIAN_BITFIELD)
-		unsigned _pad : 11;   /* 0	 unused */
-		unsigned user_isp : 1 ;
-		unsigned peer_isp : 1 ;
-		unsigned aftr_isp : 1 ; /* isp .. imposed sync pause */
-		unsigned susp : 1 ;   /* 2/2	 IO suspended  no/yes */
-		unsigned pdsk : 4 ;   /* 8/16	 from Diskless to UpToDate */
-		unsigned disk : 4 ;   /* 8/16	 from Diskless to UpToDate */
-		unsigned conn : 5 ;   /* 17/32	 cstates */
-		unsigned peer : 2 ;   /* 3/4	 primary/secondary/unknown */
-		unsigned role : 2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned _pad:11;   /* 0	 unused */
+		unsigned user_isp:1 ;
+		unsigned peer_isp:1 ;
+		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+		unsigned susp:1 ;   /* 2/2	 IO suspended  no/yes */
+		unsigned pdsk:4 ;   /* 8/16	 from Diskless to UpToDate */
+		unsigned disk:4 ;   /* 8/16	 from Diskless to UpToDate */
+		unsigned conn:5 ;   /* 17/32	 cstates */
+		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
 #else
 # error "this endianess is not supported"
 #endif
+#ifndef DRBD_DEBUG_STATE_CHANGES
+#define DRBD_DEBUG_STATE_CHANGES 0
+#endif
+#if DRBD_DEBUG_STATE_CHANGES
+		unsigned int line;
+		const char *func;
+#endif
 	};
 	unsigned int i;
 };
@@ -278,9 +285,9 @@ enum set_st_err {
 	SS_NotSupported = -17,      /* drbd-8.2 only */
 	SS_InTransientState = -18,  /* Retry after the next state change */
 	SS_ConcurrentStChg = -19,   /* Concurrent cluster side state change! */
+	SS_AfterLastError = -20,    /* Keep this at bottom */
 };
 
-
 /* from drbd_strings.c */
 extern const char *conns_to_name(enum drbd_conns);
 extern const char *roles_to_name(enum drbd_role);
@@ -335,8 +342,14 @@ enum UuidIndex {
 
 /* The following line should be moved over to linux/connector.h
  * when the time comes */
-//#define CN_IDX_DRBD			0x5
-//#define CN_VAL_DRBD			0x1
+#ifndef CN_IDX_DRBD
+# define CN_IDX_DRBD			0x6
+/* Ubuntu "intrepid ibex" release defined CN_IDX_DRBD as 0x6 */
+#endif
+#define CN_VAL_DRBD			0x1
+
+/* For searching a vacant cn_idx value */
+#define CN_IDX_STEP			6977
 
 struct drbd_nl_cfg_req {
 	int packet_type;
diff --git a/ubuntu/drbd/linux/drbd_config.h b/ubuntu/drbd/linux/drbd_config.h
index 731d3f1..285bafa 100644
--- a/ubuntu/drbd/linux/drbd_config.h
+++ b/ubuntu/drbd/linux/drbd_config.h
@@ -22,10 +22,10 @@
 
 extern const char *drbd_buildtag(void);
 
-#define REL_VERSION "8.2.6"
+#define REL_VERSION "8.3.0"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
-#define PRO_VERSION_MAX 88
+#define PRO_VERSION_MAX 89
 
 #ifndef __CHECKER__   /* for a sparse run, we need all STATICs */
 #define DBG_ALL_SYMBOLS /* no static functs, improves quality of OOPS traces */
@@ -86,4 +86,14 @@ extern const char *drbd_buildtag(void);
 //#define NEED_SG_SET_BUF
 #define HAVE_LINUX_SCATTERLIST_H
 
+/* Some vendor kernels < 2.6.7 might define msleep in one or
+ * another way .. */
+
+#define KERNEL_HAS_MSLEEP
+
+/* Some other kernels < 2.6.8 do not have struct kvec,
+ * others do.. */
+
+#define KERNEL_HAS_KVEC
+
 #endif
diff --git a/ubuntu/drbd/linux/drbd_limits.h b/ubuntu/drbd/linux/drbd_limits.h
index 7388aa9..7e286d1 100644
--- a/ubuntu/drbd/linux/drbd_limits.h
+++ b/ubuntu/drbd/linux/drbd_limits.h
@@ -65,8 +65,8 @@
 #define DRBD_MAX_EPOCH_SIZE_DEF 2048
 
   /* I don't think that a tcp send buffer of more than 10M is usefull */
-#define DRBD_SNDBUF_SIZE_MIN  1
-#define DRBD_SNDBUF_SIZE_MAX  10000000
+#define DRBD_SNDBUF_SIZE_MIN  0
+#define DRBD_SNDBUF_SIZE_MAX  (10<<20)
 #define DRBD_SNDBUF_SIZE_DEF  (2*65535)
 
   /* @4k PageSize -> 128kB - 512MB */
@@ -89,7 +89,8 @@
 /* syncer { */
   /* FIXME allow rate to be zero? */
 #define DRBD_RATE_MIN 1
-#define DRBD_RATE_MAX 700000
+/* channel bonding 10 GbE, or other hardware */
+#define DRBD_RATE_MAX (4 << 20)
 #define DRBD_RATE_DEF 250  /* kb/second */
 
   /* less than 7 would hit performance unneccessarily.
@@ -107,10 +108,11 @@
 
 /* drbdsetup XY resize -d Z
  * you are free to reduce the device size to nothing, if you want to.
- * but more than 3998G are currently not possible */
+ * the upper limit with 64bit kernel, enough ram and flexible meta data
+ * is 16 TB, currently. */
 /* DRBD_MAX_SECTORS */
 #define DRBD_DISK_SIZE_SECT_MIN  0
-#define DRBD_DISK_SIZE_SECT_MAX  ((128LLU*1024*2 - 72)*512LLU*8*8)
+#define DRBD_DISK_SIZE_SECT_MAX  (16 * (2LLU << 30))
 #define DRBD_DISK_SIZE_SECT_DEF  0 /* = disabled = no user size... */
 
 #define DRBD_ON_IO_ERROR_DEF PassOn
diff --git a/ubuntu/drbd/linux/drbd_nl.h b/ubuntu/drbd/linux/drbd_nl.h
index b89a575..210fd0e 100644
--- a/ubuntu/drbd/linux/drbd_nl.h
+++ b/ubuntu/drbd/linux/drbd_nl.h
@@ -29,9 +29,12 @@ NL_PACKET(disk_conf, 3,
 	NL_BIT(		54,	T_MAY_IGNORE,	no_md_flush)
 	  /*  55 max_bio_size was available in 8.2.6rc2 */
 	NL_INTEGER(	56,	T_MAY_IGNORE,	max_bio_bvecs)
+	NL_BIT(		57,	T_MAY_IGNORE,	no_disk_barrier)
+	NL_BIT(		58,	T_MAY_IGNORE,	no_disk_drain)
 )
 
 NL_PACKET(detach, 4, )
+
 NL_PACKET(net_conf, 5,
 	NL_STRING(	8,	T_MANDATORY,	my_addr,	128)
 	NL_STRING(	9,	T_MANDATORY,	peer_addr,	128)
@@ -52,9 +55,13 @@ NL_PACKET(net_conf, 5,
 	NL_INTEGER(	26,	T_MAY_IGNORE,	after_sb_2p)
 	NL_INTEGER(	39,	T_MAY_IGNORE,	rr_conflict)
 	NL_INTEGER(	40,	T_MAY_IGNORE,	ping_timeo)
+	  /* 59 addr_family was available in GIT, never released */
+	NL_BIT(		60,	T_MANDATORY,	mind_af)
 	NL_BIT(		27,	T_MAY_IGNORE,	want_lose)
 	NL_BIT(		28,	T_MAY_IGNORE,	two_primaries)
 	NL_BIT(		41,	T_MAY_IGNORE,	always_asbp)
+	NL_BIT(		61,	T_MAY_IGNORE,	no_cork)
+	NL_BIT(		62,	T_MANDATORY,	auto_sndbuf_size)
 )
 
 NL_PACKET(disconnect, 6, )
@@ -69,6 +76,7 @@ NL_PACKET(syncer_conf, 8,
 	NL_INTEGER(	32,	T_MAY_IGNORE,	al_extents)
 	NL_STRING(      52,     T_MAY_IGNORE,   verify_alg,     SHARED_SECRET_MAX)
 	NL_STRING(      51,     T_MAY_IGNORE,   cpu_mask,       32)
+	NL_STRING(	64,	T_MAY_IGNORE,	csums_alg,	SHARED_SECRET_MAX)
 )
 
 NL_PACKET(invalidate, 9, )
@@ -96,8 +104,7 @@ NL_PACKET(call_helper, 20,
 	NL_STRING(	38,	T_MAY_IGNORE,	helper,		32)
 )
 
-/* Tag nr 42 already allocated in drbd-8.1 development.
- * Packet numbers 21 and 22 already in drbd-8.1 development. */
+/* Tag nr 42 already allocated in drbd-8.1 development. */
 
 NL_PACKET(sync_progress, 23,
 	NL_INTEGER(	43,	T_MAY_IGNORE,	sync_progress)
@@ -115,6 +122,10 @@ NL_PACKET(dump_ee, 24,
 NL_PACKET(start_ov, 25,
 )
 
+NL_PACKET(new_c_uuid, 26,
+       NL_BIT(		63,	T_MANDATORY,	clear_bm)
+)
+
 #undef NL_PACKET
 #undef NL_INTEGER
 #undef NL_INT64
diff --git a/ubuntu/drbd/linux/drbd_tag_magic.h b/ubuntu/drbd/linux/drbd_tag_magic.h
index 652f2f5..fcdff84 100644
--- a/ubuntu/drbd/linux/drbd_tag_magic.h
+++ b/ubuntu/drbd/linux/drbd_tag_magic.h
@@ -23,7 +23,7 @@ enum packet_types {
 #define NL_INT64(pn, pr, member)		\
 	__u64 member; int tag_and_len ## member;
 #define NL_BIT(pn, pr, member)		\
-	unsigned char member : 1; int tag_and_len ## member;
+	unsigned char member:1; int tag_and_len ## member;
 #define NL_STRING(pn, pr, member, len)	\
 	unsigned char member[len]; int member ## _len; \
 	int tag_and_len ## member;
diff --git a/ubuntu/drbd/lru_cache.c b/ubuntu/drbd/lru_cache.c
index 4fe0560..d26b333 100644
--- a/ubuntu/drbd/lru_cache.c
+++ b/ubuntu/drbd/lru_cache.c
@@ -65,7 +65,7 @@ static inline void lc_init(struct lru_cache *lc,
 		e = lc_entry(lc, i);
 		e->lc_number = LC_FREE;
 		list_add(&e->list, &lc->free);
-		// memset(,0,) did the rest of init for us
+		/* memset(,0,) did the rest of init for us */
 	}
 }
 
@@ -81,7 +81,7 @@ struct lru_cache *lc_alloc(const char *name, unsigned int e_count,
 	size_t bytes;
 
 	BUG_ON(!e_count);
-	e_size = max(sizeof(struct lc_element),e_size);
+	e_size = max(sizeof(struct lc_element), e_size);
 	bytes = size_of_lc(e_count, e_size);
 	lc = vmalloc(bytes);
 	if (lc)
@@ -109,7 +109,7 @@ void lc_reset(struct lru_cache *lc)
 			lc->nr_elements, lc->element_size, lc->lc_private);
 }
 
-size_t	lc_printf_stats(struct seq_file *seq, struct lru_cache* lc)
+size_t	lc_printf_stats(struct seq_file *seq, struct lru_cache *lc)
 {
 	/* NOTE:
 	 * total calls to lc_get are
@@ -155,7 +155,8 @@ STATIC struct lc_element *lc_evict(struct lru_cache *lc)
 	struct list_head  *n;
 	struct lc_element *e;
 
-	if (list_empty(&lc->lru)) return NULL;
+	if (list_empty(&lc->lru))
+		return NULL;
 
 	n = lc->lru.prev;
 	e = list_entry(n, struct lc_element, list);
@@ -189,7 +190,8 @@ STATIC struct lc_element *lc_get_unused_element(struct lru_cache *lc)
 {
 	struct list_head *n;
 
-	if (list_empty(&lc->free)) return lc_evict(lc);
+	if (list_empty(&lc->free))
+		return lc_evict(lc);
 
 	n = lc->free.next;
 	list_del(n);
@@ -198,8 +200,10 @@ STATIC struct lc_element *lc_get_unused_element(struct lru_cache *lc)
 
 STATIC int lc_unused_element_available(struct lru_cache *lc)
 {
-	if (!list_empty(&lc->free)) return 1; /* something on the free list */
-	if (!list_empty(&lc->lru)) return 1;  /* something to evict */
+	if (!list_empty(&lc->free))
+		return 1; /* something on the free list */
+	if (!list_empty(&lc->lru))
+		return 1;  /* something to evict */
 
 	return 0;
 }
@@ -368,7 +372,7 @@ void lc_set(struct lru_cache *lc, unsigned int enr, int index)
 	e->lc_number = enr;
 
 	hlist_del_init(&e->colision);
-	hlist_add_head( &e->colision, lc->slot + lc_hash_fn(lc, enr) );
+	hlist_add_head(&e->colision, lc->slot + lc_hash_fn(lc, enr));
 	list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru);
 }
 
@@ -376,7 +380,7 @@ void lc_set(struct lru_cache *lc, unsigned int enr, int index)
  * lc_dump: Dump a complete LRU cache to seq in textual form.
  */
 void lc_dump(struct lru_cache *lc, struct seq_file *seq, char *utext,
-	     void (*detail) (struct seq_file *, struct lc_element *) )
+	     void (*detail) (struct seq_file *, struct lc_element *))
 {
 	unsigned int nr_elements = lc->nr_elements;
 	struct lc_element *e;
@@ -386,11 +390,11 @@ void lc_dump(struct lru_cache *lc, struct seq_file *seq, char *utext,
 	for (i = 0; i < nr_elements; i++) {
 		e = lc_entry(lc, i);
 		if (e->lc_number == LC_FREE) {
-			seq_printf(seq, "\t%2d: FREE\n", i );
+			seq_printf(seq, "\t%2d: FREE\n", i);
 		} else {
 			seq_printf(seq, "\t%2d: %4u %4u    ", i,
 				   e->lc_number,
-				   e->refcnt );
+				   e->refcnt);
 			detail(seq, e);
 		}
 	}
diff --git a/ubuntu/drbd/lru_cache.h b/ubuntu/drbd/lru_cache.h
index 35fe2be..d56efc2 100644
--- a/ubuntu/drbd/lru_cache.h
+++ b/ubuntu/drbd/lru_cache.h
@@ -116,7 +116,7 @@ struct seq_file;
 extern size_t lc_printf_stats(struct seq_file *seq, struct lru_cache *lc);
 
 void lc_dump(struct lru_cache *lc, struct seq_file *seq, char *utext,
-	     void (*detail) (struct seq_file *, struct lc_element *) );
+	     void (*detail) (struct seq_file *, struct lc_element *));
 
 /* This can be used to stop lc_get from changing the set of active elements.
  * Note that the reference counts and order on the lru list may still change.
@@ -136,12 +136,12 @@ static inline void lc_unlock(struct lru_cache *lc)
 static inline int lc_is_used(struct lru_cache *lc, unsigned int enr)
 {
 	struct lc_element *e = lc_find(lc, enr);
-	return (e && e->refcnt);
+	return e && e->refcnt;
 }
 
 #define LC_FREE (-1U)
 
-#define lc_e_base(lc)  ((char *) ( (lc)->slot + (lc)->nr_elements ) )
+#define lc_e_base(lc)  ((char *)((lc)->slot + (lc)->nr_elements))
 #define lc_entry(lc, i) ((struct lc_element *) \
 		       (lc_e_base(lc) + (i)*(lc)->element_size))
 #define lc_index_of(lc, e) (((char *)(e) - lc_e_base(lc))/(lc)->element_size)
-- 
1.6.0.4


--MP_/kNc+xsK1e2CQmF.8saFgkC8--



More information about the kernel-team mailing list