From 723ebcb9f9803cd6a62bbb85fbeb1206d0e63c64 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Mon, 5 Nov 2012 19:06:03 +0200
Subject: [PATCH 01/19] changed android@localhost to dzo@martin

---
 scripts/mkcompile_h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
index 41bc7a236..d5428594f 100755
--- a/scripts/mkcompile_h
+++ b/scripts/mkcompile_h
@@ -75,8 +75,8 @@ UTS_TRUNCATE="cut -b -$UTS_LEN"
 
   #/* < DTS2011052606009 jiaxianghong 20110527 begin */
   #/* < DTS2011030103387 niguodong 20110415 begin */
-  echo \#define LINUX_COMPILE_BY \"android\"
-  echo \#define LINUX_COMPILE_HOST \"localhost\"
+  echo \#define LINUX_COMPILE_BY \"dzo\"
+  echo \#define LINUX_COMPILE_HOST \"martin\"
   #/* DTS2011030103387 niguodong 20110415 end > */
   #/* < DTS2011052606009 jiaxianghong 20110527 end */
 

From cff6a3e34c3ff37f8b0cb4d0c6584a6269885c09 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Mon, 5 Nov 2012 20:04:29 +0200
Subject: [PATCH 02/19] updated ext3 and ext4 filesystems from HTC G2 CM9
 kernel

---
 fs/ext3/ialloc.c    |  8 +++--
 fs/ext3/inode.c     | 41 +++++++++++++++++++----
 fs/ext4/Kconfig     |  5 +++
 fs/ext4/balloc.c    |  3 +-
 fs/ext4/bitmap.c    |  8 ++---
 fs/ext4/ext4.h      | 16 +++++++--
 fs/ext4/ext4_jbd2.h | 56 ++++++++++++++++----------------
 fs/ext4/extents.c   | 10 ++++--
 fs/ext4/ialloc.c    | 17 ++++++----
 fs/ext4/inode.c     | 66 ++++++++++++++++++++++++-------------
 fs/ext4/ioctl.c     | 13 +++++---
 fs/ext4/mballoc.c   | 26 +++++++++++++--
 fs/ext4/namei.c     |  8 ++---
 fs/ext4/page-io.c   | 30 ++++++++++++++++-
 fs/ext4/super.c     | 79 ++++++++++++++++++++++++++++++++++++---------
 fs/ext4/xattr.c     | 13 ++++++--
 16 files changed, 296 insertions(+), 103 deletions(-)
 mode change 100755 => 100644 fs/ext4/ialloc.c

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index bfc2dc436..0b3da7cc8 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -561,8 +561,12 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
 	if (IS_DIRSYNC(inode))
 		handle->h_sync = 1;
 	if (insert_inode_locked(inode) < 0) {
-		err = -EINVAL;
-		goto fail_drop;
+		/*
+		 * Likely a bitmap corruption causing inode to be allocated
+		 * twice.
+		 */
+		err = -EIO;
+		goto fail;
 	}
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 3451d23c3..0aedb27fe 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1568,7 +1568,13 @@ static int ext3_ordered_writepage(struct page *page,
 	int err;
 
 	J_ASSERT(PageLocked(page));
-	WARN_ON_ONCE(IS_RDONLY(inode));
+	/*
+	 * We don't want to warn for emergency remount. The condition is
+	 * ordered to avoid dereferencing inode->i_sb in non-error case to
+	 * avoid slow-downs.
+	 */
+	WARN_ON_ONCE(IS_RDONLY(inode) &&
+		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
 
 	/*
 	 * We give up here if we're reentered, because it might be for a
@@ -1642,7 +1648,13 @@ static int ext3_writeback_writepage(struct page *page,
 	int err;
 
 	J_ASSERT(PageLocked(page));
-	WARN_ON_ONCE(IS_RDONLY(inode));
+	/*
+	 * We don't want to warn for emergency remount. The condition is
+	 * ordered to avoid dereferencing inode->i_sb in non-error case to
+	 * avoid slow-downs.
+	 */
+	WARN_ON_ONCE(IS_RDONLY(inode) &&
+		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
 
 	if (ext3_journal_current_handle())
 		goto out_fail;
@@ -1684,7 +1696,13 @@ static int ext3_journalled_writepage(struct page *page,
 	int err;
 
 	J_ASSERT(PageLocked(page));
-	WARN_ON_ONCE(IS_RDONLY(inode));
+	/*
+	 * We don't want to warn for emergency remount. The condition is
+	 * ordered to avoid dereferencing inode->i_sb in non-error case to
+	 * avoid slow-downs.
+	 */
+	WARN_ON_ONCE(IS_RDONLY(inode) &&
+		     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
 
 	if (ext3_journal_current_handle())
 		goto no_write;
@@ -2995,6 +3013,8 @@ static int ext3_do_update_inode(handle_t *handle,
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	struct buffer_head *bh = iloc->bh;
 	int err = 0, rc, block;
+	int need_datasync = 0;
+	__le32 disksize;
 
 again:
 	/* we can't allow multiple procs in here at once, its a bit racey */
@@ -3032,7 +3052,11 @@ static int ext3_do_update_inode(handle_t *handle,
 		raw_inode->i_gid_high = 0;
 	}
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
-	raw_inode->i_size = cpu_to_le32(ei->i_disksize);
+	disksize = cpu_to_le32(ei->i_disksize);
+	if (disksize != raw_inode->i_size) {
+		need_datasync = 1;
+		raw_inode->i_size = disksize;
+	}
 	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
 	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
 	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
@@ -3048,8 +3072,11 @@ static int ext3_do_update_inode(handle_t *handle,
 	if (!S_ISREG(inode->i_mode)) {
 		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
 	} else {
-		raw_inode->i_size_high =
-			cpu_to_le32(ei->i_disksize >> 32);
+		disksize = cpu_to_le32(ei->i_disksize >> 32);
+		if (disksize != raw_inode->i_size_high) {
+			raw_inode->i_size_high = disksize;
+			need_datasync = 1;
+		}
 		if (ei->i_disksize > 0x7fffffffULL) {
 			struct super_block *sb = inode->i_sb;
 			if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
@@ -3102,6 +3129,8 @@ static int ext3_do_update_inode(handle_t *handle,
 	ext3_clear_inode_state(inode, EXT3_STATE_NEW);
 
 	atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
+	if (need_datasync)
+		atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
 out_brelse:
 	brelse (bh);
 	ext3_std_error(inode->i_sb, err);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 9ed1bb1f3..8a595e01f 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -83,3 +83,8 @@ config EXT4_DEBUG
 
 	  If you select Y here, then you will be able to turn on debugging
 	  with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
+
+config EXT4_E2FSCK_RECOVER
+	bool "EXT4 e2fsck recovery support"
+	depends on EXT4_FS
+	default n
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 264f69495..ebe95f565 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -514,7 +514,8 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 		if (bitmap_bh == NULL)
 			continue;
 
-		x = ext4_count_free(bitmap_bh, sb->s_blocksize);
+		x = ext4_count_free(bitmap_bh->b_data,
+				    EXT4_BLOCKS_PER_GROUP(sb) / 8);
 		printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
 			i, ext4_free_blks_count(sb, gdp), x);
 		bitmap_count += x;
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index fa3af81ac..012faaaec 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,15 +15,13 @@
 
 static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
 
-unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
+unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
 {
 	unsigned int i, sum = 0;
 
-	if (!map)
-		return 0;
 	for (i = 0; i < numchars; i++)
-		sum += nibblemap[map->b_data[i] & 0xf] +
-			nibblemap[(map->b_data[i] >> 4) & 0xf];
+		sum += nibblemap[bitmap[i] & 0xf] +
+			nibblemap[(bitmap[i] >> 4) & 0xf];
 	return sum;
 }
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 354619a1a..7fb3bc7d3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -175,6 +175,7 @@ struct mpage_da_data {
  */
 #define	EXT4_IO_END_UNWRITTEN	0x0001
 #define EXT4_IO_END_ERROR	0x0002
+#define EXT4_IO_END_QUEUED	0x0004
 
 struct ext4_io_page {
 	struct page	*p_page;
@@ -357,8 +358,7 @@ struct flex_groups {
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
-			   EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
-			   EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+			   EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
 			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
 			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
 
@@ -1215,6 +1215,12 @@ struct ext4_sb_info {
 
 	/* Kernel thread for multiple mount protection */
 	struct task_struct *s_mmp_tsk;
+
+#ifdef CONFIG_EXT4_E2FSCK_RECOVER
+	/* workqueue for rebooting oem-22 to run e2fsck */
+	struct work_struct reboot_work;
+	struct workqueue_struct *recover_wq;
+#endif
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1713,7 +1719,7 @@ struct mmpd_data {
 # define NORET_AND	noreturn,
 
 /* bitmap.c */
-extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
+extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
 
 /* balloc.c */
 extern unsigned int ext4_block_group(struct super_block *sb,
@@ -2178,6 +2184,10 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 start_orig, __u64 start_donor,
 			     __u64 len, __u64 *moved_len);
 
+#ifdef CONFIG_EXT4_E2FSCK_RECOVER
+extern void ext4_e2fsck(struct super_block *sb);
+#endif
+
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
 extern void ext4_exit_pageio(void);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5802fa1da..95af6f878 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -261,43 +261,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
 
-static inline int ext4_should_journal_data(struct inode *inode)
+/*
+ * Ext4 inode journal modes
+ */
+#define EXT4_INODE_JOURNAL_DATA_MODE	0x01 /* journal data mode */
+#define EXT4_INODE_ORDERED_DATA_MODE	0x02 /* ordered data mode */
+#define EXT4_INODE_WRITEBACK_DATA_MODE	0x04 /* writeback data mode */
+
+static inline int ext4_inode_journal_mode(struct inode *inode)
 {
 	if (EXT4_JOURNAL(inode) == NULL)
-		return 0;
-	if (!S_ISREG(inode->i_mode))
-		return 1;
-	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
-		return 1;
-	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
-		return 1;
-	return 0;
+		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
+	/* We do not support data journalling with delayed allocation */
+	if (!S_ISREG(inode->i_mode) ||
+	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
+	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+	    !test_opt(inode->i_sb, DELALLOC))
+		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+		return EXT4_INODE_ORDERED_DATA_MODE;	/* ordered */
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
+	else
+		BUG();
+}
+
+static inline int ext4_should_journal_data(struct inode *inode)
+{
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
 }
 
 static inline int ext4_should_order_data(struct inode *inode)
 {
-	if (EXT4_JOURNAL(inode) == NULL)
-		return 0;
-	if (!S_ISREG(inode->i_mode))
-		return 0;
-	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
-		return 0;
-	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
-		return 1;
-	return 0;
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
 }
 
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
-	if (EXT4_JOURNAL(inode) == NULL)
-		return 1;
-	if (!S_ISREG(inode->i_mode))
-		return 0;
-	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
-		return 0;
-	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
-		return 1;
-	return 0;
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
 }
 
 /*
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index f3aacb320..57694395c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -341,6 +341,8 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 	ext4_fsblk_t block = ext4_ext_pblock(ext);
 	int len = ext4_ext_get_actual_len(ext);
 
+	if (len == 0)
+		return 0;
 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 }
 
@@ -1993,7 +1995,11 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
 			__u32 len, ext4_fsblk_t start)
 {
 	struct ext4_ext_cache *cex;
-	BUG_ON(len == 0);
+	WARN_ON(len == 0);
+	if (len == 0) {
+		EXT4_ERROR_INODE(inode, "extent.ee_len = 0");
+		return;
+	}
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	cex = &EXT4_I(inode)->i_cached_extent;
 	cex->ec_block = block;
@@ -2844,7 +2850,7 @@ static int ext4_split_extent_at(handle_t *handle,
 		if (err)
 			goto fix_extent_len;
 		/* update the extent length and mark as initialized */
-		ex->ee_len = cpu_to_le32(ee_len);
+		ex->ee_len = cpu_to_le16(ee_len);
 		ext4_ext_try_to_merge(inode, path, ex);
 		err = ext4_ext_dirty(handle, inode, path + depth);
 		goto out;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
old mode 100755
new mode 100644
index eaa10168e..29272de30
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1021,8 +1021,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
 	if (IS_DIRSYNC(inode))
 		ext4_handle_sync(handle);
 	if (insert_inode_locked(inode) < 0) {
-		err = -EINVAL;
-		goto fail_drop;
+		/*
+		 * Likely a bitmap corruption causing inode to be allocated
+		 * twice.
+		 */
+		err = -EIO;
+		goto fail;
 	}
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
@@ -1189,7 +1193,8 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 		if (!bitmap_bh)
 			continue;
 
-		x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
+		x = ext4_count_free(bitmap_bh->b_data,
+				    EXT4_INODES_PER_GROUP(sb) / 8);
 		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
 			(unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
 		bitmap_count += x;
@@ -1275,10 +1280,10 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
 	 * used inodes so we need to skip blocks with used inodes in
 	 * inode table.
 	 */
-/*  yanzhijun for remount system */
 	if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
-		used_blks = (EXT4_INODES_PER_GROUP(sb) -
-			    ext4_itable_unused_count(sb, gdp))/sbi->s_inodes_per_block;
+		used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+			    ext4_itable_unused_count(sb, gdp)),
+			    sbi->s_inodes_per_block);
 
 	if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
 		ext4_error(sb, "Something is wrong with group %u\n"
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c94774c32..18fee6dae 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -190,9 +190,6 @@ void ext4_evict_inode(struct inode *inode)
 
 	trace_ext4_evict_inode(inode);
 
-	mutex_lock(&inode->i_mutex);
-	ext4_flush_completed_IO(inode);
-	mutex_unlock(&inode->i_mutex);
 	ext4_ioend_wait(inode);
 
 	if (inode->i_nlink) {
@@ -1137,6 +1134,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
 		used = ei->i_reserved_data_blocks;
 	}
 
+	if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
+		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d "
+			 "with only %d reserved metadata blocks\n", __func__,
+			 inode->i_ino, ei->i_allocated_meta_blocks,
+			 ei->i_reserved_meta_blocks);
+		WARN_ON(1);
+		ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
+	}
+
 	/* Update per-inode reservations */
 	ei->i_reserved_data_blocks -= used;
 	ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
@@ -2129,8 +2135,11 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
 					clear_buffer_unwritten(bh);
 				}
 
-				/* skip page if block allocation undone */
-				if (buffer_delay(bh) || buffer_unwritten(bh))
+				/*
+				 * skip page if block allocation undone and
+				 * block is dirty
+				 */
+				if (ext4_bh_delay_or_unwritten(NULL, bh))
 					skip_page = 1;
 				bh = bh->b_this_page;
 				block_start += bh->b_size;
@@ -3212,13 +3221,14 @@ static int ext4_da_write_end(struct file *file,
 	int write_mode = (int)(unsigned long)fsdata;
 
 	if (write_mode == FALL_BACK_TO_NONDELALLOC) {
-		if (ext4_should_order_data(inode)) {
+		switch (ext4_inode_journal_mode(inode)) {
+		case EXT4_INODE_ORDERED_DATA_MODE:
 			return ext4_ordered_write_end(file, mapping, pos,
 					len, copied, page, fsdata);
-		} else if (ext4_should_writeback_data(inode)) {
+		case EXT4_INODE_WRITEBACK_DATA_MODE:
 			return ext4_writeback_write_end(file, mapping, pos,
 					len, copied, page, fsdata);
-		} else {
+		default:
 			BUG();
 		}
 	}
@@ -3234,7 +3244,7 @@ static int ext4_da_write_end(struct file *file,
 	 */
 
 	new_i_size = pos + copied;
-	if (new_i_size > EXT4_I(inode)->i_disksize) {
+	if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
 		if (ext4_da_should_update_i_disksize(page, end)) {
 			down_write(&EXT4_I(inode)->i_data_sem);
 			if (new_i_size > EXT4_I(inode)->i_disksize) {
@@ -3510,12 +3520,17 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 	}
 
 retry:
-	if (rw == READ && ext4_should_dioread_nolock(inode))
+	if (rw == READ && ext4_should_dioread_nolock(inode)) {
+		if (unlikely(!list_empty(&ei->i_completed_io_list))) {
+			mutex_lock(&inode->i_mutex);
+			ext4_flush_completed_IO(inode);
+			mutex_unlock(&inode->i_mutex);
+		}
 		ret = __blockdev_direct_IO(rw, iocb, inode,
 				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext4_get_block, NULL, NULL, 0);
-	else {
+	} else {
 		ret = blockdev_direct_IO(rw, iocb, inode,
 				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
@@ -3913,18 +3928,25 @@ static const struct address_space_operations ext4_da_aops = {
 
 void ext4_set_aops(struct inode *inode)
 {
-	if (ext4_should_order_data(inode) &&
-		test_opt(inode->i_sb, DELALLOC))
-		inode->i_mapping->a_ops = &ext4_da_aops;
-	else if (ext4_should_order_data(inode))
-		inode->i_mapping->a_ops = &ext4_ordered_aops;
-	else if (ext4_should_writeback_data(inode) &&
-		 test_opt(inode->i_sb, DELALLOC))
-		inode->i_mapping->a_ops = &ext4_da_aops;
-	else if (ext4_should_writeback_data(inode))
-		inode->i_mapping->a_ops = &ext4_writeback_aops;
-	else
+	switch (ext4_inode_journal_mode(inode)) {
+	case EXT4_INODE_ORDERED_DATA_MODE:
+		if (test_opt(inode->i_sb, DELALLOC))
+			inode->i_mapping->a_ops = &ext4_da_aops;
+		else
+			inode->i_mapping->a_ops = &ext4_ordered_aops;
+		break;
+	case EXT4_INODE_WRITEBACK_DATA_MODE:
+		if (test_opt(inode->i_sb, DELALLOC))
+			inode->i_mapping->a_ops = &ext4_da_aops;
+		else
+			inode->i_mapping->a_ops = &ext4_writeback_aops;
+		break;
+	case EXT4_INODE_JOURNAL_DATA_MODE:
 		inode->i_mapping->a_ops = &ext4_journalled_aops;
+		break;
+	default:
+		BUG();
+	}
 }
 
 /*
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 808c554e7..4cbe1c2c9 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -35,7 +35,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		handle_t *handle = NULL;
 		int err, migrate = 0;
 		struct ext4_iloc iloc;
-		unsigned int oldflags;
+		unsigned int oldflags, mask, i;
 		unsigned int jflag;
 
 		if (!inode_owner_or_capable(inode))
@@ -112,9 +112,14 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (err)
 			goto flags_err;
 
-		flags = flags & EXT4_FL_USER_MODIFIABLE;
-		flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE;
-		ei->i_flags = flags;
+		for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+			if (!(mask & EXT4_FL_USER_MODIFIABLE))
+				continue;
+			if (mask & flags)
+				ext4_set_inode_flag(inode, i);
+			else
+				ext4_clear_inode_flag(inode, i);
+		}
 
 		ext4_set_inode_flags(inode);
 		inode->i_ctime = ext4_current_time(inode);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0f1be7f16..67ee3c92b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -335,6 +335,9 @@
  *        object
  *
  */
+#ifdef CONFIG_EXT4_E2FSCK_RECOVER
+extern int ext4_debug_level;
+#endif
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_free_ext_cachep;
@@ -1315,6 +1318,12 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 	mb_free_blocks_double(inode, e4b, first, count);
 
 	e4b->bd_info->bb_free += count;
+#ifdef CONFIG_EXT4_E2FSCK_RECOVER
+	if (unlikely(ext4_debug_level))
+		pr_info("ext4: (%s) group %d free %d block, remain %d\n",
+			sb->s_id, e4b->bd_group, count,
+			e4b->bd_info->bb_free);
+#endif
 	if (first < e4b->bd_info->bb_first_free)
 		e4b->bd_info->bb_first_free = first;
 
@@ -1460,6 +1469,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 	mb_mark_used_double(e4b, start, len);
 
 	e4b->bd_info->bb_free -= len;
+#ifdef CONFIG_EXT4_E2FSCK_RECOVER
+	if (unlikely(ext4_debug_level))
+		pr_info("ext4: (%s) group %d mark %d block, remain %d\n",
+			e4b->bd_sb->s_id, e4b->bd_group, len,
+			e4b->bd_info->bb_free);
+#endif
 	if (e4b->bd_info->bb_first_free == start)
 		e4b->bd_info->bb_first_free += len;
 
@@ -2255,6 +2270,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 		meta_group_info[i]->bb_free =
 			ext4_free_blks_count(sb, desc);
 	}
+#ifdef CONFIG_EXT4_E2FSCK_RECOVER
+	if (unlikely(ext4_debug_level))
+		pr_info("ext4: group %d bb_free %d\n",
+			group, meta_group_info[i]->bb_free);
+#endif
 
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
 	init_rwsem(&meta_group_info[i]->alloc_sem);
@@ -2528,6 +2548,9 @@ int ext4_mb_release(struct super_block *sb)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 
+	if (sbi->s_proc)
+		remove_proc_entry("mb_groups", sbi->s_proc);
+
 	if (sbi->s_group_info) {
 		for (i = 0; i < ngroups; i++) {
 			grinfo = ext4_get_group_info(sb, i);
@@ -2575,8 +2598,6 @@ int ext4_mb_release(struct super_block *sb)
 	}
 
 	free_percpu(sbi->s_locality_groups);
-	if (sbi->s_proc)
-		remove_proc_entry("mb_groups", sbi->s_proc);
 
 	return 0;
 }
@@ -4583,6 +4604,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 		 */
 		new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
 		if (!new_entry) {
+			ext4_mb_unload_buddy(&e4b);
 			err = -ENOMEM;
 			goto error_return;
 		}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 458a394f6..3d36d5a1e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1589,7 +1589,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			dxtrace(dx_show_index("node", frames[1].entries));
 			dxtrace(dx_show_index("node",
 			       ((struct dx_node *) bh2->b_data)->entries));
-			err = ext4_handle_dirty_metadata(handle, inode, bh2);
+			err = ext4_handle_dirty_metadata(handle, dir, bh2);
 			if (err)
 				goto journal_error;
 			brelse (bh2);
@@ -1615,7 +1615,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			if (err)
 				goto journal_error;
 		}
-		err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+		err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
 		if (err) {
 			ext4_std_error(inode->i_sb, err);
 			goto cleanup;
@@ -1866,7 +1866,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
 	inode->i_nlink = 2;
 	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-	err = ext4_handle_dirty_metadata(handle, dir, dir_block);
+	err = ext4_handle_dirty_metadata(handle, inode, dir_block);
 	if (err)
 		goto out_clear_inode;
 	err = ext4_mark_inode_dirty(handle, inode);
@@ -2540,7 +2540,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
 						cpu_to_le32(new_dir->i_ino);
 		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-		retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+		retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh);
 		if (retval) {
 			ext4_std_error(old_dir->i_sb, retval);
 			goto end_rename;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 97e5e98fd..d99d74aca 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -142,7 +142,23 @@ static void ext4_end_io_work(struct work_struct *work)
 	unsigned long		flags;
 	int			ret;
 
-	mutex_lock(&inode->i_mutex);
+	if (!mutex_trylock(&inode->i_mutex)) {
+		/*
+		 * Requeue the work instead of waiting so that the work
+		 * items queued after this can be processed.
+		 */
+		queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
+		/*
+		 * To prevent the ext4-dio-unwritten thread from keeping
+		 * requeueing end_io requests and occupying cpu for too long,
+		 * yield the cpu if it sees an end_io request that has already
+		 * been requeued.
+		 */
+		if (io->flag & EXT4_IO_END_QUEUED)
+			yield();
+		io->flag |= EXT4_IO_END_QUEUED;
+		return;
+	}
 	ret = ext4_end_io_nolock(io);
 	if (ret < 0) {
 		mutex_unlock(&inode->i_mutex);
@@ -389,6 +405,18 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 
 		block_end = block_start + blocksize;
 		if (block_start >= len) {
+			/*
+			 * Comments copied from block_write_full_page_endio:
+			 *
+			 * The page straddles i_size.  It must be zeroed out on
+			 * each and every writepage invocation because it may
+			 * be mmapped.  "A file is mapped in multiples of the
+			 * page size.  For a file that is not a multiple of
+			 * the  page size, the remaining memory is zeroed when
+			 * mapped, and writes to that region are not written
+			 * out to the file."
+			 */
+			zero_user_segment(page, block_start, block_end);
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
 			continue;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 111ed9d3c..e3bcdf645 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -43,6 +43,9 @@
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#ifdef CONFIG_EXT4_E2FSCK_RECOVER
+#include <linux/reboot.h>
+#endif
 
 #include "ext4.h"
 #include "ext4_jbd2.h"
@@ -83,6 +86,10 @@ static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
 
+#ifdef CONFIG_EXT4_E2FSCK_RECOVER
+int ext4_debug_level = 0;
+#endif
+
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
 	.owner		= THIS_MODULE,
@@ -419,6 +426,11 @@ static void ext4_handle_error(struct super_block *sb)
 	if (test_opt(sb, ERRORS_PANIC))
 		panic("EXT4-fs (device %s): panic forced after error\n",
 			sb->s_id);
+
+#ifdef CONFIG_EXT4_E2FSCK_RECOVER
+	if (!ext4_debug_level && test_opt(sb, ERRORS_RO))
+		ext4_e2fsck(sb);
+#endif
 }
 
 void __ext4_error(struct super_block *sb, const char *function,
@@ -433,10 +445,47 @@ void __ext4_error(struct super_block *sb, const char *function,
 	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
 	       sb->s_id, function, line, current->comm, &vaf);
 	va_end(args);
+	save_error_info(sb, function, line);
 
 	ext4_handle_error(sb);
 }
 
+#ifdef CONFIG_EXT4_E2FSCK_RECOVER
+static void ext4_reboot(struct work_struct *work)
+{
+	printk(KERN_ERR "%s: reboot to run e2fsck\n", __func__);
+	kernel_restart("oem-22");
+}
+
+void ext4_e2fsck(struct super_block *sb)
+{
+	static int reboot;
+	struct workqueue_struct *wq;
+	struct ext4_sb_info *sb_info;
+	if (reboot)
+		return;
+	printk(KERN_ERR "%s\n", __func__);
+	reboot = 1;
+	sb_info = EXT4_SB(sb);
+	if (!sb_info) {
+		printk(KERN_ERR "%s: no sb_info\n", __func__);
+		reboot = 0;
+		return;
+	}
+	sb_info->recover_wq = create_workqueue("ext4-recover");
+	if (!sb_info->recover_wq) {
+		printk(KERN_ERR "EXT4-fs: failed to create recover workqueue\n");
+		reboot = 0;
+		return;
+	}
+
+	INIT_WORK(&sb_info->reboot_work, ext4_reboot);
+	wq = sb_info->recover_wq;
+	/* queue the work to reboot */
+	queue_work(wq, &sb_info->reboot_work);
+}
+#endif
+
 void ext4_error_inode(struct inode *inode, const char *function,
 		      unsigned int line, ext4_fsblk_t block,
 		      const char *fmt, ...)
@@ -859,6 +908,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->i_reserved_meta_blocks = 0;
 	ei->i_allocated_meta_blocks = 0;
 	ei->i_da_metadata_calc_len = 0;
+	ei->i_da_metadata_calc_last_lblock = 0;
 	spin_lock_init(&(ei->i_block_reservation_lock));
 #ifdef CONFIG_QUOTA
 	ei->i_reserved_quota = 0;
@@ -1113,9 +1163,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",block_validity");
 
 	if (!test_opt(sb, INIT_INODE_TABLE))
-		seq_puts(seq, ",noinit_inode_table");
+		seq_puts(seq, ",noinit_itable");
 	else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
-		seq_printf(seq, ",init_inode_table=%u",
+		seq_printf(seq, ",init_itable=%u",
 			   (unsigned) sbi->s_li_wait_mult);
 
 	ext4_show_quota_options(seq, sb);
@@ -1291,8 +1341,7 @@ enum {
 	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
-	Opt_discard, Opt_nodiscard,
-	Opt_init_inode_table, Opt_noinit_inode_table,
+	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
 };
 
 static const match_table_t tokens = {
@@ -1365,9 +1414,9 @@ static const match_table_t tokens = {
 	{Opt_dioread_lock, "dioread_lock"},
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
-	{Opt_init_inode_table, "init_itable=%u"},
-	{Opt_init_inode_table, "init_itable"},
-	{Opt_noinit_inode_table, "noinit_itable"},
+	{Opt_init_itable, "init_itable=%u"},
+	{Opt_init_itable, "init_itable"},
+	{Opt_noinit_itable, "noinit_itable"},
 	{Opt_err, NULL},
 };
 
@@ -1844,7 +1893,7 @@ static int parse_options(char *options, struct super_block *sb,
 		case Opt_dioread_lock:
 			clear_opt(sb, DIOREAD_NOLOCK);
 			break;
-		case Opt_init_inode_table:
+		case Opt_init_itable:
 			set_opt(sb, INIT_INODE_TABLE);
 			if (args[0].from) {
 				if (match_int(&args[0], &option))
@@ -1855,7 +1904,7 @@ static int parse_options(char *options, struct super_block *sb,
 				return 0;
 			sbi->s_li_wait_mult = option;
 			break;
-		case Opt_noinit_inode_table:
+		case Opt_noinit_itable:
 			clear_opt(sb, INIT_INODE_TABLE);
 			break;
 		default:
@@ -1958,17 +2007,16 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	struct ext4_group_desc *gdp = NULL;
 	ext4_group_t flex_group_count;
 	ext4_group_t flex_group;
-	int groups_per_flex = 0;
+	unsigned int groups_per_flex = 0;
 	size_t size;
 	int i;
 
 	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
-	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
-
-	if (groups_per_flex < 2) {
+	if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
 		sbi->s_log_groups_per_flex = 0;
 		return 1;
 	}
+	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
 
 	/* We allocate both existing and potentially added groups */
 	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
@@ -3314,7 +3362,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
 	if (sbi->s_inodes_per_block == 0)
 		goto cantfind_ext4;
-	sbi->s_itb_per_group = sbi->s_inodes_per_group /
+	sbi->s_itb_per_group = (sbi->s_inodes_per_group + sbi->s_inodes_per_block - 1)/
 					sbi->s_inodes_per_block;
 	sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
 	sbi->s_sbh = bh;
@@ -3620,7 +3668,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount4;
 	}
 
-	ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY);
+	if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
+		sb->s_flags |= MS_RDONLY;
 
 	/* determine the minimum size of new large inodes, if present */
 	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c757adc97..c2865cc31 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -487,18 +487,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		ext4_free_blocks(handle, inode, bh, 0, 1,
 				 EXT4_FREE_BLOCKS_METADATA |
 				 EXT4_FREE_BLOCKS_FORGET);
+		unlock_buffer(bh);
 	} else {
 		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+		if (ce)
+			mb_cache_entry_release(ce);
+		unlock_buffer(bh);
 		error = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (IS_SYNC(inode))
 			ext4_handle_sync(handle);
 		dquot_free_block(inode, 1);
 		ea_bdebug(bh, "refcount now=%d; releasing",
 			  le32_to_cpu(BHDR(bh)->h_refcount));
-		if (ce)
-			mb_cache_entry_release(ce);
 	}
-	unlock_buffer(bh);
 out:
 	ext4_std_error(inode->i_sb, error);
 	return;
@@ -820,8 +821,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
 
+			/*
+			 * take i_data_sem because we will test
+			 * i_delalloc_reserved_flag in ext4_mb_new_blocks
+			 */
+			down_read((&EXT4_I(inode)->i_data_sem));
 			block = ext4_new_meta_blocks(handle, inode, goal, 0,
 						     NULL, &error);
+			up_read((&EXT4_I(inode)->i_data_sem));
 			if (error)
 				goto cleanup;
 

From 862a013432daeb9a7902c16e76193060a9db3274 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Mon, 5 Nov 2012 20:06:37 +0200
Subject: [PATCH 03/19] updated kgsl and adreno drivers from HTC G2 CM9 kernel

---
 drivers/gpu/msm/adreno.c | 42 ++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)
 mode change 100755 => 100644 drivers/gpu/msm/adreno.c

diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
old mode 100755
new mode 100644
index 36b6e82db..8320003d2
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -114,6 +114,7 @@ static struct adreno_device device_3d0 = {
 	.pfp_fw = NULL,
 	.pm4_fw = NULL,
 	.wait_timeout = 10000, /* in milliseconds */
+	.ib_check_level = 0,
 };
 
 
@@ -273,9 +274,12 @@ static void adreno_setstate(struct kgsl_device *device,
 	int sizedwords = 0;
 	unsigned int mh_mmu_invalidate = 0x00000003; /*invalidate all and tc */
 
+	/*
+	 * Fix target freeze issue by adding TLB flush for each submit
+	 * on A20X based targets.
+	 */
 	if (adreno_is_a20x(adreno_dev))
 		flags |= KGSL_MMUFLAGS_TLBFLUSH;
-
 	/*
 	 * If possible, then set the state via the command stream to avoid
 	 * a CPU idle.  Otherwise, use the default setstate which uses register
@@ -641,6 +645,8 @@ adreno_recover_hang(struct kgsl_device *device)
 	unsigned int soptimestamp;
 	unsigned int eoptimestamp;
 	struct adreno_context *drawctxt;
+	struct kgsl_context *context;
+	int next = 0;
 
 	KGSL_DRV_ERR(device, "Starting recovery from 3D GPU hang....\n");
 	rb_buffer = vmalloc(rb->buffer_desc.size);
@@ -709,6 +715,24 @@ adreno_recover_hang(struct kgsl_device *device)
 
 	drawctxt->flags |= CTXT_FLAGS_GPU_HANG;
 
+	/*
+	 * Set the reset status of all contexts to
+	 * INNOCENT_CONTEXT_RESET_EXT except for the bad context
+	 * since thats the guilty party
+	 */
+	while ((context = idr_get_next(&device->context_idr, &next))) {
+		if (KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT !=
+			context->reset_status) {
+			if (context->devctxt != drawctxt)
+				context->reset_status =
+				KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT;
+			else
+				context->reset_status =
+				KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT;
+		}
+		next = next + 1;
+	}
+
 	/* Restore valid commands in ringbuffer */
 	adreno_ringbuffer_restore(rb, rb_buffer, num_rb_contents);
 	rb->timestamp = timestamp;
@@ -871,15 +895,13 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout)
 	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
 	unsigned int rbbm_status;
 	unsigned long wait_timeout =
-	msecs_to_jiffies(adreno_dev->wait_timeout);
-	/*< DTS2012041906630 zhangxiangdang 20120423 begin */
-	/*merge qc patch to fix kgsl issue.*/
+		msecs_to_jiffies(adreno_dev->wait_timeout);
 	unsigned long wait_time;
 	unsigned long wait_time_part;
 	unsigned int msecs;
 	unsigned int msecs_first;
 	unsigned int msecs_part;
-	/* DTS2012041906630 zhangxiangdang 20120423 end > */
+
 	kgsl_cffdump_regpoll(device->id, REG_RBBM_STATUS << 2,
 		0x00000000, 0x80000000);
 	/* first, wait until the CP has consumed all the commands in
@@ -887,8 +909,6 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout)
 	 */
 retry:
 	if (rb->flags & KGSL_FLAGS_STARTED) {
-		/*< DTS2012041906630 zhangxiangdang 20120423 begin */
-		/*merge qc patch to fix kgsl issue.*/
 		msecs = adreno_dev->wait_timeout;
 		msecs_first = (msecs <= 100) ? ((msecs + 4) / 5) : 100;
 		msecs_part = (msecs - msecs_first + 3) / 4;
@@ -901,7 +921,6 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout)
 				wait_time_part = jiffies +
 					msecs_to_jiffies(msecs_part);
 			}
-			/* DTS2012041906630 zhangxiangdang 20120423 end > */
 			GSL_RB_GET_READPTR(rb, &rb->rptr);
 			if (time_after(jiffies, wait_time)) {
 				KGSL_DRV_ERR(device, "rptr: %x, wptr: %x\n",
@@ -968,7 +987,7 @@ static int adreno_suspend_context(struct kgsl_device *device)
 	return status;
 }
 
-const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
+struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 						unsigned int pt_base,
 						unsigned int gpuaddr,
 						unsigned int size)
@@ -995,8 +1014,7 @@ const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 		if (!kgsl_mmu_pt_equal(priv->pagetable, pt_base))
 			continue;
 		spin_lock(&priv->mem_lock);
-		entry = kgsl_sharedmem_find_region(priv, gpuaddr,
-						sizeof(unsigned int));
+		entry = kgsl_sharedmem_find_region(priv, gpuaddr, size);
 		if (entry) {
 			result = &entry->memdesc;
 			spin_unlock(&priv->mem_lock);
@@ -1040,7 +1058,7 @@ const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 uint8_t *adreno_convertaddr(struct kgsl_device *device, unsigned int pt_base,
 			    unsigned int gpuaddr, unsigned int size)
 {
-	const struct kgsl_memdesc *memdesc;
+	struct kgsl_memdesc *memdesc;
 
 	memdesc = adreno_find_region(device, pt_base, gpuaddr, size);
 

From 4851316e18de7cbb5cb40b371171a793df974864 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Mon, 5 Nov 2012 20:12:34 +0200
Subject: [PATCH 04/19] updated kgsl and adreno drivers from HTC G2 CM9 kernel
 (2)

---
 drivers/gpu/msm/adreno.h                  |   8 +-
 drivers/gpu/msm/adreno_a2xx.c             |  81 +++++--
 drivers/gpu/msm/adreno_debugfs.c          |   2 +
 drivers/gpu/msm/adreno_drawctxt.c         |   7 +-
 drivers/gpu/msm/adreno_pm4types.h         |  39 ++-
 drivers/gpu/msm/adreno_postmortem.c       |   9 +-
 drivers/gpu/msm/adreno_ringbuffer.c       | 219 ++++++++++++++++-
 drivers/gpu/msm/adreno_snapshot.c         | 129 ++++++++--
 drivers/gpu/msm/kgsl.c                    | 275 ++++++++++++----------
 drivers/gpu/msm/kgsl.h                    |  63 +++--
 drivers/gpu/msm/kgsl_cffdump.c            | 184 ---------------
 drivers/gpu/msm/kgsl_device.h             |   9 +-
 drivers/gpu/msm/kgsl_drm.c                |  12 +-
 drivers/gpu/msm/kgsl_gpummu.c             |  12 +-
 drivers/gpu/msm/kgsl_iommu.c              |   6 +-
 drivers/gpu/msm/kgsl_pwrctrl.c            |  55 ++---
 drivers/gpu/msm/kgsl_pwrctrl.h            |   2 -
 drivers/gpu/msm/kgsl_pwrscale.c           |   4 +-
 drivers/gpu/msm/kgsl_pwrscale_idlestats.c |   0
 drivers/gpu/msm/kgsl_sharedmem.c          | 181 ++++++++++----
 drivers/gpu/msm/kgsl_sharedmem.h          |  68 ++++--
 drivers/gpu/msm/kgsl_snapshot.c           |  11 +-
 drivers/gpu/msm/z180.c                    |  55 +++--
 include/linux/msm_kgsl.h                  |  13 +-
 24 files changed, 910 insertions(+), 534 deletions(-)
 mode change 100755 => 100644 drivers/gpu/msm/adreno.h
 mode change 100755 => 100644 drivers/gpu/msm/adreno_a2xx.c
 mode change 100755 => 100644 drivers/gpu/msm/adreno_postmortem.c
 mode change 100755 => 100644 drivers/gpu/msm/kgsl.c
 mode change 100755 => 100644 drivers/gpu/msm/kgsl_gpummu.c
 mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrctrl.c
 mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrctrl.h
 mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrscale.c
 mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrscale_idlestats.c
 mode change 100755 => 100644 drivers/gpu/msm/kgsl_sharedmem.c
 mode change 100755 => 100644 drivers/gpu/msm/kgsl_sharedmem.h
 mode change 100755 => 100644 drivers/gpu/msm/z180.c
 mode change 100755 => 100644 include/linux/msm_kgsl.h

diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
old mode 100755
new mode 100644
index af5bf51ea..1259507d9
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -46,6 +46,8 @@
 #define ADRENO_ISTORE_WORDS 3
 #define ADRENO_ISTORE_START 0x5000
 
+#define ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW	50
+
 enum adreno_gpurev {
 	ADRENO_REV_UNKNOWN = 0,
 	ADRENO_REV_A200 = 200,
@@ -74,12 +76,16 @@ struct adreno_device {
 	unsigned int wait_timeout;
 	unsigned int istore_size;
 	unsigned int pix_shader_start;
+	unsigned int ib_check_level;
 };
 
 struct adreno_gpudev {
+	/* keeps track of when we need to execute the draw workaround code */
+	int ctx_switches_since_last_draw;
 	int (*ctxt_create)(struct adreno_device *, struct adreno_context *);
 	void (*ctxt_save)(struct adreno_device *, struct adreno_context *);
 	void (*ctxt_restore)(struct adreno_device *, struct adreno_context *);
+	void (*ctxt_draw_workaround)(struct adreno_device *);
 	irqreturn_t (*irq_handler)(struct adreno_device *);
 	void (*irq_control)(struct adreno_device *, int);
 	void * (*snapshot)(struct adreno_device *, void *, int *, int);
@@ -99,7 +105,7 @@ void adreno_regread(struct kgsl_device *device, unsigned int offsetwords,
 void adreno_regwrite(struct kgsl_device *device, unsigned int offsetwords,
 				unsigned int value);
 
-const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
+struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 						unsigned int pt_base,
 						unsigned int gpuaddr,
 						unsigned int size);
diff --git a/drivers/gpu/msm/adreno_a2xx.c b/drivers/gpu/msm/adreno_a2xx.c
old mode 100755
new mode 100644
index 5ce9cf85b..62628e4a3
--- a/drivers/gpu/msm/adreno_a2xx.c
+++ b/drivers/gpu/msm/adreno_a2xx.c
@@ -1421,11 +1421,61 @@ static int a2xx_drawctxt_create(struct adreno_device *adreno_dev,
 	return ret;
 }
 
+static void a2xx_drawctxt_workaround(struct adreno_device *adreno_dev)
+{
+	struct kgsl_device *device = &adreno_dev->dev;
+	unsigned int cmd[11];
+	unsigned int *cmds = &cmd[0];
+
+	if (adreno_is_a225(adreno_dev)) {
+		adreno_dev->gpudev->ctx_switches_since_last_draw++;
+		/* If there have been > than
+		 * ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW calls to context
+		 * switches w/o gmem being saved then we need to execute
+		 * this workaround */
+		if (adreno_dev->gpudev->ctx_switches_since_last_draw >
+				ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW)
+			adreno_dev->gpudev->ctx_switches_since_last_draw = 0;
+		else
+			return;
+		/*
+		 * Issue an empty draw call to avoid possible hangs due to
+		 * repeated idles without intervening draw calls.
+		 * On adreno 225 the PC block has a cache that is only
+		 * flushed on draw calls and repeated idles can make it
+		 * overflow. The gmem save path contains draw calls so
+		 * this workaround isn't needed there.
+		 */
+		*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
+		*cmds++ = (0x4 << 16) | (REG_PA_SU_SC_MODE_CNTL - 0x2000);
+		*cmds++ = 0;
+		*cmds++ = cp_type3_packet(CP_DRAW_INDX, 5);
+		*cmds++ = 0;
+		*cmds++ = 1<<14;
+		*cmds++ = 0;
+		*cmds++ = device->mmu.setstate_memory.gpuaddr;
+		*cmds++ = 0;
+		*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+		*cmds++ = 0x00000000;
+	} else {
+		/* On Adreno 20x/220, if the events for shader space reuse
+		 * gets dropped, the CP block would wait indefinitely.
+		 * Sending CP_SET_SHADER_BASES packet unblocks the CP from
+		 * this wait.
+		 */
+		*cmds++ = cp_type3_packet(CP_SET_SHADER_BASES, 1);
+		*cmds++ = adreno_encode_istore_size(adreno_dev)
+					| adreno_dev->pix_shader_start;
+	}
+
+	adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_PMODE,
+			&cmd[0], cmds - cmd);
+}
+
 static void a2xx_drawctxt_save(struct adreno_device *adreno_dev,
 			struct adreno_context *context)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
-	unsigned int cmd[22];
 
 	if (context == NULL)
 		return;
@@ -1470,33 +1520,11 @@ static void a2xx_drawctxt_save(struct adreno_device *adreno_dev,
 			adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_NONE,
 				context->chicken_restore, 3);
 		}
+		adreno_dev->gpudev->ctx_switches_since_last_draw = 0;
 
 		context->flags |= CTXT_FLAGS_GMEM_RESTORE;
-	} else if (adreno_is_a225(adreno_dev)) {
-		unsigned int *cmds = &cmd[0];
-		/*
-		 * Issue an empty draw call to avoid possible hangs due to
-		 * repeated idles without intervening draw calls.
-		 * On adreno 225 the PC block has a cache that is only
-		 * flushed on draw calls and repeated idles can make it
-		 * overflow. The gmem save path contains draw calls so
-		 * this workaround isn't needed there.
-		 */
-		*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
-		*cmds++ = (0x4 << 16) | (REG_PA_SU_SC_MODE_CNTL - 0x2000);
-		*cmds++ = 0;
-		*cmds++ = cp_type3_packet(CP_DRAW_INDX, 5);
-		*cmds++ = 0;
-		*cmds++ = 1<<14;
-		*cmds++ = 0;
-		*cmds++ = device->mmu.setstate_memory.gpuaddr;
-		*cmds++ = 0;
-		*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
-		*cmds++ = 0x00000000;
-
-		adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_PMODE,
-					    &cmd[0], 11);
-	}
+	} else if (adreno_is_a2xx(adreno_dev))
+		a2xx_drawctxt_workaround(adreno_dev);
 }
 
 static void a2xx_drawctxt_restore(struct adreno_device *adreno_dev,
@@ -1757,6 +1785,7 @@ struct adreno_gpudev adreno_a2xx_gpudev = {
 	.ctxt_create = a2xx_drawctxt_create,
 	.ctxt_save = a2xx_drawctxt_save,
 	.ctxt_restore = a2xx_drawctxt_restore,
+	.ctxt_draw_workaround = a2xx_drawctxt_workaround,
 	.irq_handler = a2xx_irq_handler,
 	.irq_control = a2xx_irq_control,
 	.snapshot = a2xx_snapshot,
diff --git a/drivers/gpu/msm/adreno_debugfs.c b/drivers/gpu/msm/adreno_debugfs.c
index c1b9e4ce2..566efa1aa 100644
--- a/drivers/gpu/msm/adreno_debugfs.c
+++ b/drivers/gpu/msm/adreno_debugfs.c
@@ -345,6 +345,8 @@ void adreno_debugfs_init(struct kgsl_device *device)
 			    &kgsl_cff_dump_enable_fops);
 	debugfs_create_u32("wait_timeout", 0644, device->d_debugfs,
 		&adreno_dev->wait_timeout);
+	debugfs_create_u32("ib_check", 0644, device->d_debugfs,
+			   &adreno_dev->ib_check_level);
 
 	/* Create post mortem control files */
 
diff --git a/drivers/gpu/msm/adreno_drawctxt.c b/drivers/gpu/msm/adreno_drawctxt.c
index 206a678ee..f0b5741b5 100644
--- a/drivers/gpu/msm/adreno_drawctxt.c
+++ b/drivers/gpu/msm/adreno_drawctxt.c
@@ -243,8 +243,13 @@ void adreno_drawctxt_switch(struct adreno_device *adreno_dev,
 	}
 
 	/* already current? */
-	if (adreno_dev->drawctxt_active == drawctxt)
+	if (adreno_dev->drawctxt_active == drawctxt) {
+		if (adreno_dev->gpudev->ctxt_draw_workaround &&
+			adreno_is_a225(adreno_dev))
+				adreno_dev->gpudev->ctxt_draw_workaround(
+					adreno_dev);
 		return;
+	}
 
 	KGSL_CTXT_INFO(device, "from %p to %p flags %d\n",
 			adreno_dev->drawctxt_active, drawctxt, flags);
diff --git a/drivers/gpu/msm/adreno_pm4types.h b/drivers/gpu/msm/adreno_pm4types.h
index 8aea58c95..454b05785 100644
--- a/drivers/gpu/msm/adreno_pm4types.h
+++ b/drivers/gpu/msm/adreno_pm4types.h
@@ -29,11 +29,6 @@
 /* skip N 32-bit words to get to the next packet */
 #define CP_NOP			0x10
 
-/* indirect buffer dispatch.  prefetch parser uses this packet type to determine
-*  whether to pre-fetch the IB
-*/
-#define CP_INDIRECT_BUFFER	0x3f
-
 /* indirect buffer dispatch.  same as IB, but init is pipelined */
 #define CP_INDIRECT_BUFFER_PFD	0x37
 
@@ -117,6 +112,9 @@
 /* load constants from a location in memory */
 #define CP_LOAD_CONSTANT_CONTEXT 0x2e
 
+/* (A2x) sets binning configuration registers */
+#define CP_SET_BIN_DATA             0x2f
+
 /* selective invalidation of state pointers */
 #define CP_INVALIDATE_STATE	0x3b
 
@@ -157,6 +155,16 @@
 
 #define CP_SET_PROTECTED_MODE  0x5f /* sets the register protection mode */
 
+/*
+ * for a3xx
+ */
+
+/* Conditionally load a IB based on a flag */
+#define CP_COND_INDIRECT_BUFFER_PFE 0x3A /* prefetch enabled */
+#define CP_COND_INDIRECT_BUFFER_PFD 0x32 /* prefetch disabled */
+
+/* Load a buffer with pre-fetch enabled */
+#define CP_INDIRECT_BUFFER_PFE 0x3F
 
 /* packet header building macros */
 #define cp_type0_packet(regindx, cnt) \
@@ -178,11 +186,20 @@
 #define cp_nop_packet(cnt) \
 	 (CP_TYPE3_PKT | (((cnt)-1) << 16) | (CP_NOP << 8))
 
+#define pkt_is_type0(pkt) (((pkt) & 0XC0000000) == CP_TYPE0_PKT)
+
+#define type0_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1)
+#define type0_pkt_offset(pkt) ((pkt) & 0x7FFF)
+
+#define pkt_is_type3(pkt) (((pkt) & 0xC0000000) == CP_TYPE3_PKT)
+
+#define cp_type3_opcode(pkt) (((pkt) >> 8) & 0xFF)
+#define type3_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1)
 
 /* packet headers */
 #define CP_HDR_ME_INIT	cp_type3_packet(CP_ME_INIT, 18)
 #define CP_HDR_INDIRECT_BUFFER_PFD cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)
-#define CP_HDR_INDIRECT_BUFFER	cp_type3_packet(CP_INDIRECT_BUFFER, 2)
+#define CP_HDR_INDIRECT_BUFFER_PFE cp_type3_packet(CP_INDIRECT_BUFFER_PFE, 2)
 
 /* dword base address of the GFX decode space */
 #define SUBBLOCK_OFFSET(reg) ((unsigned int)((reg) - (0x2000)))
@@ -190,4 +207,14 @@
 /* gmem command buffer length */
 #define CP_REG(reg) ((0x4 << 16) | (SUBBLOCK_OFFSET(reg)))
 
+
+/* Return 1 if the command is an indirect buffer of any kind */
+static inline int adreno_cmd_is_ib(unsigned int cmd)
+{
+	return (cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFE, 2) ||
+		cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2) ||
+		cmd == cp_type3_packet(CP_COND_INDIRECT_BUFFER_PFE, 2) ||
+		cmd == cp_type3_packet(CP_COND_INDIRECT_BUFFER_PFD, 2));
+}
+
 #endif	/* __ADRENO_PM4TYPES_H */
diff --git a/drivers/gpu/msm/adreno_postmortem.c b/drivers/gpu/msm/adreno_postmortem.c
old mode 100755
new mode 100644
index 40dfb30cf..63f5caa91
--- a/drivers/gpu/msm/adreno_postmortem.c
+++ b/drivers/gpu/msm/adreno_postmortem.c
@@ -53,7 +53,7 @@ static const struct pm_id_name pm3_types[] = {
 	{CP_IM_LOAD,			"IN__LOAD"},
 	{CP_IM_LOAD_IMMEDIATE,		"IM_LOADI"},
 	{CP_IM_STORE,			"IM_STORE"},
-	{CP_INDIRECT_BUFFER,		"IND_BUF_"},
+	{CP_INDIRECT_BUFFER_PFE,	"IND_BUF_"},
 	{CP_INDIRECT_BUFFER_PFD,	"IND_BUFP"},
 	{CP_INTERRUPT,			"PM4_INTR"},
 	{CP_INVALIDATE_STATE,		"INV_STAT"},
@@ -200,7 +200,7 @@ static void dump_ib1(struct kgsl_device *device, uint32_t pt_base,
 
 	for (i = 0; i+3 < ib1_size; ) {
 		value = ib1_addr[i++];
-		if (value == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) {
+		if (adreno_cmd_is_ib(value)) {
 			uint32_t ib2_base = ib1_addr[i++];
 			uint32_t ib2_size = ib1_addr[i++];
 
@@ -611,7 +611,7 @@ static int adreno_dump(struct kgsl_device *device)
 	i = 0;
 	for (read_idx = 0; read_idx < num_item; ) {
 		uint32_t this_cmd = rb_copy[read_idx++];
-		if (this_cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) {
+		if (adreno_cmd_is_ib(this_cmd)) {
 			uint32_t ib_addr = rb_copy[read_idx++];
 			uint32_t ib_size = rb_copy[read_idx++];
 			dump_ib1(device, cur_pt_base, (read_idx-3)<<2, ib_addr,
@@ -654,8 +654,7 @@ static int adreno_dump(struct kgsl_device *device)
 		for (read_idx = NUM_DWORDS_OF_RINGBUFFER_HISTORY;
 			read_idx >= 0; --read_idx) {
 			uint32_t this_cmd = rb_copy[read_idx];
-			if (this_cmd == cp_type3_packet(
-				CP_INDIRECT_BUFFER_PFD, 2)) {
+			if (adreno_cmd_is_ib(this_cmd)) {
 				uint32_t ib_addr = rb_copy[read_idx+1];
 				uint32_t ib_size = rb_copy[read_idx+2];
 				if (ib_size && cp_ib1_base == ib_addr) {
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index ea2889b3a..57883fd45 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -22,6 +22,7 @@
 #include "adreno.h"
 #include "adreno_pm4types.h"
 #include "adreno_ringbuffer.h"
+#include "adreno_debugfs.h"
 
 #include "a2xx_reg.h"
 
@@ -310,12 +311,10 @@ int adreno_ringbuffer_start(struct adreno_ringbuffer *rb, unsigned int init_ram)
 	adreno_regwrite(device, REG_SCRATCH_UMSK,
 			     GSL_RB_MEMPTRS_SCRATCH_MASK);
 
-    /*< DTS2012042406822 hanfeng 20120428 begin*/
 	/* update the eoptimestamp field with the last retired timestamp */
 	kgsl_sharedmem_writel(&device->memstore,
 			     KGSL_DEVICE_MEMSTORE_OFFSET(eoptimestamp),
 			     rb->timestamp);
-    /* DTS2012042406822 hanfeng 20120428 end > */
 
 	/* load the CP ucode */
 
@@ -554,6 +553,197 @@ adreno_ringbuffer_issuecmds(struct kgsl_device *device,
 	adreno_ringbuffer_addcmds(rb, flags, cmds, sizedwords);
 }
 
+static bool _parse_ibs(struct kgsl_device_private *dev_priv, uint gpuaddr,
+			   int sizedwords);
+
+static bool
+_handle_type3(struct kgsl_device_private *dev_priv, uint *hostaddr)
+{
+	unsigned int opcode = cp_type3_opcode(*hostaddr);
+	switch (opcode) {
+	case CP_INDIRECT_BUFFER_PFD:
+	case CP_INDIRECT_BUFFER_PFE:
+	case CP_COND_INDIRECT_BUFFER_PFE:
+	case CP_COND_INDIRECT_BUFFER_PFD:
+		return _parse_ibs(dev_priv, hostaddr[1], hostaddr[2]);
+	case CP_NOP:
+	case CP_WAIT_FOR_IDLE:
+	case CP_WAIT_REG_MEM:
+	case CP_WAIT_REG_EQ:
+	case CP_WAT_REG_GTE:
+	case CP_WAIT_UNTIL_READ:
+	case CP_WAIT_IB_PFD_COMPLETE:
+	case CP_REG_RMW:
+	case CP_REG_TO_MEM:
+	case CP_MEM_WRITE:
+	case CP_MEM_WRITE_CNTR:
+	case CP_COND_EXEC:
+	case CP_COND_WRITE:
+	case CP_EVENT_WRITE:
+	case CP_EVENT_WRITE_SHD:
+	case CP_EVENT_WRITE_CFL:
+	case CP_EVENT_WRITE_ZPD:
+	case CP_DRAW_INDX:
+	case CP_DRAW_INDX_2:
+	case CP_DRAW_INDX_BIN:
+	case CP_DRAW_INDX_2_BIN:
+	case CP_VIZ_QUERY:
+	case CP_SET_STATE:
+	case CP_SET_CONSTANT:
+	case CP_IM_LOAD:
+	case CP_IM_LOAD_IMMEDIATE:
+	case CP_LOAD_CONSTANT_CONTEXT:
+	case CP_INVALIDATE_STATE:
+	case CP_SET_SHADER_BASES:
+	case CP_SET_BIN_MASK:
+	case CP_SET_BIN_SELECT:
+	case CP_SET_BIN_BASE_OFFSET:
+	case CP_SET_BIN_DATA:
+	case CP_CONTEXT_UPDATE:
+	case CP_INTERRUPT:
+	case CP_IM_STORE:
+		break;
+	/* these shouldn't come from userspace */
+	case CP_ME_INIT:
+	case CP_SET_PROTECTED_MODE:
+	default:
+		KGSL_CMD_ERR(dev_priv->device, "bad CP opcode %0x\n", opcode);
+		return false;
+		break;
+	}
+
+	return true;
+}
+
+static bool
+_handle_type0(struct kgsl_device_private *dev_priv, uint *hostaddr)
+{
+	unsigned int reg = type0_pkt_offset(*hostaddr);
+	unsigned int cnt = type0_pkt_size(*hostaddr);
+	if (reg < 0x0192 || (reg + cnt) >= 0x8000) {
+		KGSL_CMD_ERR(dev_priv->device, "bad type0 reg: 0x%0x cnt: %d\n",
+			     reg, cnt);
+		return false;
+	}
+	return true;
+}
+
+/*
+ * Traverse IBs and dump them to test vector. Detect swap by inspecting
+ * register writes, keeping note of the current state, and dump
+ * framebuffer config to test vector
+ */
+static bool _parse_ibs(struct kgsl_device_private *dev_priv,
+			   uint gpuaddr, int sizedwords)
+{
+	static uint level; /* recursion level */
+	bool ret = false;
+	uint *hostaddr, *hoststart;
+	int dwords_left = sizedwords; /* dwords left in the current command
+					 buffer */
+	struct kgsl_mem_entry *entry;
+
+	spin_lock(&dev_priv->process_priv->mem_lock);
+	entry = kgsl_sharedmem_find_region(dev_priv->process_priv,
+					   gpuaddr, sizedwords * sizeof(uint));
+	spin_unlock(&dev_priv->process_priv->mem_lock);
+	if (entry == NULL) {
+		KGSL_CMD_ERR(dev_priv->device,
+			     "no mapping for gpuaddr: 0x%08x\n", gpuaddr);
+		return false;
+	}
+
+	hostaddr = (uint *)kgsl_gpuaddr_to_vaddr(&entry->memdesc, gpuaddr);
+	if (hostaddr == NULL) {
+		KGSL_CMD_ERR(dev_priv->device,
+			     "no mapping for gpuaddr: 0x%08x\n", gpuaddr);
+		return false;
+	}
+
+	hoststart = hostaddr;
+
+	level++;
+
+	KGSL_CMD_INFO(dev_priv->device, "ib: gpuaddr:0x%08x, wc:%d, hptr:%p\n",
+		gpuaddr, sizedwords, hostaddr);
+
+	mb();
+	while (dwords_left > 0) {
+		bool cur_ret = true;
+		int count = 0; /* dword count including packet header */
+
+		switch (*hostaddr >> 30) {
+		case 0x0: /* type-0 */
+			count = (*hostaddr >> 16)+2;
+			cur_ret = _handle_type0(dev_priv, hostaddr);
+			break;
+		case 0x1: /* type-1 */
+			count = 2;
+			break;
+		case 0x3: /* type-3 */
+			count = ((*hostaddr >> 16) & 0x3fff) + 2;
+			cur_ret = _handle_type3(dev_priv, hostaddr);
+			break;
+		default:
+			KGSL_CMD_ERR(dev_priv->device, "unexpected type: "
+				"type:%d, word:0x%08x @ 0x%p, gpu:0x%08x\n",
+				*hostaddr >> 30, *hostaddr, hostaddr,
+				gpuaddr+4*(sizedwords-dwords_left));
+			cur_ret = false;
+			count = dwords_left;
+			break;
+		}
+
+		if (!cur_ret) {
+			KGSL_CMD_ERR(dev_priv->device,
+				"bad sub-type: #:%d/%d, v:0x%08x"
+				" @ 0x%p[gb:0x%08x], level:%d\n",
+				sizedwords-dwords_left, sizedwords, *hostaddr,
+				hostaddr, gpuaddr+4*(sizedwords-dwords_left),
+				level);
+
+			if (ADRENO_DEVICE(dev_priv->device)->ib_check_level
+				>= 2)
+				print_hex_dump(KERN_ERR,
+					level == 1 ? "IB1:" : "IB2:",
+					DUMP_PREFIX_OFFSET, 32, 4, hoststart,
+					sizedwords*4, 0);
+			goto done;
+		}
+
+		/* jump to next packet */
+		dwords_left -= count;
+		hostaddr += count;
+		if (dwords_left < 0) {
+			KGSL_CMD_ERR(dev_priv->device,
+				"bad count: c:%d, #:%d/%d, "
+				"v:0x%08x @ 0x%p[gb:0x%08x], level:%d\n",
+				count, sizedwords-(dwords_left+count),
+				sizedwords, *(hostaddr-count), hostaddr-count,
+				gpuaddr+4*(sizedwords-(dwords_left+count)),
+				level);
+			if (ADRENO_DEVICE(dev_priv->device)->ib_check_level
+				>= 2)
+				print_hex_dump(KERN_ERR,
+					level == 1 ? "IB1:" : "IB2:",
+					DUMP_PREFIX_OFFSET, 32, 4, hoststart,
+					sizedwords*4, 0);
+			goto done;
+		}
+	}
+
+	ret = true;
+done:
+	if (!ret)
+		KGSL_DRV_ERR(dev_priv->device,
+			"parsing failed: gpuaddr:0x%08x, "
+			"host:0x%p, wc:%d\n", gpuaddr, hoststart, sizedwords);
+
+	level--;
+
+	return ret;
+}
+
 int
 adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv,
 				struct kgsl_context *context,
@@ -601,9 +791,12 @@ adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv,
 		start_index = 1;
 
 	for (i = start_index; i < numibs; i++) {
-		(void)kgsl_cffdump_parse_ibs(dev_priv, NULL,
-			ibdesc[i].gpuaddr, ibdesc[i].sizedwords, false);
-
+		if (unlikely(adreno_dev->ib_check_level >= 1 &&
+		    !_parse_ibs(dev_priv, ibdesc[i].gpuaddr,
+				ibdesc[i].sizedwords))) {
+			kfree(link);
+			return -EINVAL;
+		}
 		*cmds++ = CP_HDR_INDIRECT_BUFFER_PFD;
 		*cmds++ = ibdesc[i].gpuaddr;
 		*cmds++ = ibdesc[i].sizedwords;
@@ -757,8 +950,20 @@ int adreno_ringbuffer_extract(struct adreno_ringbuffer *rb,
 			kgsl_sharedmem_readl(&rb->buffer_desc, &value, rb_rptr);
 			rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr,
 							rb->buffer_desc.size);
-			BUG_ON((copy_rb_contents == 0) &&
-				(value == cur_context));
+
+			/*
+			 * If other context switches were already lost and
+			 * and the current context is the one that is hanging,
+			 * then we cannot recover.  Print an error message
+			 * and leave.
+			 */
+
+			if ((copy_rb_contents == 0) && (value == cur_context)) {
+				KGSL_DRV_ERR(device, "GPU recovery could not "
+					"find the previous context\n");
+				return -EINVAL;
+			}
+
 			/*
 			 * If we were copying the commands and got to this point
 			 * then we need to remove the 3 commands that appear
diff --git a/drivers/gpu/msm/adreno_snapshot.c b/drivers/gpu/msm/adreno_snapshot.c
index fb88a72bd..c45dbff48 100644
--- a/drivers/gpu/msm/adreno_snapshot.c
+++ b/drivers/gpu/msm/adreno_snapshot.c
@@ -45,11 +45,19 @@ static void push_object(struct kgsl_device *device, int type, uint32_t ptbase,
 	int index;
 	void *ptr;
 
-	/* Go through the list and see that object has already been seen */
+	/*
+	 * Sometimes IBs can be reused in the same dump.  Because we parse from
+	 * oldest to newest, if we come across an IB that has already been used,
+	 * assume that it has been reused and update the list with the newest
+	 * size.
+	 */
+
 	for (index = 0; index < objbufptr; index++) {
 		if (objbuf[index].gpuaddr == gpuaddr &&
-			objbuf[index].ptbase == ptbase)
-			return;
+			objbuf[index].ptbase == ptbase) {
+				objbuf[index].dwords = dwords;
+				return;
+			}
 	}
 
 	if (objbufptr == SNAPSHOT_OBJ_BUFSIZE) {
@@ -77,6 +85,25 @@ static void push_object(struct kgsl_device *device, int type, uint32_t ptbase,
 	objbuf[objbufptr++].ptr = ptr;
 }
 
+/*
+ * Return a 1 if the specified object is already on the list of buffers
+ * to be dumped
+ */
+
+static int find_object(int type, unsigned int gpuaddr, unsigned int ptbase)
+{
+	int index;
+
+	for (index = 0; index < objbufptr; index++) {
+		if (objbuf[index].gpuaddr == gpuaddr &&
+			objbuf[index].ptbase == ptbase &&
+			objbuf[index].type == type)
+			return 1;
+	}
+
+	return 0;
+}
+
 /* Snapshot the istore memory */
 static int snapshot_istore(struct kgsl_device *device, void *snapshot,
 	int remain, void *priv)
@@ -113,6 +140,7 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot,
 	unsigned int rbbase, ptbase, rptr, *rbptr;
 	int start, stop, index;
 	int numitems, size;
+	int parse_ibs = 0, ib_parse_start;
 
 	/* Get the GPU address of the ringbuffer */
 	kgsl_regread(device, REG_CP_RB_BASE, &rbbase);
@@ -158,9 +186,53 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot,
 	header->rbsize = rb->sizedwords;
 	header->count = numitems;
 
-	index = start;
+	/*
+	 * We can only reliably dump IBs from the beginning of the context,
+	 * and it turns out that for the vast majority of the time we really
+	 * only care about the current context when it comes to diagnosing
+	 * a hang. So, with an eye to limiting the buffer dumping to what is
+	 * really useful find the beginning of the context and only dump
+	 * IBs from that point
+	 */
+
+	index = rptr;
+	ib_parse_start = start;
 	rbptr = rb->buffer_desc.hostptr;
 
+	while (index != start) {
+		index--;
+
+		if (index < 0) {
+			/*
+			 * The marker we are looking for is 2 dwords long, so
+			 * when wrapping, go back 2 from the end so we don't
+			 * access out of range in the if statement below
+			 */
+			index = rb->sizedwords - 2;
+
+			/*
+			 * Account for the possibility that start might be at
+			 * rb->sizedwords - 1
+			 */
+
+			if (start == rb->sizedwords - 1)
+				break;
+		}
+
+		/*
+		 * Look for a NOP packet with the context switch identifier in
+		 * the second dword
+		 */
+
+		if (rbptr[index] == cp_nop_packet(1) &&
+			rbptr[index + 1] == KGSL_CONTEXT_TO_MEM_IDENTIFIER) {
+				ib_parse_start = index;
+				break;
+		}
+	}
+
+	index = start;
+
 	/*
 	 * Loop through the RB, copying the data and looking for indirect
 	 * buffers and MMU pagetable changes
@@ -169,15 +241,18 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot,
 	while (index != rb->wptr) {
 		*data = rbptr[index];
 
-		if (rbptr[index] == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2))
+		/* Only parse IBs between the context start and the rptr */
+
+		if (index == ib_parse_start)
+			parse_ibs = 1;
+
+		if (index == rptr)
+			parse_ibs = 0;
+
+		if (parse_ibs && adreno_cmd_is_ib(rbptr[index]))
 			push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase,
 				rbptr[index + 1], rbptr[index + 2]);
 
-		/*
-		 * FIXME: Handle upcoming MMU pagetable changes, but only
-		 * between the rptr and the wptr
-		 */
-
 		index = index + 1;
 
 		if (index == rb->sizedwords)
@@ -228,10 +303,9 @@ static int snapshot_ib(struct kgsl_device *device, void *snapshot,
 		*dst = *src;
 		/* If another IB is discovered, then push it on the list too */
 
-		if (*src == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) {
+		if (adreno_cmd_is_ib(*src))
 			push_object(device, SNAPSHOT_OBJ_TYPE_IB, obj->ptbase,
 				*(src + 1), *(src + 2));
-		}
 
 		src++;
 		dst++;
@@ -288,22 +362,45 @@ void *adreno_snapshot(struct kgsl_device *device, void *snapshot, int *remain,
 		snapshot, remain, snapshot_rb, NULL);
 
 	/*
-	 * Make sure that the IBs described in the CP registers are on the
-	 * list of objects
+	 * Make sure that the last IB1 that was being executed is dumped.
+	 * Since this was the last IB1 that was processed, we should have
+	 * already added it to the list during the ringbuffer parse but we
+	 * want to be double plus sure.
 	 */
+
 	kgsl_regread(device, REG_CP_IB1_BASE, &ibbase);
 	kgsl_regread(device, REG_CP_IB1_BUFSZ, &ibsize);
 
-	if (ibsize)
+	/*
+	 * The problem is that IB size from the register is the unprocessed size
+	 * of the buffer not the original size, so if we didn't catch this
+	 * buffer being directly used in the RB, then we might not be able to
+	 * dump the whle thing. Print a warning message so we can try to
+	 * figure how often this really happens.
+	 */
+
+	if (!find_object(SNAPSHOT_OBJ_TYPE_IB, ibbase, ptbase) && ibsize) {
 		push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase,
 			ibbase, ibsize);
+		KGSL_DRV_ERR(device, "CP_IB1_BASE not found in the ringbuffer. "
+			"Dumping %x dwords of the buffer.\n", ibsize);
+	}
 
 	kgsl_regread(device, REG_CP_IB2_BASE, &ibbase);
 	kgsl_regread(device, REG_CP_IB2_BUFSZ, &ibsize);
 
-	if (ibsize)
+	/*
+	 * Add the last parsed IB2 to the list. The IB2 should be found as we
+	 * parse the objects below, but we try to add it to the list first, so
+	 * it too can be parsed.  Don't print an error message in this case - if
+	 * the IB2 is found during parsing, the list will be updated with the
+	 * correct size.
+	 */
+
+	if (!find_object(SNAPSHOT_OBJ_TYPE_IB, ibbase, ptbase) && ibsize) {
 		push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase,
 			ibbase, ibsize);
+	}
 
 	/*
 	 * Go through the list of found objects and dump each one.  As the IBs
diff --git a/drivers/gpu/msm/kgsl.c b/drivers/gpu/msm/kgsl.c
old mode 100755
new mode 100644
index 694c09cf3..39dad925f
--- a/drivers/gpu/msm/kgsl.c
+++ b/drivers/gpu/msm/kgsl.c
@@ -21,11 +21,10 @@
 #include <linux/vmalloc.h>
 #include <linux/pm_runtime.h>
 #include <linux/genlock.h>
-
+#include <linux/rbtree.h>
 #include <linux/ashmem.h>
 #include <linux/major.h>
 #include <linux/ion.h>
-#include <mach/socinfo.h>
 
 #include "kgsl.h"
 #include "kgsl_debugfs.h"
@@ -194,8 +193,28 @@ static
 void kgsl_mem_entry_attach_process(struct kgsl_mem_entry *entry,
 				   struct kgsl_process_private *process)
 {
+	struct rb_node **node;
+	struct rb_node *parent = NULL;
+
 	spin_lock(&process->mem_lock);
-	list_add(&entry->list, &process->mem_list);
+
+	node = &process->mem_rb.rb_node;
+
+	while (*node) {
+		struct kgsl_mem_entry *cur;
+
+		parent = *node;
+		cur = rb_entry(parent, struct kgsl_mem_entry, node);
+
+		if (entry->memdesc.gpuaddr < cur->memdesc.gpuaddr)
+			node = &parent->rb_left;
+		else
+			node = &parent->rb_right;
+	}
+
+	rb_link_node(&entry->node, parent, node);
+	rb_insert_color(&entry->node, &process->mem_rb);
+
 	spin_unlock(&process->mem_lock);
 
 	entry->priv = process;
@@ -405,6 +424,10 @@ static int kgsl_suspend_device(struct kgsl_device *device, pm_message_t state)
 			INIT_COMPLETION(device->hwaccess_gate);
 			device->ftbl->suspend_context(device);
 			device->ftbl->stop(device);
+			if (device->idle_wakelock.name)
+				wake_unlock(&device->idle_wakelock);
+			pm_qos_update_request(&device->pm_qos_req_dma,
+						PM_QOS_DEFAULT_VALUE);
 			kgsl_pwrctrl_set_state(device, KGSL_STATE_SUSPEND);
 			break;
 		case KGSL_STATE_SLUMBER:
@@ -514,8 +537,8 @@ void kgsl_late_resume_driver(struct early_suspend *h)
 					struct kgsl_device, display_off);
 	KGSL_PWR_WARN(device, "late resume start\n");
 	mutex_lock(&device->mutex);
-	kgsl_pwrctrl_wake(device);
 	device->pwrctrl.restore_slumber = 0;
+	kgsl_pwrctrl_wake(device);
 	kgsl_pwrctrl_pwrlevel_change(device, KGSL_PWRLEVEL_TURBO);
 	mutex_unlock(&device->mutex);
 	kgsl_check_idle(device);
@@ -548,8 +571,7 @@ kgsl_get_process_private(struct kgsl_device_private *cur_dev_priv)
 	spin_lock_init(&private->mem_lock);
 	private->refcnt = 1;
 	private->pid = task_tgid_nr(current);
-
-	INIT_LIST_HEAD(&private->mem_list);
+	private->mem_rb = RB_ROOT;
 
 	if (kgsl_mmu_enabled())
 	{
@@ -578,7 +600,7 @@ kgsl_put_process_private(struct kgsl_device *device,
 			 struct kgsl_process_private *private)
 {
 	struct kgsl_mem_entry *entry = NULL;
-	struct kgsl_mem_entry *entry_tmp = NULL;
+	struct rb_node *node;
 
 	if (!private)
 		return;
@@ -592,11 +614,13 @@ kgsl_put_process_private(struct kgsl_device *device,
 
 	list_del(&private->list);
 
-	list_for_each_entry_safe(entry, entry_tmp, &private->mem_list, list) {
-		list_del(&entry->list);
+	for (node = rb_first(&private->mem_rb); node; ) {
+		entry = rb_entry(node, struct kgsl_mem_entry, node);
+		node = rb_next(&entry->node);
+
+		rb_erase(&entry->node, &private->mem_rb);
 		kgsl_mem_entry_put(entry);
 	}
-
 	kgsl_mmu_putpagetable(private->pagetable);
 	kfree(private);
 unlock:
@@ -722,47 +746,43 @@ static int kgsl_open(struct inode *inodep, struct file *filep)
 	return result;
 }
 
-
 /*call with private->mem_lock locked */
-static struct kgsl_mem_entry *
-kgsl_sharedmem_find(struct kgsl_process_private *private, unsigned int gpuaddr)
+struct kgsl_mem_entry *
+kgsl_sharedmem_find_region(struct kgsl_process_private *private,
+	unsigned int gpuaddr, size_t size)
 {
-	struct kgsl_mem_entry *entry = NULL, *result = NULL;
+	struct rb_node *node = private->mem_rb.rb_node;
 
-	BUG_ON(private == NULL);
+	while (node != NULL) {
+		struct kgsl_mem_entry *entry;
 
-	gpuaddr &= PAGE_MASK;
+		entry = rb_entry(node, struct kgsl_mem_entry, node);
 
-	list_for_each_entry(entry, &private->mem_list, list) {
-		if (entry->memdesc.gpuaddr == gpuaddr) {
-			result = entry;
-			break;
-		}
-	}
-	return result;
-}
-
-/*call with private->mem_lock locked */
-struct kgsl_mem_entry *
-kgsl_sharedmem_find_region(struct kgsl_process_private *private,
-				unsigned int gpuaddr,
-				size_t size)
-{
-	struct kgsl_mem_entry *entry = NULL, *result = NULL;
 
-	BUG_ON(private == NULL);
+		if (kgsl_gpuaddr_in_memdesc(&entry->memdesc, gpuaddr, size))
+			return entry;
 
-	list_for_each_entry(entry, &private->mem_list, list) {
-		if (kgsl_gpuaddr_in_memdesc(&entry->memdesc, gpuaddr, size)) {
-			result = entry;
-			break;
+		if (gpuaddr < entry->memdesc.gpuaddr)
+			node = node->rb_left;
+		else if (gpuaddr >=
+			(entry->memdesc.gpuaddr + entry->memdesc.size))
+			node = node->rb_right;
+		else {
+			return NULL;
 		}
 	}
 
-	return result;
+	return NULL;
 }
 EXPORT_SYMBOL(kgsl_sharedmem_find_region);
 
+/*call with private->mem_lock locked */
+static inline struct kgsl_mem_entry *
+kgsl_sharedmem_find(struct kgsl_process_private *private, unsigned int gpuaddr)
+{
+	return kgsl_sharedmem_find_region(private, gpuaddr, 1);
+}
+
 /*call all ioctl sub functions with driver locked*/
 static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv,
 					  unsigned int cmd, void *data)
@@ -789,6 +809,40 @@ static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv,
 
 		break;
 	}
+	case KGSL_PROP_GPU_RESET_STAT:
+	{
+		/* Return reset status of given context and clear it */
+		uint32_t id;
+		struct kgsl_context *context;
+
+		if (param->sizebytes != sizeof(unsigned int)) {
+			result = -EINVAL;
+			break;
+		}
+		/* We expect the value passed in to contain the context id */
+		if (copy_from_user(&id, param->value,
+			sizeof(unsigned int))) {
+			result = -EFAULT;
+			break;
+		}
+		context = kgsl_find_context(dev_priv, id);
+		if (!context) {
+			result = -EINVAL;
+			break;
+		}
+		/*
+		 * Copy the reset status to value which also serves as
+		 * the out parameter
+		 */
+		if (copy_to_user(param->value, &(context->reset_status),
+			sizeof(unsigned int))) {
+			result = -EFAULT;
+			break;
+		}
+		/* Clear reset status once its been queried */
+		context->reset_status = KGSL_CTX_STAT_NO_ERROR;
+		break;
+	}
 	default:
 		result = dev_priv->device->ftbl->getproperty(
 					dev_priv->device, param->type,
@@ -827,40 +881,6 @@ static long kgsl_ioctl_device_waittimestamp(struct kgsl_device_private
 
 	return result;
 }
-static bool check_ibdesc(struct kgsl_device_private *dev_priv,
-			 struct kgsl_ibdesc *ibdesc, unsigned int numibs,
-			 bool parse)
-{
-	bool result = true;
-	unsigned int i;
-	for (i = 0; i < numibs; i++) {
-		struct kgsl_mem_entry *entry;
-		spin_lock(&dev_priv->process_priv->mem_lock);
-		entry = kgsl_sharedmem_find_region(dev_priv->process_priv,
-			ibdesc[i].gpuaddr, ibdesc[i].sizedwords * sizeof(uint));
-		spin_unlock(&dev_priv->process_priv->mem_lock);
-		if (entry == NULL) {
-			KGSL_DRV_ERR(dev_priv->device,
-				"invalid cmd buffer gpuaddr %08x " \
-				"sizedwords %d\n", ibdesc[i].gpuaddr,
-				ibdesc[i].sizedwords);
-			result = false;
-			break;
-		}
-
-		if (parse && !kgsl_cffdump_parse_ibs(dev_priv, &entry->memdesc,
-			ibdesc[i].gpuaddr, ibdesc[i].sizedwords, true)) {
-			KGSL_DRV_ERR(dev_priv->device,
-				"invalid cmd buffer gpuaddr %08x " \
-				"sizedwords %d numibs %d/%d\n",
-				ibdesc[i].gpuaddr,
-				ibdesc[i].sizedwords, i+1, numibs);
-			result = false;
-			break;
-		}
-	}
-	return result;
-}
 
 static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv,
 				      unsigned int cmd, void *data)
@@ -930,12 +950,6 @@ static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv,
 		param->numibs = 1;
 	}
 
-	if (!check_ibdesc(dev_priv, ibdesc, param->numibs, true)) {
-		KGSL_DRV_ERR(dev_priv->device, "bad ibdesc");
-		result = -EINVAL;
-		goto free_ibdesc;
-	}
-
 	result = dev_priv->device->ftbl->issueibcmds(dev_priv,
 					     context,
 					     ibdesc,
@@ -945,18 +959,6 @@ static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv,
 
 	trace_kgsl_issueibcmds(dev_priv->device, param, result);
 
-	if (result != 0)
-		goto free_ibdesc;
-
-	/* this is a check to try to detect if a command buffer was freed
-	 * during issueibcmds().
-	 */
-	if (!check_ibdesc(dev_priv, ibdesc, param->numibs, false)) {
-		KGSL_DRV_ERR(dev_priv->device, "bad ibdesc AFTER issue");
-		result = -EINVAL;
-		goto free_ibdesc;
-	}
-
 free_ibdesc:
 	kfree(ibdesc);
 done:
@@ -988,7 +990,7 @@ static void kgsl_freemem_event_cb(struct kgsl_device *device,
 {
 	struct kgsl_mem_entry *entry = priv;
 	spin_lock(&entry->priv->mem_lock);
-	list_del(&entry->list);
+	rb_erase(&entry->node, &entry->priv->mem_rb);
 	spin_unlock(&entry->priv->mem_lock);
 	kgsl_mem_entry_put(entry);
 }
@@ -1080,7 +1082,8 @@ static long kgsl_ioctl_sharedmem_free(struct kgsl_device_private *dev_priv,
 	spin_lock(&private->mem_lock);
 	entry = kgsl_sharedmem_find(private, param->gpuaddr);
 	if (entry)
-		list_del(&entry->list);
+		rb_erase(&entry->node, &private->mem_rb);
+
 	spin_unlock(&private->mem_lock);
 
 	if (entry) {
@@ -1164,7 +1167,7 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv,
 		goto error;
 	}
 
-	result = kgsl_sharedmem_vmalloc_user(&entry->memdesc,
+	result = kgsl_sharedmem_page_alloc_user(&entry->memdesc,
 					     private->pagetable, len,
 					     param->flags);
 	if (result != 0)
@@ -1172,10 +1175,10 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv,
 
 	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
 
-	result = remap_vmalloc_range(vma, (void *) entry->memdesc.hostptr, 0);
+	result = kgsl_sharedmem_map_vma(vma, &entry->memdesc);
 	if (result) {
-		KGSL_CORE_ERR("remap_vmalloc_range failed: %d\n", result);
-		goto error_free_vmalloc;
+		KGSL_CORE_ERR("kgsl_sharedmem_map_vma failed: %d\n", result);
+		goto error_free_alloc;
 	}
 
 	param->gpuaddr = entry->memdesc.gpuaddr;
@@ -1190,7 +1193,7 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv,
 	kgsl_check_idle(dev_priv->device);
 	return 0;
 
-error_free_vmalloc:
+error_free_alloc:
 	kgsl_sharedmem_free(&entry->memdesc);
 
 error_free_entry:
@@ -1313,7 +1316,8 @@ static int memdesc_sg_virt(struct kgsl_memdesc *memdesc,
 	int sglen = PAGE_ALIGN(size) / PAGE_SIZE;
 	unsigned long paddr = (unsigned long) addr;
 
-	memdesc->sg = vmalloc(sglen * sizeof(struct scatterlist));
+	memdesc->sg = kgsl_sg_alloc(sglen);
+
 	if (memdesc->sg == NULL)
 		return -ENOMEM;
 
@@ -1353,7 +1357,7 @@ static int memdesc_sg_virt(struct kgsl_memdesc *memdesc,
 
 err:
 	spin_unlock(&current->mm->page_table_lock);
-	vfree(memdesc->sg);
+	kgsl_sg_free(memdesc->sg,  sglen);
 	memdesc->sg = NULL;
 
 	return -EINVAL;
@@ -1488,11 +1492,8 @@ static int kgsl_setup_ion(struct kgsl_mem_entry *entry,
 	struct scatterlist *s;
 	unsigned long flags;
 
-	if (kgsl_ion_client == NULL) {
-		kgsl_ion_client = msm_ion_client_create(UINT_MAX, KGSL_NAME);
-		if (kgsl_ion_client == NULL)
-			return -ENODEV;
-	}
+	if (IS_ERR_OR_NULL(kgsl_ion_client))
+		return -ENODEV;
 
 	handle = ion_import_fd(kgsl_ion_client, fd);
 	if (IS_ERR_OR_NULL(handle))
@@ -1622,10 +1623,20 @@ static long kgsl_ioctl_map_user_mem(struct kgsl_device_private *dev_priv,
 	kgsl_check_idle(dev_priv->device);
 	return result;
 
- error_put_file_ptr:
-	if (entry->priv_data)
-		fput(entry->priv_data);
-
+error_put_file_ptr:
+	switch (entry->memtype) {
+	case KGSL_MEM_ENTRY_PMEM:
+	case KGSL_MEM_ENTRY_ASHMEM:
+		if (entry->priv_data)
+			fput(entry->priv_data);
+		break;
+	case KGSL_MEM_ENTRY_ION:
+		ion_unmap_dma(kgsl_ion_client, entry->priv_data);
+		ion_free(kgsl_ion_client, entry->priv_data);
+		break;
+	default:
+		break;
+	}
 error:
 	kfree(entry);
 	kgsl_check_idle(dev_priv->device);
@@ -2029,7 +2040,7 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long vma_offset = vma->vm_pgoff << PAGE_SHIFT;
 	struct kgsl_device_private *dev_priv = file->private_data;
 	struct kgsl_process_private *private = dev_priv->process_priv;
-	struct kgsl_mem_entry *tmp, *entry = NULL;
+	struct kgsl_mem_entry *entry = NULL;
 	struct kgsl_device *device = dev_priv->device;
 
 	/* Handle leagacy behavior for memstore */
@@ -2040,13 +2051,11 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma)
 	/* Find a chunk of GPU memory */
 
 	spin_lock(&private->mem_lock);
-	list_for_each_entry(tmp, &private->mem_list, list) {
-		if (vma_offset == tmp->memdesc.gpuaddr) {
-			kgsl_mem_entry_get(tmp);
-			entry = tmp;
-			break;
-		}
-	}
+	entry = kgsl_sharedmem_find(private, vma_offset);
+
+	if (entry)
+		kgsl_mem_entry_get(entry);
+
 	spin_unlock(&private->mem_lock);
 
 	if (entry == NULL)
@@ -2102,8 +2111,8 @@ void kgsl_unregister_device(struct kgsl_device *device)
 	kgsl_cffdump_close(device->id);
 	kgsl_pwrctrl_uninit_sysfs(device);
 
-	if (cpu_is_msm8x60())
-		wake_lock_destroy(&device->idle_wakelock);
+	wake_lock_destroy(&device->idle_wakelock);
+	pm_qos_remove_request(&device->pm_qos_req_dma);
 
 	idr_destroy(&device->context_idr);
 
@@ -2194,9 +2203,9 @@ kgsl_register_device(struct kgsl_device *device)
 	if (ret != 0)
 		goto err_close_mmu;
 
-	if (cpu_is_msm8x60())
-		wake_lock_init(&device->idle_wakelock,
-					   WAKE_LOCK_IDLE, device->name);
+	wake_lock_init(&device->idle_wakelock, WAKE_LOCK_IDLE, device->name);
+	pm_qos_add_request(&device->pm_qos_req_dma, PM_QOS_CPU_DMA_LATENCY,
+				PM_QOS_DEFAULT_VALUE);
 
 	idr_init(&device->context_idr);
 
@@ -2242,6 +2251,8 @@ int kgsl_device_platform_probe(struct kgsl_device *device,
 	if (status)
 		goto error;
 
+	kgsl_ion_client = msm_ion_client_create(UINT_MAX, KGSL_NAME);
+
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
 					   device->iomemname);
 	if (res == NULL) {
@@ -2339,22 +2350,30 @@ kgsl_ptdata_init(void)
 
 static void kgsl_core_exit(void)
 {
-	unregister_chrdev_region(kgsl_driver.major, KGSL_DEVICE_MAX);
-
 	kgsl_mmu_ptpool_destroy(kgsl_driver.ptpool);
 	kgsl_driver.ptpool = NULL;
 
-	device_unregister(&kgsl_driver.virtdev);
+	kgsl_drm_exit();
+	kgsl_cffdump_destroy();
+	kgsl_core_debugfs_close();
+
+	/*
+	 * We call kgsl_sharedmem_uninit_sysfs() and device_unregister()
+	 * only if kgsl_driver.virtdev has been populated.
+	 * We check at least one member of kgsl_driver.virtdev to
+	 * see if it is not NULL (and thus, has been populated).
+	 */
+	if (kgsl_driver.virtdev.class) {
+		kgsl_sharedmem_uninit_sysfs();
+		device_unregister(&kgsl_driver.virtdev);
+	}
 
 	if (kgsl_driver.class) {
 		class_destroy(kgsl_driver.class);
 		kgsl_driver.class = NULL;
 	}
 
-	kgsl_drm_exit();
-	kgsl_cffdump_destroy();
-	kgsl_core_debugfs_close();
-	kgsl_sharedmem_uninit_sysfs();
+	unregister_chrdev_region(kgsl_driver.major, KGSL_DEVICE_MAX);
 }
 
 static int __init kgsl_core_init(void)
diff --git a/drivers/gpu/msm/kgsl.h b/drivers/gpu/msm/kgsl.h
index d3ae4b9bb..f027f95c4 100644
--- a/drivers/gpu/msm/kgsl.h
+++ b/drivers/gpu/msm/kgsl.h
@@ -21,13 +21,12 @@
 #include <linux/mutex.h>
 #include <linux/cdev.h>
 #include <linux/regulator/consumer.h>
+#include <linux/mm.h>
 
 #define KGSL_NAME "kgsl"
 
-/*< DTS2012042406822 hanfeng 20120428 begin*/
-/* Timestamp window used to detect rollovers */
+/* Timestamp window used to detect rollovers (half of integer range) */
 #define KGSL_TIMESTAMP_WINDOW 0x80000000
-/* DTS2012042406822 hanfeng 20120428 end > */
 
 /*cache coherency ops */
 #define DRM_KGSL_GEM_CACHE_OP_TO_DEV	0x0001
@@ -96,6 +95,8 @@ struct kgsl_driver {
 	struct {
 		unsigned int vmalloc;
 		unsigned int vmalloc_max;
+		unsigned int page_alloc;
+		unsigned int page_alloc_max;
 		unsigned int coherent;
 		unsigned int coherent_max;
 		unsigned int mapped;
@@ -107,7 +108,15 @@ struct kgsl_driver {
 extern struct kgsl_driver kgsl_driver;
 
 struct kgsl_pagetable;
-struct kgsl_memdesc_ops;
+struct kgsl_memdesc;
+
+struct kgsl_memdesc_ops {
+	int (*vmflags)(struct kgsl_memdesc *);
+	int (*vmfault)(struct kgsl_memdesc *, struct vm_area_struct *,
+		       struct vm_fault *);
+	void (*free)(struct kgsl_memdesc *memdesc);
+	int (*map_kernel_mem)(struct kgsl_memdesc *);
+};
 
 /* shared memory allocation */
 struct kgsl_memdesc {
@@ -136,7 +145,7 @@ struct kgsl_mem_entry {
 	struct kgsl_memdesc memdesc;
 	int memtype;
 	void *priv_data;
-	struct list_head list;
+	struct rb_node node;
 	uint32_t free_timestamp;
 	/* back pointer to private structure under whose context this
 	* allocation is made */
@@ -186,27 +195,47 @@ static inline int kgsl_gpuaddr_in_memdesc(const struct kgsl_memdesc *memdesc,
 	}
 	return 0;
 }
-static inline uint8_t *kgsl_gpuaddr_to_vaddr(const struct kgsl_memdesc *memdesc,
+
+static inline void *kgsl_memdesc_map(struct kgsl_memdesc *memdesc)
+{
+	if (memdesc->hostptr == NULL && memdesc->ops &&
+		memdesc->ops->map_kernel_mem)
+		memdesc->ops->map_kernel_mem(memdesc);
+
+	return memdesc->hostptr;
+}
+
+static inline uint8_t *kgsl_gpuaddr_to_vaddr(struct kgsl_memdesc *memdesc,
 					     unsigned int gpuaddr)
 {
-	if (memdesc->hostptr == NULL || memdesc->gpuaddr == 0 ||
-		(gpuaddr < memdesc->gpuaddr ||
-		gpuaddr >= memdesc->gpuaddr + memdesc->size))
-		return NULL;
+	if (memdesc->gpuaddr == 0 ||
+		gpuaddr < memdesc->gpuaddr ||
+		gpuaddr >= (memdesc->gpuaddr + memdesc->size) ||
+		(NULL == memdesc->hostptr && memdesc->ops->map_kernel_mem &&
+			memdesc->ops->map_kernel_mem(memdesc)))
+			return NULL;
 
 	return memdesc->hostptr + (gpuaddr - memdesc->gpuaddr);
 }
 
-static inline int timestamp_cmp(unsigned int new, unsigned int old)
+static inline int timestamp_cmp(unsigned int a, unsigned int b)
 {
-	int ts_diff = new - old;
-
-	if (ts_diff == 0)
+	/* check for equal */
+	if (a == b)
 		return 0;
 
-    /*< DTS2012042406822 hanfeng 20120428 begin*/
-	return ((ts_diff > 0) || (ts_diff < -KGSL_TIMESTAMP_WINDOW)) ? 1 : -1;
-    /* DTS2012042406822 hanfeng 20120428 end > */
+	/* check for greater-than for non-rollover case */
+	if ((a > b) && (a - b < KGSL_TIMESTAMP_WINDOW))
+		return 1;
+
+	/* check for greater-than for rollover case
+	 * note that <= is required to ensure that consistent
+	 * results are returned for values whose difference is
+	 * equal to the window size
+	 */
+	a += KGSL_TIMESTAMP_WINDOW;
+	b += KGSL_TIMESTAMP_WINDOW;
+	return ((a > b) && (a - b <= KGSL_TIMESTAMP_WINDOW)) ? 1 : -1;
 }
 
 static inline void
diff --git a/drivers/gpu/msm/kgsl_cffdump.c b/drivers/gpu/msm/kgsl_cffdump.c
index e9455cb82..77aef1ff0 100644
--- a/drivers/gpu/msm/kgsl_cffdump.c
+++ b/drivers/gpu/msm/kgsl_cffdump.c
@@ -497,190 +497,6 @@ int kgsl_cffdump_waitirq(void)
 }
 EXPORT_SYMBOL(kgsl_cffdump_waitirq);
 
-#define ADDRESS_STACK_SIZE 256
-#define GET_PM4_TYPE3_OPCODE(x) ((*(x) >> 8) & 0xFF)
-static unsigned int kgsl_cffdump_addr_count;
-
-static bool kgsl_cffdump_handle_type3(struct kgsl_device_private *dev_priv,
-	uint *hostaddr, bool check_only)
-{
-	static uint addr_stack[ADDRESS_STACK_SIZE];
-	static uint size_stack[ADDRESS_STACK_SIZE];
-
-	switch (GET_PM4_TYPE3_OPCODE(hostaddr)) {
-	case CP_INDIRECT_BUFFER_PFD:
-	case CP_INDIRECT_BUFFER:
-	{
-		/* traverse indirect buffers */
-		int i;
-		uint ibaddr = hostaddr[1];
-		uint ibsize = hostaddr[2];
-
-		/* is this address already in encountered? */
-		for (i = 0;
-			i < kgsl_cffdump_addr_count && addr_stack[i] != ibaddr;
-			++i)
-			;
-
-		if (kgsl_cffdump_addr_count == i) {
-			addr_stack[kgsl_cffdump_addr_count] = ibaddr;
-			size_stack[kgsl_cffdump_addr_count++] = ibsize;
-
-			if (kgsl_cffdump_addr_count >= ADDRESS_STACK_SIZE) {
-				KGSL_CORE_ERR("stack overflow\n");
-				return false;
-			}
-
-			return kgsl_cffdump_parse_ibs(dev_priv, NULL,
-				ibaddr, ibsize, check_only);
-		} else if (size_stack[i] != ibsize) {
-			KGSL_CORE_ERR("gpuaddr: 0x%08x, "
-				"wc: %u, with size wc: %u already on the "
-				"stack\n", ibaddr, ibsize, size_stack[i]);
-			return false;
-		}
-	}
-	break;
-	}
-
-	return true;
-}
-
-/*
- * Traverse IBs and dump them to test vector. Detect swap by inspecting
- * register writes, keeping note of the current state, and dump
- * framebuffer config to test vector
- */
-bool kgsl_cffdump_parse_ibs(struct kgsl_device_private *dev_priv,
-	const struct kgsl_memdesc *memdesc, uint gpuaddr, int sizedwords,
-	bool check_only)
-{
-	static uint level; /* recursion level */
-	bool ret = true;
-	uint *hostaddr, *hoststart;
-	int dwords_left = sizedwords; /* dwords left in the current command
-					 buffer */
-
-	if (level == 0)
-		kgsl_cffdump_addr_count = 0;
-
-	if (memdesc == NULL) {
-		struct kgsl_mem_entry *entry;
-		spin_lock(&dev_priv->process_priv->mem_lock);
-		entry = kgsl_sharedmem_find_region(dev_priv->process_priv,
-			gpuaddr, sizedwords * sizeof(uint));
-		spin_unlock(&dev_priv->process_priv->mem_lock);
-		if (entry == NULL) {
-			KGSL_CORE_ERR("did not find mapping "
-				"for gpuaddr: 0x%08x\n", gpuaddr);
-			return true;
-		}
-		memdesc = &entry->memdesc;
-	}
-	hostaddr = (uint *)kgsl_gpuaddr_to_vaddr(memdesc, gpuaddr);
-	if (hostaddr == NULL) {
-		KGSL_CORE_ERR("no kernel mapping for "
-			"gpuaddr: 0x%08x\n", gpuaddr);
-		return true;
-	}
-
-	hoststart = hostaddr;
-
-	level++;
-
-	mb();
-	kgsl_cache_range_op((struct kgsl_memdesc *)memdesc,
-				KGSL_CACHE_OP_INV);
-#ifdef DEBUG
-	pr_info("kgsl: cffdump: ib: gpuaddr:0x%08x, wc:%d, hptr:%p\n",
-		gpuaddr, sizedwords, hostaddr);
-#endif
-
-	while (dwords_left > 0) {
-		int count = 0; /* dword count including packet header */
-		bool cur_ret = true;
-
-		switch (*hostaddr >> 30) {
-		case 0x0: /* type-0 */
-			count = (*hostaddr >> 16)+2;
-			break;
-		case 0x1: /* type-1 */
-			count = 2;
-			break;
-		case 0x3: /* type-3 */
-			count = ((*hostaddr >> 16) & 0x3fff) + 2;
-			cur_ret = kgsl_cffdump_handle_type3(dev_priv,
-				hostaddr, check_only);
-			break;
-		default:
-			pr_warn("kgsl: cffdump: parse-ib: unexpected type: "
-				"type:%d, word:0x%08x @ 0x%p, gpu:0x%08x\n",
-				*hostaddr >> 30, *hostaddr, hostaddr,
-				gpuaddr+4*(sizedwords-dwords_left));
-			cur_ret = false;
-			count = dwords_left;
-			break;
-		}
-
-#ifdef DEBUG
-		if (!cur_ret) {
-			pr_info("kgsl: cffdump: bad sub-type: #:%d/%d, v:0x%08x"
-				" @ 0x%p[gb:0x%08x], level:%d\n",
-				sizedwords-dwords_left, sizedwords, *hostaddr,
-				hostaddr, gpuaddr+4*(sizedwords-dwords_left),
-				level);
-
-			print_hex_dump(KERN_ERR, level == 1 ? "IB1:" : "IB2:",
-				DUMP_PREFIX_OFFSET, 32, 4, hoststart,
-				sizedwords*4, 0);
-		}
-#endif
-		ret = ret && cur_ret;
-
-		/* jump to next packet */
-		dwords_left -= count;
-		hostaddr += count;
-		cur_ret = dwords_left >= 0;
-
-#ifdef DEBUG
-		if (!cur_ret) {
-			pr_info("kgsl: cffdump: bad count: c:%d, #:%d/%d, "
-				"v:0x%08x @ 0x%p[gb:0x%08x], level:%d\n",
-				count, sizedwords-(dwords_left+count),
-				sizedwords, *(hostaddr-count), hostaddr-count,
-				gpuaddr+4*(sizedwords-(dwords_left+count)),
-				level);
-
-			print_hex_dump(KERN_ERR, level == 1 ? "IB1:" : "IB2:",
-				DUMP_PREFIX_OFFSET, 32, 4, hoststart,
-				sizedwords*4, 0);
-		}
-#endif
-
-		ret = ret && cur_ret;
-	}
-
-	if (!ret)
-		pr_info("kgsl: cffdump: parsing failed: gpuaddr:0x%08x, "
-			"host:0x%p, wc:%d\n", gpuaddr, hoststart, sizedwords);
-
-	if (!check_only) {
-#ifdef DEBUG
-		uint offset = gpuaddr - memdesc->gpuaddr;
-		pr_info("kgsl: cffdump: ib-dump: hostptr:%p, gpuaddr:%08x, "
-			"physaddr:%08x, offset:%d, size:%d", hoststart,
-			gpuaddr, memdesc->physaddr + offset, offset,
-			sizedwords*4);
-#endif
-		kgsl_cffdump_syncmem(dev_priv, memdesc, gpuaddr, sizedwords*4,
-			false);
-	}
-
-	level--;
-
-	return ret;
-}
-
 static int subbuf_start_handler(struct rchan_buf *buf,
 	void *subbuf, void *prev_subbuf, uint prev_padding)
 {
diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h
index 2fb1e43f4..efec2af9e 100644
--- a/drivers/gpu/msm/kgsl_device.h
+++ b/drivers/gpu/msm/kgsl_device.h
@@ -15,6 +15,7 @@
 
 #include <linux/idr.h>
 #include <linux/wakelock.h>
+#include <linux/pm_qos_params.h>
 #include <linux/earlysuspend.h>
 
 #include "kgsl.h"
@@ -184,6 +185,7 @@ struct kgsl_device {
 	struct wake_lock idle_wakelock;
 	struct kgsl_pwrscale pwrscale;
 	struct kobject pwrscale_kobj;
+	struct pm_qos_request_list pm_qos_req_dma;
 	struct work_struct ts_expired_ws;
 	struct list_head events;
 	s64 on_time;
@@ -197,13 +199,18 @@ struct kgsl_context {
 
 	/* Pointer to the device specific context information */
 	void *devctxt;
+	/*
+	 * Status indicating whether a gpu reset occurred and whether this
+	 * context was responsible for causing it
+	 */
+	unsigned int reset_status;
 };
 
 struct kgsl_process_private {
 	unsigned int refcnt;
 	pid_t pid;
 	spinlock_t mem_lock;
-	struct list_head mem_list;
+	struct rb_root mem_rb;
 	struct kgsl_pagetable *pagetable;
 	struct list_head list;
 	struct kobject kobj;
diff --git a/drivers/gpu/msm/kgsl_drm.c b/drivers/gpu/msm/kgsl_drm.c
index dba2dfcfb..ba48f9c75 100644
--- a/drivers/gpu/msm/kgsl_drm.c
+++ b/drivers/gpu/msm/kgsl_drm.c
@@ -295,8 +295,9 @@ kgsl_gem_alloc_memory(struct drm_gem_object *obj)
 		priv->memdesc.size = obj->size * priv->bufcount;
 
 	} else if (TYPE_IS_MEM(priv->type)) {
-		priv->memdesc.hostptr =
-			vmalloc_user(obj->size * priv->bufcount);
+		result = kgsl_sharedmem_page_alloc(&priv->memdesc,
+					priv->pagetable,
+					obj->size * priv->bufcount, 0);
 
 		if (priv->memdesc.hostptr == NULL) {
 			DRM_ERROR("Unable to allocate vmalloc memory\n");
@@ -1042,17 +1043,18 @@ int kgsl_gem_kmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct drm_device *dev = obj->dev;
 	struct drm_kgsl_gem_object *priv;
-	unsigned long offset, pg;
+	unsigned long offset;
 	struct page *page;
+	int i;
 
 	mutex_lock(&dev->struct_mutex);
 
 	priv = obj->driver_private;
 
 	offset = (unsigned long) vmf->virtual_address - vma->vm_start;
-	pg = (unsigned long) priv->memdesc.hostptr + offset;
+	i = offset >> PAGE_SHIFT;
+	page = sg_page(&(priv->memdesc.sg[i]));
 
-	page = vmalloc_to_page((void *) pg);
 	if (!page) {
 		mutex_unlock(&dev->struct_mutex);
 		return VM_FAULT_SIGBUS;
diff --git a/drivers/gpu/msm/kgsl_gpummu.c b/drivers/gpu/msm/kgsl_gpummu.c
old mode 100755
new mode 100644
index a16b95418..ba8a719e4
--- a/drivers/gpu/msm/kgsl_gpummu.c
+++ b/drivers/gpu/msm/kgsl_gpummu.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2011-2012, Code Aurora Forum. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -354,8 +354,8 @@ void *kgsl_gpummu_ptpool_init(int ptsize, int entries)
 int kgsl_gpummu_pt_equal(struct kgsl_pagetable *pt,
 					unsigned int pt_base)
 {
-	struct kgsl_gpummu_pt *gpummu_pt = pt->priv;
-	return pt && pt_base && (gpummu_pt->base.gpuaddr == pt_base);
+	struct kgsl_gpummu_pt *gpummu_pt = pt ? pt->priv : NULL;
+	return gpummu_pt && pt_base && (gpummu_pt->base.gpuaddr == pt_base);
 }
 
 void kgsl_gpummu_destroy_pagetable(void *mmu_specific_pt)
@@ -398,11 +398,11 @@ static unsigned int kgsl_gpummu_pt_get_flags(struct kgsl_pagetable *pt,
 				enum kgsl_deviceid id)
 {
 	unsigned int result = 0;
-	struct kgsl_gpummu_pt *gpummu_pt = (struct kgsl_gpummu_pt *)
-						pt->priv;
+	struct kgsl_gpummu_pt *gpummu_pt;
 
 	if (pt == NULL)
 		return 0;
+	gpummu_pt = pt->priv;
 
 	spin_lock(&pt->lock);
 	if (gpummu_pt->tlb_flags && (1<<id)) {
@@ -682,7 +682,7 @@ kgsl_gpummu_map(void *mmu_specific_pt,
 		flushtlb = 1;
 
 	for_each_sg(memdesc->sg, s, memdesc->sglen, i) {
-		unsigned int paddr = sg_phys(s);
+		unsigned int paddr = kgsl_get_sg_pa(s);
 		unsigned int j;
 
 		/* Each sg entry might be multiple pages long */
diff --git a/drivers/gpu/msm/kgsl_iommu.c b/drivers/gpu/msm/kgsl_iommu.c
index e4e561cef..5646d682a 100644
--- a/drivers/gpu/msm/kgsl_iommu.c
+++ b/drivers/gpu/msm/kgsl_iommu.c
@@ -34,8 +34,8 @@ struct kgsl_iommu {
 static int kgsl_iommu_pt_equal(struct kgsl_pagetable *pt,
 					unsigned int pt_base)
 {
-	struct iommu_domain *domain = pt->priv;
-	return pt && pt_base && ((unsigned int)domain == pt_base);
+	struct iommu_domain *domain = pt ? pt->priv : NULL;
+	return domain && pt_base && ((unsigned int)domain == pt_base);
 }
 
 static void kgsl_iommu_destroy_pagetable(void *mmu_specific_pt)
@@ -262,7 +262,7 @@ kgsl_iommu_map(void *mmu_specific_pt,
 	iommu_virt_addr = memdesc->gpuaddr;
 
 	ret = iommu_map_range(domain, iommu_virt_addr, memdesc->sg,
-				memdesc->size, 0);
+				memdesc->size, (IOMMU_READ | IOMMU_WRITE));
 	if (ret) {
 		KGSL_CORE_ERR("iommu_map_range(%p, %x, %p, %d, %d) "
 				"failed with err: %d\n", domain,
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c
old mode 100755
new mode 100644
index 003afb953..429e4946d
--- a/drivers/gpu/msm/kgsl_pwrctrl.c
+++ b/drivers/gpu/msm/kgsl_pwrctrl.c
@@ -13,7 +13,6 @@
 #include <linux/interrupt.h>
 #include <mach/msm_iomap.h>
 #include <mach/msm_bus.h>
-#include <mach/socinfo.h>
 
 #include "kgsl.h"
 #include "kgsl_pwrscale.h"
@@ -25,6 +24,7 @@
 #define KGSL_PWRFLAGS_AXI_ON   2
 #define KGSL_PWRFLAGS_IRQ_ON   3
 
+#define GPU_SWFI_LATENCY	3
 #define UPDATE_BUSY_VAL		1000000
 #define UPDATE_BUSY		50
 
@@ -284,10 +284,7 @@ static int kgsl_pwrctrl_gpubusy_show(struct device *dev,
 DEVICE_ATTR(gpuclk, 0644, kgsl_pwrctrl_gpuclk_show, kgsl_pwrctrl_gpuclk_store);
 DEVICE_ATTR(max_gpuclk, 0644, kgsl_pwrctrl_max_gpuclk_show,
 	kgsl_pwrctrl_max_gpuclk_store);
-/*< DTS2011123005723 hanfeng 20111230 begin*/
-/*modify the file permission */
 DEVICE_ATTR(pwrnap, 0664, kgsl_pwrctrl_pwrnap_show, kgsl_pwrctrl_pwrnap_store);
-/* DTS2011123005723 hanfeng 20111230 end >*/
 DEVICE_ATTR(idle_timer, 0644, kgsl_pwrctrl_idle_timer_show,
 	kgsl_pwrctrl_idle_timer_store);
 DEVICE_ATTR(gpubusy, 0644, kgsl_pwrctrl_gpubusy_show,
@@ -337,7 +334,8 @@ static void kgsl_pwrctrl_busy_time(struct kgsl_device *device, bool on_time)
 	do_gettimeofday(&(b->start));
 }
 
-void kgsl_pwrctrl_clk(struct kgsl_device *device, int state)
+void kgsl_pwrctrl_clk(struct kgsl_device *device, int state,
+					  int requested_state)
 {
 	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
 	int i = 0;
@@ -349,7 +347,7 @@ void kgsl_pwrctrl_clk(struct kgsl_device *device, int state)
 				if (pwr->grp_clks[i])
 					clk_disable(pwr->grp_clks[i]);
 			if ((pwr->pwrlevels[0].gpu_freq > 0) &&
-				(device->requested_state != KGSL_STATE_NAP))
+				(requested_state != KGSL_STATE_NAP))
 				clk_set_rate(pwr->grp_clks[0],
 					pwr->pwrlevels[pwr->num_pwrlevels - 1].
 					gpu_freq);
@@ -424,8 +422,12 @@ void kgsl_pwrctrl_pwrrail(struct kgsl_device *device, int state)
 		if (!test_and_set_bit(KGSL_PWRFLAGS_POWER_ON,
 			&pwr->power_flags)) {
 			trace_kgsl_rail(device, state);
-			if (pwr->gpu_reg)
-				regulator_enable(pwr->gpu_reg);
+			if (pwr->gpu_reg) {
+				int status = regulator_enable(pwr->gpu_reg);
+				if (status)
+					KGSL_DRV_ERR(device, "regulator_enable "
+							"failed: %d\n", status);
+			}
 		}
 	}
 }
@@ -512,10 +514,7 @@ int kgsl_pwrctrl_init(struct kgsl_device *device)
 	pwr->nap_allowed = pdata->nap_allowed;
 	pwr->idle_needed = pdata->idle_needed;
 	pwr->interval_timeout = pdata->idle_timeout;
-	/*< DTS2012041906630 zhangxiangdang 20120423 begin */
-	/*merge qc patch to fix kgsl issue.*/
 	pwr->strtstp_sleepwake = pdata->strtstp_sleepwake;
-	/* DTS2012041906630 zhangxiangdang 20120423 end > */
 	pwr->ebi1_clk = clk_get(&pdev->dev, "bus_clk");
 	if (IS_ERR(pwr->ebi1_clk))
 		pwr->ebi1_clk = NULL;
@@ -638,10 +637,8 @@ void kgsl_timer(unsigned long data)
 
 	KGSL_PWR_INFO(device, "idle timer expired device %d\n", device->id);
 	if (device->requested_state != KGSL_STATE_SUSPEND) {
-		/*< DTS2012041906630 zhangxiangdang 20120423 begin */
 		if (device->pwrctrl.restore_slumber ||
 					device->pwrctrl.strtstp_sleepwake)
-			/* DTS2012041906630 zhangxiangdang 20120423 end > */
 			kgsl_pwrctrl_request_state(device, KGSL_STATE_SLUMBER);
 		else
 			kgsl_pwrctrl_request_state(device, KGSL_STATE_SLEEP);
@@ -708,10 +705,8 @@ _nap(struct kgsl_device *device)
 			return -EBUSY;
 		}
 		kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
-		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF);
-		/*< DTS2012041906630 zhangxiangdang 20120423 begin */
-		kgsl_pwrctrl_set_state(device, device->requested_state);
-		/* DTS2012041906630 zhangxiangdang 20120423 end > */
+		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_NAP);
+		kgsl_pwrctrl_set_state(device, KGSL_STATE_NAP);
 		if (device->idle_wakelock.name)
 			wake_unlock(&device->idle_wakelock);
 	case KGSL_STATE_NAP:
@@ -753,10 +748,11 @@ _sleep(struct kgsl_device *device)
 				pwr->pwrlevels[pwr->num_pwrlevels - 1].
 				gpu_freq);
 		_sleep_accounting(device);
-		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF);
+		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP);
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_SLEEP);
-		if (device->idle_wakelock.name)
-			wake_unlock(&device->idle_wakelock);
+		wake_unlock(&device->idle_wakelock);
+		pm_qos_update_request(&device->pm_qos_req_dma,
+					PM_QOS_DEFAULT_VALUE);
 		break;
 	case KGSL_STATE_SLEEP:
 	case KGSL_STATE_SLUMBER:
@@ -783,18 +779,18 @@ _slumber(struct kgsl_device *device)
 	case KGSL_STATE_NAP:
 	case KGSL_STATE_SLEEP:
 		del_timer_sync(&device->idle_timer);
-		/*< DTS2012041906630 zhangxiangdang 20120423 begin */
 		if (!device->pwrctrl.strtstp_sleepwake)
 			kgsl_pwrctrl_pwrlevel_change(device,
 					KGSL_PWRLEVEL_NOMINAL);
-		/* DTS2012041906630 zhangxiangdang 20120423 end > */
+		device->pwrctrl.restore_slumber = true;
 		device->ftbl->suspend_context(device);
 		device->ftbl->stop(device);
-		device->pwrctrl.restore_slumber = true;
 		_sleep_accounting(device);
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_SLUMBER);
 		if (device->idle_wakelock.name)
 			wake_unlock(&device->idle_wakelock);
+		pm_qos_update_request(&device->pm_qos_req_dma,
+						PM_QOS_DEFAULT_VALUE);
 		break;
 	case KGSL_STATE_SLUMBER:
 		break;
@@ -856,16 +852,17 @@ void kgsl_pwrctrl_wake(struct kgsl_device *device)
 		/* fall through */
 	case KGSL_STATE_NAP:
 		/* Turn on the core clocks */
-		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON);
+		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON, KGSL_STATE_ACTIVE);
 		/* Enable state before turning on irq */
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE);
 		kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON);
 		/* Re-enable HW access */
 		mod_timer(&device->idle_timer,
 				jiffies + device->pwrctrl.interval_timeout);
-
-		if (device->idle_wakelock.name)
-			wake_lock(&device->idle_wakelock);
+		wake_lock(&device->idle_wakelock);
+		if (device->pwrctrl.restore_slumber == false)
+			pm_qos_update_request(&device->pm_qos_req_dma,
+						GPU_SWFI_LATENCY);
 	case KGSL_STATE_ACTIVE:
 		break;
 	default:
@@ -881,7 +878,7 @@ void kgsl_pwrctrl_enable(struct kgsl_device *device)
 {
 	/* Order pwrrail/clk sequence based upon platform */
 	kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_ON);
-	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON);
+	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON, KGSL_STATE_ACTIVE);
 	kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_ON);
 }
 EXPORT_SYMBOL(kgsl_pwrctrl_enable);
@@ -890,7 +887,7 @@ void kgsl_pwrctrl_disable(struct kgsl_device *device)
 {
 	/* Order pwrrail/clk sequence based upon platform */
 	kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_OFF);
-	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF);
+	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP);
 	kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_OFF);
 }
 EXPORT_SYMBOL(kgsl_pwrctrl_disable);
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.h b/drivers/gpu/msm/kgsl_pwrctrl.h
old mode 100755
new mode 100644
index 0c7ec6003..caaed92c8
--- a/drivers/gpu/msm/kgsl_pwrctrl.h
+++ b/drivers/gpu/msm/kgsl_pwrctrl.h
@@ -47,9 +47,7 @@ struct kgsl_pwrctrl {
 	int thermal_pwrlevel;
 	unsigned int num_pwrlevels;
 	unsigned int interval_timeout;
-	/*< DTS2012041906630 zhangxiangdang 20120423 begin */
 	bool strtstp_sleepwake;
-	/* DTS2012041906630 zhangxiangdang 20120423 end > */
 	struct regulator *gpu_reg;
 	uint32_t pcl;
 	unsigned int nap_allowed;
diff --git a/drivers/gpu/msm/kgsl_pwrscale.c b/drivers/gpu/msm/kgsl_pwrscale.c
old mode 100755
new mode 100644
index c2252edcf..d0b2a412c
--- a/drivers/gpu/msm/kgsl_pwrscale.c
+++ b/drivers/gpu/msm/kgsl_pwrscale.c
@@ -89,10 +89,8 @@ static ssize_t pwrscale_policy_show(struct kgsl_device *device, char *buf)
 
 	return ret;
 }
-/*< DTS2011123005723 hanfeng 20111230 begin*/
-/*modify the file permission */
+
 PWRSCALE_ATTR(policy, 0664, pwrscale_policy_show, pwrscale_policy_store);
-/*DTS2011123005723 hanfeng 20111230 end >*/
 
 static ssize_t pwrscale_avail_policies_show(struct kgsl_device *device,
 					    char *buf)
diff --git a/drivers/gpu/msm/kgsl_pwrscale_idlestats.c b/drivers/gpu/msm/kgsl_pwrscale_idlestats.c
old mode 100755
new mode 100644
diff --git a/drivers/gpu/msm/kgsl_sharedmem.c b/drivers/gpu/msm/kgsl_sharedmem.c
old mode 100755
new mode 100644
index 389ed6d4f..ae32e81ff
--- a/drivers/gpu/msm/kgsl_sharedmem.c
+++ b/drivers/gpu/msm/kgsl_sharedmem.c
@@ -13,6 +13,8 @@
 #include <linux/vmalloc.h>
 #include <linux/memory_alloc.h>
 #include <asm/cacheflush.h>
+#include <linux/slab.h>
+#include <linux/kmemleak.h>
 
 #include "kgsl.h"
 #include "kgsl_sharedmem.h"
@@ -201,6 +203,10 @@ static int kgsl_drv_memstat_show(struct device *dev,
 		val = kgsl_driver.stats.vmalloc;
 	else if (!strncmp(attr->attr.name, "vmalloc_max", 11))
 		val = kgsl_driver.stats.vmalloc_max;
+	else if (!strncmp(attr->attr.name, "page_alloc", 10))
+		val = kgsl_driver.stats.page_alloc;
+	else if (!strncmp(attr->attr.name, "page_alloc_max", 14))
+		val = kgsl_driver.stats.page_alloc_max;
 	else if (!strncmp(attr->attr.name, "coherent", 8))
 		val = kgsl_driver.stats.coherent;
 	else if (!strncmp(attr->attr.name, "coherent_max", 12))
@@ -230,6 +236,8 @@ static int kgsl_drv_histogram_show(struct device *dev,
 
 DEVICE_ATTR(vmalloc, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(vmalloc_max, 0444, kgsl_drv_memstat_show, NULL);
+DEVICE_ATTR(page_alloc, 0444, kgsl_drv_memstat_show, NULL);
+DEVICE_ATTR(page_alloc_max, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(coherent, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(coherent_max, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(mapped, 0444, kgsl_drv_memstat_show, NULL);
@@ -239,6 +247,8 @@ DEVICE_ATTR(histogram, 0444, kgsl_drv_histogram_show, NULL);
 static const struct device_attribute *drv_attr_list[] = {
 	&dev_attr_vmalloc,
 	&dev_attr_vmalloc_max,
+	&dev_attr_page_alloc,
+	&dev_attr_page_alloc_max,
 	&dev_attr_coherent,
 	&dev_attr_coherent_max,
 	&dev_attr_mapped,
@@ -282,7 +292,7 @@ static void outer_cache_range_op_sg(struct scatterlist *sg, int sglen, int op)
 	int i;
 
 	for_each_sg(sg, s, sglen, i) {
-		unsigned int paddr = sg_phys(s);
+		unsigned int paddr = kgsl_get_sg_pa(s);
 		_outer_cache_range_op(op, paddr, s->length);
 	}
 }
@@ -293,17 +303,18 @@ static void outer_cache_range_op_sg(struct scatterlist *sg, int sglen, int op)
 }
 #endif
 
-static int kgsl_vmalloc_vmfault(struct kgsl_memdesc *memdesc,
+static int kgsl_page_alloc_vmfault(struct kgsl_memdesc *memdesc,
 				struct vm_area_struct *vma,
 				struct vm_fault *vmf)
 {
-	unsigned long offset, pg;
+	unsigned long offset;
 	struct page *page;
+	int i;
 
 	offset = (unsigned long) vmf->virtual_address - vma->vm_start;
-	pg = (unsigned long) memdesc->hostptr + offset;
 
-	page = vmalloc_to_page((void *) pg);
+	i = offset >> PAGE_SHIFT;
+	page = sg_page(&memdesc->sg[i]);
 	if (page == NULL)
 		return VM_FAULT_SIGBUS;
 
@@ -313,15 +324,23 @@ static int kgsl_vmalloc_vmfault(struct kgsl_memdesc *memdesc,
 	return 0;
 }
 
-static int kgsl_vmalloc_vmflags(struct kgsl_memdesc *memdesc)
+static int kgsl_page_alloc_vmflags(struct kgsl_memdesc *memdesc)
 {
 	return VM_RESERVED | VM_DONTEXPAND;
 }
 
-static void kgsl_vmalloc_free(struct kgsl_memdesc *memdesc)
+static void kgsl_page_alloc_free(struct kgsl_memdesc *memdesc)
 {
-	kgsl_driver.stats.vmalloc -= memdesc->size;
-	vfree(memdesc->hostptr);
+	int i = 0;
+	struct scatterlist *sg;
+	kgsl_driver.stats.page_alloc -= memdesc->size;
+	if (memdesc->hostptr) {
+		vunmap(memdesc->hostptr);
+		kgsl_driver.stats.vmalloc -= memdesc->size;
+	}
+	if (memdesc->sg)
+		for_each_sg(memdesc->sg, sg, memdesc->sglen, i)
+			__free_page(sg_page(sg));
 }
 
 static int kgsl_contiguous_vmflags(struct kgsl_memdesc *memdesc)
@@ -329,6 +348,42 @@ static int kgsl_contiguous_vmflags(struct kgsl_memdesc *memdesc)
 	return VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND;
 }
 
+/*
+ * kgsl_page_alloc_map_kernel - Map the memory in memdesc to kernel address
+ * space
+ *
+ * @memdesc - The memory descriptor which contains information about the memory
+ *
+ * Return: 0 on success else error code
+ */
+static int kgsl_page_alloc_map_kernel(struct kgsl_memdesc *memdesc)
+{
+	if (!memdesc->hostptr) {
+		pgprot_t page_prot = pgprot_writecombine(PAGE_KERNEL);
+		struct page **pages = NULL;
+		struct scatterlist *sg;
+		int i;
+		/* create a list of pages to call vmap */
+		pages = vmalloc(memdesc->sglen * sizeof(struct page *));
+		if (!pages) {
+			KGSL_CORE_ERR("vmalloc(%d) failed\n",
+				memdesc->sglen * sizeof(struct page *));
+			return -ENOMEM;
+		}
+		for_each_sg(memdesc->sg, sg, memdesc->sglen, i)
+			pages[i] = sg_page(sg);
+		memdesc->hostptr = vmap(pages, memdesc->sglen,
+					VM_IOREMAP, page_prot);
+		KGSL_STATS_ADD(memdesc->size, kgsl_driver.stats.vmalloc,
+				kgsl_driver.stats.vmalloc_max);
+		vfree(pages);
+	}
+	if (!memdesc->hostptr)
+		return -ENOMEM;
+
+	return 0;
+}
+
 static int kgsl_contiguous_vmfault(struct kgsl_memdesc *memdesc,
 				struct vm_area_struct *vma,
 				struct vm_fault *vmf)
@@ -368,12 +423,13 @@ static void kgsl_coherent_free(struct kgsl_memdesc *memdesc)
 }
 
 /* Global - also used by kgsl_drm.c */
-struct kgsl_memdesc_ops kgsl_vmalloc_ops = {
-	.free = kgsl_vmalloc_free,
-	.vmflags = kgsl_vmalloc_vmflags,
-	.vmfault = kgsl_vmalloc_vmfault,
+struct kgsl_memdesc_ops kgsl_page_alloc_ops = {
+	.free = kgsl_page_alloc_free,
+	.vmflags = kgsl_page_alloc_vmflags,
+	.vmfault = kgsl_page_alloc_vmfault,
+	.map_kernel_mem = kgsl_page_alloc_map_kernel,
 };
-EXPORT_SYMBOL(kgsl_vmalloc_ops);
+EXPORT_SYMBOL(kgsl_page_alloc_ops);
 
 static struct kgsl_memdesc_ops kgsl_ebimem_ops = {
 	.free = kgsl_ebimem_free,
@@ -407,9 +463,9 @@ void kgsl_cache_range_op(struct kgsl_memdesc *memdesc, int op)
 EXPORT_SYMBOL(kgsl_cache_range_op);
 
 static int
-_kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc,
+_kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 			struct kgsl_pagetable *pagetable,
-			void *ptr, size_t size, unsigned int protflags)
+			size_t size, unsigned int protflags)
 {
 	int order, ret = 0;
 	int sglen = PAGE_ALIGN(size) / PAGE_SIZE;
@@ -418,36 +474,43 @@ _kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc,
 	memdesc->size = size;
 	memdesc->pagetable = pagetable;
 	memdesc->priv = KGSL_MEMFLAGS_CACHED;
-	memdesc->ops = &kgsl_vmalloc_ops;
-	memdesc->hostptr = (void *) ptr;
+	memdesc->ops = &kgsl_page_alloc_ops;
+
+	memdesc->sg = kgsl_sg_alloc(sglen);
 
-	memdesc->sg = vmalloc(sglen * sizeof(struct scatterlist));
 	if (memdesc->sg == NULL) {
+		KGSL_CORE_ERR("vmalloc(%d) failed\n",
+			sglen * sizeof(struct scatterlist));
 		ret = -ENOMEM;
 		goto done;
 	}
 
+	kmemleak_not_leak(memdesc->sg);
+
 	memdesc->sglen = sglen;
 	sg_init_table(memdesc->sg, sglen);
 
-	for (i = 0; i < memdesc->sglen; i++, ptr += PAGE_SIZE) {
-		struct page *page = vmalloc_to_page(ptr);
+	for (i = 0; i < memdesc->sglen; i++) {
+		struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO |
+						__GFP_HIGHMEM);
 		if (!page) {
-			ret = -EINVAL;
+			ret = -ENOMEM;
+			memdesc->sglen = i;
 			goto done;
 		}
+		flush_dcache_page(page);
 		sg_set_page(&memdesc->sg[i], page, PAGE_SIZE, 0);
 	}
-
-	kgsl_cache_range_op(memdesc, KGSL_CACHE_OP_INV);
+	outer_cache_range_op_sg(memdesc->sg, memdesc->sglen,
+				KGSL_CACHE_OP_FLUSH);
 
 	ret = kgsl_mmu_map(pagetable, memdesc, protflags);
 
 	if (ret)
 		goto done;
 
-	KGSL_STATS_ADD(size, kgsl_driver.stats.vmalloc,
-		kgsl_driver.stats.vmalloc_max);
+	KGSL_STATS_ADD(size, kgsl_driver.stats.page_alloc,
+		kgsl_driver.stats.page_alloc_max);
 
 	order = get_order(size);
 
@@ -462,51 +525,41 @@ _kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc,
 }
 
 int
-kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc,
+kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 		       struct kgsl_pagetable *pagetable, size_t size)
 {
-	void *ptr;
-
+	int ret = 0;
 	BUG_ON(size == 0);
 
 	size = ALIGN(size, PAGE_SIZE * 2);
-	ptr = vmalloc(size);
-
-	if (ptr  == NULL) {
-		KGSL_CORE_ERR("vmalloc(%d) failed\n", size);
-		return -ENOMEM;
-	}
 
-	return _kgsl_sharedmem_vmalloc(memdesc, pagetable, ptr, size,
+	ret =  _kgsl_sharedmem_page_alloc(memdesc, pagetable, size,
 		GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	if (!ret)
+		ret = kgsl_page_alloc_map_kernel(memdesc);
+	if (ret)
+		kgsl_sharedmem_free(memdesc);
+	return ret;
 }
-EXPORT_SYMBOL(kgsl_sharedmem_vmalloc);
+EXPORT_SYMBOL(kgsl_sharedmem_page_alloc);
 
 int
-kgsl_sharedmem_vmalloc_user(struct kgsl_memdesc *memdesc,
+kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc,
 			    struct kgsl_pagetable *pagetable,
 			    size_t size, int flags)
 {
-	void *ptr;
 	unsigned int protflags;
 
 	BUG_ON(size == 0);
-	ptr = vmalloc_user(size);
-
-	if (ptr == NULL) {
-		KGSL_CORE_ERR("vmalloc_user(%d) failed: allocated=%d\n",
-			      size, kgsl_driver.stats.vmalloc);
-		return -ENOMEM;
-	}
 
 	protflags = GSL_PT_PAGE_RV;
 	if (!(flags & KGSL_MEMFLAGS_GPUREADONLY))
 		protflags |= GSL_PT_PAGE_WV;
 
-	return _kgsl_sharedmem_vmalloc(memdesc, pagetable, ptr, size,
+	return _kgsl_sharedmem_page_alloc(memdesc, pagetable, size,
 		protflags);
 }
-EXPORT_SYMBOL(kgsl_sharedmem_vmalloc_user);
+EXPORT_SYMBOL(kgsl_sharedmem_page_alloc_user);
 
 int
 kgsl_sharedmem_alloc_coherent(struct kgsl_memdesc *memdesc, size_t size)
@@ -554,7 +607,7 @@ void kgsl_sharedmem_free(struct kgsl_memdesc *memdesc)
 	if (memdesc->ops && memdesc->ops->free)
 		memdesc->ops->free(memdesc);
 
-	vfree(memdesc->sg);
+	kgsl_sg_free(memdesc->sg, memdesc->sglen);
 
 	memset(memdesc, 0, sizeof(*memdesc));
 }
@@ -686,3 +739,33 @@ kgsl_sharedmem_set(const struct kgsl_memdesc *memdesc, unsigned int offsetbytes,
 	return 0;
 }
 EXPORT_SYMBOL(kgsl_sharedmem_set);
+
+/*
+ * kgsl_sharedmem_map_vma - Map a user vma to physical memory
+ *
+ * @vma - The user vma to map
+ * @memdesc - The memory descriptor which contains information about the
+ * physical memory
+ *
+ * Return: 0 on success else error code
+ */
+int
+kgsl_sharedmem_map_vma(struct vm_area_struct *vma,
+			const struct kgsl_memdesc *memdesc)
+{
+	unsigned long addr = vma->vm_start;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	int ret, i = 0;
+
+	if (!memdesc->sg || (size != memdesc->size) ||
+		(memdesc->sglen != (size / PAGE_SIZE)))
+		return -EINVAL;
+
+	for (; addr < vma->vm_end; addr += PAGE_SIZE, i++) {
+		ret = vm_insert_page(vma, addr, sg_page(&memdesc->sg[i]));
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(kgsl_sharedmem_map_vma);
diff --git a/drivers/gpu/msm/kgsl_sharedmem.h b/drivers/gpu/msm/kgsl_sharedmem.h
old mode 100755
new mode 100644
index 67a1c2d7b..a67d9c657
--- a/drivers/gpu/msm/kgsl_sharedmem.h
+++ b/drivers/gpu/msm/kgsl_sharedmem.h
@@ -17,6 +17,8 @@
 #include <linux/dma-mapping.h>
 #include <linux/vmalloc.h>
 #include "kgsl_mmu.h"
+#include <linux/slab.h>
+#include <linux/kmemleak.h>
 
 struct kgsl_device;
 struct kgsl_process_private;
@@ -28,19 +30,12 @@ struct kgsl_process_private;
 /** Set if the memdesc describes cached memory */
 #define KGSL_MEMFLAGS_CACHED    0x00000001
 
-struct kgsl_memdesc_ops {
-	int (*vmflags)(struct kgsl_memdesc *);
-	int (*vmfault)(struct kgsl_memdesc *, struct vm_area_struct *,
-		       struct vm_fault *);
-	void (*free)(struct kgsl_memdesc *memdesc);
-};
-
-extern struct kgsl_memdesc_ops kgsl_vmalloc_ops;
+extern struct kgsl_memdesc_ops kgsl_page_alloc_ops;
 
-int kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc,
+int kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 			   struct kgsl_pagetable *pagetable, size_t size);
 
-int kgsl_sharedmem_vmalloc_user(struct kgsl_memdesc *memdesc,
+int kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc,
 				struct kgsl_pagetable *pagetable,
 				size_t size, int flags);
 
@@ -76,19 +71,58 @@ void kgsl_process_uninit_sysfs(struct kgsl_process_private *private);
 int kgsl_sharedmem_init_sysfs(void);
 void kgsl_sharedmem_uninit_sysfs(void);
 
+static inline unsigned int kgsl_get_sg_pa(struct scatterlist *sg)
+{
+	/*
+	 * Try sg_dma_address first to support ion carveout
+	 * regions which do not work with sg_phys().
+	 */
+	unsigned int pa = sg_dma_address(sg);
+	if (pa == 0)
+		pa = sg_phys(sg);
+	return pa;
+}
+
+int
+kgsl_sharedmem_map_vma(struct vm_area_struct *vma,
+			const struct kgsl_memdesc *memdesc);
+
+/*
+ * For relatively small sglists, it is preferable to use kzalloc
+ * rather than going down the vmalloc rat hole.  If the size of
+ * the sglist is < PAGE_SIZE use kzalloc otherwise fallback to
+ * vmalloc
+ */
+
+static inline void *kgsl_sg_alloc(unsigned int sglen)
+{
+	if ((sglen * sizeof(struct scatterlist)) <  PAGE_SIZE)
+		return kzalloc(sglen * sizeof(struct scatterlist), GFP_KERNEL);
+	else
+		return vmalloc(sglen * sizeof(struct scatterlist));
+}
+
+static inline void kgsl_sg_free(void *ptr, unsigned int sglen)
+{
+	if ((sglen * sizeof(struct scatterlist)) < PAGE_SIZE)
+		kfree(ptr);
+	else
+		vfree(ptr);
+}
+
 static inline int
 memdesc_sg_phys(struct kgsl_memdesc *memdesc,
 		unsigned int physaddr, unsigned int size)
 {
-	struct page *page = phys_to_page(physaddr);
+	memdesc->sg = kgsl_sg_alloc(1);
 
-	memdesc->sg = vmalloc(sizeof(struct scatterlist) * 1);
-	if (memdesc->sg == NULL)
-		return -ENOMEM;
+	kmemleak_not_leak(memdesc->sg);
 
 	memdesc->sglen = 1;
 	sg_init_table(memdesc->sg, 1);
-	sg_set_page(&memdesc->sg[0], page, size, 0);
+	memdesc->sg[0].length = size;
+	memdesc->sg[0].offset = 0;
+	memdesc->sg[0].dma_address = physaddr;
 	return 0;
 }
 
@@ -98,7 +132,7 @@ kgsl_allocate(struct kgsl_memdesc *memdesc,
 {
 	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE)
 		return kgsl_sharedmem_ebimem(memdesc, pagetable, size);
-	return kgsl_sharedmem_vmalloc(memdesc, pagetable, size);
+	return kgsl_sharedmem_page_alloc(memdesc, pagetable, size);
 }
 
 static inline int
@@ -109,7 +143,7 @@ kgsl_allocate_user(struct kgsl_memdesc *memdesc,
 	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE)
 		return kgsl_sharedmem_ebimem_user(memdesc, pagetable, size,
 						  flags);
-	return kgsl_sharedmem_vmalloc_user(memdesc, pagetable, size, flags);
+	return kgsl_sharedmem_page_alloc_user(memdesc, pagetable, size, flags);
 }
 
 static inline int
diff --git a/drivers/gpu/msm/kgsl_snapshot.c b/drivers/gpu/msm/kgsl_snapshot.c
index 72df148bc..394bc83bd 100644
--- a/drivers/gpu/msm/kgsl_snapshot.c
+++ b/drivers/gpu/msm/kgsl_snapshot.c
@@ -10,7 +10,6 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/vmalloc.h>
 #include <linux/time.h>
 #include <linux/sysfs.h>
 #include <linux/utsname.h>
@@ -283,6 +282,12 @@ int kgsl_device_snapshot(struct kgsl_device *device, int hang)
 	/* Freeze the snapshot on a hang until it gets read */
 	device->snapshot_frozen = (hang) ? 1 : 0;
 
+	/* log buffer info to aid in ramdump recovery */
+	KGSL_DRV_ERR(device, "snapshot created at va %p pa %lx size %d\n",
+			device->snapshot, __pa(device->snapshot),
+			device->snapshot_size);
+	if (hang)
+		sysfs_notify(&device->snapshot_kobj, NULL, "timestamp");
 	return 0;
 }
 EXPORT_SYMBOL(kgsl_device_snapshot);
@@ -432,7 +437,7 @@ int kgsl_device_snapshot_init(struct kgsl_device *device)
 	int ret;
 
 	if (device->snapshot == NULL)
-		device->snapshot = vmalloc(KGSL_SNAPSHOT_MEMSIZE);
+		device->snapshot = kzalloc(KGSL_SNAPSHOT_MEMSIZE, GFP_KERNEL);
 
 	if (device->snapshot == NULL)
 		return -ENOMEM;
@@ -475,7 +480,7 @@ void kgsl_device_snapshot_close(struct kgsl_device *device)
 
 	kobject_put(&device->snapshot_kobj);
 
-	vfree(device->snapshot);
+	kfree(device->snapshot);
 
 	device->snapshot = NULL;
 	device->snapshot_maxsize = 0;
diff --git a/drivers/gpu/msm/z180.c b/drivers/gpu/msm/z180.c
old mode 100755
new mode 100644
index cb3da9075..d721a577a
--- a/drivers/gpu/msm/z180.c
+++ b/drivers/gpu/msm/z180.c
@@ -157,13 +157,6 @@ static struct z180_device device_2d0 = {
 		.active_cnt = 0,
 		.iomemname = KGSL_2D0_REG_MEMORY,
 		.ftbl = &z180_functable,
-#ifdef CONFIG_HAS_EARLYSUSPEND
-		.display_off = {
-			.level = EARLY_SUSPEND_LEVEL_STOP_DRAWING,
-			.suspend = kgsl_early_suspend_driver,
-			.resume = kgsl_late_resume_driver,
-		},
-#endif
 	},
 };
 
@@ -195,13 +188,6 @@ static struct z180_device device_2d1 = {
 		.active_cnt = 0,
 		.iomemname = KGSL_2D1_REG_MEMORY,
 		.ftbl = &z180_functable,
-		.display_off = {
-#ifdef CONFIG_HAS_EARLYSUSPEND
-			.level = EARLY_SUSPEND_LEVEL_STOP_DRAWING,
-			.suspend = kgsl_early_suspend_driver,
-			.resume = kgsl_late_resume_driver,
-#endif
-		},
 	},
 };
 
@@ -407,7 +393,7 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv,
 	unsigned int index	= 0;
 	unsigned int nextindex;
 	unsigned int nextcnt    = Z180_STREAM_END_CMD | 5;
-	struct kgsl_memdesc tmp = {0};
+	struct kgsl_mem_entry *entry = NULL;
 	unsigned int cmd;
 	struct kgsl_device *device = dev_priv->device;
 	struct kgsl_pagetable *pagetable = dev_priv->process_priv->pagetable;
@@ -425,8 +411,30 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv,
 	}
 	cmd = ibdesc[0].gpuaddr;
 	sizedwords = ibdesc[0].sizedwords;
-
-	tmp.hostptr = (void *)*timestamp;
+	/*
+	 * Get a kernel mapping to the IB for monkey patching.
+	 * See the end of this function.
+	 */
+	entry = kgsl_sharedmem_find_region(dev_priv->process_priv, cmd,
+		sizedwords);
+	if (entry == NULL) {
+		KGSL_DRV_ERR(device, "Bad ibdesc: gpuaddr 0x%x size %d\n",
+			     cmd, sizedwords);
+		result = -EINVAL;
+		goto error;
+	}
+	/*
+	 * This will only map memory if it exists, otherwise it will reuse the
+	 * mapping. And the 2d userspace reuses IBs so we likely won't create
+	 * too many mappings.
+	 */
+	if (kgsl_gpuaddr_to_vaddr(&entry->memdesc, cmd) == NULL) {
+		KGSL_DRV_ERR(device,
+			     "Cannot make kernel mapping for gpuaddr 0x%x\n",
+			     cmd);
+		result = -EINVAL;
+		goto error;
+	}
 
 	KGSL_CMD_INFO(device, "ctxt %d ibaddr 0x%08x sizedwords %d\n",
 		context->id, cmd, sizedwords);
@@ -468,12 +476,13 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv,
 	nextaddr = z180_dev->ringbuffer.cmdbufdesc.gpuaddr
 		+ rb_offset(nextindex);
 
-	tmp.hostptr = (void *)(tmp.hostptr +
-			(sizedwords * sizeof(unsigned int)));
-	tmp.size = 12;
-
-	kgsl_sharedmem_writel(&tmp, 4, nextaddr);
-	kgsl_sharedmem_writel(&tmp, 8, nextcnt);
+	/* monkey patch the IB so that it jumps back to the ringbuffer */
+	kgsl_sharedmem_writel(&entry->memdesc,
+			      ((sizedwords + 1) * sizeof(unsigned int)),
+			      nextaddr);
+	kgsl_sharedmem_writel(&entry->memdesc,
+			      ((sizedwords + 2) * sizeof(unsigned int)),
+			      nextcnt);
 
 	/* sync memory before activating the hardware for the new command*/
 	mb();
diff --git a/include/linux/msm_kgsl.h b/include/linux/msm_kgsl.h
old mode 100755
new mode 100644
index a1d267893..7837bad21
--- a/include/linux/msm_kgsl.h
+++ b/include/linux/msm_kgsl.h
@@ -34,6 +34,16 @@
 #define KGSL_CLK_MEM_IFACE 0x00000010
 #define KGSL_CLK_AXI	0x00000020
 
+/*
+ * Reset status values for context
+ */
+enum kgsl_ctx_reset_stat {
+	KGSL_CTX_STAT_NO_ERROR				= 0x00000000,
+	KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT		= 0x00000001,
+	KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT	= 0x00000002,
+	KGSL_CTX_STAT_UNKNOWN_CONTEXT_RESET_EXT		= 0x00000003
+};
+
 #define KGSL_MAX_PWRLEVELS 5
 
 #define KGSL_CONVERT_TO_MBPS(val) \
@@ -110,6 +120,7 @@ enum kgsl_property_type {
 	KGSL_PROP_MMU_ENABLE 	  = 0x00000006,
 	KGSL_PROP_INTERRUPT_WAITS = 0x00000007,
 	KGSL_PROP_VERSION         = 0x00000008,
+	KGSL_PROP_GPU_RESET_STAT  = 0x00000009
 };
 
 struct kgsl_shadowprop {
@@ -146,9 +157,7 @@ struct kgsl_device_platform_data {
 	int num_levels;
 	int (*set_grp_async)(void);
 	unsigned int idle_timeout;
-	/*< DTS2012041906630 zhangxiangdang 20120423 begin */
 	bool strtstp_sleepwake;
-	/* DTS2012041906630 zhangxiangdang 20120423 end > */
 	unsigned int nap_allowed;
 	unsigned int clk_map;
 	unsigned int idle_needed;

From 127466b8f9d3e953c3b770e9012a814f3f065154 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Sun, 13 Jan 2013 17:04:43 +0200
Subject: [PATCH 05/19] updated Atmel TS for fix the touchscreen lag

---
 .../touchscreen/atmel_i2c_rmi_QT602240.c      | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c b/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c
index d6a9173f8..9a931c0f3 100755
--- a/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c
+++ b/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c
@@ -114,12 +114,13 @@ and the height of the key region is 8.5mm, TS_Y_MAX * 8.5 /91.5 */
 #define EXTRA_MAX_TOUCH_KEY    4
 #define TS_KEY_DEBOUNCE_TIMER_MS 60
 
-static int vibrate=30;
+static int vibrate=20;
 
 module_param(vibrate, int, 00644);
 
 void msm_timed_vibrate(int);
 
+
 /* to define a region of touch panel */
 typedef struct
 {
@@ -834,7 +835,7 @@ int write_power_config(int on)
 /* < DTS2010083103149 zhangtao 20100909 begin */
 		/* < DTS2011042106137 zhangtao 20110509 begin */
         /* < DTS2011062404739 cuiyu 20110624 begin */
-	    *(tmp + 1) = 14; //0xff//Active Acquisition
+	    *(tmp + 1) = 16; //0xff//Active Acquisition
         /* DTS2011062404739 cuiyu 20110624 end > */
 		/* DTS2011042106137 zhangtao 20110509 end > */
 /* DTS2010083103149 zhangtao 20100909 end > */
@@ -1196,7 +1197,7 @@ int write_gripfacesuppression_config(u8 instance)
 /* < DTS2010083103149 zhangtao 20100909 begin */
 	/* < DTS2011042106137 zhangtao 20110509 begin */
 	/* turn off the fripfacesuppression */
-	*(tmp + 0) = 0x00; //0x05; //ctrl
+	*(tmp + 0) = 0x07; //0x05; //ctrl
 	/* DTS2011042106137 zhangtao 20110509 end > */
 /* < DTS2010073101113 zhangtao 20100819 begin */
 	*(tmp + 1) = 0; //xlogrip
@@ -1206,7 +1207,7 @@ int write_gripfacesuppression_config(u8 instance)
 	*(tmp + 5) = 0; //maxtchs
 	*(tmp + 6) = 0; //reserved
 	*(tmp + 7) = 80; //szthr1
-	*(tmp + 8) = 40; //szthr2
+	*(tmp + 8) = 20; //szthr2
 	*(tmp + 9) = 4; //shpthr1
 	*(tmp + 10) = 35; //shpthr2
 	*(tmp + 11) = 10; //supextto
@@ -2399,6 +2400,7 @@ static void atmel_ts_work_func(struct work_struct *work)
 
                                 input_report_key(ts->key_input, key_tmp, 0);
                 				key_pressed1 = 0;
+                				msm_timed_vibrate(vibrate);
                                 ATMEL_DBG_MASK("when the key is released report!\n");
                 			}
                 		}
@@ -2407,7 +2409,6 @@ static void atmel_ts_work_func(struct work_struct *work)
                 			if(0 == key_pressed1)
                 			{
                                 input_report_key(ts->key_input, key_tmp, 1);
-				msm_timed_vibrate(vibrate);
                                 key_pressed1 = 1;
                                 ATMEL_DBG_MASK("the key is pressed report!\n");
                 			}
@@ -2457,7 +2458,7 @@ static void atmel_ts_work_func(struct work_struct *work)
 						if (ts->test > 0) 
 							key_pressed = KEY_BRL_DOT1;
 						else
-							key_pressed = KEY_SEARCH;
+							key_pressed = KEY_SEARCH;							
 					 	touch_input_report_key(ts, key_pressed, 1);
 						input_sync(ts->input_dev);
 						msm_timed_vibrate(vibrate);
@@ -2502,8 +2503,8 @@ static void atmel_ts_work_func(struct work_struct *work)
 				default:
 					break;
 			}
-
-
+				
+				
 			break;
 /* < DTS2010083103149 zhangtao 20100909 begin */
         case PROCG_GRIPFACESUPPRESSION_T20:         
@@ -2515,7 +2516,7 @@ static void atmel_ts_work_func(struct work_struct *work)
 
             break;
 /* DTS2010083103149 zhangtao 20100909 end > */
-
+            
 		default:
 			TS_DEBUG_TS("T%d detect\n", obj);
 			break;

From 40260614bf30a451786dc462fa603f584ce41d41 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Sun, 13 Jan 2013 17:08:07 +0200
Subject: [PATCH 06/19] enable swap

---
 arch/arm/configs/u8800_defconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/configs/u8800_defconfig b/arch/arm/configs/u8800_defconfig
index 1e9a7a2c9..bb92b53f4 100644
--- a/arch/arm/configs/u8800_defconfig
+++ b/arch/arm/configs/u8800_defconfig
@@ -45,7 +45,7 @@ CONFIG_HAVE_KERNEL_LZO=y
 CONFIG_KERNEL_LZMA=y
 # CONFIG_KERNEL_LZO is not set
 CONFIG_DEFAULT_HOSTNAME="(none)"
-# CONFIG_SWAP is not set
+CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 # CONFIG_POSIX_MQUEUE is not set

From ea79d6e1b28f75b1d7fdc8f0ddd55c05d9c66799 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Mon, 14 Jan 2013 17:30:34 +0200
Subject: [PATCH 07/19] added smartassV2 governor & change default ARCH to arm

---
 Makefile                            |   2 +-
 arch/arm/configs/u8800_defconfig    |   1 +
 drivers/cpufreq/Kconfig             |  15 +
 drivers/cpufreq/Makefile            |   1 +
 drivers/cpufreq/cpufreq_smartass2.c | 868 ++++++++++++++++++++++++++++
 include/linux/cpufreq.h             |   3 +
 6 files changed, 889 insertions(+), 1 deletion(-)
 create mode 100644 drivers/cpufreq/cpufreq_smartass2.c

diff --git a/Makefile b/Makefile
index 3ab214703..c51d9cdd9 100755
--- a/Makefile
+++ b/Makefile
@@ -192,7 +192,7 @@ SUBARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
 # Default value for CROSS_COMPILE is not to prefix executables
 # Note: Some architectures assign CROSS_COMPILE in their arch/*/Makefile
 export KBUILD_BUILDHOST := $(SUBARCH)
-ARCH		?= $(SUBARCH)
+ARCH		?= arm
 CROSS_COMPILE	?= $(CONFIG_CROSS_COMPILE:"%"=%)
 
 # Architecture as present in compile.h
diff --git a/arch/arm/configs/u8800_defconfig b/arch/arm/configs/u8800_defconfig
index bb92b53f4..60c6fc0d0 100644
--- a/arch/arm/configs/u8800_defconfig
+++ b/arch/arm/configs/u8800_defconfig
@@ -596,6 +596,7 @@ CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_INTERACTIVE=y
 CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
+CONFIG_CPU_FREQ_GOV_SMARTASS2=y
 # CONFIG_CPU_IDLE is not set
 CONFIG_CPU_FREQ_MSM=y
 
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 194708850..6f643138e 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -109,6 +109,13 @@ config CPU_FREQ_DEFAULT_GOV_INTERACTIVE
 	  loading your cpufreq low-level hardware driver, using the
 	  'interactive' governor for latency-sensitive workloads.
 
+config CPU_FREQ_DEFAULT_GOV_SMARTASS2
+	bool "smartass2"
+	select CPU_FREQ_GOV_SMARTASS2
+	select CPU_FREQ_GOV_PERFORMANCE
+	help
+	  Use the CPUFreq governor 'smartassV2' as default.
+
 endchoice
 
 config CPU_FREQ_GOV_PERFORMANCE
@@ -206,6 +213,14 @@ config CPU_FREQ_GOV_CONSERVATIVE
 
 	  If in doubt, say N.
 
+config CPU_FREQ_GOV_SMARTASS2
+	tristate "'smartassV2' cpufreq governor"
+	depends on CPU_FREQ
+	help
+	  'smartassV2' - a "smart" governor
+
+	  If in doubt, say N.
+
 menu "x86 CPU frequency scaling drivers"
 depends on X86
 source "drivers/cpufreq/Kconfig.x86"
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index c044060a4..e9261b0cc 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_CPU_FREQ_GOV_USERSPACE)	+= cpufreq_userspace.o
 obj-$(CONFIG_CPU_FREQ_GOV_ONDEMAND)	+= cpufreq_ondemand.o
 obj-$(CONFIG_CPU_FREQ_GOV_CONSERVATIVE)	+= cpufreq_conservative.o
 obj-$(CONFIG_CPU_FREQ_GOV_INTERACTIVE)	+= cpufreq_interactive.o
+obj-$(CONFIG_CPU_FREQ_GOV_SMARTASS2)  += cpufreq_smartass2.o
 
 # CPUfreq cross-arch helpers
 obj-$(CONFIG_CPU_FREQ_TABLE)		+= freq_table.o
diff --git a/drivers/cpufreq/cpufreq_smartass2.c b/drivers/cpufreq/cpufreq_smartass2.c
new file mode 100644
index 000000000..e00524992
--- /dev/null
+++ b/drivers/cpufreq/cpufreq_smartass2.c
@@ -0,0 +1,868 @@
+/*
+ * drivers/cpufreq/cpufreq_smartass2.c
+ *
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Author: Erasmux
+ *
+ * Based on the interactive governor By Mike Chan (mike@android.com)
+ * which was adaptated to 2.6.29 kernel by Nadlabak (pavel@doshaska.net)
+ *
+ * SMP support based on mod by faux123
+ *
+ * For a general overview of smartassV2 see the relavent part in
+ * Documentation/cpu-freq/governors.txt
+ *
+ */
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <linux/moduleparam.h>
+#include <asm/cputime.h>
+#include <linux/earlysuspend.h>
+
+
+/******************** Tunable parameters: ********************/
+
+/*
+ * The "ideal" frequency to use when awake. The governor will ramp up faster
+ * towards the ideal frequency and slower after it has passed it. Similarly,
+ * lowering the frequency towards the ideal frequency is faster than below it.
+ */
+#define DEFAULT_AWAKE_IDEAL_FREQ 768000
+static unsigned int awake_ideal_freq;
+
+/*
+ * The "ideal" frequency to use when suspended.
+ * When set to 0, the governor will not track the suspended state (meaning
+ * that practically when sleep_ideal_freq==0 the awake_ideal_freq is used
+ * also when suspended).
+ */
+#define DEFAULT_SLEEP_IDEAL_FREQ 245000
+static unsigned int sleep_ideal_freq;
+
+/*
+ * Freqeuncy delta when ramping up above the ideal freqeuncy.
+ * Zero disables and causes to always jump straight to max frequency.
+ * When below the ideal freqeuncy we always ramp up to the ideal freq.
+ */
+#define DEFAULT_RAMP_UP_STEP 256000
+static unsigned int ramp_up_step;
+
+/*
+ * Freqeuncy delta when ramping down below the ideal freqeuncy.
+ * Zero disables and will calculate ramp down according to load heuristic.
+ * When above the ideal freqeuncy we always ramp down to the ideal freq.
+ */
+#define DEFAULT_RAMP_DOWN_STEP 256000
+static unsigned int ramp_down_step;
+
+/*
+ * CPU freq will be increased if measured load > max_cpu_load;
+ */
+#define DEFAULT_MAX_CPU_LOAD 50
+static unsigned long max_cpu_load;
+
+/*
+ * CPU freq will be decreased if measured load < min_cpu_load;
+ */
+#define DEFAULT_MIN_CPU_LOAD 25
+static unsigned long min_cpu_load;
+
+/*
+ * The minimum amount of time to spend at a frequency before we can ramp up.
+ * Notice we ignore this when we are below the ideal frequency.
+ */
+#define DEFAULT_UP_RATE_US 48000;
+static unsigned long up_rate_us;
+
+/*
+ * The minimum amount of time to spend at a frequency before we can ramp down.
+ * Notice we ignore this when we are above the ideal frequency.
+ */
+#define DEFAULT_DOWN_RATE_US 99000;
+static unsigned long down_rate_us;
+
+/*
+ * The frequency to set when waking up from sleep.
+ * When sleep_ideal_freq=0 this will have no effect.
+ */
+#define DEFAULT_SLEEP_WAKEUP_FREQ 1024000
+static unsigned int sleep_wakeup_freq;
+
+/*
+ * Sampling rate, I highly recommend to leave it at 2.
+ */
+#define DEFAULT_SAMPLE_RATE_JIFFIES 2
+static unsigned int sample_rate_jiffies;
+
+
+/*************** End of tunables ***************/
+
+
+static void (*pm_idle_old)(void);
+static atomic_t active_count = ATOMIC_INIT(0);
+
+struct smartass_info_s {
+	struct cpufreq_policy *cur_policy;
+	struct cpufreq_frequency_table *freq_table;
+	struct timer_list timer;
+	u64 time_in_idle;
+	u64 idle_exit_time;
+	u64 freq_change_time;
+	u64 freq_change_time_in_idle;
+	int cur_cpu_load;
+	int old_freq;
+	int ramp_dir;
+	unsigned int enable;
+	int ideal_speed;
+};
+static DEFINE_PER_CPU(struct smartass_info_s, smartass_info);
+
+/* Workqueues handle frequency scaling */
+static struct workqueue_struct *up_wq;
+static struct workqueue_struct *down_wq;
+static struct work_struct freq_scale_work;
+
+static cpumask_t work_cpumask;
+static spinlock_t cpumask_lock;
+
+static unsigned int suspended;
+
+#define dprintk(flag,msg...) do { \
+	if (debug_mask & flag) printk(KERN_DEBUG msg); \
+	} while (0)
+
+enum {
+	SMARTASS_DEBUG_JUMPS=1,
+	SMARTASS_DEBUG_LOAD=2,
+	SMARTASS_DEBUG_ALG=4
+};
+
+/*
+ * Combination of the above debug flags.
+ */
+static unsigned long debug_mask;
+
+static int cpufreq_governor_smartass(struct cpufreq_policy *policy,
+		unsigned int event);
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SMARTASS2
+static
+#endif
+struct cpufreq_governor cpufreq_gov_smartass2 = {
+	.name = "smartassV2",
+	.governor = cpufreq_governor_smartass,
+	.max_transition_latency = 6000000,
+	.owner = THIS_MODULE,
+};
+
+inline static void smartass_update_min_max(struct smartass_info_s *this_smartass, struct cpufreq_policy *policy, int suspend) {
+	if (suspend) {
+		this_smartass->ideal_speed = // sleep_ideal_freq; but make sure it obeys the policy min/max
+			policy->max > sleep_ideal_freq ?
+			(sleep_ideal_freq > policy->min ? sleep_ideal_freq : policy->min) : policy->max;
+	} else {
+		this_smartass->ideal_speed = // awake_ideal_freq; but make sure it obeys the policy min/max
+			policy->min < awake_ideal_freq ?
+			(awake_ideal_freq < policy->max ? awake_ideal_freq : policy->max) : policy->min;
+	}
+}
+
+inline static void smartass_update_min_max_allcpus(void) {
+	unsigned int i;
+	for_each_online_cpu(i) {
+		struct smartass_info_s *this_smartass = &per_cpu(smartass_info, i);
+		if (this_smartass->enable)
+			smartass_update_min_max(this_smartass,this_smartass->cur_policy,suspended);
+	}
+}
+
+inline static unsigned int validate_freq(struct cpufreq_policy *policy, int freq) {
+	if (freq > (int)policy->max)
+		return policy->max;
+	if (freq < (int)policy->min)
+		return policy->min;
+	return freq;
+}
+
+inline static void reset_timer(unsigned long cpu, struct smartass_info_s *this_smartass) {
+	this_smartass->time_in_idle = get_cpu_idle_time_us(cpu, &this_smartass->idle_exit_time);
+	mod_timer(&this_smartass->timer, jiffies + sample_rate_jiffies);
+}
+
+inline static void work_cpumask_set(unsigned long cpu) {
+	unsigned long flags;
+	spin_lock_irqsave(&cpumask_lock, flags);
+	cpumask_set_cpu(cpu, &work_cpumask);
+	spin_unlock_irqrestore(&cpumask_lock, flags);
+}
+
+inline static int work_cpumask_test_and_clear(unsigned long cpu) {
+	unsigned long flags;
+	int res = 0;
+	spin_lock_irqsave(&cpumask_lock, flags);
+	res = cpumask_test_and_clear_cpu(cpu, &work_cpumask);
+	spin_unlock_irqrestore(&cpumask_lock, flags);
+	return res;
+}
+
+inline static int target_freq(struct cpufreq_policy *policy, struct smartass_info_s *this_smartass,
+			      int new_freq, int old_freq, int prefered_relation) {
+	int index, target;
+	struct cpufreq_frequency_table *table = this_smartass->freq_table;
+
+	if (new_freq == old_freq)
+		return 0;
+	new_freq = validate_freq(policy,new_freq);
+	if (new_freq == old_freq)
+		return 0;
+
+	if (table &&
+	    !cpufreq_frequency_table_target(policy,table,new_freq,prefered_relation,&index))
+	{
+		target = table[index].frequency;
+		if (target == old_freq) {
+			// if for example we are ramping up to *at most* current + ramp_up_step
+			// but there is no such frequency higher than the current, try also
+			// to ramp up to *at least* current + ramp_up_step.
+			if (new_freq > old_freq && prefered_relation==CPUFREQ_RELATION_H
+			    && !cpufreq_frequency_table_target(policy,table,new_freq,
+							       CPUFREQ_RELATION_L,&index))
+				target = table[index].frequency;
+			// simlarly for ramping down:
+			else if (new_freq < old_freq && prefered_relation==CPUFREQ_RELATION_L
+				&& !cpufreq_frequency_table_target(policy,table,new_freq,
+								   CPUFREQ_RELATION_H,&index))
+				target = table[index].frequency;
+		}
+
+		if (target == old_freq) {
+			// We should not get here:
+			// If we got here we tried to change to a validated new_freq which is different
+			// from old_freq, so there is no reason for us to remain at same frequency.
+			printk(KERN_WARNING "Smartass: frequency change failed: %d to %d => %d\n",
+			       old_freq,new_freq,target);
+			return 0;
+		}
+	}
+	else target = new_freq;
+
+	__cpufreq_driver_target(policy, target, prefered_relation);
+
+	dprintk(SMARTASS_DEBUG_JUMPS,"SmartassQ: jumping from %d to %d => %d (%d)\n",
+		old_freq,new_freq,target,policy->cur);
+
+	return target;
+}
+
+static void cpufreq_smartass_timer(unsigned long cpu)
+{
+	u64 delta_idle;
+	u64 delta_time;
+	int cpu_load;
+	int old_freq;
+	u64 update_time;
+	u64 now_idle;
+	int queued_work = 0;
+	struct smartass_info_s *this_smartass = &per_cpu(smartass_info, cpu);
+	struct cpufreq_policy *policy = this_smartass->cur_policy;
+
+	now_idle = get_cpu_idle_time_us(cpu, &update_time);
+	old_freq = policy->cur;
+
+	if (this_smartass->idle_exit_time == 0 || update_time == this_smartass->idle_exit_time)
+		return;
+
+	delta_idle = cputime64_sub(now_idle, this_smartass->time_in_idle);
+	delta_time = cputime64_sub(update_time, this_smartass->idle_exit_time);
+
+	// If timer ran less than 1ms after short-term sample started, retry.
+	if (delta_time < 1000) {
+		if (!timer_pending(&this_smartass->timer))
+			reset_timer(cpu,this_smartass);
+		return;
+	}
+
+	if (delta_idle > delta_time)
+		cpu_load = 0;
+	else
+		cpu_load = 100 * (unsigned int)(delta_time - delta_idle) / (unsigned int)delta_time;
+
+	dprintk(SMARTASS_DEBUG_LOAD,"smartassT @ %d: load %d (delta_time %llu)\n",
+		old_freq,cpu_load,delta_time);
+
+	this_smartass->cur_cpu_load = cpu_load;
+	this_smartass->old_freq = old_freq;
+
+	// Scale up if load is above max or if there where no idle cycles since coming out of idle,
+	// additionally, if we are at or above the ideal_speed, verify we have been at this frequency
+	// for at least up_rate_us:
+	if (cpu_load > max_cpu_load || delta_idle == 0)
+	{
+		if (old_freq < policy->max &&
+			 (old_freq < this_smartass->ideal_speed || delta_idle == 0 ||
+			  cputime64_sub(update_time, this_smartass->freq_change_time) >= up_rate_us))
+		{
+			dprintk(SMARTASS_DEBUG_ALG,"smartassT @ %d ramp up: load %d (delta_idle %llu)\n",
+				old_freq,cpu_load,delta_idle);
+			this_smartass->ramp_dir = 1;
+			work_cpumask_set(cpu);
+			queue_work(up_wq, &freq_scale_work);
+			queued_work = 1;
+		}
+		else this_smartass->ramp_dir = 0;
+	}
+	// Similarly for scale down: load should be below min and if we are at or below ideal
+	// frequency we require that we have been at this frequency for at least down_rate_us:
+	else if (cpu_load < min_cpu_load && old_freq > policy->min &&
+		 (old_freq > this_smartass->ideal_speed ||
+		  cputime64_sub(update_time, this_smartass->freq_change_time) >= down_rate_us))
+	{
+		dprintk(SMARTASS_DEBUG_ALG,"smartassT @ %d ramp down: load %d (delta_idle %llu)\n",
+			old_freq,cpu_load,delta_idle);
+		this_smartass->ramp_dir = -1;
+		work_cpumask_set(cpu);
+		queue_work(down_wq, &freq_scale_work);
+		queued_work = 1;
+	}
+	else this_smartass->ramp_dir = 0;
+
+	// To avoid unnecessary load when the CPU is already at high load, we don't
+	// reset ourselves if we are at max speed. If and when there are idle cycles,
+	// the idle loop will activate the timer.
+	// Additionally, if we queued some work, the work task will reset the timer
+	// after it has done its adjustments.
+	if (!queued_work && old_freq < policy->max)
+		reset_timer(cpu,this_smartass);
+}
+
+static void cpufreq_idle(void)
+{
+	struct smartass_info_s *this_smartass = &per_cpu(smartass_info, smp_processor_id());
+	struct cpufreq_policy *policy = this_smartass->cur_policy;
+
+	if (!this_smartass->enable) {
+		pm_idle_old();
+		return;
+	}
+
+	if (policy->cur == policy->min && timer_pending(&this_smartass->timer))
+		del_timer(&this_smartass->timer);
+
+	pm_idle_old();
+
+	if (!timer_pending(&this_smartass->timer))
+		reset_timer(smp_processor_id(), this_smartass);
+}
+
+/* We use the same work function to sale up and down */
+static void cpufreq_smartass_freq_change_time_work(struct work_struct *work)
+{
+	unsigned int cpu;
+	int new_freq;
+	int old_freq;
+	int ramp_dir;
+	struct smartass_info_s *this_smartass;
+	struct cpufreq_policy *policy;
+	unsigned int relation = CPUFREQ_RELATION_L;
+	for_each_possible_cpu(cpu) {
+		this_smartass = &per_cpu(smartass_info, cpu);
+		if (!work_cpumask_test_and_clear(cpu))
+			continue;
+
+		ramp_dir = this_smartass->ramp_dir;
+		this_smartass->ramp_dir = 0;
+
+		old_freq = this_smartass->old_freq;
+		policy = this_smartass->cur_policy;
+
+		if (old_freq != policy->cur) {
+			// frequency was changed by someone else?
+			printk(KERN_WARNING "Smartass: frequency changed by 3rd party: %d to %d\n",
+			       old_freq,policy->cur);
+			new_freq = old_freq;
+		}
+		else if (ramp_dir > 0 && nr_running() > 1) {
+			// ramp up logic:
+			if (old_freq < this_smartass->ideal_speed)
+				new_freq = this_smartass->ideal_speed;
+			else if (ramp_up_step) {
+				new_freq = old_freq + ramp_up_step;
+				relation = CPUFREQ_RELATION_H;
+			}
+			else {
+				new_freq = policy->max;
+				relation = CPUFREQ_RELATION_H;
+			}
+			dprintk(SMARTASS_DEBUG_ALG,"smartassQ @ %d ramp up: ramp_dir=%d ideal=%d\n",
+				old_freq,ramp_dir,this_smartass->ideal_speed);
+		}
+		else if (ramp_dir < 0) {
+			// ramp down logic:
+			if (old_freq > this_smartass->ideal_speed) {
+				new_freq = this_smartass->ideal_speed;
+				relation = CPUFREQ_RELATION_H;
+			}
+			else if (ramp_down_step)
+				new_freq = old_freq - ramp_down_step;
+			else {
+				// Load heuristics: Adjust new_freq such that, assuming a linear
+				// scaling of load vs. frequency, the load in the new frequency
+				// will be max_cpu_load:
+				new_freq = old_freq * this_smartass->cur_cpu_load / max_cpu_load;
+				if (new_freq > old_freq) // min_cpu_load > max_cpu_load ?!
+					new_freq = old_freq -1;
+			}
+			dprintk(SMARTASS_DEBUG_ALG,"smartassQ @ %d ramp down: ramp_dir=%d ideal=%d\n",
+				old_freq,ramp_dir,this_smartass->ideal_speed);
+		}
+		else { // ramp_dir==0 ?! Could the timer change its mind about a queued ramp up/down
+		       // before the work task gets to run?
+		       // This may also happen if we refused to ramp up because the nr_running()==1
+			new_freq = old_freq;
+			dprintk(SMARTASS_DEBUG_ALG,"smartassQ @ %d nothing: ramp_dir=%d nr_running=%lu\n",
+				old_freq,ramp_dir,nr_running());
+		}
+
+		// do actual ramp up (returns 0, if frequency change failed):
+		new_freq = target_freq(policy,this_smartass,new_freq,old_freq,relation);
+		if (new_freq)
+			this_smartass->freq_change_time_in_idle =
+				get_cpu_idle_time_us(cpu,&this_smartass->freq_change_time);
+
+		// reset timer:
+		if (new_freq < policy->max)
+			reset_timer(cpu,this_smartass);
+		// if we are maxed out, it is pointless to use the timer
+		// (idle cycles wake up the timer when the timer comes)
+		else if (timer_pending(&this_smartass->timer))
+			del_timer(&this_smartass->timer);
+	}
+}
+
+static ssize_t show_debug_mask(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", debug_mask);
+}
+
+static ssize_t store_debug_mask(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count)
+{
+	ssize_t res;
+	unsigned long input;
+	res = strict_strtoul(buf, 0, &input);
+	if (res >= 0)
+		debug_mask = input;
+	return res;
+}
+
+static ssize_t show_up_rate_us(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", up_rate_us);
+}
+
+static ssize_t store_up_rate_us(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count)
+{
+	ssize_t res;
+	unsigned long input;
+	res = strict_strtoul(buf, 0, &input);
+	if (res >= 0 && input >= 0 && input <= 100000000)
+		up_rate_us = input;
+	return res;
+}
+
+static ssize_t show_down_rate_us(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", down_rate_us);
+}
+
+static ssize_t store_down_rate_us(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count)
+{
+	ssize_t res;
+	unsigned long input;
+	res = strict_strtoul(buf, 0, &input);
+	if (res >= 0 && input >= 0 && input <= 100000000)
+		down_rate_us = input;
+	return res;
+}
+
+static ssize_t show_sleep_ideal_freq(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", sleep_ideal_freq);
+}
+
+static ssize_t store_sleep_ideal_freq(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count)
+{
+	ssize_t res;
+	unsigned long input;
+	res = strict_strtoul(buf, 0, &input);
+	if (res >= 0 && input >= 0) {
+		sleep_ideal_freq = input;
+		if (suspended)
+			smartass_update_min_max_allcpus();
+	}
+	return res;
+}
+
+static ssize_t show_sleep_wakeup_freq(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", sleep_wakeup_freq);
+}
+
+static ssize_t store_sleep_wakeup_freq(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count)
+{
+	ssize_t res;
+	unsigned long input;
+	res = strict_strtoul(buf, 0, &input);
+	if (res >= 0 && input >= 0)
+		sleep_wakeup_freq = input;
+	return res;
+}
+
+static ssize_t show_awake_ideal_freq(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", awake_ideal_freq);
+}
+
+static ssize_t store_awake_ideal_freq(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count)
+{
+	ssize_t res;
+	unsigned long input;
+	res = strict_strtoul(buf, 0, &input);
+	if (res >= 0 && input >= 0) {
+		awake_ideal_freq = input;
+		if (!suspended)
+			smartass_update_min_max_allcpus();
+	}
+	return res;
+}
+
+static ssize_t show_sample_rate_jiffies(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", sample_rate_jiffies);
+}
+
+static ssize_t store_sample_rate_jiffies(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count)
+{
+	ssize_t res;
+	unsigned long input;
+	res = strict_strtoul(buf, 0, &input);
+	if (res >= 0 && input > 0 && input <= 1000)
+		sample_rate_jiffies = input;
+	return res;
+}
+
+static ssize_t show_ramp_up_step(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", ramp_up_step);
+}
+
+static ssize_t store_ramp_up_step(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count)
+{
+	ssize_t res;
+	unsigned long input;
+	res = strict_strtoul(buf, 0, &input);
+	if (res >= 0 && input >= 0)
+		ramp_up_step = input;
+	return res;
+}
+
+static ssize_t show_ramp_down_step(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", ramp_down_step);
+}
+
+static ssize_t store_ramp_down_step(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count)
+{
+	ssize_t res;
+	unsigned long input;
+	res = strict_strtoul(buf, 0, &input);
+	if (res >= 0 && input >= 0)
+		ramp_down_step = input;
+	return res;
+}
+
+static ssize_t show_max_cpu_load(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", max_cpu_load);
+}
+
+static ssize_t store_max_cpu_load(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count)
+{
+	ssize_t res;
+	unsigned long input;
+	res = strict_strtoul(buf, 0, &input);
+	if (res >= 0 && input > 0 && input <= 100)
+		max_cpu_load = input;
+	return res;
+}
+
+static ssize_t show_min_cpu_load(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", min_cpu_load);
+}
+
+static ssize_t store_min_cpu_load(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count)
+{
+	ssize_t res;
+	unsigned long input;
+	res = strict_strtoul(buf, 0, &input);
+	if (res >= 0 && input > 0 && input < 100)
+		min_cpu_load = input;
+	return res;
+}
+
+#define define_global_rw_attr(_name)		\
+static struct global_attr _name##_attr =	\
+	__ATTR(_name, 0644, show_##_name, store_##_name)
+
+define_global_rw_attr(debug_mask);
+define_global_rw_attr(up_rate_us);
+define_global_rw_attr(down_rate_us);
+define_global_rw_attr(sleep_ideal_freq);
+define_global_rw_attr(sleep_wakeup_freq);
+define_global_rw_attr(awake_ideal_freq);
+define_global_rw_attr(sample_rate_jiffies);
+define_global_rw_attr(ramp_up_step);
+define_global_rw_attr(ramp_down_step);
+define_global_rw_attr(max_cpu_load);
+define_global_rw_attr(min_cpu_load);
+
+static struct attribute * smartass_attributes[] = {
+	&debug_mask_attr.attr,
+	&up_rate_us_attr.attr,
+	&down_rate_us_attr.attr,
+	&sleep_ideal_freq_attr.attr,
+	&sleep_wakeup_freq_attr.attr,
+	&awake_ideal_freq_attr.attr,
+	&sample_rate_jiffies_attr.attr,
+	&ramp_up_step_attr.attr,
+	&ramp_down_step_attr.attr,
+	&max_cpu_load_attr.attr,
+	&min_cpu_load_attr.attr,
+	NULL,
+};
+
+static struct attribute_group smartass_attr_group = {
+	.attrs = smartass_attributes,
+	.name = "smartass",
+};
+
+static int cpufreq_governor_smartass(struct cpufreq_policy *new_policy,
+		unsigned int event)
+{
+	unsigned int cpu = new_policy->cpu;
+	int rc;
+	struct smartass_info_s *this_smartass = &per_cpu(smartass_info, cpu);
+
+	switch (event) {
+	case CPUFREQ_GOV_START:
+		if ((!cpu_online(cpu)) || (!new_policy->cur))
+			return -EINVAL;
+
+		this_smartass->cur_policy = new_policy;
+
+		this_smartass->enable = 1;
+
+		smartass_update_min_max(this_smartass,new_policy,suspended);
+
+		this_smartass->freq_table = cpufreq_frequency_get_table(cpu);
+		if (!this_smartass->freq_table)
+			printk(KERN_WARNING "Smartass: no frequency table for cpu %d?!\n",cpu);
+
+		smp_wmb();
+
+		// Do not register the idle hook and create sysfs
+		// entries if we have already done so.
+		if (atomic_inc_return(&active_count) <= 1) {
+			rc = sysfs_create_group(cpufreq_global_kobject,
+						&smartass_attr_group);
+			if (rc)
+				return rc;
+
+			pm_idle_old = pm_idle;
+			pm_idle = cpufreq_idle;
+		}
+
+		if (this_smartass->cur_policy->cur < new_policy->max && !timer_pending(&this_smartass->timer))
+			reset_timer(cpu,this_smartass);
+
+		break;
+
+	case CPUFREQ_GOV_LIMITS:
+		smartass_update_min_max(this_smartass,new_policy,suspended);
+
+		if (this_smartass->cur_policy->cur > new_policy->max) {
+			dprintk(SMARTASS_DEBUG_JUMPS,"SmartassI: jumping to new max freq: %d\n",new_policy->max);
+			__cpufreq_driver_target(this_smartass->cur_policy,
+						new_policy->max, CPUFREQ_RELATION_H);
+		}
+		else if (this_smartass->cur_policy->cur < new_policy->min) {
+			dprintk(SMARTASS_DEBUG_JUMPS,"SmartassI: jumping to new min freq: %d\n",new_policy->min);
+			__cpufreq_driver_target(this_smartass->cur_policy,
+						new_policy->min, CPUFREQ_RELATION_L);
+		}
+
+		if (this_smartass->cur_policy->cur < new_policy->max && !timer_pending(&this_smartass->timer))
+			reset_timer(cpu,this_smartass);
+
+		break;
+
+	case CPUFREQ_GOV_STOP:
+		this_smartass->enable = 0;
+		smp_wmb();
+		del_timer(&this_smartass->timer);
+		flush_work(&freq_scale_work);
+		this_smartass->idle_exit_time = 0;
+
+		if (atomic_dec_return(&active_count) <= 1) {
+			sysfs_remove_group(cpufreq_global_kobject,
+					   &smartass_attr_group);
+			pm_idle = pm_idle_old;
+		}
+		break;
+	}
+
+	return 0;
+}
+
+static void smartass_suspend(int cpu, int suspend)
+{
+	struct smartass_info_s *this_smartass = &per_cpu(smartass_info, smp_processor_id());
+	struct cpufreq_policy *policy = this_smartass->cur_policy;
+	unsigned int new_freq;
+
+	if (!this_smartass->enable)
+		return;
+
+	smartass_update_min_max(this_smartass,policy,suspend);
+	if (!suspend) { // resume at max speed:
+		new_freq = validate_freq(policy,sleep_wakeup_freq);
+
+		dprintk(SMARTASS_DEBUG_JUMPS,"SmartassS: awaking at %d\n",new_freq);
+
+		__cpufreq_driver_target(policy, new_freq,
+					CPUFREQ_RELATION_L);
+	} else {
+		// to avoid wakeup issues with quick sleep/wakeup don't change actual frequency when entering sleep
+		// to allow some time to settle down. Instead we just reset our statistics (and reset the timer).
+		// Eventually, the timer will adjust the frequency if necessary.
+
+		this_smartass->freq_change_time_in_idle =
+			get_cpu_idle_time_us(cpu,&this_smartass->freq_change_time);
+
+		dprintk(SMARTASS_DEBUG_JUMPS,"SmartassS: suspending at %d\n",policy->cur);
+	}
+
+	reset_timer(smp_processor_id(),this_smartass);
+}
+
+static void smartass_early_suspend(struct early_suspend *handler) {
+	int i;
+	if (suspended || sleep_ideal_freq==0) // disable behavior for sleep_ideal_freq==0
+		return;
+	suspended = 1;
+	for_each_online_cpu(i)
+		smartass_suspend(i,1);
+}
+
+static void smartass_late_resume(struct early_suspend *handler) {
+	int i;
+	if (!suspended) // already not suspended so nothing to do
+		return;
+	suspended = 0;
+	for_each_online_cpu(i)
+		smartass_suspend(i,0);
+}
+
+static struct early_suspend smartass_power_suspend = {
+	.suspend = smartass_early_suspend,
+	.resume = smartass_late_resume,
+#ifdef CONFIG_MACH_HERO
+	.level = EARLY_SUSPEND_LEVEL_DISABLE_FB + 1,
+#endif
+};
+
+static int __init cpufreq_smartass_init(void)
+{
+	unsigned int i;
+	struct smartass_info_s *this_smartass;
+	debug_mask = 0;
+	up_rate_us = DEFAULT_UP_RATE_US;
+	down_rate_us = DEFAULT_DOWN_RATE_US;
+	sleep_ideal_freq = DEFAULT_SLEEP_IDEAL_FREQ;
+	sleep_wakeup_freq = DEFAULT_SLEEP_WAKEUP_FREQ;
+	awake_ideal_freq = DEFAULT_AWAKE_IDEAL_FREQ;
+	sample_rate_jiffies = DEFAULT_SAMPLE_RATE_JIFFIES;
+	ramp_up_step = DEFAULT_RAMP_UP_STEP;
+	ramp_down_step = DEFAULT_RAMP_DOWN_STEP;
+	max_cpu_load = DEFAULT_MAX_CPU_LOAD;
+	min_cpu_load = DEFAULT_MIN_CPU_LOAD;
+
+	spin_lock_init(&cpumask_lock);
+
+	suspended = 0;
+
+	/* Initalize per-cpu data: */
+	for_each_possible_cpu(i) {
+		this_smartass = &per_cpu(smartass_info, i);
+		this_smartass->enable = 0;
+		this_smartass->cur_policy = 0;
+		this_smartass->ramp_dir = 0;
+		this_smartass->time_in_idle = 0;
+		this_smartass->idle_exit_time = 0;
+		this_smartass->freq_change_time = 0;
+		this_smartass->freq_change_time_in_idle = 0;
+		this_smartass->cur_cpu_load = 0;
+		// intialize timer:
+		init_timer_deferrable(&this_smartass->timer);
+		this_smartass->timer.function = cpufreq_smartass_timer;
+		this_smartass->timer.data = i;
+		work_cpumask_test_and_clear(i);
+	}
+
+	// Scale up is high priority
+	up_wq = alloc_workqueue("ksmartass_up", WQ_HIGHPRI, 1);
+	down_wq = alloc_workqueue("ksmartass_down", 0, 1);
+	if (!up_wq || !down_wq)
+		return -ENOMEM;
+
+	INIT_WORK(&freq_scale_work, cpufreq_smartass_freq_change_time_work);
+
+	register_early_suspend(&smartass_power_suspend);
+
+	return cpufreq_register_governor(&cpufreq_gov_smartass2);
+}
+
+#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SMARTASS2
+fs_initcall(cpufreq_smartass_init);
+#else
+module_init(cpufreq_smartass_init);
+#endif
+
+static void __exit cpufreq_smartass_exit(void)
+{
+	cpufreq_unregister_governor(&cpufreq_gov_smartass2);
+	destroy_workqueue(up_wq);
+	destroy_workqueue(down_wq);
+}
+
+module_exit(cpufreq_smartass_exit);
+
+MODULE_AUTHOR ("Erasmux");
+MODULE_DESCRIPTION ("'cpufreq_smartass2' - A smart cpufreq governor");
+MODULE_LICENSE ("GPL");
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 957c5b414..874922d13 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -363,6 +363,9 @@ extern struct cpufreq_governor cpufreq_gov_conservative;
 #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE)
 extern struct cpufreq_governor cpufreq_gov_interactive;
 #define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_interactive)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SMARTASS2)
+extern struct cpufreq_governor cpufreq_gov_smartass2;
+#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_smartass2)
 #endif
 
 

From 526a44be1bd6964cb095a986b488916577243a41 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Mon, 14 Jan 2013 17:31:36 +0200
Subject: [PATCH 08/19] change default toolchain path

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index c51d9cdd9..bcede986c 100755
--- a/Makefile
+++ b/Makefile
@@ -193,7 +193,7 @@ SUBARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
 # Note: Some architectures assign CROSS_COMPILE in their arch/*/Makefile
 export KBUILD_BUILDHOST := $(SUBARCH)
 ARCH		?= arm
-CROSS_COMPILE	?= $(CONFIG_CROSS_COMPILE:"%"=%)
+CROSS_COMPILE	?= /home/forumber/prebuilt/linux-x86/toolchain/arm-eabi-4.4.3/bin/arm-eabi-
 
 # Architecture as present in compile.h
 UTS_MACHINE 	:= $(ARCH)

From 81617215ee066368a53adb93b5b45dff30541ba2 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Mon, 14 Jan 2013 17:34:11 +0200
Subject: [PATCH 09/19] change dzo@martin to forumber@dzo

---
 scripts/mkcompile_h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
index d5428594f..a7937dbbd 100755
--- a/scripts/mkcompile_h
+++ b/scripts/mkcompile_h
@@ -75,8 +75,8 @@ UTS_TRUNCATE="cut -b -$UTS_LEN"
 
   #/* < DTS2011052606009 jiaxianghong 20110527 begin */
   #/* < DTS2011030103387 niguodong 20110415 begin */
-  echo \#define LINUX_COMPILE_BY \"dzo\"
-  echo \#define LINUX_COMPILE_HOST \"martin\"
+  echo \#define LINUX_COMPILE_BY \"forumber\"
+  echo \#define LINUX_COMPILE_HOST \"dzo\"
   #/* DTS2011030103387 niguodong 20110415 end > */
   #/* < DTS2011052606009 jiaxianghong 20110527 end */
 

From 68b7409e06b30522d50e765bae67a0e2866fb3c7 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Mon, 14 Jan 2013 17:43:46 +0200
Subject: [PATCH 10/19] 122 Mhz and 245 Mhz are disabled

---
 arch/arm/mach-msm/acpuclock-7x30.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/mach-msm/acpuclock-7x30.c b/arch/arm/mach-msm/acpuclock-7x30.c
index 8ffe61a4d..4b7465358 100644
--- a/arch/arm/mach-msm/acpuclock-7x30.c
+++ b/arch/arm/mach-msm/acpuclock-7x30.c
@@ -121,10 +121,10 @@ static struct clk *acpuclk_sources[MAX_SOURCE];
 static struct clkctl_acpu_speed acpu_freq_tbl[] = {
 	{ 0, 24576,  LPXO, 0, 0,  30720000,  900, VDD_RAW(900) },
 	{ 0, 61440,  PLL_3,    5, 11, 61440000,  900, VDD_RAW(900) },
-	{ 1, 122880, PLL_3,    5, 5,  61440000,  900, VDD_RAW(900) },
+	{ 0, 122880, PLL_3,    5, 5,  61440000,  900, VDD_RAW(900) },
 	{ 0, 184320, PLL_3,    5, 4,  61440000,  900, VDD_RAW(900) },
 	{ 0, MAX_AXI_KHZ, AXI, 1, 0, 61440000, 900, VDD_RAW(900) },
-	{ 1, 245760, PLL_3,    5, 2,  61440000,  900, VDD_RAW(900) },
+	{ 0, 245760, PLL_3,    5, 2,  61440000,  900, VDD_RAW(900) },
 	{ 1, 368640, PLL_3,    5, 1,  122800000, 900, VDD_RAW(900) },
 	{ 0, 480000,  PLL_2,    3, 0,    122800000, 900,  VDD_RAW(900),  &pll2_tbl[0]},
 	{ 0, 600000,  PLL_2,    3, 0,    122800000, 925,  VDD_RAW(925),  &pll2_tbl[1]},

From d9cf2bb683b362b6c463f4098439efbeaae79dca Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Mon, 14 Jan 2013 17:45:57 +0200
Subject: [PATCH 11/19] 480 Mhz and 600 Mhz are enabled

---
 arch/arm/mach-msm/acpuclock-7x30.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/mach-msm/acpuclock-7x30.c b/arch/arm/mach-msm/acpuclock-7x30.c
index 4b7465358..fb4ab5387 100644
--- a/arch/arm/mach-msm/acpuclock-7x30.c
+++ b/arch/arm/mach-msm/acpuclock-7x30.c
@@ -126,8 +126,8 @@ static struct clkctl_acpu_speed acpu_freq_tbl[] = {
 	{ 0, MAX_AXI_KHZ, AXI, 1, 0, 61440000, 900, VDD_RAW(900) },
 	{ 0, 245760, PLL_3,    5, 2,  61440000,  900, VDD_RAW(900) },
 	{ 1, 368640, PLL_3,    5, 1,  122800000, 900, VDD_RAW(900) },
-	{ 0, 480000,  PLL_2,    3, 0,    122800000, 900,  VDD_RAW(900),  &pll2_tbl[0]},
-	{ 0, 600000,  PLL_2,    3, 0,    122800000, 925,  VDD_RAW(925),  &pll2_tbl[1]},
+	{ 1, 480000,  PLL_2,    3, 0,    122800000, 900,  VDD_RAW(900),  &pll2_tbl[0]},
+	{ 1, 600000,  PLL_2,    3, 0,    122800000, 925,  VDD_RAW(925),  &pll2_tbl[1]},
 	/* AXI has MSMC1 implications. See above. */
 	{ 1, 768000, PLL_1,    2, 0,  153600000, 1050, VDD_RAW(1050) },
 	/*

From 35acafc2e78c5b1c859bf9b2aae2df481445cc91 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Mon, 14 Jan 2013 17:48:49 +0200
Subject: [PATCH 12/19] undervolt some freq for battery life

---
 arch/arm/mach-msm/acpuclock-7x30.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm/mach-msm/acpuclock-7x30.c b/arch/arm/mach-msm/acpuclock-7x30.c
index fb4ab5387..7c8dddf44 100644
--- a/arch/arm/mach-msm/acpuclock-7x30.c
+++ b/arch/arm/mach-msm/acpuclock-7x30.c
@@ -129,12 +129,12 @@ static struct clkctl_acpu_speed acpu_freq_tbl[] = {
 	{ 1, 480000,  PLL_2,    3, 0,    122800000, 900,  VDD_RAW(900),  &pll2_tbl[0]},
 	{ 1, 600000,  PLL_2,    3, 0,    122800000, 925,  VDD_RAW(925),  &pll2_tbl[1]},
 	/* AXI has MSMC1 implications. See above. */
-	{ 1, 768000, PLL_1,    2, 0,  153600000, 1050, VDD_RAW(1050) },
+	{ 1, 768000, PLL_1,    2, 0,  153600000, 975, VDD_RAW(975) },
 	/*
 	 * AXI has MSMC1 implications. See above.
 	 */
-	{ 1, 806400,  PLL_2, 3, 0, UINT_MAX, 1100, VDD_RAW(1100), &pll2_tbl[2]},
-	{ 1, 1024000, PLL_2, 3, 0, UINT_MAX, 1200, VDD_RAW(1200), &pll2_tbl[3]},
+	{ 1, 806400,  PLL_2, 3, 0, UINT_MAX, 1000, VDD_RAW(1000), &pll2_tbl[2]},
+	{ 1, 1024000, PLL_2, 3, 0, UINT_MAX, 1100, VDD_RAW(1100), &pll2_tbl[3]},
 	{ 1, 1200000, PLL_2, 3, 0, UINT_MAX, 1200, VDD_RAW(1200), &pll2_tbl[4]},
 	{ 1, 1401600, PLL_2, 3, 0, UINT_MAX, 1250, VDD_RAW(1250), &pll2_tbl[5]},
 	{ 1, 1516800, PLL_2, 3, 0, UINT_MAX, 1300, VDD_RAW(1300), &pll2_tbl[6]},

From e61703f6f440ad146a2d59d4eb50e66c0626f1d2 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Fri, 25 Jan 2013 16:31:35 +0200
Subject: [PATCH 13/19] VDD sysfs interface (snq-)

---
 arch/arm/configs/u8800_defconfig   |  6 ++-
 arch/arm/mach-msm/acpuclock-7x30.c | 41 ++++++++++++++++++++
 drivers/cpufreq/Kconfig            |  8 ++++
 drivers/cpufreq/cpufreq.c          | 62 ++++++++++++++++++++++++++++++
 4 files changed, 115 insertions(+), 2 deletions(-)

diff --git a/arch/arm/configs/u8800_defconfig b/arch/arm/configs/u8800_defconfig
index 60c6fc0d0..e252fda97 100644
--- a/arch/arm/configs/u8800_defconfig
+++ b/arch/arm/configs/u8800_defconfig
@@ -382,7 +382,7 @@ CONFIG_MSM_DALRPC=y
 CONFIG_MSM_DALRPC_TEST=m
 CONFIG_MSM_CPU_FREQ_SET_MIN_MAX=y
 CONFIG_MSM_CPU_FREQ_MAX=1024000
-CONFIG_MSM_CPU_FREQ_MIN=122880
+CONFIG_MSM_CPU_FREQ_MIN=368640
 # CONFIG_MSM_AVS_HW is not set
 # CONFIG_MSM_HW3D is not set
 CONFIG_AMSS_7X25_VERSION_2009=y
@@ -584,18 +584,20 @@ CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_TABLE=y
 CONFIG_CPU_FREQ_STAT=y
 # CONFIG_CPU_FREQ_STAT_DETAILS is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_SMARTASS2=y
 CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
 CONFIG_CPU_FREQ_GOV_POWERSAVE=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_INTERACTIVE=y
 CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
+CONFIG_CPU_FREQ_VDD_LEVELS=y
 CONFIG_CPU_FREQ_GOV_SMARTASS2=y
 # CONFIG_CPU_IDLE is not set
 CONFIG_CPU_FREQ_MSM=y
diff --git a/arch/arm/mach-msm/acpuclock-7x30.c b/arch/arm/mach-msm/acpuclock-7x30.c
index 7c8dddf44..8f93fcfc4 100644
--- a/arch/arm/mach-msm/acpuclock-7x30.c
+++ b/arch/arm/mach-msm/acpuclock-7x30.c
@@ -52,6 +52,8 @@
 #define VDD_RAW(mv) (((MV(mv) / V_STEP) - 30) | VREG_DATA)
 
 #define MAX_AXI_KHZ 192000
+#define SEMC_ACPU_MIN_UV_MV 750U
+#define SEMC_ACPU_MAX_UV_MV 1500U
 
 struct clock_state {
 	struct clkctl_acpu_speed	*current_speed;
@@ -493,3 +495,42 @@ static int __init acpuclk_7x30_init(struct acpuclk_soc_data *soc_data)
 struct acpuclk_soc_data acpuclk_7x30_soc_data __initdata = {
 	.init = acpuclk_7x30_init,
 };
+
+#ifdef CONFIG_CPU_FREQ_VDD_LEVELS
+
+ssize_t acpuclk_get_vdd_levels_str(char *buf)
+{
+	int i, len = 0;
+	if (buf)
+	{
+		mutex_lock(&drv_state.lock);
+		for (i = 0; acpu_freq_tbl[i].acpu_clk_khz; i++)
+		{
+			len += sprintf(buf + len, "%8u: %4d\n", acpu_freq_tbl[i].acpu_clk_khz, acpu_freq_tbl[i].vdd_mv);
+		}
+		mutex_unlock(&drv_state.lock);
+	}
+	return len;
+}
+
+void acpuclk_set_vdd(unsigned int khz, int vdd)
+{
+	int i;
+	unsigned int new_vdd;
+	vdd = vdd / V_STEP * V_STEP;
+	mutex_lock(&drv_state.lock);
+	for (i = 0; acpu_freq_tbl[i].acpu_clk_khz; i++)
+	{
+		if (khz == 0)
+			new_vdd = min(max((acpu_freq_tbl[i].vdd_mv + vdd), SEMC_ACPU_MIN_UV_MV), SEMC_ACPU_MAX_UV_MV);
+		else if (acpu_freq_tbl[i].acpu_clk_khz == khz)
+			new_vdd = min(max((unsigned int)vdd, SEMC_ACPU_MIN_UV_MV), SEMC_ACPU_MAX_UV_MV);
+		else continue;
+
+		acpu_freq_tbl[i].vdd_mv = new_vdd;
+		acpu_freq_tbl[i].vdd_raw = VDD_RAW(new_vdd);
+	}
+	mutex_unlock(&drv_state.lock);
+}
+
+#endif
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 6f643138e..697ce42b6 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -213,6 +213,14 @@ config CPU_FREQ_GOV_CONSERVATIVE
 
 	  If in doubt, say N.
 
+config CPU_FREQ_VDD_LEVELS
+  bool "CPU Vdd levels sysfs interface"
+  depends on CPU_FREQ_STAT
+  depends on ARCH_MSM7X30
+  default n
+  help
+    CPU Vdd levels sysfs interface
+
 config CPU_FREQ_GOV_SMARTASS2
 	tristate "'smartassV2' cpufreq governor"
 	depends on CPU_FREQ
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index ff15497e9..7d92421c1 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -557,6 +557,62 @@ static ssize_t show_scaling_setspeed(struct cpufreq_policy *policy, char *buf)
 	return policy->governor->show_setspeed(policy, buf);
 }
 
+#ifdef CONFIG_CPU_FREQ_VDD_LEVELS
+
+extern ssize_t acpuclk_get_vdd_levels_str(char *buf);
+static ssize_t show_vdd_levels(struct cpufreq_policy *policy, char *buf)
+{
+	return acpuclk_get_vdd_levels_str(buf);
+}
+
+extern void acpuclk_set_vdd(unsigned acpu_khz, int vdd);
+static ssize_t store_vdd_levels(struct cpufreq_policy *policy, const char *buf, size_t count)
+{
+	int i = 0, j;
+	int pair[2] = { 0, 0 };
+	int sign = 0;
+
+	if (count < 1)
+		return 0;
+
+	if (buf[0] == '-') {
+		sign = -1;
+		i++;
+	} else if (buf[0] == '+') {
+		sign = 1;
+		i++;
+	}
+
+	for (j = 0; i < count; i++) {
+		char c = buf[i];
+		if ((c >= '0') && (c <= '9')) {
+			pair[j] *= 10;
+			pair[j] += (c - '0');
+		} else if ((c == ' ') || (c == '\t')) {
+			if (pair[j] != 0) {
+				j++;
+				if ((sign != 0) || (j > 1))
+					break;
+			}
+		}
+		else
+			break;
+	}
+
+	if (sign != 0) {
+		if (pair[0] > 0)
+			acpuclk_set_vdd(0, sign * pair[0]);
+	} else {
+		if ((pair[0] > 0) && (pair[1] > 0))
+			acpuclk_set_vdd((unsigned)pair[0], pair[1]);
+		else
+			return -EINVAL;
+	}
+	return count;
+}
+
+#endif
+
 /**
  * show_scaling_driver - show the current cpufreq HW/BIOS limitation
  */
@@ -586,6 +642,9 @@ cpufreq_freq_attr_rw(scaling_min_freq);
 cpufreq_freq_attr_rw(scaling_max_freq);
 cpufreq_freq_attr_rw(scaling_governor);
 cpufreq_freq_attr_rw(scaling_setspeed);
+#ifdef CONFIG_CPU_FREQ_VDD_LEVELS
+cpufreq_freq_attr_rw(vdd_levels);
+#endif
 
 static struct attribute *default_attrs[] = {
 	&cpuinfo_min_freq.attr,
@@ -599,6 +658,9 @@ static struct attribute *default_attrs[] = {
 	&scaling_driver.attr,
 	&scaling_available_governors.attr,
 	&scaling_setspeed.attr,
+	#ifdef CONFIG_CPU_FREQ_VDD_LEVELS
+	&vdd_levels.attr,
+	#endif
 	NULL
 };
 

From 6196e001fa3a5a1fab6d3c0cb8a3a244a6feed59 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Fri, 25 Jan 2013 16:35:07 +0200
Subject: [PATCH 14/19] change kgsl and adreno drivers to u8800pro drivers

---
 drivers/gpu/msm/adreno.c                      |  43 +--
 drivers/gpu/msm/adreno.h                      |   8 +-
 drivers/gpu/msm/adreno_a2xx.c                 |  81 ++---
 drivers/gpu/msm/adreno_debugfs.c              |   2 -
 drivers/gpu/msm/adreno_drawctxt.c             |   7 +-
 drivers/gpu/msm/adreno_pm4types.h             |  39 +--
 drivers/gpu/msm/adreno_postmortem.c           |   9 +-
 drivers/gpu/msm/adreno_ringbuffer.c           | 219 +-------------
 drivers/gpu/msm/adreno_snapshot.c             | 129 +-------
 drivers/gpu/msm/kgsl.c                        | 277 ++++++++----------
 drivers/gpu/msm/kgsl.h                        |  63 ++--
 drivers/gpu/msm/kgsl_cffdump.c                | 184 ++++++++++++
 drivers/gpu/msm/kgsl_device.h                 |   9 +-
 drivers/gpu/msm/kgsl_drm.c                    |  12 +-
 drivers/gpu/msm/kgsl_gpummu.c                 |  14 +-
 drivers/gpu/msm/kgsl_iommu.c                  |   6 +-
 drivers/gpu/msm/kgsl_pwrctrl.c                |  55 ++--
 drivers/gpu/msm/kgsl_pwrctrl.h                |   2 +
 drivers/gpu/msm/kgsl_pwrscale.c               |   4 +-
 drivers/gpu/msm/kgsl_pwrscale_idlestats.c     |   0
 drivers/gpu/msm/kgsl_sharedmem.c              | 181 ++++--------
 drivers/gpu/msm/kgsl_sharedmem.h              |  68 ++---
 drivers/gpu/msm/kgsl_snapshot.c               |  11 +-
 drivers/gpu/msm/z180.c                        |  55 ++--
 .../touchscreen/atmel_i2c_rmi_QT602240.c      |   0
 include/linux/msm_kgsl.h                      |  13 +-
 26 files changed, 547 insertions(+), 944 deletions(-)
 mode change 100644 => 100755 drivers/gpu/msm/adreno.c
 mode change 100644 => 100755 drivers/gpu/msm/adreno.h
 mode change 100644 => 100755 drivers/gpu/msm/adreno_a2xx.c
 mode change 100644 => 100755 drivers/gpu/msm/adreno_postmortem.c
 mode change 100644 => 100755 drivers/gpu/msm/kgsl.c
 mode change 100644 => 100755 drivers/gpu/msm/kgsl_gpummu.c
 mode change 100644 => 100755 drivers/gpu/msm/kgsl_pwrctrl.c
 mode change 100644 => 100755 drivers/gpu/msm/kgsl_pwrctrl.h
 mode change 100644 => 100755 drivers/gpu/msm/kgsl_pwrscale.c
 mode change 100644 => 100755 drivers/gpu/msm/kgsl_pwrscale_idlestats.c
 mode change 100644 => 100755 drivers/gpu/msm/kgsl_sharedmem.c
 mode change 100644 => 100755 drivers/gpu/msm/kgsl_sharedmem.h
 mode change 100644 => 100755 drivers/gpu/msm/z180.c
 mode change 100755 => 100644 drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c
 mode change 100644 => 100755 include/linux/msm_kgsl.h

diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
old mode 100644
new mode 100755
index 8320003d2..7ff5a4384
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -114,7 +114,6 @@ static struct adreno_device device_3d0 = {
 	.pfp_fw = NULL,
 	.pm4_fw = NULL,
 	.wait_timeout = 10000, /* in milliseconds */
-	.ib_check_level = 0,
 };
 
 
@@ -274,12 +273,6 @@ static void adreno_setstate(struct kgsl_device *device,
 	int sizedwords = 0;
 	unsigned int mh_mmu_invalidate = 0x00000003; /*invalidate all and tc */
 
-	/*
-	 * Fix target freeze issue by adding TLB flush for each submit
-	 * on A20X based targets.
-	 */
-	if (adreno_is_a20x(adreno_dev))
-		flags |= KGSL_MMUFLAGS_TLBFLUSH;
 	/*
 	 * If possible, then set the state via the command stream to avoid
 	 * a CPU idle.  Otherwise, use the default setstate which uses register
@@ -645,8 +638,6 @@ adreno_recover_hang(struct kgsl_device *device)
 	unsigned int soptimestamp;
 	unsigned int eoptimestamp;
 	struct adreno_context *drawctxt;
-	struct kgsl_context *context;
-	int next = 0;
 
 	KGSL_DRV_ERR(device, "Starting recovery from 3D GPU hang....\n");
 	rb_buffer = vmalloc(rb->buffer_desc.size);
@@ -715,24 +706,6 @@ adreno_recover_hang(struct kgsl_device *device)
 
 	drawctxt->flags |= CTXT_FLAGS_GPU_HANG;
 
-	/*
-	 * Set the reset status of all contexts to
-	 * INNOCENT_CONTEXT_RESET_EXT except for the bad context
-	 * since thats the guilty party
-	 */
-	while ((context = idr_get_next(&device->context_idr, &next))) {
-		if (KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT !=
-			context->reset_status) {
-			if (context->devctxt != drawctxt)
-				context->reset_status =
-				KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT;
-			else
-				context->reset_status =
-				KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT;
-		}
-		next = next + 1;
-	}
-
 	/* Restore valid commands in ringbuffer */
 	adreno_ringbuffer_restore(rb, rb_buffer, num_rb_contents);
 	rb->timestamp = timestamp;
@@ -895,13 +868,15 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout)
 	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
 	unsigned int rbbm_status;
 	unsigned long wait_timeout =
-		msecs_to_jiffies(adreno_dev->wait_timeout);
+	msecs_to_jiffies(adreno_dev->wait_timeout);
+	/*< DTS2012041906630 zhangxiangdang 20120423 begin */
+	/*merge qc patch to fix kgsl issue.*/
 	unsigned long wait_time;
 	unsigned long wait_time_part;
 	unsigned int msecs;
 	unsigned int msecs_first;
 	unsigned int msecs_part;
-
+	/* DTS2012041906630 zhangxiangdang 20120423 end > */
 	kgsl_cffdump_regpoll(device->id, REG_RBBM_STATUS << 2,
 		0x00000000, 0x80000000);
 	/* first, wait until the CP has consumed all the commands in
@@ -909,6 +884,8 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout)
 	 */
 retry:
 	if (rb->flags & KGSL_FLAGS_STARTED) {
+		/*< DTS2012041906630 zhangxiangdang 20120423 begin */
+		/*merge qc patch to fix kgsl issue.*/
 		msecs = adreno_dev->wait_timeout;
 		msecs_first = (msecs <= 100) ? ((msecs + 4) / 5) : 100;
 		msecs_part = (msecs - msecs_first + 3) / 4;
@@ -921,6 +898,7 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout)
 				wait_time_part = jiffies +
 					msecs_to_jiffies(msecs_part);
 			}
+			/* DTS2012041906630 zhangxiangdang 20120423 end > */
 			GSL_RB_GET_READPTR(rb, &rb->rptr);
 			if (time_after(jiffies, wait_time)) {
 				KGSL_DRV_ERR(device, "rptr: %x, wptr: %x\n",
@@ -987,7 +965,7 @@ static int adreno_suspend_context(struct kgsl_device *device)
 	return status;
 }
 
-struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
+const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 						unsigned int pt_base,
 						unsigned int gpuaddr,
 						unsigned int size)
@@ -1014,7 +992,8 @@ struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 		if (!kgsl_mmu_pt_equal(priv->pagetable, pt_base))
 			continue;
 		spin_lock(&priv->mem_lock);
-		entry = kgsl_sharedmem_find_region(priv, gpuaddr, size);
+		entry = kgsl_sharedmem_find_region(priv, gpuaddr,
+						sizeof(unsigned int));
 		if (entry) {
 			result = &entry->memdesc;
 			spin_unlock(&priv->mem_lock);
@@ -1058,7 +1037,7 @@ struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 uint8_t *adreno_convertaddr(struct kgsl_device *device, unsigned int pt_base,
 			    unsigned int gpuaddr, unsigned int size)
 {
-	struct kgsl_memdesc *memdesc;
+	const struct kgsl_memdesc *memdesc;
 
 	memdesc = adreno_find_region(device, pt_base, gpuaddr, size);
 
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
old mode 100644
new mode 100755
index 1259507d9..af5bf51ea
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -46,8 +46,6 @@
 #define ADRENO_ISTORE_WORDS 3
 #define ADRENO_ISTORE_START 0x5000
 
-#define ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW	50
-
 enum adreno_gpurev {
 	ADRENO_REV_UNKNOWN = 0,
 	ADRENO_REV_A200 = 200,
@@ -76,16 +74,12 @@ struct adreno_device {
 	unsigned int wait_timeout;
 	unsigned int istore_size;
 	unsigned int pix_shader_start;
-	unsigned int ib_check_level;
 };
 
 struct adreno_gpudev {
-	/* keeps track of when we need to execute the draw workaround code */
-	int ctx_switches_since_last_draw;
 	int (*ctxt_create)(struct adreno_device *, struct adreno_context *);
 	void (*ctxt_save)(struct adreno_device *, struct adreno_context *);
 	void (*ctxt_restore)(struct adreno_device *, struct adreno_context *);
-	void (*ctxt_draw_workaround)(struct adreno_device *);
 	irqreturn_t (*irq_handler)(struct adreno_device *);
 	void (*irq_control)(struct adreno_device *, int);
 	void * (*snapshot)(struct adreno_device *, void *, int *, int);
@@ -105,7 +99,7 @@ void adreno_regread(struct kgsl_device *device, unsigned int offsetwords,
 void adreno_regwrite(struct kgsl_device *device, unsigned int offsetwords,
 				unsigned int value);
 
-struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
+const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 						unsigned int pt_base,
 						unsigned int gpuaddr,
 						unsigned int size);
diff --git a/drivers/gpu/msm/adreno_a2xx.c b/drivers/gpu/msm/adreno_a2xx.c
old mode 100644
new mode 100755
index 62628e4a3..5ce9cf85b
--- a/drivers/gpu/msm/adreno_a2xx.c
+++ b/drivers/gpu/msm/adreno_a2xx.c
@@ -1421,61 +1421,11 @@ static int a2xx_drawctxt_create(struct adreno_device *adreno_dev,
 	return ret;
 }
 
-static void a2xx_drawctxt_workaround(struct adreno_device *adreno_dev)
-{
-	struct kgsl_device *device = &adreno_dev->dev;
-	unsigned int cmd[11];
-	unsigned int *cmds = &cmd[0];
-
-	if (adreno_is_a225(adreno_dev)) {
-		adreno_dev->gpudev->ctx_switches_since_last_draw++;
-		/* If there have been > than
-		 * ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW calls to context
-		 * switches w/o gmem being saved then we need to execute
-		 * this workaround */
-		if (adreno_dev->gpudev->ctx_switches_since_last_draw >
-				ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW)
-			adreno_dev->gpudev->ctx_switches_since_last_draw = 0;
-		else
-			return;
-		/*
-		 * Issue an empty draw call to avoid possible hangs due to
-		 * repeated idles without intervening draw calls.
-		 * On adreno 225 the PC block has a cache that is only
-		 * flushed on draw calls and repeated idles can make it
-		 * overflow. The gmem save path contains draw calls so
-		 * this workaround isn't needed there.
-		 */
-		*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
-		*cmds++ = (0x4 << 16) | (REG_PA_SU_SC_MODE_CNTL - 0x2000);
-		*cmds++ = 0;
-		*cmds++ = cp_type3_packet(CP_DRAW_INDX, 5);
-		*cmds++ = 0;
-		*cmds++ = 1<<14;
-		*cmds++ = 0;
-		*cmds++ = device->mmu.setstate_memory.gpuaddr;
-		*cmds++ = 0;
-		*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
-		*cmds++ = 0x00000000;
-	} else {
-		/* On Adreno 20x/220, if the events for shader space reuse
-		 * gets dropped, the CP block would wait indefinitely.
-		 * Sending CP_SET_SHADER_BASES packet unblocks the CP from
-		 * this wait.
-		 */
-		*cmds++ = cp_type3_packet(CP_SET_SHADER_BASES, 1);
-		*cmds++ = adreno_encode_istore_size(adreno_dev)
-					| adreno_dev->pix_shader_start;
-	}
-
-	adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_PMODE,
-			&cmd[0], cmds - cmd);
-}
-
 static void a2xx_drawctxt_save(struct adreno_device *adreno_dev,
 			struct adreno_context *context)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
+	unsigned int cmd[22];
 
 	if (context == NULL)
 		return;
@@ -1520,11 +1470,33 @@ static void a2xx_drawctxt_save(struct adreno_device *adreno_dev,
 			adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_NONE,
 				context->chicken_restore, 3);
 		}
-		adreno_dev->gpudev->ctx_switches_since_last_draw = 0;
 
 		context->flags |= CTXT_FLAGS_GMEM_RESTORE;
-	} else if (adreno_is_a2xx(adreno_dev))
-		a2xx_drawctxt_workaround(adreno_dev);
+	} else if (adreno_is_a225(adreno_dev)) {
+		unsigned int *cmds = &cmd[0];
+		/*
+		 * Issue an empty draw call to avoid possible hangs due to
+		 * repeated idles without intervening draw calls.
+		 * On adreno 225 the PC block has a cache that is only
+		 * flushed on draw calls and repeated idles can make it
+		 * overflow. The gmem save path contains draw calls so
+		 * this workaround isn't needed there.
+		 */
+		*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
+		*cmds++ = (0x4 << 16) | (REG_PA_SU_SC_MODE_CNTL - 0x2000);
+		*cmds++ = 0;
+		*cmds++ = cp_type3_packet(CP_DRAW_INDX, 5);
+		*cmds++ = 0;
+		*cmds++ = 1<<14;
+		*cmds++ = 0;
+		*cmds++ = device->mmu.setstate_memory.gpuaddr;
+		*cmds++ = 0;
+		*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+		*cmds++ = 0x00000000;
+
+		adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_PMODE,
+					    &cmd[0], 11);
+	}
 }
 
 static void a2xx_drawctxt_restore(struct adreno_device *adreno_dev,
@@ -1785,7 +1757,6 @@ struct adreno_gpudev adreno_a2xx_gpudev = {
 	.ctxt_create = a2xx_drawctxt_create,
 	.ctxt_save = a2xx_drawctxt_save,
 	.ctxt_restore = a2xx_drawctxt_restore,
-	.ctxt_draw_workaround = a2xx_drawctxt_workaround,
 	.irq_handler = a2xx_irq_handler,
 	.irq_control = a2xx_irq_control,
 	.snapshot = a2xx_snapshot,
diff --git a/drivers/gpu/msm/adreno_debugfs.c b/drivers/gpu/msm/adreno_debugfs.c
index 566efa1aa..c1b9e4ce2 100644
--- a/drivers/gpu/msm/adreno_debugfs.c
+++ b/drivers/gpu/msm/adreno_debugfs.c
@@ -345,8 +345,6 @@ void adreno_debugfs_init(struct kgsl_device *device)
 			    &kgsl_cff_dump_enable_fops);
 	debugfs_create_u32("wait_timeout", 0644, device->d_debugfs,
 		&adreno_dev->wait_timeout);
-	debugfs_create_u32("ib_check", 0644, device->d_debugfs,
-			   &adreno_dev->ib_check_level);
 
 	/* Create post mortem control files */
 
diff --git a/drivers/gpu/msm/adreno_drawctxt.c b/drivers/gpu/msm/adreno_drawctxt.c
index f0b5741b5..206a678ee 100644
--- a/drivers/gpu/msm/adreno_drawctxt.c
+++ b/drivers/gpu/msm/adreno_drawctxt.c
@@ -243,13 +243,8 @@ void adreno_drawctxt_switch(struct adreno_device *adreno_dev,
 	}
 
 	/* already current? */
-	if (adreno_dev->drawctxt_active == drawctxt) {
-		if (adreno_dev->gpudev->ctxt_draw_workaround &&
-			adreno_is_a225(adreno_dev))
-				adreno_dev->gpudev->ctxt_draw_workaround(
-					adreno_dev);
+	if (adreno_dev->drawctxt_active == drawctxt)
 		return;
-	}
 
 	KGSL_CTXT_INFO(device, "from %p to %p flags %d\n",
 			adreno_dev->drawctxt_active, drawctxt, flags);
diff --git a/drivers/gpu/msm/adreno_pm4types.h b/drivers/gpu/msm/adreno_pm4types.h
index 454b05785..8aea58c95 100644
--- a/drivers/gpu/msm/adreno_pm4types.h
+++ b/drivers/gpu/msm/adreno_pm4types.h
@@ -29,6 +29,11 @@
 /* skip N 32-bit words to get to the next packet */
 #define CP_NOP			0x10
 
+/* indirect buffer dispatch.  prefetch parser uses this packet type to determine
+*  whether to pre-fetch the IB
+*/
+#define CP_INDIRECT_BUFFER	0x3f
+
 /* indirect buffer dispatch.  same as IB, but init is pipelined */
 #define CP_INDIRECT_BUFFER_PFD	0x37
 
@@ -112,9 +117,6 @@
 /* load constants from a location in memory */
 #define CP_LOAD_CONSTANT_CONTEXT 0x2e
 
-/* (A2x) sets binning configuration registers */
-#define CP_SET_BIN_DATA             0x2f
-
 /* selective invalidation of state pointers */
 #define CP_INVALIDATE_STATE	0x3b
 
@@ -155,16 +157,6 @@
 
 #define CP_SET_PROTECTED_MODE  0x5f /* sets the register protection mode */
 
-/*
- * for a3xx
- */
-
-/* Conditionally load a IB based on a flag */
-#define CP_COND_INDIRECT_BUFFER_PFE 0x3A /* prefetch enabled */
-#define CP_COND_INDIRECT_BUFFER_PFD 0x32 /* prefetch disabled */
-
-/* Load a buffer with pre-fetch enabled */
-#define CP_INDIRECT_BUFFER_PFE 0x3F
 
 /* packet header building macros */
 #define cp_type0_packet(regindx, cnt) \
@@ -186,20 +178,11 @@
 #define cp_nop_packet(cnt) \
 	 (CP_TYPE3_PKT | (((cnt)-1) << 16) | (CP_NOP << 8))
 
-#define pkt_is_type0(pkt) (((pkt) & 0XC0000000) == CP_TYPE0_PKT)
-
-#define type0_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1)
-#define type0_pkt_offset(pkt) ((pkt) & 0x7FFF)
-
-#define pkt_is_type3(pkt) (((pkt) & 0xC0000000) == CP_TYPE3_PKT)
-
-#define cp_type3_opcode(pkt) (((pkt) >> 8) & 0xFF)
-#define type3_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1)
 
 /* packet headers */
 #define CP_HDR_ME_INIT	cp_type3_packet(CP_ME_INIT, 18)
 #define CP_HDR_INDIRECT_BUFFER_PFD cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)
-#define CP_HDR_INDIRECT_BUFFER_PFE cp_type3_packet(CP_INDIRECT_BUFFER_PFE, 2)
+#define CP_HDR_INDIRECT_BUFFER	cp_type3_packet(CP_INDIRECT_BUFFER, 2)
 
 /* dword base address of the GFX decode space */
 #define SUBBLOCK_OFFSET(reg) ((unsigned int)((reg) - (0x2000)))
@@ -207,14 +190,4 @@
 /* gmem command buffer length */
 #define CP_REG(reg) ((0x4 << 16) | (SUBBLOCK_OFFSET(reg)))
 
-
-/* Return 1 if the command is an indirect buffer of any kind */
-static inline int adreno_cmd_is_ib(unsigned int cmd)
-{
-	return (cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFE, 2) ||
-		cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2) ||
-		cmd == cp_type3_packet(CP_COND_INDIRECT_BUFFER_PFE, 2) ||
-		cmd == cp_type3_packet(CP_COND_INDIRECT_BUFFER_PFD, 2));
-}
-
 #endif	/* __ADRENO_PM4TYPES_H */
diff --git a/drivers/gpu/msm/adreno_postmortem.c b/drivers/gpu/msm/adreno_postmortem.c
old mode 100644
new mode 100755
index 63f5caa91..40dfb30cf
--- a/drivers/gpu/msm/adreno_postmortem.c
+++ b/drivers/gpu/msm/adreno_postmortem.c
@@ -53,7 +53,7 @@ static const struct pm_id_name pm3_types[] = {
 	{CP_IM_LOAD,			"IN__LOAD"},
 	{CP_IM_LOAD_IMMEDIATE,		"IM_LOADI"},
 	{CP_IM_STORE,			"IM_STORE"},
-	{CP_INDIRECT_BUFFER_PFE,	"IND_BUF_"},
+	{CP_INDIRECT_BUFFER,		"IND_BUF_"},
 	{CP_INDIRECT_BUFFER_PFD,	"IND_BUFP"},
 	{CP_INTERRUPT,			"PM4_INTR"},
 	{CP_INVALIDATE_STATE,		"INV_STAT"},
@@ -200,7 +200,7 @@ static void dump_ib1(struct kgsl_device *device, uint32_t pt_base,
 
 	for (i = 0; i+3 < ib1_size; ) {
 		value = ib1_addr[i++];
-		if (adreno_cmd_is_ib(value)) {
+		if (value == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) {
 			uint32_t ib2_base = ib1_addr[i++];
 			uint32_t ib2_size = ib1_addr[i++];
 
@@ -611,7 +611,7 @@ static int adreno_dump(struct kgsl_device *device)
 	i = 0;
 	for (read_idx = 0; read_idx < num_item; ) {
 		uint32_t this_cmd = rb_copy[read_idx++];
-		if (adreno_cmd_is_ib(this_cmd)) {
+		if (this_cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) {
 			uint32_t ib_addr = rb_copy[read_idx++];
 			uint32_t ib_size = rb_copy[read_idx++];
 			dump_ib1(device, cur_pt_base, (read_idx-3)<<2, ib_addr,
@@ -654,7 +654,8 @@ static int adreno_dump(struct kgsl_device *device)
 		for (read_idx = NUM_DWORDS_OF_RINGBUFFER_HISTORY;
 			read_idx >= 0; --read_idx) {
 			uint32_t this_cmd = rb_copy[read_idx];
-			if (adreno_cmd_is_ib(this_cmd)) {
+			if (this_cmd == cp_type3_packet(
+				CP_INDIRECT_BUFFER_PFD, 2)) {
 				uint32_t ib_addr = rb_copy[read_idx+1];
 				uint32_t ib_size = rb_copy[read_idx+2];
 				if (ib_size && cp_ib1_base == ib_addr) {
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index 57883fd45..ea2889b3a 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -22,7 +22,6 @@
 #include "adreno.h"
 #include "adreno_pm4types.h"
 #include "adreno_ringbuffer.h"
-#include "adreno_debugfs.h"
 
 #include "a2xx_reg.h"
 
@@ -311,10 +310,12 @@ int adreno_ringbuffer_start(struct adreno_ringbuffer *rb, unsigned int init_ram)
 	adreno_regwrite(device, REG_SCRATCH_UMSK,
 			     GSL_RB_MEMPTRS_SCRATCH_MASK);
 
+    /*< DTS2012042406822 hanfeng 20120428 begin*/
 	/* update the eoptimestamp field with the last retired timestamp */
 	kgsl_sharedmem_writel(&device->memstore,
 			     KGSL_DEVICE_MEMSTORE_OFFSET(eoptimestamp),
 			     rb->timestamp);
+    /* DTS2012042406822 hanfeng 20120428 end > */
 
 	/* load the CP ucode */
 
@@ -553,197 +554,6 @@ adreno_ringbuffer_issuecmds(struct kgsl_device *device,
 	adreno_ringbuffer_addcmds(rb, flags, cmds, sizedwords);
 }
 
-static bool _parse_ibs(struct kgsl_device_private *dev_priv, uint gpuaddr,
-			   int sizedwords);
-
-static bool
-_handle_type3(struct kgsl_device_private *dev_priv, uint *hostaddr)
-{
-	unsigned int opcode = cp_type3_opcode(*hostaddr);
-	switch (opcode) {
-	case CP_INDIRECT_BUFFER_PFD:
-	case CP_INDIRECT_BUFFER_PFE:
-	case CP_COND_INDIRECT_BUFFER_PFE:
-	case CP_COND_INDIRECT_BUFFER_PFD:
-		return _parse_ibs(dev_priv, hostaddr[1], hostaddr[2]);
-	case CP_NOP:
-	case CP_WAIT_FOR_IDLE:
-	case CP_WAIT_REG_MEM:
-	case CP_WAIT_REG_EQ:
-	case CP_WAT_REG_GTE:
-	case CP_WAIT_UNTIL_READ:
-	case CP_WAIT_IB_PFD_COMPLETE:
-	case CP_REG_RMW:
-	case CP_REG_TO_MEM:
-	case CP_MEM_WRITE:
-	case CP_MEM_WRITE_CNTR:
-	case CP_COND_EXEC:
-	case CP_COND_WRITE:
-	case CP_EVENT_WRITE:
-	case CP_EVENT_WRITE_SHD:
-	case CP_EVENT_WRITE_CFL:
-	case CP_EVENT_WRITE_ZPD:
-	case CP_DRAW_INDX:
-	case CP_DRAW_INDX_2:
-	case CP_DRAW_INDX_BIN:
-	case CP_DRAW_INDX_2_BIN:
-	case CP_VIZ_QUERY:
-	case CP_SET_STATE:
-	case CP_SET_CONSTANT:
-	case CP_IM_LOAD:
-	case CP_IM_LOAD_IMMEDIATE:
-	case CP_LOAD_CONSTANT_CONTEXT:
-	case CP_INVALIDATE_STATE:
-	case CP_SET_SHADER_BASES:
-	case CP_SET_BIN_MASK:
-	case CP_SET_BIN_SELECT:
-	case CP_SET_BIN_BASE_OFFSET:
-	case CP_SET_BIN_DATA:
-	case CP_CONTEXT_UPDATE:
-	case CP_INTERRUPT:
-	case CP_IM_STORE:
-		break;
-	/* these shouldn't come from userspace */
-	case CP_ME_INIT:
-	case CP_SET_PROTECTED_MODE:
-	default:
-		KGSL_CMD_ERR(dev_priv->device, "bad CP opcode %0x\n", opcode);
-		return false;
-		break;
-	}
-
-	return true;
-}
-
-static bool
-_handle_type0(struct kgsl_device_private *dev_priv, uint *hostaddr)
-{
-	unsigned int reg = type0_pkt_offset(*hostaddr);
-	unsigned int cnt = type0_pkt_size(*hostaddr);
-	if (reg < 0x0192 || (reg + cnt) >= 0x8000) {
-		KGSL_CMD_ERR(dev_priv->device, "bad type0 reg: 0x%0x cnt: %d\n",
-			     reg, cnt);
-		return false;
-	}
-	return true;
-}
-
-/*
- * Traverse IBs and dump them to test vector. Detect swap by inspecting
- * register writes, keeping note of the current state, and dump
- * framebuffer config to test vector
- */
-static bool _parse_ibs(struct kgsl_device_private *dev_priv,
-			   uint gpuaddr, int sizedwords)
-{
-	static uint level; /* recursion level */
-	bool ret = false;
-	uint *hostaddr, *hoststart;
-	int dwords_left = sizedwords; /* dwords left in the current command
-					 buffer */
-	struct kgsl_mem_entry *entry;
-
-	spin_lock(&dev_priv->process_priv->mem_lock);
-	entry = kgsl_sharedmem_find_region(dev_priv->process_priv,
-					   gpuaddr, sizedwords * sizeof(uint));
-	spin_unlock(&dev_priv->process_priv->mem_lock);
-	if (entry == NULL) {
-		KGSL_CMD_ERR(dev_priv->device,
-			     "no mapping for gpuaddr: 0x%08x\n", gpuaddr);
-		return false;
-	}
-
-	hostaddr = (uint *)kgsl_gpuaddr_to_vaddr(&entry->memdesc, gpuaddr);
-	if (hostaddr == NULL) {
-		KGSL_CMD_ERR(dev_priv->device,
-			     "no mapping for gpuaddr: 0x%08x\n", gpuaddr);
-		return false;
-	}
-
-	hoststart = hostaddr;
-
-	level++;
-
-	KGSL_CMD_INFO(dev_priv->device, "ib: gpuaddr:0x%08x, wc:%d, hptr:%p\n",
-		gpuaddr, sizedwords, hostaddr);
-
-	mb();
-	while (dwords_left > 0) {
-		bool cur_ret = true;
-		int count = 0; /* dword count including packet header */
-
-		switch (*hostaddr >> 30) {
-		case 0x0: /* type-0 */
-			count = (*hostaddr >> 16)+2;
-			cur_ret = _handle_type0(dev_priv, hostaddr);
-			break;
-		case 0x1: /* type-1 */
-			count = 2;
-			break;
-		case 0x3: /* type-3 */
-			count = ((*hostaddr >> 16) & 0x3fff) + 2;
-			cur_ret = _handle_type3(dev_priv, hostaddr);
-			break;
-		default:
-			KGSL_CMD_ERR(dev_priv->device, "unexpected type: "
-				"type:%d, word:0x%08x @ 0x%p, gpu:0x%08x\n",
-				*hostaddr >> 30, *hostaddr, hostaddr,
-				gpuaddr+4*(sizedwords-dwords_left));
-			cur_ret = false;
-			count = dwords_left;
-			break;
-		}
-
-		if (!cur_ret) {
-			KGSL_CMD_ERR(dev_priv->device,
-				"bad sub-type: #:%d/%d, v:0x%08x"
-				" @ 0x%p[gb:0x%08x], level:%d\n",
-				sizedwords-dwords_left, sizedwords, *hostaddr,
-				hostaddr, gpuaddr+4*(sizedwords-dwords_left),
-				level);
-
-			if (ADRENO_DEVICE(dev_priv->device)->ib_check_level
-				>= 2)
-				print_hex_dump(KERN_ERR,
-					level == 1 ? "IB1:" : "IB2:",
-					DUMP_PREFIX_OFFSET, 32, 4, hoststart,
-					sizedwords*4, 0);
-			goto done;
-		}
-
-		/* jump to next packet */
-		dwords_left -= count;
-		hostaddr += count;
-		if (dwords_left < 0) {
-			KGSL_CMD_ERR(dev_priv->device,
-				"bad count: c:%d, #:%d/%d, "
-				"v:0x%08x @ 0x%p[gb:0x%08x], level:%d\n",
-				count, sizedwords-(dwords_left+count),
-				sizedwords, *(hostaddr-count), hostaddr-count,
-				gpuaddr+4*(sizedwords-(dwords_left+count)),
-				level);
-			if (ADRENO_DEVICE(dev_priv->device)->ib_check_level
-				>= 2)
-				print_hex_dump(KERN_ERR,
-					level == 1 ? "IB1:" : "IB2:",
-					DUMP_PREFIX_OFFSET, 32, 4, hoststart,
-					sizedwords*4, 0);
-			goto done;
-		}
-	}
-
-	ret = true;
-done:
-	if (!ret)
-		KGSL_DRV_ERR(dev_priv->device,
-			"parsing failed: gpuaddr:0x%08x, "
-			"host:0x%p, wc:%d\n", gpuaddr, hoststart, sizedwords);
-
-	level--;
-
-	return ret;
-}
-
 int
 adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv,
 				struct kgsl_context *context,
@@ -791,12 +601,9 @@ adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv,
 		start_index = 1;
 
 	for (i = start_index; i < numibs; i++) {
-		if (unlikely(adreno_dev->ib_check_level >= 1 &&
-		    !_parse_ibs(dev_priv, ibdesc[i].gpuaddr,
-				ibdesc[i].sizedwords))) {
-			kfree(link);
-			return -EINVAL;
-		}
+		(void)kgsl_cffdump_parse_ibs(dev_priv, NULL,
+			ibdesc[i].gpuaddr, ibdesc[i].sizedwords, false);
+
 		*cmds++ = CP_HDR_INDIRECT_BUFFER_PFD;
 		*cmds++ = ibdesc[i].gpuaddr;
 		*cmds++ = ibdesc[i].sizedwords;
@@ -950,20 +757,8 @@ int adreno_ringbuffer_extract(struct adreno_ringbuffer *rb,
 			kgsl_sharedmem_readl(&rb->buffer_desc, &value, rb_rptr);
 			rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr,
 							rb->buffer_desc.size);
-
-			/*
-			 * If other context switches were already lost and
-			 * and the current context is the one that is hanging,
-			 * then we cannot recover.  Print an error message
-			 * and leave.
-			 */
-
-			if ((copy_rb_contents == 0) && (value == cur_context)) {
-				KGSL_DRV_ERR(device, "GPU recovery could not "
-					"find the previous context\n");
-				return -EINVAL;
-			}
-
+			BUG_ON((copy_rb_contents == 0) &&
+				(value == cur_context));
 			/*
 			 * If we were copying the commands and got to this point
 			 * then we need to remove the 3 commands that appear
diff --git a/drivers/gpu/msm/adreno_snapshot.c b/drivers/gpu/msm/adreno_snapshot.c
index c45dbff48..fb88a72bd 100644
--- a/drivers/gpu/msm/adreno_snapshot.c
+++ b/drivers/gpu/msm/adreno_snapshot.c
@@ -45,19 +45,11 @@ static void push_object(struct kgsl_device *device, int type, uint32_t ptbase,
 	int index;
 	void *ptr;
 
-	/*
-	 * Sometimes IBs can be reused in the same dump.  Because we parse from
-	 * oldest to newest, if we come across an IB that has already been used,
-	 * assume that it has been reused and update the list with the newest
-	 * size.
-	 */
-
+	/* Go through the list and see that object has already been seen */
 	for (index = 0; index < objbufptr; index++) {
 		if (objbuf[index].gpuaddr == gpuaddr &&
-			objbuf[index].ptbase == ptbase) {
-				objbuf[index].dwords = dwords;
-				return;
-			}
+			objbuf[index].ptbase == ptbase)
+			return;
 	}
 
 	if (objbufptr == SNAPSHOT_OBJ_BUFSIZE) {
@@ -85,25 +77,6 @@ static void push_object(struct kgsl_device *device, int type, uint32_t ptbase,
 	objbuf[objbufptr++].ptr = ptr;
 }
 
-/*
- * Return a 1 if the specified object is already on the list of buffers
- * to be dumped
- */
-
-static int find_object(int type, unsigned int gpuaddr, unsigned int ptbase)
-{
-	int index;
-
-	for (index = 0; index < objbufptr; index++) {
-		if (objbuf[index].gpuaddr == gpuaddr &&
-			objbuf[index].ptbase == ptbase &&
-			objbuf[index].type == type)
-			return 1;
-	}
-
-	return 0;
-}
-
 /* Snapshot the istore memory */
 static int snapshot_istore(struct kgsl_device *device, void *snapshot,
 	int remain, void *priv)
@@ -140,7 +113,6 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot,
 	unsigned int rbbase, ptbase, rptr, *rbptr;
 	int start, stop, index;
 	int numitems, size;
-	int parse_ibs = 0, ib_parse_start;
 
 	/* Get the GPU address of the ringbuffer */
 	kgsl_regread(device, REG_CP_RB_BASE, &rbbase);
@@ -186,52 +158,8 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot,
 	header->rbsize = rb->sizedwords;
 	header->count = numitems;
 
-	/*
-	 * We can only reliably dump IBs from the beginning of the context,
-	 * and it turns out that for the vast majority of the time we really
-	 * only care about the current context when it comes to diagnosing
-	 * a hang. So, with an eye to limiting the buffer dumping to what is
-	 * really useful find the beginning of the context and only dump
-	 * IBs from that point
-	 */
-
-	index = rptr;
-	ib_parse_start = start;
-	rbptr = rb->buffer_desc.hostptr;
-
-	while (index != start) {
-		index--;
-
-		if (index < 0) {
-			/*
-			 * The marker we are looking for is 2 dwords long, so
-			 * when wrapping, go back 2 from the end so we don't
-			 * access out of range in the if statement below
-			 */
-			index = rb->sizedwords - 2;
-
-			/*
-			 * Account for the possibility that start might be at
-			 * rb->sizedwords - 1
-			 */
-
-			if (start == rb->sizedwords - 1)
-				break;
-		}
-
-		/*
-		 * Look for a NOP packet with the context switch identifier in
-		 * the second dword
-		 */
-
-		if (rbptr[index] == cp_nop_packet(1) &&
-			rbptr[index + 1] == KGSL_CONTEXT_TO_MEM_IDENTIFIER) {
-				ib_parse_start = index;
-				break;
-		}
-	}
-
 	index = start;
+	rbptr = rb->buffer_desc.hostptr;
 
 	/*
 	 * Loop through the RB, copying the data and looking for indirect
@@ -241,18 +169,15 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot,
 	while (index != rb->wptr) {
 		*data = rbptr[index];
 
-		/* Only parse IBs between the context start and the rptr */
-
-		if (index == ib_parse_start)
-			parse_ibs = 1;
-
-		if (index == rptr)
-			parse_ibs = 0;
-
-		if (parse_ibs && adreno_cmd_is_ib(rbptr[index]))
+		if (rbptr[index] == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2))
 			push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase,
 				rbptr[index + 1], rbptr[index + 2]);
 
+		/*
+		 * FIXME: Handle upcoming MMU pagetable changes, but only
+		 * between the rptr and the wptr
+		 */
+
 		index = index + 1;
 
 		if (index == rb->sizedwords)
@@ -303,9 +228,10 @@ static int snapshot_ib(struct kgsl_device *device, void *snapshot,
 		*dst = *src;
 		/* If another IB is discovered, then push it on the list too */
 
-		if (adreno_cmd_is_ib(*src))
+		if (*src == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) {
 			push_object(device, SNAPSHOT_OBJ_TYPE_IB, obj->ptbase,
 				*(src + 1), *(src + 2));
+		}
 
 		src++;
 		dst++;
@@ -362,45 +288,22 @@ void *adreno_snapshot(struct kgsl_device *device, void *snapshot, int *remain,
 		snapshot, remain, snapshot_rb, NULL);
 
 	/*
-	 * Make sure that the last IB1 that was being executed is dumped.
-	 * Since this was the last IB1 that was processed, we should have
-	 * already added it to the list during the ringbuffer parse but we
-	 * want to be double plus sure.
+	 * Make sure that the IBs described in the CP registers are on the
+	 * list of objects
 	 */
-
 	kgsl_regread(device, REG_CP_IB1_BASE, &ibbase);
 	kgsl_regread(device, REG_CP_IB1_BUFSZ, &ibsize);
 
-	/*
-	 * The problem is that IB size from the register is the unprocessed size
-	 * of the buffer not the original size, so if we didn't catch this
-	 * buffer being directly used in the RB, then we might not be able to
-	 * dump the whle thing. Print a warning message so we can try to
-	 * figure how often this really happens.
-	 */
-
-	if (!find_object(SNAPSHOT_OBJ_TYPE_IB, ibbase, ptbase) && ibsize) {
+	if (ibsize)
 		push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase,
 			ibbase, ibsize);
-		KGSL_DRV_ERR(device, "CP_IB1_BASE not found in the ringbuffer. "
-			"Dumping %x dwords of the buffer.\n", ibsize);
-	}
 
 	kgsl_regread(device, REG_CP_IB2_BASE, &ibbase);
 	kgsl_regread(device, REG_CP_IB2_BUFSZ, &ibsize);
 
-	/*
-	 * Add the last parsed IB2 to the list. The IB2 should be found as we
-	 * parse the objects below, but we try to add it to the list first, so
-	 * it too can be parsed.  Don't print an error message in this case - if
-	 * the IB2 is found during parsing, the list will be updated with the
-	 * correct size.
-	 */
-
-	if (!find_object(SNAPSHOT_OBJ_TYPE_IB, ibbase, ptbase) && ibsize) {
+	if (ibsize)
 		push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase,
 			ibbase, ibsize);
-	}
 
 	/*
 	 * Go through the list of found objects and dump each one.  As the IBs
diff --git a/drivers/gpu/msm/kgsl.c b/drivers/gpu/msm/kgsl.c
old mode 100644
new mode 100755
index 39dad925f..d51128979
--- a/drivers/gpu/msm/kgsl.c
+++ b/drivers/gpu/msm/kgsl.c
@@ -21,10 +21,11 @@
 #include <linux/vmalloc.h>
 #include <linux/pm_runtime.h>
 #include <linux/genlock.h>
-#include <linux/rbtree.h>
+
 #include <linux/ashmem.h>
 #include <linux/major.h>
 #include <linux/ion.h>
+#include <mach/socinfo.h>
 
 #include "kgsl.h"
 #include "kgsl_debugfs.h"
@@ -193,28 +194,8 @@ static
 void kgsl_mem_entry_attach_process(struct kgsl_mem_entry *entry,
 				   struct kgsl_process_private *process)
 {
-	struct rb_node **node;
-	struct rb_node *parent = NULL;
-
 	spin_lock(&process->mem_lock);
-
-	node = &process->mem_rb.rb_node;
-
-	while (*node) {
-		struct kgsl_mem_entry *cur;
-
-		parent = *node;
-		cur = rb_entry(parent, struct kgsl_mem_entry, node);
-
-		if (entry->memdesc.gpuaddr < cur->memdesc.gpuaddr)
-			node = &parent->rb_left;
-		else
-			node = &parent->rb_right;
-	}
-
-	rb_link_node(&entry->node, parent, node);
-	rb_insert_color(&entry->node, &process->mem_rb);
-
+	list_add(&entry->list, &process->mem_list);
 	spin_unlock(&process->mem_lock);
 
 	entry->priv = process;
@@ -424,10 +405,6 @@ static int kgsl_suspend_device(struct kgsl_device *device, pm_message_t state)
 			INIT_COMPLETION(device->hwaccess_gate);
 			device->ftbl->suspend_context(device);
 			device->ftbl->stop(device);
-			if (device->idle_wakelock.name)
-				wake_unlock(&device->idle_wakelock);
-			pm_qos_update_request(&device->pm_qos_req_dma,
-						PM_QOS_DEFAULT_VALUE);
 			kgsl_pwrctrl_set_state(device, KGSL_STATE_SUSPEND);
 			break;
 		case KGSL_STATE_SLUMBER:
@@ -537,8 +514,8 @@ void kgsl_late_resume_driver(struct early_suspend *h)
 					struct kgsl_device, display_off);
 	KGSL_PWR_WARN(device, "late resume start\n");
 	mutex_lock(&device->mutex);
-	device->pwrctrl.restore_slumber = 0;
 	kgsl_pwrctrl_wake(device);
+	device->pwrctrl.restore_slumber = 0;
 	kgsl_pwrctrl_pwrlevel_change(device, KGSL_PWRLEVEL_TURBO);
 	mutex_unlock(&device->mutex);
 	kgsl_check_idle(device);
@@ -571,7 +548,8 @@ kgsl_get_process_private(struct kgsl_device_private *cur_dev_priv)
 	spin_lock_init(&private->mem_lock);
 	private->refcnt = 1;
 	private->pid = task_tgid_nr(current);
-	private->mem_rb = RB_ROOT;
+
+	INIT_LIST_HEAD(&private->mem_list);
 
 	if (kgsl_mmu_enabled())
 	{
@@ -600,7 +578,7 @@ kgsl_put_process_private(struct kgsl_device *device,
 			 struct kgsl_process_private *private)
 {
 	struct kgsl_mem_entry *entry = NULL;
-	struct rb_node *node;
+	struct kgsl_mem_entry *entry_tmp = NULL;
 
 	if (!private)
 		return;
@@ -614,13 +592,11 @@ kgsl_put_process_private(struct kgsl_device *device,
 
 	list_del(&private->list);
 
-	for (node = rb_first(&private->mem_rb); node; ) {
-		entry = rb_entry(node, struct kgsl_mem_entry, node);
-		node = rb_next(&entry->node);
-
-		rb_erase(&entry->node, &private->mem_rb);
+	list_for_each_entry_safe(entry, entry_tmp, &private->mem_list, list) {
+		list_del(&entry->list);
 		kgsl_mem_entry_put(entry);
 	}
+
 	kgsl_mmu_putpagetable(private->pagetable);
 	kfree(private);
 unlock:
@@ -746,42 +722,46 @@ static int kgsl_open(struct inode *inodep, struct file *filep)
 	return result;
 }
 
+
 /*call with private->mem_lock locked */
-struct kgsl_mem_entry *
-kgsl_sharedmem_find_region(struct kgsl_process_private *private,
-	unsigned int gpuaddr, size_t size)
+static struct kgsl_mem_entry *
+kgsl_sharedmem_find(struct kgsl_process_private *private, unsigned int gpuaddr)
 {
-	struct rb_node *node = private->mem_rb.rb_node;
-
-	while (node != NULL) {
-		struct kgsl_mem_entry *entry;
-
-		entry = rb_entry(node, struct kgsl_mem_entry, node);
+	struct kgsl_mem_entry *entry = NULL, *result = NULL;
 
+	BUG_ON(private == NULL);
 
-		if (kgsl_gpuaddr_in_memdesc(&entry->memdesc, gpuaddr, size))
-			return entry;
+	gpuaddr &= PAGE_MASK;
 
-		if (gpuaddr < entry->memdesc.gpuaddr)
-			node = node->rb_left;
-		else if (gpuaddr >=
-			(entry->memdesc.gpuaddr + entry->memdesc.size))
-			node = node->rb_right;
-		else {
-			return NULL;
+	list_for_each_entry(entry, &private->mem_list, list) {
+		if (entry->memdesc.gpuaddr == gpuaddr) {
+			result = entry;
+			break;
 		}
 	}
-
-	return NULL;
+	return result;
 }
-EXPORT_SYMBOL(kgsl_sharedmem_find_region);
 
 /*call with private->mem_lock locked */
-static inline struct kgsl_mem_entry *
-kgsl_sharedmem_find(struct kgsl_process_private *private, unsigned int gpuaddr)
+struct kgsl_mem_entry *
+kgsl_sharedmem_find_region(struct kgsl_process_private *private,
+				unsigned int gpuaddr,
+				size_t size)
 {
-	return kgsl_sharedmem_find_region(private, gpuaddr, 1);
+	struct kgsl_mem_entry *entry = NULL, *result = NULL;
+
+	BUG_ON(private == NULL);
+
+	list_for_each_entry(entry, &private->mem_list, list) {
+		if (kgsl_gpuaddr_in_memdesc(&entry->memdesc, gpuaddr, size)) {
+			result = entry;
+			break;
+		}
+	}
+
+	return result;
 }
+EXPORT_SYMBOL(kgsl_sharedmem_find_region);
 
 /*call all ioctl sub functions with driver locked*/
 static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv,
@@ -809,40 +789,6 @@ static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv,
 
 		break;
 	}
-	case KGSL_PROP_GPU_RESET_STAT:
-	{
-		/* Return reset status of given context and clear it */
-		uint32_t id;
-		struct kgsl_context *context;
-
-		if (param->sizebytes != sizeof(unsigned int)) {
-			result = -EINVAL;
-			break;
-		}
-		/* We expect the value passed in to contain the context id */
-		if (copy_from_user(&id, param->value,
-			sizeof(unsigned int))) {
-			result = -EFAULT;
-			break;
-		}
-		context = kgsl_find_context(dev_priv, id);
-		if (!context) {
-			result = -EINVAL;
-			break;
-		}
-		/*
-		 * Copy the reset status to value which also serves as
-		 * the out parameter
-		 */
-		if (copy_to_user(param->value, &(context->reset_status),
-			sizeof(unsigned int))) {
-			result = -EFAULT;
-			break;
-		}
-		/* Clear reset status once its been queried */
-		context->reset_status = KGSL_CTX_STAT_NO_ERROR;
-		break;
-	}
 	default:
 		result = dev_priv->device->ftbl->getproperty(
 					dev_priv->device, param->type,
@@ -881,6 +827,40 @@ static long kgsl_ioctl_device_waittimestamp(struct kgsl_device_private
 
 	return result;
 }
+static bool check_ibdesc(struct kgsl_device_private *dev_priv,
+			 struct kgsl_ibdesc *ibdesc, unsigned int numibs,
+			 bool parse)
+{
+	bool result = true;
+	unsigned int i;
+	for (i = 0; i < numibs; i++) {
+		struct kgsl_mem_entry *entry;
+		spin_lock(&dev_priv->process_priv->mem_lock);
+		entry = kgsl_sharedmem_find_region(dev_priv->process_priv,
+			ibdesc[i].gpuaddr, ibdesc[i].sizedwords * sizeof(uint));
+		spin_unlock(&dev_priv->process_priv->mem_lock);
+		if (entry == NULL) {
+			KGSL_DRV_ERR(dev_priv->device,
+				"invalid cmd buffer gpuaddr %08x " \
+				"sizedwords %d\n", ibdesc[i].gpuaddr,
+				ibdesc[i].sizedwords);
+			result = false;
+			break;
+		}
+
+		if (parse && !kgsl_cffdump_parse_ibs(dev_priv, &entry->memdesc,
+			ibdesc[i].gpuaddr, ibdesc[i].sizedwords, true)) {
+			KGSL_DRV_ERR(dev_priv->device,
+				"invalid cmd buffer gpuaddr %08x " \
+				"sizedwords %d numibs %d/%d\n",
+				ibdesc[i].gpuaddr,
+				ibdesc[i].sizedwords, i+1, numibs);
+			result = false;
+			break;
+		}
+	}
+	return result;
+}
 
 static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv,
 				      unsigned int cmd, void *data)
@@ -950,6 +930,12 @@ static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv,
 		param->numibs = 1;
 	}
 
+	if (!check_ibdesc(dev_priv, ibdesc, param->numibs, true)) {
+		KGSL_DRV_ERR(dev_priv->device, "bad ibdesc");
+		result = -EINVAL;
+		goto free_ibdesc;
+	}
+
 	result = dev_priv->device->ftbl->issueibcmds(dev_priv,
 					     context,
 					     ibdesc,
@@ -959,6 +945,18 @@ static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv,
 
 	trace_kgsl_issueibcmds(dev_priv->device, param, result);
 
+	if (result != 0)
+		goto free_ibdesc;
+
+	/* this is a check to try to detect if a command buffer was freed
+	 * during issueibcmds().
+	 */
+	if (!check_ibdesc(dev_priv, ibdesc, param->numibs, false)) {
+		KGSL_DRV_ERR(dev_priv->device, "bad ibdesc AFTER issue");
+		result = -EINVAL;
+		goto free_ibdesc;
+	}
+
 free_ibdesc:
 	kfree(ibdesc);
 done:
@@ -990,7 +988,7 @@ static void kgsl_freemem_event_cb(struct kgsl_device *device,
 {
 	struct kgsl_mem_entry *entry = priv;
 	spin_lock(&entry->priv->mem_lock);
-	rb_erase(&entry->node, &entry->priv->mem_rb);
+	list_del(&entry->list);
 	spin_unlock(&entry->priv->mem_lock);
 	kgsl_mem_entry_put(entry);
 }
@@ -1082,8 +1080,7 @@ static long kgsl_ioctl_sharedmem_free(struct kgsl_device_private *dev_priv,
 	spin_lock(&private->mem_lock);
 	entry = kgsl_sharedmem_find(private, param->gpuaddr);
 	if (entry)
-		rb_erase(&entry->node, &private->mem_rb);
-
+		list_del(&entry->list);
 	spin_unlock(&private->mem_lock);
 
 	if (entry) {
@@ -1167,7 +1164,7 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv,
 		goto error;
 	}
 
-	result = kgsl_sharedmem_page_alloc_user(&entry->memdesc,
+	result = kgsl_sharedmem_vmalloc_user(&entry->memdesc,
 					     private->pagetable, len,
 					     param->flags);
 	if (result != 0)
@@ -1175,10 +1172,10 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv,
 
 	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
 
-	result = kgsl_sharedmem_map_vma(vma, &entry->memdesc);
+	result = remap_vmalloc_range(vma, (void *) entry->memdesc.hostptr, 0);
 	if (result) {
-		KGSL_CORE_ERR("kgsl_sharedmem_map_vma failed: %d\n", result);
-		goto error_free_alloc;
+		KGSL_CORE_ERR("remap_vmalloc_range failed: %d\n", result);
+		goto error_free_vmalloc;
 	}
 
 	param->gpuaddr = entry->memdesc.gpuaddr;
@@ -1193,7 +1190,7 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv,
 	kgsl_check_idle(dev_priv->device);
 	return 0;
 
-error_free_alloc:
+error_free_vmalloc:
 	kgsl_sharedmem_free(&entry->memdesc);
 
 error_free_entry:
@@ -1316,8 +1313,7 @@ static int memdesc_sg_virt(struct kgsl_memdesc *memdesc,
 	int sglen = PAGE_ALIGN(size) / PAGE_SIZE;
 	unsigned long paddr = (unsigned long) addr;
 
-	memdesc->sg = kgsl_sg_alloc(sglen);
-
+	memdesc->sg = vmalloc(sglen * sizeof(struct scatterlist));
 	if (memdesc->sg == NULL)
 		return -ENOMEM;
 
@@ -1357,7 +1353,7 @@ static int memdesc_sg_virt(struct kgsl_memdesc *memdesc,
 
 err:
 	spin_unlock(&current->mm->page_table_lock);
-	kgsl_sg_free(memdesc->sg,  sglen);
+	vfree(memdesc->sg);
 	memdesc->sg = NULL;
 
 	return -EINVAL;
@@ -1492,8 +1488,11 @@ static int kgsl_setup_ion(struct kgsl_mem_entry *entry,
 	struct scatterlist *s;
 	unsigned long flags;
 
-	if (IS_ERR_OR_NULL(kgsl_ion_client))
-		return -ENODEV;
+	if (kgsl_ion_client == NULL) {
+		kgsl_ion_client = msm_ion_client_create(UINT_MAX, KGSL_NAME);
+		if (kgsl_ion_client == NULL)
+			return -ENODEV;
+	}
 
 	handle = ion_import_fd(kgsl_ion_client, fd);
 	if (IS_ERR_OR_NULL(handle))
@@ -1623,20 +1622,10 @@ static long kgsl_ioctl_map_user_mem(struct kgsl_device_private *dev_priv,
 	kgsl_check_idle(dev_priv->device);
 	return result;
 
-error_put_file_ptr:
-	switch (entry->memtype) {
-	case KGSL_MEM_ENTRY_PMEM:
-	case KGSL_MEM_ENTRY_ASHMEM:
-		if (entry->priv_data)
-			fput(entry->priv_data);
-		break;
-	case KGSL_MEM_ENTRY_ION:
-		ion_unmap_dma(kgsl_ion_client, entry->priv_data);
-		ion_free(kgsl_ion_client, entry->priv_data);
-		break;
-	default:
-		break;
-	}
+ error_put_file_ptr:
+	if (entry->priv_data)
+		fput(entry->priv_data);
+
 error:
 	kfree(entry);
 	kgsl_check_idle(dev_priv->device);
@@ -2040,7 +2029,7 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long vma_offset = vma->vm_pgoff << PAGE_SHIFT;
 	struct kgsl_device_private *dev_priv = file->private_data;
 	struct kgsl_process_private *private = dev_priv->process_priv;
-	struct kgsl_mem_entry *entry = NULL;
+	struct kgsl_mem_entry *tmp, *entry = NULL;
 	struct kgsl_device *device = dev_priv->device;
 
 	/* Handle leagacy behavior for memstore */
@@ -2051,11 +2040,13 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma)
 	/* Find a chunk of GPU memory */
 
 	spin_lock(&private->mem_lock);
-	entry = kgsl_sharedmem_find(private, vma_offset);
-
-	if (entry)
-		kgsl_mem_entry_get(entry);
-
+	list_for_each_entry(tmp, &private->mem_list, list) {
+		if (vma_offset == tmp->memdesc.gpuaddr) {
+			kgsl_mem_entry_get(tmp);
+			entry = tmp;
+			break;
+		}
+	}
 	spin_unlock(&private->mem_lock);
 
 	if (entry == NULL)
@@ -2111,8 +2102,8 @@ void kgsl_unregister_device(struct kgsl_device *device)
 	kgsl_cffdump_close(device->id);
 	kgsl_pwrctrl_uninit_sysfs(device);
 
-	wake_lock_destroy(&device->idle_wakelock);
-	pm_qos_remove_request(&device->pm_qos_req_dma);
+	if (cpu_is_msm8x60())
+		wake_lock_destroy(&device->idle_wakelock);
 
 	idr_destroy(&device->context_idr);
 
@@ -2203,9 +2194,9 @@ kgsl_register_device(struct kgsl_device *device)
 	if (ret != 0)
 		goto err_close_mmu;
 
-	wake_lock_init(&device->idle_wakelock, WAKE_LOCK_IDLE, device->name);
-	pm_qos_add_request(&device->pm_qos_req_dma, PM_QOS_CPU_DMA_LATENCY,
-				PM_QOS_DEFAULT_VALUE);
+	if (cpu_is_msm8x60())
+		wake_lock_init(&device->idle_wakelock,
+					   WAKE_LOCK_IDLE, device->name);
 
 	idr_init(&device->context_idr);
 
@@ -2251,8 +2242,6 @@ int kgsl_device_platform_probe(struct kgsl_device *device,
 	if (status)
 		goto error;
 
-	kgsl_ion_client = msm_ion_client_create(UINT_MAX, KGSL_NAME);
-
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
 					   device->iomemname);
 	if (res == NULL) {
@@ -2350,30 +2339,22 @@ kgsl_ptdata_init(void)
 
 static void kgsl_core_exit(void)
 {
-	kgsl_mmu_ptpool_destroy(kgsl_driver.ptpool);
-	kgsl_driver.ptpool = NULL;
+	unregister_chrdev_region(kgsl_driver.major, KGSL_DEVICE_MAX);
 
-	kgsl_drm_exit();
-	kgsl_cffdump_destroy();
-	kgsl_core_debugfs_close();
+	kgsl_mmu_ptpool_destroy(&kgsl_driver.ptpool);
+	kgsl_driver.ptpool = NULL;
 
-	/*
-	 * We call kgsl_sharedmem_uninit_sysfs() and device_unregister()
-	 * only if kgsl_driver.virtdev has been populated.
-	 * We check at least one member of kgsl_driver.virtdev to
-	 * see if it is not NULL (and thus, has been populated).
-	 */
-	if (kgsl_driver.virtdev.class) {
-		kgsl_sharedmem_uninit_sysfs();
-		device_unregister(&kgsl_driver.virtdev);
-	}
+	device_unregister(&kgsl_driver.virtdev);
 
 	if (kgsl_driver.class) {
 		class_destroy(kgsl_driver.class);
 		kgsl_driver.class = NULL;
 	}
 
-	unregister_chrdev_region(kgsl_driver.major, KGSL_DEVICE_MAX);
+	kgsl_drm_exit();
+	kgsl_cffdump_destroy();
+	kgsl_core_debugfs_close();
+	kgsl_sharedmem_uninit_sysfs();
 }
 
 static int __init kgsl_core_init(void)
diff --git a/drivers/gpu/msm/kgsl.h b/drivers/gpu/msm/kgsl.h
index f027f95c4..d3ae4b9bb 100644
--- a/drivers/gpu/msm/kgsl.h
+++ b/drivers/gpu/msm/kgsl.h
@@ -21,12 +21,13 @@
 #include <linux/mutex.h>
 #include <linux/cdev.h>
 #include <linux/regulator/consumer.h>
-#include <linux/mm.h>
 
 #define KGSL_NAME "kgsl"
 
-/* Timestamp window used to detect rollovers (half of integer range) */
+/*< DTS2012042406822 hanfeng 20120428 begin*/
+/* Timestamp window used to detect rollovers */
 #define KGSL_TIMESTAMP_WINDOW 0x80000000
+/* DTS2012042406822 hanfeng 20120428 end > */
 
 /*cache coherency ops */
 #define DRM_KGSL_GEM_CACHE_OP_TO_DEV	0x0001
@@ -95,8 +96,6 @@ struct kgsl_driver {
 	struct {
 		unsigned int vmalloc;
 		unsigned int vmalloc_max;
-		unsigned int page_alloc;
-		unsigned int page_alloc_max;
 		unsigned int coherent;
 		unsigned int coherent_max;
 		unsigned int mapped;
@@ -108,15 +107,7 @@ struct kgsl_driver {
 extern struct kgsl_driver kgsl_driver;
 
 struct kgsl_pagetable;
-struct kgsl_memdesc;
-
-struct kgsl_memdesc_ops {
-	int (*vmflags)(struct kgsl_memdesc *);
-	int (*vmfault)(struct kgsl_memdesc *, struct vm_area_struct *,
-		       struct vm_fault *);
-	void (*free)(struct kgsl_memdesc *memdesc);
-	int (*map_kernel_mem)(struct kgsl_memdesc *);
-};
+struct kgsl_memdesc_ops;
 
 /* shared memory allocation */
 struct kgsl_memdesc {
@@ -145,7 +136,7 @@ struct kgsl_mem_entry {
 	struct kgsl_memdesc memdesc;
 	int memtype;
 	void *priv_data;
-	struct rb_node node;
+	struct list_head list;
 	uint32_t free_timestamp;
 	/* back pointer to private structure under whose context this
 	* allocation is made */
@@ -195,47 +186,27 @@ static inline int kgsl_gpuaddr_in_memdesc(const struct kgsl_memdesc *memdesc,
 	}
 	return 0;
 }
-
-static inline void *kgsl_memdesc_map(struct kgsl_memdesc *memdesc)
-{
-	if (memdesc->hostptr == NULL && memdesc->ops &&
-		memdesc->ops->map_kernel_mem)
-		memdesc->ops->map_kernel_mem(memdesc);
-
-	return memdesc->hostptr;
-}
-
-static inline uint8_t *kgsl_gpuaddr_to_vaddr(struct kgsl_memdesc *memdesc,
+static inline uint8_t *kgsl_gpuaddr_to_vaddr(const struct kgsl_memdesc *memdesc,
 					     unsigned int gpuaddr)
 {
-	if (memdesc->gpuaddr == 0 ||
-		gpuaddr < memdesc->gpuaddr ||
-		gpuaddr >= (memdesc->gpuaddr + memdesc->size) ||
-		(NULL == memdesc->hostptr && memdesc->ops->map_kernel_mem &&
-			memdesc->ops->map_kernel_mem(memdesc)))
-			return NULL;
+	if (memdesc->hostptr == NULL || memdesc->gpuaddr == 0 ||
+		(gpuaddr < memdesc->gpuaddr ||
+		gpuaddr >= memdesc->gpuaddr + memdesc->size))
+		return NULL;
 
 	return memdesc->hostptr + (gpuaddr - memdesc->gpuaddr);
 }
 
-static inline int timestamp_cmp(unsigned int a, unsigned int b)
+static inline int timestamp_cmp(unsigned int new, unsigned int old)
 {
-	/* check for equal */
-	if (a == b)
-		return 0;
+	int ts_diff = new - old;
 
-	/* check for greater-than for non-rollover case */
-	if ((a > b) && (a - b < KGSL_TIMESTAMP_WINDOW))
-		return 1;
+	if (ts_diff == 0)
+		return 0;
 
-	/* check for greater-than for rollover case
-	 * note that <= is required to ensure that consistent
-	 * results are returned for values whose difference is
-	 * equal to the window size
-	 */
-	a += KGSL_TIMESTAMP_WINDOW;
-	b += KGSL_TIMESTAMP_WINDOW;
-	return ((a > b) && (a - b <= KGSL_TIMESTAMP_WINDOW)) ? 1 : -1;
+    /*< DTS2012042406822 hanfeng 20120428 begin*/
+	return ((ts_diff > 0) || (ts_diff < -KGSL_TIMESTAMP_WINDOW)) ? 1 : -1;
+    /* DTS2012042406822 hanfeng 20120428 end > */
 }
 
 static inline void
diff --git a/drivers/gpu/msm/kgsl_cffdump.c b/drivers/gpu/msm/kgsl_cffdump.c
index 77aef1ff0..e9455cb82 100644
--- a/drivers/gpu/msm/kgsl_cffdump.c
+++ b/drivers/gpu/msm/kgsl_cffdump.c
@@ -497,6 +497,190 @@ int kgsl_cffdump_waitirq(void)
 }
 EXPORT_SYMBOL(kgsl_cffdump_waitirq);
 
+#define ADDRESS_STACK_SIZE 256
+#define GET_PM4_TYPE3_OPCODE(x) ((*(x) >> 8) & 0xFF)
+static unsigned int kgsl_cffdump_addr_count;
+
+static bool kgsl_cffdump_handle_type3(struct kgsl_device_private *dev_priv,
+	uint *hostaddr, bool check_only)
+{
+	static uint addr_stack[ADDRESS_STACK_SIZE];
+	static uint size_stack[ADDRESS_STACK_SIZE];
+
+	switch (GET_PM4_TYPE3_OPCODE(hostaddr)) {
+	case CP_INDIRECT_BUFFER_PFD:
+	case CP_INDIRECT_BUFFER:
+	{
+		/* traverse indirect buffers */
+		int i;
+		uint ibaddr = hostaddr[1];
+		uint ibsize = hostaddr[2];
+
+		/* is this address already in encountered? */
+		for (i = 0;
+			i < kgsl_cffdump_addr_count && addr_stack[i] != ibaddr;
+			++i)
+			;
+
+		if (kgsl_cffdump_addr_count == i) {
+			addr_stack[kgsl_cffdump_addr_count] = ibaddr;
+			size_stack[kgsl_cffdump_addr_count++] = ibsize;
+
+			if (kgsl_cffdump_addr_count >= ADDRESS_STACK_SIZE) {
+				KGSL_CORE_ERR("stack overflow\n");
+				return false;
+			}
+
+			return kgsl_cffdump_parse_ibs(dev_priv, NULL,
+				ibaddr, ibsize, check_only);
+		} else if (size_stack[i] != ibsize) {
+			KGSL_CORE_ERR("gpuaddr: 0x%08x, "
+				"wc: %u, with size wc: %u already on the "
+				"stack\n", ibaddr, ibsize, size_stack[i]);
+			return false;
+		}
+	}
+	break;
+	}
+
+	return true;
+}
+
+/*
+ * Traverse IBs and dump them to test vector. Detect swap by inspecting
+ * register writes, keeping note of the current state, and dump
+ * framebuffer config to test vector
+ */
+bool kgsl_cffdump_parse_ibs(struct kgsl_device_private *dev_priv,
+	const struct kgsl_memdesc *memdesc, uint gpuaddr, int sizedwords,
+	bool check_only)
+{
+	static uint level; /* recursion level */
+	bool ret = true;
+	uint *hostaddr, *hoststart;
+	int dwords_left = sizedwords; /* dwords left in the current command
+					 buffer */
+
+	if (level == 0)
+		kgsl_cffdump_addr_count = 0;
+
+	if (memdesc == NULL) {
+		struct kgsl_mem_entry *entry;
+		spin_lock(&dev_priv->process_priv->mem_lock);
+		entry = kgsl_sharedmem_find_region(dev_priv->process_priv,
+			gpuaddr, sizedwords * sizeof(uint));
+		spin_unlock(&dev_priv->process_priv->mem_lock);
+		if (entry == NULL) {
+			KGSL_CORE_ERR("did not find mapping "
+				"for gpuaddr: 0x%08x\n", gpuaddr);
+			return true;
+		}
+		memdesc = &entry->memdesc;
+	}
+	hostaddr = (uint *)kgsl_gpuaddr_to_vaddr(memdesc, gpuaddr);
+	if (hostaddr == NULL) {
+		KGSL_CORE_ERR("no kernel mapping for "
+			"gpuaddr: 0x%08x\n", gpuaddr);
+		return true;
+	}
+
+	hoststart = hostaddr;
+
+	level++;
+
+	mb();
+	kgsl_cache_range_op((struct kgsl_memdesc *)memdesc,
+				KGSL_CACHE_OP_INV);
+#ifdef DEBUG
+	pr_info("kgsl: cffdump: ib: gpuaddr:0x%08x, wc:%d, hptr:%p\n",
+		gpuaddr, sizedwords, hostaddr);
+#endif
+
+	while (dwords_left > 0) {
+		int count = 0; /* dword count including packet header */
+		bool cur_ret = true;
+
+		switch (*hostaddr >> 30) {
+		case 0x0: /* type-0 */
+			count = (*hostaddr >> 16)+2;
+			break;
+		case 0x1: /* type-1 */
+			count = 2;
+			break;
+		case 0x3: /* type-3 */
+			count = ((*hostaddr >> 16) & 0x3fff) + 2;
+			cur_ret = kgsl_cffdump_handle_type3(dev_priv,
+				hostaddr, check_only);
+			break;
+		default:
+			pr_warn("kgsl: cffdump: parse-ib: unexpected type: "
+				"type:%d, word:0x%08x @ 0x%p, gpu:0x%08x\n",
+				*hostaddr >> 30, *hostaddr, hostaddr,
+				gpuaddr+4*(sizedwords-dwords_left));
+			cur_ret = false;
+			count = dwords_left;
+			break;
+		}
+
+#ifdef DEBUG
+		if (!cur_ret) {
+			pr_info("kgsl: cffdump: bad sub-type: #:%d/%d, v:0x%08x"
+				" @ 0x%p[gb:0x%08x], level:%d\n",
+				sizedwords-dwords_left, sizedwords, *hostaddr,
+				hostaddr, gpuaddr+4*(sizedwords-dwords_left),
+				level);
+
+			print_hex_dump(KERN_ERR, level == 1 ? "IB1:" : "IB2:",
+				DUMP_PREFIX_OFFSET, 32, 4, hoststart,
+				sizedwords*4, 0);
+		}
+#endif
+		ret = ret && cur_ret;
+
+		/* jump to next packet */
+		dwords_left -= count;
+		hostaddr += count;
+		cur_ret = dwords_left >= 0;
+
+#ifdef DEBUG
+		if (!cur_ret) {
+			pr_info("kgsl: cffdump: bad count: c:%d, #:%d/%d, "
+				"v:0x%08x @ 0x%p[gb:0x%08x], level:%d\n",
+				count, sizedwords-(dwords_left+count),
+				sizedwords, *(hostaddr-count), hostaddr-count,
+				gpuaddr+4*(sizedwords-(dwords_left+count)),
+				level);
+
+			print_hex_dump(KERN_ERR, level == 1 ? "IB1:" : "IB2:",
+				DUMP_PREFIX_OFFSET, 32, 4, hoststart,
+				sizedwords*4, 0);
+		}
+#endif
+
+		ret = ret && cur_ret;
+	}
+
+	if (!ret)
+		pr_info("kgsl: cffdump: parsing failed: gpuaddr:0x%08x, "
+			"host:0x%p, wc:%d\n", gpuaddr, hoststart, sizedwords);
+
+	if (!check_only) {
+#ifdef DEBUG
+		uint offset = gpuaddr - memdesc->gpuaddr;
+		pr_info("kgsl: cffdump: ib-dump: hostptr:%p, gpuaddr:%08x, "
+			"physaddr:%08x, offset:%d, size:%d", hoststart,
+			gpuaddr, memdesc->physaddr + offset, offset,
+			sizedwords*4);
+#endif
+		kgsl_cffdump_syncmem(dev_priv, memdesc, gpuaddr, sizedwords*4,
+			false);
+	}
+
+	level--;
+
+	return ret;
+}
+
 static int subbuf_start_handler(struct rchan_buf *buf,
 	void *subbuf, void *prev_subbuf, uint prev_padding)
 {
diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h
index efec2af9e..2fb1e43f4 100644
--- a/drivers/gpu/msm/kgsl_device.h
+++ b/drivers/gpu/msm/kgsl_device.h
@@ -15,7 +15,6 @@
 
 #include <linux/idr.h>
 #include <linux/wakelock.h>
-#include <linux/pm_qos_params.h>
 #include <linux/earlysuspend.h>
 
 #include "kgsl.h"
@@ -185,7 +184,6 @@ struct kgsl_device {
 	struct wake_lock idle_wakelock;
 	struct kgsl_pwrscale pwrscale;
 	struct kobject pwrscale_kobj;
-	struct pm_qos_request_list pm_qos_req_dma;
 	struct work_struct ts_expired_ws;
 	struct list_head events;
 	s64 on_time;
@@ -199,18 +197,13 @@ struct kgsl_context {
 
 	/* Pointer to the device specific context information */
 	void *devctxt;
-	/*
-	 * Status indicating whether a gpu reset occurred and whether this
-	 * context was responsible for causing it
-	 */
-	unsigned int reset_status;
 };
 
 struct kgsl_process_private {
 	unsigned int refcnt;
 	pid_t pid;
 	spinlock_t mem_lock;
-	struct rb_root mem_rb;
+	struct list_head mem_list;
 	struct kgsl_pagetable *pagetable;
 	struct list_head list;
 	struct kobject kobj;
diff --git a/drivers/gpu/msm/kgsl_drm.c b/drivers/gpu/msm/kgsl_drm.c
index ba48f9c75..dba2dfcfb 100644
--- a/drivers/gpu/msm/kgsl_drm.c
+++ b/drivers/gpu/msm/kgsl_drm.c
@@ -295,9 +295,8 @@ kgsl_gem_alloc_memory(struct drm_gem_object *obj)
 		priv->memdesc.size = obj->size * priv->bufcount;
 
 	} else if (TYPE_IS_MEM(priv->type)) {
-		result = kgsl_sharedmem_page_alloc(&priv->memdesc,
-					priv->pagetable,
-					obj->size * priv->bufcount, 0);
+		priv->memdesc.hostptr =
+			vmalloc_user(obj->size * priv->bufcount);
 
 		if (priv->memdesc.hostptr == NULL) {
 			DRM_ERROR("Unable to allocate vmalloc memory\n");
@@ -1043,18 +1042,17 @@ int kgsl_gem_kmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct drm_device *dev = obj->dev;
 	struct drm_kgsl_gem_object *priv;
-	unsigned long offset;
+	unsigned long offset, pg;
 	struct page *page;
-	int i;
 
 	mutex_lock(&dev->struct_mutex);
 
 	priv = obj->driver_private;
 
 	offset = (unsigned long) vmf->virtual_address - vma->vm_start;
-	i = offset >> PAGE_SHIFT;
-	page = sg_page(&(priv->memdesc.sg[i]));
+	pg = (unsigned long) priv->memdesc.hostptr + offset;
 
+	page = vmalloc_to_page((void *) pg);
 	if (!page) {
 		mutex_unlock(&dev->struct_mutex);
 		return VM_FAULT_SIGBUS;
diff --git a/drivers/gpu/msm/kgsl_gpummu.c b/drivers/gpu/msm/kgsl_gpummu.c
old mode 100644
new mode 100755
index f038f0491..a16b95418
--- a/drivers/gpu/msm/kgsl_gpummu.c
+++ b/drivers/gpu/msm/kgsl_gpummu.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011-2012, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2011, Code Aurora Forum. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -354,8 +354,8 @@ void *kgsl_gpummu_ptpool_init(int ptsize, int entries)
 int kgsl_gpummu_pt_equal(struct kgsl_pagetable *pt,
 					unsigned int pt_base)
 {
-	struct kgsl_gpummu_pt *gpummu_pt = pt ? pt->priv : NULL;
-	return gpummu_pt && pt_base && (gpummu_pt->base.gpuaddr == pt_base);
+	struct kgsl_gpummu_pt *gpummu_pt = pt->priv;
+	return pt && pt_base && (gpummu_pt->base.gpuaddr == pt_base);
 }
 
 void kgsl_gpummu_destroy_pagetable(void *mmu_specific_pt)
@@ -398,14 +398,14 @@ static unsigned int kgsl_gpummu_pt_get_flags(struct kgsl_pagetable *pt,
 				enum kgsl_deviceid id)
 {
 	unsigned int result = 0;
-	struct kgsl_gpummu_pt *gpummu_pt;
+	struct kgsl_gpummu_pt *gpummu_pt = (struct kgsl_gpummu_pt *)
+						pt->priv;
 
 	if (pt == NULL)
 		return 0;
-	gpummu_pt = pt->priv;
 
 	spin_lock(&pt->lock);
-	if (gpummu_pt->tlb_flags & (1<<id)) {
+	if (gpummu_pt->tlb_flags && (1<<id)) {
 		result = KGSL_MMUFLAGS_TLBFLUSH;
 		gpummu_pt->tlb_flags &= ~(1<<id);
 	}
@@ -682,7 +682,7 @@ kgsl_gpummu_map(void *mmu_specific_pt,
 		flushtlb = 1;
 
 	for_each_sg(memdesc->sg, s, memdesc->sglen, i) {
-		unsigned int paddr = kgsl_get_sg_pa(s);
+		unsigned int paddr = sg_phys(s);
 		unsigned int j;
 
 		/* Each sg entry might be multiple pages long */
diff --git a/drivers/gpu/msm/kgsl_iommu.c b/drivers/gpu/msm/kgsl_iommu.c
index 5646d682a..e4e561cef 100644
--- a/drivers/gpu/msm/kgsl_iommu.c
+++ b/drivers/gpu/msm/kgsl_iommu.c
@@ -34,8 +34,8 @@ struct kgsl_iommu {
 static int kgsl_iommu_pt_equal(struct kgsl_pagetable *pt,
 					unsigned int pt_base)
 {
-	struct iommu_domain *domain = pt ? pt->priv : NULL;
-	return domain && pt_base && ((unsigned int)domain == pt_base);
+	struct iommu_domain *domain = pt->priv;
+	return pt && pt_base && ((unsigned int)domain == pt_base);
 }
 
 static void kgsl_iommu_destroy_pagetable(void *mmu_specific_pt)
@@ -262,7 +262,7 @@ kgsl_iommu_map(void *mmu_specific_pt,
 	iommu_virt_addr = memdesc->gpuaddr;
 
 	ret = iommu_map_range(domain, iommu_virt_addr, memdesc->sg,
-				memdesc->size, (IOMMU_READ | IOMMU_WRITE));
+				memdesc->size, 0);
 	if (ret) {
 		KGSL_CORE_ERR("iommu_map_range(%p, %x, %p, %d, %d) "
 				"failed with err: %d\n", domain,
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c
old mode 100644
new mode 100755
index 429e4946d..003afb953
--- a/drivers/gpu/msm/kgsl_pwrctrl.c
+++ b/drivers/gpu/msm/kgsl_pwrctrl.c
@@ -13,6 +13,7 @@
 #include <linux/interrupt.h>
 #include <mach/msm_iomap.h>
 #include <mach/msm_bus.h>
+#include <mach/socinfo.h>
 
 #include "kgsl.h"
 #include "kgsl_pwrscale.h"
@@ -24,7 +25,6 @@
 #define KGSL_PWRFLAGS_AXI_ON   2
 #define KGSL_PWRFLAGS_IRQ_ON   3
 
-#define GPU_SWFI_LATENCY	3
 #define UPDATE_BUSY_VAL		1000000
 #define UPDATE_BUSY		50
 
@@ -284,7 +284,10 @@ static int kgsl_pwrctrl_gpubusy_show(struct device *dev,
 DEVICE_ATTR(gpuclk, 0644, kgsl_pwrctrl_gpuclk_show, kgsl_pwrctrl_gpuclk_store);
 DEVICE_ATTR(max_gpuclk, 0644, kgsl_pwrctrl_max_gpuclk_show,
 	kgsl_pwrctrl_max_gpuclk_store);
+/*< DTS2011123005723 hanfeng 20111230 begin*/
+/*modify the file permission */
 DEVICE_ATTR(pwrnap, 0664, kgsl_pwrctrl_pwrnap_show, kgsl_pwrctrl_pwrnap_store);
+/* DTS2011123005723 hanfeng 20111230 end >*/
 DEVICE_ATTR(idle_timer, 0644, kgsl_pwrctrl_idle_timer_show,
 	kgsl_pwrctrl_idle_timer_store);
 DEVICE_ATTR(gpubusy, 0644, kgsl_pwrctrl_gpubusy_show,
@@ -334,8 +337,7 @@ static void kgsl_pwrctrl_busy_time(struct kgsl_device *device, bool on_time)
 	do_gettimeofday(&(b->start));
 }
 
-void kgsl_pwrctrl_clk(struct kgsl_device *device, int state,
-					  int requested_state)
+void kgsl_pwrctrl_clk(struct kgsl_device *device, int state)
 {
 	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
 	int i = 0;
@@ -347,7 +349,7 @@ void kgsl_pwrctrl_clk(struct kgsl_device *device, int state,
 				if (pwr->grp_clks[i])
 					clk_disable(pwr->grp_clks[i]);
 			if ((pwr->pwrlevels[0].gpu_freq > 0) &&
-				(requested_state != KGSL_STATE_NAP))
+				(device->requested_state != KGSL_STATE_NAP))
 				clk_set_rate(pwr->grp_clks[0],
 					pwr->pwrlevels[pwr->num_pwrlevels - 1].
 					gpu_freq);
@@ -422,12 +424,8 @@ void kgsl_pwrctrl_pwrrail(struct kgsl_device *device, int state)
 		if (!test_and_set_bit(KGSL_PWRFLAGS_POWER_ON,
 			&pwr->power_flags)) {
 			trace_kgsl_rail(device, state);
-			if (pwr->gpu_reg) {
-				int status = regulator_enable(pwr->gpu_reg);
-				if (status)
-					KGSL_DRV_ERR(device, "regulator_enable "
-							"failed: %d\n", status);
-			}
+			if (pwr->gpu_reg)
+				regulator_enable(pwr->gpu_reg);
 		}
 	}
 }
@@ -514,7 +512,10 @@ int kgsl_pwrctrl_init(struct kgsl_device *device)
 	pwr->nap_allowed = pdata->nap_allowed;
 	pwr->idle_needed = pdata->idle_needed;
 	pwr->interval_timeout = pdata->idle_timeout;
+	/*< DTS2012041906630 zhangxiangdang 20120423 begin */
+	/*merge qc patch to fix kgsl issue.*/
 	pwr->strtstp_sleepwake = pdata->strtstp_sleepwake;
+	/* DTS2012041906630 zhangxiangdang 20120423 end > */
 	pwr->ebi1_clk = clk_get(&pdev->dev, "bus_clk");
 	if (IS_ERR(pwr->ebi1_clk))
 		pwr->ebi1_clk = NULL;
@@ -637,8 +638,10 @@ void kgsl_timer(unsigned long data)
 
 	KGSL_PWR_INFO(device, "idle timer expired device %d\n", device->id);
 	if (device->requested_state != KGSL_STATE_SUSPEND) {
+		/*< DTS2012041906630 zhangxiangdang 20120423 begin */
 		if (device->pwrctrl.restore_slumber ||
 					device->pwrctrl.strtstp_sleepwake)
+			/* DTS2012041906630 zhangxiangdang 20120423 end > */
 			kgsl_pwrctrl_request_state(device, KGSL_STATE_SLUMBER);
 		else
 			kgsl_pwrctrl_request_state(device, KGSL_STATE_SLEEP);
@@ -705,8 +708,10 @@ _nap(struct kgsl_device *device)
 			return -EBUSY;
 		}
 		kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
-		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_NAP);
-		kgsl_pwrctrl_set_state(device, KGSL_STATE_NAP);
+		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF);
+		/*< DTS2012041906630 zhangxiangdang 20120423 begin */
+		kgsl_pwrctrl_set_state(device, device->requested_state);
+		/* DTS2012041906630 zhangxiangdang 20120423 end > */
 		if (device->idle_wakelock.name)
 			wake_unlock(&device->idle_wakelock);
 	case KGSL_STATE_NAP:
@@ -748,11 +753,10 @@ _sleep(struct kgsl_device *device)
 				pwr->pwrlevels[pwr->num_pwrlevels - 1].
 				gpu_freq);
 		_sleep_accounting(device);
-		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP);
+		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF);
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_SLEEP);
-		wake_unlock(&device->idle_wakelock);
-		pm_qos_update_request(&device->pm_qos_req_dma,
-					PM_QOS_DEFAULT_VALUE);
+		if (device->idle_wakelock.name)
+			wake_unlock(&device->idle_wakelock);
 		break;
 	case KGSL_STATE_SLEEP:
 	case KGSL_STATE_SLUMBER:
@@ -779,18 +783,18 @@ _slumber(struct kgsl_device *device)
 	case KGSL_STATE_NAP:
 	case KGSL_STATE_SLEEP:
 		del_timer_sync(&device->idle_timer);
+		/*< DTS2012041906630 zhangxiangdang 20120423 begin */
 		if (!device->pwrctrl.strtstp_sleepwake)
 			kgsl_pwrctrl_pwrlevel_change(device,
 					KGSL_PWRLEVEL_NOMINAL);
-		device->pwrctrl.restore_slumber = true;
+		/* DTS2012041906630 zhangxiangdang 20120423 end > */
 		device->ftbl->suspend_context(device);
 		device->ftbl->stop(device);
+		device->pwrctrl.restore_slumber = true;
 		_sleep_accounting(device);
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_SLUMBER);
 		if (device->idle_wakelock.name)
 			wake_unlock(&device->idle_wakelock);
-		pm_qos_update_request(&device->pm_qos_req_dma,
-						PM_QOS_DEFAULT_VALUE);
 		break;
 	case KGSL_STATE_SLUMBER:
 		break;
@@ -852,17 +856,16 @@ void kgsl_pwrctrl_wake(struct kgsl_device *device)
 		/* fall through */
 	case KGSL_STATE_NAP:
 		/* Turn on the core clocks */
-		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON, KGSL_STATE_ACTIVE);
+		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON);
 		/* Enable state before turning on irq */
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE);
 		kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON);
 		/* Re-enable HW access */
 		mod_timer(&device->idle_timer,
 				jiffies + device->pwrctrl.interval_timeout);
-		wake_lock(&device->idle_wakelock);
-		if (device->pwrctrl.restore_slumber == false)
-			pm_qos_update_request(&device->pm_qos_req_dma,
-						GPU_SWFI_LATENCY);
+
+		if (device->idle_wakelock.name)
+			wake_lock(&device->idle_wakelock);
 	case KGSL_STATE_ACTIVE:
 		break;
 	default:
@@ -878,7 +881,7 @@ void kgsl_pwrctrl_enable(struct kgsl_device *device)
 {
 	/* Order pwrrail/clk sequence based upon platform */
 	kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_ON);
-	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON, KGSL_STATE_ACTIVE);
+	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON);
 	kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_ON);
 }
 EXPORT_SYMBOL(kgsl_pwrctrl_enable);
@@ -887,7 +890,7 @@ void kgsl_pwrctrl_disable(struct kgsl_device *device)
 {
 	/* Order pwrrail/clk sequence based upon platform */
 	kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_OFF);
-	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP);
+	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF);
 	kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_OFF);
 }
 EXPORT_SYMBOL(kgsl_pwrctrl_disable);
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.h b/drivers/gpu/msm/kgsl_pwrctrl.h
old mode 100644
new mode 100755
index caaed92c8..0c7ec6003
--- a/drivers/gpu/msm/kgsl_pwrctrl.h
+++ b/drivers/gpu/msm/kgsl_pwrctrl.h
@@ -47,7 +47,9 @@ struct kgsl_pwrctrl {
 	int thermal_pwrlevel;
 	unsigned int num_pwrlevels;
 	unsigned int interval_timeout;
+	/*< DTS2012041906630 zhangxiangdang 20120423 begin */
 	bool strtstp_sleepwake;
+	/* DTS2012041906630 zhangxiangdang 20120423 end > */
 	struct regulator *gpu_reg;
 	uint32_t pcl;
 	unsigned int nap_allowed;
diff --git a/drivers/gpu/msm/kgsl_pwrscale.c b/drivers/gpu/msm/kgsl_pwrscale.c
old mode 100644
new mode 100755
index d0b2a412c..c2252edcf
--- a/drivers/gpu/msm/kgsl_pwrscale.c
+++ b/drivers/gpu/msm/kgsl_pwrscale.c
@@ -89,8 +89,10 @@ static ssize_t pwrscale_policy_show(struct kgsl_device *device, char *buf)
 
 	return ret;
 }
-
+/*< DTS2011123005723 hanfeng 20111230 begin*/
+/*modify the file permission */
 PWRSCALE_ATTR(policy, 0664, pwrscale_policy_show, pwrscale_policy_store);
+/*DTS2011123005723 hanfeng 20111230 end >*/
 
 static ssize_t pwrscale_avail_policies_show(struct kgsl_device *device,
 					    char *buf)
diff --git a/drivers/gpu/msm/kgsl_pwrscale_idlestats.c b/drivers/gpu/msm/kgsl_pwrscale_idlestats.c
old mode 100644
new mode 100755
diff --git a/drivers/gpu/msm/kgsl_sharedmem.c b/drivers/gpu/msm/kgsl_sharedmem.c
old mode 100644
new mode 100755
index ae32e81ff..389ed6d4f
--- a/drivers/gpu/msm/kgsl_sharedmem.c
+++ b/drivers/gpu/msm/kgsl_sharedmem.c
@@ -13,8 +13,6 @@
 #include <linux/vmalloc.h>
 #include <linux/memory_alloc.h>
 #include <asm/cacheflush.h>
-#include <linux/slab.h>
-#include <linux/kmemleak.h>
 
 #include "kgsl.h"
 #include "kgsl_sharedmem.h"
@@ -203,10 +201,6 @@ static int kgsl_drv_memstat_show(struct device *dev,
 		val = kgsl_driver.stats.vmalloc;
 	else if (!strncmp(attr->attr.name, "vmalloc_max", 11))
 		val = kgsl_driver.stats.vmalloc_max;
-	else if (!strncmp(attr->attr.name, "page_alloc", 10))
-		val = kgsl_driver.stats.page_alloc;
-	else if (!strncmp(attr->attr.name, "page_alloc_max", 14))
-		val = kgsl_driver.stats.page_alloc_max;
 	else if (!strncmp(attr->attr.name, "coherent", 8))
 		val = kgsl_driver.stats.coherent;
 	else if (!strncmp(attr->attr.name, "coherent_max", 12))
@@ -236,8 +230,6 @@ static int kgsl_drv_histogram_show(struct device *dev,
 
 DEVICE_ATTR(vmalloc, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(vmalloc_max, 0444, kgsl_drv_memstat_show, NULL);
-DEVICE_ATTR(page_alloc, 0444, kgsl_drv_memstat_show, NULL);
-DEVICE_ATTR(page_alloc_max, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(coherent, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(coherent_max, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(mapped, 0444, kgsl_drv_memstat_show, NULL);
@@ -247,8 +239,6 @@ DEVICE_ATTR(histogram, 0444, kgsl_drv_histogram_show, NULL);
 static const struct device_attribute *drv_attr_list[] = {
 	&dev_attr_vmalloc,
 	&dev_attr_vmalloc_max,
-	&dev_attr_page_alloc,
-	&dev_attr_page_alloc_max,
 	&dev_attr_coherent,
 	&dev_attr_coherent_max,
 	&dev_attr_mapped,
@@ -292,7 +282,7 @@ static void outer_cache_range_op_sg(struct scatterlist *sg, int sglen, int op)
 	int i;
 
 	for_each_sg(sg, s, sglen, i) {
-		unsigned int paddr = kgsl_get_sg_pa(s);
+		unsigned int paddr = sg_phys(s);
 		_outer_cache_range_op(op, paddr, s->length);
 	}
 }
@@ -303,18 +293,17 @@ static void outer_cache_range_op_sg(struct scatterlist *sg, int sglen, int op)
 }
 #endif
 
-static int kgsl_page_alloc_vmfault(struct kgsl_memdesc *memdesc,
+static int kgsl_vmalloc_vmfault(struct kgsl_memdesc *memdesc,
 				struct vm_area_struct *vma,
 				struct vm_fault *vmf)
 {
-	unsigned long offset;
+	unsigned long offset, pg;
 	struct page *page;
-	int i;
 
 	offset = (unsigned long) vmf->virtual_address - vma->vm_start;
+	pg = (unsigned long) memdesc->hostptr + offset;
 
-	i = offset >> PAGE_SHIFT;
-	page = sg_page(&memdesc->sg[i]);
+	page = vmalloc_to_page((void *) pg);
 	if (page == NULL)
 		return VM_FAULT_SIGBUS;
 
@@ -324,23 +313,15 @@ static int kgsl_page_alloc_vmfault(struct kgsl_memdesc *memdesc,
 	return 0;
 }
 
-static int kgsl_page_alloc_vmflags(struct kgsl_memdesc *memdesc)
+static int kgsl_vmalloc_vmflags(struct kgsl_memdesc *memdesc)
 {
 	return VM_RESERVED | VM_DONTEXPAND;
 }
 
-static void kgsl_page_alloc_free(struct kgsl_memdesc *memdesc)
+static void kgsl_vmalloc_free(struct kgsl_memdesc *memdesc)
 {
-	int i = 0;
-	struct scatterlist *sg;
-	kgsl_driver.stats.page_alloc -= memdesc->size;
-	if (memdesc->hostptr) {
-		vunmap(memdesc->hostptr);
-		kgsl_driver.stats.vmalloc -= memdesc->size;
-	}
-	if (memdesc->sg)
-		for_each_sg(memdesc->sg, sg, memdesc->sglen, i)
-			__free_page(sg_page(sg));
+	kgsl_driver.stats.vmalloc -= memdesc->size;
+	vfree(memdesc->hostptr);
 }
 
 static int kgsl_contiguous_vmflags(struct kgsl_memdesc *memdesc)
@@ -348,42 +329,6 @@ static int kgsl_contiguous_vmflags(struct kgsl_memdesc *memdesc)
 	return VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND;
 }
 
-/*
- * kgsl_page_alloc_map_kernel - Map the memory in memdesc to kernel address
- * space
- *
- * @memdesc - The memory descriptor which contains information about the memory
- *
- * Return: 0 on success else error code
- */
-static int kgsl_page_alloc_map_kernel(struct kgsl_memdesc *memdesc)
-{
-	if (!memdesc->hostptr) {
-		pgprot_t page_prot = pgprot_writecombine(PAGE_KERNEL);
-		struct page **pages = NULL;
-		struct scatterlist *sg;
-		int i;
-		/* create a list of pages to call vmap */
-		pages = vmalloc(memdesc->sglen * sizeof(struct page *));
-		if (!pages) {
-			KGSL_CORE_ERR("vmalloc(%d) failed\n",
-				memdesc->sglen * sizeof(struct page *));
-			return -ENOMEM;
-		}
-		for_each_sg(memdesc->sg, sg, memdesc->sglen, i)
-			pages[i] = sg_page(sg);
-		memdesc->hostptr = vmap(pages, memdesc->sglen,
-					VM_IOREMAP, page_prot);
-		KGSL_STATS_ADD(memdesc->size, kgsl_driver.stats.vmalloc,
-				kgsl_driver.stats.vmalloc_max);
-		vfree(pages);
-	}
-	if (!memdesc->hostptr)
-		return -ENOMEM;
-
-	return 0;
-}
-
 static int kgsl_contiguous_vmfault(struct kgsl_memdesc *memdesc,
 				struct vm_area_struct *vma,
 				struct vm_fault *vmf)
@@ -423,13 +368,12 @@ static void kgsl_coherent_free(struct kgsl_memdesc *memdesc)
 }
 
 /* Global - also used by kgsl_drm.c */
-struct kgsl_memdesc_ops kgsl_page_alloc_ops = {
-	.free = kgsl_page_alloc_free,
-	.vmflags = kgsl_page_alloc_vmflags,
-	.vmfault = kgsl_page_alloc_vmfault,
-	.map_kernel_mem = kgsl_page_alloc_map_kernel,
+struct kgsl_memdesc_ops kgsl_vmalloc_ops = {
+	.free = kgsl_vmalloc_free,
+	.vmflags = kgsl_vmalloc_vmflags,
+	.vmfault = kgsl_vmalloc_vmfault,
 };
-EXPORT_SYMBOL(kgsl_page_alloc_ops);
+EXPORT_SYMBOL(kgsl_vmalloc_ops);
 
 static struct kgsl_memdesc_ops kgsl_ebimem_ops = {
 	.free = kgsl_ebimem_free,
@@ -463,9 +407,9 @@ void kgsl_cache_range_op(struct kgsl_memdesc *memdesc, int op)
 EXPORT_SYMBOL(kgsl_cache_range_op);
 
 static int
-_kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
+_kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc,
 			struct kgsl_pagetable *pagetable,
-			size_t size, unsigned int protflags)
+			void *ptr, size_t size, unsigned int protflags)
 {
 	int order, ret = 0;
 	int sglen = PAGE_ALIGN(size) / PAGE_SIZE;
@@ -474,43 +418,36 @@ _kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 	memdesc->size = size;
 	memdesc->pagetable = pagetable;
 	memdesc->priv = KGSL_MEMFLAGS_CACHED;
-	memdesc->ops = &kgsl_page_alloc_ops;
-
-	memdesc->sg = kgsl_sg_alloc(sglen);
+	memdesc->ops = &kgsl_vmalloc_ops;
+	memdesc->hostptr = (void *) ptr;
 
+	memdesc->sg = vmalloc(sglen * sizeof(struct scatterlist));
 	if (memdesc->sg == NULL) {
-		KGSL_CORE_ERR("vmalloc(%d) failed\n",
-			sglen * sizeof(struct scatterlist));
 		ret = -ENOMEM;
 		goto done;
 	}
 
-	kmemleak_not_leak(memdesc->sg);
-
 	memdesc->sglen = sglen;
 	sg_init_table(memdesc->sg, sglen);
 
-	for (i = 0; i < memdesc->sglen; i++) {
-		struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO |
-						__GFP_HIGHMEM);
+	for (i = 0; i < memdesc->sglen; i++, ptr += PAGE_SIZE) {
+		struct page *page = vmalloc_to_page(ptr);
 		if (!page) {
-			ret = -ENOMEM;
-			memdesc->sglen = i;
+			ret = -EINVAL;
 			goto done;
 		}
-		flush_dcache_page(page);
 		sg_set_page(&memdesc->sg[i], page, PAGE_SIZE, 0);
 	}
-	outer_cache_range_op_sg(memdesc->sg, memdesc->sglen,
-				KGSL_CACHE_OP_FLUSH);
+
+	kgsl_cache_range_op(memdesc, KGSL_CACHE_OP_INV);
 
 	ret = kgsl_mmu_map(pagetable, memdesc, protflags);
 
 	if (ret)
 		goto done;
 
-	KGSL_STATS_ADD(size, kgsl_driver.stats.page_alloc,
-		kgsl_driver.stats.page_alloc_max);
+	KGSL_STATS_ADD(size, kgsl_driver.stats.vmalloc,
+		kgsl_driver.stats.vmalloc_max);
 
 	order = get_order(size);
 
@@ -525,41 +462,51 @@ _kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 }
 
 int
-kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
+kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc,
 		       struct kgsl_pagetable *pagetable, size_t size)
 {
-	int ret = 0;
+	void *ptr;
+
 	BUG_ON(size == 0);
 
 	size = ALIGN(size, PAGE_SIZE * 2);
+	ptr = vmalloc(size);
+
+	if (ptr  == NULL) {
+		KGSL_CORE_ERR("vmalloc(%d) failed\n", size);
+		return -ENOMEM;
+	}
 
-	ret =  _kgsl_sharedmem_page_alloc(memdesc, pagetable, size,
+	return _kgsl_sharedmem_vmalloc(memdesc, pagetable, ptr, size,
 		GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
-	if (!ret)
-		ret = kgsl_page_alloc_map_kernel(memdesc);
-	if (ret)
-		kgsl_sharedmem_free(memdesc);
-	return ret;
 }
-EXPORT_SYMBOL(kgsl_sharedmem_page_alloc);
+EXPORT_SYMBOL(kgsl_sharedmem_vmalloc);
 
 int
-kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc,
+kgsl_sharedmem_vmalloc_user(struct kgsl_memdesc *memdesc,
 			    struct kgsl_pagetable *pagetable,
 			    size_t size, int flags)
 {
+	void *ptr;
 	unsigned int protflags;
 
 	BUG_ON(size == 0);
+	ptr = vmalloc_user(size);
+
+	if (ptr == NULL) {
+		KGSL_CORE_ERR("vmalloc_user(%d) failed: allocated=%d\n",
+			      size, kgsl_driver.stats.vmalloc);
+		return -ENOMEM;
+	}
 
 	protflags = GSL_PT_PAGE_RV;
 	if (!(flags & KGSL_MEMFLAGS_GPUREADONLY))
 		protflags |= GSL_PT_PAGE_WV;
 
-	return _kgsl_sharedmem_page_alloc(memdesc, pagetable, size,
+	return _kgsl_sharedmem_vmalloc(memdesc, pagetable, ptr, size,
 		protflags);
 }
-EXPORT_SYMBOL(kgsl_sharedmem_page_alloc_user);
+EXPORT_SYMBOL(kgsl_sharedmem_vmalloc_user);
 
 int
 kgsl_sharedmem_alloc_coherent(struct kgsl_memdesc *memdesc, size_t size)
@@ -607,7 +554,7 @@ void kgsl_sharedmem_free(struct kgsl_memdesc *memdesc)
 	if (memdesc->ops && memdesc->ops->free)
 		memdesc->ops->free(memdesc);
 
-	kgsl_sg_free(memdesc->sg, memdesc->sglen);
+	vfree(memdesc->sg);
 
 	memset(memdesc, 0, sizeof(*memdesc));
 }
@@ -739,33 +686,3 @@ kgsl_sharedmem_set(const struct kgsl_memdesc *memdesc, unsigned int offsetbytes,
 	return 0;
 }
 EXPORT_SYMBOL(kgsl_sharedmem_set);
-
-/*
- * kgsl_sharedmem_map_vma - Map a user vma to physical memory
- *
- * @vma - The user vma to map
- * @memdesc - The memory descriptor which contains information about the
- * physical memory
- *
- * Return: 0 on success else error code
- */
-int
-kgsl_sharedmem_map_vma(struct vm_area_struct *vma,
-			const struct kgsl_memdesc *memdesc)
-{
-	unsigned long addr = vma->vm_start;
-	unsigned long size = vma->vm_end - vma->vm_start;
-	int ret, i = 0;
-
-	if (!memdesc->sg || (size != memdesc->size) ||
-		(memdesc->sglen != (size / PAGE_SIZE)))
-		return -EINVAL;
-
-	for (; addr < vma->vm_end; addr += PAGE_SIZE, i++) {
-		ret = vm_insert_page(vma, addr, sg_page(&memdesc->sg[i]));
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(kgsl_sharedmem_map_vma);
diff --git a/drivers/gpu/msm/kgsl_sharedmem.h b/drivers/gpu/msm/kgsl_sharedmem.h
old mode 100644
new mode 100755
index a67d9c657..67a1c2d7b
--- a/drivers/gpu/msm/kgsl_sharedmem.h
+++ b/drivers/gpu/msm/kgsl_sharedmem.h
@@ -17,8 +17,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/vmalloc.h>
 #include "kgsl_mmu.h"
-#include <linux/slab.h>
-#include <linux/kmemleak.h>
 
 struct kgsl_device;
 struct kgsl_process_private;
@@ -30,12 +28,19 @@ struct kgsl_process_private;
 /** Set if the memdesc describes cached memory */
 #define KGSL_MEMFLAGS_CACHED    0x00000001
 
-extern struct kgsl_memdesc_ops kgsl_page_alloc_ops;
+struct kgsl_memdesc_ops {
+	int (*vmflags)(struct kgsl_memdesc *);
+	int (*vmfault)(struct kgsl_memdesc *, struct vm_area_struct *,
+		       struct vm_fault *);
+	void (*free)(struct kgsl_memdesc *memdesc);
+};
+
+extern struct kgsl_memdesc_ops kgsl_vmalloc_ops;
 
-int kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
+int kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc,
 			   struct kgsl_pagetable *pagetable, size_t size);
 
-int kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc,
+int kgsl_sharedmem_vmalloc_user(struct kgsl_memdesc *memdesc,
 				struct kgsl_pagetable *pagetable,
 				size_t size, int flags);
 
@@ -71,58 +76,19 @@ void kgsl_process_uninit_sysfs(struct kgsl_process_private *private);
 int kgsl_sharedmem_init_sysfs(void);
 void kgsl_sharedmem_uninit_sysfs(void);
 
-static inline unsigned int kgsl_get_sg_pa(struct scatterlist *sg)
-{
-	/*
-	 * Try sg_dma_address first to support ion carveout
-	 * regions which do not work with sg_phys().
-	 */
-	unsigned int pa = sg_dma_address(sg);
-	if (pa == 0)
-		pa = sg_phys(sg);
-	return pa;
-}
-
-int
-kgsl_sharedmem_map_vma(struct vm_area_struct *vma,
-			const struct kgsl_memdesc *memdesc);
-
-/*
- * For relatively small sglists, it is preferable to use kzalloc
- * rather than going down the vmalloc rat hole.  If the size of
- * the sglist is < PAGE_SIZE use kzalloc otherwise fallback to
- * vmalloc
- */
-
-static inline void *kgsl_sg_alloc(unsigned int sglen)
-{
-	if ((sglen * sizeof(struct scatterlist)) <  PAGE_SIZE)
-		return kzalloc(sglen * sizeof(struct scatterlist), GFP_KERNEL);
-	else
-		return vmalloc(sglen * sizeof(struct scatterlist));
-}
-
-static inline void kgsl_sg_free(void *ptr, unsigned int sglen)
-{
-	if ((sglen * sizeof(struct scatterlist)) < PAGE_SIZE)
-		kfree(ptr);
-	else
-		vfree(ptr);
-}
-
 static inline int
 memdesc_sg_phys(struct kgsl_memdesc *memdesc,
 		unsigned int physaddr, unsigned int size)
 {
-	memdesc->sg = kgsl_sg_alloc(1);
+	struct page *page = phys_to_page(physaddr);
 
-	kmemleak_not_leak(memdesc->sg);
+	memdesc->sg = vmalloc(sizeof(struct scatterlist) * 1);
+	if (memdesc->sg == NULL)
+		return -ENOMEM;
 
 	memdesc->sglen = 1;
 	sg_init_table(memdesc->sg, 1);
-	memdesc->sg[0].length = size;
-	memdesc->sg[0].offset = 0;
-	memdesc->sg[0].dma_address = physaddr;
+	sg_set_page(&memdesc->sg[0], page, size, 0);
 	return 0;
 }
 
@@ -132,7 +98,7 @@ kgsl_allocate(struct kgsl_memdesc *memdesc,
 {
 	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE)
 		return kgsl_sharedmem_ebimem(memdesc, pagetable, size);
-	return kgsl_sharedmem_page_alloc(memdesc, pagetable, size);
+	return kgsl_sharedmem_vmalloc(memdesc, pagetable, size);
 }
 
 static inline int
@@ -143,7 +109,7 @@ kgsl_allocate_user(struct kgsl_memdesc *memdesc,
 	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE)
 		return kgsl_sharedmem_ebimem_user(memdesc, pagetable, size,
 						  flags);
-	return kgsl_sharedmem_page_alloc_user(memdesc, pagetable, size, flags);
+	return kgsl_sharedmem_vmalloc_user(memdesc, pagetable, size, flags);
 }
 
 static inline int
diff --git a/drivers/gpu/msm/kgsl_snapshot.c b/drivers/gpu/msm/kgsl_snapshot.c
index 394bc83bd..72df148bc 100644
--- a/drivers/gpu/msm/kgsl_snapshot.c
+++ b/drivers/gpu/msm/kgsl_snapshot.c
@@ -10,6 +10,7 @@
  * GNU General Public License for more details.
  */
 
+#include <linux/vmalloc.h>
 #include <linux/time.h>
 #include <linux/sysfs.h>
 #include <linux/utsname.h>
@@ -282,12 +283,6 @@ int kgsl_device_snapshot(struct kgsl_device *device, int hang)
 	/* Freeze the snapshot on a hang until it gets read */
 	device->snapshot_frozen = (hang) ? 1 : 0;
 
-	/* log buffer info to aid in ramdump recovery */
-	KGSL_DRV_ERR(device, "snapshot created at va %p pa %lx size %d\n",
-			device->snapshot, __pa(device->snapshot),
-			device->snapshot_size);
-	if (hang)
-		sysfs_notify(&device->snapshot_kobj, NULL, "timestamp");
 	return 0;
 }
 EXPORT_SYMBOL(kgsl_device_snapshot);
@@ -437,7 +432,7 @@ int kgsl_device_snapshot_init(struct kgsl_device *device)
 	int ret;
 
 	if (device->snapshot == NULL)
-		device->snapshot = kzalloc(KGSL_SNAPSHOT_MEMSIZE, GFP_KERNEL);
+		device->snapshot = vmalloc(KGSL_SNAPSHOT_MEMSIZE);
 
 	if (device->snapshot == NULL)
 		return -ENOMEM;
@@ -480,7 +475,7 @@ void kgsl_device_snapshot_close(struct kgsl_device *device)
 
 	kobject_put(&device->snapshot_kobj);
 
-	kfree(device->snapshot);
+	vfree(device->snapshot);
 
 	device->snapshot = NULL;
 	device->snapshot_maxsize = 0;
diff --git a/drivers/gpu/msm/z180.c b/drivers/gpu/msm/z180.c
old mode 100644
new mode 100755
index d721a577a..cb3da9075
--- a/drivers/gpu/msm/z180.c
+++ b/drivers/gpu/msm/z180.c
@@ -157,6 +157,13 @@ static struct z180_device device_2d0 = {
 		.active_cnt = 0,
 		.iomemname = KGSL_2D0_REG_MEMORY,
 		.ftbl = &z180_functable,
+#ifdef CONFIG_HAS_EARLYSUSPEND
+		.display_off = {
+			.level = EARLY_SUSPEND_LEVEL_STOP_DRAWING,
+			.suspend = kgsl_early_suspend_driver,
+			.resume = kgsl_late_resume_driver,
+		},
+#endif
 	},
 };
 
@@ -188,6 +195,13 @@ static struct z180_device device_2d1 = {
 		.active_cnt = 0,
 		.iomemname = KGSL_2D1_REG_MEMORY,
 		.ftbl = &z180_functable,
+		.display_off = {
+#ifdef CONFIG_HAS_EARLYSUSPEND
+			.level = EARLY_SUSPEND_LEVEL_STOP_DRAWING,
+			.suspend = kgsl_early_suspend_driver,
+			.resume = kgsl_late_resume_driver,
+#endif
+		},
 	},
 };
 
@@ -393,7 +407,7 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv,
 	unsigned int index	= 0;
 	unsigned int nextindex;
 	unsigned int nextcnt    = Z180_STREAM_END_CMD | 5;
-	struct kgsl_mem_entry *entry = NULL;
+	struct kgsl_memdesc tmp = {0};
 	unsigned int cmd;
 	struct kgsl_device *device = dev_priv->device;
 	struct kgsl_pagetable *pagetable = dev_priv->process_priv->pagetable;
@@ -411,30 +425,8 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv,
 	}
 	cmd = ibdesc[0].gpuaddr;
 	sizedwords = ibdesc[0].sizedwords;
-	/*
-	 * Get a kernel mapping to the IB for monkey patching.
-	 * See the end of this function.
-	 */
-	entry = kgsl_sharedmem_find_region(dev_priv->process_priv, cmd,
-		sizedwords);
-	if (entry == NULL) {
-		KGSL_DRV_ERR(device, "Bad ibdesc: gpuaddr 0x%x size %d\n",
-			     cmd, sizedwords);
-		result = -EINVAL;
-		goto error;
-	}
-	/*
-	 * This will only map memory if it exists, otherwise it will reuse the
-	 * mapping. And the 2d userspace reuses IBs so we likely won't create
-	 * too many mappings.
-	 */
-	if (kgsl_gpuaddr_to_vaddr(&entry->memdesc, cmd) == NULL) {
-		KGSL_DRV_ERR(device,
-			     "Cannot make kernel mapping for gpuaddr 0x%x\n",
-			     cmd);
-		result = -EINVAL;
-		goto error;
-	}
+
+	tmp.hostptr = (void *)*timestamp;
 
 	KGSL_CMD_INFO(device, "ctxt %d ibaddr 0x%08x sizedwords %d\n",
 		context->id, cmd, sizedwords);
@@ -476,13 +468,12 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv,
 	nextaddr = z180_dev->ringbuffer.cmdbufdesc.gpuaddr
 		+ rb_offset(nextindex);
 
-	/* monkey patch the IB so that it jumps back to the ringbuffer */
-	kgsl_sharedmem_writel(&entry->memdesc,
-			      ((sizedwords + 1) * sizeof(unsigned int)),
-			      nextaddr);
-	kgsl_sharedmem_writel(&entry->memdesc,
-			      ((sizedwords + 2) * sizeof(unsigned int)),
-			      nextcnt);
+	tmp.hostptr = (void *)(tmp.hostptr +
+			(sizedwords * sizeof(unsigned int)));
+	tmp.size = 12;
+
+	kgsl_sharedmem_writel(&tmp, 4, nextaddr);
+	kgsl_sharedmem_writel(&tmp, 8, nextcnt);
 
 	/* sync memory before activating the hardware for the new command*/
 	mb();
diff --git a/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c b/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c
old mode 100755
new mode 100644
diff --git a/include/linux/msm_kgsl.h b/include/linux/msm_kgsl.h
old mode 100644
new mode 100755
index 7837bad21..a1d267893
--- a/include/linux/msm_kgsl.h
+++ b/include/linux/msm_kgsl.h
@@ -34,16 +34,6 @@
 #define KGSL_CLK_MEM_IFACE 0x00000010
 #define KGSL_CLK_AXI	0x00000020
 
-/*
- * Reset status values for context
- */
-enum kgsl_ctx_reset_stat {
-	KGSL_CTX_STAT_NO_ERROR				= 0x00000000,
-	KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT		= 0x00000001,
-	KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT	= 0x00000002,
-	KGSL_CTX_STAT_UNKNOWN_CONTEXT_RESET_EXT		= 0x00000003
-};
-
 #define KGSL_MAX_PWRLEVELS 5
 
 #define KGSL_CONVERT_TO_MBPS(val) \
@@ -120,7 +110,6 @@ enum kgsl_property_type {
 	KGSL_PROP_MMU_ENABLE 	  = 0x00000006,
 	KGSL_PROP_INTERRUPT_WAITS = 0x00000007,
 	KGSL_PROP_VERSION         = 0x00000008,
-	KGSL_PROP_GPU_RESET_STAT  = 0x00000009
 };
 
 struct kgsl_shadowprop {
@@ -157,7 +146,9 @@ struct kgsl_device_platform_data {
 	int num_levels;
 	int (*set_grp_async)(void);
 	unsigned int idle_timeout;
+	/*< DTS2012041906630 zhangxiangdang 20120423 begin */
 	bool strtstp_sleepwake;
+	/* DTS2012041906630 zhangxiangdang 20120423 end > */
 	unsigned int nap_allowed;
 	unsigned int clk_map;
 	unsigned int idle_needed;

From af9707f1751b077c4f87e7286da00e1fe60aed6a Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Fri, 25 Jan 2013 16:38:10 +0200
Subject: [PATCH 15/19] add gitignore

---
 .gitignore | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..8faa6c02b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,79 @@
+#
+# NOTE! Don't add files that are generated in specific
+# subdirectories here. Add them in the ".gitignore" file
+# in that subdirectory instead.
+#
+# NOTE! Please use 'git ls-files -i --exclude-standard'
+# command after changing this file, to see if there are
+# any tracked files which get ignored after the change.
+#
+# Normal rules
+#
+.*
+*.o
+*.o.*
+*.a
+*.s
+*.ko
+*.so
+*.so.dbg
+*.mod.c
+*.i
+*.lst
+*.symtypes
+*.order
+modules.builtin
+*.elf
+*.bin
+*.gz
+*.bz2
+*.lzma
+*.lzo
+*.patch
+*.gcno
+
+#
+# Top-level generic files
+#
+/tags
+/TAGS
+/linux
+/vmlinux
+/vmlinuz
+/System.map
+/Module.markers
+/Module.symvers
+
+#
+# git files that we don't want to ignore even it they are dot-files
+#
+!.gitignore
+!.mailmap
+
+#
+# Generated include files
+#
+include/config
+include/linux/version.h
+include/generated
+
+# stgit generated dirs
+patches-*
+
+# quilt's files
+patches
+series
+
+# cscope files
+cscope.*
+ncscope.*
+
+# gnu global files
+GPATH
+GRTAGS
+GSYMS
+GTAGS
+
+*.orig
+*~
+\#*#

From a2dec0b448e716236a9d81022d5b029a3d5bd60f Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Fri, 25 Jan 2013 17:23:23 +0200
Subject: [PATCH 16/19] updated and fixed lag on Atmel TS

---
 .../touchscreen/atmel_i2c_rmi_QT602240.c      | 167 ++++++------------
 1 file changed, 50 insertions(+), 117 deletions(-)

diff --git a/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c b/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c
index 9a931c0f3..bbb8a64fc 100644
--- a/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c
+++ b/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c
@@ -323,8 +323,7 @@ static u8 atmel_timer = 0;
 #define DISABLE 0
 
 /* < DTS2011062404739 cuiyu 20110624 begin */
-static uint32_t resume_time = 0;
-static u8 cal_check_flag = 1; 
+static u8 cal_check_flag = 0; 
 /* DTS2011062404739 cuiyu 20110624 end > */
 /* DTS2010083103149 zhangtao 20100909 end > */
 
@@ -835,7 +834,7 @@ int write_power_config(int on)
 /* < DTS2010083103149 zhangtao 20100909 begin */
 		/* < DTS2011042106137 zhangtao 20110509 begin */
         /* < DTS2011062404739 cuiyu 20110624 begin */
-	    *(tmp + 1) = 16; //0xff//Active Acquisition
+	    *(tmp + 1) = 14; //0xff//Active Acquisition
         /* DTS2011062404739 cuiyu 20110624 end > */
 		/* DTS2011042106137 zhangtao 20110509 end > */
 /* DTS2010083103149 zhangtao 20100909 end > */
@@ -1062,7 +1061,7 @@ int write_multitouchscreen_config(u8 instance,int flag)
 	*(tmp + 11) = 3; //movhysti
 	/* < DTS2011042106137 zhangtao 20110509 begin */
 	/*  make the point report every pix */
-	*(tmp + 12) = 1; //movhystn
+	*(tmp + 12) = 3; //movhystn
 	/* DTS2011042106137 zhangtao 20110509 end > */
 	*(tmp + 13) = 0;//0x2e; //movfilter
 	*(tmp + 14) = 2; //numtouch
@@ -1207,7 +1206,7 @@ int write_gripfacesuppression_config(u8 instance)
 	*(tmp + 5) = 0; //maxtchs
 	*(tmp + 6) = 0; //reserved
 	*(tmp + 7) = 80; //szthr1
-	*(tmp + 8) = 20; //szthr2
+	*(tmp + 8) = 40; //szthr2
 	*(tmp + 9) = 4; //shpthr1
 	*(tmp + 10) = 35; //shpthr2
 	*(tmp + 11) = 10; //supextto
@@ -1273,7 +1272,7 @@ int write_noisesuppression_config(u8 instance)
 	*(tmp + 6) = 0xff; //GCAFLL
 	*(tmp + 7) = 4; //actvgcafvalid
     /* < DTS2011062404739 cuiyu 20110624 begin */
-	*(tmp + 8) = 30; //noisethr
+	*(tmp + 8) = 20; //noisethr
     /* DTS2011062404739 cuiyu 20110624 end > */
 	*(tmp + 9) = 0; //reserved
 	*(tmp + 10) = 0; //freqhopscale
@@ -1890,7 +1889,15 @@ void check_chip_calibration(void)
 		/* Process counters and decide if cal was good or if we must re-calibrate. */
         /* < DTS2011062404739 cuiyu 20110624 begin */
         /* check error */
-		if(atch_ch > 0)
+		if((tch_ch) && (atch_ch == 0))
+		{
+			/* Calibration may be good */
+			cal_maybe_good();
+			TS_DEBUG_TS("the func cal_maybe_good is used!\n");
+		}
+		/* CAL_THR is configurable. A starting value of 10 to 20 is suggested.
+		 *       * This can then be tuned for the particular design. */
+		else if((tch_ch - 25) <= atch_ch && (tch_ch || atch_ch))
         /* DTS2011062404739 cuiyu 20110624 end > */
 		{					
 			/* Calibration was bad - must recalibrate and check afterwards. */
@@ -1915,46 +1922,48 @@ void check_chip_calibration(void)
 }
 /* < DTS2011062404739 cuiyu 20110624 begin */
 /* check point */
-static int check_too_many_point(int num_i, int *x_record)
-{
-
-		while(num_i > 0)
-		{
-			if((x_record[num_i] >= x_record[0] - 2) && (x_record[num_i] <= x_record[0] + 2))
-			{
-				num_i--;
-				continue;
-			}
-			else
-			{
-				return 1; // no too many point
-			}
-		}
-		return -1;
-}
 /* DTS2011062404739 cuiyu 20110624 end > */
 void cal_maybe_good(void)
 {
     int ret;
-    /* < DTS2011062404739 cuiyu 20110624 begin */
-	uint8_t data = 1u;
-	/* shut down */ 
-	if(cal_check_flag == 0)
+	/* Check if the timer is enabled */
+	if(atmel_timer == ENABLE)
 	{
-        /* shut down calibration */
-		if(1 == write_acquisition_config(0, 0))
+		TS_DEBUG_TS("cal_maybe_good: the current time is %lu\n", jiffies);
+		if((jiffies - timer_tick) /10 > 5) /* Check if the timer timedout of 0.5seconds */
 		{
-			/* Acquisition config write failed!\n */
-			TS_DEBUG_TS("\n[ERROR] line : %d\n", __LINE__);
+			/* Cal was good - don't need to check any more */
+			cal_check_flag = 0;
+			/* Disable the timer */
+			atmel_timer = DISABLE;
+			timer_tick = 0;
+			/* Write back the normal acquisition config to chip. */
+			if (1 == write_acquisition_config(0,0))
+			{
+				/* "Acquisition config write failed!\n" */
+                
+				printk("\n[ERROR] line : %d\n", __LINE__);
+			}
+            
+            ret = write_multitouchscreen_config(0,1);
+
+            printk("the cal_maybe_good is ok! the ret is %d\n",ret);
+		}
+		else
+		{
+			cal_check_flag = 1u;
+            TS_DEBUG_TS("the time is not yet!\n");
 		}
-		ret = write_multitouchscreen_config(0, 1);
-		msleep(50);
-		ret = write_mem(command_processor_address + CALIBRATE_OFFSET, 1, &data);
-		TS_DEBUG_TS("the cal_maybe_good is ok! the ret is %d\n", ret);
 	}
-    /* DTS2011062404739 cuiyu 20110624 end > */
+	else
+	{
+		/* Timer not enabled, so enable it */
+		atmel_timer = ENABLE; // enable for 100ms timer
+		timer_tick = jiffies;
+		cal_check_flag = 1u;
+        TS_DEBUG_TS("the cal_maybe_good is enable time!\n");
+	}
 }
-/* DTS2010083103149 zhangtao 20100909 end > */
 
 /* < DTS2010062400225 zhangtao 20100624 begin */
 static int atmel_ts_initchip(void)
@@ -2077,15 +2086,6 @@ static void atmel_ts_work_func(struct work_struct *work)
     static char first_point_id = 1; 
     static int point_1_x;
     static int point_1_y;
-    /* < DTS2011062404739 cuiyu 20110624 begin */
-    static int first_in_point = 0;
-    static int point_1_x_first_down;
-    static int point_1_y_first_down;
-    static int num_1;
-    static int num_2;
-    static int x_record1[10];
-    static int x_record2[5];
-    /* DTS2011062404739 cuiyu 20110624 end > */
     static int point_1_amplitude;
     static int point_1_width;
     static int point_2_x;
@@ -2194,35 +2194,6 @@ static void atmel_ts_work_func(struct work_struct *work)
                             point_1_y = ts->touch_y;
                             point_1_amplitude = ts->touchamplitude;
                             point_1_width = ts->sizeoftouch;
-                            /* < DTS2011062404739 cuiyu 20110624 begin */
-                            /* record point */
-            				if((cal_check_flag != 0) && !(first_in_point))
-            				{
-             				    first_in_point = 1;
-            				    num_1 = 0;
-            				    point_1_x_first_down = point_1_x;
-            				    point_1_y_first_down = point_1_y;
-            				}
-				
-                            /* timeout or not */
-            				if(jiffies - resume_time < 6000)
-            				{
-            					x_record1[num_1] = point_1_x;
-            					if(num_1 >= 9)
-            					{
-            						/* check point */
-            						if(check_too_many_point(num_1, x_record1) == -1)
-            						{
-                                    	 			cal_check_flag = 1;
-            						}
-            						num_1 = 0;
-               					}
-             					else
-            					{
-            						num_1++;
-            					}
-             				}
-                            /* DTS2011062404739 cuiyu 20110624 end > */						
                         }
                         else
                         {
@@ -2231,46 +2202,12 @@ static void atmel_ts_work_func(struct work_struct *work)
                             point_2_y = ts->touch_y;
                             point_2_amplitude = ts->touchamplitude;
                             point_2_width = ts->sizeoftouch;
-                            /* < DTS2011062404739 cuiyu 20110624 begin */
-                            /* timeout or not */
-            				if(jiffies - resume_time < 6000)
-            				{
-            					x_record2[num_2] = point_2_x;
-            					if(num_2 >= 4)
-            					{
-            						/* check point */
-            						if(check_too_many_point(num_2, x_record2) == -1)
-            						{
-                        	 			cal_check_flag = 1;
-             						}
-            						num_2 = 0;
-            					}
-            					else
-            					{
-            						num_2++;
-            					}
-            				}
-                            /* DTS2011062404739 cuiyu 20110624 end > */
                         }
                     }
                     else
                     {
                         if(1 == point_index)
                         {
-                            /* < DTS2011062404739 cuiyu 20110624 begin */
-                    	    if(cal_check_flag == 1 && (second_point_pressed == FALSE))
-                     	    {
-    	    				    if(((abs(ts->touch_x - point_1_x_first_down) > 100 || abs(ts->touch_y - point_1_y_first_down) > 100) 
-					    				|| jiffies - resume_time > 6000))
-    					        {
-       								/* it is all good */
-     								cal_maybe_good();
-     								cal_check_flag = 0;
-        					    }
-        					    first_in_point = 0;
-                     	    }
-                            /* DTS2011062404739 cuiyu 20110624 end > */
-
                             /*if index-1 released, index-2 point remains working*/
                             first_point_id = 2;
                         }
@@ -2577,7 +2514,7 @@ static int atmel_ts_probe(
         goto err_power_on_failed;    
     /* <DTS2011012600839 liliang 20110215 begin */
     /* set gp4 voltage as 2700mV for all */
-    ret = vreg_set_level(v_gp4,VREG_GP4_VOLTAGE_VALUE_2700);
+    ret = vreg_set_level(v_gp4,2700);
     /* <DTS2011012600839 liliang 20110215 end >*/
     if (ret)        
         goto err_power_on_failed;    
@@ -2949,7 +2886,8 @@ goto succeed_find_device;
 	{
 	    /* < DTS2011052101089 shenjinming 20110521 begin */
         /* can't use the flag ret here, it will change the return value of probe function */
-        vreg_disable(v_gp4);
+        ret = vreg_disable(v_gp4);
+        printk(KERN_ERR "the atmel's power is off: gp4 = %d \n ", ret);
         /* delete a line */
         /* DTS2011052101089 shenjinming 20110521 end > */	
 	}
@@ -3021,11 +2959,6 @@ static int atmel_ts_resume(struct i2c_client *client)
     write_power_config(1);
 /* < DTS2010083103149 zhangtao 20100909 begin */
 	calibrate_chip_error();
-/* DTS2010083103149 zhangtao 20100909 end > */
-    /* < DTS2011062404739 cuiyu 20110624 begin */
-	cal_check_flag = 1;
-	resume_time = jiffies;
-    /* DTS2011062404739 cuiyu 20110624 end > */
 	
 	if (ts->use_irq) {
 		enable_irq(client->irq);

From 89abf031a76bb8afe836b088a1ebe50fd846ec79 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Fri, 25 Jan 2013 18:38:20 +0200
Subject: [PATCH 17/19] Add BFQv5

---
 arch/arm/configs/u8800_defconfig |    6 +-
 block/Kconfig.iosched            |   26 +
 block/Makefile                   |    1 +
 block/bfq-cgroup.c               |  831 ++++++++
 block/bfq-ioc.c                  |  380 ++++
 block/bfq-iosched.c              | 3047 ++++++++++++++++++++++++++++++
 block/bfq-sched.c                | 1066 +++++++++++
 block/bfq.h                      |  595 ++++++
 block/blk-ioc.c                  |   29 +-
 block/cfq-iosched.c              |   10 +-
 fs/ioprio.c                      |    7 +-
 include/linux/cgroup_subsys.h    |    6 +
 include/linux/iocontext.h        |   18 +-
 13 files changed, 6000 insertions(+), 22 deletions(-)
 create mode 100644 block/bfq-cgroup.c
 create mode 100644 block/bfq-ioc.c
 create mode 100644 block/bfq-iosched.c
 create mode 100644 block/bfq-sched.c
 create mode 100644 block/bfq.h

diff --git a/arch/arm/configs/u8800_defconfig b/arch/arm/configs/u8800_defconfig
index e252fda97..fbf77b32f 100644
--- a/arch/arm/configs/u8800_defconfig
+++ b/arch/arm/configs/u8800_defconfig
@@ -172,11 +172,13 @@ CONFIG_LBDAF=y
 # IO Schedulers
 #
 CONFIG_IOSCHED_NOOP=y
+# CONFIG_DEFAULT_NOOP is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_IOSCHED_CFQ=y
 # CONFIG_DEFAULT_CFQ is not set
-CONFIG_DEFAULT_NOOP=y
-CONFIG_DEFAULT_IOSCHED="noop"
+CONFIG_IOSCHED_BFQ=y
+CONFIG_DEFAULT_BFQ=y
+CONFIG_DEFAULT_IOSCHED="bfq"
 # CONFIG_INLINE_SPIN_TRYLOCK is not set
 # CONFIG_INLINE_SPIN_TRYLOCK_BH is not set
 # CONFIG_INLINE_SPIN_LOCK is not set
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 3199b76f7..bceaaecef 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -43,6 +43,28 @@ config CFQ_GROUP_IOSCHED
 	---help---
 	  Enable group IO scheduling in CFQ.
 
+config IOSCHED_BFQ
+	tristate "BFQ I/O scheduler"
+	depends on EXPERIMENTAL
+	default n
+	---help---
+	  The BFQ I/O scheduler tries to distribute bandwidth among
+	  all processes according to their weights.
+	  It aims at distributing the bandwidth as desired, independently of
+	  the disk parameters and with any workload. It also tries to
+	  guarantee low latency to interactive and soft real-time
+	  applications.  If compiled built-in (saying Y here), BFQ can
+	  be configured to support hierarchical scheduling.
+
+config CGROUP_BFQIO
+	bool "BFQ hierarchical scheduling support"
+	depends on CGROUPS && IOSCHED_BFQ=y
+	default n
+	---help---
+	  Enable hierarchical scheduling in BFQ, using the cgroups
+	  filesystem interface.  The name of the subsystem will be
+	  bfqio.
+
 choice
 	prompt "Default I/O scheduler"
 	default DEFAULT_CFQ
@@ -53,6 +75,9 @@ choice
 	config DEFAULT_DEADLINE
 		bool "Deadline" if IOSCHED_DEADLINE=y
 
+	config DEFAULT_BFQ
+		bool "BFQ" if IOSCHED_BFQ=y
+		
 	config DEFAULT_CFQ
 		bool "CFQ" if IOSCHED_CFQ=y
 
@@ -65,6 +90,7 @@ config DEFAULT_IOSCHED
 	string
 	default "deadline" if DEFAULT_DEADLINE
 	default "cfq" if DEFAULT_CFQ
+	default "bfq" if DEFAULT_BFQ
 	default "noop" if DEFAULT_NOOP
 
 endmenu
diff --git a/block/Makefile b/block/Makefile
index 0fec4b3fa..a3cf79cc0 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
+obj-$(CONFIG_IOSCHED_BFQ)   += bfq-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
new file mode 100644
index 000000000..74ae73b91
--- /dev/null
+++ b/block/bfq-cgroup.c
@@ -0,0 +1,831 @@
+/*
+ * BFQ: CGROUPS support.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
+ */
+
+#ifdef CONFIG_CGROUP_BFQIO
+static struct bfqio_cgroup bfqio_root_cgroup = {
+	.weight = BFQ_DEFAULT_GRP_WEIGHT,
+	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,
+	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,
+};
+
+static inline void bfq_init_entity(struct bfq_entity *entity,
+				   struct bfq_group *bfqg)
+{
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	entity->ioprio = entity->new_ioprio;
+	entity->ioprio_class = entity->new_ioprio_class;
+	entity->parent = bfqg->my_entity;
+	entity->sched_data = &bfqg->sched_data;
+}
+
+static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),
+			    struct bfqio_cgroup, css);
+}
+
+/*
+ * Search the bfq_group for bfqd into the hash table (by now only a list)
+ * of bgrp.  Must be called under rcu_read_lock().
+ */
+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
+					    struct bfq_data *bfqd)
+{
+	struct bfq_group *bfqg;
+	struct hlist_node *n;
+	void *key;
+
+	hlist_for_each_entry_rcu(bfqg, n, &bgrp->group_data, group_node) {
+		key = rcu_dereference(bfqg->bfqd);
+		if (key == bfqd)
+			return bfqg;
+	}
+
+	return NULL;
+}
+
+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
+					 struct bfq_group *bfqg)
+{
+	struct bfq_entity *entity = &bfqg->entity;
+
+	entity->weight = entity->new_weight = bgrp->weight;
+	entity->orig_weight = entity->new_weight;
+	entity->ioprio = entity->new_ioprio = bgrp->ioprio;
+	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
+	entity->ioprio_changed = 1;
+	entity->my_sched_data = &bfqg->sched_data;
+}
+
+static inline void bfq_group_set_parent(struct bfq_group *bfqg,
+					struct bfq_group *parent)
+{
+	struct bfq_entity *entity;
+
+	BUG_ON(parent == NULL);
+	BUG_ON(bfqg == NULL);
+
+	entity = &bfqg->entity;
+	entity->parent = parent->my_entity;
+	entity->sched_data = &parent->sched_data;
+}
+
+/**
+ * bfq_group_chain_alloc - allocate a chain of groups.
+ * @bfqd: queue descriptor.
+ * @cgroup: the leaf cgroup this chain starts from.
+ *
+ * Allocate a chain of groups starting from the one belonging to
+ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain
+ * to the root has already an allocated group on @bfqd.
+ */
+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
+					       struct cgroup *cgroup)
+{
+	struct bfqio_cgroup *bgrp;
+	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
+
+	for (; cgroup != NULL; cgroup = cgroup->parent) {
+		bgrp = cgroup_to_bfqio(cgroup);
+
+		bfqg = bfqio_lookup_group(bgrp, bfqd);
+		if (bfqg != NULL) {
+			/*
+			 * All the cgroups in the path from there to the
+			 * root must have a bfq_group for bfqd, so we don't
+			 * need any more allocations.
+			 */
+			break;
+		}
+
+		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
+		if (bfqg == NULL)
+			goto cleanup;
+
+		bfq_group_init_entity(bgrp, bfqg);
+		bfqg->my_entity = &bfqg->entity;
+
+		if (leaf == NULL) {
+			leaf = bfqg;
+			prev = leaf;
+		} else {
+			bfq_group_set_parent(prev, bfqg);
+			/*
+			 * Build a list of allocated nodes using the bfqd
+			 * filed, that is still unused and will be initialized
+			 * only after the node will be connected.
+			 */
+			prev->bfqd = bfqg;
+			prev = bfqg;
+		}
+	}
+
+	return leaf;
+
+cleanup:
+	while (leaf != NULL) {
+		prev = leaf;
+		leaf = leaf->bfqd;
+		kfree(prev);
+	}
+
+	return NULL;
+}
+
+/**
+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
+ * @bfqd: the queue descriptor.
+ * @cgroup: the leaf cgroup to start from.
+ * @leaf: the leaf group (to be associated to @cgroup).
+ *
+ * Try to link a chain of groups to a cgroup hierarchy, connecting the
+ * nodes bottom-up, so we can be sure that when we find a cgroup in the
+ * hierarchy that already as a group associated to @bfqd all the nodes
+ * in the path to the root cgroup have one too.
+ *
+ * On locking: the queue lock protects the hierarchy (there is a hierarchy
+ * per device) while the bfqio_cgroup lock protects the list of groups
+ * belonging to the same cgroup.
+ */
+static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
+				 struct bfq_group *leaf)
+{
+	struct bfqio_cgroup *bgrp;
+	struct bfq_group *bfqg, *next, *prev = NULL;
+	unsigned long flags;
+
+	assert_spin_locked(bfqd->queue->queue_lock);
+
+	for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
+		bgrp = cgroup_to_bfqio(cgroup);
+		next = leaf->bfqd;
+
+		bfqg = bfqio_lookup_group(bgrp, bfqd);
+		BUG_ON(bfqg != NULL);
+
+		spin_lock_irqsave(&bgrp->lock, flags);
+
+		rcu_assign_pointer(leaf->bfqd, bfqd);
+		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
+		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
+
+		spin_unlock_irqrestore(&bgrp->lock, flags);
+
+		prev = leaf;
+		leaf = next;
+	}
+
+	BUG_ON(cgroup == NULL && leaf != NULL);
+	if (cgroup != NULL && prev != NULL) {
+		bgrp = cgroup_to_bfqio(cgroup);
+		bfqg = bfqio_lookup_group(bgrp, bfqd);
+		bfq_group_set_parent(prev, bfqg);
+	}
+}
+
+/**
+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
+ * @bfqd: queue descriptor.
+ * @cgroup: cgroup being searched for.
+ *
+ * Return a group associated to @bfqd in @cgroup, allocating one if
+ * necessary.  When a group is returned all the cgroups in the path
+ * to the root have a group associated to @bfqd.
+ *
+ * If the allocation fails, return the root group: this breaks guarantees
+ * but is a safe fallbak.  If this loss becames a problem it can be
+ * mitigated using the equivalent weight (given by the product of the
+ * weights of the groups in the path from @group to the root) in the
+ * root scheduler.
+ *
+ * We allocate all the missing nodes in the path from the leaf cgroup
+ * to the root and we connect the nodes only after all the allocations
+ * have been successful.
+ */
+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
+					      struct cgroup *cgroup)
+{
+	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
+	struct bfq_group *bfqg;
+
+	bfqg = bfqio_lookup_group(bgrp, bfqd);
+	if (bfqg != NULL)
+		return bfqg;
+
+	bfqg = bfq_group_chain_alloc(bfqd, cgroup);
+	if (bfqg != NULL)
+		bfq_group_chain_link(bfqd, cgroup, bfqg);
+	else
+		bfqg = bfqd->root_group;
+
+	return bfqg;
+}
+
+/**
+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
+ * @bfqd: queue descriptor.
+ * @bfqq: the queue to move.
+ * @entity: @bfqq's entity.
+ * @bfqg: the group to move to.
+ *
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
+ * it on the new one.  Avoid putting the entity on the old group idle tree.
+ *
+ * Must be called under the queue lock; the cgroup owning @bfqg must
+ * not disappear (by now this just means that we are called under
+ * rcu_read_lock()).
+ */
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct bfq_entity *entity, struct bfq_group *bfqg)
+{
+	int busy, resume;
+
+	busy = bfq_bfqq_busy(bfqq);
+	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
+
+	BUG_ON(resume && !entity->on_st);
+	BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue);
+
+	if (busy) {
+		BUG_ON(atomic_read(&bfqq->ref) < 2);
+
+		if (!resume)
+			bfq_del_bfqq_busy(bfqd, bfqq, 0);
+		else
+			bfq_deactivate_bfqq(bfqd, bfqq, 0);
+	} else if (entity->on_st)
+		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+
+	/*
+	 * Here we use a reference to bfqg.  We don't need a refcounter
+	 * as the cgroup reference will not be dropped, so that its
+	 * destroy() callback will not be invoked.
+	 */
+	entity->parent = bfqg->my_entity;
+	entity->sched_data = &bfqg->sched_data;
+
+	if (busy && resume)
+		bfq_activate_bfqq(bfqd, bfqq);
+}
+
+/**
+ * __bfq_cic_change_cgroup - move @cic to @cgroup.
+ * @bfqd: the queue descriptor.
+ * @cic: the cic to move.
+ * @cgroup: the cgroup to move to.
+ *
+ * Move cic to cgroup, assuming that bfqd->queue is locked; the caller
+ * has to make sure that the reference to cgroup is valid across the call.
+ *
+ * NOTE: an alternative approach might have been to store the current
+ * cgroup in bfqq and getting a reference to it, reducing the lookup
+ * time here, at the price of slightly more complex code.
+ */
+static struct bfq_group *__bfq_cic_change_cgroup(struct bfq_data *bfqd,
+						 struct cfq_io_context *cic,
+						 struct cgroup *cgroup)
+{
+	struct bfq_queue *async_bfqq = cic_to_bfqq(cic, 0);
+	struct bfq_queue *sync_bfqq = cic_to_bfqq(cic, 1);
+	struct bfq_entity *entity;
+	struct bfq_group *bfqg;
+
+	bfqg = bfq_find_alloc_group(bfqd, cgroup);
+	if (async_bfqq != NULL) {
+		entity = &async_bfqq->entity;
+
+		if (entity->sched_data != &bfqg->sched_data) {
+			cic_set_bfqq(cic, NULL, 0);
+			bfq_log_bfqq(bfqd, async_bfqq,
+				     "cic_change_group: %p %d",
+				     async_bfqq, atomic_read(&async_bfqq->ref));
+			bfq_put_queue(async_bfqq);
+		}
+	}
+
+	if (sync_bfqq != NULL) {
+		entity = &sync_bfqq->entity;
+		if (entity->sched_data != &bfqg->sched_data)
+			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
+	}
+
+	return bfqg;
+}
+
+/**
+ * bfq_cic_change_cgroup - move @cic to @cgroup.
+ * @cic: the cic being migrated.
+ * @cgroup: the destination cgroup.
+ *
+ * When the task owning @cic is moved to @cgroup, @cic is immediately
+ * moved into its new parent group.
+ */
+static void bfq_cic_change_cgroup(struct cfq_io_context *cic,
+				  struct cgroup *cgroup)
+{
+	struct bfq_data *bfqd;
+	unsigned long uninitialized_var(flags);
+
+	bfqd = bfq_get_bfqd_locked(&cic->key, &flags);
+	if (bfqd != NULL &&
+	    !strncmp(bfqd->queue->elevator->elevator_type->elevator_name,
+		     "bfq", ELV_NAME_MAX)) {
+		__bfq_cic_change_cgroup(bfqd, cic, cgroup);
+		bfq_put_bfqd_unlock(bfqd, &flags);
+	}
+}
+
+/**
+ * bfq_cic_update_cgroup - update the cgroup of @cic.
+ * @cic: the @cic to update.
+ *
+ * Make sure that @cic is enqueued in the cgroup of the current task.
+ * We need this in addition to moving cics during the cgroup attach
+ * phase because the task owning @cic could be at its first disk
+ * access or we may end up in the root cgroup as the result of a
+ * memory allocation failure and here we try to move to the right
+ * group.
+ *
+ * Must be called under the queue lock.  It is safe to use the returned
+ * value even after the rcu_read_unlock() as the migration/destruction
+ * paths act under the queue lock too.  IOW it is impossible to race with
+ * group migration/destruction and end up with an invalid group as:
+ *   a) here cgroup has not yet been destroyed, nor its destroy callback
+ *      has started execution, as current holds a reference to it,
+ *   b) if it is destroyed after rcu_read_unlock() [after current is
+ *      migrated to a different cgroup] its attach() callback will have
+ *      taken care of remove all the references to the old cgroup data.
+ */
+static struct bfq_group *bfq_cic_update_cgroup(struct cfq_io_context *cic)
+{
+	struct bfq_data *bfqd = cic->key;
+	struct bfq_group *bfqg;
+	struct cgroup *cgroup;
+
+	BUG_ON(bfqd == NULL);
+
+	rcu_read_lock();
+	cgroup = task_cgroup(current, bfqio_subsys_id);
+	bfqg = __bfq_cic_change_cgroup(bfqd, cic, cgroup);
+	rcu_read_unlock();
+
+	return bfqg;
+}
+
+/**
+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
+ * @st: the service tree being flushed.
+ */
+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entity = st->first_idle;
+
+	for (; entity != NULL; entity = st->first_idle)
+		__bfq_deactivate_entity(entity, 0);
+}
+
+/**
+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
+ * @bfqd: the device data structure with the root group.
+ * @entity: the entity to move.
+ */
+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+					    struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	BUG_ON(bfqq == NULL);
+	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
+	return;
+}
+
+/**
+ * bfq_reparent_active_entities - move to the root group all active entities.
+ * @bfqd: the device data structure with the root group.
+ * @bfqg: the group to move from.
+ * @st: the service tree with the entities.
+ *
+ * Needs queue_lock to be taken and reference to be valid over the call.
+ */
+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
+						struct bfq_group *bfqg,
+						struct bfq_service_tree *st)
+{
+	struct rb_root *active = &st->active;
+	struct bfq_entity *entity = NULL;
+
+	if (!RB_EMPTY_ROOT(&st->active))
+		entity = bfq_entity_of(rb_first(active));
+
+	for (; entity != NULL ; entity = bfq_entity_of(rb_first(active)))
+		bfq_reparent_leaf_entity(bfqd, entity);
+
+	if (bfqg->sched_data.active_entity != NULL)
+		bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);
+
+	return;
+}
+
+/**
+ * bfq_destroy_group - destroy @bfqg.
+ * @bgrp: the bfqio_cgroup containing @bfqg.
+ * @bfqg: the group being destroyed.
+ *
+ * Destroy @bfqg, making sure that it is not referenced from its parent.
+ */
+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
+{
+	struct bfq_data *bfqd;
+	struct bfq_service_tree *st;
+	struct bfq_entity *entity = bfqg->my_entity;
+	unsigned long uninitialized_var(flags);
+	int i;
+
+	hlist_del(&bfqg->group_node);
+
+	/*
+	 * Empty all service_trees belonging to this group before deactivating
+	 * the group itself.
+	 */
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
+		st = bfqg->sched_data.service_tree + i;
+
+		/*
+		 * The idle tree may still contain bfq_queues belonging
+		 * to exited task because they never migrated to a different
+		 * cgroup from the one being destroyed now.  Noone else
+		 * can access them so it's safe to act without any lock.
+		 */
+		bfq_flush_idle_tree(st);
+
+		/*
+		 * It may happen that some queues are still active
+		 * (busy) upon group destruction (if the corresponding
+		 * processes have been forced to terminate). We move
+		 * all the leaf entities corresponding to these queues
+		 * to the root_group.
+		 * Also, it may happen that the group has an entity
+		 * under service, which is disconnected from the active
+		 * tree: it must be moved, too.
+		 * There is no need to put the sync queues, as the
+		 * scheduler has taken no reference.
+		 */
+		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
+		if (bfqd != NULL) {
+			bfq_reparent_active_entities(bfqd, bfqg, st);
+			bfq_put_bfqd_unlock(bfqd, &flags);
+		}
+		BUG_ON(!RB_EMPTY_ROOT(&st->active));
+		BUG_ON(!RB_EMPTY_ROOT(&st->idle));
+	}
+	BUG_ON(bfqg->sched_data.next_active != NULL);
+	BUG_ON(bfqg->sched_data.active_entity != NULL);
+
+	/*
+	 * We may race with device destruction, take extra care when
+	 * dereferencing bfqg->bfqd.
+	 */
+	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
+	if (bfqd != NULL) {
+		hlist_del(&bfqg->bfqd_node);
+		__bfq_deactivate_entity(entity, 0);
+		bfq_put_async_queues(bfqd, bfqg);
+		bfq_put_bfqd_unlock(bfqd, &flags);
+	}
+	BUG_ON(entity->tree != NULL);
+
+	/*
+	 * No need to defer the kfree() to the end of the RCU grace
+	 * period: we are called from the destroy() callback of our
+	 * cgroup, so we can be sure that noone is a) still using
+	 * this cgroup or b) doing lookups in it.
+	 */
+	kfree(bfqg);
+}
+
+/**
+ * bfq_disconnect_groups - diconnect @bfqd from all its groups.
+ * @bfqd: the device descriptor being exited.
+ *
+ * When the device exits we just make sure that no lookup can return
+ * the now unused group structures.  They will be deallocated on cgroup
+ * destruction.
+ */
+static void bfq_disconnect_groups(struct bfq_data *bfqd)
+{
+	struct hlist_node *pos, *n;
+	struct bfq_group *bfqg;
+
+	bfq_log(bfqd, "disconnect_groups beginning") ;
+	hlist_for_each_entry_safe(bfqg, pos, n, &bfqd->group_list, bfqd_node) {
+		hlist_del(&bfqg->bfqd_node);
+
+		__bfq_deactivate_entity(bfqg->my_entity, 0);
+
+		/*
+		 * Don't remove from the group hash, just set an
+		 * invalid key.  No lookups can race with the
+		 * assignment as bfqd is being destroyed; this
+		 * implies also that new elements cannot be added
+		 * to the list.
+		 */
+		rcu_assign_pointer(bfqg->bfqd, NULL);
+
+		bfq_log(bfqd, "disconnect_groups: put async for group %p",
+			bfqg) ;
+		bfq_put_async_queues(bfqd, bfqg);
+	}
+}
+
+static inline void bfq_free_root_group(struct bfq_data *bfqd)
+{
+	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
+	struct bfq_group *bfqg = bfqd->root_group;
+
+	bfq_put_async_queues(bfqd, bfqg);
+
+	spin_lock_irq(&bgrp->lock);
+	hlist_del_rcu(&bfqg->group_node);
+	spin_unlock_irq(&bgrp->lock);
+
+	/*
+	 * No need to synchronize_rcu() here: since the device is gone
+	 * there cannot be any read-side access to its root_group.
+	 */
+	kfree(bfqg);
+}
+
+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
+{
+	struct bfq_group *bfqg;
+	struct bfqio_cgroup *bgrp;
+	int i;
+
+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
+	if (bfqg == NULL)
+		return NULL;
+
+	bfqg->entity.parent = NULL;
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+
+	bgrp = &bfqio_root_cgroup;
+	spin_lock_irq(&bgrp->lock);
+	rcu_assign_pointer(bfqg->bfqd, bfqd);
+	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
+	spin_unlock_irq(&bgrp->lock);
+
+	return bfqg;
+}
+
+#define SHOW_FUNCTION(__VAR)						\
+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup,		\
+				       struct cftype *cftype)		\
+{									\
+	struct bfqio_cgroup *bgrp;					\
+	u64 ret;							\
+									\
+	if (!cgroup_lock_live_group(cgroup))				\
+		return -ENODEV;						\
+									\
+	bgrp = cgroup_to_bfqio(cgroup);					\
+	spin_lock_irq(&bgrp->lock);					\
+	ret = bgrp->__VAR;						\
+	spin_unlock_irq(&bgrp->lock);					\
+									\
+	cgroup_unlock();						\
+									\
+	return ret;							\
+}
+
+SHOW_FUNCTION(weight);
+SHOW_FUNCTION(ioprio);
+SHOW_FUNCTION(ioprio_class);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\
+static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup,		\
+					struct cftype *cftype,		\
+					u64 val)			\
+{									\
+	struct bfqio_cgroup *bgrp;					\
+	struct bfq_group *bfqg;						\
+	struct hlist_node *n;						\
+									\
+	if (val < (__MIN) || val > (__MAX))				\
+		return -EINVAL;						\
+									\
+	if (!cgroup_lock_live_group(cgroup))				\
+		return -ENODEV;						\
+									\
+	bgrp = cgroup_to_bfqio(cgroup);					\
+									\
+	spin_lock_irq(&bgrp->lock);					\
+	bgrp->__VAR = (unsigned short)val;				\
+	hlist_for_each_entry(bfqg, n, &bgrp->group_data, group_node) {	\
+		bfqg->entity.new_##__VAR = (unsigned short)val;		\
+		smp_wmb();						\
+		bfqg->entity.ioprio_changed = 1;			\
+	}								\
+	spin_unlock_irq(&bgrp->lock);					\
+									\
+	cgroup_unlock();						\
+									\
+	return 0;							\
+}
+
+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
+#undef STORE_FUNCTION
+
+static struct cftype bfqio_files[] = {
+	{
+		.name = "weight",
+		.read_u64 = bfqio_cgroup_weight_read,
+		.write_u64 = bfqio_cgroup_weight_write,
+	},
+	{
+		.name = "ioprio",
+		.read_u64 = bfqio_cgroup_ioprio_read,
+		.write_u64 = bfqio_cgroup_ioprio_write,
+	},
+	{
+		.name = "ioprio_class",
+		.read_u64 = bfqio_cgroup_ioprio_class_read,
+		.write_u64 = bfqio_cgroup_ioprio_class_write,
+	},
+};
+
+static int bfqio_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+	return cgroup_add_files(cgroup, subsys, bfqio_files,
+				ARRAY_SIZE(bfqio_files));
+}
+
+static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys *subsys,
+						struct cgroup *cgroup)
+{
+	struct bfqio_cgroup *bgrp;
+
+	if (cgroup->parent != NULL) {
+		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
+		if (bgrp == NULL)
+			return ERR_PTR(-ENOMEM);
+	} else
+		bgrp = &bfqio_root_cgroup;
+
+	spin_lock_init(&bgrp->lock);
+	INIT_HLIST_HEAD(&bgrp->group_data);
+	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
+	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
+
+	return &bgrp->css;
+}
+
+/*
+ * We cannot support shared io contexts, as we have no mean to support
+ * two tasks with the same ioc in two different groups without major rework
+ * of the main cic/bfqq data structures.  By now we allow a task to change
+ * its cgroup only if it's the only owner of its ioc; the drawback of this
+ * behavior is that a group containing a task that forked using CLONE_IO
+ * will not be destroyed until the tasks sharing the ioc die.
+ */
+static int bfqio_can_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
+			    struct task_struct *tsk)
+{
+	struct io_context *ioc;
+	int ret = 0;
+
+	/* task_lock() is needed to avoid races with exit_io_context() */
+	task_lock(tsk);
+	ioc = tsk->io_context;
+	if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
+		/*
+		 * ioc == NULL means that the task is either too young or
+		 * exiting: if it has still no ioc the ioc can't be shared,
+		 * if the task is exiting the attach will fail anyway, no
+		 * matter what we return here.
+		 */
+		ret = -EINVAL;
+	task_unlock(tsk);
+
+	return ret;
+}
+
+static void bfqio_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
+			 struct cgroup *prev, struct task_struct *tsk)
+{
+	struct io_context *ioc;
+	struct cfq_io_context *cic;
+	struct hlist_node *n;
+
+	task_lock(tsk);
+	ioc = tsk->io_context;
+	if (ioc != NULL) {
+		BUG_ON(atomic_long_read(&ioc->refcount) == 0);
+		atomic_long_inc(&ioc->refcount);
+	}
+	task_unlock(tsk);
+
+	if (ioc == NULL)
+		return;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list)
+		bfq_cic_change_cgroup(cic, cgroup);
+	rcu_read_unlock();
+
+	put_io_context(ioc);
+}
+
+static void bfqio_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
+	struct hlist_node *n, *tmp;
+	struct bfq_group *bfqg;
+
+	/*
+	 * Since we are destroying the cgroup, there are no more tasks
+	 * referencing it, and all the RCU grace periods that may have
+	 * referenced it are ended (as the destruction of the parent
+	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by
+	 * anything else and we don't need any synchronization.
+	 */
+	hlist_for_each_entry_safe(bfqg, n, tmp, &bgrp->group_data, group_node)
+		bfq_destroy_group(bgrp, bfqg);
+
+	BUG_ON(!hlist_empty(&bgrp->group_data));
+
+	kfree(bgrp);
+}
+
+struct cgroup_subsys bfqio_subsys = {
+	.name = "bfqio",
+	.create = bfqio_create,
+	.can_attach = bfqio_can_attach,
+	.attach = bfqio_attach,
+	.destroy = bfqio_destroy,
+	.populate = bfqio_populate,
+	.subsys_id = bfqio_subsys_id,
+};
+#else
+static inline void bfq_init_entity(struct bfq_entity *entity,
+				   struct bfq_group *bfqg)
+{
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	entity->ioprio = entity->new_ioprio;
+	entity->ioprio_class = entity->new_ioprio_class;
+	entity->sched_data = &bfqg->sched_data;
+}
+
+static inline struct bfq_group *
+bfq_cic_update_cgroup(struct cfq_io_context *cic)
+{
+	struct bfq_data *bfqd = cic->key;
+	return bfqd->root_group;
+}
+
+static inline void bfq_bfqq_move(struct bfq_data *bfqd,
+				 struct bfq_queue *bfqq,
+				 struct bfq_entity *entity,
+				 struct bfq_group *bfqg)
+{
+}
+
+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
+{
+	bfq_put_async_queues(bfqd, bfqd->root_group);
+}
+
+static inline void bfq_free_root_group(struct bfq_data *bfqd)
+{
+	kfree(bfqd->root_group);
+}
+
+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
+{
+	struct bfq_group *bfqg;
+	int i;
+
+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
+	if (bfqg == NULL)
+		return NULL;
+
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+
+	return bfqg;
+}
+#endif
diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
new file mode 100644
index 000000000..01f831332
--- /dev/null
+++ b/block/bfq-ioc.c
@@ -0,0 +1,380 @@
+/*
+ * BFQ: I/O context handling.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ */
+
+/**
+ * bfq_cic_free_rcu - deferred cic freeing.
+ * @head: RCU head of the cic to free.
+ *
+ * Free the cic containing @head and, if it was the last one and
+ * the module is exiting wake up anyone waiting for its deallocation
+ * (see bfq_exit()).
+ */
+static void bfq_cic_free_rcu(struct rcu_head *head)
+{
+	struct cfq_io_context *cic;
+
+	cic = container_of(head, struct cfq_io_context, rcu_head);
+
+	kmem_cache_free(bfq_ioc_pool, cic);
+	elv_ioc_count_dec(bfq_ioc_count);
+
+	if (bfq_ioc_gone != NULL) {
+		spin_lock(&bfq_ioc_gone_lock);
+		if (bfq_ioc_gone != NULL &&
+		    !elv_ioc_count_read(bfq_ioc_count)) {
+			complete(bfq_ioc_gone);
+			bfq_ioc_gone = NULL;
+		}
+		spin_unlock(&bfq_ioc_gone_lock);
+	}
+}
+
+static void bfq_cic_free(struct cfq_io_context *cic)
+{
+	call_rcu(&cic->rcu_head, bfq_cic_free_rcu);
+}
+
+/**
+ * cic_free_func - disconnect a cic ready to be freed.
+ * @ioc: the io_context @cic belongs to.
+ * @cic: the cic to be freed.
+ *
+ * Remove @cic from the @ioc radix tree hash and from its cic list,
+ * deferring the deallocation of @cic to the end of the current RCU
+ * grace period.  This assumes that __bfq_exit_single_io_context()
+ * has already been called for @cic.
+ */
+static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
+{
+	unsigned long flags;
+	unsigned long dead_key = (unsigned long) cic->key;
+
+	BUG_ON(!(dead_key & CIC_DEAD_KEY));
+
+	spin_lock_irqsave(&ioc->lock, flags);
+	radix_tree_delete(&ioc->bfq_radix_root,
+		dead_key >> CIC_DEAD_INDEX_SHIFT);
+	hlist_del_init_rcu(&cic->cic_list);
+	spin_unlock_irqrestore(&ioc->lock, flags);
+
+	bfq_cic_free(cic);
+}
+
+static void bfq_free_io_context(struct io_context *ioc)
+{
+	/*
+	 * ioc->refcount is zero here, or we are called from elv_unregister(),
+	 * so no more cic's are allowed to be linked into this ioc.  So it
+	 * should be ok to iterate over the known list, we will see all cic's
+	 * since no new ones are added.
+	 */
+	call_for_each_cic(ioc, cic_free_func);
+}
+
+/**
+ * __bfq_exit_single_io_context - deassociate @cic from any running task.
+ * @bfqd: bfq_data on which @cic is valid.
+ * @cic: the cic being exited.
+ *
+ * Whenever no more tasks are using @cic or @bfqd is deallocated we
+ * need to invalidate its entry in the radix tree hash table and to
+ * release the queues it refers to.
+ *
+ * Called under the queue lock.
+ */
+static void __bfq_exit_single_io_context(struct bfq_data *bfqd,
+					 struct cfq_io_context *cic)
+{
+	struct io_context *ioc = cic->ioc;
+
+	list_del_init(&cic->queue_list);
+
+	/*
+	 * Make sure dead mark is seen for dead queues
+	 */
+	smp_wmb();
+	rcu_assign_pointer(cic->key, bfqd_dead_key(bfqd));
+
+	/*
+	 * No write-side locking as no task is using @ioc (they're exited
+	 * or bfqd is being deallocated.
+	 */
+	rcu_read_lock();
+	if (rcu_dereference(ioc->ioc_data) == cic) {
+		rcu_read_unlock();
+		spin_lock(&ioc->lock);
+		rcu_assign_pointer(ioc->ioc_data, NULL);
+		spin_unlock(&ioc->lock);
+	} else
+		rcu_read_unlock();
+
+	if (cic->cfqq[BLK_RW_ASYNC] != NULL) {
+		bfq_exit_bfqq(bfqd, cic->cfqq[BLK_RW_ASYNC]);
+		cic->cfqq[BLK_RW_ASYNC] = NULL;
+	}
+
+	if (cic->cfqq[BLK_RW_SYNC] != NULL) {
+		bfq_exit_bfqq(bfqd, cic->cfqq[BLK_RW_SYNC]);
+		cic->cfqq[BLK_RW_SYNC] = NULL;
+	}
+}
+
+/**
+ * bfq_exit_single_io_context - deassociate @cic from @ioc (unlocked version).
+ * @ioc: the io_context @cic belongs to.
+ * @cic: the cic being exited.
+ *
+ * Take the queue lock and call __bfq_exit_single_io_context() to do the
+ * rest of the work.  We take care of possible races with bfq_exit_queue()
+ * using bfq_get_bfqd_locked() (and abusing a little bit the RCU mechanism).
+ */
+static void bfq_exit_single_io_context(struct io_context *ioc,
+				       struct cfq_io_context *cic)
+{
+	struct bfq_data *bfqd;
+	unsigned long uninitialized_var(flags);
+
+	bfqd = bfq_get_bfqd_locked(&cic->key, &flags);
+	if (bfqd != NULL) {
+		__bfq_exit_single_io_context(bfqd, cic);
+		bfq_put_bfqd_unlock(bfqd, &flags);
+	}
+}
+
+/**
+ * bfq_exit_io_context - deassociate @ioc from all cics it owns.
+ * @ioc: the @ioc being exited.
+ *
+ * No more processes are using @ioc we need to clean up and put the
+ * internal structures we have that belongs to that process.  Loop
+ * through all its cics, locking their queues and exiting them.
+ */
+static void bfq_exit_io_context(struct io_context *ioc)
+{
+	call_for_each_cic(ioc, bfq_exit_single_io_context);
+}
+
+static struct cfq_io_context *bfq_alloc_io_context(struct bfq_data *bfqd,
+						   gfp_t gfp_mask)
+{
+	struct cfq_io_context *cic;
+
+	cic = kmem_cache_alloc_node(bfq_ioc_pool, gfp_mask | __GFP_ZERO,
+							bfqd->queue->node);
+	if (cic != NULL) {
+		cic->last_end_request = jiffies;
+		INIT_LIST_HEAD(&cic->queue_list);
+		INIT_HLIST_NODE(&cic->cic_list);
+		cic->dtor = bfq_free_io_context;
+		cic->exit = bfq_exit_io_context;
+		elv_ioc_count_inc(bfq_ioc_count);
+	}
+
+	return cic;
+}
+
+/**
+ * bfq_drop_dead_cic - free an exited cic.
+ * @bfqd: bfq data for the device in use.
+ * @ioc: io_context owning @cic.
+ * @cic: the @cic to free.
+ *
+ * We drop cfq io contexts lazily, so we may find a dead one.
+ */
+static void bfq_drop_dead_cic(struct bfq_data *bfqd, struct io_context *ioc,
+			      struct cfq_io_context *cic)
+{
+	unsigned long flags;
+
+	WARN_ON(!list_empty(&cic->queue_list));
+	BUG_ON(cic->key != bfqd_dead_key(bfqd));
+
+	spin_lock_irqsave(&ioc->lock, flags);
+
+	BUG_ON(ioc->ioc_data == cic);
+
+	/*
+	 * With shared I/O contexts two lookups may race and drop the
+	 * same cic more than one time: RCU guarantees that the storage
+	 * will not be freed too early, here we make sure that we do
+	 * not try to remove the cic from the hashing structures multiple
+	 * times.
+	 */
+	if (!hlist_unhashed(&cic->cic_list)) {
+		radix_tree_delete(&ioc->bfq_radix_root, bfqd->cic_index);
+		hlist_del_init_rcu(&cic->cic_list);
+		bfq_cic_free(cic);
+	}
+
+	spin_unlock_irqrestore(&ioc->lock, flags);
+}
+
+/**
+ * bfq_cic_lookup - search into @ioc a cic associated to @bfqd.
+ * @bfqd: the lookup key.
+ * @ioc: the io_context of the process doing I/O.
+ *
+ * If @ioc already has a cic associated to @bfqd return it, return %NULL
+ * otherwise.
+ */
+static struct cfq_io_context *bfq_cic_lookup(struct bfq_data *bfqd,
+					     struct io_context *ioc)
+{
+	struct cfq_io_context *cic;
+	unsigned long flags;
+	void *k;
+
+	if (unlikely(ioc == NULL))
+		return NULL;
+
+	rcu_read_lock();
+
+	/* We maintain a last-hit cache, to avoid browsing over the tree. */
+	cic = rcu_dereference(ioc->ioc_data);
+	if (cic != NULL) {
+		k = rcu_dereference(cic->key);
+		if (k == bfqd)
+			goto out;
+	}
+
+	do {
+		cic = radix_tree_lookup(&ioc->bfq_radix_root,
+					bfqd->cic_index);
+		if (cic == NULL)
+			goto out;
+
+		k = rcu_dereference(cic->key);
+		if (unlikely(k != bfqd)) {
+			rcu_read_unlock();
+			bfq_drop_dead_cic(bfqd, ioc, cic);
+			rcu_read_lock();
+			continue;
+		}
+
+		spin_lock_irqsave(&ioc->lock, flags);
+		rcu_assign_pointer(ioc->ioc_data, cic);
+		spin_unlock_irqrestore(&ioc->lock, flags);
+		break;
+	} while (1);
+
+out:
+	rcu_read_unlock();
+
+	return cic;
+}
+
+/**
+ * bfq_cic_link - add @cic to @ioc.
+ * @bfqd: bfq_data @cic refers to.
+ * @ioc: io_context @cic belongs to.
+ * @cic: the cic to link.
+ * @gfp_mask: the mask to use for radix tree preallocations.
+ *
+ * Add @cic to @ioc, using @bfqd as the search key.  This enables us to
+ * lookup the process specific cfq io context when entered from the block
+ * layer.  Also adds @cic to a per-bfqd list, used when this queue is
+ * removed.
+ */
+static int bfq_cic_link(struct bfq_data *bfqd, struct io_context *ioc,
+			struct cfq_io_context *cic, gfp_t gfp_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	ret = radix_tree_preload(gfp_mask);
+	if (ret == 0) {
+		cic->ioc = ioc;
+
+		/* No write-side locking, cic is not published yet. */
+		rcu_assign_pointer(cic->key, bfqd);
+
+		spin_lock_irqsave(&ioc->lock, flags);
+		ret = radix_tree_insert(&ioc->bfq_radix_root,
+					bfqd->cic_index, cic);
+		if (ret == 0)
+			hlist_add_head_rcu(&cic->cic_list, &ioc->bfq_cic_list);
+		spin_unlock_irqrestore(&ioc->lock, flags);
+
+		radix_tree_preload_end();
+
+		if (ret == 0) {
+			spin_lock_irqsave(bfqd->queue->queue_lock, flags);
+			list_add(&cic->queue_list, &bfqd->cic_list);
+			spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
+		}
+	}
+
+	if (ret != 0)
+		printk(KERN_ERR "bfq: cic link failed!\n");
+
+	return ret;
+}
+
+/**
+ * bfq_ioc_set_ioprio - signal a priority change to the cics belonging to @ioc.
+ * @ioc: the io_context changing its priority.
+ */
+static inline void bfq_ioc_set_ioprio(struct io_context *ioc)
+{
+	call_for_each_cic(ioc, bfq_changed_ioprio);
+}
+
+/**
+ * bfq_get_io_context - return the @cic associated to @bfqd in @ioc.
+ * @bfqd: the search key.
+ * @gfp_mask: the mask to use for cic allocation.
+ *
+ * Setup general io context and cfq io context.  There can be several cfq
+ * io contexts per general io context, if this process is doing io to more
+ * than one device managed by cfq.
+ */
+static struct cfq_io_context *bfq_get_io_context(struct bfq_data *bfqd,
+						 gfp_t gfp_mask)
+{
+	struct io_context *ioc = NULL;
+	struct cfq_io_context *cic;
+
+	might_sleep_if(gfp_mask & __GFP_WAIT);
+
+	ioc = get_io_context(gfp_mask, bfqd->queue->node);
+	if (ioc == NULL)
+		return NULL;
+
+	/* Lookup for an existing cic. */
+	cic = bfq_cic_lookup(bfqd, ioc);
+	if (cic != NULL)
+		goto out;
+
+	/* Alloc one if needed. */
+	cic = bfq_alloc_io_context(bfqd, gfp_mask);
+	if (cic == NULL)
+		goto err;
+
+	/* Link it into the ioc's radix tree and cic list. */
+	if (bfq_cic_link(bfqd, ioc, cic, gfp_mask) != 0)
+		goto err_free;
+
+out:
+	/*
+	 * test_and_clear_bit() implies a memory barrier, paired with
+	 * the wmb() in fs/ioprio.c, so the value seen for ioprio is the
+	 * new one.
+	 */
+	if (unlikely(test_and_clear_bit(IOC_BFQ_IOPRIO_CHANGED,
+					ioc->ioprio_changed)))
+		bfq_ioc_set_ioprio(ioc);
+
+	return cic;
+err_free:
+	bfq_cic_free(cic);
+err:
+	put_io_context(ioc);
+	return NULL;
+}
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
new file mode 100644
index 000000000..9f261ee60
--- /dev/null
+++ b/block/bfq-iosched.c
@@ -0,0 +1,3047 @@
+/*
+ * BFQ, or Budget Fair Queueing, disk scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
+ *
+ * BFQ is a proportional share disk scheduling algorithm based on the
+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
+ * measured in number of sectors, to tasks instead of time slices.
+ * The disk is not granted to the active task for a given time slice,
+ * but until it has exahusted its assigned budget.  This change from
+ * the time to the service domain allows BFQ to distribute the disk
+ * bandwidth among tasks as desired, without any distortion due to
+ * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
+ * internal scheduler, called B-WF2Q+, to schedule tasks according to
+ * their budgets.  Thanks to this accurate scheduler, BFQ can afford
+ * to assign high budgets to disk-bound non-seeky tasks (to boost the
+ * throughput), and yet guarantee low latencies to interactive and
+ * soft real-time applications.
+ *
+ * BFQ has been introduced in [1], where the interested reader can
+ * find an accurate description of the algorithm, the bandwidth
+ * distribution and latency guarantees it provides, plus formal proofs
+ * of all the properties.  With respect to the algorithm presented in
+ * the paper, this implementation adds several little heuristics, and
+ * a hierarchical extension, based on H-WF2Q+.
+ *
+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
+ * complexity derives from the one introduced with EEVDF in [3].
+ *
+ * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
+ *     with Deterministic Guarantees on Bandwidth Distribution,'',
+ *     IEEE Transactions on Computer, May 2010.
+ *
+ *     http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf
+ *
+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
+ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
+ *     Oct 1997.
+ *
+ *     http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
+ *
+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
+ *     First: A Flexible and Accurate Mechanism for Proportional Share
+ *     Resource Allocation,'' technical report.
+ *
+ *     http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/cgroup.h>
+#include <linux/elevator.h>
+#include <linux/jiffies.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include "bfq.h"
+
+/* Max number of dispatches in one round of service. */
+static const int bfq_quantum = 4;
+
+/* Expiration time of sync (0) and async (1) requests, in jiffies. */
+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
+
+/* Maximum backwards seek, in KiB. */
+static const int bfq_back_max = 16 * 1024;
+
+/* Penalty of a backwards seek, in number of sectors. */
+static const int bfq_back_penalty = 1;
+
+/* Idling period duration, in jiffies. */
+static int bfq_slice_idle = 0;
+
+/* Default maximum budget values, in sectors and number of requests. */
+static const int bfq_default_max_budget = 16 * 1024;
+static const int bfq_max_budget_async_rq = 4;
+
+/*
+ * Async to sync throughput distribution is controlled as follows:
+ * when an async request is served, the entity is charged the number
+ * of sectors of the request, multipled by the factor below
+ */
+static const int bfq_async_charge_factor = 10;
+
+/* Default timeout values, in jiffies, approximating CFQ defaults. */
+static const int bfq_timeout_sync = HZ / 8;
+static int bfq_timeout_async = HZ / 25;
+
+struct kmem_cache *bfq_pool;
+struct kmem_cache *bfq_ioc_pool;
+
+static DEFINE_PER_CPU(unsigned long, bfq_ioc_count);
+static struct completion *bfq_ioc_gone;
+static DEFINE_SPINLOCK(bfq_ioc_gone_lock);
+
+static DEFINE_SPINLOCK(cic_index_lock);
+static DEFINE_IDA(cic_index_ida);
+
+/* Below this threshold (in ms), we consider thinktime immediate. */
+#define BFQ_MIN_TT		2
+
+/* hw_tag detection: parallel requests threshold and min samples needed. */
+#define BFQ_HW_QUEUE_THRESHOLD	4
+#define BFQ_HW_QUEUE_SAMPLES	32
+
+#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)
+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
+
+/* Min samples used for peak rate estimation (for autotuning). */
+#define BFQ_PEAK_RATE_SAMPLES	32
+
+/* Shift used for peak rate fixed precision calculations. */
+#define BFQ_RATE_SHIFT		16
+
+/*
+ * The duration of the weight raising for interactive applications is
+ * computed automatically (as default behaviour), using the following
+ * formula: duration = (R / r) * T, where r is the peak rate of the
+ * disk, and R and T are two reference parameters. In particular, R is
+ * the peak rate of a reference disk, and T is about the maximum time
+ * for starting popular large applications on that disk, under BFQ and
+ * while reading two files in parallel. Finally, BFQ uses two
+ * different pairs (R, T) depending on whether the disk is rotational
+ * or non-rotational.
+ */
+#define T_rot			(msecs_to_jiffies(5500))
+#define T_nonrot		(msecs_to_jiffies(2000))
+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
+#define R_rot			17415
+#define R_nonrot		34791
+
+#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\
+				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
+
+#define RQ_CIC(rq)		\
+	((struct cfq_io_context *) (rq)->elevator_private[0])
+#define RQ_BFQQ(rq)		((rq)->elevator_private[1])
+
+#include "bfq-ioc.c"
+#include "bfq-sched.c"
+#include "bfq-cgroup.c"
+
+#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\
+				 IOPRIO_CLASS_IDLE)
+#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\
+				 IOPRIO_CLASS_RT)
+
+#define bfq_sample_valid(samples)	((samples) > 80)
+
+/*
+ * We regard a request as SYNC, if either it's a read or has the SYNC bit
+ * set (in which case it could also be a direct WRITE).
+ */
+static inline int bfq_bio_sync(struct bio *bio)
+{
+	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
+{
+	if (bfqd->queued != 0) {
+		bfq_log(bfqd, "schedule dispatch");
+		kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
+	}
+}
+
+/*
+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
+ * We choose the request that is closesr to the head right now.  Distance
+ * behind the head is penalized and only allowed to a certain extent.
+ */
+static struct request *bfq_choose_req(struct bfq_data *bfqd,
+				      struct request *rq1,
+				      struct request *rq2,
+				      sector_t last)
+{
+	sector_t s1, s2, d1 = 0, d2 = 0;
+	unsigned long back_max;
+#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */
+#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */
+	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
+
+	if (rq1 == NULL || rq1 == rq2)
+		return rq2;
+	if (rq2 == NULL)
+		return rq1;
+
+	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
+		return rq1;
+	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
+		return rq2;
+	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
+		return rq1;
+	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
+		return rq2;
+
+	s1 = blk_rq_pos(rq1);
+	s2 = blk_rq_pos(rq2);
+
+	/*
+	 * By definition, 1KiB is 2 sectors.
+	 */
+	back_max = bfqd->bfq_back_max * 2;
+
+	/*
+	 * Strict one way elevator _except_ in the case where we allow
+	 * short backward seeks which are biased as twice the cost of a
+	 * similar forward seek.
+	 */
+	if (s1 >= last)
+		d1 = s1 - last;
+	else if (s1 + back_max >= last)
+		d1 = (last - s1) * bfqd->bfq_back_penalty;
+	else
+		wrap |= BFQ_RQ1_WRAP;
+
+	if (s2 >= last)
+		d2 = s2 - last;
+	else if (s2 + back_max >= last)
+		d2 = (last - s2) * bfqd->bfq_back_penalty;
+	else
+		wrap |= BFQ_RQ2_WRAP;
+
+	/* Found required data */
+
+	/*
+	 * By doing switch() on the bit mask "wrap" we avoid having to
+	 * check two variables for all permutations: --> faster!
+	 */
+	switch (wrap) {
+	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
+		if (d1 < d2)
+			return rq1;
+		else if (d2 < d1)
+			return rq2;
+		else {
+			if (s1 >= s2)
+				return rq1;
+			else
+				return rq2;
+		}
+
+	case BFQ_RQ2_WRAP:
+		return rq1;
+	case BFQ_RQ1_WRAP:
+		return rq2;
+	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
+	default:
+		/*
+		 * Since both rqs are wrapped,
+		 * start with the one that's further behind head
+		 * (--> only *one* back seek required),
+		 * since back seek takes more time than forward.
+		 */
+		if (s1 <= s2)
+			return rq1;
+		else
+			return rq2;
+	}
+}
+
+static struct bfq_queue *
+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
+		     sector_t sector, struct rb_node **ret_parent,
+		     struct rb_node ***rb_link)
+{
+	struct rb_node **p, *parent;
+	struct bfq_queue *bfqq = NULL;
+
+	parent = NULL;
+	p = &root->rb_node;
+	while (*p) {
+		struct rb_node **n;
+
+		parent = *p;
+		bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+
+		/*
+		 * Sort strictly based on sector. Smallest to the left,
+		 * largest to the right.
+		 */
+		if (sector > blk_rq_pos(bfqq->next_rq))
+			n = &(*p)->rb_right;
+		else if (sector < blk_rq_pos(bfqq->next_rq))
+			n = &(*p)->rb_left;
+		else
+			break;
+		p = n;
+		bfqq = NULL;
+	}
+
+	*ret_parent = parent;
+	if (rb_link)
+		*rb_link = p;
+
+	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
+		(long long unsigned)sector,
+		bfqq != NULL ? bfqq->pid : 0);
+
+	return bfqq;
+}
+
+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct rb_node **p, *parent;
+	struct bfq_queue *__bfqq;
+
+	if (bfqq->pos_root != NULL) {
+		rb_erase(&bfqq->pos_node, bfqq->pos_root);
+		bfqq->pos_root = NULL;
+	}
+
+	if (bfq_class_idle(bfqq))
+		return;
+	if (!bfqq->next_rq)
+		return;
+
+	bfqq->pos_root = &bfqd->rq_pos_tree;
+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
+			blk_rq_pos(bfqq->next_rq), &parent, &p);
+	if (__bfqq == NULL) {
+		rb_link_node(&bfqq->pos_node, parent, p);
+		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
+	} else
+		bfqq->pos_root = NULL;
+}
+
+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
+					struct bfq_queue *bfqq,
+					struct request *last)
+{
+	struct rb_node *rbnext = rb_next(&last->rb_node);
+	struct rb_node *rbprev = rb_prev(&last->rb_node);
+	struct request *next = NULL, *prev = NULL;
+
+	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
+
+	if (rbprev != NULL)
+		prev = rb_entry_rq(rbprev);
+
+	if (rbnext != NULL)
+		next = rb_entry_rq(rbnext);
+	else {
+		rbnext = rb_first(&bfqq->sort_list);
+		if (rbnext && rbnext != &last->rb_node)
+			next = rb_entry_rq(rbnext);
+	}
+
+	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
+}
+
+static void bfq_del_rq_rb(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_data *bfqd = bfqq->bfqd;
+	const int sync = rq_is_sync(rq);
+
+	BUG_ON(bfqq->queued[sync] == 0);
+	bfqq->queued[sync]--;
+	bfqd->queued--;
+
+	elv_rb_del(&bfqq->sort_list, rq);
+
+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue)
+			bfq_del_bfqq_busy(bfqd, bfqq, 1);
+		/*
+		 * Remove queue from request-position tree as it is empty.
+		 */
+		if (bfqq->pos_root != NULL) {
+			rb_erase(&bfqq->pos_node, bfqq->pos_root);
+			bfqq->pos_root = NULL;
+		}
+	}
+}
+
+/* see the definition of bfq_async_charge_factor for details */
+static inline unsigned long bfq_serv_to_charge(struct request *rq,
+					       struct bfq_queue *bfqq)
+{
+	return blk_rq_sectors(rq) *
+		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
+		bfq_async_charge_factor));
+}
+
+/**
+ * bfq_updated_next_req - update the queue after a new next_rq selection.
+ * @bfqd: the device data the queue belongs to.
+ * @bfqq: the queue to update.
+ *
+ * If the first request of a queue changes we make sure that the queue
+ * has enough budget to serve at least its first request (if the
+ * request has grown).  We do this because if the queue has not enough
+ * budget for its first request, it has to go through two dispatch
+ * rounds to actually get it dispatched.
+ */
+static void bfq_updated_next_req(struct bfq_data *bfqd,
+				 struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	struct request *next_rq = bfqq->next_rq;
+	unsigned long new_budget;
+
+	if (next_rq == NULL)
+		return;
+
+	if (bfqq == bfqd->active_queue)
+		/*
+		 * In order not to break guarantees, budgets cannot be
+		 * changed after an entity has been selected.
+		 */
+		return;
+
+	BUG_ON(entity->tree != &st->active);
+	BUG_ON(entity == entity->sched_data->active_entity);
+
+	new_budget = max_t(unsigned long, bfqq->max_budget,
+			   bfq_serv_to_charge(next_rq, bfqq));
+	entity->budget = new_budget;
+	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
+	bfq_activate_bfqq(bfqd, bfqq);
+}
+
+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
+{
+	u64 dur;
+
+	if (bfqd->bfq_raising_max_time > 0)
+		return bfqd->bfq_raising_max_time;
+
+	dur = bfqd->RT_prod;
+	do_div(dur, bfqd->peak_rate);
+
+	return dur;
+}
+
+static void bfq_add_rq_rb(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_data *bfqd = bfqq->bfqd;
+	struct request *next_rq, *prev;
+	unsigned long old_raising_coeff = bfqq->raising_coeff;
+	int idle_for_long_time = bfqq->budget_timeout +
+		bfqd->bfq_raising_min_idle_time < jiffies;
+
+	bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
+	bfqq->queued[rq_is_sync(rq)]++;
+	bfqd->queued++;
+
+	elv_rb_add(&bfqq->sort_list, rq);
+
+	/*
+	 * Check if this request is a better next-serve candidate.
+	 */
+	prev = bfqq->next_rq;
+	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
+	BUG_ON(next_rq == NULL);
+	bfqq->next_rq = next_rq;
+
+	/*
+	 * Adjust priority tree position, if next_rq changes.
+	 */
+	if (prev != bfqq->next_rq)
+		bfq_rq_pos_tree_add(bfqd, bfqq);
+
+	if (!bfq_bfqq_busy(bfqq)) {
+		int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
+			bfqq->soft_rt_next_start < jiffies;
+		entity->budget = max_t(unsigned long, bfqq->max_budget,
+				       bfq_serv_to_charge(next_rq, bfqq));
+
+		if (! bfqd->low_latency)
+			goto add_bfqq_busy;
+
+		/*
+		 * If the queue is not being boosted and has been idle
+		 * for enough time, start a weight-raising period
+		 */
+		if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
+			bfqq->raising_coeff = bfqd->bfq_raising_coeff;
+			if (idle_for_long_time)
+				bfqq->raising_cur_max_time =
+					bfq_wrais_duration(bfqd);
+			else
+				bfqq->raising_cur_max_time =
+					bfqd->bfq_raising_rt_max_time;
+			bfq_log_bfqq(bfqd, bfqq,
+				     "wrais starting at %llu msec,"
+				     "rais_max_time %u",
+				     bfqq->last_rais_start_finish,
+				     jiffies_to_msecs(bfqq->
+					raising_cur_max_time));
+		} else if (old_raising_coeff > 1) {
+			if (idle_for_long_time)
+				bfqq->raising_cur_max_time =
+					bfq_wrais_duration(bfqd);
+			else if (bfqq->raising_cur_max_time ==
+				 bfqd->bfq_raising_rt_max_time &&
+				 !soft_rt) {
+				bfqq->raising_coeff = 1;
+				bfq_log_bfqq(bfqd, bfqq,
+					     "wrais ending at %llu msec,"
+					     "rais_max_time %u",
+					     bfqq->last_rais_start_finish,
+					     jiffies_to_msecs(bfqq->
+						raising_cur_max_time));
+				}
+		}
+		if (old_raising_coeff != bfqq->raising_coeff)
+			entity->ioprio_changed = 1;
+add_bfqq_busy:
+		bfq_add_bfqq_busy(bfqd, bfqq);
+        } else {
+                if(bfqd->low_latency && old_raising_coeff == 1 &&
+			!rq_is_sync(rq) &&
+			bfqq->last_rais_start_finish +
+                        bfqd->bfq_raising_min_inter_arr_async < jiffies) {
+                        bfqq->raising_coeff = bfqd->bfq_raising_coeff;
+			bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
+
+			entity->ioprio_changed = 1;
+			bfq_log_bfqq(bfqd, bfqq,
+				     "non-idle wrais starting at %llu msec,"
+				     "rais_max_time %u",
+				     bfqq->last_rais_start_finish,
+				     jiffies_to_msecs(bfqq->
+					raising_cur_max_time));
+                }
+                bfq_updated_next_req(bfqd, bfqq);
+	}
+
+	if(bfqd->low_latency &&
+		(old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
+		 idle_for_long_time))
+		bfqq->last_rais_start_finish = jiffies;
+}
+
+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
+{
+	elv_rb_del(&bfqq->sort_list, rq);
+	bfqq->queued[rq_is_sync(rq)]--;
+	bfqq->bfqd->queued--;
+	bfq_add_rq_rb(rq);
+}
+
+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
+					  struct bio *bio)
+{
+	struct task_struct *tsk = current;
+	struct cfq_io_context *cic;
+	struct bfq_queue *bfqq;
+
+	cic = bfq_cic_lookup(bfqd, tsk->io_context);
+	if (cic == NULL)
+		return NULL;
+
+	bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio));
+	if (bfqq != NULL) {
+		sector_t sector = bio->bi_sector + bio_sectors(bio);
+
+		return elv_rb_find(&bfqq->sort_list, sector);
+	}
+
+	return NULL;
+}
+
+static void bfq_activate_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+
+	bfqd->rq_in_driver++;
+	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
+	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
+		(long long unsigned)bfqd->last_position);
+}
+
+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+
+	WARN_ON(bfqd->rq_in_driver == 0);
+	bfqd->rq_in_driver--;
+}
+
+static void bfq_remove_request(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_data *bfqd = bfqq->bfqd;
+
+	if (bfqq->next_rq == rq) {
+		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
+		bfq_updated_next_req(bfqd, bfqq);
+	}
+
+	list_del_init(&rq->queuelist);
+	bfq_del_rq_rb(rq);
+
+	if (rq->cmd_flags & REQ_META) {
+		WARN_ON(bfqq->meta_pending == 0);
+		bfqq->meta_pending--;
+	}
+}
+
+static int bfq_merge(struct request_queue *q, struct request **req,
+		     struct bio *bio)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct request *__rq;
+
+	__rq = bfq_find_rq_fmerge(bfqd, bio);
+	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
+		*req = __rq;
+		return ELEVATOR_FRONT_MERGE;
+	}
+
+	return ELEVATOR_NO_MERGE;
+}
+
+static void bfq_merged_request(struct request_queue *q, struct request *req,
+			       int type)
+{
+	if (type == ELEVATOR_FRONT_MERGE) {
+		struct bfq_queue *bfqq = RQ_BFQQ(req);
+
+		bfq_reposition_rq_rb(bfqq, req);
+	}
+}
+
+static void bfq_merged_requests(struct request_queue *q, struct request *rq,
+				struct request *next)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+	/*
+	 * Reposition in fifo if next is older than rq.
+	 */
+	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
+	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
+		list_move(&rq->queuelist, &next->queuelist);
+		rq_set_fifo_time(rq, rq_fifo_time(next));
+	}
+
+	if (bfqq->next_rq == next)
+		bfqq->next_rq = rq;
+
+	bfq_remove_request(next);
+}
+
+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
+			   struct bio *bio)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct cfq_io_context *cic;
+	struct bfq_queue *bfqq;
+
+	/* Disallow merge of a sync bio into an async request. */
+	if (bfq_bio_sync(bio) && !rq_is_sync(rq))
+		return 0;
+
+	/*
+	 * Lookup the bfqq that this bio will be queued with. Allow
+	 * merge only if rq is queued there.
+	 */
+	cic = bfq_cic_lookup(bfqd, current->io_context);
+	if (cic == NULL)
+		return 0;
+
+	bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio));
+	return bfqq == RQ_BFQQ(rq);
+}
+
+static void __bfq_set_active_queue(struct bfq_data *bfqd,
+				   struct bfq_queue *bfqq)
+{
+	if (bfqq != NULL) {
+		bfq_mark_bfqq_must_alloc(bfqq);
+		bfq_mark_bfqq_budget_new(bfqq);
+		bfq_clear_bfqq_fifo_expire(bfqq);
+
+		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
+
+		bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
+			     bfqq->entity.budget);
+	}
+
+	bfqd->active_queue = bfqq;
+}
+
+/*
+ * Get and set a new active queue for service.
+ */
+static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
+					      struct bfq_queue *bfqq)
+{
+	if (!bfqq)
+		bfqq = bfq_get_next_queue(bfqd);
+	else
+		bfq_get_next_queue_forced(bfqd, bfqq);
+
+	__bfq_set_active_queue(bfqd, bfqq);
+	return bfqq;
+}
+
+static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
+					  struct request *rq)
+{
+	if (blk_rq_pos(rq) >= bfqd->last_position)
+		return blk_rq_pos(rq) - bfqd->last_position;
+	else
+		return bfqd->last_position - blk_rq_pos(rq);
+}
+
+/*
+ * Return true if bfqq has no request pending and rq is close enough to
+ * bfqd->last_position, or if rq is closer to bfqd->last_position than
+ * bfqq->next_rq
+ */
+static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
+{
+	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
+}
+
+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
+{
+	struct rb_root *root = &bfqd->rq_pos_tree;
+	struct rb_node *parent, *node;
+	struct bfq_queue *__bfqq;
+	sector_t sector = bfqd->last_position;
+
+	if (RB_EMPTY_ROOT(root))
+		return NULL;
+
+	/*
+	 * First, if we find a request starting at the end of the last
+	 * request, choose it.
+	 */
+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
+	if (__bfqq != NULL)
+		return __bfqq;
+
+	/*
+	 * If the exact sector wasn't found, the parent of the NULL leaf
+	 * will contain the closest sector (rq_pos_tree sorted by next_request
+	 * position).
+	 */
+	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+	if (bfq_rq_close(bfqd, __bfqq->next_rq))
+		return __bfqq;
+
+	if (blk_rq_pos(__bfqq->next_rq) < sector)
+		node = rb_next(&__bfqq->pos_node);
+	else
+		node = rb_prev(&__bfqq->pos_node);
+	if (node == NULL)
+		return NULL;
+
+	__bfqq = rb_entry(node, struct bfq_queue, pos_node);
+	if (bfq_rq_close(bfqd, __bfqq->next_rq))
+		return __bfqq;
+
+	return NULL;
+}
+
+/*
+ * bfqd - obvious
+ * cur_bfqq - passed in so that we don't decide that the current queue
+ *            is closely cooperating with itself.
+ *
+ * We are assuming that cur_bfqq has dispatched at least one request,
+ * and that bfqd->last_position reflects a position on the disk associated
+ * with the I/O issued by cur_bfqq.
+ */
+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
+					      struct bfq_queue *cur_bfqq)
+{
+	struct bfq_queue *bfqq;
+
+	if (bfq_class_idle(cur_bfqq))
+		return NULL;
+	if (!bfq_bfqq_sync(cur_bfqq))
+		return NULL;
+	if (BFQQ_SEEKY(cur_bfqq))
+		return NULL;
+
+	/* If device has only one backlogged bfq_queue, don't search. */
+	if (bfqd->busy_queues == 1)
+		return NULL;
+
+	/*
+	 * We should notice if some of the queues are cooperating, e.g.
+	 * working closely on the same area of the disk. In that case,
+	 * we can group them together and don't waste time idling.
+	 */
+	bfqq = bfqq_close(bfqd);
+	if (bfqq == NULL || bfqq == cur_bfqq)
+		return NULL;
+
+	/*
+	 * Do not merge queues from different bfq_groups.
+	*/
+	if (bfqq->entity.parent != cur_bfqq->entity.parent)
+		return NULL;
+
+	/*
+	 * It only makes sense to merge sync queues.
+	 */
+	if (!bfq_bfqq_sync(bfqq))
+		return NULL;
+	if (BFQQ_SEEKY(bfqq))
+		return NULL;
+
+	/*
+	 * Do not merge queues of different priority classes.
+	 */
+	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
+		return NULL;
+
+	return bfqq;
+}
+
+/*
+ * If enough samples have been computed, return the current max budget
+ * stored in bfqd, which is dynamically updated according to the
+ * estimated disk peak rate; otherwise return the default max budget
+ */
+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
+{
+	if (bfqd->budgets_assigned < 194)
+		return bfq_default_max_budget;
+	else
+		return bfqd->bfq_max_budget;
+}
+
+/*
+ * Return min budget, which is a fraction of the current or default
+ * max budget (trying with 1/32)
+ */
+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
+{
+	if (bfqd->budgets_assigned < 194)
+		return bfq_default_max_budget;
+	else
+		return bfqd->bfq_max_budget / 32;
+}
+
+/*
+ * Decides whether idling should be done for given device and
+ * given active queue.
+ */
+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
+					   struct bfq_queue *active_bfqq)
+{
+	if (active_bfqq == NULL)
+		return false;
+	/*
+	 * If device is SSD it has no seek penalty, disable idling; but
+	 * do so only if:
+	 * - device does not support queuing, otherwise we still have
+	 *   a problem with sync vs async workloads;
+	 * - the queue is not weight-raised, to preserve guarantees.
+	 */
+	return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
+		active_bfqq->raising_coeff == 1);
+}
+
+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq = bfqd->active_queue;
+	struct cfq_io_context *cic;
+	unsigned long sl;
+
+	WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
+
+	if (bfq_queue_nonrot_noidle(bfqd, bfqq))
+		return;
+
+	/* Idling is disabled, either manually or by past process history. */
+	if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_idle_window(bfqq))
+		return;
+
+	/* Tasks have exited, don't wait. */
+	cic = bfqd->active_cic;
+	if (cic == NULL || atomic_read(&cic->ioc->nr_tasks) == 0)
+		return;
+
+	bfq_mark_bfqq_wait_request(bfqq);
+
+	/*
+	 * We don't want to idle for seeks, but we do want to allow
+	 * fair distribution of slice time for a process doing back-to-back
+	 * seeks. So allow a little bit of time for him to submit a new rq.
+	 *
+	 * To prevent processes with (partly) seeky workloads from
+	 * being too ill-treated, grant them a small fraction of the
+	 * assigned budget before reducing the waiting time to
+	 * BFQ_MIN_TT. This happened to help reduce latency.
+	 */
+	sl = bfqd->bfq_slice_idle;
+	if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
+	    bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
+	    bfqq->raising_coeff == 1)
+		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
+	else if (bfqq->raising_coeff > 1)
+		sl = sl * 3;
+	bfqd->last_idling_start = ktime_get();
+	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
+	bfq_log(bfqd, "arm idle: %u/%u ms",
+		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
+}
+
+/*
+ * Set the maximum time for the active queue to consume its
+ * budget. This prevents seeky processes from lowering the disk
+ * throughput (always guaranteed with a time slice scheme as in CFQ).
+ */
+static void bfq_set_budget_timeout(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq = bfqd->active_queue;
+	unsigned int timeout_coeff;
+	if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
+		timeout_coeff = 1;
+	else
+		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
+
+	bfqd->last_budget_start = ktime_get();
+
+	bfq_clear_bfqq_budget_new(bfqq);
+	bfqq->budget_timeout = jiffies +
+		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
+
+	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
+		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
+		timeout_coeff));
+}
+
+/*
+ * Move request from internal lists to the request queue dispatch list.
+ */
+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+	bfq_remove_request(rq);
+	bfqq->dispatched++;
+	elv_dispatch_sort(q, rq);
+
+	if (bfq_bfqq_sync(bfqq))
+		bfqd->sync_flight++;
+}
+
+/*
+ * Return expired entry, or NULL to just start from scratch in rbtree.
+ */
+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
+{
+	struct request *rq = NULL;
+
+	if (bfq_bfqq_fifo_expire(bfqq))
+		return NULL;
+
+	bfq_mark_bfqq_fifo_expire(bfqq);
+
+	if (list_empty(&bfqq->fifo))
+		return NULL;
+
+	rq = rq_entry_fifo(bfqq->fifo.next);
+
+	if (time_before(jiffies, rq_fifo_time(rq)))
+		return NULL;
+
+	return rq;
+}
+
+/*
+ * Must be called with the queue_lock held.
+ */
+static int bfqq_process_refs(struct bfq_queue *bfqq)
+{
+	int process_refs, io_refs;
+
+	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
+	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
+	BUG_ON(process_refs < 0);
+	return process_refs;
+}
+
+static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+	int process_refs, new_process_refs;
+	struct bfq_queue *__bfqq;
+
+	/*
+	 * If there are no process references on the new_bfqq, then it is
+	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
+	 * may have dropped their last reference (not just their last process
+	 * reference).
+	 */
+	if (!bfqq_process_refs(new_bfqq))
+		return;
+
+	/* Avoid a circular list and skip interim queue merges. */
+	while ((__bfqq = new_bfqq->new_bfqq)) {
+		if (__bfqq == bfqq)
+			return;
+		new_bfqq = __bfqq;
+	}
+
+	process_refs = bfqq_process_refs(bfqq);
+	new_process_refs = bfqq_process_refs(new_bfqq);
+	/*
+	 * If the process for the bfqq has gone away, there is no
+	 * sense in merging the queues.
+	 */
+	if (process_refs == 0 || new_process_refs == 0)
+		return;
+
+	/*
+	 * Merge in the direction of the lesser amount of work.
+	 */
+	if (new_process_refs >= process_refs) {
+		bfqq->new_bfqq = new_bfqq;
+		atomic_add(process_refs, &new_bfqq->ref);
+	} else {
+		new_bfqq->new_bfqq = bfqq;
+		atomic_add(new_process_refs, &bfqq->ref);
+	}
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
+		new_bfqq->pid);
+}
+
+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	return entity->budget - entity->service;
+}
+
+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	BUG_ON(bfqq != bfqd->active_queue);
+
+	__bfq_bfqd_reset_active(bfqd);
+
+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+		bfq_del_bfqq_busy(bfqd, bfqq, 1);
+		/*
+		 * overloading budget_timeout field to store when
+		 * the queue remains with no backlog, used by
+		 * the weight-raising mechanism
+		 */
+		bfqq->budget_timeout = jiffies ;
+	}
+	else {
+		bfq_activate_bfqq(bfqd, bfqq);
+		/*
+		 * Resort priority tree of potential close cooperators.
+		 */
+		bfq_rq_pos_tree_add(bfqd, bfqq);
+	}
+
+	/*
+	 * If this bfqq is shared between multiple processes, check
+	 * to make sure that those processes are still issuing I/Os
+	 * within the mean seek distance. If not, it may be time to
+	 * break the queues apart again.
+	 */
+	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
+		bfq_mark_bfqq_split_coop(bfqq);
+}
+
+/**
+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
+ * @bfqd: device data.
+ * @bfqq: queue to update.
+ * @reason: reason for expiration.
+ *
+ * Handle the feedback on @bfqq budget.  See the body for detailed
+ * comments.
+ */
+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
+				     struct bfq_queue *bfqq,
+				     enum bfqq_expiration reason)
+{
+	struct request *next_rq;
+	unsigned long budget, min_budget;
+
+	budget = bfqq->max_budget;
+	min_budget = bfq_min_budget(bfqd);
+
+	BUG_ON(bfqq != bfqd->active_queue);
+
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
+		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
+		budget, bfq_min_budget(bfqd));
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
+		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));
+
+	if (bfq_bfqq_sync(bfqq)) {
+		switch (reason) {
+		/*
+		 * Caveat: in all the following cases we trade latency
+		 * for throughput.
+		 */
+		case BFQ_BFQQ_TOO_IDLE:
+			/*
+			 * This is the only case where we may reduce
+			 * the budget: if there is no requets of the
+			 * process still waiting for completion, then
+			 * we assume (tentatively) that the timer has
+			 * expired because the batch of requests of
+			 * the process could have been served with a
+			 * smaller budget.  Hence, betting that
+			 * process will behave in the same way when it
+			 * becomes backlogged again, we reduce its
+			 * next budget.  As long as we guess right,
+			 * this budget cut reduces the latency
+			 * experienced by the process.
+			 *
+			 * However, if there are still outstanding
+			 * requests, then the process may have not yet
+			 * issued its next request just because it is
+			 * still waiting for the completion of some of
+			 * the still oustanding ones.  So in this
+			 * subcase we do not reduce its budget, on the
+			 * contrary we increase it to possibly boost
+			 * the throughput, as discussed in the
+			 * comments to the BUDGET_TIMEOUT case.
+			 */
+			if (bfqq->dispatched > 0) /* still oustanding reqs */
+				budget = min(budget * 2, bfqd->bfq_max_budget);
+			else {
+				if (budget > 5 * min_budget)
+					budget -= 4 * min_budget;
+				else
+					budget = min_budget;
+			}
+			break;
+		case BFQ_BFQQ_BUDGET_TIMEOUT:
+			/*
+			 * We double the budget here because: 1) it
+			 * gives the chance to boost the throughput if
+			 * this is not a seeky process (which may have
+			 * bumped into this timeout because of, e.g.,
+			 * ZBR), 2) together with charge_full_budget
+			 * it helps give seeky processes higher
+			 * timestamps, and hence be served less
+			 * frequently.
+			 */
+			budget = min(budget * 2, bfqd->bfq_max_budget);
+			break;
+		case BFQ_BFQQ_BUDGET_EXHAUSTED:
+			/*
+			 * The process still has backlog, and did not
+			 * let either the budget timeout or the disk
+			 * idling timeout expire. Hence it is not
+			 * seeky, has a short thinktime and may be
+			 * happy with a higher budget too. So
+			 * definitely increase the budget of this good
+			 * candidate to boost the disk throughput.
+			 */
+			budget = min(budget * 4, bfqd->bfq_max_budget);
+			break;
+		case BFQ_BFQQ_NO_MORE_REQUESTS:
+		       /*
+			* Leave the budget unchanged.
+			*/
+		default:
+			return;
+		}
+	} else /* async queue */
+	    /* async queues get always the maximum possible budget
+	     * (their ability to dispatch is limited by
+	     * @bfqd->bfq_max_budget_async_rq).
+	     */
+		budget = bfqd->bfq_max_budget;
+
+	bfqq->max_budget = budget;
+
+	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
+	    bfqq->max_budget > bfqd->bfq_max_budget)
+		bfqq->max_budget = bfqd->bfq_max_budget;
+
+	/*
+	 * Make sure that we have enough budget for the next request.
+	 * Since the finish time of the bfqq must be kept in sync with
+	 * the budget, be sure to call __bfq_bfqq_expire() after the
+	 * update.
+	 */
+	next_rq = bfqq->next_rq;
+	if (next_rq != NULL)
+		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
+					    bfq_serv_to_charge(next_rq, bfqq));
+	else
+		bfqq->entity.budget = bfqq->max_budget;
+
+	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
+			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
+			bfqq->entity.budget);
+}
+
+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
+{
+	unsigned long max_budget;
+
+	/*
+	 * The max_budget calculated when autotuning is equal to the
+	 * amount of sectors transfered in timeout_sync at the
+	 * estimated peak rate.
+	 */
+	max_budget = (unsigned long)(peak_rate * 1000 *
+				     timeout >> BFQ_RATE_SHIFT);
+
+	return max_budget;
+}
+
+/*
+ * In addition to updating the peak rate, checks whether the process
+ * is "slow", and returns 1 if so. This slow flag is used, in addition
+ * to the budget timeout, to reduce the amount of service provided to
+ * seeky processes, and hence reduce their chances to lower the
+ * throughput. See the code for more details.
+ */
+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				int compensate, enum bfqq_expiration reason)
+{
+	u64 bw, usecs, expected, timeout;
+	ktime_t delta;
+	int update = 0;
+
+	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
+		return 0;
+
+	if (compensate)
+		delta = bfqd->last_idling_start;
+	else
+		delta = ktime_get();
+	delta = ktime_sub(delta, bfqd->last_budget_start);
+	usecs = ktime_to_us(delta);
+
+	/* Don't trust short/unrealistic values. */
+	if (usecs < 100 || usecs >= LONG_MAX)
+		return 0;
+
+	/*
+	 * Calculate the bandwidth for the last slice.  We use a 64 bit
+	 * value to store the peak rate, in sectors per usec in fixed
+	 * point math.  We do so to have enough precision in the estimate
+	 * and to avoid overflows.
+	 */
+	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
+	do_div(bw, (unsigned long)usecs);
+
+	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
+
+	/*
+	 * Use only long (> 20ms) intervals to filter out spikes for
+	 * the peak rate estimation.
+	 */
+	if (usecs > 20000) {
+		if (bw > bfqd->peak_rate ||
+		   (!BFQQ_SEEKY(bfqq) &&
+		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
+			bfq_log(bfqd, "measured bw =%llu", bw);
+			/*
+			 * To smooth oscillations use a low-pass filter with
+			 * alpha=7/8, i.e.,
+			 * new_rate = (7/8) * old_rate + (1/8) * bw
+			 */
+			do_div(bw, 8);
+			bfqd->peak_rate *= 7;
+			do_div(bfqd->peak_rate, 8);
+			bfqd->peak_rate += bw;
+			update = 1;
+			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
+		}
+
+		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
+
+		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
+			bfqd->peak_rate_samples++;
+
+		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
+		    update && bfqd->bfq_user_max_budget == 0) {
+			bfqd->bfq_max_budget =
+				bfq_calc_max_budget(bfqd->peak_rate, timeout);
+			bfq_log(bfqd, "new max_budget=%lu",
+				bfqd->bfq_max_budget);
+		}
+	}
+
+	/*
+	 * If the process has been served for a too short time
+	 * interval to let its possible sequential accesses prevail on
+	 * the initial seek time needed to move the disk head on the
+	 * first sector it requested, then give the process a chance
+	 * and for the moment return false.
+	 */
+	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
+		return 0;
+
+	/*
+	 * A process is considered ``slow'' (i.e., seeky, so that we
+	 * cannot treat it fairly in the service domain, as it would
+	 * slow down too much the other processes) if, when a slice
+	 * ends for whatever reason, it has received service at a
+	 * rate that would not be high enough to complete the budget
+	 * before the budget timeout expiration.
+	 */
+	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
+
+	/*
+	 * Caveat: processes doing IO in the slower disk zones will
+	 * tend to be slow(er) even if not seeky. And the estimated
+	 * peak rate will actually be an average over the disk
+	 * surface. Hence, to not be too harsh with unlucky processes,
+	 * we keep a budget/3 margin of safety before declaring a
+	 * process slow.
+	 */
+	return expected > (4 * bfqq->entity.budget) / 3;
+}
+
+/**
+ * bfq_bfqq_expire - expire a queue.
+ * @bfqd: device owning the queue.
+ * @bfqq: the queue to expire.
+ * @compensate: if true, compensate for the time spent idling.
+ * @reason: the reason causing the expiration.
+ *
+ *
+ * If the process associated to the queue is slow (i.e., seeky), or in
+ * case of budget timeout, or, finally, if it is async, we
+ * artificially charge it an entire budget (independently of the
+ * actual service it received). As a consequence, the queue will get
+ * higher timestamps than the correct ones upon reactivation, and
+ * hence it will be rescheduled as if it had received more service
+ * than what it actually received. In the end, this class of processes
+ * will receive less service in proportion to how slowly they consume
+ * their budgets (and hence how seriously they tend to lower the
+ * throughput).
+ *
+ * In contrast, when a queue expires because it has been idling for
+ * too much or because it exhausted its budget, we do not touch the
+ * amount of service it has received. Hence when the queue will be
+ * reactivated and its timestamps updated, the latter will be in sync
+ * with the actual service received by the queue until expiration.
+ *
+ * Charging a full budget to the first type of queues and the exact
+ * service to the others has the effect of using the WF2Q+ policy to
+ * schedule the former on a timeslice basis, without violating the
+ * service domain guarantees of the latter.
+ */
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+			    struct bfq_queue *bfqq,
+			    int compensate,
+			    enum bfqq_expiration reason)
+{
+	int slow;
+	BUG_ON(bfqq != bfqd->active_queue);
+
+	/* Update disk peak rate for autotuning and check whether the
+	 * process is slow (see bfq_update_peak_rate).
+	 */
+	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
+
+	/*
+	 * As above explained, 'punish' slow (i.e., seeky), timed-out
+	 * and async queues, to favor sequential sync workloads.
+	 *
+	 * Processes doing IO in the slower disk zones will tend to be
+	 * slow(er) even if not seeky. Hence, since the estimated peak
+	 * rate is actually an average over the disk surface, these
+	 * processes may timeout just for bad luck. To avoid punishing
+	 * them we do not charge a full budget to a process that
+	 * succeeded in consuming at least 2/3 of its budget.
+	 */
+	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
+		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))
+		bfq_bfqq_charge_full_budget(bfqq);
+
+	if (bfqd->low_latency && bfqq->raising_coeff == 1)
+		bfqq->last_rais_start_finish = jiffies;
+
+	if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {
+	    if(reason != BFQ_BFQQ_BUDGET_TIMEOUT)
+		bfqq->soft_rt_next_start =
+			jiffies +
+			HZ * bfqq->entity.service /
+			bfqd->bfq_raising_max_softrt_rate;
+		else
+			bfqq->soft_rt_next_start = -1; /* infinity */
+	}
+	bfq_log_bfqq(bfqd, bfqq,
+		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
+		bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
+
+	/* Increase, decrease or leave budget unchanged according to reason */
+	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
+	__bfq_bfqq_expire(bfqd, bfqq);
+}
+
+/*
+ * Budget timeout is not implemented through a dedicated timer, but
+ * just checked on request arrivals and completions, as well as on
+ * idle timer expirations.
+ */
+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
+{
+	if (bfq_bfqq_budget_new(bfqq))
+		return 0;
+
+	if (time_before(jiffies, bfqq->budget_timeout))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * If we expire a queue that is waiting for the arrival of a new
+ * request, we may prevent the fictitious timestamp backshifting that
+ * allows the guarantees of the queue to be preserved (see [1] for
+ * this tricky aspect). Hence we return true only if this condition
+ * does not hold, or if the queue is slow enough to deserve only to be
+ * kicked off for preserving a high throughput.
+*/
+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqq->bfqd, bfqq,
+		"may_budget_timeout: wr %d left %d timeout %d",
+		bfq_bfqq_wait_request(bfqq),
+			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,
+		bfq_bfqq_budget_timeout(bfqq));
+
+	return (!bfq_bfqq_wait_request(bfqq) ||
+		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)
+		&&
+		bfq_bfqq_budget_timeout(bfqq);
+}
+
+/*
+ * Select a queue for service.  If we have a current active queue,
+ * check whether to continue servicing it, or retrieve and set a new one.
+ */
+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq, *new_bfqq = NULL;
+	struct request *next_rq;
+	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
+
+	bfqq = bfqd->active_queue;
+	if (bfqq == NULL)
+		goto new_queue;
+
+	bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
+
+	/*
+         * If another queue has a request waiting within our mean seek
+         * distance, let it run. The expire code will check for close
+         * cooperators and put the close queue at the front of the
+         * service tree. If possible, merge the expiring queue with the
+         * new bfqq.
+         */
+        new_bfqq = bfq_close_cooperator(bfqd, bfqq);
+        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
+                bfq_setup_merge(bfqq, new_bfqq);
+
+	if (bfq_may_expire_for_budg_timeout(bfqq))
+		goto expire;
+
+	next_rq = bfqq->next_rq;
+	/*
+	 * If bfqq has requests queued and it has enough budget left to
+	 * serve them, keep the queue, otherwise expire it.
+	 */
+	if (next_rq != NULL) {
+		if (bfq_serv_to_charge(next_rq, bfqq) >
+			bfq_bfqq_budget_left(bfqq)) {
+			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
+			goto expire;
+		} else {
+			/*
+			 * The idle timer may be pending because we may not
+			 * disable disk idling even when a new request arrives
+			 */
+			if (timer_pending(&bfqd->idle_slice_timer)) {
+				/*
+				 * If we get here: 1) at least a new request
+				 * has arrived but we have not disabled the
+				 * timer because the request was too small,
+				 * 2) then the block layer has unplugged the
+				 * device, causing the dispatch to be invoked.
+				 *
+				 * Since the device is unplugged, now the
+				 * requests are probably large enough to
+				 * provide a reasonable throughput.
+				 * So we disable idling.
+				 */
+				bfq_clear_bfqq_wait_request(bfqq);
+				del_timer(&bfqd->idle_slice_timer);
+			}
+			if (new_bfqq == NULL)
+				goto keep_queue;
+			else
+				goto expire;
+		}
+	}
+
+	/*
+	 * No requests pending.  If there is no cooperator, and the active
+	 * queue still has requests in flight or is idling for a new request,
+	 * then keep it.
+	 */
+	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
+		(bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq) &&
+		 !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {
+		bfqq = NULL;
+		goto keep_queue;
+	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
+		/*
+		 * Expiring the queue because there is a close cooperator,
+		 * cancel timer.
+		 */
+		bfq_clear_bfqq_wait_request(bfqq);
+		del_timer(&bfqd->idle_slice_timer);
+	}
+
+	reason = BFQ_BFQQ_NO_MORE_REQUESTS;
+expire:
+	bfq_bfqq_expire(bfqd, bfqq, 0, reason);
+new_queue:
+	bfqq = bfq_set_active_queue(bfqd, new_bfqq);
+	bfq_log(bfqd, "select_queue: new queue %d returned",
+		bfqq != NULL ? bfqq->pid : 0);
+keep_queue:
+	return bfqq;
+}
+
+static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	if (bfqq->raising_coeff > 1) { /* queue is being boosted */
+		struct bfq_entity *entity = &bfqq->entity;
+
+		bfq_log_bfqq(bfqd, bfqq,
+			"raising period dur %u/%u msec, "
+			"old raising coeff %u, w %d(%d)",
+			jiffies_to_msecs(jiffies -
+				bfqq->last_rais_start_finish),
+			jiffies_to_msecs(bfqq->raising_cur_max_time),
+			bfqq->raising_coeff,
+			bfqq->entity.weight, bfqq->entity.orig_weight);
+
+		BUG_ON(bfqq != bfqd->active_queue && entity->weight !=
+			entity->orig_weight * bfqq->raising_coeff);
+		if(entity->ioprio_changed)
+			bfq_log_bfqq(bfqd, bfqq,
+			"WARN: pending prio change");
+		/*
+		 * If too much time has elapsed from the beginning
+		 * of this weight-raising period and process is not soft
+		 * real-time, stop it
+		 */
+		if (jiffies - bfqq->last_rais_start_finish >
+			bfqq->raising_cur_max_time) {
+			int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
+				bfqq->soft_rt_next_start < jiffies;
+
+			bfqq->last_rais_start_finish = jiffies;
+			if (soft_rt)
+				bfqq->raising_cur_max_time =
+					bfqd->bfq_raising_rt_max_time;
+			else {
+				bfqq->raising_coeff = 1;
+				entity->ioprio_changed = 1;
+				__bfq_entity_update_weight_prio(
+					bfq_entity_service_tree(entity),
+					entity);
+			}
+		}
+	}
+}
+
+
+/*
+ * Dispatch one request from bfqq, moving it to the request queue
+ * dispatch list.
+ */
+static int bfq_dispatch_request(struct bfq_data *bfqd,
+				struct bfq_queue *bfqq)
+{
+	int dispatched = 0;
+	struct request *rq;
+	unsigned long service_to_charge;
+
+	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
+
+	/* Follow expired path, else get first next available. */
+	rq = bfq_check_fifo(bfqq);
+	if (rq == NULL)
+		rq = bfqq->next_rq;
+	service_to_charge = bfq_serv_to_charge(rq, bfqq);
+
+	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
+		/*
+		 * This may happen if the next rq is chosen
+		 * in fifo order instead of sector order.
+		 * The budget is properly dimensioned
+		 * to be always sufficient to serve the next request
+		 * only if it is chosen in sector order. The reason is
+		 * that it would be quite inefficient and little useful
+		 * to always make sure that the budget is large enough
+		 * to serve even the possible next rq in fifo order.
+		 * In fact, requests are seldom served in fifo order.
+		 *
+		 * Expire the queue for budget exhaustion, and
+		 * make sure that the next act_budget is enough
+		 * to serve the next request, even if it comes
+		 * from the fifo expired path.
+		 */
+		bfqq->next_rq = rq;
+		/*
+		 * Since this dispatch is failed, make sure that
+		 * a new one will be performed
+		 */
+		if (!bfqd->rq_in_driver)
+			bfq_schedule_dispatch(bfqd);
+		goto expire;
+	}
+
+	/* Finally, insert request into driver dispatch list. */
+	bfq_bfqq_served(bfqq, service_to_charge);
+	bfq_dispatch_insert(bfqd->queue, rq);
+
+	update_raising_data(bfqd, bfqq);
+
+	bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), "
+			"budg left %lu",
+			blk_rq_sectors(rq),
+			(long long unsigned)blk_rq_pos(rq),
+			bfq_bfqq_budget_left(bfqq));
+
+	dispatched++;
+
+	if (bfqd->active_cic == NULL) {
+		atomic_long_inc(&RQ_CIC(rq)->ioc->refcount);
+		bfqd->active_cic = RQ_CIC(rq);
+	}
+
+	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
+	    dispatched >= bfqd->bfq_max_budget_async_rq) ||
+	    bfq_class_idle(bfqq)))
+		goto expire;
+
+	return dispatched;
+
+expire:
+	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
+	return dispatched;
+}
+
+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
+{
+	int dispatched = 0;
+
+	while (bfqq->next_rq != NULL) {
+		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
+		dispatched++;
+	}
+
+	BUG_ON(!list_empty(&bfqq->fifo));
+	return dispatched;
+}
+
+/*
+ * Drain our current requests.  Used for barriers and when switching
+ * io schedulers on-the-fly.
+ */
+static int bfq_forced_dispatch(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq, *n;
+	struct bfq_service_tree *st;
+	int dispatched = 0;
+
+	bfqq = bfqd->active_queue;
+	if (bfqq != NULL)
+		__bfq_bfqq_expire(bfqd, bfqq);
+
+	/*
+	 * Loop through classes, and be careful to leave the scheduler
+	 * in a consistent state, as feedback mechanisms and vtime
+	 * updates cannot be disabled during the process.
+	 */
+	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
+		st = bfq_entity_service_tree(&bfqq->entity);
+
+		dispatched += __bfq_forced_dispatch_bfqq(bfqq);
+		bfqq->max_budget = bfq_max_budget(bfqd);
+
+		bfq_forget_idle(st);
+	}
+
+	BUG_ON(bfqd->busy_queues != 0);
+
+	return dispatched;
+}
+
+static int bfq_dispatch_requests(struct request_queue *q, int force)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_queue *bfqq;
+	int max_dispatch;
+
+	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
+	if (bfqd->busy_queues == 0)
+		return 0;
+
+	if (unlikely(force))
+		return bfq_forced_dispatch(bfqd);
+
+	if((bfqq = bfq_select_queue(bfqd)) == NULL)
+		return 0;
+
+	max_dispatch = bfqd->bfq_quantum;
+	if (bfq_class_idle(bfqq))
+		max_dispatch = 1;
+
+	if (!bfq_bfqq_sync(bfqq))
+		max_dispatch = bfqd->bfq_max_budget_async_rq;
+
+	if (bfqq->dispatched >= max_dispatch) {
+		if (bfqd->busy_queues > 1)
+			return 0;
+		if (bfqq->dispatched >= 4 * max_dispatch)
+			return 0;
+	}
+
+	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
+		return 0;
+
+	bfq_clear_bfqq_wait_request(bfqq);
+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
+
+	if (! bfq_dispatch_request(bfqd, bfqq))
+		return 0;
+
+	bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d"
+		     "(max_disp %d)", bfqq->pid, max_dispatch);
+
+	return 1;
+}
+
+/*
+ * Task holds one reference to the queue, dropped when task exits.  Each rq
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
+ *
+ * Queue lock must be held here.
+ */
+static void bfq_put_queue(struct bfq_queue *bfqq)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+
+	BUG_ON(atomic_read(&bfqq->ref) <= 0);
+
+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
+		     atomic_read(&bfqq->ref));
+	if (!atomic_dec_and_test(&bfqq->ref))
+		return;
+
+	BUG_ON(rb_first(&bfqq->sort_list) != NULL);
+	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
+	BUG_ON(bfqq->entity.tree != NULL);
+	BUG_ON(bfq_bfqq_busy(bfqq));
+	BUG_ON(bfqd->active_queue == bfqq);
+
+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
+
+	kmem_cache_free(bfq_pool, bfqq);
+}
+
+static void bfq_put_cooperator(struct bfq_queue *bfqq)
+{
+	struct bfq_queue *__bfqq, *next;
+
+	/*
+	 * If this queue was scheduled to merge with another queue, be
+	 * sure to drop the reference taken on that queue (and others in
+	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
+	 */
+	__bfqq = bfqq->new_bfqq;
+	while (__bfqq) {
+		if (__bfqq == bfqq) {
+			WARN(1, "bfqq->new_bfqq loop detected.\n");
+			break;
+		}
+		next = __bfqq->new_bfqq;
+		bfq_put_queue(__bfqq);
+		__bfqq = next;
+	}
+}
+
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	if (bfqq == bfqd->active_queue) {
+		__bfq_bfqq_expire(bfqd, bfqq);
+		bfq_schedule_dispatch(bfqd);
+	}
+
+	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
+		     atomic_read(&bfqq->ref));
+
+	bfq_put_cooperator(bfqq);
+
+	bfq_put_queue(bfqq);
+}
+
+/*
+ * Update the entity prio values; note that the new values will not
+ * be used until the next (re)activation.
+ */
+static void bfq_init_prio_data(struct bfq_queue *bfqq, struct io_context *ioc)
+{
+	struct task_struct *tsk = current;
+	int ioprio_class;
+
+	if (!bfq_bfqq_prio_changed(bfqq))
+		return;
+
+	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
+	switch (ioprio_class) {
+	default:
+		printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
+	case IOPRIO_CLASS_NONE:
+		/*
+		 * No prio set, inherit CPU scheduling settings.
+		 */
+		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
+		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
+		break;
+	case IOPRIO_CLASS_RT:
+		bfqq->entity.new_ioprio = task_ioprio(ioc);
+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
+		break;
+	case IOPRIO_CLASS_BE:
+		bfqq->entity.new_ioprio = task_ioprio(ioc);
+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
+		break;
+	case IOPRIO_CLASS_IDLE:
+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
+		bfqq->entity.new_ioprio = 7;
+		bfq_clear_bfqq_idle_window(bfqq);
+		break;
+	}
+
+	bfqq->entity.ioprio_changed = 1;
+
+	/*
+	 * Keep track of original prio settings in case we have to temporarily
+	 * elevate the priority of this queue.
+	 */
+	bfqq->org_ioprio = bfqq->entity.new_ioprio;
+	bfqq->org_ioprio_class = bfqq->entity.new_ioprio_class;
+	bfq_clear_bfqq_prio_changed(bfqq);
+}
+
+static void bfq_changed_ioprio(struct io_context *ioc,
+			       struct cfq_io_context *cic)
+{
+	struct bfq_data *bfqd;
+	struct bfq_queue *bfqq, *new_bfqq;
+	struct bfq_group *bfqg;
+	unsigned long uninitialized_var(flags);
+
+	bfqd = bfq_get_bfqd_locked(&cic->key, &flags);
+	if (unlikely(bfqd == NULL))
+		return;
+
+	bfqq = cic->cfqq[BLK_RW_ASYNC];
+	if (bfqq != NULL) {
+		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
+				    sched_data);
+		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, cic->ioc,
+					 GFP_ATOMIC);
+		if (new_bfqq != NULL) {
+			cic->cfqq[BLK_RW_ASYNC] = new_bfqq;
+			bfq_log_bfqq(bfqd, bfqq,
+				     "changed_ioprio: bfqq %p %d",
+				     bfqq, atomic_read(&bfqq->ref));
+			bfq_put_queue(bfqq);
+		}
+	}
+
+	bfqq = cic->cfqq[BLK_RW_SYNC];
+	if (bfqq != NULL)
+		bfq_mark_bfqq_prio_changed(bfqq);
+
+	bfq_put_bfqd_unlock(bfqd, &flags);
+}
+
+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  pid_t pid, int is_sync)
+{
+	RB_CLEAR_NODE(&bfqq->entity.rb_node);
+	INIT_LIST_HEAD(&bfqq->fifo);
+
+	atomic_set(&bfqq->ref, 0);
+	bfqq->bfqd = bfqd;
+
+	bfq_mark_bfqq_prio_changed(bfqq);
+
+	if (is_sync) {
+		if (!bfq_class_idle(bfqq))
+			bfq_mark_bfqq_idle_window(bfqq);
+		bfq_mark_bfqq_sync(bfqq);
+	}
+
+	/* Tentative initial value to trade off between thr and lat */
+	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
+	bfqq->pid = pid;
+
+	bfqq->raising_coeff = 1;
+	bfqq->last_rais_start_finish = 0;
+	bfqq->soft_rt_next_start = -1;
+}
+
+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
+					      struct bfq_group *bfqg,
+					      int is_sync,
+					      struct io_context *ioc,
+					      gfp_t gfp_mask)
+{
+	struct bfq_queue *bfqq, *new_bfqq = NULL;
+	struct cfq_io_context *cic;
+
+retry:
+	cic = bfq_cic_lookup(bfqd, ioc);
+	/* cic always exists here */
+	bfqq = cic_to_bfqq(cic, is_sync);
+
+	/*
+	 * Always try a new alloc if we fall back to the OOM bfqq
+	 * originally, since it should just be a temporary situation.
+	 */
+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
+		bfqq = NULL;
+		if (new_bfqq != NULL) {
+			bfqq = new_bfqq;
+			new_bfqq = NULL;
+		} else if (gfp_mask & __GFP_WAIT) {
+			spin_unlock_irq(bfqd->queue->queue_lock);
+			new_bfqq = kmem_cache_alloc_node(bfq_pool,
+					gfp_mask | __GFP_ZERO,
+					bfqd->queue->node);
+			spin_lock_irq(bfqd->queue->queue_lock);
+			if (new_bfqq != NULL)
+				goto retry;
+		} else {
+			bfqq = kmem_cache_alloc_node(bfq_pool,
+					gfp_mask | __GFP_ZERO,
+					bfqd->queue->node);
+		}
+
+		if (bfqq != NULL) {
+			bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
+			bfq_log_bfqq(bfqd, bfqq, "allocated");
+		} else {
+			bfqq = &bfqd->oom_bfqq;
+			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
+		}
+
+		bfq_init_prio_data(bfqq, ioc);
+		bfq_init_entity(&bfqq->entity, bfqg);
+	}
+
+	if (new_bfqq != NULL)
+		kmem_cache_free(bfq_pool, new_bfqq);
+
+	return bfqq;
+}
+
+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
+					       struct bfq_group *bfqg,
+					       int ioprio_class, int ioprio)
+{
+	switch (ioprio_class) {
+	case IOPRIO_CLASS_RT:
+		return &bfqg->async_bfqq[0][ioprio];
+	case IOPRIO_CLASS_BE:
+		return &bfqg->async_bfqq[1][ioprio];
+	case IOPRIO_CLASS_IDLE:
+		return &bfqg->async_idle_bfqq;
+	default:
+		BUG();
+	}
+}
+
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+				       struct bfq_group *bfqg, int is_sync,
+				       struct io_context *ioc, gfp_t gfp_mask)
+{
+	const int ioprio = task_ioprio(ioc);
+	const int ioprio_class = task_ioprio_class(ioc);
+	struct bfq_queue **async_bfqq = NULL;
+	struct bfq_queue *bfqq = NULL;
+
+	if (!is_sync) {
+		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
+						  ioprio);
+		bfqq = *async_bfqq;
+	}
+
+	if (bfqq == NULL)
+		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask);
+
+	/*
+	 * Pin the queue now that it's allocated, scheduler exit will prune it.
+	 */
+	if (!is_sync && *async_bfqq == NULL) {
+		atomic_inc(&bfqq->ref);
+		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
+			     bfqq, atomic_read(&bfqq->ref));
+		*async_bfqq = bfqq;
+	}
+
+	atomic_inc(&bfqq->ref);
+	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
+		     atomic_read(&bfqq->ref));
+	return bfqq;
+}
+
+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
+				    struct cfq_io_context *cic)
+{
+	unsigned long elapsed = jiffies - cic->last_end_request;
+	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
+
+	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
+	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
+	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
+}
+
+static void bfq_update_io_seektime(struct bfq_data *bfqd,
+				   struct bfq_queue *bfqq,
+				   struct request *rq)
+{
+	sector_t sdist;
+	u64 total;
+
+	if (bfqq->last_request_pos < blk_rq_pos(rq))
+		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
+	else
+		sdist = bfqq->last_request_pos - blk_rq_pos(rq);
+
+	/*
+	 * Don't allow the seek distance to get too large from the
+	 * odd fragment, pagein, etc.
+	 */
+	if (bfqq->seek_samples == 0) /* first request, not really a seek */
+		sdist = 0;
+	else if (bfqq->seek_samples <= 60) /* second & third seek */
+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
+	else
+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
+
+	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
+	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
+	total = bfqq->seek_total + (bfqq->seek_samples/2);
+	do_div(total, bfqq->seek_samples);
+	if (bfq_bfqq_coop(bfqq)) {
+		/*
+		 * If the mean seektime increases for a (non-seeky) shared
+		 * queue, some cooperator is likely to be idling too much.
+		 * On the contrary,  if it decreases, some cooperator has
+		 * probably waked up.
+		 *
+		 */
+		if ((sector_t)total < bfqq->seek_mean)
+			bfq_mark_bfqq_some_coop_idle(bfqq) ;
+		else if ((sector_t)total > bfqq->seek_mean)
+			bfq_clear_bfqq_some_coop_idle(bfqq) ;
+	}
+	bfqq->seek_mean = (sector_t)total;
+
+	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
+			(u64)bfqq->seek_mean);
+}
+
+/*
+ * Disable idle window if the process thinks too long or seeks so much that
+ * it doesn't matter.
+ */
+static void bfq_update_idle_window(struct bfq_data *bfqd,
+				   struct bfq_queue *bfqq,
+				   struct cfq_io_context *cic)
+{
+	int enable_idle;
+
+	/* Don't idle for async or idle io prio class. */
+	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
+		return;
+
+	enable_idle = bfq_bfqq_idle_window(bfqq);
+
+	if (atomic_read(&cic->ioc->nr_tasks) == 0 ||
+	    bfqd->bfq_slice_idle == 0 ||
+		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
+			bfqq->raising_coeff == 1))
+		enable_idle = 0;
+	else if (bfq_sample_valid(cic->ttime_samples)) {
+		if (cic->ttime_mean > bfqd->bfq_slice_idle &&
+			bfqq->raising_coeff == 1)
+			enable_idle = 0;
+		else
+			enable_idle = 1;
+	}
+	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
+		enable_idle);
+
+	if (enable_idle)
+		bfq_mark_bfqq_idle_window(bfqq);
+	else
+		bfq_clear_bfqq_idle_window(bfqq);
+}
+
+/*
+ * Called when a new fs request (rq) is added to bfqq.  Check if there's
+ * something we should do about it.
+ */
+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			    struct request *rq)
+{
+	struct cfq_io_context *cic = RQ_CIC(rq);
+
+	if (rq->cmd_flags & REQ_META)
+		bfqq->meta_pending++;
+
+	bfq_update_io_thinktime(bfqd, cic);
+	bfq_update_io_seektime(bfqd, bfqq, rq);
+	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
+	    !BFQQ_SEEKY(bfqq))
+		bfq_update_idle_window(bfqd, bfqq, cic);
+
+	bfq_log_bfqq(bfqd, bfqq,
+		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
+		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
+		     (long long unsigned)bfqq->seek_mean);
+
+	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+
+	if (bfqq == bfqd->active_queue) {
+		/*
+		 * If there is just this request queued and the request
+		 * is small, just exit.
+		 * In this way, if the disk is being idled to wait for a new
+		 * request from the active queue, we avoid unplugging the
+		 * device now.
+		 *
+		 * By doing so, we spare the disk to be committed
+		 * to serve just a small request. On the contrary, we wait for
+		 * the block layer to decide when to unplug the device:
+		 * hopefully, new requests will be merged to this
+		 * one quickly, then the device will be unplugged
+		 * and larger requests will be dispatched.
+		 */
+	        if (bfqq->queued[rq_is_sync(rq)] == 1 &&
+		    blk_rq_sectors(rq) < 32) {
+		        return;
+		}
+		if (bfq_bfqq_wait_request(bfqq)) {
+			/*
+			 * If we are waiting for a request for this queue, let
+			 * it rip immediately and flag that we must not expire
+			 * this queue just now.
+			 */
+			bfq_clear_bfqq_wait_request(bfqq);
+			del_timer(&bfqd->idle_slice_timer);
+			/*
+			 * Here we can safely expire the queue, in
+			 * case of budget timeout, without wasting
+			 * guarantees
+			 */
+			if (bfq_bfqq_budget_timeout(bfqq))
+				bfq_bfqq_expire(bfqd, bfqq, 0,
+						BFQ_BFQQ_BUDGET_TIMEOUT);
+			__blk_run_queue(bfqd->queue);
+		}
+	}
+}
+
+static void bfq_insert_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+	assert_spin_locked(bfqd->queue->queue_lock);
+	bfq_init_prio_data(bfqq, RQ_CIC(rq)->ioc);
+
+	bfq_add_rq_rb(rq);
+
+	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
+	list_add_tail(&rq->queuelist, &bfqq->fifo);
+
+	bfq_rq_enqueued(bfqd, bfqq, rq);
+}
+
+static void bfq_update_hw_tag(struct bfq_data *bfqd)
+{
+	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
+				     bfqd->rq_in_driver);
+
+	if (bfqd->hw_tag == 1)
+		return;
+
+	/*
+	 * This sample is valid if the number of outstanding requests
+	 * is large enough to allow a queueing behavior.  Note that the
+	 * sum is not exact, as it's not taking into account deactivated
+	 * requests.
+	 */
+	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
+		return;
+
+	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
+		return;
+
+	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
+	bfqd->max_rq_in_driver = 0;
+	bfqd->hw_tag_samples = 0;
+}
+
+static void bfq_completed_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_data *bfqd = bfqq->bfqd;
+	const int sync = rq_is_sync(rq);
+
+	bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
+			blk_rq_sectors(rq), sync);
+
+	bfq_update_hw_tag(bfqd);
+
+	WARN_ON(!bfqd->rq_in_driver);
+	WARN_ON(!bfqq->dispatched);
+	bfqd->rq_in_driver--;
+	bfqq->dispatched--;
+
+	if (bfq_bfqq_sync(bfqq))
+		bfqd->sync_flight--;
+
+	if (sync)
+		RQ_CIC(rq)->last_end_request = jiffies;
+
+	/*
+	 * If this is the active queue, check if it needs to be expired,
+	 * or if we want to idle in case it has no pending requests.
+	 */
+	if (bfqd->active_queue == bfqq) {
+		if (bfq_bfqq_budget_new(bfqq))
+			bfq_set_budget_timeout(bfqd);
+
+		/* Idling is disabled also for cooperation issues:
+		 * 1) there is a close cooperator for the queue, or
+		 * 2) the queue is shared and some cooperator is likely
+		 *    to be idle (in this case, by not arming the idle timer,
+		 *    we try to slow down the queue, to prevent the zones
+		 *    of the disk accessed by the active cooperators to become
+		 *    too distant from the zone that will be accessed by the
+		 *    currently idle cooperators)
+		 */
+		if (bfq_may_expire_for_budg_timeout(bfqq))
+			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
+		else if (sync &&
+			(bfqd->rq_in_driver == 0 ||
+				bfqq->raising_coeff > 1)
+			&& RB_EMPTY_ROOT(&bfqq->sort_list)
+			&& !bfq_close_cooperator(bfqd, bfqq)
+			&& (!bfq_bfqq_coop(bfqq) ||
+				!bfq_bfqq_some_coop_idle(bfqq)))
+			bfq_arm_slice_timer(bfqd);
+	}
+
+	if (!bfqd->rq_in_driver)
+		bfq_schedule_dispatch(bfqd);
+}
+
+/*
+ * We temporarily boost lower priority queues if they are holding fs exclusive
+ * resources.  They are boosted to normal prio (CLASS_BE/4).
+ */
+static void bfq_prio_boost(struct bfq_queue *bfqq)
+{
+	if (has_fs_excl()) {
+		/*
+		 * Boost idle prio on transactions that would lock out other
+		 * users of the filesystem
+		 */
+		if (bfq_class_idle(bfqq))
+			bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
+		if (bfqq->entity.new_ioprio > IOPRIO_NORM)
+			bfqq->entity.new_ioprio = IOPRIO_NORM;
+	} else {
+		/*
+		 * Unboost the queue (if needed)
+		 */
+		bfqq->entity.new_ioprio_class = bfqq->org_ioprio_class;
+		bfqq->entity.new_ioprio = bfqq->org_ioprio;
+	}
+}
+
+static inline int __bfq_may_queue(struct bfq_queue *bfqq)
+{
+	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
+		bfq_clear_bfqq_must_alloc(bfqq);
+		return ELV_MQUEUE_MUST;
+	}
+
+	return ELV_MQUEUE_MAY;
+}
+
+static int bfq_may_queue(struct request_queue *q, int rw)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct task_struct *tsk = current;
+	struct cfq_io_context *cic;
+	struct bfq_queue *bfqq;
+
+	/*
+	 * Don't force setup of a queue from here, as a call to may_queue
+	 * does not necessarily imply that a request actually will be queued.
+	 * So just lookup a possibly existing queue, or return 'may queue'
+	 * if that fails.
+	 */
+	cic = bfq_cic_lookup(bfqd, tsk->io_context);
+	if (cic == NULL)
+		return ELV_MQUEUE_MAY;
+
+	bfqq = cic_to_bfqq(cic, rw_is_sync(rw));
+	if (bfqq != NULL) {
+		bfq_init_prio_data(bfqq, cic->ioc);
+		bfq_prio_boost(bfqq);
+
+		return __bfq_may_queue(bfqq);
+	}
+
+	return ELV_MQUEUE_MAY;
+}
+
+/*
+ * Queue lock held here.
+ */
+static void bfq_put_request(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+	if (bfqq != NULL) {
+		const int rw = rq_data_dir(rq);
+
+		BUG_ON(!bfqq->allocated[rw]);
+		bfqq->allocated[rw]--;
+
+		put_io_context(RQ_CIC(rq)->ioc);
+
+		rq->elevator_private[0] = NULL;
+		rq->elevator_private[1] = NULL;
+
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
+			     bfqq, atomic_read(&bfqq->ref));
+		bfq_put_queue(bfqq);
+	}
+}
+
+static struct bfq_queue *
+bfq_merge_bfqqs(struct bfq_data *bfqd, struct cfq_io_context *cic,
+                struct bfq_queue *bfqq)
+{
+        bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
+		(long unsigned)bfqq->new_bfqq->pid);
+        cic_set_bfqq(cic, bfqq->new_bfqq, 1);
+        bfq_mark_bfqq_coop(bfqq->new_bfqq);
+        bfq_put_queue(bfqq);
+        return cic_to_bfqq(cic, 1);
+}
+
+/*
+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
+ * was the last process referring to said bfqq.
+ */
+static struct bfq_queue *
+bfq_split_bfqq(struct cfq_io_context *cic, struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
+	if (bfqq_process_refs(bfqq) == 1) {
+		bfqq->pid = current->pid;
+		bfq_clear_bfqq_some_coop_idle(bfqq);
+		bfq_clear_bfqq_coop(bfqq);
+		bfq_clear_bfqq_split_coop(bfqq);
+		return bfqq;
+	}
+
+	cic_set_bfqq(cic, NULL, 1);
+
+	bfq_put_cooperator(bfqq);
+
+	bfq_put_queue(bfqq);
+	return NULL;
+}
+
+/*
+ * Allocate bfq data structures associated with this request.
+ */
+static int bfq_set_request(struct request_queue *q, struct request *rq,
+			   gfp_t gfp_mask)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct cfq_io_context *cic;
+	const int rw = rq_data_dir(rq);
+	const int is_sync = rq_is_sync(rq);
+	struct bfq_queue *bfqq;
+	struct bfq_group *bfqg;
+	unsigned long flags;
+
+	might_sleep_if(gfp_mask & __GFP_WAIT);
+
+	cic = bfq_get_io_context(bfqd, gfp_mask);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	if (cic == NULL)
+		goto queue_fail;
+
+	bfqg = bfq_cic_update_cgroup(cic);
+
+new_queue:
+	bfqq = cic_to_bfqq(cic, is_sync);
+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
+		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, cic->ioc, gfp_mask);
+		cic_set_bfqq(cic, bfqq, is_sync);
+	} else {
+		/*
+		 * If the queue was seeky for too long, break it apart.
+		 */
+		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
+			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
+			bfqq = bfq_split_bfqq(cic, bfqq);
+			if (!bfqq)
+				goto new_queue;
+		}
+
+		/*
+		 * Check to see if this queue is scheduled to merge with
+		 * another closely cooperating queue. The merging of queues
+		 * happens here as it must be done in process context.
+		 * The reference on new_bfqq was taken in merge_bfqqs.
+		 */
+		if (bfqq->new_bfqq != NULL)
+			bfqq = bfq_merge_bfqqs(bfqd, cic, bfqq);
+	}
+
+	bfqq->allocated[rw]++;
+	atomic_inc(&bfqq->ref);
+	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
+		     atomic_read(&bfqq->ref));
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	rq->elevator_private[0] = cic;
+	rq->elevator_private[1] = bfqq;
+
+	return 0;
+
+queue_fail:
+	if (cic != NULL)
+		put_io_context(cic->ioc);
+
+	bfq_schedule_dispatch(bfqd);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return 1;
+}
+
+static void bfq_kick_queue(struct work_struct *work)
+{
+	struct bfq_data *bfqd =
+		container_of(work, struct bfq_data, unplug_work);
+	struct request_queue *q = bfqd->queue;
+
+	spin_lock_irq(q->queue_lock);
+	__blk_run_queue(q);
+	spin_unlock_irq(q->queue_lock);
+}
+
+/*
+ * Handler of the expiration of the timer running if the active_queue
+ * is idling inside its time slice.
+ */
+static void bfq_idle_slice_timer(unsigned long data)
+{
+	struct bfq_data *bfqd = (struct bfq_data *)data;
+	struct bfq_queue *bfqq;
+	unsigned long flags;
+	enum bfqq_expiration reason;
+
+	spin_lock_irqsave(bfqd->queue->queue_lock, flags);
+
+	bfqq = bfqd->active_queue;
+	/*
+	 * Theoretical race here: active_queue can be NULL or different
+	 * from the queue that was idling if the timer handler spins on
+	 * the queue_lock and a new request arrives for the current
+	 * queue and there is a full dispatch cycle that changes the
+	 * active_queue.  This can hardly happen, but in the worst case
+	 * we just expire a queue too early.
+	 */
+	if (bfqq != NULL) {
+		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
+		if (bfq_bfqq_budget_timeout(bfqq))
+			/*
+			 * Also here the queue can be safely expired
+			 * for budget timeout without wasting
+			 * guarantees
+			 */
+			reason = BFQ_BFQQ_BUDGET_TIMEOUT;
+		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
+			/*
+			 * The queue may not be empty upon timer expiration,
+			 * because we may not disable the timer when the first
+			 * request of the active queue arrives during
+			 * disk idling
+			 */
+			reason = BFQ_BFQQ_TOO_IDLE;
+		else
+			goto schedule_dispatch;
+
+		bfq_bfqq_expire(bfqd, bfqq, 1, reason);
+	}
+
+schedule_dispatch:
+	bfq_schedule_dispatch(bfqd);
+
+	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
+}
+
+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
+{
+	del_timer_sync(&bfqd->idle_slice_timer);
+	cancel_work_sync(&bfqd->unplug_work);
+}
+
+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
+					struct bfq_queue **bfqq_ptr)
+{
+	struct bfq_group *root_group = bfqd->root_group;
+	struct bfq_queue *bfqq = *bfqq_ptr;
+
+	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
+	if (bfqq != NULL) {
+		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
+		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
+			     bfqq, atomic_read(&bfqq->ref));
+		bfq_put_queue(bfqq);
+		*bfqq_ptr = NULL;
+	}
+}
+
+/*
+ * Release all the bfqg references to its async queues.  If we are
+ * deallocating the group these queues may still contain requests, so
+ * we reparent them to the root cgroup (i.e., the only one that will
+ * exist for sure untill all the requests on a device are gone).
+ */
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
+{
+	int i, j;
+
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < IOPRIO_BE_NR; j++)
+			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
+
+	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
+}
+
+static void bfq_exit_queue(struct elevator_queue *e)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	struct request_queue *q = bfqd->queue;
+	struct bfq_queue *bfqq, *n;
+	struct cfq_io_context *cic;
+
+	bfq_shutdown_timer_wq(bfqd);
+
+	spin_lock_irq(q->queue_lock);
+
+	while (!list_empty(&bfqd->cic_list)) {
+		cic = list_entry(bfqd->cic_list.next, struct cfq_io_context,
+				 queue_list);
+		__bfq_exit_single_io_context(bfqd, cic);
+	}
+
+	BUG_ON(bfqd->active_queue != NULL);
+	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
+		bfq_deactivate_bfqq(bfqd, bfqq, 0);
+
+	bfq_disconnect_groups(bfqd);
+	spin_unlock_irq(q->queue_lock);
+
+	bfq_shutdown_timer_wq(bfqd);
+
+	spin_lock(&cic_index_lock);
+	ida_remove(&cic_index_ida, bfqd->cic_index);
+	spin_unlock(&cic_index_lock);
+
+	/* Wait for cic->key accessors to exit their grace periods. */
+	synchronize_rcu();
+
+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
+
+	bfq_free_root_group(bfqd);
+	kfree(bfqd);
+}
+
+static int bfq_alloc_cic_index(void)
+{
+	int index, error;
+
+	do {
+		if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
+			return -ENOMEM;
+
+		spin_lock(&cic_index_lock);
+		error = ida_get_new(&cic_index_ida, &index);
+		spin_unlock(&cic_index_lock);
+		if (error && error != -EAGAIN)
+			return error;
+	} while (error);
+
+	return index;
+}
+
+static void *bfq_init_queue(struct request_queue *q)
+{
+	struct bfq_group *bfqg;
+	struct bfq_data *bfqd;
+	int i;
+
+	i = bfq_alloc_cic_index();
+	if (i < 0)
+		return NULL;
+
+	bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);
+	if (bfqd == NULL)
+		return NULL;
+
+	bfqd->cic_index = i;
+
+	/*
+	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+	 * Grab a permanent reference to it, so that the normal code flow
+	 * will not attempt to free it.
+	 */
+	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
+	atomic_inc(&bfqd->oom_bfqq.ref);
+
+	INIT_LIST_HEAD(&bfqd->cic_list);
+
+	bfqd->queue = q;
+
+	bfqg = bfq_alloc_root_group(bfqd, q->node);
+	if (bfqg == NULL) {
+		kfree(bfqd);
+		return NULL;
+	}
+
+	bfqd->root_group = bfqg;
+
+	init_timer(&bfqd->idle_slice_timer);
+	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+	bfqd->idle_slice_timer.data = (unsigned long)bfqd;
+
+	bfqd->rq_pos_tree = RB_ROOT;
+
+	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
+
+	INIT_LIST_HEAD(&bfqd->active_list);
+	INIT_LIST_HEAD(&bfqd->idle_list);
+
+	bfqd->hw_tag = -1;
+
+	bfqd->bfq_max_budget = bfq_default_max_budget;
+
+	bfqd->bfq_quantum = bfq_quantum;
+	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
+	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
+	bfqd->bfq_back_max = bfq_back_max;
+	bfqd->bfq_back_penalty = bfq_back_penalty;
+	bfqd->bfq_slice_idle = bfq_slice_idle;
+	bfqd->bfq_class_idle_last_service = 0;
+	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
+	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
+	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
+
+	bfqd->low_latency = true;
+
+	bfqd->bfq_raising_coeff = 20;
+	bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
+	bfqd->bfq_raising_max_time = 0;
+	bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
+	bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
+	bfqd->bfq_raising_max_softrt_rate = 7000;
+
+	/* Initially estimate the device's peak rate as the reference rate */
+	if (blk_queue_nonrot(bfqd->queue)) {
+		bfqd->RT_prod = R_nonrot * T_nonrot;
+		bfqd->peak_rate = R_nonrot;
+	} else {
+		bfqd->RT_prod = R_rot * T_rot;
+		bfqd->peak_rate = R_rot;
+	}
+
+	return bfqd;
+}
+
+static void bfq_slab_kill(void)
+{
+	if (bfq_pool != NULL)
+		kmem_cache_destroy(bfq_pool);
+	if (bfq_ioc_pool != NULL)
+		kmem_cache_destroy(bfq_ioc_pool);
+}
+
+static int __init bfq_slab_setup(void)
+{
+	bfq_pool = KMEM_CACHE(bfq_queue, 0);
+	if (bfq_pool == NULL)
+		goto fail;
+
+	bfq_ioc_pool = kmem_cache_create("bfq_io_context",
+					 sizeof(struct cfq_io_context),
+					 __alignof__(struct cfq_io_context),
+					 0, NULL);
+	if (bfq_ioc_pool == NULL)
+		goto fail;
+
+	return 0;
+fail:
+	bfq_slab_kill();
+	return -ENOMEM;
+}
+
+static ssize_t bfq_var_show(unsigned int var, char *page)
+{
+	return sprintf(page, "%d\n", var);
+}
+
+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
+{
+	unsigned long new_val;
+	int ret = strict_strtoul(page, 10, &new_val);
+
+	if (ret == 0)
+		*var = new_val;
+
+	return count;
+}
+
+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
+		       bfqd->bfq_raising_max_time :
+		       bfq_wrais_duration(bfqd));
+}
+
+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
+{
+	struct bfq_queue *bfqq;
+	struct bfq_data *bfqd = e->elevator_data;
+	ssize_t num_char = 0;
+
+	num_char += sprintf(page + num_char, "Active:\n");
+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
+		num_char += sprintf(page + num_char,
+			"pid%d: weight %hu, dur %d/%u\n",
+			bfqq->pid,
+			bfqq->entity.weight,
+			jiffies_to_msecs(jiffies -
+				bfqq->last_rais_start_finish),
+			jiffies_to_msecs(bfqq->raising_cur_max_time));
+	}
+	num_char += sprintf(page + num_char, "Idle:\n");
+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
+			num_char += sprintf(page + num_char,
+				"pid%d: weight %hu, dur %d/%u\n",
+				bfqq->pid,
+				bfqq->entity.weight,
+				jiffies_to_msecs(jiffies -
+					bfqq->last_rais_start_finish),
+				jiffies_to_msecs(bfqq->raising_cur_max_time));
+	}
+	return num_char;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	unsigned int __data = __VAR;					\
+	if (__CONV)							\
+		__data = jiffies_to_msecs(__data);			\
+	return bfq_var_show(__data, (page));				\
+}
+SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
+	1);
+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
+	      bfqd->bfq_raising_min_inter_arr_async,
+	      1);
+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
+	bfqd->bfq_raising_max_softrt_rate, 0);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
+static ssize_t								\
+__FUNC(struct elevator_queue *e, const char *page, size_t count)	\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	unsigned long __data;						\
+	int ret = bfq_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	if (__CONV)							\
+		*(__PTR) = msecs_to_jiffies(__data);			\
+	else								\
+		*(__PTR) = __data;					\
+	return ret;							\
+}
+STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
+		INT_MAX, 1);
+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
+		INT_MAX, 1);
+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
+		INT_MAX, 0);
+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
+		1, INT_MAX, 0);
+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
+		INT_MAX, 1);
+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
+		INT_MAX, 0);
+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
+		INT_MAX, 1);
+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
+		INT_MAX, 1);
+STORE_FUNCTION(bfq_raising_min_idle_time_store,
+	       &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
+	       &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
+	       &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
+#undef STORE_FUNCTION
+
+/* do nothing for the moment */
+static ssize_t bfq_weights_store(struct elevator_queue *e,
+				    const char *page, size_t count)
+{
+	return count;
+}
+
+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
+{
+	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
+
+	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
+		return bfq_calc_max_budget(bfqd->peak_rate, timeout);
+	else
+		return bfq_default_max_budget;
+}
+
+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
+				    const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long __data;
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data == 0)
+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
+	else {
+		if (__data > INT_MAX)
+			__data = INT_MAX;
+		bfqd->bfq_max_budget = __data;
+	}
+
+	bfqd->bfq_user_max_budget = __data;
+
+	return ret;
+}
+
+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
+				      const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long __data;
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data < 1)
+		__data = 1;
+	else if (__data > INT_MAX)
+		__data = INT_MAX;
+
+	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
+	if (bfqd->bfq_user_max_budget == 0)
+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
+
+	return ret;
+}
+
+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
+				     const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long __data;
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data > 1)
+		__data = 1;
+	bfqd->low_latency = __data;
+
+	return ret;
+}
+
+#define BFQ_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
+
+static struct elv_fs_entry bfq_attrs[] = {
+	BFQ_ATTR(quantum),
+	BFQ_ATTR(fifo_expire_sync),
+	BFQ_ATTR(fifo_expire_async),
+	BFQ_ATTR(back_seek_max),
+	BFQ_ATTR(back_seek_penalty),
+	BFQ_ATTR(slice_idle),
+	BFQ_ATTR(max_budget),
+	BFQ_ATTR(max_budget_async_rq),
+	BFQ_ATTR(timeout_sync),
+	BFQ_ATTR(timeout_async),
+	BFQ_ATTR(low_latency),
+	BFQ_ATTR(raising_coeff),
+	BFQ_ATTR(raising_max_time),
+	BFQ_ATTR(raising_rt_max_time),
+	BFQ_ATTR(raising_min_idle_time),
+	BFQ_ATTR(raising_min_inter_arr_async),
+	BFQ_ATTR(raising_max_softrt_rate),
+	BFQ_ATTR(weights),
+	__ATTR_NULL
+};
+
+static struct elevator_type iosched_bfq = {
+	.ops = {
+		.elevator_merge_fn =		bfq_merge,
+		.elevator_merged_fn =		bfq_merged_request,
+		.elevator_merge_req_fn =	bfq_merged_requests,
+		.elevator_allow_merge_fn =	bfq_allow_merge,
+		.elevator_dispatch_fn =		bfq_dispatch_requests,
+		.elevator_add_req_fn =		bfq_insert_request,
+		.elevator_activate_req_fn =	bfq_activate_request,
+		.elevator_deactivate_req_fn =	bfq_deactivate_request,
+		.elevator_completed_req_fn =	bfq_completed_request,
+		.elevator_former_req_fn =	elv_rb_former_request,
+		.elevator_latter_req_fn =	elv_rb_latter_request,
+		.elevator_set_req_fn =		bfq_set_request,
+		.elevator_put_req_fn =		bfq_put_request,
+		.elevator_may_queue_fn =	bfq_may_queue,
+		.elevator_init_fn =		bfq_init_queue,
+		.elevator_exit_fn =		bfq_exit_queue,
+		.trim =				bfq_free_io_context,
+	},
+	.elevator_attrs =	bfq_attrs,
+	.elevator_name =	"bfq",
+	.elevator_owner =	THIS_MODULE,
+};
+
+static int __init bfq_init(void)
+{
+	/*
+	 * Can be 0 on HZ < 1000 setups.
+	 */
+	//if (bfq_slice_idle == 0)
+	//	bfq_slice_idle = 1;
+
+	if (bfq_timeout_async == 0)
+		bfq_timeout_async = 1;
+
+	if (bfq_slab_setup())
+		return -ENOMEM;
+
+	elv_register(&iosched_bfq);
+
+	return 0;
+}
+
+static void __exit bfq_exit(void)
+{
+	DECLARE_COMPLETION_ONSTACK(all_gone);
+	elv_unregister(&iosched_bfq);
+	bfq_ioc_gone = &all_gone;
+	/* bfq_ioc_gone's update must be visible before reading bfq_ioc_count */
+	smp_wmb();
+	if (elv_ioc_count_read(bfq_ioc_count) != 0)
+		wait_for_completion(&all_gone);
+	ida_destroy(&cic_index_ida);
+	bfq_slab_kill();
+}
+
+module_init(bfq_init);
+module_exit(bfq_exit);
+
+MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
diff --git a/block/bfq-sched.c b/block/bfq-sched.c
new file mode 100644
index 000000000..fd50b7fd1
--- /dev/null
+++ b/block/bfq-sched.c
@@ -0,0 +1,1066 @@
+/*
+ * BFQ: Hierarchical B-WF2Q+ scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ */
+
+#ifdef CONFIG_CGROUP_BFQIO
+#define for_each_entity(entity)	\
+	for (; entity != NULL; entity = entity->parent)
+
+#define for_each_entity_safe(entity, parent) \
+	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
+
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
+						 int extract,
+						 struct bfq_data *bfqd);
+
+static inline void bfq_update_budget(struct bfq_entity *next_active)
+{
+	struct bfq_entity *bfqg_entity;
+	struct bfq_group *bfqg;
+	struct bfq_sched_data *group_sd;
+
+	BUG_ON(next_active == NULL);
+
+	group_sd = next_active->sched_data;
+
+	bfqg = container_of(group_sd, struct bfq_group, sched_data);
+	/*
+	 * bfq_group's my_entity field is not NULL only if the group
+	 * is not the root group. We must not touch the root entity
+	 * as it must never become an active entity.
+	 */
+	bfqg_entity = bfqg->my_entity;
+	if (bfqg_entity != NULL)
+		bfqg_entity->budget = next_active->budget;
+}
+
+static int bfq_update_next_active(struct bfq_sched_data *sd)
+{
+	struct bfq_entity *next_active;
+
+	if (sd->active_entity != NULL)
+		/* will update/requeue at the end of service */
+		return 0;
+
+	/*
+	 * NOTE: this can be improved in many ways, such as returning
+	 * 1 (and thus propagating upwards the update) only when the
+	 * budget changes, or caching the bfqq that will be scheduled
+	 * next from this subtree.  By now we worry more about
+	 * correctness than about performance...
+	 */
+	next_active = bfq_lookup_next_entity(sd, 0, NULL);
+	sd->next_active = next_active;
+
+	if (next_active != NULL)
+		bfq_update_budget(next_active);
+
+	return 1;
+}
+
+static inline void bfq_check_next_active(struct bfq_sched_data *sd,
+					 struct bfq_entity *entity)
+{
+	BUG_ON(sd->next_active != entity);
+}
+#else
+#define for_each_entity(entity)	\
+	for (; entity != NULL; entity = NULL)
+
+#define for_each_entity_safe(entity, parent) \
+	for (parent = NULL; entity != NULL; entity = parent)
+
+static inline int bfq_update_next_active(struct bfq_sched_data *sd)
+{
+	return 0;
+}
+
+static inline void bfq_check_next_active(struct bfq_sched_data *sd,
+					 struct bfq_entity *entity)
+{
+}
+
+static inline void bfq_update_budget(struct bfq_entity *next_active)
+{
+}
+#endif
+
+/*
+ * Shift for timestamp calculations.  This actually limits the maximum
+ * service allowed in one timestamp delta (small shift values increase it),
+ * the maximum total weight that can be used for the queues in the system
+ * (big shift values increase it), and the period of virtual time wraparounds.
+ */
+#define WFQ_SERVICE_SHIFT	22
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static inline int bfq_gt(u64 a, u64 b)
+{
+	return (s64)(a - b) > 0;
+}
+
+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = NULL;
+
+	BUG_ON(entity == NULL);
+
+	if (entity->my_sched_data == NULL)
+		bfqq = container_of(entity, struct bfq_queue, entity);
+
+	return bfqq;
+}
+
+
+/**
+ * bfq_delta - map service into the virtual time domain.
+ * @service: amount of service.
+ * @weight: scale factor (weight of an entity or weight sum).
+ */
+static inline u64 bfq_delta(unsigned long service,
+					unsigned long weight)
+{
+	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
+
+	do_div(d, weight);
+	return d;
+}
+
+/**
+ * bfq_calc_finish - assign the finish time to an entity.
+ * @entity: the entity to act upon.
+ * @service: the service to be charged to the entity.
+ */
+static inline void bfq_calc_finish(struct bfq_entity *entity,
+				   unsigned long service)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	BUG_ON(entity->weight == 0);
+
+	entity->finish = entity->start +
+		bfq_delta(service, entity->weight);
+
+	if (bfqq != NULL) {
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			"calc_finish: serv %lu, w %d",
+			service, entity->weight);
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			"calc_finish: start %llu, finish %llu, delta %llu",
+			entity->start, entity->finish,
+			bfq_delta(service, entity->weight));
+	}
+}
+
+/**
+ * bfq_entity_of - get an entity from a node.
+ * @node: the node field of the entity.
+ *
+ * Convert a node pointer to the relative entity.  This is used only
+ * to simplify the logic of some functions and not as the generic
+ * conversion mechanism because, e.g., in the tree walking functions,
+ * the check for a %NULL value would be redundant.
+ */
+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
+{
+	struct bfq_entity *entity = NULL;
+
+	if (node != NULL)
+		entity = rb_entry(node, struct bfq_entity, rb_node);
+
+	return entity;
+}
+
+/**
+ * bfq_extract - remove an entity from a tree.
+ * @root: the tree root.
+ * @entity: the entity to remove.
+ */
+static inline void bfq_extract(struct rb_root *root,
+			       struct bfq_entity *entity)
+{
+	BUG_ON(entity->tree != root);
+
+	entity->tree = NULL;
+	rb_erase(&entity->rb_node, root);
+}
+
+/**
+ * bfq_idle_extract - extract an entity from the idle tree.
+ * @st: the service tree of the owning @entity.
+ * @entity: the entity being removed.
+ */
+static void bfq_idle_extract(struct bfq_service_tree *st,
+			     struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *next;
+
+	BUG_ON(entity->tree != &st->idle);
+
+	if (entity == st->first_idle) {
+		next = rb_next(&entity->rb_node);
+		st->first_idle = bfq_entity_of(next);
+	}
+
+	if (entity == st->last_idle) {
+		next = rb_prev(&entity->rb_node);
+		st->last_idle = bfq_entity_of(next);
+	}
+
+	bfq_extract(&st->idle, entity);
+
+	if (bfqq != NULL)
+		list_del(&bfqq->bfqq_list);
+}
+
+/**
+ * bfq_insert - generic tree insertion.
+ * @root: tree root.
+ * @entity: entity to insert.
+ *
+ * This is used for the idle and the active tree, since they are both
+ * ordered by finish time.
+ */
+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
+{
+	struct bfq_entity *entry;
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+
+	BUG_ON(entity->tree != NULL);
+
+	while (*node != NULL) {
+		parent = *node;
+		entry = rb_entry(parent, struct bfq_entity, rb_node);
+
+		if (bfq_gt(entry->finish, entity->finish))
+			node = &parent->rb_left;
+		else
+			node = &parent->rb_right;
+	}
+
+	rb_link_node(&entity->rb_node, parent, node);
+	rb_insert_color(&entity->rb_node, root);
+
+	entity->tree = root;
+}
+
+/**
+ * bfq_update_min - update the min_start field of a entity.
+ * @entity: the entity to update.
+ * @node: one of its children.
+ *
+ * This function is called when @entity may store an invalid value for
+ * min_start due to updates to the active tree.  The function  assumes
+ * that the subtree rooted at @node (which may be its left or its right
+ * child) has a valid min_start value.
+ */
+static inline void bfq_update_min(struct bfq_entity *entity,
+				  struct rb_node *node)
+{
+	struct bfq_entity *child;
+
+	if (node != NULL) {
+		child = rb_entry(node, struct bfq_entity, rb_node);
+		if (bfq_gt(entity->min_start, child->min_start))
+			entity->min_start = child->min_start;
+	}
+}
+
+/**
+ * bfq_update_active_node - recalculate min_start.
+ * @node: the node to update.
+ *
+ * @node may have changed position or one of its children may have moved,
+ * this function updates its min_start value.  The left and right subtrees
+ * are assumed to hold a correct min_start value.
+ */
+static inline void bfq_update_active_node(struct rb_node *node)
+{
+	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
+
+	entity->min_start = entity->start;
+	bfq_update_min(entity, node->rb_right);
+	bfq_update_min(entity, node->rb_left);
+}
+
+/**
+ * bfq_update_active_tree - update min_start for the whole active tree.
+ * @node: the starting node.
+ *
+ * @node must be the deepest modified node after an update.  This function
+ * updates its min_start using the values held by its children, assuming
+ * that they did not change, and then updates all the nodes that may have
+ * changed in the path to the root.  The only nodes that may have changed
+ * are the ones in the path or their siblings.
+ */
+static void bfq_update_active_tree(struct rb_node *node)
+{
+	struct rb_node *parent;
+
+up:
+	bfq_update_active_node(node);
+
+	parent = rb_parent(node);
+	if (parent == NULL)
+		return;
+
+	if (node == parent->rb_left && parent->rb_right != NULL)
+		bfq_update_active_node(parent->rb_right);
+	else if (parent->rb_left != NULL)
+		bfq_update_active_node(parent->rb_left);
+
+	node = parent;
+	goto up;
+}
+
+/**
+ * bfq_active_insert - insert an entity in the active tree of its group/device.
+ * @st: the service tree of the entity.
+ * @entity: the entity being inserted.
+ *
+ * The active tree is ordered by finish time, but an extra key is kept
+ * per each node, containing the minimum value for the start times of
+ * its children (and the node itself), so it's possible to search for
+ * the eligible node with the lowest finish time in logarithmic time.
+ */
+static void bfq_active_insert(struct bfq_service_tree *st,
+			      struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *node = &entity->rb_node;
+
+	bfq_insert(&st->active, entity);
+
+	if (node->rb_left != NULL)
+		node = node->rb_left;
+	else if (node->rb_right != NULL)
+		node = node->rb_right;
+
+	bfq_update_active_tree(node);
+
+	if (bfqq != NULL)
+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
+}
+
+/**
+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
+ * @ioprio: the ioprio value to convert.
+ */
+static unsigned short bfq_ioprio_to_weight(int ioprio)
+{
+	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
+	return IOPRIO_BE_NR - ioprio;
+}
+
+/**
+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
+ * @weight: the weight value to convert.
+ *
+ * To preserve as mush as possible the old only-ioprio user interface,
+ * 0 is used as an escape ioprio value for weights (numerically) equal or
+ * larger than IOPRIO_BE_NR
+ */
+static unsigned short bfq_weight_to_ioprio(int weight)
+{
+	WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
+	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
+}
+
+static inline void bfq_get_entity(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	if (bfqq != NULL) {
+		atomic_inc(&bfqq->ref);
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
+			     bfqq, atomic_read(&bfqq->ref));
+	}
+}
+
+/**
+ * bfq_find_deepest - find the deepest node that an extraction can modify.
+ * @node: the node being removed.
+ *
+ * Do the first step of an extraction in an rb tree, looking for the
+ * node that will replace @node, and returning the deepest node that
+ * the following modifications to the tree can touch.  If @node is the
+ * last node in the tree return %NULL.
+ */
+static struct rb_node *bfq_find_deepest(struct rb_node *node)
+{
+	struct rb_node *deepest;
+
+	if (node->rb_right == NULL && node->rb_left == NULL)
+		deepest = rb_parent(node);
+	else if (node->rb_right == NULL)
+		deepest = node->rb_left;
+	else if (node->rb_left == NULL)
+		deepest = node->rb_right;
+	else {
+		deepest = rb_next(node);
+		if (deepest->rb_right != NULL)
+			deepest = deepest->rb_right;
+		else if (rb_parent(deepest) != node)
+			deepest = rb_parent(deepest);
+	}
+
+	return deepest;
+}
+
+/**
+ * bfq_active_extract - remove an entity from the active tree.
+ * @st: the service_tree containing the tree.
+ * @entity: the entity being removed.
+ */
+static void bfq_active_extract(struct bfq_service_tree *st,
+			       struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *node;
+
+	node = bfq_find_deepest(&entity->rb_node);
+	bfq_extract(&st->active, entity);
+
+	if (node != NULL)
+		bfq_update_active_tree(node);
+
+	if (bfqq != NULL)
+		list_del(&bfqq->bfqq_list);
+}
+
+/**
+ * bfq_idle_insert - insert an entity into the idle tree.
+ * @st: the service tree containing the tree.
+ * @entity: the entity to insert.
+ */
+static void bfq_idle_insert(struct bfq_service_tree *st,
+			    struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct bfq_entity *first_idle = st->first_idle;
+	struct bfq_entity *last_idle = st->last_idle;
+
+	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
+		st->first_idle = entity;
+	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
+		st->last_idle = entity;
+
+	bfq_insert(&st->idle, entity);
+
+	if (bfqq != NULL)
+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
+}
+
+/**
+ * bfq_forget_entity - remove an entity from the wfq trees.
+ * @st: the service tree.
+ * @entity: the entity being removed.
+ *
+ * Update the device status and forget everything about @entity, putting
+ * the device reference to it, if it is a queue.  Entities belonging to
+ * groups are not refcounted.
+ */
+static void bfq_forget_entity(struct bfq_service_tree *st,
+			      struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	BUG_ON(!entity->on_st);
+
+	entity->on_st = 0;
+	st->wsum -= entity->weight;
+	if (bfqq != NULL) {
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
+			     bfqq, atomic_read(&bfqq->ref));
+		bfq_put_queue(bfqq);
+	}
+}
+
+/**
+ * bfq_put_idle_entity - release the idle tree ref of an entity.
+ * @st: service tree for the entity.
+ * @entity: the entity being released.
+ */
+static void bfq_put_idle_entity(struct bfq_service_tree *st,
+				struct bfq_entity *entity)
+{
+	bfq_idle_extract(st, entity);
+	bfq_forget_entity(st, entity);
+}
+
+/**
+ * bfq_forget_idle - update the idle tree if necessary.
+ * @st: the service tree to act upon.
+ *
+ * To preserve the global O(log N) complexity we only remove one entry here;
+ * as the idle tree will not grow indefinitely this can be done safely.
+ */
+static void bfq_forget_idle(struct bfq_service_tree *st)
+{
+	struct bfq_entity *first_idle = st->first_idle;
+	struct bfq_entity *last_idle = st->last_idle;
+
+	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
+	    !bfq_gt(last_idle->finish, st->vtime)) {
+		/*
+		 * Forget the whole idle tree, increasing the vtime past
+		 * the last finish time of idle entities.
+		 */
+		st->vtime = last_idle->finish;
+	}
+
+	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
+		bfq_put_idle_entity(st, first_idle);
+}
+
+static struct bfq_service_tree *
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
+			 struct bfq_entity *entity)
+{
+	struct bfq_service_tree *new_st = old_st;
+
+	if (entity->ioprio_changed) {
+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+		BUG_ON(old_st->wsum < entity->weight);
+		old_st->wsum -= entity->weight;
+
+		if (entity->new_weight != entity->orig_weight) {
+			entity->orig_weight = entity->new_weight;
+			entity->ioprio =
+				bfq_weight_to_ioprio(entity->orig_weight);
+		} else if (entity->new_ioprio != entity->ioprio) {
+			entity->ioprio = entity->new_ioprio;
+			entity->orig_weight =
+					bfq_ioprio_to_weight(entity->ioprio);
+		} else
+			entity->new_weight = entity->orig_weight =
+				bfq_ioprio_to_weight(entity->ioprio);
+
+		entity->ioprio_class = entity->new_ioprio_class;
+		entity->ioprio_changed = 0;
+
+		/*
+		 * NOTE: here we may be changing the weight too early,
+		 * this will cause unfairness.  The correct approach
+		 * would have required additional complexity to defer
+		 * weight changes to the proper time instants (i.e.,
+		 * when entity->finish <= old_st->vtime).
+		 */
+		new_st = bfq_entity_service_tree(entity);
+		entity->weight = entity->orig_weight *
+			(bfqq != NULL ? bfqq->raising_coeff : 1);
+		new_st->wsum += entity->weight;
+
+		if (new_st != old_st)
+			entity->start = new_st->vtime;
+	}
+
+	return new_st;
+}
+
+/**
+ * bfq_bfqq_served - update the scheduler status after selection for service.
+ * @bfqq: the queue being served.
+ * @served: bytes to transfer.
+ *
+ * NOTE: this can be optimized, as the timestamps of upper level entities
+ * are synchronized every time a new bfqq is selected for service.  By now,
+ * we keep it to better check consistency.
+ */
+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_service_tree *st;
+
+	for_each_entity(entity) {
+		st = bfq_entity_service_tree(entity);
+
+		entity->service += served;
+		BUG_ON(entity->service > entity->budget);
+		BUG_ON(st->wsum == 0);
+
+		st->vtime += bfq_delta(served, st->wsum);
+		bfq_forget_idle(st);
+	}
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
+}
+
+/**
+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
+ * @bfqq: the queue that needs a service update.
+ *
+ * When it's not possible to be fair in the service domain, because
+ * a queue is not consuming its budget fast enough (the meaning of
+ * fast depends on the timeout parameter), we charge it a full
+ * budget.  In this way we should obtain a sort of time-domain
+ * fairness among all the seeky/slow queues.
+ */
+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
+
+	bfq_bfqq_served(bfqq, entity->budget - entity->service);
+}
+
+/**
+ * __bfq_activate_entity - activate an entity.
+ * @entity: the entity being activated.
+ *
+ * Called whenever an entity is activated, i.e., it is not active and one
+ * of its children receives a new request, or has to be reactivated due to
+ * budget exhaustion.  It uses the current budget of the entity (and the
+ * service received if @entity is active) of the queue to calculate its
+ * timestamps.
+ */
+static void __bfq_activate_entity(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+	if (entity == sd->active_entity) {
+		BUG_ON(entity->tree != NULL);
+		/*
+		 * If we are requeueing the current entity we have
+		 * to take care of not charging to it service it has
+		 * not received.
+		 */
+		bfq_calc_finish(entity, entity->service);
+		entity->start = entity->finish;
+		sd->active_entity = NULL;
+	} else if (entity->tree == &st->active) {
+		/*
+		 * Requeueing an entity due to a change of some
+		 * next_active entity below it.  We reuse the old
+		 * start time.
+		 */
+		bfq_active_extract(st, entity);
+	} else if (entity->tree == &st->idle) {
+		/*
+		 * Must be on the idle tree, bfq_idle_extract() will
+		 * check for that.
+		 */
+		bfq_idle_extract(st, entity);
+		entity->start = bfq_gt(st->vtime, entity->finish) ?
+				       st->vtime : entity->finish;
+	} else {
+		/*
+		 * The finish time of the entity may be invalid, and
+		 * it is in the past for sure, otherwise the queue
+		 * would have been on the idle tree.
+		 */
+		entity->start = st->vtime;
+		st->wsum += entity->weight;
+		bfq_get_entity(entity);
+
+		BUG_ON(entity->on_st);
+		entity->on_st = 1;
+	}
+
+	st = __bfq_entity_update_weight_prio(st, entity);
+	bfq_calc_finish(entity, entity->budget);
+	bfq_active_insert(st, entity);
+}
+
+/**
+ * bfq_activate_entity - activate an entity and its ancestors if necessary.
+ * @entity: the entity to activate.
+ *
+ * Activate @entity and all the entities on the path from it to the root.
+ */
+static void bfq_activate_entity(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sd;
+
+	for_each_entity(entity) {
+		__bfq_activate_entity(entity);
+
+		sd = entity->sched_data;
+		if (!bfq_update_next_active(sd))
+			/*
+			 * No need to propagate the activation to the
+			 * upper entities, as they will be updated when
+			 * the active entity is rescheduled.
+			 */
+			break;
+	}
+}
+
+/**
+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
+ * @entity: the entity to deactivate.
+ * @requeue: if false, the entity will not be put into the idle tree.
+ *
+ * Deactivate an entity, independently from its previous state.  If the
+ * entity was not on a service tree just return, otherwise if it is on
+ * any scheduler tree, extract it from that tree, and if necessary
+ * and if the caller did not specify @requeue, put it on the idle tree.
+ *
+ * Return %1 if the caller should update the entity hierarchy, i.e.,
+ * if the entity was under service or if it was the next_active for
+ * its sched_data; return %0 otherwise.
+ */
+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	int was_active = entity == sd->active_entity;
+	int ret = 0;
+
+	if (!entity->on_st)
+		return 0;
+
+	BUG_ON(was_active && entity->tree != NULL);
+
+	if (was_active) {
+		bfq_calc_finish(entity, entity->service);
+		sd->active_entity = NULL;
+	} else if (entity->tree == &st->active)
+		bfq_active_extract(st, entity);
+	else if (entity->tree == &st->idle)
+		bfq_idle_extract(st, entity);
+	else if (entity->tree != NULL)
+		BUG();
+
+	if (was_active || sd->next_active == entity)
+		ret = bfq_update_next_active(sd);
+
+	if (!requeue || !bfq_gt(entity->finish, st->vtime))
+		bfq_forget_entity(st, entity);
+	else
+		bfq_idle_insert(st, entity);
+
+	BUG_ON(sd->active_entity == entity);
+	BUG_ON(sd->next_active == entity);
+
+	return ret;
+}
+
+/**
+ * bfq_deactivate_entity - deactivate an entity.
+ * @entity: the entity to deactivate.
+ * @requeue: true if the entity can be put on the idle tree
+ */
+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+{
+	struct bfq_sched_data *sd;
+	struct bfq_entity *parent;
+
+	for_each_entity_safe(entity, parent) {
+		sd = entity->sched_data;
+
+		if (!__bfq_deactivate_entity(entity, requeue))
+			/*
+			 * The parent entity is still backlogged, and
+			 * we don't need to update it as it is still
+			 * under service.
+			 */
+			break;
+
+		if (sd->next_active != NULL)
+			/*
+			 * The parent entity is still backlogged and
+			 * the budgets on the path towards the root
+			 * need to be updated.
+			 */
+			goto update;
+
+		/*
+		 * If we reach there the parent is no more backlogged and
+		 * we want to propagate the dequeue upwards.
+		 */
+		requeue = 1;
+	}
+
+	return;
+
+update:
+	entity = parent;
+	for_each_entity(entity) {
+		__bfq_activate_entity(entity);
+
+		sd = entity->sched_data;
+		if (!bfq_update_next_active(sd))
+			break;
+	}
+}
+
+/**
+ * bfq_update_vtime - update vtime if necessary.
+ * @st: the service tree to act upon.
+ *
+ * If necessary update the service tree vtime to have at least one
+ * eligible entity, skipping to its start time.  Assumes that the
+ * active tree of the device is not empty.
+ *
+ * NOTE: this hierarchical implementation updates vtimes quite often,
+ * we may end up with reactivated tasks getting timestamps after a
+ * vtime skip done because we needed a ->first_active entity on some
+ * intermediate node.
+ */
+static void bfq_update_vtime(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entry;
+	struct rb_node *node = st->active.rb_node;
+
+	entry = rb_entry(node, struct bfq_entity, rb_node);
+	if (bfq_gt(entry->min_start, st->vtime)) {
+		st->vtime = entry->min_start;
+		bfq_forget_idle(st);
+	}
+}
+
+/**
+ * bfq_first_active - find the eligible entity with the smallest finish time
+ * @st: the service tree to select from.
+ *
+ * This function searches the first schedulable entity, starting from the
+ * root of the tree and going on the left every time on this side there is
+ * a subtree with at least one eligible (start >= vtime) entity.  The path
+ * on the right is followed only if a) the left subtree contains no eligible
+ * entities and b) no eligible entity has been found yet.
+ */
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entry, *first = NULL;
+	struct rb_node *node = st->active.rb_node;
+
+	while (node != NULL) {
+		entry = rb_entry(node, struct bfq_entity, rb_node);
+left:
+		if (!bfq_gt(entry->start, st->vtime))
+			first = entry;
+
+		BUG_ON(bfq_gt(entry->min_start, st->vtime));
+
+		if (node->rb_left != NULL) {
+			entry = rb_entry(node->rb_left,
+					 struct bfq_entity, rb_node);
+			if (!bfq_gt(entry->min_start, st->vtime)) {
+				node = node->rb_left;
+				goto left;
+			}
+		}
+		if (first != NULL)
+			break;
+		node = node->rb_right;
+	}
+
+	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
+	return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * Update the virtual time in @st and return the first eligible entity
+ * it contains.
+ */
+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
+						   bool force)
+{
+	struct bfq_entity *entity, *new_next_active = NULL;
+
+	if (RB_EMPTY_ROOT(&st->active))
+		return NULL;
+
+	bfq_update_vtime(st);
+	entity = bfq_first_active_entity(st);
+	BUG_ON(bfq_gt(entity->start, st->vtime));
+
+	/*
+	 * If the chosen entity does not match with the sched_data's
+	 * next_active and we are forcedly serving the IDLE priority
+	 * class tree, bubble up budget update.
+	 */
+	if (unlikely(force && entity != entity->sched_data->next_active)) {
+		new_next_active = entity;
+		for_each_entity(new_next_active)
+			bfq_update_budget(new_next_active);
+	}
+
+	return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ * @extract: if true the returned entity will be also extracted from @sd.
+ *
+ * NOTE: since we cache the next_active entity at each level of the
+ * hierarchy, the complexity of the lookup can be decreased with
+ * absolutely no effort just returning the cached next_active value;
+ * we prefer to do full lookups to test the consistency of * the data
+ * structures.
+ */
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
+						 int extract,
+						 struct bfq_data *bfqd)
+{
+	struct bfq_service_tree *st = sd->service_tree;
+	struct bfq_entity *entity;
+	int i=0;
+
+	BUG_ON(sd->active_entity != NULL);
+
+	if (bfqd != NULL &&
+	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
+		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true);
+		if (entity != NULL) {
+			i = BFQ_IOPRIO_CLASSES - 1;
+			bfqd->bfq_class_idle_last_service = jiffies;
+			sd->next_active = entity;
+		}
+	}
+	for (; i < BFQ_IOPRIO_CLASSES; i++) {
+		entity = __bfq_lookup_next_entity(st + i, false);
+		if (entity != NULL) {
+			if (extract) {
+				bfq_check_next_active(sd, entity);
+				bfq_active_extract(st + i, entity);
+				sd->active_entity = entity;
+				sd->next_active = NULL;
+			}
+			break;
+		}
+	}
+
+	return entity;
+}
+
+/*
+ * Get next queue for service.
+ */
+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+{
+	struct bfq_entity *entity = NULL;
+	struct bfq_sched_data *sd;
+	struct bfq_queue *bfqq;
+
+	BUG_ON(bfqd->active_queue != NULL);
+
+	if (bfqd->busy_queues == 0)
+		return NULL;
+
+	sd = &bfqd->root_group->sched_data;
+	for (; sd != NULL; sd = entity->my_sched_data) {
+		entity = bfq_lookup_next_entity(sd, 1, bfqd);
+		BUG_ON(entity == NULL);
+		entity->service = 0;
+	}
+
+	bfqq = bfq_entity_to_bfqq(entity);
+	BUG_ON(bfqq == NULL);
+
+	return bfqq;
+}
+
+/*
+ * Forced extraction of the given queue.
+ */
+static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
+				      struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity;
+	struct bfq_sched_data *sd;
+
+	BUG_ON(bfqd->active_queue != NULL);
+
+	entity = &bfqq->entity;
+	/*
+	 * Bubble up extraction/update from the leaf to the root.
+	*/
+	for_each_entity(entity) {
+		sd = entity->sched_data;
+		bfq_update_budget(entity);
+		bfq_update_vtime(bfq_entity_service_tree(entity));
+		bfq_active_extract(bfq_entity_service_tree(entity), entity);
+		sd->active_entity = entity;
+		sd->next_active = NULL;
+		entity->service = 0;
+	}
+
+	return;
+}
+
+static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
+{
+	if (bfqd->active_cic != NULL) {
+		put_io_context(bfqd->active_cic->ioc);
+		bfqd->active_cic = NULL;
+	}
+
+	bfqd->active_queue = NULL;
+	del_timer(&bfqd->idle_slice_timer);
+}
+
+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				int requeue)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	if (bfqq == bfqd->active_queue)
+		__bfq_bfqd_reset_active(bfqd);
+
+	bfq_deactivate_entity(entity, requeue);
+}
+
+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_activate_entity(entity);
+}
+
+/*
+ * Called when the bfqq no longer has requests pending, remove it from
+ * the service tree.
+ */
+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			      int requeue)
+{
+	BUG_ON(!bfq_bfqq_busy(bfqq));
+	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
+
+	bfq_log_bfqq(bfqd, bfqq, "del from busy");
+
+	bfq_clear_bfqq_busy(bfqq);
+
+	BUG_ON(bfqd->busy_queues == 0);
+	bfqd->busy_queues--;
+
+	bfq_deactivate_bfqq(bfqd, bfqq, requeue);
+}
+
+/*
+ * Called when an inactive queue receives a new request.
+ */
+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	BUG_ON(bfq_bfqq_busy(bfqq));
+	BUG_ON(bfqq == bfqd->active_queue);
+
+	bfq_log_bfqq(bfqd, bfqq, "add to busy");
+
+	bfq_activate_bfqq(bfqd, bfqq);
+
+	bfq_mark_bfqq_busy(bfqq);
+	bfqd->busy_queues++;
+}
diff --git a/block/bfq.h b/block/bfq.h
new file mode 100644
index 000000000..e2ce5052a
--- /dev/null
+++ b/block/bfq.h
@@ -0,0 +1,595 @@
+/*
+ * BFQ-v5 for 3.0: data structures and common functions prototypes.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ */
+
+#ifndef _BFQ_H
+#define _BFQ_H
+
+#include <linux/blktrace_api.h>
+#include <linux/hrtimer.h>
+#include <linux/ioprio.h>
+#include <linux/rbtree.h>
+
+#define BFQ_IOPRIO_CLASSES	3
+#define BFQ_CL_IDLE_TIMEOUT	HZ/5
+
+#define BFQ_MIN_WEIGHT	1
+#define BFQ_MAX_WEIGHT	1000
+
+#define BFQ_DEFAULT_GRP_WEIGHT	10
+#define BFQ_DEFAULT_GRP_IOPRIO	0
+#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
+
+struct bfq_entity;
+
+/**
+ * struct bfq_service_tree - per ioprio_class service tree.
+ * @active: tree for active entities (i.e., those backlogged).
+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
+ * @first_idle: idle entity with minimum F_i.
+ * @last_idle: idle entity with maximum F_i.
+ * @vtime: scheduler virtual time.
+ * @wsum: scheduler weight sum; active and idle entities contribute to it.
+ *
+ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
+ * ioprio_class has its own independent scheduler, and so its own
+ * bfq_service_tree.  All the fields are protected by the queue lock
+ * of the containing bfqd.
+ */
+struct bfq_service_tree {
+	struct rb_root active;
+	struct rb_root idle;
+
+	struct bfq_entity *first_idle;
+	struct bfq_entity *last_idle;
+
+	u64 vtime;
+	unsigned long wsum;
+};
+
+/**
+ * struct bfq_sched_data - multi-class scheduler.
+ * @active_entity: entity under service.
+ * @next_active: head-of-the-line entity in the scheduler.
+ * @service_tree: array of service trees, one per ioprio_class.
+ *
+ * bfq_sched_data is the basic scheduler queue.  It supports three
+ * ioprio_classes, and can be used either as a toplevel queue or as
+ * an intermediate queue on a hierarchical setup.
+ * @next_active points to the active entity of the sched_data service
+ * trees that will be scheduled next.
+ *
+ * The supported ioprio_classes are the same as in CFQ, in descending
+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
+ * Requests from higher priority queues are served before all the
+ * requests from lower priority queues; among requests of the same
+ * queue requests are served according to B-WF2Q+.
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_sched_data {
+	struct bfq_entity *active_entity;
+	struct bfq_entity *next_active;
+	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
+};
+
+/**
+ * struct bfq_entity - schedulable entity.
+ * @rb_node: service_tree member.
+ * @on_st: flag, true if the entity is on a tree (either the active or
+ *         the idle one of its service_tree).
+ * @finish: B-WF2Q+ finish timestamp (aka F_i).
+ * @start: B-WF2Q+ start timestamp (aka S_i).
+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
+ * @min_start: minimum start time of the (active) subtree rooted at
+ *             this entity; used for O(log N) lookups into active trees.
+ * @service: service received during the last round of service.
+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
+ * @weight: weight of the queue
+ * @parent: parent entity, for hierarchical scheduling.
+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
+ *                 associated scheduler queue, %NULL on leaf nodes.
+ * @sched_data: the scheduler queue this entity belongs to.
+ * @ioprio: the ioprio in use.
+ * @new_weight: when a weight change is requested, the new weight value.
+ * @orig_weight: original weight, used to implement weight boosting
+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
+ * @ioprio_class: the ioprio_class in use.
+ * @new_ioprio_class: when an ioprio_class change is requested, the new
+ *                    ioprio_class value.
+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
+ *                  ioprio_class change.
+ *
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
+ * entity belongs to the sched_data of the parent group in the cgroup
+ * hierarchy.  Non-leaf entities have also their own sched_data, stored
+ * in @my_sched_data.
+ *
+ * Each entity stores independently its priority values; this would
+ * allow different weights on different devices, but this
+ * functionality is not exported to userspace by now.  Priorities and
+ * weights are updated lazily, first storing the new values into the
+ * new_* fields, then setting the @ioprio_changed flag.  As soon as
+ * there is a transition in the entity state that allows the priority
+ * update to take place the effective and the requested priority
+ * values are synchronized.
+ *
+ * Unless cgroups are used, the weight value is calculated from the
+ * ioprio to export the same interface as CFQ.  When dealing with
+ * ``well-behaved'' queues (i.e., queues that do not spend too much
+ * time to consume their budget and have true sequential behavior, and
+ * when there are no external factors breaking anticipation) the
+ * relative weights at each level of the cgroups hierarchy should be
+ * guaranteed.  All the fields are protected by the queue lock of the
+ * containing bfqd.
+ */
+struct bfq_entity {
+	struct rb_node rb_node;
+
+	int on_st;
+
+	u64 finish;
+	u64 start;
+
+	struct rb_root *tree;
+
+	u64 min_start;
+
+	unsigned long service, budget;
+	unsigned short weight, new_weight;
+	unsigned short orig_weight;
+
+	struct bfq_entity *parent;
+
+	struct bfq_sched_data *my_sched_data;
+	struct bfq_sched_data *sched_data;
+
+	unsigned short ioprio, new_ioprio;
+	unsigned short ioprio_class, new_ioprio_class;
+
+	int ioprio_changed;
+};
+
+struct bfq_group;
+
+/**
+ * struct bfq_queue - leaf schedulable entity.
+ * @ref: reference counter.
+ * @bfqd: parent bfq_data.
+ * @new_bfqq: shared bfq_queue if queue is cooperating with
+ *           one or more other queues.
+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
+ * @sort_list: sorted list of pending requests.
+ * @next_rq: if fifo isn't expired, next request to serve.
+ * @queued: nr of requests queued in @sort_list.
+ * @allocated: currently allocated requests.
+ * @meta_pending: pending metadata requests.
+ * @fifo: fifo list of requests in sort_list.
+ * @entity: entity representing this queue in the scheduler.
+ * @max_budget: maximum budget allowed from the feedback mechanism.
+ * @budget_timeout: budget expiration (in jiffies).
+ * @dispatched: number of requests on the dispatch list or inside driver.
+ * @org_ioprio: saved ioprio during boosted periods.
+ * @org_ioprio_class: saved ioprio_class during boosted periods.
+ * @flags: status flags.
+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
+ * @seek_samples: number of seeks sampled
+ * @seek_total: sum of the distances of the seeks sampled
+ * @seek_mean: mean seek distance
+ * @last_request_pos: position of the last request enqueued
+ * @pid: pid of the process owning the queue, used for logging purposes.
+ * @last_rais_start_time: last (idle -> weight-raised) transition attempt
+ * @raising_cur_max_time: current max raising time for this queue
+ *
+ * A bfq_queue is a leaf request queue; it can be associated to an io_context
+ * or more (if it is an async one).  @cgroup holds a reference to the
+ * cgroup, to be sure that it does not disappear while a bfqq still
+ * references it (mostly to avoid races between request issuing and task
+ * migration followed by cgroup distruction).
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_queue {
+	atomic_t ref;
+	struct bfq_data *bfqd;
+
+	/* fields for cooperating queues handling */
+	struct bfq_queue *new_bfqq;
+	struct rb_node pos_node;
+	struct rb_root *pos_root;
+
+	struct rb_root sort_list;
+	struct request *next_rq;
+	int queued[2];
+	int allocated[2];
+	int meta_pending;
+	struct list_head fifo;
+
+	struct bfq_entity entity;
+
+	unsigned long max_budget;
+	unsigned long budget_timeout;
+
+	int dispatched;
+
+	unsigned short org_ioprio;
+	unsigned short org_ioprio_class;
+
+	unsigned int flags;
+
+	struct list_head bfqq_list;
+
+	unsigned int seek_samples;
+	u64 seek_total;
+	sector_t seek_mean;
+	sector_t last_request_pos;
+
+	pid_t pid;
+
+	/* weight-raising fields */
+	unsigned int raising_cur_max_time;
+	u64 last_rais_start_finish, soft_rt_next_start;
+	unsigned int raising_coeff;
+};
+
+/**
+ * struct bfq_data - per device data structure.
+ * @queue: request queue for the managed device.
+ * @root_group: root bfq_group for the device.
+ * @rq_pos_tree: rbtree sorted by next_request position,
+ *		used when determining if two or more queues
+ *		have interleaving requests (see bfq_close_cooperator).
+ * @busy_queues: number of bfq_queues containing requests (including the
+ *		 queue under service, even if it is idling).
+ * @queued: number of queued requests.
+ * @rq_in_driver: number of requests dispatched and waiting for completion.
+ * @sync_flight: number of sync requests in the driver.
+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
+ *		      completed requests .
+ * @hw_tag_samples: nr of samples used to calculate hw_tag.
+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
+ * @budgets_assigned: number of budgets assigned.
+ * @idle_slice_timer: timer set when idling for the next sequential request
+ *                    from the queue under service.
+ * @unplug_work: delayed work to restart dispatching on the request queue.
+ * @active_queue: bfq_queue under service.
+ * @active_cic: cfq_io_context (cic) associated with the @active_queue.
+ * @last_position: on-disk position of the last served request.
+ * @last_budget_start: beginning of the last budget.
+ * @last_idling_start: beginning of the last idle slice.
+ * @peak_rate: peak transfer rate observed for a budget.
+ * @peak_rate_samples: number of samples used to calculate @peak_rate.
+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
+ * @cic_index: use small consequent indexes as radix tree keys to reduce depth
+ * @cic_list: list of all the cics active on the bfq_data device.
+ * @group_list: list of all the bfq_groups active on the device.
+ * @active_list: list of all the bfq_queues active on the device.
+ * @idle_list: list of all the bfq_queues idle on the device.
+ * @bfq_quantum: max number of requests dispatched per dispatch round.
+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
+ *                   requests are served in fifo order.
+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
+ * @bfq_back_max: maximum allowed backward seek.
+ * @bfq_slice_idle: maximum idling time.
+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
+ *                           async queues.
+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
+ *               to prevent seeky queues to impose long latencies to well
+ *               behaved ones (this also implies that seeky queues cannot
+ *               receive guarantees in the service domain; after a timeout
+ *               they are charged for the whole allocated budget, to try
+ *               to preserve a behavior reasonably fair among them, but
+ *               without service-domain guarantees).
+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
+ *                            queue is multiplied
+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
+ *			       may be reactivated for a queue (in jiffies)
+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
+ *                                   after which weight-raising may be
+ *                                   reactivated for an already busy queue
+ *                                   (in jiffies)
+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
+ *			         sectors per seconds
+ * @RT_prod: cached value of the product R*T used for computing the maximum
+ * 	     duration of the weight raising automatically
+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
+ *
+ * All the fields are protected by the @queue lock.
+ */
+struct bfq_data {
+	struct request_queue *queue;
+
+	struct bfq_group *root_group;
+
+	struct rb_root rq_pos_tree;
+
+	int busy_queues;
+	int queued;
+	int rq_in_driver;
+	int sync_flight;
+
+	int max_rq_in_driver;
+	int hw_tag_samples;
+	int hw_tag;
+
+	int budgets_assigned;
+
+	struct timer_list idle_slice_timer;
+	struct work_struct unplug_work;
+
+	struct bfq_queue *active_queue;
+	struct cfq_io_context *active_cic;
+
+	sector_t last_position;
+
+	ktime_t last_budget_start;
+	ktime_t last_idling_start;
+	int peak_rate_samples;
+	u64 peak_rate;
+	unsigned long bfq_max_budget;
+
+	unsigned int cic_index;
+	struct list_head cic_list;
+	struct hlist_head group_list;
+	struct list_head active_list;
+	struct list_head idle_list;
+
+	unsigned int bfq_quantum;
+	unsigned int bfq_fifo_expire[2];
+	unsigned int bfq_back_penalty;
+	unsigned int bfq_back_max;
+	unsigned int bfq_slice_idle;
+	u64 bfq_class_idle_last_service;
+
+	unsigned int bfq_user_max_budget;
+	unsigned int bfq_max_budget_async_rq;
+	unsigned int bfq_timeout[2];
+
+	bool low_latency;
+
+	/* parameters of the low_latency heuristics */
+	unsigned int bfq_raising_coeff;
+	unsigned int bfq_raising_max_time;
+	unsigned int bfq_raising_rt_max_time;
+	unsigned int bfq_raising_min_idle_time;
+	unsigned int bfq_raising_min_inter_arr_async;
+	unsigned int bfq_raising_max_softrt_rate;
+	u64 RT_prod;
+
+	struct bfq_queue oom_bfqq;
+};
+
+enum bfqq_state_flags {
+	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is under service */
+	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */
+	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */
+	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
+	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */
+	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */
+	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */
+	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */
+	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */
+	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */
+	BFQ_BFQQ_FLAG_some_coop_idle,   /* some cooperator is inactive */
+};
+
+#define BFQ_BFQQ_FNS(name)						\
+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\
+{									\
+	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\
+}									\
+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\
+{									\
+	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\
+}									\
+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
+{									\
+	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\
+}
+
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(must_alloc);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(prio_changed);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(budget_new);
+BFQ_BFQQ_FNS(coop);
+BFQ_BFQQ_FNS(split_coop);
+BFQ_BFQQ_FNS(some_coop_idle);
+#undef BFQ_BFQQ_FNS
+
+/* Logging facilities. */
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
+	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
+
+#define bfq_log(bfqd, fmt, args...) \
+	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
+
+/* Expiration reasons. */
+enum bfqq_expiration {
+	BFQ_BFQQ_TOO_IDLE = 0,		/* queue has been idling for too long */
+	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */
+	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */
+	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */
+};
+
+#ifdef CONFIG_CGROUP_BFQIO
+/**
+ * struct bfq_group - per (device, cgroup) data structure.
+ * @entity: schedulable entity to insert into the parent group sched_data.
+ * @sched_data: own sched_data, to contain child entities (they may be
+ *              both bfq_queues and bfq_groups).
+ * @group_node: node to be inserted into the bfqio_cgroup->group_data
+ *              list of the containing cgroup's bfqio_cgroup.
+ * @bfqd_node: node to be inserted into the @bfqd->group_list list
+ *             of the groups active on the same device; used for cleanup.
+ * @bfqd: the bfq_data for the device this group acts upon.
+ * @async_bfqq: array of async queues for all the tasks belonging to
+ *              the group, one queue per ioprio value per ioprio_class,
+ *              except for the idle class that has only one queue.
+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
+ *             to avoid too many special cases during group creation/migration.
+ *
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
+ * there is a set of bfq_groups, each one collecting the lower-level
+ * entities belonging to the group that are acting on the same device.
+ *
+ * Locking works as follows:
+ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed
+ *      via RCU from its readers.
+ *    o @bfqd is protected by the queue lock, RCU is used to access it
+ *      from the readers.
+ *    o All the other fields are protected by the @bfqd queue lock.
+ */
+struct bfq_group {
+	struct bfq_entity entity;
+	struct bfq_sched_data sched_data;
+
+	struct hlist_node group_node;
+	struct hlist_node bfqd_node;
+
+	void *bfqd;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+
+	struct bfq_entity *my_entity;
+};
+
+/**
+ * struct bfqio_cgroup - bfq cgroup data structure.
+ * @css: subsystem state for bfq in the containing cgroup.
+ * @weight: cgroup weight.
+ * @ioprio: cgroup ioprio.
+ * @ioprio_class: cgroup ioprio_class.
+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
+ * @group_data: list containing the bfq_group belonging to this cgroup.
+ *
+ * @group_data is accessed using RCU, with @lock protecting the updates,
+ * @ioprio and @ioprio_class are protected by @lock.
+ */
+struct bfqio_cgroup {
+	struct cgroup_subsys_state css;
+
+	unsigned short weight, ioprio, ioprio_class;
+
+	spinlock_t lock;
+	struct hlist_head group_data;
+};
+#else
+struct bfq_group {
+	struct bfq_sched_data sched_data;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+};
+#endif
+
+static inline struct bfq_service_tree *
+bfq_entity_service_tree(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sched_data = entity->sched_data;
+	unsigned int idx = entity->ioprio_class - 1;
+
+	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
+	BUG_ON(sched_data == NULL);
+
+	return sched_data->service_tree + idx;
+}
+
+static inline struct bfq_queue *cic_to_bfqq(struct cfq_io_context *cic,
+					    int is_sync)
+{
+	return cic->cfqq[!!is_sync];
+}
+
+static inline void cic_set_bfqq(struct cfq_io_context *cic,
+				struct bfq_queue *bfqq, int is_sync)
+{
+	cic->cfqq[!!is_sync] = bfqq;
+}
+
+static inline void call_for_each_cic(struct io_context *ioc,
+				     void (*func)(struct io_context *,
+				     struct cfq_io_context *))
+{
+	struct cfq_io_context *cic;
+	struct hlist_node *n;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list)
+		func(ioc, cic);
+	rcu_read_unlock();
+}
+
+#define CIC_DEAD_KEY    1ul
+#define CIC_DEAD_INDEX_SHIFT    1
+
+static inline void *bfqd_dead_key(struct bfq_data *bfqd)
+{
+	return (void *)(bfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
+}
+
+/**
+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
+ * @ptr: a pointer to a bfqd.
+ * @flags: storage for the flags to be saved.
+ *
+ * This function allows cic->key and bfqg->bfqd to be protected by the
+ * queue lock of the bfqd they reference; the pointer is dereferenced
+ * under RCU, so the storage for bfqd is assured to be safe as long
+ * as the RCU read side critical section does not end.  After the
+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
+ * sure that no other writer accessed it.  If we raced with a writer,
+ * the function returns NULL, with the queue unlocked, otherwise it
+ * returns the dereferenced pointer, with the queue locked.
+ */
+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
+						   unsigned long *flags)
+{
+	struct bfq_data *bfqd;
+
+	rcu_read_lock();
+	bfqd = rcu_dereference(*(struct bfq_data **)ptr);
+
+	if (bfqd != NULL && !((unsigned long) bfqd & CIC_DEAD_KEY)) {
+		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
+		if (*ptr == bfqd)
+			goto out;
+		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
+	}
+
+	bfqd = NULL;
+out:
+	rcu_read_unlock();
+	return bfqd;
+}
+
+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
+				       unsigned long *flags)
+{
+	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
+}
+
+static void bfq_changed_ioprio(struct io_context *ioc,
+			       struct cfq_io_context *cic);
+static void bfq_put_queue(struct bfq_queue *bfqq);
+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+				       struct bfq_group *bfqg, int is_sync,
+				       struct io_context *ioc, gfp_t gfp_mask);
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+#endif
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 342eae9b0..0504f530f 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -4,6 +4,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/bitmap.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
@@ -16,13 +17,12 @@
  */
 static struct kmem_cache *iocontext_cachep;
 
-static void cfq_dtor(struct io_context *ioc)
+static void hlist_sched_dtor(struct io_context *ioc, struct hlist_head *list)
 {
-	if (!hlist_empty(&ioc->cic_list)) {
+	if (!hlist_empty(list)) {
 		struct cfq_io_context *cic;
 
-		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-								cic_list);
+		cic = hlist_entry(list->first, struct cfq_io_context, cic_list);
 		cic->dtor(ioc);
 	}
 }
@@ -40,7 +40,9 @@ int put_io_context(struct io_context *ioc)
 
 	if (atomic_long_dec_and_test(&ioc->refcount)) {
 		rcu_read_lock();
-		cfq_dtor(ioc);
+		
+		hlist_sched_dtor(ioc, &ioc->cic_list);
+ 	    hlist_sched_dtor(ioc, &ioc->bfq_cic_list);
 		rcu_read_unlock();
 
 		kmem_cache_free(iocontext_cachep, ioc);
@@ -50,15 +52,14 @@ int put_io_context(struct io_context *ioc)
 }
 EXPORT_SYMBOL(put_io_context);
 
-static void cfq_exit(struct io_context *ioc)
+static void hlist_sched_exit(struct io_context *ioc, struct hlist_head *list)
 {
 	rcu_read_lock();
 
-	if (!hlist_empty(&ioc->cic_list)) {
+	if (!hlist_empty(list)) {
 		struct cfq_io_context *cic;
 
-		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-								cic_list);
+		cic = hlist_entry(list->first, struct cfq_io_context, cic_list);
 		cic->exit(ioc);
 	}
 	rcu_read_unlock();
@@ -74,8 +75,10 @@ void exit_io_context(struct task_struct *task)
 	task->io_context = NULL;
 	task_unlock(task);
 
-	if (atomic_dec_and_test(&ioc->nr_tasks))
-		cfq_exit(ioc);
+if (atomic_dec_and_test(&ioc->nr_tasks)) {
+     hlist_sched_exit(ioc, &ioc->cic_list);
+     hlist_sched_exit(ioc, &ioc->bfq_cic_list);
+   }
 
 	put_io_context(ioc);
 }
@@ -89,12 +92,14 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 		atomic_long_set(&ret->refcount, 1);
 		atomic_set(&ret->nr_tasks, 1);
 		spin_lock_init(&ret->lock);
-		ret->ioprio_changed = 0;
+		bitmap_zero(ret->ioprio_changed, IOC_IOPRIO_CHANGED_BITS);
 		ret->ioprio = 0;
 		ret->last_waited = 0; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
 		INIT_HLIST_HEAD(&ret->cic_list);
+		INIT_RADIX_TREE(&ret->bfq_radix_root, GFP_ATOMIC | __GFP_HIGH);	
+        INIT_HLIST_HEAD(&ret->bfq_cic_list);
 		ret->ioc_data = NULL;
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 		ret->cgroup_changed = 0;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ae21919f1..b581793ec 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2919,7 +2919,6 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
 static void cfq_ioc_set_ioprio(struct io_context *ioc)
 {
 	call_for_each_cic(ioc, changed_ioprio);
-	ioc->ioprio_changed = 0;
 }
 
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -3204,8 +3203,13 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		goto err_free;
 
 out:
-	smp_read_barrier_depends();
-	if (unlikely(ioc->ioprio_changed))
+	/*
+	 * test_and_clear_bit() implies a memory barrier, paired with
+	 * the wmb() in fs/ioprio.c, so the value seen for ioprio is the
+	 * new one.
+	 */
+	if (unlikely(test_and_clear_bit(IOC_CFQ_IOPRIO_CHANGED,
+					ioc->ioprio_changed)))
 		cfq_ioc_set_ioprio(ioc);
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 7da2a0650..2fdc2a310 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -30,7 +30,7 @@
 
 int set_task_ioprio(struct task_struct *task, int ioprio)
 {
-	int err;
+	int err, i;
 	struct io_context *ioc;
 	const struct cred *cred = current_cred(), *tcred;
 
@@ -60,12 +60,15 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 			err = -ENOMEM;
 			break;
 		}
+		smp_wmb();
 		task->io_context = ioc;
 	} while (1);
 
 	if (!err) {
 		ioc->ioprio = ioprio;
-		ioc->ioprio_changed = 1;
+		wmb();
+		for (i = 0; i < IOC_IOPRIO_CHANGED_BITS; i++)
+		  set_bit(i, ioc->ioprio_changed);
 	}
 
 	task_unlock(task);
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ac663c187..c96663839 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -64,3 +64,9 @@ SUBSYS(perf)
 #endif
 
 /* */
+
+#ifdef CONFIG_CGROUP_BFQIO
+SUBSYS(bfqio)
+#endif
+
+/* */
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index b2eee896d..5f5357748 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -3,12 +3,12 @@
 
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
+#include <linux/bitmap.h>
 
-struct cfq_queue;
 struct cfq_io_context {
 	void *key;
 
-	struct cfq_queue *cfqq[2];
+	void *cfqq[2];
 
 	struct io_context *ioc;
 
@@ -27,6 +27,16 @@ struct cfq_io_context {
 	struct rcu_head rcu_head;
 };
 
+/*
+ * Indexes into the ioprio_changed bitmap.  A bit set indicates that
+ * the corresponding I/O scheduler needs to see a ioprio update.
+ */
+enum {
+	IOC_CFQ_IOPRIO_CHANGED,
+	IOC_BFQ_IOPRIO_CHANGED,
+	IOC_IOPRIO_CHANGED_BITS
+};
+
 /*
  * I/O subsystem state of the associated processes.  It is refcounted
  * and kmalloc'ed. These could be shared between processes.
@@ -39,7 +49,7 @@ struct io_context {
 	spinlock_t lock;
 
 	unsigned short ioprio;
-	unsigned short ioprio_changed;
+	DECLARE_BITMAP(ioprio_changed, IOC_IOPRIO_CHANGED_BITS);
 
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 	unsigned short cgroup_changed;
@@ -53,6 +63,8 @@ struct io_context {
 
 	struct radix_tree_root radix_root;
 	struct hlist_head cic_list;
+	struct radix_tree_root bfq_radix_root;
+	struct hlist_head bfq_cic_list;
 	void __rcu *ioc_data;
 };
 

From 6f3cf5b0572fdfe57984bb0d56652d81eafbb4d5 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Sun, 3 Feb 2013 23:05:33 +0200
Subject: [PATCH 18/19] Revert "change kgsl and adreno drivers to u8800pro
 drivers"

This reverts commit 6196e001fa3a5a1fab6d3c0cb8a3a244a6feed59.
---
 drivers/gpu/msm/adreno.c                      |  43 ++-
 drivers/gpu/msm/adreno.h                      |   8 +-
 drivers/gpu/msm/adreno_a2xx.c                 |  81 +++--
 drivers/gpu/msm/adreno_debugfs.c              |   2 +
 drivers/gpu/msm/adreno_drawctxt.c             |   7 +-
 drivers/gpu/msm/adreno_pm4types.h             |  39 ++-
 drivers/gpu/msm/adreno_postmortem.c           |   9 +-
 drivers/gpu/msm/adreno_ringbuffer.c           | 219 +++++++++++++-
 drivers/gpu/msm/adreno_snapshot.c             | 129 +++++++-
 drivers/gpu/msm/kgsl.c                        | 277 ++++++++++--------
 drivers/gpu/msm/kgsl.h                        |  63 ++--
 drivers/gpu/msm/kgsl_cffdump.c                | 184 ------------
 drivers/gpu/msm/kgsl_device.h                 |   9 +-
 drivers/gpu/msm/kgsl_drm.c                    |  12 +-
 drivers/gpu/msm/kgsl_gpummu.c                 |  14 +-
 drivers/gpu/msm/kgsl_iommu.c                  |   6 +-
 drivers/gpu/msm/kgsl_pwrctrl.c                |  55 ++--
 drivers/gpu/msm/kgsl_pwrctrl.h                |   2 -
 drivers/gpu/msm/kgsl_pwrscale.c               |   4 +-
 drivers/gpu/msm/kgsl_pwrscale_idlestats.c     |   0
 drivers/gpu/msm/kgsl_sharedmem.c              | 181 ++++++++----
 drivers/gpu/msm/kgsl_sharedmem.h              |  68 +++--
 drivers/gpu/msm/kgsl_snapshot.c               |  11 +-
 drivers/gpu/msm/z180.c                        |  55 ++--
 .../touchscreen/atmel_i2c_rmi_QT602240.c      |   0
 include/linux/msm_kgsl.h                      |  13 +-
 26 files changed, 944 insertions(+), 547 deletions(-)
 mode change 100755 => 100644 drivers/gpu/msm/adreno.c
 mode change 100755 => 100644 drivers/gpu/msm/adreno.h
 mode change 100755 => 100644 drivers/gpu/msm/adreno_a2xx.c
 mode change 100755 => 100644 drivers/gpu/msm/adreno_postmortem.c
 mode change 100755 => 100644 drivers/gpu/msm/kgsl.c
 mode change 100755 => 100644 drivers/gpu/msm/kgsl_gpummu.c
 mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrctrl.c
 mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrctrl.h
 mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrscale.c
 mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrscale_idlestats.c
 mode change 100755 => 100644 drivers/gpu/msm/kgsl_sharedmem.c
 mode change 100755 => 100644 drivers/gpu/msm/kgsl_sharedmem.h
 mode change 100755 => 100644 drivers/gpu/msm/z180.c
 mode change 100644 => 100755 drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c
 mode change 100755 => 100644 include/linux/msm_kgsl.h

diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
old mode 100755
new mode 100644
index 7ff5a4384..8320003d2
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -114,6 +114,7 @@ static struct adreno_device device_3d0 = {
 	.pfp_fw = NULL,
 	.pm4_fw = NULL,
 	.wait_timeout = 10000, /* in milliseconds */
+	.ib_check_level = 0,
 };
 
 
@@ -273,6 +274,12 @@ static void adreno_setstate(struct kgsl_device *device,
 	int sizedwords = 0;
 	unsigned int mh_mmu_invalidate = 0x00000003; /*invalidate all and tc */
 
+	/*
+	 * Fix target freeze issue by adding TLB flush for each submit
+	 * on A20X based targets.
+	 */
+	if (adreno_is_a20x(adreno_dev))
+		flags |= KGSL_MMUFLAGS_TLBFLUSH;
 	/*
 	 * If possible, then set the state via the command stream to avoid
 	 * a CPU idle.  Otherwise, use the default setstate which uses register
@@ -638,6 +645,8 @@ adreno_recover_hang(struct kgsl_device *device)
 	unsigned int soptimestamp;
 	unsigned int eoptimestamp;
 	struct adreno_context *drawctxt;
+	struct kgsl_context *context;
+	int next = 0;
 
 	KGSL_DRV_ERR(device, "Starting recovery from 3D GPU hang....\n");
 	rb_buffer = vmalloc(rb->buffer_desc.size);
@@ -706,6 +715,24 @@ adreno_recover_hang(struct kgsl_device *device)
 
 	drawctxt->flags |= CTXT_FLAGS_GPU_HANG;
 
+	/*
+	 * Set the reset status of all contexts to
+	 * INNOCENT_CONTEXT_RESET_EXT except for the bad context
+	 * since thats the guilty party
+	 */
+	while ((context = idr_get_next(&device->context_idr, &next))) {
+		if (KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT !=
+			context->reset_status) {
+			if (context->devctxt != drawctxt)
+				context->reset_status =
+				KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT;
+			else
+				context->reset_status =
+				KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT;
+		}
+		next = next + 1;
+	}
+
 	/* Restore valid commands in ringbuffer */
 	adreno_ringbuffer_restore(rb, rb_buffer, num_rb_contents);
 	rb->timestamp = timestamp;
@@ -868,15 +895,13 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout)
 	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
 	unsigned int rbbm_status;
 	unsigned long wait_timeout =
-	msecs_to_jiffies(adreno_dev->wait_timeout);
-	/*< DTS2012041906630 zhangxiangdang 20120423 begin */
-	/*merge qc patch to fix kgsl issue.*/
+		msecs_to_jiffies(adreno_dev->wait_timeout);
 	unsigned long wait_time;
 	unsigned long wait_time_part;
 	unsigned int msecs;
 	unsigned int msecs_first;
 	unsigned int msecs_part;
-	/* DTS2012041906630 zhangxiangdang 20120423 end > */
+
 	kgsl_cffdump_regpoll(device->id, REG_RBBM_STATUS << 2,
 		0x00000000, 0x80000000);
 	/* first, wait until the CP has consumed all the commands in
@@ -884,8 +909,6 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout)
 	 */
 retry:
 	if (rb->flags & KGSL_FLAGS_STARTED) {
-		/*< DTS2012041906630 zhangxiangdang 20120423 begin */
-		/*merge qc patch to fix kgsl issue.*/
 		msecs = adreno_dev->wait_timeout;
 		msecs_first = (msecs <= 100) ? ((msecs + 4) / 5) : 100;
 		msecs_part = (msecs - msecs_first + 3) / 4;
@@ -898,7 +921,6 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout)
 				wait_time_part = jiffies +
 					msecs_to_jiffies(msecs_part);
 			}
-			/* DTS2012041906630 zhangxiangdang 20120423 end > */
 			GSL_RB_GET_READPTR(rb, &rb->rptr);
 			if (time_after(jiffies, wait_time)) {
 				KGSL_DRV_ERR(device, "rptr: %x, wptr: %x\n",
@@ -965,7 +987,7 @@ static int adreno_suspend_context(struct kgsl_device *device)
 	return status;
 }
 
-const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
+struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 						unsigned int pt_base,
 						unsigned int gpuaddr,
 						unsigned int size)
@@ -992,8 +1014,7 @@ const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 		if (!kgsl_mmu_pt_equal(priv->pagetable, pt_base))
 			continue;
 		spin_lock(&priv->mem_lock);
-		entry = kgsl_sharedmem_find_region(priv, gpuaddr,
-						sizeof(unsigned int));
+		entry = kgsl_sharedmem_find_region(priv, gpuaddr, size);
 		if (entry) {
 			result = &entry->memdesc;
 			spin_unlock(&priv->mem_lock);
@@ -1037,7 +1058,7 @@ const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 uint8_t *adreno_convertaddr(struct kgsl_device *device, unsigned int pt_base,
 			    unsigned int gpuaddr, unsigned int size)
 {
-	const struct kgsl_memdesc *memdesc;
+	struct kgsl_memdesc *memdesc;
 
 	memdesc = adreno_find_region(device, pt_base, gpuaddr, size);
 
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
old mode 100755
new mode 100644
index af5bf51ea..1259507d9
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -46,6 +46,8 @@
 #define ADRENO_ISTORE_WORDS 3
 #define ADRENO_ISTORE_START 0x5000
 
+#define ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW	50
+
 enum adreno_gpurev {
 	ADRENO_REV_UNKNOWN = 0,
 	ADRENO_REV_A200 = 200,
@@ -74,12 +76,16 @@ struct adreno_device {
 	unsigned int wait_timeout;
 	unsigned int istore_size;
 	unsigned int pix_shader_start;
+	unsigned int ib_check_level;
 };
 
 struct adreno_gpudev {
+	/* keeps track of when we need to execute the draw workaround code */
+	int ctx_switches_since_last_draw;
 	int (*ctxt_create)(struct adreno_device *, struct adreno_context *);
 	void (*ctxt_save)(struct adreno_device *, struct adreno_context *);
 	void (*ctxt_restore)(struct adreno_device *, struct adreno_context *);
+	void (*ctxt_draw_workaround)(struct adreno_device *);
 	irqreturn_t (*irq_handler)(struct adreno_device *);
 	void (*irq_control)(struct adreno_device *, int);
 	void * (*snapshot)(struct adreno_device *, void *, int *, int);
@@ -99,7 +105,7 @@ void adreno_regread(struct kgsl_device *device, unsigned int offsetwords,
 void adreno_regwrite(struct kgsl_device *device, unsigned int offsetwords,
 				unsigned int value);
 
-const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
+struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 						unsigned int pt_base,
 						unsigned int gpuaddr,
 						unsigned int size);
diff --git a/drivers/gpu/msm/adreno_a2xx.c b/drivers/gpu/msm/adreno_a2xx.c
old mode 100755
new mode 100644
index 5ce9cf85b..62628e4a3
--- a/drivers/gpu/msm/adreno_a2xx.c
+++ b/drivers/gpu/msm/adreno_a2xx.c
@@ -1421,11 +1421,61 @@ static int a2xx_drawctxt_create(struct adreno_device *adreno_dev,
 	return ret;
 }
 
+static void a2xx_drawctxt_workaround(struct adreno_device *adreno_dev)
+{
+	struct kgsl_device *device = &adreno_dev->dev;
+	unsigned int cmd[11];
+	unsigned int *cmds = &cmd[0];
+
+	if (adreno_is_a225(adreno_dev)) {
+		adreno_dev->gpudev->ctx_switches_since_last_draw++;
+		/* If there have been > than
+		 * ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW calls to context
+		 * switches w/o gmem being saved then we need to execute
+		 * this workaround */
+		if (adreno_dev->gpudev->ctx_switches_since_last_draw >
+				ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW)
+			adreno_dev->gpudev->ctx_switches_since_last_draw = 0;
+		else
+			return;
+		/*
+		 * Issue an empty draw call to avoid possible hangs due to
+		 * repeated idles without intervening draw calls.
+		 * On adreno 225 the PC block has a cache that is only
+		 * flushed on draw calls and repeated idles can make it
+		 * overflow. The gmem save path contains draw calls so
+		 * this workaround isn't needed there.
+		 */
+		*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
+		*cmds++ = (0x4 << 16) | (REG_PA_SU_SC_MODE_CNTL - 0x2000);
+		*cmds++ = 0;
+		*cmds++ = cp_type3_packet(CP_DRAW_INDX, 5);
+		*cmds++ = 0;
+		*cmds++ = 1<<14;
+		*cmds++ = 0;
+		*cmds++ = device->mmu.setstate_memory.gpuaddr;
+		*cmds++ = 0;
+		*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+		*cmds++ = 0x00000000;
+	} else {
+		/* On Adreno 20x/220, if the events for shader space reuse
+		 * gets dropped, the CP block would wait indefinitely.
+		 * Sending CP_SET_SHADER_BASES packet unblocks the CP from
+		 * this wait.
+		 */
+		*cmds++ = cp_type3_packet(CP_SET_SHADER_BASES, 1);
+		*cmds++ = adreno_encode_istore_size(adreno_dev)
+					| adreno_dev->pix_shader_start;
+	}
+
+	adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_PMODE,
+			&cmd[0], cmds - cmd);
+}
+
 static void a2xx_drawctxt_save(struct adreno_device *adreno_dev,
 			struct adreno_context *context)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
-	unsigned int cmd[22];
 
 	if (context == NULL)
 		return;
@@ -1470,33 +1520,11 @@ static void a2xx_drawctxt_save(struct adreno_device *adreno_dev,
 			adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_NONE,
 				context->chicken_restore, 3);
 		}
+		adreno_dev->gpudev->ctx_switches_since_last_draw = 0;
 
 		context->flags |= CTXT_FLAGS_GMEM_RESTORE;
-	} else if (adreno_is_a225(adreno_dev)) {
-		unsigned int *cmds = &cmd[0];
-		/*
-		 * Issue an empty draw call to avoid possible hangs due to
-		 * repeated idles without intervening draw calls.
-		 * On adreno 225 the PC block has a cache that is only
-		 * flushed on draw calls and repeated idles can make it
-		 * overflow. The gmem save path contains draw calls so
-		 * this workaround isn't needed there.
-		 */
-		*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2);
-		*cmds++ = (0x4 << 16) | (REG_PA_SU_SC_MODE_CNTL - 0x2000);
-		*cmds++ = 0;
-		*cmds++ = cp_type3_packet(CP_DRAW_INDX, 5);
-		*cmds++ = 0;
-		*cmds++ = 1<<14;
-		*cmds++ = 0;
-		*cmds++ = device->mmu.setstate_memory.gpuaddr;
-		*cmds++ = 0;
-		*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
-		*cmds++ = 0x00000000;
-
-		adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_PMODE,
-					    &cmd[0], 11);
-	}
+	} else if (adreno_is_a2xx(adreno_dev))
+		a2xx_drawctxt_workaround(adreno_dev);
 }
 
 static void a2xx_drawctxt_restore(struct adreno_device *adreno_dev,
@@ -1757,6 +1785,7 @@ struct adreno_gpudev adreno_a2xx_gpudev = {
 	.ctxt_create = a2xx_drawctxt_create,
 	.ctxt_save = a2xx_drawctxt_save,
 	.ctxt_restore = a2xx_drawctxt_restore,
+	.ctxt_draw_workaround = a2xx_drawctxt_workaround,
 	.irq_handler = a2xx_irq_handler,
 	.irq_control = a2xx_irq_control,
 	.snapshot = a2xx_snapshot,
diff --git a/drivers/gpu/msm/adreno_debugfs.c b/drivers/gpu/msm/adreno_debugfs.c
index c1b9e4ce2..566efa1aa 100644
--- a/drivers/gpu/msm/adreno_debugfs.c
+++ b/drivers/gpu/msm/adreno_debugfs.c
@@ -345,6 +345,8 @@ void adreno_debugfs_init(struct kgsl_device *device)
 			    &kgsl_cff_dump_enable_fops);
 	debugfs_create_u32("wait_timeout", 0644, device->d_debugfs,
 		&adreno_dev->wait_timeout);
+	debugfs_create_u32("ib_check", 0644, device->d_debugfs,
+			   &adreno_dev->ib_check_level);
 
 	/* Create post mortem control files */
 
diff --git a/drivers/gpu/msm/adreno_drawctxt.c b/drivers/gpu/msm/adreno_drawctxt.c
index 206a678ee..f0b5741b5 100644
--- a/drivers/gpu/msm/adreno_drawctxt.c
+++ b/drivers/gpu/msm/adreno_drawctxt.c
@@ -243,8 +243,13 @@ void adreno_drawctxt_switch(struct adreno_device *adreno_dev,
 	}
 
 	/* already current? */
-	if (adreno_dev->drawctxt_active == drawctxt)
+	if (adreno_dev->drawctxt_active == drawctxt) {
+		if (adreno_dev->gpudev->ctxt_draw_workaround &&
+			adreno_is_a225(adreno_dev))
+				adreno_dev->gpudev->ctxt_draw_workaround(
+					adreno_dev);
 		return;
+	}
 
 	KGSL_CTXT_INFO(device, "from %p to %p flags %d\n",
 			adreno_dev->drawctxt_active, drawctxt, flags);
diff --git a/drivers/gpu/msm/adreno_pm4types.h b/drivers/gpu/msm/adreno_pm4types.h
index 8aea58c95..454b05785 100644
--- a/drivers/gpu/msm/adreno_pm4types.h
+++ b/drivers/gpu/msm/adreno_pm4types.h
@@ -29,11 +29,6 @@
 /* skip N 32-bit words to get to the next packet */
 #define CP_NOP			0x10
 
-/* indirect buffer dispatch.  prefetch parser uses this packet type to determine
-*  whether to pre-fetch the IB
-*/
-#define CP_INDIRECT_BUFFER	0x3f
-
 /* indirect buffer dispatch.  same as IB, but init is pipelined */
 #define CP_INDIRECT_BUFFER_PFD	0x37
 
@@ -117,6 +112,9 @@
 /* load constants from a location in memory */
 #define CP_LOAD_CONSTANT_CONTEXT 0x2e
 
+/* (A2x) sets binning configuration registers */
+#define CP_SET_BIN_DATA             0x2f
+
 /* selective invalidation of state pointers */
 #define CP_INVALIDATE_STATE	0x3b
 
@@ -157,6 +155,16 @@
 
 #define CP_SET_PROTECTED_MODE  0x5f /* sets the register protection mode */
 
+/*
+ * for a3xx
+ */
+
+/* Conditionally load a IB based on a flag */
+#define CP_COND_INDIRECT_BUFFER_PFE 0x3A /* prefetch enabled */
+#define CP_COND_INDIRECT_BUFFER_PFD 0x32 /* prefetch disabled */
+
+/* Load a buffer with pre-fetch enabled */
+#define CP_INDIRECT_BUFFER_PFE 0x3F
 
 /* packet header building macros */
 #define cp_type0_packet(regindx, cnt) \
@@ -178,11 +186,20 @@
 #define cp_nop_packet(cnt) \
 	 (CP_TYPE3_PKT | (((cnt)-1) << 16) | (CP_NOP << 8))
 
+#define pkt_is_type0(pkt) (((pkt) & 0XC0000000) == CP_TYPE0_PKT)
+
+#define type0_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1)
+#define type0_pkt_offset(pkt) ((pkt) & 0x7FFF)
+
+#define pkt_is_type3(pkt) (((pkt) & 0xC0000000) == CP_TYPE3_PKT)
+
+#define cp_type3_opcode(pkt) (((pkt) >> 8) & 0xFF)
+#define type3_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1)
 
 /* packet headers */
 #define CP_HDR_ME_INIT	cp_type3_packet(CP_ME_INIT, 18)
 #define CP_HDR_INDIRECT_BUFFER_PFD cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)
-#define CP_HDR_INDIRECT_BUFFER	cp_type3_packet(CP_INDIRECT_BUFFER, 2)
+#define CP_HDR_INDIRECT_BUFFER_PFE cp_type3_packet(CP_INDIRECT_BUFFER_PFE, 2)
 
 /* dword base address of the GFX decode space */
 #define SUBBLOCK_OFFSET(reg) ((unsigned int)((reg) - (0x2000)))
@@ -190,4 +207,14 @@
 /* gmem command buffer length */
 #define CP_REG(reg) ((0x4 << 16) | (SUBBLOCK_OFFSET(reg)))
 
+
+/* Return 1 if the command is an indirect buffer of any kind */
+static inline int adreno_cmd_is_ib(unsigned int cmd)
+{
+	return (cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFE, 2) ||
+		cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2) ||
+		cmd == cp_type3_packet(CP_COND_INDIRECT_BUFFER_PFE, 2) ||
+		cmd == cp_type3_packet(CP_COND_INDIRECT_BUFFER_PFD, 2));
+}
+
 #endif	/* __ADRENO_PM4TYPES_H */
diff --git a/drivers/gpu/msm/adreno_postmortem.c b/drivers/gpu/msm/adreno_postmortem.c
old mode 100755
new mode 100644
index 40dfb30cf..63f5caa91
--- a/drivers/gpu/msm/adreno_postmortem.c
+++ b/drivers/gpu/msm/adreno_postmortem.c
@@ -53,7 +53,7 @@ static const struct pm_id_name pm3_types[] = {
 	{CP_IM_LOAD,			"IN__LOAD"},
 	{CP_IM_LOAD_IMMEDIATE,		"IM_LOADI"},
 	{CP_IM_STORE,			"IM_STORE"},
-	{CP_INDIRECT_BUFFER,		"IND_BUF_"},
+	{CP_INDIRECT_BUFFER_PFE,	"IND_BUF_"},
 	{CP_INDIRECT_BUFFER_PFD,	"IND_BUFP"},
 	{CP_INTERRUPT,			"PM4_INTR"},
 	{CP_INVALIDATE_STATE,		"INV_STAT"},
@@ -200,7 +200,7 @@ static void dump_ib1(struct kgsl_device *device, uint32_t pt_base,
 
 	for (i = 0; i+3 < ib1_size; ) {
 		value = ib1_addr[i++];
-		if (value == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) {
+		if (adreno_cmd_is_ib(value)) {
 			uint32_t ib2_base = ib1_addr[i++];
 			uint32_t ib2_size = ib1_addr[i++];
 
@@ -611,7 +611,7 @@ static int adreno_dump(struct kgsl_device *device)
 	i = 0;
 	for (read_idx = 0; read_idx < num_item; ) {
 		uint32_t this_cmd = rb_copy[read_idx++];
-		if (this_cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) {
+		if (adreno_cmd_is_ib(this_cmd)) {
 			uint32_t ib_addr = rb_copy[read_idx++];
 			uint32_t ib_size = rb_copy[read_idx++];
 			dump_ib1(device, cur_pt_base, (read_idx-3)<<2, ib_addr,
@@ -654,8 +654,7 @@ static int adreno_dump(struct kgsl_device *device)
 		for (read_idx = NUM_DWORDS_OF_RINGBUFFER_HISTORY;
 			read_idx >= 0; --read_idx) {
 			uint32_t this_cmd = rb_copy[read_idx];
-			if (this_cmd == cp_type3_packet(
-				CP_INDIRECT_BUFFER_PFD, 2)) {
+			if (adreno_cmd_is_ib(this_cmd)) {
 				uint32_t ib_addr = rb_copy[read_idx+1];
 				uint32_t ib_size = rb_copy[read_idx+2];
 				if (ib_size && cp_ib1_base == ib_addr) {
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index ea2889b3a..57883fd45 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -22,6 +22,7 @@
 #include "adreno.h"
 #include "adreno_pm4types.h"
 #include "adreno_ringbuffer.h"
+#include "adreno_debugfs.h"
 
 #include "a2xx_reg.h"
 
@@ -310,12 +311,10 @@ int adreno_ringbuffer_start(struct adreno_ringbuffer *rb, unsigned int init_ram)
 	adreno_regwrite(device, REG_SCRATCH_UMSK,
 			     GSL_RB_MEMPTRS_SCRATCH_MASK);
 
-    /*< DTS2012042406822 hanfeng 20120428 begin*/
 	/* update the eoptimestamp field with the last retired timestamp */
 	kgsl_sharedmem_writel(&device->memstore,
 			     KGSL_DEVICE_MEMSTORE_OFFSET(eoptimestamp),
 			     rb->timestamp);
-    /* DTS2012042406822 hanfeng 20120428 end > */
 
 	/* load the CP ucode */
 
@@ -554,6 +553,197 @@ adreno_ringbuffer_issuecmds(struct kgsl_device *device,
 	adreno_ringbuffer_addcmds(rb, flags, cmds, sizedwords);
 }
 
+static bool _parse_ibs(struct kgsl_device_private *dev_priv, uint gpuaddr,
+			   int sizedwords);
+
+static bool
+_handle_type3(struct kgsl_device_private *dev_priv, uint *hostaddr)
+{
+	unsigned int opcode = cp_type3_opcode(*hostaddr);
+	switch (opcode) {
+	case CP_INDIRECT_BUFFER_PFD:
+	case CP_INDIRECT_BUFFER_PFE:
+	case CP_COND_INDIRECT_BUFFER_PFE:
+	case CP_COND_INDIRECT_BUFFER_PFD:
+		return _parse_ibs(dev_priv, hostaddr[1], hostaddr[2]);
+	case CP_NOP:
+	case CP_WAIT_FOR_IDLE:
+	case CP_WAIT_REG_MEM:
+	case CP_WAIT_REG_EQ:
+	case CP_WAT_REG_GTE:
+	case CP_WAIT_UNTIL_READ:
+	case CP_WAIT_IB_PFD_COMPLETE:
+	case CP_REG_RMW:
+	case CP_REG_TO_MEM:
+	case CP_MEM_WRITE:
+	case CP_MEM_WRITE_CNTR:
+	case CP_COND_EXEC:
+	case CP_COND_WRITE:
+	case CP_EVENT_WRITE:
+	case CP_EVENT_WRITE_SHD:
+	case CP_EVENT_WRITE_CFL:
+	case CP_EVENT_WRITE_ZPD:
+	case CP_DRAW_INDX:
+	case CP_DRAW_INDX_2:
+	case CP_DRAW_INDX_BIN:
+	case CP_DRAW_INDX_2_BIN:
+	case CP_VIZ_QUERY:
+	case CP_SET_STATE:
+	case CP_SET_CONSTANT:
+	case CP_IM_LOAD:
+	case CP_IM_LOAD_IMMEDIATE:
+	case CP_LOAD_CONSTANT_CONTEXT:
+	case CP_INVALIDATE_STATE:
+	case CP_SET_SHADER_BASES:
+	case CP_SET_BIN_MASK:
+	case CP_SET_BIN_SELECT:
+	case CP_SET_BIN_BASE_OFFSET:
+	case CP_SET_BIN_DATA:
+	case CP_CONTEXT_UPDATE:
+	case CP_INTERRUPT:
+	case CP_IM_STORE:
+		break;
+	/* these shouldn't come from userspace */
+	case CP_ME_INIT:
+	case CP_SET_PROTECTED_MODE:
+	default:
+		KGSL_CMD_ERR(dev_priv->device, "bad CP opcode %0x\n", opcode);
+		return false;
+		break;
+	}
+
+	return true;
+}
+
+static bool
+_handle_type0(struct kgsl_device_private *dev_priv, uint *hostaddr)
+{
+	unsigned int reg = type0_pkt_offset(*hostaddr);
+	unsigned int cnt = type0_pkt_size(*hostaddr);
+	if (reg < 0x0192 || (reg + cnt) >= 0x8000) {
+		KGSL_CMD_ERR(dev_priv->device, "bad type0 reg: 0x%0x cnt: %d\n",
+			     reg, cnt);
+		return false;
+	}
+	return true;
+}
+
+/*
+ * Traverse IBs and dump them to test vector. Detect swap by inspecting
+ * register writes, keeping note of the current state, and dump
+ * framebuffer config to test vector
+ */
+static bool _parse_ibs(struct kgsl_device_private *dev_priv,
+			   uint gpuaddr, int sizedwords)
+{
+	static uint level; /* recursion level */
+	bool ret = false;
+	uint *hostaddr, *hoststart;
+	int dwords_left = sizedwords; /* dwords left in the current command
+					 buffer */
+	struct kgsl_mem_entry *entry;
+
+	spin_lock(&dev_priv->process_priv->mem_lock);
+	entry = kgsl_sharedmem_find_region(dev_priv->process_priv,
+					   gpuaddr, sizedwords * sizeof(uint));
+	spin_unlock(&dev_priv->process_priv->mem_lock);
+	if (entry == NULL) {
+		KGSL_CMD_ERR(dev_priv->device,
+			     "no mapping for gpuaddr: 0x%08x\n", gpuaddr);
+		return false;
+	}
+
+	hostaddr = (uint *)kgsl_gpuaddr_to_vaddr(&entry->memdesc, gpuaddr);
+	if (hostaddr == NULL) {
+		KGSL_CMD_ERR(dev_priv->device,
+			     "no mapping for gpuaddr: 0x%08x\n", gpuaddr);
+		return false;
+	}
+
+	hoststart = hostaddr;
+
+	level++;
+
+	KGSL_CMD_INFO(dev_priv->device, "ib: gpuaddr:0x%08x, wc:%d, hptr:%p\n",
+		gpuaddr, sizedwords, hostaddr);
+
+	mb();
+	while (dwords_left > 0) {
+		bool cur_ret = true;
+		int count = 0; /* dword count including packet header */
+
+		switch (*hostaddr >> 30) {
+		case 0x0: /* type-0 */
+			count = (*hostaddr >> 16)+2;
+			cur_ret = _handle_type0(dev_priv, hostaddr);
+			break;
+		case 0x1: /* type-1 */
+			count = 2;
+			break;
+		case 0x3: /* type-3 */
+			count = ((*hostaddr >> 16) & 0x3fff) + 2;
+			cur_ret = _handle_type3(dev_priv, hostaddr);
+			break;
+		default:
+			KGSL_CMD_ERR(dev_priv->device, "unexpected type: "
+				"type:%d, word:0x%08x @ 0x%p, gpu:0x%08x\n",
+				*hostaddr >> 30, *hostaddr, hostaddr,
+				gpuaddr+4*(sizedwords-dwords_left));
+			cur_ret = false;
+			count = dwords_left;
+			break;
+		}
+
+		if (!cur_ret) {
+			KGSL_CMD_ERR(dev_priv->device,
+				"bad sub-type: #:%d/%d, v:0x%08x"
+				" @ 0x%p[gb:0x%08x], level:%d\n",
+				sizedwords-dwords_left, sizedwords, *hostaddr,
+				hostaddr, gpuaddr+4*(sizedwords-dwords_left),
+				level);
+
+			if (ADRENO_DEVICE(dev_priv->device)->ib_check_level
+				>= 2)
+				print_hex_dump(KERN_ERR,
+					level == 1 ? "IB1:" : "IB2:",
+					DUMP_PREFIX_OFFSET, 32, 4, hoststart,
+					sizedwords*4, 0);
+			goto done;
+		}
+
+		/* jump to next packet */
+		dwords_left -= count;
+		hostaddr += count;
+		if (dwords_left < 0) {
+			KGSL_CMD_ERR(dev_priv->device,
+				"bad count: c:%d, #:%d/%d, "
+				"v:0x%08x @ 0x%p[gb:0x%08x], level:%d\n",
+				count, sizedwords-(dwords_left+count),
+				sizedwords, *(hostaddr-count), hostaddr-count,
+				gpuaddr+4*(sizedwords-(dwords_left+count)),
+				level);
+			if (ADRENO_DEVICE(dev_priv->device)->ib_check_level
+				>= 2)
+				print_hex_dump(KERN_ERR,
+					level == 1 ? "IB1:" : "IB2:",
+					DUMP_PREFIX_OFFSET, 32, 4, hoststart,
+					sizedwords*4, 0);
+			goto done;
+		}
+	}
+
+	ret = true;
+done:
+	if (!ret)
+		KGSL_DRV_ERR(dev_priv->device,
+			"parsing failed: gpuaddr:0x%08x, "
+			"host:0x%p, wc:%d\n", gpuaddr, hoststart, sizedwords);
+
+	level--;
+
+	return ret;
+}
+
 int
 adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv,
 				struct kgsl_context *context,
@@ -601,9 +791,12 @@ adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv,
 		start_index = 1;
 
 	for (i = start_index; i < numibs; i++) {
-		(void)kgsl_cffdump_parse_ibs(dev_priv, NULL,
-			ibdesc[i].gpuaddr, ibdesc[i].sizedwords, false);
-
+		if (unlikely(adreno_dev->ib_check_level >= 1 &&
+		    !_parse_ibs(dev_priv, ibdesc[i].gpuaddr,
+				ibdesc[i].sizedwords))) {
+			kfree(link);
+			return -EINVAL;
+		}
 		*cmds++ = CP_HDR_INDIRECT_BUFFER_PFD;
 		*cmds++ = ibdesc[i].gpuaddr;
 		*cmds++ = ibdesc[i].sizedwords;
@@ -757,8 +950,20 @@ int adreno_ringbuffer_extract(struct adreno_ringbuffer *rb,
 			kgsl_sharedmem_readl(&rb->buffer_desc, &value, rb_rptr);
 			rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr,
 							rb->buffer_desc.size);
-			BUG_ON((copy_rb_contents == 0) &&
-				(value == cur_context));
+
+			/*
+			 * If other context switches were already lost and
+			 * and the current context is the one that is hanging,
+			 * then we cannot recover.  Print an error message
+			 * and leave.
+			 */
+
+			if ((copy_rb_contents == 0) && (value == cur_context)) {
+				KGSL_DRV_ERR(device, "GPU recovery could not "
+					"find the previous context\n");
+				return -EINVAL;
+			}
+
 			/*
 			 * If we were copying the commands and got to this point
 			 * then we need to remove the 3 commands that appear
diff --git a/drivers/gpu/msm/adreno_snapshot.c b/drivers/gpu/msm/adreno_snapshot.c
index fb88a72bd..c45dbff48 100644
--- a/drivers/gpu/msm/adreno_snapshot.c
+++ b/drivers/gpu/msm/adreno_snapshot.c
@@ -45,11 +45,19 @@ static void push_object(struct kgsl_device *device, int type, uint32_t ptbase,
 	int index;
 	void *ptr;
 
-	/* Go through the list and see that object has already been seen */
+	/*
+	 * Sometimes IBs can be reused in the same dump.  Because we parse from
+	 * oldest to newest, if we come across an IB that has already been used,
+	 * assume that it has been reused and update the list with the newest
+	 * size.
+	 */
+
 	for (index = 0; index < objbufptr; index++) {
 		if (objbuf[index].gpuaddr == gpuaddr &&
-			objbuf[index].ptbase == ptbase)
-			return;
+			objbuf[index].ptbase == ptbase) {
+				objbuf[index].dwords = dwords;
+				return;
+			}
 	}
 
 	if (objbufptr == SNAPSHOT_OBJ_BUFSIZE) {
@@ -77,6 +85,25 @@ static void push_object(struct kgsl_device *device, int type, uint32_t ptbase,
 	objbuf[objbufptr++].ptr = ptr;
 }
 
+/*
+ * Return a 1 if the specified object is already on the list of buffers
+ * to be dumped
+ */
+
+static int find_object(int type, unsigned int gpuaddr, unsigned int ptbase)
+{
+	int index;
+
+	for (index = 0; index < objbufptr; index++) {
+		if (objbuf[index].gpuaddr == gpuaddr &&
+			objbuf[index].ptbase == ptbase &&
+			objbuf[index].type == type)
+			return 1;
+	}
+
+	return 0;
+}
+
 /* Snapshot the istore memory */
 static int snapshot_istore(struct kgsl_device *device, void *snapshot,
 	int remain, void *priv)
@@ -113,6 +140,7 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot,
 	unsigned int rbbase, ptbase, rptr, *rbptr;
 	int start, stop, index;
 	int numitems, size;
+	int parse_ibs = 0, ib_parse_start;
 
 	/* Get the GPU address of the ringbuffer */
 	kgsl_regread(device, REG_CP_RB_BASE, &rbbase);
@@ -158,9 +186,53 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot,
 	header->rbsize = rb->sizedwords;
 	header->count = numitems;
 
-	index = start;
+	/*
+	 * We can only reliably dump IBs from the beginning of the context,
+	 * and it turns out that for the vast majority of the time we really
+	 * only care about the current context when it comes to diagnosing
+	 * a hang. So, with an eye to limiting the buffer dumping to what is
+	 * really useful find the beginning of the context and only dump
+	 * IBs from that point
+	 */
+
+	index = rptr;
+	ib_parse_start = start;
 	rbptr = rb->buffer_desc.hostptr;
 
+	while (index != start) {
+		index--;
+
+		if (index < 0) {
+			/*
+			 * The marker we are looking for is 2 dwords long, so
+			 * when wrapping, go back 2 from the end so we don't
+			 * access out of range in the if statement below
+			 */
+			index = rb->sizedwords - 2;
+
+			/*
+			 * Account for the possibility that start might be at
+			 * rb->sizedwords - 1
+			 */
+
+			if (start == rb->sizedwords - 1)
+				break;
+		}
+
+		/*
+		 * Look for a NOP packet with the context switch identifier in
+		 * the second dword
+		 */
+
+		if (rbptr[index] == cp_nop_packet(1) &&
+			rbptr[index + 1] == KGSL_CONTEXT_TO_MEM_IDENTIFIER) {
+				ib_parse_start = index;
+				break;
+		}
+	}
+
+	index = start;
+
 	/*
 	 * Loop through the RB, copying the data and looking for indirect
 	 * buffers and MMU pagetable changes
@@ -169,15 +241,18 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot,
 	while (index != rb->wptr) {
 		*data = rbptr[index];
 
-		if (rbptr[index] == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2))
+		/* Only parse IBs between the context start and the rptr */
+
+		if (index == ib_parse_start)
+			parse_ibs = 1;
+
+		if (index == rptr)
+			parse_ibs = 0;
+
+		if (parse_ibs && adreno_cmd_is_ib(rbptr[index]))
 			push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase,
 				rbptr[index + 1], rbptr[index + 2]);
 
-		/*
-		 * FIXME: Handle upcoming MMU pagetable changes, but only
-		 * between the rptr and the wptr
-		 */
-
 		index = index + 1;
 
 		if (index == rb->sizedwords)
@@ -228,10 +303,9 @@ static int snapshot_ib(struct kgsl_device *device, void *snapshot,
 		*dst = *src;
 		/* If another IB is discovered, then push it on the list too */
 
-		if (*src == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) {
+		if (adreno_cmd_is_ib(*src))
 			push_object(device, SNAPSHOT_OBJ_TYPE_IB, obj->ptbase,
 				*(src + 1), *(src + 2));
-		}
 
 		src++;
 		dst++;
@@ -288,22 +362,45 @@ void *adreno_snapshot(struct kgsl_device *device, void *snapshot, int *remain,
 		snapshot, remain, snapshot_rb, NULL);
 
 	/*
-	 * Make sure that the IBs described in the CP registers are on the
-	 * list of objects
+	 * Make sure that the last IB1 that was being executed is dumped.
+	 * Since this was the last IB1 that was processed, we should have
+	 * already added it to the list during the ringbuffer parse but we
+	 * want to be double plus sure.
 	 */
+
 	kgsl_regread(device, REG_CP_IB1_BASE, &ibbase);
 	kgsl_regread(device, REG_CP_IB1_BUFSZ, &ibsize);
 
-	if (ibsize)
+	/*
+	 * The problem is that IB size from the register is the unprocessed size
+	 * of the buffer not the original size, so if we didn't catch this
+	 * buffer being directly used in the RB, then we might not be able to
+	 * dump the whle thing. Print a warning message so we can try to
+	 * figure how often this really happens.
+	 */
+
+	if (!find_object(SNAPSHOT_OBJ_TYPE_IB, ibbase, ptbase) && ibsize) {
 		push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase,
 			ibbase, ibsize);
+		KGSL_DRV_ERR(device, "CP_IB1_BASE not found in the ringbuffer. "
+			"Dumping %x dwords of the buffer.\n", ibsize);
+	}
 
 	kgsl_regread(device, REG_CP_IB2_BASE, &ibbase);
 	kgsl_regread(device, REG_CP_IB2_BUFSZ, &ibsize);
 
-	if (ibsize)
+	/*
+	 * Add the last parsed IB2 to the list. The IB2 should be found as we
+	 * parse the objects below, but we try to add it to the list first, so
+	 * it too can be parsed.  Don't print an error message in this case - if
+	 * the IB2 is found during parsing, the list will be updated with the
+	 * correct size.
+	 */
+
+	if (!find_object(SNAPSHOT_OBJ_TYPE_IB, ibbase, ptbase) && ibsize) {
 		push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase,
 			ibbase, ibsize);
+	}
 
 	/*
 	 * Go through the list of found objects and dump each one.  As the IBs
diff --git a/drivers/gpu/msm/kgsl.c b/drivers/gpu/msm/kgsl.c
old mode 100755
new mode 100644
index d51128979..39dad925f
--- a/drivers/gpu/msm/kgsl.c
+++ b/drivers/gpu/msm/kgsl.c
@@ -21,11 +21,10 @@
 #include <linux/vmalloc.h>
 #include <linux/pm_runtime.h>
 #include <linux/genlock.h>
-
+#include <linux/rbtree.h>
 #include <linux/ashmem.h>
 #include <linux/major.h>
 #include <linux/ion.h>
-#include <mach/socinfo.h>
 
 #include "kgsl.h"
 #include "kgsl_debugfs.h"
@@ -194,8 +193,28 @@ static
 void kgsl_mem_entry_attach_process(struct kgsl_mem_entry *entry,
 				   struct kgsl_process_private *process)
 {
+	struct rb_node **node;
+	struct rb_node *parent = NULL;
+
 	spin_lock(&process->mem_lock);
-	list_add(&entry->list, &process->mem_list);
+
+	node = &process->mem_rb.rb_node;
+
+	while (*node) {
+		struct kgsl_mem_entry *cur;
+
+		parent = *node;
+		cur = rb_entry(parent, struct kgsl_mem_entry, node);
+
+		if (entry->memdesc.gpuaddr < cur->memdesc.gpuaddr)
+			node = &parent->rb_left;
+		else
+			node = &parent->rb_right;
+	}
+
+	rb_link_node(&entry->node, parent, node);
+	rb_insert_color(&entry->node, &process->mem_rb);
+
 	spin_unlock(&process->mem_lock);
 
 	entry->priv = process;
@@ -405,6 +424,10 @@ static int kgsl_suspend_device(struct kgsl_device *device, pm_message_t state)
 			INIT_COMPLETION(device->hwaccess_gate);
 			device->ftbl->suspend_context(device);
 			device->ftbl->stop(device);
+			if (device->idle_wakelock.name)
+				wake_unlock(&device->idle_wakelock);
+			pm_qos_update_request(&device->pm_qos_req_dma,
+						PM_QOS_DEFAULT_VALUE);
 			kgsl_pwrctrl_set_state(device, KGSL_STATE_SUSPEND);
 			break;
 		case KGSL_STATE_SLUMBER:
@@ -514,8 +537,8 @@ void kgsl_late_resume_driver(struct early_suspend *h)
 					struct kgsl_device, display_off);
 	KGSL_PWR_WARN(device, "late resume start\n");
 	mutex_lock(&device->mutex);
-	kgsl_pwrctrl_wake(device);
 	device->pwrctrl.restore_slumber = 0;
+	kgsl_pwrctrl_wake(device);
 	kgsl_pwrctrl_pwrlevel_change(device, KGSL_PWRLEVEL_TURBO);
 	mutex_unlock(&device->mutex);
 	kgsl_check_idle(device);
@@ -548,8 +571,7 @@ kgsl_get_process_private(struct kgsl_device_private *cur_dev_priv)
 	spin_lock_init(&private->mem_lock);
 	private->refcnt = 1;
 	private->pid = task_tgid_nr(current);
-
-	INIT_LIST_HEAD(&private->mem_list);
+	private->mem_rb = RB_ROOT;
 
 	if (kgsl_mmu_enabled())
 	{
@@ -578,7 +600,7 @@ kgsl_put_process_private(struct kgsl_device *device,
 			 struct kgsl_process_private *private)
 {
 	struct kgsl_mem_entry *entry = NULL;
-	struct kgsl_mem_entry *entry_tmp = NULL;
+	struct rb_node *node;
 
 	if (!private)
 		return;
@@ -592,11 +614,13 @@ kgsl_put_process_private(struct kgsl_device *device,
 
 	list_del(&private->list);
 
-	list_for_each_entry_safe(entry, entry_tmp, &private->mem_list, list) {
-		list_del(&entry->list);
+	for (node = rb_first(&private->mem_rb); node; ) {
+		entry = rb_entry(node, struct kgsl_mem_entry, node);
+		node = rb_next(&entry->node);
+
+		rb_erase(&entry->node, &private->mem_rb);
 		kgsl_mem_entry_put(entry);
 	}
-
 	kgsl_mmu_putpagetable(private->pagetable);
 	kfree(private);
 unlock:
@@ -722,47 +746,43 @@ static int kgsl_open(struct inode *inodep, struct file *filep)
 	return result;
 }
 
-
 /*call with private->mem_lock locked */
-static struct kgsl_mem_entry *
-kgsl_sharedmem_find(struct kgsl_process_private *private, unsigned int gpuaddr)
+struct kgsl_mem_entry *
+kgsl_sharedmem_find_region(struct kgsl_process_private *private,
+	unsigned int gpuaddr, size_t size)
 {
-	struct kgsl_mem_entry *entry = NULL, *result = NULL;
+	struct rb_node *node = private->mem_rb.rb_node;
 
-	BUG_ON(private == NULL);
+	while (node != NULL) {
+		struct kgsl_mem_entry *entry;
 
-	gpuaddr &= PAGE_MASK;
+		entry = rb_entry(node, struct kgsl_mem_entry, node);
 
-	list_for_each_entry(entry, &private->mem_list, list) {
-		if (entry->memdesc.gpuaddr == gpuaddr) {
-			result = entry;
-			break;
-		}
-	}
-	return result;
-}
-
-/*call with private->mem_lock locked */
-struct kgsl_mem_entry *
-kgsl_sharedmem_find_region(struct kgsl_process_private *private,
-				unsigned int gpuaddr,
-				size_t size)
-{
-	struct kgsl_mem_entry *entry = NULL, *result = NULL;
 
-	BUG_ON(private == NULL);
+		if (kgsl_gpuaddr_in_memdesc(&entry->memdesc, gpuaddr, size))
+			return entry;
 
-	list_for_each_entry(entry, &private->mem_list, list) {
-		if (kgsl_gpuaddr_in_memdesc(&entry->memdesc, gpuaddr, size)) {
-			result = entry;
-			break;
+		if (gpuaddr < entry->memdesc.gpuaddr)
+			node = node->rb_left;
+		else if (gpuaddr >=
+			(entry->memdesc.gpuaddr + entry->memdesc.size))
+			node = node->rb_right;
+		else {
+			return NULL;
 		}
 	}
 
-	return result;
+	return NULL;
 }
 EXPORT_SYMBOL(kgsl_sharedmem_find_region);
 
+/*call with private->mem_lock locked */
+static inline struct kgsl_mem_entry *
+kgsl_sharedmem_find(struct kgsl_process_private *private, unsigned int gpuaddr)
+{
+	return kgsl_sharedmem_find_region(private, gpuaddr, 1);
+}
+
 /*call all ioctl sub functions with driver locked*/
 static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv,
 					  unsigned int cmd, void *data)
@@ -789,6 +809,40 @@ static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv,
 
 		break;
 	}
+	case KGSL_PROP_GPU_RESET_STAT:
+	{
+		/* Return reset status of given context and clear it */
+		uint32_t id;
+		struct kgsl_context *context;
+
+		if (param->sizebytes != sizeof(unsigned int)) {
+			result = -EINVAL;
+			break;
+		}
+		/* We expect the value passed in to contain the context id */
+		if (copy_from_user(&id, param->value,
+			sizeof(unsigned int))) {
+			result = -EFAULT;
+			break;
+		}
+		context = kgsl_find_context(dev_priv, id);
+		if (!context) {
+			result = -EINVAL;
+			break;
+		}
+		/*
+		 * Copy the reset status to value which also serves as
+		 * the out parameter
+		 */
+		if (copy_to_user(param->value, &(context->reset_status),
+			sizeof(unsigned int))) {
+			result = -EFAULT;
+			break;
+		}
+		/* Clear reset status once its been queried */
+		context->reset_status = KGSL_CTX_STAT_NO_ERROR;
+		break;
+	}
 	default:
 		result = dev_priv->device->ftbl->getproperty(
 					dev_priv->device, param->type,
@@ -827,40 +881,6 @@ static long kgsl_ioctl_device_waittimestamp(struct kgsl_device_private
 
 	return result;
 }
-static bool check_ibdesc(struct kgsl_device_private *dev_priv,
-			 struct kgsl_ibdesc *ibdesc, unsigned int numibs,
-			 bool parse)
-{
-	bool result = true;
-	unsigned int i;
-	for (i = 0; i < numibs; i++) {
-		struct kgsl_mem_entry *entry;
-		spin_lock(&dev_priv->process_priv->mem_lock);
-		entry = kgsl_sharedmem_find_region(dev_priv->process_priv,
-			ibdesc[i].gpuaddr, ibdesc[i].sizedwords * sizeof(uint));
-		spin_unlock(&dev_priv->process_priv->mem_lock);
-		if (entry == NULL) {
-			KGSL_DRV_ERR(dev_priv->device,
-				"invalid cmd buffer gpuaddr %08x " \
-				"sizedwords %d\n", ibdesc[i].gpuaddr,
-				ibdesc[i].sizedwords);
-			result = false;
-			break;
-		}
-
-		if (parse && !kgsl_cffdump_parse_ibs(dev_priv, &entry->memdesc,
-			ibdesc[i].gpuaddr, ibdesc[i].sizedwords, true)) {
-			KGSL_DRV_ERR(dev_priv->device,
-				"invalid cmd buffer gpuaddr %08x " \
-				"sizedwords %d numibs %d/%d\n",
-				ibdesc[i].gpuaddr,
-				ibdesc[i].sizedwords, i+1, numibs);
-			result = false;
-			break;
-		}
-	}
-	return result;
-}
 
 static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv,
 				      unsigned int cmd, void *data)
@@ -930,12 +950,6 @@ static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv,
 		param->numibs = 1;
 	}
 
-	if (!check_ibdesc(dev_priv, ibdesc, param->numibs, true)) {
-		KGSL_DRV_ERR(dev_priv->device, "bad ibdesc");
-		result = -EINVAL;
-		goto free_ibdesc;
-	}
-
 	result = dev_priv->device->ftbl->issueibcmds(dev_priv,
 					     context,
 					     ibdesc,
@@ -945,18 +959,6 @@ static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv,
 
 	trace_kgsl_issueibcmds(dev_priv->device, param, result);
 
-	if (result != 0)
-		goto free_ibdesc;
-
-	/* this is a check to try to detect if a command buffer was freed
-	 * during issueibcmds().
-	 */
-	if (!check_ibdesc(dev_priv, ibdesc, param->numibs, false)) {
-		KGSL_DRV_ERR(dev_priv->device, "bad ibdesc AFTER issue");
-		result = -EINVAL;
-		goto free_ibdesc;
-	}
-
 free_ibdesc:
 	kfree(ibdesc);
 done:
@@ -988,7 +990,7 @@ static void kgsl_freemem_event_cb(struct kgsl_device *device,
 {
 	struct kgsl_mem_entry *entry = priv;
 	spin_lock(&entry->priv->mem_lock);
-	list_del(&entry->list);
+	rb_erase(&entry->node, &entry->priv->mem_rb);
 	spin_unlock(&entry->priv->mem_lock);
 	kgsl_mem_entry_put(entry);
 }
@@ -1080,7 +1082,8 @@ static long kgsl_ioctl_sharedmem_free(struct kgsl_device_private *dev_priv,
 	spin_lock(&private->mem_lock);
 	entry = kgsl_sharedmem_find(private, param->gpuaddr);
 	if (entry)
-		list_del(&entry->list);
+		rb_erase(&entry->node, &private->mem_rb);
+
 	spin_unlock(&private->mem_lock);
 
 	if (entry) {
@@ -1164,7 +1167,7 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv,
 		goto error;
 	}
 
-	result = kgsl_sharedmem_vmalloc_user(&entry->memdesc,
+	result = kgsl_sharedmem_page_alloc_user(&entry->memdesc,
 					     private->pagetable, len,
 					     param->flags);
 	if (result != 0)
@@ -1172,10 +1175,10 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv,
 
 	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
 
-	result = remap_vmalloc_range(vma, (void *) entry->memdesc.hostptr, 0);
+	result = kgsl_sharedmem_map_vma(vma, &entry->memdesc);
 	if (result) {
-		KGSL_CORE_ERR("remap_vmalloc_range failed: %d\n", result);
-		goto error_free_vmalloc;
+		KGSL_CORE_ERR("kgsl_sharedmem_map_vma failed: %d\n", result);
+		goto error_free_alloc;
 	}
 
 	param->gpuaddr = entry->memdesc.gpuaddr;
@@ -1190,7 +1193,7 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv,
 	kgsl_check_idle(dev_priv->device);
 	return 0;
 
-error_free_vmalloc:
+error_free_alloc:
 	kgsl_sharedmem_free(&entry->memdesc);
 
 error_free_entry:
@@ -1313,7 +1316,8 @@ static int memdesc_sg_virt(struct kgsl_memdesc *memdesc,
 	int sglen = PAGE_ALIGN(size) / PAGE_SIZE;
 	unsigned long paddr = (unsigned long) addr;
 
-	memdesc->sg = vmalloc(sglen * sizeof(struct scatterlist));
+	memdesc->sg = kgsl_sg_alloc(sglen);
+
 	if (memdesc->sg == NULL)
 		return -ENOMEM;
 
@@ -1353,7 +1357,7 @@ static int memdesc_sg_virt(struct kgsl_memdesc *memdesc,
 
 err:
 	spin_unlock(&current->mm->page_table_lock);
-	vfree(memdesc->sg);
+	kgsl_sg_free(memdesc->sg,  sglen);
 	memdesc->sg = NULL;
 
 	return -EINVAL;
@@ -1488,11 +1492,8 @@ static int kgsl_setup_ion(struct kgsl_mem_entry *entry,
 	struct scatterlist *s;
 	unsigned long flags;
 
-	if (kgsl_ion_client == NULL) {
-		kgsl_ion_client = msm_ion_client_create(UINT_MAX, KGSL_NAME);
-		if (kgsl_ion_client == NULL)
-			return -ENODEV;
-	}
+	if (IS_ERR_OR_NULL(kgsl_ion_client))
+		return -ENODEV;
 
 	handle = ion_import_fd(kgsl_ion_client, fd);
 	if (IS_ERR_OR_NULL(handle))
@@ -1622,10 +1623,20 @@ static long kgsl_ioctl_map_user_mem(struct kgsl_device_private *dev_priv,
 	kgsl_check_idle(dev_priv->device);
 	return result;
 
- error_put_file_ptr:
-	if (entry->priv_data)
-		fput(entry->priv_data);
-
+error_put_file_ptr:
+	switch (entry->memtype) {
+	case KGSL_MEM_ENTRY_PMEM:
+	case KGSL_MEM_ENTRY_ASHMEM:
+		if (entry->priv_data)
+			fput(entry->priv_data);
+		break;
+	case KGSL_MEM_ENTRY_ION:
+		ion_unmap_dma(kgsl_ion_client, entry->priv_data);
+		ion_free(kgsl_ion_client, entry->priv_data);
+		break;
+	default:
+		break;
+	}
 error:
 	kfree(entry);
 	kgsl_check_idle(dev_priv->device);
@@ -2029,7 +2040,7 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long vma_offset = vma->vm_pgoff << PAGE_SHIFT;
 	struct kgsl_device_private *dev_priv = file->private_data;
 	struct kgsl_process_private *private = dev_priv->process_priv;
-	struct kgsl_mem_entry *tmp, *entry = NULL;
+	struct kgsl_mem_entry *entry = NULL;
 	struct kgsl_device *device = dev_priv->device;
 
 	/* Handle leagacy behavior for memstore */
@@ -2040,13 +2051,11 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma)
 	/* Find a chunk of GPU memory */
 
 	spin_lock(&private->mem_lock);
-	list_for_each_entry(tmp, &private->mem_list, list) {
-		if (vma_offset == tmp->memdesc.gpuaddr) {
-			kgsl_mem_entry_get(tmp);
-			entry = tmp;
-			break;
-		}
-	}
+	entry = kgsl_sharedmem_find(private, vma_offset);
+
+	if (entry)
+		kgsl_mem_entry_get(entry);
+
 	spin_unlock(&private->mem_lock);
 
 	if (entry == NULL)
@@ -2102,8 +2111,8 @@ void kgsl_unregister_device(struct kgsl_device *device)
 	kgsl_cffdump_close(device->id);
 	kgsl_pwrctrl_uninit_sysfs(device);
 
-	if (cpu_is_msm8x60())
-		wake_lock_destroy(&device->idle_wakelock);
+	wake_lock_destroy(&device->idle_wakelock);
+	pm_qos_remove_request(&device->pm_qos_req_dma);
 
 	idr_destroy(&device->context_idr);
 
@@ -2194,9 +2203,9 @@ kgsl_register_device(struct kgsl_device *device)
 	if (ret != 0)
 		goto err_close_mmu;
 
-	if (cpu_is_msm8x60())
-		wake_lock_init(&device->idle_wakelock,
-					   WAKE_LOCK_IDLE, device->name);
+	wake_lock_init(&device->idle_wakelock, WAKE_LOCK_IDLE, device->name);
+	pm_qos_add_request(&device->pm_qos_req_dma, PM_QOS_CPU_DMA_LATENCY,
+				PM_QOS_DEFAULT_VALUE);
 
 	idr_init(&device->context_idr);
 
@@ -2242,6 +2251,8 @@ int kgsl_device_platform_probe(struct kgsl_device *device,
 	if (status)
 		goto error;
 
+	kgsl_ion_client = msm_ion_client_create(UINT_MAX, KGSL_NAME);
+
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
 					   device->iomemname);
 	if (res == NULL) {
@@ -2339,22 +2350,30 @@ kgsl_ptdata_init(void)
 
 static void kgsl_core_exit(void)
 {
-	unregister_chrdev_region(kgsl_driver.major, KGSL_DEVICE_MAX);
-
-	kgsl_mmu_ptpool_destroy(&kgsl_driver.ptpool);
+	kgsl_mmu_ptpool_destroy(kgsl_driver.ptpool);
 	kgsl_driver.ptpool = NULL;
 
-	device_unregister(&kgsl_driver.virtdev);
+	kgsl_drm_exit();
+	kgsl_cffdump_destroy();
+	kgsl_core_debugfs_close();
+
+	/*
+	 * We call kgsl_sharedmem_uninit_sysfs() and device_unregister()
+	 * only if kgsl_driver.virtdev has been populated.
+	 * We check at least one member of kgsl_driver.virtdev to
+	 * see if it is not NULL (and thus, has been populated).
+	 */
+	if (kgsl_driver.virtdev.class) {
+		kgsl_sharedmem_uninit_sysfs();
+		device_unregister(&kgsl_driver.virtdev);
+	}
 
 	if (kgsl_driver.class) {
 		class_destroy(kgsl_driver.class);
 		kgsl_driver.class = NULL;
 	}
 
-	kgsl_drm_exit();
-	kgsl_cffdump_destroy();
-	kgsl_core_debugfs_close();
-	kgsl_sharedmem_uninit_sysfs();
+	unregister_chrdev_region(kgsl_driver.major, KGSL_DEVICE_MAX);
 }
 
 static int __init kgsl_core_init(void)
diff --git a/drivers/gpu/msm/kgsl.h b/drivers/gpu/msm/kgsl.h
index d3ae4b9bb..f027f95c4 100644
--- a/drivers/gpu/msm/kgsl.h
+++ b/drivers/gpu/msm/kgsl.h
@@ -21,13 +21,12 @@
 #include <linux/mutex.h>
 #include <linux/cdev.h>
 #include <linux/regulator/consumer.h>
+#include <linux/mm.h>
 
 #define KGSL_NAME "kgsl"
 
-/*< DTS2012042406822 hanfeng 20120428 begin*/
-/* Timestamp window used to detect rollovers */
+/* Timestamp window used to detect rollovers (half of integer range) */
 #define KGSL_TIMESTAMP_WINDOW 0x80000000
-/* DTS2012042406822 hanfeng 20120428 end > */
 
 /*cache coherency ops */
 #define DRM_KGSL_GEM_CACHE_OP_TO_DEV	0x0001
@@ -96,6 +95,8 @@ struct kgsl_driver {
 	struct {
 		unsigned int vmalloc;
 		unsigned int vmalloc_max;
+		unsigned int page_alloc;
+		unsigned int page_alloc_max;
 		unsigned int coherent;
 		unsigned int coherent_max;
 		unsigned int mapped;
@@ -107,7 +108,15 @@ struct kgsl_driver {
 extern struct kgsl_driver kgsl_driver;
 
 struct kgsl_pagetable;
-struct kgsl_memdesc_ops;
+struct kgsl_memdesc;
+
+struct kgsl_memdesc_ops {
+	int (*vmflags)(struct kgsl_memdesc *);
+	int (*vmfault)(struct kgsl_memdesc *, struct vm_area_struct *,
+		       struct vm_fault *);
+	void (*free)(struct kgsl_memdesc *memdesc);
+	int (*map_kernel_mem)(struct kgsl_memdesc *);
+};
 
 /* shared memory allocation */
 struct kgsl_memdesc {
@@ -136,7 +145,7 @@ struct kgsl_mem_entry {
 	struct kgsl_memdesc memdesc;
 	int memtype;
 	void *priv_data;
-	struct list_head list;
+	struct rb_node node;
 	uint32_t free_timestamp;
 	/* back pointer to private structure under whose context this
 	* allocation is made */
@@ -186,27 +195,47 @@ static inline int kgsl_gpuaddr_in_memdesc(const struct kgsl_memdesc *memdesc,
 	}
 	return 0;
 }
-static inline uint8_t *kgsl_gpuaddr_to_vaddr(const struct kgsl_memdesc *memdesc,
+
+static inline void *kgsl_memdesc_map(struct kgsl_memdesc *memdesc)
+{
+	if (memdesc->hostptr == NULL && memdesc->ops &&
+		memdesc->ops->map_kernel_mem)
+		memdesc->ops->map_kernel_mem(memdesc);
+
+	return memdesc->hostptr;
+}
+
+static inline uint8_t *kgsl_gpuaddr_to_vaddr(struct kgsl_memdesc *memdesc,
 					     unsigned int gpuaddr)
 {
-	if (memdesc->hostptr == NULL || memdesc->gpuaddr == 0 ||
-		(gpuaddr < memdesc->gpuaddr ||
-		gpuaddr >= memdesc->gpuaddr + memdesc->size))
-		return NULL;
+	if (memdesc->gpuaddr == 0 ||
+		gpuaddr < memdesc->gpuaddr ||
+		gpuaddr >= (memdesc->gpuaddr + memdesc->size) ||
+		(NULL == memdesc->hostptr && memdesc->ops->map_kernel_mem &&
+			memdesc->ops->map_kernel_mem(memdesc)))
+			return NULL;
 
 	return memdesc->hostptr + (gpuaddr - memdesc->gpuaddr);
 }
 
-static inline int timestamp_cmp(unsigned int new, unsigned int old)
+static inline int timestamp_cmp(unsigned int a, unsigned int b)
 {
-	int ts_diff = new - old;
-
-	if (ts_diff == 0)
+	/* check for equal */
+	if (a == b)
 		return 0;
 
-    /*< DTS2012042406822 hanfeng 20120428 begin*/
-	return ((ts_diff > 0) || (ts_diff < -KGSL_TIMESTAMP_WINDOW)) ? 1 : -1;
-    /* DTS2012042406822 hanfeng 20120428 end > */
+	/* check for greater-than for non-rollover case */
+	if ((a > b) && (a - b < KGSL_TIMESTAMP_WINDOW))
+		return 1;
+
+	/* check for greater-than for rollover case
+	 * note that <= is required to ensure that consistent
+	 * results are returned for values whose difference is
+	 * equal to the window size
+	 */
+	a += KGSL_TIMESTAMP_WINDOW;
+	b += KGSL_TIMESTAMP_WINDOW;
+	return ((a > b) && (a - b <= KGSL_TIMESTAMP_WINDOW)) ? 1 : -1;
 }
 
 static inline void
diff --git a/drivers/gpu/msm/kgsl_cffdump.c b/drivers/gpu/msm/kgsl_cffdump.c
index e9455cb82..77aef1ff0 100644
--- a/drivers/gpu/msm/kgsl_cffdump.c
+++ b/drivers/gpu/msm/kgsl_cffdump.c
@@ -497,190 +497,6 @@ int kgsl_cffdump_waitirq(void)
 }
 EXPORT_SYMBOL(kgsl_cffdump_waitirq);
 
-#define ADDRESS_STACK_SIZE 256
-#define GET_PM4_TYPE3_OPCODE(x) ((*(x) >> 8) & 0xFF)
-static unsigned int kgsl_cffdump_addr_count;
-
-static bool kgsl_cffdump_handle_type3(struct kgsl_device_private *dev_priv,
-	uint *hostaddr, bool check_only)
-{
-	static uint addr_stack[ADDRESS_STACK_SIZE];
-	static uint size_stack[ADDRESS_STACK_SIZE];
-
-	switch (GET_PM4_TYPE3_OPCODE(hostaddr)) {
-	case CP_INDIRECT_BUFFER_PFD:
-	case CP_INDIRECT_BUFFER:
-	{
-		/* traverse indirect buffers */
-		int i;
-		uint ibaddr = hostaddr[1];
-		uint ibsize = hostaddr[2];
-
-		/* is this address already in encountered? */
-		for (i = 0;
-			i < kgsl_cffdump_addr_count && addr_stack[i] != ibaddr;
-			++i)
-			;
-
-		if (kgsl_cffdump_addr_count == i) {
-			addr_stack[kgsl_cffdump_addr_count] = ibaddr;
-			size_stack[kgsl_cffdump_addr_count++] = ibsize;
-
-			if (kgsl_cffdump_addr_count >= ADDRESS_STACK_SIZE) {
-				KGSL_CORE_ERR("stack overflow\n");
-				return false;
-			}
-
-			return kgsl_cffdump_parse_ibs(dev_priv, NULL,
-				ibaddr, ibsize, check_only);
-		} else if (size_stack[i] != ibsize) {
-			KGSL_CORE_ERR("gpuaddr: 0x%08x, "
-				"wc: %u, with size wc: %u already on the "
-				"stack\n", ibaddr, ibsize, size_stack[i]);
-			return false;
-		}
-	}
-	break;
-	}
-
-	return true;
-}
-
-/*
- * Traverse IBs and dump them to test vector. Detect swap by inspecting
- * register writes, keeping note of the current state, and dump
- * framebuffer config to test vector
- */
-bool kgsl_cffdump_parse_ibs(struct kgsl_device_private *dev_priv,
-	const struct kgsl_memdesc *memdesc, uint gpuaddr, int sizedwords,
-	bool check_only)
-{
-	static uint level; /* recursion level */
-	bool ret = true;
-	uint *hostaddr, *hoststart;
-	int dwords_left = sizedwords; /* dwords left in the current command
-					 buffer */
-
-	if (level == 0)
-		kgsl_cffdump_addr_count = 0;
-
-	if (memdesc == NULL) {
-		struct kgsl_mem_entry *entry;
-		spin_lock(&dev_priv->process_priv->mem_lock);
-		entry = kgsl_sharedmem_find_region(dev_priv->process_priv,
-			gpuaddr, sizedwords * sizeof(uint));
-		spin_unlock(&dev_priv->process_priv->mem_lock);
-		if (entry == NULL) {
-			KGSL_CORE_ERR("did not find mapping "
-				"for gpuaddr: 0x%08x\n", gpuaddr);
-			return true;
-		}
-		memdesc = &entry->memdesc;
-	}
-	hostaddr = (uint *)kgsl_gpuaddr_to_vaddr(memdesc, gpuaddr);
-	if (hostaddr == NULL) {
-		KGSL_CORE_ERR("no kernel mapping for "
-			"gpuaddr: 0x%08x\n", gpuaddr);
-		return true;
-	}
-
-	hoststart = hostaddr;
-
-	level++;
-
-	mb();
-	kgsl_cache_range_op((struct kgsl_memdesc *)memdesc,
-				KGSL_CACHE_OP_INV);
-#ifdef DEBUG
-	pr_info("kgsl: cffdump: ib: gpuaddr:0x%08x, wc:%d, hptr:%p\n",
-		gpuaddr, sizedwords, hostaddr);
-#endif
-
-	while (dwords_left > 0) {
-		int count = 0; /* dword count including packet header */
-		bool cur_ret = true;
-
-		switch (*hostaddr >> 30) {
-		case 0x0: /* type-0 */
-			count = (*hostaddr >> 16)+2;
-			break;
-		case 0x1: /* type-1 */
-			count = 2;
-			break;
-		case 0x3: /* type-3 */
-			count = ((*hostaddr >> 16) & 0x3fff) + 2;
-			cur_ret = kgsl_cffdump_handle_type3(dev_priv,
-				hostaddr, check_only);
-			break;
-		default:
-			pr_warn("kgsl: cffdump: parse-ib: unexpected type: "
-				"type:%d, word:0x%08x @ 0x%p, gpu:0x%08x\n",
-				*hostaddr >> 30, *hostaddr, hostaddr,
-				gpuaddr+4*(sizedwords-dwords_left));
-			cur_ret = false;
-			count = dwords_left;
-			break;
-		}
-
-#ifdef DEBUG
-		if (!cur_ret) {
-			pr_info("kgsl: cffdump: bad sub-type: #:%d/%d, v:0x%08x"
-				" @ 0x%p[gb:0x%08x], level:%d\n",
-				sizedwords-dwords_left, sizedwords, *hostaddr,
-				hostaddr, gpuaddr+4*(sizedwords-dwords_left),
-				level);
-
-			print_hex_dump(KERN_ERR, level == 1 ? "IB1:" : "IB2:",
-				DUMP_PREFIX_OFFSET, 32, 4, hoststart,
-				sizedwords*4, 0);
-		}
-#endif
-		ret = ret && cur_ret;
-
-		/* jump to next packet */
-		dwords_left -= count;
-		hostaddr += count;
-		cur_ret = dwords_left >= 0;
-
-#ifdef DEBUG
-		if (!cur_ret) {
-			pr_info("kgsl: cffdump: bad count: c:%d, #:%d/%d, "
-				"v:0x%08x @ 0x%p[gb:0x%08x], level:%d\n",
-				count, sizedwords-(dwords_left+count),
-				sizedwords, *(hostaddr-count), hostaddr-count,
-				gpuaddr+4*(sizedwords-(dwords_left+count)),
-				level);
-
-			print_hex_dump(KERN_ERR, level == 1 ? "IB1:" : "IB2:",
-				DUMP_PREFIX_OFFSET, 32, 4, hoststart,
-				sizedwords*4, 0);
-		}
-#endif
-
-		ret = ret && cur_ret;
-	}
-
-	if (!ret)
-		pr_info("kgsl: cffdump: parsing failed: gpuaddr:0x%08x, "
-			"host:0x%p, wc:%d\n", gpuaddr, hoststart, sizedwords);
-
-	if (!check_only) {
-#ifdef DEBUG
-		uint offset = gpuaddr - memdesc->gpuaddr;
-		pr_info("kgsl: cffdump: ib-dump: hostptr:%p, gpuaddr:%08x, "
-			"physaddr:%08x, offset:%d, size:%d", hoststart,
-			gpuaddr, memdesc->physaddr + offset, offset,
-			sizedwords*4);
-#endif
-		kgsl_cffdump_syncmem(dev_priv, memdesc, gpuaddr, sizedwords*4,
-			false);
-	}
-
-	level--;
-
-	return ret;
-}
-
 static int subbuf_start_handler(struct rchan_buf *buf,
 	void *subbuf, void *prev_subbuf, uint prev_padding)
 {
diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h
index 2fb1e43f4..efec2af9e 100644
--- a/drivers/gpu/msm/kgsl_device.h
+++ b/drivers/gpu/msm/kgsl_device.h
@@ -15,6 +15,7 @@
 
 #include <linux/idr.h>
 #include <linux/wakelock.h>
+#include <linux/pm_qos_params.h>
 #include <linux/earlysuspend.h>
 
 #include "kgsl.h"
@@ -184,6 +185,7 @@ struct kgsl_device {
 	struct wake_lock idle_wakelock;
 	struct kgsl_pwrscale pwrscale;
 	struct kobject pwrscale_kobj;
+	struct pm_qos_request_list pm_qos_req_dma;
 	struct work_struct ts_expired_ws;
 	struct list_head events;
 	s64 on_time;
@@ -197,13 +199,18 @@ struct kgsl_context {
 
 	/* Pointer to the device specific context information */
 	void *devctxt;
+	/*
+	 * Status indicating whether a gpu reset occurred and whether this
+	 * context was responsible for causing it
+	 */
+	unsigned int reset_status;
 };
 
 struct kgsl_process_private {
 	unsigned int refcnt;
 	pid_t pid;
 	spinlock_t mem_lock;
-	struct list_head mem_list;
+	struct rb_root mem_rb;
 	struct kgsl_pagetable *pagetable;
 	struct list_head list;
 	struct kobject kobj;
diff --git a/drivers/gpu/msm/kgsl_drm.c b/drivers/gpu/msm/kgsl_drm.c
index dba2dfcfb..ba48f9c75 100644
--- a/drivers/gpu/msm/kgsl_drm.c
+++ b/drivers/gpu/msm/kgsl_drm.c
@@ -295,8 +295,9 @@ kgsl_gem_alloc_memory(struct drm_gem_object *obj)
 		priv->memdesc.size = obj->size * priv->bufcount;
 
 	} else if (TYPE_IS_MEM(priv->type)) {
-		priv->memdesc.hostptr =
-			vmalloc_user(obj->size * priv->bufcount);
+		result = kgsl_sharedmem_page_alloc(&priv->memdesc,
+					priv->pagetable,
+					obj->size * priv->bufcount, 0);
 
 		if (priv->memdesc.hostptr == NULL) {
 			DRM_ERROR("Unable to allocate vmalloc memory\n");
@@ -1042,17 +1043,18 @@ int kgsl_gem_kmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct drm_device *dev = obj->dev;
 	struct drm_kgsl_gem_object *priv;
-	unsigned long offset, pg;
+	unsigned long offset;
 	struct page *page;
+	int i;
 
 	mutex_lock(&dev->struct_mutex);
 
 	priv = obj->driver_private;
 
 	offset = (unsigned long) vmf->virtual_address - vma->vm_start;
-	pg = (unsigned long) priv->memdesc.hostptr + offset;
+	i = offset >> PAGE_SHIFT;
+	page = sg_page(&(priv->memdesc.sg[i]));
 
-	page = vmalloc_to_page((void *) pg);
 	if (!page) {
 		mutex_unlock(&dev->struct_mutex);
 		return VM_FAULT_SIGBUS;
diff --git a/drivers/gpu/msm/kgsl_gpummu.c b/drivers/gpu/msm/kgsl_gpummu.c
old mode 100755
new mode 100644
index a16b95418..f038f0491
--- a/drivers/gpu/msm/kgsl_gpummu.c
+++ b/drivers/gpu/msm/kgsl_gpummu.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2011, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2011-2012, Code Aurora Forum. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -354,8 +354,8 @@ void *kgsl_gpummu_ptpool_init(int ptsize, int entries)
 int kgsl_gpummu_pt_equal(struct kgsl_pagetable *pt,
 					unsigned int pt_base)
 {
-	struct kgsl_gpummu_pt *gpummu_pt = pt->priv;
-	return pt && pt_base && (gpummu_pt->base.gpuaddr == pt_base);
+	struct kgsl_gpummu_pt *gpummu_pt = pt ? pt->priv : NULL;
+	return gpummu_pt && pt_base && (gpummu_pt->base.gpuaddr == pt_base);
 }
 
 void kgsl_gpummu_destroy_pagetable(void *mmu_specific_pt)
@@ -398,14 +398,14 @@ static unsigned int kgsl_gpummu_pt_get_flags(struct kgsl_pagetable *pt,
 				enum kgsl_deviceid id)
 {
 	unsigned int result = 0;
-	struct kgsl_gpummu_pt *gpummu_pt = (struct kgsl_gpummu_pt *)
-						pt->priv;
+	struct kgsl_gpummu_pt *gpummu_pt;
 
 	if (pt == NULL)
 		return 0;
+	gpummu_pt = pt->priv;
 
 	spin_lock(&pt->lock);
-	if (gpummu_pt->tlb_flags && (1<<id)) {
+	if (gpummu_pt->tlb_flags & (1<<id)) {
 		result = KGSL_MMUFLAGS_TLBFLUSH;
 		gpummu_pt->tlb_flags &= ~(1<<id);
 	}
@@ -682,7 +682,7 @@ kgsl_gpummu_map(void *mmu_specific_pt,
 		flushtlb = 1;
 
 	for_each_sg(memdesc->sg, s, memdesc->sglen, i) {
-		unsigned int paddr = sg_phys(s);
+		unsigned int paddr = kgsl_get_sg_pa(s);
 		unsigned int j;
 
 		/* Each sg entry might be multiple pages long */
diff --git a/drivers/gpu/msm/kgsl_iommu.c b/drivers/gpu/msm/kgsl_iommu.c
index e4e561cef..5646d682a 100644
--- a/drivers/gpu/msm/kgsl_iommu.c
+++ b/drivers/gpu/msm/kgsl_iommu.c
@@ -34,8 +34,8 @@ struct kgsl_iommu {
 static int kgsl_iommu_pt_equal(struct kgsl_pagetable *pt,
 					unsigned int pt_base)
 {
-	struct iommu_domain *domain = pt->priv;
-	return pt && pt_base && ((unsigned int)domain == pt_base);
+	struct iommu_domain *domain = pt ? pt->priv : NULL;
+	return domain && pt_base && ((unsigned int)domain == pt_base);
 }
 
 static void kgsl_iommu_destroy_pagetable(void *mmu_specific_pt)
@@ -262,7 +262,7 @@ kgsl_iommu_map(void *mmu_specific_pt,
 	iommu_virt_addr = memdesc->gpuaddr;
 
 	ret = iommu_map_range(domain, iommu_virt_addr, memdesc->sg,
-				memdesc->size, 0);
+				memdesc->size, (IOMMU_READ | IOMMU_WRITE));
 	if (ret) {
 		KGSL_CORE_ERR("iommu_map_range(%p, %x, %p, %d, %d) "
 				"failed with err: %d\n", domain,
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c
old mode 100755
new mode 100644
index 003afb953..429e4946d
--- a/drivers/gpu/msm/kgsl_pwrctrl.c
+++ b/drivers/gpu/msm/kgsl_pwrctrl.c
@@ -13,7 +13,6 @@
 #include <linux/interrupt.h>
 #include <mach/msm_iomap.h>
 #include <mach/msm_bus.h>
-#include <mach/socinfo.h>
 
 #include "kgsl.h"
 #include "kgsl_pwrscale.h"
@@ -25,6 +24,7 @@
 #define KGSL_PWRFLAGS_AXI_ON   2
 #define KGSL_PWRFLAGS_IRQ_ON   3
 
+#define GPU_SWFI_LATENCY	3
 #define UPDATE_BUSY_VAL		1000000
 #define UPDATE_BUSY		50
 
@@ -284,10 +284,7 @@ static int kgsl_pwrctrl_gpubusy_show(struct device *dev,
 DEVICE_ATTR(gpuclk, 0644, kgsl_pwrctrl_gpuclk_show, kgsl_pwrctrl_gpuclk_store);
 DEVICE_ATTR(max_gpuclk, 0644, kgsl_pwrctrl_max_gpuclk_show,
 	kgsl_pwrctrl_max_gpuclk_store);
-/*< DTS2011123005723 hanfeng 20111230 begin*/
-/*modify the file permission */
 DEVICE_ATTR(pwrnap, 0664, kgsl_pwrctrl_pwrnap_show, kgsl_pwrctrl_pwrnap_store);
-/* DTS2011123005723 hanfeng 20111230 end >*/
 DEVICE_ATTR(idle_timer, 0644, kgsl_pwrctrl_idle_timer_show,
 	kgsl_pwrctrl_idle_timer_store);
 DEVICE_ATTR(gpubusy, 0644, kgsl_pwrctrl_gpubusy_show,
@@ -337,7 +334,8 @@ static void kgsl_pwrctrl_busy_time(struct kgsl_device *device, bool on_time)
 	do_gettimeofday(&(b->start));
 }
 
-void kgsl_pwrctrl_clk(struct kgsl_device *device, int state)
+void kgsl_pwrctrl_clk(struct kgsl_device *device, int state,
+					  int requested_state)
 {
 	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
 	int i = 0;
@@ -349,7 +347,7 @@ void kgsl_pwrctrl_clk(struct kgsl_device *device, int state)
 				if (pwr->grp_clks[i])
 					clk_disable(pwr->grp_clks[i]);
 			if ((pwr->pwrlevels[0].gpu_freq > 0) &&
-				(device->requested_state != KGSL_STATE_NAP))
+				(requested_state != KGSL_STATE_NAP))
 				clk_set_rate(pwr->grp_clks[0],
 					pwr->pwrlevels[pwr->num_pwrlevels - 1].
 					gpu_freq);
@@ -424,8 +422,12 @@ void kgsl_pwrctrl_pwrrail(struct kgsl_device *device, int state)
 		if (!test_and_set_bit(KGSL_PWRFLAGS_POWER_ON,
 			&pwr->power_flags)) {
 			trace_kgsl_rail(device, state);
-			if (pwr->gpu_reg)
-				regulator_enable(pwr->gpu_reg);
+			if (pwr->gpu_reg) {
+				int status = regulator_enable(pwr->gpu_reg);
+				if (status)
+					KGSL_DRV_ERR(device, "regulator_enable "
+							"failed: %d\n", status);
+			}
 		}
 	}
 }
@@ -512,10 +514,7 @@ int kgsl_pwrctrl_init(struct kgsl_device *device)
 	pwr->nap_allowed = pdata->nap_allowed;
 	pwr->idle_needed = pdata->idle_needed;
 	pwr->interval_timeout = pdata->idle_timeout;
-	/*< DTS2012041906630 zhangxiangdang 20120423 begin */
-	/*merge qc patch to fix kgsl issue.*/
 	pwr->strtstp_sleepwake = pdata->strtstp_sleepwake;
-	/* DTS2012041906630 zhangxiangdang 20120423 end > */
 	pwr->ebi1_clk = clk_get(&pdev->dev, "bus_clk");
 	if (IS_ERR(pwr->ebi1_clk))
 		pwr->ebi1_clk = NULL;
@@ -638,10 +637,8 @@ void kgsl_timer(unsigned long data)
 
 	KGSL_PWR_INFO(device, "idle timer expired device %d\n", device->id);
 	if (device->requested_state != KGSL_STATE_SUSPEND) {
-		/*< DTS2012041906630 zhangxiangdang 20120423 begin */
 		if (device->pwrctrl.restore_slumber ||
 					device->pwrctrl.strtstp_sleepwake)
-			/* DTS2012041906630 zhangxiangdang 20120423 end > */
 			kgsl_pwrctrl_request_state(device, KGSL_STATE_SLUMBER);
 		else
 			kgsl_pwrctrl_request_state(device, KGSL_STATE_SLEEP);
@@ -708,10 +705,8 @@ _nap(struct kgsl_device *device)
 			return -EBUSY;
 		}
 		kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
-		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF);
-		/*< DTS2012041906630 zhangxiangdang 20120423 begin */
-		kgsl_pwrctrl_set_state(device, device->requested_state);
-		/* DTS2012041906630 zhangxiangdang 20120423 end > */
+		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_NAP);
+		kgsl_pwrctrl_set_state(device, KGSL_STATE_NAP);
 		if (device->idle_wakelock.name)
 			wake_unlock(&device->idle_wakelock);
 	case KGSL_STATE_NAP:
@@ -753,10 +748,11 @@ _sleep(struct kgsl_device *device)
 				pwr->pwrlevels[pwr->num_pwrlevels - 1].
 				gpu_freq);
 		_sleep_accounting(device);
-		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF);
+		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP);
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_SLEEP);
-		if (device->idle_wakelock.name)
-			wake_unlock(&device->idle_wakelock);
+		wake_unlock(&device->idle_wakelock);
+		pm_qos_update_request(&device->pm_qos_req_dma,
+					PM_QOS_DEFAULT_VALUE);
 		break;
 	case KGSL_STATE_SLEEP:
 	case KGSL_STATE_SLUMBER:
@@ -783,18 +779,18 @@ _slumber(struct kgsl_device *device)
 	case KGSL_STATE_NAP:
 	case KGSL_STATE_SLEEP:
 		del_timer_sync(&device->idle_timer);
-		/*< DTS2012041906630 zhangxiangdang 20120423 begin */
 		if (!device->pwrctrl.strtstp_sleepwake)
 			kgsl_pwrctrl_pwrlevel_change(device,
 					KGSL_PWRLEVEL_NOMINAL);
-		/* DTS2012041906630 zhangxiangdang 20120423 end > */
+		device->pwrctrl.restore_slumber = true;
 		device->ftbl->suspend_context(device);
 		device->ftbl->stop(device);
-		device->pwrctrl.restore_slumber = true;
 		_sleep_accounting(device);
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_SLUMBER);
 		if (device->idle_wakelock.name)
 			wake_unlock(&device->idle_wakelock);
+		pm_qos_update_request(&device->pm_qos_req_dma,
+						PM_QOS_DEFAULT_VALUE);
 		break;
 	case KGSL_STATE_SLUMBER:
 		break;
@@ -856,16 +852,17 @@ void kgsl_pwrctrl_wake(struct kgsl_device *device)
 		/* fall through */
 	case KGSL_STATE_NAP:
 		/* Turn on the core clocks */
-		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON);
+		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON, KGSL_STATE_ACTIVE);
 		/* Enable state before turning on irq */
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE);
 		kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON);
 		/* Re-enable HW access */
 		mod_timer(&device->idle_timer,
 				jiffies + device->pwrctrl.interval_timeout);
-
-		if (device->idle_wakelock.name)
-			wake_lock(&device->idle_wakelock);
+		wake_lock(&device->idle_wakelock);
+		if (device->pwrctrl.restore_slumber == false)
+			pm_qos_update_request(&device->pm_qos_req_dma,
+						GPU_SWFI_LATENCY);
 	case KGSL_STATE_ACTIVE:
 		break;
 	default:
@@ -881,7 +878,7 @@ void kgsl_pwrctrl_enable(struct kgsl_device *device)
 {
 	/* Order pwrrail/clk sequence based upon platform */
 	kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_ON);
-	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON);
+	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON, KGSL_STATE_ACTIVE);
 	kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_ON);
 }
 EXPORT_SYMBOL(kgsl_pwrctrl_enable);
@@ -890,7 +887,7 @@ void kgsl_pwrctrl_disable(struct kgsl_device *device)
 {
 	/* Order pwrrail/clk sequence based upon platform */
 	kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_OFF);
-	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF);
+	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP);
 	kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_OFF);
 }
 EXPORT_SYMBOL(kgsl_pwrctrl_disable);
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.h b/drivers/gpu/msm/kgsl_pwrctrl.h
old mode 100755
new mode 100644
index 0c7ec6003..caaed92c8
--- a/drivers/gpu/msm/kgsl_pwrctrl.h
+++ b/drivers/gpu/msm/kgsl_pwrctrl.h
@@ -47,9 +47,7 @@ struct kgsl_pwrctrl {
 	int thermal_pwrlevel;
 	unsigned int num_pwrlevels;
 	unsigned int interval_timeout;
-	/*< DTS2012041906630 zhangxiangdang 20120423 begin */
 	bool strtstp_sleepwake;
-	/* DTS2012041906630 zhangxiangdang 20120423 end > */
 	struct regulator *gpu_reg;
 	uint32_t pcl;
 	unsigned int nap_allowed;
diff --git a/drivers/gpu/msm/kgsl_pwrscale.c b/drivers/gpu/msm/kgsl_pwrscale.c
old mode 100755
new mode 100644
index c2252edcf..d0b2a412c
--- a/drivers/gpu/msm/kgsl_pwrscale.c
+++ b/drivers/gpu/msm/kgsl_pwrscale.c
@@ -89,10 +89,8 @@ static ssize_t pwrscale_policy_show(struct kgsl_device *device, char *buf)
 
 	return ret;
 }
-/*< DTS2011123005723 hanfeng 20111230 begin*/
-/*modify the file permission */
+
 PWRSCALE_ATTR(policy, 0664, pwrscale_policy_show, pwrscale_policy_store);
-/*DTS2011123005723 hanfeng 20111230 end >*/
 
 static ssize_t pwrscale_avail_policies_show(struct kgsl_device *device,
 					    char *buf)
diff --git a/drivers/gpu/msm/kgsl_pwrscale_idlestats.c b/drivers/gpu/msm/kgsl_pwrscale_idlestats.c
old mode 100755
new mode 100644
diff --git a/drivers/gpu/msm/kgsl_sharedmem.c b/drivers/gpu/msm/kgsl_sharedmem.c
old mode 100755
new mode 100644
index 389ed6d4f..ae32e81ff
--- a/drivers/gpu/msm/kgsl_sharedmem.c
+++ b/drivers/gpu/msm/kgsl_sharedmem.c
@@ -13,6 +13,8 @@
 #include <linux/vmalloc.h>
 #include <linux/memory_alloc.h>
 #include <asm/cacheflush.h>
+#include <linux/slab.h>
+#include <linux/kmemleak.h>
 
 #include "kgsl.h"
 #include "kgsl_sharedmem.h"
@@ -201,6 +203,10 @@ static int kgsl_drv_memstat_show(struct device *dev,
 		val = kgsl_driver.stats.vmalloc;
 	else if (!strncmp(attr->attr.name, "vmalloc_max", 11))
 		val = kgsl_driver.stats.vmalloc_max;
+	else if (!strncmp(attr->attr.name, "page_alloc", 10))
+		val = kgsl_driver.stats.page_alloc;
+	else if (!strncmp(attr->attr.name, "page_alloc_max", 14))
+		val = kgsl_driver.stats.page_alloc_max;
 	else if (!strncmp(attr->attr.name, "coherent", 8))
 		val = kgsl_driver.stats.coherent;
 	else if (!strncmp(attr->attr.name, "coherent_max", 12))
@@ -230,6 +236,8 @@ static int kgsl_drv_histogram_show(struct device *dev,
 
 DEVICE_ATTR(vmalloc, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(vmalloc_max, 0444, kgsl_drv_memstat_show, NULL);
+DEVICE_ATTR(page_alloc, 0444, kgsl_drv_memstat_show, NULL);
+DEVICE_ATTR(page_alloc_max, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(coherent, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(coherent_max, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(mapped, 0444, kgsl_drv_memstat_show, NULL);
@@ -239,6 +247,8 @@ DEVICE_ATTR(histogram, 0444, kgsl_drv_histogram_show, NULL);
 static const struct device_attribute *drv_attr_list[] = {
 	&dev_attr_vmalloc,
 	&dev_attr_vmalloc_max,
+	&dev_attr_page_alloc,
+	&dev_attr_page_alloc_max,
 	&dev_attr_coherent,
 	&dev_attr_coherent_max,
 	&dev_attr_mapped,
@@ -282,7 +292,7 @@ static void outer_cache_range_op_sg(struct scatterlist *sg, int sglen, int op)
 	int i;
 
 	for_each_sg(sg, s, sglen, i) {
-		unsigned int paddr = sg_phys(s);
+		unsigned int paddr = kgsl_get_sg_pa(s);
 		_outer_cache_range_op(op, paddr, s->length);
 	}
 }
@@ -293,17 +303,18 @@ static void outer_cache_range_op_sg(struct scatterlist *sg, int sglen, int op)
 }
 #endif
 
-static int kgsl_vmalloc_vmfault(struct kgsl_memdesc *memdesc,
+static int kgsl_page_alloc_vmfault(struct kgsl_memdesc *memdesc,
 				struct vm_area_struct *vma,
 				struct vm_fault *vmf)
 {
-	unsigned long offset, pg;
+	unsigned long offset;
 	struct page *page;
+	int i;
 
 	offset = (unsigned long) vmf->virtual_address - vma->vm_start;
-	pg = (unsigned long) memdesc->hostptr + offset;
 
-	page = vmalloc_to_page((void *) pg);
+	i = offset >> PAGE_SHIFT;
+	page = sg_page(&memdesc->sg[i]);
 	if (page == NULL)
 		return VM_FAULT_SIGBUS;
 
@@ -313,15 +324,23 @@ static int kgsl_vmalloc_vmfault(struct kgsl_memdesc *memdesc,
 	return 0;
 }
 
-static int kgsl_vmalloc_vmflags(struct kgsl_memdesc *memdesc)
+static int kgsl_page_alloc_vmflags(struct kgsl_memdesc *memdesc)
 {
 	return VM_RESERVED | VM_DONTEXPAND;
 }
 
-static void kgsl_vmalloc_free(struct kgsl_memdesc *memdesc)
+static void kgsl_page_alloc_free(struct kgsl_memdesc *memdesc)
 {
-	kgsl_driver.stats.vmalloc -= memdesc->size;
-	vfree(memdesc->hostptr);
+	int i = 0;
+	struct scatterlist *sg;
+	kgsl_driver.stats.page_alloc -= memdesc->size;
+	if (memdesc->hostptr) {
+		vunmap(memdesc->hostptr);
+		kgsl_driver.stats.vmalloc -= memdesc->size;
+	}
+	if (memdesc->sg)
+		for_each_sg(memdesc->sg, sg, memdesc->sglen, i)
+			__free_page(sg_page(sg));
 }
 
 static int kgsl_contiguous_vmflags(struct kgsl_memdesc *memdesc)
@@ -329,6 +348,42 @@ static int kgsl_contiguous_vmflags(struct kgsl_memdesc *memdesc)
 	return VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND;
 }
 
+/*
+ * kgsl_page_alloc_map_kernel - Map the memory in memdesc to kernel address
+ * space
+ *
+ * @memdesc - The memory descriptor which contains information about the memory
+ *
+ * Return: 0 on success else error code
+ */
+static int kgsl_page_alloc_map_kernel(struct kgsl_memdesc *memdesc)
+{
+	if (!memdesc->hostptr) {
+		pgprot_t page_prot = pgprot_writecombine(PAGE_KERNEL);
+		struct page **pages = NULL;
+		struct scatterlist *sg;
+		int i;
+		/* create a list of pages to call vmap */
+		pages = vmalloc(memdesc->sglen * sizeof(struct page *));
+		if (!pages) {
+			KGSL_CORE_ERR("vmalloc(%d) failed\n",
+				memdesc->sglen * sizeof(struct page *));
+			return -ENOMEM;
+		}
+		for_each_sg(memdesc->sg, sg, memdesc->sglen, i)
+			pages[i] = sg_page(sg);
+		memdesc->hostptr = vmap(pages, memdesc->sglen,
+					VM_IOREMAP, page_prot);
+		KGSL_STATS_ADD(memdesc->size, kgsl_driver.stats.vmalloc,
+				kgsl_driver.stats.vmalloc_max);
+		vfree(pages);
+	}
+	if (!memdesc->hostptr)
+		return -ENOMEM;
+
+	return 0;
+}
+
 static int kgsl_contiguous_vmfault(struct kgsl_memdesc *memdesc,
 				struct vm_area_struct *vma,
 				struct vm_fault *vmf)
@@ -368,12 +423,13 @@ static void kgsl_coherent_free(struct kgsl_memdesc *memdesc)
 }
 
 /* Global - also used by kgsl_drm.c */
-struct kgsl_memdesc_ops kgsl_vmalloc_ops = {
-	.free = kgsl_vmalloc_free,
-	.vmflags = kgsl_vmalloc_vmflags,
-	.vmfault = kgsl_vmalloc_vmfault,
+struct kgsl_memdesc_ops kgsl_page_alloc_ops = {
+	.free = kgsl_page_alloc_free,
+	.vmflags = kgsl_page_alloc_vmflags,
+	.vmfault = kgsl_page_alloc_vmfault,
+	.map_kernel_mem = kgsl_page_alloc_map_kernel,
 };
-EXPORT_SYMBOL(kgsl_vmalloc_ops);
+EXPORT_SYMBOL(kgsl_page_alloc_ops);
 
 static struct kgsl_memdesc_ops kgsl_ebimem_ops = {
 	.free = kgsl_ebimem_free,
@@ -407,9 +463,9 @@ void kgsl_cache_range_op(struct kgsl_memdesc *memdesc, int op)
 EXPORT_SYMBOL(kgsl_cache_range_op);
 
 static int
-_kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc,
+_kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 			struct kgsl_pagetable *pagetable,
-			void *ptr, size_t size, unsigned int protflags)
+			size_t size, unsigned int protflags)
 {
 	int order, ret = 0;
 	int sglen = PAGE_ALIGN(size) / PAGE_SIZE;
@@ -418,36 +474,43 @@ _kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc,
 	memdesc->size = size;
 	memdesc->pagetable = pagetable;
 	memdesc->priv = KGSL_MEMFLAGS_CACHED;
-	memdesc->ops = &kgsl_vmalloc_ops;
-	memdesc->hostptr = (void *) ptr;
+	memdesc->ops = &kgsl_page_alloc_ops;
+
+	memdesc->sg = kgsl_sg_alloc(sglen);
 
-	memdesc->sg = vmalloc(sglen * sizeof(struct scatterlist));
 	if (memdesc->sg == NULL) {
+		KGSL_CORE_ERR("vmalloc(%d) failed\n",
+			sglen * sizeof(struct scatterlist));
 		ret = -ENOMEM;
 		goto done;
 	}
 
+	kmemleak_not_leak(memdesc->sg);
+
 	memdesc->sglen = sglen;
 	sg_init_table(memdesc->sg, sglen);
 
-	for (i = 0; i < memdesc->sglen; i++, ptr += PAGE_SIZE) {
-		struct page *page = vmalloc_to_page(ptr);
+	for (i = 0; i < memdesc->sglen; i++) {
+		struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO |
+						__GFP_HIGHMEM);
 		if (!page) {
-			ret = -EINVAL;
+			ret = -ENOMEM;
+			memdesc->sglen = i;
 			goto done;
 		}
+		flush_dcache_page(page);
 		sg_set_page(&memdesc->sg[i], page, PAGE_SIZE, 0);
 	}
-
-	kgsl_cache_range_op(memdesc, KGSL_CACHE_OP_INV);
+	outer_cache_range_op_sg(memdesc->sg, memdesc->sglen,
+				KGSL_CACHE_OP_FLUSH);
 
 	ret = kgsl_mmu_map(pagetable, memdesc, protflags);
 
 	if (ret)
 		goto done;
 
-	KGSL_STATS_ADD(size, kgsl_driver.stats.vmalloc,
-		kgsl_driver.stats.vmalloc_max);
+	KGSL_STATS_ADD(size, kgsl_driver.stats.page_alloc,
+		kgsl_driver.stats.page_alloc_max);
 
 	order = get_order(size);
 
@@ -462,51 +525,41 @@ _kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc,
 }
 
 int
-kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc,
+kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 		       struct kgsl_pagetable *pagetable, size_t size)
 {
-	void *ptr;
-
+	int ret = 0;
 	BUG_ON(size == 0);
 
 	size = ALIGN(size, PAGE_SIZE * 2);
-	ptr = vmalloc(size);
-
-	if (ptr  == NULL) {
-		KGSL_CORE_ERR("vmalloc(%d) failed\n", size);
-		return -ENOMEM;
-	}
 
-	return _kgsl_sharedmem_vmalloc(memdesc, pagetable, ptr, size,
+	ret =  _kgsl_sharedmem_page_alloc(memdesc, pagetable, size,
 		GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	if (!ret)
+		ret = kgsl_page_alloc_map_kernel(memdesc);
+	if (ret)
+		kgsl_sharedmem_free(memdesc);
+	return ret;
 }
-EXPORT_SYMBOL(kgsl_sharedmem_vmalloc);
+EXPORT_SYMBOL(kgsl_sharedmem_page_alloc);
 
 int
-kgsl_sharedmem_vmalloc_user(struct kgsl_memdesc *memdesc,
+kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc,
 			    struct kgsl_pagetable *pagetable,
 			    size_t size, int flags)
 {
-	void *ptr;
 	unsigned int protflags;
 
 	BUG_ON(size == 0);
-	ptr = vmalloc_user(size);
-
-	if (ptr == NULL) {
-		KGSL_CORE_ERR("vmalloc_user(%d) failed: allocated=%d\n",
-			      size, kgsl_driver.stats.vmalloc);
-		return -ENOMEM;
-	}
 
 	protflags = GSL_PT_PAGE_RV;
 	if (!(flags & KGSL_MEMFLAGS_GPUREADONLY))
 		protflags |= GSL_PT_PAGE_WV;
 
-	return _kgsl_sharedmem_vmalloc(memdesc, pagetable, ptr, size,
+	return _kgsl_sharedmem_page_alloc(memdesc, pagetable, size,
 		protflags);
 }
-EXPORT_SYMBOL(kgsl_sharedmem_vmalloc_user);
+EXPORT_SYMBOL(kgsl_sharedmem_page_alloc_user);
 
 int
 kgsl_sharedmem_alloc_coherent(struct kgsl_memdesc *memdesc, size_t size)
@@ -554,7 +607,7 @@ void kgsl_sharedmem_free(struct kgsl_memdesc *memdesc)
 	if (memdesc->ops && memdesc->ops->free)
 		memdesc->ops->free(memdesc);
 
-	vfree(memdesc->sg);
+	kgsl_sg_free(memdesc->sg, memdesc->sglen);
 
 	memset(memdesc, 0, sizeof(*memdesc));
 }
@@ -686,3 +739,33 @@ kgsl_sharedmem_set(const struct kgsl_memdesc *memdesc, unsigned int offsetbytes,
 	return 0;
 }
 EXPORT_SYMBOL(kgsl_sharedmem_set);
+
+/*
+ * kgsl_sharedmem_map_vma - Map a user vma to physical memory
+ *
+ * @vma - The user vma to map
+ * @memdesc - The memory descriptor which contains information about the
+ * physical memory
+ *
+ * Return: 0 on success else error code
+ */
+int
+kgsl_sharedmem_map_vma(struct vm_area_struct *vma,
+			const struct kgsl_memdesc *memdesc)
+{
+	unsigned long addr = vma->vm_start;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	int ret, i = 0;
+
+	if (!memdesc->sg || (size != memdesc->size) ||
+		(memdesc->sglen != (size / PAGE_SIZE)))
+		return -EINVAL;
+
+	for (; addr < vma->vm_end; addr += PAGE_SIZE, i++) {
+		ret = vm_insert_page(vma, addr, sg_page(&memdesc->sg[i]));
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(kgsl_sharedmem_map_vma);
diff --git a/drivers/gpu/msm/kgsl_sharedmem.h b/drivers/gpu/msm/kgsl_sharedmem.h
old mode 100755
new mode 100644
index 67a1c2d7b..a67d9c657
--- a/drivers/gpu/msm/kgsl_sharedmem.h
+++ b/drivers/gpu/msm/kgsl_sharedmem.h
@@ -17,6 +17,8 @@
 #include <linux/dma-mapping.h>
 #include <linux/vmalloc.h>
 #include "kgsl_mmu.h"
+#include <linux/slab.h>
+#include <linux/kmemleak.h>
 
 struct kgsl_device;
 struct kgsl_process_private;
@@ -28,19 +30,12 @@ struct kgsl_process_private;
 /** Set if the memdesc describes cached memory */
 #define KGSL_MEMFLAGS_CACHED    0x00000001
 
-struct kgsl_memdesc_ops {
-	int (*vmflags)(struct kgsl_memdesc *);
-	int (*vmfault)(struct kgsl_memdesc *, struct vm_area_struct *,
-		       struct vm_fault *);
-	void (*free)(struct kgsl_memdesc *memdesc);
-};
-
-extern struct kgsl_memdesc_ops kgsl_vmalloc_ops;
+extern struct kgsl_memdesc_ops kgsl_page_alloc_ops;
 
-int kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc,
+int kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 			   struct kgsl_pagetable *pagetable, size_t size);
 
-int kgsl_sharedmem_vmalloc_user(struct kgsl_memdesc *memdesc,
+int kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc,
 				struct kgsl_pagetable *pagetable,
 				size_t size, int flags);
 
@@ -76,19 +71,58 @@ void kgsl_process_uninit_sysfs(struct kgsl_process_private *private);
 int kgsl_sharedmem_init_sysfs(void);
 void kgsl_sharedmem_uninit_sysfs(void);
 
+static inline unsigned int kgsl_get_sg_pa(struct scatterlist *sg)
+{
+	/*
+	 * Try sg_dma_address first to support ion carveout
+	 * regions which do not work with sg_phys().
+	 */
+	unsigned int pa = sg_dma_address(sg);
+	if (pa == 0)
+		pa = sg_phys(sg);
+	return pa;
+}
+
+int
+kgsl_sharedmem_map_vma(struct vm_area_struct *vma,
+			const struct kgsl_memdesc *memdesc);
+
+/*
+ * For relatively small sglists, it is preferable to use kzalloc
+ * rather than going down the vmalloc rat hole.  If the size of
+ * the sglist is < PAGE_SIZE use kzalloc otherwise fallback to
+ * vmalloc
+ */
+
+static inline void *kgsl_sg_alloc(unsigned int sglen)
+{
+	if ((sglen * sizeof(struct scatterlist)) <  PAGE_SIZE)
+		return kzalloc(sglen * sizeof(struct scatterlist), GFP_KERNEL);
+	else
+		return vmalloc(sglen * sizeof(struct scatterlist));
+}
+
+static inline void kgsl_sg_free(void *ptr, unsigned int sglen)
+{
+	if ((sglen * sizeof(struct scatterlist)) < PAGE_SIZE)
+		kfree(ptr);
+	else
+		vfree(ptr);
+}
+
 static inline int
 memdesc_sg_phys(struct kgsl_memdesc *memdesc,
 		unsigned int physaddr, unsigned int size)
 {
-	struct page *page = phys_to_page(physaddr);
+	memdesc->sg = kgsl_sg_alloc(1);
 
-	memdesc->sg = vmalloc(sizeof(struct scatterlist) * 1);
-	if (memdesc->sg == NULL)
-		return -ENOMEM;
+	kmemleak_not_leak(memdesc->sg);
 
 	memdesc->sglen = 1;
 	sg_init_table(memdesc->sg, 1);
-	sg_set_page(&memdesc->sg[0], page, size, 0);
+	memdesc->sg[0].length = size;
+	memdesc->sg[0].offset = 0;
+	memdesc->sg[0].dma_address = physaddr;
 	return 0;
 }
 
@@ -98,7 +132,7 @@ kgsl_allocate(struct kgsl_memdesc *memdesc,
 {
 	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE)
 		return kgsl_sharedmem_ebimem(memdesc, pagetable, size);
-	return kgsl_sharedmem_vmalloc(memdesc, pagetable, size);
+	return kgsl_sharedmem_page_alloc(memdesc, pagetable, size);
 }
 
 static inline int
@@ -109,7 +143,7 @@ kgsl_allocate_user(struct kgsl_memdesc *memdesc,
 	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE)
 		return kgsl_sharedmem_ebimem_user(memdesc, pagetable, size,
 						  flags);
-	return kgsl_sharedmem_vmalloc_user(memdesc, pagetable, size, flags);
+	return kgsl_sharedmem_page_alloc_user(memdesc, pagetable, size, flags);
 }
 
 static inline int
diff --git a/drivers/gpu/msm/kgsl_snapshot.c b/drivers/gpu/msm/kgsl_snapshot.c
index 72df148bc..394bc83bd 100644
--- a/drivers/gpu/msm/kgsl_snapshot.c
+++ b/drivers/gpu/msm/kgsl_snapshot.c
@@ -10,7 +10,6 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/vmalloc.h>
 #include <linux/time.h>
 #include <linux/sysfs.h>
 #include <linux/utsname.h>
@@ -283,6 +282,12 @@ int kgsl_device_snapshot(struct kgsl_device *device, int hang)
 	/* Freeze the snapshot on a hang until it gets read */
 	device->snapshot_frozen = (hang) ? 1 : 0;
 
+	/* log buffer info to aid in ramdump recovery */
+	KGSL_DRV_ERR(device, "snapshot created at va %p pa %lx size %d\n",
+			device->snapshot, __pa(device->snapshot),
+			device->snapshot_size);
+	if (hang)
+		sysfs_notify(&device->snapshot_kobj, NULL, "timestamp");
 	return 0;
 }
 EXPORT_SYMBOL(kgsl_device_snapshot);
@@ -432,7 +437,7 @@ int kgsl_device_snapshot_init(struct kgsl_device *device)
 	int ret;
 
 	if (device->snapshot == NULL)
-		device->snapshot = vmalloc(KGSL_SNAPSHOT_MEMSIZE);
+		device->snapshot = kzalloc(KGSL_SNAPSHOT_MEMSIZE, GFP_KERNEL);
 
 	if (device->snapshot == NULL)
 		return -ENOMEM;
@@ -475,7 +480,7 @@ void kgsl_device_snapshot_close(struct kgsl_device *device)
 
 	kobject_put(&device->snapshot_kobj);
 
-	vfree(device->snapshot);
+	kfree(device->snapshot);
 
 	device->snapshot = NULL;
 	device->snapshot_maxsize = 0;
diff --git a/drivers/gpu/msm/z180.c b/drivers/gpu/msm/z180.c
old mode 100755
new mode 100644
index cb3da9075..d721a577a
--- a/drivers/gpu/msm/z180.c
+++ b/drivers/gpu/msm/z180.c
@@ -157,13 +157,6 @@ static struct z180_device device_2d0 = {
 		.active_cnt = 0,
 		.iomemname = KGSL_2D0_REG_MEMORY,
 		.ftbl = &z180_functable,
-#ifdef CONFIG_HAS_EARLYSUSPEND
-		.display_off = {
-			.level = EARLY_SUSPEND_LEVEL_STOP_DRAWING,
-			.suspend = kgsl_early_suspend_driver,
-			.resume = kgsl_late_resume_driver,
-		},
-#endif
 	},
 };
 
@@ -195,13 +188,6 @@ static struct z180_device device_2d1 = {
 		.active_cnt = 0,
 		.iomemname = KGSL_2D1_REG_MEMORY,
 		.ftbl = &z180_functable,
-		.display_off = {
-#ifdef CONFIG_HAS_EARLYSUSPEND
-			.level = EARLY_SUSPEND_LEVEL_STOP_DRAWING,
-			.suspend = kgsl_early_suspend_driver,
-			.resume = kgsl_late_resume_driver,
-#endif
-		},
 	},
 };
 
@@ -407,7 +393,7 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv,
 	unsigned int index	= 0;
 	unsigned int nextindex;
 	unsigned int nextcnt    = Z180_STREAM_END_CMD | 5;
-	struct kgsl_memdesc tmp = {0};
+	struct kgsl_mem_entry *entry = NULL;
 	unsigned int cmd;
 	struct kgsl_device *device = dev_priv->device;
 	struct kgsl_pagetable *pagetable = dev_priv->process_priv->pagetable;
@@ -425,8 +411,30 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv,
 	}
 	cmd = ibdesc[0].gpuaddr;
 	sizedwords = ibdesc[0].sizedwords;
-
-	tmp.hostptr = (void *)*timestamp;
+	/*
+	 * Get a kernel mapping to the IB for monkey patching.
+	 * See the end of this function.
+	 */
+	entry = kgsl_sharedmem_find_region(dev_priv->process_priv, cmd,
+		sizedwords);
+	if (entry == NULL) {
+		KGSL_DRV_ERR(device, "Bad ibdesc: gpuaddr 0x%x size %d\n",
+			     cmd, sizedwords);
+		result = -EINVAL;
+		goto error;
+	}
+	/*
+	 * This will only map memory if it exists, otherwise it will reuse the
+	 * mapping. And the 2d userspace reuses IBs so we likely won't create
+	 * too many mappings.
+	 */
+	if (kgsl_gpuaddr_to_vaddr(&entry->memdesc, cmd) == NULL) {
+		KGSL_DRV_ERR(device,
+			     "Cannot make kernel mapping for gpuaddr 0x%x\n",
+			     cmd);
+		result = -EINVAL;
+		goto error;
+	}
 
 	KGSL_CMD_INFO(device, "ctxt %d ibaddr 0x%08x sizedwords %d\n",
 		context->id, cmd, sizedwords);
@@ -468,12 +476,13 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv,
 	nextaddr = z180_dev->ringbuffer.cmdbufdesc.gpuaddr
 		+ rb_offset(nextindex);
 
-	tmp.hostptr = (void *)(tmp.hostptr +
-			(sizedwords * sizeof(unsigned int)));
-	tmp.size = 12;
-
-	kgsl_sharedmem_writel(&tmp, 4, nextaddr);
-	kgsl_sharedmem_writel(&tmp, 8, nextcnt);
+	/* monkey patch the IB so that it jumps back to the ringbuffer */
+	kgsl_sharedmem_writel(&entry->memdesc,
+			      ((sizedwords + 1) * sizeof(unsigned int)),
+			      nextaddr);
+	kgsl_sharedmem_writel(&entry->memdesc,
+			      ((sizedwords + 2) * sizeof(unsigned int)),
+			      nextcnt);
 
 	/* sync memory before activating the hardware for the new command*/
 	mb();
diff --git a/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c b/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c
old mode 100644
new mode 100755
diff --git a/include/linux/msm_kgsl.h b/include/linux/msm_kgsl.h
old mode 100755
new mode 100644
index a1d267893..7837bad21
--- a/include/linux/msm_kgsl.h
+++ b/include/linux/msm_kgsl.h
@@ -34,6 +34,16 @@
 #define KGSL_CLK_MEM_IFACE 0x00000010
 #define KGSL_CLK_AXI	0x00000020
 
+/*
+ * Reset status values for context
+ */
+enum kgsl_ctx_reset_stat {
+	KGSL_CTX_STAT_NO_ERROR				= 0x00000000,
+	KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT		= 0x00000001,
+	KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT	= 0x00000002,
+	KGSL_CTX_STAT_UNKNOWN_CONTEXT_RESET_EXT		= 0x00000003
+};
+
 #define KGSL_MAX_PWRLEVELS 5
 
 #define KGSL_CONVERT_TO_MBPS(val) \
@@ -110,6 +120,7 @@ enum kgsl_property_type {
 	KGSL_PROP_MMU_ENABLE 	  = 0x00000006,
 	KGSL_PROP_INTERRUPT_WAITS = 0x00000007,
 	KGSL_PROP_VERSION         = 0x00000008,
+	KGSL_PROP_GPU_RESET_STAT  = 0x00000009
 };
 
 struct kgsl_shadowprop {
@@ -146,9 +157,7 @@ struct kgsl_device_platform_data {
 	int num_levels;
 	int (*set_grp_async)(void);
 	unsigned int idle_timeout;
-	/*< DTS2012041906630 zhangxiangdang 20120423 begin */
 	bool strtstp_sleepwake;
-	/* DTS2012041906630 zhangxiangdang 20120423 end > */
 	unsigned int nap_allowed;
 	unsigned int clk_map;
 	unsigned int idle_needed;

From 5b225c80411d97112391f4dc9908289f0f68d4a5 Mon Sep 17 00:00:00 2001
From: forumber <kidobeko@gmail.com>
Date: Tue, 5 Feb 2013 17:55:43 +0200
Subject: [PATCH 19/19] Revert "Add BFQv5"

This reverts commit 89abf031a76bb8afe836b088a1ebe50fd846ec79.
---
 arch/arm/configs/u8800_defconfig |    6 +-
 block/Kconfig.iosched            |   26 -
 block/Makefile                   |    1 -
 block/bfq-cgroup.c               |  831 --------
 block/bfq-ioc.c                  |  380 ----
 block/bfq-iosched.c              | 3047 ------------------------------
 block/bfq-sched.c                | 1066 -----------
 block/bfq.h                      |  595 ------
 block/blk-ioc.c                  |   29 +-
 block/cfq-iosched.c              |   10 +-
 fs/ioprio.c                      |    7 +-
 include/linux/cgroup_subsys.h    |    6 -
 include/linux/iocontext.h        |   18 +-
 13 files changed, 22 insertions(+), 6000 deletions(-)
 delete mode 100644 block/bfq-cgroup.c
 delete mode 100644 block/bfq-ioc.c
 delete mode 100644 block/bfq-iosched.c
 delete mode 100644 block/bfq-sched.c
 delete mode 100644 block/bfq.h

diff --git a/arch/arm/configs/u8800_defconfig b/arch/arm/configs/u8800_defconfig
index fbf77b32f..e252fda97 100644
--- a/arch/arm/configs/u8800_defconfig
+++ b/arch/arm/configs/u8800_defconfig
@@ -172,13 +172,11 @@ CONFIG_LBDAF=y
 # IO Schedulers
 #
 CONFIG_IOSCHED_NOOP=y
-# CONFIG_DEFAULT_NOOP is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_IOSCHED_CFQ=y
 # CONFIG_DEFAULT_CFQ is not set
-CONFIG_IOSCHED_BFQ=y
-CONFIG_DEFAULT_BFQ=y
-CONFIG_DEFAULT_IOSCHED="bfq"
+CONFIG_DEFAULT_NOOP=y
+CONFIG_DEFAULT_IOSCHED="noop"
 # CONFIG_INLINE_SPIN_TRYLOCK is not set
 # CONFIG_INLINE_SPIN_TRYLOCK_BH is not set
 # CONFIG_INLINE_SPIN_LOCK is not set
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index bceaaecef..3199b76f7 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -43,28 +43,6 @@ config CFQ_GROUP_IOSCHED
 	---help---
 	  Enable group IO scheduling in CFQ.
 
-config IOSCHED_BFQ
-	tristate "BFQ I/O scheduler"
-	depends on EXPERIMENTAL
-	default n
-	---help---
-	  The BFQ I/O scheduler tries to distribute bandwidth among
-	  all processes according to their weights.
-	  It aims at distributing the bandwidth as desired, independently of
-	  the disk parameters and with any workload. It also tries to
-	  guarantee low latency to interactive and soft real-time
-	  applications.  If compiled built-in (saying Y here), BFQ can
-	  be configured to support hierarchical scheduling.
-
-config CGROUP_BFQIO
-	bool "BFQ hierarchical scheduling support"
-	depends on CGROUPS && IOSCHED_BFQ=y
-	default n
-	---help---
-	  Enable hierarchical scheduling in BFQ, using the cgroups
-	  filesystem interface.  The name of the subsystem will be
-	  bfqio.
-
 choice
 	prompt "Default I/O scheduler"
 	default DEFAULT_CFQ
@@ -75,9 +53,6 @@ choice
 	config DEFAULT_DEADLINE
 		bool "Deadline" if IOSCHED_DEADLINE=y
 
-	config DEFAULT_BFQ
-		bool "BFQ" if IOSCHED_BFQ=y
-		
 	config DEFAULT_CFQ
 		bool "CFQ" if IOSCHED_CFQ=y
 
@@ -90,7 +65,6 @@ config DEFAULT_IOSCHED
 	string
 	default "deadline" if DEFAULT_DEADLINE
 	default "cfq" if DEFAULT_CFQ
-	default "bfq" if DEFAULT_BFQ
 	default "noop" if DEFAULT_NOOP
 
 endmenu
diff --git a/block/Makefile b/block/Makefile
index a3cf79cc0..0fec4b3fa 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -13,7 +13,6 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
-obj-$(CONFIG_IOSCHED_BFQ)   += bfq-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
deleted file mode 100644
index 74ae73b91..000000000
--- a/block/bfq-cgroup.c
+++ /dev/null
@@ -1,831 +0,0 @@
-/*
- * BFQ: CGROUPS support.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- *
- * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
- *		      Paolo Valente <paolo.valente@unimore.it>
- *
- * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
- */
-
-#ifdef CONFIG_CGROUP_BFQIO
-static struct bfqio_cgroup bfqio_root_cgroup = {
-	.weight = BFQ_DEFAULT_GRP_WEIGHT,
-	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,
-	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,
-};
-
-static inline void bfq_init_entity(struct bfq_entity *entity,
-				   struct bfq_group *bfqg)
-{
-	entity->weight = entity->new_weight;
-	entity->orig_weight = entity->new_weight;
-	entity->ioprio = entity->new_ioprio;
-	entity->ioprio_class = entity->new_ioprio_class;
-	entity->parent = bfqg->my_entity;
-	entity->sched_data = &bfqg->sched_data;
-}
-
-static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)
-{
-	return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),
-			    struct bfqio_cgroup, css);
-}
-
-/*
- * Search the bfq_group for bfqd into the hash table (by now only a list)
- * of bgrp.  Must be called under rcu_read_lock().
- */
-static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
-					    struct bfq_data *bfqd)
-{
-	struct bfq_group *bfqg;
-	struct hlist_node *n;
-	void *key;
-
-	hlist_for_each_entry_rcu(bfqg, n, &bgrp->group_data, group_node) {
-		key = rcu_dereference(bfqg->bfqd);
-		if (key == bfqd)
-			return bfqg;
-	}
-
-	return NULL;
-}
-
-static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
-					 struct bfq_group *bfqg)
-{
-	struct bfq_entity *entity = &bfqg->entity;
-
-	entity->weight = entity->new_weight = bgrp->weight;
-	entity->orig_weight = entity->new_weight;
-	entity->ioprio = entity->new_ioprio = bgrp->ioprio;
-	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
-	entity->ioprio_changed = 1;
-	entity->my_sched_data = &bfqg->sched_data;
-}
-
-static inline void bfq_group_set_parent(struct bfq_group *bfqg,
-					struct bfq_group *parent)
-{
-	struct bfq_entity *entity;
-
-	BUG_ON(parent == NULL);
-	BUG_ON(bfqg == NULL);
-
-	entity = &bfqg->entity;
-	entity->parent = parent->my_entity;
-	entity->sched_data = &parent->sched_data;
-}
-
-/**
- * bfq_group_chain_alloc - allocate a chain of groups.
- * @bfqd: queue descriptor.
- * @cgroup: the leaf cgroup this chain starts from.
- *
- * Allocate a chain of groups starting from the one belonging to
- * @cgroup up to the root cgroup.  Stop if a cgroup on the chain
- * to the root has already an allocated group on @bfqd.
- */
-static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
-					       struct cgroup *cgroup)
-{
-	struct bfqio_cgroup *bgrp;
-	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
-
-	for (; cgroup != NULL; cgroup = cgroup->parent) {
-		bgrp = cgroup_to_bfqio(cgroup);
-
-		bfqg = bfqio_lookup_group(bgrp, bfqd);
-		if (bfqg != NULL) {
-			/*
-			 * All the cgroups in the path from there to the
-			 * root must have a bfq_group for bfqd, so we don't
-			 * need any more allocations.
-			 */
-			break;
-		}
-
-		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
-		if (bfqg == NULL)
-			goto cleanup;
-
-		bfq_group_init_entity(bgrp, bfqg);
-		bfqg->my_entity = &bfqg->entity;
-
-		if (leaf == NULL) {
-			leaf = bfqg;
-			prev = leaf;
-		} else {
-			bfq_group_set_parent(prev, bfqg);
-			/*
-			 * Build a list of allocated nodes using the bfqd
-			 * filed, that is still unused and will be initialized
-			 * only after the node will be connected.
-			 */
-			prev->bfqd = bfqg;
-			prev = bfqg;
-		}
-	}
-
-	return leaf;
-
-cleanup:
-	while (leaf != NULL) {
-		prev = leaf;
-		leaf = leaf->bfqd;
-		kfree(prev);
-	}
-
-	return NULL;
-}
-
-/**
- * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
- * @bfqd: the queue descriptor.
- * @cgroup: the leaf cgroup to start from.
- * @leaf: the leaf group (to be associated to @cgroup).
- *
- * Try to link a chain of groups to a cgroup hierarchy, connecting the
- * nodes bottom-up, so we can be sure that when we find a cgroup in the
- * hierarchy that already as a group associated to @bfqd all the nodes
- * in the path to the root cgroup have one too.
- *
- * On locking: the queue lock protects the hierarchy (there is a hierarchy
- * per device) while the bfqio_cgroup lock protects the list of groups
- * belonging to the same cgroup.
- */
-static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
-				 struct bfq_group *leaf)
-{
-	struct bfqio_cgroup *bgrp;
-	struct bfq_group *bfqg, *next, *prev = NULL;
-	unsigned long flags;
-
-	assert_spin_locked(bfqd->queue->queue_lock);
-
-	for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
-		bgrp = cgroup_to_bfqio(cgroup);
-		next = leaf->bfqd;
-
-		bfqg = bfqio_lookup_group(bgrp, bfqd);
-		BUG_ON(bfqg != NULL);
-
-		spin_lock_irqsave(&bgrp->lock, flags);
-
-		rcu_assign_pointer(leaf->bfqd, bfqd);
-		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
-		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
-
-		spin_unlock_irqrestore(&bgrp->lock, flags);
-
-		prev = leaf;
-		leaf = next;
-	}
-
-	BUG_ON(cgroup == NULL && leaf != NULL);
-	if (cgroup != NULL && prev != NULL) {
-		bgrp = cgroup_to_bfqio(cgroup);
-		bfqg = bfqio_lookup_group(bgrp, bfqd);
-		bfq_group_set_parent(prev, bfqg);
-	}
-}
-
-/**
- * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
- * @bfqd: queue descriptor.
- * @cgroup: cgroup being searched for.
- *
- * Return a group associated to @bfqd in @cgroup, allocating one if
- * necessary.  When a group is returned all the cgroups in the path
- * to the root have a group associated to @bfqd.
- *
- * If the allocation fails, return the root group: this breaks guarantees
- * but is a safe fallbak.  If this loss becames a problem it can be
- * mitigated using the equivalent weight (given by the product of the
- * weights of the groups in the path from @group to the root) in the
- * root scheduler.
- *
- * We allocate all the missing nodes in the path from the leaf cgroup
- * to the root and we connect the nodes only after all the allocations
- * have been successful.
- */
-static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
-					      struct cgroup *cgroup)
-{
-	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
-	struct bfq_group *bfqg;
-
-	bfqg = bfqio_lookup_group(bgrp, bfqd);
-	if (bfqg != NULL)
-		return bfqg;
-
-	bfqg = bfq_group_chain_alloc(bfqd, cgroup);
-	if (bfqg != NULL)
-		bfq_group_chain_link(bfqd, cgroup, bfqg);
-	else
-		bfqg = bfqd->root_group;
-
-	return bfqg;
-}
-
-/**
- * bfq_bfqq_move - migrate @bfqq to @bfqg.
- * @bfqd: queue descriptor.
- * @bfqq: the queue to move.
- * @entity: @bfqq's entity.
- * @bfqg: the group to move to.
- *
- * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
- * it on the new one.  Avoid putting the entity on the old group idle tree.
- *
- * Must be called under the queue lock; the cgroup owning @bfqg must
- * not disappear (by now this just means that we are called under
- * rcu_read_lock()).
- */
-static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			  struct bfq_entity *entity, struct bfq_group *bfqg)
-{
-	int busy, resume;
-
-	busy = bfq_bfqq_busy(bfqq);
-	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
-
-	BUG_ON(resume && !entity->on_st);
-	BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue);
-
-	if (busy) {
-		BUG_ON(atomic_read(&bfqq->ref) < 2);
-
-		if (!resume)
-			bfq_del_bfqq_busy(bfqd, bfqq, 0);
-		else
-			bfq_deactivate_bfqq(bfqd, bfqq, 0);
-	} else if (entity->on_st)
-		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
-
-	/*
-	 * Here we use a reference to bfqg.  We don't need a refcounter
-	 * as the cgroup reference will not be dropped, so that its
-	 * destroy() callback will not be invoked.
-	 */
-	entity->parent = bfqg->my_entity;
-	entity->sched_data = &bfqg->sched_data;
-
-	if (busy && resume)
-		bfq_activate_bfqq(bfqd, bfqq);
-}
-
-/**
- * __bfq_cic_change_cgroup - move @cic to @cgroup.
- * @bfqd: the queue descriptor.
- * @cic: the cic to move.
- * @cgroup: the cgroup to move to.
- *
- * Move cic to cgroup, assuming that bfqd->queue is locked; the caller
- * has to make sure that the reference to cgroup is valid across the call.
- *
- * NOTE: an alternative approach might have been to store the current
- * cgroup in bfqq and getting a reference to it, reducing the lookup
- * time here, at the price of slightly more complex code.
- */
-static struct bfq_group *__bfq_cic_change_cgroup(struct bfq_data *bfqd,
-						 struct cfq_io_context *cic,
-						 struct cgroup *cgroup)
-{
-	struct bfq_queue *async_bfqq = cic_to_bfqq(cic, 0);
-	struct bfq_queue *sync_bfqq = cic_to_bfqq(cic, 1);
-	struct bfq_entity *entity;
-	struct bfq_group *bfqg;
-
-	bfqg = bfq_find_alloc_group(bfqd, cgroup);
-	if (async_bfqq != NULL) {
-		entity = &async_bfqq->entity;
-
-		if (entity->sched_data != &bfqg->sched_data) {
-			cic_set_bfqq(cic, NULL, 0);
-			bfq_log_bfqq(bfqd, async_bfqq,
-				     "cic_change_group: %p %d",
-				     async_bfqq, atomic_read(&async_bfqq->ref));
-			bfq_put_queue(async_bfqq);
-		}
-	}
-
-	if (sync_bfqq != NULL) {
-		entity = &sync_bfqq->entity;
-		if (entity->sched_data != &bfqg->sched_data)
-			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
-	}
-
-	return bfqg;
-}
-
-/**
- * bfq_cic_change_cgroup - move @cic to @cgroup.
- * @cic: the cic being migrated.
- * @cgroup: the destination cgroup.
- *
- * When the task owning @cic is moved to @cgroup, @cic is immediately
- * moved into its new parent group.
- */
-static void bfq_cic_change_cgroup(struct cfq_io_context *cic,
-				  struct cgroup *cgroup)
-{
-	struct bfq_data *bfqd;
-	unsigned long uninitialized_var(flags);
-
-	bfqd = bfq_get_bfqd_locked(&cic->key, &flags);
-	if (bfqd != NULL &&
-	    !strncmp(bfqd->queue->elevator->elevator_type->elevator_name,
-		     "bfq", ELV_NAME_MAX)) {
-		__bfq_cic_change_cgroup(bfqd, cic, cgroup);
-		bfq_put_bfqd_unlock(bfqd, &flags);
-	}
-}
-
-/**
- * bfq_cic_update_cgroup - update the cgroup of @cic.
- * @cic: the @cic to update.
- *
- * Make sure that @cic is enqueued in the cgroup of the current task.
- * We need this in addition to moving cics during the cgroup attach
- * phase because the task owning @cic could be at its first disk
- * access or we may end up in the root cgroup as the result of a
- * memory allocation failure and here we try to move to the right
- * group.
- *
- * Must be called under the queue lock.  It is safe to use the returned
- * value even after the rcu_read_unlock() as the migration/destruction
- * paths act under the queue lock too.  IOW it is impossible to race with
- * group migration/destruction and end up with an invalid group as:
- *   a) here cgroup has not yet been destroyed, nor its destroy callback
- *      has started execution, as current holds a reference to it,
- *   b) if it is destroyed after rcu_read_unlock() [after current is
- *      migrated to a different cgroup] its attach() callback will have
- *      taken care of remove all the references to the old cgroup data.
- */
-static struct bfq_group *bfq_cic_update_cgroup(struct cfq_io_context *cic)
-{
-	struct bfq_data *bfqd = cic->key;
-	struct bfq_group *bfqg;
-	struct cgroup *cgroup;
-
-	BUG_ON(bfqd == NULL);
-
-	rcu_read_lock();
-	cgroup = task_cgroup(current, bfqio_subsys_id);
-	bfqg = __bfq_cic_change_cgroup(bfqd, cic, cgroup);
-	rcu_read_unlock();
-
-	return bfqg;
-}
-
-/**
- * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
- * @st: the service tree being flushed.
- */
-static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
-{
-	struct bfq_entity *entity = st->first_idle;
-
-	for (; entity != NULL; entity = st->first_idle)
-		__bfq_deactivate_entity(entity, 0);
-}
-
-/**
- * bfq_reparent_leaf_entity - move leaf entity to the root_group.
- * @bfqd: the device data structure with the root group.
- * @entity: the entity to move.
- */
-static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
-					    struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	BUG_ON(bfqq == NULL);
-	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
-	return;
-}
-
-/**
- * bfq_reparent_active_entities - move to the root group all active entities.
- * @bfqd: the device data structure with the root group.
- * @bfqg: the group to move from.
- * @st: the service tree with the entities.
- *
- * Needs queue_lock to be taken and reference to be valid over the call.
- */
-static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
-						struct bfq_group *bfqg,
-						struct bfq_service_tree *st)
-{
-	struct rb_root *active = &st->active;
-	struct bfq_entity *entity = NULL;
-
-	if (!RB_EMPTY_ROOT(&st->active))
-		entity = bfq_entity_of(rb_first(active));
-
-	for (; entity != NULL ; entity = bfq_entity_of(rb_first(active)))
-		bfq_reparent_leaf_entity(bfqd, entity);
-
-	if (bfqg->sched_data.active_entity != NULL)
-		bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);
-
-	return;
-}
-
-/**
- * bfq_destroy_group - destroy @bfqg.
- * @bgrp: the bfqio_cgroup containing @bfqg.
- * @bfqg: the group being destroyed.
- *
- * Destroy @bfqg, making sure that it is not referenced from its parent.
- */
-static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
-{
-	struct bfq_data *bfqd;
-	struct bfq_service_tree *st;
-	struct bfq_entity *entity = bfqg->my_entity;
-	unsigned long uninitialized_var(flags);
-	int i;
-
-	hlist_del(&bfqg->group_node);
-
-	/*
-	 * Empty all service_trees belonging to this group before deactivating
-	 * the group itself.
-	 */
-	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
-		st = bfqg->sched_data.service_tree + i;
-
-		/*
-		 * The idle tree may still contain bfq_queues belonging
-		 * to exited task because they never migrated to a different
-		 * cgroup from the one being destroyed now.  Noone else
-		 * can access them so it's safe to act without any lock.
-		 */
-		bfq_flush_idle_tree(st);
-
-		/*
-		 * It may happen that some queues are still active
-		 * (busy) upon group destruction (if the corresponding
-		 * processes have been forced to terminate). We move
-		 * all the leaf entities corresponding to these queues
-		 * to the root_group.
-		 * Also, it may happen that the group has an entity
-		 * under service, which is disconnected from the active
-		 * tree: it must be moved, too.
-		 * There is no need to put the sync queues, as the
-		 * scheduler has taken no reference.
-		 */
-		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
-		if (bfqd != NULL) {
-			bfq_reparent_active_entities(bfqd, bfqg, st);
-			bfq_put_bfqd_unlock(bfqd, &flags);
-		}
-		BUG_ON(!RB_EMPTY_ROOT(&st->active));
-		BUG_ON(!RB_EMPTY_ROOT(&st->idle));
-	}
-	BUG_ON(bfqg->sched_data.next_active != NULL);
-	BUG_ON(bfqg->sched_data.active_entity != NULL);
-
-	/*
-	 * We may race with device destruction, take extra care when
-	 * dereferencing bfqg->bfqd.
-	 */
-	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
-	if (bfqd != NULL) {
-		hlist_del(&bfqg->bfqd_node);
-		__bfq_deactivate_entity(entity, 0);
-		bfq_put_async_queues(bfqd, bfqg);
-		bfq_put_bfqd_unlock(bfqd, &flags);
-	}
-	BUG_ON(entity->tree != NULL);
-
-	/*
-	 * No need to defer the kfree() to the end of the RCU grace
-	 * period: we are called from the destroy() callback of our
-	 * cgroup, so we can be sure that noone is a) still using
-	 * this cgroup or b) doing lookups in it.
-	 */
-	kfree(bfqg);
-}
-
-/**
- * bfq_disconnect_groups - diconnect @bfqd from all its groups.
- * @bfqd: the device descriptor being exited.
- *
- * When the device exits we just make sure that no lookup can return
- * the now unused group structures.  They will be deallocated on cgroup
- * destruction.
- */
-static void bfq_disconnect_groups(struct bfq_data *bfqd)
-{
-	struct hlist_node *pos, *n;
-	struct bfq_group *bfqg;
-
-	bfq_log(bfqd, "disconnect_groups beginning") ;
-	hlist_for_each_entry_safe(bfqg, pos, n, &bfqd->group_list, bfqd_node) {
-		hlist_del(&bfqg->bfqd_node);
-
-		__bfq_deactivate_entity(bfqg->my_entity, 0);
-
-		/*
-		 * Don't remove from the group hash, just set an
-		 * invalid key.  No lookups can race with the
-		 * assignment as bfqd is being destroyed; this
-		 * implies also that new elements cannot be added
-		 * to the list.
-		 */
-		rcu_assign_pointer(bfqg->bfqd, NULL);
-
-		bfq_log(bfqd, "disconnect_groups: put async for group %p",
-			bfqg) ;
-		bfq_put_async_queues(bfqd, bfqg);
-	}
-}
-
-static inline void bfq_free_root_group(struct bfq_data *bfqd)
-{
-	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
-	struct bfq_group *bfqg = bfqd->root_group;
-
-	bfq_put_async_queues(bfqd, bfqg);
-
-	spin_lock_irq(&bgrp->lock);
-	hlist_del_rcu(&bfqg->group_node);
-	spin_unlock_irq(&bgrp->lock);
-
-	/*
-	 * No need to synchronize_rcu() here: since the device is gone
-	 * there cannot be any read-side access to its root_group.
-	 */
-	kfree(bfqg);
-}
-
-static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
-{
-	struct bfq_group *bfqg;
-	struct bfqio_cgroup *bgrp;
-	int i;
-
-	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
-	if (bfqg == NULL)
-		return NULL;
-
-	bfqg->entity.parent = NULL;
-	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
-		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
-
-	bgrp = &bfqio_root_cgroup;
-	spin_lock_irq(&bgrp->lock);
-	rcu_assign_pointer(bfqg->bfqd, bfqd);
-	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
-	spin_unlock_irq(&bgrp->lock);
-
-	return bfqg;
-}
-
-#define SHOW_FUNCTION(__VAR)						\
-static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup,		\
-				       struct cftype *cftype)		\
-{									\
-	struct bfqio_cgroup *bgrp;					\
-	u64 ret;							\
-									\
-	if (!cgroup_lock_live_group(cgroup))				\
-		return -ENODEV;						\
-									\
-	bgrp = cgroup_to_bfqio(cgroup);					\
-	spin_lock_irq(&bgrp->lock);					\
-	ret = bgrp->__VAR;						\
-	spin_unlock_irq(&bgrp->lock);					\
-									\
-	cgroup_unlock();						\
-									\
-	return ret;							\
-}
-
-SHOW_FUNCTION(weight);
-SHOW_FUNCTION(ioprio);
-SHOW_FUNCTION(ioprio_class);
-#undef SHOW_FUNCTION
-
-#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\
-static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup,		\
-					struct cftype *cftype,		\
-					u64 val)			\
-{									\
-	struct bfqio_cgroup *bgrp;					\
-	struct bfq_group *bfqg;						\
-	struct hlist_node *n;						\
-									\
-	if (val < (__MIN) || val > (__MAX))				\
-		return -EINVAL;						\
-									\
-	if (!cgroup_lock_live_group(cgroup))				\
-		return -ENODEV;						\
-									\
-	bgrp = cgroup_to_bfqio(cgroup);					\
-									\
-	spin_lock_irq(&bgrp->lock);					\
-	bgrp->__VAR = (unsigned short)val;				\
-	hlist_for_each_entry(bfqg, n, &bgrp->group_data, group_node) {	\
-		bfqg->entity.new_##__VAR = (unsigned short)val;		\
-		smp_wmb();						\
-		bfqg->entity.ioprio_changed = 1;			\
-	}								\
-	spin_unlock_irq(&bgrp->lock);					\
-									\
-	cgroup_unlock();						\
-									\
-	return 0;							\
-}
-
-STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
-STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
-STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
-#undef STORE_FUNCTION
-
-static struct cftype bfqio_files[] = {
-	{
-		.name = "weight",
-		.read_u64 = bfqio_cgroup_weight_read,
-		.write_u64 = bfqio_cgroup_weight_write,
-	},
-	{
-		.name = "ioprio",
-		.read_u64 = bfqio_cgroup_ioprio_read,
-		.write_u64 = bfqio_cgroup_ioprio_write,
-	},
-	{
-		.name = "ioprio_class",
-		.read_u64 = bfqio_cgroup_ioprio_class_read,
-		.write_u64 = bfqio_cgroup_ioprio_class_write,
-	},
-};
-
-static int bfqio_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
-{
-	return cgroup_add_files(cgroup, subsys, bfqio_files,
-				ARRAY_SIZE(bfqio_files));
-}
-
-static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys *subsys,
-						struct cgroup *cgroup)
-{
-	struct bfqio_cgroup *bgrp;
-
-	if (cgroup->parent != NULL) {
-		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
-		if (bgrp == NULL)
-			return ERR_PTR(-ENOMEM);
-	} else
-		bgrp = &bfqio_root_cgroup;
-
-	spin_lock_init(&bgrp->lock);
-	INIT_HLIST_HEAD(&bgrp->group_data);
-	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
-	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
-
-	return &bgrp->css;
-}
-
-/*
- * We cannot support shared io contexts, as we have no mean to support
- * two tasks with the same ioc in two different groups without major rework
- * of the main cic/bfqq data structures.  By now we allow a task to change
- * its cgroup only if it's the only owner of its ioc; the drawback of this
- * behavior is that a group containing a task that forked using CLONE_IO
- * will not be destroyed until the tasks sharing the ioc die.
- */
-static int bfqio_can_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
-			    struct task_struct *tsk)
-{
-	struct io_context *ioc;
-	int ret = 0;
-
-	/* task_lock() is needed to avoid races with exit_io_context() */
-	task_lock(tsk);
-	ioc = tsk->io_context;
-	if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
-		/*
-		 * ioc == NULL means that the task is either too young or
-		 * exiting: if it has still no ioc the ioc can't be shared,
-		 * if the task is exiting the attach will fail anyway, no
-		 * matter what we return here.
-		 */
-		ret = -EINVAL;
-	task_unlock(tsk);
-
-	return ret;
-}
-
-static void bfqio_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
-			 struct cgroup *prev, struct task_struct *tsk)
-{
-	struct io_context *ioc;
-	struct cfq_io_context *cic;
-	struct hlist_node *n;
-
-	task_lock(tsk);
-	ioc = tsk->io_context;
-	if (ioc != NULL) {
-		BUG_ON(atomic_long_read(&ioc->refcount) == 0);
-		atomic_long_inc(&ioc->refcount);
-	}
-	task_unlock(tsk);
-
-	if (ioc == NULL)
-		return;
-
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list)
-		bfq_cic_change_cgroup(cic, cgroup);
-	rcu_read_unlock();
-
-	put_io_context(ioc);
-}
-
-static void bfqio_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
-{
-	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
-	struct hlist_node *n, *tmp;
-	struct bfq_group *bfqg;
-
-	/*
-	 * Since we are destroying the cgroup, there are no more tasks
-	 * referencing it, and all the RCU grace periods that may have
-	 * referenced it are ended (as the destruction of the parent
-	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by
-	 * anything else and we don't need any synchronization.
-	 */
-	hlist_for_each_entry_safe(bfqg, n, tmp, &bgrp->group_data, group_node)
-		bfq_destroy_group(bgrp, bfqg);
-
-	BUG_ON(!hlist_empty(&bgrp->group_data));
-
-	kfree(bgrp);
-}
-
-struct cgroup_subsys bfqio_subsys = {
-	.name = "bfqio",
-	.create = bfqio_create,
-	.can_attach = bfqio_can_attach,
-	.attach = bfqio_attach,
-	.destroy = bfqio_destroy,
-	.populate = bfqio_populate,
-	.subsys_id = bfqio_subsys_id,
-};
-#else
-static inline void bfq_init_entity(struct bfq_entity *entity,
-				   struct bfq_group *bfqg)
-{
-	entity->weight = entity->new_weight;
-	entity->orig_weight = entity->new_weight;
-	entity->ioprio = entity->new_ioprio;
-	entity->ioprio_class = entity->new_ioprio_class;
-	entity->sched_data = &bfqg->sched_data;
-}
-
-static inline struct bfq_group *
-bfq_cic_update_cgroup(struct cfq_io_context *cic)
-{
-	struct bfq_data *bfqd = cic->key;
-	return bfqd->root_group;
-}
-
-static inline void bfq_bfqq_move(struct bfq_data *bfqd,
-				 struct bfq_queue *bfqq,
-				 struct bfq_entity *entity,
-				 struct bfq_group *bfqg)
-{
-}
-
-static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
-{
-	bfq_put_async_queues(bfqd, bfqd->root_group);
-}
-
-static inline void bfq_free_root_group(struct bfq_data *bfqd)
-{
-	kfree(bfqd->root_group);
-}
-
-static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
-{
-	struct bfq_group *bfqg;
-	int i;
-
-	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
-	if (bfqg == NULL)
-		return NULL;
-
-	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
-		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
-
-	return bfqg;
-}
-#endif
diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
deleted file mode 100644
index 01f831332..000000000
--- a/block/bfq-ioc.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * BFQ: I/O context handling.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- *
- * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
- *		      Paolo Valente <paolo.valente@unimore.it>
- */
-
-/**
- * bfq_cic_free_rcu - deferred cic freeing.
- * @head: RCU head of the cic to free.
- *
- * Free the cic containing @head and, if it was the last one and
- * the module is exiting wake up anyone waiting for its deallocation
- * (see bfq_exit()).
- */
-static void bfq_cic_free_rcu(struct rcu_head *head)
-{
-	struct cfq_io_context *cic;
-
-	cic = container_of(head, struct cfq_io_context, rcu_head);
-
-	kmem_cache_free(bfq_ioc_pool, cic);
-	elv_ioc_count_dec(bfq_ioc_count);
-
-	if (bfq_ioc_gone != NULL) {
-		spin_lock(&bfq_ioc_gone_lock);
-		if (bfq_ioc_gone != NULL &&
-		    !elv_ioc_count_read(bfq_ioc_count)) {
-			complete(bfq_ioc_gone);
-			bfq_ioc_gone = NULL;
-		}
-		spin_unlock(&bfq_ioc_gone_lock);
-	}
-}
-
-static void bfq_cic_free(struct cfq_io_context *cic)
-{
-	call_rcu(&cic->rcu_head, bfq_cic_free_rcu);
-}
-
-/**
- * cic_free_func - disconnect a cic ready to be freed.
- * @ioc: the io_context @cic belongs to.
- * @cic: the cic to be freed.
- *
- * Remove @cic from the @ioc radix tree hash and from its cic list,
- * deferring the deallocation of @cic to the end of the current RCU
- * grace period.  This assumes that __bfq_exit_single_io_context()
- * has already been called for @cic.
- */
-static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
-{
-	unsigned long flags;
-	unsigned long dead_key = (unsigned long) cic->key;
-
-	BUG_ON(!(dead_key & CIC_DEAD_KEY));
-
-	spin_lock_irqsave(&ioc->lock, flags);
-	radix_tree_delete(&ioc->bfq_radix_root,
-		dead_key >> CIC_DEAD_INDEX_SHIFT);
-	hlist_del_init_rcu(&cic->cic_list);
-	spin_unlock_irqrestore(&ioc->lock, flags);
-
-	bfq_cic_free(cic);
-}
-
-static void bfq_free_io_context(struct io_context *ioc)
-{
-	/*
-	 * ioc->refcount is zero here, or we are called from elv_unregister(),
-	 * so no more cic's are allowed to be linked into this ioc.  So it
-	 * should be ok to iterate over the known list, we will see all cic's
-	 * since no new ones are added.
-	 */
-	call_for_each_cic(ioc, cic_free_func);
-}
-
-/**
- * __bfq_exit_single_io_context - deassociate @cic from any running task.
- * @bfqd: bfq_data on which @cic is valid.
- * @cic: the cic being exited.
- *
- * Whenever no more tasks are using @cic or @bfqd is deallocated we
- * need to invalidate its entry in the radix tree hash table and to
- * release the queues it refers to.
- *
- * Called under the queue lock.
- */
-static void __bfq_exit_single_io_context(struct bfq_data *bfqd,
-					 struct cfq_io_context *cic)
-{
-	struct io_context *ioc = cic->ioc;
-
-	list_del_init(&cic->queue_list);
-
-	/*
-	 * Make sure dead mark is seen for dead queues
-	 */
-	smp_wmb();
-	rcu_assign_pointer(cic->key, bfqd_dead_key(bfqd));
-
-	/*
-	 * No write-side locking as no task is using @ioc (they're exited
-	 * or bfqd is being deallocated.
-	 */
-	rcu_read_lock();
-	if (rcu_dereference(ioc->ioc_data) == cic) {
-		rcu_read_unlock();
-		spin_lock(&ioc->lock);
-		rcu_assign_pointer(ioc->ioc_data, NULL);
-		spin_unlock(&ioc->lock);
-	} else
-		rcu_read_unlock();
-
-	if (cic->cfqq[BLK_RW_ASYNC] != NULL) {
-		bfq_exit_bfqq(bfqd, cic->cfqq[BLK_RW_ASYNC]);
-		cic->cfqq[BLK_RW_ASYNC] = NULL;
-	}
-
-	if (cic->cfqq[BLK_RW_SYNC] != NULL) {
-		bfq_exit_bfqq(bfqd, cic->cfqq[BLK_RW_SYNC]);
-		cic->cfqq[BLK_RW_SYNC] = NULL;
-	}
-}
-
-/**
- * bfq_exit_single_io_context - deassociate @cic from @ioc (unlocked version).
- * @ioc: the io_context @cic belongs to.
- * @cic: the cic being exited.
- *
- * Take the queue lock and call __bfq_exit_single_io_context() to do the
- * rest of the work.  We take care of possible races with bfq_exit_queue()
- * using bfq_get_bfqd_locked() (and abusing a little bit the RCU mechanism).
- */
-static void bfq_exit_single_io_context(struct io_context *ioc,
-				       struct cfq_io_context *cic)
-{
-	struct bfq_data *bfqd;
-	unsigned long uninitialized_var(flags);
-
-	bfqd = bfq_get_bfqd_locked(&cic->key, &flags);
-	if (bfqd != NULL) {
-		__bfq_exit_single_io_context(bfqd, cic);
-		bfq_put_bfqd_unlock(bfqd, &flags);
-	}
-}
-
-/**
- * bfq_exit_io_context - deassociate @ioc from all cics it owns.
- * @ioc: the @ioc being exited.
- *
- * No more processes are using @ioc we need to clean up and put the
- * internal structures we have that belongs to that process.  Loop
- * through all its cics, locking their queues and exiting them.
- */
-static void bfq_exit_io_context(struct io_context *ioc)
-{
-	call_for_each_cic(ioc, bfq_exit_single_io_context);
-}
-
-static struct cfq_io_context *bfq_alloc_io_context(struct bfq_data *bfqd,
-						   gfp_t gfp_mask)
-{
-	struct cfq_io_context *cic;
-
-	cic = kmem_cache_alloc_node(bfq_ioc_pool, gfp_mask | __GFP_ZERO,
-							bfqd->queue->node);
-	if (cic != NULL) {
-		cic->last_end_request = jiffies;
-		INIT_LIST_HEAD(&cic->queue_list);
-		INIT_HLIST_NODE(&cic->cic_list);
-		cic->dtor = bfq_free_io_context;
-		cic->exit = bfq_exit_io_context;
-		elv_ioc_count_inc(bfq_ioc_count);
-	}
-
-	return cic;
-}
-
-/**
- * bfq_drop_dead_cic - free an exited cic.
- * @bfqd: bfq data for the device in use.
- * @ioc: io_context owning @cic.
- * @cic: the @cic to free.
- *
- * We drop cfq io contexts lazily, so we may find a dead one.
- */
-static void bfq_drop_dead_cic(struct bfq_data *bfqd, struct io_context *ioc,
-			      struct cfq_io_context *cic)
-{
-	unsigned long flags;
-
-	WARN_ON(!list_empty(&cic->queue_list));
-	BUG_ON(cic->key != bfqd_dead_key(bfqd));
-
-	spin_lock_irqsave(&ioc->lock, flags);
-
-	BUG_ON(ioc->ioc_data == cic);
-
-	/*
-	 * With shared I/O contexts two lookups may race and drop the
-	 * same cic more than one time: RCU guarantees that the storage
-	 * will not be freed too early, here we make sure that we do
-	 * not try to remove the cic from the hashing structures multiple
-	 * times.
-	 */
-	if (!hlist_unhashed(&cic->cic_list)) {
-		radix_tree_delete(&ioc->bfq_radix_root, bfqd->cic_index);
-		hlist_del_init_rcu(&cic->cic_list);
-		bfq_cic_free(cic);
-	}
-
-	spin_unlock_irqrestore(&ioc->lock, flags);
-}
-
-/**
- * bfq_cic_lookup - search into @ioc a cic associated to @bfqd.
- * @bfqd: the lookup key.
- * @ioc: the io_context of the process doing I/O.
- *
- * If @ioc already has a cic associated to @bfqd return it, return %NULL
- * otherwise.
- */
-static struct cfq_io_context *bfq_cic_lookup(struct bfq_data *bfqd,
-					     struct io_context *ioc)
-{
-	struct cfq_io_context *cic;
-	unsigned long flags;
-	void *k;
-
-	if (unlikely(ioc == NULL))
-		return NULL;
-
-	rcu_read_lock();
-
-	/* We maintain a last-hit cache, to avoid browsing over the tree. */
-	cic = rcu_dereference(ioc->ioc_data);
-	if (cic != NULL) {
-		k = rcu_dereference(cic->key);
-		if (k == bfqd)
-			goto out;
-	}
-
-	do {
-		cic = radix_tree_lookup(&ioc->bfq_radix_root,
-					bfqd->cic_index);
-		if (cic == NULL)
-			goto out;
-
-		k = rcu_dereference(cic->key);
-		if (unlikely(k != bfqd)) {
-			rcu_read_unlock();
-			bfq_drop_dead_cic(bfqd, ioc, cic);
-			rcu_read_lock();
-			continue;
-		}
-
-		spin_lock_irqsave(&ioc->lock, flags);
-		rcu_assign_pointer(ioc->ioc_data, cic);
-		spin_unlock_irqrestore(&ioc->lock, flags);
-		break;
-	} while (1);
-
-out:
-	rcu_read_unlock();
-
-	return cic;
-}
-
-/**
- * bfq_cic_link - add @cic to @ioc.
- * @bfqd: bfq_data @cic refers to.
- * @ioc: io_context @cic belongs to.
- * @cic: the cic to link.
- * @gfp_mask: the mask to use for radix tree preallocations.
- *
- * Add @cic to @ioc, using @bfqd as the search key.  This enables us to
- * lookup the process specific cfq io context when entered from the block
- * layer.  Also adds @cic to a per-bfqd list, used when this queue is
- * removed.
- */
-static int bfq_cic_link(struct bfq_data *bfqd, struct io_context *ioc,
-			struct cfq_io_context *cic, gfp_t gfp_mask)
-{
-	unsigned long flags;
-	int ret;
-
-	ret = radix_tree_preload(gfp_mask);
-	if (ret == 0) {
-		cic->ioc = ioc;
-
-		/* No write-side locking, cic is not published yet. */
-		rcu_assign_pointer(cic->key, bfqd);
-
-		spin_lock_irqsave(&ioc->lock, flags);
-		ret = radix_tree_insert(&ioc->bfq_radix_root,
-					bfqd->cic_index, cic);
-		if (ret == 0)
-			hlist_add_head_rcu(&cic->cic_list, &ioc->bfq_cic_list);
-		spin_unlock_irqrestore(&ioc->lock, flags);
-
-		radix_tree_preload_end();
-
-		if (ret == 0) {
-			spin_lock_irqsave(bfqd->queue->queue_lock, flags);
-			list_add(&cic->queue_list, &bfqd->cic_list);
-			spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
-		}
-	}
-
-	if (ret != 0)
-		printk(KERN_ERR "bfq: cic link failed!\n");
-
-	return ret;
-}
-
-/**
- * bfq_ioc_set_ioprio - signal a priority change to the cics belonging to @ioc.
- * @ioc: the io_context changing its priority.
- */
-static inline void bfq_ioc_set_ioprio(struct io_context *ioc)
-{
-	call_for_each_cic(ioc, bfq_changed_ioprio);
-}
-
-/**
- * bfq_get_io_context - return the @cic associated to @bfqd in @ioc.
- * @bfqd: the search key.
- * @gfp_mask: the mask to use for cic allocation.
- *
- * Setup general io context and cfq io context.  There can be several cfq
- * io contexts per general io context, if this process is doing io to more
- * than one device managed by cfq.
- */
-static struct cfq_io_context *bfq_get_io_context(struct bfq_data *bfqd,
-						 gfp_t gfp_mask)
-{
-	struct io_context *ioc = NULL;
-	struct cfq_io_context *cic;
-
-	might_sleep_if(gfp_mask & __GFP_WAIT);
-
-	ioc = get_io_context(gfp_mask, bfqd->queue->node);
-	if (ioc == NULL)
-		return NULL;
-
-	/* Lookup for an existing cic. */
-	cic = bfq_cic_lookup(bfqd, ioc);
-	if (cic != NULL)
-		goto out;
-
-	/* Alloc one if needed. */
-	cic = bfq_alloc_io_context(bfqd, gfp_mask);
-	if (cic == NULL)
-		goto err;
-
-	/* Link it into the ioc's radix tree and cic list. */
-	if (bfq_cic_link(bfqd, ioc, cic, gfp_mask) != 0)
-		goto err_free;
-
-out:
-	/*
-	 * test_and_clear_bit() implies a memory barrier, paired with
-	 * the wmb() in fs/ioprio.c, so the value seen for ioprio is the
-	 * new one.
-	 */
-	if (unlikely(test_and_clear_bit(IOC_BFQ_IOPRIO_CHANGED,
-					ioc->ioprio_changed)))
-		bfq_ioc_set_ioprio(ioc);
-
-	return cic;
-err_free:
-	bfq_cic_free(cic);
-err:
-	put_io_context(ioc);
-	return NULL;
-}
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
deleted file mode 100644
index 9f261ee60..000000000
--- a/block/bfq-iosched.c
+++ /dev/null
@@ -1,3047 +0,0 @@
-/*
- * BFQ, or Budget Fair Queueing, disk scheduler.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- *
- * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
- *		      Paolo Valente <paolo.valente@unimore.it>
- *
- * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
- *
- * BFQ is a proportional share disk scheduling algorithm based on the
- * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
- * measured in number of sectors, to tasks instead of time slices.
- * The disk is not granted to the active task for a given time slice,
- * but until it has exahusted its assigned budget.  This change from
- * the time to the service domain allows BFQ to distribute the disk
- * bandwidth among tasks as desired, without any distortion due to
- * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
- * internal scheduler, called B-WF2Q+, to schedule tasks according to
- * their budgets.  Thanks to this accurate scheduler, BFQ can afford
- * to assign high budgets to disk-bound non-seeky tasks (to boost the
- * throughput), and yet guarantee low latencies to interactive and
- * soft real-time applications.
- *
- * BFQ has been introduced in [1], where the interested reader can
- * find an accurate description of the algorithm, the bandwidth
- * distribution and latency guarantees it provides, plus formal proofs
- * of all the properties.  With respect to the algorithm presented in
- * the paper, this implementation adds several little heuristics, and
- * a hierarchical extension, based on H-WF2Q+.
- *
- * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
- * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
- * complexity derives from the one introduced with EEVDF in [3].
- *
- * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
- *     with Deterministic Guarantees on Bandwidth Distribution,'',
- *     IEEE Transactions on Computer, May 2010.
- *
- *     http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf
- *
- * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
- *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
- *     Oct 1997.
- *
- *     http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
- *
- * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
- *     First: A Flexible and Accurate Mechanism for Proportional Share
- *     Resource Allocation,'' technical report.
- *
- *     http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
- */
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/blkdev.h>
-#include <linux/cgroup.h>
-#include <linux/elevator.h>
-#include <linux/jiffies.h>
-#include <linux/rbtree.h>
-#include <linux/ioprio.h>
-#include "bfq.h"
-
-/* Max number of dispatches in one round of service. */
-static const int bfq_quantum = 4;
-
-/* Expiration time of sync (0) and async (1) requests, in jiffies. */
-static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
-
-/* Maximum backwards seek, in KiB. */
-static const int bfq_back_max = 16 * 1024;
-
-/* Penalty of a backwards seek, in number of sectors. */
-static const int bfq_back_penalty = 1;
-
-/* Idling period duration, in jiffies. */
-static int bfq_slice_idle = 0;
-
-/* Default maximum budget values, in sectors and number of requests. */
-static const int bfq_default_max_budget = 16 * 1024;
-static const int bfq_max_budget_async_rq = 4;
-
-/*
- * Async to sync throughput distribution is controlled as follows:
- * when an async request is served, the entity is charged the number
- * of sectors of the request, multipled by the factor below
- */
-static const int bfq_async_charge_factor = 10;
-
-/* Default timeout values, in jiffies, approximating CFQ defaults. */
-static const int bfq_timeout_sync = HZ / 8;
-static int bfq_timeout_async = HZ / 25;
-
-struct kmem_cache *bfq_pool;
-struct kmem_cache *bfq_ioc_pool;
-
-static DEFINE_PER_CPU(unsigned long, bfq_ioc_count);
-static struct completion *bfq_ioc_gone;
-static DEFINE_SPINLOCK(bfq_ioc_gone_lock);
-
-static DEFINE_SPINLOCK(cic_index_lock);
-static DEFINE_IDA(cic_index_ida);
-
-/* Below this threshold (in ms), we consider thinktime immediate. */
-#define BFQ_MIN_TT		2
-
-/* hw_tag detection: parallel requests threshold and min samples needed. */
-#define BFQ_HW_QUEUE_THRESHOLD	4
-#define BFQ_HW_QUEUE_SAMPLES	32
-
-#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)
-#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
-
-/* Min samples used for peak rate estimation (for autotuning). */
-#define BFQ_PEAK_RATE_SAMPLES	32
-
-/* Shift used for peak rate fixed precision calculations. */
-#define BFQ_RATE_SHIFT		16
-
-/*
- * The duration of the weight raising for interactive applications is
- * computed automatically (as default behaviour), using the following
- * formula: duration = (R / r) * T, where r is the peak rate of the
- * disk, and R and T are two reference parameters. In particular, R is
- * the peak rate of a reference disk, and T is about the maximum time
- * for starting popular large applications on that disk, under BFQ and
- * while reading two files in parallel. Finally, BFQ uses two
- * different pairs (R, T) depending on whether the disk is rotational
- * or non-rotational.
- */
-#define T_rot			(msecs_to_jiffies(5500))
-#define T_nonrot		(msecs_to_jiffies(2000))
-/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
-#define R_rot			17415
-#define R_nonrot		34791
-
-#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\
-				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
-
-#define RQ_CIC(rq)		\
-	((struct cfq_io_context *) (rq)->elevator_private[0])
-#define RQ_BFQQ(rq)		((rq)->elevator_private[1])
-
-#include "bfq-ioc.c"
-#include "bfq-sched.c"
-#include "bfq-cgroup.c"
-
-#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\
-				 IOPRIO_CLASS_IDLE)
-#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\
-				 IOPRIO_CLASS_RT)
-
-#define bfq_sample_valid(samples)	((samples) > 80)
-
-/*
- * We regard a request as SYNC, if either it's a read or has the SYNC bit
- * set (in which case it could also be a direct WRITE).
- */
-static inline int bfq_bio_sync(struct bio *bio)
-{
-	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
-		return 1;
-
-	return 0;
-}
-
-/*
- * Scheduler run of queue, if there are requests pending and no one in the
- * driver that will restart queueing.
- */
-static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
-{
-	if (bfqd->queued != 0) {
-		bfq_log(bfqd, "schedule dispatch");
-		kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
-	}
-}
-
-/*
- * Lifted from AS - choose which of rq1 and rq2 that is best served now.
- * We choose the request that is closesr to the head right now.  Distance
- * behind the head is penalized and only allowed to a certain extent.
- */
-static struct request *bfq_choose_req(struct bfq_data *bfqd,
-				      struct request *rq1,
-				      struct request *rq2,
-				      sector_t last)
-{
-	sector_t s1, s2, d1 = 0, d2 = 0;
-	unsigned long back_max;
-#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */
-#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */
-	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
-
-	if (rq1 == NULL || rq1 == rq2)
-		return rq2;
-	if (rq2 == NULL)
-		return rq1;
-
-	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
-		return rq1;
-	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
-		return rq2;
-	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
-		return rq1;
-	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
-		return rq2;
-
-	s1 = blk_rq_pos(rq1);
-	s2 = blk_rq_pos(rq2);
-
-	/*
-	 * By definition, 1KiB is 2 sectors.
-	 */
-	back_max = bfqd->bfq_back_max * 2;
-
-	/*
-	 * Strict one way elevator _except_ in the case where we allow
-	 * short backward seeks which are biased as twice the cost of a
-	 * similar forward seek.
-	 */
-	if (s1 >= last)
-		d1 = s1 - last;
-	else if (s1 + back_max >= last)
-		d1 = (last - s1) * bfqd->bfq_back_penalty;
-	else
-		wrap |= BFQ_RQ1_WRAP;
-
-	if (s2 >= last)
-		d2 = s2 - last;
-	else if (s2 + back_max >= last)
-		d2 = (last - s2) * bfqd->bfq_back_penalty;
-	else
-		wrap |= BFQ_RQ2_WRAP;
-
-	/* Found required data */
-
-	/*
-	 * By doing switch() on the bit mask "wrap" we avoid having to
-	 * check two variables for all permutations: --> faster!
-	 */
-	switch (wrap) {
-	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
-		if (d1 < d2)
-			return rq1;
-		else if (d2 < d1)
-			return rq2;
-		else {
-			if (s1 >= s2)
-				return rq1;
-			else
-				return rq2;
-		}
-
-	case BFQ_RQ2_WRAP:
-		return rq1;
-	case BFQ_RQ1_WRAP:
-		return rq2;
-	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
-	default:
-		/*
-		 * Since both rqs are wrapped,
-		 * start with the one that's further behind head
-		 * (--> only *one* back seek required),
-		 * since back seek takes more time than forward.
-		 */
-		if (s1 <= s2)
-			return rq1;
-		else
-			return rq2;
-	}
-}
-
-static struct bfq_queue *
-bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
-		     sector_t sector, struct rb_node **ret_parent,
-		     struct rb_node ***rb_link)
-{
-	struct rb_node **p, *parent;
-	struct bfq_queue *bfqq = NULL;
-
-	parent = NULL;
-	p = &root->rb_node;
-	while (*p) {
-		struct rb_node **n;
-
-		parent = *p;
-		bfqq = rb_entry(parent, struct bfq_queue, pos_node);
-
-		/*
-		 * Sort strictly based on sector. Smallest to the left,
-		 * largest to the right.
-		 */
-		if (sector > blk_rq_pos(bfqq->next_rq))
-			n = &(*p)->rb_right;
-		else if (sector < blk_rq_pos(bfqq->next_rq))
-			n = &(*p)->rb_left;
-		else
-			break;
-		p = n;
-		bfqq = NULL;
-	}
-
-	*ret_parent = parent;
-	if (rb_link)
-		*rb_link = p;
-
-	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
-		(long long unsigned)sector,
-		bfqq != NULL ? bfqq->pid : 0);
-
-	return bfqq;
-}
-
-static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	struct rb_node **p, *parent;
-	struct bfq_queue *__bfqq;
-
-	if (bfqq->pos_root != NULL) {
-		rb_erase(&bfqq->pos_node, bfqq->pos_root);
-		bfqq->pos_root = NULL;
-	}
-
-	if (bfq_class_idle(bfqq))
-		return;
-	if (!bfqq->next_rq)
-		return;
-
-	bfqq->pos_root = &bfqd->rq_pos_tree;
-	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
-			blk_rq_pos(bfqq->next_rq), &parent, &p);
-	if (__bfqq == NULL) {
-		rb_link_node(&bfqq->pos_node, parent, p);
-		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
-	} else
-		bfqq->pos_root = NULL;
-}
-
-static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
-					struct bfq_queue *bfqq,
-					struct request *last)
-{
-	struct rb_node *rbnext = rb_next(&last->rb_node);
-	struct rb_node *rbprev = rb_prev(&last->rb_node);
-	struct request *next = NULL, *prev = NULL;
-
-	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
-
-	if (rbprev != NULL)
-		prev = rb_entry_rq(rbprev);
-
-	if (rbnext != NULL)
-		next = rb_entry_rq(rbnext);
-	else {
-		rbnext = rb_first(&bfqq->sort_list);
-		if (rbnext && rbnext != &last->rb_node)
-			next = rb_entry_rq(rbnext);
-	}
-
-	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
-}
-
-static void bfq_del_rq_rb(struct request *rq)
-{
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-	struct bfq_data *bfqd = bfqq->bfqd;
-	const int sync = rq_is_sync(rq);
-
-	BUG_ON(bfqq->queued[sync] == 0);
-	bfqq->queued[sync]--;
-	bfqd->queued--;
-
-	elv_rb_del(&bfqq->sort_list, rq);
-
-	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
-		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue)
-			bfq_del_bfqq_busy(bfqd, bfqq, 1);
-		/*
-		 * Remove queue from request-position tree as it is empty.
-		 */
-		if (bfqq->pos_root != NULL) {
-			rb_erase(&bfqq->pos_node, bfqq->pos_root);
-			bfqq->pos_root = NULL;
-		}
-	}
-}
-
-/* see the definition of bfq_async_charge_factor for details */
-static inline unsigned long bfq_serv_to_charge(struct request *rq,
-					       struct bfq_queue *bfqq)
-{
-	return blk_rq_sectors(rq) *
-		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
-		bfq_async_charge_factor));
-}
-
-/**
- * bfq_updated_next_req - update the queue after a new next_rq selection.
- * @bfqd: the device data the queue belongs to.
- * @bfqq: the queue to update.
- *
- * If the first request of a queue changes we make sure that the queue
- * has enough budget to serve at least its first request (if the
- * request has grown).  We do this because if the queue has not enough
- * budget for its first request, it has to go through two dispatch
- * rounds to actually get it dispatched.
- */
-static void bfq_updated_next_req(struct bfq_data *bfqd,
-				 struct bfq_queue *bfqq)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-	struct request *next_rq = bfqq->next_rq;
-	unsigned long new_budget;
-
-	if (next_rq == NULL)
-		return;
-
-	if (bfqq == bfqd->active_queue)
-		/*
-		 * In order not to break guarantees, budgets cannot be
-		 * changed after an entity has been selected.
-		 */
-		return;
-
-	BUG_ON(entity->tree != &st->active);
-	BUG_ON(entity == entity->sched_data->active_entity);
-
-	new_budget = max_t(unsigned long, bfqq->max_budget,
-			   bfq_serv_to_charge(next_rq, bfqq));
-	entity->budget = new_budget;
-	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
-	bfq_activate_bfqq(bfqd, bfqq);
-}
-
-static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
-{
-	u64 dur;
-
-	if (bfqd->bfq_raising_max_time > 0)
-		return bfqd->bfq_raising_max_time;
-
-	dur = bfqd->RT_prod;
-	do_div(dur, bfqd->peak_rate);
-
-	return dur;
-}
-
-static void bfq_add_rq_rb(struct request *rq)
-{
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-	struct bfq_entity *entity = &bfqq->entity;
-	struct bfq_data *bfqd = bfqq->bfqd;
-	struct request *next_rq, *prev;
-	unsigned long old_raising_coeff = bfqq->raising_coeff;
-	int idle_for_long_time = bfqq->budget_timeout +
-		bfqd->bfq_raising_min_idle_time < jiffies;
-
-	bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
-	bfqq->queued[rq_is_sync(rq)]++;
-	bfqd->queued++;
-
-	elv_rb_add(&bfqq->sort_list, rq);
-
-	/*
-	 * Check if this request is a better next-serve candidate.
-	 */
-	prev = bfqq->next_rq;
-	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
-	BUG_ON(next_rq == NULL);
-	bfqq->next_rq = next_rq;
-
-	/*
-	 * Adjust priority tree position, if next_rq changes.
-	 */
-	if (prev != bfqq->next_rq)
-		bfq_rq_pos_tree_add(bfqd, bfqq);
-
-	if (!bfq_bfqq_busy(bfqq)) {
-		int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
-			bfqq->soft_rt_next_start < jiffies;
-		entity->budget = max_t(unsigned long, bfqq->max_budget,
-				       bfq_serv_to_charge(next_rq, bfqq));
-
-		if (! bfqd->low_latency)
-			goto add_bfqq_busy;
-
-		/*
-		 * If the queue is not being boosted and has been idle
-		 * for enough time, start a weight-raising period
-		 */
-		if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
-			bfqq->raising_coeff = bfqd->bfq_raising_coeff;
-			if (idle_for_long_time)
-				bfqq->raising_cur_max_time =
-					bfq_wrais_duration(bfqd);
-			else
-				bfqq->raising_cur_max_time =
-					bfqd->bfq_raising_rt_max_time;
-			bfq_log_bfqq(bfqd, bfqq,
-				     "wrais starting at %llu msec,"
-				     "rais_max_time %u",
-				     bfqq->last_rais_start_finish,
-				     jiffies_to_msecs(bfqq->
-					raising_cur_max_time));
-		} else if (old_raising_coeff > 1) {
-			if (idle_for_long_time)
-				bfqq->raising_cur_max_time =
-					bfq_wrais_duration(bfqd);
-			else if (bfqq->raising_cur_max_time ==
-				 bfqd->bfq_raising_rt_max_time &&
-				 !soft_rt) {
-				bfqq->raising_coeff = 1;
-				bfq_log_bfqq(bfqd, bfqq,
-					     "wrais ending at %llu msec,"
-					     "rais_max_time %u",
-					     bfqq->last_rais_start_finish,
-					     jiffies_to_msecs(bfqq->
-						raising_cur_max_time));
-				}
-		}
-		if (old_raising_coeff != bfqq->raising_coeff)
-			entity->ioprio_changed = 1;
-add_bfqq_busy:
-		bfq_add_bfqq_busy(bfqd, bfqq);
-        } else {
-                if(bfqd->low_latency && old_raising_coeff == 1 &&
-			!rq_is_sync(rq) &&
-			bfqq->last_rais_start_finish +
-                        bfqd->bfq_raising_min_inter_arr_async < jiffies) {
-                        bfqq->raising_coeff = bfqd->bfq_raising_coeff;
-			bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
-
-			entity->ioprio_changed = 1;
-			bfq_log_bfqq(bfqd, bfqq,
-				     "non-idle wrais starting at %llu msec,"
-				     "rais_max_time %u",
-				     bfqq->last_rais_start_finish,
-				     jiffies_to_msecs(bfqq->
-					raising_cur_max_time));
-                }
-                bfq_updated_next_req(bfqd, bfqq);
-	}
-
-	if(bfqd->low_latency &&
-		(old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
-		 idle_for_long_time))
-		bfqq->last_rais_start_finish = jiffies;
-}
-
-static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
-{
-	elv_rb_del(&bfqq->sort_list, rq);
-	bfqq->queued[rq_is_sync(rq)]--;
-	bfqq->bfqd->queued--;
-	bfq_add_rq_rb(rq);
-}
-
-static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
-					  struct bio *bio)
-{
-	struct task_struct *tsk = current;
-	struct cfq_io_context *cic;
-	struct bfq_queue *bfqq;
-
-	cic = bfq_cic_lookup(bfqd, tsk->io_context);
-	if (cic == NULL)
-		return NULL;
-
-	bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio));
-	if (bfqq != NULL) {
-		sector_t sector = bio->bi_sector + bio_sectors(bio);
-
-		return elv_rb_find(&bfqq->sort_list, sector);
-	}
-
-	return NULL;
-}
-
-static void bfq_activate_request(struct request_queue *q, struct request *rq)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-
-	bfqd->rq_in_driver++;
-	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
-	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
-		(long long unsigned)bfqd->last_position);
-}
-
-static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-
-	WARN_ON(bfqd->rq_in_driver == 0);
-	bfqd->rq_in_driver--;
-}
-
-static void bfq_remove_request(struct request *rq)
-{
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-	struct bfq_data *bfqd = bfqq->bfqd;
-
-	if (bfqq->next_rq == rq) {
-		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
-		bfq_updated_next_req(bfqd, bfqq);
-	}
-
-	list_del_init(&rq->queuelist);
-	bfq_del_rq_rb(rq);
-
-	if (rq->cmd_flags & REQ_META) {
-		WARN_ON(bfqq->meta_pending == 0);
-		bfqq->meta_pending--;
-	}
-}
-
-static int bfq_merge(struct request_queue *q, struct request **req,
-		     struct bio *bio)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct request *__rq;
-
-	__rq = bfq_find_rq_fmerge(bfqd, bio);
-	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
-		*req = __rq;
-		return ELEVATOR_FRONT_MERGE;
-	}
-
-	return ELEVATOR_NO_MERGE;
-}
-
-static void bfq_merged_request(struct request_queue *q, struct request *req,
-			       int type)
-{
-	if (type == ELEVATOR_FRONT_MERGE) {
-		struct bfq_queue *bfqq = RQ_BFQQ(req);
-
-		bfq_reposition_rq_rb(bfqq, req);
-	}
-}
-
-static void bfq_merged_requests(struct request_queue *q, struct request *rq,
-				struct request *next)
-{
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-
-	/*
-	 * Reposition in fifo if next is older than rq.
-	 */
-	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
-	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
-		list_move(&rq->queuelist, &next->queuelist);
-		rq_set_fifo_time(rq, rq_fifo_time(next));
-	}
-
-	if (bfqq->next_rq == next)
-		bfqq->next_rq = rq;
-
-	bfq_remove_request(next);
-}
-
-static int bfq_allow_merge(struct request_queue *q, struct request *rq,
-			   struct bio *bio)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct cfq_io_context *cic;
-	struct bfq_queue *bfqq;
-
-	/* Disallow merge of a sync bio into an async request. */
-	if (bfq_bio_sync(bio) && !rq_is_sync(rq))
-		return 0;
-
-	/*
-	 * Lookup the bfqq that this bio will be queued with. Allow
-	 * merge only if rq is queued there.
-	 */
-	cic = bfq_cic_lookup(bfqd, current->io_context);
-	if (cic == NULL)
-		return 0;
-
-	bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio));
-	return bfqq == RQ_BFQQ(rq);
-}
-
-static void __bfq_set_active_queue(struct bfq_data *bfqd,
-				   struct bfq_queue *bfqq)
-{
-	if (bfqq != NULL) {
-		bfq_mark_bfqq_must_alloc(bfqq);
-		bfq_mark_bfqq_budget_new(bfqq);
-		bfq_clear_bfqq_fifo_expire(bfqq);
-
-		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
-
-		bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
-			     bfqq->entity.budget);
-	}
-
-	bfqd->active_queue = bfqq;
-}
-
-/*
- * Get and set a new active queue for service.
- */
-static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
-					      struct bfq_queue *bfqq)
-{
-	if (!bfqq)
-		bfqq = bfq_get_next_queue(bfqd);
-	else
-		bfq_get_next_queue_forced(bfqd, bfqq);
-
-	__bfq_set_active_queue(bfqd, bfqq);
-	return bfqq;
-}
-
-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
-					  struct request *rq)
-{
-	if (blk_rq_pos(rq) >= bfqd->last_position)
-		return blk_rq_pos(rq) - bfqd->last_position;
-	else
-		return bfqd->last_position - blk_rq_pos(rq);
-}
-
-/*
- * Return true if bfqq has no request pending and rq is close enough to
- * bfqd->last_position, or if rq is closer to bfqd->last_position than
- * bfqq->next_rq
- */
-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
-{
-	return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
-}
-
-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
-{
-	struct rb_root *root = &bfqd->rq_pos_tree;
-	struct rb_node *parent, *node;
-	struct bfq_queue *__bfqq;
-	sector_t sector = bfqd->last_position;
-
-	if (RB_EMPTY_ROOT(root))
-		return NULL;
-
-	/*
-	 * First, if we find a request starting at the end of the last
-	 * request, choose it.
-	 */
-	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
-	if (__bfqq != NULL)
-		return __bfqq;
-
-	/*
-	 * If the exact sector wasn't found, the parent of the NULL leaf
-	 * will contain the closest sector (rq_pos_tree sorted by next_request
-	 * position).
-	 */
-	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);
-	if (bfq_rq_close(bfqd, __bfqq->next_rq))
-		return __bfqq;
-
-	if (blk_rq_pos(__bfqq->next_rq) < sector)
-		node = rb_next(&__bfqq->pos_node);
-	else
-		node = rb_prev(&__bfqq->pos_node);
-	if (node == NULL)
-		return NULL;
-
-	__bfqq = rb_entry(node, struct bfq_queue, pos_node);
-	if (bfq_rq_close(bfqd, __bfqq->next_rq))
-		return __bfqq;
-
-	return NULL;
-}
-
-/*
- * bfqd - obvious
- * cur_bfqq - passed in so that we don't decide that the current queue
- *            is closely cooperating with itself.
- *
- * We are assuming that cur_bfqq has dispatched at least one request,
- * and that bfqd->last_position reflects a position on the disk associated
- * with the I/O issued by cur_bfqq.
- */
-static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
-					      struct bfq_queue *cur_bfqq)
-{
-	struct bfq_queue *bfqq;
-
-	if (bfq_class_idle(cur_bfqq))
-		return NULL;
-	if (!bfq_bfqq_sync(cur_bfqq))
-		return NULL;
-	if (BFQQ_SEEKY(cur_bfqq))
-		return NULL;
-
-	/* If device has only one backlogged bfq_queue, don't search. */
-	if (bfqd->busy_queues == 1)
-		return NULL;
-
-	/*
-	 * We should notice if some of the queues are cooperating, e.g.
-	 * working closely on the same area of the disk. In that case,
-	 * we can group them together and don't waste time idling.
-	 */
-	bfqq = bfqq_close(bfqd);
-	if (bfqq == NULL || bfqq == cur_bfqq)
-		return NULL;
-
-	/*
-	 * Do not merge queues from different bfq_groups.
-	*/
-	if (bfqq->entity.parent != cur_bfqq->entity.parent)
-		return NULL;
-
-	/*
-	 * It only makes sense to merge sync queues.
-	 */
-	if (!bfq_bfqq_sync(bfqq))
-		return NULL;
-	if (BFQQ_SEEKY(bfqq))
-		return NULL;
-
-	/*
-	 * Do not merge queues of different priority classes.
-	 */
-	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
-		return NULL;
-
-	return bfqq;
-}
-
-/*
- * If enough samples have been computed, return the current max budget
- * stored in bfqd, which is dynamically updated according to the
- * estimated disk peak rate; otherwise return the default max budget
- */
-static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
-{
-	if (bfqd->budgets_assigned < 194)
-		return bfq_default_max_budget;
-	else
-		return bfqd->bfq_max_budget;
-}
-
-/*
- * Return min budget, which is a fraction of the current or default
- * max budget (trying with 1/32)
- */
-static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
-{
-	if (bfqd->budgets_assigned < 194)
-		return bfq_default_max_budget;
-	else
-		return bfqd->bfq_max_budget / 32;
-}
-
-/*
- * Decides whether idling should be done for given device and
- * given active queue.
- */
-static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
-					   struct bfq_queue *active_bfqq)
-{
-	if (active_bfqq == NULL)
-		return false;
-	/*
-	 * If device is SSD it has no seek penalty, disable idling; but
-	 * do so only if:
-	 * - device does not support queuing, otherwise we still have
-	 *   a problem with sync vs async workloads;
-	 * - the queue is not weight-raised, to preserve guarantees.
-	 */
-	return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
-		active_bfqq->raising_coeff == 1);
-}
-
-static void bfq_arm_slice_timer(struct bfq_data *bfqd)
-{
-	struct bfq_queue *bfqq = bfqd->active_queue;
-	struct cfq_io_context *cic;
-	unsigned long sl;
-
-	WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
-
-	if (bfq_queue_nonrot_noidle(bfqd, bfqq))
-		return;
-
-	/* Idling is disabled, either manually or by past process history. */
-	if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_idle_window(bfqq))
-		return;
-
-	/* Tasks have exited, don't wait. */
-	cic = bfqd->active_cic;
-	if (cic == NULL || atomic_read(&cic->ioc->nr_tasks) == 0)
-		return;
-
-	bfq_mark_bfqq_wait_request(bfqq);
-
-	/*
-	 * We don't want to idle for seeks, but we do want to allow
-	 * fair distribution of slice time for a process doing back-to-back
-	 * seeks. So allow a little bit of time for him to submit a new rq.
-	 *
-	 * To prevent processes with (partly) seeky workloads from
-	 * being too ill-treated, grant them a small fraction of the
-	 * assigned budget before reducing the waiting time to
-	 * BFQ_MIN_TT. This happened to help reduce latency.
-	 */
-	sl = bfqd->bfq_slice_idle;
-	if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
-	    bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
-	    bfqq->raising_coeff == 1)
-		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
-	else if (bfqq->raising_coeff > 1)
-		sl = sl * 3;
-	bfqd->last_idling_start = ktime_get();
-	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
-	bfq_log(bfqd, "arm idle: %u/%u ms",
-		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
-}
-
-/*
- * Set the maximum time for the active queue to consume its
- * budget. This prevents seeky processes from lowering the disk
- * throughput (always guaranteed with a time slice scheme as in CFQ).
- */
-static void bfq_set_budget_timeout(struct bfq_data *bfqd)
-{
-	struct bfq_queue *bfqq = bfqd->active_queue;
-	unsigned int timeout_coeff;
-	if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
-		timeout_coeff = 1;
-	else
-		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
-
-	bfqd->last_budget_start = ktime_get();
-
-	bfq_clear_bfqq_budget_new(bfqq);
-	bfqq->budget_timeout = jiffies +
-		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
-
-	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
-		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
-		timeout_coeff));
-}
-
-/*
- * Move request from internal lists to the request queue dispatch list.
- */
-static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-
-	bfq_remove_request(rq);
-	bfqq->dispatched++;
-	elv_dispatch_sort(q, rq);
-
-	if (bfq_bfqq_sync(bfqq))
-		bfqd->sync_flight++;
-}
-
-/*
- * Return expired entry, or NULL to just start from scratch in rbtree.
- */
-static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
-{
-	struct request *rq = NULL;
-
-	if (bfq_bfqq_fifo_expire(bfqq))
-		return NULL;
-
-	bfq_mark_bfqq_fifo_expire(bfqq);
-
-	if (list_empty(&bfqq->fifo))
-		return NULL;
-
-	rq = rq_entry_fifo(bfqq->fifo.next);
-
-	if (time_before(jiffies, rq_fifo_time(rq)))
-		return NULL;
-
-	return rq;
-}
-
-/*
- * Must be called with the queue_lock held.
- */
-static int bfqq_process_refs(struct bfq_queue *bfqq)
-{
-	int process_refs, io_refs;
-
-	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
-	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
-	BUG_ON(process_refs < 0);
-	return process_refs;
-}
-
-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
-{
-	int process_refs, new_process_refs;
-	struct bfq_queue *__bfqq;
-
-	/*
-	 * If there are no process references on the new_bfqq, then it is
-	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
-	 * may have dropped their last reference (not just their last process
-	 * reference).
-	 */
-	if (!bfqq_process_refs(new_bfqq))
-		return;
-
-	/* Avoid a circular list and skip interim queue merges. */
-	while ((__bfqq = new_bfqq->new_bfqq)) {
-		if (__bfqq == bfqq)
-			return;
-		new_bfqq = __bfqq;
-	}
-
-	process_refs = bfqq_process_refs(bfqq);
-	new_process_refs = bfqq_process_refs(new_bfqq);
-	/*
-	 * If the process for the bfqq has gone away, there is no
-	 * sense in merging the queues.
-	 */
-	if (process_refs == 0 || new_process_refs == 0)
-		return;
-
-	/*
-	 * Merge in the direction of the lesser amount of work.
-	 */
-	if (new_process_refs >= process_refs) {
-		bfqq->new_bfqq = new_bfqq;
-		atomic_add(process_refs, &new_bfqq->ref);
-	} else {
-		new_bfqq->new_bfqq = bfqq;
-		atomic_add(new_process_refs, &bfqq->ref);
-	}
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
-		new_bfqq->pid);
-}
-
-static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-	return entity->budget - entity->service;
-}
-
-static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	BUG_ON(bfqq != bfqd->active_queue);
-
-	__bfq_bfqd_reset_active(bfqd);
-
-	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
-		bfq_del_bfqq_busy(bfqd, bfqq, 1);
-		/*
-		 * overloading budget_timeout field to store when
-		 * the queue remains with no backlog, used by
-		 * the weight-raising mechanism
-		 */
-		bfqq->budget_timeout = jiffies ;
-	}
-	else {
-		bfq_activate_bfqq(bfqd, bfqq);
-		/*
-		 * Resort priority tree of potential close cooperators.
-		 */
-		bfq_rq_pos_tree_add(bfqd, bfqq);
-	}
-
-	/*
-	 * If this bfqq is shared between multiple processes, check
-	 * to make sure that those processes are still issuing I/Os
-	 * within the mean seek distance. If not, it may be time to
-	 * break the queues apart again.
-	 */
-	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
-		bfq_mark_bfqq_split_coop(bfqq);
-}
-
-/**
- * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
- * @bfqd: device data.
- * @bfqq: queue to update.
- * @reason: reason for expiration.
- *
- * Handle the feedback on @bfqq budget.  See the body for detailed
- * comments.
- */
-static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
-				     struct bfq_queue *bfqq,
-				     enum bfqq_expiration reason)
-{
-	struct request *next_rq;
-	unsigned long budget, min_budget;
-
-	budget = bfqq->max_budget;
-	min_budget = bfq_min_budget(bfqd);
-
-	BUG_ON(bfqq != bfqd->active_queue);
-
-	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
-		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
-	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
-		budget, bfq_min_budget(bfqd));
-	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
-		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));
-
-	if (bfq_bfqq_sync(bfqq)) {
-		switch (reason) {
-		/*
-		 * Caveat: in all the following cases we trade latency
-		 * for throughput.
-		 */
-		case BFQ_BFQQ_TOO_IDLE:
-			/*
-			 * This is the only case where we may reduce
-			 * the budget: if there is no requets of the
-			 * process still waiting for completion, then
-			 * we assume (tentatively) that the timer has
-			 * expired because the batch of requests of
-			 * the process could have been served with a
-			 * smaller budget.  Hence, betting that
-			 * process will behave in the same way when it
-			 * becomes backlogged again, we reduce its
-			 * next budget.  As long as we guess right,
-			 * this budget cut reduces the latency
-			 * experienced by the process.
-			 *
-			 * However, if there are still outstanding
-			 * requests, then the process may have not yet
-			 * issued its next request just because it is
-			 * still waiting for the completion of some of
-			 * the still oustanding ones.  So in this
-			 * subcase we do not reduce its budget, on the
-			 * contrary we increase it to possibly boost
-			 * the throughput, as discussed in the
-			 * comments to the BUDGET_TIMEOUT case.
-			 */
-			if (bfqq->dispatched > 0) /* still oustanding reqs */
-				budget = min(budget * 2, bfqd->bfq_max_budget);
-			else {
-				if (budget > 5 * min_budget)
-					budget -= 4 * min_budget;
-				else
-					budget = min_budget;
-			}
-			break;
-		case BFQ_BFQQ_BUDGET_TIMEOUT:
-			/*
-			 * We double the budget here because: 1) it
-			 * gives the chance to boost the throughput if
-			 * this is not a seeky process (which may have
-			 * bumped into this timeout because of, e.g.,
-			 * ZBR), 2) together with charge_full_budget
-			 * it helps give seeky processes higher
-			 * timestamps, and hence be served less
-			 * frequently.
-			 */
-			budget = min(budget * 2, bfqd->bfq_max_budget);
-			break;
-		case BFQ_BFQQ_BUDGET_EXHAUSTED:
-			/*
-			 * The process still has backlog, and did not
-			 * let either the budget timeout or the disk
-			 * idling timeout expire. Hence it is not
-			 * seeky, has a short thinktime and may be
-			 * happy with a higher budget too. So
-			 * definitely increase the budget of this good
-			 * candidate to boost the disk throughput.
-			 */
-			budget = min(budget * 4, bfqd->bfq_max_budget);
-			break;
-		case BFQ_BFQQ_NO_MORE_REQUESTS:
-		       /*
-			* Leave the budget unchanged.
-			*/
-		default:
-			return;
-		}
-	} else /* async queue */
-	    /* async queues get always the maximum possible budget
-	     * (their ability to dispatch is limited by
-	     * @bfqd->bfq_max_budget_async_rq).
-	     */
-		budget = bfqd->bfq_max_budget;
-
-	bfqq->max_budget = budget;
-
-	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
-	    bfqq->max_budget > bfqd->bfq_max_budget)
-		bfqq->max_budget = bfqd->bfq_max_budget;
-
-	/*
-	 * Make sure that we have enough budget for the next request.
-	 * Since the finish time of the bfqq must be kept in sync with
-	 * the budget, be sure to call __bfq_bfqq_expire() after the
-	 * update.
-	 */
-	next_rq = bfqq->next_rq;
-	if (next_rq != NULL)
-		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
-					    bfq_serv_to_charge(next_rq, bfqq));
-	else
-		bfqq->entity.budget = bfqq->max_budget;
-
-	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
-			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
-			bfqq->entity.budget);
-}
-
-static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
-{
-	unsigned long max_budget;
-
-	/*
-	 * The max_budget calculated when autotuning is equal to the
-	 * amount of sectors transfered in timeout_sync at the
-	 * estimated peak rate.
-	 */
-	max_budget = (unsigned long)(peak_rate * 1000 *
-				     timeout >> BFQ_RATE_SHIFT);
-
-	return max_budget;
-}
-
-/*
- * In addition to updating the peak rate, checks whether the process
- * is "slow", and returns 1 if so. This slow flag is used, in addition
- * to the budget timeout, to reduce the amount of service provided to
- * seeky processes, and hence reduce their chances to lower the
- * throughput. See the code for more details.
- */
-static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-				int compensate, enum bfqq_expiration reason)
-{
-	u64 bw, usecs, expected, timeout;
-	ktime_t delta;
-	int update = 0;
-
-	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
-		return 0;
-
-	if (compensate)
-		delta = bfqd->last_idling_start;
-	else
-		delta = ktime_get();
-	delta = ktime_sub(delta, bfqd->last_budget_start);
-	usecs = ktime_to_us(delta);
-
-	/* Don't trust short/unrealistic values. */
-	if (usecs < 100 || usecs >= LONG_MAX)
-		return 0;
-
-	/*
-	 * Calculate the bandwidth for the last slice.  We use a 64 bit
-	 * value to store the peak rate, in sectors per usec in fixed
-	 * point math.  We do so to have enough precision in the estimate
-	 * and to avoid overflows.
-	 */
-	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
-	do_div(bw, (unsigned long)usecs);
-
-	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
-
-	/*
-	 * Use only long (> 20ms) intervals to filter out spikes for
-	 * the peak rate estimation.
-	 */
-	if (usecs > 20000) {
-		if (bw > bfqd->peak_rate ||
-		   (!BFQQ_SEEKY(bfqq) &&
-		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
-			bfq_log(bfqd, "measured bw =%llu", bw);
-			/*
-			 * To smooth oscillations use a low-pass filter with
-			 * alpha=7/8, i.e.,
-			 * new_rate = (7/8) * old_rate + (1/8) * bw
-			 */
-			do_div(bw, 8);
-			bfqd->peak_rate *= 7;
-			do_div(bfqd->peak_rate, 8);
-			bfqd->peak_rate += bw;
-			update = 1;
-			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
-		}
-
-		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
-
-		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
-			bfqd->peak_rate_samples++;
-
-		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
-		    update && bfqd->bfq_user_max_budget == 0) {
-			bfqd->bfq_max_budget =
-				bfq_calc_max_budget(bfqd->peak_rate, timeout);
-			bfq_log(bfqd, "new max_budget=%lu",
-				bfqd->bfq_max_budget);
-		}
-	}
-
-	/*
-	 * If the process has been served for a too short time
-	 * interval to let its possible sequential accesses prevail on
-	 * the initial seek time needed to move the disk head on the
-	 * first sector it requested, then give the process a chance
-	 * and for the moment return false.
-	 */
-	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
-		return 0;
-
-	/*
-	 * A process is considered ``slow'' (i.e., seeky, so that we
-	 * cannot treat it fairly in the service domain, as it would
-	 * slow down too much the other processes) if, when a slice
-	 * ends for whatever reason, it has received service at a
-	 * rate that would not be high enough to complete the budget
-	 * before the budget timeout expiration.
-	 */
-	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
-
-	/*
-	 * Caveat: processes doing IO in the slower disk zones will
-	 * tend to be slow(er) even if not seeky. And the estimated
-	 * peak rate will actually be an average over the disk
-	 * surface. Hence, to not be too harsh with unlucky processes,
-	 * we keep a budget/3 margin of safety before declaring a
-	 * process slow.
-	 */
-	return expected > (4 * bfqq->entity.budget) / 3;
-}
-
-/**
- * bfq_bfqq_expire - expire a queue.
- * @bfqd: device owning the queue.
- * @bfqq: the queue to expire.
- * @compensate: if true, compensate for the time spent idling.
- * @reason: the reason causing the expiration.
- *
- *
- * If the process associated to the queue is slow (i.e., seeky), or in
- * case of budget timeout, or, finally, if it is async, we
- * artificially charge it an entire budget (independently of the
- * actual service it received). As a consequence, the queue will get
- * higher timestamps than the correct ones upon reactivation, and
- * hence it will be rescheduled as if it had received more service
- * than what it actually received. In the end, this class of processes
- * will receive less service in proportion to how slowly they consume
- * their budgets (and hence how seriously they tend to lower the
- * throughput).
- *
- * In contrast, when a queue expires because it has been idling for
- * too much or because it exhausted its budget, we do not touch the
- * amount of service it has received. Hence when the queue will be
- * reactivated and its timestamps updated, the latter will be in sync
- * with the actual service received by the queue until expiration.
- *
- * Charging a full budget to the first type of queues and the exact
- * service to the others has the effect of using the WF2Q+ policy to
- * schedule the former on a timeslice basis, without violating the
- * service domain guarantees of the latter.
- */
-static void bfq_bfqq_expire(struct bfq_data *bfqd,
-			    struct bfq_queue *bfqq,
-			    int compensate,
-			    enum bfqq_expiration reason)
-{
-	int slow;
-	BUG_ON(bfqq != bfqd->active_queue);
-
-	/* Update disk peak rate for autotuning and check whether the
-	 * process is slow (see bfq_update_peak_rate).
-	 */
-	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
-
-	/*
-	 * As above explained, 'punish' slow (i.e., seeky), timed-out
-	 * and async queues, to favor sequential sync workloads.
-	 *
-	 * Processes doing IO in the slower disk zones will tend to be
-	 * slow(er) even if not seeky. Hence, since the estimated peak
-	 * rate is actually an average over the disk surface, these
-	 * processes may timeout just for bad luck. To avoid punishing
-	 * them we do not charge a full budget to a process that
-	 * succeeded in consuming at least 2/3 of its budget.
-	 */
-	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
-		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))
-		bfq_bfqq_charge_full_budget(bfqq);
-
-	if (bfqd->low_latency && bfqq->raising_coeff == 1)
-		bfqq->last_rais_start_finish = jiffies;
-
-	if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {
-	    if(reason != BFQ_BFQQ_BUDGET_TIMEOUT)
-		bfqq->soft_rt_next_start =
-			jiffies +
-			HZ * bfqq->entity.service /
-			bfqd->bfq_raising_max_softrt_rate;
-		else
-			bfqq->soft_rt_next_start = -1; /* infinity */
-	}
-	bfq_log_bfqq(bfqd, bfqq,
-		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
-		bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
-
-	/* Increase, decrease or leave budget unchanged according to reason */
-	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
-	__bfq_bfqq_expire(bfqd, bfqq);
-}
-
-/*
- * Budget timeout is not implemented through a dedicated timer, but
- * just checked on request arrivals and completions, as well as on
- * idle timer expirations.
- */
-static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
-{
-	if (bfq_bfqq_budget_new(bfqq))
-		return 0;
-
-	if (time_before(jiffies, bfqq->budget_timeout))
-		return 0;
-
-	return 1;
-}
-
-/*
- * If we expire a queue that is waiting for the arrival of a new
- * request, we may prevent the fictitious timestamp backshifting that
- * allows the guarantees of the queue to be preserved (see [1] for
- * this tricky aspect). Hence we return true only if this condition
- * does not hold, or if the queue is slow enough to deserve only to be
- * kicked off for preserving a high throughput.
-*/
-static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
-{
-	bfq_log_bfqq(bfqq->bfqd, bfqq,
-		"may_budget_timeout: wr %d left %d timeout %d",
-		bfq_bfqq_wait_request(bfqq),
-			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,
-		bfq_bfqq_budget_timeout(bfqq));
-
-	return (!bfq_bfqq_wait_request(bfqq) ||
-		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)
-		&&
-		bfq_bfqq_budget_timeout(bfqq);
-}
-
-/*
- * Select a queue for service.  If we have a current active queue,
- * check whether to continue servicing it, or retrieve and set a new one.
- */
-static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
-{
-	struct bfq_queue *bfqq, *new_bfqq = NULL;
-	struct request *next_rq;
-	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
-
-	bfqq = bfqd->active_queue;
-	if (bfqq == NULL)
-		goto new_queue;
-
-	bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
-
-	/*
-         * If another queue has a request waiting within our mean seek
-         * distance, let it run. The expire code will check for close
-         * cooperators and put the close queue at the front of the
-         * service tree. If possible, merge the expiring queue with the
-         * new bfqq.
-         */
-        new_bfqq = bfq_close_cooperator(bfqd, bfqq);
-        if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
-                bfq_setup_merge(bfqq, new_bfqq);
-
-	if (bfq_may_expire_for_budg_timeout(bfqq))
-		goto expire;
-
-	next_rq = bfqq->next_rq;
-	/*
-	 * If bfqq has requests queued and it has enough budget left to
-	 * serve them, keep the queue, otherwise expire it.
-	 */
-	if (next_rq != NULL) {
-		if (bfq_serv_to_charge(next_rq, bfqq) >
-			bfq_bfqq_budget_left(bfqq)) {
-			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
-			goto expire;
-		} else {
-			/*
-			 * The idle timer may be pending because we may not
-			 * disable disk idling even when a new request arrives
-			 */
-			if (timer_pending(&bfqd->idle_slice_timer)) {
-				/*
-				 * If we get here: 1) at least a new request
-				 * has arrived but we have not disabled the
-				 * timer because the request was too small,
-				 * 2) then the block layer has unplugged the
-				 * device, causing the dispatch to be invoked.
-				 *
-				 * Since the device is unplugged, now the
-				 * requests are probably large enough to
-				 * provide a reasonable throughput.
-				 * So we disable idling.
-				 */
-				bfq_clear_bfqq_wait_request(bfqq);
-				del_timer(&bfqd->idle_slice_timer);
-			}
-			if (new_bfqq == NULL)
-				goto keep_queue;
-			else
-				goto expire;
-		}
-	}
-
-	/*
-	 * No requests pending.  If there is no cooperator, and the active
-	 * queue still has requests in flight or is idling for a new request,
-	 * then keep it.
-	 */
-	if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
-		(bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq) &&
-		 !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {
-		bfqq = NULL;
-		goto keep_queue;
-	} else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
-		/*
-		 * Expiring the queue because there is a close cooperator,
-		 * cancel timer.
-		 */
-		bfq_clear_bfqq_wait_request(bfqq);
-		del_timer(&bfqd->idle_slice_timer);
-	}
-
-	reason = BFQ_BFQQ_NO_MORE_REQUESTS;
-expire:
-	bfq_bfqq_expire(bfqd, bfqq, 0, reason);
-new_queue:
-	bfqq = bfq_set_active_queue(bfqd, new_bfqq);
-	bfq_log(bfqd, "select_queue: new queue %d returned",
-		bfqq != NULL ? bfqq->pid : 0);
-keep_queue:
-	return bfqq;
-}
-
-static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	if (bfqq->raising_coeff > 1) { /* queue is being boosted */
-		struct bfq_entity *entity = &bfqq->entity;
-
-		bfq_log_bfqq(bfqd, bfqq,
-			"raising period dur %u/%u msec, "
-			"old raising coeff %u, w %d(%d)",
-			jiffies_to_msecs(jiffies -
-				bfqq->last_rais_start_finish),
-			jiffies_to_msecs(bfqq->raising_cur_max_time),
-			bfqq->raising_coeff,
-			bfqq->entity.weight, bfqq->entity.orig_weight);
-
-		BUG_ON(bfqq != bfqd->active_queue && entity->weight !=
-			entity->orig_weight * bfqq->raising_coeff);
-		if(entity->ioprio_changed)
-			bfq_log_bfqq(bfqd, bfqq,
-			"WARN: pending prio change");
-		/*
-		 * If too much time has elapsed from the beginning
-		 * of this weight-raising period and process is not soft
-		 * real-time, stop it
-		 */
-		if (jiffies - bfqq->last_rais_start_finish >
-			bfqq->raising_cur_max_time) {
-			int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
-				bfqq->soft_rt_next_start < jiffies;
-
-			bfqq->last_rais_start_finish = jiffies;
-			if (soft_rt)
-				bfqq->raising_cur_max_time =
-					bfqd->bfq_raising_rt_max_time;
-			else {
-				bfqq->raising_coeff = 1;
-				entity->ioprio_changed = 1;
-				__bfq_entity_update_weight_prio(
-					bfq_entity_service_tree(entity),
-					entity);
-			}
-		}
-	}
-}
-
-
-/*
- * Dispatch one request from bfqq, moving it to the request queue
- * dispatch list.
- */
-static int bfq_dispatch_request(struct bfq_data *bfqd,
-				struct bfq_queue *bfqq)
-{
-	int dispatched = 0;
-	struct request *rq;
-	unsigned long service_to_charge;
-
-	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
-
-	/* Follow expired path, else get first next available. */
-	rq = bfq_check_fifo(bfqq);
-	if (rq == NULL)
-		rq = bfqq->next_rq;
-	service_to_charge = bfq_serv_to_charge(rq, bfqq);
-
-	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
-		/*
-		 * This may happen if the next rq is chosen
-		 * in fifo order instead of sector order.
-		 * The budget is properly dimensioned
-		 * to be always sufficient to serve the next request
-		 * only if it is chosen in sector order. The reason is
-		 * that it would be quite inefficient and little useful
-		 * to always make sure that the budget is large enough
-		 * to serve even the possible next rq in fifo order.
-		 * In fact, requests are seldom served in fifo order.
-		 *
-		 * Expire the queue for budget exhaustion, and
-		 * make sure that the next act_budget is enough
-		 * to serve the next request, even if it comes
-		 * from the fifo expired path.
-		 */
-		bfqq->next_rq = rq;
-		/*
-		 * Since this dispatch is failed, make sure that
-		 * a new one will be performed
-		 */
-		if (!bfqd->rq_in_driver)
-			bfq_schedule_dispatch(bfqd);
-		goto expire;
-	}
-
-	/* Finally, insert request into driver dispatch list. */
-	bfq_bfqq_served(bfqq, service_to_charge);
-	bfq_dispatch_insert(bfqd->queue, rq);
-
-	update_raising_data(bfqd, bfqq);
-
-	bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), "
-			"budg left %lu",
-			blk_rq_sectors(rq),
-			(long long unsigned)blk_rq_pos(rq),
-			bfq_bfqq_budget_left(bfqq));
-
-	dispatched++;
-
-	if (bfqd->active_cic == NULL) {
-		atomic_long_inc(&RQ_CIC(rq)->ioc->refcount);
-		bfqd->active_cic = RQ_CIC(rq);
-	}
-
-	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
-	    dispatched >= bfqd->bfq_max_budget_async_rq) ||
-	    bfq_class_idle(bfqq)))
-		goto expire;
-
-	return dispatched;
-
-expire:
-	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
-	return dispatched;
-}
-
-static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
-{
-	int dispatched = 0;
-
-	while (bfqq->next_rq != NULL) {
-		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
-		dispatched++;
-	}
-
-	BUG_ON(!list_empty(&bfqq->fifo));
-	return dispatched;
-}
-
-/*
- * Drain our current requests.  Used for barriers and when switching
- * io schedulers on-the-fly.
- */
-static int bfq_forced_dispatch(struct bfq_data *bfqd)
-{
-	struct bfq_queue *bfqq, *n;
-	struct bfq_service_tree *st;
-	int dispatched = 0;
-
-	bfqq = bfqd->active_queue;
-	if (bfqq != NULL)
-		__bfq_bfqq_expire(bfqd, bfqq);
-
-	/*
-	 * Loop through classes, and be careful to leave the scheduler
-	 * in a consistent state, as feedback mechanisms and vtime
-	 * updates cannot be disabled during the process.
-	 */
-	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
-		st = bfq_entity_service_tree(&bfqq->entity);
-
-		dispatched += __bfq_forced_dispatch_bfqq(bfqq);
-		bfqq->max_budget = bfq_max_budget(bfqd);
-
-		bfq_forget_idle(st);
-	}
-
-	BUG_ON(bfqd->busy_queues != 0);
-
-	return dispatched;
-}
-
-static int bfq_dispatch_requests(struct request_queue *q, int force)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct bfq_queue *bfqq;
-	int max_dispatch;
-
-	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
-	if (bfqd->busy_queues == 0)
-		return 0;
-
-	if (unlikely(force))
-		return bfq_forced_dispatch(bfqd);
-
-	if((bfqq = bfq_select_queue(bfqd)) == NULL)
-		return 0;
-
-	max_dispatch = bfqd->bfq_quantum;
-	if (bfq_class_idle(bfqq))
-		max_dispatch = 1;
-
-	if (!bfq_bfqq_sync(bfqq))
-		max_dispatch = bfqd->bfq_max_budget_async_rq;
-
-	if (bfqq->dispatched >= max_dispatch) {
-		if (bfqd->busy_queues > 1)
-			return 0;
-		if (bfqq->dispatched >= 4 * max_dispatch)
-			return 0;
-	}
-
-	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
-		return 0;
-
-	bfq_clear_bfqq_wait_request(bfqq);
-	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
-
-	if (! bfq_dispatch_request(bfqd, bfqq))
-		return 0;
-
-	bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d"
-		     "(max_disp %d)", bfqq->pid, max_dispatch);
-
-	return 1;
-}
-
-/*
- * Task holds one reference to the queue, dropped when task exits.  Each rq
- * in-flight on this queue also holds a reference, dropped when rq is freed.
- *
- * Queue lock must be held here.
- */
-static void bfq_put_queue(struct bfq_queue *bfqq)
-{
-	struct bfq_data *bfqd = bfqq->bfqd;
-
-	BUG_ON(atomic_read(&bfqq->ref) <= 0);
-
-	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
-		     atomic_read(&bfqq->ref));
-	if (!atomic_dec_and_test(&bfqq->ref))
-		return;
-
-	BUG_ON(rb_first(&bfqq->sort_list) != NULL);
-	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
-	BUG_ON(bfqq->entity.tree != NULL);
-	BUG_ON(bfq_bfqq_busy(bfqq));
-	BUG_ON(bfqd->active_queue == bfqq);
-
-	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
-
-	kmem_cache_free(bfq_pool, bfqq);
-}
-
-static void bfq_put_cooperator(struct bfq_queue *bfqq)
-{
-	struct bfq_queue *__bfqq, *next;
-
-	/*
-	 * If this queue was scheduled to merge with another queue, be
-	 * sure to drop the reference taken on that queue (and others in
-	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
-	 */
-	__bfqq = bfqq->new_bfqq;
-	while (__bfqq) {
-		if (__bfqq == bfqq) {
-			WARN(1, "bfqq->new_bfqq loop detected.\n");
-			break;
-		}
-		next = __bfqq->new_bfqq;
-		bfq_put_queue(__bfqq);
-		__bfqq = next;
-	}
-}
-
-static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	if (bfqq == bfqd->active_queue) {
-		__bfq_bfqq_expire(bfqd, bfqq);
-		bfq_schedule_dispatch(bfqd);
-	}
-
-	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
-		     atomic_read(&bfqq->ref));
-
-	bfq_put_cooperator(bfqq);
-
-	bfq_put_queue(bfqq);
-}
-
-/*
- * Update the entity prio values; note that the new values will not
- * be used until the next (re)activation.
- */
-static void bfq_init_prio_data(struct bfq_queue *bfqq, struct io_context *ioc)
-{
-	struct task_struct *tsk = current;
-	int ioprio_class;
-
-	if (!bfq_bfqq_prio_changed(bfqq))
-		return;
-
-	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
-	switch (ioprio_class) {
-	default:
-		printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
-	case IOPRIO_CLASS_NONE:
-		/*
-		 * No prio set, inherit CPU scheduling settings.
-		 */
-		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
-		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
-		break;
-	case IOPRIO_CLASS_RT:
-		bfqq->entity.new_ioprio = task_ioprio(ioc);
-		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
-		break;
-	case IOPRIO_CLASS_BE:
-		bfqq->entity.new_ioprio = task_ioprio(ioc);
-		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
-		break;
-	case IOPRIO_CLASS_IDLE:
-		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
-		bfqq->entity.new_ioprio = 7;
-		bfq_clear_bfqq_idle_window(bfqq);
-		break;
-	}
-
-	bfqq->entity.ioprio_changed = 1;
-
-	/*
-	 * Keep track of original prio settings in case we have to temporarily
-	 * elevate the priority of this queue.
-	 */
-	bfqq->org_ioprio = bfqq->entity.new_ioprio;
-	bfqq->org_ioprio_class = bfqq->entity.new_ioprio_class;
-	bfq_clear_bfqq_prio_changed(bfqq);
-}
-
-static void bfq_changed_ioprio(struct io_context *ioc,
-			       struct cfq_io_context *cic)
-{
-	struct bfq_data *bfqd;
-	struct bfq_queue *bfqq, *new_bfqq;
-	struct bfq_group *bfqg;
-	unsigned long uninitialized_var(flags);
-
-	bfqd = bfq_get_bfqd_locked(&cic->key, &flags);
-	if (unlikely(bfqd == NULL))
-		return;
-
-	bfqq = cic->cfqq[BLK_RW_ASYNC];
-	if (bfqq != NULL) {
-		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
-				    sched_data);
-		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, cic->ioc,
-					 GFP_ATOMIC);
-		if (new_bfqq != NULL) {
-			cic->cfqq[BLK_RW_ASYNC] = new_bfqq;
-			bfq_log_bfqq(bfqd, bfqq,
-				     "changed_ioprio: bfqq %p %d",
-				     bfqq, atomic_read(&bfqq->ref));
-			bfq_put_queue(bfqq);
-		}
-	}
-
-	bfqq = cic->cfqq[BLK_RW_SYNC];
-	if (bfqq != NULL)
-		bfq_mark_bfqq_prio_changed(bfqq);
-
-	bfq_put_bfqd_unlock(bfqd, &flags);
-}
-
-static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			  pid_t pid, int is_sync)
-{
-	RB_CLEAR_NODE(&bfqq->entity.rb_node);
-	INIT_LIST_HEAD(&bfqq->fifo);
-
-	atomic_set(&bfqq->ref, 0);
-	bfqq->bfqd = bfqd;
-
-	bfq_mark_bfqq_prio_changed(bfqq);
-
-	if (is_sync) {
-		if (!bfq_class_idle(bfqq))
-			bfq_mark_bfqq_idle_window(bfqq);
-		bfq_mark_bfqq_sync(bfqq);
-	}
-
-	/* Tentative initial value to trade off between thr and lat */
-	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
-	bfqq->pid = pid;
-
-	bfqq->raising_coeff = 1;
-	bfqq->last_rais_start_finish = 0;
-	bfqq->soft_rt_next_start = -1;
-}
-
-static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
-					      struct bfq_group *bfqg,
-					      int is_sync,
-					      struct io_context *ioc,
-					      gfp_t gfp_mask)
-{
-	struct bfq_queue *bfqq, *new_bfqq = NULL;
-	struct cfq_io_context *cic;
-
-retry:
-	cic = bfq_cic_lookup(bfqd, ioc);
-	/* cic always exists here */
-	bfqq = cic_to_bfqq(cic, is_sync);
-
-	/*
-	 * Always try a new alloc if we fall back to the OOM bfqq
-	 * originally, since it should just be a temporary situation.
-	 */
-	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
-		bfqq = NULL;
-		if (new_bfqq != NULL) {
-			bfqq = new_bfqq;
-			new_bfqq = NULL;
-		} else if (gfp_mask & __GFP_WAIT) {
-			spin_unlock_irq(bfqd->queue->queue_lock);
-			new_bfqq = kmem_cache_alloc_node(bfq_pool,
-					gfp_mask | __GFP_ZERO,
-					bfqd->queue->node);
-			spin_lock_irq(bfqd->queue->queue_lock);
-			if (new_bfqq != NULL)
-				goto retry;
-		} else {
-			bfqq = kmem_cache_alloc_node(bfq_pool,
-					gfp_mask | __GFP_ZERO,
-					bfqd->queue->node);
-		}
-
-		if (bfqq != NULL) {
-			bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
-			bfq_log_bfqq(bfqd, bfqq, "allocated");
-		} else {
-			bfqq = &bfqd->oom_bfqq;
-			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
-		}
-
-		bfq_init_prio_data(bfqq, ioc);
-		bfq_init_entity(&bfqq->entity, bfqg);
-	}
-
-	if (new_bfqq != NULL)
-		kmem_cache_free(bfq_pool, new_bfqq);
-
-	return bfqq;
-}
-
-static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
-					       struct bfq_group *bfqg,
-					       int ioprio_class, int ioprio)
-{
-	switch (ioprio_class) {
-	case IOPRIO_CLASS_RT:
-		return &bfqg->async_bfqq[0][ioprio];
-	case IOPRIO_CLASS_BE:
-		return &bfqg->async_bfqq[1][ioprio];
-	case IOPRIO_CLASS_IDLE:
-		return &bfqg->async_idle_bfqq;
-	default:
-		BUG();
-	}
-}
-
-static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
-				       struct bfq_group *bfqg, int is_sync,
-				       struct io_context *ioc, gfp_t gfp_mask)
-{
-	const int ioprio = task_ioprio(ioc);
-	const int ioprio_class = task_ioprio_class(ioc);
-	struct bfq_queue **async_bfqq = NULL;
-	struct bfq_queue *bfqq = NULL;
-
-	if (!is_sync) {
-		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
-						  ioprio);
-		bfqq = *async_bfqq;
-	}
-
-	if (bfqq == NULL)
-		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask);
-
-	/*
-	 * Pin the queue now that it's allocated, scheduler exit will prune it.
-	 */
-	if (!is_sync && *async_bfqq == NULL) {
-		atomic_inc(&bfqq->ref);
-		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
-			     bfqq, atomic_read(&bfqq->ref));
-		*async_bfqq = bfqq;
-	}
-
-	atomic_inc(&bfqq->ref);
-	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
-		     atomic_read(&bfqq->ref));
-	return bfqq;
-}
-
-static void bfq_update_io_thinktime(struct bfq_data *bfqd,
-				    struct cfq_io_context *cic)
-{
-	unsigned long elapsed = jiffies - cic->last_end_request;
-	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
-
-	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
-	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
-	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
-}
-
-static void bfq_update_io_seektime(struct bfq_data *bfqd,
-				   struct bfq_queue *bfqq,
-				   struct request *rq)
-{
-	sector_t sdist;
-	u64 total;
-
-	if (bfqq->last_request_pos < blk_rq_pos(rq))
-		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
-	else
-		sdist = bfqq->last_request_pos - blk_rq_pos(rq);
-
-	/*
-	 * Don't allow the seek distance to get too large from the
-	 * odd fragment, pagein, etc.
-	 */
-	if (bfqq->seek_samples == 0) /* first request, not really a seek */
-		sdist = 0;
-	else if (bfqq->seek_samples <= 60) /* second & third seek */
-		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
-	else
-		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
-
-	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
-	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
-	total = bfqq->seek_total + (bfqq->seek_samples/2);
-	do_div(total, bfqq->seek_samples);
-	if (bfq_bfqq_coop(bfqq)) {
-		/*
-		 * If the mean seektime increases for a (non-seeky) shared
-		 * queue, some cooperator is likely to be idling too much.
-		 * On the contrary,  if it decreases, some cooperator has
-		 * probably waked up.
-		 *
-		 */
-		if ((sector_t)total < bfqq->seek_mean)
-			bfq_mark_bfqq_some_coop_idle(bfqq) ;
-		else if ((sector_t)total > bfqq->seek_mean)
-			bfq_clear_bfqq_some_coop_idle(bfqq) ;
-	}
-	bfqq->seek_mean = (sector_t)total;
-
-	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
-			(u64)bfqq->seek_mean);
-}
-
-/*
- * Disable idle window if the process thinks too long or seeks so much that
- * it doesn't matter.
- */
-static void bfq_update_idle_window(struct bfq_data *bfqd,
-				   struct bfq_queue *bfqq,
-				   struct cfq_io_context *cic)
-{
-	int enable_idle;
-
-	/* Don't idle for async or idle io prio class. */
-	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
-		return;
-
-	enable_idle = bfq_bfqq_idle_window(bfqq);
-
-	if (atomic_read(&cic->ioc->nr_tasks) == 0 ||
-	    bfqd->bfq_slice_idle == 0 ||
-		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
-			bfqq->raising_coeff == 1))
-		enable_idle = 0;
-	else if (bfq_sample_valid(cic->ttime_samples)) {
-		if (cic->ttime_mean > bfqd->bfq_slice_idle &&
-			bfqq->raising_coeff == 1)
-			enable_idle = 0;
-		else
-			enable_idle = 1;
-	}
-	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
-		enable_idle);
-
-	if (enable_idle)
-		bfq_mark_bfqq_idle_window(bfqq);
-	else
-		bfq_clear_bfqq_idle_window(bfqq);
-}
-
-/*
- * Called when a new fs request (rq) is added to bfqq.  Check if there's
- * something we should do about it.
- */
-static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			    struct request *rq)
-{
-	struct cfq_io_context *cic = RQ_CIC(rq);
-
-	if (rq->cmd_flags & REQ_META)
-		bfqq->meta_pending++;
-
-	bfq_update_io_thinktime(bfqd, cic);
-	bfq_update_io_seektime(bfqd, bfqq, rq);
-	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
-	    !BFQQ_SEEKY(bfqq))
-		bfq_update_idle_window(bfqd, bfqq, cic);
-
-	bfq_log_bfqq(bfqd, bfqq,
-		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
-		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
-		     (long long unsigned)bfqq->seek_mean);
-
-	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
-
-	if (bfqq == bfqd->active_queue) {
-		/*
-		 * If there is just this request queued and the request
-		 * is small, just exit.
-		 * In this way, if the disk is being idled to wait for a new
-		 * request from the active queue, we avoid unplugging the
-		 * device now.
-		 *
-		 * By doing so, we spare the disk to be committed
-		 * to serve just a small request. On the contrary, we wait for
-		 * the block layer to decide when to unplug the device:
-		 * hopefully, new requests will be merged to this
-		 * one quickly, then the device will be unplugged
-		 * and larger requests will be dispatched.
-		 */
-	        if (bfqq->queued[rq_is_sync(rq)] == 1 &&
-		    blk_rq_sectors(rq) < 32) {
-		        return;
-		}
-		if (bfq_bfqq_wait_request(bfqq)) {
-			/*
-			 * If we are waiting for a request for this queue, let
-			 * it rip immediately and flag that we must not expire
-			 * this queue just now.
-			 */
-			bfq_clear_bfqq_wait_request(bfqq);
-			del_timer(&bfqd->idle_slice_timer);
-			/*
-			 * Here we can safely expire the queue, in
-			 * case of budget timeout, without wasting
-			 * guarantees
-			 */
-			if (bfq_bfqq_budget_timeout(bfqq))
-				bfq_bfqq_expire(bfqd, bfqq, 0,
-						BFQ_BFQQ_BUDGET_TIMEOUT);
-			__blk_run_queue(bfqd->queue);
-		}
-	}
-}
-
-static void bfq_insert_request(struct request_queue *q, struct request *rq)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-
-	assert_spin_locked(bfqd->queue->queue_lock);
-	bfq_init_prio_data(bfqq, RQ_CIC(rq)->ioc);
-
-	bfq_add_rq_rb(rq);
-
-	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
-	list_add_tail(&rq->queuelist, &bfqq->fifo);
-
-	bfq_rq_enqueued(bfqd, bfqq, rq);
-}
-
-static void bfq_update_hw_tag(struct bfq_data *bfqd)
-{
-	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
-				     bfqd->rq_in_driver);
-
-	if (bfqd->hw_tag == 1)
-		return;
-
-	/*
-	 * This sample is valid if the number of outstanding requests
-	 * is large enough to allow a queueing behavior.  Note that the
-	 * sum is not exact, as it's not taking into account deactivated
-	 * requests.
-	 */
-	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
-		return;
-
-	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
-		return;
-
-	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
-	bfqd->max_rq_in_driver = 0;
-	bfqd->hw_tag_samples = 0;
-}
-
-static void bfq_completed_request(struct request_queue *q, struct request *rq)
-{
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-	struct bfq_data *bfqd = bfqq->bfqd;
-	const int sync = rq_is_sync(rq);
-
-	bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
-			blk_rq_sectors(rq), sync);
-
-	bfq_update_hw_tag(bfqd);
-
-	WARN_ON(!bfqd->rq_in_driver);
-	WARN_ON(!bfqq->dispatched);
-	bfqd->rq_in_driver--;
-	bfqq->dispatched--;
-
-	if (bfq_bfqq_sync(bfqq))
-		bfqd->sync_flight--;
-
-	if (sync)
-		RQ_CIC(rq)->last_end_request = jiffies;
-
-	/*
-	 * If this is the active queue, check if it needs to be expired,
-	 * or if we want to idle in case it has no pending requests.
-	 */
-	if (bfqd->active_queue == bfqq) {
-		if (bfq_bfqq_budget_new(bfqq))
-			bfq_set_budget_timeout(bfqd);
-
-		/* Idling is disabled also for cooperation issues:
-		 * 1) there is a close cooperator for the queue, or
-		 * 2) the queue is shared and some cooperator is likely
-		 *    to be idle (in this case, by not arming the idle timer,
-		 *    we try to slow down the queue, to prevent the zones
-		 *    of the disk accessed by the active cooperators to become
-		 *    too distant from the zone that will be accessed by the
-		 *    currently idle cooperators)
-		 */
-		if (bfq_may_expire_for_budg_timeout(bfqq))
-			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
-		else if (sync &&
-			(bfqd->rq_in_driver == 0 ||
-				bfqq->raising_coeff > 1)
-			&& RB_EMPTY_ROOT(&bfqq->sort_list)
-			&& !bfq_close_cooperator(bfqd, bfqq)
-			&& (!bfq_bfqq_coop(bfqq) ||
-				!bfq_bfqq_some_coop_idle(bfqq)))
-			bfq_arm_slice_timer(bfqd);
-	}
-
-	if (!bfqd->rq_in_driver)
-		bfq_schedule_dispatch(bfqd);
-}
-
-/*
- * We temporarily boost lower priority queues if they are holding fs exclusive
- * resources.  They are boosted to normal prio (CLASS_BE/4).
- */
-static void bfq_prio_boost(struct bfq_queue *bfqq)
-{
-	if (has_fs_excl()) {
-		/*
-		 * Boost idle prio on transactions that would lock out other
-		 * users of the filesystem
-		 */
-		if (bfq_class_idle(bfqq))
-			bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
-		if (bfqq->entity.new_ioprio > IOPRIO_NORM)
-			bfqq->entity.new_ioprio = IOPRIO_NORM;
-	} else {
-		/*
-		 * Unboost the queue (if needed)
-		 */
-		bfqq->entity.new_ioprio_class = bfqq->org_ioprio_class;
-		bfqq->entity.new_ioprio = bfqq->org_ioprio;
-	}
-}
-
-static inline int __bfq_may_queue(struct bfq_queue *bfqq)
-{
-	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
-		bfq_clear_bfqq_must_alloc(bfqq);
-		return ELV_MQUEUE_MUST;
-	}
-
-	return ELV_MQUEUE_MAY;
-}
-
-static int bfq_may_queue(struct request_queue *q, int rw)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct task_struct *tsk = current;
-	struct cfq_io_context *cic;
-	struct bfq_queue *bfqq;
-
-	/*
-	 * Don't force setup of a queue from here, as a call to may_queue
-	 * does not necessarily imply that a request actually will be queued.
-	 * So just lookup a possibly existing queue, or return 'may queue'
-	 * if that fails.
-	 */
-	cic = bfq_cic_lookup(bfqd, tsk->io_context);
-	if (cic == NULL)
-		return ELV_MQUEUE_MAY;
-
-	bfqq = cic_to_bfqq(cic, rw_is_sync(rw));
-	if (bfqq != NULL) {
-		bfq_init_prio_data(bfqq, cic->ioc);
-		bfq_prio_boost(bfqq);
-
-		return __bfq_may_queue(bfqq);
-	}
-
-	return ELV_MQUEUE_MAY;
-}
-
-/*
- * Queue lock held here.
- */
-static void bfq_put_request(struct request *rq)
-{
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-
-	if (bfqq != NULL) {
-		const int rw = rq_data_dir(rq);
-
-		BUG_ON(!bfqq->allocated[rw]);
-		bfqq->allocated[rw]--;
-
-		put_io_context(RQ_CIC(rq)->ioc);
-
-		rq->elevator_private[0] = NULL;
-		rq->elevator_private[1] = NULL;
-
-		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
-			     bfqq, atomic_read(&bfqq->ref));
-		bfq_put_queue(bfqq);
-	}
-}
-
-static struct bfq_queue *
-bfq_merge_bfqqs(struct bfq_data *bfqd, struct cfq_io_context *cic,
-                struct bfq_queue *bfqq)
-{
-        bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
-		(long unsigned)bfqq->new_bfqq->pid);
-        cic_set_bfqq(cic, bfqq->new_bfqq, 1);
-        bfq_mark_bfqq_coop(bfqq->new_bfqq);
-        bfq_put_queue(bfqq);
-        return cic_to_bfqq(cic, 1);
-}
-
-/*
- * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
- * was the last process referring to said bfqq.
- */
-static struct bfq_queue *
-bfq_split_bfqq(struct cfq_io_context *cic, struct bfq_queue *bfqq)
-{
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
-	if (bfqq_process_refs(bfqq) == 1) {
-		bfqq->pid = current->pid;
-		bfq_clear_bfqq_some_coop_idle(bfqq);
-		bfq_clear_bfqq_coop(bfqq);
-		bfq_clear_bfqq_split_coop(bfqq);
-		return bfqq;
-	}
-
-	cic_set_bfqq(cic, NULL, 1);
-
-	bfq_put_cooperator(bfqq);
-
-	bfq_put_queue(bfqq);
-	return NULL;
-}
-
-/*
- * Allocate bfq data structures associated with this request.
- */
-static int bfq_set_request(struct request_queue *q, struct request *rq,
-			   gfp_t gfp_mask)
-{
-	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct cfq_io_context *cic;
-	const int rw = rq_data_dir(rq);
-	const int is_sync = rq_is_sync(rq);
-	struct bfq_queue *bfqq;
-	struct bfq_group *bfqg;
-	unsigned long flags;
-
-	might_sleep_if(gfp_mask & __GFP_WAIT);
-
-	cic = bfq_get_io_context(bfqd, gfp_mask);
-
-	spin_lock_irqsave(q->queue_lock, flags);
-
-	if (cic == NULL)
-		goto queue_fail;
-
-	bfqg = bfq_cic_update_cgroup(cic);
-
-new_queue:
-	bfqq = cic_to_bfqq(cic, is_sync);
-	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
-		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, cic->ioc, gfp_mask);
-		cic_set_bfqq(cic, bfqq, is_sync);
-	} else {
-		/*
-		 * If the queue was seeky for too long, break it apart.
-		 */
-		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
-			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
-			bfqq = bfq_split_bfqq(cic, bfqq);
-			if (!bfqq)
-				goto new_queue;
-		}
-
-		/*
-		 * Check to see if this queue is scheduled to merge with
-		 * another closely cooperating queue. The merging of queues
-		 * happens here as it must be done in process context.
-		 * The reference on new_bfqq was taken in merge_bfqqs.
-		 */
-		if (bfqq->new_bfqq != NULL)
-			bfqq = bfq_merge_bfqqs(bfqd, cic, bfqq);
-	}
-
-	bfqq->allocated[rw]++;
-	atomic_inc(&bfqq->ref);
-	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
-		     atomic_read(&bfqq->ref));
-
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	rq->elevator_private[0] = cic;
-	rq->elevator_private[1] = bfqq;
-
-	return 0;
-
-queue_fail:
-	if (cic != NULL)
-		put_io_context(cic->ioc);
-
-	bfq_schedule_dispatch(bfqd);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	return 1;
-}
-
-static void bfq_kick_queue(struct work_struct *work)
-{
-	struct bfq_data *bfqd =
-		container_of(work, struct bfq_data, unplug_work);
-	struct request_queue *q = bfqd->queue;
-
-	spin_lock_irq(q->queue_lock);
-	__blk_run_queue(q);
-	spin_unlock_irq(q->queue_lock);
-}
-
-/*
- * Handler of the expiration of the timer running if the active_queue
- * is idling inside its time slice.
- */
-static void bfq_idle_slice_timer(unsigned long data)
-{
-	struct bfq_data *bfqd = (struct bfq_data *)data;
-	struct bfq_queue *bfqq;
-	unsigned long flags;
-	enum bfqq_expiration reason;
-
-	spin_lock_irqsave(bfqd->queue->queue_lock, flags);
-
-	bfqq = bfqd->active_queue;
-	/*
-	 * Theoretical race here: active_queue can be NULL or different
-	 * from the queue that was idling if the timer handler spins on
-	 * the queue_lock and a new request arrives for the current
-	 * queue and there is a full dispatch cycle that changes the
-	 * active_queue.  This can hardly happen, but in the worst case
-	 * we just expire a queue too early.
-	 */
-	if (bfqq != NULL) {
-		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
-		if (bfq_bfqq_budget_timeout(bfqq))
-			/*
-			 * Also here the queue can be safely expired
-			 * for budget timeout without wasting
-			 * guarantees
-			 */
-			reason = BFQ_BFQQ_BUDGET_TIMEOUT;
-		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
-			/*
-			 * The queue may not be empty upon timer expiration,
-			 * because we may not disable the timer when the first
-			 * request of the active queue arrives during
-			 * disk idling
-			 */
-			reason = BFQ_BFQQ_TOO_IDLE;
-		else
-			goto schedule_dispatch;
-
-		bfq_bfqq_expire(bfqd, bfqq, 1, reason);
-	}
-
-schedule_dispatch:
-	bfq_schedule_dispatch(bfqd);
-
-	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
-}
-
-static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
-{
-	del_timer_sync(&bfqd->idle_slice_timer);
-	cancel_work_sync(&bfqd->unplug_work);
-}
-
-static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
-					struct bfq_queue **bfqq_ptr)
-{
-	struct bfq_group *root_group = bfqd->root_group;
-	struct bfq_queue *bfqq = *bfqq_ptr;
-
-	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
-	if (bfqq != NULL) {
-		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
-		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
-			     bfqq, atomic_read(&bfqq->ref));
-		bfq_put_queue(bfqq);
-		*bfqq_ptr = NULL;
-	}
-}
-
-/*
- * Release all the bfqg references to its async queues.  If we are
- * deallocating the group these queues may still contain requests, so
- * we reparent them to the root cgroup (i.e., the only one that will
- * exist for sure untill all the requests on a device are gone).
- */
-static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
-{
-	int i, j;
-
-	for (i = 0; i < 2; i++)
-		for (j = 0; j < IOPRIO_BE_NR; j++)
-			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
-
-	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
-}
-
-static void bfq_exit_queue(struct elevator_queue *e)
-{
-	struct bfq_data *bfqd = e->elevator_data;
-	struct request_queue *q = bfqd->queue;
-	struct bfq_queue *bfqq, *n;
-	struct cfq_io_context *cic;
-
-	bfq_shutdown_timer_wq(bfqd);
-
-	spin_lock_irq(q->queue_lock);
-
-	while (!list_empty(&bfqd->cic_list)) {
-		cic = list_entry(bfqd->cic_list.next, struct cfq_io_context,
-				 queue_list);
-		__bfq_exit_single_io_context(bfqd, cic);
-	}
-
-	BUG_ON(bfqd->active_queue != NULL);
-	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
-		bfq_deactivate_bfqq(bfqd, bfqq, 0);
-
-	bfq_disconnect_groups(bfqd);
-	spin_unlock_irq(q->queue_lock);
-
-	bfq_shutdown_timer_wq(bfqd);
-
-	spin_lock(&cic_index_lock);
-	ida_remove(&cic_index_ida, bfqd->cic_index);
-	spin_unlock(&cic_index_lock);
-
-	/* Wait for cic->key accessors to exit their grace periods. */
-	synchronize_rcu();
-
-	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
-
-	bfq_free_root_group(bfqd);
-	kfree(bfqd);
-}
-
-static int bfq_alloc_cic_index(void)
-{
-	int index, error;
-
-	do {
-		if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
-			return -ENOMEM;
-
-		spin_lock(&cic_index_lock);
-		error = ida_get_new(&cic_index_ida, &index);
-		spin_unlock(&cic_index_lock);
-		if (error && error != -EAGAIN)
-			return error;
-	} while (error);
-
-	return index;
-}
-
-static void *bfq_init_queue(struct request_queue *q)
-{
-	struct bfq_group *bfqg;
-	struct bfq_data *bfqd;
-	int i;
-
-	i = bfq_alloc_cic_index();
-	if (i < 0)
-		return NULL;
-
-	bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);
-	if (bfqd == NULL)
-		return NULL;
-
-	bfqd->cic_index = i;
-
-	/*
-	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
-	 * Grab a permanent reference to it, so that the normal code flow
-	 * will not attempt to free it.
-	 */
-	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
-	atomic_inc(&bfqd->oom_bfqq.ref);
-
-	INIT_LIST_HEAD(&bfqd->cic_list);
-
-	bfqd->queue = q;
-
-	bfqg = bfq_alloc_root_group(bfqd, q->node);
-	if (bfqg == NULL) {
-		kfree(bfqd);
-		return NULL;
-	}
-
-	bfqd->root_group = bfqg;
-
-	init_timer(&bfqd->idle_slice_timer);
-	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
-	bfqd->idle_slice_timer.data = (unsigned long)bfqd;
-
-	bfqd->rq_pos_tree = RB_ROOT;
-
-	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
-
-	INIT_LIST_HEAD(&bfqd->active_list);
-	INIT_LIST_HEAD(&bfqd->idle_list);
-
-	bfqd->hw_tag = -1;
-
-	bfqd->bfq_max_budget = bfq_default_max_budget;
-
-	bfqd->bfq_quantum = bfq_quantum;
-	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
-	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
-	bfqd->bfq_back_max = bfq_back_max;
-	bfqd->bfq_back_penalty = bfq_back_penalty;
-	bfqd->bfq_slice_idle = bfq_slice_idle;
-	bfqd->bfq_class_idle_last_service = 0;
-	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
-	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
-	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
-
-	bfqd->low_latency = true;
-
-	bfqd->bfq_raising_coeff = 20;
-	bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
-	bfqd->bfq_raising_max_time = 0;
-	bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
-	bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
-	bfqd->bfq_raising_max_softrt_rate = 7000;
-
-	/* Initially estimate the device's peak rate as the reference rate */
-	if (blk_queue_nonrot(bfqd->queue)) {
-		bfqd->RT_prod = R_nonrot * T_nonrot;
-		bfqd->peak_rate = R_nonrot;
-	} else {
-		bfqd->RT_prod = R_rot * T_rot;
-		bfqd->peak_rate = R_rot;
-	}
-
-	return bfqd;
-}
-
-static void bfq_slab_kill(void)
-{
-	if (bfq_pool != NULL)
-		kmem_cache_destroy(bfq_pool);
-	if (bfq_ioc_pool != NULL)
-		kmem_cache_destroy(bfq_ioc_pool);
-}
-
-static int __init bfq_slab_setup(void)
-{
-	bfq_pool = KMEM_CACHE(bfq_queue, 0);
-	if (bfq_pool == NULL)
-		goto fail;
-
-	bfq_ioc_pool = kmem_cache_create("bfq_io_context",
-					 sizeof(struct cfq_io_context),
-					 __alignof__(struct cfq_io_context),
-					 0, NULL);
-	if (bfq_ioc_pool == NULL)
-		goto fail;
-
-	return 0;
-fail:
-	bfq_slab_kill();
-	return -ENOMEM;
-}
-
-static ssize_t bfq_var_show(unsigned int var, char *page)
-{
-	return sprintf(page, "%d\n", var);
-}
-
-static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
-{
-	unsigned long new_val;
-	int ret = strict_strtoul(page, 10, &new_val);
-
-	if (ret == 0)
-		*var = new_val;
-
-	return count;
-}
-
-static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
-{
-	struct bfq_data *bfqd = e->elevator_data;
-	return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
-		       bfqd->bfq_raising_max_time :
-		       bfq_wrais_duration(bfqd));
-}
-
-static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
-{
-	struct bfq_queue *bfqq;
-	struct bfq_data *bfqd = e->elevator_data;
-	ssize_t num_char = 0;
-
-	num_char += sprintf(page + num_char, "Active:\n");
-	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
-		num_char += sprintf(page + num_char,
-			"pid%d: weight %hu, dur %d/%u\n",
-			bfqq->pid,
-			bfqq->entity.weight,
-			jiffies_to_msecs(jiffies -
-				bfqq->last_rais_start_finish),
-			jiffies_to_msecs(bfqq->raising_cur_max_time));
-	}
-	num_char += sprintf(page + num_char, "Idle:\n");
-	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
-			num_char += sprintf(page + num_char,
-				"pid%d: weight %hu, dur %d/%u\n",
-				bfqq->pid,
-				bfqq->entity.weight,
-				jiffies_to_msecs(jiffies -
-					bfqq->last_rais_start_finish),
-				jiffies_to_msecs(bfqq->raising_cur_max_time));
-	}
-	return num_char;
-}
-
-#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
-{									\
-	struct bfq_data *bfqd = e->elevator_data;			\
-	unsigned int __data = __VAR;					\
-	if (__CONV)							\
-		__data = jiffies_to_msecs(__data);			\
-	return bfq_var_show(__data, (page));				\
-}
-SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
-SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
-SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
-SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
-SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
-SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
-SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
-SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
-SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
-SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
-SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
-SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
-SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
-SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
-	1);
-SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
-	      bfqd->bfq_raising_min_inter_arr_async,
-	      1);
-SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
-	bfqd->bfq_raising_max_softrt_rate, 0);
-#undef SHOW_FUNCTION
-
-#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t								\
-__FUNC(struct elevator_queue *e, const char *page, size_t count)	\
-{									\
-	struct bfq_data *bfqd = e->elevator_data;			\
-	unsigned long __data;						\
-	int ret = bfq_var_store(&__data, (page), count);		\
-	if (__data < (MIN))						\
-		__data = (MIN);						\
-	else if (__data > (MAX))					\
-		__data = (MAX);						\
-	if (__CONV)							\
-		*(__PTR) = msecs_to_jiffies(__data);			\
-	else								\
-		*(__PTR) = __data;					\
-	return ret;							\
-}
-STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
-STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
-		INT_MAX, 1);
-STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
-		INT_MAX, 1);
-STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
-STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
-		INT_MAX, 0);
-STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
-STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
-		1, INT_MAX, 0);
-STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
-		INT_MAX, 1);
-STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
-		INT_MAX, 0);
-STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
-		INT_MAX, 1);
-STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
-		INT_MAX, 1);
-STORE_FUNCTION(bfq_raising_min_idle_time_store,
-	       &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
-STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
-	       &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
-STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
-	       &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
-#undef STORE_FUNCTION
-
-/* do nothing for the moment */
-static ssize_t bfq_weights_store(struct elevator_queue *e,
-				    const char *page, size_t count)
-{
-	return count;
-}
-
-static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
-{
-	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
-
-	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
-		return bfq_calc_max_budget(bfqd->peak_rate, timeout);
-	else
-		return bfq_default_max_budget;
-}
-
-static ssize_t bfq_max_budget_store(struct elevator_queue *e,
-				    const char *page, size_t count)
-{
-	struct bfq_data *bfqd = e->elevator_data;
-	unsigned long __data;
-	int ret = bfq_var_store(&__data, (page), count);
-
-	if (__data == 0)
-		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
-	else {
-		if (__data > INT_MAX)
-			__data = INT_MAX;
-		bfqd->bfq_max_budget = __data;
-	}
-
-	bfqd->bfq_user_max_budget = __data;
-
-	return ret;
-}
-
-static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
-				      const char *page, size_t count)
-{
-	struct bfq_data *bfqd = e->elevator_data;
-	unsigned long __data;
-	int ret = bfq_var_store(&__data, (page), count);
-
-	if (__data < 1)
-		__data = 1;
-	else if (__data > INT_MAX)
-		__data = INT_MAX;
-
-	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
-	if (bfqd->bfq_user_max_budget == 0)
-		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
-
-	return ret;
-}
-
-static ssize_t bfq_low_latency_store(struct elevator_queue *e,
-				     const char *page, size_t count)
-{
-	struct bfq_data *bfqd = e->elevator_data;
-	unsigned long __data;
-	int ret = bfq_var_store(&__data, (page), count);
-
-	if (__data > 1)
-		__data = 1;
-	bfqd->low_latency = __data;
-
-	return ret;
-}
-
-#define BFQ_ATTR(name) \
-	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
-
-static struct elv_fs_entry bfq_attrs[] = {
-	BFQ_ATTR(quantum),
-	BFQ_ATTR(fifo_expire_sync),
-	BFQ_ATTR(fifo_expire_async),
-	BFQ_ATTR(back_seek_max),
-	BFQ_ATTR(back_seek_penalty),
-	BFQ_ATTR(slice_idle),
-	BFQ_ATTR(max_budget),
-	BFQ_ATTR(max_budget_async_rq),
-	BFQ_ATTR(timeout_sync),
-	BFQ_ATTR(timeout_async),
-	BFQ_ATTR(low_latency),
-	BFQ_ATTR(raising_coeff),
-	BFQ_ATTR(raising_max_time),
-	BFQ_ATTR(raising_rt_max_time),
-	BFQ_ATTR(raising_min_idle_time),
-	BFQ_ATTR(raising_min_inter_arr_async),
-	BFQ_ATTR(raising_max_softrt_rate),
-	BFQ_ATTR(weights),
-	__ATTR_NULL
-};
-
-static struct elevator_type iosched_bfq = {
-	.ops = {
-		.elevator_merge_fn =		bfq_merge,
-		.elevator_merged_fn =		bfq_merged_request,
-		.elevator_merge_req_fn =	bfq_merged_requests,
-		.elevator_allow_merge_fn =	bfq_allow_merge,
-		.elevator_dispatch_fn =		bfq_dispatch_requests,
-		.elevator_add_req_fn =		bfq_insert_request,
-		.elevator_activate_req_fn =	bfq_activate_request,
-		.elevator_deactivate_req_fn =	bfq_deactivate_request,
-		.elevator_completed_req_fn =	bfq_completed_request,
-		.elevator_former_req_fn =	elv_rb_former_request,
-		.elevator_latter_req_fn =	elv_rb_latter_request,
-		.elevator_set_req_fn =		bfq_set_request,
-		.elevator_put_req_fn =		bfq_put_request,
-		.elevator_may_queue_fn =	bfq_may_queue,
-		.elevator_init_fn =		bfq_init_queue,
-		.elevator_exit_fn =		bfq_exit_queue,
-		.trim =				bfq_free_io_context,
-	},
-	.elevator_attrs =	bfq_attrs,
-	.elevator_name =	"bfq",
-	.elevator_owner =	THIS_MODULE,
-};
-
-static int __init bfq_init(void)
-{
-	/*
-	 * Can be 0 on HZ < 1000 setups.
-	 */
-	//if (bfq_slice_idle == 0)
-	//	bfq_slice_idle = 1;
-
-	if (bfq_timeout_async == 0)
-		bfq_timeout_async = 1;
-
-	if (bfq_slab_setup())
-		return -ENOMEM;
-
-	elv_register(&iosched_bfq);
-
-	return 0;
-}
-
-static void __exit bfq_exit(void)
-{
-	DECLARE_COMPLETION_ONSTACK(all_gone);
-	elv_unregister(&iosched_bfq);
-	bfq_ioc_gone = &all_gone;
-	/* bfq_ioc_gone's update must be visible before reading bfq_ioc_count */
-	smp_wmb();
-	if (elv_ioc_count_read(bfq_ioc_count) != 0)
-		wait_for_completion(&all_gone);
-	ida_destroy(&cic_index_ida);
-	bfq_slab_kill();
-}
-
-module_init(bfq_init);
-module_exit(bfq_exit);
-
-MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
diff --git a/block/bfq-sched.c b/block/bfq-sched.c
deleted file mode 100644
index fd50b7fd1..000000000
--- a/block/bfq-sched.c
+++ /dev/null
@@ -1,1066 +0,0 @@
-/*
- * BFQ: Hierarchical B-WF2Q+ scheduler.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- *
- * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
- *		      Paolo Valente <paolo.valente@unimore.it>
- */
-
-#ifdef CONFIG_CGROUP_BFQIO
-#define for_each_entity(entity)	\
-	for (; entity != NULL; entity = entity->parent)
-
-#define for_each_entity_safe(entity, parent) \
-	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
-
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
-						 int extract,
-						 struct bfq_data *bfqd);
-
-static inline void bfq_update_budget(struct bfq_entity *next_active)
-{
-	struct bfq_entity *bfqg_entity;
-	struct bfq_group *bfqg;
-	struct bfq_sched_data *group_sd;
-
-	BUG_ON(next_active == NULL);
-
-	group_sd = next_active->sched_data;
-
-	bfqg = container_of(group_sd, struct bfq_group, sched_data);
-	/*
-	 * bfq_group's my_entity field is not NULL only if the group
-	 * is not the root group. We must not touch the root entity
-	 * as it must never become an active entity.
-	 */
-	bfqg_entity = bfqg->my_entity;
-	if (bfqg_entity != NULL)
-		bfqg_entity->budget = next_active->budget;
-}
-
-static int bfq_update_next_active(struct bfq_sched_data *sd)
-{
-	struct bfq_entity *next_active;
-
-	if (sd->active_entity != NULL)
-		/* will update/requeue at the end of service */
-		return 0;
-
-	/*
-	 * NOTE: this can be improved in many ways, such as returning
-	 * 1 (and thus propagating upwards the update) only when the
-	 * budget changes, or caching the bfqq that will be scheduled
-	 * next from this subtree.  By now we worry more about
-	 * correctness than about performance...
-	 */
-	next_active = bfq_lookup_next_entity(sd, 0, NULL);
-	sd->next_active = next_active;
-
-	if (next_active != NULL)
-		bfq_update_budget(next_active);
-
-	return 1;
-}
-
-static inline void bfq_check_next_active(struct bfq_sched_data *sd,
-					 struct bfq_entity *entity)
-{
-	BUG_ON(sd->next_active != entity);
-}
-#else
-#define for_each_entity(entity)	\
-	for (; entity != NULL; entity = NULL)
-
-#define for_each_entity_safe(entity, parent) \
-	for (parent = NULL; entity != NULL; entity = parent)
-
-static inline int bfq_update_next_active(struct bfq_sched_data *sd)
-{
-	return 0;
-}
-
-static inline void bfq_check_next_active(struct bfq_sched_data *sd,
-					 struct bfq_entity *entity)
-{
-}
-
-static inline void bfq_update_budget(struct bfq_entity *next_active)
-{
-}
-#endif
-
-/*
- * Shift for timestamp calculations.  This actually limits the maximum
- * service allowed in one timestamp delta (small shift values increase it),
- * the maximum total weight that can be used for the queues in the system
- * (big shift values increase it), and the period of virtual time wraparounds.
- */
-#define WFQ_SERVICE_SHIFT	22
-
-/**
- * bfq_gt - compare two timestamps.
- * @a: first ts.
- * @b: second ts.
- *
- * Return @a > @b, dealing with wrapping correctly.
- */
-static inline int bfq_gt(u64 a, u64 b)
-{
-	return (s64)(a - b) > 0;
-}
-
-static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = NULL;
-
-	BUG_ON(entity == NULL);
-
-	if (entity->my_sched_data == NULL)
-		bfqq = container_of(entity, struct bfq_queue, entity);
-
-	return bfqq;
-}
-
-
-/**
- * bfq_delta - map service into the virtual time domain.
- * @service: amount of service.
- * @weight: scale factor (weight of an entity or weight sum).
- */
-static inline u64 bfq_delta(unsigned long service,
-					unsigned long weight)
-{
-	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
-
-	do_div(d, weight);
-	return d;
-}
-
-/**
- * bfq_calc_finish - assign the finish time to an entity.
- * @entity: the entity to act upon.
- * @service: the service to be charged to the entity.
- */
-static inline void bfq_calc_finish(struct bfq_entity *entity,
-				   unsigned long service)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	BUG_ON(entity->weight == 0);
-
-	entity->finish = entity->start +
-		bfq_delta(service, entity->weight);
-
-	if (bfqq != NULL) {
-		bfq_log_bfqq(bfqq->bfqd, bfqq,
-			"calc_finish: serv %lu, w %d",
-			service, entity->weight);
-		bfq_log_bfqq(bfqq->bfqd, bfqq,
-			"calc_finish: start %llu, finish %llu, delta %llu",
-			entity->start, entity->finish,
-			bfq_delta(service, entity->weight));
-	}
-}
-
-/**
- * bfq_entity_of - get an entity from a node.
- * @node: the node field of the entity.
- *
- * Convert a node pointer to the relative entity.  This is used only
- * to simplify the logic of some functions and not as the generic
- * conversion mechanism because, e.g., in the tree walking functions,
- * the check for a %NULL value would be redundant.
- */
-static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
-{
-	struct bfq_entity *entity = NULL;
-
-	if (node != NULL)
-		entity = rb_entry(node, struct bfq_entity, rb_node);
-
-	return entity;
-}
-
-/**
- * bfq_extract - remove an entity from a tree.
- * @root: the tree root.
- * @entity: the entity to remove.
- */
-static inline void bfq_extract(struct rb_root *root,
-			       struct bfq_entity *entity)
-{
-	BUG_ON(entity->tree != root);
-
-	entity->tree = NULL;
-	rb_erase(&entity->rb_node, root);
-}
-
-/**
- * bfq_idle_extract - extract an entity from the idle tree.
- * @st: the service tree of the owning @entity.
- * @entity: the entity being removed.
- */
-static void bfq_idle_extract(struct bfq_service_tree *st,
-			     struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	struct rb_node *next;
-
-	BUG_ON(entity->tree != &st->idle);
-
-	if (entity == st->first_idle) {
-		next = rb_next(&entity->rb_node);
-		st->first_idle = bfq_entity_of(next);
-	}
-
-	if (entity == st->last_idle) {
-		next = rb_prev(&entity->rb_node);
-		st->last_idle = bfq_entity_of(next);
-	}
-
-	bfq_extract(&st->idle, entity);
-
-	if (bfqq != NULL)
-		list_del(&bfqq->bfqq_list);
-}
-
-/**
- * bfq_insert - generic tree insertion.
- * @root: tree root.
- * @entity: entity to insert.
- *
- * This is used for the idle and the active tree, since they are both
- * ordered by finish time.
- */
-static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
-{
-	struct bfq_entity *entry;
-	struct rb_node **node = &root->rb_node;
-	struct rb_node *parent = NULL;
-
-	BUG_ON(entity->tree != NULL);
-
-	while (*node != NULL) {
-		parent = *node;
-		entry = rb_entry(parent, struct bfq_entity, rb_node);
-
-		if (bfq_gt(entry->finish, entity->finish))
-			node = &parent->rb_left;
-		else
-			node = &parent->rb_right;
-	}
-
-	rb_link_node(&entity->rb_node, parent, node);
-	rb_insert_color(&entity->rb_node, root);
-
-	entity->tree = root;
-}
-
-/**
- * bfq_update_min - update the min_start field of a entity.
- * @entity: the entity to update.
- * @node: one of its children.
- *
- * This function is called when @entity may store an invalid value for
- * min_start due to updates to the active tree.  The function  assumes
- * that the subtree rooted at @node (which may be its left or its right
- * child) has a valid min_start value.
- */
-static inline void bfq_update_min(struct bfq_entity *entity,
-				  struct rb_node *node)
-{
-	struct bfq_entity *child;
-
-	if (node != NULL) {
-		child = rb_entry(node, struct bfq_entity, rb_node);
-		if (bfq_gt(entity->min_start, child->min_start))
-			entity->min_start = child->min_start;
-	}
-}
-
-/**
- * bfq_update_active_node - recalculate min_start.
- * @node: the node to update.
- *
- * @node may have changed position or one of its children may have moved,
- * this function updates its min_start value.  The left and right subtrees
- * are assumed to hold a correct min_start value.
- */
-static inline void bfq_update_active_node(struct rb_node *node)
-{
-	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
-
-	entity->min_start = entity->start;
-	bfq_update_min(entity, node->rb_right);
-	bfq_update_min(entity, node->rb_left);
-}
-
-/**
- * bfq_update_active_tree - update min_start for the whole active tree.
- * @node: the starting node.
- *
- * @node must be the deepest modified node after an update.  This function
- * updates its min_start using the values held by its children, assuming
- * that they did not change, and then updates all the nodes that may have
- * changed in the path to the root.  The only nodes that may have changed
- * are the ones in the path or their siblings.
- */
-static void bfq_update_active_tree(struct rb_node *node)
-{
-	struct rb_node *parent;
-
-up:
-	bfq_update_active_node(node);
-
-	parent = rb_parent(node);
-	if (parent == NULL)
-		return;
-
-	if (node == parent->rb_left && parent->rb_right != NULL)
-		bfq_update_active_node(parent->rb_right);
-	else if (parent->rb_left != NULL)
-		bfq_update_active_node(parent->rb_left);
-
-	node = parent;
-	goto up;
-}
-
-/**
- * bfq_active_insert - insert an entity in the active tree of its group/device.
- * @st: the service tree of the entity.
- * @entity: the entity being inserted.
- *
- * The active tree is ordered by finish time, but an extra key is kept
- * per each node, containing the minimum value for the start times of
- * its children (and the node itself), so it's possible to search for
- * the eligible node with the lowest finish time in logarithmic time.
- */
-static void bfq_active_insert(struct bfq_service_tree *st,
-			      struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	struct rb_node *node = &entity->rb_node;
-
-	bfq_insert(&st->active, entity);
-
-	if (node->rb_left != NULL)
-		node = node->rb_left;
-	else if (node->rb_right != NULL)
-		node = node->rb_right;
-
-	bfq_update_active_tree(node);
-
-	if (bfqq != NULL)
-		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
-}
-
-/**
- * bfq_ioprio_to_weight - calc a weight from an ioprio.
- * @ioprio: the ioprio value to convert.
- */
-static unsigned short bfq_ioprio_to_weight(int ioprio)
-{
-	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
-	return IOPRIO_BE_NR - ioprio;
-}
-
-/**
- * bfq_weight_to_ioprio - calc an ioprio from a weight.
- * @weight: the weight value to convert.
- *
- * To preserve as mush as possible the old only-ioprio user interface,
- * 0 is used as an escape ioprio value for weights (numerically) equal or
- * larger than IOPRIO_BE_NR
- */
-static unsigned short bfq_weight_to_ioprio(int weight)
-{
-	WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
-	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
-}
-
-static inline void bfq_get_entity(struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	if (bfqq != NULL) {
-		atomic_inc(&bfqq->ref);
-		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
-			     bfqq, atomic_read(&bfqq->ref));
-	}
-}
-
-/**
- * bfq_find_deepest - find the deepest node that an extraction can modify.
- * @node: the node being removed.
- *
- * Do the first step of an extraction in an rb tree, looking for the
- * node that will replace @node, and returning the deepest node that
- * the following modifications to the tree can touch.  If @node is the
- * last node in the tree return %NULL.
- */
-static struct rb_node *bfq_find_deepest(struct rb_node *node)
-{
-	struct rb_node *deepest;
-
-	if (node->rb_right == NULL && node->rb_left == NULL)
-		deepest = rb_parent(node);
-	else if (node->rb_right == NULL)
-		deepest = node->rb_left;
-	else if (node->rb_left == NULL)
-		deepest = node->rb_right;
-	else {
-		deepest = rb_next(node);
-		if (deepest->rb_right != NULL)
-			deepest = deepest->rb_right;
-		else if (rb_parent(deepest) != node)
-			deepest = rb_parent(deepest);
-	}
-
-	return deepest;
-}
-
-/**
- * bfq_active_extract - remove an entity from the active tree.
- * @st: the service_tree containing the tree.
- * @entity: the entity being removed.
- */
-static void bfq_active_extract(struct bfq_service_tree *st,
-			       struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	struct rb_node *node;
-
-	node = bfq_find_deepest(&entity->rb_node);
-	bfq_extract(&st->active, entity);
-
-	if (node != NULL)
-		bfq_update_active_tree(node);
-
-	if (bfqq != NULL)
-		list_del(&bfqq->bfqq_list);
-}
-
-/**
- * bfq_idle_insert - insert an entity into the idle tree.
- * @st: the service tree containing the tree.
- * @entity: the entity to insert.
- */
-static void bfq_idle_insert(struct bfq_service_tree *st,
-			    struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	struct bfq_entity *first_idle = st->first_idle;
-	struct bfq_entity *last_idle = st->last_idle;
-
-	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
-		st->first_idle = entity;
-	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
-		st->last_idle = entity;
-
-	bfq_insert(&st->idle, entity);
-
-	if (bfqq != NULL)
-		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
-}
-
-/**
- * bfq_forget_entity - remove an entity from the wfq trees.
- * @st: the service tree.
- * @entity: the entity being removed.
- *
- * Update the device status and forget everything about @entity, putting
- * the device reference to it, if it is a queue.  Entities belonging to
- * groups are not refcounted.
- */
-static void bfq_forget_entity(struct bfq_service_tree *st,
-			      struct bfq_entity *entity)
-{
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-	BUG_ON(!entity->on_st);
-
-	entity->on_st = 0;
-	st->wsum -= entity->weight;
-	if (bfqq != NULL) {
-		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
-			     bfqq, atomic_read(&bfqq->ref));
-		bfq_put_queue(bfqq);
-	}
-}
-
-/**
- * bfq_put_idle_entity - release the idle tree ref of an entity.
- * @st: service tree for the entity.
- * @entity: the entity being released.
- */
-static void bfq_put_idle_entity(struct bfq_service_tree *st,
-				struct bfq_entity *entity)
-{
-	bfq_idle_extract(st, entity);
-	bfq_forget_entity(st, entity);
-}
-
-/**
- * bfq_forget_idle - update the idle tree if necessary.
- * @st: the service tree to act upon.
- *
- * To preserve the global O(log N) complexity we only remove one entry here;
- * as the idle tree will not grow indefinitely this can be done safely.
- */
-static void bfq_forget_idle(struct bfq_service_tree *st)
-{
-	struct bfq_entity *first_idle = st->first_idle;
-	struct bfq_entity *last_idle = st->last_idle;
-
-	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
-	    !bfq_gt(last_idle->finish, st->vtime)) {
-		/*
-		 * Forget the whole idle tree, increasing the vtime past
-		 * the last finish time of idle entities.
-		 */
-		st->vtime = last_idle->finish;
-	}
-
-	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
-		bfq_put_idle_entity(st, first_idle);
-}
-
-static struct bfq_service_tree *
-__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
-			 struct bfq_entity *entity)
-{
-	struct bfq_service_tree *new_st = old_st;
-
-	if (entity->ioprio_changed) {
-		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-
-		BUG_ON(old_st->wsum < entity->weight);
-		old_st->wsum -= entity->weight;
-
-		if (entity->new_weight != entity->orig_weight) {
-			entity->orig_weight = entity->new_weight;
-			entity->ioprio =
-				bfq_weight_to_ioprio(entity->orig_weight);
-		} else if (entity->new_ioprio != entity->ioprio) {
-			entity->ioprio = entity->new_ioprio;
-			entity->orig_weight =
-					bfq_ioprio_to_weight(entity->ioprio);
-		} else
-			entity->new_weight = entity->orig_weight =
-				bfq_ioprio_to_weight(entity->ioprio);
-
-		entity->ioprio_class = entity->new_ioprio_class;
-		entity->ioprio_changed = 0;
-
-		/*
-		 * NOTE: here we may be changing the weight too early,
-		 * this will cause unfairness.  The correct approach
-		 * would have required additional complexity to defer
-		 * weight changes to the proper time instants (i.e.,
-		 * when entity->finish <= old_st->vtime).
-		 */
-		new_st = bfq_entity_service_tree(entity);
-		entity->weight = entity->orig_weight *
-			(bfqq != NULL ? bfqq->raising_coeff : 1);
-		new_st->wsum += entity->weight;
-
-		if (new_st != old_st)
-			entity->start = new_st->vtime;
-	}
-
-	return new_st;
-}
-
-/**
- * bfq_bfqq_served - update the scheduler status after selection for service.
- * @bfqq: the queue being served.
- * @served: bytes to transfer.
- *
- * NOTE: this can be optimized, as the timestamps of upper level entities
- * are synchronized every time a new bfqq is selected for service.  By now,
- * we keep it to better check consistency.
- */
-static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-	struct bfq_service_tree *st;
-
-	for_each_entity(entity) {
-		st = bfq_entity_service_tree(entity);
-
-		entity->service += served;
-		BUG_ON(entity->service > entity->budget);
-		BUG_ON(st->wsum == 0);
-
-		st->vtime += bfq_delta(served, st->wsum);
-		bfq_forget_idle(st);
-	}
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
-}
-
-/**
- * bfq_bfqq_charge_full_budget - set the service to the entity budget.
- * @bfqq: the queue that needs a service update.
- *
- * When it's not possible to be fair in the service domain, because
- * a queue is not consuming its budget fast enough (the meaning of
- * fast depends on the timeout parameter), we charge it a full
- * budget.  In this way we should obtain a sort of time-domain
- * fairness among all the seeky/slow queues.
- */
-static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
-
-	bfq_bfqq_served(bfqq, entity->budget - entity->service);
-}
-
-/**
- * __bfq_activate_entity - activate an entity.
- * @entity: the entity being activated.
- *
- * Called whenever an entity is activated, i.e., it is not active and one
- * of its children receives a new request, or has to be reactivated due to
- * budget exhaustion.  It uses the current budget of the entity (and the
- * service received if @entity is active) of the queue to calculate its
- * timestamps.
- */
-static void __bfq_activate_entity(struct bfq_entity *entity)
-{
-	struct bfq_sched_data *sd = entity->sched_data;
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-
-	if (entity == sd->active_entity) {
-		BUG_ON(entity->tree != NULL);
-		/*
-		 * If we are requeueing the current entity we have
-		 * to take care of not charging to it service it has
-		 * not received.
-		 */
-		bfq_calc_finish(entity, entity->service);
-		entity->start = entity->finish;
-		sd->active_entity = NULL;
-	} else if (entity->tree == &st->active) {
-		/*
-		 * Requeueing an entity due to a change of some
-		 * next_active entity below it.  We reuse the old
-		 * start time.
-		 */
-		bfq_active_extract(st, entity);
-	} else if (entity->tree == &st->idle) {
-		/*
-		 * Must be on the idle tree, bfq_idle_extract() will
-		 * check for that.
-		 */
-		bfq_idle_extract(st, entity);
-		entity->start = bfq_gt(st->vtime, entity->finish) ?
-				       st->vtime : entity->finish;
-	} else {
-		/*
-		 * The finish time of the entity may be invalid, and
-		 * it is in the past for sure, otherwise the queue
-		 * would have been on the idle tree.
-		 */
-		entity->start = st->vtime;
-		st->wsum += entity->weight;
-		bfq_get_entity(entity);
-
-		BUG_ON(entity->on_st);
-		entity->on_st = 1;
-	}
-
-	st = __bfq_entity_update_weight_prio(st, entity);
-	bfq_calc_finish(entity, entity->budget);
-	bfq_active_insert(st, entity);
-}
-
-/**
- * bfq_activate_entity - activate an entity and its ancestors if necessary.
- * @entity: the entity to activate.
- *
- * Activate @entity and all the entities on the path from it to the root.
- */
-static void bfq_activate_entity(struct bfq_entity *entity)
-{
-	struct bfq_sched_data *sd;
-
-	for_each_entity(entity) {
-		__bfq_activate_entity(entity);
-
-		sd = entity->sched_data;
-		if (!bfq_update_next_active(sd))
-			/*
-			 * No need to propagate the activation to the
-			 * upper entities, as they will be updated when
-			 * the active entity is rescheduled.
-			 */
-			break;
-	}
-}
-
-/**
- * __bfq_deactivate_entity - deactivate an entity from its service tree.
- * @entity: the entity to deactivate.
- * @requeue: if false, the entity will not be put into the idle tree.
- *
- * Deactivate an entity, independently from its previous state.  If the
- * entity was not on a service tree just return, otherwise if it is on
- * any scheduler tree, extract it from that tree, and if necessary
- * and if the caller did not specify @requeue, put it on the idle tree.
- *
- * Return %1 if the caller should update the entity hierarchy, i.e.,
- * if the entity was under service or if it was the next_active for
- * its sched_data; return %0 otherwise.
- */
-static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
-{
-	struct bfq_sched_data *sd = entity->sched_data;
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-	int was_active = entity == sd->active_entity;
-	int ret = 0;
-
-	if (!entity->on_st)
-		return 0;
-
-	BUG_ON(was_active && entity->tree != NULL);
-
-	if (was_active) {
-		bfq_calc_finish(entity, entity->service);
-		sd->active_entity = NULL;
-	} else if (entity->tree == &st->active)
-		bfq_active_extract(st, entity);
-	else if (entity->tree == &st->idle)
-		bfq_idle_extract(st, entity);
-	else if (entity->tree != NULL)
-		BUG();
-
-	if (was_active || sd->next_active == entity)
-		ret = bfq_update_next_active(sd);
-
-	if (!requeue || !bfq_gt(entity->finish, st->vtime))
-		bfq_forget_entity(st, entity);
-	else
-		bfq_idle_insert(st, entity);
-
-	BUG_ON(sd->active_entity == entity);
-	BUG_ON(sd->next_active == entity);
-
-	return ret;
-}
-
-/**
- * bfq_deactivate_entity - deactivate an entity.
- * @entity: the entity to deactivate.
- * @requeue: true if the entity can be put on the idle tree
- */
-static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
-{
-	struct bfq_sched_data *sd;
-	struct bfq_entity *parent;
-
-	for_each_entity_safe(entity, parent) {
-		sd = entity->sched_data;
-
-		if (!__bfq_deactivate_entity(entity, requeue))
-			/*
-			 * The parent entity is still backlogged, and
-			 * we don't need to update it as it is still
-			 * under service.
-			 */
-			break;
-
-		if (sd->next_active != NULL)
-			/*
-			 * The parent entity is still backlogged and
-			 * the budgets on the path towards the root
-			 * need to be updated.
-			 */
-			goto update;
-
-		/*
-		 * If we reach there the parent is no more backlogged and
-		 * we want to propagate the dequeue upwards.
-		 */
-		requeue = 1;
-	}
-
-	return;
-
-update:
-	entity = parent;
-	for_each_entity(entity) {
-		__bfq_activate_entity(entity);
-
-		sd = entity->sched_data;
-		if (!bfq_update_next_active(sd))
-			break;
-	}
-}
-
-/**
- * bfq_update_vtime - update vtime if necessary.
- * @st: the service tree to act upon.
- *
- * If necessary update the service tree vtime to have at least one
- * eligible entity, skipping to its start time.  Assumes that the
- * active tree of the device is not empty.
- *
- * NOTE: this hierarchical implementation updates vtimes quite often,
- * we may end up with reactivated tasks getting timestamps after a
- * vtime skip done because we needed a ->first_active entity on some
- * intermediate node.
- */
-static void bfq_update_vtime(struct bfq_service_tree *st)
-{
-	struct bfq_entity *entry;
-	struct rb_node *node = st->active.rb_node;
-
-	entry = rb_entry(node, struct bfq_entity, rb_node);
-	if (bfq_gt(entry->min_start, st->vtime)) {
-		st->vtime = entry->min_start;
-		bfq_forget_idle(st);
-	}
-}
-
-/**
- * bfq_first_active - find the eligible entity with the smallest finish time
- * @st: the service tree to select from.
- *
- * This function searches the first schedulable entity, starting from the
- * root of the tree and going on the left every time on this side there is
- * a subtree with at least one eligible (start >= vtime) entity.  The path
- * on the right is followed only if a) the left subtree contains no eligible
- * entities and b) no eligible entity has been found yet.
- */
-static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
-{
-	struct bfq_entity *entry, *first = NULL;
-	struct rb_node *node = st->active.rb_node;
-
-	while (node != NULL) {
-		entry = rb_entry(node, struct bfq_entity, rb_node);
-left:
-		if (!bfq_gt(entry->start, st->vtime))
-			first = entry;
-
-		BUG_ON(bfq_gt(entry->min_start, st->vtime));
-
-		if (node->rb_left != NULL) {
-			entry = rb_entry(node->rb_left,
-					 struct bfq_entity, rb_node);
-			if (!bfq_gt(entry->min_start, st->vtime)) {
-				node = node->rb_left;
-				goto left;
-			}
-		}
-		if (first != NULL)
-			break;
-		node = node->rb_right;
-	}
-
-	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
-	return first;
-}
-
-/**
- * __bfq_lookup_next_entity - return the first eligible entity in @st.
- * @st: the service tree.
- *
- * Update the virtual time in @st and return the first eligible entity
- * it contains.
- */
-static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
-						   bool force)
-{
-	struct bfq_entity *entity, *new_next_active = NULL;
-
-	if (RB_EMPTY_ROOT(&st->active))
-		return NULL;
-
-	bfq_update_vtime(st);
-	entity = bfq_first_active_entity(st);
-	BUG_ON(bfq_gt(entity->start, st->vtime));
-
-	/*
-	 * If the chosen entity does not match with the sched_data's
-	 * next_active and we are forcedly serving the IDLE priority
-	 * class tree, bubble up budget update.
-	 */
-	if (unlikely(force && entity != entity->sched_data->next_active)) {
-		new_next_active = entity;
-		for_each_entity(new_next_active)
-			bfq_update_budget(new_next_active);
-	}
-
-	return entity;
-}
-
-/**
- * bfq_lookup_next_entity - return the first eligible entity in @sd.
- * @sd: the sched_data.
- * @extract: if true the returned entity will be also extracted from @sd.
- *
- * NOTE: since we cache the next_active entity at each level of the
- * hierarchy, the complexity of the lookup can be decreased with
- * absolutely no effort just returning the cached next_active value;
- * we prefer to do full lookups to test the consistency of * the data
- * structures.
- */
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
-						 int extract,
-						 struct bfq_data *bfqd)
-{
-	struct bfq_service_tree *st = sd->service_tree;
-	struct bfq_entity *entity;
-	int i=0;
-
-	BUG_ON(sd->active_entity != NULL);
-
-	if (bfqd != NULL &&
-	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
-		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true);
-		if (entity != NULL) {
-			i = BFQ_IOPRIO_CLASSES - 1;
-			bfqd->bfq_class_idle_last_service = jiffies;
-			sd->next_active = entity;
-		}
-	}
-	for (; i < BFQ_IOPRIO_CLASSES; i++) {
-		entity = __bfq_lookup_next_entity(st + i, false);
-		if (entity != NULL) {
-			if (extract) {
-				bfq_check_next_active(sd, entity);
-				bfq_active_extract(st + i, entity);
-				sd->active_entity = entity;
-				sd->next_active = NULL;
-			}
-			break;
-		}
-	}
-
-	return entity;
-}
-
-/*
- * Get next queue for service.
- */
-static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
-{
-	struct bfq_entity *entity = NULL;
-	struct bfq_sched_data *sd;
-	struct bfq_queue *bfqq;
-
-	BUG_ON(bfqd->active_queue != NULL);
-
-	if (bfqd->busy_queues == 0)
-		return NULL;
-
-	sd = &bfqd->root_group->sched_data;
-	for (; sd != NULL; sd = entity->my_sched_data) {
-		entity = bfq_lookup_next_entity(sd, 1, bfqd);
-		BUG_ON(entity == NULL);
-		entity->service = 0;
-	}
-
-	bfqq = bfq_entity_to_bfqq(entity);
-	BUG_ON(bfqq == NULL);
-
-	return bfqq;
-}
-
-/*
- * Forced extraction of the given queue.
- */
-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
-				      struct bfq_queue *bfqq)
-{
-	struct bfq_entity *entity;
-	struct bfq_sched_data *sd;
-
-	BUG_ON(bfqd->active_queue != NULL);
-
-	entity = &bfqq->entity;
-	/*
-	 * Bubble up extraction/update from the leaf to the root.
-	*/
-	for_each_entity(entity) {
-		sd = entity->sched_data;
-		bfq_update_budget(entity);
-		bfq_update_vtime(bfq_entity_service_tree(entity));
-		bfq_active_extract(bfq_entity_service_tree(entity), entity);
-		sd->active_entity = entity;
-		sd->next_active = NULL;
-		entity->service = 0;
-	}
-
-	return;
-}
-
-static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
-{
-	if (bfqd->active_cic != NULL) {
-		put_io_context(bfqd->active_cic->ioc);
-		bfqd->active_cic = NULL;
-	}
-
-	bfqd->active_queue = NULL;
-	del_timer(&bfqd->idle_slice_timer);
-}
-
-static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-				int requeue)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-
-	if (bfqq == bfqd->active_queue)
-		__bfq_bfqd_reset_active(bfqd);
-
-	bfq_deactivate_entity(entity, requeue);
-}
-
-static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-
-	bfq_activate_entity(entity);
-}
-
-/*
- * Called when the bfqq no longer has requests pending, remove it from
- * the service tree.
- */
-static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			      int requeue)
-{
-	BUG_ON(!bfq_bfqq_busy(bfqq));
-	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
-
-	bfq_log_bfqq(bfqd, bfqq, "del from busy");
-
-	bfq_clear_bfqq_busy(bfqq);
-
-	BUG_ON(bfqd->busy_queues == 0);
-	bfqd->busy_queues--;
-
-	bfq_deactivate_bfqq(bfqd, bfqq, requeue);
-}
-
-/*
- * Called when an inactive queue receives a new request.
- */
-static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-{
-	BUG_ON(bfq_bfqq_busy(bfqq));
-	BUG_ON(bfqq == bfqd->active_queue);
-
-	bfq_log_bfqq(bfqd, bfqq, "add to busy");
-
-	bfq_activate_bfqq(bfqd, bfqq);
-
-	bfq_mark_bfqq_busy(bfqq);
-	bfqd->busy_queues++;
-}
diff --git a/block/bfq.h b/block/bfq.h
deleted file mode 100644
index e2ce5052a..000000000
--- a/block/bfq.h
+++ /dev/null
@@ -1,595 +0,0 @@
-/*
- * BFQ-v5 for 3.0: data structures and common functions prototypes.
- *
- * Based on ideas and code from CFQ:
- * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- *
- * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
- *		      Paolo Valente <paolo.valente@unimore.it>
- */
-
-#ifndef _BFQ_H
-#define _BFQ_H
-
-#include <linux/blktrace_api.h>
-#include <linux/hrtimer.h>
-#include <linux/ioprio.h>
-#include <linux/rbtree.h>
-
-#define BFQ_IOPRIO_CLASSES	3
-#define BFQ_CL_IDLE_TIMEOUT	HZ/5
-
-#define BFQ_MIN_WEIGHT	1
-#define BFQ_MAX_WEIGHT	1000
-
-#define BFQ_DEFAULT_GRP_WEIGHT	10
-#define BFQ_DEFAULT_GRP_IOPRIO	0
-#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
-
-struct bfq_entity;
-
-/**
- * struct bfq_service_tree - per ioprio_class service tree.
- * @active: tree for active entities (i.e., those backlogged).
- * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
- * @first_idle: idle entity with minimum F_i.
- * @last_idle: idle entity with maximum F_i.
- * @vtime: scheduler virtual time.
- * @wsum: scheduler weight sum; active and idle entities contribute to it.
- *
- * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
- * ioprio_class has its own independent scheduler, and so its own
- * bfq_service_tree.  All the fields are protected by the queue lock
- * of the containing bfqd.
- */
-struct bfq_service_tree {
-	struct rb_root active;
-	struct rb_root idle;
-
-	struct bfq_entity *first_idle;
-	struct bfq_entity *last_idle;
-
-	u64 vtime;
-	unsigned long wsum;
-};
-
-/**
- * struct bfq_sched_data - multi-class scheduler.
- * @active_entity: entity under service.
- * @next_active: head-of-the-line entity in the scheduler.
- * @service_tree: array of service trees, one per ioprio_class.
- *
- * bfq_sched_data is the basic scheduler queue.  It supports three
- * ioprio_classes, and can be used either as a toplevel queue or as
- * an intermediate queue on a hierarchical setup.
- * @next_active points to the active entity of the sched_data service
- * trees that will be scheduled next.
- *
- * The supported ioprio_classes are the same as in CFQ, in descending
- * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
- * Requests from higher priority queues are served before all the
- * requests from lower priority queues; among requests of the same
- * queue requests are served according to B-WF2Q+.
- * All the fields are protected by the queue lock of the containing bfqd.
- */
-struct bfq_sched_data {
-	struct bfq_entity *active_entity;
-	struct bfq_entity *next_active;
-	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
-};
-
-/**
- * struct bfq_entity - schedulable entity.
- * @rb_node: service_tree member.
- * @on_st: flag, true if the entity is on a tree (either the active or
- *         the idle one of its service_tree).
- * @finish: B-WF2Q+ finish timestamp (aka F_i).
- * @start: B-WF2Q+ start timestamp (aka S_i).
- * @tree: tree the entity is enqueued into; %NULL if not on a tree.
- * @min_start: minimum start time of the (active) subtree rooted at
- *             this entity; used for O(log N) lookups into active trees.
- * @service: service received during the last round of service.
- * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
- * @weight: weight of the queue
- * @parent: parent entity, for hierarchical scheduling.
- * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
- *                 associated scheduler queue, %NULL on leaf nodes.
- * @sched_data: the scheduler queue this entity belongs to.
- * @ioprio: the ioprio in use.
- * @new_weight: when a weight change is requested, the new weight value.
- * @orig_weight: original weight, used to implement weight boosting
- * @new_ioprio: when an ioprio change is requested, the new ioprio value.
- * @ioprio_class: the ioprio_class in use.
- * @new_ioprio_class: when an ioprio_class change is requested, the new
- *                    ioprio_class value.
- * @ioprio_changed: flag, true when the user requested a weight, ioprio or
- *                  ioprio_class change.
- *
- * A bfq_entity is used to represent either a bfq_queue (leaf node in the
- * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
- * entity belongs to the sched_data of the parent group in the cgroup
- * hierarchy.  Non-leaf entities have also their own sched_data, stored
- * in @my_sched_data.
- *
- * Each entity stores independently its priority values; this would
- * allow different weights on different devices, but this
- * functionality is not exported to userspace by now.  Priorities and
- * weights are updated lazily, first storing the new values into the
- * new_* fields, then setting the @ioprio_changed flag.  As soon as
- * there is a transition in the entity state that allows the priority
- * update to take place the effective and the requested priority
- * values are synchronized.
- *
- * Unless cgroups are used, the weight value is calculated from the
- * ioprio to export the same interface as CFQ.  When dealing with
- * ``well-behaved'' queues (i.e., queues that do not spend too much
- * time to consume their budget and have true sequential behavior, and
- * when there are no external factors breaking anticipation) the
- * relative weights at each level of the cgroups hierarchy should be
- * guaranteed.  All the fields are protected by the queue lock of the
- * containing bfqd.
- */
-struct bfq_entity {
-	struct rb_node rb_node;
-
-	int on_st;
-
-	u64 finish;
-	u64 start;
-
-	struct rb_root *tree;
-
-	u64 min_start;
-
-	unsigned long service, budget;
-	unsigned short weight, new_weight;
-	unsigned short orig_weight;
-
-	struct bfq_entity *parent;
-
-	struct bfq_sched_data *my_sched_data;
-	struct bfq_sched_data *sched_data;
-
-	unsigned short ioprio, new_ioprio;
-	unsigned short ioprio_class, new_ioprio_class;
-
-	int ioprio_changed;
-};
-
-struct bfq_group;
-
-/**
- * struct bfq_queue - leaf schedulable entity.
- * @ref: reference counter.
- * @bfqd: parent bfq_data.
- * @new_bfqq: shared bfq_queue if queue is cooperating with
- *           one or more other queues.
- * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
- * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
- * @sort_list: sorted list of pending requests.
- * @next_rq: if fifo isn't expired, next request to serve.
- * @queued: nr of requests queued in @sort_list.
- * @allocated: currently allocated requests.
- * @meta_pending: pending metadata requests.
- * @fifo: fifo list of requests in sort_list.
- * @entity: entity representing this queue in the scheduler.
- * @max_budget: maximum budget allowed from the feedback mechanism.
- * @budget_timeout: budget expiration (in jiffies).
- * @dispatched: number of requests on the dispatch list or inside driver.
- * @org_ioprio: saved ioprio during boosted periods.
- * @org_ioprio_class: saved ioprio_class during boosted periods.
- * @flags: status flags.
- * @bfqq_list: node for active/idle bfqq list inside our bfqd.
- * @seek_samples: number of seeks sampled
- * @seek_total: sum of the distances of the seeks sampled
- * @seek_mean: mean seek distance
- * @last_request_pos: position of the last request enqueued
- * @pid: pid of the process owning the queue, used for logging purposes.
- * @last_rais_start_time: last (idle -> weight-raised) transition attempt
- * @raising_cur_max_time: current max raising time for this queue
- *
- * A bfq_queue is a leaf request queue; it can be associated to an io_context
- * or more (if it is an async one).  @cgroup holds a reference to the
- * cgroup, to be sure that it does not disappear while a bfqq still
- * references it (mostly to avoid races between request issuing and task
- * migration followed by cgroup distruction).
- * All the fields are protected by the queue lock of the containing bfqd.
- */
-struct bfq_queue {
-	atomic_t ref;
-	struct bfq_data *bfqd;
-
-	/* fields for cooperating queues handling */
-	struct bfq_queue *new_bfqq;
-	struct rb_node pos_node;
-	struct rb_root *pos_root;
-
-	struct rb_root sort_list;
-	struct request *next_rq;
-	int queued[2];
-	int allocated[2];
-	int meta_pending;
-	struct list_head fifo;
-
-	struct bfq_entity entity;
-
-	unsigned long max_budget;
-	unsigned long budget_timeout;
-
-	int dispatched;
-
-	unsigned short org_ioprio;
-	unsigned short org_ioprio_class;
-
-	unsigned int flags;
-
-	struct list_head bfqq_list;
-
-	unsigned int seek_samples;
-	u64 seek_total;
-	sector_t seek_mean;
-	sector_t last_request_pos;
-
-	pid_t pid;
-
-	/* weight-raising fields */
-	unsigned int raising_cur_max_time;
-	u64 last_rais_start_finish, soft_rt_next_start;
-	unsigned int raising_coeff;
-};
-
-/**
- * struct bfq_data - per device data structure.
- * @queue: request queue for the managed device.
- * @root_group: root bfq_group for the device.
- * @rq_pos_tree: rbtree sorted by next_request position,
- *		used when determining if two or more queues
- *		have interleaving requests (see bfq_close_cooperator).
- * @busy_queues: number of bfq_queues containing requests (including the
- *		 queue under service, even if it is idling).
- * @queued: number of queued requests.
- * @rq_in_driver: number of requests dispatched and waiting for completion.
- * @sync_flight: number of sync requests in the driver.
- * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
- *		      completed requests .
- * @hw_tag_samples: nr of samples used to calculate hw_tag.
- * @hw_tag: flag set to one if the driver is showing a queueing behavior.
- * @budgets_assigned: number of budgets assigned.
- * @idle_slice_timer: timer set when idling for the next sequential request
- *                    from the queue under service.
- * @unplug_work: delayed work to restart dispatching on the request queue.
- * @active_queue: bfq_queue under service.
- * @active_cic: cfq_io_context (cic) associated with the @active_queue.
- * @last_position: on-disk position of the last served request.
- * @last_budget_start: beginning of the last budget.
- * @last_idling_start: beginning of the last idle slice.
- * @peak_rate: peak transfer rate observed for a budget.
- * @peak_rate_samples: number of samples used to calculate @peak_rate.
- * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
- * @cic_index: use small consequent indexes as radix tree keys to reduce depth
- * @cic_list: list of all the cics active on the bfq_data device.
- * @group_list: list of all the bfq_groups active on the device.
- * @active_list: list of all the bfq_queues active on the device.
- * @idle_list: list of all the bfq_queues idle on the device.
- * @bfq_quantum: max number of requests dispatched per dispatch round.
- * @bfq_fifo_expire: timeout for async/sync requests; when it expires
- *                   requests are served in fifo order.
- * @bfq_back_penalty: weight of backward seeks wrt forward ones.
- * @bfq_back_max: maximum allowed backward seek.
- * @bfq_slice_idle: maximum idling time.
- * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
- * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
- *                           async queues.
- * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
- *               to prevent seeky queues to impose long latencies to well
- *               behaved ones (this also implies that seeky queues cannot
- *               receive guarantees in the service domain; after a timeout
- *               they are charged for the whole allocated budget, to try
- *               to preserve a behavior reasonably fair among them, but
- *               without service-domain guarantees).
- * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
- *                            queue is multiplied
- * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
- * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
- * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
- *			       may be reactivated for a queue (in jiffies)
- * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
- *                                   after which weight-raising may be
- *                                   reactivated for an already busy queue
- *                                   (in jiffies)
- * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
- *			         sectors per seconds
- * @RT_prod: cached value of the product R*T used for computing the maximum
- * 	     duration of the weight raising automatically
- * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
- *
- * All the fields are protected by the @queue lock.
- */
-struct bfq_data {
-	struct request_queue *queue;
-
-	struct bfq_group *root_group;
-
-	struct rb_root rq_pos_tree;
-
-	int busy_queues;
-	int queued;
-	int rq_in_driver;
-	int sync_flight;
-
-	int max_rq_in_driver;
-	int hw_tag_samples;
-	int hw_tag;
-
-	int budgets_assigned;
-
-	struct timer_list idle_slice_timer;
-	struct work_struct unplug_work;
-
-	struct bfq_queue *active_queue;
-	struct cfq_io_context *active_cic;
-
-	sector_t last_position;
-
-	ktime_t last_budget_start;
-	ktime_t last_idling_start;
-	int peak_rate_samples;
-	u64 peak_rate;
-	unsigned long bfq_max_budget;
-
-	unsigned int cic_index;
-	struct list_head cic_list;
-	struct hlist_head group_list;
-	struct list_head active_list;
-	struct list_head idle_list;
-
-	unsigned int bfq_quantum;
-	unsigned int bfq_fifo_expire[2];
-	unsigned int bfq_back_penalty;
-	unsigned int bfq_back_max;
-	unsigned int bfq_slice_idle;
-	u64 bfq_class_idle_last_service;
-
-	unsigned int bfq_user_max_budget;
-	unsigned int bfq_max_budget_async_rq;
-	unsigned int bfq_timeout[2];
-
-	bool low_latency;
-
-	/* parameters of the low_latency heuristics */
-	unsigned int bfq_raising_coeff;
-	unsigned int bfq_raising_max_time;
-	unsigned int bfq_raising_rt_max_time;
-	unsigned int bfq_raising_min_idle_time;
-	unsigned int bfq_raising_min_inter_arr_async;
-	unsigned int bfq_raising_max_softrt_rate;
-	u64 RT_prod;
-
-	struct bfq_queue oom_bfqq;
-};
-
-enum bfqq_state_flags {
-	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is under service */
-	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */
-	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */
-	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
-	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */
-	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */
-	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */
-	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */
-	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */
-	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */
-	BFQ_BFQQ_FLAG_some_coop_idle,   /* some cooperator is inactive */
-};
-
-#define BFQ_BFQQ_FNS(name)						\
-static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\
-{									\
-	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\
-}									\
-static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\
-{									\
-	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\
-}									\
-static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
-{									\
-	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\
-}
-
-BFQ_BFQQ_FNS(busy);
-BFQ_BFQQ_FNS(wait_request);
-BFQ_BFQQ_FNS(must_alloc);
-BFQ_BFQQ_FNS(fifo_expire);
-BFQ_BFQQ_FNS(idle_window);
-BFQ_BFQQ_FNS(prio_changed);
-BFQ_BFQQ_FNS(sync);
-BFQ_BFQQ_FNS(budget_new);
-BFQ_BFQQ_FNS(coop);
-BFQ_BFQQ_FNS(split_coop);
-BFQ_BFQQ_FNS(some_coop_idle);
-#undef BFQ_BFQQ_FNS
-
-/* Logging facilities. */
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
-	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
-
-#define bfq_log(bfqd, fmt, args...) \
-	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
-
-/* Expiration reasons. */
-enum bfqq_expiration {
-	BFQ_BFQQ_TOO_IDLE = 0,		/* queue has been idling for too long */
-	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */
-	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */
-	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */
-};
-
-#ifdef CONFIG_CGROUP_BFQIO
-/**
- * struct bfq_group - per (device, cgroup) data structure.
- * @entity: schedulable entity to insert into the parent group sched_data.
- * @sched_data: own sched_data, to contain child entities (they may be
- *              both bfq_queues and bfq_groups).
- * @group_node: node to be inserted into the bfqio_cgroup->group_data
- *              list of the containing cgroup's bfqio_cgroup.
- * @bfqd_node: node to be inserted into the @bfqd->group_list list
- *             of the groups active on the same device; used for cleanup.
- * @bfqd: the bfq_data for the device this group acts upon.
- * @async_bfqq: array of async queues for all the tasks belonging to
- *              the group, one queue per ioprio value per ioprio_class,
- *              except for the idle class that has only one queue.
- * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
- * @my_entity: pointer to @entity, %NULL for the toplevel group; used
- *             to avoid too many special cases during group creation/migration.
- *
- * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
- * there is a set of bfq_groups, each one collecting the lower-level
- * entities belonging to the group that are acting on the same device.
- *
- * Locking works as follows:
- *    o @group_node is protected by the bfqio_cgroup lock, and is accessed
- *      via RCU from its readers.
- *    o @bfqd is protected by the queue lock, RCU is used to access it
- *      from the readers.
- *    o All the other fields are protected by the @bfqd queue lock.
- */
-struct bfq_group {
-	struct bfq_entity entity;
-	struct bfq_sched_data sched_data;
-
-	struct hlist_node group_node;
-	struct hlist_node bfqd_node;
-
-	void *bfqd;
-
-	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
-	struct bfq_queue *async_idle_bfqq;
-
-	struct bfq_entity *my_entity;
-};
-
-/**
- * struct bfqio_cgroup - bfq cgroup data structure.
- * @css: subsystem state for bfq in the containing cgroup.
- * @weight: cgroup weight.
- * @ioprio: cgroup ioprio.
- * @ioprio_class: cgroup ioprio_class.
- * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
- * @group_data: list containing the bfq_group belonging to this cgroup.
- *
- * @group_data is accessed using RCU, with @lock protecting the updates,
- * @ioprio and @ioprio_class are protected by @lock.
- */
-struct bfqio_cgroup {
-	struct cgroup_subsys_state css;
-
-	unsigned short weight, ioprio, ioprio_class;
-
-	spinlock_t lock;
-	struct hlist_head group_data;
-};
-#else
-struct bfq_group {
-	struct bfq_sched_data sched_data;
-
-	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
-	struct bfq_queue *async_idle_bfqq;
-};
-#endif
-
-static inline struct bfq_service_tree *
-bfq_entity_service_tree(struct bfq_entity *entity)
-{
-	struct bfq_sched_data *sched_data = entity->sched_data;
-	unsigned int idx = entity->ioprio_class - 1;
-
-	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
-	BUG_ON(sched_data == NULL);
-
-	return sched_data->service_tree + idx;
-}
-
-static inline struct bfq_queue *cic_to_bfqq(struct cfq_io_context *cic,
-					    int is_sync)
-{
-	return cic->cfqq[!!is_sync];
-}
-
-static inline void cic_set_bfqq(struct cfq_io_context *cic,
-				struct bfq_queue *bfqq, int is_sync)
-{
-	cic->cfqq[!!is_sync] = bfqq;
-}
-
-static inline void call_for_each_cic(struct io_context *ioc,
-				     void (*func)(struct io_context *,
-				     struct cfq_io_context *))
-{
-	struct cfq_io_context *cic;
-	struct hlist_node *n;
-
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list)
-		func(ioc, cic);
-	rcu_read_unlock();
-}
-
-#define CIC_DEAD_KEY    1ul
-#define CIC_DEAD_INDEX_SHIFT    1
-
-static inline void *bfqd_dead_key(struct bfq_data *bfqd)
-{
-	return (void *)(bfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
-}
-
-/**
- * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
- * @ptr: a pointer to a bfqd.
- * @flags: storage for the flags to be saved.
- *
- * This function allows cic->key and bfqg->bfqd to be protected by the
- * queue lock of the bfqd they reference; the pointer is dereferenced
- * under RCU, so the storage for bfqd is assured to be safe as long
- * as the RCU read side critical section does not end.  After the
- * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
- * sure that no other writer accessed it.  If we raced with a writer,
- * the function returns NULL, with the queue unlocked, otherwise it
- * returns the dereferenced pointer, with the queue locked.
- */
-static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
-						   unsigned long *flags)
-{
-	struct bfq_data *bfqd;
-
-	rcu_read_lock();
-	bfqd = rcu_dereference(*(struct bfq_data **)ptr);
-
-	if (bfqd != NULL && !((unsigned long) bfqd & CIC_DEAD_KEY)) {
-		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
-		if (*ptr == bfqd)
-			goto out;
-		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
-	}
-
-	bfqd = NULL;
-out:
-	rcu_read_unlock();
-	return bfqd;
-}
-
-static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
-				       unsigned long *flags)
-{
-	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
-}
-
-static void bfq_changed_ioprio(struct io_context *ioc,
-			       struct cfq_io_context *cic);
-static void bfq_put_queue(struct bfq_queue *bfqq);
-static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
-static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
-				       struct bfq_group *bfqg, int is_sync,
-				       struct io_context *ioc, gfp_t gfp_mask);
-static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
-static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
-#endif
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 0504f530f..342eae9b0 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -4,7 +4,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/bitmap.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
@@ -17,12 +16,13 @@
  */
 static struct kmem_cache *iocontext_cachep;
 
-static void hlist_sched_dtor(struct io_context *ioc, struct hlist_head *list)
+static void cfq_dtor(struct io_context *ioc)
 {
-	if (!hlist_empty(list)) {
+	if (!hlist_empty(&ioc->cic_list)) {
 		struct cfq_io_context *cic;
 
-		cic = hlist_entry(list->first, struct cfq_io_context, cic_list);
+		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
+								cic_list);
 		cic->dtor(ioc);
 	}
 }
@@ -40,9 +40,7 @@ int put_io_context(struct io_context *ioc)
 
 	if (atomic_long_dec_and_test(&ioc->refcount)) {
 		rcu_read_lock();
-		
-		hlist_sched_dtor(ioc, &ioc->cic_list);
- 	    hlist_sched_dtor(ioc, &ioc->bfq_cic_list);
+		cfq_dtor(ioc);
 		rcu_read_unlock();
 
 		kmem_cache_free(iocontext_cachep, ioc);
@@ -52,14 +50,15 @@ int put_io_context(struct io_context *ioc)
 }
 EXPORT_SYMBOL(put_io_context);
 
-static void hlist_sched_exit(struct io_context *ioc, struct hlist_head *list)
+static void cfq_exit(struct io_context *ioc)
 {
 	rcu_read_lock();
 
-	if (!hlist_empty(list)) {
+	if (!hlist_empty(&ioc->cic_list)) {
 		struct cfq_io_context *cic;
 
-		cic = hlist_entry(list->first, struct cfq_io_context, cic_list);
+		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
+								cic_list);
 		cic->exit(ioc);
 	}
 	rcu_read_unlock();
@@ -75,10 +74,8 @@ void exit_io_context(struct task_struct *task)
 	task->io_context = NULL;
 	task_unlock(task);
 
-if (atomic_dec_and_test(&ioc->nr_tasks)) {
-     hlist_sched_exit(ioc, &ioc->cic_list);
-     hlist_sched_exit(ioc, &ioc->bfq_cic_list);
-   }
+	if (atomic_dec_and_test(&ioc->nr_tasks))
+		cfq_exit(ioc);
 
 	put_io_context(ioc);
 }
@@ -92,14 +89,12 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 		atomic_long_set(&ret->refcount, 1);
 		atomic_set(&ret->nr_tasks, 1);
 		spin_lock_init(&ret->lock);
-		bitmap_zero(ret->ioprio_changed, IOC_IOPRIO_CHANGED_BITS);
+		ret->ioprio_changed = 0;
 		ret->ioprio = 0;
 		ret->last_waited = 0; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
 		INIT_HLIST_HEAD(&ret->cic_list);
-		INIT_RADIX_TREE(&ret->bfq_radix_root, GFP_ATOMIC | __GFP_HIGH);	
-        INIT_HLIST_HEAD(&ret->bfq_cic_list);
 		ret->ioc_data = NULL;
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 		ret->cgroup_changed = 0;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b581793ec..ae21919f1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2919,6 +2919,7 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
 static void cfq_ioc_set_ioprio(struct io_context *ioc)
 {
 	call_for_each_cic(ioc, changed_ioprio);
+	ioc->ioprio_changed = 0;
 }
 
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -3203,13 +3204,8 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		goto err_free;
 
 out:
-	/*
-	 * test_and_clear_bit() implies a memory barrier, paired with
-	 * the wmb() in fs/ioprio.c, so the value seen for ioprio is the
-	 * new one.
-	 */
-	if (unlikely(test_and_clear_bit(IOC_CFQ_IOPRIO_CHANGED,
-					ioc->ioprio_changed)))
+	smp_read_barrier_depends();
+	if (unlikely(ioc->ioprio_changed))
 		cfq_ioc_set_ioprio(ioc);
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 2fdc2a310..7da2a0650 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -30,7 +30,7 @@
 
 int set_task_ioprio(struct task_struct *task, int ioprio)
 {
-	int err, i;
+	int err;
 	struct io_context *ioc;
 	const struct cred *cred = current_cred(), *tcred;
 
@@ -60,15 +60,12 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 			err = -ENOMEM;
 			break;
 		}
-		smp_wmb();
 		task->io_context = ioc;
 	} while (1);
 
 	if (!err) {
 		ioc->ioprio = ioprio;
-		wmb();
-		for (i = 0; i < IOC_IOPRIO_CHANGED_BITS; i++)
-		  set_bit(i, ioc->ioprio_changed);
+		ioc->ioprio_changed = 1;
 	}
 
 	task_unlock(task);
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index c96663839..ac663c187 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -64,9 +64,3 @@ SUBSYS(perf)
 #endif
 
 /* */
-
-#ifdef CONFIG_CGROUP_BFQIO
-SUBSYS(bfqio)
-#endif
-
-/* */
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 5f5357748..b2eee896d 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -3,12 +3,12 @@
 
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
-#include <linux/bitmap.h>
 
+struct cfq_queue;
 struct cfq_io_context {
 	void *key;
 
-	void *cfqq[2];
+	struct cfq_queue *cfqq[2];
 
 	struct io_context *ioc;
 
@@ -27,16 +27,6 @@ struct cfq_io_context {
 	struct rcu_head rcu_head;
 };
 
-/*
- * Indexes into the ioprio_changed bitmap.  A bit set indicates that
- * the corresponding I/O scheduler needs to see a ioprio update.
- */
-enum {
-	IOC_CFQ_IOPRIO_CHANGED,
-	IOC_BFQ_IOPRIO_CHANGED,
-	IOC_IOPRIO_CHANGED_BITS
-};
-
 /*
  * I/O subsystem state of the associated processes.  It is refcounted
  * and kmalloc'ed. These could be shared between processes.
@@ -49,7 +39,7 @@ struct io_context {
 	spinlock_t lock;
 
 	unsigned short ioprio;
-	DECLARE_BITMAP(ioprio_changed, IOC_IOPRIO_CHANGED_BITS);
+	unsigned short ioprio_changed;
 
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 	unsigned short cgroup_changed;
@@ -63,8 +53,6 @@ struct io_context {
 
 	struct radix_tree_root radix_root;
 	struct hlist_head cic_list;
-	struct radix_tree_root bfq_radix_root;
-	struct hlist_head bfq_cic_list;
 	void __rcu *ioc_data;
 };