From 723ebcb9f9803cd6a62bbb85fbeb1206d0e63c64 Mon Sep 17 00:00:00 2001 From: forumber Date: Mon, 5 Nov 2012 19:06:03 +0200 Subject: [PATCH 01/19] changed android@localhost to dzo@martin --- scripts/mkcompile_h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h index 41bc7a236..d5428594f 100755 --- a/scripts/mkcompile_h +++ b/scripts/mkcompile_h @@ -75,8 +75,8 @@ UTS_TRUNCATE="cut -b -$UTS_LEN" #/* < DTS2011052606009 jiaxianghong 20110527 begin */ #/* < DTS2011030103387 niguodong 20110415 begin */ - echo \#define LINUX_COMPILE_BY \"android\" - echo \#define LINUX_COMPILE_HOST \"localhost\" + echo \#define LINUX_COMPILE_BY \"dzo\" + echo \#define LINUX_COMPILE_HOST \"martin\" #/* DTS2011030103387 niguodong 20110415 end > */ #/* < DTS2011052606009 jiaxianghong 20110527 end */ From cff6a3e34c3ff37f8b0cb4d0c6584a6269885c09 Mon Sep 17 00:00:00 2001 From: forumber Date: Mon, 5 Nov 2012 20:04:29 +0200 Subject: [PATCH 02/19] updated ext3 and ext4 filesystems from HTC G2 CM9 kernel --- fs/ext3/ialloc.c | 8 +++-- fs/ext3/inode.c | 41 +++++++++++++++++++---- fs/ext4/Kconfig | 5 +++ fs/ext4/balloc.c | 3 +- fs/ext4/bitmap.c | 8 ++--- fs/ext4/ext4.h | 16 +++++++-- fs/ext4/ext4_jbd2.h | 56 ++++++++++++++++---------------- fs/ext4/extents.c | 10 ++++-- fs/ext4/ialloc.c | 17 ++++++---- fs/ext4/inode.c | 66 ++++++++++++++++++++++++------------- fs/ext4/ioctl.c | 13 +++++--- fs/ext4/mballoc.c | 26 +++++++++++++-- fs/ext4/namei.c | 8 ++--- fs/ext4/page-io.c | 30 ++++++++++++++++- fs/ext4/super.c | 79 ++++++++++++++++++++++++++++++++++++--------- fs/ext4/xattr.c | 13 ++++++-- 16 files changed, 296 insertions(+), 103 deletions(-) mode change 100755 => 100644 fs/ext4/ialloc.c diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index bfc2dc436..0b3da7cc8 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -561,8 +561,12 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, if (IS_DIRSYNC(inode)) handle->h_sync = 1; if (insert_inode_locked(inode) < 0) { - err = -EINVAL; - goto fail_drop; + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + goto fail; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 3451d23c3..0aedb27fe 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1568,7 +1568,13 @@ static int ext3_ordered_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); /* * We give up here if we're reentered, because it might be for a @@ -1642,7 +1648,13 @@ static int ext3_writeback_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); if (ext3_journal_current_handle()) goto out_fail; @@ -1684,7 +1696,13 @@ static int ext3_journalled_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); if (ext3_journal_current_handle()) goto no_write; @@ -2995,6 +3013,8 @@ static int ext3_do_update_inode(handle_t *handle, struct ext3_inode_info *ei = EXT3_I(inode); struct buffer_head *bh = iloc->bh; int err = 0, rc, block; + int need_datasync = 0; + __le32 disksize; again: /* we can't allow multiple procs in here at once, its a bit racey */ @@ -3032,7 +3052,11 @@ static int ext3_do_update_inode(handle_t *handle, raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - raw_inode->i_size = cpu_to_le32(ei->i_disksize); + disksize = cpu_to_le32(ei->i_disksize); + if (disksize != raw_inode->i_size) { + need_datasync = 1; + raw_inode->i_size = disksize; + } raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); @@ -3048,8 +3072,11 @@ static int ext3_do_update_inode(handle_t *handle, if (!S_ISREG(inode->i_mode)) { raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); } else { - raw_inode->i_size_high = - cpu_to_le32(ei->i_disksize >> 32); + disksize = cpu_to_le32(ei->i_disksize >> 32); + if (disksize != raw_inode->i_size_high) { + raw_inode->i_size_high = disksize; + need_datasync = 1; + } if (ei->i_disksize > 0x7fffffffULL) { struct super_block *sb = inode->i_sb; if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, @@ -3102,6 +3129,8 @@ static int ext3_do_update_inode(handle_t *handle, ext3_clear_inode_state(inode, EXT3_STATE_NEW); atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); + if (need_datasync) + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); out_brelse: brelse (bh); ext3_std_error(inode->i_sb, err); diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 9ed1bb1f3..8a595e01f 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -83,3 +83,8 @@ config EXT4_DEBUG If you select Y here, then you will be able to turn on debugging with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" + +config EXT4_E2FSCK_RECOVER + bool "EXT4 e2fsck recovery support" + depends on EXT4_FS + default n diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 264f69495..ebe95f565 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -514,7 +514,8 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) if (bitmap_bh == NULL) continue; - x = ext4_count_free(bitmap_bh, sb->s_blocksize); + x = ext4_count_free(bitmap_bh->b_data, + EXT4_BLOCKS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_blks_count(sb, gdp), x); bitmap_count += x; diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index fa3af81ac..012faaaec 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -15,15 +15,13 @@ static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; -unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars) +unsigned int ext4_count_free(char *bitmap, unsigned int numchars) { unsigned int i, sum = 0; - if (!map) - return 0; for (i = 0; i < numchars; i++) - sum += nibblemap[map->b_data[i] & 0xf] + - nibblemap[(map->b_data[i] >> 4) & 0xf]; + sum += nibblemap[bitmap[i] & 0xf] + + nibblemap[(bitmap[i] >> 4) & 0xf]; return sum; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 354619a1a..7fb3bc7d3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -175,6 +175,7 @@ struct mpage_da_data { */ #define EXT4_IO_END_UNWRITTEN 0x0001 #define EXT4_IO_END_ERROR 0x0002 +#define EXT4_IO_END_QUEUED 0x0004 struct ext4_io_page { struct page *p_page; @@ -357,8 +358,7 @@ struct flex_groups { /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ - EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\ - EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) @@ -1215,6 +1215,12 @@ struct ext4_sb_info { /* Kernel thread for multiple mount protection */ struct task_struct *s_mmp_tsk; + +#ifdef CONFIG_EXT4_E2FSCK_RECOVER + /* workqueue for rebooting oem-22 to run e2fsck */ + struct work_struct reboot_work; + struct workqueue_struct *recover_wq; +#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1713,7 +1719,7 @@ struct mmpd_data { # define NORET_AND noreturn, /* bitmap.c */ -extern unsigned int ext4_count_free(struct buffer_head *, unsigned); +extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); /* balloc.c */ extern unsigned int ext4_block_group(struct super_block *sb, @@ -2178,6 +2184,10 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); +#ifdef CONFIG_EXT4_E2FSCK_RECOVER +extern void ext4_e2fsck(struct super_block *sb); +#endif + /* page-io.c */ extern int __init ext4_init_pageio(void); extern void ext4_exit_pageio(void); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 5802fa1da..95af6f878 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -261,43 +261,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle, /* super.c */ int ext4_force_commit(struct super_block *sb); -static inline int ext4_should_journal_data(struct inode *inode) +/* + * Ext4 inode journal modes + */ +#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ +#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ +#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ + +static inline int ext4_inode_journal_mode(struct inode *inode) { if (EXT4_JOURNAL(inode) == NULL) - return 0; - if (!S_ISREG(inode->i_mode)) - return 1; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - return 1; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 1; - return 0; + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC)) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + else + BUG(); +} + +static inline int ext4_should_journal_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; } static inline int ext4_should_order_data(struct inode *inode) { - if (EXT4_JOURNAL(inode) == NULL) - return 0; - if (!S_ISREG(inode->i_mode)) - return 0; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - return 1; - return 0; + return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; } static inline int ext4_should_writeback_data(struct inode *inode) { - if (EXT4_JOURNAL(inode) == NULL) - return 1; - if (!S_ISREG(inode->i_mode)) - return 0; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - return 1; - return 0; + return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; } /* diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f3aacb320..57694395c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -341,6 +341,8 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); + if (len == 0) + return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -1993,7 +1995,11 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, __u32 len, ext4_fsblk_t start) { struct ext4_ext_cache *cex; - BUG_ON(len == 0); + WARN_ON(len == 0); + if (len == 0) { + EXT4_ERROR_INODE(inode, "extent.ee_len = 0"); + return; + } spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; cex->ec_block = block; @@ -2844,7 +2850,7 @@ static int ext4_split_extent_at(handle_t *handle, if (err) goto fix_extent_len; /* update the extent length and mark as initialized */ - ex->ee_len = cpu_to_le32(ee_len); + ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(inode, path, ex); err = ext4_ext_dirty(handle, inode, path + depth); goto out; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c old mode 100755 new mode 100644 index eaa10168e..29272de30 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1021,8 +1021,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { - err = -EINVAL; - goto fail_drop; + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + goto fail; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; @@ -1189,7 +1193,8 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) if (!bitmap_bh) continue; - x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); + x = ext4_count_free(bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; @@ -1275,10 +1280,10 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, * used inodes so we need to skip blocks with used inodes in * inode table. */ -/* yanzhijun for remount system */ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) - used_blks = (EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp))/sbi->s_inodes_per_block; + used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp)), + sbi->s_inodes_per_block); if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { ext4_error(sb, "Something is wrong with group %u\n" diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c94774c32..18fee6dae 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -190,9 +190,6 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); - mutex_lock(&inode->i_mutex); - ext4_flush_completed_IO(inode); - mutex_unlock(&inode->i_mutex); ext4_ioend_wait(inode); if (inode->i_nlink) { @@ -1137,6 +1134,15 @@ void ext4_da_update_reserve_space(struct inode *inode, used = ei->i_reserved_data_blocks; } + if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) { + ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d " + "with only %d reserved metadata blocks\n", __func__, + inode->i_ino, ei->i_allocated_meta_blocks, + ei->i_reserved_meta_blocks); + WARN_ON(1); + ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks; + } + /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; @@ -2129,8 +2135,11 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, clear_buffer_unwritten(bh); } - /* skip page if block allocation undone */ - if (buffer_delay(bh) || buffer_unwritten(bh)) + /* + * skip page if block allocation undone and + * block is dirty + */ + if (ext4_bh_delay_or_unwritten(NULL, bh)) skip_page = 1; bh = bh->b_this_page; block_start += bh->b_size; @@ -3212,13 +3221,14 @@ static int ext4_da_write_end(struct file *file, int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) { - if (ext4_should_order_data(inode)) { + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: return ext4_ordered_write_end(file, mapping, pos, len, copied, page, fsdata); - } else if (ext4_should_writeback_data(inode)) { + case EXT4_INODE_WRITEBACK_DATA_MODE: return ext4_writeback_write_end(file, mapping, pos, len, copied, page, fsdata); - } else { + default: BUG(); } } @@ -3234,7 +3244,7 @@ static int ext4_da_write_end(struct file *file, */ new_i_size = pos + copied; - if (new_i_size > EXT4_I(inode)->i_disksize) { + if (copied && new_i_size > EXT4_I(inode)->i_disksize) { if (ext4_da_should_update_i_disksize(page, end)) { down_write(&EXT4_I(inode)->i_data_sem); if (new_i_size > EXT4_I(inode)->i_disksize) { @@ -3510,12 +3520,17 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, } retry: - if (rw == READ && ext4_should_dioread_nolock(inode)) + if (rw == READ && ext4_should_dioread_nolock(inode)) { + if (unlikely(!list_empty(&ei->i_completed_io_list))) { + mutex_lock(&inode->i_mutex); + ext4_flush_completed_IO(inode); + mutex_unlock(&inode->i_mutex); + } ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block, NULL, NULL, 0); - else { + } else { ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, @@ -3913,18 +3928,25 @@ static const struct address_space_operations ext4_da_aops = { void ext4_set_aops(struct inode *inode) { - if (ext4_should_order_data(inode) && - test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else if (ext4_should_order_data(inode)) - inode->i_mapping->a_ops = &ext4_ordered_aops; - else if (ext4_should_writeback_data(inode) && - test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else if (ext4_should_writeback_data(inode)) - inode->i_mapping->a_ops = &ext4_writeback_aops; - else + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_ordered_aops; + break; + case EXT4_INODE_WRITEBACK_DATA_MODE: + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_writeback_aops; + break; + case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; + break; + default: + BUG(); + } } /* diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 808c554e7..4cbe1c2c9 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -35,7 +35,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) handle_t *handle = NULL; int err, migrate = 0; struct ext4_iloc iloc; - unsigned int oldflags; + unsigned int oldflags, mask, i; unsigned int jflag; if (!inode_owner_or_capable(inode)) @@ -112,9 +112,14 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) goto flags_err; - flags = flags & EXT4_FL_USER_MODIFIABLE; - flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE; - ei->i_flags = flags; + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { + if (!(mask & EXT4_FL_USER_MODIFIABLE)) + continue; + if (mask & flags) + ext4_set_inode_flag(inode, i); + else + ext4_clear_inode_flag(inode, i); + } ext4_set_inode_flags(inode); inode->i_ctime = ext4_current_time(inode); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0f1be7f16..67ee3c92b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -335,6 +335,9 @@ * object * */ +#ifdef CONFIG_EXT4_E2FSCK_RECOVER +extern int ext4_debug_level; +#endif static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; static struct kmem_cache *ext4_free_ext_cachep; @@ -1315,6 +1318,12 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, mb_free_blocks_double(inode, e4b, first, count); e4b->bd_info->bb_free += count; +#ifdef CONFIG_EXT4_E2FSCK_RECOVER + if (unlikely(ext4_debug_level)) + pr_info("ext4: (%s) group %d free %d block, remain %d\n", + sb->s_id, e4b->bd_group, count, + e4b->bd_info->bb_free); +#endif if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; @@ -1460,6 +1469,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) mb_mark_used_double(e4b, start, len); e4b->bd_info->bb_free -= len; +#ifdef CONFIG_EXT4_E2FSCK_RECOVER + if (unlikely(ext4_debug_level)) + pr_info("ext4: (%s) group %d mark %d block, remain %d\n", + e4b->bd_sb->s_id, e4b->bd_group, len, + e4b->bd_info->bb_free); +#endif if (e4b->bd_info->bb_first_free == start) e4b->bd_info->bb_first_free += len; @@ -2255,6 +2270,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, meta_group_info[i]->bb_free = ext4_free_blks_count(sb, desc); } +#ifdef CONFIG_EXT4_E2FSCK_RECOVER + if (unlikely(ext4_debug_level)) + pr_info("ext4: group %d bb_free %d\n", + group, meta_group_info[i]->bb_free); +#endif INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); @@ -2528,6 +2548,9 @@ int ext4_mb_release(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + if (sbi->s_proc) + remove_proc_entry("mb_groups", sbi->s_proc); + if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { grinfo = ext4_get_group_info(sb, i); @@ -2575,8 +2598,6 @@ int ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); - if (sbi->s_proc) - remove_proc_entry("mb_groups", sbi->s_proc); return 0; } @@ -4583,6 +4604,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, */ new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); if (!new_entry) { + ext4_mb_unload_buddy(&e4b); err = -ENOMEM; goto error_return; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 458a394f6..3d36d5a1e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1589,7 +1589,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, dxtrace(dx_show_index("node", frames[1].entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_handle_dirty_metadata(handle, inode, bh2); + err = ext4_handle_dirty_metadata(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); @@ -1615,7 +1615,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh); + err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); if (err) { ext4_std_error(inode->i_sb, err); goto cleanup; @@ -1866,7 +1866,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) ext4_set_de_type(dir->i_sb, de, S_IFDIR); inode->i_nlink = 2; BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, dir_block); + err = ext4_handle_dirty_metadata(handle, inode, dir_block); if (err) goto out_clear_inode; err = ext4_mark_inode_dirty(handle, inode); @@ -2540,7 +2540,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh); + retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh); if (retval) { ext4_std_error(old_dir->i_sb, retval); goto end_rename; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 97e5e98fd..d99d74aca 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -142,7 +142,23 @@ static void ext4_end_io_work(struct work_struct *work) unsigned long flags; int ret; - mutex_lock(&inode->i_mutex); + if (!mutex_trylock(&inode->i_mutex)) { + /* + * Requeue the work instead of waiting so that the work + * items queued after this can be processed. + */ + queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); + /* + * To prevent the ext4-dio-unwritten thread from keeping + * requeueing end_io requests and occupying cpu for too long, + * yield the cpu if it sees an end_io request that has already + * been requeued. + */ + if (io->flag & EXT4_IO_END_QUEUED) + yield(); + io->flag |= EXT4_IO_END_QUEUED; + return; + } ret = ext4_end_io_nolock(io); if (ret < 0) { mutex_unlock(&inode->i_mutex); @@ -389,6 +405,18 @@ int ext4_bio_write_page(struct ext4_io_submit *io, block_end = block_start + blocksize; if (block_start >= len) { + /* + * Comments copied from block_write_full_page_endio: + * + * The page straddles i_size. It must be zeroed out on + * each and every writepage invocation because it may + * be mmapped. "A file is mapped in multiples of the + * page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when + * mapped, and writes to that region are not written + * out to the file." + */ + zero_user_segment(page, block_start, block_end); clear_buffer_dirty(bh); set_buffer_uptodate(bh); continue; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 111ed9d3c..e3bcdf645 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -43,6 +43,9 @@ #include #include +#ifdef CONFIG_EXT4_E2FSCK_RECOVER +#include +#endif #include "ext4.h" #include "ext4_jbd2.h" @@ -83,6 +86,10 @@ static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +#ifdef CONFIG_EXT4_E2FSCK_RECOVER +int ext4_debug_level = 0; +#endif + #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, @@ -419,6 +426,11 @@ static void ext4_handle_error(struct super_block *sb) if (test_opt(sb, ERRORS_PANIC)) panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); + +#ifdef CONFIG_EXT4_E2FSCK_RECOVER + if (!ext4_debug_level && test_opt(sb, ERRORS_RO)) + ext4_e2fsck(sb); +#endif } void __ext4_error(struct super_block *sb, const char *function, @@ -433,10 +445,47 @@ void __ext4_error(struct super_block *sb, const char *function, printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", sb->s_id, function, line, current->comm, &vaf); va_end(args); + save_error_info(sb, function, line); ext4_handle_error(sb); } +#ifdef CONFIG_EXT4_E2FSCK_RECOVER +static void ext4_reboot(struct work_struct *work) +{ + printk(KERN_ERR "%s: reboot to run e2fsck\n", __func__); + kernel_restart("oem-22"); +} + +void ext4_e2fsck(struct super_block *sb) +{ + static int reboot; + struct workqueue_struct *wq; + struct ext4_sb_info *sb_info; + if (reboot) + return; + printk(KERN_ERR "%s\n", __func__); + reboot = 1; + sb_info = EXT4_SB(sb); + if (!sb_info) { + printk(KERN_ERR "%s: no sb_info\n", __func__); + reboot = 0; + return; + } + sb_info->recover_wq = create_workqueue("ext4-recover"); + if (!sb_info->recover_wq) { + printk(KERN_ERR "EXT4-fs: failed to create recover workqueue\n"); + reboot = 0; + return; + } + + INIT_WORK(&sb_info->reboot_work, ext4_reboot); + wq = sb_info->recover_wq; + /* queue the work to reboot */ + queue_work(wq, &sb_info->reboot_work); +} +#endif + void ext4_error_inode(struct inode *inode, const char *function, unsigned int line, ext4_fsblk_t block, const char *fmt, ...) @@ -859,6 +908,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; + ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; @@ -1113,9 +1163,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",block_validity"); if (!test_opt(sb, INIT_INODE_TABLE)) - seq_puts(seq, ",noinit_inode_table"); + seq_puts(seq, ",noinit_itable"); else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) - seq_printf(seq, ",init_inode_table=%u", + seq_printf(seq, ",init_itable=%u", (unsigned) sbi->s_li_wait_mult); ext4_show_quota_options(seq, sb); @@ -1291,8 +1341,7 @@ enum { Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, - Opt_discard, Opt_nodiscard, - Opt_init_inode_table, Opt_noinit_inode_table, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, }; static const match_table_t tokens = { @@ -1365,9 +1414,9 @@ static const match_table_t tokens = { {Opt_dioread_lock, "dioread_lock"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, - {Opt_init_inode_table, "init_itable=%u"}, - {Opt_init_inode_table, "init_itable"}, - {Opt_noinit_inode_table, "noinit_itable"}, + {Opt_init_itable, "init_itable=%u"}, + {Opt_init_itable, "init_itable"}, + {Opt_noinit_itable, "noinit_itable"}, {Opt_err, NULL}, }; @@ -1844,7 +1893,7 @@ static int parse_options(char *options, struct super_block *sb, case Opt_dioread_lock: clear_opt(sb, DIOREAD_NOLOCK); break; - case Opt_init_inode_table: + case Opt_init_itable: set_opt(sb, INIT_INODE_TABLE); if (args[0].from) { if (match_int(&args[0], &option)) @@ -1855,7 +1904,7 @@ static int parse_options(char *options, struct super_block *sb, return 0; sbi->s_li_wait_mult = option; break; - case Opt_noinit_inode_table: + case Opt_noinit_itable: clear_opt(sb, INIT_INODE_TABLE); break; default: @@ -1958,17 +2007,16 @@ static int ext4_fill_flex_info(struct super_block *sb) struct ext4_group_desc *gdp = NULL; ext4_group_t flex_group_count; ext4_group_t flex_group; - int groups_per_flex = 0; + unsigned int groups_per_flex = 0; size_t size; int i; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; - groups_per_flex = 1 << sbi->s_log_groups_per_flex; - - if (groups_per_flex < 2) { + if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { sbi->s_log_groups_per_flex = 0; return 1; } + groups_per_flex = 1 << sbi->s_log_groups_per_flex; /* We allocate both existing and potentially added groups */ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + @@ -3314,7 +3362,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0) goto cantfind_ext4; - sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_itb_per_group = (sbi->s_inodes_per_group + sbi->s_inodes_per_block - 1)/ sbi->s_inodes_per_block; sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); sbi->s_sbh = bh; @@ -3620,7 +3668,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount4; } - ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY); + if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY)) + sb->s_flags |= MS_RDONLY; /* determine the minimum size of new large inodes, if present */ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c757adc97..c2865cc31 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -487,18 +487,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + unlock_buffer(bh); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); + if (ce) + mb_cache_entry_release(ce); + unlock_buffer(bh); error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); - if (ce) - mb_cache_entry_release(ce); } - unlock_buffer(bh); out: ext4_std_error(inode->i_sb, error); return; @@ -820,8 +821,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + /* + * take i_data_sem because we will test + * i_delalloc_reserved_flag in ext4_mb_new_blocks + */ + down_read((&EXT4_I(inode)->i_data_sem)); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); + up_read((&EXT4_I(inode)->i_data_sem)); if (error) goto cleanup; From 862a013432daeb9a7902c16e76193060a9db3274 Mon Sep 17 00:00:00 2001 From: forumber Date: Mon, 5 Nov 2012 20:06:37 +0200 Subject: [PATCH 03/19] updated kgsl and adreno drivers from HTC G2 CM9 kernel --- drivers/gpu/msm/adreno.c | 42 ++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) mode change 100755 => 100644 drivers/gpu/msm/adreno.c diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c old mode 100755 new mode 100644 index 36b6e82db..8320003d2 --- a/drivers/gpu/msm/adreno.c +++ b/drivers/gpu/msm/adreno.c @@ -114,6 +114,7 @@ static struct adreno_device device_3d0 = { .pfp_fw = NULL, .pm4_fw = NULL, .wait_timeout = 10000, /* in milliseconds */ + .ib_check_level = 0, }; @@ -273,9 +274,12 @@ static void adreno_setstate(struct kgsl_device *device, int sizedwords = 0; unsigned int mh_mmu_invalidate = 0x00000003; /*invalidate all and tc */ + /* + * Fix target freeze issue by adding TLB flush for each submit + * on A20X based targets. + */ if (adreno_is_a20x(adreno_dev)) flags |= KGSL_MMUFLAGS_TLBFLUSH; - /* * If possible, then set the state via the command stream to avoid * a CPU idle. Otherwise, use the default setstate which uses register @@ -641,6 +645,8 @@ adreno_recover_hang(struct kgsl_device *device) unsigned int soptimestamp; unsigned int eoptimestamp; struct adreno_context *drawctxt; + struct kgsl_context *context; + int next = 0; KGSL_DRV_ERR(device, "Starting recovery from 3D GPU hang....\n"); rb_buffer = vmalloc(rb->buffer_desc.size); @@ -709,6 +715,24 @@ adreno_recover_hang(struct kgsl_device *device) drawctxt->flags |= CTXT_FLAGS_GPU_HANG; + /* + * Set the reset status of all contexts to + * INNOCENT_CONTEXT_RESET_EXT except for the bad context + * since thats the guilty party + */ + while ((context = idr_get_next(&device->context_idr, &next))) { + if (KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT != + context->reset_status) { + if (context->devctxt != drawctxt) + context->reset_status = + KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT; + else + context->reset_status = + KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT; + } + next = next + 1; + } + /* Restore valid commands in ringbuffer */ adreno_ringbuffer_restore(rb, rb_buffer, num_rb_contents); rb->timestamp = timestamp; @@ -871,15 +895,13 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout) struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer; unsigned int rbbm_status; unsigned long wait_timeout = - msecs_to_jiffies(adreno_dev->wait_timeout); - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ - /*merge qc patch to fix kgsl issue.*/ + msecs_to_jiffies(adreno_dev->wait_timeout); unsigned long wait_time; unsigned long wait_time_part; unsigned int msecs; unsigned int msecs_first; unsigned int msecs_part; - /* DTS2012041906630 zhangxiangdang 20120423 end > */ + kgsl_cffdump_regpoll(device->id, REG_RBBM_STATUS << 2, 0x00000000, 0x80000000); /* first, wait until the CP has consumed all the commands in @@ -887,8 +909,6 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout) */ retry: if (rb->flags & KGSL_FLAGS_STARTED) { - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ - /*merge qc patch to fix kgsl issue.*/ msecs = adreno_dev->wait_timeout; msecs_first = (msecs <= 100) ? ((msecs + 4) / 5) : 100; msecs_part = (msecs - msecs_first + 3) / 4; @@ -901,7 +921,6 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout) wait_time_part = jiffies + msecs_to_jiffies(msecs_part); } - /* DTS2012041906630 zhangxiangdang 20120423 end > */ GSL_RB_GET_READPTR(rb, &rb->rptr); if (time_after(jiffies, wait_time)) { KGSL_DRV_ERR(device, "rptr: %x, wptr: %x\n", @@ -968,7 +987,7 @@ static int adreno_suspend_context(struct kgsl_device *device) return status; } -const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, +struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, unsigned int pt_base, unsigned int gpuaddr, unsigned int size) @@ -995,8 +1014,7 @@ const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, if (!kgsl_mmu_pt_equal(priv->pagetable, pt_base)) continue; spin_lock(&priv->mem_lock); - entry = kgsl_sharedmem_find_region(priv, gpuaddr, - sizeof(unsigned int)); + entry = kgsl_sharedmem_find_region(priv, gpuaddr, size); if (entry) { result = &entry->memdesc; spin_unlock(&priv->mem_lock); @@ -1040,7 +1058,7 @@ const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, uint8_t *adreno_convertaddr(struct kgsl_device *device, unsigned int pt_base, unsigned int gpuaddr, unsigned int size) { - const struct kgsl_memdesc *memdesc; + struct kgsl_memdesc *memdesc; memdesc = adreno_find_region(device, pt_base, gpuaddr, size); From 4851316e18de7cbb5cb40b371171a793df974864 Mon Sep 17 00:00:00 2001 From: forumber Date: Mon, 5 Nov 2012 20:12:34 +0200 Subject: [PATCH 04/19] updated kgsl and adreno drivers from HTC G2 CM9 kernel (2) --- drivers/gpu/msm/adreno.h | 8 +- drivers/gpu/msm/adreno_a2xx.c | 81 +++++-- drivers/gpu/msm/adreno_debugfs.c | 2 + drivers/gpu/msm/adreno_drawctxt.c | 7 +- drivers/gpu/msm/adreno_pm4types.h | 39 ++- drivers/gpu/msm/adreno_postmortem.c | 9 +- drivers/gpu/msm/adreno_ringbuffer.c | 219 ++++++++++++++++- drivers/gpu/msm/adreno_snapshot.c | 129 ++++++++-- drivers/gpu/msm/kgsl.c | 275 ++++++++++++---------- drivers/gpu/msm/kgsl.h | 63 +++-- drivers/gpu/msm/kgsl_cffdump.c | 184 --------------- drivers/gpu/msm/kgsl_device.h | 9 +- drivers/gpu/msm/kgsl_drm.c | 12 +- drivers/gpu/msm/kgsl_gpummu.c | 12 +- drivers/gpu/msm/kgsl_iommu.c | 6 +- drivers/gpu/msm/kgsl_pwrctrl.c | 55 ++--- drivers/gpu/msm/kgsl_pwrctrl.h | 2 - drivers/gpu/msm/kgsl_pwrscale.c | 4 +- drivers/gpu/msm/kgsl_pwrscale_idlestats.c | 0 drivers/gpu/msm/kgsl_sharedmem.c | 181 ++++++++++---- drivers/gpu/msm/kgsl_sharedmem.h | 68 ++++-- drivers/gpu/msm/kgsl_snapshot.c | 11 +- drivers/gpu/msm/z180.c | 55 +++-- include/linux/msm_kgsl.h | 13 +- 24 files changed, 910 insertions(+), 534 deletions(-) mode change 100755 => 100644 drivers/gpu/msm/adreno.h mode change 100755 => 100644 drivers/gpu/msm/adreno_a2xx.c mode change 100755 => 100644 drivers/gpu/msm/adreno_postmortem.c mode change 100755 => 100644 drivers/gpu/msm/kgsl.c mode change 100755 => 100644 drivers/gpu/msm/kgsl_gpummu.c mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrctrl.c mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrctrl.h mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrscale.c mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrscale_idlestats.c mode change 100755 => 100644 drivers/gpu/msm/kgsl_sharedmem.c mode change 100755 => 100644 drivers/gpu/msm/kgsl_sharedmem.h mode change 100755 => 100644 drivers/gpu/msm/z180.c mode change 100755 => 100644 include/linux/msm_kgsl.h diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h old mode 100755 new mode 100644 index af5bf51ea..1259507d9 --- a/drivers/gpu/msm/adreno.h +++ b/drivers/gpu/msm/adreno.h @@ -46,6 +46,8 @@ #define ADRENO_ISTORE_WORDS 3 #define ADRENO_ISTORE_START 0x5000 +#define ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW 50 + enum adreno_gpurev { ADRENO_REV_UNKNOWN = 0, ADRENO_REV_A200 = 200, @@ -74,12 +76,16 @@ struct adreno_device { unsigned int wait_timeout; unsigned int istore_size; unsigned int pix_shader_start; + unsigned int ib_check_level; }; struct adreno_gpudev { + /* keeps track of when we need to execute the draw workaround code */ + int ctx_switches_since_last_draw; int (*ctxt_create)(struct adreno_device *, struct adreno_context *); void (*ctxt_save)(struct adreno_device *, struct adreno_context *); void (*ctxt_restore)(struct adreno_device *, struct adreno_context *); + void (*ctxt_draw_workaround)(struct adreno_device *); irqreturn_t (*irq_handler)(struct adreno_device *); void (*irq_control)(struct adreno_device *, int); void * (*snapshot)(struct adreno_device *, void *, int *, int); @@ -99,7 +105,7 @@ void adreno_regread(struct kgsl_device *device, unsigned int offsetwords, void adreno_regwrite(struct kgsl_device *device, unsigned int offsetwords, unsigned int value); -const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, +struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, unsigned int pt_base, unsigned int gpuaddr, unsigned int size); diff --git a/drivers/gpu/msm/adreno_a2xx.c b/drivers/gpu/msm/adreno_a2xx.c old mode 100755 new mode 100644 index 5ce9cf85b..62628e4a3 --- a/drivers/gpu/msm/adreno_a2xx.c +++ b/drivers/gpu/msm/adreno_a2xx.c @@ -1421,11 +1421,61 @@ static int a2xx_drawctxt_create(struct adreno_device *adreno_dev, return ret; } +static void a2xx_drawctxt_workaround(struct adreno_device *adreno_dev) +{ + struct kgsl_device *device = &adreno_dev->dev; + unsigned int cmd[11]; + unsigned int *cmds = &cmd[0]; + + if (adreno_is_a225(adreno_dev)) { + adreno_dev->gpudev->ctx_switches_since_last_draw++; + /* If there have been > than + * ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW calls to context + * switches w/o gmem being saved then we need to execute + * this workaround */ + if (adreno_dev->gpudev->ctx_switches_since_last_draw > + ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW) + adreno_dev->gpudev->ctx_switches_since_last_draw = 0; + else + return; + /* + * Issue an empty draw call to avoid possible hangs due to + * repeated idles without intervening draw calls. + * On adreno 225 the PC block has a cache that is only + * flushed on draw calls and repeated idles can make it + * overflow. The gmem save path contains draw calls so + * this workaround isn't needed there. + */ + *cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2); + *cmds++ = (0x4 << 16) | (REG_PA_SU_SC_MODE_CNTL - 0x2000); + *cmds++ = 0; + *cmds++ = cp_type3_packet(CP_DRAW_INDX, 5); + *cmds++ = 0; + *cmds++ = 1<<14; + *cmds++ = 0; + *cmds++ = device->mmu.setstate_memory.gpuaddr; + *cmds++ = 0; + *cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1); + *cmds++ = 0x00000000; + } else { + /* On Adreno 20x/220, if the events for shader space reuse + * gets dropped, the CP block would wait indefinitely. + * Sending CP_SET_SHADER_BASES packet unblocks the CP from + * this wait. + */ + *cmds++ = cp_type3_packet(CP_SET_SHADER_BASES, 1); + *cmds++ = adreno_encode_istore_size(adreno_dev) + | adreno_dev->pix_shader_start; + } + + adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_PMODE, + &cmd[0], cmds - cmd); +} + static void a2xx_drawctxt_save(struct adreno_device *adreno_dev, struct adreno_context *context) { struct kgsl_device *device = &adreno_dev->dev; - unsigned int cmd[22]; if (context == NULL) return; @@ -1470,33 +1520,11 @@ static void a2xx_drawctxt_save(struct adreno_device *adreno_dev, adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_NONE, context->chicken_restore, 3); } + adreno_dev->gpudev->ctx_switches_since_last_draw = 0; context->flags |= CTXT_FLAGS_GMEM_RESTORE; - } else if (adreno_is_a225(adreno_dev)) { - unsigned int *cmds = &cmd[0]; - /* - * Issue an empty draw call to avoid possible hangs due to - * repeated idles without intervening draw calls. - * On adreno 225 the PC block has a cache that is only - * flushed on draw calls and repeated idles can make it - * overflow. The gmem save path contains draw calls so - * this workaround isn't needed there. - */ - *cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2); - *cmds++ = (0x4 << 16) | (REG_PA_SU_SC_MODE_CNTL - 0x2000); - *cmds++ = 0; - *cmds++ = cp_type3_packet(CP_DRAW_INDX, 5); - *cmds++ = 0; - *cmds++ = 1<<14; - *cmds++ = 0; - *cmds++ = device->mmu.setstate_memory.gpuaddr; - *cmds++ = 0; - *cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1); - *cmds++ = 0x00000000; - - adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_PMODE, - &cmd[0], 11); - } + } else if (adreno_is_a2xx(adreno_dev)) + a2xx_drawctxt_workaround(adreno_dev); } static void a2xx_drawctxt_restore(struct adreno_device *adreno_dev, @@ -1757,6 +1785,7 @@ struct adreno_gpudev adreno_a2xx_gpudev = { .ctxt_create = a2xx_drawctxt_create, .ctxt_save = a2xx_drawctxt_save, .ctxt_restore = a2xx_drawctxt_restore, + .ctxt_draw_workaround = a2xx_drawctxt_workaround, .irq_handler = a2xx_irq_handler, .irq_control = a2xx_irq_control, .snapshot = a2xx_snapshot, diff --git a/drivers/gpu/msm/adreno_debugfs.c b/drivers/gpu/msm/adreno_debugfs.c index c1b9e4ce2..566efa1aa 100644 --- a/drivers/gpu/msm/adreno_debugfs.c +++ b/drivers/gpu/msm/adreno_debugfs.c @@ -345,6 +345,8 @@ void adreno_debugfs_init(struct kgsl_device *device) &kgsl_cff_dump_enable_fops); debugfs_create_u32("wait_timeout", 0644, device->d_debugfs, &adreno_dev->wait_timeout); + debugfs_create_u32("ib_check", 0644, device->d_debugfs, + &adreno_dev->ib_check_level); /* Create post mortem control files */ diff --git a/drivers/gpu/msm/adreno_drawctxt.c b/drivers/gpu/msm/adreno_drawctxt.c index 206a678ee..f0b5741b5 100644 --- a/drivers/gpu/msm/adreno_drawctxt.c +++ b/drivers/gpu/msm/adreno_drawctxt.c @@ -243,8 +243,13 @@ void adreno_drawctxt_switch(struct adreno_device *adreno_dev, } /* already current? */ - if (adreno_dev->drawctxt_active == drawctxt) + if (adreno_dev->drawctxt_active == drawctxt) { + if (adreno_dev->gpudev->ctxt_draw_workaround && + adreno_is_a225(adreno_dev)) + adreno_dev->gpudev->ctxt_draw_workaround( + adreno_dev); return; + } KGSL_CTXT_INFO(device, "from %p to %p flags %d\n", adreno_dev->drawctxt_active, drawctxt, flags); diff --git a/drivers/gpu/msm/adreno_pm4types.h b/drivers/gpu/msm/adreno_pm4types.h index 8aea58c95..454b05785 100644 --- a/drivers/gpu/msm/adreno_pm4types.h +++ b/drivers/gpu/msm/adreno_pm4types.h @@ -29,11 +29,6 @@ /* skip N 32-bit words to get to the next packet */ #define CP_NOP 0x10 -/* indirect buffer dispatch. prefetch parser uses this packet type to determine -* whether to pre-fetch the IB -*/ -#define CP_INDIRECT_BUFFER 0x3f - /* indirect buffer dispatch. same as IB, but init is pipelined */ #define CP_INDIRECT_BUFFER_PFD 0x37 @@ -117,6 +112,9 @@ /* load constants from a location in memory */ #define CP_LOAD_CONSTANT_CONTEXT 0x2e +/* (A2x) sets binning configuration registers */ +#define CP_SET_BIN_DATA 0x2f + /* selective invalidation of state pointers */ #define CP_INVALIDATE_STATE 0x3b @@ -157,6 +155,16 @@ #define CP_SET_PROTECTED_MODE 0x5f /* sets the register protection mode */ +/* + * for a3xx + */ + +/* Conditionally load a IB based on a flag */ +#define CP_COND_INDIRECT_BUFFER_PFE 0x3A /* prefetch enabled */ +#define CP_COND_INDIRECT_BUFFER_PFD 0x32 /* prefetch disabled */ + +/* Load a buffer with pre-fetch enabled */ +#define CP_INDIRECT_BUFFER_PFE 0x3F /* packet header building macros */ #define cp_type0_packet(regindx, cnt) \ @@ -178,11 +186,20 @@ #define cp_nop_packet(cnt) \ (CP_TYPE3_PKT | (((cnt)-1) << 16) | (CP_NOP << 8)) +#define pkt_is_type0(pkt) (((pkt) & 0XC0000000) == CP_TYPE0_PKT) + +#define type0_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1) +#define type0_pkt_offset(pkt) ((pkt) & 0x7FFF) + +#define pkt_is_type3(pkt) (((pkt) & 0xC0000000) == CP_TYPE3_PKT) + +#define cp_type3_opcode(pkt) (((pkt) >> 8) & 0xFF) +#define type3_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1) /* packet headers */ #define CP_HDR_ME_INIT cp_type3_packet(CP_ME_INIT, 18) #define CP_HDR_INDIRECT_BUFFER_PFD cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2) -#define CP_HDR_INDIRECT_BUFFER cp_type3_packet(CP_INDIRECT_BUFFER, 2) +#define CP_HDR_INDIRECT_BUFFER_PFE cp_type3_packet(CP_INDIRECT_BUFFER_PFE, 2) /* dword base address of the GFX decode space */ #define SUBBLOCK_OFFSET(reg) ((unsigned int)((reg) - (0x2000))) @@ -190,4 +207,14 @@ /* gmem command buffer length */ #define CP_REG(reg) ((0x4 << 16) | (SUBBLOCK_OFFSET(reg))) + +/* Return 1 if the command is an indirect buffer of any kind */ +static inline int adreno_cmd_is_ib(unsigned int cmd) +{ + return (cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFE, 2) || + cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2) || + cmd == cp_type3_packet(CP_COND_INDIRECT_BUFFER_PFE, 2) || + cmd == cp_type3_packet(CP_COND_INDIRECT_BUFFER_PFD, 2)); +} + #endif /* __ADRENO_PM4TYPES_H */ diff --git a/drivers/gpu/msm/adreno_postmortem.c b/drivers/gpu/msm/adreno_postmortem.c old mode 100755 new mode 100644 index 40dfb30cf..63f5caa91 --- a/drivers/gpu/msm/adreno_postmortem.c +++ b/drivers/gpu/msm/adreno_postmortem.c @@ -53,7 +53,7 @@ static const struct pm_id_name pm3_types[] = { {CP_IM_LOAD, "IN__LOAD"}, {CP_IM_LOAD_IMMEDIATE, "IM_LOADI"}, {CP_IM_STORE, "IM_STORE"}, - {CP_INDIRECT_BUFFER, "IND_BUF_"}, + {CP_INDIRECT_BUFFER_PFE, "IND_BUF_"}, {CP_INDIRECT_BUFFER_PFD, "IND_BUFP"}, {CP_INTERRUPT, "PM4_INTR"}, {CP_INVALIDATE_STATE, "INV_STAT"}, @@ -200,7 +200,7 @@ static void dump_ib1(struct kgsl_device *device, uint32_t pt_base, for (i = 0; i+3 < ib1_size; ) { value = ib1_addr[i++]; - if (value == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) { + if (adreno_cmd_is_ib(value)) { uint32_t ib2_base = ib1_addr[i++]; uint32_t ib2_size = ib1_addr[i++]; @@ -611,7 +611,7 @@ static int adreno_dump(struct kgsl_device *device) i = 0; for (read_idx = 0; read_idx < num_item; ) { uint32_t this_cmd = rb_copy[read_idx++]; - if (this_cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) { + if (adreno_cmd_is_ib(this_cmd)) { uint32_t ib_addr = rb_copy[read_idx++]; uint32_t ib_size = rb_copy[read_idx++]; dump_ib1(device, cur_pt_base, (read_idx-3)<<2, ib_addr, @@ -654,8 +654,7 @@ static int adreno_dump(struct kgsl_device *device) for (read_idx = NUM_DWORDS_OF_RINGBUFFER_HISTORY; read_idx >= 0; --read_idx) { uint32_t this_cmd = rb_copy[read_idx]; - if (this_cmd == cp_type3_packet( - CP_INDIRECT_BUFFER_PFD, 2)) { + if (adreno_cmd_is_ib(this_cmd)) { uint32_t ib_addr = rb_copy[read_idx+1]; uint32_t ib_size = rb_copy[read_idx+2]; if (ib_size && cp_ib1_base == ib_addr) { diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c index ea2889b3a..57883fd45 100644 --- a/drivers/gpu/msm/adreno_ringbuffer.c +++ b/drivers/gpu/msm/adreno_ringbuffer.c @@ -22,6 +22,7 @@ #include "adreno.h" #include "adreno_pm4types.h" #include "adreno_ringbuffer.h" +#include "adreno_debugfs.h" #include "a2xx_reg.h" @@ -310,12 +311,10 @@ int adreno_ringbuffer_start(struct adreno_ringbuffer *rb, unsigned int init_ram) adreno_regwrite(device, REG_SCRATCH_UMSK, GSL_RB_MEMPTRS_SCRATCH_MASK); - /*< DTS2012042406822 hanfeng 20120428 begin*/ /* update the eoptimestamp field with the last retired timestamp */ kgsl_sharedmem_writel(&device->memstore, KGSL_DEVICE_MEMSTORE_OFFSET(eoptimestamp), rb->timestamp); - /* DTS2012042406822 hanfeng 20120428 end > */ /* load the CP ucode */ @@ -554,6 +553,197 @@ adreno_ringbuffer_issuecmds(struct kgsl_device *device, adreno_ringbuffer_addcmds(rb, flags, cmds, sizedwords); } +static bool _parse_ibs(struct kgsl_device_private *dev_priv, uint gpuaddr, + int sizedwords); + +static bool +_handle_type3(struct kgsl_device_private *dev_priv, uint *hostaddr) +{ + unsigned int opcode = cp_type3_opcode(*hostaddr); + switch (opcode) { + case CP_INDIRECT_BUFFER_PFD: + case CP_INDIRECT_BUFFER_PFE: + case CP_COND_INDIRECT_BUFFER_PFE: + case CP_COND_INDIRECT_BUFFER_PFD: + return _parse_ibs(dev_priv, hostaddr[1], hostaddr[2]); + case CP_NOP: + case CP_WAIT_FOR_IDLE: + case CP_WAIT_REG_MEM: + case CP_WAIT_REG_EQ: + case CP_WAT_REG_GTE: + case CP_WAIT_UNTIL_READ: + case CP_WAIT_IB_PFD_COMPLETE: + case CP_REG_RMW: + case CP_REG_TO_MEM: + case CP_MEM_WRITE: + case CP_MEM_WRITE_CNTR: + case CP_COND_EXEC: + case CP_COND_WRITE: + case CP_EVENT_WRITE: + case CP_EVENT_WRITE_SHD: + case CP_EVENT_WRITE_CFL: + case CP_EVENT_WRITE_ZPD: + case CP_DRAW_INDX: + case CP_DRAW_INDX_2: + case CP_DRAW_INDX_BIN: + case CP_DRAW_INDX_2_BIN: + case CP_VIZ_QUERY: + case CP_SET_STATE: + case CP_SET_CONSTANT: + case CP_IM_LOAD: + case CP_IM_LOAD_IMMEDIATE: + case CP_LOAD_CONSTANT_CONTEXT: + case CP_INVALIDATE_STATE: + case CP_SET_SHADER_BASES: + case CP_SET_BIN_MASK: + case CP_SET_BIN_SELECT: + case CP_SET_BIN_BASE_OFFSET: + case CP_SET_BIN_DATA: + case CP_CONTEXT_UPDATE: + case CP_INTERRUPT: + case CP_IM_STORE: + break; + /* these shouldn't come from userspace */ + case CP_ME_INIT: + case CP_SET_PROTECTED_MODE: + default: + KGSL_CMD_ERR(dev_priv->device, "bad CP opcode %0x\n", opcode); + return false; + break; + } + + return true; +} + +static bool +_handle_type0(struct kgsl_device_private *dev_priv, uint *hostaddr) +{ + unsigned int reg = type0_pkt_offset(*hostaddr); + unsigned int cnt = type0_pkt_size(*hostaddr); + if (reg < 0x0192 || (reg + cnt) >= 0x8000) { + KGSL_CMD_ERR(dev_priv->device, "bad type0 reg: 0x%0x cnt: %d\n", + reg, cnt); + return false; + } + return true; +} + +/* + * Traverse IBs and dump them to test vector. Detect swap by inspecting + * register writes, keeping note of the current state, and dump + * framebuffer config to test vector + */ +static bool _parse_ibs(struct kgsl_device_private *dev_priv, + uint gpuaddr, int sizedwords) +{ + static uint level; /* recursion level */ + bool ret = false; + uint *hostaddr, *hoststart; + int dwords_left = sizedwords; /* dwords left in the current command + buffer */ + struct kgsl_mem_entry *entry; + + spin_lock(&dev_priv->process_priv->mem_lock); + entry = kgsl_sharedmem_find_region(dev_priv->process_priv, + gpuaddr, sizedwords * sizeof(uint)); + spin_unlock(&dev_priv->process_priv->mem_lock); + if (entry == NULL) { + KGSL_CMD_ERR(dev_priv->device, + "no mapping for gpuaddr: 0x%08x\n", gpuaddr); + return false; + } + + hostaddr = (uint *)kgsl_gpuaddr_to_vaddr(&entry->memdesc, gpuaddr); + if (hostaddr == NULL) { + KGSL_CMD_ERR(dev_priv->device, + "no mapping for gpuaddr: 0x%08x\n", gpuaddr); + return false; + } + + hoststart = hostaddr; + + level++; + + KGSL_CMD_INFO(dev_priv->device, "ib: gpuaddr:0x%08x, wc:%d, hptr:%p\n", + gpuaddr, sizedwords, hostaddr); + + mb(); + while (dwords_left > 0) { + bool cur_ret = true; + int count = 0; /* dword count including packet header */ + + switch (*hostaddr >> 30) { + case 0x0: /* type-0 */ + count = (*hostaddr >> 16)+2; + cur_ret = _handle_type0(dev_priv, hostaddr); + break; + case 0x1: /* type-1 */ + count = 2; + break; + case 0x3: /* type-3 */ + count = ((*hostaddr >> 16) & 0x3fff) + 2; + cur_ret = _handle_type3(dev_priv, hostaddr); + break; + default: + KGSL_CMD_ERR(dev_priv->device, "unexpected type: " + "type:%d, word:0x%08x @ 0x%p, gpu:0x%08x\n", + *hostaddr >> 30, *hostaddr, hostaddr, + gpuaddr+4*(sizedwords-dwords_left)); + cur_ret = false; + count = dwords_left; + break; + } + + if (!cur_ret) { + KGSL_CMD_ERR(dev_priv->device, + "bad sub-type: #:%d/%d, v:0x%08x" + " @ 0x%p[gb:0x%08x], level:%d\n", + sizedwords-dwords_left, sizedwords, *hostaddr, + hostaddr, gpuaddr+4*(sizedwords-dwords_left), + level); + + if (ADRENO_DEVICE(dev_priv->device)->ib_check_level + >= 2) + print_hex_dump(KERN_ERR, + level == 1 ? "IB1:" : "IB2:", + DUMP_PREFIX_OFFSET, 32, 4, hoststart, + sizedwords*4, 0); + goto done; + } + + /* jump to next packet */ + dwords_left -= count; + hostaddr += count; + if (dwords_left < 0) { + KGSL_CMD_ERR(dev_priv->device, + "bad count: c:%d, #:%d/%d, " + "v:0x%08x @ 0x%p[gb:0x%08x], level:%d\n", + count, sizedwords-(dwords_left+count), + sizedwords, *(hostaddr-count), hostaddr-count, + gpuaddr+4*(sizedwords-(dwords_left+count)), + level); + if (ADRENO_DEVICE(dev_priv->device)->ib_check_level + >= 2) + print_hex_dump(KERN_ERR, + level == 1 ? "IB1:" : "IB2:", + DUMP_PREFIX_OFFSET, 32, 4, hoststart, + sizedwords*4, 0); + goto done; + } + } + + ret = true; +done: + if (!ret) + KGSL_DRV_ERR(dev_priv->device, + "parsing failed: gpuaddr:0x%08x, " + "host:0x%p, wc:%d\n", gpuaddr, hoststart, sizedwords); + + level--; + + return ret; +} + int adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv, struct kgsl_context *context, @@ -601,9 +791,12 @@ adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv, start_index = 1; for (i = start_index; i < numibs; i++) { - (void)kgsl_cffdump_parse_ibs(dev_priv, NULL, - ibdesc[i].gpuaddr, ibdesc[i].sizedwords, false); - + if (unlikely(adreno_dev->ib_check_level >= 1 && + !_parse_ibs(dev_priv, ibdesc[i].gpuaddr, + ibdesc[i].sizedwords))) { + kfree(link); + return -EINVAL; + } *cmds++ = CP_HDR_INDIRECT_BUFFER_PFD; *cmds++ = ibdesc[i].gpuaddr; *cmds++ = ibdesc[i].sizedwords; @@ -757,8 +950,20 @@ int adreno_ringbuffer_extract(struct adreno_ringbuffer *rb, kgsl_sharedmem_readl(&rb->buffer_desc, &value, rb_rptr); rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr, rb->buffer_desc.size); - BUG_ON((copy_rb_contents == 0) && - (value == cur_context)); + + /* + * If other context switches were already lost and + * and the current context is the one that is hanging, + * then we cannot recover. Print an error message + * and leave. + */ + + if ((copy_rb_contents == 0) && (value == cur_context)) { + KGSL_DRV_ERR(device, "GPU recovery could not " + "find the previous context\n"); + return -EINVAL; + } + /* * If we were copying the commands and got to this point * then we need to remove the 3 commands that appear diff --git a/drivers/gpu/msm/adreno_snapshot.c b/drivers/gpu/msm/adreno_snapshot.c index fb88a72bd..c45dbff48 100644 --- a/drivers/gpu/msm/adreno_snapshot.c +++ b/drivers/gpu/msm/adreno_snapshot.c @@ -45,11 +45,19 @@ static void push_object(struct kgsl_device *device, int type, uint32_t ptbase, int index; void *ptr; - /* Go through the list and see that object has already been seen */ + /* + * Sometimes IBs can be reused in the same dump. Because we parse from + * oldest to newest, if we come across an IB that has already been used, + * assume that it has been reused and update the list with the newest + * size. + */ + for (index = 0; index < objbufptr; index++) { if (objbuf[index].gpuaddr == gpuaddr && - objbuf[index].ptbase == ptbase) - return; + objbuf[index].ptbase == ptbase) { + objbuf[index].dwords = dwords; + return; + } } if (objbufptr == SNAPSHOT_OBJ_BUFSIZE) { @@ -77,6 +85,25 @@ static void push_object(struct kgsl_device *device, int type, uint32_t ptbase, objbuf[objbufptr++].ptr = ptr; } +/* + * Return a 1 if the specified object is already on the list of buffers + * to be dumped + */ + +static int find_object(int type, unsigned int gpuaddr, unsigned int ptbase) +{ + int index; + + for (index = 0; index < objbufptr; index++) { + if (objbuf[index].gpuaddr == gpuaddr && + objbuf[index].ptbase == ptbase && + objbuf[index].type == type) + return 1; + } + + return 0; +} + /* Snapshot the istore memory */ static int snapshot_istore(struct kgsl_device *device, void *snapshot, int remain, void *priv) @@ -113,6 +140,7 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot, unsigned int rbbase, ptbase, rptr, *rbptr; int start, stop, index; int numitems, size; + int parse_ibs = 0, ib_parse_start; /* Get the GPU address of the ringbuffer */ kgsl_regread(device, REG_CP_RB_BASE, &rbbase); @@ -158,9 +186,53 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot, header->rbsize = rb->sizedwords; header->count = numitems; - index = start; + /* + * We can only reliably dump IBs from the beginning of the context, + * and it turns out that for the vast majority of the time we really + * only care about the current context when it comes to diagnosing + * a hang. So, with an eye to limiting the buffer dumping to what is + * really useful find the beginning of the context and only dump + * IBs from that point + */ + + index = rptr; + ib_parse_start = start; rbptr = rb->buffer_desc.hostptr; + while (index != start) { + index--; + + if (index < 0) { + /* + * The marker we are looking for is 2 dwords long, so + * when wrapping, go back 2 from the end so we don't + * access out of range in the if statement below + */ + index = rb->sizedwords - 2; + + /* + * Account for the possibility that start might be at + * rb->sizedwords - 1 + */ + + if (start == rb->sizedwords - 1) + break; + } + + /* + * Look for a NOP packet with the context switch identifier in + * the second dword + */ + + if (rbptr[index] == cp_nop_packet(1) && + rbptr[index + 1] == KGSL_CONTEXT_TO_MEM_IDENTIFIER) { + ib_parse_start = index; + break; + } + } + + index = start; + /* * Loop through the RB, copying the data and looking for indirect * buffers and MMU pagetable changes @@ -169,15 +241,18 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot, while (index != rb->wptr) { *data = rbptr[index]; - if (rbptr[index] == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) + /* Only parse IBs between the context start and the rptr */ + + if (index == ib_parse_start) + parse_ibs = 1; + + if (index == rptr) + parse_ibs = 0; + + if (parse_ibs && adreno_cmd_is_ib(rbptr[index])) push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase, rbptr[index + 1], rbptr[index + 2]); - /* - * FIXME: Handle upcoming MMU pagetable changes, but only - * between the rptr and the wptr - */ - index = index + 1; if (index == rb->sizedwords) @@ -228,10 +303,9 @@ static int snapshot_ib(struct kgsl_device *device, void *snapshot, *dst = *src; /* If another IB is discovered, then push it on the list too */ - if (*src == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) { + if (adreno_cmd_is_ib(*src)) push_object(device, SNAPSHOT_OBJ_TYPE_IB, obj->ptbase, *(src + 1), *(src + 2)); - } src++; dst++; @@ -288,22 +362,45 @@ void *adreno_snapshot(struct kgsl_device *device, void *snapshot, int *remain, snapshot, remain, snapshot_rb, NULL); /* - * Make sure that the IBs described in the CP registers are on the - * list of objects + * Make sure that the last IB1 that was being executed is dumped. + * Since this was the last IB1 that was processed, we should have + * already added it to the list during the ringbuffer parse but we + * want to be double plus sure. */ + kgsl_regread(device, REG_CP_IB1_BASE, &ibbase); kgsl_regread(device, REG_CP_IB1_BUFSZ, &ibsize); - if (ibsize) + /* + * The problem is that IB size from the register is the unprocessed size + * of the buffer not the original size, so if we didn't catch this + * buffer being directly used in the RB, then we might not be able to + * dump the whle thing. Print a warning message so we can try to + * figure how often this really happens. + */ + + if (!find_object(SNAPSHOT_OBJ_TYPE_IB, ibbase, ptbase) && ibsize) { push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase, ibbase, ibsize); + KGSL_DRV_ERR(device, "CP_IB1_BASE not found in the ringbuffer. " + "Dumping %x dwords of the buffer.\n", ibsize); + } kgsl_regread(device, REG_CP_IB2_BASE, &ibbase); kgsl_regread(device, REG_CP_IB2_BUFSZ, &ibsize); - if (ibsize) + /* + * Add the last parsed IB2 to the list. The IB2 should be found as we + * parse the objects below, but we try to add it to the list first, so + * it too can be parsed. Don't print an error message in this case - if + * the IB2 is found during parsing, the list will be updated with the + * correct size. + */ + + if (!find_object(SNAPSHOT_OBJ_TYPE_IB, ibbase, ptbase) && ibsize) { push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase, ibbase, ibsize); + } /* * Go through the list of found objects and dump each one. As the IBs diff --git a/drivers/gpu/msm/kgsl.c b/drivers/gpu/msm/kgsl.c old mode 100755 new mode 100644 index 694c09cf3..39dad925f --- a/drivers/gpu/msm/kgsl.c +++ b/drivers/gpu/msm/kgsl.c @@ -21,11 +21,10 @@ #include #include #include - +#include #include #include #include -#include #include "kgsl.h" #include "kgsl_debugfs.h" @@ -194,8 +193,28 @@ static void kgsl_mem_entry_attach_process(struct kgsl_mem_entry *entry, struct kgsl_process_private *process) { + struct rb_node **node; + struct rb_node *parent = NULL; + spin_lock(&process->mem_lock); - list_add(&entry->list, &process->mem_list); + + node = &process->mem_rb.rb_node; + + while (*node) { + struct kgsl_mem_entry *cur; + + parent = *node; + cur = rb_entry(parent, struct kgsl_mem_entry, node); + + if (entry->memdesc.gpuaddr < cur->memdesc.gpuaddr) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&entry->node, parent, node); + rb_insert_color(&entry->node, &process->mem_rb); + spin_unlock(&process->mem_lock); entry->priv = process; @@ -405,6 +424,10 @@ static int kgsl_suspend_device(struct kgsl_device *device, pm_message_t state) INIT_COMPLETION(device->hwaccess_gate); device->ftbl->suspend_context(device); device->ftbl->stop(device); + if (device->idle_wakelock.name) + wake_unlock(&device->idle_wakelock); + pm_qos_update_request(&device->pm_qos_req_dma, + PM_QOS_DEFAULT_VALUE); kgsl_pwrctrl_set_state(device, KGSL_STATE_SUSPEND); break; case KGSL_STATE_SLUMBER: @@ -514,8 +537,8 @@ void kgsl_late_resume_driver(struct early_suspend *h) struct kgsl_device, display_off); KGSL_PWR_WARN(device, "late resume start\n"); mutex_lock(&device->mutex); - kgsl_pwrctrl_wake(device); device->pwrctrl.restore_slumber = 0; + kgsl_pwrctrl_wake(device); kgsl_pwrctrl_pwrlevel_change(device, KGSL_PWRLEVEL_TURBO); mutex_unlock(&device->mutex); kgsl_check_idle(device); @@ -548,8 +571,7 @@ kgsl_get_process_private(struct kgsl_device_private *cur_dev_priv) spin_lock_init(&private->mem_lock); private->refcnt = 1; private->pid = task_tgid_nr(current); - - INIT_LIST_HEAD(&private->mem_list); + private->mem_rb = RB_ROOT; if (kgsl_mmu_enabled()) { @@ -578,7 +600,7 @@ kgsl_put_process_private(struct kgsl_device *device, struct kgsl_process_private *private) { struct kgsl_mem_entry *entry = NULL; - struct kgsl_mem_entry *entry_tmp = NULL; + struct rb_node *node; if (!private) return; @@ -592,11 +614,13 @@ kgsl_put_process_private(struct kgsl_device *device, list_del(&private->list); - list_for_each_entry_safe(entry, entry_tmp, &private->mem_list, list) { - list_del(&entry->list); + for (node = rb_first(&private->mem_rb); node; ) { + entry = rb_entry(node, struct kgsl_mem_entry, node); + node = rb_next(&entry->node); + + rb_erase(&entry->node, &private->mem_rb); kgsl_mem_entry_put(entry); } - kgsl_mmu_putpagetable(private->pagetable); kfree(private); unlock: @@ -722,47 +746,43 @@ static int kgsl_open(struct inode *inodep, struct file *filep) return result; } - /*call with private->mem_lock locked */ -static struct kgsl_mem_entry * -kgsl_sharedmem_find(struct kgsl_process_private *private, unsigned int gpuaddr) +struct kgsl_mem_entry * +kgsl_sharedmem_find_region(struct kgsl_process_private *private, + unsigned int gpuaddr, size_t size) { - struct kgsl_mem_entry *entry = NULL, *result = NULL; + struct rb_node *node = private->mem_rb.rb_node; - BUG_ON(private == NULL); + while (node != NULL) { + struct kgsl_mem_entry *entry; - gpuaddr &= PAGE_MASK; + entry = rb_entry(node, struct kgsl_mem_entry, node); - list_for_each_entry(entry, &private->mem_list, list) { - if (entry->memdesc.gpuaddr == gpuaddr) { - result = entry; - break; - } - } - return result; -} - -/*call with private->mem_lock locked */ -struct kgsl_mem_entry * -kgsl_sharedmem_find_region(struct kgsl_process_private *private, - unsigned int gpuaddr, - size_t size) -{ - struct kgsl_mem_entry *entry = NULL, *result = NULL; - BUG_ON(private == NULL); + if (kgsl_gpuaddr_in_memdesc(&entry->memdesc, gpuaddr, size)) + return entry; - list_for_each_entry(entry, &private->mem_list, list) { - if (kgsl_gpuaddr_in_memdesc(&entry->memdesc, gpuaddr, size)) { - result = entry; - break; + if (gpuaddr < entry->memdesc.gpuaddr) + node = node->rb_left; + else if (gpuaddr >= + (entry->memdesc.gpuaddr + entry->memdesc.size)) + node = node->rb_right; + else { + return NULL; } } - return result; + return NULL; } EXPORT_SYMBOL(kgsl_sharedmem_find_region); +/*call with private->mem_lock locked */ +static inline struct kgsl_mem_entry * +kgsl_sharedmem_find(struct kgsl_process_private *private, unsigned int gpuaddr) +{ + return kgsl_sharedmem_find_region(private, gpuaddr, 1); +} + /*call all ioctl sub functions with driver locked*/ static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv, unsigned int cmd, void *data) @@ -789,6 +809,40 @@ static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv, break; } + case KGSL_PROP_GPU_RESET_STAT: + { + /* Return reset status of given context and clear it */ + uint32_t id; + struct kgsl_context *context; + + if (param->sizebytes != sizeof(unsigned int)) { + result = -EINVAL; + break; + } + /* We expect the value passed in to contain the context id */ + if (copy_from_user(&id, param->value, + sizeof(unsigned int))) { + result = -EFAULT; + break; + } + context = kgsl_find_context(dev_priv, id); + if (!context) { + result = -EINVAL; + break; + } + /* + * Copy the reset status to value which also serves as + * the out parameter + */ + if (copy_to_user(param->value, &(context->reset_status), + sizeof(unsigned int))) { + result = -EFAULT; + break; + } + /* Clear reset status once its been queried */ + context->reset_status = KGSL_CTX_STAT_NO_ERROR; + break; + } default: result = dev_priv->device->ftbl->getproperty( dev_priv->device, param->type, @@ -827,40 +881,6 @@ static long kgsl_ioctl_device_waittimestamp(struct kgsl_device_private return result; } -static bool check_ibdesc(struct kgsl_device_private *dev_priv, - struct kgsl_ibdesc *ibdesc, unsigned int numibs, - bool parse) -{ - bool result = true; - unsigned int i; - for (i = 0; i < numibs; i++) { - struct kgsl_mem_entry *entry; - spin_lock(&dev_priv->process_priv->mem_lock); - entry = kgsl_sharedmem_find_region(dev_priv->process_priv, - ibdesc[i].gpuaddr, ibdesc[i].sizedwords * sizeof(uint)); - spin_unlock(&dev_priv->process_priv->mem_lock); - if (entry == NULL) { - KGSL_DRV_ERR(dev_priv->device, - "invalid cmd buffer gpuaddr %08x " \ - "sizedwords %d\n", ibdesc[i].gpuaddr, - ibdesc[i].sizedwords); - result = false; - break; - } - - if (parse && !kgsl_cffdump_parse_ibs(dev_priv, &entry->memdesc, - ibdesc[i].gpuaddr, ibdesc[i].sizedwords, true)) { - KGSL_DRV_ERR(dev_priv->device, - "invalid cmd buffer gpuaddr %08x " \ - "sizedwords %d numibs %d/%d\n", - ibdesc[i].gpuaddr, - ibdesc[i].sizedwords, i+1, numibs); - result = false; - break; - } - } - return result; -} static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv, unsigned int cmd, void *data) @@ -930,12 +950,6 @@ static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv, param->numibs = 1; } - if (!check_ibdesc(dev_priv, ibdesc, param->numibs, true)) { - KGSL_DRV_ERR(dev_priv->device, "bad ibdesc"); - result = -EINVAL; - goto free_ibdesc; - } - result = dev_priv->device->ftbl->issueibcmds(dev_priv, context, ibdesc, @@ -945,18 +959,6 @@ static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv, trace_kgsl_issueibcmds(dev_priv->device, param, result); - if (result != 0) - goto free_ibdesc; - - /* this is a check to try to detect if a command buffer was freed - * during issueibcmds(). - */ - if (!check_ibdesc(dev_priv, ibdesc, param->numibs, false)) { - KGSL_DRV_ERR(dev_priv->device, "bad ibdesc AFTER issue"); - result = -EINVAL; - goto free_ibdesc; - } - free_ibdesc: kfree(ibdesc); done: @@ -988,7 +990,7 @@ static void kgsl_freemem_event_cb(struct kgsl_device *device, { struct kgsl_mem_entry *entry = priv; spin_lock(&entry->priv->mem_lock); - list_del(&entry->list); + rb_erase(&entry->node, &entry->priv->mem_rb); spin_unlock(&entry->priv->mem_lock); kgsl_mem_entry_put(entry); } @@ -1080,7 +1082,8 @@ static long kgsl_ioctl_sharedmem_free(struct kgsl_device_private *dev_priv, spin_lock(&private->mem_lock); entry = kgsl_sharedmem_find(private, param->gpuaddr); if (entry) - list_del(&entry->list); + rb_erase(&entry->node, &private->mem_rb); + spin_unlock(&private->mem_lock); if (entry) { @@ -1164,7 +1167,7 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv, goto error; } - result = kgsl_sharedmem_vmalloc_user(&entry->memdesc, + result = kgsl_sharedmem_page_alloc_user(&entry->memdesc, private->pagetable, len, param->flags); if (result != 0) @@ -1172,10 +1175,10 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv, vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - result = remap_vmalloc_range(vma, (void *) entry->memdesc.hostptr, 0); + result = kgsl_sharedmem_map_vma(vma, &entry->memdesc); if (result) { - KGSL_CORE_ERR("remap_vmalloc_range failed: %d\n", result); - goto error_free_vmalloc; + KGSL_CORE_ERR("kgsl_sharedmem_map_vma failed: %d\n", result); + goto error_free_alloc; } param->gpuaddr = entry->memdesc.gpuaddr; @@ -1190,7 +1193,7 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv, kgsl_check_idle(dev_priv->device); return 0; -error_free_vmalloc: +error_free_alloc: kgsl_sharedmem_free(&entry->memdesc); error_free_entry: @@ -1313,7 +1316,8 @@ static int memdesc_sg_virt(struct kgsl_memdesc *memdesc, int sglen = PAGE_ALIGN(size) / PAGE_SIZE; unsigned long paddr = (unsigned long) addr; - memdesc->sg = vmalloc(sglen * sizeof(struct scatterlist)); + memdesc->sg = kgsl_sg_alloc(sglen); + if (memdesc->sg == NULL) return -ENOMEM; @@ -1353,7 +1357,7 @@ static int memdesc_sg_virt(struct kgsl_memdesc *memdesc, err: spin_unlock(¤t->mm->page_table_lock); - vfree(memdesc->sg); + kgsl_sg_free(memdesc->sg, sglen); memdesc->sg = NULL; return -EINVAL; @@ -1488,11 +1492,8 @@ static int kgsl_setup_ion(struct kgsl_mem_entry *entry, struct scatterlist *s; unsigned long flags; - if (kgsl_ion_client == NULL) { - kgsl_ion_client = msm_ion_client_create(UINT_MAX, KGSL_NAME); - if (kgsl_ion_client == NULL) - return -ENODEV; - } + if (IS_ERR_OR_NULL(kgsl_ion_client)) + return -ENODEV; handle = ion_import_fd(kgsl_ion_client, fd); if (IS_ERR_OR_NULL(handle)) @@ -1622,10 +1623,20 @@ static long kgsl_ioctl_map_user_mem(struct kgsl_device_private *dev_priv, kgsl_check_idle(dev_priv->device); return result; - error_put_file_ptr: - if (entry->priv_data) - fput(entry->priv_data); - +error_put_file_ptr: + switch (entry->memtype) { + case KGSL_MEM_ENTRY_PMEM: + case KGSL_MEM_ENTRY_ASHMEM: + if (entry->priv_data) + fput(entry->priv_data); + break; + case KGSL_MEM_ENTRY_ION: + ion_unmap_dma(kgsl_ion_client, entry->priv_data); + ion_free(kgsl_ion_client, entry->priv_data); + break; + default: + break; + } error: kfree(entry); kgsl_check_idle(dev_priv->device); @@ -2029,7 +2040,7 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma) unsigned long vma_offset = vma->vm_pgoff << PAGE_SHIFT; struct kgsl_device_private *dev_priv = file->private_data; struct kgsl_process_private *private = dev_priv->process_priv; - struct kgsl_mem_entry *tmp, *entry = NULL; + struct kgsl_mem_entry *entry = NULL; struct kgsl_device *device = dev_priv->device; /* Handle leagacy behavior for memstore */ @@ -2040,13 +2051,11 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma) /* Find a chunk of GPU memory */ spin_lock(&private->mem_lock); - list_for_each_entry(tmp, &private->mem_list, list) { - if (vma_offset == tmp->memdesc.gpuaddr) { - kgsl_mem_entry_get(tmp); - entry = tmp; - break; - } - } + entry = kgsl_sharedmem_find(private, vma_offset); + + if (entry) + kgsl_mem_entry_get(entry); + spin_unlock(&private->mem_lock); if (entry == NULL) @@ -2102,8 +2111,8 @@ void kgsl_unregister_device(struct kgsl_device *device) kgsl_cffdump_close(device->id); kgsl_pwrctrl_uninit_sysfs(device); - if (cpu_is_msm8x60()) - wake_lock_destroy(&device->idle_wakelock); + wake_lock_destroy(&device->idle_wakelock); + pm_qos_remove_request(&device->pm_qos_req_dma); idr_destroy(&device->context_idr); @@ -2194,9 +2203,9 @@ kgsl_register_device(struct kgsl_device *device) if (ret != 0) goto err_close_mmu; - if (cpu_is_msm8x60()) - wake_lock_init(&device->idle_wakelock, - WAKE_LOCK_IDLE, device->name); + wake_lock_init(&device->idle_wakelock, WAKE_LOCK_IDLE, device->name); + pm_qos_add_request(&device->pm_qos_req_dma, PM_QOS_CPU_DMA_LATENCY, + PM_QOS_DEFAULT_VALUE); idr_init(&device->context_idr); @@ -2242,6 +2251,8 @@ int kgsl_device_platform_probe(struct kgsl_device *device, if (status) goto error; + kgsl_ion_client = msm_ion_client_create(UINT_MAX, KGSL_NAME); + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, device->iomemname); if (res == NULL) { @@ -2339,22 +2350,30 @@ kgsl_ptdata_init(void) static void kgsl_core_exit(void) { - unregister_chrdev_region(kgsl_driver.major, KGSL_DEVICE_MAX); - kgsl_mmu_ptpool_destroy(kgsl_driver.ptpool); kgsl_driver.ptpool = NULL; - device_unregister(&kgsl_driver.virtdev); + kgsl_drm_exit(); + kgsl_cffdump_destroy(); + kgsl_core_debugfs_close(); + + /* + * We call kgsl_sharedmem_uninit_sysfs() and device_unregister() + * only if kgsl_driver.virtdev has been populated. + * We check at least one member of kgsl_driver.virtdev to + * see if it is not NULL (and thus, has been populated). + */ + if (kgsl_driver.virtdev.class) { + kgsl_sharedmem_uninit_sysfs(); + device_unregister(&kgsl_driver.virtdev); + } if (kgsl_driver.class) { class_destroy(kgsl_driver.class); kgsl_driver.class = NULL; } - kgsl_drm_exit(); - kgsl_cffdump_destroy(); - kgsl_core_debugfs_close(); - kgsl_sharedmem_uninit_sysfs(); + unregister_chrdev_region(kgsl_driver.major, KGSL_DEVICE_MAX); } static int __init kgsl_core_init(void) diff --git a/drivers/gpu/msm/kgsl.h b/drivers/gpu/msm/kgsl.h index d3ae4b9bb..f027f95c4 100644 --- a/drivers/gpu/msm/kgsl.h +++ b/drivers/gpu/msm/kgsl.h @@ -21,13 +21,12 @@ #include #include #include +#include #define KGSL_NAME "kgsl" -/*< DTS2012042406822 hanfeng 20120428 begin*/ -/* Timestamp window used to detect rollovers */ +/* Timestamp window used to detect rollovers (half of integer range) */ #define KGSL_TIMESTAMP_WINDOW 0x80000000 -/* DTS2012042406822 hanfeng 20120428 end > */ /*cache coherency ops */ #define DRM_KGSL_GEM_CACHE_OP_TO_DEV 0x0001 @@ -96,6 +95,8 @@ struct kgsl_driver { struct { unsigned int vmalloc; unsigned int vmalloc_max; + unsigned int page_alloc; + unsigned int page_alloc_max; unsigned int coherent; unsigned int coherent_max; unsigned int mapped; @@ -107,7 +108,15 @@ struct kgsl_driver { extern struct kgsl_driver kgsl_driver; struct kgsl_pagetable; -struct kgsl_memdesc_ops; +struct kgsl_memdesc; + +struct kgsl_memdesc_ops { + int (*vmflags)(struct kgsl_memdesc *); + int (*vmfault)(struct kgsl_memdesc *, struct vm_area_struct *, + struct vm_fault *); + void (*free)(struct kgsl_memdesc *memdesc); + int (*map_kernel_mem)(struct kgsl_memdesc *); +}; /* shared memory allocation */ struct kgsl_memdesc { @@ -136,7 +145,7 @@ struct kgsl_mem_entry { struct kgsl_memdesc memdesc; int memtype; void *priv_data; - struct list_head list; + struct rb_node node; uint32_t free_timestamp; /* back pointer to private structure under whose context this * allocation is made */ @@ -186,27 +195,47 @@ static inline int kgsl_gpuaddr_in_memdesc(const struct kgsl_memdesc *memdesc, } return 0; } -static inline uint8_t *kgsl_gpuaddr_to_vaddr(const struct kgsl_memdesc *memdesc, + +static inline void *kgsl_memdesc_map(struct kgsl_memdesc *memdesc) +{ + if (memdesc->hostptr == NULL && memdesc->ops && + memdesc->ops->map_kernel_mem) + memdesc->ops->map_kernel_mem(memdesc); + + return memdesc->hostptr; +} + +static inline uint8_t *kgsl_gpuaddr_to_vaddr(struct kgsl_memdesc *memdesc, unsigned int gpuaddr) { - if (memdesc->hostptr == NULL || memdesc->gpuaddr == 0 || - (gpuaddr < memdesc->gpuaddr || - gpuaddr >= memdesc->gpuaddr + memdesc->size)) - return NULL; + if (memdesc->gpuaddr == 0 || + gpuaddr < memdesc->gpuaddr || + gpuaddr >= (memdesc->gpuaddr + memdesc->size) || + (NULL == memdesc->hostptr && memdesc->ops->map_kernel_mem && + memdesc->ops->map_kernel_mem(memdesc))) + return NULL; return memdesc->hostptr + (gpuaddr - memdesc->gpuaddr); } -static inline int timestamp_cmp(unsigned int new, unsigned int old) +static inline int timestamp_cmp(unsigned int a, unsigned int b) { - int ts_diff = new - old; - - if (ts_diff == 0) + /* check for equal */ + if (a == b) return 0; - /*< DTS2012042406822 hanfeng 20120428 begin*/ - return ((ts_diff > 0) || (ts_diff < -KGSL_TIMESTAMP_WINDOW)) ? 1 : -1; - /* DTS2012042406822 hanfeng 20120428 end > */ + /* check for greater-than for non-rollover case */ + if ((a > b) && (a - b < KGSL_TIMESTAMP_WINDOW)) + return 1; + + /* check for greater-than for rollover case + * note that <= is required to ensure that consistent + * results are returned for values whose difference is + * equal to the window size + */ + a += KGSL_TIMESTAMP_WINDOW; + b += KGSL_TIMESTAMP_WINDOW; + return ((a > b) && (a - b <= KGSL_TIMESTAMP_WINDOW)) ? 1 : -1; } static inline void diff --git a/drivers/gpu/msm/kgsl_cffdump.c b/drivers/gpu/msm/kgsl_cffdump.c index e9455cb82..77aef1ff0 100644 --- a/drivers/gpu/msm/kgsl_cffdump.c +++ b/drivers/gpu/msm/kgsl_cffdump.c @@ -497,190 +497,6 @@ int kgsl_cffdump_waitirq(void) } EXPORT_SYMBOL(kgsl_cffdump_waitirq); -#define ADDRESS_STACK_SIZE 256 -#define GET_PM4_TYPE3_OPCODE(x) ((*(x) >> 8) & 0xFF) -static unsigned int kgsl_cffdump_addr_count; - -static bool kgsl_cffdump_handle_type3(struct kgsl_device_private *dev_priv, - uint *hostaddr, bool check_only) -{ - static uint addr_stack[ADDRESS_STACK_SIZE]; - static uint size_stack[ADDRESS_STACK_SIZE]; - - switch (GET_PM4_TYPE3_OPCODE(hostaddr)) { - case CP_INDIRECT_BUFFER_PFD: - case CP_INDIRECT_BUFFER: - { - /* traverse indirect buffers */ - int i; - uint ibaddr = hostaddr[1]; - uint ibsize = hostaddr[2]; - - /* is this address already in encountered? */ - for (i = 0; - i < kgsl_cffdump_addr_count && addr_stack[i] != ibaddr; - ++i) - ; - - if (kgsl_cffdump_addr_count == i) { - addr_stack[kgsl_cffdump_addr_count] = ibaddr; - size_stack[kgsl_cffdump_addr_count++] = ibsize; - - if (kgsl_cffdump_addr_count >= ADDRESS_STACK_SIZE) { - KGSL_CORE_ERR("stack overflow\n"); - return false; - } - - return kgsl_cffdump_parse_ibs(dev_priv, NULL, - ibaddr, ibsize, check_only); - } else if (size_stack[i] != ibsize) { - KGSL_CORE_ERR("gpuaddr: 0x%08x, " - "wc: %u, with size wc: %u already on the " - "stack\n", ibaddr, ibsize, size_stack[i]); - return false; - } - } - break; - } - - return true; -} - -/* - * Traverse IBs and dump them to test vector. Detect swap by inspecting - * register writes, keeping note of the current state, and dump - * framebuffer config to test vector - */ -bool kgsl_cffdump_parse_ibs(struct kgsl_device_private *dev_priv, - const struct kgsl_memdesc *memdesc, uint gpuaddr, int sizedwords, - bool check_only) -{ - static uint level; /* recursion level */ - bool ret = true; - uint *hostaddr, *hoststart; - int dwords_left = sizedwords; /* dwords left in the current command - buffer */ - - if (level == 0) - kgsl_cffdump_addr_count = 0; - - if (memdesc == NULL) { - struct kgsl_mem_entry *entry; - spin_lock(&dev_priv->process_priv->mem_lock); - entry = kgsl_sharedmem_find_region(dev_priv->process_priv, - gpuaddr, sizedwords * sizeof(uint)); - spin_unlock(&dev_priv->process_priv->mem_lock); - if (entry == NULL) { - KGSL_CORE_ERR("did not find mapping " - "for gpuaddr: 0x%08x\n", gpuaddr); - return true; - } - memdesc = &entry->memdesc; - } - hostaddr = (uint *)kgsl_gpuaddr_to_vaddr(memdesc, gpuaddr); - if (hostaddr == NULL) { - KGSL_CORE_ERR("no kernel mapping for " - "gpuaddr: 0x%08x\n", gpuaddr); - return true; - } - - hoststart = hostaddr; - - level++; - - mb(); - kgsl_cache_range_op((struct kgsl_memdesc *)memdesc, - KGSL_CACHE_OP_INV); -#ifdef DEBUG - pr_info("kgsl: cffdump: ib: gpuaddr:0x%08x, wc:%d, hptr:%p\n", - gpuaddr, sizedwords, hostaddr); -#endif - - while (dwords_left > 0) { - int count = 0; /* dword count including packet header */ - bool cur_ret = true; - - switch (*hostaddr >> 30) { - case 0x0: /* type-0 */ - count = (*hostaddr >> 16)+2; - break; - case 0x1: /* type-1 */ - count = 2; - break; - case 0x3: /* type-3 */ - count = ((*hostaddr >> 16) & 0x3fff) + 2; - cur_ret = kgsl_cffdump_handle_type3(dev_priv, - hostaddr, check_only); - break; - default: - pr_warn("kgsl: cffdump: parse-ib: unexpected type: " - "type:%d, word:0x%08x @ 0x%p, gpu:0x%08x\n", - *hostaddr >> 30, *hostaddr, hostaddr, - gpuaddr+4*(sizedwords-dwords_left)); - cur_ret = false; - count = dwords_left; - break; - } - -#ifdef DEBUG - if (!cur_ret) { - pr_info("kgsl: cffdump: bad sub-type: #:%d/%d, v:0x%08x" - " @ 0x%p[gb:0x%08x], level:%d\n", - sizedwords-dwords_left, sizedwords, *hostaddr, - hostaddr, gpuaddr+4*(sizedwords-dwords_left), - level); - - print_hex_dump(KERN_ERR, level == 1 ? "IB1:" : "IB2:", - DUMP_PREFIX_OFFSET, 32, 4, hoststart, - sizedwords*4, 0); - } -#endif - ret = ret && cur_ret; - - /* jump to next packet */ - dwords_left -= count; - hostaddr += count; - cur_ret = dwords_left >= 0; - -#ifdef DEBUG - if (!cur_ret) { - pr_info("kgsl: cffdump: bad count: c:%d, #:%d/%d, " - "v:0x%08x @ 0x%p[gb:0x%08x], level:%d\n", - count, sizedwords-(dwords_left+count), - sizedwords, *(hostaddr-count), hostaddr-count, - gpuaddr+4*(sizedwords-(dwords_left+count)), - level); - - print_hex_dump(KERN_ERR, level == 1 ? "IB1:" : "IB2:", - DUMP_PREFIX_OFFSET, 32, 4, hoststart, - sizedwords*4, 0); - } -#endif - - ret = ret && cur_ret; - } - - if (!ret) - pr_info("kgsl: cffdump: parsing failed: gpuaddr:0x%08x, " - "host:0x%p, wc:%d\n", gpuaddr, hoststart, sizedwords); - - if (!check_only) { -#ifdef DEBUG - uint offset = gpuaddr - memdesc->gpuaddr; - pr_info("kgsl: cffdump: ib-dump: hostptr:%p, gpuaddr:%08x, " - "physaddr:%08x, offset:%d, size:%d", hoststart, - gpuaddr, memdesc->physaddr + offset, offset, - sizedwords*4); -#endif - kgsl_cffdump_syncmem(dev_priv, memdesc, gpuaddr, sizedwords*4, - false); - } - - level--; - - return ret; -} - static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf, void *prev_subbuf, uint prev_padding) { diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h index 2fb1e43f4..efec2af9e 100644 --- a/drivers/gpu/msm/kgsl_device.h +++ b/drivers/gpu/msm/kgsl_device.h @@ -15,6 +15,7 @@ #include #include +#include #include #include "kgsl.h" @@ -184,6 +185,7 @@ struct kgsl_device { struct wake_lock idle_wakelock; struct kgsl_pwrscale pwrscale; struct kobject pwrscale_kobj; + struct pm_qos_request_list pm_qos_req_dma; struct work_struct ts_expired_ws; struct list_head events; s64 on_time; @@ -197,13 +199,18 @@ struct kgsl_context { /* Pointer to the device specific context information */ void *devctxt; + /* + * Status indicating whether a gpu reset occurred and whether this + * context was responsible for causing it + */ + unsigned int reset_status; }; struct kgsl_process_private { unsigned int refcnt; pid_t pid; spinlock_t mem_lock; - struct list_head mem_list; + struct rb_root mem_rb; struct kgsl_pagetable *pagetable; struct list_head list; struct kobject kobj; diff --git a/drivers/gpu/msm/kgsl_drm.c b/drivers/gpu/msm/kgsl_drm.c index dba2dfcfb..ba48f9c75 100644 --- a/drivers/gpu/msm/kgsl_drm.c +++ b/drivers/gpu/msm/kgsl_drm.c @@ -295,8 +295,9 @@ kgsl_gem_alloc_memory(struct drm_gem_object *obj) priv->memdesc.size = obj->size * priv->bufcount; } else if (TYPE_IS_MEM(priv->type)) { - priv->memdesc.hostptr = - vmalloc_user(obj->size * priv->bufcount); + result = kgsl_sharedmem_page_alloc(&priv->memdesc, + priv->pagetable, + obj->size * priv->bufcount, 0); if (priv->memdesc.hostptr == NULL) { DRM_ERROR("Unable to allocate vmalloc memory\n"); @@ -1042,17 +1043,18 @@ int kgsl_gem_kmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct drm_gem_object *obj = vma->vm_private_data; struct drm_device *dev = obj->dev; struct drm_kgsl_gem_object *priv; - unsigned long offset, pg; + unsigned long offset; struct page *page; + int i; mutex_lock(&dev->struct_mutex); priv = obj->driver_private; offset = (unsigned long) vmf->virtual_address - vma->vm_start; - pg = (unsigned long) priv->memdesc.hostptr + offset; + i = offset >> PAGE_SHIFT; + page = sg_page(&(priv->memdesc.sg[i])); - page = vmalloc_to_page((void *) pg); if (!page) { mutex_unlock(&dev->struct_mutex); return VM_FAULT_SIGBUS; diff --git a/drivers/gpu/msm/kgsl_gpummu.c b/drivers/gpu/msm/kgsl_gpummu.c old mode 100755 new mode 100644 index a16b95418..ba8a719e4 --- a/drivers/gpu/msm/kgsl_gpummu.c +++ b/drivers/gpu/msm/kgsl_gpummu.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2011, Code Aurora Forum. All rights reserved. +/* Copyright (c) 2011-2012, Code Aurora Forum. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -354,8 +354,8 @@ void *kgsl_gpummu_ptpool_init(int ptsize, int entries) int kgsl_gpummu_pt_equal(struct kgsl_pagetable *pt, unsigned int pt_base) { - struct kgsl_gpummu_pt *gpummu_pt = pt->priv; - return pt && pt_base && (gpummu_pt->base.gpuaddr == pt_base); + struct kgsl_gpummu_pt *gpummu_pt = pt ? pt->priv : NULL; + return gpummu_pt && pt_base && (gpummu_pt->base.gpuaddr == pt_base); } void kgsl_gpummu_destroy_pagetable(void *mmu_specific_pt) @@ -398,11 +398,11 @@ static unsigned int kgsl_gpummu_pt_get_flags(struct kgsl_pagetable *pt, enum kgsl_deviceid id) { unsigned int result = 0; - struct kgsl_gpummu_pt *gpummu_pt = (struct kgsl_gpummu_pt *) - pt->priv; + struct kgsl_gpummu_pt *gpummu_pt; if (pt == NULL) return 0; + gpummu_pt = pt->priv; spin_lock(&pt->lock); if (gpummu_pt->tlb_flags && (1<sg, s, memdesc->sglen, i) { - unsigned int paddr = sg_phys(s); + unsigned int paddr = kgsl_get_sg_pa(s); unsigned int j; /* Each sg entry might be multiple pages long */ diff --git a/drivers/gpu/msm/kgsl_iommu.c b/drivers/gpu/msm/kgsl_iommu.c index e4e561cef..5646d682a 100644 --- a/drivers/gpu/msm/kgsl_iommu.c +++ b/drivers/gpu/msm/kgsl_iommu.c @@ -34,8 +34,8 @@ struct kgsl_iommu { static int kgsl_iommu_pt_equal(struct kgsl_pagetable *pt, unsigned int pt_base) { - struct iommu_domain *domain = pt->priv; - return pt && pt_base && ((unsigned int)domain == pt_base); + struct iommu_domain *domain = pt ? pt->priv : NULL; + return domain && pt_base && ((unsigned int)domain == pt_base); } static void kgsl_iommu_destroy_pagetable(void *mmu_specific_pt) @@ -262,7 +262,7 @@ kgsl_iommu_map(void *mmu_specific_pt, iommu_virt_addr = memdesc->gpuaddr; ret = iommu_map_range(domain, iommu_virt_addr, memdesc->sg, - memdesc->size, 0); + memdesc->size, (IOMMU_READ | IOMMU_WRITE)); if (ret) { KGSL_CORE_ERR("iommu_map_range(%p, %x, %p, %d, %d) " "failed with err: %d\n", domain, diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c old mode 100755 new mode 100644 index 003afb953..429e4946d --- a/drivers/gpu/msm/kgsl_pwrctrl.c +++ b/drivers/gpu/msm/kgsl_pwrctrl.c @@ -13,7 +13,6 @@ #include #include #include -#include #include "kgsl.h" #include "kgsl_pwrscale.h" @@ -25,6 +24,7 @@ #define KGSL_PWRFLAGS_AXI_ON 2 #define KGSL_PWRFLAGS_IRQ_ON 3 +#define GPU_SWFI_LATENCY 3 #define UPDATE_BUSY_VAL 1000000 #define UPDATE_BUSY 50 @@ -284,10 +284,7 @@ static int kgsl_pwrctrl_gpubusy_show(struct device *dev, DEVICE_ATTR(gpuclk, 0644, kgsl_pwrctrl_gpuclk_show, kgsl_pwrctrl_gpuclk_store); DEVICE_ATTR(max_gpuclk, 0644, kgsl_pwrctrl_max_gpuclk_show, kgsl_pwrctrl_max_gpuclk_store); -/*< DTS2011123005723 hanfeng 20111230 begin*/ -/*modify the file permission */ DEVICE_ATTR(pwrnap, 0664, kgsl_pwrctrl_pwrnap_show, kgsl_pwrctrl_pwrnap_store); -/* DTS2011123005723 hanfeng 20111230 end >*/ DEVICE_ATTR(idle_timer, 0644, kgsl_pwrctrl_idle_timer_show, kgsl_pwrctrl_idle_timer_store); DEVICE_ATTR(gpubusy, 0644, kgsl_pwrctrl_gpubusy_show, @@ -337,7 +334,8 @@ static void kgsl_pwrctrl_busy_time(struct kgsl_device *device, bool on_time) do_gettimeofday(&(b->start)); } -void kgsl_pwrctrl_clk(struct kgsl_device *device, int state) +void kgsl_pwrctrl_clk(struct kgsl_device *device, int state, + int requested_state) { struct kgsl_pwrctrl *pwr = &device->pwrctrl; int i = 0; @@ -349,7 +347,7 @@ void kgsl_pwrctrl_clk(struct kgsl_device *device, int state) if (pwr->grp_clks[i]) clk_disable(pwr->grp_clks[i]); if ((pwr->pwrlevels[0].gpu_freq > 0) && - (device->requested_state != KGSL_STATE_NAP)) + (requested_state != KGSL_STATE_NAP)) clk_set_rate(pwr->grp_clks[0], pwr->pwrlevels[pwr->num_pwrlevels - 1]. gpu_freq); @@ -424,8 +422,12 @@ void kgsl_pwrctrl_pwrrail(struct kgsl_device *device, int state) if (!test_and_set_bit(KGSL_PWRFLAGS_POWER_ON, &pwr->power_flags)) { trace_kgsl_rail(device, state); - if (pwr->gpu_reg) - regulator_enable(pwr->gpu_reg); + if (pwr->gpu_reg) { + int status = regulator_enable(pwr->gpu_reg); + if (status) + KGSL_DRV_ERR(device, "regulator_enable " + "failed: %d\n", status); + } } } } @@ -512,10 +514,7 @@ int kgsl_pwrctrl_init(struct kgsl_device *device) pwr->nap_allowed = pdata->nap_allowed; pwr->idle_needed = pdata->idle_needed; pwr->interval_timeout = pdata->idle_timeout; - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ - /*merge qc patch to fix kgsl issue.*/ pwr->strtstp_sleepwake = pdata->strtstp_sleepwake; - /* DTS2012041906630 zhangxiangdang 20120423 end > */ pwr->ebi1_clk = clk_get(&pdev->dev, "bus_clk"); if (IS_ERR(pwr->ebi1_clk)) pwr->ebi1_clk = NULL; @@ -638,10 +637,8 @@ void kgsl_timer(unsigned long data) KGSL_PWR_INFO(device, "idle timer expired device %d\n", device->id); if (device->requested_state != KGSL_STATE_SUSPEND) { - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ if (device->pwrctrl.restore_slumber || device->pwrctrl.strtstp_sleepwake) - /* DTS2012041906630 zhangxiangdang 20120423 end > */ kgsl_pwrctrl_request_state(device, KGSL_STATE_SLUMBER); else kgsl_pwrctrl_request_state(device, KGSL_STATE_SLEEP); @@ -708,10 +705,8 @@ _nap(struct kgsl_device *device) return -EBUSY; } kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF); - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF); - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ - kgsl_pwrctrl_set_state(device, device->requested_state); - /* DTS2012041906630 zhangxiangdang 20120423 end > */ + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_NAP); + kgsl_pwrctrl_set_state(device, KGSL_STATE_NAP); if (device->idle_wakelock.name) wake_unlock(&device->idle_wakelock); case KGSL_STATE_NAP: @@ -753,10 +748,11 @@ _sleep(struct kgsl_device *device) pwr->pwrlevels[pwr->num_pwrlevels - 1]. gpu_freq); _sleep_accounting(device); - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF); + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP); kgsl_pwrctrl_set_state(device, KGSL_STATE_SLEEP); - if (device->idle_wakelock.name) - wake_unlock(&device->idle_wakelock); + wake_unlock(&device->idle_wakelock); + pm_qos_update_request(&device->pm_qos_req_dma, + PM_QOS_DEFAULT_VALUE); break; case KGSL_STATE_SLEEP: case KGSL_STATE_SLUMBER: @@ -783,18 +779,18 @@ _slumber(struct kgsl_device *device) case KGSL_STATE_NAP: case KGSL_STATE_SLEEP: del_timer_sync(&device->idle_timer); - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ if (!device->pwrctrl.strtstp_sleepwake) kgsl_pwrctrl_pwrlevel_change(device, KGSL_PWRLEVEL_NOMINAL); - /* DTS2012041906630 zhangxiangdang 20120423 end > */ + device->pwrctrl.restore_slumber = true; device->ftbl->suspend_context(device); device->ftbl->stop(device); - device->pwrctrl.restore_slumber = true; _sleep_accounting(device); kgsl_pwrctrl_set_state(device, KGSL_STATE_SLUMBER); if (device->idle_wakelock.name) wake_unlock(&device->idle_wakelock); + pm_qos_update_request(&device->pm_qos_req_dma, + PM_QOS_DEFAULT_VALUE); break; case KGSL_STATE_SLUMBER: break; @@ -856,16 +852,17 @@ void kgsl_pwrctrl_wake(struct kgsl_device *device) /* fall through */ case KGSL_STATE_NAP: /* Turn on the core clocks */ - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON); + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON, KGSL_STATE_ACTIVE); /* Enable state before turning on irq */ kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE); kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON); /* Re-enable HW access */ mod_timer(&device->idle_timer, jiffies + device->pwrctrl.interval_timeout); - - if (device->idle_wakelock.name) - wake_lock(&device->idle_wakelock); + wake_lock(&device->idle_wakelock); + if (device->pwrctrl.restore_slumber == false) + pm_qos_update_request(&device->pm_qos_req_dma, + GPU_SWFI_LATENCY); case KGSL_STATE_ACTIVE: break; default: @@ -881,7 +878,7 @@ void kgsl_pwrctrl_enable(struct kgsl_device *device) { /* Order pwrrail/clk sequence based upon platform */ kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_ON); - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON); + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON, KGSL_STATE_ACTIVE); kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_ON); } EXPORT_SYMBOL(kgsl_pwrctrl_enable); @@ -890,7 +887,7 @@ void kgsl_pwrctrl_disable(struct kgsl_device *device) { /* Order pwrrail/clk sequence based upon platform */ kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_OFF); - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF); + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP); kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_OFF); } EXPORT_SYMBOL(kgsl_pwrctrl_disable); diff --git a/drivers/gpu/msm/kgsl_pwrctrl.h b/drivers/gpu/msm/kgsl_pwrctrl.h old mode 100755 new mode 100644 index 0c7ec6003..caaed92c8 --- a/drivers/gpu/msm/kgsl_pwrctrl.h +++ b/drivers/gpu/msm/kgsl_pwrctrl.h @@ -47,9 +47,7 @@ struct kgsl_pwrctrl { int thermal_pwrlevel; unsigned int num_pwrlevels; unsigned int interval_timeout; - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ bool strtstp_sleepwake; - /* DTS2012041906630 zhangxiangdang 20120423 end > */ struct regulator *gpu_reg; uint32_t pcl; unsigned int nap_allowed; diff --git a/drivers/gpu/msm/kgsl_pwrscale.c b/drivers/gpu/msm/kgsl_pwrscale.c old mode 100755 new mode 100644 index c2252edcf..d0b2a412c --- a/drivers/gpu/msm/kgsl_pwrscale.c +++ b/drivers/gpu/msm/kgsl_pwrscale.c @@ -89,10 +89,8 @@ static ssize_t pwrscale_policy_show(struct kgsl_device *device, char *buf) return ret; } -/*< DTS2011123005723 hanfeng 20111230 begin*/ -/*modify the file permission */ + PWRSCALE_ATTR(policy, 0664, pwrscale_policy_show, pwrscale_policy_store); -/*DTS2011123005723 hanfeng 20111230 end >*/ static ssize_t pwrscale_avail_policies_show(struct kgsl_device *device, char *buf) diff --git a/drivers/gpu/msm/kgsl_pwrscale_idlestats.c b/drivers/gpu/msm/kgsl_pwrscale_idlestats.c old mode 100755 new mode 100644 diff --git a/drivers/gpu/msm/kgsl_sharedmem.c b/drivers/gpu/msm/kgsl_sharedmem.c old mode 100755 new mode 100644 index 389ed6d4f..ae32e81ff --- a/drivers/gpu/msm/kgsl_sharedmem.c +++ b/drivers/gpu/msm/kgsl_sharedmem.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include "kgsl.h" #include "kgsl_sharedmem.h" @@ -201,6 +203,10 @@ static int kgsl_drv_memstat_show(struct device *dev, val = kgsl_driver.stats.vmalloc; else if (!strncmp(attr->attr.name, "vmalloc_max", 11)) val = kgsl_driver.stats.vmalloc_max; + else if (!strncmp(attr->attr.name, "page_alloc", 10)) + val = kgsl_driver.stats.page_alloc; + else if (!strncmp(attr->attr.name, "page_alloc_max", 14)) + val = kgsl_driver.stats.page_alloc_max; else if (!strncmp(attr->attr.name, "coherent", 8)) val = kgsl_driver.stats.coherent; else if (!strncmp(attr->attr.name, "coherent_max", 12)) @@ -230,6 +236,8 @@ static int kgsl_drv_histogram_show(struct device *dev, DEVICE_ATTR(vmalloc, 0444, kgsl_drv_memstat_show, NULL); DEVICE_ATTR(vmalloc_max, 0444, kgsl_drv_memstat_show, NULL); +DEVICE_ATTR(page_alloc, 0444, kgsl_drv_memstat_show, NULL); +DEVICE_ATTR(page_alloc_max, 0444, kgsl_drv_memstat_show, NULL); DEVICE_ATTR(coherent, 0444, kgsl_drv_memstat_show, NULL); DEVICE_ATTR(coherent_max, 0444, kgsl_drv_memstat_show, NULL); DEVICE_ATTR(mapped, 0444, kgsl_drv_memstat_show, NULL); @@ -239,6 +247,8 @@ DEVICE_ATTR(histogram, 0444, kgsl_drv_histogram_show, NULL); static const struct device_attribute *drv_attr_list[] = { &dev_attr_vmalloc, &dev_attr_vmalloc_max, + &dev_attr_page_alloc, + &dev_attr_page_alloc_max, &dev_attr_coherent, &dev_attr_coherent_max, &dev_attr_mapped, @@ -282,7 +292,7 @@ static void outer_cache_range_op_sg(struct scatterlist *sg, int sglen, int op) int i; for_each_sg(sg, s, sglen, i) { - unsigned int paddr = sg_phys(s); + unsigned int paddr = kgsl_get_sg_pa(s); _outer_cache_range_op(op, paddr, s->length); } } @@ -293,17 +303,18 @@ static void outer_cache_range_op_sg(struct scatterlist *sg, int sglen, int op) } #endif -static int kgsl_vmalloc_vmfault(struct kgsl_memdesc *memdesc, +static int kgsl_page_alloc_vmfault(struct kgsl_memdesc *memdesc, struct vm_area_struct *vma, struct vm_fault *vmf) { - unsigned long offset, pg; + unsigned long offset; struct page *page; + int i; offset = (unsigned long) vmf->virtual_address - vma->vm_start; - pg = (unsigned long) memdesc->hostptr + offset; - page = vmalloc_to_page((void *) pg); + i = offset >> PAGE_SHIFT; + page = sg_page(&memdesc->sg[i]); if (page == NULL) return VM_FAULT_SIGBUS; @@ -313,15 +324,23 @@ static int kgsl_vmalloc_vmfault(struct kgsl_memdesc *memdesc, return 0; } -static int kgsl_vmalloc_vmflags(struct kgsl_memdesc *memdesc) +static int kgsl_page_alloc_vmflags(struct kgsl_memdesc *memdesc) { return VM_RESERVED | VM_DONTEXPAND; } -static void kgsl_vmalloc_free(struct kgsl_memdesc *memdesc) +static void kgsl_page_alloc_free(struct kgsl_memdesc *memdesc) { - kgsl_driver.stats.vmalloc -= memdesc->size; - vfree(memdesc->hostptr); + int i = 0; + struct scatterlist *sg; + kgsl_driver.stats.page_alloc -= memdesc->size; + if (memdesc->hostptr) { + vunmap(memdesc->hostptr); + kgsl_driver.stats.vmalloc -= memdesc->size; + } + if (memdesc->sg) + for_each_sg(memdesc->sg, sg, memdesc->sglen, i) + __free_page(sg_page(sg)); } static int kgsl_contiguous_vmflags(struct kgsl_memdesc *memdesc) @@ -329,6 +348,42 @@ static int kgsl_contiguous_vmflags(struct kgsl_memdesc *memdesc) return VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND; } +/* + * kgsl_page_alloc_map_kernel - Map the memory in memdesc to kernel address + * space + * + * @memdesc - The memory descriptor which contains information about the memory + * + * Return: 0 on success else error code + */ +static int kgsl_page_alloc_map_kernel(struct kgsl_memdesc *memdesc) +{ + if (!memdesc->hostptr) { + pgprot_t page_prot = pgprot_writecombine(PAGE_KERNEL); + struct page **pages = NULL; + struct scatterlist *sg; + int i; + /* create a list of pages to call vmap */ + pages = vmalloc(memdesc->sglen * sizeof(struct page *)); + if (!pages) { + KGSL_CORE_ERR("vmalloc(%d) failed\n", + memdesc->sglen * sizeof(struct page *)); + return -ENOMEM; + } + for_each_sg(memdesc->sg, sg, memdesc->sglen, i) + pages[i] = sg_page(sg); + memdesc->hostptr = vmap(pages, memdesc->sglen, + VM_IOREMAP, page_prot); + KGSL_STATS_ADD(memdesc->size, kgsl_driver.stats.vmalloc, + kgsl_driver.stats.vmalloc_max); + vfree(pages); + } + if (!memdesc->hostptr) + return -ENOMEM; + + return 0; +} + static int kgsl_contiguous_vmfault(struct kgsl_memdesc *memdesc, struct vm_area_struct *vma, struct vm_fault *vmf) @@ -368,12 +423,13 @@ static void kgsl_coherent_free(struct kgsl_memdesc *memdesc) } /* Global - also used by kgsl_drm.c */ -struct kgsl_memdesc_ops kgsl_vmalloc_ops = { - .free = kgsl_vmalloc_free, - .vmflags = kgsl_vmalloc_vmflags, - .vmfault = kgsl_vmalloc_vmfault, +struct kgsl_memdesc_ops kgsl_page_alloc_ops = { + .free = kgsl_page_alloc_free, + .vmflags = kgsl_page_alloc_vmflags, + .vmfault = kgsl_page_alloc_vmfault, + .map_kernel_mem = kgsl_page_alloc_map_kernel, }; -EXPORT_SYMBOL(kgsl_vmalloc_ops); +EXPORT_SYMBOL(kgsl_page_alloc_ops); static struct kgsl_memdesc_ops kgsl_ebimem_ops = { .free = kgsl_ebimem_free, @@ -407,9 +463,9 @@ void kgsl_cache_range_op(struct kgsl_memdesc *memdesc, int op) EXPORT_SYMBOL(kgsl_cache_range_op); static int -_kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc, +_kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, - void *ptr, size_t size, unsigned int protflags) + size_t size, unsigned int protflags) { int order, ret = 0; int sglen = PAGE_ALIGN(size) / PAGE_SIZE; @@ -418,36 +474,43 @@ _kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc, memdesc->size = size; memdesc->pagetable = pagetable; memdesc->priv = KGSL_MEMFLAGS_CACHED; - memdesc->ops = &kgsl_vmalloc_ops; - memdesc->hostptr = (void *) ptr; + memdesc->ops = &kgsl_page_alloc_ops; + + memdesc->sg = kgsl_sg_alloc(sglen); - memdesc->sg = vmalloc(sglen * sizeof(struct scatterlist)); if (memdesc->sg == NULL) { + KGSL_CORE_ERR("vmalloc(%d) failed\n", + sglen * sizeof(struct scatterlist)); ret = -ENOMEM; goto done; } + kmemleak_not_leak(memdesc->sg); + memdesc->sglen = sglen; sg_init_table(memdesc->sg, sglen); - for (i = 0; i < memdesc->sglen; i++, ptr += PAGE_SIZE) { - struct page *page = vmalloc_to_page(ptr); + for (i = 0; i < memdesc->sglen; i++) { + struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO | + __GFP_HIGHMEM); if (!page) { - ret = -EINVAL; + ret = -ENOMEM; + memdesc->sglen = i; goto done; } + flush_dcache_page(page); sg_set_page(&memdesc->sg[i], page, PAGE_SIZE, 0); } - - kgsl_cache_range_op(memdesc, KGSL_CACHE_OP_INV); + outer_cache_range_op_sg(memdesc->sg, memdesc->sglen, + KGSL_CACHE_OP_FLUSH); ret = kgsl_mmu_map(pagetable, memdesc, protflags); if (ret) goto done; - KGSL_STATS_ADD(size, kgsl_driver.stats.vmalloc, - kgsl_driver.stats.vmalloc_max); + KGSL_STATS_ADD(size, kgsl_driver.stats.page_alloc, + kgsl_driver.stats.page_alloc_max); order = get_order(size); @@ -462,51 +525,41 @@ _kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc, } int -kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc, +kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, size_t size) { - void *ptr; - + int ret = 0; BUG_ON(size == 0); size = ALIGN(size, PAGE_SIZE * 2); - ptr = vmalloc(size); - - if (ptr == NULL) { - KGSL_CORE_ERR("vmalloc(%d) failed\n", size); - return -ENOMEM; - } - return _kgsl_sharedmem_vmalloc(memdesc, pagetable, ptr, size, + ret = _kgsl_sharedmem_page_alloc(memdesc, pagetable, size, GSL_PT_PAGE_RV | GSL_PT_PAGE_WV); + if (!ret) + ret = kgsl_page_alloc_map_kernel(memdesc); + if (ret) + kgsl_sharedmem_free(memdesc); + return ret; } -EXPORT_SYMBOL(kgsl_sharedmem_vmalloc); +EXPORT_SYMBOL(kgsl_sharedmem_page_alloc); int -kgsl_sharedmem_vmalloc_user(struct kgsl_memdesc *memdesc, +kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, size_t size, int flags) { - void *ptr; unsigned int protflags; BUG_ON(size == 0); - ptr = vmalloc_user(size); - - if (ptr == NULL) { - KGSL_CORE_ERR("vmalloc_user(%d) failed: allocated=%d\n", - size, kgsl_driver.stats.vmalloc); - return -ENOMEM; - } protflags = GSL_PT_PAGE_RV; if (!(flags & KGSL_MEMFLAGS_GPUREADONLY)) protflags |= GSL_PT_PAGE_WV; - return _kgsl_sharedmem_vmalloc(memdesc, pagetable, ptr, size, + return _kgsl_sharedmem_page_alloc(memdesc, pagetable, size, protflags); } -EXPORT_SYMBOL(kgsl_sharedmem_vmalloc_user); +EXPORT_SYMBOL(kgsl_sharedmem_page_alloc_user); int kgsl_sharedmem_alloc_coherent(struct kgsl_memdesc *memdesc, size_t size) @@ -554,7 +607,7 @@ void kgsl_sharedmem_free(struct kgsl_memdesc *memdesc) if (memdesc->ops && memdesc->ops->free) memdesc->ops->free(memdesc); - vfree(memdesc->sg); + kgsl_sg_free(memdesc->sg, memdesc->sglen); memset(memdesc, 0, sizeof(*memdesc)); } @@ -686,3 +739,33 @@ kgsl_sharedmem_set(const struct kgsl_memdesc *memdesc, unsigned int offsetbytes, return 0; } EXPORT_SYMBOL(kgsl_sharedmem_set); + +/* + * kgsl_sharedmem_map_vma - Map a user vma to physical memory + * + * @vma - The user vma to map + * @memdesc - The memory descriptor which contains information about the + * physical memory + * + * Return: 0 on success else error code + */ +int +kgsl_sharedmem_map_vma(struct vm_area_struct *vma, + const struct kgsl_memdesc *memdesc) +{ + unsigned long addr = vma->vm_start; + unsigned long size = vma->vm_end - vma->vm_start; + int ret, i = 0; + + if (!memdesc->sg || (size != memdesc->size) || + (memdesc->sglen != (size / PAGE_SIZE))) + return -EINVAL; + + for (; addr < vma->vm_end; addr += PAGE_SIZE, i++) { + ret = vm_insert_page(vma, addr, sg_page(&memdesc->sg[i])); + if (ret) + return ret; + } + return 0; +} +EXPORT_SYMBOL(kgsl_sharedmem_map_vma); diff --git a/drivers/gpu/msm/kgsl_sharedmem.h b/drivers/gpu/msm/kgsl_sharedmem.h old mode 100755 new mode 100644 index 67a1c2d7b..a67d9c657 --- a/drivers/gpu/msm/kgsl_sharedmem.h +++ b/drivers/gpu/msm/kgsl_sharedmem.h @@ -17,6 +17,8 @@ #include #include #include "kgsl_mmu.h" +#include +#include struct kgsl_device; struct kgsl_process_private; @@ -28,19 +30,12 @@ struct kgsl_process_private; /** Set if the memdesc describes cached memory */ #define KGSL_MEMFLAGS_CACHED 0x00000001 -struct kgsl_memdesc_ops { - int (*vmflags)(struct kgsl_memdesc *); - int (*vmfault)(struct kgsl_memdesc *, struct vm_area_struct *, - struct vm_fault *); - void (*free)(struct kgsl_memdesc *memdesc); -}; - -extern struct kgsl_memdesc_ops kgsl_vmalloc_ops; +extern struct kgsl_memdesc_ops kgsl_page_alloc_ops; -int kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc, +int kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, size_t size); -int kgsl_sharedmem_vmalloc_user(struct kgsl_memdesc *memdesc, +int kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, size_t size, int flags); @@ -76,19 +71,58 @@ void kgsl_process_uninit_sysfs(struct kgsl_process_private *private); int kgsl_sharedmem_init_sysfs(void); void kgsl_sharedmem_uninit_sysfs(void); +static inline unsigned int kgsl_get_sg_pa(struct scatterlist *sg) +{ + /* + * Try sg_dma_address first to support ion carveout + * regions which do not work with sg_phys(). + */ + unsigned int pa = sg_dma_address(sg); + if (pa == 0) + pa = sg_phys(sg); + return pa; +} + +int +kgsl_sharedmem_map_vma(struct vm_area_struct *vma, + const struct kgsl_memdesc *memdesc); + +/* + * For relatively small sglists, it is preferable to use kzalloc + * rather than going down the vmalloc rat hole. If the size of + * the sglist is < PAGE_SIZE use kzalloc otherwise fallback to + * vmalloc + */ + +static inline void *kgsl_sg_alloc(unsigned int sglen) +{ + if ((sglen * sizeof(struct scatterlist)) < PAGE_SIZE) + return kzalloc(sglen * sizeof(struct scatterlist), GFP_KERNEL); + else + return vmalloc(sglen * sizeof(struct scatterlist)); +} + +static inline void kgsl_sg_free(void *ptr, unsigned int sglen) +{ + if ((sglen * sizeof(struct scatterlist)) < PAGE_SIZE) + kfree(ptr); + else + vfree(ptr); +} + static inline int memdesc_sg_phys(struct kgsl_memdesc *memdesc, unsigned int physaddr, unsigned int size) { - struct page *page = phys_to_page(physaddr); + memdesc->sg = kgsl_sg_alloc(1); - memdesc->sg = vmalloc(sizeof(struct scatterlist) * 1); - if (memdesc->sg == NULL) - return -ENOMEM; + kmemleak_not_leak(memdesc->sg); memdesc->sglen = 1; sg_init_table(memdesc->sg, 1); - sg_set_page(&memdesc->sg[0], page, size, 0); + memdesc->sg[0].length = size; + memdesc->sg[0].offset = 0; + memdesc->sg[0].dma_address = physaddr; return 0; } @@ -98,7 +132,7 @@ kgsl_allocate(struct kgsl_memdesc *memdesc, { if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE) return kgsl_sharedmem_ebimem(memdesc, pagetable, size); - return kgsl_sharedmem_vmalloc(memdesc, pagetable, size); + return kgsl_sharedmem_page_alloc(memdesc, pagetable, size); } static inline int @@ -109,7 +143,7 @@ kgsl_allocate_user(struct kgsl_memdesc *memdesc, if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE) return kgsl_sharedmem_ebimem_user(memdesc, pagetable, size, flags); - return kgsl_sharedmem_vmalloc_user(memdesc, pagetable, size, flags); + return kgsl_sharedmem_page_alloc_user(memdesc, pagetable, size, flags); } static inline int diff --git a/drivers/gpu/msm/kgsl_snapshot.c b/drivers/gpu/msm/kgsl_snapshot.c index 72df148bc..394bc83bd 100644 --- a/drivers/gpu/msm/kgsl_snapshot.c +++ b/drivers/gpu/msm/kgsl_snapshot.c @@ -10,7 +10,6 @@ * GNU General Public License for more details. */ -#include #include #include #include @@ -283,6 +282,12 @@ int kgsl_device_snapshot(struct kgsl_device *device, int hang) /* Freeze the snapshot on a hang until it gets read */ device->snapshot_frozen = (hang) ? 1 : 0; + /* log buffer info to aid in ramdump recovery */ + KGSL_DRV_ERR(device, "snapshot created at va %p pa %lx size %d\n", + device->snapshot, __pa(device->snapshot), + device->snapshot_size); + if (hang) + sysfs_notify(&device->snapshot_kobj, NULL, "timestamp"); return 0; } EXPORT_SYMBOL(kgsl_device_snapshot); @@ -432,7 +437,7 @@ int kgsl_device_snapshot_init(struct kgsl_device *device) int ret; if (device->snapshot == NULL) - device->snapshot = vmalloc(KGSL_SNAPSHOT_MEMSIZE); + device->snapshot = kzalloc(KGSL_SNAPSHOT_MEMSIZE, GFP_KERNEL); if (device->snapshot == NULL) return -ENOMEM; @@ -475,7 +480,7 @@ void kgsl_device_snapshot_close(struct kgsl_device *device) kobject_put(&device->snapshot_kobj); - vfree(device->snapshot); + kfree(device->snapshot); device->snapshot = NULL; device->snapshot_maxsize = 0; diff --git a/drivers/gpu/msm/z180.c b/drivers/gpu/msm/z180.c old mode 100755 new mode 100644 index cb3da9075..d721a577a --- a/drivers/gpu/msm/z180.c +++ b/drivers/gpu/msm/z180.c @@ -157,13 +157,6 @@ static struct z180_device device_2d0 = { .active_cnt = 0, .iomemname = KGSL_2D0_REG_MEMORY, .ftbl = &z180_functable, -#ifdef CONFIG_HAS_EARLYSUSPEND - .display_off = { - .level = EARLY_SUSPEND_LEVEL_STOP_DRAWING, - .suspend = kgsl_early_suspend_driver, - .resume = kgsl_late_resume_driver, - }, -#endif }, }; @@ -195,13 +188,6 @@ static struct z180_device device_2d1 = { .active_cnt = 0, .iomemname = KGSL_2D1_REG_MEMORY, .ftbl = &z180_functable, - .display_off = { -#ifdef CONFIG_HAS_EARLYSUSPEND - .level = EARLY_SUSPEND_LEVEL_STOP_DRAWING, - .suspend = kgsl_early_suspend_driver, - .resume = kgsl_late_resume_driver, -#endif - }, }, }; @@ -407,7 +393,7 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv, unsigned int index = 0; unsigned int nextindex; unsigned int nextcnt = Z180_STREAM_END_CMD | 5; - struct kgsl_memdesc tmp = {0}; + struct kgsl_mem_entry *entry = NULL; unsigned int cmd; struct kgsl_device *device = dev_priv->device; struct kgsl_pagetable *pagetable = dev_priv->process_priv->pagetable; @@ -425,8 +411,30 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv, } cmd = ibdesc[0].gpuaddr; sizedwords = ibdesc[0].sizedwords; - - tmp.hostptr = (void *)*timestamp; + /* + * Get a kernel mapping to the IB for monkey patching. + * See the end of this function. + */ + entry = kgsl_sharedmem_find_region(dev_priv->process_priv, cmd, + sizedwords); + if (entry == NULL) { + KGSL_DRV_ERR(device, "Bad ibdesc: gpuaddr 0x%x size %d\n", + cmd, sizedwords); + result = -EINVAL; + goto error; + } + /* + * This will only map memory if it exists, otherwise it will reuse the + * mapping. And the 2d userspace reuses IBs so we likely won't create + * too many mappings. + */ + if (kgsl_gpuaddr_to_vaddr(&entry->memdesc, cmd) == NULL) { + KGSL_DRV_ERR(device, + "Cannot make kernel mapping for gpuaddr 0x%x\n", + cmd); + result = -EINVAL; + goto error; + } KGSL_CMD_INFO(device, "ctxt %d ibaddr 0x%08x sizedwords %d\n", context->id, cmd, sizedwords); @@ -468,12 +476,13 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv, nextaddr = z180_dev->ringbuffer.cmdbufdesc.gpuaddr + rb_offset(nextindex); - tmp.hostptr = (void *)(tmp.hostptr + - (sizedwords * sizeof(unsigned int))); - tmp.size = 12; - - kgsl_sharedmem_writel(&tmp, 4, nextaddr); - kgsl_sharedmem_writel(&tmp, 8, nextcnt); + /* monkey patch the IB so that it jumps back to the ringbuffer */ + kgsl_sharedmem_writel(&entry->memdesc, + ((sizedwords + 1) * sizeof(unsigned int)), + nextaddr); + kgsl_sharedmem_writel(&entry->memdesc, + ((sizedwords + 2) * sizeof(unsigned int)), + nextcnt); /* sync memory before activating the hardware for the new command*/ mb(); diff --git a/include/linux/msm_kgsl.h b/include/linux/msm_kgsl.h old mode 100755 new mode 100644 index a1d267893..7837bad21 --- a/include/linux/msm_kgsl.h +++ b/include/linux/msm_kgsl.h @@ -34,6 +34,16 @@ #define KGSL_CLK_MEM_IFACE 0x00000010 #define KGSL_CLK_AXI 0x00000020 +/* + * Reset status values for context + */ +enum kgsl_ctx_reset_stat { + KGSL_CTX_STAT_NO_ERROR = 0x00000000, + KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT = 0x00000001, + KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT = 0x00000002, + KGSL_CTX_STAT_UNKNOWN_CONTEXT_RESET_EXT = 0x00000003 +}; + #define KGSL_MAX_PWRLEVELS 5 #define KGSL_CONVERT_TO_MBPS(val) \ @@ -110,6 +120,7 @@ enum kgsl_property_type { KGSL_PROP_MMU_ENABLE = 0x00000006, KGSL_PROP_INTERRUPT_WAITS = 0x00000007, KGSL_PROP_VERSION = 0x00000008, + KGSL_PROP_GPU_RESET_STAT = 0x00000009 }; struct kgsl_shadowprop { @@ -146,9 +157,7 @@ struct kgsl_device_platform_data { int num_levels; int (*set_grp_async)(void); unsigned int idle_timeout; - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ bool strtstp_sleepwake; - /* DTS2012041906630 zhangxiangdang 20120423 end > */ unsigned int nap_allowed; unsigned int clk_map; unsigned int idle_needed; From 127466b8f9d3e953c3b770e9012a814f3f065154 Mon Sep 17 00:00:00 2001 From: forumber Date: Sun, 13 Jan 2013 17:04:43 +0200 Subject: [PATCH 05/19] updated Atmel TS for fix the touchscreen lag --- .../touchscreen/atmel_i2c_rmi_QT602240.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c b/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c index d6a9173f8..9a931c0f3 100755 --- a/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c +++ b/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c @@ -114,12 +114,13 @@ and the height of the key region is 8.5mm, TS_Y_MAX * 8.5 /91.5 */ #define EXTRA_MAX_TOUCH_KEY 4 #define TS_KEY_DEBOUNCE_TIMER_MS 60 -static int vibrate=30; +static int vibrate=20; module_param(vibrate, int, 00644); void msm_timed_vibrate(int); + /* to define a region of touch panel */ typedef struct { @@ -834,7 +835,7 @@ int write_power_config(int on) /* < DTS2010083103149 zhangtao 20100909 begin */ /* < DTS2011042106137 zhangtao 20110509 begin */ /* < DTS2011062404739 cuiyu 20110624 begin */ - *(tmp + 1) = 14; //0xff//Active Acquisition + *(tmp + 1) = 16; //0xff//Active Acquisition /* DTS2011062404739 cuiyu 20110624 end > */ /* DTS2011042106137 zhangtao 20110509 end > */ /* DTS2010083103149 zhangtao 20100909 end > */ @@ -1196,7 +1197,7 @@ int write_gripfacesuppression_config(u8 instance) /* < DTS2010083103149 zhangtao 20100909 begin */ /* < DTS2011042106137 zhangtao 20110509 begin */ /* turn off the fripfacesuppression */ - *(tmp + 0) = 0x00; //0x05; //ctrl + *(tmp + 0) = 0x07; //0x05; //ctrl /* DTS2011042106137 zhangtao 20110509 end > */ /* < DTS2010073101113 zhangtao 20100819 begin */ *(tmp + 1) = 0; //xlogrip @@ -1206,7 +1207,7 @@ int write_gripfacesuppression_config(u8 instance) *(tmp + 5) = 0; //maxtchs *(tmp + 6) = 0; //reserved *(tmp + 7) = 80; //szthr1 - *(tmp + 8) = 40; //szthr2 + *(tmp + 8) = 20; //szthr2 *(tmp + 9) = 4; //shpthr1 *(tmp + 10) = 35; //shpthr2 *(tmp + 11) = 10; //supextto @@ -2399,6 +2400,7 @@ static void atmel_ts_work_func(struct work_struct *work) input_report_key(ts->key_input, key_tmp, 0); key_pressed1 = 0; + msm_timed_vibrate(vibrate); ATMEL_DBG_MASK("when the key is released report!\n"); } } @@ -2407,7 +2409,6 @@ static void atmel_ts_work_func(struct work_struct *work) if(0 == key_pressed1) { input_report_key(ts->key_input, key_tmp, 1); - msm_timed_vibrate(vibrate); key_pressed1 = 1; ATMEL_DBG_MASK("the key is pressed report!\n"); } @@ -2457,7 +2458,7 @@ static void atmel_ts_work_func(struct work_struct *work) if (ts->test > 0) key_pressed = KEY_BRL_DOT1; else - key_pressed = KEY_SEARCH; + key_pressed = KEY_SEARCH; touch_input_report_key(ts, key_pressed, 1); input_sync(ts->input_dev); msm_timed_vibrate(vibrate); @@ -2502,8 +2503,8 @@ static void atmel_ts_work_func(struct work_struct *work) default: break; } - - + + break; /* < DTS2010083103149 zhangtao 20100909 begin */ case PROCG_GRIPFACESUPPRESSION_T20: @@ -2515,7 +2516,7 @@ static void atmel_ts_work_func(struct work_struct *work) break; /* DTS2010083103149 zhangtao 20100909 end > */ - + default: TS_DEBUG_TS("T%d detect\n", obj); break; From 40260614bf30a451786dc462fa603f584ce41d41 Mon Sep 17 00:00:00 2001 From: forumber Date: Sun, 13 Jan 2013 17:08:07 +0200 Subject: [PATCH 06/19] enable swap --- arch/arm/configs/u8800_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/configs/u8800_defconfig b/arch/arm/configs/u8800_defconfig index 1e9a7a2c9..bb92b53f4 100644 --- a/arch/arm/configs/u8800_defconfig +++ b/arch/arm/configs/u8800_defconfig @@ -45,7 +45,7 @@ CONFIG_HAVE_KERNEL_LZO=y CONFIG_KERNEL_LZMA=y # CONFIG_KERNEL_LZO is not set CONFIG_DEFAULT_HOSTNAME="(none)" -# CONFIG_SWAP is not set +CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y # CONFIG_POSIX_MQUEUE is not set From ea79d6e1b28f75b1d7fdc8f0ddd55c05d9c66799 Mon Sep 17 00:00:00 2001 From: forumber Date: Mon, 14 Jan 2013 17:30:34 +0200 Subject: [PATCH 07/19] added smartassV2 governor & change default ARCH to arm --- Makefile | 2 +- arch/arm/configs/u8800_defconfig | 1 + drivers/cpufreq/Kconfig | 15 + drivers/cpufreq/Makefile | 1 + drivers/cpufreq/cpufreq_smartass2.c | 868 ++++++++++++++++++++++++++++ include/linux/cpufreq.h | 3 + 6 files changed, 889 insertions(+), 1 deletion(-) create mode 100644 drivers/cpufreq/cpufreq_smartass2.c diff --git a/Makefile b/Makefile index 3ab214703..c51d9cdd9 100755 --- a/Makefile +++ b/Makefile @@ -192,7 +192,7 @@ SUBARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \ # Default value for CROSS_COMPILE is not to prefix executables # Note: Some architectures assign CROSS_COMPILE in their arch/*/Makefile export KBUILD_BUILDHOST := $(SUBARCH) -ARCH ?= $(SUBARCH) +ARCH ?= arm CROSS_COMPILE ?= $(CONFIG_CROSS_COMPILE:"%"=%) # Architecture as present in compile.h diff --git a/arch/arm/configs/u8800_defconfig b/arch/arm/configs/u8800_defconfig index bb92b53f4..60c6fc0d0 100644 --- a/arch/arm/configs/u8800_defconfig +++ b/arch/arm/configs/u8800_defconfig @@ -596,6 +596,7 @@ CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_INTERACTIVE=y CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y +CONFIG_CPU_FREQ_GOV_SMARTASS2=y # CONFIG_CPU_IDLE is not set CONFIG_CPU_FREQ_MSM=y diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 194708850..6f643138e 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -109,6 +109,13 @@ config CPU_FREQ_DEFAULT_GOV_INTERACTIVE loading your cpufreq low-level hardware driver, using the 'interactive' governor for latency-sensitive workloads. +config CPU_FREQ_DEFAULT_GOV_SMARTASS2 + bool "smartass2" + select CPU_FREQ_GOV_SMARTASS2 + select CPU_FREQ_GOV_PERFORMANCE + help + Use the CPUFreq governor 'smartassV2' as default. + endchoice config CPU_FREQ_GOV_PERFORMANCE @@ -206,6 +213,14 @@ config CPU_FREQ_GOV_CONSERVATIVE If in doubt, say N. +config CPU_FREQ_GOV_SMARTASS2 + tristate "'smartassV2' cpufreq governor" + depends on CPU_FREQ + help + 'smartassV2' - a "smart" governor + + If in doubt, say N. + menu "x86 CPU frequency scaling drivers" depends on X86 source "drivers/cpufreq/Kconfig.x86" diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index c044060a4..e9261b0cc 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_CPU_FREQ_GOV_USERSPACE) += cpufreq_userspace.o obj-$(CONFIG_CPU_FREQ_GOV_ONDEMAND) += cpufreq_ondemand.o obj-$(CONFIG_CPU_FREQ_GOV_CONSERVATIVE) += cpufreq_conservative.o obj-$(CONFIG_CPU_FREQ_GOV_INTERACTIVE) += cpufreq_interactive.o +obj-$(CONFIG_CPU_FREQ_GOV_SMARTASS2) += cpufreq_smartass2.o # CPUfreq cross-arch helpers obj-$(CONFIG_CPU_FREQ_TABLE) += freq_table.o diff --git a/drivers/cpufreq/cpufreq_smartass2.c b/drivers/cpufreq/cpufreq_smartass2.c new file mode 100644 index 000000000..e00524992 --- /dev/null +++ b/drivers/cpufreq/cpufreq_smartass2.c @@ -0,0 +1,868 @@ +/* + * drivers/cpufreq/cpufreq_smartass2.c + * + * Copyright (C) 2010 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Author: Erasmux + * + * Based on the interactive governor By Mike Chan (mike@android.com) + * which was adaptated to 2.6.29 kernel by Nadlabak (pavel@doshaska.net) + * + * SMP support based on mod by faux123 + * + * For a general overview of smartassV2 see the relavent part in + * Documentation/cpu-freq/governors.txt + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/******************** Tunable parameters: ********************/ + +/* + * The "ideal" frequency to use when awake. The governor will ramp up faster + * towards the ideal frequency and slower after it has passed it. Similarly, + * lowering the frequency towards the ideal frequency is faster than below it. + */ +#define DEFAULT_AWAKE_IDEAL_FREQ 768000 +static unsigned int awake_ideal_freq; + +/* + * The "ideal" frequency to use when suspended. + * When set to 0, the governor will not track the suspended state (meaning + * that practically when sleep_ideal_freq==0 the awake_ideal_freq is used + * also when suspended). + */ +#define DEFAULT_SLEEP_IDEAL_FREQ 245000 +static unsigned int sleep_ideal_freq; + +/* + * Freqeuncy delta when ramping up above the ideal freqeuncy. + * Zero disables and causes to always jump straight to max frequency. + * When below the ideal freqeuncy we always ramp up to the ideal freq. + */ +#define DEFAULT_RAMP_UP_STEP 256000 +static unsigned int ramp_up_step; + +/* + * Freqeuncy delta when ramping down below the ideal freqeuncy. + * Zero disables and will calculate ramp down according to load heuristic. + * When above the ideal freqeuncy we always ramp down to the ideal freq. + */ +#define DEFAULT_RAMP_DOWN_STEP 256000 +static unsigned int ramp_down_step; + +/* + * CPU freq will be increased if measured load > max_cpu_load; + */ +#define DEFAULT_MAX_CPU_LOAD 50 +static unsigned long max_cpu_load; + +/* + * CPU freq will be decreased if measured load < min_cpu_load; + */ +#define DEFAULT_MIN_CPU_LOAD 25 +static unsigned long min_cpu_load; + +/* + * The minimum amount of time to spend at a frequency before we can ramp up. + * Notice we ignore this when we are below the ideal frequency. + */ +#define DEFAULT_UP_RATE_US 48000; +static unsigned long up_rate_us; + +/* + * The minimum amount of time to spend at a frequency before we can ramp down. + * Notice we ignore this when we are above the ideal frequency. + */ +#define DEFAULT_DOWN_RATE_US 99000; +static unsigned long down_rate_us; + +/* + * The frequency to set when waking up from sleep. + * When sleep_ideal_freq=0 this will have no effect. + */ +#define DEFAULT_SLEEP_WAKEUP_FREQ 1024000 +static unsigned int sleep_wakeup_freq; + +/* + * Sampling rate, I highly recommend to leave it at 2. + */ +#define DEFAULT_SAMPLE_RATE_JIFFIES 2 +static unsigned int sample_rate_jiffies; + + +/*************** End of tunables ***************/ + + +static void (*pm_idle_old)(void); +static atomic_t active_count = ATOMIC_INIT(0); + +struct smartass_info_s { + struct cpufreq_policy *cur_policy; + struct cpufreq_frequency_table *freq_table; + struct timer_list timer; + u64 time_in_idle; + u64 idle_exit_time; + u64 freq_change_time; + u64 freq_change_time_in_idle; + int cur_cpu_load; + int old_freq; + int ramp_dir; + unsigned int enable; + int ideal_speed; +}; +static DEFINE_PER_CPU(struct smartass_info_s, smartass_info); + +/* Workqueues handle frequency scaling */ +static struct workqueue_struct *up_wq; +static struct workqueue_struct *down_wq; +static struct work_struct freq_scale_work; + +static cpumask_t work_cpumask; +static spinlock_t cpumask_lock; + +static unsigned int suspended; + +#define dprintk(flag,msg...) do { \ + if (debug_mask & flag) printk(KERN_DEBUG msg); \ + } while (0) + +enum { + SMARTASS_DEBUG_JUMPS=1, + SMARTASS_DEBUG_LOAD=2, + SMARTASS_DEBUG_ALG=4 +}; + +/* + * Combination of the above debug flags. + */ +static unsigned long debug_mask; + +static int cpufreq_governor_smartass(struct cpufreq_policy *policy, + unsigned int event); + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SMARTASS2 +static +#endif +struct cpufreq_governor cpufreq_gov_smartass2 = { + .name = "smartassV2", + .governor = cpufreq_governor_smartass, + .max_transition_latency = 6000000, + .owner = THIS_MODULE, +}; + +inline static void smartass_update_min_max(struct smartass_info_s *this_smartass, struct cpufreq_policy *policy, int suspend) { + if (suspend) { + this_smartass->ideal_speed = // sleep_ideal_freq; but make sure it obeys the policy min/max + policy->max > sleep_ideal_freq ? + (sleep_ideal_freq > policy->min ? sleep_ideal_freq : policy->min) : policy->max; + } else { + this_smartass->ideal_speed = // awake_ideal_freq; but make sure it obeys the policy min/max + policy->min < awake_ideal_freq ? + (awake_ideal_freq < policy->max ? awake_ideal_freq : policy->max) : policy->min; + } +} + +inline static void smartass_update_min_max_allcpus(void) { + unsigned int i; + for_each_online_cpu(i) { + struct smartass_info_s *this_smartass = &per_cpu(smartass_info, i); + if (this_smartass->enable) + smartass_update_min_max(this_smartass,this_smartass->cur_policy,suspended); + } +} + +inline static unsigned int validate_freq(struct cpufreq_policy *policy, int freq) { + if (freq > (int)policy->max) + return policy->max; + if (freq < (int)policy->min) + return policy->min; + return freq; +} + +inline static void reset_timer(unsigned long cpu, struct smartass_info_s *this_smartass) { + this_smartass->time_in_idle = get_cpu_idle_time_us(cpu, &this_smartass->idle_exit_time); + mod_timer(&this_smartass->timer, jiffies + sample_rate_jiffies); +} + +inline static void work_cpumask_set(unsigned long cpu) { + unsigned long flags; + spin_lock_irqsave(&cpumask_lock, flags); + cpumask_set_cpu(cpu, &work_cpumask); + spin_unlock_irqrestore(&cpumask_lock, flags); +} + +inline static int work_cpumask_test_and_clear(unsigned long cpu) { + unsigned long flags; + int res = 0; + spin_lock_irqsave(&cpumask_lock, flags); + res = cpumask_test_and_clear_cpu(cpu, &work_cpumask); + spin_unlock_irqrestore(&cpumask_lock, flags); + return res; +} + +inline static int target_freq(struct cpufreq_policy *policy, struct smartass_info_s *this_smartass, + int new_freq, int old_freq, int prefered_relation) { + int index, target; + struct cpufreq_frequency_table *table = this_smartass->freq_table; + + if (new_freq == old_freq) + return 0; + new_freq = validate_freq(policy,new_freq); + if (new_freq == old_freq) + return 0; + + if (table && + !cpufreq_frequency_table_target(policy,table,new_freq,prefered_relation,&index)) + { + target = table[index].frequency; + if (target == old_freq) { + // if for example we are ramping up to *at most* current + ramp_up_step + // but there is no such frequency higher than the current, try also + // to ramp up to *at least* current + ramp_up_step. + if (new_freq > old_freq && prefered_relation==CPUFREQ_RELATION_H + && !cpufreq_frequency_table_target(policy,table,new_freq, + CPUFREQ_RELATION_L,&index)) + target = table[index].frequency; + // simlarly for ramping down: + else if (new_freq < old_freq && prefered_relation==CPUFREQ_RELATION_L + && !cpufreq_frequency_table_target(policy,table,new_freq, + CPUFREQ_RELATION_H,&index)) + target = table[index].frequency; + } + + if (target == old_freq) { + // We should not get here: + // If we got here we tried to change to a validated new_freq which is different + // from old_freq, so there is no reason for us to remain at same frequency. + printk(KERN_WARNING "Smartass: frequency change failed: %d to %d => %d\n", + old_freq,new_freq,target); + return 0; + } + } + else target = new_freq; + + __cpufreq_driver_target(policy, target, prefered_relation); + + dprintk(SMARTASS_DEBUG_JUMPS,"SmartassQ: jumping from %d to %d => %d (%d)\n", + old_freq,new_freq,target,policy->cur); + + return target; +} + +static void cpufreq_smartass_timer(unsigned long cpu) +{ + u64 delta_idle; + u64 delta_time; + int cpu_load; + int old_freq; + u64 update_time; + u64 now_idle; + int queued_work = 0; + struct smartass_info_s *this_smartass = &per_cpu(smartass_info, cpu); + struct cpufreq_policy *policy = this_smartass->cur_policy; + + now_idle = get_cpu_idle_time_us(cpu, &update_time); + old_freq = policy->cur; + + if (this_smartass->idle_exit_time == 0 || update_time == this_smartass->idle_exit_time) + return; + + delta_idle = cputime64_sub(now_idle, this_smartass->time_in_idle); + delta_time = cputime64_sub(update_time, this_smartass->idle_exit_time); + + // If timer ran less than 1ms after short-term sample started, retry. + if (delta_time < 1000) { + if (!timer_pending(&this_smartass->timer)) + reset_timer(cpu,this_smartass); + return; + } + + if (delta_idle > delta_time) + cpu_load = 0; + else + cpu_load = 100 * (unsigned int)(delta_time - delta_idle) / (unsigned int)delta_time; + + dprintk(SMARTASS_DEBUG_LOAD,"smartassT @ %d: load %d (delta_time %llu)\n", + old_freq,cpu_load,delta_time); + + this_smartass->cur_cpu_load = cpu_load; + this_smartass->old_freq = old_freq; + + // Scale up if load is above max or if there where no idle cycles since coming out of idle, + // additionally, if we are at or above the ideal_speed, verify we have been at this frequency + // for at least up_rate_us: + if (cpu_load > max_cpu_load || delta_idle == 0) + { + if (old_freq < policy->max && + (old_freq < this_smartass->ideal_speed || delta_idle == 0 || + cputime64_sub(update_time, this_smartass->freq_change_time) >= up_rate_us)) + { + dprintk(SMARTASS_DEBUG_ALG,"smartassT @ %d ramp up: load %d (delta_idle %llu)\n", + old_freq,cpu_load,delta_idle); + this_smartass->ramp_dir = 1; + work_cpumask_set(cpu); + queue_work(up_wq, &freq_scale_work); + queued_work = 1; + } + else this_smartass->ramp_dir = 0; + } + // Similarly for scale down: load should be below min and if we are at or below ideal + // frequency we require that we have been at this frequency for at least down_rate_us: + else if (cpu_load < min_cpu_load && old_freq > policy->min && + (old_freq > this_smartass->ideal_speed || + cputime64_sub(update_time, this_smartass->freq_change_time) >= down_rate_us)) + { + dprintk(SMARTASS_DEBUG_ALG,"smartassT @ %d ramp down: load %d (delta_idle %llu)\n", + old_freq,cpu_load,delta_idle); + this_smartass->ramp_dir = -1; + work_cpumask_set(cpu); + queue_work(down_wq, &freq_scale_work); + queued_work = 1; + } + else this_smartass->ramp_dir = 0; + + // To avoid unnecessary load when the CPU is already at high load, we don't + // reset ourselves if we are at max speed. If and when there are idle cycles, + // the idle loop will activate the timer. + // Additionally, if we queued some work, the work task will reset the timer + // after it has done its adjustments. + if (!queued_work && old_freq < policy->max) + reset_timer(cpu,this_smartass); +} + +static void cpufreq_idle(void) +{ + struct smartass_info_s *this_smartass = &per_cpu(smartass_info, smp_processor_id()); + struct cpufreq_policy *policy = this_smartass->cur_policy; + + if (!this_smartass->enable) { + pm_idle_old(); + return; + } + + if (policy->cur == policy->min && timer_pending(&this_smartass->timer)) + del_timer(&this_smartass->timer); + + pm_idle_old(); + + if (!timer_pending(&this_smartass->timer)) + reset_timer(smp_processor_id(), this_smartass); +} + +/* We use the same work function to sale up and down */ +static void cpufreq_smartass_freq_change_time_work(struct work_struct *work) +{ + unsigned int cpu; + int new_freq; + int old_freq; + int ramp_dir; + struct smartass_info_s *this_smartass; + struct cpufreq_policy *policy; + unsigned int relation = CPUFREQ_RELATION_L; + for_each_possible_cpu(cpu) { + this_smartass = &per_cpu(smartass_info, cpu); + if (!work_cpumask_test_and_clear(cpu)) + continue; + + ramp_dir = this_smartass->ramp_dir; + this_smartass->ramp_dir = 0; + + old_freq = this_smartass->old_freq; + policy = this_smartass->cur_policy; + + if (old_freq != policy->cur) { + // frequency was changed by someone else? + printk(KERN_WARNING "Smartass: frequency changed by 3rd party: %d to %d\n", + old_freq,policy->cur); + new_freq = old_freq; + } + else if (ramp_dir > 0 && nr_running() > 1) { + // ramp up logic: + if (old_freq < this_smartass->ideal_speed) + new_freq = this_smartass->ideal_speed; + else if (ramp_up_step) { + new_freq = old_freq + ramp_up_step; + relation = CPUFREQ_RELATION_H; + } + else { + new_freq = policy->max; + relation = CPUFREQ_RELATION_H; + } + dprintk(SMARTASS_DEBUG_ALG,"smartassQ @ %d ramp up: ramp_dir=%d ideal=%d\n", + old_freq,ramp_dir,this_smartass->ideal_speed); + } + else if (ramp_dir < 0) { + // ramp down logic: + if (old_freq > this_smartass->ideal_speed) { + new_freq = this_smartass->ideal_speed; + relation = CPUFREQ_RELATION_H; + } + else if (ramp_down_step) + new_freq = old_freq - ramp_down_step; + else { + // Load heuristics: Adjust new_freq such that, assuming a linear + // scaling of load vs. frequency, the load in the new frequency + // will be max_cpu_load: + new_freq = old_freq * this_smartass->cur_cpu_load / max_cpu_load; + if (new_freq > old_freq) // min_cpu_load > max_cpu_load ?! + new_freq = old_freq -1; + } + dprintk(SMARTASS_DEBUG_ALG,"smartassQ @ %d ramp down: ramp_dir=%d ideal=%d\n", + old_freq,ramp_dir,this_smartass->ideal_speed); + } + else { // ramp_dir==0 ?! Could the timer change its mind about a queued ramp up/down + // before the work task gets to run? + // This may also happen if we refused to ramp up because the nr_running()==1 + new_freq = old_freq; + dprintk(SMARTASS_DEBUG_ALG,"smartassQ @ %d nothing: ramp_dir=%d nr_running=%lu\n", + old_freq,ramp_dir,nr_running()); + } + + // do actual ramp up (returns 0, if frequency change failed): + new_freq = target_freq(policy,this_smartass,new_freq,old_freq,relation); + if (new_freq) + this_smartass->freq_change_time_in_idle = + get_cpu_idle_time_us(cpu,&this_smartass->freq_change_time); + + // reset timer: + if (new_freq < policy->max) + reset_timer(cpu,this_smartass); + // if we are maxed out, it is pointless to use the timer + // (idle cycles wake up the timer when the timer comes) + else if (timer_pending(&this_smartass->timer)) + del_timer(&this_smartass->timer); + } +} + +static ssize_t show_debug_mask(struct kobject *kobj, struct attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", debug_mask); +} + +static ssize_t store_debug_mask(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) +{ + ssize_t res; + unsigned long input; + res = strict_strtoul(buf, 0, &input); + if (res >= 0) + debug_mask = input; + return res; +} + +static ssize_t show_up_rate_us(struct kobject *kobj, struct attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", up_rate_us); +} + +static ssize_t store_up_rate_us(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) +{ + ssize_t res; + unsigned long input; + res = strict_strtoul(buf, 0, &input); + if (res >= 0 && input >= 0 && input <= 100000000) + up_rate_us = input; + return res; +} + +static ssize_t show_down_rate_us(struct kobject *kobj, struct attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", down_rate_us); +} + +static ssize_t store_down_rate_us(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) +{ + ssize_t res; + unsigned long input; + res = strict_strtoul(buf, 0, &input); + if (res >= 0 && input >= 0 && input <= 100000000) + down_rate_us = input; + return res; +} + +static ssize_t show_sleep_ideal_freq(struct kobject *kobj, struct attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", sleep_ideal_freq); +} + +static ssize_t store_sleep_ideal_freq(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) +{ + ssize_t res; + unsigned long input; + res = strict_strtoul(buf, 0, &input); + if (res >= 0 && input >= 0) { + sleep_ideal_freq = input; + if (suspended) + smartass_update_min_max_allcpus(); + } + return res; +} + +static ssize_t show_sleep_wakeup_freq(struct kobject *kobj, struct attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", sleep_wakeup_freq); +} + +static ssize_t store_sleep_wakeup_freq(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) +{ + ssize_t res; + unsigned long input; + res = strict_strtoul(buf, 0, &input); + if (res >= 0 && input >= 0) + sleep_wakeup_freq = input; + return res; +} + +static ssize_t show_awake_ideal_freq(struct kobject *kobj, struct attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", awake_ideal_freq); +} + +static ssize_t store_awake_ideal_freq(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) +{ + ssize_t res; + unsigned long input; + res = strict_strtoul(buf, 0, &input); + if (res >= 0 && input >= 0) { + awake_ideal_freq = input; + if (!suspended) + smartass_update_min_max_allcpus(); + } + return res; +} + +static ssize_t show_sample_rate_jiffies(struct kobject *kobj, struct attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", sample_rate_jiffies); +} + +static ssize_t store_sample_rate_jiffies(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) +{ + ssize_t res; + unsigned long input; + res = strict_strtoul(buf, 0, &input); + if (res >= 0 && input > 0 && input <= 1000) + sample_rate_jiffies = input; + return res; +} + +static ssize_t show_ramp_up_step(struct kobject *kobj, struct attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ramp_up_step); +} + +static ssize_t store_ramp_up_step(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) +{ + ssize_t res; + unsigned long input; + res = strict_strtoul(buf, 0, &input); + if (res >= 0 && input >= 0) + ramp_up_step = input; + return res; +} + +static ssize_t show_ramp_down_step(struct kobject *kobj, struct attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ramp_down_step); +} + +static ssize_t store_ramp_down_step(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) +{ + ssize_t res; + unsigned long input; + res = strict_strtoul(buf, 0, &input); + if (res >= 0 && input >= 0) + ramp_down_step = input; + return res; +} + +static ssize_t show_max_cpu_load(struct kobject *kobj, struct attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", max_cpu_load); +} + +static ssize_t store_max_cpu_load(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) +{ + ssize_t res; + unsigned long input; + res = strict_strtoul(buf, 0, &input); + if (res >= 0 && input > 0 && input <= 100) + max_cpu_load = input; + return res; +} + +static ssize_t show_min_cpu_load(struct kobject *kobj, struct attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", min_cpu_load); +} + +static ssize_t store_min_cpu_load(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) +{ + ssize_t res; + unsigned long input; + res = strict_strtoul(buf, 0, &input); + if (res >= 0 && input > 0 && input < 100) + min_cpu_load = input; + return res; +} + +#define define_global_rw_attr(_name) \ +static struct global_attr _name##_attr = \ + __ATTR(_name, 0644, show_##_name, store_##_name) + +define_global_rw_attr(debug_mask); +define_global_rw_attr(up_rate_us); +define_global_rw_attr(down_rate_us); +define_global_rw_attr(sleep_ideal_freq); +define_global_rw_attr(sleep_wakeup_freq); +define_global_rw_attr(awake_ideal_freq); +define_global_rw_attr(sample_rate_jiffies); +define_global_rw_attr(ramp_up_step); +define_global_rw_attr(ramp_down_step); +define_global_rw_attr(max_cpu_load); +define_global_rw_attr(min_cpu_load); + +static struct attribute * smartass_attributes[] = { + &debug_mask_attr.attr, + &up_rate_us_attr.attr, + &down_rate_us_attr.attr, + &sleep_ideal_freq_attr.attr, + &sleep_wakeup_freq_attr.attr, + &awake_ideal_freq_attr.attr, + &sample_rate_jiffies_attr.attr, + &ramp_up_step_attr.attr, + &ramp_down_step_attr.attr, + &max_cpu_load_attr.attr, + &min_cpu_load_attr.attr, + NULL, +}; + +static struct attribute_group smartass_attr_group = { + .attrs = smartass_attributes, + .name = "smartass", +}; + +static int cpufreq_governor_smartass(struct cpufreq_policy *new_policy, + unsigned int event) +{ + unsigned int cpu = new_policy->cpu; + int rc; + struct smartass_info_s *this_smartass = &per_cpu(smartass_info, cpu); + + switch (event) { + case CPUFREQ_GOV_START: + if ((!cpu_online(cpu)) || (!new_policy->cur)) + return -EINVAL; + + this_smartass->cur_policy = new_policy; + + this_smartass->enable = 1; + + smartass_update_min_max(this_smartass,new_policy,suspended); + + this_smartass->freq_table = cpufreq_frequency_get_table(cpu); + if (!this_smartass->freq_table) + printk(KERN_WARNING "Smartass: no frequency table for cpu %d?!\n",cpu); + + smp_wmb(); + + // Do not register the idle hook and create sysfs + // entries if we have already done so. + if (atomic_inc_return(&active_count) <= 1) { + rc = sysfs_create_group(cpufreq_global_kobject, + &smartass_attr_group); + if (rc) + return rc; + + pm_idle_old = pm_idle; + pm_idle = cpufreq_idle; + } + + if (this_smartass->cur_policy->cur < new_policy->max && !timer_pending(&this_smartass->timer)) + reset_timer(cpu,this_smartass); + + break; + + case CPUFREQ_GOV_LIMITS: + smartass_update_min_max(this_smartass,new_policy,suspended); + + if (this_smartass->cur_policy->cur > new_policy->max) { + dprintk(SMARTASS_DEBUG_JUMPS,"SmartassI: jumping to new max freq: %d\n",new_policy->max); + __cpufreq_driver_target(this_smartass->cur_policy, + new_policy->max, CPUFREQ_RELATION_H); + } + else if (this_smartass->cur_policy->cur < new_policy->min) { + dprintk(SMARTASS_DEBUG_JUMPS,"SmartassI: jumping to new min freq: %d\n",new_policy->min); + __cpufreq_driver_target(this_smartass->cur_policy, + new_policy->min, CPUFREQ_RELATION_L); + } + + if (this_smartass->cur_policy->cur < new_policy->max && !timer_pending(&this_smartass->timer)) + reset_timer(cpu,this_smartass); + + break; + + case CPUFREQ_GOV_STOP: + this_smartass->enable = 0; + smp_wmb(); + del_timer(&this_smartass->timer); + flush_work(&freq_scale_work); + this_smartass->idle_exit_time = 0; + + if (atomic_dec_return(&active_count) <= 1) { + sysfs_remove_group(cpufreq_global_kobject, + &smartass_attr_group); + pm_idle = pm_idle_old; + } + break; + } + + return 0; +} + +static void smartass_suspend(int cpu, int suspend) +{ + struct smartass_info_s *this_smartass = &per_cpu(smartass_info, smp_processor_id()); + struct cpufreq_policy *policy = this_smartass->cur_policy; + unsigned int new_freq; + + if (!this_smartass->enable) + return; + + smartass_update_min_max(this_smartass,policy,suspend); + if (!suspend) { // resume at max speed: + new_freq = validate_freq(policy,sleep_wakeup_freq); + + dprintk(SMARTASS_DEBUG_JUMPS,"SmartassS: awaking at %d\n",new_freq); + + __cpufreq_driver_target(policy, new_freq, + CPUFREQ_RELATION_L); + } else { + // to avoid wakeup issues with quick sleep/wakeup don't change actual frequency when entering sleep + // to allow some time to settle down. Instead we just reset our statistics (and reset the timer). + // Eventually, the timer will adjust the frequency if necessary. + + this_smartass->freq_change_time_in_idle = + get_cpu_idle_time_us(cpu,&this_smartass->freq_change_time); + + dprintk(SMARTASS_DEBUG_JUMPS,"SmartassS: suspending at %d\n",policy->cur); + } + + reset_timer(smp_processor_id(),this_smartass); +} + +static void smartass_early_suspend(struct early_suspend *handler) { + int i; + if (suspended || sleep_ideal_freq==0) // disable behavior for sleep_ideal_freq==0 + return; + suspended = 1; + for_each_online_cpu(i) + smartass_suspend(i,1); +} + +static void smartass_late_resume(struct early_suspend *handler) { + int i; + if (!suspended) // already not suspended so nothing to do + return; + suspended = 0; + for_each_online_cpu(i) + smartass_suspend(i,0); +} + +static struct early_suspend smartass_power_suspend = { + .suspend = smartass_early_suspend, + .resume = smartass_late_resume, +#ifdef CONFIG_MACH_HERO + .level = EARLY_SUSPEND_LEVEL_DISABLE_FB + 1, +#endif +}; + +static int __init cpufreq_smartass_init(void) +{ + unsigned int i; + struct smartass_info_s *this_smartass; + debug_mask = 0; + up_rate_us = DEFAULT_UP_RATE_US; + down_rate_us = DEFAULT_DOWN_RATE_US; + sleep_ideal_freq = DEFAULT_SLEEP_IDEAL_FREQ; + sleep_wakeup_freq = DEFAULT_SLEEP_WAKEUP_FREQ; + awake_ideal_freq = DEFAULT_AWAKE_IDEAL_FREQ; + sample_rate_jiffies = DEFAULT_SAMPLE_RATE_JIFFIES; + ramp_up_step = DEFAULT_RAMP_UP_STEP; + ramp_down_step = DEFAULT_RAMP_DOWN_STEP; + max_cpu_load = DEFAULT_MAX_CPU_LOAD; + min_cpu_load = DEFAULT_MIN_CPU_LOAD; + + spin_lock_init(&cpumask_lock); + + suspended = 0; + + /* Initalize per-cpu data: */ + for_each_possible_cpu(i) { + this_smartass = &per_cpu(smartass_info, i); + this_smartass->enable = 0; + this_smartass->cur_policy = 0; + this_smartass->ramp_dir = 0; + this_smartass->time_in_idle = 0; + this_smartass->idle_exit_time = 0; + this_smartass->freq_change_time = 0; + this_smartass->freq_change_time_in_idle = 0; + this_smartass->cur_cpu_load = 0; + // intialize timer: + init_timer_deferrable(&this_smartass->timer); + this_smartass->timer.function = cpufreq_smartass_timer; + this_smartass->timer.data = i; + work_cpumask_test_and_clear(i); + } + + // Scale up is high priority + up_wq = alloc_workqueue("ksmartass_up", WQ_HIGHPRI, 1); + down_wq = alloc_workqueue("ksmartass_down", 0, 1); + if (!up_wq || !down_wq) + return -ENOMEM; + + INIT_WORK(&freq_scale_work, cpufreq_smartass_freq_change_time_work); + + register_early_suspend(&smartass_power_suspend); + + return cpufreq_register_governor(&cpufreq_gov_smartass2); +} + +#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SMARTASS2 +fs_initcall(cpufreq_smartass_init); +#else +module_init(cpufreq_smartass_init); +#endif + +static void __exit cpufreq_smartass_exit(void) +{ + cpufreq_unregister_governor(&cpufreq_gov_smartass2); + destroy_workqueue(up_wq); + destroy_workqueue(down_wq); +} + +module_exit(cpufreq_smartass_exit); + +MODULE_AUTHOR ("Erasmux"); +MODULE_DESCRIPTION ("'cpufreq_smartass2' - A smart cpufreq governor"); +MODULE_LICENSE ("GPL"); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 957c5b414..874922d13 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -363,6 +363,9 @@ extern struct cpufreq_governor cpufreq_gov_conservative; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE) extern struct cpufreq_governor cpufreq_gov_interactive; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_interactive) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SMARTASS2) +extern struct cpufreq_governor cpufreq_gov_smartass2; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_smartass2) #endif From 526a44be1bd6964cb095a986b488916577243a41 Mon Sep 17 00:00:00 2001 From: forumber Date: Mon, 14 Jan 2013 17:31:36 +0200 Subject: [PATCH 08/19] change default toolchain path --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c51d9cdd9..bcede986c 100755 --- a/Makefile +++ b/Makefile @@ -193,7 +193,7 @@ SUBARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \ # Note: Some architectures assign CROSS_COMPILE in their arch/*/Makefile export KBUILD_BUILDHOST := $(SUBARCH) ARCH ?= arm -CROSS_COMPILE ?= $(CONFIG_CROSS_COMPILE:"%"=%) +CROSS_COMPILE ?= /home/forumber/prebuilt/linux-x86/toolchain/arm-eabi-4.4.3/bin/arm-eabi- # Architecture as present in compile.h UTS_MACHINE := $(ARCH) From 81617215ee066368a53adb93b5b45dff30541ba2 Mon Sep 17 00:00:00 2001 From: forumber Date: Mon, 14 Jan 2013 17:34:11 +0200 Subject: [PATCH 09/19] change dzo@martin to forumber@dzo --- scripts/mkcompile_h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h index d5428594f..a7937dbbd 100755 --- a/scripts/mkcompile_h +++ b/scripts/mkcompile_h @@ -75,8 +75,8 @@ UTS_TRUNCATE="cut -b -$UTS_LEN" #/* < DTS2011052606009 jiaxianghong 20110527 begin */ #/* < DTS2011030103387 niguodong 20110415 begin */ - echo \#define LINUX_COMPILE_BY \"dzo\" - echo \#define LINUX_COMPILE_HOST \"martin\" + echo \#define LINUX_COMPILE_BY \"forumber\" + echo \#define LINUX_COMPILE_HOST \"dzo\" #/* DTS2011030103387 niguodong 20110415 end > */ #/* < DTS2011052606009 jiaxianghong 20110527 end */ From 68b7409e06b30522d50e765bae67a0e2866fb3c7 Mon Sep 17 00:00:00 2001 From: forumber Date: Mon, 14 Jan 2013 17:43:46 +0200 Subject: [PATCH 10/19] 122 Mhz and 245 Mhz are disabled --- arch/arm/mach-msm/acpuclock-7x30.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-msm/acpuclock-7x30.c b/arch/arm/mach-msm/acpuclock-7x30.c index 8ffe61a4d..4b7465358 100644 --- a/arch/arm/mach-msm/acpuclock-7x30.c +++ b/arch/arm/mach-msm/acpuclock-7x30.c @@ -121,10 +121,10 @@ static struct clk *acpuclk_sources[MAX_SOURCE]; static struct clkctl_acpu_speed acpu_freq_tbl[] = { { 0, 24576, LPXO, 0, 0, 30720000, 900, VDD_RAW(900) }, { 0, 61440, PLL_3, 5, 11, 61440000, 900, VDD_RAW(900) }, - { 1, 122880, PLL_3, 5, 5, 61440000, 900, VDD_RAW(900) }, + { 0, 122880, PLL_3, 5, 5, 61440000, 900, VDD_RAW(900) }, { 0, 184320, PLL_3, 5, 4, 61440000, 900, VDD_RAW(900) }, { 0, MAX_AXI_KHZ, AXI, 1, 0, 61440000, 900, VDD_RAW(900) }, - { 1, 245760, PLL_3, 5, 2, 61440000, 900, VDD_RAW(900) }, + { 0, 245760, PLL_3, 5, 2, 61440000, 900, VDD_RAW(900) }, { 1, 368640, PLL_3, 5, 1, 122800000, 900, VDD_RAW(900) }, { 0, 480000, PLL_2, 3, 0, 122800000, 900, VDD_RAW(900), &pll2_tbl[0]}, { 0, 600000, PLL_2, 3, 0, 122800000, 925, VDD_RAW(925), &pll2_tbl[1]}, From d9cf2bb683b362b6c463f4098439efbeaae79dca Mon Sep 17 00:00:00 2001 From: forumber Date: Mon, 14 Jan 2013 17:45:57 +0200 Subject: [PATCH 11/19] 480 Mhz and 600 Mhz are enabled --- arch/arm/mach-msm/acpuclock-7x30.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-msm/acpuclock-7x30.c b/arch/arm/mach-msm/acpuclock-7x30.c index 4b7465358..fb4ab5387 100644 --- a/arch/arm/mach-msm/acpuclock-7x30.c +++ b/arch/arm/mach-msm/acpuclock-7x30.c @@ -126,8 +126,8 @@ static struct clkctl_acpu_speed acpu_freq_tbl[] = { { 0, MAX_AXI_KHZ, AXI, 1, 0, 61440000, 900, VDD_RAW(900) }, { 0, 245760, PLL_3, 5, 2, 61440000, 900, VDD_RAW(900) }, { 1, 368640, PLL_3, 5, 1, 122800000, 900, VDD_RAW(900) }, - { 0, 480000, PLL_2, 3, 0, 122800000, 900, VDD_RAW(900), &pll2_tbl[0]}, - { 0, 600000, PLL_2, 3, 0, 122800000, 925, VDD_RAW(925), &pll2_tbl[1]}, + { 1, 480000, PLL_2, 3, 0, 122800000, 900, VDD_RAW(900), &pll2_tbl[0]}, + { 1, 600000, PLL_2, 3, 0, 122800000, 925, VDD_RAW(925), &pll2_tbl[1]}, /* AXI has MSMC1 implications. See above. */ { 1, 768000, PLL_1, 2, 0, 153600000, 1050, VDD_RAW(1050) }, /* From 35acafc2e78c5b1c859bf9b2aae2df481445cc91 Mon Sep 17 00:00:00 2001 From: forumber Date: Mon, 14 Jan 2013 17:48:49 +0200 Subject: [PATCH 12/19] undervolt some freq for battery life --- arch/arm/mach-msm/acpuclock-7x30.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/mach-msm/acpuclock-7x30.c b/arch/arm/mach-msm/acpuclock-7x30.c index fb4ab5387..7c8dddf44 100644 --- a/arch/arm/mach-msm/acpuclock-7x30.c +++ b/arch/arm/mach-msm/acpuclock-7x30.c @@ -129,12 +129,12 @@ static struct clkctl_acpu_speed acpu_freq_tbl[] = { { 1, 480000, PLL_2, 3, 0, 122800000, 900, VDD_RAW(900), &pll2_tbl[0]}, { 1, 600000, PLL_2, 3, 0, 122800000, 925, VDD_RAW(925), &pll2_tbl[1]}, /* AXI has MSMC1 implications. See above. */ - { 1, 768000, PLL_1, 2, 0, 153600000, 1050, VDD_RAW(1050) }, + { 1, 768000, PLL_1, 2, 0, 153600000, 975, VDD_RAW(975) }, /* * AXI has MSMC1 implications. See above. */ - { 1, 806400, PLL_2, 3, 0, UINT_MAX, 1100, VDD_RAW(1100), &pll2_tbl[2]}, - { 1, 1024000, PLL_2, 3, 0, UINT_MAX, 1200, VDD_RAW(1200), &pll2_tbl[3]}, + { 1, 806400, PLL_2, 3, 0, UINT_MAX, 1000, VDD_RAW(1000), &pll2_tbl[2]}, + { 1, 1024000, PLL_2, 3, 0, UINT_MAX, 1100, VDD_RAW(1100), &pll2_tbl[3]}, { 1, 1200000, PLL_2, 3, 0, UINT_MAX, 1200, VDD_RAW(1200), &pll2_tbl[4]}, { 1, 1401600, PLL_2, 3, 0, UINT_MAX, 1250, VDD_RAW(1250), &pll2_tbl[5]}, { 1, 1516800, PLL_2, 3, 0, UINT_MAX, 1300, VDD_RAW(1300), &pll2_tbl[6]}, From e61703f6f440ad146a2d59d4eb50e66c0626f1d2 Mon Sep 17 00:00:00 2001 From: forumber Date: Fri, 25 Jan 2013 16:31:35 +0200 Subject: [PATCH 13/19] VDD sysfs interface (snq-) --- arch/arm/configs/u8800_defconfig | 6 ++- arch/arm/mach-msm/acpuclock-7x30.c | 41 ++++++++++++++++++++ drivers/cpufreq/Kconfig | 8 ++++ drivers/cpufreq/cpufreq.c | 62 ++++++++++++++++++++++++++++++ 4 files changed, 115 insertions(+), 2 deletions(-) diff --git a/arch/arm/configs/u8800_defconfig b/arch/arm/configs/u8800_defconfig index 60c6fc0d0..e252fda97 100644 --- a/arch/arm/configs/u8800_defconfig +++ b/arch/arm/configs/u8800_defconfig @@ -382,7 +382,7 @@ CONFIG_MSM_DALRPC=y CONFIG_MSM_DALRPC_TEST=m CONFIG_MSM_CPU_FREQ_SET_MIN_MAX=y CONFIG_MSM_CPU_FREQ_MAX=1024000 -CONFIG_MSM_CPU_FREQ_MIN=122880 +CONFIG_MSM_CPU_FREQ_MIN=368640 # CONFIG_MSM_AVS_HW is not set # CONFIG_MSM_HW3D is not set CONFIG_AMSS_7X25_VERSION_2009=y @@ -584,18 +584,20 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_TABLE=y CONFIG_CPU_FREQ_STAT=y # CONFIG_CPU_FREQ_STAT_DETAILS is not set -CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set # CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set # CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set # CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set # CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set # CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_SMARTASS2=y CONFIG_CPU_FREQ_GOV_PERFORMANCE=y CONFIG_CPU_FREQ_GOV_POWERSAVE=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_INTERACTIVE=y CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y +CONFIG_CPU_FREQ_VDD_LEVELS=y CONFIG_CPU_FREQ_GOV_SMARTASS2=y # CONFIG_CPU_IDLE is not set CONFIG_CPU_FREQ_MSM=y diff --git a/arch/arm/mach-msm/acpuclock-7x30.c b/arch/arm/mach-msm/acpuclock-7x30.c index 7c8dddf44..8f93fcfc4 100644 --- a/arch/arm/mach-msm/acpuclock-7x30.c +++ b/arch/arm/mach-msm/acpuclock-7x30.c @@ -52,6 +52,8 @@ #define VDD_RAW(mv) (((MV(mv) / V_STEP) - 30) | VREG_DATA) #define MAX_AXI_KHZ 192000 +#define SEMC_ACPU_MIN_UV_MV 750U +#define SEMC_ACPU_MAX_UV_MV 1500U struct clock_state { struct clkctl_acpu_speed *current_speed; @@ -493,3 +495,42 @@ static int __init acpuclk_7x30_init(struct acpuclk_soc_data *soc_data) struct acpuclk_soc_data acpuclk_7x30_soc_data __initdata = { .init = acpuclk_7x30_init, }; + +#ifdef CONFIG_CPU_FREQ_VDD_LEVELS + +ssize_t acpuclk_get_vdd_levels_str(char *buf) +{ + int i, len = 0; + if (buf) + { + mutex_lock(&drv_state.lock); + for (i = 0; acpu_freq_tbl[i].acpu_clk_khz; i++) + { + len += sprintf(buf + len, "%8u: %4d\n", acpu_freq_tbl[i].acpu_clk_khz, acpu_freq_tbl[i].vdd_mv); + } + mutex_unlock(&drv_state.lock); + } + return len; +} + +void acpuclk_set_vdd(unsigned int khz, int vdd) +{ + int i; + unsigned int new_vdd; + vdd = vdd / V_STEP * V_STEP; + mutex_lock(&drv_state.lock); + for (i = 0; acpu_freq_tbl[i].acpu_clk_khz; i++) + { + if (khz == 0) + new_vdd = min(max((acpu_freq_tbl[i].vdd_mv + vdd), SEMC_ACPU_MIN_UV_MV), SEMC_ACPU_MAX_UV_MV); + else if (acpu_freq_tbl[i].acpu_clk_khz == khz) + new_vdd = min(max((unsigned int)vdd, SEMC_ACPU_MIN_UV_MV), SEMC_ACPU_MAX_UV_MV); + else continue; + + acpu_freq_tbl[i].vdd_mv = new_vdd; + acpu_freq_tbl[i].vdd_raw = VDD_RAW(new_vdd); + } + mutex_unlock(&drv_state.lock); +} + +#endif diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 6f643138e..697ce42b6 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -213,6 +213,14 @@ config CPU_FREQ_GOV_CONSERVATIVE If in doubt, say N. +config CPU_FREQ_VDD_LEVELS + bool "CPU Vdd levels sysfs interface" + depends on CPU_FREQ_STAT + depends on ARCH_MSM7X30 + default n + help + CPU Vdd levels sysfs interface + config CPU_FREQ_GOV_SMARTASS2 tristate "'smartassV2' cpufreq governor" depends on CPU_FREQ diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index ff15497e9..7d92421c1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -557,6 +557,62 @@ static ssize_t show_scaling_setspeed(struct cpufreq_policy *policy, char *buf) return policy->governor->show_setspeed(policy, buf); } +#ifdef CONFIG_CPU_FREQ_VDD_LEVELS + +extern ssize_t acpuclk_get_vdd_levels_str(char *buf); +static ssize_t show_vdd_levels(struct cpufreq_policy *policy, char *buf) +{ + return acpuclk_get_vdd_levels_str(buf); +} + +extern void acpuclk_set_vdd(unsigned acpu_khz, int vdd); +static ssize_t store_vdd_levels(struct cpufreq_policy *policy, const char *buf, size_t count) +{ + int i = 0, j; + int pair[2] = { 0, 0 }; + int sign = 0; + + if (count < 1) + return 0; + + if (buf[0] == '-') { + sign = -1; + i++; + } else if (buf[0] == '+') { + sign = 1; + i++; + } + + for (j = 0; i < count; i++) { + char c = buf[i]; + if ((c >= '0') && (c <= '9')) { + pair[j] *= 10; + pair[j] += (c - '0'); + } else if ((c == ' ') || (c == '\t')) { + if (pair[j] != 0) { + j++; + if ((sign != 0) || (j > 1)) + break; + } + } + else + break; + } + + if (sign != 0) { + if (pair[0] > 0) + acpuclk_set_vdd(0, sign * pair[0]); + } else { + if ((pair[0] > 0) && (pair[1] > 0)) + acpuclk_set_vdd((unsigned)pair[0], pair[1]); + else + return -EINVAL; + } + return count; +} + +#endif + /** * show_scaling_driver - show the current cpufreq HW/BIOS limitation */ @@ -586,6 +642,9 @@ cpufreq_freq_attr_rw(scaling_min_freq); cpufreq_freq_attr_rw(scaling_max_freq); cpufreq_freq_attr_rw(scaling_governor); cpufreq_freq_attr_rw(scaling_setspeed); +#ifdef CONFIG_CPU_FREQ_VDD_LEVELS +cpufreq_freq_attr_rw(vdd_levels); +#endif static struct attribute *default_attrs[] = { &cpuinfo_min_freq.attr, @@ -599,6 +658,9 @@ static struct attribute *default_attrs[] = { &scaling_driver.attr, &scaling_available_governors.attr, &scaling_setspeed.attr, + #ifdef CONFIG_CPU_FREQ_VDD_LEVELS + &vdd_levels.attr, + #endif NULL }; From 6196e001fa3a5a1fab6d3c0cb8a3a244a6feed59 Mon Sep 17 00:00:00 2001 From: forumber Date: Fri, 25 Jan 2013 16:35:07 +0200 Subject: [PATCH 14/19] change kgsl and adreno drivers to u8800pro drivers --- drivers/gpu/msm/adreno.c | 43 +-- drivers/gpu/msm/adreno.h | 8 +- drivers/gpu/msm/adreno_a2xx.c | 81 ++--- drivers/gpu/msm/adreno_debugfs.c | 2 - drivers/gpu/msm/adreno_drawctxt.c | 7 +- drivers/gpu/msm/adreno_pm4types.h | 39 +-- drivers/gpu/msm/adreno_postmortem.c | 9 +- drivers/gpu/msm/adreno_ringbuffer.c | 219 +------------- drivers/gpu/msm/adreno_snapshot.c | 129 +------- drivers/gpu/msm/kgsl.c | 277 ++++++++---------- drivers/gpu/msm/kgsl.h | 63 ++-- drivers/gpu/msm/kgsl_cffdump.c | 184 ++++++++++++ drivers/gpu/msm/kgsl_device.h | 9 +- drivers/gpu/msm/kgsl_drm.c | 12 +- drivers/gpu/msm/kgsl_gpummu.c | 14 +- drivers/gpu/msm/kgsl_iommu.c | 6 +- drivers/gpu/msm/kgsl_pwrctrl.c | 55 ++-- drivers/gpu/msm/kgsl_pwrctrl.h | 2 + drivers/gpu/msm/kgsl_pwrscale.c | 4 +- drivers/gpu/msm/kgsl_pwrscale_idlestats.c | 0 drivers/gpu/msm/kgsl_sharedmem.c | 181 ++++-------- drivers/gpu/msm/kgsl_sharedmem.h | 68 ++--- drivers/gpu/msm/kgsl_snapshot.c | 11 +- drivers/gpu/msm/z180.c | 55 ++-- .../touchscreen/atmel_i2c_rmi_QT602240.c | 0 include/linux/msm_kgsl.h | 13 +- 26 files changed, 547 insertions(+), 944 deletions(-) mode change 100644 => 100755 drivers/gpu/msm/adreno.c mode change 100644 => 100755 drivers/gpu/msm/adreno.h mode change 100644 => 100755 drivers/gpu/msm/adreno_a2xx.c mode change 100644 => 100755 drivers/gpu/msm/adreno_postmortem.c mode change 100644 => 100755 drivers/gpu/msm/kgsl.c mode change 100644 => 100755 drivers/gpu/msm/kgsl_gpummu.c mode change 100644 => 100755 drivers/gpu/msm/kgsl_pwrctrl.c mode change 100644 => 100755 drivers/gpu/msm/kgsl_pwrctrl.h mode change 100644 => 100755 drivers/gpu/msm/kgsl_pwrscale.c mode change 100644 => 100755 drivers/gpu/msm/kgsl_pwrscale_idlestats.c mode change 100644 => 100755 drivers/gpu/msm/kgsl_sharedmem.c mode change 100644 => 100755 drivers/gpu/msm/kgsl_sharedmem.h mode change 100644 => 100755 drivers/gpu/msm/z180.c mode change 100755 => 100644 drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c mode change 100644 => 100755 include/linux/msm_kgsl.h diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c old mode 100644 new mode 100755 index 8320003d2..7ff5a4384 --- a/drivers/gpu/msm/adreno.c +++ b/drivers/gpu/msm/adreno.c @@ -114,7 +114,6 @@ static struct adreno_device device_3d0 = { .pfp_fw = NULL, .pm4_fw = NULL, .wait_timeout = 10000, /* in milliseconds */ - .ib_check_level = 0, }; @@ -274,12 +273,6 @@ static void adreno_setstate(struct kgsl_device *device, int sizedwords = 0; unsigned int mh_mmu_invalidate = 0x00000003; /*invalidate all and tc */ - /* - * Fix target freeze issue by adding TLB flush for each submit - * on A20X based targets. - */ - if (adreno_is_a20x(adreno_dev)) - flags |= KGSL_MMUFLAGS_TLBFLUSH; /* * If possible, then set the state via the command stream to avoid * a CPU idle. Otherwise, use the default setstate which uses register @@ -645,8 +638,6 @@ adreno_recover_hang(struct kgsl_device *device) unsigned int soptimestamp; unsigned int eoptimestamp; struct adreno_context *drawctxt; - struct kgsl_context *context; - int next = 0; KGSL_DRV_ERR(device, "Starting recovery from 3D GPU hang....\n"); rb_buffer = vmalloc(rb->buffer_desc.size); @@ -715,24 +706,6 @@ adreno_recover_hang(struct kgsl_device *device) drawctxt->flags |= CTXT_FLAGS_GPU_HANG; - /* - * Set the reset status of all contexts to - * INNOCENT_CONTEXT_RESET_EXT except for the bad context - * since thats the guilty party - */ - while ((context = idr_get_next(&device->context_idr, &next))) { - if (KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT != - context->reset_status) { - if (context->devctxt != drawctxt) - context->reset_status = - KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT; - else - context->reset_status = - KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT; - } - next = next + 1; - } - /* Restore valid commands in ringbuffer */ adreno_ringbuffer_restore(rb, rb_buffer, num_rb_contents); rb->timestamp = timestamp; @@ -895,13 +868,15 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout) struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer; unsigned int rbbm_status; unsigned long wait_timeout = - msecs_to_jiffies(adreno_dev->wait_timeout); + msecs_to_jiffies(adreno_dev->wait_timeout); + /*< DTS2012041906630 zhangxiangdang 20120423 begin */ + /*merge qc patch to fix kgsl issue.*/ unsigned long wait_time; unsigned long wait_time_part; unsigned int msecs; unsigned int msecs_first; unsigned int msecs_part; - + /* DTS2012041906630 zhangxiangdang 20120423 end > */ kgsl_cffdump_regpoll(device->id, REG_RBBM_STATUS << 2, 0x00000000, 0x80000000); /* first, wait until the CP has consumed all the commands in @@ -909,6 +884,8 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout) */ retry: if (rb->flags & KGSL_FLAGS_STARTED) { + /*< DTS2012041906630 zhangxiangdang 20120423 begin */ + /*merge qc patch to fix kgsl issue.*/ msecs = adreno_dev->wait_timeout; msecs_first = (msecs <= 100) ? ((msecs + 4) / 5) : 100; msecs_part = (msecs - msecs_first + 3) / 4; @@ -921,6 +898,7 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout) wait_time_part = jiffies + msecs_to_jiffies(msecs_part); } + /* DTS2012041906630 zhangxiangdang 20120423 end > */ GSL_RB_GET_READPTR(rb, &rb->rptr); if (time_after(jiffies, wait_time)) { KGSL_DRV_ERR(device, "rptr: %x, wptr: %x\n", @@ -987,7 +965,7 @@ static int adreno_suspend_context(struct kgsl_device *device) return status; } -struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, +const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, unsigned int pt_base, unsigned int gpuaddr, unsigned int size) @@ -1014,7 +992,8 @@ struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, if (!kgsl_mmu_pt_equal(priv->pagetable, pt_base)) continue; spin_lock(&priv->mem_lock); - entry = kgsl_sharedmem_find_region(priv, gpuaddr, size); + entry = kgsl_sharedmem_find_region(priv, gpuaddr, + sizeof(unsigned int)); if (entry) { result = &entry->memdesc; spin_unlock(&priv->mem_lock); @@ -1058,7 +1037,7 @@ struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, uint8_t *adreno_convertaddr(struct kgsl_device *device, unsigned int pt_base, unsigned int gpuaddr, unsigned int size) { - struct kgsl_memdesc *memdesc; + const struct kgsl_memdesc *memdesc; memdesc = adreno_find_region(device, pt_base, gpuaddr, size); diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h old mode 100644 new mode 100755 index 1259507d9..af5bf51ea --- a/drivers/gpu/msm/adreno.h +++ b/drivers/gpu/msm/adreno.h @@ -46,8 +46,6 @@ #define ADRENO_ISTORE_WORDS 3 #define ADRENO_ISTORE_START 0x5000 -#define ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW 50 - enum adreno_gpurev { ADRENO_REV_UNKNOWN = 0, ADRENO_REV_A200 = 200, @@ -76,16 +74,12 @@ struct adreno_device { unsigned int wait_timeout; unsigned int istore_size; unsigned int pix_shader_start; - unsigned int ib_check_level; }; struct adreno_gpudev { - /* keeps track of when we need to execute the draw workaround code */ - int ctx_switches_since_last_draw; int (*ctxt_create)(struct adreno_device *, struct adreno_context *); void (*ctxt_save)(struct adreno_device *, struct adreno_context *); void (*ctxt_restore)(struct adreno_device *, struct adreno_context *); - void (*ctxt_draw_workaround)(struct adreno_device *); irqreturn_t (*irq_handler)(struct adreno_device *); void (*irq_control)(struct adreno_device *, int); void * (*snapshot)(struct adreno_device *, void *, int *, int); @@ -105,7 +99,7 @@ void adreno_regread(struct kgsl_device *device, unsigned int offsetwords, void adreno_regwrite(struct kgsl_device *device, unsigned int offsetwords, unsigned int value); -struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, +const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, unsigned int pt_base, unsigned int gpuaddr, unsigned int size); diff --git a/drivers/gpu/msm/adreno_a2xx.c b/drivers/gpu/msm/adreno_a2xx.c old mode 100644 new mode 100755 index 62628e4a3..5ce9cf85b --- a/drivers/gpu/msm/adreno_a2xx.c +++ b/drivers/gpu/msm/adreno_a2xx.c @@ -1421,61 +1421,11 @@ static int a2xx_drawctxt_create(struct adreno_device *adreno_dev, return ret; } -static void a2xx_drawctxt_workaround(struct adreno_device *adreno_dev) -{ - struct kgsl_device *device = &adreno_dev->dev; - unsigned int cmd[11]; - unsigned int *cmds = &cmd[0]; - - if (adreno_is_a225(adreno_dev)) { - adreno_dev->gpudev->ctx_switches_since_last_draw++; - /* If there have been > than - * ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW calls to context - * switches w/o gmem being saved then we need to execute - * this workaround */ - if (adreno_dev->gpudev->ctx_switches_since_last_draw > - ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW) - adreno_dev->gpudev->ctx_switches_since_last_draw = 0; - else - return; - /* - * Issue an empty draw call to avoid possible hangs due to - * repeated idles without intervening draw calls. - * On adreno 225 the PC block has a cache that is only - * flushed on draw calls and repeated idles can make it - * overflow. The gmem save path contains draw calls so - * this workaround isn't needed there. - */ - *cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2); - *cmds++ = (0x4 << 16) | (REG_PA_SU_SC_MODE_CNTL - 0x2000); - *cmds++ = 0; - *cmds++ = cp_type3_packet(CP_DRAW_INDX, 5); - *cmds++ = 0; - *cmds++ = 1<<14; - *cmds++ = 0; - *cmds++ = device->mmu.setstate_memory.gpuaddr; - *cmds++ = 0; - *cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1); - *cmds++ = 0x00000000; - } else { - /* On Adreno 20x/220, if the events for shader space reuse - * gets dropped, the CP block would wait indefinitely. - * Sending CP_SET_SHADER_BASES packet unblocks the CP from - * this wait. - */ - *cmds++ = cp_type3_packet(CP_SET_SHADER_BASES, 1); - *cmds++ = adreno_encode_istore_size(adreno_dev) - | adreno_dev->pix_shader_start; - } - - adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_PMODE, - &cmd[0], cmds - cmd); -} - static void a2xx_drawctxt_save(struct adreno_device *adreno_dev, struct adreno_context *context) { struct kgsl_device *device = &adreno_dev->dev; + unsigned int cmd[22]; if (context == NULL) return; @@ -1520,11 +1470,33 @@ static void a2xx_drawctxt_save(struct adreno_device *adreno_dev, adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_NONE, context->chicken_restore, 3); } - adreno_dev->gpudev->ctx_switches_since_last_draw = 0; context->flags |= CTXT_FLAGS_GMEM_RESTORE; - } else if (adreno_is_a2xx(adreno_dev)) - a2xx_drawctxt_workaround(adreno_dev); + } else if (adreno_is_a225(adreno_dev)) { + unsigned int *cmds = &cmd[0]; + /* + * Issue an empty draw call to avoid possible hangs due to + * repeated idles without intervening draw calls. + * On adreno 225 the PC block has a cache that is only + * flushed on draw calls and repeated idles can make it + * overflow. The gmem save path contains draw calls so + * this workaround isn't needed there. + */ + *cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2); + *cmds++ = (0x4 << 16) | (REG_PA_SU_SC_MODE_CNTL - 0x2000); + *cmds++ = 0; + *cmds++ = cp_type3_packet(CP_DRAW_INDX, 5); + *cmds++ = 0; + *cmds++ = 1<<14; + *cmds++ = 0; + *cmds++ = device->mmu.setstate_memory.gpuaddr; + *cmds++ = 0; + *cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1); + *cmds++ = 0x00000000; + + adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_PMODE, + &cmd[0], 11); + } } static void a2xx_drawctxt_restore(struct adreno_device *adreno_dev, @@ -1785,7 +1757,6 @@ struct adreno_gpudev adreno_a2xx_gpudev = { .ctxt_create = a2xx_drawctxt_create, .ctxt_save = a2xx_drawctxt_save, .ctxt_restore = a2xx_drawctxt_restore, - .ctxt_draw_workaround = a2xx_drawctxt_workaround, .irq_handler = a2xx_irq_handler, .irq_control = a2xx_irq_control, .snapshot = a2xx_snapshot, diff --git a/drivers/gpu/msm/adreno_debugfs.c b/drivers/gpu/msm/adreno_debugfs.c index 566efa1aa..c1b9e4ce2 100644 --- a/drivers/gpu/msm/adreno_debugfs.c +++ b/drivers/gpu/msm/adreno_debugfs.c @@ -345,8 +345,6 @@ void adreno_debugfs_init(struct kgsl_device *device) &kgsl_cff_dump_enable_fops); debugfs_create_u32("wait_timeout", 0644, device->d_debugfs, &adreno_dev->wait_timeout); - debugfs_create_u32("ib_check", 0644, device->d_debugfs, - &adreno_dev->ib_check_level); /* Create post mortem control files */ diff --git a/drivers/gpu/msm/adreno_drawctxt.c b/drivers/gpu/msm/adreno_drawctxt.c index f0b5741b5..206a678ee 100644 --- a/drivers/gpu/msm/adreno_drawctxt.c +++ b/drivers/gpu/msm/adreno_drawctxt.c @@ -243,13 +243,8 @@ void adreno_drawctxt_switch(struct adreno_device *adreno_dev, } /* already current? */ - if (adreno_dev->drawctxt_active == drawctxt) { - if (adreno_dev->gpudev->ctxt_draw_workaround && - adreno_is_a225(adreno_dev)) - adreno_dev->gpudev->ctxt_draw_workaround( - adreno_dev); + if (adreno_dev->drawctxt_active == drawctxt) return; - } KGSL_CTXT_INFO(device, "from %p to %p flags %d\n", adreno_dev->drawctxt_active, drawctxt, flags); diff --git a/drivers/gpu/msm/adreno_pm4types.h b/drivers/gpu/msm/adreno_pm4types.h index 454b05785..8aea58c95 100644 --- a/drivers/gpu/msm/adreno_pm4types.h +++ b/drivers/gpu/msm/adreno_pm4types.h @@ -29,6 +29,11 @@ /* skip N 32-bit words to get to the next packet */ #define CP_NOP 0x10 +/* indirect buffer dispatch. prefetch parser uses this packet type to determine +* whether to pre-fetch the IB +*/ +#define CP_INDIRECT_BUFFER 0x3f + /* indirect buffer dispatch. same as IB, but init is pipelined */ #define CP_INDIRECT_BUFFER_PFD 0x37 @@ -112,9 +117,6 @@ /* load constants from a location in memory */ #define CP_LOAD_CONSTANT_CONTEXT 0x2e -/* (A2x) sets binning configuration registers */ -#define CP_SET_BIN_DATA 0x2f - /* selective invalidation of state pointers */ #define CP_INVALIDATE_STATE 0x3b @@ -155,16 +157,6 @@ #define CP_SET_PROTECTED_MODE 0x5f /* sets the register protection mode */ -/* - * for a3xx - */ - -/* Conditionally load a IB based on a flag */ -#define CP_COND_INDIRECT_BUFFER_PFE 0x3A /* prefetch enabled */ -#define CP_COND_INDIRECT_BUFFER_PFD 0x32 /* prefetch disabled */ - -/* Load a buffer with pre-fetch enabled */ -#define CP_INDIRECT_BUFFER_PFE 0x3F /* packet header building macros */ #define cp_type0_packet(regindx, cnt) \ @@ -186,20 +178,11 @@ #define cp_nop_packet(cnt) \ (CP_TYPE3_PKT | (((cnt)-1) << 16) | (CP_NOP << 8)) -#define pkt_is_type0(pkt) (((pkt) & 0XC0000000) == CP_TYPE0_PKT) - -#define type0_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1) -#define type0_pkt_offset(pkt) ((pkt) & 0x7FFF) - -#define pkt_is_type3(pkt) (((pkt) & 0xC0000000) == CP_TYPE3_PKT) - -#define cp_type3_opcode(pkt) (((pkt) >> 8) & 0xFF) -#define type3_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1) /* packet headers */ #define CP_HDR_ME_INIT cp_type3_packet(CP_ME_INIT, 18) #define CP_HDR_INDIRECT_BUFFER_PFD cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2) -#define CP_HDR_INDIRECT_BUFFER_PFE cp_type3_packet(CP_INDIRECT_BUFFER_PFE, 2) +#define CP_HDR_INDIRECT_BUFFER cp_type3_packet(CP_INDIRECT_BUFFER, 2) /* dword base address of the GFX decode space */ #define SUBBLOCK_OFFSET(reg) ((unsigned int)((reg) - (0x2000))) @@ -207,14 +190,4 @@ /* gmem command buffer length */ #define CP_REG(reg) ((0x4 << 16) | (SUBBLOCK_OFFSET(reg))) - -/* Return 1 if the command is an indirect buffer of any kind */ -static inline int adreno_cmd_is_ib(unsigned int cmd) -{ - return (cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFE, 2) || - cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2) || - cmd == cp_type3_packet(CP_COND_INDIRECT_BUFFER_PFE, 2) || - cmd == cp_type3_packet(CP_COND_INDIRECT_BUFFER_PFD, 2)); -} - #endif /* __ADRENO_PM4TYPES_H */ diff --git a/drivers/gpu/msm/adreno_postmortem.c b/drivers/gpu/msm/adreno_postmortem.c old mode 100644 new mode 100755 index 63f5caa91..40dfb30cf --- a/drivers/gpu/msm/adreno_postmortem.c +++ b/drivers/gpu/msm/adreno_postmortem.c @@ -53,7 +53,7 @@ static const struct pm_id_name pm3_types[] = { {CP_IM_LOAD, "IN__LOAD"}, {CP_IM_LOAD_IMMEDIATE, "IM_LOADI"}, {CP_IM_STORE, "IM_STORE"}, - {CP_INDIRECT_BUFFER_PFE, "IND_BUF_"}, + {CP_INDIRECT_BUFFER, "IND_BUF_"}, {CP_INDIRECT_BUFFER_PFD, "IND_BUFP"}, {CP_INTERRUPT, "PM4_INTR"}, {CP_INVALIDATE_STATE, "INV_STAT"}, @@ -200,7 +200,7 @@ static void dump_ib1(struct kgsl_device *device, uint32_t pt_base, for (i = 0; i+3 < ib1_size; ) { value = ib1_addr[i++]; - if (adreno_cmd_is_ib(value)) { + if (value == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) { uint32_t ib2_base = ib1_addr[i++]; uint32_t ib2_size = ib1_addr[i++]; @@ -611,7 +611,7 @@ static int adreno_dump(struct kgsl_device *device) i = 0; for (read_idx = 0; read_idx < num_item; ) { uint32_t this_cmd = rb_copy[read_idx++]; - if (adreno_cmd_is_ib(this_cmd)) { + if (this_cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) { uint32_t ib_addr = rb_copy[read_idx++]; uint32_t ib_size = rb_copy[read_idx++]; dump_ib1(device, cur_pt_base, (read_idx-3)<<2, ib_addr, @@ -654,7 +654,8 @@ static int adreno_dump(struct kgsl_device *device) for (read_idx = NUM_DWORDS_OF_RINGBUFFER_HISTORY; read_idx >= 0; --read_idx) { uint32_t this_cmd = rb_copy[read_idx]; - if (adreno_cmd_is_ib(this_cmd)) { + if (this_cmd == cp_type3_packet( + CP_INDIRECT_BUFFER_PFD, 2)) { uint32_t ib_addr = rb_copy[read_idx+1]; uint32_t ib_size = rb_copy[read_idx+2]; if (ib_size && cp_ib1_base == ib_addr) { diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c index 57883fd45..ea2889b3a 100644 --- a/drivers/gpu/msm/adreno_ringbuffer.c +++ b/drivers/gpu/msm/adreno_ringbuffer.c @@ -22,7 +22,6 @@ #include "adreno.h" #include "adreno_pm4types.h" #include "adreno_ringbuffer.h" -#include "adreno_debugfs.h" #include "a2xx_reg.h" @@ -311,10 +310,12 @@ int adreno_ringbuffer_start(struct adreno_ringbuffer *rb, unsigned int init_ram) adreno_regwrite(device, REG_SCRATCH_UMSK, GSL_RB_MEMPTRS_SCRATCH_MASK); + /*< DTS2012042406822 hanfeng 20120428 begin*/ /* update the eoptimestamp field with the last retired timestamp */ kgsl_sharedmem_writel(&device->memstore, KGSL_DEVICE_MEMSTORE_OFFSET(eoptimestamp), rb->timestamp); + /* DTS2012042406822 hanfeng 20120428 end > */ /* load the CP ucode */ @@ -553,197 +554,6 @@ adreno_ringbuffer_issuecmds(struct kgsl_device *device, adreno_ringbuffer_addcmds(rb, flags, cmds, sizedwords); } -static bool _parse_ibs(struct kgsl_device_private *dev_priv, uint gpuaddr, - int sizedwords); - -static bool -_handle_type3(struct kgsl_device_private *dev_priv, uint *hostaddr) -{ - unsigned int opcode = cp_type3_opcode(*hostaddr); - switch (opcode) { - case CP_INDIRECT_BUFFER_PFD: - case CP_INDIRECT_BUFFER_PFE: - case CP_COND_INDIRECT_BUFFER_PFE: - case CP_COND_INDIRECT_BUFFER_PFD: - return _parse_ibs(dev_priv, hostaddr[1], hostaddr[2]); - case CP_NOP: - case CP_WAIT_FOR_IDLE: - case CP_WAIT_REG_MEM: - case CP_WAIT_REG_EQ: - case CP_WAT_REG_GTE: - case CP_WAIT_UNTIL_READ: - case CP_WAIT_IB_PFD_COMPLETE: - case CP_REG_RMW: - case CP_REG_TO_MEM: - case CP_MEM_WRITE: - case CP_MEM_WRITE_CNTR: - case CP_COND_EXEC: - case CP_COND_WRITE: - case CP_EVENT_WRITE: - case CP_EVENT_WRITE_SHD: - case CP_EVENT_WRITE_CFL: - case CP_EVENT_WRITE_ZPD: - case CP_DRAW_INDX: - case CP_DRAW_INDX_2: - case CP_DRAW_INDX_BIN: - case CP_DRAW_INDX_2_BIN: - case CP_VIZ_QUERY: - case CP_SET_STATE: - case CP_SET_CONSTANT: - case CP_IM_LOAD: - case CP_IM_LOAD_IMMEDIATE: - case CP_LOAD_CONSTANT_CONTEXT: - case CP_INVALIDATE_STATE: - case CP_SET_SHADER_BASES: - case CP_SET_BIN_MASK: - case CP_SET_BIN_SELECT: - case CP_SET_BIN_BASE_OFFSET: - case CP_SET_BIN_DATA: - case CP_CONTEXT_UPDATE: - case CP_INTERRUPT: - case CP_IM_STORE: - break; - /* these shouldn't come from userspace */ - case CP_ME_INIT: - case CP_SET_PROTECTED_MODE: - default: - KGSL_CMD_ERR(dev_priv->device, "bad CP opcode %0x\n", opcode); - return false; - break; - } - - return true; -} - -static bool -_handle_type0(struct kgsl_device_private *dev_priv, uint *hostaddr) -{ - unsigned int reg = type0_pkt_offset(*hostaddr); - unsigned int cnt = type0_pkt_size(*hostaddr); - if (reg < 0x0192 || (reg + cnt) >= 0x8000) { - KGSL_CMD_ERR(dev_priv->device, "bad type0 reg: 0x%0x cnt: %d\n", - reg, cnt); - return false; - } - return true; -} - -/* - * Traverse IBs and dump them to test vector. Detect swap by inspecting - * register writes, keeping note of the current state, and dump - * framebuffer config to test vector - */ -static bool _parse_ibs(struct kgsl_device_private *dev_priv, - uint gpuaddr, int sizedwords) -{ - static uint level; /* recursion level */ - bool ret = false; - uint *hostaddr, *hoststart; - int dwords_left = sizedwords; /* dwords left in the current command - buffer */ - struct kgsl_mem_entry *entry; - - spin_lock(&dev_priv->process_priv->mem_lock); - entry = kgsl_sharedmem_find_region(dev_priv->process_priv, - gpuaddr, sizedwords * sizeof(uint)); - spin_unlock(&dev_priv->process_priv->mem_lock); - if (entry == NULL) { - KGSL_CMD_ERR(dev_priv->device, - "no mapping for gpuaddr: 0x%08x\n", gpuaddr); - return false; - } - - hostaddr = (uint *)kgsl_gpuaddr_to_vaddr(&entry->memdesc, gpuaddr); - if (hostaddr == NULL) { - KGSL_CMD_ERR(dev_priv->device, - "no mapping for gpuaddr: 0x%08x\n", gpuaddr); - return false; - } - - hoststart = hostaddr; - - level++; - - KGSL_CMD_INFO(dev_priv->device, "ib: gpuaddr:0x%08x, wc:%d, hptr:%p\n", - gpuaddr, sizedwords, hostaddr); - - mb(); - while (dwords_left > 0) { - bool cur_ret = true; - int count = 0; /* dword count including packet header */ - - switch (*hostaddr >> 30) { - case 0x0: /* type-0 */ - count = (*hostaddr >> 16)+2; - cur_ret = _handle_type0(dev_priv, hostaddr); - break; - case 0x1: /* type-1 */ - count = 2; - break; - case 0x3: /* type-3 */ - count = ((*hostaddr >> 16) & 0x3fff) + 2; - cur_ret = _handle_type3(dev_priv, hostaddr); - break; - default: - KGSL_CMD_ERR(dev_priv->device, "unexpected type: " - "type:%d, word:0x%08x @ 0x%p, gpu:0x%08x\n", - *hostaddr >> 30, *hostaddr, hostaddr, - gpuaddr+4*(sizedwords-dwords_left)); - cur_ret = false; - count = dwords_left; - break; - } - - if (!cur_ret) { - KGSL_CMD_ERR(dev_priv->device, - "bad sub-type: #:%d/%d, v:0x%08x" - " @ 0x%p[gb:0x%08x], level:%d\n", - sizedwords-dwords_left, sizedwords, *hostaddr, - hostaddr, gpuaddr+4*(sizedwords-dwords_left), - level); - - if (ADRENO_DEVICE(dev_priv->device)->ib_check_level - >= 2) - print_hex_dump(KERN_ERR, - level == 1 ? "IB1:" : "IB2:", - DUMP_PREFIX_OFFSET, 32, 4, hoststart, - sizedwords*4, 0); - goto done; - } - - /* jump to next packet */ - dwords_left -= count; - hostaddr += count; - if (dwords_left < 0) { - KGSL_CMD_ERR(dev_priv->device, - "bad count: c:%d, #:%d/%d, " - "v:0x%08x @ 0x%p[gb:0x%08x], level:%d\n", - count, sizedwords-(dwords_left+count), - sizedwords, *(hostaddr-count), hostaddr-count, - gpuaddr+4*(sizedwords-(dwords_left+count)), - level); - if (ADRENO_DEVICE(dev_priv->device)->ib_check_level - >= 2) - print_hex_dump(KERN_ERR, - level == 1 ? "IB1:" : "IB2:", - DUMP_PREFIX_OFFSET, 32, 4, hoststart, - sizedwords*4, 0); - goto done; - } - } - - ret = true; -done: - if (!ret) - KGSL_DRV_ERR(dev_priv->device, - "parsing failed: gpuaddr:0x%08x, " - "host:0x%p, wc:%d\n", gpuaddr, hoststart, sizedwords); - - level--; - - return ret; -} - int adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv, struct kgsl_context *context, @@ -791,12 +601,9 @@ adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv, start_index = 1; for (i = start_index; i < numibs; i++) { - if (unlikely(adreno_dev->ib_check_level >= 1 && - !_parse_ibs(dev_priv, ibdesc[i].gpuaddr, - ibdesc[i].sizedwords))) { - kfree(link); - return -EINVAL; - } + (void)kgsl_cffdump_parse_ibs(dev_priv, NULL, + ibdesc[i].gpuaddr, ibdesc[i].sizedwords, false); + *cmds++ = CP_HDR_INDIRECT_BUFFER_PFD; *cmds++ = ibdesc[i].gpuaddr; *cmds++ = ibdesc[i].sizedwords; @@ -950,20 +757,8 @@ int adreno_ringbuffer_extract(struct adreno_ringbuffer *rb, kgsl_sharedmem_readl(&rb->buffer_desc, &value, rb_rptr); rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr, rb->buffer_desc.size); - - /* - * If other context switches were already lost and - * and the current context is the one that is hanging, - * then we cannot recover. Print an error message - * and leave. - */ - - if ((copy_rb_contents == 0) && (value == cur_context)) { - KGSL_DRV_ERR(device, "GPU recovery could not " - "find the previous context\n"); - return -EINVAL; - } - + BUG_ON((copy_rb_contents == 0) && + (value == cur_context)); /* * If we were copying the commands and got to this point * then we need to remove the 3 commands that appear diff --git a/drivers/gpu/msm/adreno_snapshot.c b/drivers/gpu/msm/adreno_snapshot.c index c45dbff48..fb88a72bd 100644 --- a/drivers/gpu/msm/adreno_snapshot.c +++ b/drivers/gpu/msm/adreno_snapshot.c @@ -45,19 +45,11 @@ static void push_object(struct kgsl_device *device, int type, uint32_t ptbase, int index; void *ptr; - /* - * Sometimes IBs can be reused in the same dump. Because we parse from - * oldest to newest, if we come across an IB that has already been used, - * assume that it has been reused and update the list with the newest - * size. - */ - + /* Go through the list and see that object has already been seen */ for (index = 0; index < objbufptr; index++) { if (objbuf[index].gpuaddr == gpuaddr && - objbuf[index].ptbase == ptbase) { - objbuf[index].dwords = dwords; - return; - } + objbuf[index].ptbase == ptbase) + return; } if (objbufptr == SNAPSHOT_OBJ_BUFSIZE) { @@ -85,25 +77,6 @@ static void push_object(struct kgsl_device *device, int type, uint32_t ptbase, objbuf[objbufptr++].ptr = ptr; } -/* - * Return a 1 if the specified object is already on the list of buffers - * to be dumped - */ - -static int find_object(int type, unsigned int gpuaddr, unsigned int ptbase) -{ - int index; - - for (index = 0; index < objbufptr; index++) { - if (objbuf[index].gpuaddr == gpuaddr && - objbuf[index].ptbase == ptbase && - objbuf[index].type == type) - return 1; - } - - return 0; -} - /* Snapshot the istore memory */ static int snapshot_istore(struct kgsl_device *device, void *snapshot, int remain, void *priv) @@ -140,7 +113,6 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot, unsigned int rbbase, ptbase, rptr, *rbptr; int start, stop, index; int numitems, size; - int parse_ibs = 0, ib_parse_start; /* Get the GPU address of the ringbuffer */ kgsl_regread(device, REG_CP_RB_BASE, &rbbase); @@ -186,52 +158,8 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot, header->rbsize = rb->sizedwords; header->count = numitems; - /* - * We can only reliably dump IBs from the beginning of the context, - * and it turns out that for the vast majority of the time we really - * only care about the current context when it comes to diagnosing - * a hang. So, with an eye to limiting the buffer dumping to what is - * really useful find the beginning of the context and only dump - * IBs from that point - */ - - index = rptr; - ib_parse_start = start; - rbptr = rb->buffer_desc.hostptr; - - while (index != start) { - index--; - - if (index < 0) { - /* - * The marker we are looking for is 2 dwords long, so - * when wrapping, go back 2 from the end so we don't - * access out of range in the if statement below - */ - index = rb->sizedwords - 2; - - /* - * Account for the possibility that start might be at - * rb->sizedwords - 1 - */ - - if (start == rb->sizedwords - 1) - break; - } - - /* - * Look for a NOP packet with the context switch identifier in - * the second dword - */ - - if (rbptr[index] == cp_nop_packet(1) && - rbptr[index + 1] == KGSL_CONTEXT_TO_MEM_IDENTIFIER) { - ib_parse_start = index; - break; - } - } - index = start; + rbptr = rb->buffer_desc.hostptr; /* * Loop through the RB, copying the data and looking for indirect @@ -241,18 +169,15 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot, while (index != rb->wptr) { *data = rbptr[index]; - /* Only parse IBs between the context start and the rptr */ - - if (index == ib_parse_start) - parse_ibs = 1; - - if (index == rptr) - parse_ibs = 0; - - if (parse_ibs && adreno_cmd_is_ib(rbptr[index])) + if (rbptr[index] == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase, rbptr[index + 1], rbptr[index + 2]); + /* + * FIXME: Handle upcoming MMU pagetable changes, but only + * between the rptr and the wptr + */ + index = index + 1; if (index == rb->sizedwords) @@ -303,9 +228,10 @@ static int snapshot_ib(struct kgsl_device *device, void *snapshot, *dst = *src; /* If another IB is discovered, then push it on the list too */ - if (adreno_cmd_is_ib(*src)) + if (*src == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) { push_object(device, SNAPSHOT_OBJ_TYPE_IB, obj->ptbase, *(src + 1), *(src + 2)); + } src++; dst++; @@ -362,45 +288,22 @@ void *adreno_snapshot(struct kgsl_device *device, void *snapshot, int *remain, snapshot, remain, snapshot_rb, NULL); /* - * Make sure that the last IB1 that was being executed is dumped. - * Since this was the last IB1 that was processed, we should have - * already added it to the list during the ringbuffer parse but we - * want to be double plus sure. + * Make sure that the IBs described in the CP registers are on the + * list of objects */ - kgsl_regread(device, REG_CP_IB1_BASE, &ibbase); kgsl_regread(device, REG_CP_IB1_BUFSZ, &ibsize); - /* - * The problem is that IB size from the register is the unprocessed size - * of the buffer not the original size, so if we didn't catch this - * buffer being directly used in the RB, then we might not be able to - * dump the whle thing. Print a warning message so we can try to - * figure how often this really happens. - */ - - if (!find_object(SNAPSHOT_OBJ_TYPE_IB, ibbase, ptbase) && ibsize) { + if (ibsize) push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase, ibbase, ibsize); - KGSL_DRV_ERR(device, "CP_IB1_BASE not found in the ringbuffer. " - "Dumping %x dwords of the buffer.\n", ibsize); - } kgsl_regread(device, REG_CP_IB2_BASE, &ibbase); kgsl_regread(device, REG_CP_IB2_BUFSZ, &ibsize); - /* - * Add the last parsed IB2 to the list. The IB2 should be found as we - * parse the objects below, but we try to add it to the list first, so - * it too can be parsed. Don't print an error message in this case - if - * the IB2 is found during parsing, the list will be updated with the - * correct size. - */ - - if (!find_object(SNAPSHOT_OBJ_TYPE_IB, ibbase, ptbase) && ibsize) { + if (ibsize) push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase, ibbase, ibsize); - } /* * Go through the list of found objects and dump each one. As the IBs diff --git a/drivers/gpu/msm/kgsl.c b/drivers/gpu/msm/kgsl.c old mode 100644 new mode 100755 index 39dad925f..d51128979 --- a/drivers/gpu/msm/kgsl.c +++ b/drivers/gpu/msm/kgsl.c @@ -21,10 +21,11 @@ #include #include #include -#include + #include #include #include +#include #include "kgsl.h" #include "kgsl_debugfs.h" @@ -193,28 +194,8 @@ static void kgsl_mem_entry_attach_process(struct kgsl_mem_entry *entry, struct kgsl_process_private *process) { - struct rb_node **node; - struct rb_node *parent = NULL; - spin_lock(&process->mem_lock); - - node = &process->mem_rb.rb_node; - - while (*node) { - struct kgsl_mem_entry *cur; - - parent = *node; - cur = rb_entry(parent, struct kgsl_mem_entry, node); - - if (entry->memdesc.gpuaddr < cur->memdesc.gpuaddr) - node = &parent->rb_left; - else - node = &parent->rb_right; - } - - rb_link_node(&entry->node, parent, node); - rb_insert_color(&entry->node, &process->mem_rb); - + list_add(&entry->list, &process->mem_list); spin_unlock(&process->mem_lock); entry->priv = process; @@ -424,10 +405,6 @@ static int kgsl_suspend_device(struct kgsl_device *device, pm_message_t state) INIT_COMPLETION(device->hwaccess_gate); device->ftbl->suspend_context(device); device->ftbl->stop(device); - if (device->idle_wakelock.name) - wake_unlock(&device->idle_wakelock); - pm_qos_update_request(&device->pm_qos_req_dma, - PM_QOS_DEFAULT_VALUE); kgsl_pwrctrl_set_state(device, KGSL_STATE_SUSPEND); break; case KGSL_STATE_SLUMBER: @@ -537,8 +514,8 @@ void kgsl_late_resume_driver(struct early_suspend *h) struct kgsl_device, display_off); KGSL_PWR_WARN(device, "late resume start\n"); mutex_lock(&device->mutex); - device->pwrctrl.restore_slumber = 0; kgsl_pwrctrl_wake(device); + device->pwrctrl.restore_slumber = 0; kgsl_pwrctrl_pwrlevel_change(device, KGSL_PWRLEVEL_TURBO); mutex_unlock(&device->mutex); kgsl_check_idle(device); @@ -571,7 +548,8 @@ kgsl_get_process_private(struct kgsl_device_private *cur_dev_priv) spin_lock_init(&private->mem_lock); private->refcnt = 1; private->pid = task_tgid_nr(current); - private->mem_rb = RB_ROOT; + + INIT_LIST_HEAD(&private->mem_list); if (kgsl_mmu_enabled()) { @@ -600,7 +578,7 @@ kgsl_put_process_private(struct kgsl_device *device, struct kgsl_process_private *private) { struct kgsl_mem_entry *entry = NULL; - struct rb_node *node; + struct kgsl_mem_entry *entry_tmp = NULL; if (!private) return; @@ -614,13 +592,11 @@ kgsl_put_process_private(struct kgsl_device *device, list_del(&private->list); - for (node = rb_first(&private->mem_rb); node; ) { - entry = rb_entry(node, struct kgsl_mem_entry, node); - node = rb_next(&entry->node); - - rb_erase(&entry->node, &private->mem_rb); + list_for_each_entry_safe(entry, entry_tmp, &private->mem_list, list) { + list_del(&entry->list); kgsl_mem_entry_put(entry); } + kgsl_mmu_putpagetable(private->pagetable); kfree(private); unlock: @@ -746,42 +722,46 @@ static int kgsl_open(struct inode *inodep, struct file *filep) return result; } + /*call with private->mem_lock locked */ -struct kgsl_mem_entry * -kgsl_sharedmem_find_region(struct kgsl_process_private *private, - unsigned int gpuaddr, size_t size) +static struct kgsl_mem_entry * +kgsl_sharedmem_find(struct kgsl_process_private *private, unsigned int gpuaddr) { - struct rb_node *node = private->mem_rb.rb_node; - - while (node != NULL) { - struct kgsl_mem_entry *entry; - - entry = rb_entry(node, struct kgsl_mem_entry, node); + struct kgsl_mem_entry *entry = NULL, *result = NULL; + BUG_ON(private == NULL); - if (kgsl_gpuaddr_in_memdesc(&entry->memdesc, gpuaddr, size)) - return entry; + gpuaddr &= PAGE_MASK; - if (gpuaddr < entry->memdesc.gpuaddr) - node = node->rb_left; - else if (gpuaddr >= - (entry->memdesc.gpuaddr + entry->memdesc.size)) - node = node->rb_right; - else { - return NULL; + list_for_each_entry(entry, &private->mem_list, list) { + if (entry->memdesc.gpuaddr == gpuaddr) { + result = entry; + break; } } - - return NULL; + return result; } -EXPORT_SYMBOL(kgsl_sharedmem_find_region); /*call with private->mem_lock locked */ -static inline struct kgsl_mem_entry * -kgsl_sharedmem_find(struct kgsl_process_private *private, unsigned int gpuaddr) +struct kgsl_mem_entry * +kgsl_sharedmem_find_region(struct kgsl_process_private *private, + unsigned int gpuaddr, + size_t size) { - return kgsl_sharedmem_find_region(private, gpuaddr, 1); + struct kgsl_mem_entry *entry = NULL, *result = NULL; + + BUG_ON(private == NULL); + + list_for_each_entry(entry, &private->mem_list, list) { + if (kgsl_gpuaddr_in_memdesc(&entry->memdesc, gpuaddr, size)) { + result = entry; + break; + } + } + + return result; } +EXPORT_SYMBOL(kgsl_sharedmem_find_region); /*call all ioctl sub functions with driver locked*/ static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv, @@ -809,40 +789,6 @@ static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv, break; } - case KGSL_PROP_GPU_RESET_STAT: - { - /* Return reset status of given context and clear it */ - uint32_t id; - struct kgsl_context *context; - - if (param->sizebytes != sizeof(unsigned int)) { - result = -EINVAL; - break; - } - /* We expect the value passed in to contain the context id */ - if (copy_from_user(&id, param->value, - sizeof(unsigned int))) { - result = -EFAULT; - break; - } - context = kgsl_find_context(dev_priv, id); - if (!context) { - result = -EINVAL; - break; - } - /* - * Copy the reset status to value which also serves as - * the out parameter - */ - if (copy_to_user(param->value, &(context->reset_status), - sizeof(unsigned int))) { - result = -EFAULT; - break; - } - /* Clear reset status once its been queried */ - context->reset_status = KGSL_CTX_STAT_NO_ERROR; - break; - } default: result = dev_priv->device->ftbl->getproperty( dev_priv->device, param->type, @@ -881,6 +827,40 @@ static long kgsl_ioctl_device_waittimestamp(struct kgsl_device_private return result; } +static bool check_ibdesc(struct kgsl_device_private *dev_priv, + struct kgsl_ibdesc *ibdesc, unsigned int numibs, + bool parse) +{ + bool result = true; + unsigned int i; + for (i = 0; i < numibs; i++) { + struct kgsl_mem_entry *entry; + spin_lock(&dev_priv->process_priv->mem_lock); + entry = kgsl_sharedmem_find_region(dev_priv->process_priv, + ibdesc[i].gpuaddr, ibdesc[i].sizedwords * sizeof(uint)); + spin_unlock(&dev_priv->process_priv->mem_lock); + if (entry == NULL) { + KGSL_DRV_ERR(dev_priv->device, + "invalid cmd buffer gpuaddr %08x " \ + "sizedwords %d\n", ibdesc[i].gpuaddr, + ibdesc[i].sizedwords); + result = false; + break; + } + + if (parse && !kgsl_cffdump_parse_ibs(dev_priv, &entry->memdesc, + ibdesc[i].gpuaddr, ibdesc[i].sizedwords, true)) { + KGSL_DRV_ERR(dev_priv->device, + "invalid cmd buffer gpuaddr %08x " \ + "sizedwords %d numibs %d/%d\n", + ibdesc[i].gpuaddr, + ibdesc[i].sizedwords, i+1, numibs); + result = false; + break; + } + } + return result; +} static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv, unsigned int cmd, void *data) @@ -950,6 +930,12 @@ static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv, param->numibs = 1; } + if (!check_ibdesc(dev_priv, ibdesc, param->numibs, true)) { + KGSL_DRV_ERR(dev_priv->device, "bad ibdesc"); + result = -EINVAL; + goto free_ibdesc; + } + result = dev_priv->device->ftbl->issueibcmds(dev_priv, context, ibdesc, @@ -959,6 +945,18 @@ static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv, trace_kgsl_issueibcmds(dev_priv->device, param, result); + if (result != 0) + goto free_ibdesc; + + /* this is a check to try to detect if a command buffer was freed + * during issueibcmds(). + */ + if (!check_ibdesc(dev_priv, ibdesc, param->numibs, false)) { + KGSL_DRV_ERR(dev_priv->device, "bad ibdesc AFTER issue"); + result = -EINVAL; + goto free_ibdesc; + } + free_ibdesc: kfree(ibdesc); done: @@ -990,7 +988,7 @@ static void kgsl_freemem_event_cb(struct kgsl_device *device, { struct kgsl_mem_entry *entry = priv; spin_lock(&entry->priv->mem_lock); - rb_erase(&entry->node, &entry->priv->mem_rb); + list_del(&entry->list); spin_unlock(&entry->priv->mem_lock); kgsl_mem_entry_put(entry); } @@ -1082,8 +1080,7 @@ static long kgsl_ioctl_sharedmem_free(struct kgsl_device_private *dev_priv, spin_lock(&private->mem_lock); entry = kgsl_sharedmem_find(private, param->gpuaddr); if (entry) - rb_erase(&entry->node, &private->mem_rb); - + list_del(&entry->list); spin_unlock(&private->mem_lock); if (entry) { @@ -1167,7 +1164,7 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv, goto error; } - result = kgsl_sharedmem_page_alloc_user(&entry->memdesc, + result = kgsl_sharedmem_vmalloc_user(&entry->memdesc, private->pagetable, len, param->flags); if (result != 0) @@ -1175,10 +1172,10 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv, vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - result = kgsl_sharedmem_map_vma(vma, &entry->memdesc); + result = remap_vmalloc_range(vma, (void *) entry->memdesc.hostptr, 0); if (result) { - KGSL_CORE_ERR("kgsl_sharedmem_map_vma failed: %d\n", result); - goto error_free_alloc; + KGSL_CORE_ERR("remap_vmalloc_range failed: %d\n", result); + goto error_free_vmalloc; } param->gpuaddr = entry->memdesc.gpuaddr; @@ -1193,7 +1190,7 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv, kgsl_check_idle(dev_priv->device); return 0; -error_free_alloc: +error_free_vmalloc: kgsl_sharedmem_free(&entry->memdesc); error_free_entry: @@ -1316,8 +1313,7 @@ static int memdesc_sg_virt(struct kgsl_memdesc *memdesc, int sglen = PAGE_ALIGN(size) / PAGE_SIZE; unsigned long paddr = (unsigned long) addr; - memdesc->sg = kgsl_sg_alloc(sglen); - + memdesc->sg = vmalloc(sglen * sizeof(struct scatterlist)); if (memdesc->sg == NULL) return -ENOMEM; @@ -1357,7 +1353,7 @@ static int memdesc_sg_virt(struct kgsl_memdesc *memdesc, err: spin_unlock(¤t->mm->page_table_lock); - kgsl_sg_free(memdesc->sg, sglen); + vfree(memdesc->sg); memdesc->sg = NULL; return -EINVAL; @@ -1492,8 +1488,11 @@ static int kgsl_setup_ion(struct kgsl_mem_entry *entry, struct scatterlist *s; unsigned long flags; - if (IS_ERR_OR_NULL(kgsl_ion_client)) - return -ENODEV; + if (kgsl_ion_client == NULL) { + kgsl_ion_client = msm_ion_client_create(UINT_MAX, KGSL_NAME); + if (kgsl_ion_client == NULL) + return -ENODEV; + } handle = ion_import_fd(kgsl_ion_client, fd); if (IS_ERR_OR_NULL(handle)) @@ -1623,20 +1622,10 @@ static long kgsl_ioctl_map_user_mem(struct kgsl_device_private *dev_priv, kgsl_check_idle(dev_priv->device); return result; -error_put_file_ptr: - switch (entry->memtype) { - case KGSL_MEM_ENTRY_PMEM: - case KGSL_MEM_ENTRY_ASHMEM: - if (entry->priv_data) - fput(entry->priv_data); - break; - case KGSL_MEM_ENTRY_ION: - ion_unmap_dma(kgsl_ion_client, entry->priv_data); - ion_free(kgsl_ion_client, entry->priv_data); - break; - default: - break; - } + error_put_file_ptr: + if (entry->priv_data) + fput(entry->priv_data); + error: kfree(entry); kgsl_check_idle(dev_priv->device); @@ -2040,7 +2029,7 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma) unsigned long vma_offset = vma->vm_pgoff << PAGE_SHIFT; struct kgsl_device_private *dev_priv = file->private_data; struct kgsl_process_private *private = dev_priv->process_priv; - struct kgsl_mem_entry *entry = NULL; + struct kgsl_mem_entry *tmp, *entry = NULL; struct kgsl_device *device = dev_priv->device; /* Handle leagacy behavior for memstore */ @@ -2051,11 +2040,13 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma) /* Find a chunk of GPU memory */ spin_lock(&private->mem_lock); - entry = kgsl_sharedmem_find(private, vma_offset); - - if (entry) - kgsl_mem_entry_get(entry); - + list_for_each_entry(tmp, &private->mem_list, list) { + if (vma_offset == tmp->memdesc.gpuaddr) { + kgsl_mem_entry_get(tmp); + entry = tmp; + break; + } + } spin_unlock(&private->mem_lock); if (entry == NULL) @@ -2111,8 +2102,8 @@ void kgsl_unregister_device(struct kgsl_device *device) kgsl_cffdump_close(device->id); kgsl_pwrctrl_uninit_sysfs(device); - wake_lock_destroy(&device->idle_wakelock); - pm_qos_remove_request(&device->pm_qos_req_dma); + if (cpu_is_msm8x60()) + wake_lock_destroy(&device->idle_wakelock); idr_destroy(&device->context_idr); @@ -2203,9 +2194,9 @@ kgsl_register_device(struct kgsl_device *device) if (ret != 0) goto err_close_mmu; - wake_lock_init(&device->idle_wakelock, WAKE_LOCK_IDLE, device->name); - pm_qos_add_request(&device->pm_qos_req_dma, PM_QOS_CPU_DMA_LATENCY, - PM_QOS_DEFAULT_VALUE); + if (cpu_is_msm8x60()) + wake_lock_init(&device->idle_wakelock, + WAKE_LOCK_IDLE, device->name); idr_init(&device->context_idr); @@ -2251,8 +2242,6 @@ int kgsl_device_platform_probe(struct kgsl_device *device, if (status) goto error; - kgsl_ion_client = msm_ion_client_create(UINT_MAX, KGSL_NAME); - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, device->iomemname); if (res == NULL) { @@ -2350,30 +2339,22 @@ kgsl_ptdata_init(void) static void kgsl_core_exit(void) { - kgsl_mmu_ptpool_destroy(kgsl_driver.ptpool); - kgsl_driver.ptpool = NULL; + unregister_chrdev_region(kgsl_driver.major, KGSL_DEVICE_MAX); - kgsl_drm_exit(); - kgsl_cffdump_destroy(); - kgsl_core_debugfs_close(); + kgsl_mmu_ptpool_destroy(&kgsl_driver.ptpool); + kgsl_driver.ptpool = NULL; - /* - * We call kgsl_sharedmem_uninit_sysfs() and device_unregister() - * only if kgsl_driver.virtdev has been populated. - * We check at least one member of kgsl_driver.virtdev to - * see if it is not NULL (and thus, has been populated). - */ - if (kgsl_driver.virtdev.class) { - kgsl_sharedmem_uninit_sysfs(); - device_unregister(&kgsl_driver.virtdev); - } + device_unregister(&kgsl_driver.virtdev); if (kgsl_driver.class) { class_destroy(kgsl_driver.class); kgsl_driver.class = NULL; } - unregister_chrdev_region(kgsl_driver.major, KGSL_DEVICE_MAX); + kgsl_drm_exit(); + kgsl_cffdump_destroy(); + kgsl_core_debugfs_close(); + kgsl_sharedmem_uninit_sysfs(); } static int __init kgsl_core_init(void) diff --git a/drivers/gpu/msm/kgsl.h b/drivers/gpu/msm/kgsl.h index f027f95c4..d3ae4b9bb 100644 --- a/drivers/gpu/msm/kgsl.h +++ b/drivers/gpu/msm/kgsl.h @@ -21,12 +21,13 @@ #include #include #include -#include #define KGSL_NAME "kgsl" -/* Timestamp window used to detect rollovers (half of integer range) */ +/*< DTS2012042406822 hanfeng 20120428 begin*/ +/* Timestamp window used to detect rollovers */ #define KGSL_TIMESTAMP_WINDOW 0x80000000 +/* DTS2012042406822 hanfeng 20120428 end > */ /*cache coherency ops */ #define DRM_KGSL_GEM_CACHE_OP_TO_DEV 0x0001 @@ -95,8 +96,6 @@ struct kgsl_driver { struct { unsigned int vmalloc; unsigned int vmalloc_max; - unsigned int page_alloc; - unsigned int page_alloc_max; unsigned int coherent; unsigned int coherent_max; unsigned int mapped; @@ -108,15 +107,7 @@ struct kgsl_driver { extern struct kgsl_driver kgsl_driver; struct kgsl_pagetable; -struct kgsl_memdesc; - -struct kgsl_memdesc_ops { - int (*vmflags)(struct kgsl_memdesc *); - int (*vmfault)(struct kgsl_memdesc *, struct vm_area_struct *, - struct vm_fault *); - void (*free)(struct kgsl_memdesc *memdesc); - int (*map_kernel_mem)(struct kgsl_memdesc *); -}; +struct kgsl_memdesc_ops; /* shared memory allocation */ struct kgsl_memdesc { @@ -145,7 +136,7 @@ struct kgsl_mem_entry { struct kgsl_memdesc memdesc; int memtype; void *priv_data; - struct rb_node node; + struct list_head list; uint32_t free_timestamp; /* back pointer to private structure under whose context this * allocation is made */ @@ -195,47 +186,27 @@ static inline int kgsl_gpuaddr_in_memdesc(const struct kgsl_memdesc *memdesc, } return 0; } - -static inline void *kgsl_memdesc_map(struct kgsl_memdesc *memdesc) -{ - if (memdesc->hostptr == NULL && memdesc->ops && - memdesc->ops->map_kernel_mem) - memdesc->ops->map_kernel_mem(memdesc); - - return memdesc->hostptr; -} - -static inline uint8_t *kgsl_gpuaddr_to_vaddr(struct kgsl_memdesc *memdesc, +static inline uint8_t *kgsl_gpuaddr_to_vaddr(const struct kgsl_memdesc *memdesc, unsigned int gpuaddr) { - if (memdesc->gpuaddr == 0 || - gpuaddr < memdesc->gpuaddr || - gpuaddr >= (memdesc->gpuaddr + memdesc->size) || - (NULL == memdesc->hostptr && memdesc->ops->map_kernel_mem && - memdesc->ops->map_kernel_mem(memdesc))) - return NULL; + if (memdesc->hostptr == NULL || memdesc->gpuaddr == 0 || + (gpuaddr < memdesc->gpuaddr || + gpuaddr >= memdesc->gpuaddr + memdesc->size)) + return NULL; return memdesc->hostptr + (gpuaddr - memdesc->gpuaddr); } -static inline int timestamp_cmp(unsigned int a, unsigned int b) +static inline int timestamp_cmp(unsigned int new, unsigned int old) { - /* check for equal */ - if (a == b) - return 0; + int ts_diff = new - old; - /* check for greater-than for non-rollover case */ - if ((a > b) && (a - b < KGSL_TIMESTAMP_WINDOW)) - return 1; + if (ts_diff == 0) + return 0; - /* check for greater-than for rollover case - * note that <= is required to ensure that consistent - * results are returned for values whose difference is - * equal to the window size - */ - a += KGSL_TIMESTAMP_WINDOW; - b += KGSL_TIMESTAMP_WINDOW; - return ((a > b) && (a - b <= KGSL_TIMESTAMP_WINDOW)) ? 1 : -1; + /*< DTS2012042406822 hanfeng 20120428 begin*/ + return ((ts_diff > 0) || (ts_diff < -KGSL_TIMESTAMP_WINDOW)) ? 1 : -1; + /* DTS2012042406822 hanfeng 20120428 end > */ } static inline void diff --git a/drivers/gpu/msm/kgsl_cffdump.c b/drivers/gpu/msm/kgsl_cffdump.c index 77aef1ff0..e9455cb82 100644 --- a/drivers/gpu/msm/kgsl_cffdump.c +++ b/drivers/gpu/msm/kgsl_cffdump.c @@ -497,6 +497,190 @@ int kgsl_cffdump_waitirq(void) } EXPORT_SYMBOL(kgsl_cffdump_waitirq); +#define ADDRESS_STACK_SIZE 256 +#define GET_PM4_TYPE3_OPCODE(x) ((*(x) >> 8) & 0xFF) +static unsigned int kgsl_cffdump_addr_count; + +static bool kgsl_cffdump_handle_type3(struct kgsl_device_private *dev_priv, + uint *hostaddr, bool check_only) +{ + static uint addr_stack[ADDRESS_STACK_SIZE]; + static uint size_stack[ADDRESS_STACK_SIZE]; + + switch (GET_PM4_TYPE3_OPCODE(hostaddr)) { + case CP_INDIRECT_BUFFER_PFD: + case CP_INDIRECT_BUFFER: + { + /* traverse indirect buffers */ + int i; + uint ibaddr = hostaddr[1]; + uint ibsize = hostaddr[2]; + + /* is this address already in encountered? */ + for (i = 0; + i < kgsl_cffdump_addr_count && addr_stack[i] != ibaddr; + ++i) + ; + + if (kgsl_cffdump_addr_count == i) { + addr_stack[kgsl_cffdump_addr_count] = ibaddr; + size_stack[kgsl_cffdump_addr_count++] = ibsize; + + if (kgsl_cffdump_addr_count >= ADDRESS_STACK_SIZE) { + KGSL_CORE_ERR("stack overflow\n"); + return false; + } + + return kgsl_cffdump_parse_ibs(dev_priv, NULL, + ibaddr, ibsize, check_only); + } else if (size_stack[i] != ibsize) { + KGSL_CORE_ERR("gpuaddr: 0x%08x, " + "wc: %u, with size wc: %u already on the " + "stack\n", ibaddr, ibsize, size_stack[i]); + return false; + } + } + break; + } + + return true; +} + +/* + * Traverse IBs and dump them to test vector. Detect swap by inspecting + * register writes, keeping note of the current state, and dump + * framebuffer config to test vector + */ +bool kgsl_cffdump_parse_ibs(struct kgsl_device_private *dev_priv, + const struct kgsl_memdesc *memdesc, uint gpuaddr, int sizedwords, + bool check_only) +{ + static uint level; /* recursion level */ + bool ret = true; + uint *hostaddr, *hoststart; + int dwords_left = sizedwords; /* dwords left in the current command + buffer */ + + if (level == 0) + kgsl_cffdump_addr_count = 0; + + if (memdesc == NULL) { + struct kgsl_mem_entry *entry; + spin_lock(&dev_priv->process_priv->mem_lock); + entry = kgsl_sharedmem_find_region(dev_priv->process_priv, + gpuaddr, sizedwords * sizeof(uint)); + spin_unlock(&dev_priv->process_priv->mem_lock); + if (entry == NULL) { + KGSL_CORE_ERR("did not find mapping " + "for gpuaddr: 0x%08x\n", gpuaddr); + return true; + } + memdesc = &entry->memdesc; + } + hostaddr = (uint *)kgsl_gpuaddr_to_vaddr(memdesc, gpuaddr); + if (hostaddr == NULL) { + KGSL_CORE_ERR("no kernel mapping for " + "gpuaddr: 0x%08x\n", gpuaddr); + return true; + } + + hoststart = hostaddr; + + level++; + + mb(); + kgsl_cache_range_op((struct kgsl_memdesc *)memdesc, + KGSL_CACHE_OP_INV); +#ifdef DEBUG + pr_info("kgsl: cffdump: ib: gpuaddr:0x%08x, wc:%d, hptr:%p\n", + gpuaddr, sizedwords, hostaddr); +#endif + + while (dwords_left > 0) { + int count = 0; /* dword count including packet header */ + bool cur_ret = true; + + switch (*hostaddr >> 30) { + case 0x0: /* type-0 */ + count = (*hostaddr >> 16)+2; + break; + case 0x1: /* type-1 */ + count = 2; + break; + case 0x3: /* type-3 */ + count = ((*hostaddr >> 16) & 0x3fff) + 2; + cur_ret = kgsl_cffdump_handle_type3(dev_priv, + hostaddr, check_only); + break; + default: + pr_warn("kgsl: cffdump: parse-ib: unexpected type: " + "type:%d, word:0x%08x @ 0x%p, gpu:0x%08x\n", + *hostaddr >> 30, *hostaddr, hostaddr, + gpuaddr+4*(sizedwords-dwords_left)); + cur_ret = false; + count = dwords_left; + break; + } + +#ifdef DEBUG + if (!cur_ret) { + pr_info("kgsl: cffdump: bad sub-type: #:%d/%d, v:0x%08x" + " @ 0x%p[gb:0x%08x], level:%d\n", + sizedwords-dwords_left, sizedwords, *hostaddr, + hostaddr, gpuaddr+4*(sizedwords-dwords_left), + level); + + print_hex_dump(KERN_ERR, level == 1 ? "IB1:" : "IB2:", + DUMP_PREFIX_OFFSET, 32, 4, hoststart, + sizedwords*4, 0); + } +#endif + ret = ret && cur_ret; + + /* jump to next packet */ + dwords_left -= count; + hostaddr += count; + cur_ret = dwords_left >= 0; + +#ifdef DEBUG + if (!cur_ret) { + pr_info("kgsl: cffdump: bad count: c:%d, #:%d/%d, " + "v:0x%08x @ 0x%p[gb:0x%08x], level:%d\n", + count, sizedwords-(dwords_left+count), + sizedwords, *(hostaddr-count), hostaddr-count, + gpuaddr+4*(sizedwords-(dwords_left+count)), + level); + + print_hex_dump(KERN_ERR, level == 1 ? "IB1:" : "IB2:", + DUMP_PREFIX_OFFSET, 32, 4, hoststart, + sizedwords*4, 0); + } +#endif + + ret = ret && cur_ret; + } + + if (!ret) + pr_info("kgsl: cffdump: parsing failed: gpuaddr:0x%08x, " + "host:0x%p, wc:%d\n", gpuaddr, hoststart, sizedwords); + + if (!check_only) { +#ifdef DEBUG + uint offset = gpuaddr - memdesc->gpuaddr; + pr_info("kgsl: cffdump: ib-dump: hostptr:%p, gpuaddr:%08x, " + "physaddr:%08x, offset:%d, size:%d", hoststart, + gpuaddr, memdesc->physaddr + offset, offset, + sizedwords*4); +#endif + kgsl_cffdump_syncmem(dev_priv, memdesc, gpuaddr, sizedwords*4, + false); + } + + level--; + + return ret; +} + static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf, void *prev_subbuf, uint prev_padding) { diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h index efec2af9e..2fb1e43f4 100644 --- a/drivers/gpu/msm/kgsl_device.h +++ b/drivers/gpu/msm/kgsl_device.h @@ -15,7 +15,6 @@ #include #include -#include #include #include "kgsl.h" @@ -185,7 +184,6 @@ struct kgsl_device { struct wake_lock idle_wakelock; struct kgsl_pwrscale pwrscale; struct kobject pwrscale_kobj; - struct pm_qos_request_list pm_qos_req_dma; struct work_struct ts_expired_ws; struct list_head events; s64 on_time; @@ -199,18 +197,13 @@ struct kgsl_context { /* Pointer to the device specific context information */ void *devctxt; - /* - * Status indicating whether a gpu reset occurred and whether this - * context was responsible for causing it - */ - unsigned int reset_status; }; struct kgsl_process_private { unsigned int refcnt; pid_t pid; spinlock_t mem_lock; - struct rb_root mem_rb; + struct list_head mem_list; struct kgsl_pagetable *pagetable; struct list_head list; struct kobject kobj; diff --git a/drivers/gpu/msm/kgsl_drm.c b/drivers/gpu/msm/kgsl_drm.c index ba48f9c75..dba2dfcfb 100644 --- a/drivers/gpu/msm/kgsl_drm.c +++ b/drivers/gpu/msm/kgsl_drm.c @@ -295,9 +295,8 @@ kgsl_gem_alloc_memory(struct drm_gem_object *obj) priv->memdesc.size = obj->size * priv->bufcount; } else if (TYPE_IS_MEM(priv->type)) { - result = kgsl_sharedmem_page_alloc(&priv->memdesc, - priv->pagetable, - obj->size * priv->bufcount, 0); + priv->memdesc.hostptr = + vmalloc_user(obj->size * priv->bufcount); if (priv->memdesc.hostptr == NULL) { DRM_ERROR("Unable to allocate vmalloc memory\n"); @@ -1043,18 +1042,17 @@ int kgsl_gem_kmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct drm_gem_object *obj = vma->vm_private_data; struct drm_device *dev = obj->dev; struct drm_kgsl_gem_object *priv; - unsigned long offset; + unsigned long offset, pg; struct page *page; - int i; mutex_lock(&dev->struct_mutex); priv = obj->driver_private; offset = (unsigned long) vmf->virtual_address - vma->vm_start; - i = offset >> PAGE_SHIFT; - page = sg_page(&(priv->memdesc.sg[i])); + pg = (unsigned long) priv->memdesc.hostptr + offset; + page = vmalloc_to_page((void *) pg); if (!page) { mutex_unlock(&dev->struct_mutex); return VM_FAULT_SIGBUS; diff --git a/drivers/gpu/msm/kgsl_gpummu.c b/drivers/gpu/msm/kgsl_gpummu.c old mode 100644 new mode 100755 index f038f0491..a16b95418 --- a/drivers/gpu/msm/kgsl_gpummu.c +++ b/drivers/gpu/msm/kgsl_gpummu.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2011-2012, Code Aurora Forum. All rights reserved. +/* Copyright (c) 2011, Code Aurora Forum. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -354,8 +354,8 @@ void *kgsl_gpummu_ptpool_init(int ptsize, int entries) int kgsl_gpummu_pt_equal(struct kgsl_pagetable *pt, unsigned int pt_base) { - struct kgsl_gpummu_pt *gpummu_pt = pt ? pt->priv : NULL; - return gpummu_pt && pt_base && (gpummu_pt->base.gpuaddr == pt_base); + struct kgsl_gpummu_pt *gpummu_pt = pt->priv; + return pt && pt_base && (gpummu_pt->base.gpuaddr == pt_base); } void kgsl_gpummu_destroy_pagetable(void *mmu_specific_pt) @@ -398,14 +398,14 @@ static unsigned int kgsl_gpummu_pt_get_flags(struct kgsl_pagetable *pt, enum kgsl_deviceid id) { unsigned int result = 0; - struct kgsl_gpummu_pt *gpummu_pt; + struct kgsl_gpummu_pt *gpummu_pt = (struct kgsl_gpummu_pt *) + pt->priv; if (pt == NULL) return 0; - gpummu_pt = pt->priv; spin_lock(&pt->lock); - if (gpummu_pt->tlb_flags & (1<tlb_flags && (1<tlb_flags &= ~(1<sg, s, memdesc->sglen, i) { - unsigned int paddr = kgsl_get_sg_pa(s); + unsigned int paddr = sg_phys(s); unsigned int j; /* Each sg entry might be multiple pages long */ diff --git a/drivers/gpu/msm/kgsl_iommu.c b/drivers/gpu/msm/kgsl_iommu.c index 5646d682a..e4e561cef 100644 --- a/drivers/gpu/msm/kgsl_iommu.c +++ b/drivers/gpu/msm/kgsl_iommu.c @@ -34,8 +34,8 @@ struct kgsl_iommu { static int kgsl_iommu_pt_equal(struct kgsl_pagetable *pt, unsigned int pt_base) { - struct iommu_domain *domain = pt ? pt->priv : NULL; - return domain && pt_base && ((unsigned int)domain == pt_base); + struct iommu_domain *domain = pt->priv; + return pt && pt_base && ((unsigned int)domain == pt_base); } static void kgsl_iommu_destroy_pagetable(void *mmu_specific_pt) @@ -262,7 +262,7 @@ kgsl_iommu_map(void *mmu_specific_pt, iommu_virt_addr = memdesc->gpuaddr; ret = iommu_map_range(domain, iommu_virt_addr, memdesc->sg, - memdesc->size, (IOMMU_READ | IOMMU_WRITE)); + memdesc->size, 0); if (ret) { KGSL_CORE_ERR("iommu_map_range(%p, %x, %p, %d, %d) " "failed with err: %d\n", domain, diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c old mode 100644 new mode 100755 index 429e4946d..003afb953 --- a/drivers/gpu/msm/kgsl_pwrctrl.c +++ b/drivers/gpu/msm/kgsl_pwrctrl.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "kgsl.h" #include "kgsl_pwrscale.h" @@ -24,7 +25,6 @@ #define KGSL_PWRFLAGS_AXI_ON 2 #define KGSL_PWRFLAGS_IRQ_ON 3 -#define GPU_SWFI_LATENCY 3 #define UPDATE_BUSY_VAL 1000000 #define UPDATE_BUSY 50 @@ -284,7 +284,10 @@ static int kgsl_pwrctrl_gpubusy_show(struct device *dev, DEVICE_ATTR(gpuclk, 0644, kgsl_pwrctrl_gpuclk_show, kgsl_pwrctrl_gpuclk_store); DEVICE_ATTR(max_gpuclk, 0644, kgsl_pwrctrl_max_gpuclk_show, kgsl_pwrctrl_max_gpuclk_store); +/*< DTS2011123005723 hanfeng 20111230 begin*/ +/*modify the file permission */ DEVICE_ATTR(pwrnap, 0664, kgsl_pwrctrl_pwrnap_show, kgsl_pwrctrl_pwrnap_store); +/* DTS2011123005723 hanfeng 20111230 end >*/ DEVICE_ATTR(idle_timer, 0644, kgsl_pwrctrl_idle_timer_show, kgsl_pwrctrl_idle_timer_store); DEVICE_ATTR(gpubusy, 0644, kgsl_pwrctrl_gpubusy_show, @@ -334,8 +337,7 @@ static void kgsl_pwrctrl_busy_time(struct kgsl_device *device, bool on_time) do_gettimeofday(&(b->start)); } -void kgsl_pwrctrl_clk(struct kgsl_device *device, int state, - int requested_state) +void kgsl_pwrctrl_clk(struct kgsl_device *device, int state) { struct kgsl_pwrctrl *pwr = &device->pwrctrl; int i = 0; @@ -347,7 +349,7 @@ void kgsl_pwrctrl_clk(struct kgsl_device *device, int state, if (pwr->grp_clks[i]) clk_disable(pwr->grp_clks[i]); if ((pwr->pwrlevels[0].gpu_freq > 0) && - (requested_state != KGSL_STATE_NAP)) + (device->requested_state != KGSL_STATE_NAP)) clk_set_rate(pwr->grp_clks[0], pwr->pwrlevels[pwr->num_pwrlevels - 1]. gpu_freq); @@ -422,12 +424,8 @@ void kgsl_pwrctrl_pwrrail(struct kgsl_device *device, int state) if (!test_and_set_bit(KGSL_PWRFLAGS_POWER_ON, &pwr->power_flags)) { trace_kgsl_rail(device, state); - if (pwr->gpu_reg) { - int status = regulator_enable(pwr->gpu_reg); - if (status) - KGSL_DRV_ERR(device, "regulator_enable " - "failed: %d\n", status); - } + if (pwr->gpu_reg) + regulator_enable(pwr->gpu_reg); } } } @@ -514,7 +512,10 @@ int kgsl_pwrctrl_init(struct kgsl_device *device) pwr->nap_allowed = pdata->nap_allowed; pwr->idle_needed = pdata->idle_needed; pwr->interval_timeout = pdata->idle_timeout; + /*< DTS2012041906630 zhangxiangdang 20120423 begin */ + /*merge qc patch to fix kgsl issue.*/ pwr->strtstp_sleepwake = pdata->strtstp_sleepwake; + /* DTS2012041906630 zhangxiangdang 20120423 end > */ pwr->ebi1_clk = clk_get(&pdev->dev, "bus_clk"); if (IS_ERR(pwr->ebi1_clk)) pwr->ebi1_clk = NULL; @@ -637,8 +638,10 @@ void kgsl_timer(unsigned long data) KGSL_PWR_INFO(device, "idle timer expired device %d\n", device->id); if (device->requested_state != KGSL_STATE_SUSPEND) { + /*< DTS2012041906630 zhangxiangdang 20120423 begin */ if (device->pwrctrl.restore_slumber || device->pwrctrl.strtstp_sleepwake) + /* DTS2012041906630 zhangxiangdang 20120423 end > */ kgsl_pwrctrl_request_state(device, KGSL_STATE_SLUMBER); else kgsl_pwrctrl_request_state(device, KGSL_STATE_SLEEP); @@ -705,8 +708,10 @@ _nap(struct kgsl_device *device) return -EBUSY; } kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF); - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_NAP); - kgsl_pwrctrl_set_state(device, KGSL_STATE_NAP); + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF); + /*< DTS2012041906630 zhangxiangdang 20120423 begin */ + kgsl_pwrctrl_set_state(device, device->requested_state); + /* DTS2012041906630 zhangxiangdang 20120423 end > */ if (device->idle_wakelock.name) wake_unlock(&device->idle_wakelock); case KGSL_STATE_NAP: @@ -748,11 +753,10 @@ _sleep(struct kgsl_device *device) pwr->pwrlevels[pwr->num_pwrlevels - 1]. gpu_freq); _sleep_accounting(device); - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP); + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF); kgsl_pwrctrl_set_state(device, KGSL_STATE_SLEEP); - wake_unlock(&device->idle_wakelock); - pm_qos_update_request(&device->pm_qos_req_dma, - PM_QOS_DEFAULT_VALUE); + if (device->idle_wakelock.name) + wake_unlock(&device->idle_wakelock); break; case KGSL_STATE_SLEEP: case KGSL_STATE_SLUMBER: @@ -779,18 +783,18 @@ _slumber(struct kgsl_device *device) case KGSL_STATE_NAP: case KGSL_STATE_SLEEP: del_timer_sync(&device->idle_timer); + /*< DTS2012041906630 zhangxiangdang 20120423 begin */ if (!device->pwrctrl.strtstp_sleepwake) kgsl_pwrctrl_pwrlevel_change(device, KGSL_PWRLEVEL_NOMINAL); - device->pwrctrl.restore_slumber = true; + /* DTS2012041906630 zhangxiangdang 20120423 end > */ device->ftbl->suspend_context(device); device->ftbl->stop(device); + device->pwrctrl.restore_slumber = true; _sleep_accounting(device); kgsl_pwrctrl_set_state(device, KGSL_STATE_SLUMBER); if (device->idle_wakelock.name) wake_unlock(&device->idle_wakelock); - pm_qos_update_request(&device->pm_qos_req_dma, - PM_QOS_DEFAULT_VALUE); break; case KGSL_STATE_SLUMBER: break; @@ -852,17 +856,16 @@ void kgsl_pwrctrl_wake(struct kgsl_device *device) /* fall through */ case KGSL_STATE_NAP: /* Turn on the core clocks */ - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON, KGSL_STATE_ACTIVE); + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON); /* Enable state before turning on irq */ kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE); kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON); /* Re-enable HW access */ mod_timer(&device->idle_timer, jiffies + device->pwrctrl.interval_timeout); - wake_lock(&device->idle_wakelock); - if (device->pwrctrl.restore_slumber == false) - pm_qos_update_request(&device->pm_qos_req_dma, - GPU_SWFI_LATENCY); + + if (device->idle_wakelock.name) + wake_lock(&device->idle_wakelock); case KGSL_STATE_ACTIVE: break; default: @@ -878,7 +881,7 @@ void kgsl_pwrctrl_enable(struct kgsl_device *device) { /* Order pwrrail/clk sequence based upon platform */ kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_ON); - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON, KGSL_STATE_ACTIVE); + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON); kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_ON); } EXPORT_SYMBOL(kgsl_pwrctrl_enable); @@ -887,7 +890,7 @@ void kgsl_pwrctrl_disable(struct kgsl_device *device) { /* Order pwrrail/clk sequence based upon platform */ kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_OFF); - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP); + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF); kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_OFF); } EXPORT_SYMBOL(kgsl_pwrctrl_disable); diff --git a/drivers/gpu/msm/kgsl_pwrctrl.h b/drivers/gpu/msm/kgsl_pwrctrl.h old mode 100644 new mode 100755 index caaed92c8..0c7ec6003 --- a/drivers/gpu/msm/kgsl_pwrctrl.h +++ b/drivers/gpu/msm/kgsl_pwrctrl.h @@ -47,7 +47,9 @@ struct kgsl_pwrctrl { int thermal_pwrlevel; unsigned int num_pwrlevels; unsigned int interval_timeout; + /*< DTS2012041906630 zhangxiangdang 20120423 begin */ bool strtstp_sleepwake; + /* DTS2012041906630 zhangxiangdang 20120423 end > */ struct regulator *gpu_reg; uint32_t pcl; unsigned int nap_allowed; diff --git a/drivers/gpu/msm/kgsl_pwrscale.c b/drivers/gpu/msm/kgsl_pwrscale.c old mode 100644 new mode 100755 index d0b2a412c..c2252edcf --- a/drivers/gpu/msm/kgsl_pwrscale.c +++ b/drivers/gpu/msm/kgsl_pwrscale.c @@ -89,8 +89,10 @@ static ssize_t pwrscale_policy_show(struct kgsl_device *device, char *buf) return ret; } - +/*< DTS2011123005723 hanfeng 20111230 begin*/ +/*modify the file permission */ PWRSCALE_ATTR(policy, 0664, pwrscale_policy_show, pwrscale_policy_store); +/*DTS2011123005723 hanfeng 20111230 end >*/ static ssize_t pwrscale_avail_policies_show(struct kgsl_device *device, char *buf) diff --git a/drivers/gpu/msm/kgsl_pwrscale_idlestats.c b/drivers/gpu/msm/kgsl_pwrscale_idlestats.c old mode 100644 new mode 100755 diff --git a/drivers/gpu/msm/kgsl_sharedmem.c b/drivers/gpu/msm/kgsl_sharedmem.c old mode 100644 new mode 100755 index ae32e81ff..389ed6d4f --- a/drivers/gpu/msm/kgsl_sharedmem.c +++ b/drivers/gpu/msm/kgsl_sharedmem.c @@ -13,8 +13,6 @@ #include #include #include -#include -#include #include "kgsl.h" #include "kgsl_sharedmem.h" @@ -203,10 +201,6 @@ static int kgsl_drv_memstat_show(struct device *dev, val = kgsl_driver.stats.vmalloc; else if (!strncmp(attr->attr.name, "vmalloc_max", 11)) val = kgsl_driver.stats.vmalloc_max; - else if (!strncmp(attr->attr.name, "page_alloc", 10)) - val = kgsl_driver.stats.page_alloc; - else if (!strncmp(attr->attr.name, "page_alloc_max", 14)) - val = kgsl_driver.stats.page_alloc_max; else if (!strncmp(attr->attr.name, "coherent", 8)) val = kgsl_driver.stats.coherent; else if (!strncmp(attr->attr.name, "coherent_max", 12)) @@ -236,8 +230,6 @@ static int kgsl_drv_histogram_show(struct device *dev, DEVICE_ATTR(vmalloc, 0444, kgsl_drv_memstat_show, NULL); DEVICE_ATTR(vmalloc_max, 0444, kgsl_drv_memstat_show, NULL); -DEVICE_ATTR(page_alloc, 0444, kgsl_drv_memstat_show, NULL); -DEVICE_ATTR(page_alloc_max, 0444, kgsl_drv_memstat_show, NULL); DEVICE_ATTR(coherent, 0444, kgsl_drv_memstat_show, NULL); DEVICE_ATTR(coherent_max, 0444, kgsl_drv_memstat_show, NULL); DEVICE_ATTR(mapped, 0444, kgsl_drv_memstat_show, NULL); @@ -247,8 +239,6 @@ DEVICE_ATTR(histogram, 0444, kgsl_drv_histogram_show, NULL); static const struct device_attribute *drv_attr_list[] = { &dev_attr_vmalloc, &dev_attr_vmalloc_max, - &dev_attr_page_alloc, - &dev_attr_page_alloc_max, &dev_attr_coherent, &dev_attr_coherent_max, &dev_attr_mapped, @@ -292,7 +282,7 @@ static void outer_cache_range_op_sg(struct scatterlist *sg, int sglen, int op) int i; for_each_sg(sg, s, sglen, i) { - unsigned int paddr = kgsl_get_sg_pa(s); + unsigned int paddr = sg_phys(s); _outer_cache_range_op(op, paddr, s->length); } } @@ -303,18 +293,17 @@ static void outer_cache_range_op_sg(struct scatterlist *sg, int sglen, int op) } #endif -static int kgsl_page_alloc_vmfault(struct kgsl_memdesc *memdesc, +static int kgsl_vmalloc_vmfault(struct kgsl_memdesc *memdesc, struct vm_area_struct *vma, struct vm_fault *vmf) { - unsigned long offset; + unsigned long offset, pg; struct page *page; - int i; offset = (unsigned long) vmf->virtual_address - vma->vm_start; + pg = (unsigned long) memdesc->hostptr + offset; - i = offset >> PAGE_SHIFT; - page = sg_page(&memdesc->sg[i]); + page = vmalloc_to_page((void *) pg); if (page == NULL) return VM_FAULT_SIGBUS; @@ -324,23 +313,15 @@ static int kgsl_page_alloc_vmfault(struct kgsl_memdesc *memdesc, return 0; } -static int kgsl_page_alloc_vmflags(struct kgsl_memdesc *memdesc) +static int kgsl_vmalloc_vmflags(struct kgsl_memdesc *memdesc) { return VM_RESERVED | VM_DONTEXPAND; } -static void kgsl_page_alloc_free(struct kgsl_memdesc *memdesc) +static void kgsl_vmalloc_free(struct kgsl_memdesc *memdesc) { - int i = 0; - struct scatterlist *sg; - kgsl_driver.stats.page_alloc -= memdesc->size; - if (memdesc->hostptr) { - vunmap(memdesc->hostptr); - kgsl_driver.stats.vmalloc -= memdesc->size; - } - if (memdesc->sg) - for_each_sg(memdesc->sg, sg, memdesc->sglen, i) - __free_page(sg_page(sg)); + kgsl_driver.stats.vmalloc -= memdesc->size; + vfree(memdesc->hostptr); } static int kgsl_contiguous_vmflags(struct kgsl_memdesc *memdesc) @@ -348,42 +329,6 @@ static int kgsl_contiguous_vmflags(struct kgsl_memdesc *memdesc) return VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND; } -/* - * kgsl_page_alloc_map_kernel - Map the memory in memdesc to kernel address - * space - * - * @memdesc - The memory descriptor which contains information about the memory - * - * Return: 0 on success else error code - */ -static int kgsl_page_alloc_map_kernel(struct kgsl_memdesc *memdesc) -{ - if (!memdesc->hostptr) { - pgprot_t page_prot = pgprot_writecombine(PAGE_KERNEL); - struct page **pages = NULL; - struct scatterlist *sg; - int i; - /* create a list of pages to call vmap */ - pages = vmalloc(memdesc->sglen * sizeof(struct page *)); - if (!pages) { - KGSL_CORE_ERR("vmalloc(%d) failed\n", - memdesc->sglen * sizeof(struct page *)); - return -ENOMEM; - } - for_each_sg(memdesc->sg, sg, memdesc->sglen, i) - pages[i] = sg_page(sg); - memdesc->hostptr = vmap(pages, memdesc->sglen, - VM_IOREMAP, page_prot); - KGSL_STATS_ADD(memdesc->size, kgsl_driver.stats.vmalloc, - kgsl_driver.stats.vmalloc_max); - vfree(pages); - } - if (!memdesc->hostptr) - return -ENOMEM; - - return 0; -} - static int kgsl_contiguous_vmfault(struct kgsl_memdesc *memdesc, struct vm_area_struct *vma, struct vm_fault *vmf) @@ -423,13 +368,12 @@ static void kgsl_coherent_free(struct kgsl_memdesc *memdesc) } /* Global - also used by kgsl_drm.c */ -struct kgsl_memdesc_ops kgsl_page_alloc_ops = { - .free = kgsl_page_alloc_free, - .vmflags = kgsl_page_alloc_vmflags, - .vmfault = kgsl_page_alloc_vmfault, - .map_kernel_mem = kgsl_page_alloc_map_kernel, +struct kgsl_memdesc_ops kgsl_vmalloc_ops = { + .free = kgsl_vmalloc_free, + .vmflags = kgsl_vmalloc_vmflags, + .vmfault = kgsl_vmalloc_vmfault, }; -EXPORT_SYMBOL(kgsl_page_alloc_ops); +EXPORT_SYMBOL(kgsl_vmalloc_ops); static struct kgsl_memdesc_ops kgsl_ebimem_ops = { .free = kgsl_ebimem_free, @@ -463,9 +407,9 @@ void kgsl_cache_range_op(struct kgsl_memdesc *memdesc, int op) EXPORT_SYMBOL(kgsl_cache_range_op); static int -_kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc, +_kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, - size_t size, unsigned int protflags) + void *ptr, size_t size, unsigned int protflags) { int order, ret = 0; int sglen = PAGE_ALIGN(size) / PAGE_SIZE; @@ -474,43 +418,36 @@ _kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc, memdesc->size = size; memdesc->pagetable = pagetable; memdesc->priv = KGSL_MEMFLAGS_CACHED; - memdesc->ops = &kgsl_page_alloc_ops; - - memdesc->sg = kgsl_sg_alloc(sglen); + memdesc->ops = &kgsl_vmalloc_ops; + memdesc->hostptr = (void *) ptr; + memdesc->sg = vmalloc(sglen * sizeof(struct scatterlist)); if (memdesc->sg == NULL) { - KGSL_CORE_ERR("vmalloc(%d) failed\n", - sglen * sizeof(struct scatterlist)); ret = -ENOMEM; goto done; } - kmemleak_not_leak(memdesc->sg); - memdesc->sglen = sglen; sg_init_table(memdesc->sg, sglen); - for (i = 0; i < memdesc->sglen; i++) { - struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO | - __GFP_HIGHMEM); + for (i = 0; i < memdesc->sglen; i++, ptr += PAGE_SIZE) { + struct page *page = vmalloc_to_page(ptr); if (!page) { - ret = -ENOMEM; - memdesc->sglen = i; + ret = -EINVAL; goto done; } - flush_dcache_page(page); sg_set_page(&memdesc->sg[i], page, PAGE_SIZE, 0); } - outer_cache_range_op_sg(memdesc->sg, memdesc->sglen, - KGSL_CACHE_OP_FLUSH); + + kgsl_cache_range_op(memdesc, KGSL_CACHE_OP_INV); ret = kgsl_mmu_map(pagetable, memdesc, protflags); if (ret) goto done; - KGSL_STATS_ADD(size, kgsl_driver.stats.page_alloc, - kgsl_driver.stats.page_alloc_max); + KGSL_STATS_ADD(size, kgsl_driver.stats.vmalloc, + kgsl_driver.stats.vmalloc_max); order = get_order(size); @@ -525,41 +462,51 @@ _kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc, } int -kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc, +kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, size_t size) { - int ret = 0; + void *ptr; + BUG_ON(size == 0); size = ALIGN(size, PAGE_SIZE * 2); + ptr = vmalloc(size); + + if (ptr == NULL) { + KGSL_CORE_ERR("vmalloc(%d) failed\n", size); + return -ENOMEM; + } - ret = _kgsl_sharedmem_page_alloc(memdesc, pagetable, size, + return _kgsl_sharedmem_vmalloc(memdesc, pagetable, ptr, size, GSL_PT_PAGE_RV | GSL_PT_PAGE_WV); - if (!ret) - ret = kgsl_page_alloc_map_kernel(memdesc); - if (ret) - kgsl_sharedmem_free(memdesc); - return ret; } -EXPORT_SYMBOL(kgsl_sharedmem_page_alloc); +EXPORT_SYMBOL(kgsl_sharedmem_vmalloc); int -kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc, +kgsl_sharedmem_vmalloc_user(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, size_t size, int flags) { + void *ptr; unsigned int protflags; BUG_ON(size == 0); + ptr = vmalloc_user(size); + + if (ptr == NULL) { + KGSL_CORE_ERR("vmalloc_user(%d) failed: allocated=%d\n", + size, kgsl_driver.stats.vmalloc); + return -ENOMEM; + } protflags = GSL_PT_PAGE_RV; if (!(flags & KGSL_MEMFLAGS_GPUREADONLY)) protflags |= GSL_PT_PAGE_WV; - return _kgsl_sharedmem_page_alloc(memdesc, pagetable, size, + return _kgsl_sharedmem_vmalloc(memdesc, pagetable, ptr, size, protflags); } -EXPORT_SYMBOL(kgsl_sharedmem_page_alloc_user); +EXPORT_SYMBOL(kgsl_sharedmem_vmalloc_user); int kgsl_sharedmem_alloc_coherent(struct kgsl_memdesc *memdesc, size_t size) @@ -607,7 +554,7 @@ void kgsl_sharedmem_free(struct kgsl_memdesc *memdesc) if (memdesc->ops && memdesc->ops->free) memdesc->ops->free(memdesc); - kgsl_sg_free(memdesc->sg, memdesc->sglen); + vfree(memdesc->sg); memset(memdesc, 0, sizeof(*memdesc)); } @@ -739,33 +686,3 @@ kgsl_sharedmem_set(const struct kgsl_memdesc *memdesc, unsigned int offsetbytes, return 0; } EXPORT_SYMBOL(kgsl_sharedmem_set); - -/* - * kgsl_sharedmem_map_vma - Map a user vma to physical memory - * - * @vma - The user vma to map - * @memdesc - The memory descriptor which contains information about the - * physical memory - * - * Return: 0 on success else error code - */ -int -kgsl_sharedmem_map_vma(struct vm_area_struct *vma, - const struct kgsl_memdesc *memdesc) -{ - unsigned long addr = vma->vm_start; - unsigned long size = vma->vm_end - vma->vm_start; - int ret, i = 0; - - if (!memdesc->sg || (size != memdesc->size) || - (memdesc->sglen != (size / PAGE_SIZE))) - return -EINVAL; - - for (; addr < vma->vm_end; addr += PAGE_SIZE, i++) { - ret = vm_insert_page(vma, addr, sg_page(&memdesc->sg[i])); - if (ret) - return ret; - } - return 0; -} -EXPORT_SYMBOL(kgsl_sharedmem_map_vma); diff --git a/drivers/gpu/msm/kgsl_sharedmem.h b/drivers/gpu/msm/kgsl_sharedmem.h old mode 100644 new mode 100755 index a67d9c657..67a1c2d7b --- a/drivers/gpu/msm/kgsl_sharedmem.h +++ b/drivers/gpu/msm/kgsl_sharedmem.h @@ -17,8 +17,6 @@ #include #include #include "kgsl_mmu.h" -#include -#include struct kgsl_device; struct kgsl_process_private; @@ -30,12 +28,19 @@ struct kgsl_process_private; /** Set if the memdesc describes cached memory */ #define KGSL_MEMFLAGS_CACHED 0x00000001 -extern struct kgsl_memdesc_ops kgsl_page_alloc_ops; +struct kgsl_memdesc_ops { + int (*vmflags)(struct kgsl_memdesc *); + int (*vmfault)(struct kgsl_memdesc *, struct vm_area_struct *, + struct vm_fault *); + void (*free)(struct kgsl_memdesc *memdesc); +}; + +extern struct kgsl_memdesc_ops kgsl_vmalloc_ops; -int kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc, +int kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, size_t size); -int kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc, +int kgsl_sharedmem_vmalloc_user(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, size_t size, int flags); @@ -71,58 +76,19 @@ void kgsl_process_uninit_sysfs(struct kgsl_process_private *private); int kgsl_sharedmem_init_sysfs(void); void kgsl_sharedmem_uninit_sysfs(void); -static inline unsigned int kgsl_get_sg_pa(struct scatterlist *sg) -{ - /* - * Try sg_dma_address first to support ion carveout - * regions which do not work with sg_phys(). - */ - unsigned int pa = sg_dma_address(sg); - if (pa == 0) - pa = sg_phys(sg); - return pa; -} - -int -kgsl_sharedmem_map_vma(struct vm_area_struct *vma, - const struct kgsl_memdesc *memdesc); - -/* - * For relatively small sglists, it is preferable to use kzalloc - * rather than going down the vmalloc rat hole. If the size of - * the sglist is < PAGE_SIZE use kzalloc otherwise fallback to - * vmalloc - */ - -static inline void *kgsl_sg_alloc(unsigned int sglen) -{ - if ((sglen * sizeof(struct scatterlist)) < PAGE_SIZE) - return kzalloc(sglen * sizeof(struct scatterlist), GFP_KERNEL); - else - return vmalloc(sglen * sizeof(struct scatterlist)); -} - -static inline void kgsl_sg_free(void *ptr, unsigned int sglen) -{ - if ((sglen * sizeof(struct scatterlist)) < PAGE_SIZE) - kfree(ptr); - else - vfree(ptr); -} - static inline int memdesc_sg_phys(struct kgsl_memdesc *memdesc, unsigned int physaddr, unsigned int size) { - memdesc->sg = kgsl_sg_alloc(1); + struct page *page = phys_to_page(physaddr); - kmemleak_not_leak(memdesc->sg); + memdesc->sg = vmalloc(sizeof(struct scatterlist) * 1); + if (memdesc->sg == NULL) + return -ENOMEM; memdesc->sglen = 1; sg_init_table(memdesc->sg, 1); - memdesc->sg[0].length = size; - memdesc->sg[0].offset = 0; - memdesc->sg[0].dma_address = physaddr; + sg_set_page(&memdesc->sg[0], page, size, 0); return 0; } @@ -132,7 +98,7 @@ kgsl_allocate(struct kgsl_memdesc *memdesc, { if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE) return kgsl_sharedmem_ebimem(memdesc, pagetable, size); - return kgsl_sharedmem_page_alloc(memdesc, pagetable, size); + return kgsl_sharedmem_vmalloc(memdesc, pagetable, size); } static inline int @@ -143,7 +109,7 @@ kgsl_allocate_user(struct kgsl_memdesc *memdesc, if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE) return kgsl_sharedmem_ebimem_user(memdesc, pagetable, size, flags); - return kgsl_sharedmem_page_alloc_user(memdesc, pagetable, size, flags); + return kgsl_sharedmem_vmalloc_user(memdesc, pagetable, size, flags); } static inline int diff --git a/drivers/gpu/msm/kgsl_snapshot.c b/drivers/gpu/msm/kgsl_snapshot.c index 394bc83bd..72df148bc 100644 --- a/drivers/gpu/msm/kgsl_snapshot.c +++ b/drivers/gpu/msm/kgsl_snapshot.c @@ -10,6 +10,7 @@ * GNU General Public License for more details. */ +#include #include #include #include @@ -282,12 +283,6 @@ int kgsl_device_snapshot(struct kgsl_device *device, int hang) /* Freeze the snapshot on a hang until it gets read */ device->snapshot_frozen = (hang) ? 1 : 0; - /* log buffer info to aid in ramdump recovery */ - KGSL_DRV_ERR(device, "snapshot created at va %p pa %lx size %d\n", - device->snapshot, __pa(device->snapshot), - device->snapshot_size); - if (hang) - sysfs_notify(&device->snapshot_kobj, NULL, "timestamp"); return 0; } EXPORT_SYMBOL(kgsl_device_snapshot); @@ -437,7 +432,7 @@ int kgsl_device_snapshot_init(struct kgsl_device *device) int ret; if (device->snapshot == NULL) - device->snapshot = kzalloc(KGSL_SNAPSHOT_MEMSIZE, GFP_KERNEL); + device->snapshot = vmalloc(KGSL_SNAPSHOT_MEMSIZE); if (device->snapshot == NULL) return -ENOMEM; @@ -480,7 +475,7 @@ void kgsl_device_snapshot_close(struct kgsl_device *device) kobject_put(&device->snapshot_kobj); - kfree(device->snapshot); + vfree(device->snapshot); device->snapshot = NULL; device->snapshot_maxsize = 0; diff --git a/drivers/gpu/msm/z180.c b/drivers/gpu/msm/z180.c old mode 100644 new mode 100755 index d721a577a..cb3da9075 --- a/drivers/gpu/msm/z180.c +++ b/drivers/gpu/msm/z180.c @@ -157,6 +157,13 @@ static struct z180_device device_2d0 = { .active_cnt = 0, .iomemname = KGSL_2D0_REG_MEMORY, .ftbl = &z180_functable, +#ifdef CONFIG_HAS_EARLYSUSPEND + .display_off = { + .level = EARLY_SUSPEND_LEVEL_STOP_DRAWING, + .suspend = kgsl_early_suspend_driver, + .resume = kgsl_late_resume_driver, + }, +#endif }, }; @@ -188,6 +195,13 @@ static struct z180_device device_2d1 = { .active_cnt = 0, .iomemname = KGSL_2D1_REG_MEMORY, .ftbl = &z180_functable, + .display_off = { +#ifdef CONFIG_HAS_EARLYSUSPEND + .level = EARLY_SUSPEND_LEVEL_STOP_DRAWING, + .suspend = kgsl_early_suspend_driver, + .resume = kgsl_late_resume_driver, +#endif + }, }, }; @@ -393,7 +407,7 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv, unsigned int index = 0; unsigned int nextindex; unsigned int nextcnt = Z180_STREAM_END_CMD | 5; - struct kgsl_mem_entry *entry = NULL; + struct kgsl_memdesc tmp = {0}; unsigned int cmd; struct kgsl_device *device = dev_priv->device; struct kgsl_pagetable *pagetable = dev_priv->process_priv->pagetable; @@ -411,30 +425,8 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv, } cmd = ibdesc[0].gpuaddr; sizedwords = ibdesc[0].sizedwords; - /* - * Get a kernel mapping to the IB for monkey patching. - * See the end of this function. - */ - entry = kgsl_sharedmem_find_region(dev_priv->process_priv, cmd, - sizedwords); - if (entry == NULL) { - KGSL_DRV_ERR(device, "Bad ibdesc: gpuaddr 0x%x size %d\n", - cmd, sizedwords); - result = -EINVAL; - goto error; - } - /* - * This will only map memory if it exists, otherwise it will reuse the - * mapping. And the 2d userspace reuses IBs so we likely won't create - * too many mappings. - */ - if (kgsl_gpuaddr_to_vaddr(&entry->memdesc, cmd) == NULL) { - KGSL_DRV_ERR(device, - "Cannot make kernel mapping for gpuaddr 0x%x\n", - cmd); - result = -EINVAL; - goto error; - } + + tmp.hostptr = (void *)*timestamp; KGSL_CMD_INFO(device, "ctxt %d ibaddr 0x%08x sizedwords %d\n", context->id, cmd, sizedwords); @@ -476,13 +468,12 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv, nextaddr = z180_dev->ringbuffer.cmdbufdesc.gpuaddr + rb_offset(nextindex); - /* monkey patch the IB so that it jumps back to the ringbuffer */ - kgsl_sharedmem_writel(&entry->memdesc, - ((sizedwords + 1) * sizeof(unsigned int)), - nextaddr); - kgsl_sharedmem_writel(&entry->memdesc, - ((sizedwords + 2) * sizeof(unsigned int)), - nextcnt); + tmp.hostptr = (void *)(tmp.hostptr + + (sizedwords * sizeof(unsigned int))); + tmp.size = 12; + + kgsl_sharedmem_writel(&tmp, 4, nextaddr); + kgsl_sharedmem_writel(&tmp, 8, nextcnt); /* sync memory before activating the hardware for the new command*/ mb(); diff --git a/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c b/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c old mode 100755 new mode 100644 diff --git a/include/linux/msm_kgsl.h b/include/linux/msm_kgsl.h old mode 100644 new mode 100755 index 7837bad21..a1d267893 --- a/include/linux/msm_kgsl.h +++ b/include/linux/msm_kgsl.h @@ -34,16 +34,6 @@ #define KGSL_CLK_MEM_IFACE 0x00000010 #define KGSL_CLK_AXI 0x00000020 -/* - * Reset status values for context - */ -enum kgsl_ctx_reset_stat { - KGSL_CTX_STAT_NO_ERROR = 0x00000000, - KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT = 0x00000001, - KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT = 0x00000002, - KGSL_CTX_STAT_UNKNOWN_CONTEXT_RESET_EXT = 0x00000003 -}; - #define KGSL_MAX_PWRLEVELS 5 #define KGSL_CONVERT_TO_MBPS(val) \ @@ -120,7 +110,6 @@ enum kgsl_property_type { KGSL_PROP_MMU_ENABLE = 0x00000006, KGSL_PROP_INTERRUPT_WAITS = 0x00000007, KGSL_PROP_VERSION = 0x00000008, - KGSL_PROP_GPU_RESET_STAT = 0x00000009 }; struct kgsl_shadowprop { @@ -157,7 +146,9 @@ struct kgsl_device_platform_data { int num_levels; int (*set_grp_async)(void); unsigned int idle_timeout; + /*< DTS2012041906630 zhangxiangdang 20120423 begin */ bool strtstp_sleepwake; + /* DTS2012041906630 zhangxiangdang 20120423 end > */ unsigned int nap_allowed; unsigned int clk_map; unsigned int idle_needed; From af9707f1751b077c4f87e7286da00e1fe60aed6a Mon Sep 17 00:00:00 2001 From: forumber Date: Fri, 25 Jan 2013 16:38:10 +0200 Subject: [PATCH 15/19] add gitignore --- .gitignore | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..8faa6c02b --- /dev/null +++ b/.gitignore @@ -0,0 +1,79 @@ +# +# NOTE! Don't add files that are generated in specific +# subdirectories here. Add them in the ".gitignore" file +# in that subdirectory instead. +# +# NOTE! Please use 'git ls-files -i --exclude-standard' +# command after changing this file, to see if there are +# any tracked files which get ignored after the change. +# +# Normal rules +# +.* +*.o +*.o.* +*.a +*.s +*.ko +*.so +*.so.dbg +*.mod.c +*.i +*.lst +*.symtypes +*.order +modules.builtin +*.elf +*.bin +*.gz +*.bz2 +*.lzma +*.lzo +*.patch +*.gcno + +# +# Top-level generic files +# +/tags +/TAGS +/linux +/vmlinux +/vmlinuz +/System.map +/Module.markers +/Module.symvers + +# +# git files that we don't want to ignore even it they are dot-files +# +!.gitignore +!.mailmap + +# +# Generated include files +# +include/config +include/linux/version.h +include/generated + +# stgit generated dirs +patches-* + +# quilt's files +patches +series + +# cscope files +cscope.* +ncscope.* + +# gnu global files +GPATH +GRTAGS +GSYMS +GTAGS + +*.orig +*~ +\#*# From a2dec0b448e716236a9d81022d5b029a3d5bd60f Mon Sep 17 00:00:00 2001 From: forumber Date: Fri, 25 Jan 2013 17:23:23 +0200 Subject: [PATCH 16/19] updated and fixed lag on Atmel TS --- .../touchscreen/atmel_i2c_rmi_QT602240.c | 167 ++++++------------ 1 file changed, 50 insertions(+), 117 deletions(-) diff --git a/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c b/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c index 9a931c0f3..bbb8a64fc 100644 --- a/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c +++ b/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c @@ -323,8 +323,7 @@ static u8 atmel_timer = 0; #define DISABLE 0 /* < DTS2011062404739 cuiyu 20110624 begin */ -static uint32_t resume_time = 0; -static u8 cal_check_flag = 1; +static u8 cal_check_flag = 0; /* DTS2011062404739 cuiyu 20110624 end > */ /* DTS2010083103149 zhangtao 20100909 end > */ @@ -835,7 +834,7 @@ int write_power_config(int on) /* < DTS2010083103149 zhangtao 20100909 begin */ /* < DTS2011042106137 zhangtao 20110509 begin */ /* < DTS2011062404739 cuiyu 20110624 begin */ - *(tmp + 1) = 16; //0xff//Active Acquisition + *(tmp + 1) = 14; //0xff//Active Acquisition /* DTS2011062404739 cuiyu 20110624 end > */ /* DTS2011042106137 zhangtao 20110509 end > */ /* DTS2010083103149 zhangtao 20100909 end > */ @@ -1062,7 +1061,7 @@ int write_multitouchscreen_config(u8 instance,int flag) *(tmp + 11) = 3; //movhysti /* < DTS2011042106137 zhangtao 20110509 begin */ /* make the point report every pix */ - *(tmp + 12) = 1; //movhystn + *(tmp + 12) = 3; //movhystn /* DTS2011042106137 zhangtao 20110509 end > */ *(tmp + 13) = 0;//0x2e; //movfilter *(tmp + 14) = 2; //numtouch @@ -1207,7 +1206,7 @@ int write_gripfacesuppression_config(u8 instance) *(tmp + 5) = 0; //maxtchs *(tmp + 6) = 0; //reserved *(tmp + 7) = 80; //szthr1 - *(tmp + 8) = 20; //szthr2 + *(tmp + 8) = 40; //szthr2 *(tmp + 9) = 4; //shpthr1 *(tmp + 10) = 35; //shpthr2 *(tmp + 11) = 10; //supextto @@ -1273,7 +1272,7 @@ int write_noisesuppression_config(u8 instance) *(tmp + 6) = 0xff; //GCAFLL *(tmp + 7) = 4; //actvgcafvalid /* < DTS2011062404739 cuiyu 20110624 begin */ - *(tmp + 8) = 30; //noisethr + *(tmp + 8) = 20; //noisethr /* DTS2011062404739 cuiyu 20110624 end > */ *(tmp + 9) = 0; //reserved *(tmp + 10) = 0; //freqhopscale @@ -1890,7 +1889,15 @@ void check_chip_calibration(void) /* Process counters and decide if cal was good or if we must re-calibrate. */ /* < DTS2011062404739 cuiyu 20110624 begin */ /* check error */ - if(atch_ch > 0) + if((tch_ch) && (atch_ch == 0)) + { + /* Calibration may be good */ + cal_maybe_good(); + TS_DEBUG_TS("the func cal_maybe_good is used!\n"); + } + /* CAL_THR is configurable. A starting value of 10 to 20 is suggested. + * * This can then be tuned for the particular design. */ + else if((tch_ch - 25) <= atch_ch && (tch_ch || atch_ch)) /* DTS2011062404739 cuiyu 20110624 end > */ { /* Calibration was bad - must recalibrate and check afterwards. */ @@ -1915,46 +1922,48 @@ void check_chip_calibration(void) } /* < DTS2011062404739 cuiyu 20110624 begin */ /* check point */ -static int check_too_many_point(int num_i, int *x_record) -{ - - while(num_i > 0) - { - if((x_record[num_i] >= x_record[0] - 2) && (x_record[num_i] <= x_record[0] + 2)) - { - num_i--; - continue; - } - else - { - return 1; // no too many point - } - } - return -1; -} /* DTS2011062404739 cuiyu 20110624 end > */ void cal_maybe_good(void) { int ret; - /* < DTS2011062404739 cuiyu 20110624 begin */ - uint8_t data = 1u; - /* shut down */ - if(cal_check_flag == 0) + /* Check if the timer is enabled */ + if(atmel_timer == ENABLE) { - /* shut down calibration */ - if(1 == write_acquisition_config(0, 0)) + TS_DEBUG_TS("cal_maybe_good: the current time is %lu\n", jiffies); + if((jiffies - timer_tick) /10 > 5) /* Check if the timer timedout of 0.5seconds */ { - /* Acquisition config write failed!\n */ - TS_DEBUG_TS("\n[ERROR] line : %d\n", __LINE__); + /* Cal was good - don't need to check any more */ + cal_check_flag = 0; + /* Disable the timer */ + atmel_timer = DISABLE; + timer_tick = 0; + /* Write back the normal acquisition config to chip. */ + if (1 == write_acquisition_config(0,0)) + { + /* "Acquisition config write failed!\n" */ + + printk("\n[ERROR] line : %d\n", __LINE__); + } + + ret = write_multitouchscreen_config(0,1); + + printk("the cal_maybe_good is ok! the ret is %d\n",ret); + } + else + { + cal_check_flag = 1u; + TS_DEBUG_TS("the time is not yet!\n"); } - ret = write_multitouchscreen_config(0, 1); - msleep(50); - ret = write_mem(command_processor_address + CALIBRATE_OFFSET, 1, &data); - TS_DEBUG_TS("the cal_maybe_good is ok! the ret is %d\n", ret); } - /* DTS2011062404739 cuiyu 20110624 end > */ + else + { + /* Timer not enabled, so enable it */ + atmel_timer = ENABLE; // enable for 100ms timer + timer_tick = jiffies; + cal_check_flag = 1u; + TS_DEBUG_TS("the cal_maybe_good is enable time!\n"); + } } -/* DTS2010083103149 zhangtao 20100909 end > */ /* < DTS2010062400225 zhangtao 20100624 begin */ static int atmel_ts_initchip(void) @@ -2077,15 +2086,6 @@ static void atmel_ts_work_func(struct work_struct *work) static char first_point_id = 1; static int point_1_x; static int point_1_y; - /* < DTS2011062404739 cuiyu 20110624 begin */ - static int first_in_point = 0; - static int point_1_x_first_down; - static int point_1_y_first_down; - static int num_1; - static int num_2; - static int x_record1[10]; - static int x_record2[5]; - /* DTS2011062404739 cuiyu 20110624 end > */ static int point_1_amplitude; static int point_1_width; static int point_2_x; @@ -2194,35 +2194,6 @@ static void atmel_ts_work_func(struct work_struct *work) point_1_y = ts->touch_y; point_1_amplitude = ts->touchamplitude; point_1_width = ts->sizeoftouch; - /* < DTS2011062404739 cuiyu 20110624 begin */ - /* record point */ - if((cal_check_flag != 0) && !(first_in_point)) - { - first_in_point = 1; - num_1 = 0; - point_1_x_first_down = point_1_x; - point_1_y_first_down = point_1_y; - } - - /* timeout or not */ - if(jiffies - resume_time < 6000) - { - x_record1[num_1] = point_1_x; - if(num_1 >= 9) - { - /* check point */ - if(check_too_many_point(num_1, x_record1) == -1) - { - cal_check_flag = 1; - } - num_1 = 0; - } - else - { - num_1++; - } - } - /* DTS2011062404739 cuiyu 20110624 end > */ } else { @@ -2231,46 +2202,12 @@ static void atmel_ts_work_func(struct work_struct *work) point_2_y = ts->touch_y; point_2_amplitude = ts->touchamplitude; point_2_width = ts->sizeoftouch; - /* < DTS2011062404739 cuiyu 20110624 begin */ - /* timeout or not */ - if(jiffies - resume_time < 6000) - { - x_record2[num_2] = point_2_x; - if(num_2 >= 4) - { - /* check point */ - if(check_too_many_point(num_2, x_record2) == -1) - { - cal_check_flag = 1; - } - num_2 = 0; - } - else - { - num_2++; - } - } - /* DTS2011062404739 cuiyu 20110624 end > */ } } else { if(1 == point_index) { - /* < DTS2011062404739 cuiyu 20110624 begin */ - if(cal_check_flag == 1 && (second_point_pressed == FALSE)) - { - if(((abs(ts->touch_x - point_1_x_first_down) > 100 || abs(ts->touch_y - point_1_y_first_down) > 100) - || jiffies - resume_time > 6000)) - { - /* it is all good */ - cal_maybe_good(); - cal_check_flag = 0; - } - first_in_point = 0; - } - /* DTS2011062404739 cuiyu 20110624 end > */ - /*if index-1 released, index-2 point remains working*/ first_point_id = 2; } @@ -2577,7 +2514,7 @@ static int atmel_ts_probe( goto err_power_on_failed; /* */ if (ret) goto err_power_on_failed; @@ -2949,7 +2886,8 @@ goto succeed_find_device; { /* < DTS2011052101089 shenjinming 20110521 begin */ /* can't use the flag ret here, it will change the return value of probe function */ - vreg_disable(v_gp4); + ret = vreg_disable(v_gp4); + printk(KERN_ERR "the atmel's power is off: gp4 = %d \n ", ret); /* delete a line */ /* DTS2011052101089 shenjinming 20110521 end > */ } @@ -3021,11 +2959,6 @@ static int atmel_ts_resume(struct i2c_client *client) write_power_config(1); /* < DTS2010083103149 zhangtao 20100909 begin */ calibrate_chip_error(); -/* DTS2010083103149 zhangtao 20100909 end > */ - /* < DTS2011062404739 cuiyu 20110624 begin */ - cal_check_flag = 1; - resume_time = jiffies; - /* DTS2011062404739 cuiyu 20110624 end > */ if (ts->use_irq) { enable_irq(client->irq); From 89abf031a76bb8afe836b088a1ebe50fd846ec79 Mon Sep 17 00:00:00 2001 From: forumber Date: Fri, 25 Jan 2013 18:38:20 +0200 Subject: [PATCH 17/19] Add BFQv5 --- arch/arm/configs/u8800_defconfig | 6 +- block/Kconfig.iosched | 26 + block/Makefile | 1 + block/bfq-cgroup.c | 831 ++++++++ block/bfq-ioc.c | 380 ++++ block/bfq-iosched.c | 3047 ++++++++++++++++++++++++++++++ block/bfq-sched.c | 1066 +++++++++++ block/bfq.h | 595 ++++++ block/blk-ioc.c | 29 +- block/cfq-iosched.c | 10 +- fs/ioprio.c | 7 +- include/linux/cgroup_subsys.h | 6 + include/linux/iocontext.h | 18 +- 13 files changed, 6000 insertions(+), 22 deletions(-) create mode 100644 block/bfq-cgroup.c create mode 100644 block/bfq-ioc.c create mode 100644 block/bfq-iosched.c create mode 100644 block/bfq-sched.c create mode 100644 block/bfq.h diff --git a/arch/arm/configs/u8800_defconfig b/arch/arm/configs/u8800_defconfig index e252fda97..fbf77b32f 100644 --- a/arch/arm/configs/u8800_defconfig +++ b/arch/arm/configs/u8800_defconfig @@ -172,11 +172,13 @@ CONFIG_LBDAF=y # IO Schedulers # CONFIG_IOSCHED_NOOP=y +# CONFIG_DEFAULT_NOOP is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_IOSCHED_CFQ=y # CONFIG_DEFAULT_CFQ is not set -CONFIG_DEFAULT_NOOP=y -CONFIG_DEFAULT_IOSCHED="noop" +CONFIG_IOSCHED_BFQ=y +CONFIG_DEFAULT_BFQ=y +CONFIG_DEFAULT_IOSCHED="bfq" # CONFIG_INLINE_SPIN_TRYLOCK is not set # CONFIG_INLINE_SPIN_TRYLOCK_BH is not set # CONFIG_INLINE_SPIN_LOCK is not set diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 3199b76f7..bceaaecef 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -43,6 +43,28 @@ config CFQ_GROUP_IOSCHED ---help--- Enable group IO scheduling in CFQ. +config IOSCHED_BFQ + tristate "BFQ I/O scheduler" + depends on EXPERIMENTAL + default n + ---help--- + The BFQ I/O scheduler tries to distribute bandwidth among + all processes according to their weights. + It aims at distributing the bandwidth as desired, independently of + the disk parameters and with any workload. It also tries to + guarantee low latency to interactive and soft real-time + applications. If compiled built-in (saying Y here), BFQ can + be configured to support hierarchical scheduling. + +config CGROUP_BFQIO + bool "BFQ hierarchical scheduling support" + depends on CGROUPS && IOSCHED_BFQ=y + default n + ---help--- + Enable hierarchical scheduling in BFQ, using the cgroups + filesystem interface. The name of the subsystem will be + bfqio. + choice prompt "Default I/O scheduler" default DEFAULT_CFQ @@ -53,6 +75,9 @@ choice config DEFAULT_DEADLINE bool "Deadline" if IOSCHED_DEADLINE=y + config DEFAULT_BFQ + bool "BFQ" if IOSCHED_BFQ=y + config DEFAULT_CFQ bool "CFQ" if IOSCHED_CFQ=y @@ -65,6 +90,7 @@ config DEFAULT_IOSCHED string default "deadline" if DEFAULT_DEADLINE default "cfq" if DEFAULT_CFQ + default "bfq" if DEFAULT_BFQ default "noop" if DEFAULT_NOOP endmenu diff --git a/block/Makefile b/block/Makefile index 0fec4b3fa..a3cf79cc0 100644 --- a/block/Makefile +++ b/block/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o +obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c new file mode 100644 index 000000000..74ae73b91 --- /dev/null +++ b/block/bfq-cgroup.c @@ -0,0 +1,831 @@ +/* + * BFQ: CGROUPS support. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. + */ + +#ifdef CONFIG_CGROUP_BFQIO +static struct bfqio_cgroup bfqio_root_cgroup = { + .weight = BFQ_DEFAULT_GRP_WEIGHT, + .ioprio = BFQ_DEFAULT_GRP_IOPRIO, + .ioprio_class = BFQ_DEFAULT_GRP_CLASS, +}; + +static inline void bfq_init_entity(struct bfq_entity *entity, + struct bfq_group *bfqg) +{ + entity->weight = entity->new_weight; + entity->orig_weight = entity->new_weight; + entity->ioprio = entity->new_ioprio; + entity->ioprio_class = entity->new_ioprio_class; + entity->parent = bfqg->my_entity; + entity->sched_data = &bfqg->sched_data; +} + +static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup) +{ + return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id), + struct bfqio_cgroup, css); +} + +/* + * Search the bfq_group for bfqd into the hash table (by now only a list) + * of bgrp. Must be called under rcu_read_lock(). + */ +static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp, + struct bfq_data *bfqd) +{ + struct bfq_group *bfqg; + struct hlist_node *n; + void *key; + + hlist_for_each_entry_rcu(bfqg, n, &bgrp->group_data, group_node) { + key = rcu_dereference(bfqg->bfqd); + if (key == bfqd) + return bfqg; + } + + return NULL; +} + +static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp, + struct bfq_group *bfqg) +{ + struct bfq_entity *entity = &bfqg->entity; + + entity->weight = entity->new_weight = bgrp->weight; + entity->orig_weight = entity->new_weight; + entity->ioprio = entity->new_ioprio = bgrp->ioprio; + entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class; + entity->ioprio_changed = 1; + entity->my_sched_data = &bfqg->sched_data; +} + +static inline void bfq_group_set_parent(struct bfq_group *bfqg, + struct bfq_group *parent) +{ + struct bfq_entity *entity; + + BUG_ON(parent == NULL); + BUG_ON(bfqg == NULL); + + entity = &bfqg->entity; + entity->parent = parent->my_entity; + entity->sched_data = &parent->sched_data; +} + +/** + * bfq_group_chain_alloc - allocate a chain of groups. + * @bfqd: queue descriptor. + * @cgroup: the leaf cgroup this chain starts from. + * + * Allocate a chain of groups starting from the one belonging to + * @cgroup up to the root cgroup. Stop if a cgroup on the chain + * to the root has already an allocated group on @bfqd. + */ +static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd, + struct cgroup *cgroup) +{ + struct bfqio_cgroup *bgrp; + struct bfq_group *bfqg, *prev = NULL, *leaf = NULL; + + for (; cgroup != NULL; cgroup = cgroup->parent) { + bgrp = cgroup_to_bfqio(cgroup); + + bfqg = bfqio_lookup_group(bgrp, bfqd); + if (bfqg != NULL) { + /* + * All the cgroups in the path from there to the + * root must have a bfq_group for bfqd, so we don't + * need any more allocations. + */ + break; + } + + bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC); + if (bfqg == NULL) + goto cleanup; + + bfq_group_init_entity(bgrp, bfqg); + bfqg->my_entity = &bfqg->entity; + + if (leaf == NULL) { + leaf = bfqg; + prev = leaf; + } else { + bfq_group_set_parent(prev, bfqg); + /* + * Build a list of allocated nodes using the bfqd + * filed, that is still unused and will be initialized + * only after the node will be connected. + */ + prev->bfqd = bfqg; + prev = bfqg; + } + } + + return leaf; + +cleanup: + while (leaf != NULL) { + prev = leaf; + leaf = leaf->bfqd; + kfree(prev); + } + + return NULL; +} + +/** + * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy. + * @bfqd: the queue descriptor. + * @cgroup: the leaf cgroup to start from. + * @leaf: the leaf group (to be associated to @cgroup). + * + * Try to link a chain of groups to a cgroup hierarchy, connecting the + * nodes bottom-up, so we can be sure that when we find a cgroup in the + * hierarchy that already as a group associated to @bfqd all the nodes + * in the path to the root cgroup have one too. + * + * On locking: the queue lock protects the hierarchy (there is a hierarchy + * per device) while the bfqio_cgroup lock protects the list of groups + * belonging to the same cgroup. + */ +static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup, + struct bfq_group *leaf) +{ + struct bfqio_cgroup *bgrp; + struct bfq_group *bfqg, *next, *prev = NULL; + unsigned long flags; + + assert_spin_locked(bfqd->queue->queue_lock); + + for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) { + bgrp = cgroup_to_bfqio(cgroup); + next = leaf->bfqd; + + bfqg = bfqio_lookup_group(bgrp, bfqd); + BUG_ON(bfqg != NULL); + + spin_lock_irqsave(&bgrp->lock, flags); + + rcu_assign_pointer(leaf->bfqd, bfqd); + hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data); + hlist_add_head(&leaf->bfqd_node, &bfqd->group_list); + + spin_unlock_irqrestore(&bgrp->lock, flags); + + prev = leaf; + leaf = next; + } + + BUG_ON(cgroup == NULL && leaf != NULL); + if (cgroup != NULL && prev != NULL) { + bgrp = cgroup_to_bfqio(cgroup); + bfqg = bfqio_lookup_group(bgrp, bfqd); + bfq_group_set_parent(prev, bfqg); + } +} + +/** + * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup. + * @bfqd: queue descriptor. + * @cgroup: cgroup being searched for. + * + * Return a group associated to @bfqd in @cgroup, allocating one if + * necessary. When a group is returned all the cgroups in the path + * to the root have a group associated to @bfqd. + * + * If the allocation fails, return the root group: this breaks guarantees + * but is a safe fallbak. If this loss becames a problem it can be + * mitigated using the equivalent weight (given by the product of the + * weights of the groups in the path from @group to the root) in the + * root scheduler. + * + * We allocate all the missing nodes in the path from the leaf cgroup + * to the root and we connect the nodes only after all the allocations + * have been successful. + */ +static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, + struct cgroup *cgroup) +{ + struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup); + struct bfq_group *bfqg; + + bfqg = bfqio_lookup_group(bgrp, bfqd); + if (bfqg != NULL) + return bfqg; + + bfqg = bfq_group_chain_alloc(bfqd, cgroup); + if (bfqg != NULL) + bfq_group_chain_link(bfqd, cgroup, bfqg); + else + bfqg = bfqd->root_group; + + return bfqg; +} + +/** + * bfq_bfqq_move - migrate @bfqq to @bfqg. + * @bfqd: queue descriptor. + * @bfqq: the queue to move. + * @entity: @bfqq's entity. + * @bfqg: the group to move to. + * + * Move @bfqq to @bfqg, deactivating it from its old group and reactivating + * it on the new one. Avoid putting the entity on the old group idle tree. + * + * Must be called under the queue lock; the cgroup owning @bfqg must + * not disappear (by now this just means that we are called under + * rcu_read_lock()). + */ +static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_entity *entity, struct bfq_group *bfqg) +{ + int busy, resume; + + busy = bfq_bfqq_busy(bfqq); + resume = !RB_EMPTY_ROOT(&bfqq->sort_list); + + BUG_ON(resume && !entity->on_st); + BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue); + + if (busy) { + BUG_ON(atomic_read(&bfqq->ref) < 2); + + if (!resume) + bfq_del_bfqq_busy(bfqd, bfqq, 0); + else + bfq_deactivate_bfqq(bfqd, bfqq, 0); + } else if (entity->on_st) + bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); + + /* + * Here we use a reference to bfqg. We don't need a refcounter + * as the cgroup reference will not be dropped, so that its + * destroy() callback will not be invoked. + */ + entity->parent = bfqg->my_entity; + entity->sched_data = &bfqg->sched_data; + + if (busy && resume) + bfq_activate_bfqq(bfqd, bfqq); +} + +/** + * __bfq_cic_change_cgroup - move @cic to @cgroup. + * @bfqd: the queue descriptor. + * @cic: the cic to move. + * @cgroup: the cgroup to move to. + * + * Move cic to cgroup, assuming that bfqd->queue is locked; the caller + * has to make sure that the reference to cgroup is valid across the call. + * + * NOTE: an alternative approach might have been to store the current + * cgroup in bfqq and getting a reference to it, reducing the lookup + * time here, at the price of slightly more complex code. + */ +static struct bfq_group *__bfq_cic_change_cgroup(struct bfq_data *bfqd, + struct cfq_io_context *cic, + struct cgroup *cgroup) +{ + struct bfq_queue *async_bfqq = cic_to_bfqq(cic, 0); + struct bfq_queue *sync_bfqq = cic_to_bfqq(cic, 1); + struct bfq_entity *entity; + struct bfq_group *bfqg; + + bfqg = bfq_find_alloc_group(bfqd, cgroup); + if (async_bfqq != NULL) { + entity = &async_bfqq->entity; + + if (entity->sched_data != &bfqg->sched_data) { + cic_set_bfqq(cic, NULL, 0); + bfq_log_bfqq(bfqd, async_bfqq, + "cic_change_group: %p %d", + async_bfqq, atomic_read(&async_bfqq->ref)); + bfq_put_queue(async_bfqq); + } + } + + if (sync_bfqq != NULL) { + entity = &sync_bfqq->entity; + if (entity->sched_data != &bfqg->sched_data) + bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); + } + + return bfqg; +} + +/** + * bfq_cic_change_cgroup - move @cic to @cgroup. + * @cic: the cic being migrated. + * @cgroup: the destination cgroup. + * + * When the task owning @cic is moved to @cgroup, @cic is immediately + * moved into its new parent group. + */ +static void bfq_cic_change_cgroup(struct cfq_io_context *cic, + struct cgroup *cgroup) +{ + struct bfq_data *bfqd; + unsigned long uninitialized_var(flags); + + bfqd = bfq_get_bfqd_locked(&cic->key, &flags); + if (bfqd != NULL && + !strncmp(bfqd->queue->elevator->elevator_type->elevator_name, + "bfq", ELV_NAME_MAX)) { + __bfq_cic_change_cgroup(bfqd, cic, cgroup); + bfq_put_bfqd_unlock(bfqd, &flags); + } +} + +/** + * bfq_cic_update_cgroup - update the cgroup of @cic. + * @cic: the @cic to update. + * + * Make sure that @cic is enqueued in the cgroup of the current task. + * We need this in addition to moving cics during the cgroup attach + * phase because the task owning @cic could be at its first disk + * access or we may end up in the root cgroup as the result of a + * memory allocation failure and here we try to move to the right + * group. + * + * Must be called under the queue lock. It is safe to use the returned + * value even after the rcu_read_unlock() as the migration/destruction + * paths act under the queue lock too. IOW it is impossible to race with + * group migration/destruction and end up with an invalid group as: + * a) here cgroup has not yet been destroyed, nor its destroy callback + * has started execution, as current holds a reference to it, + * b) if it is destroyed after rcu_read_unlock() [after current is + * migrated to a different cgroup] its attach() callback will have + * taken care of remove all the references to the old cgroup data. + */ +static struct bfq_group *bfq_cic_update_cgroup(struct cfq_io_context *cic) +{ + struct bfq_data *bfqd = cic->key; + struct bfq_group *bfqg; + struct cgroup *cgroup; + + BUG_ON(bfqd == NULL); + + rcu_read_lock(); + cgroup = task_cgroup(current, bfqio_subsys_id); + bfqg = __bfq_cic_change_cgroup(bfqd, cic, cgroup); + rcu_read_unlock(); + + return bfqg; +} + +/** + * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. + * @st: the service tree being flushed. + */ +static inline void bfq_flush_idle_tree(struct bfq_service_tree *st) +{ + struct bfq_entity *entity = st->first_idle; + + for (; entity != NULL; entity = st->first_idle) + __bfq_deactivate_entity(entity, 0); +} + +/** + * bfq_reparent_leaf_entity - move leaf entity to the root_group. + * @bfqd: the device data structure with the root group. + * @entity: the entity to move. + */ +static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + BUG_ON(bfqq == NULL); + bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); + return; +} + +/** + * bfq_reparent_active_entities - move to the root group all active entities. + * @bfqd: the device data structure with the root group. + * @bfqg: the group to move from. + * @st: the service tree with the entities. + * + * Needs queue_lock to be taken and reference to be valid over the call. + */ +static inline void bfq_reparent_active_entities(struct bfq_data *bfqd, + struct bfq_group *bfqg, + struct bfq_service_tree *st) +{ + struct rb_root *active = &st->active; + struct bfq_entity *entity = NULL; + + if (!RB_EMPTY_ROOT(&st->active)) + entity = bfq_entity_of(rb_first(active)); + + for (; entity != NULL ; entity = bfq_entity_of(rb_first(active))) + bfq_reparent_leaf_entity(bfqd, entity); + + if (bfqg->sched_data.active_entity != NULL) + bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity); + + return; +} + +/** + * bfq_destroy_group - destroy @bfqg. + * @bgrp: the bfqio_cgroup containing @bfqg. + * @bfqg: the group being destroyed. + * + * Destroy @bfqg, making sure that it is not referenced from its parent. + */ +static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) +{ + struct bfq_data *bfqd; + struct bfq_service_tree *st; + struct bfq_entity *entity = bfqg->my_entity; + unsigned long uninitialized_var(flags); + int i; + + hlist_del(&bfqg->group_node); + + /* + * Empty all service_trees belonging to this group before deactivating + * the group itself. + */ + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { + st = bfqg->sched_data.service_tree + i; + + /* + * The idle tree may still contain bfq_queues belonging + * to exited task because they never migrated to a different + * cgroup from the one being destroyed now. Noone else + * can access them so it's safe to act without any lock. + */ + bfq_flush_idle_tree(st); + + /* + * It may happen that some queues are still active + * (busy) upon group destruction (if the corresponding + * processes have been forced to terminate). We move + * all the leaf entities corresponding to these queues + * to the root_group. + * Also, it may happen that the group has an entity + * under service, which is disconnected from the active + * tree: it must be moved, too. + * There is no need to put the sync queues, as the + * scheduler has taken no reference. + */ + bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); + if (bfqd != NULL) { + bfq_reparent_active_entities(bfqd, bfqg, st); + bfq_put_bfqd_unlock(bfqd, &flags); + } + BUG_ON(!RB_EMPTY_ROOT(&st->active)); + BUG_ON(!RB_EMPTY_ROOT(&st->idle)); + } + BUG_ON(bfqg->sched_data.next_active != NULL); + BUG_ON(bfqg->sched_data.active_entity != NULL); + + /* + * We may race with device destruction, take extra care when + * dereferencing bfqg->bfqd. + */ + bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); + if (bfqd != NULL) { + hlist_del(&bfqg->bfqd_node); + __bfq_deactivate_entity(entity, 0); + bfq_put_async_queues(bfqd, bfqg); + bfq_put_bfqd_unlock(bfqd, &flags); + } + BUG_ON(entity->tree != NULL); + + /* + * No need to defer the kfree() to the end of the RCU grace + * period: we are called from the destroy() callback of our + * cgroup, so we can be sure that noone is a) still using + * this cgroup or b) doing lookups in it. + */ + kfree(bfqg); +} + +/** + * bfq_disconnect_groups - diconnect @bfqd from all its groups. + * @bfqd: the device descriptor being exited. + * + * When the device exits we just make sure that no lookup can return + * the now unused group structures. They will be deallocated on cgroup + * destruction. + */ +static void bfq_disconnect_groups(struct bfq_data *bfqd) +{ + struct hlist_node *pos, *n; + struct bfq_group *bfqg; + + bfq_log(bfqd, "disconnect_groups beginning") ; + hlist_for_each_entry_safe(bfqg, pos, n, &bfqd->group_list, bfqd_node) { + hlist_del(&bfqg->bfqd_node); + + __bfq_deactivate_entity(bfqg->my_entity, 0); + + /* + * Don't remove from the group hash, just set an + * invalid key. No lookups can race with the + * assignment as bfqd is being destroyed; this + * implies also that new elements cannot be added + * to the list. + */ + rcu_assign_pointer(bfqg->bfqd, NULL); + + bfq_log(bfqd, "disconnect_groups: put async for group %p", + bfqg) ; + bfq_put_async_queues(bfqd, bfqg); + } +} + +static inline void bfq_free_root_group(struct bfq_data *bfqd) +{ + struct bfqio_cgroup *bgrp = &bfqio_root_cgroup; + struct bfq_group *bfqg = bfqd->root_group; + + bfq_put_async_queues(bfqd, bfqg); + + spin_lock_irq(&bgrp->lock); + hlist_del_rcu(&bfqg->group_node); + spin_unlock_irq(&bgrp->lock); + + /* + * No need to synchronize_rcu() here: since the device is gone + * there cannot be any read-side access to its root_group. + */ + kfree(bfqg); +} + +static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) +{ + struct bfq_group *bfqg; + struct bfqio_cgroup *bgrp; + int i; + + bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); + if (bfqg == NULL) + return NULL; + + bfqg->entity.parent = NULL; + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) + bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; + + bgrp = &bfqio_root_cgroup; + spin_lock_irq(&bgrp->lock); + rcu_assign_pointer(bfqg->bfqd, bfqd); + hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data); + spin_unlock_irq(&bgrp->lock); + + return bfqg; +} + +#define SHOW_FUNCTION(__VAR) \ +static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup, \ + struct cftype *cftype) \ +{ \ + struct bfqio_cgroup *bgrp; \ + u64 ret; \ + \ + if (!cgroup_lock_live_group(cgroup)) \ + return -ENODEV; \ + \ + bgrp = cgroup_to_bfqio(cgroup); \ + spin_lock_irq(&bgrp->lock); \ + ret = bgrp->__VAR; \ + spin_unlock_irq(&bgrp->lock); \ + \ + cgroup_unlock(); \ + \ + return ret; \ +} + +SHOW_FUNCTION(weight); +SHOW_FUNCTION(ioprio); +SHOW_FUNCTION(ioprio_class); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ +static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \ + struct cftype *cftype, \ + u64 val) \ +{ \ + struct bfqio_cgroup *bgrp; \ + struct bfq_group *bfqg; \ + struct hlist_node *n; \ + \ + if (val < (__MIN) || val > (__MAX)) \ + return -EINVAL; \ + \ + if (!cgroup_lock_live_group(cgroup)) \ + return -ENODEV; \ + \ + bgrp = cgroup_to_bfqio(cgroup); \ + \ + spin_lock_irq(&bgrp->lock); \ + bgrp->__VAR = (unsigned short)val; \ + hlist_for_each_entry(bfqg, n, &bgrp->group_data, group_node) { \ + bfqg->entity.new_##__VAR = (unsigned short)val; \ + smp_wmb(); \ + bfqg->entity.ioprio_changed = 1; \ + } \ + spin_unlock_irq(&bgrp->lock); \ + \ + cgroup_unlock(); \ + \ + return 0; \ +} + +STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT); +STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1); +STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); +#undef STORE_FUNCTION + +static struct cftype bfqio_files[] = { + { + .name = "weight", + .read_u64 = bfqio_cgroup_weight_read, + .write_u64 = bfqio_cgroup_weight_write, + }, + { + .name = "ioprio", + .read_u64 = bfqio_cgroup_ioprio_read, + .write_u64 = bfqio_cgroup_ioprio_write, + }, + { + .name = "ioprio_class", + .read_u64 = bfqio_cgroup_ioprio_class_read, + .write_u64 = bfqio_cgroup_ioprio_class_write, + }, +}; + +static int bfqio_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, subsys, bfqio_files, + ARRAY_SIZE(bfqio_files)); +} + +static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys *subsys, + struct cgroup *cgroup) +{ + struct bfqio_cgroup *bgrp; + + if (cgroup->parent != NULL) { + bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL); + if (bgrp == NULL) + return ERR_PTR(-ENOMEM); + } else + bgrp = &bfqio_root_cgroup; + + spin_lock_init(&bgrp->lock); + INIT_HLIST_HEAD(&bgrp->group_data); + bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO; + bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS; + + return &bgrp->css; +} + +/* + * We cannot support shared io contexts, as we have no mean to support + * two tasks with the same ioc in two different groups without major rework + * of the main cic/bfqq data structures. By now we allow a task to change + * its cgroup only if it's the only owner of its ioc; the drawback of this + * behavior is that a group containing a task that forked using CLONE_IO + * will not be destroyed until the tasks sharing the ioc die. + */ +static int bfqio_can_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, + struct task_struct *tsk) +{ + struct io_context *ioc; + int ret = 0; + + /* task_lock() is needed to avoid races with exit_io_context() */ + task_lock(tsk); + ioc = tsk->io_context; + if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) + /* + * ioc == NULL means that the task is either too young or + * exiting: if it has still no ioc the ioc can't be shared, + * if the task is exiting the attach will fail anyway, no + * matter what we return here. + */ + ret = -EINVAL; + task_unlock(tsk); + + return ret; +} + +static void bfqio_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, + struct cgroup *prev, struct task_struct *tsk) +{ + struct io_context *ioc; + struct cfq_io_context *cic; + struct hlist_node *n; + + task_lock(tsk); + ioc = tsk->io_context; + if (ioc != NULL) { + BUG_ON(atomic_long_read(&ioc->refcount) == 0); + atomic_long_inc(&ioc->refcount); + } + task_unlock(tsk); + + if (ioc == NULL) + return; + + rcu_read_lock(); + hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list) + bfq_cic_change_cgroup(cic, cgroup); + rcu_read_unlock(); + + put_io_context(ioc); +} + +static void bfqio_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup); + struct hlist_node *n, *tmp; + struct bfq_group *bfqg; + + /* + * Since we are destroying the cgroup, there are no more tasks + * referencing it, and all the RCU grace periods that may have + * referenced it are ended (as the destruction of the parent + * cgroup is RCU-safe); bgrp->group_data will not be accessed by + * anything else and we don't need any synchronization. + */ + hlist_for_each_entry_safe(bfqg, n, tmp, &bgrp->group_data, group_node) + bfq_destroy_group(bgrp, bfqg); + + BUG_ON(!hlist_empty(&bgrp->group_data)); + + kfree(bgrp); +} + +struct cgroup_subsys bfqio_subsys = { + .name = "bfqio", + .create = bfqio_create, + .can_attach = bfqio_can_attach, + .attach = bfqio_attach, + .destroy = bfqio_destroy, + .populate = bfqio_populate, + .subsys_id = bfqio_subsys_id, +}; +#else +static inline void bfq_init_entity(struct bfq_entity *entity, + struct bfq_group *bfqg) +{ + entity->weight = entity->new_weight; + entity->orig_weight = entity->new_weight; + entity->ioprio = entity->new_ioprio; + entity->ioprio_class = entity->new_ioprio_class; + entity->sched_data = &bfqg->sched_data; +} + +static inline struct bfq_group * +bfq_cic_update_cgroup(struct cfq_io_context *cic) +{ + struct bfq_data *bfqd = cic->key; + return bfqd->root_group; +} + +static inline void bfq_bfqq_move(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_entity *entity, + struct bfq_group *bfqg) +{ +} + +static inline void bfq_disconnect_groups(struct bfq_data *bfqd) +{ + bfq_put_async_queues(bfqd, bfqd->root_group); +} + +static inline void bfq_free_root_group(struct bfq_data *bfqd) +{ + kfree(bfqd->root_group); +} + +static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) +{ + struct bfq_group *bfqg; + int i; + + bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); + if (bfqg == NULL) + return NULL; + + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) + bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; + + return bfqg; +} +#endif diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c new file mode 100644 index 000000000..01f831332 --- /dev/null +++ b/block/bfq-ioc.c @@ -0,0 +1,380 @@ +/* + * BFQ: I/O context handling. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + */ + +/** + * bfq_cic_free_rcu - deferred cic freeing. + * @head: RCU head of the cic to free. + * + * Free the cic containing @head and, if it was the last one and + * the module is exiting wake up anyone waiting for its deallocation + * (see bfq_exit()). + */ +static void bfq_cic_free_rcu(struct rcu_head *head) +{ + struct cfq_io_context *cic; + + cic = container_of(head, struct cfq_io_context, rcu_head); + + kmem_cache_free(bfq_ioc_pool, cic); + elv_ioc_count_dec(bfq_ioc_count); + + if (bfq_ioc_gone != NULL) { + spin_lock(&bfq_ioc_gone_lock); + if (bfq_ioc_gone != NULL && + !elv_ioc_count_read(bfq_ioc_count)) { + complete(bfq_ioc_gone); + bfq_ioc_gone = NULL; + } + spin_unlock(&bfq_ioc_gone_lock); + } +} + +static void bfq_cic_free(struct cfq_io_context *cic) +{ + call_rcu(&cic->rcu_head, bfq_cic_free_rcu); +} + +/** + * cic_free_func - disconnect a cic ready to be freed. + * @ioc: the io_context @cic belongs to. + * @cic: the cic to be freed. + * + * Remove @cic from the @ioc radix tree hash and from its cic list, + * deferring the deallocation of @cic to the end of the current RCU + * grace period. This assumes that __bfq_exit_single_io_context() + * has already been called for @cic. + */ +static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) +{ + unsigned long flags; + unsigned long dead_key = (unsigned long) cic->key; + + BUG_ON(!(dead_key & CIC_DEAD_KEY)); + + spin_lock_irqsave(&ioc->lock, flags); + radix_tree_delete(&ioc->bfq_radix_root, + dead_key >> CIC_DEAD_INDEX_SHIFT); + hlist_del_init_rcu(&cic->cic_list); + spin_unlock_irqrestore(&ioc->lock, flags); + + bfq_cic_free(cic); +} + +static void bfq_free_io_context(struct io_context *ioc) +{ + /* + * ioc->refcount is zero here, or we are called from elv_unregister(), + * so no more cic's are allowed to be linked into this ioc. So it + * should be ok to iterate over the known list, we will see all cic's + * since no new ones are added. + */ + call_for_each_cic(ioc, cic_free_func); +} + +/** + * __bfq_exit_single_io_context - deassociate @cic from any running task. + * @bfqd: bfq_data on which @cic is valid. + * @cic: the cic being exited. + * + * Whenever no more tasks are using @cic or @bfqd is deallocated we + * need to invalidate its entry in the radix tree hash table and to + * release the queues it refers to. + * + * Called under the queue lock. + */ +static void __bfq_exit_single_io_context(struct bfq_data *bfqd, + struct cfq_io_context *cic) +{ + struct io_context *ioc = cic->ioc; + + list_del_init(&cic->queue_list); + + /* + * Make sure dead mark is seen for dead queues + */ + smp_wmb(); + rcu_assign_pointer(cic->key, bfqd_dead_key(bfqd)); + + /* + * No write-side locking as no task is using @ioc (they're exited + * or bfqd is being deallocated. + */ + rcu_read_lock(); + if (rcu_dereference(ioc->ioc_data) == cic) { + rcu_read_unlock(); + spin_lock(&ioc->lock); + rcu_assign_pointer(ioc->ioc_data, NULL); + spin_unlock(&ioc->lock); + } else + rcu_read_unlock(); + + if (cic->cfqq[BLK_RW_ASYNC] != NULL) { + bfq_exit_bfqq(bfqd, cic->cfqq[BLK_RW_ASYNC]); + cic->cfqq[BLK_RW_ASYNC] = NULL; + } + + if (cic->cfqq[BLK_RW_SYNC] != NULL) { + bfq_exit_bfqq(bfqd, cic->cfqq[BLK_RW_SYNC]); + cic->cfqq[BLK_RW_SYNC] = NULL; + } +} + +/** + * bfq_exit_single_io_context - deassociate @cic from @ioc (unlocked version). + * @ioc: the io_context @cic belongs to. + * @cic: the cic being exited. + * + * Take the queue lock and call __bfq_exit_single_io_context() to do the + * rest of the work. We take care of possible races with bfq_exit_queue() + * using bfq_get_bfqd_locked() (and abusing a little bit the RCU mechanism). + */ +static void bfq_exit_single_io_context(struct io_context *ioc, + struct cfq_io_context *cic) +{ + struct bfq_data *bfqd; + unsigned long uninitialized_var(flags); + + bfqd = bfq_get_bfqd_locked(&cic->key, &flags); + if (bfqd != NULL) { + __bfq_exit_single_io_context(bfqd, cic); + bfq_put_bfqd_unlock(bfqd, &flags); + } +} + +/** + * bfq_exit_io_context - deassociate @ioc from all cics it owns. + * @ioc: the @ioc being exited. + * + * No more processes are using @ioc we need to clean up and put the + * internal structures we have that belongs to that process. Loop + * through all its cics, locking their queues and exiting them. + */ +static void bfq_exit_io_context(struct io_context *ioc) +{ + call_for_each_cic(ioc, bfq_exit_single_io_context); +} + +static struct cfq_io_context *bfq_alloc_io_context(struct bfq_data *bfqd, + gfp_t gfp_mask) +{ + struct cfq_io_context *cic; + + cic = kmem_cache_alloc_node(bfq_ioc_pool, gfp_mask | __GFP_ZERO, + bfqd->queue->node); + if (cic != NULL) { + cic->last_end_request = jiffies; + INIT_LIST_HEAD(&cic->queue_list); + INIT_HLIST_NODE(&cic->cic_list); + cic->dtor = bfq_free_io_context; + cic->exit = bfq_exit_io_context; + elv_ioc_count_inc(bfq_ioc_count); + } + + return cic; +} + +/** + * bfq_drop_dead_cic - free an exited cic. + * @bfqd: bfq data for the device in use. + * @ioc: io_context owning @cic. + * @cic: the @cic to free. + * + * We drop cfq io contexts lazily, so we may find a dead one. + */ +static void bfq_drop_dead_cic(struct bfq_data *bfqd, struct io_context *ioc, + struct cfq_io_context *cic) +{ + unsigned long flags; + + WARN_ON(!list_empty(&cic->queue_list)); + BUG_ON(cic->key != bfqd_dead_key(bfqd)); + + spin_lock_irqsave(&ioc->lock, flags); + + BUG_ON(ioc->ioc_data == cic); + + /* + * With shared I/O contexts two lookups may race and drop the + * same cic more than one time: RCU guarantees that the storage + * will not be freed too early, here we make sure that we do + * not try to remove the cic from the hashing structures multiple + * times. + */ + if (!hlist_unhashed(&cic->cic_list)) { + radix_tree_delete(&ioc->bfq_radix_root, bfqd->cic_index); + hlist_del_init_rcu(&cic->cic_list); + bfq_cic_free(cic); + } + + spin_unlock_irqrestore(&ioc->lock, flags); +} + +/** + * bfq_cic_lookup - search into @ioc a cic associated to @bfqd. + * @bfqd: the lookup key. + * @ioc: the io_context of the process doing I/O. + * + * If @ioc already has a cic associated to @bfqd return it, return %NULL + * otherwise. + */ +static struct cfq_io_context *bfq_cic_lookup(struct bfq_data *bfqd, + struct io_context *ioc) +{ + struct cfq_io_context *cic; + unsigned long flags; + void *k; + + if (unlikely(ioc == NULL)) + return NULL; + + rcu_read_lock(); + + /* We maintain a last-hit cache, to avoid browsing over the tree. */ + cic = rcu_dereference(ioc->ioc_data); + if (cic != NULL) { + k = rcu_dereference(cic->key); + if (k == bfqd) + goto out; + } + + do { + cic = radix_tree_lookup(&ioc->bfq_radix_root, + bfqd->cic_index); + if (cic == NULL) + goto out; + + k = rcu_dereference(cic->key); + if (unlikely(k != bfqd)) { + rcu_read_unlock(); + bfq_drop_dead_cic(bfqd, ioc, cic); + rcu_read_lock(); + continue; + } + + spin_lock_irqsave(&ioc->lock, flags); + rcu_assign_pointer(ioc->ioc_data, cic); + spin_unlock_irqrestore(&ioc->lock, flags); + break; + } while (1); + +out: + rcu_read_unlock(); + + return cic; +} + +/** + * bfq_cic_link - add @cic to @ioc. + * @bfqd: bfq_data @cic refers to. + * @ioc: io_context @cic belongs to. + * @cic: the cic to link. + * @gfp_mask: the mask to use for radix tree preallocations. + * + * Add @cic to @ioc, using @bfqd as the search key. This enables us to + * lookup the process specific cfq io context when entered from the block + * layer. Also adds @cic to a per-bfqd list, used when this queue is + * removed. + */ +static int bfq_cic_link(struct bfq_data *bfqd, struct io_context *ioc, + struct cfq_io_context *cic, gfp_t gfp_mask) +{ + unsigned long flags; + int ret; + + ret = radix_tree_preload(gfp_mask); + if (ret == 0) { + cic->ioc = ioc; + + /* No write-side locking, cic is not published yet. */ + rcu_assign_pointer(cic->key, bfqd); + + spin_lock_irqsave(&ioc->lock, flags); + ret = radix_tree_insert(&ioc->bfq_radix_root, + bfqd->cic_index, cic); + if (ret == 0) + hlist_add_head_rcu(&cic->cic_list, &ioc->bfq_cic_list); + spin_unlock_irqrestore(&ioc->lock, flags); + + radix_tree_preload_end(); + + if (ret == 0) { + spin_lock_irqsave(bfqd->queue->queue_lock, flags); + list_add(&cic->queue_list, &bfqd->cic_list); + spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); + } + } + + if (ret != 0) + printk(KERN_ERR "bfq: cic link failed!\n"); + + return ret; +} + +/** + * bfq_ioc_set_ioprio - signal a priority change to the cics belonging to @ioc. + * @ioc: the io_context changing its priority. + */ +static inline void bfq_ioc_set_ioprio(struct io_context *ioc) +{ + call_for_each_cic(ioc, bfq_changed_ioprio); +} + +/** + * bfq_get_io_context - return the @cic associated to @bfqd in @ioc. + * @bfqd: the search key. + * @gfp_mask: the mask to use for cic allocation. + * + * Setup general io context and cfq io context. There can be several cfq + * io contexts per general io context, if this process is doing io to more + * than one device managed by cfq. + */ +static struct cfq_io_context *bfq_get_io_context(struct bfq_data *bfqd, + gfp_t gfp_mask) +{ + struct io_context *ioc = NULL; + struct cfq_io_context *cic; + + might_sleep_if(gfp_mask & __GFP_WAIT); + + ioc = get_io_context(gfp_mask, bfqd->queue->node); + if (ioc == NULL) + return NULL; + + /* Lookup for an existing cic. */ + cic = bfq_cic_lookup(bfqd, ioc); + if (cic != NULL) + goto out; + + /* Alloc one if needed. */ + cic = bfq_alloc_io_context(bfqd, gfp_mask); + if (cic == NULL) + goto err; + + /* Link it into the ioc's radix tree and cic list. */ + if (bfq_cic_link(bfqd, ioc, cic, gfp_mask) != 0) + goto err_free; + +out: + /* + * test_and_clear_bit() implies a memory barrier, paired with + * the wmb() in fs/ioprio.c, so the value seen for ioprio is the + * new one. + */ + if (unlikely(test_and_clear_bit(IOC_BFQ_IOPRIO_CHANGED, + ioc->ioprio_changed))) + bfq_ioc_set_ioprio(ioc); + + return cic; +err_free: + bfq_cic_free(cic); +err: + put_io_context(ioc); + return NULL; +} diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c new file mode 100644 index 000000000..9f261ee60 --- /dev/null +++ b/block/bfq-iosched.c @@ -0,0 +1,3047 @@ +/* + * BFQ, or Budget Fair Queueing, disk scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. + * + * BFQ is a proportional share disk scheduling algorithm based on the + * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, + * measured in number of sectors, to tasks instead of time slices. + * The disk is not granted to the active task for a given time slice, + * but until it has exahusted its assigned budget. This change from + * the time to the service domain allows BFQ to distribute the disk + * bandwidth among tasks as desired, without any distortion due to + * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc + * internal scheduler, called B-WF2Q+, to schedule tasks according to + * their budgets. Thanks to this accurate scheduler, BFQ can afford + * to assign high budgets to disk-bound non-seeky tasks (to boost the + * throughput), and yet guarantee low latencies to interactive and + * soft real-time applications. + * + * BFQ has been introduced in [1], where the interested reader can + * find an accurate description of the algorithm, the bandwidth + * distribution and latency guarantees it provides, plus formal proofs + * of all the properties. With respect to the algorithm presented in + * the paper, this implementation adds several little heuristics, and + * a hierarchical extension, based on H-WF2Q+. + * + * B-WF2Q+ is based on WF2Q+, that is described in [2], together with + * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) + * complexity derives from the one introduced with EEVDF in [3]. + * + * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling + * with Deterministic Guarantees on Bandwidth Distribution,'', + * IEEE Transactions on Computer, May 2010. + * + * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf + * + * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing + * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, + * Oct 1997. + * + * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz + * + * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline + * First: A Flexible and Accurate Mechanism for Proportional Share + * Resource Allocation,'' technical report. + * + * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "bfq.h" + +/* Max number of dispatches in one round of service. */ +static const int bfq_quantum = 4; + +/* Expiration time of sync (0) and async (1) requests, in jiffies. */ +static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; + +/* Maximum backwards seek, in KiB. */ +static const int bfq_back_max = 16 * 1024; + +/* Penalty of a backwards seek, in number of sectors. */ +static const int bfq_back_penalty = 1; + +/* Idling period duration, in jiffies. */ +static int bfq_slice_idle = 0; + +/* Default maximum budget values, in sectors and number of requests. */ +static const int bfq_default_max_budget = 16 * 1024; +static const int bfq_max_budget_async_rq = 4; + +/* + * Async to sync throughput distribution is controlled as follows: + * when an async request is served, the entity is charged the number + * of sectors of the request, multipled by the factor below + */ +static const int bfq_async_charge_factor = 10; + +/* Default timeout values, in jiffies, approximating CFQ defaults. */ +static const int bfq_timeout_sync = HZ / 8; +static int bfq_timeout_async = HZ / 25; + +struct kmem_cache *bfq_pool; +struct kmem_cache *bfq_ioc_pool; + +static DEFINE_PER_CPU(unsigned long, bfq_ioc_count); +static struct completion *bfq_ioc_gone; +static DEFINE_SPINLOCK(bfq_ioc_gone_lock); + +static DEFINE_SPINLOCK(cic_index_lock); +static DEFINE_IDA(cic_index_ida); + +/* Below this threshold (in ms), we consider thinktime immediate. */ +#define BFQ_MIN_TT 2 + +/* hw_tag detection: parallel requests threshold and min samples needed. */ +#define BFQ_HW_QUEUE_THRESHOLD 4 +#define BFQ_HW_QUEUE_SAMPLES 32 + +#define BFQQ_SEEK_THR (sector_t)(8 * 1024) +#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) + +/* Min samples used for peak rate estimation (for autotuning). */ +#define BFQ_PEAK_RATE_SAMPLES 32 + +/* Shift used for peak rate fixed precision calculations. */ +#define BFQ_RATE_SHIFT 16 + +/* + * The duration of the weight raising for interactive applications is + * computed automatically (as default behaviour), using the following + * formula: duration = (R / r) * T, where r is the peak rate of the + * disk, and R and T are two reference parameters. In particular, R is + * the peak rate of a reference disk, and T is about the maximum time + * for starting popular large applications on that disk, under BFQ and + * while reading two files in parallel. Finally, BFQ uses two + * different pairs (R, T) depending on whether the disk is rotational + * or non-rotational. + */ +#define T_rot (msecs_to_jiffies(5500)) +#define T_nonrot (msecs_to_jiffies(2000)) +/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */ +#define R_rot 17415 +#define R_nonrot 34791 + +#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) + +#define RQ_CIC(rq) \ + ((struct cfq_io_context *) (rq)->elevator_private[0]) +#define RQ_BFQQ(rq) ((rq)->elevator_private[1]) + +#include "bfq-ioc.c" +#include "bfq-sched.c" +#include "bfq-cgroup.c" + +#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\ + IOPRIO_CLASS_IDLE) +#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\ + IOPRIO_CLASS_RT) + +#define bfq_sample_valid(samples) ((samples) > 80) + +/* + * We regard a request as SYNC, if either it's a read or has the SYNC bit + * set (in which case it could also be a direct WRITE). + */ +static inline int bfq_bio_sync(struct bio *bio) +{ + if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) + return 1; + + return 0; +} + +/* + * Scheduler run of queue, if there are requests pending and no one in the + * driver that will restart queueing. + */ +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) +{ + if (bfqd->queued != 0) { + bfq_log(bfqd, "schedule dispatch"); + kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work); + } +} + +/* + * Lifted from AS - choose which of rq1 and rq2 that is best served now. + * We choose the request that is closesr to the head right now. Distance + * behind the head is penalized and only allowed to a certain extent. + */ +static struct request *bfq_choose_req(struct bfq_data *bfqd, + struct request *rq1, + struct request *rq2, + sector_t last) +{ + sector_t s1, s2, d1 = 0, d2 = 0; + unsigned long back_max; +#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ +#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ + unsigned wrap = 0; /* bit mask: requests behind the disk head? */ + + if (rq1 == NULL || rq1 == rq2) + return rq2; + if (rq2 == NULL) + return rq1; + + if (rq_is_sync(rq1) && !rq_is_sync(rq2)) + return rq1; + else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) + return rq2; + if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) + return rq1; + else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) + return rq2; + + s1 = blk_rq_pos(rq1); + s2 = blk_rq_pos(rq2); + + /* + * By definition, 1KiB is 2 sectors. + */ + back_max = bfqd->bfq_back_max * 2; + + /* + * Strict one way elevator _except_ in the case where we allow + * short backward seeks which are biased as twice the cost of a + * similar forward seek. + */ + if (s1 >= last) + d1 = s1 - last; + else if (s1 + back_max >= last) + d1 = (last - s1) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ1_WRAP; + + if (s2 >= last) + d2 = s2 - last; + else if (s2 + back_max >= last) + d2 = (last - s2) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ2_WRAP; + + /* Found required data */ + + /* + * By doing switch() on the bit mask "wrap" we avoid having to + * check two variables for all permutations: --> faster! + */ + switch (wrap) { + case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ + if (d1 < d2) + return rq1; + else if (d2 < d1) + return rq2; + else { + if (s1 >= s2) + return rq1; + else + return rq2; + } + + case BFQ_RQ2_WRAP: + return rq1; + case BFQ_RQ1_WRAP: + return rq2; + case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ + default: + /* + * Since both rqs are wrapped, + * start with the one that's further behind head + * (--> only *one* back seek required), + * since back seek takes more time than forward. + */ + if (s1 <= s2) + return rq1; + else + return rq2; + } +} + +static struct bfq_queue * +bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, + sector_t sector, struct rb_node **ret_parent, + struct rb_node ***rb_link) +{ + struct rb_node **p, *parent; + struct bfq_queue *bfqq = NULL; + + parent = NULL; + p = &root->rb_node; + while (*p) { + struct rb_node **n; + + parent = *p; + bfqq = rb_entry(parent, struct bfq_queue, pos_node); + + /* + * Sort strictly based on sector. Smallest to the left, + * largest to the right. + */ + if (sector > blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_right; + else if (sector < blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_left; + else + break; + p = n; + bfqq = NULL; + } + + *ret_parent = parent; + if (rb_link) + *rb_link = p; + + bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", + (long long unsigned)sector, + bfqq != NULL ? bfqq->pid : 0); + + return bfqq; +} + +static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct rb_node **p, *parent; + struct bfq_queue *__bfqq; + + if (bfqq->pos_root != NULL) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + + if (bfq_class_idle(bfqq)) + return; + if (!bfqq->next_rq) + return; + + bfqq->pos_root = &bfqd->rq_pos_tree; + __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, + blk_rq_pos(bfqq->next_rq), &parent, &p); + if (__bfqq == NULL) { + rb_link_node(&bfqq->pos_node, parent, p); + rb_insert_color(&bfqq->pos_node, bfqq->pos_root); + } else + bfqq->pos_root = NULL; +} + +static struct request *bfq_find_next_rq(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *last) +{ + struct rb_node *rbnext = rb_next(&last->rb_node); + struct rb_node *rbprev = rb_prev(&last->rb_node); + struct request *next = NULL, *prev = NULL; + + BUG_ON(RB_EMPTY_NODE(&last->rb_node)); + + if (rbprev != NULL) + prev = rb_entry_rq(rbprev); + + if (rbnext != NULL) + next = rb_entry_rq(rbnext); + else { + rbnext = rb_first(&bfqq->sort_list); + if (rbnext && rbnext != &last->rb_node) + next = rb_entry_rq(rbnext); + } + + return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); +} + +static void bfq_del_rq_rb(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + const int sync = rq_is_sync(rq); + + BUG_ON(bfqq->queued[sync] == 0); + bfqq->queued[sync]--; + bfqd->queued--; + + elv_rb_del(&bfqq->sort_list, rq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue) + bfq_del_bfqq_busy(bfqd, bfqq, 1); + /* + * Remove queue from request-position tree as it is empty. + */ + if (bfqq->pos_root != NULL) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + } +} + +/* see the definition of bfq_async_charge_factor for details */ +static inline unsigned long bfq_serv_to_charge(struct request *rq, + struct bfq_queue *bfqq) +{ + return blk_rq_sectors(rq) * + (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) * + bfq_async_charge_factor)); +} + +/** + * bfq_updated_next_req - update the queue after a new next_rq selection. + * @bfqd: the device data the queue belongs to. + * @bfqq: the queue to update. + * + * If the first request of a queue changes we make sure that the queue + * has enough budget to serve at least its first request (if the + * request has grown). We do this because if the queue has not enough + * budget for its first request, it has to go through two dispatch + * rounds to actually get it dispatched. + */ +static void bfq_updated_next_req(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + struct request *next_rq = bfqq->next_rq; + unsigned long new_budget; + + if (next_rq == NULL) + return; + + if (bfqq == bfqd->active_queue) + /* + * In order not to break guarantees, budgets cannot be + * changed after an entity has been selected. + */ + return; + + BUG_ON(entity->tree != &st->active); + BUG_ON(entity == entity->sched_data->active_entity); + + new_budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + entity->budget = new_budget; + bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); + bfq_activate_bfqq(bfqd, bfqq); +} + +static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd) +{ + u64 dur; + + if (bfqd->bfq_raising_max_time > 0) + return bfqd->bfq_raising_max_time; + + dur = bfqd->RT_prod; + do_div(dur, bfqd->peak_rate); + + return dur; +} + +static void bfq_add_rq_rb(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_entity *entity = &bfqq->entity; + struct bfq_data *bfqd = bfqq->bfqd; + struct request *next_rq, *prev; + unsigned long old_raising_coeff = bfqq->raising_coeff; + int idle_for_long_time = bfqq->budget_timeout + + bfqd->bfq_raising_min_idle_time < jiffies; + + bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq)); + bfqq->queued[rq_is_sync(rq)]++; + bfqd->queued++; + + elv_rb_add(&bfqq->sort_list, rq); + + /* + * Check if this request is a better next-serve candidate. + */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); + BUG_ON(next_rq == NULL); + bfqq->next_rq = next_rq; + + /* + * Adjust priority tree position, if next_rq changes. + */ + if (prev != bfqq->next_rq) + bfq_rq_pos_tree_add(bfqd, bfqq); + + if (!bfq_bfqq_busy(bfqq)) { + int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && + bfqq->soft_rt_next_start < jiffies; + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + + if (! bfqd->low_latency) + goto add_bfqq_busy; + + /* + * If the queue is not being boosted and has been idle + * for enough time, start a weight-raising period + */ + if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) { + bfqq->raising_coeff = bfqd->bfq_raising_coeff; + if (idle_for_long_time) + bfqq->raising_cur_max_time = + bfq_wrais_duration(bfqd); + else + bfqq->raising_cur_max_time = + bfqd->bfq_raising_rt_max_time; + bfq_log_bfqq(bfqd, bfqq, + "wrais starting at %llu msec," + "rais_max_time %u", + bfqq->last_rais_start_finish, + jiffies_to_msecs(bfqq-> + raising_cur_max_time)); + } else if (old_raising_coeff > 1) { + if (idle_for_long_time) + bfqq->raising_cur_max_time = + bfq_wrais_duration(bfqd); + else if (bfqq->raising_cur_max_time == + bfqd->bfq_raising_rt_max_time && + !soft_rt) { + bfqq->raising_coeff = 1; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %llu msec," + "rais_max_time %u", + bfqq->last_rais_start_finish, + jiffies_to_msecs(bfqq-> + raising_cur_max_time)); + } + } + if (old_raising_coeff != bfqq->raising_coeff) + entity->ioprio_changed = 1; +add_bfqq_busy: + bfq_add_bfqq_busy(bfqd, bfqq); + } else { + if(bfqd->low_latency && old_raising_coeff == 1 && + !rq_is_sync(rq) && + bfqq->last_rais_start_finish + + bfqd->bfq_raising_min_inter_arr_async < jiffies) { + bfqq->raising_coeff = bfqd->bfq_raising_coeff; + bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd); + + entity->ioprio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, + "non-idle wrais starting at %llu msec," + "rais_max_time %u", + bfqq->last_rais_start_finish, + jiffies_to_msecs(bfqq-> + raising_cur_max_time)); + } + bfq_updated_next_req(bfqd, bfqq); + } + + if(bfqd->low_latency && + (old_raising_coeff == 1 || bfqq->raising_coeff == 1 || + idle_for_long_time)) + bfqq->last_rais_start_finish = jiffies; +} + +static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq) +{ + elv_rb_del(&bfqq->sort_list, rq); + bfqq->queued[rq_is_sync(rq)]--; + bfqq->bfqd->queued--; + bfq_add_rq_rb(rq); +} + +static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, + struct bio *bio) +{ + struct task_struct *tsk = current; + struct cfq_io_context *cic; + struct bfq_queue *bfqq; + + cic = bfq_cic_lookup(bfqd, tsk->io_context); + if (cic == NULL) + return NULL; + + bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio)); + if (bfqq != NULL) { + sector_t sector = bio->bi_sector + bio_sectors(bio); + + return elv_rb_find(&bfqq->sort_list, sector); + } + + return NULL; +} + +static void bfq_activate_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + + bfqd->rq_in_driver++; + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); + bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", + (long long unsigned)bfqd->last_position); +} + +static void bfq_deactivate_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + + WARN_ON(bfqd->rq_in_driver == 0); + bfqd->rq_in_driver--; +} + +static void bfq_remove_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + + if (bfqq->next_rq == rq) { + bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); + bfq_updated_next_req(bfqd, bfqq); + } + + list_del_init(&rq->queuelist); + bfq_del_rq_rb(rq); + + if (rq->cmd_flags & REQ_META) { + WARN_ON(bfqq->meta_pending == 0); + bfqq->meta_pending--; + } +} + +static int bfq_merge(struct request_queue *q, struct request **req, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct request *__rq; + + __rq = bfq_find_rq_fmerge(bfqd, bio); + if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { + *req = __rq; + return ELEVATOR_FRONT_MERGE; + } + + return ELEVATOR_NO_MERGE; +} + +static void bfq_merged_request(struct request_queue *q, struct request *req, + int type) +{ + if (type == ELEVATOR_FRONT_MERGE) { + struct bfq_queue *bfqq = RQ_BFQQ(req); + + bfq_reposition_rq_rb(bfqq, req); + } +} + +static void bfq_merged_requests(struct request_queue *q, struct request *rq, + struct request *next) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + /* + * Reposition in fifo if next is older than rq. + */ + if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && + time_before(rq_fifo_time(next), rq_fifo_time(rq))) { + list_move(&rq->queuelist, &next->queuelist); + rq_set_fifo_time(rq, rq_fifo_time(next)); + } + + if (bfqq->next_rq == next) + bfqq->next_rq = rq; + + bfq_remove_request(next); +} + +static int bfq_allow_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct cfq_io_context *cic; + struct bfq_queue *bfqq; + + /* Disallow merge of a sync bio into an async request. */ + if (bfq_bio_sync(bio) && !rq_is_sync(rq)) + return 0; + + /* + * Lookup the bfqq that this bio will be queued with. Allow + * merge only if rq is queued there. + */ + cic = bfq_cic_lookup(bfqd, current->io_context); + if (cic == NULL) + return 0; + + bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio)); + return bfqq == RQ_BFQQ(rq); +} + +static void __bfq_set_active_queue(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + if (bfqq != NULL) { + bfq_mark_bfqq_must_alloc(bfqq); + bfq_mark_bfqq_budget_new(bfqq); + bfq_clear_bfqq_fifo_expire(bfqq); + + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; + + bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu", + bfqq->entity.budget); + } + + bfqd->active_queue = bfqq; +} + +/* + * Get and set a new active queue for service. + */ +static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + if (!bfqq) + bfqq = bfq_get_next_queue(bfqd); + else + bfq_get_next_queue_forced(bfqd, bfqq); + + __bfq_set_active_queue(bfqd, bfqq); + return bfqq; +} + +static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd, + struct request *rq) +{ + if (blk_rq_pos(rq) >= bfqd->last_position) + return blk_rq_pos(rq) - bfqd->last_position; + else + return bfqd->last_position - blk_rq_pos(rq); +} + +/* + * Return true if bfqq has no request pending and rq is close enough to + * bfqd->last_position, or if rq is closer to bfqd->last_position than + * bfqq->next_rq + */ +static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq) +{ + return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR; +} + +static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) +{ + struct rb_root *root = &bfqd->rq_pos_tree; + struct rb_node *parent, *node; + struct bfq_queue *__bfqq; + sector_t sector = bfqd->last_position; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + /* + * First, if we find a request starting at the end of the last + * request, choose it. + */ + __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); + if (__bfqq != NULL) + return __bfqq; + + /* + * If the exact sector wasn't found, the parent of the NULL leaf + * will contain the closest sector (rq_pos_tree sorted by next_request + * position). + */ + __bfqq = rb_entry(parent, struct bfq_queue, pos_node); + if (bfq_rq_close(bfqd, __bfqq->next_rq)) + return __bfqq; + + if (blk_rq_pos(__bfqq->next_rq) < sector) + node = rb_next(&__bfqq->pos_node); + else + node = rb_prev(&__bfqq->pos_node); + if (node == NULL) + return NULL; + + __bfqq = rb_entry(node, struct bfq_queue, pos_node); + if (bfq_rq_close(bfqd, __bfqq->next_rq)) + return __bfqq; + + return NULL; +} + +/* + * bfqd - obvious + * cur_bfqq - passed in so that we don't decide that the current queue + * is closely cooperating with itself. + * + * We are assuming that cur_bfqq has dispatched at least one request, + * and that bfqd->last_position reflects a position on the disk associated + * with the I/O issued by cur_bfqq. + */ +static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, + struct bfq_queue *cur_bfqq) +{ + struct bfq_queue *bfqq; + + if (bfq_class_idle(cur_bfqq)) + return NULL; + if (!bfq_bfqq_sync(cur_bfqq)) + return NULL; + if (BFQQ_SEEKY(cur_bfqq)) + return NULL; + + /* If device has only one backlogged bfq_queue, don't search. */ + if (bfqd->busy_queues == 1) + return NULL; + + /* + * We should notice if some of the queues are cooperating, e.g. + * working closely on the same area of the disk. In that case, + * we can group them together and don't waste time idling. + */ + bfqq = bfqq_close(bfqd); + if (bfqq == NULL || bfqq == cur_bfqq) + return NULL; + + /* + * Do not merge queues from different bfq_groups. + */ + if (bfqq->entity.parent != cur_bfqq->entity.parent) + return NULL; + + /* + * It only makes sense to merge sync queues. + */ + if (!bfq_bfqq_sync(bfqq)) + return NULL; + if (BFQQ_SEEKY(bfqq)) + return NULL; + + /* + * Do not merge queues of different priority classes. + */ + if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq)) + return NULL; + + return bfqq; +} + +/* + * If enough samples have been computed, return the current max budget + * stored in bfqd, which is dynamically updated according to the + * estimated disk peak rate; otherwise return the default max budget + */ +static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < 194) + return bfq_default_max_budget; + else + return bfqd->bfq_max_budget; +} + +/* + * Return min budget, which is a fraction of the current or default + * max budget (trying with 1/32) + */ +static inline unsigned long bfq_min_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < 194) + return bfq_default_max_budget; + else + return bfqd->bfq_max_budget / 32; +} + +/* + * Decides whether idling should be done for given device and + * given active queue. + */ +static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd, + struct bfq_queue *active_bfqq) +{ + if (active_bfqq == NULL) + return false; + /* + * If device is SSD it has no seek penalty, disable idling; but + * do so only if: + * - device does not support queuing, otherwise we still have + * a problem with sync vs async workloads; + * - the queue is not weight-raised, to preserve guarantees. + */ + return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag && + active_bfqq->raising_coeff == 1); +} + +static void bfq_arm_slice_timer(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfqd->active_queue; + struct cfq_io_context *cic; + unsigned long sl; + + WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + + if (bfq_queue_nonrot_noidle(bfqd, bfqq)) + return; + + /* Idling is disabled, either manually or by past process history. */ + if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_idle_window(bfqq)) + return; + + /* Tasks have exited, don't wait. */ + cic = bfqd->active_cic; + if (cic == NULL || atomic_read(&cic->ioc->nr_tasks) == 0) + return; + + bfq_mark_bfqq_wait_request(bfqq); + + /* + * We don't want to idle for seeks, but we do want to allow + * fair distribution of slice time for a process doing back-to-back + * seeks. So allow a little bit of time for him to submit a new rq. + * + * To prevent processes with (partly) seeky workloads from + * being too ill-treated, grant them a small fraction of the + * assigned budget before reducing the waiting time to + * BFQ_MIN_TT. This happened to help reduce latency. + */ + sl = bfqd->bfq_slice_idle; + if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) && + bfqq->entity.service > bfq_max_budget(bfqd) / 8 && + bfqq->raising_coeff == 1) + sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); + else if (bfqq->raising_coeff > 1) + sl = sl * 3; + bfqd->last_idling_start = ktime_get(); + mod_timer(&bfqd->idle_slice_timer, jiffies + sl); + bfq_log(bfqd, "arm idle: %u/%u ms", + jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); +} + +/* + * Set the maximum time for the active queue to consume its + * budget. This prevents seeky processes from lowering the disk + * throughput (always guaranteed with a time slice scheme as in CFQ). + */ +static void bfq_set_budget_timeout(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfqd->active_queue; + unsigned int timeout_coeff; + if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time) + timeout_coeff = 1; + else + timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; + + bfqd->last_budget_start = ktime_get(); + + bfq_clear_bfqq_budget_new(bfqq); + bfqq->budget_timeout = jiffies + + bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; + + bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", + jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * + timeout_coeff)); +} + +/* + * Move request from internal lists to the request queue dispatch list. + */ +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + bfq_remove_request(rq); + bfqq->dispatched++; + elv_dispatch_sort(q, rq); + + if (bfq_bfqq_sync(bfqq)) + bfqd->sync_flight++; +} + +/* + * Return expired entry, or NULL to just start from scratch in rbtree. + */ +static struct request *bfq_check_fifo(struct bfq_queue *bfqq) +{ + struct request *rq = NULL; + + if (bfq_bfqq_fifo_expire(bfqq)) + return NULL; + + bfq_mark_bfqq_fifo_expire(bfqq); + + if (list_empty(&bfqq->fifo)) + return NULL; + + rq = rq_entry_fifo(bfqq->fifo.next); + + if (time_before(jiffies, rq_fifo_time(rq))) + return NULL; + + return rq; +} + +/* + * Must be called with the queue_lock held. + */ +static int bfqq_process_refs(struct bfq_queue *bfqq) +{ + int process_refs, io_refs; + + io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; + process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; + BUG_ON(process_refs < 0); + return process_refs; +} + +static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + int process_refs, new_process_refs; + struct bfq_queue *__bfqq; + + /* + * If there are no process references on the new_bfqq, then it is + * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain + * may have dropped their last reference (not just their last process + * reference). + */ + if (!bfqq_process_refs(new_bfqq)) + return; + + /* Avoid a circular list and skip interim queue merges. */ + while ((__bfqq = new_bfqq->new_bfqq)) { + if (__bfqq == bfqq) + return; + new_bfqq = __bfqq; + } + + process_refs = bfqq_process_refs(bfqq); + new_process_refs = bfqq_process_refs(new_bfqq); + /* + * If the process for the bfqq has gone away, there is no + * sense in merging the queues. + */ + if (process_refs == 0 || new_process_refs == 0) + return; + + /* + * Merge in the direction of the lesser amount of work. + */ + if (new_process_refs >= process_refs) { + bfqq->new_bfqq = new_bfqq; + atomic_add(process_refs, &new_bfqq->ref); + } else { + new_bfqq->new_bfqq = bfqq; + atomic_add(new_process_refs, &bfqq->ref); + } + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", + new_bfqq->pid); +} + +static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + return entity->budget - entity->service; +} + +static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + BUG_ON(bfqq != bfqd->active_queue); + + __bfq_bfqd_reset_active(bfqd); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + bfq_del_bfqq_busy(bfqd, bfqq, 1); + /* + * overloading budget_timeout field to store when + * the queue remains with no backlog, used by + * the weight-raising mechanism + */ + bfqq->budget_timeout = jiffies ; + } + else { + bfq_activate_bfqq(bfqd, bfqq); + /* + * Resort priority tree of potential close cooperators. + */ + bfq_rq_pos_tree_add(bfqd, bfqq); + } + + /* + * If this bfqq is shared between multiple processes, check + * to make sure that those processes are still issuing I/Os + * within the mean seek distance. If not, it may be time to + * break the queues apart again. + */ + if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) + bfq_mark_bfqq_split_coop(bfqq); +} + +/** + * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. + * @bfqd: device data. + * @bfqq: queue to update. + * @reason: reason for expiration. + * + * Handle the feedback on @bfqq budget. See the body for detailed + * comments. + */ +static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + enum bfqq_expiration reason) +{ + struct request *next_rq; + unsigned long budget, min_budget; + + budget = bfqq->max_budget; + min_budget = bfq_min_budget(bfqd); + + BUG_ON(bfqq != bfqd->active_queue); + + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", + budget, bfq_min_budget(bfqd)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", + bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue)); + + if (bfq_bfqq_sync(bfqq)) { + switch (reason) { + /* + * Caveat: in all the following cases we trade latency + * for throughput. + */ + case BFQ_BFQQ_TOO_IDLE: + /* + * This is the only case where we may reduce + * the budget: if there is no requets of the + * process still waiting for completion, then + * we assume (tentatively) that the timer has + * expired because the batch of requests of + * the process could have been served with a + * smaller budget. Hence, betting that + * process will behave in the same way when it + * becomes backlogged again, we reduce its + * next budget. As long as we guess right, + * this budget cut reduces the latency + * experienced by the process. + * + * However, if there are still outstanding + * requests, then the process may have not yet + * issued its next request just because it is + * still waiting for the completion of some of + * the still oustanding ones. So in this + * subcase we do not reduce its budget, on the + * contrary we increase it to possibly boost + * the throughput, as discussed in the + * comments to the BUDGET_TIMEOUT case. + */ + if (bfqq->dispatched > 0) /* still oustanding reqs */ + budget = min(budget * 2, bfqd->bfq_max_budget); + else { + if (budget > 5 * min_budget) + budget -= 4 * min_budget; + else + budget = min_budget; + } + break; + case BFQ_BFQQ_BUDGET_TIMEOUT: + /* + * We double the budget here because: 1) it + * gives the chance to boost the throughput if + * this is not a seeky process (which may have + * bumped into this timeout because of, e.g., + * ZBR), 2) together with charge_full_budget + * it helps give seeky processes higher + * timestamps, and hence be served less + * frequently. + */ + budget = min(budget * 2, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_BUDGET_EXHAUSTED: + /* + * The process still has backlog, and did not + * let either the budget timeout or the disk + * idling timeout expire. Hence it is not + * seeky, has a short thinktime and may be + * happy with a higher budget too. So + * definitely increase the budget of this good + * candidate to boost the disk throughput. + */ + budget = min(budget * 4, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_NO_MORE_REQUESTS: + /* + * Leave the budget unchanged. + */ + default: + return; + } + } else /* async queue */ + /* async queues get always the maximum possible budget + * (their ability to dispatch is limited by + * @bfqd->bfq_max_budget_async_rq). + */ + budget = bfqd->bfq_max_budget; + + bfqq->max_budget = budget; + + if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 && + bfqq->max_budget > bfqd->bfq_max_budget) + bfqq->max_budget = bfqd->bfq_max_budget; + + /* + * Make sure that we have enough budget for the next request. + * Since the finish time of the bfqq must be kept in sync with + * the budget, be sure to call __bfq_bfqq_expire() after the + * update. + */ + next_rq = bfqq->next_rq; + if (next_rq != NULL) + bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + else + bfqq->entity.budget = bfqq->max_budget; + + bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu", + next_rq != NULL ? blk_rq_sectors(next_rq) : 0, + bfqq->entity.budget); +} + +static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) +{ + unsigned long max_budget; + + /* + * The max_budget calculated when autotuning is equal to the + * amount of sectors transfered in timeout_sync at the + * estimated peak rate. + */ + max_budget = (unsigned long)(peak_rate * 1000 * + timeout >> BFQ_RATE_SHIFT); + + return max_budget; +} + +/* + * In addition to updating the peak rate, checks whether the process + * is "slow", and returns 1 if so. This slow flag is used, in addition + * to the budget timeout, to reduce the amount of service provided to + * seeky processes, and hence reduce their chances to lower the + * throughput. See the code for more details. + */ +static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int compensate, enum bfqq_expiration reason) +{ + u64 bw, usecs, expected, timeout; + ktime_t delta; + int update = 0; + + if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) + return 0; + + if (compensate) + delta = bfqd->last_idling_start; + else + delta = ktime_get(); + delta = ktime_sub(delta, bfqd->last_budget_start); + usecs = ktime_to_us(delta); + + /* Don't trust short/unrealistic values. */ + if (usecs < 100 || usecs >= LONG_MAX) + return 0; + + /* + * Calculate the bandwidth for the last slice. We use a 64 bit + * value to store the peak rate, in sectors per usec in fixed + * point math. We do so to have enough precision in the estimate + * and to avoid overflows. + */ + bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; + do_div(bw, (unsigned long)usecs); + + timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); + + /* + * Use only long (> 20ms) intervals to filter out spikes for + * the peak rate estimation. + */ + if (usecs > 20000) { + if (bw > bfqd->peak_rate || + (!BFQQ_SEEKY(bfqq) && + reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { + bfq_log(bfqd, "measured bw =%llu", bw); + /* + * To smooth oscillations use a low-pass filter with + * alpha=7/8, i.e., + * new_rate = (7/8) * old_rate + (1/8) * bw + */ + do_div(bw, 8); + bfqd->peak_rate *= 7; + do_div(bfqd->peak_rate, 8); + bfqd->peak_rate += bw; + update = 1; + bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); + } + + update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; + + if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) + bfqd->peak_rate_samples++; + + if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && + update && bfqd->bfq_user_max_budget == 0) { + bfqd->bfq_max_budget = + bfq_calc_max_budget(bfqd->peak_rate, timeout); + bfq_log(bfqd, "new max_budget=%lu", + bfqd->bfq_max_budget); + } + } + + /* + * If the process has been served for a too short time + * interval to let its possible sequential accesses prevail on + * the initial seek time needed to move the disk head on the + * first sector it requested, then give the process a chance + * and for the moment return false. + */ + if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) + return 0; + + /* + * A process is considered ``slow'' (i.e., seeky, so that we + * cannot treat it fairly in the service domain, as it would + * slow down too much the other processes) if, when a slice + * ends for whatever reason, it has received service at a + * rate that would not be high enough to complete the budget + * before the budget timeout expiration. + */ + expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; + + /* + * Caveat: processes doing IO in the slower disk zones will + * tend to be slow(er) even if not seeky. And the estimated + * peak rate will actually be an average over the disk + * surface. Hence, to not be too harsh with unlucky processes, + * we keep a budget/3 margin of safety before declaring a + * process slow. + */ + return expected > (4 * bfqq->entity.budget) / 3; +} + +/** + * bfq_bfqq_expire - expire a queue. + * @bfqd: device owning the queue. + * @bfqq: the queue to expire. + * @compensate: if true, compensate for the time spent idling. + * @reason: the reason causing the expiration. + * + * + * If the process associated to the queue is slow (i.e., seeky), or in + * case of budget timeout, or, finally, if it is async, we + * artificially charge it an entire budget (independently of the + * actual service it received). As a consequence, the queue will get + * higher timestamps than the correct ones upon reactivation, and + * hence it will be rescheduled as if it had received more service + * than what it actually received. In the end, this class of processes + * will receive less service in proportion to how slowly they consume + * their budgets (and hence how seriously they tend to lower the + * throughput). + * + * In contrast, when a queue expires because it has been idling for + * too much or because it exhausted its budget, we do not touch the + * amount of service it has received. Hence when the queue will be + * reactivated and its timestamps updated, the latter will be in sync + * with the actual service received by the queue until expiration. + * + * Charging a full budget to the first type of queues and the exact + * service to the others has the effect of using the WF2Q+ policy to + * schedule the former on a timeslice basis, without violating the + * service domain guarantees of the latter. + */ +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + int compensate, + enum bfqq_expiration reason) +{ + int slow; + BUG_ON(bfqq != bfqd->active_queue); + + /* Update disk peak rate for autotuning and check whether the + * process is slow (see bfq_update_peak_rate). + */ + slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); + + /* + * As above explained, 'punish' slow (i.e., seeky), timed-out + * and async queues, to favor sequential sync workloads. + * + * Processes doing IO in the slower disk zones will tend to be + * slow(er) even if not seeky. Hence, since the estimated peak + * rate is actually an average over the disk surface, these + * processes may timeout just for bad luck. To avoid punishing + * them we do not charge a full budget to a process that + * succeeded in consuming at least 2/3 of its budget. + */ + if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) + bfq_bfqq_charge_full_budget(bfqq); + + if (bfqd->low_latency && bfqq->raising_coeff == 1) + bfqq->last_rais_start_finish = jiffies; + + if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) { + if(reason != BFQ_BFQQ_BUDGET_TIMEOUT) + bfqq->soft_rt_next_start = + jiffies + + HZ * bfqq->entity.service / + bfqd->bfq_raising_max_softrt_rate; + else + bfqq->soft_rt_next_start = -1; /* infinity */ + } + bfq_log_bfqq(bfqd, bfqq, + "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow, + bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); + + /* Increase, decrease or leave budget unchanged according to reason */ + __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); + __bfq_bfqq_expire(bfqd, bfqq); +} + +/* + * Budget timeout is not implemented through a dedicated timer, but + * just checked on request arrivals and completions, as well as on + * idle timer expirations. + */ +static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) +{ + if (bfq_bfqq_budget_new(bfqq)) + return 0; + + if (time_before(jiffies, bfqq->budget_timeout)) + return 0; + + return 1; +} + +/* + * If we expire a queue that is waiting for the arrival of a new + * request, we may prevent the fictitious timestamp backshifting that + * allows the guarantees of the queue to be preserved (see [1] for + * this tricky aspect). Hence we return true only if this condition + * does not hold, or if the queue is slow enough to deserve only to be + * kicked off for preserving a high throughput. +*/ +static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, + "may_budget_timeout: wr %d left %d timeout %d", + bfq_bfqq_wait_request(bfqq), + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, + bfq_bfqq_budget_timeout(bfqq)); + + return (!bfq_bfqq_wait_request(bfqq) || + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) + && + bfq_bfqq_budget_timeout(bfqq); +} + +/* + * Select a queue for service. If we have a current active queue, + * check whether to continue servicing it, or retrieve and set a new one. + */ +static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq, *new_bfqq = NULL; + struct request *next_rq; + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; + + bfqq = bfqd->active_queue; + if (bfqq == NULL) + goto new_queue; + + bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue"); + + /* + * If another queue has a request waiting within our mean seek + * distance, let it run. The expire code will check for close + * cooperators and put the close queue at the front of the + * service tree. If possible, merge the expiring queue with the + * new bfqq. + */ + new_bfqq = bfq_close_cooperator(bfqd, bfqq); + if (new_bfqq != NULL && bfqq->new_bfqq == NULL) + bfq_setup_merge(bfqq, new_bfqq); + + if (bfq_may_expire_for_budg_timeout(bfqq)) + goto expire; + + next_rq = bfqq->next_rq; + /* + * If bfqq has requests queued and it has enough budget left to + * serve them, keep the queue, otherwise expire it. + */ + if (next_rq != NULL) { + if (bfq_serv_to_charge(next_rq, bfqq) > + bfq_bfqq_budget_left(bfqq)) { + reason = BFQ_BFQQ_BUDGET_EXHAUSTED; + goto expire; + } else { + /* + * The idle timer may be pending because we may not + * disable disk idling even when a new request arrives + */ + if (timer_pending(&bfqd->idle_slice_timer)) { + /* + * If we get here: 1) at least a new request + * has arrived but we have not disabled the + * timer because the request was too small, + * 2) then the block layer has unplugged the + * device, causing the dispatch to be invoked. + * + * Since the device is unplugged, now the + * requests are probably large enough to + * provide a reasonable throughput. + * So we disable idling. + */ + bfq_clear_bfqq_wait_request(bfqq); + del_timer(&bfqd->idle_slice_timer); + } + if (new_bfqq == NULL) + goto keep_queue; + else + goto expire; + } + } + + /* + * No requests pending. If there is no cooperator, and the active + * queue still has requests in flight or is idling for a new request, + * then keep it. + */ + if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) || + (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq) && + !bfq_queue_nonrot_noidle(bfqd, bfqq)))) { + bfqq = NULL; + goto keep_queue; + } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) { + /* + * Expiring the queue because there is a close cooperator, + * cancel timer. + */ + bfq_clear_bfqq_wait_request(bfqq); + del_timer(&bfqd->idle_slice_timer); + } + + reason = BFQ_BFQQ_NO_MORE_REQUESTS; +expire: + bfq_bfqq_expire(bfqd, bfqq, 0, reason); +new_queue: + bfqq = bfq_set_active_queue(bfqd, new_bfqq); + bfq_log(bfqd, "select_queue: new queue %d returned", + bfqq != NULL ? bfqq->pid : 0); +keep_queue: + return bfqq; +} + +static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + if (bfqq->raising_coeff > 1) { /* queue is being boosted */ + struct bfq_entity *entity = &bfqq->entity; + + bfq_log_bfqq(bfqd, bfqq, + "raising period dur %u/%u msec, " + "old raising coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - + bfqq->last_rais_start_finish), + jiffies_to_msecs(bfqq->raising_cur_max_time), + bfqq->raising_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); + + BUG_ON(bfqq != bfqd->active_queue && entity->weight != + entity->orig_weight * bfqq->raising_coeff); + if(entity->ioprio_changed) + bfq_log_bfqq(bfqd, bfqq, + "WARN: pending prio change"); + /* + * If too much time has elapsed from the beginning + * of this weight-raising period and process is not soft + * real-time, stop it + */ + if (jiffies - bfqq->last_rais_start_finish > + bfqq->raising_cur_max_time) { + int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && + bfqq->soft_rt_next_start < jiffies; + + bfqq->last_rais_start_finish = jiffies; + if (soft_rt) + bfqq->raising_cur_max_time = + bfqd->bfq_raising_rt_max_time; + else { + bfqq->raising_coeff = 1; + entity->ioprio_changed = 1; + __bfq_entity_update_weight_prio( + bfq_entity_service_tree(entity), + entity); + } + } + } +} + + +/* + * Dispatch one request from bfqq, moving it to the request queue + * dispatch list. + */ +static int bfq_dispatch_request(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + int dispatched = 0; + struct request *rq; + unsigned long service_to_charge; + + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + + /* Follow expired path, else get first next available. */ + rq = bfq_check_fifo(bfqq); + if (rq == NULL) + rq = bfqq->next_rq; + service_to_charge = bfq_serv_to_charge(rq, bfqq); + + if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { + /* + * This may happen if the next rq is chosen + * in fifo order instead of sector order. + * The budget is properly dimensioned + * to be always sufficient to serve the next request + * only if it is chosen in sector order. The reason is + * that it would be quite inefficient and little useful + * to always make sure that the budget is large enough + * to serve even the possible next rq in fifo order. + * In fact, requests are seldom served in fifo order. + * + * Expire the queue for budget exhaustion, and + * make sure that the next act_budget is enough + * to serve the next request, even if it comes + * from the fifo expired path. + */ + bfqq->next_rq = rq; + /* + * Since this dispatch is failed, make sure that + * a new one will be performed + */ + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); + goto expire; + } + + /* Finally, insert request into driver dispatch list. */ + bfq_bfqq_served(bfqq, service_to_charge); + bfq_dispatch_insert(bfqd->queue, rq); + + update_raising_data(bfqd, bfqq); + + bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), " + "budg left %lu", + blk_rq_sectors(rq), + (long long unsigned)blk_rq_pos(rq), + bfq_bfqq_budget_left(bfqq)); + + dispatched++; + + if (bfqd->active_cic == NULL) { + atomic_long_inc(&RQ_CIC(rq)->ioc->refcount); + bfqd->active_cic = RQ_CIC(rq); + } + + if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && + dispatched >= bfqd->bfq_max_budget_async_rq) || + bfq_class_idle(bfqq))) + goto expire; + + return dispatched; + +expire: + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED); + return dispatched; +} + +static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) +{ + int dispatched = 0; + + while (bfqq->next_rq != NULL) { + bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); + dispatched++; + } + + BUG_ON(!list_empty(&bfqq->fifo)); + return dispatched; +} + +/* + * Drain our current requests. Used for barriers and when switching + * io schedulers on-the-fly. + */ +static int bfq_forced_dispatch(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq, *n; + struct bfq_service_tree *st; + int dispatched = 0; + + bfqq = bfqd->active_queue; + if (bfqq != NULL) + __bfq_bfqq_expire(bfqd, bfqq); + + /* + * Loop through classes, and be careful to leave the scheduler + * in a consistent state, as feedback mechanisms and vtime + * updates cannot be disabled during the process. + */ + list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { + st = bfq_entity_service_tree(&bfqq->entity); + + dispatched += __bfq_forced_dispatch_bfqq(bfqq); + bfqq->max_budget = bfq_max_budget(bfqd); + + bfq_forget_idle(st); + } + + BUG_ON(bfqd->busy_queues != 0); + + return dispatched; +} + +static int bfq_dispatch_requests(struct request_queue *q, int force) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq; + int max_dispatch; + + bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); + if (bfqd->busy_queues == 0) + return 0; + + if (unlikely(force)) + return bfq_forced_dispatch(bfqd); + + if((bfqq = bfq_select_queue(bfqd)) == NULL) + return 0; + + max_dispatch = bfqd->bfq_quantum; + if (bfq_class_idle(bfqq)) + max_dispatch = 1; + + if (!bfq_bfqq_sync(bfqq)) + max_dispatch = bfqd->bfq_max_budget_async_rq; + + if (bfqq->dispatched >= max_dispatch) { + if (bfqd->busy_queues > 1) + return 0; + if (bfqq->dispatched >= 4 * max_dispatch) + return 0; + } + + if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) + return 0; + + bfq_clear_bfqq_wait_request(bfqq); + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + + if (! bfq_dispatch_request(bfqd, bfqq)) + return 0; + + bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d" + "(max_disp %d)", bfqq->pid, max_dispatch); + + return 1; +} + +/* + * Task holds one reference to the queue, dropped when task exits. Each rq + * in-flight on this queue also holds a reference, dropped when rq is freed. + * + * Queue lock must be held here. + */ +static void bfq_put_queue(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; + + BUG_ON(atomic_read(&bfqq->ref) <= 0); + + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, + atomic_read(&bfqq->ref)); + if (!atomic_dec_and_test(&bfqq->ref)) + return; + + BUG_ON(rb_first(&bfqq->sort_list) != NULL); + BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); + BUG_ON(bfqq->entity.tree != NULL); + BUG_ON(bfq_bfqq_busy(bfqq)); + BUG_ON(bfqd->active_queue == bfqq); + + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); + + kmem_cache_free(bfq_pool, bfqq); +} + +static void bfq_put_cooperator(struct bfq_queue *bfqq) +{ + struct bfq_queue *__bfqq, *next; + + /* + * If this queue was scheduled to merge with another queue, be + * sure to drop the reference taken on that queue (and others in + * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. + */ + __bfqq = bfqq->new_bfqq; + while (__bfqq) { + if (__bfqq == bfqq) { + WARN(1, "bfqq->new_bfqq loop detected.\n"); + break; + } + next = __bfqq->new_bfqq; + bfq_put_queue(__bfqq); + __bfqq = next; + } +} + +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + if (bfqq == bfqd->active_queue) { + __bfq_bfqq_expire(bfqd, bfqq); + bfq_schedule_dispatch(bfqd); + } + + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, + atomic_read(&bfqq->ref)); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); +} + +/* + * Update the entity prio values; note that the new values will not + * be used until the next (re)activation. + */ +static void bfq_init_prio_data(struct bfq_queue *bfqq, struct io_context *ioc) +{ + struct task_struct *tsk = current; + int ioprio_class; + + if (!bfq_bfqq_prio_changed(bfqq)) + return; + + ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); + switch (ioprio_class) { + default: + printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class); + case IOPRIO_CLASS_NONE: + /* + * No prio set, inherit CPU scheduling settings. + */ + bfqq->entity.new_ioprio = task_nice_ioprio(tsk); + bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk); + break; + case IOPRIO_CLASS_RT: + bfqq->entity.new_ioprio = task_ioprio(ioc); + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT; + break; + case IOPRIO_CLASS_BE: + bfqq->entity.new_ioprio = task_ioprio(ioc); + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; + break; + case IOPRIO_CLASS_IDLE: + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE; + bfqq->entity.new_ioprio = 7; + bfq_clear_bfqq_idle_window(bfqq); + break; + } + + bfqq->entity.ioprio_changed = 1; + + /* + * Keep track of original prio settings in case we have to temporarily + * elevate the priority of this queue. + */ + bfqq->org_ioprio = bfqq->entity.new_ioprio; + bfqq->org_ioprio_class = bfqq->entity.new_ioprio_class; + bfq_clear_bfqq_prio_changed(bfqq); +} + +static void bfq_changed_ioprio(struct io_context *ioc, + struct cfq_io_context *cic) +{ + struct bfq_data *bfqd; + struct bfq_queue *bfqq, *new_bfqq; + struct bfq_group *bfqg; + unsigned long uninitialized_var(flags); + + bfqd = bfq_get_bfqd_locked(&cic->key, &flags); + if (unlikely(bfqd == NULL)) + return; + + bfqq = cic->cfqq[BLK_RW_ASYNC]; + if (bfqq != NULL) { + bfqg = container_of(bfqq->entity.sched_data, struct bfq_group, + sched_data); + new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, cic->ioc, + GFP_ATOMIC); + if (new_bfqq != NULL) { + cic->cfqq[BLK_RW_ASYNC] = new_bfqq; + bfq_log_bfqq(bfqd, bfqq, + "changed_ioprio: bfqq %p %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } + } + + bfqq = cic->cfqq[BLK_RW_SYNC]; + if (bfqq != NULL) + bfq_mark_bfqq_prio_changed(bfqq); + + bfq_put_bfqd_unlock(bfqd, &flags); +} + +static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + pid_t pid, int is_sync) +{ + RB_CLEAR_NODE(&bfqq->entity.rb_node); + INIT_LIST_HEAD(&bfqq->fifo); + + atomic_set(&bfqq->ref, 0); + bfqq->bfqd = bfqd; + + bfq_mark_bfqq_prio_changed(bfqq); + + if (is_sync) { + if (!bfq_class_idle(bfqq)) + bfq_mark_bfqq_idle_window(bfqq); + bfq_mark_bfqq_sync(bfqq); + } + + /* Tentative initial value to trade off between thr and lat */ + bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; + bfqq->pid = pid; + + bfqq->raising_coeff = 1; + bfqq->last_rais_start_finish = 0; + bfqq->soft_rt_next_start = -1; +} + +static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, + int is_sync, + struct io_context *ioc, + gfp_t gfp_mask) +{ + struct bfq_queue *bfqq, *new_bfqq = NULL; + struct cfq_io_context *cic; + +retry: + cic = bfq_cic_lookup(bfqd, ioc); + /* cic always exists here */ + bfqq = cic_to_bfqq(cic, is_sync); + + /* + * Always try a new alloc if we fall back to the OOM bfqq + * originally, since it should just be a temporary situation. + */ + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { + bfqq = NULL; + if (new_bfqq != NULL) { + bfqq = new_bfqq; + new_bfqq = NULL; + } else if (gfp_mask & __GFP_WAIT) { + spin_unlock_irq(bfqd->queue->queue_lock); + new_bfqq = kmem_cache_alloc_node(bfq_pool, + gfp_mask | __GFP_ZERO, + bfqd->queue->node); + spin_lock_irq(bfqd->queue->queue_lock); + if (new_bfqq != NULL) + goto retry; + } else { + bfqq = kmem_cache_alloc_node(bfq_pool, + gfp_mask | __GFP_ZERO, + bfqd->queue->node); + } + + if (bfqq != NULL) { + bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync); + bfq_log_bfqq(bfqd, bfqq, "allocated"); + } else { + bfqq = &bfqd->oom_bfqq; + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); + } + + bfq_init_prio_data(bfqq, ioc); + bfq_init_entity(&bfqq->entity, bfqg); + } + + if (new_bfqq != NULL) + kmem_cache_free(bfq_pool, new_bfqq); + + return bfqq; +} + +static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, + struct bfq_group *bfqg, + int ioprio_class, int ioprio) +{ + switch (ioprio_class) { + case IOPRIO_CLASS_RT: + return &bfqg->async_bfqq[0][ioprio]; + case IOPRIO_CLASS_BE: + return &bfqg->async_bfqq[1][ioprio]; + case IOPRIO_CLASS_IDLE: + return &bfqg->async_idle_bfqq; + default: + BUG(); + } +} + +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, int is_sync, + struct io_context *ioc, gfp_t gfp_mask) +{ + const int ioprio = task_ioprio(ioc); + const int ioprio_class = task_ioprio_class(ioc); + struct bfq_queue **async_bfqq = NULL; + struct bfq_queue *bfqq = NULL; + + if (!is_sync) { + async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, + ioprio); + bfqq = *async_bfqq; + } + + if (bfqq == NULL) + bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask); + + /* + * Pin the queue now that it's allocated, scheduler exit will prune it. + */ + if (!is_sync && *async_bfqq == NULL) { + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", + bfqq, atomic_read(&bfqq->ref)); + *async_bfqq = bfqq; + } + + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, + atomic_read(&bfqq->ref)); + return bfqq; +} + +static void bfq_update_io_thinktime(struct bfq_data *bfqd, + struct cfq_io_context *cic) +{ + unsigned long elapsed = jiffies - cic->last_end_request; + unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); + + cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; + cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; + cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; +} + +static void bfq_update_io_seektime(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *rq) +{ + sector_t sdist; + u64 total; + + if (bfqq->last_request_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - bfqq->last_request_pos; + else + sdist = bfqq->last_request_pos - blk_rq_pos(rq); + + /* + * Don't allow the seek distance to get too large from the + * odd fragment, pagein, etc. + */ + if (bfqq->seek_samples == 0) /* first request, not really a seek */ + sdist = 0; + else if (bfqq->seek_samples <= 60) /* second & third seek */ + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); + else + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); + + bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; + bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; + total = bfqq->seek_total + (bfqq->seek_samples/2); + do_div(total, bfqq->seek_samples); + if (bfq_bfqq_coop(bfqq)) { + /* + * If the mean seektime increases for a (non-seeky) shared + * queue, some cooperator is likely to be idling too much. + * On the contrary, if it decreases, some cooperator has + * probably waked up. + * + */ + if ((sector_t)total < bfqq->seek_mean) + bfq_mark_bfqq_some_coop_idle(bfqq) ; + else if ((sector_t)total > bfqq->seek_mean) + bfq_clear_bfqq_some_coop_idle(bfqq) ; + } + bfqq->seek_mean = (sector_t)total; + + bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, + (u64)bfqq->seek_mean); +} + +/* + * Disable idle window if the process thinks too long or seeks so much that + * it doesn't matter. + */ +static void bfq_update_idle_window(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct cfq_io_context *cic) +{ + int enable_idle; + + /* Don't idle for async or idle io prio class. */ + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) + return; + + enable_idle = bfq_bfqq_idle_window(bfqq); + + if (atomic_read(&cic->ioc->nr_tasks) == 0 || + bfqd->bfq_slice_idle == 0 || + (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && + bfqq->raising_coeff == 1)) + enable_idle = 0; + else if (bfq_sample_valid(cic->ttime_samples)) { + if (cic->ttime_mean > bfqd->bfq_slice_idle && + bfqq->raising_coeff == 1) + enable_idle = 0; + else + enable_idle = 1; + } + bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", + enable_idle); + + if (enable_idle) + bfq_mark_bfqq_idle_window(bfqq); + else + bfq_clear_bfqq_idle_window(bfqq); +} + +/* + * Called when a new fs request (rq) is added to bfqq. Check if there's + * something we should do about it. + */ +static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct request *rq) +{ + struct cfq_io_context *cic = RQ_CIC(rq); + + if (rq->cmd_flags & REQ_META) + bfqq->meta_pending++; + + bfq_update_io_thinktime(bfqd, cic); + bfq_update_io_seektime(bfqd, bfqq, rq); + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || + !BFQQ_SEEKY(bfqq)) + bfq_update_idle_window(bfqd, bfqq, cic); + + bfq_log_bfqq(bfqd, bfqq, + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", + bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), + (long long unsigned)bfqq->seek_mean); + + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); + + if (bfqq == bfqd->active_queue) { + /* + * If there is just this request queued and the request + * is small, just exit. + * In this way, if the disk is being idled to wait for a new + * request from the active queue, we avoid unplugging the + * device now. + * + * By doing so, we spare the disk to be committed + * to serve just a small request. On the contrary, we wait for + * the block layer to decide when to unplug the device: + * hopefully, new requests will be merged to this + * one quickly, then the device will be unplugged + * and larger requests will be dispatched. + */ + if (bfqq->queued[rq_is_sync(rq)] == 1 && + blk_rq_sectors(rq) < 32) { + return; + } + if (bfq_bfqq_wait_request(bfqq)) { + /* + * If we are waiting for a request for this queue, let + * it rip immediately and flag that we must not expire + * this queue just now. + */ + bfq_clear_bfqq_wait_request(bfqq); + del_timer(&bfqd->idle_slice_timer); + /* + * Here we can safely expire the queue, in + * case of budget timeout, without wasting + * guarantees + */ + if (bfq_bfqq_budget_timeout(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, 0, + BFQ_BFQQ_BUDGET_TIMEOUT); + __blk_run_queue(bfqd->queue); + } + } +} + +static void bfq_insert_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + assert_spin_locked(bfqd->queue->queue_lock); + bfq_init_prio_data(bfqq, RQ_CIC(rq)->ioc); + + bfq_add_rq_rb(rq); + + rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); + list_add_tail(&rq->queuelist, &bfqq->fifo); + + bfq_rq_enqueued(bfqd, bfqq, rq); +} + +static void bfq_update_hw_tag(struct bfq_data *bfqd) +{ + bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, + bfqd->rq_in_driver); + + if (bfqd->hw_tag == 1) + return; + + /* + * This sample is valid if the number of outstanding requests + * is large enough to allow a queueing behavior. Note that the + * sum is not exact, as it's not taking into account deactivated + * requests. + */ + if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) + return; + + if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) + return; + + bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; + bfqd->max_rq_in_driver = 0; + bfqd->hw_tag_samples = 0; +} + +static void bfq_completed_request(struct request_queue *q, struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + const int sync = rq_is_sync(rq); + + bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)", + blk_rq_sectors(rq), sync); + + bfq_update_hw_tag(bfqd); + + WARN_ON(!bfqd->rq_in_driver); + WARN_ON(!bfqq->dispatched); + bfqd->rq_in_driver--; + bfqq->dispatched--; + + if (bfq_bfqq_sync(bfqq)) + bfqd->sync_flight--; + + if (sync) + RQ_CIC(rq)->last_end_request = jiffies; + + /* + * If this is the active queue, check if it needs to be expired, + * or if we want to idle in case it has no pending requests. + */ + if (bfqd->active_queue == bfqq) { + if (bfq_bfqq_budget_new(bfqq)) + bfq_set_budget_timeout(bfqd); + + /* Idling is disabled also for cooperation issues: + * 1) there is a close cooperator for the queue, or + * 2) the queue is shared and some cooperator is likely + * to be idle (in this case, by not arming the idle timer, + * we try to slow down the queue, to prevent the zones + * of the disk accessed by the active cooperators to become + * too distant from the zone that will be accessed by the + * currently idle cooperators) + */ + if (bfq_may_expire_for_budg_timeout(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); + else if (sync && + (bfqd->rq_in_driver == 0 || + bfqq->raising_coeff > 1) + && RB_EMPTY_ROOT(&bfqq->sort_list) + && !bfq_close_cooperator(bfqd, bfqq) + && (!bfq_bfqq_coop(bfqq) || + !bfq_bfqq_some_coop_idle(bfqq))) + bfq_arm_slice_timer(bfqd); + } + + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); +} + +/* + * We temporarily boost lower priority queues if they are holding fs exclusive + * resources. They are boosted to normal prio (CLASS_BE/4). + */ +static void bfq_prio_boost(struct bfq_queue *bfqq) +{ + if (has_fs_excl()) { + /* + * Boost idle prio on transactions that would lock out other + * users of the filesystem + */ + if (bfq_class_idle(bfqq)) + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; + if (bfqq->entity.new_ioprio > IOPRIO_NORM) + bfqq->entity.new_ioprio = IOPRIO_NORM; + } else { + /* + * Unboost the queue (if needed) + */ + bfqq->entity.new_ioprio_class = bfqq->org_ioprio_class; + bfqq->entity.new_ioprio = bfqq->org_ioprio; + } +} + +static inline int __bfq_may_queue(struct bfq_queue *bfqq) +{ + if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { + bfq_clear_bfqq_must_alloc(bfqq); + return ELV_MQUEUE_MUST; + } + + return ELV_MQUEUE_MAY; +} + +static int bfq_may_queue(struct request_queue *q, int rw) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct task_struct *tsk = current; + struct cfq_io_context *cic; + struct bfq_queue *bfqq; + + /* + * Don't force setup of a queue from here, as a call to may_queue + * does not necessarily imply that a request actually will be queued. + * So just lookup a possibly existing queue, or return 'may queue' + * if that fails. + */ + cic = bfq_cic_lookup(bfqd, tsk->io_context); + if (cic == NULL) + return ELV_MQUEUE_MAY; + + bfqq = cic_to_bfqq(cic, rw_is_sync(rw)); + if (bfqq != NULL) { + bfq_init_prio_data(bfqq, cic->ioc); + bfq_prio_boost(bfqq); + + return __bfq_may_queue(bfqq); + } + + return ELV_MQUEUE_MAY; +} + +/* + * Queue lock held here. + */ +static void bfq_put_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + if (bfqq != NULL) { + const int rw = rq_data_dir(rq); + + BUG_ON(!bfqq->allocated[rw]); + bfqq->allocated[rw]--; + + put_io_context(RQ_CIC(rq)->ioc); + + rq->elevator_private[0] = NULL; + rq->elevator_private[1] = NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } +} + +static struct bfq_queue * +bfq_merge_bfqqs(struct bfq_data *bfqd, struct cfq_io_context *cic, + struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", + (long unsigned)bfqq->new_bfqq->pid); + cic_set_bfqq(cic, bfqq->new_bfqq, 1); + bfq_mark_bfqq_coop(bfqq->new_bfqq); + bfq_put_queue(bfqq); + return cic_to_bfqq(cic, 1); +} + +/* + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this + * was the last process referring to said bfqq. + */ +static struct bfq_queue * +bfq_split_bfqq(struct cfq_io_context *cic, struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); + if (bfqq_process_refs(bfqq) == 1) { + bfqq->pid = current->pid; + bfq_clear_bfqq_some_coop_idle(bfqq); + bfq_clear_bfqq_coop(bfqq); + bfq_clear_bfqq_split_coop(bfqq); + return bfqq; + } + + cic_set_bfqq(cic, NULL, 1); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); + return NULL; +} + +/* + * Allocate bfq data structures associated with this request. + */ +static int bfq_set_request(struct request_queue *q, struct request *rq, + gfp_t gfp_mask) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct cfq_io_context *cic; + const int rw = rq_data_dir(rq); + const int is_sync = rq_is_sync(rq); + struct bfq_queue *bfqq; + struct bfq_group *bfqg; + unsigned long flags; + + might_sleep_if(gfp_mask & __GFP_WAIT); + + cic = bfq_get_io_context(bfqd, gfp_mask); + + spin_lock_irqsave(q->queue_lock, flags); + + if (cic == NULL) + goto queue_fail; + + bfqg = bfq_cic_update_cgroup(cic); + +new_queue: + bfqq = cic_to_bfqq(cic, is_sync); + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { + bfqq = bfq_get_queue(bfqd, bfqg, is_sync, cic->ioc, gfp_mask); + cic_set_bfqq(cic, bfqq, is_sync); + } else { + /* + * If the queue was seeky for too long, break it apart. + */ + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + bfqq = bfq_split_bfqq(cic, bfqq); + if (!bfqq) + goto new_queue; + } + + /* + * Check to see if this queue is scheduled to merge with + * another closely cooperating queue. The merging of queues + * happens here as it must be done in process context. + * The reference on new_bfqq was taken in merge_bfqqs. + */ + if (bfqq->new_bfqq != NULL) + bfqq = bfq_merge_bfqqs(bfqd, cic, bfqq); + } + + bfqq->allocated[rw]++; + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, + atomic_read(&bfqq->ref)); + + spin_unlock_irqrestore(q->queue_lock, flags); + + rq->elevator_private[0] = cic; + rq->elevator_private[1] = bfqq; + + return 0; + +queue_fail: + if (cic != NULL) + put_io_context(cic->ioc); + + bfq_schedule_dispatch(bfqd); + spin_unlock_irqrestore(q->queue_lock, flags); + + return 1; +} + +static void bfq_kick_queue(struct work_struct *work) +{ + struct bfq_data *bfqd = + container_of(work, struct bfq_data, unplug_work); + struct request_queue *q = bfqd->queue; + + spin_lock_irq(q->queue_lock); + __blk_run_queue(q); + spin_unlock_irq(q->queue_lock); +} + +/* + * Handler of the expiration of the timer running if the active_queue + * is idling inside its time slice. + */ +static void bfq_idle_slice_timer(unsigned long data) +{ + struct bfq_data *bfqd = (struct bfq_data *)data; + struct bfq_queue *bfqq; + unsigned long flags; + enum bfqq_expiration reason; + + spin_lock_irqsave(bfqd->queue->queue_lock, flags); + + bfqq = bfqd->active_queue; + /* + * Theoretical race here: active_queue can be NULL or different + * from the queue that was idling if the timer handler spins on + * the queue_lock and a new request arrives for the current + * queue and there is a full dispatch cycle that changes the + * active_queue. This can hardly happen, but in the worst case + * we just expire a queue too early. + */ + if (bfqq != NULL) { + bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); + if (bfq_bfqq_budget_timeout(bfqq)) + /* + * Also here the queue can be safely expired + * for budget timeout without wasting + * guarantees + */ + reason = BFQ_BFQQ_BUDGET_TIMEOUT; + else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) + /* + * The queue may not be empty upon timer expiration, + * because we may not disable the timer when the first + * request of the active queue arrives during + * disk idling + */ + reason = BFQ_BFQQ_TOO_IDLE; + else + goto schedule_dispatch; + + bfq_bfqq_expire(bfqd, bfqq, 1, reason); + } + +schedule_dispatch: + bfq_schedule_dispatch(bfqd); + + spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); +} + +static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) +{ + del_timer_sync(&bfqd->idle_slice_timer); + cancel_work_sync(&bfqd->unplug_work); +} + +static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, + struct bfq_queue **bfqq_ptr) +{ + struct bfq_group *root_group = bfqd->root_group; + struct bfq_queue *bfqq = *bfqq_ptr; + + bfq_log(bfqd, "put_async_bfqq: %p", bfqq); + if (bfqq != NULL) { + bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); + bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + *bfqq_ptr = NULL; + } +} + +/* + * Release all the bfqg references to its async queues. If we are + * deallocating the group these queues may still contain requests, so + * we reparent them to the root cgroup (i.e., the only one that will + * exist for sure untill all the requests on a device are gone). + */ +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); + + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); +} + +static void bfq_exit_queue(struct elevator_queue *e) +{ + struct bfq_data *bfqd = e->elevator_data; + struct request_queue *q = bfqd->queue; + struct bfq_queue *bfqq, *n; + struct cfq_io_context *cic; + + bfq_shutdown_timer_wq(bfqd); + + spin_lock_irq(q->queue_lock); + + while (!list_empty(&bfqd->cic_list)) { + cic = list_entry(bfqd->cic_list.next, struct cfq_io_context, + queue_list); + __bfq_exit_single_io_context(bfqd, cic); + } + + BUG_ON(bfqd->active_queue != NULL); + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) + bfq_deactivate_bfqq(bfqd, bfqq, 0); + + bfq_disconnect_groups(bfqd); + spin_unlock_irq(q->queue_lock); + + bfq_shutdown_timer_wq(bfqd); + + spin_lock(&cic_index_lock); + ida_remove(&cic_index_ida, bfqd->cic_index); + spin_unlock(&cic_index_lock); + + /* Wait for cic->key accessors to exit their grace periods. */ + synchronize_rcu(); + + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + + bfq_free_root_group(bfqd); + kfree(bfqd); +} + +static int bfq_alloc_cic_index(void) +{ + int index, error; + + do { + if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(&cic_index_lock); + error = ida_get_new(&cic_index_ida, &index); + spin_unlock(&cic_index_lock); + if (error && error != -EAGAIN) + return error; + } while (error); + + return index; +} + +static void *bfq_init_queue(struct request_queue *q) +{ + struct bfq_group *bfqg; + struct bfq_data *bfqd; + int i; + + i = bfq_alloc_cic_index(); + if (i < 0) + return NULL; + + bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node); + if (bfqd == NULL) + return NULL; + + bfqd->cic_index = i; + + /* + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow + * will not attempt to free it. + */ + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0); + atomic_inc(&bfqd->oom_bfqq.ref); + + INIT_LIST_HEAD(&bfqd->cic_list); + + bfqd->queue = q; + + bfqg = bfq_alloc_root_group(bfqd, q->node); + if (bfqg == NULL) { + kfree(bfqd); + return NULL; + } + + bfqd->root_group = bfqg; + + init_timer(&bfqd->idle_slice_timer); + bfqd->idle_slice_timer.function = bfq_idle_slice_timer; + bfqd->idle_slice_timer.data = (unsigned long)bfqd; + + bfqd->rq_pos_tree = RB_ROOT; + + INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); + + INIT_LIST_HEAD(&bfqd->active_list); + INIT_LIST_HEAD(&bfqd->idle_list); + + bfqd->hw_tag = -1; + + bfqd->bfq_max_budget = bfq_default_max_budget; + + bfqd->bfq_quantum = bfq_quantum; + bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; + bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; + bfqd->bfq_back_max = bfq_back_max; + bfqd->bfq_back_penalty = bfq_back_penalty; + bfqd->bfq_slice_idle = bfq_slice_idle; + bfqd->bfq_class_idle_last_service = 0; + bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; + bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; + bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; + + bfqd->low_latency = true; + + bfqd->bfq_raising_coeff = 20; + bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300); + bfqd->bfq_raising_max_time = 0; + bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000); + bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500); + bfqd->bfq_raising_max_softrt_rate = 7000; + + /* Initially estimate the device's peak rate as the reference rate */ + if (blk_queue_nonrot(bfqd->queue)) { + bfqd->RT_prod = R_nonrot * T_nonrot; + bfqd->peak_rate = R_nonrot; + } else { + bfqd->RT_prod = R_rot * T_rot; + bfqd->peak_rate = R_rot; + } + + return bfqd; +} + +static void bfq_slab_kill(void) +{ + if (bfq_pool != NULL) + kmem_cache_destroy(bfq_pool); + if (bfq_ioc_pool != NULL) + kmem_cache_destroy(bfq_ioc_pool); +} + +static int __init bfq_slab_setup(void) +{ + bfq_pool = KMEM_CACHE(bfq_queue, 0); + if (bfq_pool == NULL) + goto fail; + + bfq_ioc_pool = kmem_cache_create("bfq_io_context", + sizeof(struct cfq_io_context), + __alignof__(struct cfq_io_context), + 0, NULL); + if (bfq_ioc_pool == NULL) + goto fail; + + return 0; +fail: + bfq_slab_kill(); + return -ENOMEM; +} + +static ssize_t bfq_var_show(unsigned int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count) +{ + unsigned long new_val; + int ret = strict_strtoul(page, 10, &new_val); + + if (ret == 0) + *var = new_val; + + return count; +} + +static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page) +{ + struct bfq_data *bfqd = e->elevator_data; + return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ? + bfqd->bfq_raising_max_time : + bfq_wrais_duration(bfqd)); +} + +static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) +{ + struct bfq_queue *bfqq; + struct bfq_data *bfqd = e->elevator_data; + ssize_t num_char = 0; + + num_char += sprintf(page + num_char, "Active:\n"); + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { + num_char += sprintf(page + num_char, + "pid%d: weight %hu, dur %d/%u\n", + bfqq->pid, + bfqq->entity.weight, + jiffies_to_msecs(jiffies - + bfqq->last_rais_start_finish), + jiffies_to_msecs(bfqq->raising_cur_max_time)); + } + num_char += sprintf(page + num_char, "Idle:\n"); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { + num_char += sprintf(page + num_char, + "pid%d: weight %hu, dur %d/%u\n", + bfqq->pid, + bfqq->entity.weight, + jiffies_to_msecs(jiffies - + bfqq->last_rais_start_finish), + jiffies_to_msecs(bfqq->raising_cur_max_time)); + } + return num_char; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned int __data = __VAR; \ + if (__CONV) \ + __data = jiffies_to_msecs(__data); \ + return bfq_var_show(__data, (page)); \ +} +SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0); +SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); +SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); +SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); +SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); +SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); +SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0); +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); +SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); +SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); +SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0); +SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1); +SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time, + 1); +SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show, + bfqd->bfq_raising_min_inter_arr_async, + 1); +SHOW_FUNCTION(bfq_raising_max_softrt_rate_show, + bfqd->bfq_raising_max_softrt_rate, 0); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t \ +__FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned long __data; \ + int ret = bfq_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + if (__CONV) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else \ + *(__PTR) = __data; \ + return ret; \ +} +STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0); +STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, + INT_MAX, 1); +STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, + INT_MAX, 1); +STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); +STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, + INT_MAX, 0); +STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, + 1, INT_MAX, 0); +STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, + INT_MAX, 1); +STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1, + INT_MAX, 0); +STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0, + INT_MAX, 1); +STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0, + INT_MAX, 1); +STORE_FUNCTION(bfq_raising_min_idle_time_store, + &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_raising_min_inter_arr_async_store, + &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_raising_max_softrt_rate_store, + &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0); +#undef STORE_FUNCTION + +/* do nothing for the moment */ +static ssize_t bfq_weights_store(struct elevator_queue *e, + const char *page, size_t count) +{ + return count; +} + +static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) +{ + u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); + + if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) + return bfq_calc_max_budget(bfqd->peak_rate, timeout); + else + return bfq_default_max_budget; +} + +static ssize_t bfq_max_budget_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long __data; + int ret = bfq_var_store(&__data, (page), count); + + if (__data == 0) + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + else { + if (__data > INT_MAX) + __data = INT_MAX; + bfqd->bfq_max_budget = __data; + } + + bfqd->bfq_user_max_budget = __data; + + return ret; +} + +static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long __data; + int ret = bfq_var_store(&__data, (page), count); + + if (__data < 1) + __data = 1; + else if (__data > INT_MAX) + __data = INT_MAX; + + bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); + if (bfqd->bfq_user_max_budget == 0) + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + + return ret; +} + +static ssize_t bfq_low_latency_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long __data; + int ret = bfq_var_store(&__data, (page), count); + + if (__data > 1) + __data = 1; + bfqd->low_latency = __data; + + return ret; +} + +#define BFQ_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) + +static struct elv_fs_entry bfq_attrs[] = { + BFQ_ATTR(quantum), + BFQ_ATTR(fifo_expire_sync), + BFQ_ATTR(fifo_expire_async), + BFQ_ATTR(back_seek_max), + BFQ_ATTR(back_seek_penalty), + BFQ_ATTR(slice_idle), + BFQ_ATTR(max_budget), + BFQ_ATTR(max_budget_async_rq), + BFQ_ATTR(timeout_sync), + BFQ_ATTR(timeout_async), + BFQ_ATTR(low_latency), + BFQ_ATTR(raising_coeff), + BFQ_ATTR(raising_max_time), + BFQ_ATTR(raising_rt_max_time), + BFQ_ATTR(raising_min_idle_time), + BFQ_ATTR(raising_min_inter_arr_async), + BFQ_ATTR(raising_max_softrt_rate), + BFQ_ATTR(weights), + __ATTR_NULL +}; + +static struct elevator_type iosched_bfq = { + .ops = { + .elevator_merge_fn = bfq_merge, + .elevator_merged_fn = bfq_merged_request, + .elevator_merge_req_fn = bfq_merged_requests, + .elevator_allow_merge_fn = bfq_allow_merge, + .elevator_dispatch_fn = bfq_dispatch_requests, + .elevator_add_req_fn = bfq_insert_request, + .elevator_activate_req_fn = bfq_activate_request, + .elevator_deactivate_req_fn = bfq_deactivate_request, + .elevator_completed_req_fn = bfq_completed_request, + .elevator_former_req_fn = elv_rb_former_request, + .elevator_latter_req_fn = elv_rb_latter_request, + .elevator_set_req_fn = bfq_set_request, + .elevator_put_req_fn = bfq_put_request, + .elevator_may_queue_fn = bfq_may_queue, + .elevator_init_fn = bfq_init_queue, + .elevator_exit_fn = bfq_exit_queue, + .trim = bfq_free_io_context, + }, + .elevator_attrs = bfq_attrs, + .elevator_name = "bfq", + .elevator_owner = THIS_MODULE, +}; + +static int __init bfq_init(void) +{ + /* + * Can be 0 on HZ < 1000 setups. + */ + //if (bfq_slice_idle == 0) + // bfq_slice_idle = 1; + + if (bfq_timeout_async == 0) + bfq_timeout_async = 1; + + if (bfq_slab_setup()) + return -ENOMEM; + + elv_register(&iosched_bfq); + + return 0; +} + +static void __exit bfq_exit(void) +{ + DECLARE_COMPLETION_ONSTACK(all_gone); + elv_unregister(&iosched_bfq); + bfq_ioc_gone = &all_gone; + /* bfq_ioc_gone's update must be visible before reading bfq_ioc_count */ + smp_wmb(); + if (elv_ioc_count_read(bfq_ioc_count) != 0) + wait_for_completion(&all_gone); + ida_destroy(&cic_index_ida); + bfq_slab_kill(); +} + +module_init(bfq_init); +module_exit(bfq_exit); + +MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler"); diff --git a/block/bfq-sched.c b/block/bfq-sched.c new file mode 100644 index 000000000..fd50b7fd1 --- /dev/null +++ b/block/bfq-sched.c @@ -0,0 +1,1066 @@ +/* + * BFQ: Hierarchical B-WF2Q+ scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + */ + +#ifdef CONFIG_CGROUP_BFQIO +#define for_each_entity(entity) \ + for (; entity != NULL; entity = entity->parent) + +#define for_each_entity_safe(entity, parent) \ + for (; entity && ({ parent = entity->parent; 1; }); entity = parent) + +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + int extract, + struct bfq_data *bfqd); + +static inline void bfq_update_budget(struct bfq_entity *next_active) +{ + struct bfq_entity *bfqg_entity; + struct bfq_group *bfqg; + struct bfq_sched_data *group_sd; + + BUG_ON(next_active == NULL); + + group_sd = next_active->sched_data; + + bfqg = container_of(group_sd, struct bfq_group, sched_data); + /* + * bfq_group's my_entity field is not NULL only if the group + * is not the root group. We must not touch the root entity + * as it must never become an active entity. + */ + bfqg_entity = bfqg->my_entity; + if (bfqg_entity != NULL) + bfqg_entity->budget = next_active->budget; +} + +static int bfq_update_next_active(struct bfq_sched_data *sd) +{ + struct bfq_entity *next_active; + + if (sd->active_entity != NULL) + /* will update/requeue at the end of service */ + return 0; + + /* + * NOTE: this can be improved in many ways, such as returning + * 1 (and thus propagating upwards the update) only when the + * budget changes, or caching the bfqq that will be scheduled + * next from this subtree. By now we worry more about + * correctness than about performance... + */ + next_active = bfq_lookup_next_entity(sd, 0, NULL); + sd->next_active = next_active; + + if (next_active != NULL) + bfq_update_budget(next_active); + + return 1; +} + +static inline void bfq_check_next_active(struct bfq_sched_data *sd, + struct bfq_entity *entity) +{ + BUG_ON(sd->next_active != entity); +} +#else +#define for_each_entity(entity) \ + for (; entity != NULL; entity = NULL) + +#define for_each_entity_safe(entity, parent) \ + for (parent = NULL; entity != NULL; entity = parent) + +static inline int bfq_update_next_active(struct bfq_sched_data *sd) +{ + return 0; +} + +static inline void bfq_check_next_active(struct bfq_sched_data *sd, + struct bfq_entity *entity) +{ +} + +static inline void bfq_update_budget(struct bfq_entity *next_active) +{ +} +#endif + +/* + * Shift for timestamp calculations. This actually limits the maximum + * service allowed in one timestamp delta (small shift values increase it), + * the maximum total weight that can be used for the queues in the system + * (big shift values increase it), and the period of virtual time wraparounds. + */ +#define WFQ_SERVICE_SHIFT 22 + +/** + * bfq_gt - compare two timestamps. + * @a: first ts. + * @b: second ts. + * + * Return @a > @b, dealing with wrapping correctly. + */ +static inline int bfq_gt(u64 a, u64 b) +{ + return (s64)(a - b) > 0; +} + +static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = NULL; + + BUG_ON(entity == NULL); + + if (entity->my_sched_data == NULL) + bfqq = container_of(entity, struct bfq_queue, entity); + + return bfqq; +} + + +/** + * bfq_delta - map service into the virtual time domain. + * @service: amount of service. + * @weight: scale factor (weight of an entity or weight sum). + */ +static inline u64 bfq_delta(unsigned long service, + unsigned long weight) +{ + u64 d = (u64)service << WFQ_SERVICE_SHIFT; + + do_div(d, weight); + return d; +} + +/** + * bfq_calc_finish - assign the finish time to an entity. + * @entity: the entity to act upon. + * @service: the service to be charged to the entity. + */ +static inline void bfq_calc_finish(struct bfq_entity *entity, + unsigned long service) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + BUG_ON(entity->weight == 0); + + entity->finish = entity->start + + bfq_delta(service, entity->weight); + + if (bfqq != NULL) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: serv %lu, w %d", + service, entity->weight); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: start %llu, finish %llu, delta %llu", + entity->start, entity->finish, + bfq_delta(service, entity->weight)); + } +} + +/** + * bfq_entity_of - get an entity from a node. + * @node: the node field of the entity. + * + * Convert a node pointer to the relative entity. This is used only + * to simplify the logic of some functions and not as the generic + * conversion mechanism because, e.g., in the tree walking functions, + * the check for a %NULL value would be redundant. + */ +static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) +{ + struct bfq_entity *entity = NULL; + + if (node != NULL) + entity = rb_entry(node, struct bfq_entity, rb_node); + + return entity; +} + +/** + * bfq_extract - remove an entity from a tree. + * @root: the tree root. + * @entity: the entity to remove. + */ +static inline void bfq_extract(struct rb_root *root, + struct bfq_entity *entity) +{ + BUG_ON(entity->tree != root); + + entity->tree = NULL; + rb_erase(&entity->rb_node, root); +} + +/** + * bfq_idle_extract - extract an entity from the idle tree. + * @st: the service tree of the owning @entity. + * @entity: the entity being removed. + */ +static void bfq_idle_extract(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *next; + + BUG_ON(entity->tree != &st->idle); + + if (entity == st->first_idle) { + next = rb_next(&entity->rb_node); + st->first_idle = bfq_entity_of(next); + } + + if (entity == st->last_idle) { + next = rb_prev(&entity->rb_node); + st->last_idle = bfq_entity_of(next); + } + + bfq_extract(&st->idle, entity); + + if (bfqq != NULL) + list_del(&bfqq->bfqq_list); +} + +/** + * bfq_insert - generic tree insertion. + * @root: tree root. + * @entity: entity to insert. + * + * This is used for the idle and the active tree, since they are both + * ordered by finish time. + */ +static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) +{ + struct bfq_entity *entry; + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + + BUG_ON(entity->tree != NULL); + + while (*node != NULL) { + parent = *node; + entry = rb_entry(parent, struct bfq_entity, rb_node); + + if (bfq_gt(entry->finish, entity->finish)) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&entity->rb_node, parent, node); + rb_insert_color(&entity->rb_node, root); + + entity->tree = root; +} + +/** + * bfq_update_min - update the min_start field of a entity. + * @entity: the entity to update. + * @node: one of its children. + * + * This function is called when @entity may store an invalid value for + * min_start due to updates to the active tree. The function assumes + * that the subtree rooted at @node (which may be its left or its right + * child) has a valid min_start value. + */ +static inline void bfq_update_min(struct bfq_entity *entity, + struct rb_node *node) +{ + struct bfq_entity *child; + + if (node != NULL) { + child = rb_entry(node, struct bfq_entity, rb_node); + if (bfq_gt(entity->min_start, child->min_start)) + entity->min_start = child->min_start; + } +} + +/** + * bfq_update_active_node - recalculate min_start. + * @node: the node to update. + * + * @node may have changed position or one of its children may have moved, + * this function updates its min_start value. The left and right subtrees + * are assumed to hold a correct min_start value. + */ +static inline void bfq_update_active_node(struct rb_node *node) +{ + struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); + + entity->min_start = entity->start; + bfq_update_min(entity, node->rb_right); + bfq_update_min(entity, node->rb_left); +} + +/** + * bfq_update_active_tree - update min_start for the whole active tree. + * @node: the starting node. + * + * @node must be the deepest modified node after an update. This function + * updates its min_start using the values held by its children, assuming + * that they did not change, and then updates all the nodes that may have + * changed in the path to the root. The only nodes that may have changed + * are the ones in the path or their siblings. + */ +static void bfq_update_active_tree(struct rb_node *node) +{ + struct rb_node *parent; + +up: + bfq_update_active_node(node); + + parent = rb_parent(node); + if (parent == NULL) + return; + + if (node == parent->rb_left && parent->rb_right != NULL) + bfq_update_active_node(parent->rb_right); + else if (parent->rb_left != NULL) + bfq_update_active_node(parent->rb_left); + + node = parent; + goto up; +} + +/** + * bfq_active_insert - insert an entity in the active tree of its group/device. + * @st: the service tree of the entity. + * @entity: the entity being inserted. + * + * The active tree is ordered by finish time, but an extra key is kept + * per each node, containing the minimum value for the start times of + * its children (and the node itself), so it's possible to search for + * the eligible node with the lowest finish time in logarithmic time. + */ +static void bfq_active_insert(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node = &entity->rb_node; + + bfq_insert(&st->active, entity); + + if (node->rb_left != NULL) + node = node->rb_left; + else if (node->rb_right != NULL) + node = node->rb_right; + + bfq_update_active_tree(node); + + if (bfqq != NULL) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); +} + +/** + * bfq_ioprio_to_weight - calc a weight from an ioprio. + * @ioprio: the ioprio value to convert. + */ +static unsigned short bfq_ioprio_to_weight(int ioprio) +{ + WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); + return IOPRIO_BE_NR - ioprio; +} + +/** + * bfq_weight_to_ioprio - calc an ioprio from a weight. + * @weight: the weight value to convert. + * + * To preserve as mush as possible the old only-ioprio user interface, + * 0 is used as an escape ioprio value for weights (numerically) equal or + * larger than IOPRIO_BE_NR + */ +static unsigned short bfq_weight_to_ioprio(int weight) +{ + WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); + return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; +} + +static inline void bfq_get_entity(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + if (bfqq != NULL) { + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", + bfqq, atomic_read(&bfqq->ref)); + } +} + +/** + * bfq_find_deepest - find the deepest node that an extraction can modify. + * @node: the node being removed. + * + * Do the first step of an extraction in an rb tree, looking for the + * node that will replace @node, and returning the deepest node that + * the following modifications to the tree can touch. If @node is the + * last node in the tree return %NULL. + */ +static struct rb_node *bfq_find_deepest(struct rb_node *node) +{ + struct rb_node *deepest; + + if (node->rb_right == NULL && node->rb_left == NULL) + deepest = rb_parent(node); + else if (node->rb_right == NULL) + deepest = node->rb_left; + else if (node->rb_left == NULL) + deepest = node->rb_right; + else { + deepest = rb_next(node); + if (deepest->rb_right != NULL) + deepest = deepest->rb_right; + else if (rb_parent(deepest) != node) + deepest = rb_parent(deepest); + } + + return deepest; +} + +/** + * bfq_active_extract - remove an entity from the active tree. + * @st: the service_tree containing the tree. + * @entity: the entity being removed. + */ +static void bfq_active_extract(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node; + + node = bfq_find_deepest(&entity->rb_node); + bfq_extract(&st->active, entity); + + if (node != NULL) + bfq_update_active_tree(node); + + if (bfqq != NULL) + list_del(&bfqq->bfqq_list); +} + +/** + * bfq_idle_insert - insert an entity into the idle tree. + * @st: the service tree containing the tree. + * @entity: the entity to insert. + */ +static void bfq_idle_insert(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_entity *first_idle = st->first_idle; + struct bfq_entity *last_idle = st->last_idle; + + if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) + st->first_idle = entity; + if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) + st->last_idle = entity; + + bfq_insert(&st->idle, entity); + + if (bfqq != NULL) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); +} + +/** + * bfq_forget_entity - remove an entity from the wfq trees. + * @st: the service tree. + * @entity: the entity being removed. + * + * Update the device status and forget everything about @entity, putting + * the device reference to it, if it is a queue. Entities belonging to + * groups are not refcounted. + */ +static void bfq_forget_entity(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + BUG_ON(!entity->on_st); + + entity->on_st = 0; + st->wsum -= entity->weight; + if (bfqq != NULL) { + bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } +} + +/** + * bfq_put_idle_entity - release the idle tree ref of an entity. + * @st: service tree for the entity. + * @entity: the entity being released. + */ +static void bfq_put_idle_entity(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + bfq_idle_extract(st, entity); + bfq_forget_entity(st, entity); +} + +/** + * bfq_forget_idle - update the idle tree if necessary. + * @st: the service tree to act upon. + * + * To preserve the global O(log N) complexity we only remove one entry here; + * as the idle tree will not grow indefinitely this can be done safely. + */ +static void bfq_forget_idle(struct bfq_service_tree *st) +{ + struct bfq_entity *first_idle = st->first_idle; + struct bfq_entity *last_idle = st->last_idle; + + if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && + !bfq_gt(last_idle->finish, st->vtime)) { + /* + * Forget the whole idle tree, increasing the vtime past + * the last finish time of idle entities. + */ + st->vtime = last_idle->finish; + } + + if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) + bfq_put_idle_entity(st, first_idle); +} + +static struct bfq_service_tree * +__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + struct bfq_entity *entity) +{ + struct bfq_service_tree *new_st = old_st; + + if (entity->ioprio_changed) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + BUG_ON(old_st->wsum < entity->weight); + old_st->wsum -= entity->weight; + + if (entity->new_weight != entity->orig_weight) { + entity->orig_weight = entity->new_weight; + entity->ioprio = + bfq_weight_to_ioprio(entity->orig_weight); + } else if (entity->new_ioprio != entity->ioprio) { + entity->ioprio = entity->new_ioprio; + entity->orig_weight = + bfq_ioprio_to_weight(entity->ioprio); + } else + entity->new_weight = entity->orig_weight = + bfq_ioprio_to_weight(entity->ioprio); + + entity->ioprio_class = entity->new_ioprio_class; + entity->ioprio_changed = 0; + + /* + * NOTE: here we may be changing the weight too early, + * this will cause unfairness. The correct approach + * would have required additional complexity to defer + * weight changes to the proper time instants (i.e., + * when entity->finish <= old_st->vtime). + */ + new_st = bfq_entity_service_tree(entity); + entity->weight = entity->orig_weight * + (bfqq != NULL ? bfqq->raising_coeff : 1); + new_st->wsum += entity->weight; + + if (new_st != old_st) + entity->start = new_st->vtime; + } + + return new_st; +} + +/** + * bfq_bfqq_served - update the scheduler status after selection for service. + * @bfqq: the queue being served. + * @served: bytes to transfer. + * + * NOTE: this can be optimized, as the timestamps of upper level entities + * are synchronized every time a new bfqq is selected for service. By now, + * we keep it to better check consistency. + */ +static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st; + + for_each_entity(entity) { + st = bfq_entity_service_tree(entity); + + entity->service += served; + BUG_ON(entity->service > entity->budget); + BUG_ON(st->wsum == 0); + + st->vtime += bfq_delta(served, st->wsum); + bfq_forget_idle(st); + } + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served); +} + +/** + * bfq_bfqq_charge_full_budget - set the service to the entity budget. + * @bfqq: the queue that needs a service update. + * + * When it's not possible to be fair in the service domain, because + * a queue is not consuming its budget fast enough (the meaning of + * fast depends on the timeout parameter), we charge it a full + * budget. In this way we should obtain a sort of time-domain + * fairness among all the seeky/slow queues. + */ +static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); + + bfq_bfqq_served(bfqq, entity->budget - entity->service); +} + +/** + * __bfq_activate_entity - activate an entity. + * @entity: the entity being activated. + * + * Called whenever an entity is activated, i.e., it is not active and one + * of its children receives a new request, or has to be reactivated due to + * budget exhaustion. It uses the current budget of the entity (and the + * service received if @entity is active) of the queue to calculate its + * timestamps. + */ +static void __bfq_activate_entity(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + + if (entity == sd->active_entity) { + BUG_ON(entity->tree != NULL); + /* + * If we are requeueing the current entity we have + * to take care of not charging to it service it has + * not received. + */ + bfq_calc_finish(entity, entity->service); + entity->start = entity->finish; + sd->active_entity = NULL; + } else if (entity->tree == &st->active) { + /* + * Requeueing an entity due to a change of some + * next_active entity below it. We reuse the old + * start time. + */ + bfq_active_extract(st, entity); + } else if (entity->tree == &st->idle) { + /* + * Must be on the idle tree, bfq_idle_extract() will + * check for that. + */ + bfq_idle_extract(st, entity); + entity->start = bfq_gt(st->vtime, entity->finish) ? + st->vtime : entity->finish; + } else { + /* + * The finish time of the entity may be invalid, and + * it is in the past for sure, otherwise the queue + * would have been on the idle tree. + */ + entity->start = st->vtime; + st->wsum += entity->weight; + bfq_get_entity(entity); + + BUG_ON(entity->on_st); + entity->on_st = 1; + } + + st = __bfq_entity_update_weight_prio(st, entity); + bfq_calc_finish(entity, entity->budget); + bfq_active_insert(st, entity); +} + +/** + * bfq_activate_entity - activate an entity and its ancestors if necessary. + * @entity: the entity to activate. + * + * Activate @entity and all the entities on the path from it to the root. + */ +static void bfq_activate_entity(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd; + + for_each_entity(entity) { + __bfq_activate_entity(entity); + + sd = entity->sched_data; + if (!bfq_update_next_active(sd)) + /* + * No need to propagate the activation to the + * upper entities, as they will be updated when + * the active entity is rescheduled. + */ + break; + } +} + +/** + * __bfq_deactivate_entity - deactivate an entity from its service tree. + * @entity: the entity to deactivate. + * @requeue: if false, the entity will not be put into the idle tree. + * + * Deactivate an entity, independently from its previous state. If the + * entity was not on a service tree just return, otherwise if it is on + * any scheduler tree, extract it from that tree, and if necessary + * and if the caller did not specify @requeue, put it on the idle tree. + * + * Return %1 if the caller should update the entity hierarchy, i.e., + * if the entity was under service or if it was the next_active for + * its sched_data; return %0 otherwise. + */ +static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + int was_active = entity == sd->active_entity; + int ret = 0; + + if (!entity->on_st) + return 0; + + BUG_ON(was_active && entity->tree != NULL); + + if (was_active) { + bfq_calc_finish(entity, entity->service); + sd->active_entity = NULL; + } else if (entity->tree == &st->active) + bfq_active_extract(st, entity); + else if (entity->tree == &st->idle) + bfq_idle_extract(st, entity); + else if (entity->tree != NULL) + BUG(); + + if (was_active || sd->next_active == entity) + ret = bfq_update_next_active(sd); + + if (!requeue || !bfq_gt(entity->finish, st->vtime)) + bfq_forget_entity(st, entity); + else + bfq_idle_insert(st, entity); + + BUG_ON(sd->active_entity == entity); + BUG_ON(sd->next_active == entity); + + return ret; +} + +/** + * bfq_deactivate_entity - deactivate an entity. + * @entity: the entity to deactivate. + * @requeue: true if the entity can be put on the idle tree + */ +static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) +{ + struct bfq_sched_data *sd; + struct bfq_entity *parent; + + for_each_entity_safe(entity, parent) { + sd = entity->sched_data; + + if (!__bfq_deactivate_entity(entity, requeue)) + /* + * The parent entity is still backlogged, and + * we don't need to update it as it is still + * under service. + */ + break; + + if (sd->next_active != NULL) + /* + * The parent entity is still backlogged and + * the budgets on the path towards the root + * need to be updated. + */ + goto update; + + /* + * If we reach there the parent is no more backlogged and + * we want to propagate the dequeue upwards. + */ + requeue = 1; + } + + return; + +update: + entity = parent; + for_each_entity(entity) { + __bfq_activate_entity(entity); + + sd = entity->sched_data; + if (!bfq_update_next_active(sd)) + break; + } +} + +/** + * bfq_update_vtime - update vtime if necessary. + * @st: the service tree to act upon. + * + * If necessary update the service tree vtime to have at least one + * eligible entity, skipping to its start time. Assumes that the + * active tree of the device is not empty. + * + * NOTE: this hierarchical implementation updates vtimes quite often, + * we may end up with reactivated tasks getting timestamps after a + * vtime skip done because we needed a ->first_active entity on some + * intermediate node. + */ +static void bfq_update_vtime(struct bfq_service_tree *st) +{ + struct bfq_entity *entry; + struct rb_node *node = st->active.rb_node; + + entry = rb_entry(node, struct bfq_entity, rb_node); + if (bfq_gt(entry->min_start, st->vtime)) { + st->vtime = entry->min_start; + bfq_forget_idle(st); + } +} + +/** + * bfq_first_active - find the eligible entity with the smallest finish time + * @st: the service tree to select from. + * + * This function searches the first schedulable entity, starting from the + * root of the tree and going on the left every time on this side there is + * a subtree with at least one eligible (start >= vtime) entity. The path + * on the right is followed only if a) the left subtree contains no eligible + * entities and b) no eligible entity has been found yet. + */ +static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) +{ + struct bfq_entity *entry, *first = NULL; + struct rb_node *node = st->active.rb_node; + + while (node != NULL) { + entry = rb_entry(node, struct bfq_entity, rb_node); +left: + if (!bfq_gt(entry->start, st->vtime)) + first = entry; + + BUG_ON(bfq_gt(entry->min_start, st->vtime)); + + if (node->rb_left != NULL) { + entry = rb_entry(node->rb_left, + struct bfq_entity, rb_node); + if (!bfq_gt(entry->min_start, st->vtime)) { + node = node->rb_left; + goto left; + } + } + if (first != NULL) + break; + node = node->rb_right; + } + + BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); + return first; +} + +/** + * __bfq_lookup_next_entity - return the first eligible entity in @st. + * @st: the service tree. + * + * Update the virtual time in @st and return the first eligible entity + * it contains. + */ +static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, + bool force) +{ + struct bfq_entity *entity, *new_next_active = NULL; + + if (RB_EMPTY_ROOT(&st->active)) + return NULL; + + bfq_update_vtime(st); + entity = bfq_first_active_entity(st); + BUG_ON(bfq_gt(entity->start, st->vtime)); + + /* + * If the chosen entity does not match with the sched_data's + * next_active and we are forcedly serving the IDLE priority + * class tree, bubble up budget update. + */ + if (unlikely(force && entity != entity->sched_data->next_active)) { + new_next_active = entity; + for_each_entity(new_next_active) + bfq_update_budget(new_next_active); + } + + return entity; +} + +/** + * bfq_lookup_next_entity - return the first eligible entity in @sd. + * @sd: the sched_data. + * @extract: if true the returned entity will be also extracted from @sd. + * + * NOTE: since we cache the next_active entity at each level of the + * hierarchy, the complexity of the lookup can be decreased with + * absolutely no effort just returning the cached next_active value; + * we prefer to do full lookups to test the consistency of * the data + * structures. + */ +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + int extract, + struct bfq_data *bfqd) +{ + struct bfq_service_tree *st = sd->service_tree; + struct bfq_entity *entity; + int i=0; + + BUG_ON(sd->active_entity != NULL); + + if (bfqd != NULL && + jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { + entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true); + if (entity != NULL) { + i = BFQ_IOPRIO_CLASSES - 1; + bfqd->bfq_class_idle_last_service = jiffies; + sd->next_active = entity; + } + } + for (; i < BFQ_IOPRIO_CLASSES; i++) { + entity = __bfq_lookup_next_entity(st + i, false); + if (entity != NULL) { + if (extract) { + bfq_check_next_active(sd, entity); + bfq_active_extract(st + i, entity); + sd->active_entity = entity; + sd->next_active = NULL; + } + break; + } + } + + return entity; +} + +/* + * Get next queue for service. + */ +static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) +{ + struct bfq_entity *entity = NULL; + struct bfq_sched_data *sd; + struct bfq_queue *bfqq; + + BUG_ON(bfqd->active_queue != NULL); + + if (bfqd->busy_queues == 0) + return NULL; + + sd = &bfqd->root_group->sched_data; + for (; sd != NULL; sd = entity->my_sched_data) { + entity = bfq_lookup_next_entity(sd, 1, bfqd); + BUG_ON(entity == NULL); + entity->service = 0; + } + + bfqq = bfq_entity_to_bfqq(entity); + BUG_ON(bfqq == NULL); + + return bfqq; +} + +/* + * Forced extraction of the given queue. + */ +static void bfq_get_next_queue_forced(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_entity *entity; + struct bfq_sched_data *sd; + + BUG_ON(bfqd->active_queue != NULL); + + entity = &bfqq->entity; + /* + * Bubble up extraction/update from the leaf to the root. + */ + for_each_entity(entity) { + sd = entity->sched_data; + bfq_update_budget(entity); + bfq_update_vtime(bfq_entity_service_tree(entity)); + bfq_active_extract(bfq_entity_service_tree(entity), entity); + sd->active_entity = entity; + sd->next_active = NULL; + entity->service = 0; + } + + return; +} + +static void __bfq_bfqd_reset_active(struct bfq_data *bfqd) +{ + if (bfqd->active_cic != NULL) { + put_io_context(bfqd->active_cic->ioc); + bfqd->active_cic = NULL; + } + + bfqd->active_queue = NULL; + del_timer(&bfqd->idle_slice_timer); +} + +static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int requeue) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (bfqq == bfqd->active_queue) + __bfq_bfqd_reset_active(bfqd); + + bfq_deactivate_entity(entity, requeue); +} + +static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_activate_entity(entity); +} + +/* + * Called when the bfqq no longer has requests pending, remove it from + * the service tree. + */ +static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int requeue) +{ + BUG_ON(!bfq_bfqq_busy(bfqq)); + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + + bfq_log_bfqq(bfqd, bfqq, "del from busy"); + + bfq_clear_bfqq_busy(bfqq); + + BUG_ON(bfqd->busy_queues == 0); + bfqd->busy_queues--; + + bfq_deactivate_bfqq(bfqd, bfqq, requeue); +} + +/* + * Called when an inactive queue receives a new request. + */ +static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + BUG_ON(bfq_bfqq_busy(bfqq)); + BUG_ON(bfqq == bfqd->active_queue); + + bfq_log_bfqq(bfqd, bfqq, "add to busy"); + + bfq_activate_bfqq(bfqd, bfqq); + + bfq_mark_bfqq_busy(bfqq); + bfqd->busy_queues++; +} diff --git a/block/bfq.h b/block/bfq.h new file mode 100644 index 000000000..e2ce5052a --- /dev/null +++ b/block/bfq.h @@ -0,0 +1,595 @@ +/* + * BFQ-v5 for 3.0: data structures and common functions prototypes. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + */ + +#ifndef _BFQ_H +#define _BFQ_H + +#include +#include +#include +#include + +#define BFQ_IOPRIO_CLASSES 3 +#define BFQ_CL_IDLE_TIMEOUT HZ/5 + +#define BFQ_MIN_WEIGHT 1 +#define BFQ_MAX_WEIGHT 1000 + +#define BFQ_DEFAULT_GRP_WEIGHT 10 +#define BFQ_DEFAULT_GRP_IOPRIO 0 +#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE + +struct bfq_entity; + +/** + * struct bfq_service_tree - per ioprio_class service tree. + * @active: tree for active entities (i.e., those backlogged). + * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). + * @first_idle: idle entity with minimum F_i. + * @last_idle: idle entity with maximum F_i. + * @vtime: scheduler virtual time. + * @wsum: scheduler weight sum; active and idle entities contribute to it. + * + * Each service tree represents a B-WF2Q+ scheduler on its own. Each + * ioprio_class has its own independent scheduler, and so its own + * bfq_service_tree. All the fields are protected by the queue lock + * of the containing bfqd. + */ +struct bfq_service_tree { + struct rb_root active; + struct rb_root idle; + + struct bfq_entity *first_idle; + struct bfq_entity *last_idle; + + u64 vtime; + unsigned long wsum; +}; + +/** + * struct bfq_sched_data - multi-class scheduler. + * @active_entity: entity under service. + * @next_active: head-of-the-line entity in the scheduler. + * @service_tree: array of service trees, one per ioprio_class. + * + * bfq_sched_data is the basic scheduler queue. It supports three + * ioprio_classes, and can be used either as a toplevel queue or as + * an intermediate queue on a hierarchical setup. + * @next_active points to the active entity of the sched_data service + * trees that will be scheduled next. + * + * The supported ioprio_classes are the same as in CFQ, in descending + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. + * Requests from higher priority queues are served before all the + * requests from lower priority queues; among requests of the same + * queue requests are served according to B-WF2Q+. + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_sched_data { + struct bfq_entity *active_entity; + struct bfq_entity *next_active; + struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; +}; + +/** + * struct bfq_entity - schedulable entity. + * @rb_node: service_tree member. + * @on_st: flag, true if the entity is on a tree (either the active or + * the idle one of its service_tree). + * @finish: B-WF2Q+ finish timestamp (aka F_i). + * @start: B-WF2Q+ start timestamp (aka S_i). + * @tree: tree the entity is enqueued into; %NULL if not on a tree. + * @min_start: minimum start time of the (active) subtree rooted at + * this entity; used for O(log N) lookups into active trees. + * @service: service received during the last round of service. + * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. + * @weight: weight of the queue + * @parent: parent entity, for hierarchical scheduling. + * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the + * associated scheduler queue, %NULL on leaf nodes. + * @sched_data: the scheduler queue this entity belongs to. + * @ioprio: the ioprio in use. + * @new_weight: when a weight change is requested, the new weight value. + * @orig_weight: original weight, used to implement weight boosting + * @new_ioprio: when an ioprio change is requested, the new ioprio value. + * @ioprio_class: the ioprio_class in use. + * @new_ioprio_class: when an ioprio_class change is requested, the new + * ioprio_class value. + * @ioprio_changed: flag, true when the user requested a weight, ioprio or + * ioprio_class change. + * + * A bfq_entity is used to represent either a bfq_queue (leaf node in the + * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each + * entity belongs to the sched_data of the parent group in the cgroup + * hierarchy. Non-leaf entities have also their own sched_data, stored + * in @my_sched_data. + * + * Each entity stores independently its priority values; this would + * allow different weights on different devices, but this + * functionality is not exported to userspace by now. Priorities and + * weights are updated lazily, first storing the new values into the + * new_* fields, then setting the @ioprio_changed flag. As soon as + * there is a transition in the entity state that allows the priority + * update to take place the effective and the requested priority + * values are synchronized. + * + * Unless cgroups are used, the weight value is calculated from the + * ioprio to export the same interface as CFQ. When dealing with + * ``well-behaved'' queues (i.e., queues that do not spend too much + * time to consume their budget and have true sequential behavior, and + * when there are no external factors breaking anticipation) the + * relative weights at each level of the cgroups hierarchy should be + * guaranteed. All the fields are protected by the queue lock of the + * containing bfqd. + */ +struct bfq_entity { + struct rb_node rb_node; + + int on_st; + + u64 finish; + u64 start; + + struct rb_root *tree; + + u64 min_start; + + unsigned long service, budget; + unsigned short weight, new_weight; + unsigned short orig_weight; + + struct bfq_entity *parent; + + struct bfq_sched_data *my_sched_data; + struct bfq_sched_data *sched_data; + + unsigned short ioprio, new_ioprio; + unsigned short ioprio_class, new_ioprio_class; + + int ioprio_changed; +}; + +struct bfq_group; + +/** + * struct bfq_queue - leaf schedulable entity. + * @ref: reference counter. + * @bfqd: parent bfq_data. + * @new_bfqq: shared bfq_queue if queue is cooperating with + * one or more other queues. + * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree). + * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree). + * @sort_list: sorted list of pending requests. + * @next_rq: if fifo isn't expired, next request to serve. + * @queued: nr of requests queued in @sort_list. + * @allocated: currently allocated requests. + * @meta_pending: pending metadata requests. + * @fifo: fifo list of requests in sort_list. + * @entity: entity representing this queue in the scheduler. + * @max_budget: maximum budget allowed from the feedback mechanism. + * @budget_timeout: budget expiration (in jiffies). + * @dispatched: number of requests on the dispatch list or inside driver. + * @org_ioprio: saved ioprio during boosted periods. + * @org_ioprio_class: saved ioprio_class during boosted periods. + * @flags: status flags. + * @bfqq_list: node for active/idle bfqq list inside our bfqd. + * @seek_samples: number of seeks sampled + * @seek_total: sum of the distances of the seeks sampled + * @seek_mean: mean seek distance + * @last_request_pos: position of the last request enqueued + * @pid: pid of the process owning the queue, used for logging purposes. + * @last_rais_start_time: last (idle -> weight-raised) transition attempt + * @raising_cur_max_time: current max raising time for this queue + * + * A bfq_queue is a leaf request queue; it can be associated to an io_context + * or more (if it is an async one). @cgroup holds a reference to the + * cgroup, to be sure that it does not disappear while a bfqq still + * references it (mostly to avoid races between request issuing and task + * migration followed by cgroup distruction). + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_queue { + atomic_t ref; + struct bfq_data *bfqd; + + /* fields for cooperating queues handling */ + struct bfq_queue *new_bfqq; + struct rb_node pos_node; + struct rb_root *pos_root; + + struct rb_root sort_list; + struct request *next_rq; + int queued[2]; + int allocated[2]; + int meta_pending; + struct list_head fifo; + + struct bfq_entity entity; + + unsigned long max_budget; + unsigned long budget_timeout; + + int dispatched; + + unsigned short org_ioprio; + unsigned short org_ioprio_class; + + unsigned int flags; + + struct list_head bfqq_list; + + unsigned int seek_samples; + u64 seek_total; + sector_t seek_mean; + sector_t last_request_pos; + + pid_t pid; + + /* weight-raising fields */ + unsigned int raising_cur_max_time; + u64 last_rais_start_finish, soft_rt_next_start; + unsigned int raising_coeff; +}; + +/** + * struct bfq_data - per device data structure. + * @queue: request queue for the managed device. + * @root_group: root bfq_group for the device. + * @rq_pos_tree: rbtree sorted by next_request position, + * used when determining if two or more queues + * have interleaving requests (see bfq_close_cooperator). + * @busy_queues: number of bfq_queues containing requests (including the + * queue under service, even if it is idling). + * @queued: number of queued requests. + * @rq_in_driver: number of requests dispatched and waiting for completion. + * @sync_flight: number of sync requests in the driver. + * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples + * completed requests . + * @hw_tag_samples: nr of samples used to calculate hw_tag. + * @hw_tag: flag set to one if the driver is showing a queueing behavior. + * @budgets_assigned: number of budgets assigned. + * @idle_slice_timer: timer set when idling for the next sequential request + * from the queue under service. + * @unplug_work: delayed work to restart dispatching on the request queue. + * @active_queue: bfq_queue under service. + * @active_cic: cfq_io_context (cic) associated with the @active_queue. + * @last_position: on-disk position of the last served request. + * @last_budget_start: beginning of the last budget. + * @last_idling_start: beginning of the last idle slice. + * @peak_rate: peak transfer rate observed for a budget. + * @peak_rate_samples: number of samples used to calculate @peak_rate. + * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling. + * @cic_index: use small consequent indexes as radix tree keys to reduce depth + * @cic_list: list of all the cics active on the bfq_data device. + * @group_list: list of all the bfq_groups active on the device. + * @active_list: list of all the bfq_queues active on the device. + * @idle_list: list of all the bfq_queues idle on the device. + * @bfq_quantum: max number of requests dispatched per dispatch round. + * @bfq_fifo_expire: timeout for async/sync requests; when it expires + * requests are served in fifo order. + * @bfq_back_penalty: weight of backward seeks wrt forward ones. + * @bfq_back_max: maximum allowed backward seek. + * @bfq_slice_idle: maximum idling time. + * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning). + * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to + * async queues. + * @bfq_timeout: timeout for bfq_queues to consume their budget; used to + * to prevent seeky queues to impose long latencies to well + * behaved ones (this also implies that seeky queues cannot + * receive guarantees in the service domain; after a timeout + * they are charged for the whole allocated budget, to try + * to preserve a behavior reasonably fair among them, but + * without service-domain guarantees). + * @bfq_raising_coeff: Maximum factor by which the weight of a boosted + * queue is multiplied + * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies) + * @bfq_raising_rt_max_time: maximum duration for soft real-time processes + * @bfq_raising_min_idle_time: minimum idle period after which weight-raising + * may be reactivated for a queue (in jiffies) + * @bfq_raising_min_inter_arr_async: minimum period between request arrivals + * after which weight-raising may be + * reactivated for an already busy queue + * (in jiffies) + * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue, + * sectors per seconds + * @RT_prod: cached value of the product R*T used for computing the maximum + * duration of the weight raising automatically + * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions + * + * All the fields are protected by the @queue lock. + */ +struct bfq_data { + struct request_queue *queue; + + struct bfq_group *root_group; + + struct rb_root rq_pos_tree; + + int busy_queues; + int queued; + int rq_in_driver; + int sync_flight; + + int max_rq_in_driver; + int hw_tag_samples; + int hw_tag; + + int budgets_assigned; + + struct timer_list idle_slice_timer; + struct work_struct unplug_work; + + struct bfq_queue *active_queue; + struct cfq_io_context *active_cic; + + sector_t last_position; + + ktime_t last_budget_start; + ktime_t last_idling_start; + int peak_rate_samples; + u64 peak_rate; + unsigned long bfq_max_budget; + + unsigned int cic_index; + struct list_head cic_list; + struct hlist_head group_list; + struct list_head active_list; + struct list_head idle_list; + + unsigned int bfq_quantum; + unsigned int bfq_fifo_expire[2]; + unsigned int bfq_back_penalty; + unsigned int bfq_back_max; + unsigned int bfq_slice_idle; + u64 bfq_class_idle_last_service; + + unsigned int bfq_user_max_budget; + unsigned int bfq_max_budget_async_rq; + unsigned int bfq_timeout[2]; + + bool low_latency; + + /* parameters of the low_latency heuristics */ + unsigned int bfq_raising_coeff; + unsigned int bfq_raising_max_time; + unsigned int bfq_raising_rt_max_time; + unsigned int bfq_raising_min_idle_time; + unsigned int bfq_raising_min_inter_arr_async; + unsigned int bfq_raising_max_softrt_rate; + u64 RT_prod; + + struct bfq_queue oom_bfqq; +}; + +enum bfqq_state_flags { + BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */ + BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ + BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ + BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ + BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ + BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */ + BFQ_BFQQ_FLAG_sync, /* synchronous queue */ + BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ + BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */ +}; + +#define BFQ_BFQQ_FNS(name) \ +static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ +} \ +static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ +} \ +static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ +{ \ + return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ +} + +BFQ_BFQQ_FNS(busy); +BFQ_BFQQ_FNS(wait_request); +BFQ_BFQQ_FNS(must_alloc); +BFQ_BFQQ_FNS(fifo_expire); +BFQ_BFQQ_FNS(idle_window); +BFQ_BFQQ_FNS(prio_changed); +BFQ_BFQQ_FNS(sync); +BFQ_BFQQ_FNS(budget_new); +BFQ_BFQQ_FNS(coop); +BFQ_BFQQ_FNS(split_coop); +BFQ_BFQQ_FNS(some_coop_idle); +#undef BFQ_BFQQ_FNS + +/* Logging facilities. */ +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) + +#define bfq_log(bfqd, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) + +/* Expiration reasons. */ +enum bfqq_expiration { + BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */ + BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ + BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ + BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ +}; + +#ifdef CONFIG_CGROUP_BFQIO +/** + * struct bfq_group - per (device, cgroup) data structure. + * @entity: schedulable entity to insert into the parent group sched_data. + * @sched_data: own sched_data, to contain child entities (they may be + * both bfq_queues and bfq_groups). + * @group_node: node to be inserted into the bfqio_cgroup->group_data + * list of the containing cgroup's bfqio_cgroup. + * @bfqd_node: node to be inserted into the @bfqd->group_list list + * of the groups active on the same device; used for cleanup. + * @bfqd: the bfq_data for the device this group acts upon. + * @async_bfqq: array of async queues for all the tasks belonging to + * the group, one queue per ioprio value per ioprio_class, + * except for the idle class that has only one queue. + * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). + * @my_entity: pointer to @entity, %NULL for the toplevel group; used + * to avoid too many special cases during group creation/migration. + * + * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup + * there is a set of bfq_groups, each one collecting the lower-level + * entities belonging to the group that are acting on the same device. + * + * Locking works as follows: + * o @group_node is protected by the bfqio_cgroup lock, and is accessed + * via RCU from its readers. + * o @bfqd is protected by the queue lock, RCU is used to access it + * from the readers. + * o All the other fields are protected by the @bfqd queue lock. + */ +struct bfq_group { + struct bfq_entity entity; + struct bfq_sched_data sched_data; + + struct hlist_node group_node; + struct hlist_node bfqd_node; + + void *bfqd; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; + + struct bfq_entity *my_entity; +}; + +/** + * struct bfqio_cgroup - bfq cgroup data structure. + * @css: subsystem state for bfq in the containing cgroup. + * @weight: cgroup weight. + * @ioprio: cgroup ioprio. + * @ioprio_class: cgroup ioprio_class. + * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data. + * @group_data: list containing the bfq_group belonging to this cgroup. + * + * @group_data is accessed using RCU, with @lock protecting the updates, + * @ioprio and @ioprio_class are protected by @lock. + */ +struct bfqio_cgroup { + struct cgroup_subsys_state css; + + unsigned short weight, ioprio, ioprio_class; + + spinlock_t lock; + struct hlist_head group_data; +}; +#else +struct bfq_group { + struct bfq_sched_data sched_data; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; +}; +#endif + +static inline struct bfq_service_tree * +bfq_entity_service_tree(struct bfq_entity *entity) +{ + struct bfq_sched_data *sched_data = entity->sched_data; + unsigned int idx = entity->ioprio_class - 1; + + BUG_ON(idx >= BFQ_IOPRIO_CLASSES); + BUG_ON(sched_data == NULL); + + return sched_data->service_tree + idx; +} + +static inline struct bfq_queue *cic_to_bfqq(struct cfq_io_context *cic, + int is_sync) +{ + return cic->cfqq[!!is_sync]; +} + +static inline void cic_set_bfqq(struct cfq_io_context *cic, + struct bfq_queue *bfqq, int is_sync) +{ + cic->cfqq[!!is_sync] = bfqq; +} + +static inline void call_for_each_cic(struct io_context *ioc, + void (*func)(struct io_context *, + struct cfq_io_context *)) +{ + struct cfq_io_context *cic; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list) + func(ioc, cic); + rcu_read_unlock(); +} + +#define CIC_DEAD_KEY 1ul +#define CIC_DEAD_INDEX_SHIFT 1 + +static inline void *bfqd_dead_key(struct bfq_data *bfqd) +{ + return (void *)(bfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); +} + +/** + * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. + * @ptr: a pointer to a bfqd. + * @flags: storage for the flags to be saved. + * + * This function allows cic->key and bfqg->bfqd to be protected by the + * queue lock of the bfqd they reference; the pointer is dereferenced + * under RCU, so the storage for bfqd is assured to be safe as long + * as the RCU read side critical section does not end. After the + * bfqd->queue->queue_lock is taken the pointer is rechecked, to be + * sure that no other writer accessed it. If we raced with a writer, + * the function returns NULL, with the queue unlocked, otherwise it + * returns the dereferenced pointer, with the queue locked. + */ +static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, + unsigned long *flags) +{ + struct bfq_data *bfqd; + + rcu_read_lock(); + bfqd = rcu_dereference(*(struct bfq_data **)ptr); + + if (bfqd != NULL && !((unsigned long) bfqd & CIC_DEAD_KEY)) { + spin_lock_irqsave(bfqd->queue->queue_lock, *flags); + if (*ptr == bfqd) + goto out; + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); + } + + bfqd = NULL; +out: + rcu_read_unlock(); + return bfqd; +} + +static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd, + unsigned long *flags) +{ + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); +} + +static void bfq_changed_ioprio(struct io_context *ioc, + struct cfq_io_context *cic); +static void bfq_put_queue(struct bfq_queue *bfqq); +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, int is_sync, + struct io_context *ioc, gfp_t gfp_mask); +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); +#endif diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 342eae9b0..0504f530f 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include /* for max_pfn/max_low_pfn */ @@ -16,13 +17,12 @@ */ static struct kmem_cache *iocontext_cachep; -static void cfq_dtor(struct io_context *ioc) +static void hlist_sched_dtor(struct io_context *ioc, struct hlist_head *list) { - if (!hlist_empty(&ioc->cic_list)) { + if (!hlist_empty(list)) { struct cfq_io_context *cic; - cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, - cic_list); + cic = hlist_entry(list->first, struct cfq_io_context, cic_list); cic->dtor(ioc); } } @@ -40,7 +40,9 @@ int put_io_context(struct io_context *ioc) if (atomic_long_dec_and_test(&ioc->refcount)) { rcu_read_lock(); - cfq_dtor(ioc); + + hlist_sched_dtor(ioc, &ioc->cic_list); + hlist_sched_dtor(ioc, &ioc->bfq_cic_list); rcu_read_unlock(); kmem_cache_free(iocontext_cachep, ioc); @@ -50,15 +52,14 @@ int put_io_context(struct io_context *ioc) } EXPORT_SYMBOL(put_io_context); -static void cfq_exit(struct io_context *ioc) +static void hlist_sched_exit(struct io_context *ioc, struct hlist_head *list) { rcu_read_lock(); - if (!hlist_empty(&ioc->cic_list)) { + if (!hlist_empty(list)) { struct cfq_io_context *cic; - cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, - cic_list); + cic = hlist_entry(list->first, struct cfq_io_context, cic_list); cic->exit(ioc); } rcu_read_unlock(); @@ -74,8 +75,10 @@ void exit_io_context(struct task_struct *task) task->io_context = NULL; task_unlock(task); - if (atomic_dec_and_test(&ioc->nr_tasks)) - cfq_exit(ioc); +if (atomic_dec_and_test(&ioc->nr_tasks)) { + hlist_sched_exit(ioc, &ioc->cic_list); + hlist_sched_exit(ioc, &ioc->bfq_cic_list); + } put_io_context(ioc); } @@ -89,12 +92,14 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) atomic_long_set(&ret->refcount, 1); atomic_set(&ret->nr_tasks, 1); spin_lock_init(&ret->lock); - ret->ioprio_changed = 0; + bitmap_zero(ret->ioprio_changed, IOC_IOPRIO_CHANGED_BITS); ret->ioprio = 0; ret->last_waited = 0; /* doesn't matter... */ ret->nr_batch_requests = 0; /* because this is 0 */ INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ret->cic_list); + INIT_RADIX_TREE(&ret->bfq_radix_root, GFP_ATOMIC | __GFP_HIGH); + INIT_HLIST_HEAD(&ret->bfq_cic_list); ret->ioc_data = NULL; #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) ret->cgroup_changed = 0; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ae21919f1..b581793ec 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2919,7 +2919,6 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) static void cfq_ioc_set_ioprio(struct io_context *ioc) { call_for_each_cic(ioc, changed_ioprio); - ioc->ioprio_changed = 0; } static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, @@ -3204,8 +3203,13 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) goto err_free; out: - smp_read_barrier_depends(); - if (unlikely(ioc->ioprio_changed)) + /* + * test_and_clear_bit() implies a memory barrier, paired with + * the wmb() in fs/ioprio.c, so the value seen for ioprio is the + * new one. + */ + if (unlikely(test_and_clear_bit(IOC_CFQ_IOPRIO_CHANGED, + ioc->ioprio_changed))) cfq_ioc_set_ioprio(ioc); #ifdef CONFIG_CFQ_GROUP_IOSCHED diff --git a/fs/ioprio.c b/fs/ioprio.c index 7da2a0650..2fdc2a310 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -30,7 +30,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) { - int err; + int err, i; struct io_context *ioc; const struct cred *cred = current_cred(), *tcred; @@ -60,12 +60,15 @@ int set_task_ioprio(struct task_struct *task, int ioprio) err = -ENOMEM; break; } + smp_wmb(); task->io_context = ioc; } while (1); if (!err) { ioc->ioprio = ioprio; - ioc->ioprio_changed = 1; + wmb(); + for (i = 0; i < IOC_IOPRIO_CHANGED_BITS; i++) + set_bit(i, ioc->ioprio_changed); } task_unlock(task); diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index ac663c187..c96663839 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -64,3 +64,9 @@ SUBSYS(perf) #endif /* */ + +#ifdef CONFIG_CGROUP_BFQIO +SUBSYS(bfqio) +#endif + +/* */ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index b2eee896d..5f5357748 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -3,12 +3,12 @@ #include #include +#include -struct cfq_queue; struct cfq_io_context { void *key; - struct cfq_queue *cfqq[2]; + void *cfqq[2]; struct io_context *ioc; @@ -27,6 +27,16 @@ struct cfq_io_context { struct rcu_head rcu_head; }; +/* + * Indexes into the ioprio_changed bitmap. A bit set indicates that + * the corresponding I/O scheduler needs to see a ioprio update. + */ +enum { + IOC_CFQ_IOPRIO_CHANGED, + IOC_BFQ_IOPRIO_CHANGED, + IOC_IOPRIO_CHANGED_BITS +}; + /* * I/O subsystem state of the associated processes. It is refcounted * and kmalloc'ed. These could be shared between processes. @@ -39,7 +49,7 @@ struct io_context { spinlock_t lock; unsigned short ioprio; - unsigned short ioprio_changed; + DECLARE_BITMAP(ioprio_changed, IOC_IOPRIO_CHANGED_BITS); #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) unsigned short cgroup_changed; @@ -53,6 +63,8 @@ struct io_context { struct radix_tree_root radix_root; struct hlist_head cic_list; + struct radix_tree_root bfq_radix_root; + struct hlist_head bfq_cic_list; void __rcu *ioc_data; }; From 6f3cf5b0572fdfe57984bb0d56652d81eafbb4d5 Mon Sep 17 00:00:00 2001 From: forumber Date: Sun, 3 Feb 2013 23:05:33 +0200 Subject: [PATCH 18/19] Revert "change kgsl and adreno drivers to u8800pro drivers" This reverts commit 6196e001fa3a5a1fab6d3c0cb8a3a244a6feed59. --- drivers/gpu/msm/adreno.c | 43 ++- drivers/gpu/msm/adreno.h | 8 +- drivers/gpu/msm/adreno_a2xx.c | 81 +++-- drivers/gpu/msm/adreno_debugfs.c | 2 + drivers/gpu/msm/adreno_drawctxt.c | 7 +- drivers/gpu/msm/adreno_pm4types.h | 39 ++- drivers/gpu/msm/adreno_postmortem.c | 9 +- drivers/gpu/msm/adreno_ringbuffer.c | 219 +++++++++++++- drivers/gpu/msm/adreno_snapshot.c | 129 +++++++- drivers/gpu/msm/kgsl.c | 277 ++++++++++-------- drivers/gpu/msm/kgsl.h | 63 ++-- drivers/gpu/msm/kgsl_cffdump.c | 184 ------------ drivers/gpu/msm/kgsl_device.h | 9 +- drivers/gpu/msm/kgsl_drm.c | 12 +- drivers/gpu/msm/kgsl_gpummu.c | 14 +- drivers/gpu/msm/kgsl_iommu.c | 6 +- drivers/gpu/msm/kgsl_pwrctrl.c | 55 ++-- drivers/gpu/msm/kgsl_pwrctrl.h | 2 - drivers/gpu/msm/kgsl_pwrscale.c | 4 +- drivers/gpu/msm/kgsl_pwrscale_idlestats.c | 0 drivers/gpu/msm/kgsl_sharedmem.c | 181 ++++++++---- drivers/gpu/msm/kgsl_sharedmem.h | 68 +++-- drivers/gpu/msm/kgsl_snapshot.c | 11 +- drivers/gpu/msm/z180.c | 55 ++-- .../touchscreen/atmel_i2c_rmi_QT602240.c | 0 include/linux/msm_kgsl.h | 13 +- 26 files changed, 944 insertions(+), 547 deletions(-) mode change 100755 => 100644 drivers/gpu/msm/adreno.c mode change 100755 => 100644 drivers/gpu/msm/adreno.h mode change 100755 => 100644 drivers/gpu/msm/adreno_a2xx.c mode change 100755 => 100644 drivers/gpu/msm/adreno_postmortem.c mode change 100755 => 100644 drivers/gpu/msm/kgsl.c mode change 100755 => 100644 drivers/gpu/msm/kgsl_gpummu.c mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrctrl.c mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrctrl.h mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrscale.c mode change 100755 => 100644 drivers/gpu/msm/kgsl_pwrscale_idlestats.c mode change 100755 => 100644 drivers/gpu/msm/kgsl_sharedmem.c mode change 100755 => 100644 drivers/gpu/msm/kgsl_sharedmem.h mode change 100755 => 100644 drivers/gpu/msm/z180.c mode change 100644 => 100755 drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c mode change 100755 => 100644 include/linux/msm_kgsl.h diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c old mode 100755 new mode 100644 index 7ff5a4384..8320003d2 --- a/drivers/gpu/msm/adreno.c +++ b/drivers/gpu/msm/adreno.c @@ -114,6 +114,7 @@ static struct adreno_device device_3d0 = { .pfp_fw = NULL, .pm4_fw = NULL, .wait_timeout = 10000, /* in milliseconds */ + .ib_check_level = 0, }; @@ -273,6 +274,12 @@ static void adreno_setstate(struct kgsl_device *device, int sizedwords = 0; unsigned int mh_mmu_invalidate = 0x00000003; /*invalidate all and tc */ + /* + * Fix target freeze issue by adding TLB flush for each submit + * on A20X based targets. + */ + if (adreno_is_a20x(adreno_dev)) + flags |= KGSL_MMUFLAGS_TLBFLUSH; /* * If possible, then set the state via the command stream to avoid * a CPU idle. Otherwise, use the default setstate which uses register @@ -638,6 +645,8 @@ adreno_recover_hang(struct kgsl_device *device) unsigned int soptimestamp; unsigned int eoptimestamp; struct adreno_context *drawctxt; + struct kgsl_context *context; + int next = 0; KGSL_DRV_ERR(device, "Starting recovery from 3D GPU hang....\n"); rb_buffer = vmalloc(rb->buffer_desc.size); @@ -706,6 +715,24 @@ adreno_recover_hang(struct kgsl_device *device) drawctxt->flags |= CTXT_FLAGS_GPU_HANG; + /* + * Set the reset status of all contexts to + * INNOCENT_CONTEXT_RESET_EXT except for the bad context + * since thats the guilty party + */ + while ((context = idr_get_next(&device->context_idr, &next))) { + if (KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT != + context->reset_status) { + if (context->devctxt != drawctxt) + context->reset_status = + KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT; + else + context->reset_status = + KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT; + } + next = next + 1; + } + /* Restore valid commands in ringbuffer */ adreno_ringbuffer_restore(rb, rb_buffer, num_rb_contents); rb->timestamp = timestamp; @@ -868,15 +895,13 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout) struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer; unsigned int rbbm_status; unsigned long wait_timeout = - msecs_to_jiffies(adreno_dev->wait_timeout); - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ - /*merge qc patch to fix kgsl issue.*/ + msecs_to_jiffies(adreno_dev->wait_timeout); unsigned long wait_time; unsigned long wait_time_part; unsigned int msecs; unsigned int msecs_first; unsigned int msecs_part; - /* DTS2012041906630 zhangxiangdang 20120423 end > */ + kgsl_cffdump_regpoll(device->id, REG_RBBM_STATUS << 2, 0x00000000, 0x80000000); /* first, wait until the CP has consumed all the commands in @@ -884,8 +909,6 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout) */ retry: if (rb->flags & KGSL_FLAGS_STARTED) { - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ - /*merge qc patch to fix kgsl issue.*/ msecs = adreno_dev->wait_timeout; msecs_first = (msecs <= 100) ? ((msecs + 4) / 5) : 100; msecs_part = (msecs - msecs_first + 3) / 4; @@ -898,7 +921,6 @@ int adreno_idle(struct kgsl_device *device, unsigned int timeout) wait_time_part = jiffies + msecs_to_jiffies(msecs_part); } - /* DTS2012041906630 zhangxiangdang 20120423 end > */ GSL_RB_GET_READPTR(rb, &rb->rptr); if (time_after(jiffies, wait_time)) { KGSL_DRV_ERR(device, "rptr: %x, wptr: %x\n", @@ -965,7 +987,7 @@ static int adreno_suspend_context(struct kgsl_device *device) return status; } -const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, +struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, unsigned int pt_base, unsigned int gpuaddr, unsigned int size) @@ -992,8 +1014,7 @@ const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, if (!kgsl_mmu_pt_equal(priv->pagetable, pt_base)) continue; spin_lock(&priv->mem_lock); - entry = kgsl_sharedmem_find_region(priv, gpuaddr, - sizeof(unsigned int)); + entry = kgsl_sharedmem_find_region(priv, gpuaddr, size); if (entry) { result = &entry->memdesc; spin_unlock(&priv->mem_lock); @@ -1037,7 +1058,7 @@ const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, uint8_t *adreno_convertaddr(struct kgsl_device *device, unsigned int pt_base, unsigned int gpuaddr, unsigned int size) { - const struct kgsl_memdesc *memdesc; + struct kgsl_memdesc *memdesc; memdesc = adreno_find_region(device, pt_base, gpuaddr, size); diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h old mode 100755 new mode 100644 index af5bf51ea..1259507d9 --- a/drivers/gpu/msm/adreno.h +++ b/drivers/gpu/msm/adreno.h @@ -46,6 +46,8 @@ #define ADRENO_ISTORE_WORDS 3 #define ADRENO_ISTORE_START 0x5000 +#define ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW 50 + enum adreno_gpurev { ADRENO_REV_UNKNOWN = 0, ADRENO_REV_A200 = 200, @@ -74,12 +76,16 @@ struct adreno_device { unsigned int wait_timeout; unsigned int istore_size; unsigned int pix_shader_start; + unsigned int ib_check_level; }; struct adreno_gpudev { + /* keeps track of when we need to execute the draw workaround code */ + int ctx_switches_since_last_draw; int (*ctxt_create)(struct adreno_device *, struct adreno_context *); void (*ctxt_save)(struct adreno_device *, struct adreno_context *); void (*ctxt_restore)(struct adreno_device *, struct adreno_context *); + void (*ctxt_draw_workaround)(struct adreno_device *); irqreturn_t (*irq_handler)(struct adreno_device *); void (*irq_control)(struct adreno_device *, int); void * (*snapshot)(struct adreno_device *, void *, int *, int); @@ -99,7 +105,7 @@ void adreno_regread(struct kgsl_device *device, unsigned int offsetwords, void adreno_regwrite(struct kgsl_device *device, unsigned int offsetwords, unsigned int value); -const struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, +struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device, unsigned int pt_base, unsigned int gpuaddr, unsigned int size); diff --git a/drivers/gpu/msm/adreno_a2xx.c b/drivers/gpu/msm/adreno_a2xx.c old mode 100755 new mode 100644 index 5ce9cf85b..62628e4a3 --- a/drivers/gpu/msm/adreno_a2xx.c +++ b/drivers/gpu/msm/adreno_a2xx.c @@ -1421,11 +1421,61 @@ static int a2xx_drawctxt_create(struct adreno_device *adreno_dev, return ret; } +static void a2xx_drawctxt_workaround(struct adreno_device *adreno_dev) +{ + struct kgsl_device *device = &adreno_dev->dev; + unsigned int cmd[11]; + unsigned int *cmds = &cmd[0]; + + if (adreno_is_a225(adreno_dev)) { + adreno_dev->gpudev->ctx_switches_since_last_draw++; + /* If there have been > than + * ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW calls to context + * switches w/o gmem being saved then we need to execute + * this workaround */ + if (adreno_dev->gpudev->ctx_switches_since_last_draw > + ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW) + adreno_dev->gpudev->ctx_switches_since_last_draw = 0; + else + return; + /* + * Issue an empty draw call to avoid possible hangs due to + * repeated idles without intervening draw calls. + * On adreno 225 the PC block has a cache that is only + * flushed on draw calls and repeated idles can make it + * overflow. The gmem save path contains draw calls so + * this workaround isn't needed there. + */ + *cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2); + *cmds++ = (0x4 << 16) | (REG_PA_SU_SC_MODE_CNTL - 0x2000); + *cmds++ = 0; + *cmds++ = cp_type3_packet(CP_DRAW_INDX, 5); + *cmds++ = 0; + *cmds++ = 1<<14; + *cmds++ = 0; + *cmds++ = device->mmu.setstate_memory.gpuaddr; + *cmds++ = 0; + *cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1); + *cmds++ = 0x00000000; + } else { + /* On Adreno 20x/220, if the events for shader space reuse + * gets dropped, the CP block would wait indefinitely. + * Sending CP_SET_SHADER_BASES packet unblocks the CP from + * this wait. + */ + *cmds++ = cp_type3_packet(CP_SET_SHADER_BASES, 1); + *cmds++ = adreno_encode_istore_size(adreno_dev) + | adreno_dev->pix_shader_start; + } + + adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_PMODE, + &cmd[0], cmds - cmd); +} + static void a2xx_drawctxt_save(struct adreno_device *adreno_dev, struct adreno_context *context) { struct kgsl_device *device = &adreno_dev->dev; - unsigned int cmd[22]; if (context == NULL) return; @@ -1470,33 +1520,11 @@ static void a2xx_drawctxt_save(struct adreno_device *adreno_dev, adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_NONE, context->chicken_restore, 3); } + adreno_dev->gpudev->ctx_switches_since_last_draw = 0; context->flags |= CTXT_FLAGS_GMEM_RESTORE; - } else if (adreno_is_a225(adreno_dev)) { - unsigned int *cmds = &cmd[0]; - /* - * Issue an empty draw call to avoid possible hangs due to - * repeated idles without intervening draw calls. - * On adreno 225 the PC block has a cache that is only - * flushed on draw calls and repeated idles can make it - * overflow. The gmem save path contains draw calls so - * this workaround isn't needed there. - */ - *cmds++ = cp_type3_packet(CP_SET_CONSTANT, 2); - *cmds++ = (0x4 << 16) | (REG_PA_SU_SC_MODE_CNTL - 0x2000); - *cmds++ = 0; - *cmds++ = cp_type3_packet(CP_DRAW_INDX, 5); - *cmds++ = 0; - *cmds++ = 1<<14; - *cmds++ = 0; - *cmds++ = device->mmu.setstate_memory.gpuaddr; - *cmds++ = 0; - *cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1); - *cmds++ = 0x00000000; - - adreno_ringbuffer_issuecmds(device, KGSL_CMD_FLAGS_PMODE, - &cmd[0], 11); - } + } else if (adreno_is_a2xx(adreno_dev)) + a2xx_drawctxt_workaround(adreno_dev); } static void a2xx_drawctxt_restore(struct adreno_device *adreno_dev, @@ -1757,6 +1785,7 @@ struct adreno_gpudev adreno_a2xx_gpudev = { .ctxt_create = a2xx_drawctxt_create, .ctxt_save = a2xx_drawctxt_save, .ctxt_restore = a2xx_drawctxt_restore, + .ctxt_draw_workaround = a2xx_drawctxt_workaround, .irq_handler = a2xx_irq_handler, .irq_control = a2xx_irq_control, .snapshot = a2xx_snapshot, diff --git a/drivers/gpu/msm/adreno_debugfs.c b/drivers/gpu/msm/adreno_debugfs.c index c1b9e4ce2..566efa1aa 100644 --- a/drivers/gpu/msm/adreno_debugfs.c +++ b/drivers/gpu/msm/adreno_debugfs.c @@ -345,6 +345,8 @@ void adreno_debugfs_init(struct kgsl_device *device) &kgsl_cff_dump_enable_fops); debugfs_create_u32("wait_timeout", 0644, device->d_debugfs, &adreno_dev->wait_timeout); + debugfs_create_u32("ib_check", 0644, device->d_debugfs, + &adreno_dev->ib_check_level); /* Create post mortem control files */ diff --git a/drivers/gpu/msm/adreno_drawctxt.c b/drivers/gpu/msm/adreno_drawctxt.c index 206a678ee..f0b5741b5 100644 --- a/drivers/gpu/msm/adreno_drawctxt.c +++ b/drivers/gpu/msm/adreno_drawctxt.c @@ -243,8 +243,13 @@ void adreno_drawctxt_switch(struct adreno_device *adreno_dev, } /* already current? */ - if (adreno_dev->drawctxt_active == drawctxt) + if (adreno_dev->drawctxt_active == drawctxt) { + if (adreno_dev->gpudev->ctxt_draw_workaround && + adreno_is_a225(adreno_dev)) + adreno_dev->gpudev->ctxt_draw_workaround( + adreno_dev); return; + } KGSL_CTXT_INFO(device, "from %p to %p flags %d\n", adreno_dev->drawctxt_active, drawctxt, flags); diff --git a/drivers/gpu/msm/adreno_pm4types.h b/drivers/gpu/msm/adreno_pm4types.h index 8aea58c95..454b05785 100644 --- a/drivers/gpu/msm/adreno_pm4types.h +++ b/drivers/gpu/msm/adreno_pm4types.h @@ -29,11 +29,6 @@ /* skip N 32-bit words to get to the next packet */ #define CP_NOP 0x10 -/* indirect buffer dispatch. prefetch parser uses this packet type to determine -* whether to pre-fetch the IB -*/ -#define CP_INDIRECT_BUFFER 0x3f - /* indirect buffer dispatch. same as IB, but init is pipelined */ #define CP_INDIRECT_BUFFER_PFD 0x37 @@ -117,6 +112,9 @@ /* load constants from a location in memory */ #define CP_LOAD_CONSTANT_CONTEXT 0x2e +/* (A2x) sets binning configuration registers */ +#define CP_SET_BIN_DATA 0x2f + /* selective invalidation of state pointers */ #define CP_INVALIDATE_STATE 0x3b @@ -157,6 +155,16 @@ #define CP_SET_PROTECTED_MODE 0x5f /* sets the register protection mode */ +/* + * for a3xx + */ + +/* Conditionally load a IB based on a flag */ +#define CP_COND_INDIRECT_BUFFER_PFE 0x3A /* prefetch enabled */ +#define CP_COND_INDIRECT_BUFFER_PFD 0x32 /* prefetch disabled */ + +/* Load a buffer with pre-fetch enabled */ +#define CP_INDIRECT_BUFFER_PFE 0x3F /* packet header building macros */ #define cp_type0_packet(regindx, cnt) \ @@ -178,11 +186,20 @@ #define cp_nop_packet(cnt) \ (CP_TYPE3_PKT | (((cnt)-1) << 16) | (CP_NOP << 8)) +#define pkt_is_type0(pkt) (((pkt) & 0XC0000000) == CP_TYPE0_PKT) + +#define type0_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1) +#define type0_pkt_offset(pkt) ((pkt) & 0x7FFF) + +#define pkt_is_type3(pkt) (((pkt) & 0xC0000000) == CP_TYPE3_PKT) + +#define cp_type3_opcode(pkt) (((pkt) >> 8) & 0xFF) +#define type3_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1) /* packet headers */ #define CP_HDR_ME_INIT cp_type3_packet(CP_ME_INIT, 18) #define CP_HDR_INDIRECT_BUFFER_PFD cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2) -#define CP_HDR_INDIRECT_BUFFER cp_type3_packet(CP_INDIRECT_BUFFER, 2) +#define CP_HDR_INDIRECT_BUFFER_PFE cp_type3_packet(CP_INDIRECT_BUFFER_PFE, 2) /* dword base address of the GFX decode space */ #define SUBBLOCK_OFFSET(reg) ((unsigned int)((reg) - (0x2000))) @@ -190,4 +207,14 @@ /* gmem command buffer length */ #define CP_REG(reg) ((0x4 << 16) | (SUBBLOCK_OFFSET(reg))) + +/* Return 1 if the command is an indirect buffer of any kind */ +static inline int adreno_cmd_is_ib(unsigned int cmd) +{ + return (cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFE, 2) || + cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2) || + cmd == cp_type3_packet(CP_COND_INDIRECT_BUFFER_PFE, 2) || + cmd == cp_type3_packet(CP_COND_INDIRECT_BUFFER_PFD, 2)); +} + #endif /* __ADRENO_PM4TYPES_H */ diff --git a/drivers/gpu/msm/adreno_postmortem.c b/drivers/gpu/msm/adreno_postmortem.c old mode 100755 new mode 100644 index 40dfb30cf..63f5caa91 --- a/drivers/gpu/msm/adreno_postmortem.c +++ b/drivers/gpu/msm/adreno_postmortem.c @@ -53,7 +53,7 @@ static const struct pm_id_name pm3_types[] = { {CP_IM_LOAD, "IN__LOAD"}, {CP_IM_LOAD_IMMEDIATE, "IM_LOADI"}, {CP_IM_STORE, "IM_STORE"}, - {CP_INDIRECT_BUFFER, "IND_BUF_"}, + {CP_INDIRECT_BUFFER_PFE, "IND_BUF_"}, {CP_INDIRECT_BUFFER_PFD, "IND_BUFP"}, {CP_INTERRUPT, "PM4_INTR"}, {CP_INVALIDATE_STATE, "INV_STAT"}, @@ -200,7 +200,7 @@ static void dump_ib1(struct kgsl_device *device, uint32_t pt_base, for (i = 0; i+3 < ib1_size; ) { value = ib1_addr[i++]; - if (value == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) { + if (adreno_cmd_is_ib(value)) { uint32_t ib2_base = ib1_addr[i++]; uint32_t ib2_size = ib1_addr[i++]; @@ -611,7 +611,7 @@ static int adreno_dump(struct kgsl_device *device) i = 0; for (read_idx = 0; read_idx < num_item; ) { uint32_t this_cmd = rb_copy[read_idx++]; - if (this_cmd == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) { + if (adreno_cmd_is_ib(this_cmd)) { uint32_t ib_addr = rb_copy[read_idx++]; uint32_t ib_size = rb_copy[read_idx++]; dump_ib1(device, cur_pt_base, (read_idx-3)<<2, ib_addr, @@ -654,8 +654,7 @@ static int adreno_dump(struct kgsl_device *device) for (read_idx = NUM_DWORDS_OF_RINGBUFFER_HISTORY; read_idx >= 0; --read_idx) { uint32_t this_cmd = rb_copy[read_idx]; - if (this_cmd == cp_type3_packet( - CP_INDIRECT_BUFFER_PFD, 2)) { + if (adreno_cmd_is_ib(this_cmd)) { uint32_t ib_addr = rb_copy[read_idx+1]; uint32_t ib_size = rb_copy[read_idx+2]; if (ib_size && cp_ib1_base == ib_addr) { diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c index ea2889b3a..57883fd45 100644 --- a/drivers/gpu/msm/adreno_ringbuffer.c +++ b/drivers/gpu/msm/adreno_ringbuffer.c @@ -22,6 +22,7 @@ #include "adreno.h" #include "adreno_pm4types.h" #include "adreno_ringbuffer.h" +#include "adreno_debugfs.h" #include "a2xx_reg.h" @@ -310,12 +311,10 @@ int adreno_ringbuffer_start(struct adreno_ringbuffer *rb, unsigned int init_ram) adreno_regwrite(device, REG_SCRATCH_UMSK, GSL_RB_MEMPTRS_SCRATCH_MASK); - /*< DTS2012042406822 hanfeng 20120428 begin*/ /* update the eoptimestamp field with the last retired timestamp */ kgsl_sharedmem_writel(&device->memstore, KGSL_DEVICE_MEMSTORE_OFFSET(eoptimestamp), rb->timestamp); - /* DTS2012042406822 hanfeng 20120428 end > */ /* load the CP ucode */ @@ -554,6 +553,197 @@ adreno_ringbuffer_issuecmds(struct kgsl_device *device, adreno_ringbuffer_addcmds(rb, flags, cmds, sizedwords); } +static bool _parse_ibs(struct kgsl_device_private *dev_priv, uint gpuaddr, + int sizedwords); + +static bool +_handle_type3(struct kgsl_device_private *dev_priv, uint *hostaddr) +{ + unsigned int opcode = cp_type3_opcode(*hostaddr); + switch (opcode) { + case CP_INDIRECT_BUFFER_PFD: + case CP_INDIRECT_BUFFER_PFE: + case CP_COND_INDIRECT_BUFFER_PFE: + case CP_COND_INDIRECT_BUFFER_PFD: + return _parse_ibs(dev_priv, hostaddr[1], hostaddr[2]); + case CP_NOP: + case CP_WAIT_FOR_IDLE: + case CP_WAIT_REG_MEM: + case CP_WAIT_REG_EQ: + case CP_WAT_REG_GTE: + case CP_WAIT_UNTIL_READ: + case CP_WAIT_IB_PFD_COMPLETE: + case CP_REG_RMW: + case CP_REG_TO_MEM: + case CP_MEM_WRITE: + case CP_MEM_WRITE_CNTR: + case CP_COND_EXEC: + case CP_COND_WRITE: + case CP_EVENT_WRITE: + case CP_EVENT_WRITE_SHD: + case CP_EVENT_WRITE_CFL: + case CP_EVENT_WRITE_ZPD: + case CP_DRAW_INDX: + case CP_DRAW_INDX_2: + case CP_DRAW_INDX_BIN: + case CP_DRAW_INDX_2_BIN: + case CP_VIZ_QUERY: + case CP_SET_STATE: + case CP_SET_CONSTANT: + case CP_IM_LOAD: + case CP_IM_LOAD_IMMEDIATE: + case CP_LOAD_CONSTANT_CONTEXT: + case CP_INVALIDATE_STATE: + case CP_SET_SHADER_BASES: + case CP_SET_BIN_MASK: + case CP_SET_BIN_SELECT: + case CP_SET_BIN_BASE_OFFSET: + case CP_SET_BIN_DATA: + case CP_CONTEXT_UPDATE: + case CP_INTERRUPT: + case CP_IM_STORE: + break; + /* these shouldn't come from userspace */ + case CP_ME_INIT: + case CP_SET_PROTECTED_MODE: + default: + KGSL_CMD_ERR(dev_priv->device, "bad CP opcode %0x\n", opcode); + return false; + break; + } + + return true; +} + +static bool +_handle_type0(struct kgsl_device_private *dev_priv, uint *hostaddr) +{ + unsigned int reg = type0_pkt_offset(*hostaddr); + unsigned int cnt = type0_pkt_size(*hostaddr); + if (reg < 0x0192 || (reg + cnt) >= 0x8000) { + KGSL_CMD_ERR(dev_priv->device, "bad type0 reg: 0x%0x cnt: %d\n", + reg, cnt); + return false; + } + return true; +} + +/* + * Traverse IBs and dump them to test vector. Detect swap by inspecting + * register writes, keeping note of the current state, and dump + * framebuffer config to test vector + */ +static bool _parse_ibs(struct kgsl_device_private *dev_priv, + uint gpuaddr, int sizedwords) +{ + static uint level; /* recursion level */ + bool ret = false; + uint *hostaddr, *hoststart; + int dwords_left = sizedwords; /* dwords left in the current command + buffer */ + struct kgsl_mem_entry *entry; + + spin_lock(&dev_priv->process_priv->mem_lock); + entry = kgsl_sharedmem_find_region(dev_priv->process_priv, + gpuaddr, sizedwords * sizeof(uint)); + spin_unlock(&dev_priv->process_priv->mem_lock); + if (entry == NULL) { + KGSL_CMD_ERR(dev_priv->device, + "no mapping for gpuaddr: 0x%08x\n", gpuaddr); + return false; + } + + hostaddr = (uint *)kgsl_gpuaddr_to_vaddr(&entry->memdesc, gpuaddr); + if (hostaddr == NULL) { + KGSL_CMD_ERR(dev_priv->device, + "no mapping for gpuaddr: 0x%08x\n", gpuaddr); + return false; + } + + hoststart = hostaddr; + + level++; + + KGSL_CMD_INFO(dev_priv->device, "ib: gpuaddr:0x%08x, wc:%d, hptr:%p\n", + gpuaddr, sizedwords, hostaddr); + + mb(); + while (dwords_left > 0) { + bool cur_ret = true; + int count = 0; /* dword count including packet header */ + + switch (*hostaddr >> 30) { + case 0x0: /* type-0 */ + count = (*hostaddr >> 16)+2; + cur_ret = _handle_type0(dev_priv, hostaddr); + break; + case 0x1: /* type-1 */ + count = 2; + break; + case 0x3: /* type-3 */ + count = ((*hostaddr >> 16) & 0x3fff) + 2; + cur_ret = _handle_type3(dev_priv, hostaddr); + break; + default: + KGSL_CMD_ERR(dev_priv->device, "unexpected type: " + "type:%d, word:0x%08x @ 0x%p, gpu:0x%08x\n", + *hostaddr >> 30, *hostaddr, hostaddr, + gpuaddr+4*(sizedwords-dwords_left)); + cur_ret = false; + count = dwords_left; + break; + } + + if (!cur_ret) { + KGSL_CMD_ERR(dev_priv->device, + "bad sub-type: #:%d/%d, v:0x%08x" + " @ 0x%p[gb:0x%08x], level:%d\n", + sizedwords-dwords_left, sizedwords, *hostaddr, + hostaddr, gpuaddr+4*(sizedwords-dwords_left), + level); + + if (ADRENO_DEVICE(dev_priv->device)->ib_check_level + >= 2) + print_hex_dump(KERN_ERR, + level == 1 ? "IB1:" : "IB2:", + DUMP_PREFIX_OFFSET, 32, 4, hoststart, + sizedwords*4, 0); + goto done; + } + + /* jump to next packet */ + dwords_left -= count; + hostaddr += count; + if (dwords_left < 0) { + KGSL_CMD_ERR(dev_priv->device, + "bad count: c:%d, #:%d/%d, " + "v:0x%08x @ 0x%p[gb:0x%08x], level:%d\n", + count, sizedwords-(dwords_left+count), + sizedwords, *(hostaddr-count), hostaddr-count, + gpuaddr+4*(sizedwords-(dwords_left+count)), + level); + if (ADRENO_DEVICE(dev_priv->device)->ib_check_level + >= 2) + print_hex_dump(KERN_ERR, + level == 1 ? "IB1:" : "IB2:", + DUMP_PREFIX_OFFSET, 32, 4, hoststart, + sizedwords*4, 0); + goto done; + } + } + + ret = true; +done: + if (!ret) + KGSL_DRV_ERR(dev_priv->device, + "parsing failed: gpuaddr:0x%08x, " + "host:0x%p, wc:%d\n", gpuaddr, hoststart, sizedwords); + + level--; + + return ret; +} + int adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv, struct kgsl_context *context, @@ -601,9 +791,12 @@ adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv, start_index = 1; for (i = start_index; i < numibs; i++) { - (void)kgsl_cffdump_parse_ibs(dev_priv, NULL, - ibdesc[i].gpuaddr, ibdesc[i].sizedwords, false); - + if (unlikely(adreno_dev->ib_check_level >= 1 && + !_parse_ibs(dev_priv, ibdesc[i].gpuaddr, + ibdesc[i].sizedwords))) { + kfree(link); + return -EINVAL; + } *cmds++ = CP_HDR_INDIRECT_BUFFER_PFD; *cmds++ = ibdesc[i].gpuaddr; *cmds++ = ibdesc[i].sizedwords; @@ -757,8 +950,20 @@ int adreno_ringbuffer_extract(struct adreno_ringbuffer *rb, kgsl_sharedmem_readl(&rb->buffer_desc, &value, rb_rptr); rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr, rb->buffer_desc.size); - BUG_ON((copy_rb_contents == 0) && - (value == cur_context)); + + /* + * If other context switches were already lost and + * and the current context is the one that is hanging, + * then we cannot recover. Print an error message + * and leave. + */ + + if ((copy_rb_contents == 0) && (value == cur_context)) { + KGSL_DRV_ERR(device, "GPU recovery could not " + "find the previous context\n"); + return -EINVAL; + } + /* * If we were copying the commands and got to this point * then we need to remove the 3 commands that appear diff --git a/drivers/gpu/msm/adreno_snapshot.c b/drivers/gpu/msm/adreno_snapshot.c index fb88a72bd..c45dbff48 100644 --- a/drivers/gpu/msm/adreno_snapshot.c +++ b/drivers/gpu/msm/adreno_snapshot.c @@ -45,11 +45,19 @@ static void push_object(struct kgsl_device *device, int type, uint32_t ptbase, int index; void *ptr; - /* Go through the list and see that object has already been seen */ + /* + * Sometimes IBs can be reused in the same dump. Because we parse from + * oldest to newest, if we come across an IB that has already been used, + * assume that it has been reused and update the list with the newest + * size. + */ + for (index = 0; index < objbufptr; index++) { if (objbuf[index].gpuaddr == gpuaddr && - objbuf[index].ptbase == ptbase) - return; + objbuf[index].ptbase == ptbase) { + objbuf[index].dwords = dwords; + return; + } } if (objbufptr == SNAPSHOT_OBJ_BUFSIZE) { @@ -77,6 +85,25 @@ static void push_object(struct kgsl_device *device, int type, uint32_t ptbase, objbuf[objbufptr++].ptr = ptr; } +/* + * Return a 1 if the specified object is already on the list of buffers + * to be dumped + */ + +static int find_object(int type, unsigned int gpuaddr, unsigned int ptbase) +{ + int index; + + for (index = 0; index < objbufptr; index++) { + if (objbuf[index].gpuaddr == gpuaddr && + objbuf[index].ptbase == ptbase && + objbuf[index].type == type) + return 1; + } + + return 0; +} + /* Snapshot the istore memory */ static int snapshot_istore(struct kgsl_device *device, void *snapshot, int remain, void *priv) @@ -113,6 +140,7 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot, unsigned int rbbase, ptbase, rptr, *rbptr; int start, stop, index; int numitems, size; + int parse_ibs = 0, ib_parse_start; /* Get the GPU address of the ringbuffer */ kgsl_regread(device, REG_CP_RB_BASE, &rbbase); @@ -158,9 +186,53 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot, header->rbsize = rb->sizedwords; header->count = numitems; - index = start; + /* + * We can only reliably dump IBs from the beginning of the context, + * and it turns out that for the vast majority of the time we really + * only care about the current context when it comes to diagnosing + * a hang. So, with an eye to limiting the buffer dumping to what is + * really useful find the beginning of the context and only dump + * IBs from that point + */ + + index = rptr; + ib_parse_start = start; rbptr = rb->buffer_desc.hostptr; + while (index != start) { + index--; + + if (index < 0) { + /* + * The marker we are looking for is 2 dwords long, so + * when wrapping, go back 2 from the end so we don't + * access out of range in the if statement below + */ + index = rb->sizedwords - 2; + + /* + * Account for the possibility that start might be at + * rb->sizedwords - 1 + */ + + if (start == rb->sizedwords - 1) + break; + } + + /* + * Look for a NOP packet with the context switch identifier in + * the second dword + */ + + if (rbptr[index] == cp_nop_packet(1) && + rbptr[index + 1] == KGSL_CONTEXT_TO_MEM_IDENTIFIER) { + ib_parse_start = index; + break; + } + } + + index = start; + /* * Loop through the RB, copying the data and looking for indirect * buffers and MMU pagetable changes @@ -169,15 +241,18 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot, while (index != rb->wptr) { *data = rbptr[index]; - if (rbptr[index] == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) + /* Only parse IBs between the context start and the rptr */ + + if (index == ib_parse_start) + parse_ibs = 1; + + if (index == rptr) + parse_ibs = 0; + + if (parse_ibs && adreno_cmd_is_ib(rbptr[index])) push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase, rbptr[index + 1], rbptr[index + 2]); - /* - * FIXME: Handle upcoming MMU pagetable changes, but only - * between the rptr and the wptr - */ - index = index + 1; if (index == rb->sizedwords) @@ -228,10 +303,9 @@ static int snapshot_ib(struct kgsl_device *device, void *snapshot, *dst = *src; /* If another IB is discovered, then push it on the list too */ - if (*src == cp_type3_packet(CP_INDIRECT_BUFFER_PFD, 2)) { + if (adreno_cmd_is_ib(*src)) push_object(device, SNAPSHOT_OBJ_TYPE_IB, obj->ptbase, *(src + 1), *(src + 2)); - } src++; dst++; @@ -288,22 +362,45 @@ void *adreno_snapshot(struct kgsl_device *device, void *snapshot, int *remain, snapshot, remain, snapshot_rb, NULL); /* - * Make sure that the IBs described in the CP registers are on the - * list of objects + * Make sure that the last IB1 that was being executed is dumped. + * Since this was the last IB1 that was processed, we should have + * already added it to the list during the ringbuffer parse but we + * want to be double plus sure. */ + kgsl_regread(device, REG_CP_IB1_BASE, &ibbase); kgsl_regread(device, REG_CP_IB1_BUFSZ, &ibsize); - if (ibsize) + /* + * The problem is that IB size from the register is the unprocessed size + * of the buffer not the original size, so if we didn't catch this + * buffer being directly used in the RB, then we might not be able to + * dump the whle thing. Print a warning message so we can try to + * figure how often this really happens. + */ + + if (!find_object(SNAPSHOT_OBJ_TYPE_IB, ibbase, ptbase) && ibsize) { push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase, ibbase, ibsize); + KGSL_DRV_ERR(device, "CP_IB1_BASE not found in the ringbuffer. " + "Dumping %x dwords of the buffer.\n", ibsize); + } kgsl_regread(device, REG_CP_IB2_BASE, &ibbase); kgsl_regread(device, REG_CP_IB2_BUFSZ, &ibsize); - if (ibsize) + /* + * Add the last parsed IB2 to the list. The IB2 should be found as we + * parse the objects below, but we try to add it to the list first, so + * it too can be parsed. Don't print an error message in this case - if + * the IB2 is found during parsing, the list will be updated with the + * correct size. + */ + + if (!find_object(SNAPSHOT_OBJ_TYPE_IB, ibbase, ptbase) && ibsize) { push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase, ibbase, ibsize); + } /* * Go through the list of found objects and dump each one. As the IBs diff --git a/drivers/gpu/msm/kgsl.c b/drivers/gpu/msm/kgsl.c old mode 100755 new mode 100644 index d51128979..39dad925f --- a/drivers/gpu/msm/kgsl.c +++ b/drivers/gpu/msm/kgsl.c @@ -21,11 +21,10 @@ #include #include #include - +#include #include #include #include -#include #include "kgsl.h" #include "kgsl_debugfs.h" @@ -194,8 +193,28 @@ static void kgsl_mem_entry_attach_process(struct kgsl_mem_entry *entry, struct kgsl_process_private *process) { + struct rb_node **node; + struct rb_node *parent = NULL; + spin_lock(&process->mem_lock); - list_add(&entry->list, &process->mem_list); + + node = &process->mem_rb.rb_node; + + while (*node) { + struct kgsl_mem_entry *cur; + + parent = *node; + cur = rb_entry(parent, struct kgsl_mem_entry, node); + + if (entry->memdesc.gpuaddr < cur->memdesc.gpuaddr) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&entry->node, parent, node); + rb_insert_color(&entry->node, &process->mem_rb); + spin_unlock(&process->mem_lock); entry->priv = process; @@ -405,6 +424,10 @@ static int kgsl_suspend_device(struct kgsl_device *device, pm_message_t state) INIT_COMPLETION(device->hwaccess_gate); device->ftbl->suspend_context(device); device->ftbl->stop(device); + if (device->idle_wakelock.name) + wake_unlock(&device->idle_wakelock); + pm_qos_update_request(&device->pm_qos_req_dma, + PM_QOS_DEFAULT_VALUE); kgsl_pwrctrl_set_state(device, KGSL_STATE_SUSPEND); break; case KGSL_STATE_SLUMBER: @@ -514,8 +537,8 @@ void kgsl_late_resume_driver(struct early_suspend *h) struct kgsl_device, display_off); KGSL_PWR_WARN(device, "late resume start\n"); mutex_lock(&device->mutex); - kgsl_pwrctrl_wake(device); device->pwrctrl.restore_slumber = 0; + kgsl_pwrctrl_wake(device); kgsl_pwrctrl_pwrlevel_change(device, KGSL_PWRLEVEL_TURBO); mutex_unlock(&device->mutex); kgsl_check_idle(device); @@ -548,8 +571,7 @@ kgsl_get_process_private(struct kgsl_device_private *cur_dev_priv) spin_lock_init(&private->mem_lock); private->refcnt = 1; private->pid = task_tgid_nr(current); - - INIT_LIST_HEAD(&private->mem_list); + private->mem_rb = RB_ROOT; if (kgsl_mmu_enabled()) { @@ -578,7 +600,7 @@ kgsl_put_process_private(struct kgsl_device *device, struct kgsl_process_private *private) { struct kgsl_mem_entry *entry = NULL; - struct kgsl_mem_entry *entry_tmp = NULL; + struct rb_node *node; if (!private) return; @@ -592,11 +614,13 @@ kgsl_put_process_private(struct kgsl_device *device, list_del(&private->list); - list_for_each_entry_safe(entry, entry_tmp, &private->mem_list, list) { - list_del(&entry->list); + for (node = rb_first(&private->mem_rb); node; ) { + entry = rb_entry(node, struct kgsl_mem_entry, node); + node = rb_next(&entry->node); + + rb_erase(&entry->node, &private->mem_rb); kgsl_mem_entry_put(entry); } - kgsl_mmu_putpagetable(private->pagetable); kfree(private); unlock: @@ -722,47 +746,43 @@ static int kgsl_open(struct inode *inodep, struct file *filep) return result; } - /*call with private->mem_lock locked */ -static struct kgsl_mem_entry * -kgsl_sharedmem_find(struct kgsl_process_private *private, unsigned int gpuaddr) +struct kgsl_mem_entry * +kgsl_sharedmem_find_region(struct kgsl_process_private *private, + unsigned int gpuaddr, size_t size) { - struct kgsl_mem_entry *entry = NULL, *result = NULL; + struct rb_node *node = private->mem_rb.rb_node; - BUG_ON(private == NULL); + while (node != NULL) { + struct kgsl_mem_entry *entry; - gpuaddr &= PAGE_MASK; + entry = rb_entry(node, struct kgsl_mem_entry, node); - list_for_each_entry(entry, &private->mem_list, list) { - if (entry->memdesc.gpuaddr == gpuaddr) { - result = entry; - break; - } - } - return result; -} - -/*call with private->mem_lock locked */ -struct kgsl_mem_entry * -kgsl_sharedmem_find_region(struct kgsl_process_private *private, - unsigned int gpuaddr, - size_t size) -{ - struct kgsl_mem_entry *entry = NULL, *result = NULL; - BUG_ON(private == NULL); + if (kgsl_gpuaddr_in_memdesc(&entry->memdesc, gpuaddr, size)) + return entry; - list_for_each_entry(entry, &private->mem_list, list) { - if (kgsl_gpuaddr_in_memdesc(&entry->memdesc, gpuaddr, size)) { - result = entry; - break; + if (gpuaddr < entry->memdesc.gpuaddr) + node = node->rb_left; + else if (gpuaddr >= + (entry->memdesc.gpuaddr + entry->memdesc.size)) + node = node->rb_right; + else { + return NULL; } } - return result; + return NULL; } EXPORT_SYMBOL(kgsl_sharedmem_find_region); +/*call with private->mem_lock locked */ +static inline struct kgsl_mem_entry * +kgsl_sharedmem_find(struct kgsl_process_private *private, unsigned int gpuaddr) +{ + return kgsl_sharedmem_find_region(private, gpuaddr, 1); +} + /*call all ioctl sub functions with driver locked*/ static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv, unsigned int cmd, void *data) @@ -789,6 +809,40 @@ static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv, break; } + case KGSL_PROP_GPU_RESET_STAT: + { + /* Return reset status of given context and clear it */ + uint32_t id; + struct kgsl_context *context; + + if (param->sizebytes != sizeof(unsigned int)) { + result = -EINVAL; + break; + } + /* We expect the value passed in to contain the context id */ + if (copy_from_user(&id, param->value, + sizeof(unsigned int))) { + result = -EFAULT; + break; + } + context = kgsl_find_context(dev_priv, id); + if (!context) { + result = -EINVAL; + break; + } + /* + * Copy the reset status to value which also serves as + * the out parameter + */ + if (copy_to_user(param->value, &(context->reset_status), + sizeof(unsigned int))) { + result = -EFAULT; + break; + } + /* Clear reset status once its been queried */ + context->reset_status = KGSL_CTX_STAT_NO_ERROR; + break; + } default: result = dev_priv->device->ftbl->getproperty( dev_priv->device, param->type, @@ -827,40 +881,6 @@ static long kgsl_ioctl_device_waittimestamp(struct kgsl_device_private return result; } -static bool check_ibdesc(struct kgsl_device_private *dev_priv, - struct kgsl_ibdesc *ibdesc, unsigned int numibs, - bool parse) -{ - bool result = true; - unsigned int i; - for (i = 0; i < numibs; i++) { - struct kgsl_mem_entry *entry; - spin_lock(&dev_priv->process_priv->mem_lock); - entry = kgsl_sharedmem_find_region(dev_priv->process_priv, - ibdesc[i].gpuaddr, ibdesc[i].sizedwords * sizeof(uint)); - spin_unlock(&dev_priv->process_priv->mem_lock); - if (entry == NULL) { - KGSL_DRV_ERR(dev_priv->device, - "invalid cmd buffer gpuaddr %08x " \ - "sizedwords %d\n", ibdesc[i].gpuaddr, - ibdesc[i].sizedwords); - result = false; - break; - } - - if (parse && !kgsl_cffdump_parse_ibs(dev_priv, &entry->memdesc, - ibdesc[i].gpuaddr, ibdesc[i].sizedwords, true)) { - KGSL_DRV_ERR(dev_priv->device, - "invalid cmd buffer gpuaddr %08x " \ - "sizedwords %d numibs %d/%d\n", - ibdesc[i].gpuaddr, - ibdesc[i].sizedwords, i+1, numibs); - result = false; - break; - } - } - return result; -} static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv, unsigned int cmd, void *data) @@ -930,12 +950,6 @@ static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv, param->numibs = 1; } - if (!check_ibdesc(dev_priv, ibdesc, param->numibs, true)) { - KGSL_DRV_ERR(dev_priv->device, "bad ibdesc"); - result = -EINVAL; - goto free_ibdesc; - } - result = dev_priv->device->ftbl->issueibcmds(dev_priv, context, ibdesc, @@ -945,18 +959,6 @@ static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv, trace_kgsl_issueibcmds(dev_priv->device, param, result); - if (result != 0) - goto free_ibdesc; - - /* this is a check to try to detect if a command buffer was freed - * during issueibcmds(). - */ - if (!check_ibdesc(dev_priv, ibdesc, param->numibs, false)) { - KGSL_DRV_ERR(dev_priv->device, "bad ibdesc AFTER issue"); - result = -EINVAL; - goto free_ibdesc; - } - free_ibdesc: kfree(ibdesc); done: @@ -988,7 +990,7 @@ static void kgsl_freemem_event_cb(struct kgsl_device *device, { struct kgsl_mem_entry *entry = priv; spin_lock(&entry->priv->mem_lock); - list_del(&entry->list); + rb_erase(&entry->node, &entry->priv->mem_rb); spin_unlock(&entry->priv->mem_lock); kgsl_mem_entry_put(entry); } @@ -1080,7 +1082,8 @@ static long kgsl_ioctl_sharedmem_free(struct kgsl_device_private *dev_priv, spin_lock(&private->mem_lock); entry = kgsl_sharedmem_find(private, param->gpuaddr); if (entry) - list_del(&entry->list); + rb_erase(&entry->node, &private->mem_rb); + spin_unlock(&private->mem_lock); if (entry) { @@ -1164,7 +1167,7 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv, goto error; } - result = kgsl_sharedmem_vmalloc_user(&entry->memdesc, + result = kgsl_sharedmem_page_alloc_user(&entry->memdesc, private->pagetable, len, param->flags); if (result != 0) @@ -1172,10 +1175,10 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv, vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - result = remap_vmalloc_range(vma, (void *) entry->memdesc.hostptr, 0); + result = kgsl_sharedmem_map_vma(vma, &entry->memdesc); if (result) { - KGSL_CORE_ERR("remap_vmalloc_range failed: %d\n", result); - goto error_free_vmalloc; + KGSL_CORE_ERR("kgsl_sharedmem_map_vma failed: %d\n", result); + goto error_free_alloc; } param->gpuaddr = entry->memdesc.gpuaddr; @@ -1190,7 +1193,7 @@ kgsl_ioctl_sharedmem_from_vmalloc(struct kgsl_device_private *dev_priv, kgsl_check_idle(dev_priv->device); return 0; -error_free_vmalloc: +error_free_alloc: kgsl_sharedmem_free(&entry->memdesc); error_free_entry: @@ -1313,7 +1316,8 @@ static int memdesc_sg_virt(struct kgsl_memdesc *memdesc, int sglen = PAGE_ALIGN(size) / PAGE_SIZE; unsigned long paddr = (unsigned long) addr; - memdesc->sg = vmalloc(sglen * sizeof(struct scatterlist)); + memdesc->sg = kgsl_sg_alloc(sglen); + if (memdesc->sg == NULL) return -ENOMEM; @@ -1353,7 +1357,7 @@ static int memdesc_sg_virt(struct kgsl_memdesc *memdesc, err: spin_unlock(¤t->mm->page_table_lock); - vfree(memdesc->sg); + kgsl_sg_free(memdesc->sg, sglen); memdesc->sg = NULL; return -EINVAL; @@ -1488,11 +1492,8 @@ static int kgsl_setup_ion(struct kgsl_mem_entry *entry, struct scatterlist *s; unsigned long flags; - if (kgsl_ion_client == NULL) { - kgsl_ion_client = msm_ion_client_create(UINT_MAX, KGSL_NAME); - if (kgsl_ion_client == NULL) - return -ENODEV; - } + if (IS_ERR_OR_NULL(kgsl_ion_client)) + return -ENODEV; handle = ion_import_fd(kgsl_ion_client, fd); if (IS_ERR_OR_NULL(handle)) @@ -1622,10 +1623,20 @@ static long kgsl_ioctl_map_user_mem(struct kgsl_device_private *dev_priv, kgsl_check_idle(dev_priv->device); return result; - error_put_file_ptr: - if (entry->priv_data) - fput(entry->priv_data); - +error_put_file_ptr: + switch (entry->memtype) { + case KGSL_MEM_ENTRY_PMEM: + case KGSL_MEM_ENTRY_ASHMEM: + if (entry->priv_data) + fput(entry->priv_data); + break; + case KGSL_MEM_ENTRY_ION: + ion_unmap_dma(kgsl_ion_client, entry->priv_data); + ion_free(kgsl_ion_client, entry->priv_data); + break; + default: + break; + } error: kfree(entry); kgsl_check_idle(dev_priv->device); @@ -2029,7 +2040,7 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma) unsigned long vma_offset = vma->vm_pgoff << PAGE_SHIFT; struct kgsl_device_private *dev_priv = file->private_data; struct kgsl_process_private *private = dev_priv->process_priv; - struct kgsl_mem_entry *tmp, *entry = NULL; + struct kgsl_mem_entry *entry = NULL; struct kgsl_device *device = dev_priv->device; /* Handle leagacy behavior for memstore */ @@ -2040,13 +2051,11 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma) /* Find a chunk of GPU memory */ spin_lock(&private->mem_lock); - list_for_each_entry(tmp, &private->mem_list, list) { - if (vma_offset == tmp->memdesc.gpuaddr) { - kgsl_mem_entry_get(tmp); - entry = tmp; - break; - } - } + entry = kgsl_sharedmem_find(private, vma_offset); + + if (entry) + kgsl_mem_entry_get(entry); + spin_unlock(&private->mem_lock); if (entry == NULL) @@ -2102,8 +2111,8 @@ void kgsl_unregister_device(struct kgsl_device *device) kgsl_cffdump_close(device->id); kgsl_pwrctrl_uninit_sysfs(device); - if (cpu_is_msm8x60()) - wake_lock_destroy(&device->idle_wakelock); + wake_lock_destroy(&device->idle_wakelock); + pm_qos_remove_request(&device->pm_qos_req_dma); idr_destroy(&device->context_idr); @@ -2194,9 +2203,9 @@ kgsl_register_device(struct kgsl_device *device) if (ret != 0) goto err_close_mmu; - if (cpu_is_msm8x60()) - wake_lock_init(&device->idle_wakelock, - WAKE_LOCK_IDLE, device->name); + wake_lock_init(&device->idle_wakelock, WAKE_LOCK_IDLE, device->name); + pm_qos_add_request(&device->pm_qos_req_dma, PM_QOS_CPU_DMA_LATENCY, + PM_QOS_DEFAULT_VALUE); idr_init(&device->context_idr); @@ -2242,6 +2251,8 @@ int kgsl_device_platform_probe(struct kgsl_device *device, if (status) goto error; + kgsl_ion_client = msm_ion_client_create(UINT_MAX, KGSL_NAME); + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, device->iomemname); if (res == NULL) { @@ -2339,22 +2350,30 @@ kgsl_ptdata_init(void) static void kgsl_core_exit(void) { - unregister_chrdev_region(kgsl_driver.major, KGSL_DEVICE_MAX); - - kgsl_mmu_ptpool_destroy(&kgsl_driver.ptpool); + kgsl_mmu_ptpool_destroy(kgsl_driver.ptpool); kgsl_driver.ptpool = NULL; - device_unregister(&kgsl_driver.virtdev); + kgsl_drm_exit(); + kgsl_cffdump_destroy(); + kgsl_core_debugfs_close(); + + /* + * We call kgsl_sharedmem_uninit_sysfs() and device_unregister() + * only if kgsl_driver.virtdev has been populated. + * We check at least one member of kgsl_driver.virtdev to + * see if it is not NULL (and thus, has been populated). + */ + if (kgsl_driver.virtdev.class) { + kgsl_sharedmem_uninit_sysfs(); + device_unregister(&kgsl_driver.virtdev); + } if (kgsl_driver.class) { class_destroy(kgsl_driver.class); kgsl_driver.class = NULL; } - kgsl_drm_exit(); - kgsl_cffdump_destroy(); - kgsl_core_debugfs_close(); - kgsl_sharedmem_uninit_sysfs(); + unregister_chrdev_region(kgsl_driver.major, KGSL_DEVICE_MAX); } static int __init kgsl_core_init(void) diff --git a/drivers/gpu/msm/kgsl.h b/drivers/gpu/msm/kgsl.h index d3ae4b9bb..f027f95c4 100644 --- a/drivers/gpu/msm/kgsl.h +++ b/drivers/gpu/msm/kgsl.h @@ -21,13 +21,12 @@ #include #include #include +#include #define KGSL_NAME "kgsl" -/*< DTS2012042406822 hanfeng 20120428 begin*/ -/* Timestamp window used to detect rollovers */ +/* Timestamp window used to detect rollovers (half of integer range) */ #define KGSL_TIMESTAMP_WINDOW 0x80000000 -/* DTS2012042406822 hanfeng 20120428 end > */ /*cache coherency ops */ #define DRM_KGSL_GEM_CACHE_OP_TO_DEV 0x0001 @@ -96,6 +95,8 @@ struct kgsl_driver { struct { unsigned int vmalloc; unsigned int vmalloc_max; + unsigned int page_alloc; + unsigned int page_alloc_max; unsigned int coherent; unsigned int coherent_max; unsigned int mapped; @@ -107,7 +108,15 @@ struct kgsl_driver { extern struct kgsl_driver kgsl_driver; struct kgsl_pagetable; -struct kgsl_memdesc_ops; +struct kgsl_memdesc; + +struct kgsl_memdesc_ops { + int (*vmflags)(struct kgsl_memdesc *); + int (*vmfault)(struct kgsl_memdesc *, struct vm_area_struct *, + struct vm_fault *); + void (*free)(struct kgsl_memdesc *memdesc); + int (*map_kernel_mem)(struct kgsl_memdesc *); +}; /* shared memory allocation */ struct kgsl_memdesc { @@ -136,7 +145,7 @@ struct kgsl_mem_entry { struct kgsl_memdesc memdesc; int memtype; void *priv_data; - struct list_head list; + struct rb_node node; uint32_t free_timestamp; /* back pointer to private structure under whose context this * allocation is made */ @@ -186,27 +195,47 @@ static inline int kgsl_gpuaddr_in_memdesc(const struct kgsl_memdesc *memdesc, } return 0; } -static inline uint8_t *kgsl_gpuaddr_to_vaddr(const struct kgsl_memdesc *memdesc, + +static inline void *kgsl_memdesc_map(struct kgsl_memdesc *memdesc) +{ + if (memdesc->hostptr == NULL && memdesc->ops && + memdesc->ops->map_kernel_mem) + memdesc->ops->map_kernel_mem(memdesc); + + return memdesc->hostptr; +} + +static inline uint8_t *kgsl_gpuaddr_to_vaddr(struct kgsl_memdesc *memdesc, unsigned int gpuaddr) { - if (memdesc->hostptr == NULL || memdesc->gpuaddr == 0 || - (gpuaddr < memdesc->gpuaddr || - gpuaddr >= memdesc->gpuaddr + memdesc->size)) - return NULL; + if (memdesc->gpuaddr == 0 || + gpuaddr < memdesc->gpuaddr || + gpuaddr >= (memdesc->gpuaddr + memdesc->size) || + (NULL == memdesc->hostptr && memdesc->ops->map_kernel_mem && + memdesc->ops->map_kernel_mem(memdesc))) + return NULL; return memdesc->hostptr + (gpuaddr - memdesc->gpuaddr); } -static inline int timestamp_cmp(unsigned int new, unsigned int old) +static inline int timestamp_cmp(unsigned int a, unsigned int b) { - int ts_diff = new - old; - - if (ts_diff == 0) + /* check for equal */ + if (a == b) return 0; - /*< DTS2012042406822 hanfeng 20120428 begin*/ - return ((ts_diff > 0) || (ts_diff < -KGSL_TIMESTAMP_WINDOW)) ? 1 : -1; - /* DTS2012042406822 hanfeng 20120428 end > */ + /* check for greater-than for non-rollover case */ + if ((a > b) && (a - b < KGSL_TIMESTAMP_WINDOW)) + return 1; + + /* check for greater-than for rollover case + * note that <= is required to ensure that consistent + * results are returned for values whose difference is + * equal to the window size + */ + a += KGSL_TIMESTAMP_WINDOW; + b += KGSL_TIMESTAMP_WINDOW; + return ((a > b) && (a - b <= KGSL_TIMESTAMP_WINDOW)) ? 1 : -1; } static inline void diff --git a/drivers/gpu/msm/kgsl_cffdump.c b/drivers/gpu/msm/kgsl_cffdump.c index e9455cb82..77aef1ff0 100644 --- a/drivers/gpu/msm/kgsl_cffdump.c +++ b/drivers/gpu/msm/kgsl_cffdump.c @@ -497,190 +497,6 @@ int kgsl_cffdump_waitirq(void) } EXPORT_SYMBOL(kgsl_cffdump_waitirq); -#define ADDRESS_STACK_SIZE 256 -#define GET_PM4_TYPE3_OPCODE(x) ((*(x) >> 8) & 0xFF) -static unsigned int kgsl_cffdump_addr_count; - -static bool kgsl_cffdump_handle_type3(struct kgsl_device_private *dev_priv, - uint *hostaddr, bool check_only) -{ - static uint addr_stack[ADDRESS_STACK_SIZE]; - static uint size_stack[ADDRESS_STACK_SIZE]; - - switch (GET_PM4_TYPE3_OPCODE(hostaddr)) { - case CP_INDIRECT_BUFFER_PFD: - case CP_INDIRECT_BUFFER: - { - /* traverse indirect buffers */ - int i; - uint ibaddr = hostaddr[1]; - uint ibsize = hostaddr[2]; - - /* is this address already in encountered? */ - for (i = 0; - i < kgsl_cffdump_addr_count && addr_stack[i] != ibaddr; - ++i) - ; - - if (kgsl_cffdump_addr_count == i) { - addr_stack[kgsl_cffdump_addr_count] = ibaddr; - size_stack[kgsl_cffdump_addr_count++] = ibsize; - - if (kgsl_cffdump_addr_count >= ADDRESS_STACK_SIZE) { - KGSL_CORE_ERR("stack overflow\n"); - return false; - } - - return kgsl_cffdump_parse_ibs(dev_priv, NULL, - ibaddr, ibsize, check_only); - } else if (size_stack[i] != ibsize) { - KGSL_CORE_ERR("gpuaddr: 0x%08x, " - "wc: %u, with size wc: %u already on the " - "stack\n", ibaddr, ibsize, size_stack[i]); - return false; - } - } - break; - } - - return true; -} - -/* - * Traverse IBs and dump them to test vector. Detect swap by inspecting - * register writes, keeping note of the current state, and dump - * framebuffer config to test vector - */ -bool kgsl_cffdump_parse_ibs(struct kgsl_device_private *dev_priv, - const struct kgsl_memdesc *memdesc, uint gpuaddr, int sizedwords, - bool check_only) -{ - static uint level; /* recursion level */ - bool ret = true; - uint *hostaddr, *hoststart; - int dwords_left = sizedwords; /* dwords left in the current command - buffer */ - - if (level == 0) - kgsl_cffdump_addr_count = 0; - - if (memdesc == NULL) { - struct kgsl_mem_entry *entry; - spin_lock(&dev_priv->process_priv->mem_lock); - entry = kgsl_sharedmem_find_region(dev_priv->process_priv, - gpuaddr, sizedwords * sizeof(uint)); - spin_unlock(&dev_priv->process_priv->mem_lock); - if (entry == NULL) { - KGSL_CORE_ERR("did not find mapping " - "for gpuaddr: 0x%08x\n", gpuaddr); - return true; - } - memdesc = &entry->memdesc; - } - hostaddr = (uint *)kgsl_gpuaddr_to_vaddr(memdesc, gpuaddr); - if (hostaddr == NULL) { - KGSL_CORE_ERR("no kernel mapping for " - "gpuaddr: 0x%08x\n", gpuaddr); - return true; - } - - hoststart = hostaddr; - - level++; - - mb(); - kgsl_cache_range_op((struct kgsl_memdesc *)memdesc, - KGSL_CACHE_OP_INV); -#ifdef DEBUG - pr_info("kgsl: cffdump: ib: gpuaddr:0x%08x, wc:%d, hptr:%p\n", - gpuaddr, sizedwords, hostaddr); -#endif - - while (dwords_left > 0) { - int count = 0; /* dword count including packet header */ - bool cur_ret = true; - - switch (*hostaddr >> 30) { - case 0x0: /* type-0 */ - count = (*hostaddr >> 16)+2; - break; - case 0x1: /* type-1 */ - count = 2; - break; - case 0x3: /* type-3 */ - count = ((*hostaddr >> 16) & 0x3fff) + 2; - cur_ret = kgsl_cffdump_handle_type3(dev_priv, - hostaddr, check_only); - break; - default: - pr_warn("kgsl: cffdump: parse-ib: unexpected type: " - "type:%d, word:0x%08x @ 0x%p, gpu:0x%08x\n", - *hostaddr >> 30, *hostaddr, hostaddr, - gpuaddr+4*(sizedwords-dwords_left)); - cur_ret = false; - count = dwords_left; - break; - } - -#ifdef DEBUG - if (!cur_ret) { - pr_info("kgsl: cffdump: bad sub-type: #:%d/%d, v:0x%08x" - " @ 0x%p[gb:0x%08x], level:%d\n", - sizedwords-dwords_left, sizedwords, *hostaddr, - hostaddr, gpuaddr+4*(sizedwords-dwords_left), - level); - - print_hex_dump(KERN_ERR, level == 1 ? "IB1:" : "IB2:", - DUMP_PREFIX_OFFSET, 32, 4, hoststart, - sizedwords*4, 0); - } -#endif - ret = ret && cur_ret; - - /* jump to next packet */ - dwords_left -= count; - hostaddr += count; - cur_ret = dwords_left >= 0; - -#ifdef DEBUG - if (!cur_ret) { - pr_info("kgsl: cffdump: bad count: c:%d, #:%d/%d, " - "v:0x%08x @ 0x%p[gb:0x%08x], level:%d\n", - count, sizedwords-(dwords_left+count), - sizedwords, *(hostaddr-count), hostaddr-count, - gpuaddr+4*(sizedwords-(dwords_left+count)), - level); - - print_hex_dump(KERN_ERR, level == 1 ? "IB1:" : "IB2:", - DUMP_PREFIX_OFFSET, 32, 4, hoststart, - sizedwords*4, 0); - } -#endif - - ret = ret && cur_ret; - } - - if (!ret) - pr_info("kgsl: cffdump: parsing failed: gpuaddr:0x%08x, " - "host:0x%p, wc:%d\n", gpuaddr, hoststart, sizedwords); - - if (!check_only) { -#ifdef DEBUG - uint offset = gpuaddr - memdesc->gpuaddr; - pr_info("kgsl: cffdump: ib-dump: hostptr:%p, gpuaddr:%08x, " - "physaddr:%08x, offset:%d, size:%d", hoststart, - gpuaddr, memdesc->physaddr + offset, offset, - sizedwords*4); -#endif - kgsl_cffdump_syncmem(dev_priv, memdesc, gpuaddr, sizedwords*4, - false); - } - - level--; - - return ret; -} - static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf, void *prev_subbuf, uint prev_padding) { diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h index 2fb1e43f4..efec2af9e 100644 --- a/drivers/gpu/msm/kgsl_device.h +++ b/drivers/gpu/msm/kgsl_device.h @@ -15,6 +15,7 @@ #include #include +#include #include #include "kgsl.h" @@ -184,6 +185,7 @@ struct kgsl_device { struct wake_lock idle_wakelock; struct kgsl_pwrscale pwrscale; struct kobject pwrscale_kobj; + struct pm_qos_request_list pm_qos_req_dma; struct work_struct ts_expired_ws; struct list_head events; s64 on_time; @@ -197,13 +199,18 @@ struct kgsl_context { /* Pointer to the device specific context information */ void *devctxt; + /* + * Status indicating whether a gpu reset occurred and whether this + * context was responsible for causing it + */ + unsigned int reset_status; }; struct kgsl_process_private { unsigned int refcnt; pid_t pid; spinlock_t mem_lock; - struct list_head mem_list; + struct rb_root mem_rb; struct kgsl_pagetable *pagetable; struct list_head list; struct kobject kobj; diff --git a/drivers/gpu/msm/kgsl_drm.c b/drivers/gpu/msm/kgsl_drm.c index dba2dfcfb..ba48f9c75 100644 --- a/drivers/gpu/msm/kgsl_drm.c +++ b/drivers/gpu/msm/kgsl_drm.c @@ -295,8 +295,9 @@ kgsl_gem_alloc_memory(struct drm_gem_object *obj) priv->memdesc.size = obj->size * priv->bufcount; } else if (TYPE_IS_MEM(priv->type)) { - priv->memdesc.hostptr = - vmalloc_user(obj->size * priv->bufcount); + result = kgsl_sharedmem_page_alloc(&priv->memdesc, + priv->pagetable, + obj->size * priv->bufcount, 0); if (priv->memdesc.hostptr == NULL) { DRM_ERROR("Unable to allocate vmalloc memory\n"); @@ -1042,17 +1043,18 @@ int kgsl_gem_kmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct drm_gem_object *obj = vma->vm_private_data; struct drm_device *dev = obj->dev; struct drm_kgsl_gem_object *priv; - unsigned long offset, pg; + unsigned long offset; struct page *page; + int i; mutex_lock(&dev->struct_mutex); priv = obj->driver_private; offset = (unsigned long) vmf->virtual_address - vma->vm_start; - pg = (unsigned long) priv->memdesc.hostptr + offset; + i = offset >> PAGE_SHIFT; + page = sg_page(&(priv->memdesc.sg[i])); - page = vmalloc_to_page((void *) pg); if (!page) { mutex_unlock(&dev->struct_mutex); return VM_FAULT_SIGBUS; diff --git a/drivers/gpu/msm/kgsl_gpummu.c b/drivers/gpu/msm/kgsl_gpummu.c old mode 100755 new mode 100644 index a16b95418..f038f0491 --- a/drivers/gpu/msm/kgsl_gpummu.c +++ b/drivers/gpu/msm/kgsl_gpummu.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2011, Code Aurora Forum. All rights reserved. +/* Copyright (c) 2011-2012, Code Aurora Forum. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -354,8 +354,8 @@ void *kgsl_gpummu_ptpool_init(int ptsize, int entries) int kgsl_gpummu_pt_equal(struct kgsl_pagetable *pt, unsigned int pt_base) { - struct kgsl_gpummu_pt *gpummu_pt = pt->priv; - return pt && pt_base && (gpummu_pt->base.gpuaddr == pt_base); + struct kgsl_gpummu_pt *gpummu_pt = pt ? pt->priv : NULL; + return gpummu_pt && pt_base && (gpummu_pt->base.gpuaddr == pt_base); } void kgsl_gpummu_destroy_pagetable(void *mmu_specific_pt) @@ -398,14 +398,14 @@ static unsigned int kgsl_gpummu_pt_get_flags(struct kgsl_pagetable *pt, enum kgsl_deviceid id) { unsigned int result = 0; - struct kgsl_gpummu_pt *gpummu_pt = (struct kgsl_gpummu_pt *) - pt->priv; + struct kgsl_gpummu_pt *gpummu_pt; if (pt == NULL) return 0; + gpummu_pt = pt->priv; spin_lock(&pt->lock); - if (gpummu_pt->tlb_flags && (1<tlb_flags & (1<tlb_flags &= ~(1<sg, s, memdesc->sglen, i) { - unsigned int paddr = sg_phys(s); + unsigned int paddr = kgsl_get_sg_pa(s); unsigned int j; /* Each sg entry might be multiple pages long */ diff --git a/drivers/gpu/msm/kgsl_iommu.c b/drivers/gpu/msm/kgsl_iommu.c index e4e561cef..5646d682a 100644 --- a/drivers/gpu/msm/kgsl_iommu.c +++ b/drivers/gpu/msm/kgsl_iommu.c @@ -34,8 +34,8 @@ struct kgsl_iommu { static int kgsl_iommu_pt_equal(struct kgsl_pagetable *pt, unsigned int pt_base) { - struct iommu_domain *domain = pt->priv; - return pt && pt_base && ((unsigned int)domain == pt_base); + struct iommu_domain *domain = pt ? pt->priv : NULL; + return domain && pt_base && ((unsigned int)domain == pt_base); } static void kgsl_iommu_destroy_pagetable(void *mmu_specific_pt) @@ -262,7 +262,7 @@ kgsl_iommu_map(void *mmu_specific_pt, iommu_virt_addr = memdesc->gpuaddr; ret = iommu_map_range(domain, iommu_virt_addr, memdesc->sg, - memdesc->size, 0); + memdesc->size, (IOMMU_READ | IOMMU_WRITE)); if (ret) { KGSL_CORE_ERR("iommu_map_range(%p, %x, %p, %d, %d) " "failed with err: %d\n", domain, diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c old mode 100755 new mode 100644 index 003afb953..429e4946d --- a/drivers/gpu/msm/kgsl_pwrctrl.c +++ b/drivers/gpu/msm/kgsl_pwrctrl.c @@ -13,7 +13,6 @@ #include #include #include -#include #include "kgsl.h" #include "kgsl_pwrscale.h" @@ -25,6 +24,7 @@ #define KGSL_PWRFLAGS_AXI_ON 2 #define KGSL_PWRFLAGS_IRQ_ON 3 +#define GPU_SWFI_LATENCY 3 #define UPDATE_BUSY_VAL 1000000 #define UPDATE_BUSY 50 @@ -284,10 +284,7 @@ static int kgsl_pwrctrl_gpubusy_show(struct device *dev, DEVICE_ATTR(gpuclk, 0644, kgsl_pwrctrl_gpuclk_show, kgsl_pwrctrl_gpuclk_store); DEVICE_ATTR(max_gpuclk, 0644, kgsl_pwrctrl_max_gpuclk_show, kgsl_pwrctrl_max_gpuclk_store); -/*< DTS2011123005723 hanfeng 20111230 begin*/ -/*modify the file permission */ DEVICE_ATTR(pwrnap, 0664, kgsl_pwrctrl_pwrnap_show, kgsl_pwrctrl_pwrnap_store); -/* DTS2011123005723 hanfeng 20111230 end >*/ DEVICE_ATTR(idle_timer, 0644, kgsl_pwrctrl_idle_timer_show, kgsl_pwrctrl_idle_timer_store); DEVICE_ATTR(gpubusy, 0644, kgsl_pwrctrl_gpubusy_show, @@ -337,7 +334,8 @@ static void kgsl_pwrctrl_busy_time(struct kgsl_device *device, bool on_time) do_gettimeofday(&(b->start)); } -void kgsl_pwrctrl_clk(struct kgsl_device *device, int state) +void kgsl_pwrctrl_clk(struct kgsl_device *device, int state, + int requested_state) { struct kgsl_pwrctrl *pwr = &device->pwrctrl; int i = 0; @@ -349,7 +347,7 @@ void kgsl_pwrctrl_clk(struct kgsl_device *device, int state) if (pwr->grp_clks[i]) clk_disable(pwr->grp_clks[i]); if ((pwr->pwrlevels[0].gpu_freq > 0) && - (device->requested_state != KGSL_STATE_NAP)) + (requested_state != KGSL_STATE_NAP)) clk_set_rate(pwr->grp_clks[0], pwr->pwrlevels[pwr->num_pwrlevels - 1]. gpu_freq); @@ -424,8 +422,12 @@ void kgsl_pwrctrl_pwrrail(struct kgsl_device *device, int state) if (!test_and_set_bit(KGSL_PWRFLAGS_POWER_ON, &pwr->power_flags)) { trace_kgsl_rail(device, state); - if (pwr->gpu_reg) - regulator_enable(pwr->gpu_reg); + if (pwr->gpu_reg) { + int status = regulator_enable(pwr->gpu_reg); + if (status) + KGSL_DRV_ERR(device, "regulator_enable " + "failed: %d\n", status); + } } } } @@ -512,10 +514,7 @@ int kgsl_pwrctrl_init(struct kgsl_device *device) pwr->nap_allowed = pdata->nap_allowed; pwr->idle_needed = pdata->idle_needed; pwr->interval_timeout = pdata->idle_timeout; - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ - /*merge qc patch to fix kgsl issue.*/ pwr->strtstp_sleepwake = pdata->strtstp_sleepwake; - /* DTS2012041906630 zhangxiangdang 20120423 end > */ pwr->ebi1_clk = clk_get(&pdev->dev, "bus_clk"); if (IS_ERR(pwr->ebi1_clk)) pwr->ebi1_clk = NULL; @@ -638,10 +637,8 @@ void kgsl_timer(unsigned long data) KGSL_PWR_INFO(device, "idle timer expired device %d\n", device->id); if (device->requested_state != KGSL_STATE_SUSPEND) { - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ if (device->pwrctrl.restore_slumber || device->pwrctrl.strtstp_sleepwake) - /* DTS2012041906630 zhangxiangdang 20120423 end > */ kgsl_pwrctrl_request_state(device, KGSL_STATE_SLUMBER); else kgsl_pwrctrl_request_state(device, KGSL_STATE_SLEEP); @@ -708,10 +705,8 @@ _nap(struct kgsl_device *device) return -EBUSY; } kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF); - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF); - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ - kgsl_pwrctrl_set_state(device, device->requested_state); - /* DTS2012041906630 zhangxiangdang 20120423 end > */ + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_NAP); + kgsl_pwrctrl_set_state(device, KGSL_STATE_NAP); if (device->idle_wakelock.name) wake_unlock(&device->idle_wakelock); case KGSL_STATE_NAP: @@ -753,10 +748,11 @@ _sleep(struct kgsl_device *device) pwr->pwrlevels[pwr->num_pwrlevels - 1]. gpu_freq); _sleep_accounting(device); - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF); + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP); kgsl_pwrctrl_set_state(device, KGSL_STATE_SLEEP); - if (device->idle_wakelock.name) - wake_unlock(&device->idle_wakelock); + wake_unlock(&device->idle_wakelock); + pm_qos_update_request(&device->pm_qos_req_dma, + PM_QOS_DEFAULT_VALUE); break; case KGSL_STATE_SLEEP: case KGSL_STATE_SLUMBER: @@ -783,18 +779,18 @@ _slumber(struct kgsl_device *device) case KGSL_STATE_NAP: case KGSL_STATE_SLEEP: del_timer_sync(&device->idle_timer); - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ if (!device->pwrctrl.strtstp_sleepwake) kgsl_pwrctrl_pwrlevel_change(device, KGSL_PWRLEVEL_NOMINAL); - /* DTS2012041906630 zhangxiangdang 20120423 end > */ + device->pwrctrl.restore_slumber = true; device->ftbl->suspend_context(device); device->ftbl->stop(device); - device->pwrctrl.restore_slumber = true; _sleep_accounting(device); kgsl_pwrctrl_set_state(device, KGSL_STATE_SLUMBER); if (device->idle_wakelock.name) wake_unlock(&device->idle_wakelock); + pm_qos_update_request(&device->pm_qos_req_dma, + PM_QOS_DEFAULT_VALUE); break; case KGSL_STATE_SLUMBER: break; @@ -856,16 +852,17 @@ void kgsl_pwrctrl_wake(struct kgsl_device *device) /* fall through */ case KGSL_STATE_NAP: /* Turn on the core clocks */ - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON); + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON, KGSL_STATE_ACTIVE); /* Enable state before turning on irq */ kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE); kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON); /* Re-enable HW access */ mod_timer(&device->idle_timer, jiffies + device->pwrctrl.interval_timeout); - - if (device->idle_wakelock.name) - wake_lock(&device->idle_wakelock); + wake_lock(&device->idle_wakelock); + if (device->pwrctrl.restore_slumber == false) + pm_qos_update_request(&device->pm_qos_req_dma, + GPU_SWFI_LATENCY); case KGSL_STATE_ACTIVE: break; default: @@ -881,7 +878,7 @@ void kgsl_pwrctrl_enable(struct kgsl_device *device) { /* Order pwrrail/clk sequence based upon platform */ kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_ON); - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON); + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_ON, KGSL_STATE_ACTIVE); kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_ON); } EXPORT_SYMBOL(kgsl_pwrctrl_enable); @@ -890,7 +887,7 @@ void kgsl_pwrctrl_disable(struct kgsl_device *device) { /* Order pwrrail/clk sequence based upon platform */ kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_OFF); - kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF); + kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP); kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_OFF); } EXPORT_SYMBOL(kgsl_pwrctrl_disable); diff --git a/drivers/gpu/msm/kgsl_pwrctrl.h b/drivers/gpu/msm/kgsl_pwrctrl.h old mode 100755 new mode 100644 index 0c7ec6003..caaed92c8 --- a/drivers/gpu/msm/kgsl_pwrctrl.h +++ b/drivers/gpu/msm/kgsl_pwrctrl.h @@ -47,9 +47,7 @@ struct kgsl_pwrctrl { int thermal_pwrlevel; unsigned int num_pwrlevels; unsigned int interval_timeout; - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ bool strtstp_sleepwake; - /* DTS2012041906630 zhangxiangdang 20120423 end > */ struct regulator *gpu_reg; uint32_t pcl; unsigned int nap_allowed; diff --git a/drivers/gpu/msm/kgsl_pwrscale.c b/drivers/gpu/msm/kgsl_pwrscale.c old mode 100755 new mode 100644 index c2252edcf..d0b2a412c --- a/drivers/gpu/msm/kgsl_pwrscale.c +++ b/drivers/gpu/msm/kgsl_pwrscale.c @@ -89,10 +89,8 @@ static ssize_t pwrscale_policy_show(struct kgsl_device *device, char *buf) return ret; } -/*< DTS2011123005723 hanfeng 20111230 begin*/ -/*modify the file permission */ + PWRSCALE_ATTR(policy, 0664, pwrscale_policy_show, pwrscale_policy_store); -/*DTS2011123005723 hanfeng 20111230 end >*/ static ssize_t pwrscale_avail_policies_show(struct kgsl_device *device, char *buf) diff --git a/drivers/gpu/msm/kgsl_pwrscale_idlestats.c b/drivers/gpu/msm/kgsl_pwrscale_idlestats.c old mode 100755 new mode 100644 diff --git a/drivers/gpu/msm/kgsl_sharedmem.c b/drivers/gpu/msm/kgsl_sharedmem.c old mode 100755 new mode 100644 index 389ed6d4f..ae32e81ff --- a/drivers/gpu/msm/kgsl_sharedmem.c +++ b/drivers/gpu/msm/kgsl_sharedmem.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include "kgsl.h" #include "kgsl_sharedmem.h" @@ -201,6 +203,10 @@ static int kgsl_drv_memstat_show(struct device *dev, val = kgsl_driver.stats.vmalloc; else if (!strncmp(attr->attr.name, "vmalloc_max", 11)) val = kgsl_driver.stats.vmalloc_max; + else if (!strncmp(attr->attr.name, "page_alloc", 10)) + val = kgsl_driver.stats.page_alloc; + else if (!strncmp(attr->attr.name, "page_alloc_max", 14)) + val = kgsl_driver.stats.page_alloc_max; else if (!strncmp(attr->attr.name, "coherent", 8)) val = kgsl_driver.stats.coherent; else if (!strncmp(attr->attr.name, "coherent_max", 12)) @@ -230,6 +236,8 @@ static int kgsl_drv_histogram_show(struct device *dev, DEVICE_ATTR(vmalloc, 0444, kgsl_drv_memstat_show, NULL); DEVICE_ATTR(vmalloc_max, 0444, kgsl_drv_memstat_show, NULL); +DEVICE_ATTR(page_alloc, 0444, kgsl_drv_memstat_show, NULL); +DEVICE_ATTR(page_alloc_max, 0444, kgsl_drv_memstat_show, NULL); DEVICE_ATTR(coherent, 0444, kgsl_drv_memstat_show, NULL); DEVICE_ATTR(coherent_max, 0444, kgsl_drv_memstat_show, NULL); DEVICE_ATTR(mapped, 0444, kgsl_drv_memstat_show, NULL); @@ -239,6 +247,8 @@ DEVICE_ATTR(histogram, 0444, kgsl_drv_histogram_show, NULL); static const struct device_attribute *drv_attr_list[] = { &dev_attr_vmalloc, &dev_attr_vmalloc_max, + &dev_attr_page_alloc, + &dev_attr_page_alloc_max, &dev_attr_coherent, &dev_attr_coherent_max, &dev_attr_mapped, @@ -282,7 +292,7 @@ static void outer_cache_range_op_sg(struct scatterlist *sg, int sglen, int op) int i; for_each_sg(sg, s, sglen, i) { - unsigned int paddr = sg_phys(s); + unsigned int paddr = kgsl_get_sg_pa(s); _outer_cache_range_op(op, paddr, s->length); } } @@ -293,17 +303,18 @@ static void outer_cache_range_op_sg(struct scatterlist *sg, int sglen, int op) } #endif -static int kgsl_vmalloc_vmfault(struct kgsl_memdesc *memdesc, +static int kgsl_page_alloc_vmfault(struct kgsl_memdesc *memdesc, struct vm_area_struct *vma, struct vm_fault *vmf) { - unsigned long offset, pg; + unsigned long offset; struct page *page; + int i; offset = (unsigned long) vmf->virtual_address - vma->vm_start; - pg = (unsigned long) memdesc->hostptr + offset; - page = vmalloc_to_page((void *) pg); + i = offset >> PAGE_SHIFT; + page = sg_page(&memdesc->sg[i]); if (page == NULL) return VM_FAULT_SIGBUS; @@ -313,15 +324,23 @@ static int kgsl_vmalloc_vmfault(struct kgsl_memdesc *memdesc, return 0; } -static int kgsl_vmalloc_vmflags(struct kgsl_memdesc *memdesc) +static int kgsl_page_alloc_vmflags(struct kgsl_memdesc *memdesc) { return VM_RESERVED | VM_DONTEXPAND; } -static void kgsl_vmalloc_free(struct kgsl_memdesc *memdesc) +static void kgsl_page_alloc_free(struct kgsl_memdesc *memdesc) { - kgsl_driver.stats.vmalloc -= memdesc->size; - vfree(memdesc->hostptr); + int i = 0; + struct scatterlist *sg; + kgsl_driver.stats.page_alloc -= memdesc->size; + if (memdesc->hostptr) { + vunmap(memdesc->hostptr); + kgsl_driver.stats.vmalloc -= memdesc->size; + } + if (memdesc->sg) + for_each_sg(memdesc->sg, sg, memdesc->sglen, i) + __free_page(sg_page(sg)); } static int kgsl_contiguous_vmflags(struct kgsl_memdesc *memdesc) @@ -329,6 +348,42 @@ static int kgsl_contiguous_vmflags(struct kgsl_memdesc *memdesc) return VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND; } +/* + * kgsl_page_alloc_map_kernel - Map the memory in memdesc to kernel address + * space + * + * @memdesc - The memory descriptor which contains information about the memory + * + * Return: 0 on success else error code + */ +static int kgsl_page_alloc_map_kernel(struct kgsl_memdesc *memdesc) +{ + if (!memdesc->hostptr) { + pgprot_t page_prot = pgprot_writecombine(PAGE_KERNEL); + struct page **pages = NULL; + struct scatterlist *sg; + int i; + /* create a list of pages to call vmap */ + pages = vmalloc(memdesc->sglen * sizeof(struct page *)); + if (!pages) { + KGSL_CORE_ERR("vmalloc(%d) failed\n", + memdesc->sglen * sizeof(struct page *)); + return -ENOMEM; + } + for_each_sg(memdesc->sg, sg, memdesc->sglen, i) + pages[i] = sg_page(sg); + memdesc->hostptr = vmap(pages, memdesc->sglen, + VM_IOREMAP, page_prot); + KGSL_STATS_ADD(memdesc->size, kgsl_driver.stats.vmalloc, + kgsl_driver.stats.vmalloc_max); + vfree(pages); + } + if (!memdesc->hostptr) + return -ENOMEM; + + return 0; +} + static int kgsl_contiguous_vmfault(struct kgsl_memdesc *memdesc, struct vm_area_struct *vma, struct vm_fault *vmf) @@ -368,12 +423,13 @@ static void kgsl_coherent_free(struct kgsl_memdesc *memdesc) } /* Global - also used by kgsl_drm.c */ -struct kgsl_memdesc_ops kgsl_vmalloc_ops = { - .free = kgsl_vmalloc_free, - .vmflags = kgsl_vmalloc_vmflags, - .vmfault = kgsl_vmalloc_vmfault, +struct kgsl_memdesc_ops kgsl_page_alloc_ops = { + .free = kgsl_page_alloc_free, + .vmflags = kgsl_page_alloc_vmflags, + .vmfault = kgsl_page_alloc_vmfault, + .map_kernel_mem = kgsl_page_alloc_map_kernel, }; -EXPORT_SYMBOL(kgsl_vmalloc_ops); +EXPORT_SYMBOL(kgsl_page_alloc_ops); static struct kgsl_memdesc_ops kgsl_ebimem_ops = { .free = kgsl_ebimem_free, @@ -407,9 +463,9 @@ void kgsl_cache_range_op(struct kgsl_memdesc *memdesc, int op) EXPORT_SYMBOL(kgsl_cache_range_op); static int -_kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc, +_kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, - void *ptr, size_t size, unsigned int protflags) + size_t size, unsigned int protflags) { int order, ret = 0; int sglen = PAGE_ALIGN(size) / PAGE_SIZE; @@ -418,36 +474,43 @@ _kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc, memdesc->size = size; memdesc->pagetable = pagetable; memdesc->priv = KGSL_MEMFLAGS_CACHED; - memdesc->ops = &kgsl_vmalloc_ops; - memdesc->hostptr = (void *) ptr; + memdesc->ops = &kgsl_page_alloc_ops; + + memdesc->sg = kgsl_sg_alloc(sglen); - memdesc->sg = vmalloc(sglen * sizeof(struct scatterlist)); if (memdesc->sg == NULL) { + KGSL_CORE_ERR("vmalloc(%d) failed\n", + sglen * sizeof(struct scatterlist)); ret = -ENOMEM; goto done; } + kmemleak_not_leak(memdesc->sg); + memdesc->sglen = sglen; sg_init_table(memdesc->sg, sglen); - for (i = 0; i < memdesc->sglen; i++, ptr += PAGE_SIZE) { - struct page *page = vmalloc_to_page(ptr); + for (i = 0; i < memdesc->sglen; i++) { + struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO | + __GFP_HIGHMEM); if (!page) { - ret = -EINVAL; + ret = -ENOMEM; + memdesc->sglen = i; goto done; } + flush_dcache_page(page); sg_set_page(&memdesc->sg[i], page, PAGE_SIZE, 0); } - - kgsl_cache_range_op(memdesc, KGSL_CACHE_OP_INV); + outer_cache_range_op_sg(memdesc->sg, memdesc->sglen, + KGSL_CACHE_OP_FLUSH); ret = kgsl_mmu_map(pagetable, memdesc, protflags); if (ret) goto done; - KGSL_STATS_ADD(size, kgsl_driver.stats.vmalloc, - kgsl_driver.stats.vmalloc_max); + KGSL_STATS_ADD(size, kgsl_driver.stats.page_alloc, + kgsl_driver.stats.page_alloc_max); order = get_order(size); @@ -462,51 +525,41 @@ _kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc, } int -kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc, +kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, size_t size) { - void *ptr; - + int ret = 0; BUG_ON(size == 0); size = ALIGN(size, PAGE_SIZE * 2); - ptr = vmalloc(size); - - if (ptr == NULL) { - KGSL_CORE_ERR("vmalloc(%d) failed\n", size); - return -ENOMEM; - } - return _kgsl_sharedmem_vmalloc(memdesc, pagetable, ptr, size, + ret = _kgsl_sharedmem_page_alloc(memdesc, pagetable, size, GSL_PT_PAGE_RV | GSL_PT_PAGE_WV); + if (!ret) + ret = kgsl_page_alloc_map_kernel(memdesc); + if (ret) + kgsl_sharedmem_free(memdesc); + return ret; } -EXPORT_SYMBOL(kgsl_sharedmem_vmalloc); +EXPORT_SYMBOL(kgsl_sharedmem_page_alloc); int -kgsl_sharedmem_vmalloc_user(struct kgsl_memdesc *memdesc, +kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, size_t size, int flags) { - void *ptr; unsigned int protflags; BUG_ON(size == 0); - ptr = vmalloc_user(size); - - if (ptr == NULL) { - KGSL_CORE_ERR("vmalloc_user(%d) failed: allocated=%d\n", - size, kgsl_driver.stats.vmalloc); - return -ENOMEM; - } protflags = GSL_PT_PAGE_RV; if (!(flags & KGSL_MEMFLAGS_GPUREADONLY)) protflags |= GSL_PT_PAGE_WV; - return _kgsl_sharedmem_vmalloc(memdesc, pagetable, ptr, size, + return _kgsl_sharedmem_page_alloc(memdesc, pagetable, size, protflags); } -EXPORT_SYMBOL(kgsl_sharedmem_vmalloc_user); +EXPORT_SYMBOL(kgsl_sharedmem_page_alloc_user); int kgsl_sharedmem_alloc_coherent(struct kgsl_memdesc *memdesc, size_t size) @@ -554,7 +607,7 @@ void kgsl_sharedmem_free(struct kgsl_memdesc *memdesc) if (memdesc->ops && memdesc->ops->free) memdesc->ops->free(memdesc); - vfree(memdesc->sg); + kgsl_sg_free(memdesc->sg, memdesc->sglen); memset(memdesc, 0, sizeof(*memdesc)); } @@ -686,3 +739,33 @@ kgsl_sharedmem_set(const struct kgsl_memdesc *memdesc, unsigned int offsetbytes, return 0; } EXPORT_SYMBOL(kgsl_sharedmem_set); + +/* + * kgsl_sharedmem_map_vma - Map a user vma to physical memory + * + * @vma - The user vma to map + * @memdesc - The memory descriptor which contains information about the + * physical memory + * + * Return: 0 on success else error code + */ +int +kgsl_sharedmem_map_vma(struct vm_area_struct *vma, + const struct kgsl_memdesc *memdesc) +{ + unsigned long addr = vma->vm_start; + unsigned long size = vma->vm_end - vma->vm_start; + int ret, i = 0; + + if (!memdesc->sg || (size != memdesc->size) || + (memdesc->sglen != (size / PAGE_SIZE))) + return -EINVAL; + + for (; addr < vma->vm_end; addr += PAGE_SIZE, i++) { + ret = vm_insert_page(vma, addr, sg_page(&memdesc->sg[i])); + if (ret) + return ret; + } + return 0; +} +EXPORT_SYMBOL(kgsl_sharedmem_map_vma); diff --git a/drivers/gpu/msm/kgsl_sharedmem.h b/drivers/gpu/msm/kgsl_sharedmem.h old mode 100755 new mode 100644 index 67a1c2d7b..a67d9c657 --- a/drivers/gpu/msm/kgsl_sharedmem.h +++ b/drivers/gpu/msm/kgsl_sharedmem.h @@ -17,6 +17,8 @@ #include #include #include "kgsl_mmu.h" +#include +#include struct kgsl_device; struct kgsl_process_private; @@ -28,19 +30,12 @@ struct kgsl_process_private; /** Set if the memdesc describes cached memory */ #define KGSL_MEMFLAGS_CACHED 0x00000001 -struct kgsl_memdesc_ops { - int (*vmflags)(struct kgsl_memdesc *); - int (*vmfault)(struct kgsl_memdesc *, struct vm_area_struct *, - struct vm_fault *); - void (*free)(struct kgsl_memdesc *memdesc); -}; - -extern struct kgsl_memdesc_ops kgsl_vmalloc_ops; +extern struct kgsl_memdesc_ops kgsl_page_alloc_ops; -int kgsl_sharedmem_vmalloc(struct kgsl_memdesc *memdesc, +int kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, size_t size); -int kgsl_sharedmem_vmalloc_user(struct kgsl_memdesc *memdesc, +int kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc, struct kgsl_pagetable *pagetable, size_t size, int flags); @@ -76,19 +71,58 @@ void kgsl_process_uninit_sysfs(struct kgsl_process_private *private); int kgsl_sharedmem_init_sysfs(void); void kgsl_sharedmem_uninit_sysfs(void); +static inline unsigned int kgsl_get_sg_pa(struct scatterlist *sg) +{ + /* + * Try sg_dma_address first to support ion carveout + * regions which do not work with sg_phys(). + */ + unsigned int pa = sg_dma_address(sg); + if (pa == 0) + pa = sg_phys(sg); + return pa; +} + +int +kgsl_sharedmem_map_vma(struct vm_area_struct *vma, + const struct kgsl_memdesc *memdesc); + +/* + * For relatively small sglists, it is preferable to use kzalloc + * rather than going down the vmalloc rat hole. If the size of + * the sglist is < PAGE_SIZE use kzalloc otherwise fallback to + * vmalloc + */ + +static inline void *kgsl_sg_alloc(unsigned int sglen) +{ + if ((sglen * sizeof(struct scatterlist)) < PAGE_SIZE) + return kzalloc(sglen * sizeof(struct scatterlist), GFP_KERNEL); + else + return vmalloc(sglen * sizeof(struct scatterlist)); +} + +static inline void kgsl_sg_free(void *ptr, unsigned int sglen) +{ + if ((sglen * sizeof(struct scatterlist)) < PAGE_SIZE) + kfree(ptr); + else + vfree(ptr); +} + static inline int memdesc_sg_phys(struct kgsl_memdesc *memdesc, unsigned int physaddr, unsigned int size) { - struct page *page = phys_to_page(physaddr); + memdesc->sg = kgsl_sg_alloc(1); - memdesc->sg = vmalloc(sizeof(struct scatterlist) * 1); - if (memdesc->sg == NULL) - return -ENOMEM; + kmemleak_not_leak(memdesc->sg); memdesc->sglen = 1; sg_init_table(memdesc->sg, 1); - sg_set_page(&memdesc->sg[0], page, size, 0); + memdesc->sg[0].length = size; + memdesc->sg[0].offset = 0; + memdesc->sg[0].dma_address = physaddr; return 0; } @@ -98,7 +132,7 @@ kgsl_allocate(struct kgsl_memdesc *memdesc, { if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE) return kgsl_sharedmem_ebimem(memdesc, pagetable, size); - return kgsl_sharedmem_vmalloc(memdesc, pagetable, size); + return kgsl_sharedmem_page_alloc(memdesc, pagetable, size); } static inline int @@ -109,7 +143,7 @@ kgsl_allocate_user(struct kgsl_memdesc *memdesc, if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_NONE) return kgsl_sharedmem_ebimem_user(memdesc, pagetable, size, flags); - return kgsl_sharedmem_vmalloc_user(memdesc, pagetable, size, flags); + return kgsl_sharedmem_page_alloc_user(memdesc, pagetable, size, flags); } static inline int diff --git a/drivers/gpu/msm/kgsl_snapshot.c b/drivers/gpu/msm/kgsl_snapshot.c index 72df148bc..394bc83bd 100644 --- a/drivers/gpu/msm/kgsl_snapshot.c +++ b/drivers/gpu/msm/kgsl_snapshot.c @@ -10,7 +10,6 @@ * GNU General Public License for more details. */ -#include #include #include #include @@ -283,6 +282,12 @@ int kgsl_device_snapshot(struct kgsl_device *device, int hang) /* Freeze the snapshot on a hang until it gets read */ device->snapshot_frozen = (hang) ? 1 : 0; + /* log buffer info to aid in ramdump recovery */ + KGSL_DRV_ERR(device, "snapshot created at va %p pa %lx size %d\n", + device->snapshot, __pa(device->snapshot), + device->snapshot_size); + if (hang) + sysfs_notify(&device->snapshot_kobj, NULL, "timestamp"); return 0; } EXPORT_SYMBOL(kgsl_device_snapshot); @@ -432,7 +437,7 @@ int kgsl_device_snapshot_init(struct kgsl_device *device) int ret; if (device->snapshot == NULL) - device->snapshot = vmalloc(KGSL_SNAPSHOT_MEMSIZE); + device->snapshot = kzalloc(KGSL_SNAPSHOT_MEMSIZE, GFP_KERNEL); if (device->snapshot == NULL) return -ENOMEM; @@ -475,7 +480,7 @@ void kgsl_device_snapshot_close(struct kgsl_device *device) kobject_put(&device->snapshot_kobj); - vfree(device->snapshot); + kfree(device->snapshot); device->snapshot = NULL; device->snapshot_maxsize = 0; diff --git a/drivers/gpu/msm/z180.c b/drivers/gpu/msm/z180.c old mode 100755 new mode 100644 index cb3da9075..d721a577a --- a/drivers/gpu/msm/z180.c +++ b/drivers/gpu/msm/z180.c @@ -157,13 +157,6 @@ static struct z180_device device_2d0 = { .active_cnt = 0, .iomemname = KGSL_2D0_REG_MEMORY, .ftbl = &z180_functable, -#ifdef CONFIG_HAS_EARLYSUSPEND - .display_off = { - .level = EARLY_SUSPEND_LEVEL_STOP_DRAWING, - .suspend = kgsl_early_suspend_driver, - .resume = kgsl_late_resume_driver, - }, -#endif }, }; @@ -195,13 +188,6 @@ static struct z180_device device_2d1 = { .active_cnt = 0, .iomemname = KGSL_2D1_REG_MEMORY, .ftbl = &z180_functable, - .display_off = { -#ifdef CONFIG_HAS_EARLYSUSPEND - .level = EARLY_SUSPEND_LEVEL_STOP_DRAWING, - .suspend = kgsl_early_suspend_driver, - .resume = kgsl_late_resume_driver, -#endif - }, }, }; @@ -407,7 +393,7 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv, unsigned int index = 0; unsigned int nextindex; unsigned int nextcnt = Z180_STREAM_END_CMD | 5; - struct kgsl_memdesc tmp = {0}; + struct kgsl_mem_entry *entry = NULL; unsigned int cmd; struct kgsl_device *device = dev_priv->device; struct kgsl_pagetable *pagetable = dev_priv->process_priv->pagetable; @@ -425,8 +411,30 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv, } cmd = ibdesc[0].gpuaddr; sizedwords = ibdesc[0].sizedwords; - - tmp.hostptr = (void *)*timestamp; + /* + * Get a kernel mapping to the IB for monkey patching. + * See the end of this function. + */ + entry = kgsl_sharedmem_find_region(dev_priv->process_priv, cmd, + sizedwords); + if (entry == NULL) { + KGSL_DRV_ERR(device, "Bad ibdesc: gpuaddr 0x%x size %d\n", + cmd, sizedwords); + result = -EINVAL; + goto error; + } + /* + * This will only map memory if it exists, otherwise it will reuse the + * mapping. And the 2d userspace reuses IBs so we likely won't create + * too many mappings. + */ + if (kgsl_gpuaddr_to_vaddr(&entry->memdesc, cmd) == NULL) { + KGSL_DRV_ERR(device, + "Cannot make kernel mapping for gpuaddr 0x%x\n", + cmd); + result = -EINVAL; + goto error; + } KGSL_CMD_INFO(device, "ctxt %d ibaddr 0x%08x sizedwords %d\n", context->id, cmd, sizedwords); @@ -468,12 +476,13 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv, nextaddr = z180_dev->ringbuffer.cmdbufdesc.gpuaddr + rb_offset(nextindex); - tmp.hostptr = (void *)(tmp.hostptr + - (sizedwords * sizeof(unsigned int))); - tmp.size = 12; - - kgsl_sharedmem_writel(&tmp, 4, nextaddr); - kgsl_sharedmem_writel(&tmp, 8, nextcnt); + /* monkey patch the IB so that it jumps back to the ringbuffer */ + kgsl_sharedmem_writel(&entry->memdesc, + ((sizedwords + 1) * sizeof(unsigned int)), + nextaddr); + kgsl_sharedmem_writel(&entry->memdesc, + ((sizedwords + 2) * sizeof(unsigned int)), + nextcnt); /* sync memory before activating the hardware for the new command*/ mb(); diff --git a/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c b/drivers/input/touchscreen/atmel_i2c_rmi_QT602240.c old mode 100644 new mode 100755 diff --git a/include/linux/msm_kgsl.h b/include/linux/msm_kgsl.h old mode 100755 new mode 100644 index a1d267893..7837bad21 --- a/include/linux/msm_kgsl.h +++ b/include/linux/msm_kgsl.h @@ -34,6 +34,16 @@ #define KGSL_CLK_MEM_IFACE 0x00000010 #define KGSL_CLK_AXI 0x00000020 +/* + * Reset status values for context + */ +enum kgsl_ctx_reset_stat { + KGSL_CTX_STAT_NO_ERROR = 0x00000000, + KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT = 0x00000001, + KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT = 0x00000002, + KGSL_CTX_STAT_UNKNOWN_CONTEXT_RESET_EXT = 0x00000003 +}; + #define KGSL_MAX_PWRLEVELS 5 #define KGSL_CONVERT_TO_MBPS(val) \ @@ -110,6 +120,7 @@ enum kgsl_property_type { KGSL_PROP_MMU_ENABLE = 0x00000006, KGSL_PROP_INTERRUPT_WAITS = 0x00000007, KGSL_PROP_VERSION = 0x00000008, + KGSL_PROP_GPU_RESET_STAT = 0x00000009 }; struct kgsl_shadowprop { @@ -146,9 +157,7 @@ struct kgsl_device_platform_data { int num_levels; int (*set_grp_async)(void); unsigned int idle_timeout; - /*< DTS2012041906630 zhangxiangdang 20120423 begin */ bool strtstp_sleepwake; - /* DTS2012041906630 zhangxiangdang 20120423 end > */ unsigned int nap_allowed; unsigned int clk_map; unsigned int idle_needed; From 5b225c80411d97112391f4dc9908289f0f68d4a5 Mon Sep 17 00:00:00 2001 From: forumber Date: Tue, 5 Feb 2013 17:55:43 +0200 Subject: [PATCH 19/19] Revert "Add BFQv5" This reverts commit 89abf031a76bb8afe836b088a1ebe50fd846ec79. --- arch/arm/configs/u8800_defconfig | 6 +- block/Kconfig.iosched | 26 - block/Makefile | 1 - block/bfq-cgroup.c | 831 -------- block/bfq-ioc.c | 380 ---- block/bfq-iosched.c | 3047 ------------------------------ block/bfq-sched.c | 1066 ----------- block/bfq.h | 595 ------ block/blk-ioc.c | 29 +- block/cfq-iosched.c | 10 +- fs/ioprio.c | 7 +- include/linux/cgroup_subsys.h | 6 - include/linux/iocontext.h | 18 +- 13 files changed, 22 insertions(+), 6000 deletions(-) delete mode 100644 block/bfq-cgroup.c delete mode 100644 block/bfq-ioc.c delete mode 100644 block/bfq-iosched.c delete mode 100644 block/bfq-sched.c delete mode 100644 block/bfq.h diff --git a/arch/arm/configs/u8800_defconfig b/arch/arm/configs/u8800_defconfig index fbf77b32f..e252fda97 100644 --- a/arch/arm/configs/u8800_defconfig +++ b/arch/arm/configs/u8800_defconfig @@ -172,13 +172,11 @@ CONFIG_LBDAF=y # IO Schedulers # CONFIG_IOSCHED_NOOP=y -# CONFIG_DEFAULT_NOOP is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_IOSCHED_CFQ=y # CONFIG_DEFAULT_CFQ is not set -CONFIG_IOSCHED_BFQ=y -CONFIG_DEFAULT_BFQ=y -CONFIG_DEFAULT_IOSCHED="bfq" +CONFIG_DEFAULT_NOOP=y +CONFIG_DEFAULT_IOSCHED="noop" # CONFIG_INLINE_SPIN_TRYLOCK is not set # CONFIG_INLINE_SPIN_TRYLOCK_BH is not set # CONFIG_INLINE_SPIN_LOCK is not set diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index bceaaecef..3199b76f7 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -43,28 +43,6 @@ config CFQ_GROUP_IOSCHED ---help--- Enable group IO scheduling in CFQ. -config IOSCHED_BFQ - tristate "BFQ I/O scheduler" - depends on EXPERIMENTAL - default n - ---help--- - The BFQ I/O scheduler tries to distribute bandwidth among - all processes according to their weights. - It aims at distributing the bandwidth as desired, independently of - the disk parameters and with any workload. It also tries to - guarantee low latency to interactive and soft real-time - applications. If compiled built-in (saying Y here), BFQ can - be configured to support hierarchical scheduling. - -config CGROUP_BFQIO - bool "BFQ hierarchical scheduling support" - depends on CGROUPS && IOSCHED_BFQ=y - default n - ---help--- - Enable hierarchical scheduling in BFQ, using the cgroups - filesystem interface. The name of the subsystem will be - bfqio. - choice prompt "Default I/O scheduler" default DEFAULT_CFQ @@ -75,9 +53,6 @@ choice config DEFAULT_DEADLINE bool "Deadline" if IOSCHED_DEADLINE=y - config DEFAULT_BFQ - bool "BFQ" if IOSCHED_BFQ=y - config DEFAULT_CFQ bool "CFQ" if IOSCHED_CFQ=y @@ -90,7 +65,6 @@ config DEFAULT_IOSCHED string default "deadline" if DEFAULT_DEADLINE default "cfq" if DEFAULT_CFQ - default "bfq" if DEFAULT_BFQ default "noop" if DEFAULT_NOOP endmenu diff --git a/block/Makefile b/block/Makefile index a3cf79cc0..0fec4b3fa 100644 --- a/block/Makefile +++ b/block/Makefile @@ -13,7 +13,6 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o -obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c deleted file mode 100644 index 74ae73b91..000000000 --- a/block/bfq-cgroup.c +++ /dev/null @@ -1,831 +0,0 @@ -/* - * BFQ: CGROUPS support. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * - * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. - */ - -#ifdef CONFIG_CGROUP_BFQIO -static struct bfqio_cgroup bfqio_root_cgroup = { - .weight = BFQ_DEFAULT_GRP_WEIGHT, - .ioprio = BFQ_DEFAULT_GRP_IOPRIO, - .ioprio_class = BFQ_DEFAULT_GRP_CLASS, -}; - -static inline void bfq_init_entity(struct bfq_entity *entity, - struct bfq_group *bfqg) -{ - entity->weight = entity->new_weight; - entity->orig_weight = entity->new_weight; - entity->ioprio = entity->new_ioprio; - entity->ioprio_class = entity->new_ioprio_class; - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; -} - -static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup) -{ - return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id), - struct bfqio_cgroup, css); -} - -/* - * Search the bfq_group for bfqd into the hash table (by now only a list) - * of bgrp. Must be called under rcu_read_lock(). - */ -static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp, - struct bfq_data *bfqd) -{ - struct bfq_group *bfqg; - struct hlist_node *n; - void *key; - - hlist_for_each_entry_rcu(bfqg, n, &bgrp->group_data, group_node) { - key = rcu_dereference(bfqg->bfqd); - if (key == bfqd) - return bfqg; - } - - return NULL; -} - -static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp, - struct bfq_group *bfqg) -{ - struct bfq_entity *entity = &bfqg->entity; - - entity->weight = entity->new_weight = bgrp->weight; - entity->orig_weight = entity->new_weight; - entity->ioprio = entity->new_ioprio = bgrp->ioprio; - entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class; - entity->ioprio_changed = 1; - entity->my_sched_data = &bfqg->sched_data; -} - -static inline void bfq_group_set_parent(struct bfq_group *bfqg, - struct bfq_group *parent) -{ - struct bfq_entity *entity; - - BUG_ON(parent == NULL); - BUG_ON(bfqg == NULL); - - entity = &bfqg->entity; - entity->parent = parent->my_entity; - entity->sched_data = &parent->sched_data; -} - -/** - * bfq_group_chain_alloc - allocate a chain of groups. - * @bfqd: queue descriptor. - * @cgroup: the leaf cgroup this chain starts from. - * - * Allocate a chain of groups starting from the one belonging to - * @cgroup up to the root cgroup. Stop if a cgroup on the chain - * to the root has already an allocated group on @bfqd. - */ -static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd, - struct cgroup *cgroup) -{ - struct bfqio_cgroup *bgrp; - struct bfq_group *bfqg, *prev = NULL, *leaf = NULL; - - for (; cgroup != NULL; cgroup = cgroup->parent) { - bgrp = cgroup_to_bfqio(cgroup); - - bfqg = bfqio_lookup_group(bgrp, bfqd); - if (bfqg != NULL) { - /* - * All the cgroups in the path from there to the - * root must have a bfq_group for bfqd, so we don't - * need any more allocations. - */ - break; - } - - bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC); - if (bfqg == NULL) - goto cleanup; - - bfq_group_init_entity(bgrp, bfqg); - bfqg->my_entity = &bfqg->entity; - - if (leaf == NULL) { - leaf = bfqg; - prev = leaf; - } else { - bfq_group_set_parent(prev, bfqg); - /* - * Build a list of allocated nodes using the bfqd - * filed, that is still unused and will be initialized - * only after the node will be connected. - */ - prev->bfqd = bfqg; - prev = bfqg; - } - } - - return leaf; - -cleanup: - while (leaf != NULL) { - prev = leaf; - leaf = leaf->bfqd; - kfree(prev); - } - - return NULL; -} - -/** - * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy. - * @bfqd: the queue descriptor. - * @cgroup: the leaf cgroup to start from. - * @leaf: the leaf group (to be associated to @cgroup). - * - * Try to link a chain of groups to a cgroup hierarchy, connecting the - * nodes bottom-up, so we can be sure that when we find a cgroup in the - * hierarchy that already as a group associated to @bfqd all the nodes - * in the path to the root cgroup have one too. - * - * On locking: the queue lock protects the hierarchy (there is a hierarchy - * per device) while the bfqio_cgroup lock protects the list of groups - * belonging to the same cgroup. - */ -static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup, - struct bfq_group *leaf) -{ - struct bfqio_cgroup *bgrp; - struct bfq_group *bfqg, *next, *prev = NULL; - unsigned long flags; - - assert_spin_locked(bfqd->queue->queue_lock); - - for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) { - bgrp = cgroup_to_bfqio(cgroup); - next = leaf->bfqd; - - bfqg = bfqio_lookup_group(bgrp, bfqd); - BUG_ON(bfqg != NULL); - - spin_lock_irqsave(&bgrp->lock, flags); - - rcu_assign_pointer(leaf->bfqd, bfqd); - hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data); - hlist_add_head(&leaf->bfqd_node, &bfqd->group_list); - - spin_unlock_irqrestore(&bgrp->lock, flags); - - prev = leaf; - leaf = next; - } - - BUG_ON(cgroup == NULL && leaf != NULL); - if (cgroup != NULL && prev != NULL) { - bgrp = cgroup_to_bfqio(cgroup); - bfqg = bfqio_lookup_group(bgrp, bfqd); - bfq_group_set_parent(prev, bfqg); - } -} - -/** - * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup. - * @bfqd: queue descriptor. - * @cgroup: cgroup being searched for. - * - * Return a group associated to @bfqd in @cgroup, allocating one if - * necessary. When a group is returned all the cgroups in the path - * to the root have a group associated to @bfqd. - * - * If the allocation fails, return the root group: this breaks guarantees - * but is a safe fallbak. If this loss becames a problem it can be - * mitigated using the equivalent weight (given by the product of the - * weights of the groups in the path from @group to the root) in the - * root scheduler. - * - * We allocate all the missing nodes in the path from the leaf cgroup - * to the root and we connect the nodes only after all the allocations - * have been successful. - */ -static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - struct cgroup *cgroup) -{ - struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup); - struct bfq_group *bfqg; - - bfqg = bfqio_lookup_group(bgrp, bfqd); - if (bfqg != NULL) - return bfqg; - - bfqg = bfq_group_chain_alloc(bfqd, cgroup); - if (bfqg != NULL) - bfq_group_chain_link(bfqd, cgroup, bfqg); - else - bfqg = bfqd->root_group; - - return bfqg; -} - -/** - * bfq_bfqq_move - migrate @bfqq to @bfqg. - * @bfqd: queue descriptor. - * @bfqq: the queue to move. - * @entity: @bfqq's entity. - * @bfqg: the group to move to. - * - * Move @bfqq to @bfqg, deactivating it from its old group and reactivating - * it on the new one. Avoid putting the entity on the old group idle tree. - * - * Must be called under the queue lock; the cgroup owning @bfqg must - * not disappear (by now this just means that we are called under - * rcu_read_lock()). - */ -static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_entity *entity, struct bfq_group *bfqg) -{ - int busy, resume; - - busy = bfq_bfqq_busy(bfqq); - resume = !RB_EMPTY_ROOT(&bfqq->sort_list); - - BUG_ON(resume && !entity->on_st); - BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue); - - if (busy) { - BUG_ON(atomic_read(&bfqq->ref) < 2); - - if (!resume) - bfq_del_bfqq_busy(bfqd, bfqq, 0); - else - bfq_deactivate_bfqq(bfqd, bfqq, 0); - } else if (entity->on_st) - bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - - /* - * Here we use a reference to bfqg. We don't need a refcounter - * as the cgroup reference will not be dropped, so that its - * destroy() callback will not be invoked. - */ - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; - - if (busy && resume) - bfq_activate_bfqq(bfqd, bfqq); -} - -/** - * __bfq_cic_change_cgroup - move @cic to @cgroup. - * @bfqd: the queue descriptor. - * @cic: the cic to move. - * @cgroup: the cgroup to move to. - * - * Move cic to cgroup, assuming that bfqd->queue is locked; the caller - * has to make sure that the reference to cgroup is valid across the call. - * - * NOTE: an alternative approach might have been to store the current - * cgroup in bfqq and getting a reference to it, reducing the lookup - * time here, at the price of slightly more complex code. - */ -static struct bfq_group *__bfq_cic_change_cgroup(struct bfq_data *bfqd, - struct cfq_io_context *cic, - struct cgroup *cgroup) -{ - struct bfq_queue *async_bfqq = cic_to_bfqq(cic, 0); - struct bfq_queue *sync_bfqq = cic_to_bfqq(cic, 1); - struct bfq_entity *entity; - struct bfq_group *bfqg; - - bfqg = bfq_find_alloc_group(bfqd, cgroup); - if (async_bfqq != NULL) { - entity = &async_bfqq->entity; - - if (entity->sched_data != &bfqg->sched_data) { - cic_set_bfqq(cic, NULL, 0); - bfq_log_bfqq(bfqd, async_bfqq, - "cic_change_group: %p %d", - async_bfqq, atomic_read(&async_bfqq->ref)); - bfq_put_queue(async_bfqq); - } - } - - if (sync_bfqq != NULL) { - entity = &sync_bfqq->entity; - if (entity->sched_data != &bfqg->sched_data) - bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); - } - - return bfqg; -} - -/** - * bfq_cic_change_cgroup - move @cic to @cgroup. - * @cic: the cic being migrated. - * @cgroup: the destination cgroup. - * - * When the task owning @cic is moved to @cgroup, @cic is immediately - * moved into its new parent group. - */ -static void bfq_cic_change_cgroup(struct cfq_io_context *cic, - struct cgroup *cgroup) -{ - struct bfq_data *bfqd; - unsigned long uninitialized_var(flags); - - bfqd = bfq_get_bfqd_locked(&cic->key, &flags); - if (bfqd != NULL && - !strncmp(bfqd->queue->elevator->elevator_type->elevator_name, - "bfq", ELV_NAME_MAX)) { - __bfq_cic_change_cgroup(bfqd, cic, cgroup); - bfq_put_bfqd_unlock(bfqd, &flags); - } -} - -/** - * bfq_cic_update_cgroup - update the cgroup of @cic. - * @cic: the @cic to update. - * - * Make sure that @cic is enqueued in the cgroup of the current task. - * We need this in addition to moving cics during the cgroup attach - * phase because the task owning @cic could be at its first disk - * access or we may end up in the root cgroup as the result of a - * memory allocation failure and here we try to move to the right - * group. - * - * Must be called under the queue lock. It is safe to use the returned - * value even after the rcu_read_unlock() as the migration/destruction - * paths act under the queue lock too. IOW it is impossible to race with - * group migration/destruction and end up with an invalid group as: - * a) here cgroup has not yet been destroyed, nor its destroy callback - * has started execution, as current holds a reference to it, - * b) if it is destroyed after rcu_read_unlock() [after current is - * migrated to a different cgroup] its attach() callback will have - * taken care of remove all the references to the old cgroup data. - */ -static struct bfq_group *bfq_cic_update_cgroup(struct cfq_io_context *cic) -{ - struct bfq_data *bfqd = cic->key; - struct bfq_group *bfqg; - struct cgroup *cgroup; - - BUG_ON(bfqd == NULL); - - rcu_read_lock(); - cgroup = task_cgroup(current, bfqio_subsys_id); - bfqg = __bfq_cic_change_cgroup(bfqd, cic, cgroup); - rcu_read_unlock(); - - return bfqg; -} - -/** - * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. - * @st: the service tree being flushed. - */ -static inline void bfq_flush_idle_tree(struct bfq_service_tree *st) -{ - struct bfq_entity *entity = st->first_idle; - - for (; entity != NULL; entity = st->first_idle) - __bfq_deactivate_entity(entity, 0); -} - -/** - * bfq_reparent_leaf_entity - move leaf entity to the root_group. - * @bfqd: the device data structure with the root group. - * @entity: the entity to move. - */ -static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - BUG_ON(bfqq == NULL); - bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); - return; -} - -/** - * bfq_reparent_active_entities - move to the root group all active entities. - * @bfqd: the device data structure with the root group. - * @bfqg: the group to move from. - * @st: the service tree with the entities. - * - * Needs queue_lock to be taken and reference to be valid over the call. - */ -static inline void bfq_reparent_active_entities(struct bfq_data *bfqd, - struct bfq_group *bfqg, - struct bfq_service_tree *st) -{ - struct rb_root *active = &st->active; - struct bfq_entity *entity = NULL; - - if (!RB_EMPTY_ROOT(&st->active)) - entity = bfq_entity_of(rb_first(active)); - - for (; entity != NULL ; entity = bfq_entity_of(rb_first(active))) - bfq_reparent_leaf_entity(bfqd, entity); - - if (bfqg->sched_data.active_entity != NULL) - bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity); - - return; -} - -/** - * bfq_destroy_group - destroy @bfqg. - * @bgrp: the bfqio_cgroup containing @bfqg. - * @bfqg: the group being destroyed. - * - * Destroy @bfqg, making sure that it is not referenced from its parent. - */ -static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) -{ - struct bfq_data *bfqd; - struct bfq_service_tree *st; - struct bfq_entity *entity = bfqg->my_entity; - unsigned long uninitialized_var(flags); - int i; - - hlist_del(&bfqg->group_node); - - /* - * Empty all service_trees belonging to this group before deactivating - * the group itself. - */ - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { - st = bfqg->sched_data.service_tree + i; - - /* - * The idle tree may still contain bfq_queues belonging - * to exited task because they never migrated to a different - * cgroup from the one being destroyed now. Noone else - * can access them so it's safe to act without any lock. - */ - bfq_flush_idle_tree(st); - - /* - * It may happen that some queues are still active - * (busy) upon group destruction (if the corresponding - * processes have been forced to terminate). We move - * all the leaf entities corresponding to these queues - * to the root_group. - * Also, it may happen that the group has an entity - * under service, which is disconnected from the active - * tree: it must be moved, too. - * There is no need to put the sync queues, as the - * scheduler has taken no reference. - */ - bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); - if (bfqd != NULL) { - bfq_reparent_active_entities(bfqd, bfqg, st); - bfq_put_bfqd_unlock(bfqd, &flags); - } - BUG_ON(!RB_EMPTY_ROOT(&st->active)); - BUG_ON(!RB_EMPTY_ROOT(&st->idle)); - } - BUG_ON(bfqg->sched_data.next_active != NULL); - BUG_ON(bfqg->sched_data.active_entity != NULL); - - /* - * We may race with device destruction, take extra care when - * dereferencing bfqg->bfqd. - */ - bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); - if (bfqd != NULL) { - hlist_del(&bfqg->bfqd_node); - __bfq_deactivate_entity(entity, 0); - bfq_put_async_queues(bfqd, bfqg); - bfq_put_bfqd_unlock(bfqd, &flags); - } - BUG_ON(entity->tree != NULL); - - /* - * No need to defer the kfree() to the end of the RCU grace - * period: we are called from the destroy() callback of our - * cgroup, so we can be sure that noone is a) still using - * this cgroup or b) doing lookups in it. - */ - kfree(bfqg); -} - -/** - * bfq_disconnect_groups - diconnect @bfqd from all its groups. - * @bfqd: the device descriptor being exited. - * - * When the device exits we just make sure that no lookup can return - * the now unused group structures. They will be deallocated on cgroup - * destruction. - */ -static void bfq_disconnect_groups(struct bfq_data *bfqd) -{ - struct hlist_node *pos, *n; - struct bfq_group *bfqg; - - bfq_log(bfqd, "disconnect_groups beginning") ; - hlist_for_each_entry_safe(bfqg, pos, n, &bfqd->group_list, bfqd_node) { - hlist_del(&bfqg->bfqd_node); - - __bfq_deactivate_entity(bfqg->my_entity, 0); - - /* - * Don't remove from the group hash, just set an - * invalid key. No lookups can race with the - * assignment as bfqd is being destroyed; this - * implies also that new elements cannot be added - * to the list. - */ - rcu_assign_pointer(bfqg->bfqd, NULL); - - bfq_log(bfqd, "disconnect_groups: put async for group %p", - bfqg) ; - bfq_put_async_queues(bfqd, bfqg); - } -} - -static inline void bfq_free_root_group(struct bfq_data *bfqd) -{ - struct bfqio_cgroup *bgrp = &bfqio_root_cgroup; - struct bfq_group *bfqg = bfqd->root_group; - - bfq_put_async_queues(bfqd, bfqg); - - spin_lock_irq(&bgrp->lock); - hlist_del_rcu(&bfqg->group_node); - spin_unlock_irq(&bgrp->lock); - - /* - * No need to synchronize_rcu() here: since the device is gone - * there cannot be any read-side access to its root_group. - */ - kfree(bfqg); -} - -static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) -{ - struct bfq_group *bfqg; - struct bfqio_cgroup *bgrp; - int i; - - bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); - if (bfqg == NULL) - return NULL; - - bfqg->entity.parent = NULL; - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) - bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; - - bgrp = &bfqio_root_cgroup; - spin_lock_irq(&bgrp->lock); - rcu_assign_pointer(bfqg->bfqd, bfqd); - hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data); - spin_unlock_irq(&bgrp->lock); - - return bfqg; -} - -#define SHOW_FUNCTION(__VAR) \ -static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup, \ - struct cftype *cftype) \ -{ \ - struct bfqio_cgroup *bgrp; \ - u64 ret; \ - \ - if (!cgroup_lock_live_group(cgroup)) \ - return -ENODEV; \ - \ - bgrp = cgroup_to_bfqio(cgroup); \ - spin_lock_irq(&bgrp->lock); \ - ret = bgrp->__VAR; \ - spin_unlock_irq(&bgrp->lock); \ - \ - cgroup_unlock(); \ - \ - return ret; \ -} - -SHOW_FUNCTION(weight); -SHOW_FUNCTION(ioprio); -SHOW_FUNCTION(ioprio_class); -#undef SHOW_FUNCTION - -#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ -static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \ - struct cftype *cftype, \ - u64 val) \ -{ \ - struct bfqio_cgroup *bgrp; \ - struct bfq_group *bfqg; \ - struct hlist_node *n; \ - \ - if (val < (__MIN) || val > (__MAX)) \ - return -EINVAL; \ - \ - if (!cgroup_lock_live_group(cgroup)) \ - return -ENODEV; \ - \ - bgrp = cgroup_to_bfqio(cgroup); \ - \ - spin_lock_irq(&bgrp->lock); \ - bgrp->__VAR = (unsigned short)val; \ - hlist_for_each_entry(bfqg, n, &bgrp->group_data, group_node) { \ - bfqg->entity.new_##__VAR = (unsigned short)val; \ - smp_wmb(); \ - bfqg->entity.ioprio_changed = 1; \ - } \ - spin_unlock_irq(&bgrp->lock); \ - \ - cgroup_unlock(); \ - \ - return 0; \ -} - -STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT); -STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1); -STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); -#undef STORE_FUNCTION - -static struct cftype bfqio_files[] = { - { - .name = "weight", - .read_u64 = bfqio_cgroup_weight_read, - .write_u64 = bfqio_cgroup_weight_write, - }, - { - .name = "ioprio", - .read_u64 = bfqio_cgroup_ioprio_read, - .write_u64 = bfqio_cgroup_ioprio_write, - }, - { - .name = "ioprio_class", - .read_u64 = bfqio_cgroup_ioprio_class_read, - .write_u64 = bfqio_cgroup_ioprio_class_write, - }, -}; - -static int bfqio_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) -{ - return cgroup_add_files(cgroup, subsys, bfqio_files, - ARRAY_SIZE(bfqio_files)); -} - -static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys *subsys, - struct cgroup *cgroup) -{ - struct bfqio_cgroup *bgrp; - - if (cgroup->parent != NULL) { - bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL); - if (bgrp == NULL) - return ERR_PTR(-ENOMEM); - } else - bgrp = &bfqio_root_cgroup; - - spin_lock_init(&bgrp->lock); - INIT_HLIST_HEAD(&bgrp->group_data); - bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO; - bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS; - - return &bgrp->css; -} - -/* - * We cannot support shared io contexts, as we have no mean to support - * two tasks with the same ioc in two different groups without major rework - * of the main cic/bfqq data structures. By now we allow a task to change - * its cgroup only if it's the only owner of its ioc; the drawback of this - * behavior is that a group containing a task that forked using CLONE_IO - * will not be destroyed until the tasks sharing the ioc die. - */ -static int bfqio_can_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, - struct task_struct *tsk) -{ - struct io_context *ioc; - int ret = 0; - - /* task_lock() is needed to avoid races with exit_io_context() */ - task_lock(tsk); - ioc = tsk->io_context; - if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) - /* - * ioc == NULL means that the task is either too young or - * exiting: if it has still no ioc the ioc can't be shared, - * if the task is exiting the attach will fail anyway, no - * matter what we return here. - */ - ret = -EINVAL; - task_unlock(tsk); - - return ret; -} - -static void bfqio_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, - struct cgroup *prev, struct task_struct *tsk) -{ - struct io_context *ioc; - struct cfq_io_context *cic; - struct hlist_node *n; - - task_lock(tsk); - ioc = tsk->io_context; - if (ioc != NULL) { - BUG_ON(atomic_long_read(&ioc->refcount) == 0); - atomic_long_inc(&ioc->refcount); - } - task_unlock(tsk); - - if (ioc == NULL) - return; - - rcu_read_lock(); - hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list) - bfq_cic_change_cgroup(cic, cgroup); - rcu_read_unlock(); - - put_io_context(ioc); -} - -static void bfqio_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) -{ - struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup); - struct hlist_node *n, *tmp; - struct bfq_group *bfqg; - - /* - * Since we are destroying the cgroup, there are no more tasks - * referencing it, and all the RCU grace periods that may have - * referenced it are ended (as the destruction of the parent - * cgroup is RCU-safe); bgrp->group_data will not be accessed by - * anything else and we don't need any synchronization. - */ - hlist_for_each_entry_safe(bfqg, n, tmp, &bgrp->group_data, group_node) - bfq_destroy_group(bgrp, bfqg); - - BUG_ON(!hlist_empty(&bgrp->group_data)); - - kfree(bgrp); -} - -struct cgroup_subsys bfqio_subsys = { - .name = "bfqio", - .create = bfqio_create, - .can_attach = bfqio_can_attach, - .attach = bfqio_attach, - .destroy = bfqio_destroy, - .populate = bfqio_populate, - .subsys_id = bfqio_subsys_id, -}; -#else -static inline void bfq_init_entity(struct bfq_entity *entity, - struct bfq_group *bfqg) -{ - entity->weight = entity->new_weight; - entity->orig_weight = entity->new_weight; - entity->ioprio = entity->new_ioprio; - entity->ioprio_class = entity->new_ioprio_class; - entity->sched_data = &bfqg->sched_data; -} - -static inline struct bfq_group * -bfq_cic_update_cgroup(struct cfq_io_context *cic) -{ - struct bfq_data *bfqd = cic->key; - return bfqd->root_group; -} - -static inline void bfq_bfqq_move(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct bfq_entity *entity, - struct bfq_group *bfqg) -{ -} - -static inline void bfq_disconnect_groups(struct bfq_data *bfqd) -{ - bfq_put_async_queues(bfqd, bfqd->root_group); -} - -static inline void bfq_free_root_group(struct bfq_data *bfqd) -{ - kfree(bfqd->root_group); -} - -static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) -{ - struct bfq_group *bfqg; - int i; - - bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); - if (bfqg == NULL) - return NULL; - - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) - bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; - - return bfqg; -} -#endif diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c deleted file mode 100644 index 01f831332..000000000 --- a/block/bfq-ioc.c +++ /dev/null @@ -1,380 +0,0 @@ -/* - * BFQ: I/O context handling. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - */ - -/** - * bfq_cic_free_rcu - deferred cic freeing. - * @head: RCU head of the cic to free. - * - * Free the cic containing @head and, if it was the last one and - * the module is exiting wake up anyone waiting for its deallocation - * (see bfq_exit()). - */ -static void bfq_cic_free_rcu(struct rcu_head *head) -{ - struct cfq_io_context *cic; - - cic = container_of(head, struct cfq_io_context, rcu_head); - - kmem_cache_free(bfq_ioc_pool, cic); - elv_ioc_count_dec(bfq_ioc_count); - - if (bfq_ioc_gone != NULL) { - spin_lock(&bfq_ioc_gone_lock); - if (bfq_ioc_gone != NULL && - !elv_ioc_count_read(bfq_ioc_count)) { - complete(bfq_ioc_gone); - bfq_ioc_gone = NULL; - } - spin_unlock(&bfq_ioc_gone_lock); - } -} - -static void bfq_cic_free(struct cfq_io_context *cic) -{ - call_rcu(&cic->rcu_head, bfq_cic_free_rcu); -} - -/** - * cic_free_func - disconnect a cic ready to be freed. - * @ioc: the io_context @cic belongs to. - * @cic: the cic to be freed. - * - * Remove @cic from the @ioc radix tree hash and from its cic list, - * deferring the deallocation of @cic to the end of the current RCU - * grace period. This assumes that __bfq_exit_single_io_context() - * has already been called for @cic. - */ -static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) -{ - unsigned long flags; - unsigned long dead_key = (unsigned long) cic->key; - - BUG_ON(!(dead_key & CIC_DEAD_KEY)); - - spin_lock_irqsave(&ioc->lock, flags); - radix_tree_delete(&ioc->bfq_radix_root, - dead_key >> CIC_DEAD_INDEX_SHIFT); - hlist_del_init_rcu(&cic->cic_list); - spin_unlock_irqrestore(&ioc->lock, flags); - - bfq_cic_free(cic); -} - -static void bfq_free_io_context(struct io_context *ioc) -{ - /* - * ioc->refcount is zero here, or we are called from elv_unregister(), - * so no more cic's are allowed to be linked into this ioc. So it - * should be ok to iterate over the known list, we will see all cic's - * since no new ones are added. - */ - call_for_each_cic(ioc, cic_free_func); -} - -/** - * __bfq_exit_single_io_context - deassociate @cic from any running task. - * @bfqd: bfq_data on which @cic is valid. - * @cic: the cic being exited. - * - * Whenever no more tasks are using @cic or @bfqd is deallocated we - * need to invalidate its entry in the radix tree hash table and to - * release the queues it refers to. - * - * Called under the queue lock. - */ -static void __bfq_exit_single_io_context(struct bfq_data *bfqd, - struct cfq_io_context *cic) -{ - struct io_context *ioc = cic->ioc; - - list_del_init(&cic->queue_list); - - /* - * Make sure dead mark is seen for dead queues - */ - smp_wmb(); - rcu_assign_pointer(cic->key, bfqd_dead_key(bfqd)); - - /* - * No write-side locking as no task is using @ioc (they're exited - * or bfqd is being deallocated. - */ - rcu_read_lock(); - if (rcu_dereference(ioc->ioc_data) == cic) { - rcu_read_unlock(); - spin_lock(&ioc->lock); - rcu_assign_pointer(ioc->ioc_data, NULL); - spin_unlock(&ioc->lock); - } else - rcu_read_unlock(); - - if (cic->cfqq[BLK_RW_ASYNC] != NULL) { - bfq_exit_bfqq(bfqd, cic->cfqq[BLK_RW_ASYNC]); - cic->cfqq[BLK_RW_ASYNC] = NULL; - } - - if (cic->cfqq[BLK_RW_SYNC] != NULL) { - bfq_exit_bfqq(bfqd, cic->cfqq[BLK_RW_SYNC]); - cic->cfqq[BLK_RW_SYNC] = NULL; - } -} - -/** - * bfq_exit_single_io_context - deassociate @cic from @ioc (unlocked version). - * @ioc: the io_context @cic belongs to. - * @cic: the cic being exited. - * - * Take the queue lock and call __bfq_exit_single_io_context() to do the - * rest of the work. We take care of possible races with bfq_exit_queue() - * using bfq_get_bfqd_locked() (and abusing a little bit the RCU mechanism). - */ -static void bfq_exit_single_io_context(struct io_context *ioc, - struct cfq_io_context *cic) -{ - struct bfq_data *bfqd; - unsigned long uninitialized_var(flags); - - bfqd = bfq_get_bfqd_locked(&cic->key, &flags); - if (bfqd != NULL) { - __bfq_exit_single_io_context(bfqd, cic); - bfq_put_bfqd_unlock(bfqd, &flags); - } -} - -/** - * bfq_exit_io_context - deassociate @ioc from all cics it owns. - * @ioc: the @ioc being exited. - * - * No more processes are using @ioc we need to clean up and put the - * internal structures we have that belongs to that process. Loop - * through all its cics, locking their queues and exiting them. - */ -static void bfq_exit_io_context(struct io_context *ioc) -{ - call_for_each_cic(ioc, bfq_exit_single_io_context); -} - -static struct cfq_io_context *bfq_alloc_io_context(struct bfq_data *bfqd, - gfp_t gfp_mask) -{ - struct cfq_io_context *cic; - - cic = kmem_cache_alloc_node(bfq_ioc_pool, gfp_mask | __GFP_ZERO, - bfqd->queue->node); - if (cic != NULL) { - cic->last_end_request = jiffies; - INIT_LIST_HEAD(&cic->queue_list); - INIT_HLIST_NODE(&cic->cic_list); - cic->dtor = bfq_free_io_context; - cic->exit = bfq_exit_io_context; - elv_ioc_count_inc(bfq_ioc_count); - } - - return cic; -} - -/** - * bfq_drop_dead_cic - free an exited cic. - * @bfqd: bfq data for the device in use. - * @ioc: io_context owning @cic. - * @cic: the @cic to free. - * - * We drop cfq io contexts lazily, so we may find a dead one. - */ -static void bfq_drop_dead_cic(struct bfq_data *bfqd, struct io_context *ioc, - struct cfq_io_context *cic) -{ - unsigned long flags; - - WARN_ON(!list_empty(&cic->queue_list)); - BUG_ON(cic->key != bfqd_dead_key(bfqd)); - - spin_lock_irqsave(&ioc->lock, flags); - - BUG_ON(ioc->ioc_data == cic); - - /* - * With shared I/O contexts two lookups may race and drop the - * same cic more than one time: RCU guarantees that the storage - * will not be freed too early, here we make sure that we do - * not try to remove the cic from the hashing structures multiple - * times. - */ - if (!hlist_unhashed(&cic->cic_list)) { - radix_tree_delete(&ioc->bfq_radix_root, bfqd->cic_index); - hlist_del_init_rcu(&cic->cic_list); - bfq_cic_free(cic); - } - - spin_unlock_irqrestore(&ioc->lock, flags); -} - -/** - * bfq_cic_lookup - search into @ioc a cic associated to @bfqd. - * @bfqd: the lookup key. - * @ioc: the io_context of the process doing I/O. - * - * If @ioc already has a cic associated to @bfqd return it, return %NULL - * otherwise. - */ -static struct cfq_io_context *bfq_cic_lookup(struct bfq_data *bfqd, - struct io_context *ioc) -{ - struct cfq_io_context *cic; - unsigned long flags; - void *k; - - if (unlikely(ioc == NULL)) - return NULL; - - rcu_read_lock(); - - /* We maintain a last-hit cache, to avoid browsing over the tree. */ - cic = rcu_dereference(ioc->ioc_data); - if (cic != NULL) { - k = rcu_dereference(cic->key); - if (k == bfqd) - goto out; - } - - do { - cic = radix_tree_lookup(&ioc->bfq_radix_root, - bfqd->cic_index); - if (cic == NULL) - goto out; - - k = rcu_dereference(cic->key); - if (unlikely(k != bfqd)) { - rcu_read_unlock(); - bfq_drop_dead_cic(bfqd, ioc, cic); - rcu_read_lock(); - continue; - } - - spin_lock_irqsave(&ioc->lock, flags); - rcu_assign_pointer(ioc->ioc_data, cic); - spin_unlock_irqrestore(&ioc->lock, flags); - break; - } while (1); - -out: - rcu_read_unlock(); - - return cic; -} - -/** - * bfq_cic_link - add @cic to @ioc. - * @bfqd: bfq_data @cic refers to. - * @ioc: io_context @cic belongs to. - * @cic: the cic to link. - * @gfp_mask: the mask to use for radix tree preallocations. - * - * Add @cic to @ioc, using @bfqd as the search key. This enables us to - * lookup the process specific cfq io context when entered from the block - * layer. Also adds @cic to a per-bfqd list, used when this queue is - * removed. - */ -static int bfq_cic_link(struct bfq_data *bfqd, struct io_context *ioc, - struct cfq_io_context *cic, gfp_t gfp_mask) -{ - unsigned long flags; - int ret; - - ret = radix_tree_preload(gfp_mask); - if (ret == 0) { - cic->ioc = ioc; - - /* No write-side locking, cic is not published yet. */ - rcu_assign_pointer(cic->key, bfqd); - - spin_lock_irqsave(&ioc->lock, flags); - ret = radix_tree_insert(&ioc->bfq_radix_root, - bfqd->cic_index, cic); - if (ret == 0) - hlist_add_head_rcu(&cic->cic_list, &ioc->bfq_cic_list); - spin_unlock_irqrestore(&ioc->lock, flags); - - radix_tree_preload_end(); - - if (ret == 0) { - spin_lock_irqsave(bfqd->queue->queue_lock, flags); - list_add(&cic->queue_list, &bfqd->cic_list); - spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); - } - } - - if (ret != 0) - printk(KERN_ERR "bfq: cic link failed!\n"); - - return ret; -} - -/** - * bfq_ioc_set_ioprio - signal a priority change to the cics belonging to @ioc. - * @ioc: the io_context changing its priority. - */ -static inline void bfq_ioc_set_ioprio(struct io_context *ioc) -{ - call_for_each_cic(ioc, bfq_changed_ioprio); -} - -/** - * bfq_get_io_context - return the @cic associated to @bfqd in @ioc. - * @bfqd: the search key. - * @gfp_mask: the mask to use for cic allocation. - * - * Setup general io context and cfq io context. There can be several cfq - * io contexts per general io context, if this process is doing io to more - * than one device managed by cfq. - */ -static struct cfq_io_context *bfq_get_io_context(struct bfq_data *bfqd, - gfp_t gfp_mask) -{ - struct io_context *ioc = NULL; - struct cfq_io_context *cic; - - might_sleep_if(gfp_mask & __GFP_WAIT); - - ioc = get_io_context(gfp_mask, bfqd->queue->node); - if (ioc == NULL) - return NULL; - - /* Lookup for an existing cic. */ - cic = bfq_cic_lookup(bfqd, ioc); - if (cic != NULL) - goto out; - - /* Alloc one if needed. */ - cic = bfq_alloc_io_context(bfqd, gfp_mask); - if (cic == NULL) - goto err; - - /* Link it into the ioc's radix tree and cic list. */ - if (bfq_cic_link(bfqd, ioc, cic, gfp_mask) != 0) - goto err_free; - -out: - /* - * test_and_clear_bit() implies a memory barrier, paired with - * the wmb() in fs/ioprio.c, so the value seen for ioprio is the - * new one. - */ - if (unlikely(test_and_clear_bit(IOC_BFQ_IOPRIO_CHANGED, - ioc->ioprio_changed))) - bfq_ioc_set_ioprio(ioc); - - return cic; -err_free: - bfq_cic_free(cic); -err: - put_io_context(ioc); - return NULL; -} diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c deleted file mode 100644 index 9f261ee60..000000000 --- a/block/bfq-iosched.c +++ /dev/null @@ -1,3047 +0,0 @@ -/* - * BFQ, or Budget Fair Queueing, disk scheduler. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * - * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. - * - * BFQ is a proportional share disk scheduling algorithm based on the - * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, - * measured in number of sectors, to tasks instead of time slices. - * The disk is not granted to the active task for a given time slice, - * but until it has exahusted its assigned budget. This change from - * the time to the service domain allows BFQ to distribute the disk - * bandwidth among tasks as desired, without any distortion due to - * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc - * internal scheduler, called B-WF2Q+, to schedule tasks according to - * their budgets. Thanks to this accurate scheduler, BFQ can afford - * to assign high budgets to disk-bound non-seeky tasks (to boost the - * throughput), and yet guarantee low latencies to interactive and - * soft real-time applications. - * - * BFQ has been introduced in [1], where the interested reader can - * find an accurate description of the algorithm, the bandwidth - * distribution and latency guarantees it provides, plus formal proofs - * of all the properties. With respect to the algorithm presented in - * the paper, this implementation adds several little heuristics, and - * a hierarchical extension, based on H-WF2Q+. - * - * B-WF2Q+ is based on WF2Q+, that is described in [2], together with - * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) - * complexity derives from the one introduced with EEVDF in [3]. - * - * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling - * with Deterministic Guarantees on Bandwidth Distribution,'', - * IEEE Transactions on Computer, May 2010. - * - * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf - * - * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing - * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, - * Oct 1997. - * - * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz - * - * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline - * First: A Flexible and Accurate Mechanism for Proportional Share - * Resource Allocation,'' technical report. - * - * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include "bfq.h" - -/* Max number of dispatches in one round of service. */ -static const int bfq_quantum = 4; - -/* Expiration time of sync (0) and async (1) requests, in jiffies. */ -static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; - -/* Maximum backwards seek, in KiB. */ -static const int bfq_back_max = 16 * 1024; - -/* Penalty of a backwards seek, in number of sectors. */ -static const int bfq_back_penalty = 1; - -/* Idling period duration, in jiffies. */ -static int bfq_slice_idle = 0; - -/* Default maximum budget values, in sectors and number of requests. */ -static const int bfq_default_max_budget = 16 * 1024; -static const int bfq_max_budget_async_rq = 4; - -/* - * Async to sync throughput distribution is controlled as follows: - * when an async request is served, the entity is charged the number - * of sectors of the request, multipled by the factor below - */ -static const int bfq_async_charge_factor = 10; - -/* Default timeout values, in jiffies, approximating CFQ defaults. */ -static const int bfq_timeout_sync = HZ / 8; -static int bfq_timeout_async = HZ / 25; - -struct kmem_cache *bfq_pool; -struct kmem_cache *bfq_ioc_pool; - -static DEFINE_PER_CPU(unsigned long, bfq_ioc_count); -static struct completion *bfq_ioc_gone; -static DEFINE_SPINLOCK(bfq_ioc_gone_lock); - -static DEFINE_SPINLOCK(cic_index_lock); -static DEFINE_IDA(cic_index_ida); - -/* Below this threshold (in ms), we consider thinktime immediate. */ -#define BFQ_MIN_TT 2 - -/* hw_tag detection: parallel requests threshold and min samples needed. */ -#define BFQ_HW_QUEUE_THRESHOLD 4 -#define BFQ_HW_QUEUE_SAMPLES 32 - -#define BFQQ_SEEK_THR (sector_t)(8 * 1024) -#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) - -/* Min samples used for peak rate estimation (for autotuning). */ -#define BFQ_PEAK_RATE_SAMPLES 32 - -/* Shift used for peak rate fixed precision calculations. */ -#define BFQ_RATE_SHIFT 16 - -/* - * The duration of the weight raising for interactive applications is - * computed automatically (as default behaviour), using the following - * formula: duration = (R / r) * T, where r is the peak rate of the - * disk, and R and T are two reference parameters. In particular, R is - * the peak rate of a reference disk, and T is about the maximum time - * for starting popular large applications on that disk, under BFQ and - * while reading two files in parallel. Finally, BFQ uses two - * different pairs (R, T) depending on whether the disk is rotational - * or non-rotational. - */ -#define T_rot (msecs_to_jiffies(5500)) -#define T_nonrot (msecs_to_jiffies(2000)) -/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */ -#define R_rot 17415 -#define R_nonrot 34791 - -#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - -#define RQ_CIC(rq) \ - ((struct cfq_io_context *) (rq)->elevator_private[0]) -#define RQ_BFQQ(rq) ((rq)->elevator_private[1]) - -#include "bfq-ioc.c" -#include "bfq-sched.c" -#include "bfq-cgroup.c" - -#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\ - IOPRIO_CLASS_IDLE) -#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\ - IOPRIO_CLASS_RT) - -#define bfq_sample_valid(samples) ((samples) > 80) - -/* - * We regard a request as SYNC, if either it's a read or has the SYNC bit - * set (in which case it could also be a direct WRITE). - */ -static inline int bfq_bio_sync(struct bio *bio) -{ - if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) - return 1; - - return 0; -} - -/* - * Scheduler run of queue, if there are requests pending and no one in the - * driver that will restart queueing. - */ -static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) -{ - if (bfqd->queued != 0) { - bfq_log(bfqd, "schedule dispatch"); - kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work); - } -} - -/* - * Lifted from AS - choose which of rq1 and rq2 that is best served now. - * We choose the request that is closesr to the head right now. Distance - * behind the head is penalized and only allowed to a certain extent. - */ -static struct request *bfq_choose_req(struct bfq_data *bfqd, - struct request *rq1, - struct request *rq2, - sector_t last) -{ - sector_t s1, s2, d1 = 0, d2 = 0; - unsigned long back_max; -#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ - unsigned wrap = 0; /* bit mask: requests behind the disk head? */ - - if (rq1 == NULL || rq1 == rq2) - return rq2; - if (rq2 == NULL) - return rq1; - - if (rq_is_sync(rq1) && !rq_is_sync(rq2)) - return rq1; - else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) - return rq2; - if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) - return rq1; - else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) - return rq2; - - s1 = blk_rq_pos(rq1); - s2 = blk_rq_pos(rq2); - - /* - * By definition, 1KiB is 2 sectors. - */ - back_max = bfqd->bfq_back_max * 2; - - /* - * Strict one way elevator _except_ in the case where we allow - * short backward seeks which are biased as twice the cost of a - * similar forward seek. - */ - if (s1 >= last) - d1 = s1 - last; - else if (s1 + back_max >= last) - d1 = (last - s1) * bfqd->bfq_back_penalty; - else - wrap |= BFQ_RQ1_WRAP; - - if (s2 >= last) - d2 = s2 - last; - else if (s2 + back_max >= last) - d2 = (last - s2) * bfqd->bfq_back_penalty; - else - wrap |= BFQ_RQ2_WRAP; - - /* Found required data */ - - /* - * By doing switch() on the bit mask "wrap" we avoid having to - * check two variables for all permutations: --> faster! - */ - switch (wrap) { - case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ - if (d1 < d2) - return rq1; - else if (d2 < d1) - return rq2; - else { - if (s1 >= s2) - return rq1; - else - return rq2; - } - - case BFQ_RQ2_WRAP: - return rq1; - case BFQ_RQ1_WRAP: - return rq2; - case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ - default: - /* - * Since both rqs are wrapped, - * start with the one that's further behind head - * (--> only *one* back seek required), - * since back seek takes more time than forward. - */ - if (s1 <= s2) - return rq1; - else - return rq2; - } -} - -static struct bfq_queue * -bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - sector_t sector, struct rb_node **ret_parent, - struct rb_node ***rb_link) -{ - struct rb_node **p, *parent; - struct bfq_queue *bfqq = NULL; - - parent = NULL; - p = &root->rb_node; - while (*p) { - struct rb_node **n; - - parent = *p; - bfqq = rb_entry(parent, struct bfq_queue, pos_node); - - /* - * Sort strictly based on sector. Smallest to the left, - * largest to the right. - */ - if (sector > blk_rq_pos(bfqq->next_rq)) - n = &(*p)->rb_right; - else if (sector < blk_rq_pos(bfqq->next_rq)) - n = &(*p)->rb_left; - else - break; - p = n; - bfqq = NULL; - } - - *ret_parent = parent; - if (rb_link) - *rb_link = p; - - bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", - (long long unsigned)sector, - bfqq != NULL ? bfqq->pid : 0); - - return bfqq; -} - -static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - struct rb_node **p, *parent; - struct bfq_queue *__bfqq; - - if (bfqq->pos_root != NULL) { - rb_erase(&bfqq->pos_node, bfqq->pos_root); - bfqq->pos_root = NULL; - } - - if (bfq_class_idle(bfqq)) - return; - if (!bfqq->next_rq) - return; - - bfqq->pos_root = &bfqd->rq_pos_tree; - __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, - blk_rq_pos(bfqq->next_rq), &parent, &p); - if (__bfqq == NULL) { - rb_link_node(&bfqq->pos_node, parent, p); - rb_insert_color(&bfqq->pos_node, bfqq->pos_root); - } else - bfqq->pos_root = NULL; -} - -static struct request *bfq_find_next_rq(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct request *last) -{ - struct rb_node *rbnext = rb_next(&last->rb_node); - struct rb_node *rbprev = rb_prev(&last->rb_node); - struct request *next = NULL, *prev = NULL; - - BUG_ON(RB_EMPTY_NODE(&last->rb_node)); - - if (rbprev != NULL) - prev = rb_entry_rq(rbprev); - - if (rbnext != NULL) - next = rb_entry_rq(rbnext); - else { - rbnext = rb_first(&bfqq->sort_list); - if (rbnext && rbnext != &last->rb_node) - next = rb_entry_rq(rbnext); - } - - return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -} - -static void bfq_del_rq_rb(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; - const int sync = rq_is_sync(rq); - - BUG_ON(bfqq->queued[sync] == 0); - bfqq->queued[sync]--; - bfqd->queued--; - - elv_rb_del(&bfqq->sort_list, rq); - - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue) - bfq_del_bfqq_busy(bfqd, bfqq, 1); - /* - * Remove queue from request-position tree as it is empty. - */ - if (bfqq->pos_root != NULL) { - rb_erase(&bfqq->pos_node, bfqq->pos_root); - bfqq->pos_root = NULL; - } - } -} - -/* see the definition of bfq_async_charge_factor for details */ -static inline unsigned long bfq_serv_to_charge(struct request *rq, - struct bfq_queue *bfqq) -{ - return blk_rq_sectors(rq) * - (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) * - bfq_async_charge_factor)); -} - -/** - * bfq_updated_next_req - update the queue after a new next_rq selection. - * @bfqd: the device data the queue belongs to. - * @bfqq: the queue to update. - * - * If the first request of a queue changes we make sure that the queue - * has enough budget to serve at least its first request (if the - * request has grown). We do this because if the queue has not enough - * budget for its first request, it has to go through two dispatch - * rounds to actually get it dispatched. - */ -static void bfq_updated_next_req(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - struct bfq_service_tree *st = bfq_entity_service_tree(entity); - struct request *next_rq = bfqq->next_rq; - unsigned long new_budget; - - if (next_rq == NULL) - return; - - if (bfqq == bfqd->active_queue) - /* - * In order not to break guarantees, budgets cannot be - * changed after an entity has been selected. - */ - return; - - BUG_ON(entity->tree != &st->active); - BUG_ON(entity == entity->sched_data->active_entity); - - new_budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); - entity->budget = new_budget; - bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); - bfq_activate_bfqq(bfqd, bfqq); -} - -static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd) -{ - u64 dur; - - if (bfqd->bfq_raising_max_time > 0) - return bfqd->bfq_raising_max_time; - - dur = bfqd->RT_prod; - do_div(dur, bfqd->peak_rate); - - return dur; -} - -static void bfq_add_rq_rb(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_entity *entity = &bfqq->entity; - struct bfq_data *bfqd = bfqq->bfqd; - struct request *next_rq, *prev; - unsigned long old_raising_coeff = bfqq->raising_coeff; - int idle_for_long_time = bfqq->budget_timeout + - bfqd->bfq_raising_min_idle_time < jiffies; - - bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq)); - bfqq->queued[rq_is_sync(rq)]++; - bfqd->queued++; - - elv_rb_add(&bfqq->sort_list, rq); - - /* - * Check if this request is a better next-serve candidate. - */ - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); - BUG_ON(next_rq == NULL); - bfqq->next_rq = next_rq; - - /* - * Adjust priority tree position, if next_rq changes. - */ - if (prev != bfqq->next_rq) - bfq_rq_pos_tree_add(bfqd, bfqq); - - if (!bfq_bfqq_busy(bfqq)) { - int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && - bfqq->soft_rt_next_start < jiffies; - entity->budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); - - if (! bfqd->low_latency) - goto add_bfqq_busy; - - /* - * If the queue is not being boosted and has been idle - * for enough time, start a weight-raising period - */ - if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) { - bfqq->raising_coeff = bfqd->bfq_raising_coeff; - if (idle_for_long_time) - bfqq->raising_cur_max_time = - bfq_wrais_duration(bfqd); - else - bfqq->raising_cur_max_time = - bfqd->bfq_raising_rt_max_time; - bfq_log_bfqq(bfqd, bfqq, - "wrais starting at %llu msec," - "rais_max_time %u", - bfqq->last_rais_start_finish, - jiffies_to_msecs(bfqq-> - raising_cur_max_time)); - } else if (old_raising_coeff > 1) { - if (idle_for_long_time) - bfqq->raising_cur_max_time = - bfq_wrais_duration(bfqd); - else if (bfqq->raising_cur_max_time == - bfqd->bfq_raising_rt_max_time && - !soft_rt) { - bfqq->raising_coeff = 1; - bfq_log_bfqq(bfqd, bfqq, - "wrais ending at %llu msec," - "rais_max_time %u", - bfqq->last_rais_start_finish, - jiffies_to_msecs(bfqq-> - raising_cur_max_time)); - } - } - if (old_raising_coeff != bfqq->raising_coeff) - entity->ioprio_changed = 1; -add_bfqq_busy: - bfq_add_bfqq_busy(bfqd, bfqq); - } else { - if(bfqd->low_latency && old_raising_coeff == 1 && - !rq_is_sync(rq) && - bfqq->last_rais_start_finish + - bfqd->bfq_raising_min_inter_arr_async < jiffies) { - bfqq->raising_coeff = bfqd->bfq_raising_coeff; - bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd); - - entity->ioprio_changed = 1; - bfq_log_bfqq(bfqd, bfqq, - "non-idle wrais starting at %llu msec," - "rais_max_time %u", - bfqq->last_rais_start_finish, - jiffies_to_msecs(bfqq-> - raising_cur_max_time)); - } - bfq_updated_next_req(bfqd, bfqq); - } - - if(bfqd->low_latency && - (old_raising_coeff == 1 || bfqq->raising_coeff == 1 || - idle_for_long_time)) - bfqq->last_rais_start_finish = jiffies; -} - -static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq) -{ - elv_rb_del(&bfqq->sort_list, rq); - bfqq->queued[rq_is_sync(rq)]--; - bfqq->bfqd->queued--; - bfq_add_rq_rb(rq); -} - -static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, - struct bio *bio) -{ - struct task_struct *tsk = current; - struct cfq_io_context *cic; - struct bfq_queue *bfqq; - - cic = bfq_cic_lookup(bfqd, tsk->io_context); - if (cic == NULL) - return NULL; - - bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio)); - if (bfqq != NULL) { - sector_t sector = bio->bi_sector + bio_sectors(bio); - - return elv_rb_find(&bfqq->sort_list, sector); - } - - return NULL; -} - -static void bfq_activate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - bfqd->rq_in_driver++; - bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); - bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", - (long long unsigned)bfqd->last_position); -} - -static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - WARN_ON(bfqd->rq_in_driver == 0); - bfqd->rq_in_driver--; -} - -static void bfq_remove_request(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; - - if (bfqq->next_rq == rq) { - bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); - bfq_updated_next_req(bfqd, bfqq); - } - - list_del_init(&rq->queuelist); - bfq_del_rq_rb(rq); - - if (rq->cmd_flags & REQ_META) { - WARN_ON(bfqq->meta_pending == 0); - bfqq->meta_pending--; - } -} - -static int bfq_merge(struct request_queue *q, struct request **req, - struct bio *bio) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct request *__rq; - - __rq = bfq_find_rq_fmerge(bfqd, bio); - if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { - *req = __rq; - return ELEVATOR_FRONT_MERGE; - } - - return ELEVATOR_NO_MERGE; -} - -static void bfq_merged_request(struct request_queue *q, struct request *req, - int type) -{ - if (type == ELEVATOR_FRONT_MERGE) { - struct bfq_queue *bfqq = RQ_BFQQ(req); - - bfq_reposition_rq_rb(bfqq, req); - } -} - -static void bfq_merged_requests(struct request_queue *q, struct request *rq, - struct request *next) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - /* - * Reposition in fifo if next is older than rq. - */ - if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && - time_before(rq_fifo_time(next), rq_fifo_time(rq))) { - list_move(&rq->queuelist, &next->queuelist); - rq_set_fifo_time(rq, rq_fifo_time(next)); - } - - if (bfqq->next_rq == next) - bfqq->next_rq = rq; - - bfq_remove_request(next); -} - -static int bfq_allow_merge(struct request_queue *q, struct request *rq, - struct bio *bio) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct cfq_io_context *cic; - struct bfq_queue *bfqq; - - /* Disallow merge of a sync bio into an async request. */ - if (bfq_bio_sync(bio) && !rq_is_sync(rq)) - return 0; - - /* - * Lookup the bfqq that this bio will be queued with. Allow - * merge only if rq is queued there. - */ - cic = bfq_cic_lookup(bfqd, current->io_context); - if (cic == NULL) - return 0; - - bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio)); - return bfqq == RQ_BFQQ(rq); -} - -static void __bfq_set_active_queue(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - if (bfqq != NULL) { - bfq_mark_bfqq_must_alloc(bfqq); - bfq_mark_bfqq_budget_new(bfqq); - bfq_clear_bfqq_fifo_expire(bfqq); - - bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; - - bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu", - bfqq->entity.budget); - } - - bfqd->active_queue = bfqq; -} - -/* - * Get and set a new active queue for service. - */ -static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - if (!bfqq) - bfqq = bfq_get_next_queue(bfqd); - else - bfq_get_next_queue_forced(bfqd, bfqq); - - __bfq_set_active_queue(bfqd, bfqq); - return bfqq; -} - -static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd, - struct request *rq) -{ - if (blk_rq_pos(rq) >= bfqd->last_position) - return blk_rq_pos(rq) - bfqd->last_position; - else - return bfqd->last_position - blk_rq_pos(rq); -} - -/* - * Return true if bfqq has no request pending and rq is close enough to - * bfqd->last_position, or if rq is closer to bfqd->last_position than - * bfqq->next_rq - */ -static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq) -{ - return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR; -} - -static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) -{ - struct rb_root *root = &bfqd->rq_pos_tree; - struct rb_node *parent, *node; - struct bfq_queue *__bfqq; - sector_t sector = bfqd->last_position; - - if (RB_EMPTY_ROOT(root)) - return NULL; - - /* - * First, if we find a request starting at the end of the last - * request, choose it. - */ - __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); - if (__bfqq != NULL) - return __bfqq; - - /* - * If the exact sector wasn't found, the parent of the NULL leaf - * will contain the closest sector (rq_pos_tree sorted by next_request - * position). - */ - __bfqq = rb_entry(parent, struct bfq_queue, pos_node); - if (bfq_rq_close(bfqd, __bfqq->next_rq)) - return __bfqq; - - if (blk_rq_pos(__bfqq->next_rq) < sector) - node = rb_next(&__bfqq->pos_node); - else - node = rb_prev(&__bfqq->pos_node); - if (node == NULL) - return NULL; - - __bfqq = rb_entry(node, struct bfq_queue, pos_node); - if (bfq_rq_close(bfqd, __bfqq->next_rq)) - return __bfqq; - - return NULL; -} - -/* - * bfqd - obvious - * cur_bfqq - passed in so that we don't decide that the current queue - * is closely cooperating with itself. - * - * We are assuming that cur_bfqq has dispatched at least one request, - * and that bfqd->last_position reflects a position on the disk associated - * with the I/O issued by cur_bfqq. - */ -static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, - struct bfq_queue *cur_bfqq) -{ - struct bfq_queue *bfqq; - - if (bfq_class_idle(cur_bfqq)) - return NULL; - if (!bfq_bfqq_sync(cur_bfqq)) - return NULL; - if (BFQQ_SEEKY(cur_bfqq)) - return NULL; - - /* If device has only one backlogged bfq_queue, don't search. */ - if (bfqd->busy_queues == 1) - return NULL; - - /* - * We should notice if some of the queues are cooperating, e.g. - * working closely on the same area of the disk. In that case, - * we can group them together and don't waste time idling. - */ - bfqq = bfqq_close(bfqd); - if (bfqq == NULL || bfqq == cur_bfqq) - return NULL; - - /* - * Do not merge queues from different bfq_groups. - */ - if (bfqq->entity.parent != cur_bfqq->entity.parent) - return NULL; - - /* - * It only makes sense to merge sync queues. - */ - if (!bfq_bfqq_sync(bfqq)) - return NULL; - if (BFQQ_SEEKY(bfqq)) - return NULL; - - /* - * Do not merge queues of different priority classes. - */ - if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq)) - return NULL; - - return bfqq; -} - -/* - * If enough samples have been computed, return the current max budget - * stored in bfqd, which is dynamically updated according to the - * estimated disk peak rate; otherwise return the default max budget - */ -static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) -{ - if (bfqd->budgets_assigned < 194) - return bfq_default_max_budget; - else - return bfqd->bfq_max_budget; -} - -/* - * Return min budget, which is a fraction of the current or default - * max budget (trying with 1/32) - */ -static inline unsigned long bfq_min_budget(struct bfq_data *bfqd) -{ - if (bfqd->budgets_assigned < 194) - return bfq_default_max_budget; - else - return bfqd->bfq_max_budget / 32; -} - -/* - * Decides whether idling should be done for given device and - * given active queue. - */ -static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd, - struct bfq_queue *active_bfqq) -{ - if (active_bfqq == NULL) - return false; - /* - * If device is SSD it has no seek penalty, disable idling; but - * do so only if: - * - device does not support queuing, otherwise we still have - * a problem with sync vs async workloads; - * - the queue is not weight-raised, to preserve guarantees. - */ - return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag && - active_bfqq->raising_coeff == 1); -} - -static void bfq_arm_slice_timer(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq = bfqd->active_queue; - struct cfq_io_context *cic; - unsigned long sl; - - WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - - if (bfq_queue_nonrot_noidle(bfqd, bfqq)) - return; - - /* Idling is disabled, either manually or by past process history. */ - if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_idle_window(bfqq)) - return; - - /* Tasks have exited, don't wait. */ - cic = bfqd->active_cic; - if (cic == NULL || atomic_read(&cic->ioc->nr_tasks) == 0) - return; - - bfq_mark_bfqq_wait_request(bfqq); - - /* - * We don't want to idle for seeks, but we do want to allow - * fair distribution of slice time for a process doing back-to-back - * seeks. So allow a little bit of time for him to submit a new rq. - * - * To prevent processes with (partly) seeky workloads from - * being too ill-treated, grant them a small fraction of the - * assigned budget before reducing the waiting time to - * BFQ_MIN_TT. This happened to help reduce latency. - */ - sl = bfqd->bfq_slice_idle; - if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) && - bfqq->entity.service > bfq_max_budget(bfqd) / 8 && - bfqq->raising_coeff == 1) - sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); - else if (bfqq->raising_coeff > 1) - sl = sl * 3; - bfqd->last_idling_start = ktime_get(); - mod_timer(&bfqd->idle_slice_timer, jiffies + sl); - bfq_log(bfqd, "arm idle: %u/%u ms", - jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); -} - -/* - * Set the maximum time for the active queue to consume its - * budget. This prevents seeky processes from lowering the disk - * throughput (always guaranteed with a time slice scheme as in CFQ). - */ -static void bfq_set_budget_timeout(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq = bfqd->active_queue; - unsigned int timeout_coeff; - if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time) - timeout_coeff = 1; - else - timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; - - bfqd->last_budget_start = ktime_get(); - - bfq_clear_bfqq_budget_new(bfqq); - bfqq->budget_timeout = jiffies + - bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; - - bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", - jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * - timeout_coeff)); -} - -/* - * Move request from internal lists to the request queue dispatch list. - */ -static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - bfq_remove_request(rq); - bfqq->dispatched++; - elv_dispatch_sort(q, rq); - - if (bfq_bfqq_sync(bfqq)) - bfqd->sync_flight++; -} - -/* - * Return expired entry, or NULL to just start from scratch in rbtree. - */ -static struct request *bfq_check_fifo(struct bfq_queue *bfqq) -{ - struct request *rq = NULL; - - if (bfq_bfqq_fifo_expire(bfqq)) - return NULL; - - bfq_mark_bfqq_fifo_expire(bfqq); - - if (list_empty(&bfqq->fifo)) - return NULL; - - rq = rq_entry_fifo(bfqq->fifo.next); - - if (time_before(jiffies, rq_fifo_time(rq))) - return NULL; - - return rq; -} - -/* - * Must be called with the queue_lock held. - */ -static int bfqq_process_refs(struct bfq_queue *bfqq) -{ - int process_refs, io_refs; - - io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; - process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; - BUG_ON(process_refs < 0); - return process_refs; -} - -static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -{ - int process_refs, new_process_refs; - struct bfq_queue *__bfqq; - - /* - * If there are no process references on the new_bfqq, then it is - * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain - * may have dropped their last reference (not just their last process - * reference). - */ - if (!bfqq_process_refs(new_bfqq)) - return; - - /* Avoid a circular list and skip interim queue merges. */ - while ((__bfqq = new_bfqq->new_bfqq)) { - if (__bfqq == bfqq) - return; - new_bfqq = __bfqq; - } - - process_refs = bfqq_process_refs(bfqq); - new_process_refs = bfqq_process_refs(new_bfqq); - /* - * If the process for the bfqq has gone away, there is no - * sense in merging the queues. - */ - if (process_refs == 0 || new_process_refs == 0) - return; - - /* - * Merge in the direction of the lesser amount of work. - */ - if (new_process_refs >= process_refs) { - bfqq->new_bfqq = new_bfqq; - atomic_add(process_refs, &new_bfqq->ref); - } else { - new_bfqq->new_bfqq = bfqq; - atomic_add(new_process_refs, &bfqq->ref); - } - bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", - new_bfqq->pid); -} - -static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - return entity->budget - entity->service; -} - -static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - BUG_ON(bfqq != bfqd->active_queue); - - __bfq_bfqd_reset_active(bfqd); - - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - bfq_del_bfqq_busy(bfqd, bfqq, 1); - /* - * overloading budget_timeout field to store when - * the queue remains with no backlog, used by - * the weight-raising mechanism - */ - bfqq->budget_timeout = jiffies ; - } - else { - bfq_activate_bfqq(bfqd, bfqq); - /* - * Resort priority tree of potential close cooperators. - */ - bfq_rq_pos_tree_add(bfqd, bfqq); - } - - /* - * If this bfqq is shared between multiple processes, check - * to make sure that those processes are still issuing I/Os - * within the mean seek distance. If not, it may be time to - * break the queues apart again. - */ - if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) - bfq_mark_bfqq_split_coop(bfqq); -} - -/** - * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. - * @bfqd: device data. - * @bfqq: queue to update. - * @reason: reason for expiration. - * - * Handle the feedback on @bfqq budget. See the body for detailed - * comments. - */ -static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - enum bfqq_expiration reason) -{ - struct request *next_rq; - unsigned long budget, min_budget; - - budget = bfqq->max_budget; - min_budget = bfq_min_budget(bfqd); - - BUG_ON(bfqq != bfqd->active_queue); - - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", - bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", - budget, bfq_min_budget(bfqd)); - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", - bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue)); - - if (bfq_bfqq_sync(bfqq)) { - switch (reason) { - /* - * Caveat: in all the following cases we trade latency - * for throughput. - */ - case BFQ_BFQQ_TOO_IDLE: - /* - * This is the only case where we may reduce - * the budget: if there is no requets of the - * process still waiting for completion, then - * we assume (tentatively) that the timer has - * expired because the batch of requests of - * the process could have been served with a - * smaller budget. Hence, betting that - * process will behave in the same way when it - * becomes backlogged again, we reduce its - * next budget. As long as we guess right, - * this budget cut reduces the latency - * experienced by the process. - * - * However, if there are still outstanding - * requests, then the process may have not yet - * issued its next request just because it is - * still waiting for the completion of some of - * the still oustanding ones. So in this - * subcase we do not reduce its budget, on the - * contrary we increase it to possibly boost - * the throughput, as discussed in the - * comments to the BUDGET_TIMEOUT case. - */ - if (bfqq->dispatched > 0) /* still oustanding reqs */ - budget = min(budget * 2, bfqd->bfq_max_budget); - else { - if (budget > 5 * min_budget) - budget -= 4 * min_budget; - else - budget = min_budget; - } - break; - case BFQ_BFQQ_BUDGET_TIMEOUT: - /* - * We double the budget here because: 1) it - * gives the chance to boost the throughput if - * this is not a seeky process (which may have - * bumped into this timeout because of, e.g., - * ZBR), 2) together with charge_full_budget - * it helps give seeky processes higher - * timestamps, and hence be served less - * frequently. - */ - budget = min(budget * 2, bfqd->bfq_max_budget); - break; - case BFQ_BFQQ_BUDGET_EXHAUSTED: - /* - * The process still has backlog, and did not - * let either the budget timeout or the disk - * idling timeout expire. Hence it is not - * seeky, has a short thinktime and may be - * happy with a higher budget too. So - * definitely increase the budget of this good - * candidate to boost the disk throughput. - */ - budget = min(budget * 4, bfqd->bfq_max_budget); - break; - case BFQ_BFQQ_NO_MORE_REQUESTS: - /* - * Leave the budget unchanged. - */ - default: - return; - } - } else /* async queue */ - /* async queues get always the maximum possible budget - * (their ability to dispatch is limited by - * @bfqd->bfq_max_budget_async_rq). - */ - budget = bfqd->bfq_max_budget; - - bfqq->max_budget = budget; - - if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 && - bfqq->max_budget > bfqd->bfq_max_budget) - bfqq->max_budget = bfqd->bfq_max_budget; - - /* - * Make sure that we have enough budget for the next request. - * Since the finish time of the bfqq must be kept in sync with - * the budget, be sure to call __bfq_bfqq_expire() after the - * update. - */ - next_rq = bfqq->next_rq; - if (next_rq != NULL) - bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); - else - bfqq->entity.budget = bfqq->max_budget; - - bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu", - next_rq != NULL ? blk_rq_sectors(next_rq) : 0, - bfqq->entity.budget); -} - -static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) -{ - unsigned long max_budget; - - /* - * The max_budget calculated when autotuning is equal to the - * amount of sectors transfered in timeout_sync at the - * estimated peak rate. - */ - max_budget = (unsigned long)(peak_rate * 1000 * - timeout >> BFQ_RATE_SHIFT); - - return max_budget; -} - -/* - * In addition to updating the peak rate, checks whether the process - * is "slow", and returns 1 if so. This slow flag is used, in addition - * to the budget timeout, to reduce the amount of service provided to - * seeky processes, and hence reduce their chances to lower the - * throughput. See the code for more details. - */ -static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, - int compensate, enum bfqq_expiration reason) -{ - u64 bw, usecs, expected, timeout; - ktime_t delta; - int update = 0; - - if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) - return 0; - - if (compensate) - delta = bfqd->last_idling_start; - else - delta = ktime_get(); - delta = ktime_sub(delta, bfqd->last_budget_start); - usecs = ktime_to_us(delta); - - /* Don't trust short/unrealistic values. */ - if (usecs < 100 || usecs >= LONG_MAX) - return 0; - - /* - * Calculate the bandwidth for the last slice. We use a 64 bit - * value to store the peak rate, in sectors per usec in fixed - * point math. We do so to have enough precision in the estimate - * and to avoid overflows. - */ - bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; - do_div(bw, (unsigned long)usecs); - - timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); - - /* - * Use only long (> 20ms) intervals to filter out spikes for - * the peak rate estimation. - */ - if (usecs > 20000) { - if (bw > bfqd->peak_rate || - (!BFQQ_SEEKY(bfqq) && - reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { - bfq_log(bfqd, "measured bw =%llu", bw); - /* - * To smooth oscillations use a low-pass filter with - * alpha=7/8, i.e., - * new_rate = (7/8) * old_rate + (1/8) * bw - */ - do_div(bw, 8); - bfqd->peak_rate *= 7; - do_div(bfqd->peak_rate, 8); - bfqd->peak_rate += bw; - update = 1; - bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); - } - - update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; - - if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) - bfqd->peak_rate_samples++; - - if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && - update && bfqd->bfq_user_max_budget == 0) { - bfqd->bfq_max_budget = - bfq_calc_max_budget(bfqd->peak_rate, timeout); - bfq_log(bfqd, "new max_budget=%lu", - bfqd->bfq_max_budget); - } - } - - /* - * If the process has been served for a too short time - * interval to let its possible sequential accesses prevail on - * the initial seek time needed to move the disk head on the - * first sector it requested, then give the process a chance - * and for the moment return false. - */ - if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) - return 0; - - /* - * A process is considered ``slow'' (i.e., seeky, so that we - * cannot treat it fairly in the service domain, as it would - * slow down too much the other processes) if, when a slice - * ends for whatever reason, it has received service at a - * rate that would not be high enough to complete the budget - * before the budget timeout expiration. - */ - expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; - - /* - * Caveat: processes doing IO in the slower disk zones will - * tend to be slow(er) even if not seeky. And the estimated - * peak rate will actually be an average over the disk - * surface. Hence, to not be too harsh with unlucky processes, - * we keep a budget/3 margin of safety before declaring a - * process slow. - */ - return expected > (4 * bfqq->entity.budget) / 3; -} - -/** - * bfq_bfqq_expire - expire a queue. - * @bfqd: device owning the queue. - * @bfqq: the queue to expire. - * @compensate: if true, compensate for the time spent idling. - * @reason: the reason causing the expiration. - * - * - * If the process associated to the queue is slow (i.e., seeky), or in - * case of budget timeout, or, finally, if it is async, we - * artificially charge it an entire budget (independently of the - * actual service it received). As a consequence, the queue will get - * higher timestamps than the correct ones upon reactivation, and - * hence it will be rescheduled as if it had received more service - * than what it actually received. In the end, this class of processes - * will receive less service in proportion to how slowly they consume - * their budgets (and hence how seriously they tend to lower the - * throughput). - * - * In contrast, when a queue expires because it has been idling for - * too much or because it exhausted its budget, we do not touch the - * amount of service it has received. Hence when the queue will be - * reactivated and its timestamps updated, the latter will be in sync - * with the actual service received by the queue until expiration. - * - * Charging a full budget to the first type of queues and the exact - * service to the others has the effect of using the WF2Q+ policy to - * schedule the former on a timeslice basis, without violating the - * service domain guarantees of the latter. - */ -static void bfq_bfqq_expire(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - int compensate, - enum bfqq_expiration reason) -{ - int slow; - BUG_ON(bfqq != bfqd->active_queue); - - /* Update disk peak rate for autotuning and check whether the - * process is slow (see bfq_update_peak_rate). - */ - slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); - - /* - * As above explained, 'punish' slow (i.e., seeky), timed-out - * and async queues, to favor sequential sync workloads. - * - * Processes doing IO in the slower disk zones will tend to be - * slow(er) even if not seeky. Hence, since the estimated peak - * rate is actually an average over the disk surface, these - * processes may timeout just for bad luck. To avoid punishing - * them we do not charge a full budget to a process that - * succeeded in consuming at least 2/3 of its budget. - */ - if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) - bfq_bfqq_charge_full_budget(bfqq); - - if (bfqd->low_latency && bfqq->raising_coeff == 1) - bfqq->last_rais_start_finish = jiffies; - - if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) { - if(reason != BFQ_BFQQ_BUDGET_TIMEOUT) - bfqq->soft_rt_next_start = - jiffies + - HZ * bfqq->entity.service / - bfqd->bfq_raising_max_softrt_rate; - else - bfqq->soft_rt_next_start = -1; /* infinity */ - } - bfq_log_bfqq(bfqd, bfqq, - "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow, - bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); - - /* Increase, decrease or leave budget unchanged according to reason */ - __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); - __bfq_bfqq_expire(bfqd, bfqq); -} - -/* - * Budget timeout is not implemented through a dedicated timer, but - * just checked on request arrivals and completions, as well as on - * idle timer expirations. - */ -static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -{ - if (bfq_bfqq_budget_new(bfqq)) - return 0; - - if (time_before(jiffies, bfqq->budget_timeout)) - return 0; - - return 1; -} - -/* - * If we expire a queue that is waiting for the arrival of a new - * request, we may prevent the fictitious timestamp backshifting that - * allows the guarantees of the queue to be preserved (see [1] for - * this tricky aspect). Hence we return true only if this condition - * does not hold, or if the queue is slow enough to deserve only to be - * kicked off for preserving a high throughput. -*/ -static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -{ - bfq_log_bfqq(bfqq->bfqd, bfqq, - "may_budget_timeout: wr %d left %d timeout %d", - bfq_bfqq_wait_request(bfqq), - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, - bfq_bfqq_budget_timeout(bfqq)); - - return (!bfq_bfqq_wait_request(bfqq) || - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) - && - bfq_bfqq_budget_timeout(bfqq); -} - -/* - * Select a queue for service. If we have a current active queue, - * check whether to continue servicing it, or retrieve and set a new one. - */ -static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq, *new_bfqq = NULL; - struct request *next_rq; - enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; - - bfqq = bfqd->active_queue; - if (bfqq == NULL) - goto new_queue; - - bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue"); - - /* - * If another queue has a request waiting within our mean seek - * distance, let it run. The expire code will check for close - * cooperators and put the close queue at the front of the - * service tree. If possible, merge the expiring queue with the - * new bfqq. - */ - new_bfqq = bfq_close_cooperator(bfqd, bfqq); - if (new_bfqq != NULL && bfqq->new_bfqq == NULL) - bfq_setup_merge(bfqq, new_bfqq); - - if (bfq_may_expire_for_budg_timeout(bfqq)) - goto expire; - - next_rq = bfqq->next_rq; - /* - * If bfqq has requests queued and it has enough budget left to - * serve them, keep the queue, otherwise expire it. - */ - if (next_rq != NULL) { - if (bfq_serv_to_charge(next_rq, bfqq) > - bfq_bfqq_budget_left(bfqq)) { - reason = BFQ_BFQQ_BUDGET_EXHAUSTED; - goto expire; - } else { - /* - * The idle timer may be pending because we may not - * disable disk idling even when a new request arrives - */ - if (timer_pending(&bfqd->idle_slice_timer)) { - /* - * If we get here: 1) at least a new request - * has arrived but we have not disabled the - * timer because the request was too small, - * 2) then the block layer has unplugged the - * device, causing the dispatch to be invoked. - * - * Since the device is unplugged, now the - * requests are probably large enough to - * provide a reasonable throughput. - * So we disable idling. - */ - bfq_clear_bfqq_wait_request(bfqq); - del_timer(&bfqd->idle_slice_timer); - } - if (new_bfqq == NULL) - goto keep_queue; - else - goto expire; - } - } - - /* - * No requests pending. If there is no cooperator, and the active - * queue still has requests in flight or is idling for a new request, - * then keep it. - */ - if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) || - (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq) && - !bfq_queue_nonrot_noidle(bfqd, bfqq)))) { - bfqq = NULL; - goto keep_queue; - } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) { - /* - * Expiring the queue because there is a close cooperator, - * cancel timer. - */ - bfq_clear_bfqq_wait_request(bfqq); - del_timer(&bfqd->idle_slice_timer); - } - - reason = BFQ_BFQQ_NO_MORE_REQUESTS; -expire: - bfq_bfqq_expire(bfqd, bfqq, 0, reason); -new_queue: - bfqq = bfq_set_active_queue(bfqd, new_bfqq); - bfq_log(bfqd, "select_queue: new queue %d returned", - bfqq != NULL ? bfqq->pid : 0); -keep_queue: - return bfqq; -} - -static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - if (bfqq->raising_coeff > 1) { /* queue is being boosted */ - struct bfq_entity *entity = &bfqq->entity; - - bfq_log_bfqq(bfqd, bfqq, - "raising period dur %u/%u msec, " - "old raising coeff %u, w %d(%d)", - jiffies_to_msecs(jiffies - - bfqq->last_rais_start_finish), - jiffies_to_msecs(bfqq->raising_cur_max_time), - bfqq->raising_coeff, - bfqq->entity.weight, bfqq->entity.orig_weight); - - BUG_ON(bfqq != bfqd->active_queue && entity->weight != - entity->orig_weight * bfqq->raising_coeff); - if(entity->ioprio_changed) - bfq_log_bfqq(bfqd, bfqq, - "WARN: pending prio change"); - /* - * If too much time has elapsed from the beginning - * of this weight-raising period and process is not soft - * real-time, stop it - */ - if (jiffies - bfqq->last_rais_start_finish > - bfqq->raising_cur_max_time) { - int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && - bfqq->soft_rt_next_start < jiffies; - - bfqq->last_rais_start_finish = jiffies; - if (soft_rt) - bfqq->raising_cur_max_time = - bfqd->bfq_raising_rt_max_time; - else { - bfqq->raising_coeff = 1; - entity->ioprio_changed = 1; - __bfq_entity_update_weight_prio( - bfq_entity_service_tree(entity), - entity); - } - } - } -} - - -/* - * Dispatch one request from bfqq, moving it to the request queue - * dispatch list. - */ -static int bfq_dispatch_request(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - int dispatched = 0; - struct request *rq; - unsigned long service_to_charge; - - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); - - /* Follow expired path, else get first next available. */ - rq = bfq_check_fifo(bfqq); - if (rq == NULL) - rq = bfqq->next_rq; - service_to_charge = bfq_serv_to_charge(rq, bfqq); - - if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { - /* - * This may happen if the next rq is chosen - * in fifo order instead of sector order. - * The budget is properly dimensioned - * to be always sufficient to serve the next request - * only if it is chosen in sector order. The reason is - * that it would be quite inefficient and little useful - * to always make sure that the budget is large enough - * to serve even the possible next rq in fifo order. - * In fact, requests are seldom served in fifo order. - * - * Expire the queue for budget exhaustion, and - * make sure that the next act_budget is enough - * to serve the next request, even if it comes - * from the fifo expired path. - */ - bfqq->next_rq = rq; - /* - * Since this dispatch is failed, make sure that - * a new one will be performed - */ - if (!bfqd->rq_in_driver) - bfq_schedule_dispatch(bfqd); - goto expire; - } - - /* Finally, insert request into driver dispatch list. */ - bfq_bfqq_served(bfqq, service_to_charge); - bfq_dispatch_insert(bfqd->queue, rq); - - update_raising_data(bfqd, bfqq); - - bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), " - "budg left %lu", - blk_rq_sectors(rq), - (long long unsigned)blk_rq_pos(rq), - bfq_bfqq_budget_left(bfqq)); - - dispatched++; - - if (bfqd->active_cic == NULL) { - atomic_long_inc(&RQ_CIC(rq)->ioc->refcount); - bfqd->active_cic = RQ_CIC(rq); - } - - if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && - dispatched >= bfqd->bfq_max_budget_async_rq) || - bfq_class_idle(bfqq))) - goto expire; - - return dispatched; - -expire: - bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED); - return dispatched; -} - -static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -{ - int dispatched = 0; - - while (bfqq->next_rq != NULL) { - bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); - dispatched++; - } - - BUG_ON(!list_empty(&bfqq->fifo)); - return dispatched; -} - -/* - * Drain our current requests. Used for barriers and when switching - * io schedulers on-the-fly. - */ -static int bfq_forced_dispatch(struct bfq_data *bfqd) -{ - struct bfq_queue *bfqq, *n; - struct bfq_service_tree *st; - int dispatched = 0; - - bfqq = bfqd->active_queue; - if (bfqq != NULL) - __bfq_bfqq_expire(bfqd, bfqq); - - /* - * Loop through classes, and be careful to leave the scheduler - * in a consistent state, as feedback mechanisms and vtime - * updates cannot be disabled during the process. - */ - list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { - st = bfq_entity_service_tree(&bfqq->entity); - - dispatched += __bfq_forced_dispatch_bfqq(bfqq); - bfqq->max_budget = bfq_max_budget(bfqd); - - bfq_forget_idle(st); - } - - BUG_ON(bfqd->busy_queues != 0); - - return dispatched; -} - -static int bfq_dispatch_requests(struct request_queue *q, int force) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq; - int max_dispatch; - - bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); - if (bfqd->busy_queues == 0) - return 0; - - if (unlikely(force)) - return bfq_forced_dispatch(bfqd); - - if((bfqq = bfq_select_queue(bfqd)) == NULL) - return 0; - - max_dispatch = bfqd->bfq_quantum; - if (bfq_class_idle(bfqq)) - max_dispatch = 1; - - if (!bfq_bfqq_sync(bfqq)) - max_dispatch = bfqd->bfq_max_budget_async_rq; - - if (bfqq->dispatched >= max_dispatch) { - if (bfqd->busy_queues > 1) - return 0; - if (bfqq->dispatched >= 4 * max_dispatch) - return 0; - } - - if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) - return 0; - - bfq_clear_bfqq_wait_request(bfqq); - BUG_ON(timer_pending(&bfqd->idle_slice_timer)); - - if (! bfq_dispatch_request(bfqd, bfqq)) - return 0; - - bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d" - "(max_disp %d)", bfqq->pid, max_dispatch); - - return 1; -} - -/* - * Task holds one reference to the queue, dropped when task exits. Each rq - * in-flight on this queue also holds a reference, dropped when rq is freed. - * - * Queue lock must be held here. - */ -static void bfq_put_queue(struct bfq_queue *bfqq) -{ - struct bfq_data *bfqd = bfqq->bfqd; - - BUG_ON(atomic_read(&bfqq->ref) <= 0); - - bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, - atomic_read(&bfqq->ref)); - if (!atomic_dec_and_test(&bfqq->ref)) - return; - - BUG_ON(rb_first(&bfqq->sort_list) != NULL); - BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); - BUG_ON(bfqq->entity.tree != NULL); - BUG_ON(bfq_bfqq_busy(bfqq)); - BUG_ON(bfqd->active_queue == bfqq); - - bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); -} - -static void bfq_put_cooperator(struct bfq_queue *bfqq) -{ - struct bfq_queue *__bfqq, *next; - - /* - * If this queue was scheduled to merge with another queue, be - * sure to drop the reference taken on that queue (and others in - * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. - */ - __bfqq = bfqq->new_bfqq; - while (__bfqq) { - if (__bfqq == bfqq) { - WARN(1, "bfqq->new_bfqq loop detected.\n"); - break; - } - next = __bfqq->new_bfqq; - bfq_put_queue(__bfqq); - __bfqq = next; - } -} - -static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - if (bfqq == bfqd->active_queue) { - __bfq_bfqq_expire(bfqd, bfqq); - bfq_schedule_dispatch(bfqd); - } - - bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, - atomic_read(&bfqq->ref)); - - bfq_put_cooperator(bfqq); - - bfq_put_queue(bfqq); -} - -/* - * Update the entity prio values; note that the new values will not - * be used until the next (re)activation. - */ -static void bfq_init_prio_data(struct bfq_queue *bfqq, struct io_context *ioc) -{ - struct task_struct *tsk = current; - int ioprio_class; - - if (!bfq_bfqq_prio_changed(bfqq)) - return; - - ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); - switch (ioprio_class) { - default: - printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class); - case IOPRIO_CLASS_NONE: - /* - * No prio set, inherit CPU scheduling settings. - */ - bfqq->entity.new_ioprio = task_nice_ioprio(tsk); - bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk); - break; - case IOPRIO_CLASS_RT: - bfqq->entity.new_ioprio = task_ioprio(ioc); - bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT; - break; - case IOPRIO_CLASS_BE: - bfqq->entity.new_ioprio = task_ioprio(ioc); - bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; - break; - case IOPRIO_CLASS_IDLE: - bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE; - bfqq->entity.new_ioprio = 7; - bfq_clear_bfqq_idle_window(bfqq); - break; - } - - bfqq->entity.ioprio_changed = 1; - - /* - * Keep track of original prio settings in case we have to temporarily - * elevate the priority of this queue. - */ - bfqq->org_ioprio = bfqq->entity.new_ioprio; - bfqq->org_ioprio_class = bfqq->entity.new_ioprio_class; - bfq_clear_bfqq_prio_changed(bfqq); -} - -static void bfq_changed_ioprio(struct io_context *ioc, - struct cfq_io_context *cic) -{ - struct bfq_data *bfqd; - struct bfq_queue *bfqq, *new_bfqq; - struct bfq_group *bfqg; - unsigned long uninitialized_var(flags); - - bfqd = bfq_get_bfqd_locked(&cic->key, &flags); - if (unlikely(bfqd == NULL)) - return; - - bfqq = cic->cfqq[BLK_RW_ASYNC]; - if (bfqq != NULL) { - bfqg = container_of(bfqq->entity.sched_data, struct bfq_group, - sched_data); - new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, cic->ioc, - GFP_ATOMIC); - if (new_bfqq != NULL) { - cic->cfqq[BLK_RW_ASYNC] = new_bfqq; - bfq_log_bfqq(bfqd, bfqq, - "changed_ioprio: bfqq %p %d", - bfqq, atomic_read(&bfqq->ref)); - bfq_put_queue(bfqq); - } - } - - bfqq = cic->cfqq[BLK_RW_SYNC]; - if (bfqq != NULL) - bfq_mark_bfqq_prio_changed(bfqq); - - bfq_put_bfqd_unlock(bfqd, &flags); -} - -static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - pid_t pid, int is_sync) -{ - RB_CLEAR_NODE(&bfqq->entity.rb_node); - INIT_LIST_HEAD(&bfqq->fifo); - - atomic_set(&bfqq->ref, 0); - bfqq->bfqd = bfqd; - - bfq_mark_bfqq_prio_changed(bfqq); - - if (is_sync) { - if (!bfq_class_idle(bfqq)) - bfq_mark_bfqq_idle_window(bfqq); - bfq_mark_bfqq_sync(bfqq); - } - - /* Tentative initial value to trade off between thr and lat */ - bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; - bfqq->pid = pid; - - bfqq->raising_coeff = 1; - bfqq->last_rais_start_finish = 0; - bfqq->soft_rt_next_start = -1; -} - -static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, - struct bfq_group *bfqg, - int is_sync, - struct io_context *ioc, - gfp_t gfp_mask) -{ - struct bfq_queue *bfqq, *new_bfqq = NULL; - struct cfq_io_context *cic; - -retry: - cic = bfq_cic_lookup(bfqd, ioc); - /* cic always exists here */ - bfqq = cic_to_bfqq(cic, is_sync); - - /* - * Always try a new alloc if we fall back to the OOM bfqq - * originally, since it should just be a temporary situation. - */ - if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { - bfqq = NULL; - if (new_bfqq != NULL) { - bfqq = new_bfqq; - new_bfqq = NULL; - } else if (gfp_mask & __GFP_WAIT) { - spin_unlock_irq(bfqd->queue->queue_lock); - new_bfqq = kmem_cache_alloc_node(bfq_pool, - gfp_mask | __GFP_ZERO, - bfqd->queue->node); - spin_lock_irq(bfqd->queue->queue_lock); - if (new_bfqq != NULL) - goto retry; - } else { - bfqq = kmem_cache_alloc_node(bfq_pool, - gfp_mask | __GFP_ZERO, - bfqd->queue->node); - } - - if (bfqq != NULL) { - bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync); - bfq_log_bfqq(bfqd, bfqq, "allocated"); - } else { - bfqq = &bfqd->oom_bfqq; - bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); - } - - bfq_init_prio_data(bfqq, ioc); - bfq_init_entity(&bfqq->entity, bfqg); - } - - if (new_bfqq != NULL) - kmem_cache_free(bfq_pool, new_bfqq); - - return bfqq; -} - -static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, - struct bfq_group *bfqg, - int ioprio_class, int ioprio) -{ - switch (ioprio_class) { - case IOPRIO_CLASS_RT: - return &bfqg->async_bfqq[0][ioprio]; - case IOPRIO_CLASS_BE: - return &bfqg->async_bfqq[1][ioprio]; - case IOPRIO_CLASS_IDLE: - return &bfqg->async_idle_bfqq; - default: - BUG(); - } -} - -static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bfq_group *bfqg, int is_sync, - struct io_context *ioc, gfp_t gfp_mask) -{ - const int ioprio = task_ioprio(ioc); - const int ioprio_class = task_ioprio_class(ioc); - struct bfq_queue **async_bfqq = NULL; - struct bfq_queue *bfqq = NULL; - - if (!is_sync) { - async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, - ioprio); - bfqq = *async_bfqq; - } - - if (bfqq == NULL) - bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask); - - /* - * Pin the queue now that it's allocated, scheduler exit will prune it. - */ - if (!is_sync && *async_bfqq == NULL) { - atomic_inc(&bfqq->ref); - bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", - bfqq, atomic_read(&bfqq->ref)); - *async_bfqq = bfqq; - } - - atomic_inc(&bfqq->ref); - bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, - atomic_read(&bfqq->ref)); - return bfqq; -} - -static void bfq_update_io_thinktime(struct bfq_data *bfqd, - struct cfq_io_context *cic) -{ - unsigned long elapsed = jiffies - cic->last_end_request; - unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); - - cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; - cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; - cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; -} - -static void bfq_update_io_seektime(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct request *rq) -{ - sector_t sdist; - u64 total; - - if (bfqq->last_request_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - bfqq->last_request_pos; - else - sdist = bfqq->last_request_pos - blk_rq_pos(rq); - - /* - * Don't allow the seek distance to get too large from the - * odd fragment, pagein, etc. - */ - if (bfqq->seek_samples == 0) /* first request, not really a seek */ - sdist = 0; - else if (bfqq->seek_samples <= 60) /* second & third seek */ - sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); - else - sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); - - bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; - bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; - total = bfqq->seek_total + (bfqq->seek_samples/2); - do_div(total, bfqq->seek_samples); - if (bfq_bfqq_coop(bfqq)) { - /* - * If the mean seektime increases for a (non-seeky) shared - * queue, some cooperator is likely to be idling too much. - * On the contrary, if it decreases, some cooperator has - * probably waked up. - * - */ - if ((sector_t)total < bfqq->seek_mean) - bfq_mark_bfqq_some_coop_idle(bfqq) ; - else if ((sector_t)total > bfqq->seek_mean) - bfq_clear_bfqq_some_coop_idle(bfqq) ; - } - bfqq->seek_mean = (sector_t)total; - - bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, - (u64)bfqq->seek_mean); -} - -/* - * Disable idle window if the process thinks too long or seeks so much that - * it doesn't matter. - */ -static void bfq_update_idle_window(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct cfq_io_context *cic) -{ - int enable_idle; - - /* Don't idle for async or idle io prio class. */ - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) - return; - - enable_idle = bfq_bfqq_idle_window(bfqq); - - if (atomic_read(&cic->ioc->nr_tasks) == 0 || - bfqd->bfq_slice_idle == 0 || - (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && - bfqq->raising_coeff == 1)) - enable_idle = 0; - else if (bfq_sample_valid(cic->ttime_samples)) { - if (cic->ttime_mean > bfqd->bfq_slice_idle && - bfqq->raising_coeff == 1) - enable_idle = 0; - else - enable_idle = 1; - } - bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", - enable_idle); - - if (enable_idle) - bfq_mark_bfqq_idle_window(bfqq); - else - bfq_clear_bfqq_idle_window(bfqq); -} - -/* - * Called when a new fs request (rq) is added to bfqq. Check if there's - * something we should do about it. - */ -static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct request *rq) -{ - struct cfq_io_context *cic = RQ_CIC(rq); - - if (rq->cmd_flags & REQ_META) - bfqq->meta_pending++; - - bfq_update_io_thinktime(bfqd, cic); - bfq_update_io_seektime(bfqd, bfqq, rq); - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || - !BFQQ_SEEKY(bfqq)) - bfq_update_idle_window(bfqd, bfqq, cic); - - bfq_log_bfqq(bfqd, bfqq, - "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", - bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), - (long long unsigned)bfqq->seek_mean); - - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); - - if (bfqq == bfqd->active_queue) { - /* - * If there is just this request queued and the request - * is small, just exit. - * In this way, if the disk is being idled to wait for a new - * request from the active queue, we avoid unplugging the - * device now. - * - * By doing so, we spare the disk to be committed - * to serve just a small request. On the contrary, we wait for - * the block layer to decide when to unplug the device: - * hopefully, new requests will be merged to this - * one quickly, then the device will be unplugged - * and larger requests will be dispatched. - */ - if (bfqq->queued[rq_is_sync(rq)] == 1 && - blk_rq_sectors(rq) < 32) { - return; - } - if (bfq_bfqq_wait_request(bfqq)) { - /* - * If we are waiting for a request for this queue, let - * it rip immediately and flag that we must not expire - * this queue just now. - */ - bfq_clear_bfqq_wait_request(bfqq); - del_timer(&bfqd->idle_slice_timer); - /* - * Here we can safely expire the queue, in - * case of budget timeout, without wasting - * guarantees - */ - if (bfq_bfqq_budget_timeout(bfqq)) - bfq_bfqq_expire(bfqd, bfqq, 0, - BFQ_BFQQ_BUDGET_TIMEOUT); - __blk_run_queue(bfqd->queue); - } - } -} - -static void bfq_insert_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - assert_spin_locked(bfqd->queue->queue_lock); - bfq_init_prio_data(bfqq, RQ_CIC(rq)->ioc); - - bfq_add_rq_rb(rq); - - rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); - list_add_tail(&rq->queuelist, &bfqq->fifo); - - bfq_rq_enqueued(bfqd, bfqq, rq); -} - -static void bfq_update_hw_tag(struct bfq_data *bfqd) -{ - bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, - bfqd->rq_in_driver); - - if (bfqd->hw_tag == 1) - return; - - /* - * This sample is valid if the number of outstanding requests - * is large enough to allow a queueing behavior. Note that the - * sum is not exact, as it's not taking into account deactivated - * requests. - */ - if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) - return; - - if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) - return; - - bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; - bfqd->max_rq_in_driver = 0; - bfqd->hw_tag_samples = 0; -} - -static void bfq_completed_request(struct request_queue *q, struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; - const int sync = rq_is_sync(rq); - - bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)", - blk_rq_sectors(rq), sync); - - bfq_update_hw_tag(bfqd); - - WARN_ON(!bfqd->rq_in_driver); - WARN_ON(!bfqq->dispatched); - bfqd->rq_in_driver--; - bfqq->dispatched--; - - if (bfq_bfqq_sync(bfqq)) - bfqd->sync_flight--; - - if (sync) - RQ_CIC(rq)->last_end_request = jiffies; - - /* - * If this is the active queue, check if it needs to be expired, - * or if we want to idle in case it has no pending requests. - */ - if (bfqd->active_queue == bfqq) { - if (bfq_bfqq_budget_new(bfqq)) - bfq_set_budget_timeout(bfqd); - - /* Idling is disabled also for cooperation issues: - * 1) there is a close cooperator for the queue, or - * 2) the queue is shared and some cooperator is likely - * to be idle (in this case, by not arming the idle timer, - * we try to slow down the queue, to prevent the zones - * of the disk accessed by the active cooperators to become - * too distant from the zone that will be accessed by the - * currently idle cooperators) - */ - if (bfq_may_expire_for_budg_timeout(bfqq)) - bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); - else if (sync && - (bfqd->rq_in_driver == 0 || - bfqq->raising_coeff > 1) - && RB_EMPTY_ROOT(&bfqq->sort_list) - && !bfq_close_cooperator(bfqd, bfqq) - && (!bfq_bfqq_coop(bfqq) || - !bfq_bfqq_some_coop_idle(bfqq))) - bfq_arm_slice_timer(bfqd); - } - - if (!bfqd->rq_in_driver) - bfq_schedule_dispatch(bfqd); -} - -/* - * We temporarily boost lower priority queues if they are holding fs exclusive - * resources. They are boosted to normal prio (CLASS_BE/4). - */ -static void bfq_prio_boost(struct bfq_queue *bfqq) -{ - if (has_fs_excl()) { - /* - * Boost idle prio on transactions that would lock out other - * users of the filesystem - */ - if (bfq_class_idle(bfqq)) - bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; - if (bfqq->entity.new_ioprio > IOPRIO_NORM) - bfqq->entity.new_ioprio = IOPRIO_NORM; - } else { - /* - * Unboost the queue (if needed) - */ - bfqq->entity.new_ioprio_class = bfqq->org_ioprio_class; - bfqq->entity.new_ioprio = bfqq->org_ioprio; - } -} - -static inline int __bfq_may_queue(struct bfq_queue *bfqq) -{ - if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { - bfq_clear_bfqq_must_alloc(bfqq); - return ELV_MQUEUE_MUST; - } - - return ELV_MQUEUE_MAY; -} - -static int bfq_may_queue(struct request_queue *q, int rw) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct task_struct *tsk = current; - struct cfq_io_context *cic; - struct bfq_queue *bfqq; - - /* - * Don't force setup of a queue from here, as a call to may_queue - * does not necessarily imply that a request actually will be queued. - * So just lookup a possibly existing queue, or return 'may queue' - * if that fails. - */ - cic = bfq_cic_lookup(bfqd, tsk->io_context); - if (cic == NULL) - return ELV_MQUEUE_MAY; - - bfqq = cic_to_bfqq(cic, rw_is_sync(rw)); - if (bfqq != NULL) { - bfq_init_prio_data(bfqq, cic->ioc); - bfq_prio_boost(bfqq); - - return __bfq_may_queue(bfqq); - } - - return ELV_MQUEUE_MAY; -} - -/* - * Queue lock held here. - */ -static void bfq_put_request(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - if (bfqq != NULL) { - const int rw = rq_data_dir(rq); - - BUG_ON(!bfqq->allocated[rw]); - bfqq->allocated[rw]--; - - put_io_context(RQ_CIC(rq)->ioc); - - rq->elevator_private[0] = NULL; - rq->elevator_private[1] = NULL; - - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", - bfqq, atomic_read(&bfqq->ref)); - bfq_put_queue(bfqq); - } -} - -static struct bfq_queue * -bfq_merge_bfqqs(struct bfq_data *bfqd, struct cfq_io_context *cic, - struct bfq_queue *bfqq) -{ - bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", - (long unsigned)bfqq->new_bfqq->pid); - cic_set_bfqq(cic, bfqq->new_bfqq, 1); - bfq_mark_bfqq_coop(bfqq->new_bfqq); - bfq_put_queue(bfqq); - return cic_to_bfqq(cic, 1); -} - -/* - * Returns NULL if a new bfqq should be allocated, or the old bfqq if this - * was the last process referring to said bfqq. - */ -static struct bfq_queue * -bfq_split_bfqq(struct cfq_io_context *cic, struct bfq_queue *bfqq) -{ - bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); - if (bfqq_process_refs(bfqq) == 1) { - bfqq->pid = current->pid; - bfq_clear_bfqq_some_coop_idle(bfqq); - bfq_clear_bfqq_coop(bfqq); - bfq_clear_bfqq_split_coop(bfqq); - return bfqq; - } - - cic_set_bfqq(cic, NULL, 1); - - bfq_put_cooperator(bfqq); - - bfq_put_queue(bfqq); - return NULL; -} - -/* - * Allocate bfq data structures associated with this request. - */ -static int bfq_set_request(struct request_queue *q, struct request *rq, - gfp_t gfp_mask) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - struct cfq_io_context *cic; - const int rw = rq_data_dir(rq); - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; - struct bfq_group *bfqg; - unsigned long flags; - - might_sleep_if(gfp_mask & __GFP_WAIT); - - cic = bfq_get_io_context(bfqd, gfp_mask); - - spin_lock_irqsave(q->queue_lock, flags); - - if (cic == NULL) - goto queue_fail; - - bfqg = bfq_cic_update_cgroup(cic); - -new_queue: - bfqq = cic_to_bfqq(cic, is_sync); - if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { - bfqq = bfq_get_queue(bfqd, bfqg, is_sync, cic->ioc, gfp_mask); - cic_set_bfqq(cic, bfqq, is_sync); - } else { - /* - * If the queue was seeky for too long, break it apart. - */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); - bfqq = bfq_split_bfqq(cic, bfqq); - if (!bfqq) - goto new_queue; - } - - /* - * Check to see if this queue is scheduled to merge with - * another closely cooperating queue. The merging of queues - * happens here as it must be done in process context. - * The reference on new_bfqq was taken in merge_bfqqs. - */ - if (bfqq->new_bfqq != NULL) - bfqq = bfq_merge_bfqqs(bfqd, cic, bfqq); - } - - bfqq->allocated[rw]++; - atomic_inc(&bfqq->ref); - bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, - atomic_read(&bfqq->ref)); - - spin_unlock_irqrestore(q->queue_lock, flags); - - rq->elevator_private[0] = cic; - rq->elevator_private[1] = bfqq; - - return 0; - -queue_fail: - if (cic != NULL) - put_io_context(cic->ioc); - - bfq_schedule_dispatch(bfqd); - spin_unlock_irqrestore(q->queue_lock, flags); - - return 1; -} - -static void bfq_kick_queue(struct work_struct *work) -{ - struct bfq_data *bfqd = - container_of(work, struct bfq_data, unplug_work); - struct request_queue *q = bfqd->queue; - - spin_lock_irq(q->queue_lock); - __blk_run_queue(q); - spin_unlock_irq(q->queue_lock); -} - -/* - * Handler of the expiration of the timer running if the active_queue - * is idling inside its time slice. - */ -static void bfq_idle_slice_timer(unsigned long data) -{ - struct bfq_data *bfqd = (struct bfq_data *)data; - struct bfq_queue *bfqq; - unsigned long flags; - enum bfqq_expiration reason; - - spin_lock_irqsave(bfqd->queue->queue_lock, flags); - - bfqq = bfqd->active_queue; - /* - * Theoretical race here: active_queue can be NULL or different - * from the queue that was idling if the timer handler spins on - * the queue_lock and a new request arrives for the current - * queue and there is a full dispatch cycle that changes the - * active_queue. This can hardly happen, but in the worst case - * we just expire a queue too early. - */ - if (bfqq != NULL) { - bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); - if (bfq_bfqq_budget_timeout(bfqq)) - /* - * Also here the queue can be safely expired - * for budget timeout without wasting - * guarantees - */ - reason = BFQ_BFQQ_BUDGET_TIMEOUT; - else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) - /* - * The queue may not be empty upon timer expiration, - * because we may not disable the timer when the first - * request of the active queue arrives during - * disk idling - */ - reason = BFQ_BFQQ_TOO_IDLE; - else - goto schedule_dispatch; - - bfq_bfqq_expire(bfqd, bfqq, 1, reason); - } - -schedule_dispatch: - bfq_schedule_dispatch(bfqd); - - spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -} - -static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -{ - del_timer_sync(&bfqd->idle_slice_timer); - cancel_work_sync(&bfqd->unplug_work); -} - -static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, - struct bfq_queue **bfqq_ptr) -{ - struct bfq_group *root_group = bfqd->root_group; - struct bfq_queue *bfqq = *bfqq_ptr; - - bfq_log(bfqd, "put_async_bfqq: %p", bfqq); - if (bfqq != NULL) { - bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); - bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", - bfqq, atomic_read(&bfqq->ref)); - bfq_put_queue(bfqq); - *bfqq_ptr = NULL; - } -} - -/* - * Release all the bfqg references to its async queues. If we are - * deallocating the group these queues may still contain requests, so - * we reparent them to the root cgroup (i.e., the only one that will - * exist for sure untill all the requests on a device are gone). - */ -static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -{ - int i, j; - - for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) - __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); - - __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -} - -static void bfq_exit_queue(struct elevator_queue *e) -{ - struct bfq_data *bfqd = e->elevator_data; - struct request_queue *q = bfqd->queue; - struct bfq_queue *bfqq, *n; - struct cfq_io_context *cic; - - bfq_shutdown_timer_wq(bfqd); - - spin_lock_irq(q->queue_lock); - - while (!list_empty(&bfqd->cic_list)) { - cic = list_entry(bfqd->cic_list.next, struct cfq_io_context, - queue_list); - __bfq_exit_single_io_context(bfqd, cic); - } - - BUG_ON(bfqd->active_queue != NULL); - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, 0); - - bfq_disconnect_groups(bfqd); - spin_unlock_irq(q->queue_lock); - - bfq_shutdown_timer_wq(bfqd); - - spin_lock(&cic_index_lock); - ida_remove(&cic_index_ida, bfqd->cic_index); - spin_unlock(&cic_index_lock); - - /* Wait for cic->key accessors to exit their grace periods. */ - synchronize_rcu(); - - BUG_ON(timer_pending(&bfqd->idle_slice_timer)); - - bfq_free_root_group(bfqd); - kfree(bfqd); -} - -static int bfq_alloc_cic_index(void) -{ - int index, error; - - do { - if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) - return -ENOMEM; - - spin_lock(&cic_index_lock); - error = ida_get_new(&cic_index_ida, &index); - spin_unlock(&cic_index_lock); - if (error && error != -EAGAIN) - return error; - } while (error); - - return index; -} - -static void *bfq_init_queue(struct request_queue *q) -{ - struct bfq_group *bfqg; - struct bfq_data *bfqd; - int i; - - i = bfq_alloc_cic_index(); - if (i < 0) - return NULL; - - bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node); - if (bfqd == NULL) - return NULL; - - bfqd->cic_index = i; - - /* - * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. - * Grab a permanent reference to it, so that the normal code flow - * will not attempt to free it. - */ - bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0); - atomic_inc(&bfqd->oom_bfqq.ref); - - INIT_LIST_HEAD(&bfqd->cic_list); - - bfqd->queue = q; - - bfqg = bfq_alloc_root_group(bfqd, q->node); - if (bfqg == NULL) { - kfree(bfqd); - return NULL; - } - - bfqd->root_group = bfqg; - - init_timer(&bfqd->idle_slice_timer); - bfqd->idle_slice_timer.function = bfq_idle_slice_timer; - bfqd->idle_slice_timer.data = (unsigned long)bfqd; - - bfqd->rq_pos_tree = RB_ROOT; - - INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); - - INIT_LIST_HEAD(&bfqd->active_list); - INIT_LIST_HEAD(&bfqd->idle_list); - - bfqd->hw_tag = -1; - - bfqd->bfq_max_budget = bfq_default_max_budget; - - bfqd->bfq_quantum = bfq_quantum; - bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; - bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; - bfqd->bfq_back_max = bfq_back_max; - bfqd->bfq_back_penalty = bfq_back_penalty; - bfqd->bfq_slice_idle = bfq_slice_idle; - bfqd->bfq_class_idle_last_service = 0; - bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; - bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; - bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; - - bfqd->low_latency = true; - - bfqd->bfq_raising_coeff = 20; - bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300); - bfqd->bfq_raising_max_time = 0; - bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000); - bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500); - bfqd->bfq_raising_max_softrt_rate = 7000; - - /* Initially estimate the device's peak rate as the reference rate */ - if (blk_queue_nonrot(bfqd->queue)) { - bfqd->RT_prod = R_nonrot * T_nonrot; - bfqd->peak_rate = R_nonrot; - } else { - bfqd->RT_prod = R_rot * T_rot; - bfqd->peak_rate = R_rot; - } - - return bfqd; -} - -static void bfq_slab_kill(void) -{ - if (bfq_pool != NULL) - kmem_cache_destroy(bfq_pool); - if (bfq_ioc_pool != NULL) - kmem_cache_destroy(bfq_ioc_pool); -} - -static int __init bfq_slab_setup(void) -{ - bfq_pool = KMEM_CACHE(bfq_queue, 0); - if (bfq_pool == NULL) - goto fail; - - bfq_ioc_pool = kmem_cache_create("bfq_io_context", - sizeof(struct cfq_io_context), - __alignof__(struct cfq_io_context), - 0, NULL); - if (bfq_ioc_pool == NULL) - goto fail; - - return 0; -fail: - bfq_slab_kill(); - return -ENOMEM; -} - -static ssize_t bfq_var_show(unsigned int var, char *page) -{ - return sprintf(page, "%d\n", var); -} - -static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count) -{ - unsigned long new_val; - int ret = strict_strtoul(page, 10, &new_val); - - if (ret == 0) - *var = new_val; - - return count; -} - -static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page) -{ - struct bfq_data *bfqd = e->elevator_data; - return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ? - bfqd->bfq_raising_max_time : - bfq_wrais_duration(bfqd)); -} - -static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -{ - struct bfq_queue *bfqq; - struct bfq_data *bfqd = e->elevator_data; - ssize_t num_char = 0; - - num_char += sprintf(page + num_char, "Active:\n"); - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { - num_char += sprintf(page + num_char, - "pid%d: weight %hu, dur %d/%u\n", - bfqq->pid, - bfqq->entity.weight, - jiffies_to_msecs(jiffies - - bfqq->last_rais_start_finish), - jiffies_to_msecs(bfqq->raising_cur_max_time)); - } - num_char += sprintf(page + num_char, "Idle:\n"); - list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { - num_char += sprintf(page + num_char, - "pid%d: weight %hu, dur %d/%u\n", - bfqq->pid, - bfqq->entity.weight, - jiffies_to_msecs(jiffies - - bfqq->last_rais_start_finish), - jiffies_to_msecs(bfqq->raising_cur_max_time)); - } - return num_char; -} - -#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -{ \ - struct bfq_data *bfqd = e->elevator_data; \ - unsigned int __data = __VAR; \ - if (__CONV) \ - __data = jiffies_to_msecs(__data); \ - return bfq_var_show(__data, (page)); \ -} -SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0); -SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); -SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); -SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); -SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0); -SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); -SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); -SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0); -SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1); -SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time, - 1); -SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show, - bfqd->bfq_raising_min_inter_arr_async, - 1); -SHOW_FUNCTION(bfq_raising_max_softrt_rate_show, - bfqd->bfq_raising_max_softrt_rate, 0); -#undef SHOW_FUNCTION - -#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -static ssize_t \ -__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -{ \ - struct bfq_data *bfqd = e->elevator_data; \ - unsigned long __data; \ - int ret = bfq_var_store(&__data, (page), count); \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ - if (__CONV) \ - *(__PTR) = msecs_to_jiffies(__data); \ - else \ - *(__PTR) = __data; \ - return ret; \ -} -STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0); -STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, - INT_MAX, 1); -STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, - INT_MAX, 1); -STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, - INT_MAX, 0); -STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); -STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, - 1, INT_MAX, 0); -STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, - INT_MAX, 1); -STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1, - INT_MAX, 0); -STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0, - INT_MAX, 1); -STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0, - INT_MAX, 1); -STORE_FUNCTION(bfq_raising_min_idle_time_store, - &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1); -STORE_FUNCTION(bfq_raising_min_inter_arr_async_store, - &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1); -STORE_FUNCTION(bfq_raising_max_softrt_rate_store, - &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0); -#undef STORE_FUNCTION - -/* do nothing for the moment */ -static ssize_t bfq_weights_store(struct elevator_queue *e, - const char *page, size_t count) -{ - return count; -} - -static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) -{ - u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); - - if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) - return bfq_calc_max_budget(bfqd->peak_rate, timeout); - else - return bfq_default_max_budget; -} - -static ssize_t bfq_max_budget_store(struct elevator_queue *e, - const char *page, size_t count) -{ - struct bfq_data *bfqd = e->elevator_data; - unsigned long __data; - int ret = bfq_var_store(&__data, (page), count); - - if (__data == 0) - bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); - else { - if (__data > INT_MAX) - __data = INT_MAX; - bfqd->bfq_max_budget = __data; - } - - bfqd->bfq_user_max_budget = __data; - - return ret; -} - -static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, - const char *page, size_t count) -{ - struct bfq_data *bfqd = e->elevator_data; - unsigned long __data; - int ret = bfq_var_store(&__data, (page), count); - - if (__data < 1) - __data = 1; - else if (__data > INT_MAX) - __data = INT_MAX; - - bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); - if (bfqd->bfq_user_max_budget == 0) - bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); - - return ret; -} - -static ssize_t bfq_low_latency_store(struct elevator_queue *e, - const char *page, size_t count) -{ - struct bfq_data *bfqd = e->elevator_data; - unsigned long __data; - int ret = bfq_var_store(&__data, (page), count); - - if (__data > 1) - __data = 1; - bfqd->low_latency = __data; - - return ret; -} - -#define BFQ_ATTR(name) \ - __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) - -static struct elv_fs_entry bfq_attrs[] = { - BFQ_ATTR(quantum), - BFQ_ATTR(fifo_expire_sync), - BFQ_ATTR(fifo_expire_async), - BFQ_ATTR(back_seek_max), - BFQ_ATTR(back_seek_penalty), - BFQ_ATTR(slice_idle), - BFQ_ATTR(max_budget), - BFQ_ATTR(max_budget_async_rq), - BFQ_ATTR(timeout_sync), - BFQ_ATTR(timeout_async), - BFQ_ATTR(low_latency), - BFQ_ATTR(raising_coeff), - BFQ_ATTR(raising_max_time), - BFQ_ATTR(raising_rt_max_time), - BFQ_ATTR(raising_min_idle_time), - BFQ_ATTR(raising_min_inter_arr_async), - BFQ_ATTR(raising_max_softrt_rate), - BFQ_ATTR(weights), - __ATTR_NULL -}; - -static struct elevator_type iosched_bfq = { - .ops = { - .elevator_merge_fn = bfq_merge, - .elevator_merged_fn = bfq_merged_request, - .elevator_merge_req_fn = bfq_merged_requests, - .elevator_allow_merge_fn = bfq_allow_merge, - .elevator_dispatch_fn = bfq_dispatch_requests, - .elevator_add_req_fn = bfq_insert_request, - .elevator_activate_req_fn = bfq_activate_request, - .elevator_deactivate_req_fn = bfq_deactivate_request, - .elevator_completed_req_fn = bfq_completed_request, - .elevator_former_req_fn = elv_rb_former_request, - .elevator_latter_req_fn = elv_rb_latter_request, - .elevator_set_req_fn = bfq_set_request, - .elevator_put_req_fn = bfq_put_request, - .elevator_may_queue_fn = bfq_may_queue, - .elevator_init_fn = bfq_init_queue, - .elevator_exit_fn = bfq_exit_queue, - .trim = bfq_free_io_context, - }, - .elevator_attrs = bfq_attrs, - .elevator_name = "bfq", - .elevator_owner = THIS_MODULE, -}; - -static int __init bfq_init(void) -{ - /* - * Can be 0 on HZ < 1000 setups. - */ - //if (bfq_slice_idle == 0) - // bfq_slice_idle = 1; - - if (bfq_timeout_async == 0) - bfq_timeout_async = 1; - - if (bfq_slab_setup()) - return -ENOMEM; - - elv_register(&iosched_bfq); - - return 0; -} - -static void __exit bfq_exit(void) -{ - DECLARE_COMPLETION_ONSTACK(all_gone); - elv_unregister(&iosched_bfq); - bfq_ioc_gone = &all_gone; - /* bfq_ioc_gone's update must be visible before reading bfq_ioc_count */ - smp_wmb(); - if (elv_ioc_count_read(bfq_ioc_count) != 0) - wait_for_completion(&all_gone); - ida_destroy(&cic_index_ida); - bfq_slab_kill(); -} - -module_init(bfq_init); -module_exit(bfq_exit); - -MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler"); diff --git a/block/bfq-sched.c b/block/bfq-sched.c deleted file mode 100644 index fd50b7fd1..000000000 --- a/block/bfq-sched.c +++ /dev/null @@ -1,1066 +0,0 @@ -/* - * BFQ: Hierarchical B-WF2Q+ scheduler. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - */ - -#ifdef CONFIG_CGROUP_BFQIO -#define for_each_entity(entity) \ - for (; entity != NULL; entity = entity->parent) - -#define for_each_entity_safe(entity, parent) \ - for (; entity && ({ parent = entity->parent; 1; }); entity = parent) - -static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, - int extract, - struct bfq_data *bfqd); - -static inline void bfq_update_budget(struct bfq_entity *next_active) -{ - struct bfq_entity *bfqg_entity; - struct bfq_group *bfqg; - struct bfq_sched_data *group_sd; - - BUG_ON(next_active == NULL); - - group_sd = next_active->sched_data; - - bfqg = container_of(group_sd, struct bfq_group, sched_data); - /* - * bfq_group's my_entity field is not NULL only if the group - * is not the root group. We must not touch the root entity - * as it must never become an active entity. - */ - bfqg_entity = bfqg->my_entity; - if (bfqg_entity != NULL) - bfqg_entity->budget = next_active->budget; -} - -static int bfq_update_next_active(struct bfq_sched_data *sd) -{ - struct bfq_entity *next_active; - - if (sd->active_entity != NULL) - /* will update/requeue at the end of service */ - return 0; - - /* - * NOTE: this can be improved in many ways, such as returning - * 1 (and thus propagating upwards the update) only when the - * budget changes, or caching the bfqq that will be scheduled - * next from this subtree. By now we worry more about - * correctness than about performance... - */ - next_active = bfq_lookup_next_entity(sd, 0, NULL); - sd->next_active = next_active; - - if (next_active != NULL) - bfq_update_budget(next_active); - - return 1; -} - -static inline void bfq_check_next_active(struct bfq_sched_data *sd, - struct bfq_entity *entity) -{ - BUG_ON(sd->next_active != entity); -} -#else -#define for_each_entity(entity) \ - for (; entity != NULL; entity = NULL) - -#define for_each_entity_safe(entity, parent) \ - for (parent = NULL; entity != NULL; entity = parent) - -static inline int bfq_update_next_active(struct bfq_sched_data *sd) -{ - return 0; -} - -static inline void bfq_check_next_active(struct bfq_sched_data *sd, - struct bfq_entity *entity) -{ -} - -static inline void bfq_update_budget(struct bfq_entity *next_active) -{ -} -#endif - -/* - * Shift for timestamp calculations. This actually limits the maximum - * service allowed in one timestamp delta (small shift values increase it), - * the maximum total weight that can be used for the queues in the system - * (big shift values increase it), and the period of virtual time wraparounds. - */ -#define WFQ_SERVICE_SHIFT 22 - -/** - * bfq_gt - compare two timestamps. - * @a: first ts. - * @b: second ts. - * - * Return @a > @b, dealing with wrapping correctly. - */ -static inline int bfq_gt(u64 a, u64 b) -{ - return (s64)(a - b) > 0; -} - -static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = NULL; - - BUG_ON(entity == NULL); - - if (entity->my_sched_data == NULL) - bfqq = container_of(entity, struct bfq_queue, entity); - - return bfqq; -} - - -/** - * bfq_delta - map service into the virtual time domain. - * @service: amount of service. - * @weight: scale factor (weight of an entity or weight sum). - */ -static inline u64 bfq_delta(unsigned long service, - unsigned long weight) -{ - u64 d = (u64)service << WFQ_SERVICE_SHIFT; - - do_div(d, weight); - return d; -} - -/** - * bfq_calc_finish - assign the finish time to an entity. - * @entity: the entity to act upon. - * @service: the service to be charged to the entity. - */ -static inline void bfq_calc_finish(struct bfq_entity *entity, - unsigned long service) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - BUG_ON(entity->weight == 0); - - entity->finish = entity->start + - bfq_delta(service, entity->weight); - - if (bfqq != NULL) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_finish: serv %lu, w %d", - service, entity->weight); - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_finish: start %llu, finish %llu, delta %llu", - entity->start, entity->finish, - bfq_delta(service, entity->weight)); - } -} - -/** - * bfq_entity_of - get an entity from a node. - * @node: the node field of the entity. - * - * Convert a node pointer to the relative entity. This is used only - * to simplify the logic of some functions and not as the generic - * conversion mechanism because, e.g., in the tree walking functions, - * the check for a %NULL value would be redundant. - */ -static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) -{ - struct bfq_entity *entity = NULL; - - if (node != NULL) - entity = rb_entry(node, struct bfq_entity, rb_node); - - return entity; -} - -/** - * bfq_extract - remove an entity from a tree. - * @root: the tree root. - * @entity: the entity to remove. - */ -static inline void bfq_extract(struct rb_root *root, - struct bfq_entity *entity) -{ - BUG_ON(entity->tree != root); - - entity->tree = NULL; - rb_erase(&entity->rb_node, root); -} - -/** - * bfq_idle_extract - extract an entity from the idle tree. - * @st: the service tree of the owning @entity. - * @entity: the entity being removed. - */ -static void bfq_idle_extract(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *next; - - BUG_ON(entity->tree != &st->idle); - - if (entity == st->first_idle) { - next = rb_next(&entity->rb_node); - st->first_idle = bfq_entity_of(next); - } - - if (entity == st->last_idle) { - next = rb_prev(&entity->rb_node); - st->last_idle = bfq_entity_of(next); - } - - bfq_extract(&st->idle, entity); - - if (bfqq != NULL) - list_del(&bfqq->bfqq_list); -} - -/** - * bfq_insert - generic tree insertion. - * @root: tree root. - * @entity: entity to insert. - * - * This is used for the idle and the active tree, since they are both - * ordered by finish time. - */ -static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) -{ - struct bfq_entity *entry; - struct rb_node **node = &root->rb_node; - struct rb_node *parent = NULL; - - BUG_ON(entity->tree != NULL); - - while (*node != NULL) { - parent = *node; - entry = rb_entry(parent, struct bfq_entity, rb_node); - - if (bfq_gt(entry->finish, entity->finish)) - node = &parent->rb_left; - else - node = &parent->rb_right; - } - - rb_link_node(&entity->rb_node, parent, node); - rb_insert_color(&entity->rb_node, root); - - entity->tree = root; -} - -/** - * bfq_update_min - update the min_start field of a entity. - * @entity: the entity to update. - * @node: one of its children. - * - * This function is called when @entity may store an invalid value for - * min_start due to updates to the active tree. The function assumes - * that the subtree rooted at @node (which may be its left or its right - * child) has a valid min_start value. - */ -static inline void bfq_update_min(struct bfq_entity *entity, - struct rb_node *node) -{ - struct bfq_entity *child; - - if (node != NULL) { - child = rb_entry(node, struct bfq_entity, rb_node); - if (bfq_gt(entity->min_start, child->min_start)) - entity->min_start = child->min_start; - } -} - -/** - * bfq_update_active_node - recalculate min_start. - * @node: the node to update. - * - * @node may have changed position or one of its children may have moved, - * this function updates its min_start value. The left and right subtrees - * are assumed to hold a correct min_start value. - */ -static inline void bfq_update_active_node(struct rb_node *node) -{ - struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); - - entity->min_start = entity->start; - bfq_update_min(entity, node->rb_right); - bfq_update_min(entity, node->rb_left); -} - -/** - * bfq_update_active_tree - update min_start for the whole active tree. - * @node: the starting node. - * - * @node must be the deepest modified node after an update. This function - * updates its min_start using the values held by its children, assuming - * that they did not change, and then updates all the nodes that may have - * changed in the path to the root. The only nodes that may have changed - * are the ones in the path or their siblings. - */ -static void bfq_update_active_tree(struct rb_node *node) -{ - struct rb_node *parent; - -up: - bfq_update_active_node(node); - - parent = rb_parent(node); - if (parent == NULL) - return; - - if (node == parent->rb_left && parent->rb_right != NULL) - bfq_update_active_node(parent->rb_right); - else if (parent->rb_left != NULL) - bfq_update_active_node(parent->rb_left); - - node = parent; - goto up; -} - -/** - * bfq_active_insert - insert an entity in the active tree of its group/device. - * @st: the service tree of the entity. - * @entity: the entity being inserted. - * - * The active tree is ordered by finish time, but an extra key is kept - * per each node, containing the minimum value for the start times of - * its children (and the node itself), so it's possible to search for - * the eligible node with the lowest finish time in logarithmic time. - */ -static void bfq_active_insert(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *node = &entity->rb_node; - - bfq_insert(&st->active, entity); - - if (node->rb_left != NULL) - node = node->rb_left; - else if (node->rb_right != NULL) - node = node->rb_right; - - bfq_update_active_tree(node); - - if (bfqq != NULL) - list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -} - -/** - * bfq_ioprio_to_weight - calc a weight from an ioprio. - * @ioprio: the ioprio value to convert. - */ -static unsigned short bfq_ioprio_to_weight(int ioprio) -{ - WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); - return IOPRIO_BE_NR - ioprio; -} - -/** - * bfq_weight_to_ioprio - calc an ioprio from a weight. - * @weight: the weight value to convert. - * - * To preserve as mush as possible the old only-ioprio user interface, - * 0 is used as an escape ioprio value for weights (numerically) equal or - * larger than IOPRIO_BE_NR - */ -static unsigned short bfq_weight_to_ioprio(int weight) -{ - WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); - return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; -} - -static inline void bfq_get_entity(struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - if (bfqq != NULL) { - atomic_inc(&bfqq->ref); - bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", - bfqq, atomic_read(&bfqq->ref)); - } -} - -/** - * bfq_find_deepest - find the deepest node that an extraction can modify. - * @node: the node being removed. - * - * Do the first step of an extraction in an rb tree, looking for the - * node that will replace @node, and returning the deepest node that - * the following modifications to the tree can touch. If @node is the - * last node in the tree return %NULL. - */ -static struct rb_node *bfq_find_deepest(struct rb_node *node) -{ - struct rb_node *deepest; - - if (node->rb_right == NULL && node->rb_left == NULL) - deepest = rb_parent(node); - else if (node->rb_right == NULL) - deepest = node->rb_left; - else if (node->rb_left == NULL) - deepest = node->rb_right; - else { - deepest = rb_next(node); - if (deepest->rb_right != NULL) - deepest = deepest->rb_right; - else if (rb_parent(deepest) != node) - deepest = rb_parent(deepest); - } - - return deepest; -} - -/** - * bfq_active_extract - remove an entity from the active tree. - * @st: the service_tree containing the tree. - * @entity: the entity being removed. - */ -static void bfq_active_extract(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *node; - - node = bfq_find_deepest(&entity->rb_node); - bfq_extract(&st->active, entity); - - if (node != NULL) - bfq_update_active_tree(node); - - if (bfqq != NULL) - list_del(&bfqq->bfqq_list); -} - -/** - * bfq_idle_insert - insert an entity into the idle tree. - * @st: the service tree containing the tree. - * @entity: the entity to insert. - */ -static void bfq_idle_insert(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct bfq_entity *first_idle = st->first_idle; - struct bfq_entity *last_idle = st->last_idle; - - if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) - st->first_idle = entity; - if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) - st->last_idle = entity; - - bfq_insert(&st->idle, entity); - - if (bfqq != NULL) - list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); -} - -/** - * bfq_forget_entity - remove an entity from the wfq trees. - * @st: the service tree. - * @entity: the entity being removed. - * - * Update the device status and forget everything about @entity, putting - * the device reference to it, if it is a queue. Entities belonging to - * groups are not refcounted. - */ -static void bfq_forget_entity(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - BUG_ON(!entity->on_st); - - entity->on_st = 0; - st->wsum -= entity->weight; - if (bfqq != NULL) { - bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", - bfqq, atomic_read(&bfqq->ref)); - bfq_put_queue(bfqq); - } -} - -/** - * bfq_put_idle_entity - release the idle tree ref of an entity. - * @st: service tree for the entity. - * @entity: the entity being released. - */ -static void bfq_put_idle_entity(struct bfq_service_tree *st, - struct bfq_entity *entity) -{ - bfq_idle_extract(st, entity); - bfq_forget_entity(st, entity); -} - -/** - * bfq_forget_idle - update the idle tree if necessary. - * @st: the service tree to act upon. - * - * To preserve the global O(log N) complexity we only remove one entry here; - * as the idle tree will not grow indefinitely this can be done safely. - */ -static void bfq_forget_idle(struct bfq_service_tree *st) -{ - struct bfq_entity *first_idle = st->first_idle; - struct bfq_entity *last_idle = st->last_idle; - - if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && - !bfq_gt(last_idle->finish, st->vtime)) { - /* - * Forget the whole idle tree, increasing the vtime past - * the last finish time of idle entities. - */ - st->vtime = last_idle->finish; - } - - if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) - bfq_put_idle_entity(st, first_idle); -} - -static struct bfq_service_tree * -__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - struct bfq_entity *entity) -{ - struct bfq_service_tree *new_st = old_st; - - if (entity->ioprio_changed) { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - BUG_ON(old_st->wsum < entity->weight); - old_st->wsum -= entity->weight; - - if (entity->new_weight != entity->orig_weight) { - entity->orig_weight = entity->new_weight; - entity->ioprio = - bfq_weight_to_ioprio(entity->orig_weight); - } else if (entity->new_ioprio != entity->ioprio) { - entity->ioprio = entity->new_ioprio; - entity->orig_weight = - bfq_ioprio_to_weight(entity->ioprio); - } else - entity->new_weight = entity->orig_weight = - bfq_ioprio_to_weight(entity->ioprio); - - entity->ioprio_class = entity->new_ioprio_class; - entity->ioprio_changed = 0; - - /* - * NOTE: here we may be changing the weight too early, - * this will cause unfairness. The correct approach - * would have required additional complexity to defer - * weight changes to the proper time instants (i.e., - * when entity->finish <= old_st->vtime). - */ - new_st = bfq_entity_service_tree(entity); - entity->weight = entity->orig_weight * - (bfqq != NULL ? bfqq->raising_coeff : 1); - new_st->wsum += entity->weight; - - if (new_st != old_st) - entity->start = new_st->vtime; - } - - return new_st; -} - -/** - * bfq_bfqq_served - update the scheduler status after selection for service. - * @bfqq: the queue being served. - * @served: bytes to transfer. - * - * NOTE: this can be optimized, as the timestamps of upper level entities - * are synchronized every time a new bfqq is selected for service. By now, - * we keep it to better check consistency. - */ -static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) -{ - struct bfq_entity *entity = &bfqq->entity; - struct bfq_service_tree *st; - - for_each_entity(entity) { - st = bfq_entity_service_tree(entity); - - entity->service += served; - BUG_ON(entity->service > entity->budget); - BUG_ON(st->wsum == 0); - - st->vtime += bfq_delta(served, st->wsum); - bfq_forget_idle(st); - } - bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served); -} - -/** - * bfq_bfqq_charge_full_budget - set the service to the entity budget. - * @bfqq: the queue that needs a service update. - * - * When it's not possible to be fair in the service domain, because - * a queue is not consuming its budget fast enough (the meaning of - * fast depends on the timeout parameter), we charge it a full - * budget. In this way we should obtain a sort of time-domain - * fairness among all the seeky/slow queues. - */ -static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - - bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); - - bfq_bfqq_served(bfqq, entity->budget - entity->service); -} - -/** - * __bfq_activate_entity - activate an entity. - * @entity: the entity being activated. - * - * Called whenever an entity is activated, i.e., it is not active and one - * of its children receives a new request, or has to be reactivated due to - * budget exhaustion. It uses the current budget of the entity (and the - * service received if @entity is active) of the queue to calculate its - * timestamps. - */ -static void __bfq_activate_entity(struct bfq_entity *entity) -{ - struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st = bfq_entity_service_tree(entity); - - if (entity == sd->active_entity) { - BUG_ON(entity->tree != NULL); - /* - * If we are requeueing the current entity we have - * to take care of not charging to it service it has - * not received. - */ - bfq_calc_finish(entity, entity->service); - entity->start = entity->finish; - sd->active_entity = NULL; - } else if (entity->tree == &st->active) { - /* - * Requeueing an entity due to a change of some - * next_active entity below it. We reuse the old - * start time. - */ - bfq_active_extract(st, entity); - } else if (entity->tree == &st->idle) { - /* - * Must be on the idle tree, bfq_idle_extract() will - * check for that. - */ - bfq_idle_extract(st, entity); - entity->start = bfq_gt(st->vtime, entity->finish) ? - st->vtime : entity->finish; - } else { - /* - * The finish time of the entity may be invalid, and - * it is in the past for sure, otherwise the queue - * would have been on the idle tree. - */ - entity->start = st->vtime; - st->wsum += entity->weight; - bfq_get_entity(entity); - - BUG_ON(entity->on_st); - entity->on_st = 1; - } - - st = __bfq_entity_update_weight_prio(st, entity); - bfq_calc_finish(entity, entity->budget); - bfq_active_insert(st, entity); -} - -/** - * bfq_activate_entity - activate an entity and its ancestors if necessary. - * @entity: the entity to activate. - * - * Activate @entity and all the entities on the path from it to the root. - */ -static void bfq_activate_entity(struct bfq_entity *entity) -{ - struct bfq_sched_data *sd; - - for_each_entity(entity) { - __bfq_activate_entity(entity); - - sd = entity->sched_data; - if (!bfq_update_next_active(sd)) - /* - * No need to propagate the activation to the - * upper entities, as they will be updated when - * the active entity is rescheduled. - */ - break; - } -} - -/** - * __bfq_deactivate_entity - deactivate an entity from its service tree. - * @entity: the entity to deactivate. - * @requeue: if false, the entity will not be put into the idle tree. - * - * Deactivate an entity, independently from its previous state. If the - * entity was not on a service tree just return, otherwise if it is on - * any scheduler tree, extract it from that tree, and if necessary - * and if the caller did not specify @requeue, put it on the idle tree. - * - * Return %1 if the caller should update the entity hierarchy, i.e., - * if the entity was under service or if it was the next_active for - * its sched_data; return %0 otherwise. - */ -static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) -{ - struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st = bfq_entity_service_tree(entity); - int was_active = entity == sd->active_entity; - int ret = 0; - - if (!entity->on_st) - return 0; - - BUG_ON(was_active && entity->tree != NULL); - - if (was_active) { - bfq_calc_finish(entity, entity->service); - sd->active_entity = NULL; - } else if (entity->tree == &st->active) - bfq_active_extract(st, entity); - else if (entity->tree == &st->idle) - bfq_idle_extract(st, entity); - else if (entity->tree != NULL) - BUG(); - - if (was_active || sd->next_active == entity) - ret = bfq_update_next_active(sd); - - if (!requeue || !bfq_gt(entity->finish, st->vtime)) - bfq_forget_entity(st, entity); - else - bfq_idle_insert(st, entity); - - BUG_ON(sd->active_entity == entity); - BUG_ON(sd->next_active == entity); - - return ret; -} - -/** - * bfq_deactivate_entity - deactivate an entity. - * @entity: the entity to deactivate. - * @requeue: true if the entity can be put on the idle tree - */ -static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) -{ - struct bfq_sched_data *sd; - struct bfq_entity *parent; - - for_each_entity_safe(entity, parent) { - sd = entity->sched_data; - - if (!__bfq_deactivate_entity(entity, requeue)) - /* - * The parent entity is still backlogged, and - * we don't need to update it as it is still - * under service. - */ - break; - - if (sd->next_active != NULL) - /* - * The parent entity is still backlogged and - * the budgets on the path towards the root - * need to be updated. - */ - goto update; - - /* - * If we reach there the parent is no more backlogged and - * we want to propagate the dequeue upwards. - */ - requeue = 1; - } - - return; - -update: - entity = parent; - for_each_entity(entity) { - __bfq_activate_entity(entity); - - sd = entity->sched_data; - if (!bfq_update_next_active(sd)) - break; - } -} - -/** - * bfq_update_vtime - update vtime if necessary. - * @st: the service tree to act upon. - * - * If necessary update the service tree vtime to have at least one - * eligible entity, skipping to its start time. Assumes that the - * active tree of the device is not empty. - * - * NOTE: this hierarchical implementation updates vtimes quite often, - * we may end up with reactivated tasks getting timestamps after a - * vtime skip done because we needed a ->first_active entity on some - * intermediate node. - */ -static void bfq_update_vtime(struct bfq_service_tree *st) -{ - struct bfq_entity *entry; - struct rb_node *node = st->active.rb_node; - - entry = rb_entry(node, struct bfq_entity, rb_node); - if (bfq_gt(entry->min_start, st->vtime)) { - st->vtime = entry->min_start; - bfq_forget_idle(st); - } -} - -/** - * bfq_first_active - find the eligible entity with the smallest finish time - * @st: the service tree to select from. - * - * This function searches the first schedulable entity, starting from the - * root of the tree and going on the left every time on this side there is - * a subtree with at least one eligible (start >= vtime) entity. The path - * on the right is followed only if a) the left subtree contains no eligible - * entities and b) no eligible entity has been found yet. - */ -static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) -{ - struct bfq_entity *entry, *first = NULL; - struct rb_node *node = st->active.rb_node; - - while (node != NULL) { - entry = rb_entry(node, struct bfq_entity, rb_node); -left: - if (!bfq_gt(entry->start, st->vtime)) - first = entry; - - BUG_ON(bfq_gt(entry->min_start, st->vtime)); - - if (node->rb_left != NULL) { - entry = rb_entry(node->rb_left, - struct bfq_entity, rb_node); - if (!bfq_gt(entry->min_start, st->vtime)) { - node = node->rb_left; - goto left; - } - } - if (first != NULL) - break; - node = node->rb_right; - } - - BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); - return first; -} - -/** - * __bfq_lookup_next_entity - return the first eligible entity in @st. - * @st: the service tree. - * - * Update the virtual time in @st and return the first eligible entity - * it contains. - */ -static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, - bool force) -{ - struct bfq_entity *entity, *new_next_active = NULL; - - if (RB_EMPTY_ROOT(&st->active)) - return NULL; - - bfq_update_vtime(st); - entity = bfq_first_active_entity(st); - BUG_ON(bfq_gt(entity->start, st->vtime)); - - /* - * If the chosen entity does not match with the sched_data's - * next_active and we are forcedly serving the IDLE priority - * class tree, bubble up budget update. - */ - if (unlikely(force && entity != entity->sched_data->next_active)) { - new_next_active = entity; - for_each_entity(new_next_active) - bfq_update_budget(new_next_active); - } - - return entity; -} - -/** - * bfq_lookup_next_entity - return the first eligible entity in @sd. - * @sd: the sched_data. - * @extract: if true the returned entity will be also extracted from @sd. - * - * NOTE: since we cache the next_active entity at each level of the - * hierarchy, the complexity of the lookup can be decreased with - * absolutely no effort just returning the cached next_active value; - * we prefer to do full lookups to test the consistency of * the data - * structures. - */ -static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, - int extract, - struct bfq_data *bfqd) -{ - struct bfq_service_tree *st = sd->service_tree; - struct bfq_entity *entity; - int i=0; - - BUG_ON(sd->active_entity != NULL); - - if (bfqd != NULL && - jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { - entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true); - if (entity != NULL) { - i = BFQ_IOPRIO_CLASSES - 1; - bfqd->bfq_class_idle_last_service = jiffies; - sd->next_active = entity; - } - } - for (; i < BFQ_IOPRIO_CLASSES; i++) { - entity = __bfq_lookup_next_entity(st + i, false); - if (entity != NULL) { - if (extract) { - bfq_check_next_active(sd, entity); - bfq_active_extract(st + i, entity); - sd->active_entity = entity; - sd->next_active = NULL; - } - break; - } - } - - return entity; -} - -/* - * Get next queue for service. - */ -static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) -{ - struct bfq_entity *entity = NULL; - struct bfq_sched_data *sd; - struct bfq_queue *bfqq; - - BUG_ON(bfqd->active_queue != NULL); - - if (bfqd->busy_queues == 0) - return NULL; - - sd = &bfqd->root_group->sched_data; - for (; sd != NULL; sd = entity->my_sched_data) { - entity = bfq_lookup_next_entity(sd, 1, bfqd); - BUG_ON(entity == NULL); - entity->service = 0; - } - - bfqq = bfq_entity_to_bfqq(entity); - BUG_ON(bfqq == NULL); - - return bfqq; -} - -/* - * Forced extraction of the given queue. - */ -static void bfq_get_next_queue_forced(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - struct bfq_entity *entity; - struct bfq_sched_data *sd; - - BUG_ON(bfqd->active_queue != NULL); - - entity = &bfqq->entity; - /* - * Bubble up extraction/update from the leaf to the root. - */ - for_each_entity(entity) { - sd = entity->sched_data; - bfq_update_budget(entity); - bfq_update_vtime(bfq_entity_service_tree(entity)); - bfq_active_extract(bfq_entity_service_tree(entity), entity); - sd->active_entity = entity; - sd->next_active = NULL; - entity->service = 0; - } - - return; -} - -static void __bfq_bfqd_reset_active(struct bfq_data *bfqd) -{ - if (bfqd->active_cic != NULL) { - put_io_context(bfqd->active_cic->ioc); - bfqd->active_cic = NULL; - } - - bfqd->active_queue = NULL; - del_timer(&bfqd->idle_slice_timer); -} - -static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - int requeue) -{ - struct bfq_entity *entity = &bfqq->entity; - - if (bfqq == bfqd->active_queue) - __bfq_bfqd_reset_active(bfqd); - - bfq_deactivate_entity(entity, requeue); -} - -static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = &bfqq->entity; - - bfq_activate_entity(entity); -} - -/* - * Called when the bfqq no longer has requests pending, remove it from - * the service tree. - */ -static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - int requeue) -{ - BUG_ON(!bfq_bfqq_busy(bfqq)); - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - - bfq_log_bfqq(bfqd, bfqq, "del from busy"); - - bfq_clear_bfqq_busy(bfqq); - - BUG_ON(bfqd->busy_queues == 0); - bfqd->busy_queues--; - - bfq_deactivate_bfqq(bfqd, bfqq, requeue); -} - -/* - * Called when an inactive queue receives a new request. - */ -static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) -{ - BUG_ON(bfq_bfqq_busy(bfqq)); - BUG_ON(bfqq == bfqd->active_queue); - - bfq_log_bfqq(bfqd, bfqq, "add to busy"); - - bfq_activate_bfqq(bfqd, bfqq); - - bfq_mark_bfqq_busy(bfqq); - bfqd->busy_queues++; -} diff --git a/block/bfq.h b/block/bfq.h deleted file mode 100644 index e2ce5052a..000000000 --- a/block/bfq.h +++ /dev/null @@ -1,595 +0,0 @@ -/* - * BFQ-v5 for 3.0: data structures and common functions prototypes. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - */ - -#ifndef _BFQ_H -#define _BFQ_H - -#include -#include -#include -#include - -#define BFQ_IOPRIO_CLASSES 3 -#define BFQ_CL_IDLE_TIMEOUT HZ/5 - -#define BFQ_MIN_WEIGHT 1 -#define BFQ_MAX_WEIGHT 1000 - -#define BFQ_DEFAULT_GRP_WEIGHT 10 -#define BFQ_DEFAULT_GRP_IOPRIO 0 -#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE - -struct bfq_entity; - -/** - * struct bfq_service_tree - per ioprio_class service tree. - * @active: tree for active entities (i.e., those backlogged). - * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). - * @first_idle: idle entity with minimum F_i. - * @last_idle: idle entity with maximum F_i. - * @vtime: scheduler virtual time. - * @wsum: scheduler weight sum; active and idle entities contribute to it. - * - * Each service tree represents a B-WF2Q+ scheduler on its own. Each - * ioprio_class has its own independent scheduler, and so its own - * bfq_service_tree. All the fields are protected by the queue lock - * of the containing bfqd. - */ -struct bfq_service_tree { - struct rb_root active; - struct rb_root idle; - - struct bfq_entity *first_idle; - struct bfq_entity *last_idle; - - u64 vtime; - unsigned long wsum; -}; - -/** - * struct bfq_sched_data - multi-class scheduler. - * @active_entity: entity under service. - * @next_active: head-of-the-line entity in the scheduler. - * @service_tree: array of service trees, one per ioprio_class. - * - * bfq_sched_data is the basic scheduler queue. It supports three - * ioprio_classes, and can be used either as a toplevel queue or as - * an intermediate queue on a hierarchical setup. - * @next_active points to the active entity of the sched_data service - * trees that will be scheduled next. - * - * The supported ioprio_classes are the same as in CFQ, in descending - * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. - * Requests from higher priority queues are served before all the - * requests from lower priority queues; among requests of the same - * queue requests are served according to B-WF2Q+. - * All the fields are protected by the queue lock of the containing bfqd. - */ -struct bfq_sched_data { - struct bfq_entity *active_entity; - struct bfq_entity *next_active; - struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -}; - -/** - * struct bfq_entity - schedulable entity. - * @rb_node: service_tree member. - * @on_st: flag, true if the entity is on a tree (either the active or - * the idle one of its service_tree). - * @finish: B-WF2Q+ finish timestamp (aka F_i). - * @start: B-WF2Q+ start timestamp (aka S_i). - * @tree: tree the entity is enqueued into; %NULL if not on a tree. - * @min_start: minimum start time of the (active) subtree rooted at - * this entity; used for O(log N) lookups into active trees. - * @service: service received during the last round of service. - * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. - * @weight: weight of the queue - * @parent: parent entity, for hierarchical scheduling. - * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the - * associated scheduler queue, %NULL on leaf nodes. - * @sched_data: the scheduler queue this entity belongs to. - * @ioprio: the ioprio in use. - * @new_weight: when a weight change is requested, the new weight value. - * @orig_weight: original weight, used to implement weight boosting - * @new_ioprio: when an ioprio change is requested, the new ioprio value. - * @ioprio_class: the ioprio_class in use. - * @new_ioprio_class: when an ioprio_class change is requested, the new - * ioprio_class value. - * @ioprio_changed: flag, true when the user requested a weight, ioprio or - * ioprio_class change. - * - * A bfq_entity is used to represent either a bfq_queue (leaf node in the - * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each - * entity belongs to the sched_data of the parent group in the cgroup - * hierarchy. Non-leaf entities have also their own sched_data, stored - * in @my_sched_data. - * - * Each entity stores independently its priority values; this would - * allow different weights on different devices, but this - * functionality is not exported to userspace by now. Priorities and - * weights are updated lazily, first storing the new values into the - * new_* fields, then setting the @ioprio_changed flag. As soon as - * there is a transition in the entity state that allows the priority - * update to take place the effective and the requested priority - * values are synchronized. - * - * Unless cgroups are used, the weight value is calculated from the - * ioprio to export the same interface as CFQ. When dealing with - * ``well-behaved'' queues (i.e., queues that do not spend too much - * time to consume their budget and have true sequential behavior, and - * when there are no external factors breaking anticipation) the - * relative weights at each level of the cgroups hierarchy should be - * guaranteed. All the fields are protected by the queue lock of the - * containing bfqd. - */ -struct bfq_entity { - struct rb_node rb_node; - - int on_st; - - u64 finish; - u64 start; - - struct rb_root *tree; - - u64 min_start; - - unsigned long service, budget; - unsigned short weight, new_weight; - unsigned short orig_weight; - - struct bfq_entity *parent; - - struct bfq_sched_data *my_sched_data; - struct bfq_sched_data *sched_data; - - unsigned short ioprio, new_ioprio; - unsigned short ioprio_class, new_ioprio_class; - - int ioprio_changed; -}; - -struct bfq_group; - -/** - * struct bfq_queue - leaf schedulable entity. - * @ref: reference counter. - * @bfqd: parent bfq_data. - * @new_bfqq: shared bfq_queue if queue is cooperating with - * one or more other queues. - * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree). - * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree). - * @sort_list: sorted list of pending requests. - * @next_rq: if fifo isn't expired, next request to serve. - * @queued: nr of requests queued in @sort_list. - * @allocated: currently allocated requests. - * @meta_pending: pending metadata requests. - * @fifo: fifo list of requests in sort_list. - * @entity: entity representing this queue in the scheduler. - * @max_budget: maximum budget allowed from the feedback mechanism. - * @budget_timeout: budget expiration (in jiffies). - * @dispatched: number of requests on the dispatch list or inside driver. - * @org_ioprio: saved ioprio during boosted periods. - * @org_ioprio_class: saved ioprio_class during boosted periods. - * @flags: status flags. - * @bfqq_list: node for active/idle bfqq list inside our bfqd. - * @seek_samples: number of seeks sampled - * @seek_total: sum of the distances of the seeks sampled - * @seek_mean: mean seek distance - * @last_request_pos: position of the last request enqueued - * @pid: pid of the process owning the queue, used for logging purposes. - * @last_rais_start_time: last (idle -> weight-raised) transition attempt - * @raising_cur_max_time: current max raising time for this queue - * - * A bfq_queue is a leaf request queue; it can be associated to an io_context - * or more (if it is an async one). @cgroup holds a reference to the - * cgroup, to be sure that it does not disappear while a bfqq still - * references it (mostly to avoid races between request issuing and task - * migration followed by cgroup distruction). - * All the fields are protected by the queue lock of the containing bfqd. - */ -struct bfq_queue { - atomic_t ref; - struct bfq_data *bfqd; - - /* fields for cooperating queues handling */ - struct bfq_queue *new_bfqq; - struct rb_node pos_node; - struct rb_root *pos_root; - - struct rb_root sort_list; - struct request *next_rq; - int queued[2]; - int allocated[2]; - int meta_pending; - struct list_head fifo; - - struct bfq_entity entity; - - unsigned long max_budget; - unsigned long budget_timeout; - - int dispatched; - - unsigned short org_ioprio; - unsigned short org_ioprio_class; - - unsigned int flags; - - struct list_head bfqq_list; - - unsigned int seek_samples; - u64 seek_total; - sector_t seek_mean; - sector_t last_request_pos; - - pid_t pid; - - /* weight-raising fields */ - unsigned int raising_cur_max_time; - u64 last_rais_start_finish, soft_rt_next_start; - unsigned int raising_coeff; -}; - -/** - * struct bfq_data - per device data structure. - * @queue: request queue for the managed device. - * @root_group: root bfq_group for the device. - * @rq_pos_tree: rbtree sorted by next_request position, - * used when determining if two or more queues - * have interleaving requests (see bfq_close_cooperator). - * @busy_queues: number of bfq_queues containing requests (including the - * queue under service, even if it is idling). - * @queued: number of queued requests. - * @rq_in_driver: number of requests dispatched and waiting for completion. - * @sync_flight: number of sync requests in the driver. - * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples - * completed requests . - * @hw_tag_samples: nr of samples used to calculate hw_tag. - * @hw_tag: flag set to one if the driver is showing a queueing behavior. - * @budgets_assigned: number of budgets assigned. - * @idle_slice_timer: timer set when idling for the next sequential request - * from the queue under service. - * @unplug_work: delayed work to restart dispatching on the request queue. - * @active_queue: bfq_queue under service. - * @active_cic: cfq_io_context (cic) associated with the @active_queue. - * @last_position: on-disk position of the last served request. - * @last_budget_start: beginning of the last budget. - * @last_idling_start: beginning of the last idle slice. - * @peak_rate: peak transfer rate observed for a budget. - * @peak_rate_samples: number of samples used to calculate @peak_rate. - * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling. - * @cic_index: use small consequent indexes as radix tree keys to reduce depth - * @cic_list: list of all the cics active on the bfq_data device. - * @group_list: list of all the bfq_groups active on the device. - * @active_list: list of all the bfq_queues active on the device. - * @idle_list: list of all the bfq_queues idle on the device. - * @bfq_quantum: max number of requests dispatched per dispatch round. - * @bfq_fifo_expire: timeout for async/sync requests; when it expires - * requests are served in fifo order. - * @bfq_back_penalty: weight of backward seeks wrt forward ones. - * @bfq_back_max: maximum allowed backward seek. - * @bfq_slice_idle: maximum idling time. - * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning). - * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to - * async queues. - * @bfq_timeout: timeout for bfq_queues to consume their budget; used to - * to prevent seeky queues to impose long latencies to well - * behaved ones (this also implies that seeky queues cannot - * receive guarantees in the service domain; after a timeout - * they are charged for the whole allocated budget, to try - * to preserve a behavior reasonably fair among them, but - * without service-domain guarantees). - * @bfq_raising_coeff: Maximum factor by which the weight of a boosted - * queue is multiplied - * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies) - * @bfq_raising_rt_max_time: maximum duration for soft real-time processes - * @bfq_raising_min_idle_time: minimum idle period after which weight-raising - * may be reactivated for a queue (in jiffies) - * @bfq_raising_min_inter_arr_async: minimum period between request arrivals - * after which weight-raising may be - * reactivated for an already busy queue - * (in jiffies) - * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue, - * sectors per seconds - * @RT_prod: cached value of the product R*T used for computing the maximum - * duration of the weight raising automatically - * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions - * - * All the fields are protected by the @queue lock. - */ -struct bfq_data { - struct request_queue *queue; - - struct bfq_group *root_group; - - struct rb_root rq_pos_tree; - - int busy_queues; - int queued; - int rq_in_driver; - int sync_flight; - - int max_rq_in_driver; - int hw_tag_samples; - int hw_tag; - - int budgets_assigned; - - struct timer_list idle_slice_timer; - struct work_struct unplug_work; - - struct bfq_queue *active_queue; - struct cfq_io_context *active_cic; - - sector_t last_position; - - ktime_t last_budget_start; - ktime_t last_idling_start; - int peak_rate_samples; - u64 peak_rate; - unsigned long bfq_max_budget; - - unsigned int cic_index; - struct list_head cic_list; - struct hlist_head group_list; - struct list_head active_list; - struct list_head idle_list; - - unsigned int bfq_quantum; - unsigned int bfq_fifo_expire[2]; - unsigned int bfq_back_penalty; - unsigned int bfq_back_max; - unsigned int bfq_slice_idle; - u64 bfq_class_idle_last_service; - - unsigned int bfq_user_max_budget; - unsigned int bfq_max_budget_async_rq; - unsigned int bfq_timeout[2]; - - bool low_latency; - - /* parameters of the low_latency heuristics */ - unsigned int bfq_raising_coeff; - unsigned int bfq_raising_max_time; - unsigned int bfq_raising_rt_max_time; - unsigned int bfq_raising_min_idle_time; - unsigned int bfq_raising_min_inter_arr_async; - unsigned int bfq_raising_max_softrt_rate; - u64 RT_prod; - - struct bfq_queue oom_bfqq; -}; - -enum bfqq_state_flags { - BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */ - BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ - BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ - BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ - BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */ - BFQ_BFQQ_FLAG_sync, /* synchronous queue */ - BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ - BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ - BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ - BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */ -}; - -#define BFQ_BFQQ_FNS(name) \ -static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -{ \ - (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -} \ -static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -{ \ - (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -} \ -static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -{ \ - return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -} - -BFQ_BFQQ_FNS(busy); -BFQ_BFQQ_FNS(wait_request); -BFQ_BFQQ_FNS(must_alloc); -BFQ_BFQQ_FNS(fifo_expire); -BFQ_BFQQ_FNS(idle_window); -BFQ_BFQQ_FNS(prio_changed); -BFQ_BFQQ_FNS(sync); -BFQ_BFQQ_FNS(budget_new); -BFQ_BFQQ_FNS(coop); -BFQ_BFQQ_FNS(split_coop); -BFQ_BFQQ_FNS(some_coop_idle); -#undef BFQ_BFQQ_FNS - -/* Logging facilities. */ -#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) - -#define bfq_log(bfqd, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) - -/* Expiration reasons. */ -enum bfqq_expiration { - BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */ - BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ - BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ - BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -}; - -#ifdef CONFIG_CGROUP_BFQIO -/** - * struct bfq_group - per (device, cgroup) data structure. - * @entity: schedulable entity to insert into the parent group sched_data. - * @sched_data: own sched_data, to contain child entities (they may be - * both bfq_queues and bfq_groups). - * @group_node: node to be inserted into the bfqio_cgroup->group_data - * list of the containing cgroup's bfqio_cgroup. - * @bfqd_node: node to be inserted into the @bfqd->group_list list - * of the groups active on the same device; used for cleanup. - * @bfqd: the bfq_data for the device this group acts upon. - * @async_bfqq: array of async queues for all the tasks belonging to - * the group, one queue per ioprio value per ioprio_class, - * except for the idle class that has only one queue. - * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). - * @my_entity: pointer to @entity, %NULL for the toplevel group; used - * to avoid too many special cases during group creation/migration. - * - * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup - * there is a set of bfq_groups, each one collecting the lower-level - * entities belonging to the group that are acting on the same device. - * - * Locking works as follows: - * o @group_node is protected by the bfqio_cgroup lock, and is accessed - * via RCU from its readers. - * o @bfqd is protected by the queue lock, RCU is used to access it - * from the readers. - * o All the other fields are protected by the @bfqd queue lock. - */ -struct bfq_group { - struct bfq_entity entity; - struct bfq_sched_data sched_data; - - struct hlist_node group_node; - struct hlist_node bfqd_node; - - void *bfqd; - - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; - struct bfq_queue *async_idle_bfqq; - - struct bfq_entity *my_entity; -}; - -/** - * struct bfqio_cgroup - bfq cgroup data structure. - * @css: subsystem state for bfq in the containing cgroup. - * @weight: cgroup weight. - * @ioprio: cgroup ioprio. - * @ioprio_class: cgroup ioprio_class. - * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data. - * @group_data: list containing the bfq_group belonging to this cgroup. - * - * @group_data is accessed using RCU, with @lock protecting the updates, - * @ioprio and @ioprio_class are protected by @lock. - */ -struct bfqio_cgroup { - struct cgroup_subsys_state css; - - unsigned short weight, ioprio, ioprio_class; - - spinlock_t lock; - struct hlist_head group_data; -}; -#else -struct bfq_group { - struct bfq_sched_data sched_data; - - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; - struct bfq_queue *async_idle_bfqq; -}; -#endif - -static inline struct bfq_service_tree * -bfq_entity_service_tree(struct bfq_entity *entity) -{ - struct bfq_sched_data *sched_data = entity->sched_data; - unsigned int idx = entity->ioprio_class - 1; - - BUG_ON(idx >= BFQ_IOPRIO_CLASSES); - BUG_ON(sched_data == NULL); - - return sched_data->service_tree + idx; -} - -static inline struct bfq_queue *cic_to_bfqq(struct cfq_io_context *cic, - int is_sync) -{ - return cic->cfqq[!!is_sync]; -} - -static inline void cic_set_bfqq(struct cfq_io_context *cic, - struct bfq_queue *bfqq, int is_sync) -{ - cic->cfqq[!!is_sync] = bfqq; -} - -static inline void call_for_each_cic(struct io_context *ioc, - void (*func)(struct io_context *, - struct cfq_io_context *)) -{ - struct cfq_io_context *cic; - struct hlist_node *n; - - rcu_read_lock(); - hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list) - func(ioc, cic); - rcu_read_unlock(); -} - -#define CIC_DEAD_KEY 1ul -#define CIC_DEAD_INDEX_SHIFT 1 - -static inline void *bfqd_dead_key(struct bfq_data *bfqd) -{ - return (void *)(bfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); -} - -/** - * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. - * @ptr: a pointer to a bfqd. - * @flags: storage for the flags to be saved. - * - * This function allows cic->key and bfqg->bfqd to be protected by the - * queue lock of the bfqd they reference; the pointer is dereferenced - * under RCU, so the storage for bfqd is assured to be safe as long - * as the RCU read side critical section does not end. After the - * bfqd->queue->queue_lock is taken the pointer is rechecked, to be - * sure that no other writer accessed it. If we raced with a writer, - * the function returns NULL, with the queue unlocked, otherwise it - * returns the dereferenced pointer, with the queue locked. - */ -static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, - unsigned long *flags) -{ - struct bfq_data *bfqd; - - rcu_read_lock(); - bfqd = rcu_dereference(*(struct bfq_data **)ptr); - - if (bfqd != NULL && !((unsigned long) bfqd & CIC_DEAD_KEY)) { - spin_lock_irqsave(bfqd->queue->queue_lock, *flags); - if (*ptr == bfqd) - goto out; - spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); - } - - bfqd = NULL; -out: - rcu_read_unlock(); - return bfqd; -} - -static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd, - unsigned long *flags) -{ - spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); -} - -static void bfq_changed_ioprio(struct io_context *ioc, - struct cfq_io_context *cic); -static void bfq_put_queue(struct bfq_queue *bfqq); -static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); -static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bfq_group *bfqg, int is_sync, - struct io_context *ioc, gfp_t gfp_mask); -static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -#endif diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 0504f530f..342eae9b0 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include /* for max_pfn/max_low_pfn */ @@ -17,12 +16,13 @@ */ static struct kmem_cache *iocontext_cachep; -static void hlist_sched_dtor(struct io_context *ioc, struct hlist_head *list) +static void cfq_dtor(struct io_context *ioc) { - if (!hlist_empty(list)) { + if (!hlist_empty(&ioc->cic_list)) { struct cfq_io_context *cic; - cic = hlist_entry(list->first, struct cfq_io_context, cic_list); + cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, + cic_list); cic->dtor(ioc); } } @@ -40,9 +40,7 @@ int put_io_context(struct io_context *ioc) if (atomic_long_dec_and_test(&ioc->refcount)) { rcu_read_lock(); - - hlist_sched_dtor(ioc, &ioc->cic_list); - hlist_sched_dtor(ioc, &ioc->bfq_cic_list); + cfq_dtor(ioc); rcu_read_unlock(); kmem_cache_free(iocontext_cachep, ioc); @@ -52,14 +50,15 @@ int put_io_context(struct io_context *ioc) } EXPORT_SYMBOL(put_io_context); -static void hlist_sched_exit(struct io_context *ioc, struct hlist_head *list) +static void cfq_exit(struct io_context *ioc) { rcu_read_lock(); - if (!hlist_empty(list)) { + if (!hlist_empty(&ioc->cic_list)) { struct cfq_io_context *cic; - cic = hlist_entry(list->first, struct cfq_io_context, cic_list); + cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, + cic_list); cic->exit(ioc); } rcu_read_unlock(); @@ -75,10 +74,8 @@ void exit_io_context(struct task_struct *task) task->io_context = NULL; task_unlock(task); -if (atomic_dec_and_test(&ioc->nr_tasks)) { - hlist_sched_exit(ioc, &ioc->cic_list); - hlist_sched_exit(ioc, &ioc->bfq_cic_list); - } + if (atomic_dec_and_test(&ioc->nr_tasks)) + cfq_exit(ioc); put_io_context(ioc); } @@ -92,14 +89,12 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) atomic_long_set(&ret->refcount, 1); atomic_set(&ret->nr_tasks, 1); spin_lock_init(&ret->lock); - bitmap_zero(ret->ioprio_changed, IOC_IOPRIO_CHANGED_BITS); + ret->ioprio_changed = 0; ret->ioprio = 0; ret->last_waited = 0; /* doesn't matter... */ ret->nr_batch_requests = 0; /* because this is 0 */ INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ret->cic_list); - INIT_RADIX_TREE(&ret->bfq_radix_root, GFP_ATOMIC | __GFP_HIGH); - INIT_HLIST_HEAD(&ret->bfq_cic_list); ret->ioc_data = NULL; #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) ret->cgroup_changed = 0; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b581793ec..ae21919f1 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2919,6 +2919,7 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) static void cfq_ioc_set_ioprio(struct io_context *ioc) { call_for_each_cic(ioc, changed_ioprio); + ioc->ioprio_changed = 0; } static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, @@ -3203,13 +3204,8 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) goto err_free; out: - /* - * test_and_clear_bit() implies a memory barrier, paired with - * the wmb() in fs/ioprio.c, so the value seen for ioprio is the - * new one. - */ - if (unlikely(test_and_clear_bit(IOC_CFQ_IOPRIO_CHANGED, - ioc->ioprio_changed))) + smp_read_barrier_depends(); + if (unlikely(ioc->ioprio_changed)) cfq_ioc_set_ioprio(ioc); #ifdef CONFIG_CFQ_GROUP_IOSCHED diff --git a/fs/ioprio.c b/fs/ioprio.c index 2fdc2a310..7da2a0650 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -30,7 +30,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) { - int err, i; + int err; struct io_context *ioc; const struct cred *cred = current_cred(), *tcred; @@ -60,15 +60,12 @@ int set_task_ioprio(struct task_struct *task, int ioprio) err = -ENOMEM; break; } - smp_wmb(); task->io_context = ioc; } while (1); if (!err) { ioc->ioprio = ioprio; - wmb(); - for (i = 0; i < IOC_IOPRIO_CHANGED_BITS; i++) - set_bit(i, ioc->ioprio_changed); + ioc->ioprio_changed = 1; } task_unlock(task); diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index c96663839..ac663c187 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -64,9 +64,3 @@ SUBSYS(perf) #endif /* */ - -#ifdef CONFIG_CGROUP_BFQIO -SUBSYS(bfqio) -#endif - -/* */ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 5f5357748..b2eee896d 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -3,12 +3,12 @@ #include #include -#include +struct cfq_queue; struct cfq_io_context { void *key; - void *cfqq[2]; + struct cfq_queue *cfqq[2]; struct io_context *ioc; @@ -27,16 +27,6 @@ struct cfq_io_context { struct rcu_head rcu_head; }; -/* - * Indexes into the ioprio_changed bitmap. A bit set indicates that - * the corresponding I/O scheduler needs to see a ioprio update. - */ -enum { - IOC_CFQ_IOPRIO_CHANGED, - IOC_BFQ_IOPRIO_CHANGED, - IOC_IOPRIO_CHANGED_BITS -}; - /* * I/O subsystem state of the associated processes. It is refcounted * and kmalloc'ed. These could be shared between processes. @@ -49,7 +39,7 @@ struct io_context { spinlock_t lock; unsigned short ioprio; - DECLARE_BITMAP(ioprio_changed, IOC_IOPRIO_CHANGED_BITS); + unsigned short ioprio_changed; #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) unsigned short cgroup_changed; @@ -63,8 +53,6 @@ struct io_context { struct radix_tree_root radix_root; struct hlist_head cic_list; - struct radix_tree_root bfq_radix_root; - struct hlist_head bfq_cic_list; void __rcu *ioc_data; };