ext4 - mballoc块分配机制

这篇具有很好参考价值的文章主要介绍了ext4 - mballoc块分配机制。希望对大家有所帮助。如果存在错误或未考虑完全的地方，请大家不吝赐教，您也可以点击"举报违法"按钮提交疑问。

概述

ext4为了尽量避免block管理的碎片化有如此措施：

1.mballoc多块分配器。

buddy算法管理每个block group
采用prellocation机制，氛围per-cpu local preallocation和per inode preallocation
- 小文件和大文件采用不同的策略
- 小文件（具体怎么算小文件可配置）尽量保持在一起，默认应该是512 blocks的一块区域, 采用的是per_cpu locality group，为每个cpu都配置这么一块存放小文件的区域。
- 大文件采用per-inode preallocation方式。
block分配时，会比请求的分配数量更多，多余的空间会放入preallocation space，这样给write多留些空间，避免concurrent write时候碎片化。
计算目标goal phsycial block，尽量保持块分配的连续性。

2.delay allocation。

delay allocation可以尽可能将连续的申请组织成extent，配置mballoc一次分配连续的多个phsycial block，降低cpu使用率/碎片化。

3.data block优先和其inode在同一个block group中

4.磁盘分成128M的block group

5.同一个目录下的inode优先保存期该目录所在的block group（具体源码在哪里尚未找到，不太确认ext4是否实现）

6.defrag反碎片化工具。

ext4_mb_new_blocks

ext4 mballoc执行phsycial block分配的入口点是ext4_mb_new_blocks:


/*
 * Main entry point into mballoc to allocate blocks
 * it tries to use preallocation first, then falls back
 * to usual allocation
 */
ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
				struct ext4_allocation_request *ar, int *errp)
{
	struct ext4_allocation_context *ac = NULL;
	struct ext4_sb_info *sbi;
	struct super_block *sb;
	ext4_fsblk_t block = 0;
	unsigned int inquota = 0;
	unsigned int reserv_clstrs = 0;
	u64 seq;

	might_sleep();
	sb = ar->inode->i_sb;
	sbi = EXT4_SB(sb);

	trace_ext4_request_blocks(ar);
    .../*主要是检查是否有足够的空间满足分配*/
      
    
    //创建一个allocation context
	ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
	if (!ac) {
		ar->len = 0;
		*errp = -ENOMEM;
		goto out;
	}
    //初始化context
	*errp = ext4_mb_initialize_context(ac, ar);
	if (*errp) {
		ar->len = 0;
		goto out;
	}

	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
	seq = this_cpu_read(discard_pa_seq);
    //优先使用prellcation space分配
	if (!ext4_mb_use_preallocated(ac)) {
		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
        //所谓规范化本质是分配比请求量更大的空间
		ext4_mb_normalize_request(ac, ar);
        
        //初始化ac->pa
		*errp = ext4_mb_pa_alloc(ac);
		if (*errp)
			goto errout;
repeat:
		/* allocate space in core */
        //预分配失败，进入常规的分配逻辑
		*errp = ext4_mb_regular_allocator(ac);
		/*
		 * pa allocated above is added to grp->bb_prealloc_list only
		 * when we were able to allocate some block i.e. when
		 * ac->ac_status == AC_STATUS_FOUND.
		 * And error from above mean ac->ac_status != AC_STATUS_FOUND
		 * So we have to free this pa here itself.
		 */
		if (*errp) {
			ext4_mb_pa_free(ac);
			ext4_discard_allocated_blocks(ac);
			goto errout;
		}
		if (ac->ac_status == AC_STATUS_FOUND &&
			ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
			ext4_mb_pa_free(ac);
	}
	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
		if (*errp) {
			ext4_discard_allocated_blocks(ac);
			goto errout;
		} else {
			block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
			ar->len = ac->ac_b_ex.fe_len;
		}
	} else {
		if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
			goto repeat;
		/*
		 * If block allocation fails then the pa allocated above
		 * needs to be freed here itself.
		 */
		ext4_mb_pa_free(ac);
		*errp = -ENOSPC;
	}
    ...

	return block;
}

ext4_allocation_context结构体


struct ext4_allocation_context {
	struct inode *ac_inode;
	struct super_block *ac_sb;

	/* original request */
	struct ext4_free_extent ac_o_ex;

	/* goal request (normalized ac_o_ex) */
	struct ext4_free_extent ac_g_ex;

	/* the best found extent */
	struct ext4_free_extent ac_b_ex;

	/* copy of the best found extent taken before preallocation efforts */
	struct ext4_free_extent ac_f_ex;

	__u16 ac_groups_scanned;
	__u16 ac_found;
	__u16 ac_tail;
	__u16 ac_buddy;
	__u16 ac_flags;		/* allocation hints */
	__u8 ac_status;
	__u8 ac_criteria;
	__u8 ac_2order;		/* if request is to allocate 2^N blocks and
				 * N > 0, the field stores N, otherwise 0 */
	__u8 ac_op;		/* operation, for history only */
	struct page *ac_bitmap_page;
	struct page *ac_buddy_page;
	struct ext4_prealloc_space *ac_pa;
	struct ext4_locality_group *ac_lg;
};

上面注释写的非常清晰:

ac_o_ex: 原始请求

ac_g_ex：目标请求，可以跟ac_o_ex不同，比如如注释中说明，ac_g_ex是ac_o_ex经过normalized（对应mballoc::ext4_mb_normalize_request函数处理之后即为ac_g_ex)的结果，ac_b_ex：最终的分配结果，因为ac_g_ex未必能被100%满足

ac_f_ex: ac_b_ex的一份拷贝。

ac_2order: 申请物理block数量如果正好是2的N次方，那么ac_2order = N，否则为0

ac_bitmap_page/ac_buddy_page: 跟mballoc相关的bit位信息，参考ext4 mballoc之buddy算法_nginux的博客-CSDN博客

ac_pa: per-inode预分配

ac_lg: per-cpu预分配，给小文件准备的。

ext4_mb_initialize_context函数


static noinline_for_stack int
ext4_mb_initialize_context(struct ext4_allocation_context *ac,
				struct ext4_allocation_request *ar)
{
	struct super_block *sb = ar->inode->i_sb;
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	struct ext4_super_block *es = sbi->s_es;
	ext4_group_t group;
	unsigned int len;
	ext4_fsblk_t goal;
	ext4_grpblk_t block;

	/* we can't allocate > group size */
	len = ar->len;

	/* just a dirty hack to filter too big requests  */
	if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
		len = EXT4_CLUSTERS_PER_GROUP(sb);

	/* start searching from the goal */
	goal = ar->goal;
	if (goal < le32_to_cpu(es->s_first_data_block) ||
			goal >= ext4_blocks_count(es))
		goal = le32_to_cpu(es->s_first_data_block);
	ext4_get_group_no_and_offset(sb, goal, &group, &block);

	/* set up allocation goals */
	ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
	ac->ac_status = AC_STATUS_CONTINUE;
	ac->ac_sb = sb;
	ac->ac_inode = ar->inode;
	ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
	ac->ac_o_ex.fe_group = group;
	ac->ac_o_ex.fe_start = block;
	ac->ac_o_ex.fe_len = len;
    //可以看到normalized前ac_g_ex跟ac_o_ex相同
	ac->ac_g_ex = ac->ac_o_ex;
	ac->ac_flags = ar->flags;

	/* we have to define context: we'll work with a file or
	 * locality group. this is a policy, actually */
	ext4_mb_group_or_file(ac);

    ...
	return 0;

}

首先完成ac_o_ex和 ac_g_ex的赋值工作；然后，ext4_mb_group_or_file函数决定是一个文件到底是小文件和大文件，如概述中描述，ext4针对这两种文件策略不同。

ext4_mb_group_or_file函数


/*
 * We use locality group preallocation for small size file. The size of the
 * file is determined by the current size or the resulting size after
 * allocation which ever is larger
 *
 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
 */
static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
{
	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
	int bsbits = ac->ac_sb->s_blocksize_bits;
	loff_t size, isize;

	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
		return;

	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
		return;

	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
		>> bsbits;

	if ((size == isize) && !ext4_fs_is_busy(sbi) &&
	    !inode_is_open_for_write(ac->ac_inode)) {
		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
		return;
	}
    //s_mb_group_prealloc是给小文分配的per-cpu local group空间大小，如果<=0
    //就设置EXT4_MB_STREAM_ALLOC，不适用小文件分配策略
	if (sbi->s_mb_group_prealloc <= 0) {
		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
		return;
	}

	/* don't use group allocation for large files */
    //s_mb_stream_request值来自于/sys/fs/ext4/xxx/mb_stream_req,文件大小大于了该值
    //为大文件，否则为小文件
	size = max(size, isize);
	if (size > sbi->s_mb_stream_request) {
		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
		return;
	}

	BUG_ON(ac->ac_lg != NULL);
	/*
	 * locality group prealloc space are per cpu. The reason for having
	 * per cpu locality group is to reduce the contention between block
	 * request from multiple CPUs.
	 */
	ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);

	/* we're going to use group allocation */
    //如果进行到这里，说明是小文件
	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;

	/* serialize all allocations in the group */
	mutex_lock(&ac->ac_lg->lg_mutex);
}

ext4_prealloc_space结构体

struct ext4_prealloc_space {
    //如果是per-inode preallocation挂在ext4_inode_info的i_prealloc_list
    //如果是per_cpu locality group预分配空间挂在ext4_locality_group的lg_prealloc_list链表上
	struct list_head	pa_inode_list;

    //预分配空间同时也会挂在ext4_group_info的bb_prealloc_list链表上，
    //用于初始化buddy bitmap的之      前给block bitmap置上对应的已使用标记
	struct list_head	pa_group_list;
	union {
		struct list_head pa_tmp_list;
		struct rcu_head	pa_rcu;
	} u;
	spinlock_t		pa_lock;
	atomic_t		pa_count;
    //预分配空间是否已删除
	unsigned		pa_deleted;
    //起始物理块号
	ext4_fsblk_t		pa_pstart;	/* phys. block */
    //起始逻辑块号（相对于文件）
	ext4_lblk_t		pa_lstart;	/* log. block */
    //预分配空间长度（单位是block)
	ext4_grpblk_t		pa_len;		/* len of preallocated chunk */
    //空间的可用长度
	ext4_grpblk_t		pa_free;	/* how many blocks are free */
    //类型，indode or group
	unsigned short		pa_type;	/* pa type. inode or group */
	spinlock_t		*pa_obj_lock;
	struct inode		*pa_inode;	/* hack, for history only */
};

ext4_locality_group结构体


/*
 * Locality group:
 *   we try to group all related changes together
 *   so that writeback can flush/allocate them together as well
 *   Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
 *   (512). We store prealloc space into the hash based on the pa_free blocks
 *   order value.ie, fls(pa_free)-1;
 */
#define PREALLOC_TB_SIZE 10
struct ext4_locality_group {
	/* for allocator */
	/* to serialize allocates */
	struct mutex		lg_mutex;
	/* list of preallocations */
    // 挂ext4_prealloc_space的链表，按照预分配空间的可用长度进行分组
	struct list_head	lg_prealloc_list[PREALLOC_TB_SIZE];
	spinlock_t		lg_prealloc_lock;
};

ext4_mb_use_preallocated函数

函数判定能否使用preallocation space分配block，优先使用per-inode preallocation预分配空间；如果失败，再判断是否是小文件能使用per-cpu local group preallocation预分配空间；如果任何一个预分配空间分配成功，return true；否者return false代表无法使用预分配空间。


/*
 * search goal blocks in preallocated space
 */
static noinline_for_stack bool
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
	int order, i;
	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
	struct ext4_locality_group *lg;
	struct ext4_prealloc_space *pa, *cpa = NULL;
	ext4_fsblk_t goal_block;

	/* only data can be preallocated */
    //linux一切皆文件，只要普通文件才使用预分配，EXT4_MB_HINT_DATA是ext4_ext_map_blocks
    //中根据如下条件设置    if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; 
	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
		return false;

	/* first, try per-file preallocation */
	rcu_read_lock();
	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {

		/* all fields in this condition don't change,
		 * so we can skip locking for them */
        //不在这个预分配空间范围内，跳到下一个预分配空间
		if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
		    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
					       EXT4_C2B(sbi, pa->pa_len)))
			continue;

		/* non-extent files can't have physical blocks past 2^32 */
		if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
		    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
		     EXT4_MAX_BLOCK_FILE_PHYS))
			continue;

		/* found preallocated blocks, use them */
        //找到了合适的预分配空间
		spin_lock(&pa->pa_lock);
		if (pa->pa_deleted == 0 && pa->pa_free) {
			atomic_inc(&pa->pa_count);
			ext4_mb_use_inode_pa(ac, pa);
			spin_unlock(&pa->pa_lock);
			ac->ac_criteria = 10;
			rcu_read_unlock();
			return true;
		}
		spin_unlock(&pa->pa_lock);
	}
	rcu_read_unlock();
    //走到这里说明per-inode没有分配成功，需要判定能否是小文件走per-cpu local group分配
	/* can we use group allocation? */
	if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
		return false;

	/* inode may have no locality group for some reason */
	lg = ac->ac_lg;
	if (lg == NULL)
		return false;
	order  = fls(ac->ac_o_ex.fe_len) - 1;
	if (order > PREALLOC_TB_SIZE - 1)
		/* The max size of hash table is PREALLOC_TB_SIZE */
		order = PREALLOC_TB_SIZE - 1;

	goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
	/*
	 * search for the prealloc space that is having
	 * minimal distance from the goal block.
	 */
	for (i = order; i < PREALLOC_TB_SIZE; i++) {
		rcu_read_lock();
		list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
					pa_inode_list) {
			spin_lock(&pa->pa_lock);
			if (pa->pa_deleted == 0 &&
					pa->pa_free >= ac->ac_o_ex.fe_len) {

				cpa = ext4_mb_check_group_pa(goal_block,
								pa, cpa);
			}
			spin_unlock(&pa->pa_lock);
		}
		rcu_read_unlock();
	}
	if (cpa) {
        //小文件预分配空间分配成功
		ext4_mb_use_group_pa(ac, cpa);
		ac->ac_criteria = 20;
		return true;
	}
	return false;
}