linux内核启动分析（三）

这篇具有很好参考价值的文章主要介绍了linux内核启动分析（三）。希望对大家有所帮助。如果存在错误或未考虑完全的地方，请大家不吝赐教，您也可以点击"举报违法"按钮提交疑问。

我们前面看了start_kernel的一些函数，现在我们继续追setup_arch：

9. setup_arch

void __init __no_sanitize_address setup_arch(char **cmdline_p)
{
	//init_mm是init进程（0号进程）的内存描述符
	//初始化内核的mm结构体的代码段、数据段和栈的结束地址
	init_mm.start_code = (unsigned long) _text;
	init_mm.end_code   = (unsigned long) _etext;
	init_mm.end_data   = (unsigned long) _edata;
	init_mm.brk	   = (unsigned long) _end;

	*cmdline_p = boot_command_line;//cmdline_p指向boot启动参数

	/*
	 * If know now we are going to need KPTI then use non-global
	 * mappings from the start, avoiding the cost of rewriting
	 * everything later.
	 */
	//这里根据是否开启kpli(内核页表隔离)决定是否创建非全局页表
	arm64_use_ng_mappings = kaslr_requires_kpti();

	early_fixmap_init();//早期固定映射初始化，搞了个框架，还没有填充pte页表项
	early_ioremap_init();//早期io映射初始化

	setup_machine_fdt(__fdt_pointer);//解析fdt中内存，加入到memblock中

	/*
	 * Initialise the static keys early as they may be enabled by the
	 * cpufeature code and early parameters.
	 */
	jump_label_init();//初始化jump-label子系统，看不懂
	parse_early_param();//解析early options这两个参数

	/*
	 * Unmask asynchronous aborts and fiq after bringing up possible
	 * earlycon. (Report possible System Errors once we can report this
	 * occurred).
	 */
	local_daif_restore(DAIF_PROCCTX_NOIRQ);//恢复daif为不可以IRQ

	/*
	 * TTBR0 is only used for the identity mapping at this stage. Make it
	 * point to zero page to avoid speculatively fetching new entries.
	 */
	cpu_uninstall_idmap();//移除idmap_pg_dir页表

	xen_early_init();//不支持xen，不看
	efi_init();

	if (!efi_enabled(EFI_BOOT) && ((u64)_text % MIN_KIMG_ALIGN) != 0)
	     pr_warn(FW_BUG "Kernel image misaligned at boot, please fix your bootloader!");

	arm64_memblock_init();//初始化memblock，到这里，不能使用的内存都放入reserved了

	paging_init();//切换为细粒度映射，释放粗粒度映射页表内存，memblock初始化完成

	acpi_table_upgrade();//解析acpi表的信息，存放到acpi_initrd_files中

	/* Parse the ACPI tables for possible boot-time configuration */
	acpi_boot_table_init();//解析ACPI表，获得可能的引导时配置

	if (acpi_disabled)
		unflatten_device_tree();//解析设备树信息，创建设备节点树

	bootmem_init();//初始化内存基本数据结构，pg_data_t,、zone、page

	kasan_init();//KASAN初始化，我们没有使用

	request_standard_resources();

	early_ioremap_reset();//设置after_paging_init为1

	if (acpi_disabled)
		psci_dt_init();//设备树中psci初始化
	else
		psci_acpi_init();//acpi种psci初始化

	init_bootcpu_ops();//初始化cpu启动方法集合
	smp_init_cpus();//smp的其他cpu初始化
	smp_build_mpidr_hash();//看不懂

	/* Init percpu seeds for random tags after cpus are set up. */
	kasan_init_tags();//kasan的东西，我们没有用

#ifdef CONFIG_ARM64_SW_TTBR0_PAN
	/*
	 * Make sure init_thread_info.ttbr0 always generates translation
	 * faults in case uaccess_enable() is inadvertently called by the init
	 * thread.
	 */
	init_task.thread_info.ttbr0 = phys_to_ttbr(__pa_symbol(reserved_pg_dir));
#endif

	if (boot_args[1] || boot_args[2] || boot_args[3]) {
		pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n"
			"\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n"
			"This indicates a broken bootloader or old kernel\n",
			boot_args[1], boot_args[2], boot_args[3]);
	}
}

setup_arch函数主要是处理cpu体系相关架构，我们是arm64平台，这个函数处理arm64的一些初始化，主要包括：

初始化内核的mm结构体的代码段、数据段和栈的结束地址；
调用函数early_fixmap_init进行早期固定映射初始化；
调用函数early_ioremap_init进行早期io映射初始化；
调用函数setup_machine_fdt解析fdt中内存，加入到memblock中；
调用函数parse_early_param/解析early options这两个参数；
调用函数local_daif_restore恢复cpu的daif状态为不可以IRQ；
调用函数cpu_uninstall_idmap移除idmap_pg_dir页表
调用函数efi_init初始化 UEFI 系统中的各个硬件设备以及配置各种环境参数，从而为操作系统的正常启动做好准备
调用函数arm64_memblock_init初始化memblock，
调用函数 paging_init切换为细粒度映射，释放粗粒度映射页表内存
调用函数acpi_table_upgrade解析acpi表的信息，存放到acpi_initrd_files中
调用函数acpi_boot_table_init解析ACPI表，
如果找不到ACPI表，调用函数unflatten_device_tree()解析设备树信息，创建设备节点树
调用函数bootmem_init初始化内存基本数据结构，pg_data_t,、zone、page
如果ACPI表不存在，调用函数psci_dt_init进行设备树中psci初始化；否则调用函数psci_acpi_init进行acpi种psci初始化
调用函数init_bootcpu_ops初始化cpu启动方法集合
调用函数smp_init_cpus进行smp的其他cpu初始化

9.1 kaslr_requires_kpti

bool kaslr_requires_kpti(void)
{
	//如果没有CONFIG_RANDOMIZE_BASE，说明不能支持kpti
	if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE))
		return false;

	/*
	 * E0PD does a similar job to KPTI so can be used instead
	 * where available.
	 */
	if (IS_ENABLED(CONFIG_ARM64_E0PD)) {
		//读取AA64MMFR2_EL1寄存器
		u64 mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1);
		//如果寄存器实现了E0PD，就不用KPTI了，因为硬件实现了相关功能
		if (cpuid_feature_extract_unsigned_field(mmfr2,
						ID_AA64MMFR2_E0PD_SHIFT))
			return false;
	}

	/*
	 * Systems affected by Cavium erratum 24756 are incompatible
	 * with KPTI.
	 */
	//如果开启了erratum 24756，就不能开启KPTI，因为不兼容
	if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) {
		extern const struct midr_range cavium_erratum_27456_cpus[];

		if (is_midr_in_range_list(read_cpuid_id(),
					  cavium_erratum_27456_cpus))
			return false;
	}

	return kaslr_offset() > 0;
}

KPTI 是一项安全技术，它将内核页表和用户页表分离，使得用户空间无法直接访问内核空间的数据结构和代码，从而防止用户空间对内核进行非法访问。在某些情况下，KPTI 还可以减轻 Meltdown 和 Spectre 等 CPU 漏洞所带来的安全风险。
kaslr_requires_kpti判断当前系统是否需要启用 Kernel Page Table Isolation (KPTI) 技术来保护内核代码和数据。

首先判断是否启用了 KASLR（Kernel Address Space Layout Randomization）随机化技术。如果已经启用了 KASLR，则需要启用 KPTI 技术来保护内核；否则不需要启用 KPTI 技术。
然后判断是否启用了arm64的v8.5的E0PD(Exclusive Only Point of Drop)技术。如果已经启用了E0PD，读取mmfr2寄存器查看是否支持E0PD，如果支持，则不需要启用 KPTI 技术。
如果开启了erratum 24756，就不能开启KPTI。
如果内核镜像的虚拟地址大于KIMAGE_VADDR，则可以启用 KPTI 技术。

9.2 early_fixmap_init

void __init early_fixmap_init(void)
{
	pgd_t *pgdp;
	p4d_t *p4dp, p4d;
	pud_t *pudp;
	pmd_t *pmdp;
	unsigned long addr = FIXADDR_START;

	pgdp = pgd_offset_k(addr);//找到固定映射起始地址的pgd页表项（偏移后的）
	p4dp = p4d_offset(pgdp, addr);//计算出pgd页表项中存放的物理地址
	p4d = READ_ONCE(*p4dp);//读取页表项中的物理地址存放的值，也就是p4d表起始地址
	//如果使用超过3级页表，并且p4d不是空的
	if (CONFIG_PGTABLE_LEVELS > 3 &&
	    !(p4d_none(p4d) || p4d_page_paddr(p4d) == __pa_symbol(bm_pud))) {
		/*
		 * We only end up here if the kernel mapping and the fixmap
		 * share the top level pgd entry, which should only happen on
		 * 16k/4 levels configurations.
		 */
		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
		pudp = pud_offset_kimg(p4dp, addr);//计算出固定映射起始地址的pud页表项
	} else {
		if (p4d_none(p4d))//如果p4d是空的
			//将bm_pud的物理地址写到pgd全局页目录表中
			__p4d_populate(p4dp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
		pudp = fixmap_pud(addr);
	}
	if (pud_none(READ_ONCE(*pudp)))//如果pudp是空的
		//将bm_pmd的物理地址写到pud页目录表中
		__pud_populate(pudp, __pa_symbol(bm_pmd), PMD_TYPE_TABLE);
	pmdp = fixmap_pmd(addr);//计算出pmd的目录项的物理地址
	//将bm_pte的物理地址写到pmd页表目录表中
	__pmd_populate(pmdp, __pa_symbol(bm_pte), PMD_TYPE_TABLE);

	/*
	 * The boot-ioremap range spans multiple pmds, for which
	 * we are not prepared:
	 */
	BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
		     != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));

	if ((pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)))
	     || pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) {
		WARN_ON(1);
		pr_warn("pmdp %p != %p, %p\n",
			pmdp, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)),
			fixmap_pmd(fix_to_virt(FIX_BTMAP_END)));
		pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
			fix_to_virt(FIX_BTMAP_BEGIN));
		pr_warn("fix_to_virt(FIX_BTMAP_END):   %08lx\n",
			fix_to_virt(FIX_BTMAP_END));

		pr_warn("FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
		pr_warn("FIX_BTMAP_BEGIN:     %d\n", FIX_BTMAP_BEGIN);
	}
}

early_fixmap_init函数主要是初始化早期固定映射：

调用函数pgd_offset_k计算出固定映射起始地址的pgd页表项；
调用函数p4d_offset计算出pgd页表项中存放的物理地址，
读取页表项中的物理地址存放的值，也就是p4d页表的起始地址
如果使用超过3级页表，并且p4d不是空的，则调用函数pud_offset_kimg计算出固定映射起始地址的pud页表项；否则调用函数__p4d_populate填充pud页目录项，并且调用函数fixmap_pud计算出固定映射起始地址的pud页表项；
如果pudp是空的，调用函数__pud_populate填充pmd页目录项。
调用函数fixmap_pmd计算出固定映射起始地址的pmd页表项；
调用函数__pmd_populate填充pmd页目录项。

9.3 early_ioremap_init

void __init early_ioremap_init(void)
{
	early_ioremap_setup();
}

void __init early_ioremap_setup(void)
{
	int i;

	for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
		if (WARN_ON(prev_map[i]))
			break;

	for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
		slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
}

9.4 setup_machine_fdt

static void __init setup_machine_fdt(phys_addr_t dt_phys)
{
	int size;
	//为FDT物理计算出虚拟地址，然后填写pte页表项创建映射关系
	void *dt_virt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL);
	const char *name;

	if (dt_virt)
		memblock_reserve(dt_phys, size);//设备树内存添加到memblock.reserved中

	//扫描设备树的内存信息，把内存加入memblock
	if (!dt_virt || !early_init_dt_scan(dt_virt)) {
		pr_crit("\n"
			"Error: invalid device tree blob at physical address %pa (virtual address 0x%p)\n"
			"The dtb must be 8-byte aligned and must not exceed 2 MB in size\n"
			"\nPlease check your bootloader.",
			&dt_phys, dt_virt);

		while (true)
			cpu_relax();
	}

	//修改映射的pte项为只读
	fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO);

	name = of_flat_dt_get_machine_name();
	if (!name)
		return;

	pr_info("Machine model: %s\n", name);
	dump_stack_set_arch_desc("%s (DT)", name);
}

setup_machine_fdt函数主要做了一下几件事：

调用函数fixmap_remap_fdt为FDT物理计算出虚拟地址，创建固定映射的关系；
调用函数memblock_reserve把设备树使用的内存添加到memblock的reserved中
调用函数early_init_dt_scan扫描设备树的内存信息，把内存加入memblock
调用函数fixmap_remap_fdt修改映射的pte项为只读

9.5 parse_early_param

void __init parse_early_param(void)
{
	static int done __initdata;
	static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;

	if (done)
		return;

	/* All fall through to do_early_param. */
	strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);//拷贝启动参数到tmp_cmdline
	parse_early_options(tmp_cmdline);//解析启动参数的early或者options，如果有运行函数early options
	done = 1;
}

void __init parse_early_options(char *cmdline)
{
	parse_args("early options", cmdline, NULL, 0, 0, 0, NULL,
		   do_early_param);
}

parse_early_param函数首先调用函数strlcpy把启动参数拷贝到tmp_cmdline，然后调用函数parse_early_options解析启动参数，parse_early_options函数也只是直接调用parse_args函数解析参数的。

9.6 cpu_uninstall_idmap

static inline void cpu_uninstall_idmap(void)
{
	struct mm_struct *mm = current->active_mm;

	cpu_set_reserved_ttbr0();//设置TTBR0_EL1为reserved_pg_dir的物理地址
	local_flush_tlb_all();//冲刷TLB
	cpu_set_default_tcr_t0sz();//根据虚拟地址位数设置TCR.t0sz

	if (mm != &init_mm && !system_uses_ttbr0_pan())
		cpu_switch_mm(mm->pgd, mm);
}

cpu_uninstall_idmap主要做了以下几件事：

调用函数cpu_set_reserved_ttbr0设置TTBR0_EL1为reserved_pg_dir的物理地址
调用函数local_flush_tlb_all冲刷TLB
调用函数cpu_set_default_tcr_t0sz根据虚拟地址位数设置TCR.t0sz

9.7 arm64_memblock_init

void __init arm64_memblock_init(void)
{
	const s64 linear_region_size = BIT(vabits_actual - 1);

	/* Handle linux,usable-memory-range property */
	fdt_enforce_memory_region();//扫描usable-memory-range属性的内存

	//删除超出我们支持的物理地址大小的内存
	memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);

	//为物理内存的基数选择一个合适的值
	memstart_addr = round_down(memblock_start_of_DRAM(),
				   ARM64_MEMSTART_ALIGN);

	/*
	 * Remove the memory that we will not be able to cover with the
	 * linear mapping. Take care not to clip the kernel which may be
	 * high in memory.
	 */
	//删除线性映射覆盖的内存，我们无法用
	memblock_remove(max_t(u64, memstart_addr + linear_region_size,
			__pa_symbol(_end)), ULLONG_MAX);
	if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
		/* ensure that memstart_addr remains sufficiently aligned */
		memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
					 ARM64_MEMSTART_ALIGN);
		memblock_remove(0, memstart_addr);
	}

	/*
	 * If we are running with a 52-bit kernel VA config on a system that
	 * does not support it, we have to place the available physical
	 * memory in the 48-bit addressable part of the linear region, i.e.,
	 * we have to move it upward. Since memstart_addr represents the
	 * physical address of PAGE_OFFSET, we have to *subtract* from it.
	 */
	if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52))
		memstart_addr -= _PAGE_OFFSET(48) - _PAGE_OFFSET(52);

	/*
	 * Apply the memory limit if it was set. Since the kernel may be loaded
	 * high up in memory, add back the kernel region that must be accessible
	 * via the linear mapping.
	 */
	if (memory_limit != PHYS_ADDR_MAX) {
		memblock_mem_limit_remove_map(memory_limit);
		memblock_add(__pa_symbol(_text), (u64)(_end - _text));
	}

	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
		/*
		 * Add back the memory we just removed if it results in the
		 * initrd to become inaccessible via the linear mapping.
		 * Otherwise, this is a no-op
		 */
		u64 base = phys_initrd_start & PAGE_MASK;
		u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base;

		/*
		 * We can only add back the initrd memory if we don't end up
		 * with more memory than we can address via the linear mapping.
		 * It is up to the bootloader to position the kernel and the
		 * initrd reasonably close to each other (i.e., within 32 GB of
		 * each other) so that all granule/#levels combinations can
		 * always access both.
		 */
		if (WARN(base < memblock_start_of_DRAM() ||
			 base + size > memblock_start_of_DRAM() +
				       linear_region_size,
			"initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {
			phys_initrd_size = 0;
		} else {
			//移除[phys_initrd_start,phys_initrd_start + phys_initrd_size]
			//再添加回去，从而清除flags，最后放入reserve中
			memblock_remove(base, size); /* clear MEMBLOCK_ flags */
			memblock_add(base, size);
			memblock_reserve(base, size);
		}
	}

	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
		extern u16 memstart_offset_seed;
		u64 range = linear_region_size -
			    (memblock_end_of_DRAM() - memblock_start_of_DRAM());

		/*
		 * If the size of the linear region exceeds, by a sufficient
		 * margin, the size of the region that the available physical
		 * memory spans, randomize the linear region as well.
		 */
		if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) {
			range /= ARM64_MEMSTART_ALIGN;
			memstart_addr -= ARM64_MEMSTART_ALIGN *
					 ((range * memstart_offset_seed) >> 16);
		}
	}

	/*
	 * Register the kernel text, kernel data, initrd, and initial
	 * pagetables with memblock.
	 */
	//把内核代码段的内存放入reserve
	memblock_reserve(__pa_symbol(_text), _end - _text);
	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
		/* the generic initrd code expects virtual addresses */
		initrd_start = __phys_to_virt(phys_initrd_start);
		initrd_end = initrd_start + phys_initrd_size;
	}
	//把设备树中记录的reserved内存放入reserve
	early_init_fdt_scan_reserved_mem();

	reserve_elfcorehdr();//为ELF核心头预留内存

	if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
		reserve_crashkernel();//解析设备树，把crashkernel需要的内存放入reserve

	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
}

arm64_memblock_init函数主要做了以下几件事情：

调用函数fdt_enforce_memory_region扫描设备树中的可用内存，并且添加到menblock中，
调用函数memblock_remove把memblcok中超出linux系统支持范围（linux内核一般配置为支持48位）的物理内存移除；
调用函数memblock_remove把线性映射区的内存移除；
把initrd使用的内存移除后重新添加，目的是清除flage；
把内核代码段的内存放入memblock的reserve中
调用函数early_init_fdt_scan_reserved_mem把设备树中记录的reserved内存放入reserve
调用函数reserve_elfcorehdr为ELF核心头预留内存
调用函数reserve_crashkernel解析设备树，把crashkernel需要的内存放入reserve

9.8 paging_init

void __init paging_init(void)
{
	//获取swapper_pg_dir对应的pgd项
	pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));

	map_kernel(pgdp);//为内核创建细粒度映射。
	map_mem(pgdp);//给全部的memblock创建页表映射

	pgd_clear_fixmap();//清空fixmap临时映射
	//切换页表，并将新建立的页表内容替换swappper_pg_dir页表内容
	cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
	init_mm.pgd = swapper_pg_dir;//设置mm.pgd

	//释放粗粒度内核页表init_pg_dir
	memblock_free(__pa_symbol(init_pg_dir),
		      __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));

	memblock_allow_resize();//设置memblock_can_resize为1
}

paging_init主要做了以下几件事：

调用函数pgd_set_fixmap找到swapper_pg_dir的物理地址的pgd，
调用函数map_kernel为内核创建细粒度映射，
调用函数map_mem给全部的memblock创建页表映射，
调用函数pgd_clear_fixmap清空之前的固定映射关系，
调用函数cpu_replace_ttbr1切换页表，并将新建立的页表内容替换为swappper_pg_dir页表内容
修改init_mm.pgd为swapper_pg_dir
调用函数memblock_free释放粗粒度内核页表init_pg_dir

9.9 unflatten_device_tree

void __init unflatten_device_tree(void)
{
	//解析fdt信息，转化为device_node组成的树状结构of_root
	__unflatten_device_tree(initial_boot_params, NULL, &of_root,
				early_init_dt_alloc_memory_arch, false);

	/* Get pointer to "/chosen" and "/aliases" nodes for use everywhere */
	//扫描“aliases”节点的所有属性
	of_alias_scan(early_init_dt_alloc_memory_arch);

	unittest_unflatten_overlay_base();//为ovelay测试创建基本设备树，空函数
}

unflatten_device_tree函数主要做了以下几件事：

调用函数__unflatten_device_tree解析fdt信息，转化为device_node组成的树状结构of_root
调用函数of_alias_scan扫描“aliases”节点的所有属性

9.10 bootmem_init

void __init bootmem_init(void)
{
	unsigned long min, max;

	min = PFN_UP(memblock_start_of_DRAM());
	max = PFN_DOWN(memblock_end_of_DRAM());

	//引导时期的内存读写测试，
	early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);

	max_pfn = max_low_pfn = max;
	min_low_pfn = min;

	arm64_numa_init();//numa相关的初始化

	/*
	 * must be done after arm64_numa_init() which calls numa_init() to
	 * initialize node_online_map that gets used in hugetlb_cma_reserve()
	 * while allocating required CMA size across online nodes.
	 */
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
	arm64_hugetlb_cma_reserve();//预留CMA区域
#endif
	//如果启动参数指定了cma_pernuma，为每个numa申请cma内存
	dma_pernuma_cma_reserve();//反正我没用过

	/*
	 * sparse_init() tries to allocate memory from memblock, so must be
	 * done after the fixed reservations
	 */
	//为每个section创建page结构体数组，并为这些struct page结构体创建页表，
	//映射到vmemmap虚拟映射区，同时对每个在线的mem_section进行初始化
	sparse_init();//
	zone_sizes_init(min, max);//初始化每个node, node下的zone, 以及node下的每一个pfn对应的page

	//初始化arm64_dma_phys_limit后保留CMA区域
	dma_contiguous_reserve(arm64_dma_phys_limit);

	/*
	 * request_standard_resources() depends on crashkernel's memory being
	 * reserved, so do it here.
	 */
	if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))
		reserve_crashkernel();//为崩溃内核预留内存

	memblock_dump_all();//debug的时候输出memblock的信息
}

bootmem_init函数主要做了以下几件事：

调用函数early_memtest进行引导时期的内存读写测试，
调用函数arm64_numa_init进行numa相关的初始化
调用函数arm64_hugetlb_cma_reserve预留CMA区域
调用函数dma_pernuma_cma_reserve判断如果启动参数指定了cma_pernuma，为每个numa申请cma内存
调用函数sparse_init为每个section创建page结构体数组，并为这些struct page结构体创建页表，映射到vmemmap虚拟映射区，同时对每个在线的mem_section进行初始化；
调用函数zone_sizes_init初始化每个node, node下的zone, 以及node下的每一个pfn对应的page
调用函数dma_contiguous_reserve初始化arm64_dma_phys_limit后保留CMA区域
调用函数reserve_crashkernel为崩溃内核预留内存
调用函数memblock_dump_all进行debug，输出memblock的信息

9.11 psci_dt_init

int __init psci_dt_init(void)
{
	struct device_node *np;
	const struct of_device_id *matched_np;
	psci_initcall_t init_fn;
	int ret;

	//找到设备树中的psci节点
	np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);

	if (!np || !of_device_is_available(np))//判断节点是否ok
		return -ENODEV;

	init_fn = (psci_initcall_t)matched_np->data;//psci_0_2_init()
	ret = init_fn(np);

	of_node_put(np);//node的kobject的引用计数减一
	return ret;
}

psci_dt_init函数主要做了以下几件事：

调用函数of_find_matching_node_and_match找到设备树中的psci节点
然后判断这节点是否可以使用，如果不可以直接返回找不到
根据psci_of_match中找到的节点调用对应的回调函数，这里是psci_0_2_init函数

9.12 init_bootcpu_ops

static inline void __init init_bootcpu_ops(void)
{
	init_cpu_ops(0);//读取cpu的enable方法并记录在cpu_ops中
}

int __init init_cpu_ops(int cpu)
{
	const char *enable_method = cpu_read_enable_method(cpu);

	if (!enable_method)
		return -ENODEV;

	cpu_ops[cpu] = cpu_get_ops(enable_method);//cpu_psci_ops
	if (!cpu_ops[cpu]) {
		pr_warn("Unsupported enable-method: %s\n", enable_method);
		return -EOPNOTSUPP;
	}

	return 0;
}

init_bootcpu_ops函数首先会定义一个boot_ops结构体，其中包含了一系列启动CPU的函数指针，例如跳转到kernel_entry函数，或者设置CPU寄存器等操作。接着会将该结构体的地址保存到一个全局变量cpu_ops中。

9.13 smp_init_cpus

void __init smp_init_cpus(void)
{
	int i;

	if (acpi_disabled)
		of_parse_and_init_cpus();//初始化设备树中记录的cpu
	else
		acpi_parse_and_init_cpus();//初始化acpi检测到的cpu

	if (cpu_count > nr_cpu_ids)
		pr_warn("Number of cores (%d) exceeds configured maximum of %u - clipping\n",
			cpu_count, nr_cpu_ids);

	if (!bootcpu_valid) {
		pr_err("missing boot CPU MPIDR, not enabling secondaries\n");
		return;
	}

	/*
	 * We need to set the cpu_logical_map entries before enabling
	 * the cpus so that cpu processor description entries (DT cpu nodes
	 * and ACPI MADT entries) can be retrieved by matching the cpu hwid
	 * with entries in cpu_logical_map while initializing the cpus.
	 * If the cpu set-up fails, invalidate the cpu_logical_map entry.
	 */
	//遍历所有cpu
	for (i = 1; i < nr_cpu_ids; i++) {
		if (cpu_logical_map(i) != INVALID_HWID) {//cpu不是INVALID
			if (smp_cpu_setup(i))//cpu设置成功
				set_cpu_logical_map(i, INVALID_HWID);//设置cpu逻辑映射项
		}
	}
}

smp_init_cpus函数主要做了以下几件事：

调用函数of_parse_and_init_cpus初始化设备树中记录的cpu
遍历所有cpu：调用smp_cpu_setup设置cpu，然后调用set_cpu_logical_map填充数组__cpu_logical_map

9.14 smp_build_mpidr_hash

smp_build_mpidr_hash函数是针对arm64多处理器系统中的处理器寻址和调度进行优化的函数，其作用是构建一个哈希表，以加速处理器之间的通信和任务调度。具体来说，这个哈希表会将每个处理器的mpidr（Multiprocessor Affinity Descriptor Register）与其所在的CPU核心ID关联起来。
首先，在for_each_possible_cpu循环中，遍历了所有可能存在的CPU核心，对于每个CPU核心，都使用per_cpu函数获取其cpu_data结构体，然后判断该处理器是否支持ARM64_HAS_CPUID功能。

如果支持的话，就进一步检查该处理器是否属于某个集群。如果属于某个集群的话，会根据其mpidr中的cluster_id来设置相应的临时掩码变量tmp_mask。

如果不属于任何集群的话，就通过mpidr_cpu_id函数从mpidr中提取出该处理器对应的CPU核心ID，然后将该ID和该处理器的pgtable_hash_node结构体通过hash_add函数添加到wait_for_pgtable_refill哈希表中。

最终，smp_build_mpidr_hash函数完成了一个哈希表的构建，其中，所有的处理器都被关联到相应的CPU核心或集群中。这样，在进行处理器之间的通信或任务调度时，就可以快速定位目标处理器所在的CPU核心或集群，大大提高了系统的性能和效率。文章来源地址https://www.toymoban.com/news/detail-418810.html

到了这里，关于linux内核启动分析（三）的文章就介绍完了。如果您还想了解更多内容，请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章，希望大家以后多多支持TOY模板网！