Linux ioremap 的实现
linux, memory, ioremap
在 linux kernel 的代码中,经常看到 ioremap 函数。
其功能是将给定的物理地址映射为虚拟地址。
注意,此处的物理地址并不是真正内存的物理地址,而是cpu上的io memory。
可以参考芯片《Reference Manual》中断 memory map 章节。
本文主要学习 ioremap 是如何实现的。
ioremap 的定义:
#define ioremap(cookie,size) __arch_ioremap((cookie), (size), MT_DEVICE)
#define MT_DEVICE 0
#define __arch_ioremap __arm_ioremap
void __iomem *
__arm_ioremap(unsigned long phys_addr, size_t size, unsigned int mtype)
{
return __arm_ioremap_caller(phys_addr, size, mtype,
__builtin_return_address(0));
}
void __iomem *__arm_ioremap_caller(unsigned long phys_addr, size_t size,
unsigned int mtype, void *caller)
{
unsigned long last_addr;
unsigned long offset = phys_addr & ~PAGE_MASK;
unsigned long pfn = __phys_to_pfn(phys_addr);
/*
* Don‘t allow wraparound or zero size
*/
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr)
return NULL;
return __arm_ioremap_pfn_caller(pfn, offset, size, mtype,
caller);
}
void __iomem * __arm_ioremap_pfn_caller(unsigned long pfn,
unsigned long offset, size_t size, unsigned int mtype, void *caller)
{
const struct mem_type *type;
int err;
unsigned long addr;
struct vm_struct * area;
/*
* High mappings must be supersection aligned
*/
// 高端内存需要对齐到 supersection
if (pfn >= 0x100000 && (__pfn_to_phys(pfn) & ~SUPERSECTION_MASK))
return NULL;
/*
* Don‘t allow RAM to be mapped - this causes problems with ARMv6+
*/
// map 的不能是 RAM ,只能是 soc 的 io memory
/*
int pfn_valid(unsigned long pfn)
{
return memblock_is_memory(pfn << PAGE_SHIFT);
}
*/
if (WARN_ON(pfn_valid(pfn)))
return NULL;
// get_mem_type 的实现见后文
// 从前文的定义可知, mtype 为 MT_DEVICE
type = get_mem_type(mtype);
if (!type)
return NULL;
/*
* Page align the mapping size, taking account of any offset.
*/
size = PAGE_ALIGN(offset + size);
// get_vm_area_caller 函数的实现见后面
area = get_vm_area_caller(size, VM_IOREMAP, caller);
if (!area)
return NULL;
addr = (unsigned long)area->addr;
#ifndef CONFIG_SMP
if (DOMAIN_IO == 0 &&
(((cpu_architecture() >= CPU_ARCH_ARMv6) && (get_cr() & CR_XP)) ||
cpu_is_xsc3()) && pfn >= 0x100000 &&
!((__pfn_to_phys(pfn) | size | addr) & ~SUPERSECTION_MASK)) {
area->flags |= VM_ARM_SECTION_MAPPING;
err = remap_area_supersections(addr, pfn, size, type);
} else if (!((__pfn_to_phys(pfn) | size | addr) & ~PMD_MASK)) {
area->flags |= VM_ARM_SECTION_MAPPING;
err = remap_area_sections(addr, pfn, size, type);
} else
#endif
// ioremap_page_range 函数的实现见后文
err = ioremap_page_range(addr, addr + size, __pfn_to_phys(pfn),
__pgprot(type->prot_pte));
if (err) {
vunmap((void *)addr);
return NULL;
}
flush_cache_vmap(addr, addr + size);
return (void __iomem *) (offset + addr);
}
get_mem_type 函数的实现:
const struct mem_type *get_mem_type(unsigned int type)
{
return type < ARRAY_SIZE(mem_types) ? &mem_types[type] : NULL;
}
mem_types 的定义:
static struct mem_type mem_types[] = {
[MT_DEVICE] = { /* Strongly ordered / ARMv6 shared device */
.prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_SHARED |
L_PTE_SHARED,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PROT_SECT_DEVICE | PMD_SECT_S,
.domain = DOMAIN_IO,
},
[MT_DEVICE_NONSHARED] = { /* ARMv6 non-shared device */
.prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_NONSHARED,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PROT_SECT_DEVICE,
.domain = DOMAIN_IO,
},
[MT_DEVICE_CACHED] = { /* ioremap_cached */
.prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_CACHED,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PROT_SECT_DEVICE | PMD_SECT_WB,
.domain = DOMAIN_IO,
},
[MT_DEVICE_WC] = { /* ioremap_wc */
.prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_WC,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PROT_SECT_DEVICE,
.domain = DOMAIN_IO,
},
[MT_UNCACHED] = {
.prot_pte = PROT_PTE_DEVICE,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
.domain = DOMAIN_IO,
},
[MT_CACHECLEAN] = {
.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
.domain = DOMAIN_KERNEL,
},
[MT_MINICLEAN] = {
.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN | PMD_SECT_MINICACHE,
.domain = DOMAIN_KERNEL,
},
[MT_LOW_VECTORS] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_RDONLY,
.prot_l1 = PMD_TYPE_TABLE,
.domain = DOMAIN_USER,
},
[MT_HIGH_VECTORS] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_USER | L_PTE_RDONLY,
.prot_l1 = PMD_TYPE_TABLE,
.domain = DOMAIN_USER,
},
[MT_MEMORY] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
.domain = DOMAIN_KERNEL,
},
[MT_ROM] = {
.prot_sect = PMD_TYPE_SECT,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_NONCACHED] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_MT_BUFFERABLE,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_DTCM] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_XN,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_ITCM] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
.prot_l1 = PMD_TYPE_TABLE,
.domain = DOMAIN_KERNEL,
},
};
get_vm_area_caller 函数的实现:
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
void *caller)
{
/*
* Just any arbitrary offset to the start of the vmalloc VM area: the
* current 8MB value just means that there will be a 8MB "hole" after the
* physical memory until the kernel virtual memory starts. That means that
* any out-of-bounds memory accesses will hopefully be caught.
* The vmalloc() routines leaves a hole of 4kB between each vmalloced
* area for the same reason. ;)
*
* Note that platforms may override VMALLOC_START, but they must provide
* VMALLOC_END. VMALLOC_END defines the (exclusive) limit of this space,
* which may not overlap IO space.
*/
/*
#ifndef VMALLOC_START
#define VMALLOC_OFFSET (8*1024*1024)
// high_memory 在 arch/arm/mm/init.c 文件中的 bootmem_init 函数中赋值,该函数的实现见后文
#define VMALLOC_START (((unsigned long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))
#endif
*/
/* vmalloc ending address */
#define VMALLOC_END 0xf2000000UL
return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-1, GFP_KERNEL, caller);
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask, void *caller)
{
static struct vmap_area *va;
struct vm_struct *area;
BUG_ON(in_interrupt());
/* bits in flags of vmalloc‘s vm_struct below */
// #define VM_IOREMAP 0x00000001 /* ioremap() and friends */
if (flags & VM_IOREMAP) {
int bit = fls(size);
if (bit > IOREMAP_MAX_ORDER)
bit = IOREMAP_MAX_ORDER;
else if (bit < PAGE_SHIFT)
bit = PAGE_SHIFT;
align = 1ul << bit;
}
size = PAGE_ALIGN(size);
if (unlikely(!size))
return NULL;
/**
* kzalloc_node - allocate zeroed memory from a particular memory node.
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate (see kmalloc).
* @node: memory node from which to allocate
*/
// 分配一个 vm_struct 结构体
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
return NULL;
/*
* We always allocate a guard page.
*/
size += PAGE_SIZE;
// start 和 end 分别为 VMALLOC_START 和 VMALLOC_END
// align 为 1
// alloc_vmap_area 函数的注释:
/*
* Allocate a region of KVA of the specified size and alignment, within the
* vstart and vend.
*/
// 已经使用的 vm 的信息分别存在各个 vmap_area 结构体中
// 所有的 vmap_area 结构体都在红黑树 vmap_area_root 中
// alloc_vmap_area 函数的主要功能是,查找红黑树 vmap_area_root ,找到 start 和 end 之间满足 size 大小的未使用空间,
// 创建一个 vmap_area 结构体,并用找到的未使用空间信息初始化该结构体,然后将该结构体插入到红黑树 vmap_area_root 中
va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
/*
* When this function is called from __vmalloc_node_range,
* we do not add vm_struct to vmlist here to avoid
* accessing uninitialized members of vm_struct such as
* pages and nr_pages fields. They will be set later.
* To distinguish it from others, we use a VM_UNLIST flag.
*/
if (flags & VM_UNLIST)
setup_vmalloc_vm(area, va, flags, caller);
else
// 看前面注释可知,上面的 if 分支只是个特殊情况,我们只分析 else 分支
// insert_vmalloc_vm 函数的实现见后文
insert_vmalloc_vm(area, va, flags, caller);
return area;
}
文件中的 bootmem_init 函数中赋值,该函数的实现:
void __init bootmem_init(void)
{
unsigned long min, max_low, max_high;
max_low = max_high = 0;
// 找到内存的起始地址, 低端内存的最高地址, 高端内存的最高地址
// find_limits 函数实现见后文
find_limits(&min, &max_low, &max_high);
arm_bootmem_init(min, max_low);
/*
* Sparsemem tries to allocate bootmem in memory_present(),
* so must be done after the fixed reservations
*/
arm_memory_present();
/*
* sparse_init() needs the bootmem allocator up and running.
*/
sparse_init();
/*
* Now free the memory - free_area_init_node needs
* the sparse mem_map arrays initialized by sparse_init()
* for memmap_init_zone(), otherwise all PFNs are invalid.
*/
arm_bootmem_free(min, max_low, max_high);
// high_memory 为高端内存的起始虚拟地址
high_memory = __va(((phys_addr_t)max_low << PAGE_SHIFT) - 1) + 1;
/*
* This doesn‘t seem to be used by the Linux memory manager any
* more, but is used by ll_rw_block. If we can get rid of it, we
* also get rid of some of the stuff above as well.
*
* Note: max_low_pfn and max_pfn reflect the number of _pages_ in
* the system, not the maximum PFN.
*/
max_low_pfn = max_low - PHYS_PFN_OFFSET;
max_pfn = max_high - PHYS_PFN_OFFSET;
}
find_limits 函数实现:
static void __init find_limits(unsigned long *min, unsigned long *max_low,
unsigned long *max_high)
{
struct meminfo *mi = &meminfo;
int i;
*min = -1UL;
*max_low = *max_high = 0;
for_each_bank (i, mi) {
struct membank *bank = &mi->bank[i];
unsigned long start, end;
start = bank_pfn_start(bank);
end = bank_pfn_end(bank);
if (*min > start)
*min = start;
if (*max_high < end)
*max_high = end;
// 如果是高端内存,就不用更新 max_low 了
// 参考后面的 sanity_check_meminfo 函数
if (bank->highmem)
continue;
if (*max_low < end)
*max_low = end;
}
}
sanity_check_meminfo 函数的实现:
void __init sanity_check_meminfo(void)
{
int i, j, highmem = 0;
for (i = 0, j = 0; i < meminfo.nr_banks; i++) {
struct membank *bank = &meminfo.bank[j];
*bank = meminfo.bank[i];
#ifdef CONFIG_HIGHMEM
// static void * __initdata vmalloc_min = (void *)(VMALLOC_END - SZ_128M);
if (__va(bank->start) >= vmalloc_min ||
__va(bank->start) < (void *)PAGE_OFFSET)
highmem = 1;
bank->highmem = highmem;
...
#else
bank->highmem = highmem;
...
}
...
}
回到函数 get_vm_area_caller 的实现。
insert_vmalloc_vm 函数的实现:
static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, void *caller)
{
// vm(vm_struct) 结构体在函数 __get_vm_area_node 中分配
// va(vmap_area) 结构体,在函数 __get_vm_area_node 中通过调用 alloc_vmap_area 分配
setup_vmalloc_vm(vm, va, flags, caller);
insert_vmalloc_vmlist(vm);
}
setup_vmalloc_vm 函数的实现:
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, void *caller)
{
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->vm = vm;
va->flags |= VM_VM_AREA;
}
insert_vmalloc_vmlist 函数的实现:
static void insert_vmalloc_vmlist(struct vm_struct *vm)
{
struct vm_struct *tmp, **p;
vm->flags &= ~VM_UNLIST;
write_lock(&vmlist_lock);
// 将 vm_struct 结构体插入的 vmlist 中
for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
if (tmp->addr >= vm->addr)
break;
}
vm->next = *p;
*p = vm;
write_unlock(&vmlist_lock);
}
ioremap_page_range 函数的实现:
int ioremap_page_range(unsigned long addr,
unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
pgd_t *pgd;
unsigned long start;
unsigned long next;
int err;
BUG_ON(addr >= end);
start = addr;
phys_addr -= addr;
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, prot);
if (err)
break;
} while (pgd++, addr = next, addr != end);
flush_cache_vmap(start, end);
return err;
}
static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
pud_t *pud;
unsigned long next;
phys_addr -= addr;
pud = pud_alloc(&init_mm, pgd, addr);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
pmd_t *pmd;
unsigned long next;
phys_addr -= addr;
pmd = pmd_alloc(&init_mm, pud, addr);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
pte_t *pte;
u64 pfn;
pfn = phys_addr >> PAGE_SHIFT;
pte = pte_alloc_kernel(pmd, addr);
if (!pte)
return -ENOMEM;
do {
BUG_ON(!pte_none(*pte));
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
return 0;
}
上面几个函数的功能,是建立 linux 4级页表。
linux 4级页表参考:
http://larmbr.me/2014/01/19/the-evolution-of-4-level-page-talbe-in-linux/
总价一下。
ioremap中首先做了一些检查,其中一项检查是要处理的物理地址是不是 RAM ,因为 ioremap 只处理 soc 的 io memory ,不处理 RAM 。
分配一个 vm_struct 结构体。
之后分配一个 vmap_area 结构体,并查找红黑树 vmap_area_root 找到合适的 hole 。
然后初始化 vm_struct 结构体和 vmap_area 结构体的一些成员。
最后建立 linux 的4级内存页表。
4级即: PGD -> PUD -> PMD -> PTE
原文:http://blog.csdn.net/njuitjf/article/details/40745227