From 4417fe62013767e15fcede705083e27e93574a65 Mon Sep 17 00:00:00 2001 From: Adrian-Ken Rueegsegger Date: Mon, 16 Feb 2015 15:20:06 +0100 Subject: [PATCH] hw_x86_64: Implementation of IA-32e paging IA-32e paging translates 48-bit linear addresses to 52-bit physical addresses. Translation structures are hierarchical and four levels deep. The current implementation supports regular 4KB and 1 GB and 2 MB large page mappings. Memory typing is not yet implemented since the encoded type bits depend on the active page attribute table (PAT)*. For detailed information refer to Intel SDM Vol. 3A, section 4.5. * The default PAT after power up does not allow the encoding of the write-combining memory type, see Intel SDM Vol. 3A, section 11.12.4. * Add common IA-32e paging descriptor type: The type represents a table entry and encompasses all fields shared by paging structure entries of all four levels (PML4, PDPT, PD and PT). * Simplify PT entry type by using common descriptor: Differing fields are the physical address, the global flag and the memory type flags. * Simplify directory entry type by using common descriptor: Page directory entries (PDPT and PD) have an additional 'page size' field that specifies if the entry references a next level paging structure or represents a large page mapping. * Simplify PML4 entry type by using common descriptor Top-level paging structure entries (PML4) do not have a 'pat' flag and the memory type is specified by the 'pwt' and 'pcd' fields only. * Implement access right merging for directory paging entries The access rights for translations are determined by the U/S, R/W and XD flags. Paging structure entries that reference other tables must provide the superset of rights required for all entries of the referenced table. Thus merge access rights of new mappings into existing directory entries to grant additional rights if needed. * Add cr3 register definition: The control register 3 is used to set the current page-directory base register. * Add cr3 variable to x86_64 Cpu Context The variable designates the address of the top-level paging structure. * Return current cr3 value as translation table base * Set context cr3 value on translation table assignment * Implement switch to virtual mode in kernel Activate translation table in init_virt_kernel function by updating the cr3 register. * Ignore accessed and dirty flags when comparing existing table entries These flags can be set by the MMU and must be disregarded. --- repos/base-hw/src/core/include/spec/x86/cpu.h | 44 +- .../include/spec/x86_64/translation_table.h | 693 +++++++++++++++++- .../src/core/spec/x86_64/mode_transition.s | 2 +- 3 files changed, 703 insertions(+), 36 deletions(-) diff --git a/repos/base-hw/src/core/include/spec/x86/cpu.h b/repos/base-hw/src/core/include/spec/x86/cpu.h index f828c1f11..67c24458c 100644 --- a/repos/base-hw/src/core/include/spec/x86/cpu.h +++ b/repos/base-hw/src/core/include/spec/x86/cpu.h @@ -51,20 +51,56 @@ class Genode::Cpu static constexpr addr_t exception_entry = 0x0; /* XXX */ static constexpr addr_t mtc_size = 1 << 13; + /** + * Control register 3: Page-Directory base register + * + * See Intel SDM Vol. 3A, section 2.5. + */ + struct Cr3 : Register<64> + { + struct Pwt : Bitfield<3,1> { }; /* Page-level write-through */ + struct Pcd : Bitfield<4,1> { }; /* Page-level cache disable */ + struct Pdb : Bitfield<12, 36> { }; /* Page-directory base address */ + + static void write(access_t const v) { + asm volatile ("mov %0, %%cr3" :: "r" (v) : ); } + + static access_t read() + { + access_t v; + asm volatile ("mov %%cr3, %0" : "=r" (v) :: ); + return v; + } + + /** + * Return initialized value + * + * \param table base of targeted translation table + */ + static access_t init(addr_t const table) { + return Pdb::masked(table); } + }; + /** * Extend basic CPU state by members relevant for 'base-hw' only */ struct Context : Cpu_state { + /** + * Address of top-level paging structure. + */ + addr_t cr3; + /** * Return base of assigned translation table */ - addr_t translation_table() const { return 0UL; } + addr_t translation_table() const { return cr3; } /** * Assign translation-table base 'table' */ - void translation_table(addr_t const table) { } + void translation_table(addr_t const table) { + cr3 = Cr3::init(table); } /** * Assign protection domain @@ -187,8 +223,8 @@ class Genode::Cpu * \param process_id process ID of the kernel address-space */ static void - init_virt_kernel(addr_t const table, unsigned const process_id) - { } + init_virt_kernel(addr_t const table, unsigned const process_id) { + Cr3::write(Cr3::init(table)); } inline static void finish_init_phys_kernel() { } diff --git a/repos/base-hw/src/core/include/spec/x86_64/translation_table.h b/repos/base-hw/src/core/include/spec/x86_64/translation_table.h index 7b0705ea6..9f0b5d003 100644 --- a/repos/base-hw/src/core/include/spec/x86_64/translation_table.h +++ b/repos/base-hw/src/core/include/spec/x86_64/translation_table.h @@ -15,58 +15,257 @@ #ifndef _TRANSLATION_TABLE_H_ #define _TRANSLATION_TABLE_H_ -#include +/* Genode includes */ +#include +#include +#include #include + +/* base-hw includes */ +#include #include namespace Genode { + /** - * First level translation table + * IA-32e paging translates 48-bit linear addresses to 52-bit physical + * addresses. Translation structures are hierarchical and four levels + * deep. + * + * For detailed information refer to Intel SDM Vol. 3A, section 4.5. */ - class Translation_table; + + enum { + SIZE_LOG2_4KB = 12, + SIZE_LOG2_2MB = 21, + SIZE_LOG2_1GB = 30, + SIZE_LOG2_512GB = 39, + SIZE_LOG2_256TB = 48, + }; + + class Level_4_translation_table; + class PML4_table; + + /** + * IA-32e page directory template. + * + * Page directories can refer to paging structures of the next higher level + * or directly map page frames by using large page mappings. + * + * \param PAGE_SIZE_LOG2 virtual address range size in log2 + * of a single table entry + * \param SIZE_LOG2 virtual address range size in log2 of whole table + */ + template + class Page_directory; + + using Level_3_translation_table = + Page_directory; + using Level_2_translation_table = + Page_directory; + + using Translation_table = PML4_table; + + /** + * IA-32e common descriptor. + * + * Table entry containing descriptor fields common to all four levels. + */ + struct Common_descriptor : Register<64> + { + struct P : Bitfield<0, 1> { }; /* present */ + struct Rw : Bitfield<1, 1> { }; /* read/write */ + struct Us : Bitfield<2, 1> { }; /* user/supervisor */ + struct Pwt : Bitfield<3, 1> { }; /* write-through */ + struct Pcd : Bitfield<4, 1> { }; /* cache disable */ + struct A : Bitfield<5, 1> { }; /* accessed */ + struct D : Bitfield<6, 1> { }; /* dirty */ + struct Xd : Bitfield<63, 1> { }; /* execute-disable */ + + static bool present(access_t const v) { return P::get(v); } + + static access_t create(Page_flags const &flags) + { + return P::bits(1) + | Rw::bits(flags.writeable) + | Us::bits(!flags.privileged) + | Xd::bits(!flags.executable); + } + + /** + * Return descriptor value with cleared accessed and dirty flags. These + * flags can be set by the MMU. + */ + static access_t clear_mmu_flags(access_t value) + { + A::clear(value); + D::clear(value); + return value; + } + + /** + * Merge access rights of descriptor with given flags. + */ + static void merge_access_rights(access_t &desc, + Page_flags const &flags) + { + Rw::set(desc, Rw::get(desc) | flags.writeable); + Us::set(desc, Us::get(desc) | !flags.privileged); + Xd::set(desc, Xd::get(desc) & !flags.executable); + } + }; } - -class Genode::Translation_table +class Genode::Level_4_translation_table { - public: + private: - enum { - ALIGNM_LOG2 = 12, - MIN_PAGE_SIZE_LOG2 = 12, - MAX_COSTS_PER_TRANSLATION = 4*4096 + static constexpr size_t PAGE_SIZE_LOG2 = SIZE_LOG2_4KB; + static constexpr size_t SIZE_LOG2 = SIZE_LOG2_2MB; + static constexpr size_t MAX_ENTRIES = 1 << (SIZE_LOG2-PAGE_SIZE_LOG2); + static constexpr size_t PAGE_SIZE = 1 << PAGE_SIZE_LOG2; + static constexpr size_t PAGE_MASK = ~((1 << PAGE_SIZE_LOG2) - 1); + + class Misaligned {}; + class Invalid_range {}; + class Double_insertion {}; + + struct Descriptor : Common_descriptor + { + using Common = Common_descriptor; + + struct Pat : Bitfield<7, 1> { }; /* page attribute table */ + struct G : Bitfield<8, 1> { }; /* global */ + struct Pa : Bitfield<12, 36> { }; /* physical address */ + struct Mt : Bitset_3 { }; /* memory type */ + + static access_t create(Page_flags const &flags, addr_t const pa) + { + /* XXX: Set memory type depending on active PAT */ + return Common::create(flags) + | G::bits(flags.global) + | Pa::masked(pa); + } }; - void * operator new (size_t, void * p) { return p; } + typename Descriptor::access_t _entries[MAX_ENTRIES]; - /** - * Constructor - */ - Translation_table() { } + inline bool _aligned(addr_t const a, size_t const alignm_log2) { + return a == ((a >> alignm_log2) << alignm_log2); } - /** - * Maximum virtual offset that can be translated by this table - */ - static addr_t max_virt_offset() + struct Insert_func { - PDBG("not implemented"); - return 0; + Page_flags const & flags; + Page_slab * slab; + + Insert_func(Page_flags const & flags, + Page_slab * slab) : flags(flags), slab(slab) { } + + void operator () (addr_t const vo, + addr_t const pa, + size_t const size, + Descriptor::access_t &desc) + { + if ((vo & ~PAGE_MASK) || (pa & ~PAGE_MASK) || + size < PAGE_SIZE) + throw Invalid_range(); + + Descriptor::access_t table_entry = + Descriptor::create(flags, pa); + if (Descriptor::present(desc) && + Descriptor::clear_mmu_flags(desc) != table_entry) + throw Double_insertion(); + + desc = table_entry; + } + }; + + struct Remove_func + { + Page_slab * slab; + + Remove_func(Page_slab * slab) : slab(slab) { } + + void operator () (addr_t const vo, + addr_t const pa, + size_t const size, + Descriptor::access_t &desc) { + desc = 0; } + }; + + template + void _range_op(addr_t vo, addr_t pa, size_t size, FUNC &&func) + { + for (size_t i = vo >> PAGE_SIZE_LOG2; size > 0; + i = vo >> PAGE_SIZE_LOG2) { + addr_t end = (vo + PAGE_SIZE) & PAGE_MASK; + size_t sz = min(size, end-vo); + + func(vo, pa, sz, _entries[i]); + + /* check whether we wrap */ + if (end < vo) return; + + size = size - sz; + vo += sz; + pa += sz; + } + } + + public: + + static constexpr size_t MIN_PAGE_SIZE_LOG2 = SIZE_LOG2_4KB; + static constexpr size_t ALIGNM_LOG2 = SIZE_LOG2_4KB; + + /** + * IA-32e page table (Level 4) + * + * A page table consists of 512 entries that each maps a 4KB page + * frame. + * For further details refer to Intel SDM Vol. 3A, table 4-19. + */ + Level_4_translation_table() + { + if (!_aligned((addr_t)this, ALIGNM_LOG2)) + throw Misaligned(); + + memset(&_entries, 0, sizeof(_entries)); + } + + /** + * Returns True if table does not contain any page mappings. + * + * \return false if an entry is present, True otherwise + */ + bool empty() + { + for (unsigned i = 0; i < MAX_ENTRIES; i++) + if (Descriptor::present(_entries[i])) + return false; + return true; } /** * Insert translations into this table * - * \param vo offset of virt. transl. region in virt. table region - * \param pa base of physical backing store - * \param size size of translated region - * \param f mapping flags - * \param s second level page slab allocator + * \param vo offset of the virtual region represented + * by the translation within the virtual + * region represented by this table + * \param pa base of the physical backing store + * \param size size of the translated region + * \param flags mapping flags + * \param slab second level page slab allocator */ - void insert_translation(addr_t vo, addr_t pa, size_t size, - Page_flags const & f, Page_slab * const s) + void insert_translation(addr_t vo, + addr_t pa, + size_t size, + Page_flags const & flags, + Page_slab * slab) { - PDBG("not implemented"); + this->_range_op(vo, pa, size, Insert_func(flags, slab)); } /** @@ -78,8 +277,440 @@ class Genode::Translation_table */ void remove_translation(addr_t vo, size_t size, Page_slab * slab) { - PDBG("not implemented"); + this->_range_op(vo, 0, size, Remove_func(slab)); } -}; +} __attribute__((aligned(1 << ALIGNM_LOG2))); + + +template +class Genode::Page_directory +{ + private: + + static constexpr size_t MAX_ENTRIES = 1 << (SIZE_LOG2-PAGE_SIZE_LOG2); + static constexpr size_t PAGE_SIZE = 1 << PAGE_SIZE_LOG2; + static constexpr size_t PAGE_MASK = ~((1 << PAGE_SIZE_LOG2) - 1); + + class Misaligned {}; + class Invalid_range {}; + class Double_insertion {}; + + struct Base_descriptor : Common_descriptor + { + using Common = Common_descriptor; + + struct Ps : Common::template Bitfield<7, 1> { }; /* page size */ + + static bool maps_page(access_t const v) { return Ps::get(v); } + }; + + struct Page_descriptor : Base_descriptor + { + using Base = Base_descriptor; + + /** + * Global attribute + */ + struct G : Base::template Bitfield<8, 1> { }; + + /** + * Page attribute table + */ + struct Pat : Base::template Bitfield<12, 1> { }; + + /** + * Physical address + */ + struct Pa : Base::template Bitfield { }; + + /** + * Memory type + */ + struct Mt : Base::template Bitset_3 { }; + + static typename Base::access_t create(Page_flags const &flags, + addr_t const pa) + { + /* XXX: Set memory type depending on active PAT */ + return Base::create(flags) + | Base::Ps::bits(1) + | G::bits(flags.global) + | Pa::masked(pa); + } + }; + + struct Table_descriptor : Base_descriptor + { + using Base = Base_descriptor; + + /** + * Physical address + */ + struct Pa : Base::template Bitfield<12, 36> { }; + + /** + * Memory types + */ + struct Mt : Base::template Bitset_2 { }; + + static typename Base::access_t create(Page_flags const &flags, + addr_t const pa) + { + /* XXX: Set memory type depending on active PAT */ + return Base::create(flags) + | Pa::masked(pa); + } + }; + + typename Base_descriptor::access_t _entries[MAX_ENTRIES]; + + inline bool _aligned(addr_t const a, size_t const alignm_log2) { + return a == ((a >> alignm_log2) << alignm_log2); } + + struct Insert_func + { + Page_flags const & flags; + Page_slab * slab; + + Insert_func(Page_flags const & flags, + Page_slab * slab) : flags(flags), slab(slab) { } + + void operator () (addr_t const vo, + addr_t const pa, + size_t const size, + typename Base_descriptor::access_t &desc) + { + /* can we insert a large page mapping? */ + if (!((vo & ~PAGE_MASK) || (pa & ~PAGE_MASK) || + size < PAGE_SIZE)) { + typename Base_descriptor::access_t table_entry = + Page_descriptor::create(flags, pa); + + if (Base_descriptor::present(desc) && + Base_descriptor::clear_mmu_flags(desc) != table_entry) + throw Double_insertion(); + + desc = table_entry; + return; + } + + /* we need to use a next level table */ + ENTRY *table; + if (!Base_descriptor::present(desc)) { + if (!slab) + throw Allocator::Out_of_memory(); + + /* create and link next level table */ + table = new (slab) ENTRY(); + ENTRY * phys_addr = (ENTRY*) slab->phys_addr(table); + desc = (typename Base_descriptor::access_t) + Table_descriptor::create(flags, + (addr_t)(phys_addr ? phys_addr + : table)); + } else if (Base_descriptor::maps_page(desc)) { + throw Double_insertion(); + } else { + Base_descriptor::merge_access_rights(desc, flags); + ENTRY * phys_addr = (ENTRY*) + Table_descriptor::Pa::masked(desc); + table = (ENTRY*) slab->virt_addr(phys_addr); + table = table ? table : (ENTRY*)phys_addr; + } + + /* insert translation */ + table->insert_translation(vo - (vo & PAGE_MASK), + pa, size, flags, slab); + } + }; + + struct Remove_func + { + Page_slab * slab; + + Remove_func(Page_slab * slab) : slab(slab) { } + + void operator () (addr_t const vo, + addr_t const pa, + size_t const size, + typename Base_descriptor::access_t &desc) + { + if (Base_descriptor::present(desc)) { + if (Base_descriptor::maps_page(desc)) { + desc = 0; + } else { + /* use allocator to retrieve virt address of table */ + ENTRY* phys_addr = (ENTRY*) + Table_descriptor::Pa::masked(desc); + ENTRY* table = (ENTRY*) slab->virt_addr(phys_addr); + table = table ? table : (ENTRY*)phys_addr; + table->remove_translation(vo - (vo & PAGE_MASK), + size, slab); + if (table->empty()) { + destroy(slab, table); + desc = 0; + } + } + } + } + }; + + template + void _range_op(addr_t vo, addr_t pa, size_t size, FUNC &&func) + { + for (size_t i = vo >> PAGE_SIZE_LOG2; size > 0; + i = vo >> PAGE_SIZE_LOG2) { + addr_t end = (vo + PAGE_SIZE) & PAGE_MASK; + size_t sz = min(size, end-vo); + + func(vo, pa, sz, _entries[i]); + + /* check whether we wrap */ + if (end < vo) return; + + size = size - sz; + vo += sz; + pa += sz; + } + } + + public: + + static constexpr size_t MIN_PAGE_SIZE_LOG2 = SIZE_LOG2_4KB; + static constexpr size_t ALIGNM_LOG2 = SIZE_LOG2_4KB; + + Page_directory() + { + if (!_aligned((addr_t)this, ALIGNM_LOG2)) + throw Misaligned(); + + memset(&_entries, 0, sizeof(_entries)); + } + + /** + * Returns True if table does not contain any page mappings. + * + * \return false if an entry is present, True otherwise + */ + bool empty() + { + for (unsigned i = 0; i < MAX_ENTRIES; i++) + if (Base_descriptor::present(_entries[i])) + return false; + return true; + } + + /** + * Insert translations into this table + * + * \param vo offset of the virtual region represented + * by the translation within the virtual + * region represented by this table + * \param pa base of the physical backing store + * \param size size of the translated region + * \param flags mapping flags + * \param slab second level page slab allocator + */ + void insert_translation(addr_t vo, + addr_t pa, + size_t size, + Page_flags const & flags, + Page_slab * slab) + { + _range_op(vo, pa, size, Insert_func(flags, slab)); + } + + /** + * Remove translations that overlap with a given virtual region + * + * \param vo region offset within the tables virtual region + * \param size region size + * \param slab second level page slab allocator + */ + void remove_translation(addr_t vo, size_t size, Page_slab * slab) + { + _range_op(vo, 0, size, Remove_func(slab)); + } +} __attribute__((aligned(1 << ALIGNM_LOG2))); + + +class Genode::PML4_table +{ + private: + + static constexpr size_t PAGE_SIZE_LOG2 = SIZE_LOG2_256TB; + static constexpr size_t SIZE_LOG2 = SIZE_LOG2_512GB; + static constexpr size_t MAX_ENTRIES = 512; + static constexpr size_t PAGE_SIZE = 1UL << PAGE_SIZE_LOG2; + static constexpr size_t PAGE_MASK = ~((1UL << PAGE_SIZE_LOG2) - 1); + + class Misaligned {}; + class Invalid_range {}; + + struct Descriptor : Common_descriptor + { + struct Pa : Bitfield<12, SIZE_LOG2> { }; /* physical address */ + struct Mt : Bitset_2 { }; /* memory type */ + + static access_t create(Page_flags const &flags, addr_t const pa) + { + /* XXX: Set memory type depending on active PAT */ + return Common_descriptor::create(flags) + | Pa::masked(pa); + } + }; + + typename Descriptor::access_t _entries[MAX_ENTRIES]; + + inline bool _aligned(addr_t const a, size_t const alignm_log2) { + return a == ((a >> alignm_log2) << alignm_log2); } + + using ENTRY = Level_2_translation_table; + + struct Insert_func + { + Page_flags const & flags; + Page_slab * slab; + + Insert_func(Page_flags const & flags, + Page_slab * slab) : flags(flags), slab(slab) { } + + void operator () (addr_t const vo, + addr_t const pa, + size_t const size, + Descriptor::access_t &desc) + { + /* we need to use a next level table */ + ENTRY *table; + if (!Descriptor::present(desc)) { + if (!slab) + throw Allocator::Out_of_memory(); + + /* create and link next level table */ + table = new (slab) ENTRY(); + ENTRY * phys_addr = (ENTRY*) slab->phys_addr(table); + desc = Descriptor::create(flags, + (addr_t)(phys_addr ? phys_addr + : table)); + } else { + Descriptor::merge_access_rights(desc, flags); + ENTRY * phys_addr = (ENTRY*) + Descriptor::Pa::masked(desc); + table = (ENTRY*) slab->virt_addr(phys_addr); + table = table ? table : (ENTRY*)phys_addr; + } + + /* insert translation */ + table->insert_translation(vo - (vo & PAGE_MASK), + pa, size, flags, slab); + } + }; + + struct Remove_func + { + Page_slab * slab; + + Remove_func(Page_slab * slab) : slab(slab) { } + + void operator () (addr_t const vo, + addr_t const pa, + size_t const size, + Descriptor::access_t &desc) + { + if (Descriptor::present(desc)) { + /* use allocator to retrieve virt address of table */ + ENTRY* phys_addr = (ENTRY*) + Descriptor::Pa::masked(desc); + ENTRY* table = (ENTRY*) slab->virt_addr(phys_addr); + table = table ? table : (ENTRY*)phys_addr; + table->remove_translation(vo - (vo & PAGE_MASK), size, + slab); + if (table->empty()) { + destroy(slab, table); + desc = 0; + } + } + } + }; + + template + void _range_op(addr_t vo, addr_t pa, size_t size, FUNC &&func) + { + for (size_t i = vo >> PAGE_SIZE_LOG2; size > 0; + i = vo >> PAGE_SIZE_LOG2) { + addr_t end = (vo + PAGE_SIZE) & PAGE_MASK; + size_t sz = min(size, end-vo); + + func(vo, pa, sz, _entries[i]); + + /* check whether we wrap */ + if (end < vo) return; + + size = size - sz; + vo += sz; + pa += sz; + } + } + + public: + + static constexpr size_t MIN_PAGE_SIZE_LOG2 = SIZE_LOG2_4KB; + static constexpr size_t ALIGNM_LOG2 = SIZE_LOG2_4KB; + + PML4_table() + { + if (!_aligned((addr_t)this, ALIGNM_LOG2)) + throw Misaligned(); + + memset(&_entries, 0, sizeof(_entries)); + } + + /** + * Returns True if table does not contain any page mappings. + * + * \return false if an entry is present, True otherwise + */ + bool empty() + { + for (unsigned i = 0; i < MAX_ENTRIES; i++) + if (Descriptor::present(_entries[i])) + return false; + return true; + } + + /** + * Insert translations into this table + * + * \param vo offset of the virtual region represented + * by the translation within the virtual + * region represented by this table + * \param pa base of the physical backing store + * \param size size of the translated region + * \param flags mapping flags + * \param slab second level page slab allocator + */ + void insert_translation(addr_t vo, + addr_t pa, + size_t size, + Page_flags const & flags, + Page_slab * slab) + { + _range_op(vo, pa, size, Insert_func(flags, slab)); + } + + /** + * Remove translations that overlap with a given virtual region + * + * \param vo region offset within the tables virtual region + * \param size region size + * \param slab second level page slab allocator + */ + void remove_translation(addr_t vo, size_t size, Page_slab * slab) + { + _range_op(vo, 0, size, Remove_func(slab)); + } +} __attribute__((aligned(1 << ALIGNM_LOG2))); #endif /* _TRANSLATION_TABLE_H_ */ diff --git a/repos/base-hw/src/core/spec/x86_64/mode_transition.s b/repos/base-hw/src/core/spec/x86_64/mode_transition.s index f5ba76b11..08c200b53 100644 --- a/repos/base-hw/src/core/spec/x86_64/mode_transition.s +++ b/repos/base-hw/src/core/spec/x86_64/mode_transition.s @@ -35,7 +35,7 @@ _mt_master_context_begin: /* space must be at least as large as 'Cpu_state' */ - .space 20*8 + .space 21*8 .global _mt_master_context_end _mt_master_context_end: