/* prepage.c: A pre-page system call for Linux. Written 1997-1998 by Donald Becker. This file implements prefetching of virtual memory pages for the Linux OS. Pre-fetching of pages is a flexible version of asynchronous I/O. A list of addresses/extents tuples specifies virtual memory pages to be resolved. The kernel makes a best-effort attempt to bring those pages into physical memory. The author may be reached as becker@CESDIS.gsfc.nasa.gov, or C/O Center of Excellence in Space Data and Information Sciences Code 930.5, Goddard Space Flight Center, Greenbelt MD 20771 Errata: this implementation only uses the base address, and not the range specification of the tuple. */ static const char *version = "prepage.c:v1.00 1/30/98 becker@cesdis.gsfc.nasa.gov\n"; /* Theory of Operation Hey, how much simpler can it be? We accept of list of address/extent tuples, and start reading in the VM pages associated with each tuple. This is a best-effort attempt. Failed I/O attempts are later resolved by the normal page fault mechanism pages, so we have the flexibility to have a direct, high-performance code path. The semantics of prepaging a invalid region are similar to reading that region: they may, but are not assured to, generate a SIGSEGV or a SIGBUS (the latter for a mmap()ed shared page). Alternate sematics of ignoring prepaging of invalid regions is equally valid, but is not deemed useful. */ #include #include #include #include /* Variables that can be set when loading the module: */ /* Turn on debugging information. */ static int debug = 1; /* The system call number we attempt to install ourselves as. */ static int syscall_num = 165; struct prepage_tuple { caddr_t addr; size_t extent; }; static void prefetch_region(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long address, size_t extent, int write_access); asmlinkage int sys_prepage(int magic, struct prepage_tuple *tlist, int arg3) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; struct vm_area_struct * vma; int error, i = 3; if (debug) printk("Prepage request entry, args are %8.8x %8.8x %8.8x.\n", magic, (int)tlist, arg3); while (--i > 0) { unsigned long address; size_t extent; if ((error = verify_area(VERIFY_READ, tlist, sizeof(struct prepage_tuple)))) return error; address = (unsigned long) get_user(&tlist->addr); if (address == 0) break; extent = get_user(&tlist->extent); if (debug) printk("Prepage request at %8.8lx for %d bytes.\n", address, extent); vma = find_vma_intersection(mm, address, address + extent); /* do_no_page efficiently checks for pages that already exists. */ if (vma && (vma->vm_flags & (VM_READ | VM_EXEC))) prefetch_region(tsk, vma, address, extent, 0); tlist++; } return 0; } /* Don't ask... */ /* See /usr/src/linux/arch/i386/kernel/head.S for values. */ const empty_bad_page = 0x3000; const empty_bad_page_table = 0x4000; pte_t * __bad_pagetable(void) { __asm__ __volatile__("cld ; rep ; stosl": :"a" (pte_val(BAD_PAGE)), "D" ((long) empty_bad_page_table), "c" (PAGE_SIZE/4) :"di","cx"); return (pte_t *) empty_bad_page_table; } pte_t __bad_page(void) { __asm__ __volatile__("cld ; rep ; stosl": :"a" (0), "D" ((long) empty_bad_page), "c" (PAGE_SIZE/4) :"di","cx"); return pte_mkdirty(mk_pte((unsigned long) empty_bad_page, PAGE_SHARED)); } /* Implement functionality similar to do_no_page(). We could check enough error conditions to use it directly, but the point is moot as it's not exported for module use. This simplifies the semantics for invalid accesses anyway... */ /* * prefetch_page() tries to create a new page mapping. It aggressively * tries to share with existing pages, but makes a separate copy if * the "write_access" parameter is true in order to avoid the next * page fault. */ static void prefetch_region(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long address, size_t extent, int write_access) { pgd_t * pgd; pmd_t * pmd; pte_t * page_table; pte_t entry; unsigned long page; pgd = pgd_offset(tsk->mm, address); pmd = pmd_alloc(pgd, address); if (!pmd) goto no_memory; page_table = pte_alloc(pmd, address); if (!page_table) goto no_memory; entry = *page_table; if (pte_present(entry)) goto is_present; if (!pte_none(entry)) goto swap_page; /* Currently being swapped out. */ address &= PAGE_MASK; if (!vma->vm_ops || !vma->vm_ops->nopage) goto anonymous_page; /* * The third argument is "no_share", which tells the low-level code * to copy, not share the page even if sharing is possible. It's * essentially an early COW detection */ page = vma->vm_ops->nopage(vma, address, (vma->vm_flags & VM_SHARED)?0:write_access); if (!page) goto sigbus; ++tsk->maj_flt; /* Increment major fault counter, */ ++vma->vm_mm->rss; /* and the resident set size. */ /* * This silly early PAGE_DIRTY setting removes a race * due to the bad i386 page protection. But it's valid * for other architectures too. * * Note that if write_access is true, we either now have * a exclusive copy of the page, or this is a shared mapping, * so we can make it writable and dirty to avoid having to * handle that later. */ flush_page_to_ram(page); /* No-op on cache-coherent Intel. */ entry = mk_pte(page, vma->vm_page_prot); if (write_access) { entry = pte_mkwrite(pte_mkdirty(entry)); } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED)) entry = pte_wrprotect(entry); if (!pte_none(*page_table)) free_page(pte_page(entry)); else set_pte(page_table, entry); /* no need to invalidate: a not-present page shouldn't be cached */ return; anonymous_page: /* We don't actually allocate anonymous pages. */ return; swap_page: /* We get here if the page is being swapped out. We handle this case only if it's easy to do so. */ #ifndef MODULE do_swap_page(tsk, vma, address, page_table, entry, write_access); #else if (!vma->vm_ops || !vma->vm_ops->swapin) { /* We don't retrieve pages from anonymous swap yet. */ #ifdef notdef swap_in(tsk, vma, page_table, pte_val(entry), write_access); flush_page_to_ram(pte_page(*page_table)); #endif } else { pte_t page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry)); if (pte_val(*page_table) != pte_val(entry)) { free_page(pte_page(page)); } else { if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED)) page = pte_wrprotect(page); ++vma->vm_mm->rss; ++tsk->maj_flt; flush_page_to_ram(pte_page(page)); set_pte(page_table, page); } } #endif return; sigbus: no_memory: is_present: return; } extern int sys_call_table[]; #ifdef MODULE int init_module(void) { printk(version); if (sys_call_table[syscall_num]) { printk("The requested sys_call_table slot %d is already used!\n", syscall_num); return 1; } sys_call_table[syscall_num] = (int)sys_prepage; return 0; } void cleanup_module(void) { sys_call_table[syscall_num] = 0; } #endif /* MODULE */ /* * Local variables: * compile-command: "gcc -DMODULE -D__KERNEL__ -Wall -Wstrict-prototypes -O6 -c prepage.c" * c-indent-level: 4 * tab-width: 4 * End: */