--- linux-2.4.2/include/linux/sched.h Sun Feb 25 17:37:13 2001 +++ lk/include/linux/sched.h Sun Feb 25 23:08:02 2001 @@ -26,6 +26,7 @@ #include #include #include +#include /* * cloning flags: --- linux-2.4.2/include/linux/low-latency.h Thu Jan 1 00:00:00 1970 +++ lk/include/linux/low-latency.h Sun Feb 25 23:08:02 2001 @@ -0,0 +1,107 @@ +/* + * include/linux/low-latency.h + * + * Andrew Morton + */ + +#ifndef LOW_LATENCY_H_INCLUDED +#define LOW_LATENCY_H_INCLUDED + +#if defined(CONFIG_LOLAT) +#define LOWLATENCY_NEEDED 1 +#else +#define LOWLATENCY_NEEDED 0 +#endif + +#if LOWLATENCY_NEEDED + +#include /* For ____cacheline_aligned */ + +#ifdef CONFIG_LOLAT_SYSCTL +extern struct low_latency_enable_struct { + int yep; +} ____cacheline_aligned __enable_lowlatency; +#define enable_lowlatency __enable_lowlatency.yep + +#else +#define enable_lowlatency 1 +#endif + +/* + * Set this non-zero to generate low-latency instrumentation + */ +#define LOWLATENCY_DEBUG 0 + +/* + * Set this non-zero for robustness testing + */ +#define LOWLATENCY_ALWAYS_SCHEDULE 0 + +#if LOWLATENCY_DEBUG + +#if LOWLATENCY_ALWAYS_SCHEDULE +#define conditional_schedule_needed() ((enable_lowlatency == 2) || (enable_lowlatency && current->need_resched)) +#else +#define conditional_schedule_needed() (enable_lowlatency && current->need_resched) +#endif + +struct lolat_stats_t { + unsigned long count; + int visited; + const char *file; + int line; + struct lolat_stats_t *next; +}; + +void set_running_and_schedule(struct lolat_stats_t *stats); + +#define unconditional_schedule() \ + do { \ + static struct lolat_stats_t stats = { \ + file: __FILE__, \ + line: __LINE__, \ + }; \ + set_running_and_schedule(&stats); \ + } while (0) + +extern void show_lolat_stats(void); + +#else /* LOWLATENCY_DEBUG */ + +#if LOWLATENCY_ALWAYS_SCHEDULE +#define conditional_schedule_needed() 1 +#else +#define conditional_schedule_needed() (current->need_resched) +#endif + +void set_running_and_schedule(void); +#define unconditional_schedule() set_running_and_schedule() + +#endif /* LOWLATENCY_DEBUG */ + +#define conditional_schedule() \ + do { \ + if (conditional_schedule_needed()) \ + unconditional_schedule(); \ + } while (0) + +#define DEFINE_RESCHED_COUNT int resched_count = 0 +#define TEST_RESCHED_COUNT(n) (enable_lowlatency && (++resched_count > (n))) +#define RESET_RESCHED_COUNT() resched_count = 0 +extern int ll_copy_to_user(void *to_user, const void *from, unsigned long len); + +#else /* LOWLATENCY_NEEDED */ + +#define conditional_schedule_needed() 0 +#define conditional_schedule() +#define unconditional_schedule() + +#define DEFINE_RESCHED_COUNT +#define TEST_RESCHED_COUNT(n) 0 +#define RESET_RESCHED_COUNT() +#define ll_copy_to_user(to_user, from, len) copy_to_user((to_user), (from), (len)) + +#endif /* LOWLATENCY_NEEDED */ + +#endif /* LOW_LATENCY_H_INCLUDED */ + --- linux-2.4.2/include/linux/mm.h Sun Feb 25 17:37:13 2001 +++ lk/include/linux/mm.h Sun Feb 25 23:08:02 2001 @@ -254,6 +254,10 @@ #define NOPAGE_SIGBUS (NULL) #define NOPAGE_OOM ((struct page *) (-1)) +/* Actions for zap_page_range() */ +#define ZPR_FLUSH_CACHE 1 /* Do flush_cache_range() prior to releasing pages */ +#define ZPR_FLUSH_TLB 2 /* Do flush_tlb_range() after releasing pages */ +#define ZPR_COND_RESCHED 4 /* Do a conditional_schedule() occasionally */ /* * Various page->flags bits: @@ -389,7 +393,7 @@ extern void shmem_lock(struct file * file, int lock); extern int shmem_zero_setup(struct vm_area_struct *); -extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size); +extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, int actions); extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot); --- linux-2.4.2/include/linux/sysctl.h Tue Jan 30 18:24:55 2001 +++ lk/include/linux/sysctl.h Sun Feb 25 23:08:02 2001 @@ -117,6 +117,7 @@ KERN_OVERFLOWGID=47, /* int: overflow GID */ KERN_SHMPATH=48, /* string: path to shm fs */ KERN_HOTPLUG=49, /* string: path to hotplug policy agent */ + KERN_LOWLATENCY=50, /* int: enable low latency scheduling */ }; --- linux-2.4.2/kernel/exit.c Sun Feb 25 17:37:14 2001 +++ lk/kernel/exit.c Sun Feb 25 23:08:02 2001 @@ -193,6 +193,7 @@ } i++; set >>= 1; + conditional_schedule(); /* sys_exit, many files open */ } } } --- linux-2.4.2/kernel/module.c Sun Feb 25 17:37:14 2001 +++ lk/kernel/module.c Sun Feb 25 23:08:02 2001 @@ -1180,6 +1180,7 @@ continue; for (i = mod->nsyms, sym = mod->syms; i > 0; --i, ++sym) { + conditional_schedule(); p = buf + len; if (*mod->name) { len += sprintf(p, "%0*lx %s\t[%s]\n", --- linux-2.4.2/kernel/sched.c Sun Feb 25 17:37:14 2001 +++ lk/kernel/sched.c Sun Feb 25 23:08:02 2001 @@ -280,6 +280,17 @@ if (tsk->processor != this_cpu) smp_send_reschedule(tsk->processor); } +#if LOWLATENCY_NEEDED + if (enable_lowlatency && (p->policy & (SCHED_OTHER|SCHED_FIFO))) { + struct task_struct *t; + for (i = 0; i < smp_num_cpus; i++) { + cpu = cpu_logical_map(i); + t = cpu_curr(cpu); + if (t != tsk) + t->need_resched = 1; + } + } +#endif return; @@ -1231,3 +1242,74 @@ atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current, cpu); } + +#if LOWLATENCY_NEEDED +#if LOWLATENCY_DEBUG + +static struct lolat_stats_t *lolat_stats_head; +static spinlock_t lolat_stats_lock = SPIN_LOCK_UNLOCKED; + +void set_running_and_schedule(struct lolat_stats_t *stats) +{ + spin_lock(&lolat_stats_lock); + if (stats->visited == 0) { + stats->visited = 1; + stats->next = lolat_stats_head; + lolat_stats_head = stats; + } + stats->count++; + spin_unlock(&lolat_stats_lock); + + if (current->state != TASK_RUNNING) + set_current_state(TASK_RUNNING); + schedule(); +} + +void show_lolat_stats(void) +{ + struct lolat_stats_t *stats = lolat_stats_head; + + printk("Low latency scheduling stats:\n"); + while (stats) { + printk("%s:%d: %lu\n", stats->file, stats->line, stats->count); + stats->count = 0; + stats = stats->next; + } +} + +#else /* LOWLATENCY_DEBUG */ + +void set_running_and_schedule() +{ + if (current->state != TASK_RUNNING) + __set_current_state(TASK_RUNNING); + schedule(); +} + +#endif /* LOWLATENCY_DEBUG */ + +int ll_copy_to_user(void *to_user, const void *from, unsigned long len) +{ + while (len) { + unsigned long n_to_copy = len; + unsigned long remainder; + + if (n_to_copy > 4096) + n_to_copy = 4096; + remainder = copy_to_user(to_user, from, n_to_copy); + if (remainder) + return remainder + len; + to_user = ((char *)to_user) + n_to_copy; + from = ((char *)from) + n_to_copy; + len -= n_to_copy; + conditional_schedule(); + } + return 0; +} + +#ifdef CONFIG_LOLAT_SYSCTL +struct low_latency_enable_struct __enable_lowlatency = { 0, }; +#endif + +#endif /* LOWLATENCY_NEEDED */ + --- linux-2.4.2/kernel/ksyms.c Sun Feb 25 17:37:14 2001 +++ lk/kernel/ksyms.c Sun Feb 25 23:08:02 2001 @@ -433,6 +433,13 @@ EXPORT_SYMBOL(do_gettimeofday); EXPORT_SYMBOL(do_settimeofday); +#if LOWLATENCY_NEEDED +EXPORT_SYMBOL(set_running_and_schedule); +#ifdef CONFIG_LOLAT_SYSCTL +EXPORT_SYMBOL(__enable_lowlatency); +#endif +#endif + #if !defined(__ia64__) EXPORT_SYMBOL(loops_per_jiffy); #endif --- linux-2.4.2/kernel/sysctl.c Sun Feb 25 17:37:14 2001 +++ lk/kernel/sysctl.c Sun Feb 25 23:08:44 2001 @@ -249,6 +249,10 @@ {KERN_S390_USER_DEBUG_LOGGING,"userprocess_debug", &sysctl_userprocess_debug,sizeof(int),0644,NULL,&proc_dointvec}, #endif +#ifdef CONFIG_LOLAT_SYSCTL + {KERN_LOWLATENCY, "lowlatency", &enable_lowlatency, sizeof (int), + 0644, NULL, &proc_dointvec}, +#endif {0} }; --- linux-2.4.2/mm/filemap.c Sun Feb 25 17:37:14 2001 +++ lk/mm/filemap.c Sun Feb 25 23:08:02 2001 @@ -275,6 +275,7 @@ unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); repeat: + conditional_schedule(); /* unlink large files */ spin_lock(&pagecache_lock); if (truncate_list_pages(&mapping->clean_pages, start, &partial)) goto repeat; @@ -366,6 +367,7 @@ page_cache_get(page); spin_unlock(&pagecache_lock); + conditional_schedule(); /* sys_msync() (only used by minixfs, udf) */ lock_page(page); /* The buffers could have been free'd while we waited for the page lock */ @@ -414,8 +416,8 @@ { int (*writepage)(struct page *) = mapping->a_ops->writepage; + conditional_schedule(); /* sys_msync() */ spin_lock(&pagecache_lock); - while (!list_empty(&mapping->dirty_pages)) { struct page *page = list_entry(mapping->dirty_pages.next, struct page, list); @@ -428,6 +430,8 @@ page_cache_get(page); spin_unlock(&pagecache_lock); + conditional_schedule(); /* sys_msync() */ + lock_page(page); if (PageDirty(page)) { @@ -451,6 +455,8 @@ */ void filemap_fdatawait(struct address_space * mapping) { + DEFINE_RESCHED_COUNT; +restart: spin_lock(&pagecache_lock); while (!list_empty(&mapping->locked_pages)) { @@ -459,6 +465,17 @@ list_del(&page->list); list_add(&page->list, &mapping->clean_pages); + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + page_cache_get(page); + spin_unlock(&pagecache_lock); + unconditional_schedule(); + page_cache_release(page); + goto restart; + } + } + if (!PageLocked(page)) continue; @@ -1063,6 +1080,8 @@ struct page *page, **hash; unsigned long end_index, nr; + conditional_schedule(); /* sys_read() */ + end_index = inode->i_size >> PAGE_CACHE_SHIFT; if (index > end_index) break; @@ -1618,6 +1637,12 @@ address += PAGE_SIZE; pte++; } while (address && (address < end)); + + if (conditional_schedule_needed()) { + spin_unlock(&vma->vm_mm->page_table_lock); + unconditional_schedule(); /* syncing large mapped files */ + spin_lock(&vma->vm_mm->page_table_lock); + } return error; } @@ -2022,9 +2047,8 @@ if (vma->vm_flags & VM_LOCKED) return -EINVAL; - flush_cache_range(vma->vm_mm, start, end); - zap_page_range(vma->vm_mm, start, end - start); - flush_tlb_range(vma->vm_mm, start, end); + zap_page_range(vma->vm_mm, start, end - start, + ZPR_FLUSH_CACHE|ZPR_FLUSH_TLB|ZPR_COND_RESCHED); /* sys_madvise(MADV_DONTNEED) */ return 0; } @@ -2496,6 +2520,7 @@ char *kaddr; int deactivate = 1; + conditional_schedule(); /* sys_write() */ /* * Try to find the page in the cache. If it isn't there, * allocate a free page. --- linux-2.4.2/mm/memory.c Sun Feb 25 17:37:14 2001 +++ lk/mm/memory.c Sun Feb 25 23:08:02 2001 @@ -244,6 +244,7 @@ goto out_unlock; src_pte++; dst_pte++; + conditional_schedule(); /* sys_fork(), with a large shm seg */ } while ((unsigned long)src_pte & PTE_TABLE_MASK); spin_unlock(&src->page_table_lock); @@ -354,7 +355,7 @@ /* * remove user pages in a given range. */ -void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) +static void do_zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) { pgd_t * dir; unsigned long end = address + size; @@ -388,6 +389,25 @@ mm->rss = 0; } +#define MAX_ZAP_BYTES 256*PAGE_SIZE + +void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, int actions) +{ + while (size) { + unsigned long chunk = size; + if (actions & ZPR_COND_RESCHED && chunk > MAX_ZAP_BYTES) + chunk = MAX_ZAP_BYTES; + if (actions & ZPR_FLUSH_CACHE) + flush_cache_range(mm, address, address + chunk); + do_zap_page_range(mm, address, chunk); + if (actions & ZPR_FLUSH_TLB) + flush_tlb_range(mm, address, address + chunk); + if (actions & ZPR_COND_RESCHED) + conditional_schedule(); + address += chunk; + size -= chunk; + } +} /* * Do a quick page-table lookup for a single page. @@ -655,6 +675,7 @@ forget_pte(oldpage); address += PAGE_SIZE; pte++; + conditional_schedule(); /* mmap(/dev/zero) */ } while (address && (address < end)); } @@ -699,6 +720,7 @@ break; address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; + conditional_schedule(); /* mmap(/dev/zero) */ } while (address && (address < end)); flush_tlb_range(current->mm, beg, end); return error; @@ -908,9 +930,7 @@ /* mapping wholly truncated? */ if (mpnt->vm_pgoff >= pgoff) { - flush_cache_range(mm, start, end); - zap_page_range(mm, start, len); - flush_tlb_range(mm, start, end); + zap_page_range(mm, start, len, ZPR_FLUSH_CACHE|ZPR_FLUSH_TLB); continue; } @@ -923,9 +943,7 @@ /* Ok, partially affected.. */ start += diff << PAGE_SHIFT; len = (len - diff) << PAGE_SHIFT; - flush_cache_range(mm, start, end); - zap_page_range(mm, start, len); - flush_tlb_range(mm, start, end); + zap_page_range(mm, start, len, ZPR_FLUSH_CACHE|ZPR_FLUSH_TLB); } while ((mpnt = mpnt->vm_next_share) != NULL); } @@ -1228,6 +1246,7 @@ if (addr >= end) BUG(); do { + conditional_schedule(); /* Pinning down many physical pages (kiobufs, mlockall) */ if (handle_mm_fault(mm, vma, addr, write) < 0) return -1; addr += PAGE_SIZE; --- linux-2.4.2/mm/mmap.c Sun Feb 25 17:37:14 2001 +++ lk/mm/mmap.c Sun Feb 25 23:08:02 2001 @@ -373,9 +373,8 @@ vma->vm_file = NULL; fput(file); /* Undo any partial mapping done by a device driver. */ - flush_cache_range(mm, vma->vm_start, vma->vm_end); - zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); - flush_tlb_range(mm, vma->vm_start, vma->vm_end); + zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start, + ZPR_FLUSH_CACHE|ZPR_FLUSH_TLB); free_vma: kmem_cache_free(vm_area_cachep, vma); return error; @@ -749,10 +748,8 @@ } remove_shared_vm_struct(mpnt); mm->map_count--; - - flush_cache_range(mm, st, end); - zap_page_range(mm, st, size); - flush_tlb_range(mm, st, end); + zap_page_range(mm, st, size, + ZPR_FLUSH_CACHE|ZPR_FLUSH_TLB|ZPR_COND_RESCHED); /* sys_munmap() */ /* * Fix the mapping, and free the old area if it wasn't reused. @@ -907,7 +904,7 @@ } mm->map_count--; remove_shared_vm_struct(mpnt); - zap_page_range(mm, start, size); + zap_page_range(mm, start, size, ZPR_COND_RESCHED); /* sys_exit() */ if (mpnt->vm_file) fput(mpnt->vm_file); kmem_cache_free(vm_area_cachep, mpnt); --- linux-2.4.2/mm/mremap.c Sat Dec 30 09:07:24 2000 +++ lk/mm/mremap.c Sun Feb 25 23:08:02 2001 @@ -118,8 +118,7 @@ flush_cache_range(mm, new_addr, new_addr + len); while ((offset += PAGE_SIZE) < len) move_one_page(mm, new_addr + offset, old_addr + offset); - zap_page_range(mm, new_addr, len); - flush_tlb_range(mm, new_addr, new_addr + len); + zap_page_range(mm, new_addr, len, ZPR_FLUSH_TLB); return -1; } --- linux-2.4.2/mm/vmscan.c Tue Jan 16 07:36:49 2001 +++ lk/mm/vmscan.c Sun Feb 25 23:08:02 2001 @@ -133,6 +133,7 @@ { pte_t * pte; unsigned long pmd_end; + DEFINE_RESCHED_COUNT; if (pmd_none(*dir)) return count; @@ -156,6 +157,11 @@ try_to_swap_out(mm, vma, address, pte, page); if (!--count) break; + if (TEST_RESCHED_COUNT(16)) { + if (conditional_schedule_needed()) + return 0; + RESET_RESCHED_COUNT(); + } } } address += PAGE_SIZE; @@ -188,6 +194,8 @@ count = swap_out_pmd(mm, vma, pmd, address, end, count); if (!count) break; + if (conditional_schedule_needed()) + return 0; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -212,6 +220,8 @@ count = swap_out_pgd(mm, vma, pgdir, address, end, count); if (!count) break; + if (conditional_schedule_needed()) + return 0; address = (address + PGDIR_SIZE) & PGDIR_MASK; pgdir++; } while (address && (address < end)); @@ -231,6 +241,7 @@ * Find the proper vm-area after freezing the vma chain * and ptes. */ +continue_scan: spin_lock(&mm->page_table_lock); address = mm->swap_address; vma = find_vma(mm, address); @@ -242,6 +253,12 @@ count = swap_out_vma(mm, vma, address, count); if (!count) goto out_unlock; + if (conditional_schedule_needed()) { /* Scanning a large vma */ + spin_unlock(&mm->page_table_lock); + unconditional_schedule(); + /* Continue from where we left off */ + goto continue_scan; + } vma = vma->vm_next; if (!vma) break; @@ -420,6 +437,7 @@ int can_get_io_locks; struct list_head * page_lru; struct page * page; + DEFINE_RESCHED_COUNT; /* * We can only grab the IO locks (eg. for flushing dirty @@ -436,6 +454,18 @@ maxscan = nr_inactive_dirty_pages; while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list && maxscan-- > 0) { + + /* Scanning this list can take a long time */ + if (TEST_RESCHED_COUNT(2)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + spin_unlock(&pagemap_lru_lock); + unconditional_schedule(); + spin_lock(&pagemap_lru_lock); + continue; + } + } + page = list_entry(page_lru, struct page, lru); /* Wrong page on list?! (list corruption, should not happen) */ @@ -489,6 +519,8 @@ page_cache_get(page); spin_unlock(&pagemap_lru_lock); + conditional_schedule(); + writepage(page); page_cache_release(page); @@ -526,6 +558,8 @@ else wait = 0; /* No IO */ + conditional_schedule(); + /* Try to free the page buffers. */ clearedbuf = try_to_free_buffers(page, wait); @@ -637,12 +671,24 @@ struct page * page; int maxscan, page_active = 0; int ret = 0; + DEFINE_RESCHED_COUNT; /* Take the lock while messing with the list... */ spin_lock(&pagemap_lru_lock); maxscan = nr_active_pages >> priority; while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) { page = list_entry(page_lru, struct page, lru); + + if (TEST_RESCHED_COUNT(10)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + spin_unlock(&pagemap_lru_lock); + unconditional_schedule(); + spin_lock(&pagemap_lru_lock); + maxscan++; + continue; + } + } /* Wrong page on list?! (list corruption, should not happen) */ if (!PageActive(page)) { --- linux-2.4.2/mm/slab.c Sun Jan 28 07:05:11 2001 +++ lk/mm/slab.c Sun Feb 25 23:08:02 2001 @@ -922,6 +922,7 @@ spin_unlock_irq(&cachep->spinlock); kmem_slab_destroy(cachep, slabp); + conditional_schedule(); /* Can take 30 milliseconds */ spin_lock_irq(&cachep->spinlock); } ret = !list_empty(&cachep->slabs); @@ -1816,6 +1817,7 @@ */ spin_unlock_irq(&best_cachep->spinlock); kmem_slab_destroy(best_cachep, slabp); + conditional_schedule(); /* try_to_free_pages() */ spin_lock_irq(&best_cachep->spinlock); } spin_unlock_irq(&best_cachep->spinlock); --- linux-2.4.2/mm/swapfile.c Sun Feb 25 17:37:14 2001 +++ lk/mm/swapfile.c Sun Feb 25 23:08:02 2001 @@ -509,7 +509,7 @@ len += sprintf(buf + len, "partition\t"); usedswap = 0; - for (j = 0; j < ptr->max; ++j) + for (j = 0; j < ptr->max; ++j) { switch (ptr->swap_map[j]) { case SWAP_MAP_BAD: case 0: @@ -517,6 +517,8 @@ default: usedswap++; } + conditional_schedule(); + } len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), usedswap << (PAGE_SHIFT - 10), ptr->prio); } @@ -802,6 +804,7 @@ if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK) continue; for (j = 0; j < swap_info[i].max; ++j) { + conditional_schedule(); /* Scanning large swap maps */ switch (swap_info[i].swap_map[j]) { case SWAP_MAP_BAD: continue; --- linux-2.4.2/drivers/char/mem.c Sun Feb 25 17:37:03 2001 +++ lk/drivers/char/mem.c Sun Feb 25 23:08:02 2001 @@ -367,8 +367,7 @@ if (count > size) count = size; - flush_cache_range(mm, addr, addr + count); - zap_page_range(mm, addr, count); + zap_page_range(mm, addr, count, ZPR_FLUSH_CACHE); zeromap_page_range(addr, count, PAGE_COPY); flush_tlb_range(mm, addr, addr + count); --- linux-2.4.2/drivers/char/random.c Sun Feb 25 17:37:03 2001 +++ lk/drivers/char/random.c Sun Feb 25 23:08:02 2001 @@ -1320,6 +1320,11 @@ buf += i; ret += i; add_timer_randomness(&extract_timer_state, nbytes); +#if LOWLATENCY_NEEDED + /* This can happen in softirq's, but that's what we want */ + if (conditional_schedule_needed()) + break; +#endif } /* Wipe data just returned from memory */ --- linux-2.4.2/fs/buffer.c Sun Feb 25 17:37:11 2001 +++ lk/fs/buffer.c Sun Feb 25 23:08:02 2001 @@ -188,6 +188,7 @@ * there to be dirty buffers on any of the other lists. */ repeat: + conditional_schedule(); /* syncing with many dirty buffers */ spin_lock(&lru_list_lock); bh = lru_list[BUF_DIRTY]; if (!bh) @@ -251,6 +252,15 @@ break; if (dev && bh->b_dev != dev) continue; + + if (conditional_schedule_needed()) { + atomic_inc(&bh->b_count); + spin_unlock(&lru_list_lock); + unconditional_schedule(); + spin_lock(&lru_list_lock); + atomic_dec(&bh->b_count); + } + if (buffer_locked(bh)) { /* Buffer is locked; skip it unless wait is * requested AND pass > 0. @@ -863,7 +873,8 @@ struct buffer_head *bh; struct inode tmp; int err = 0, err2; - + DEFINE_RESCHED_COUNT; + INIT_LIST_HEAD(&tmp.i_dirty_buffers); spin_lock(&lru_list_lock); @@ -884,8 +895,18 @@ spin_lock(&lru_list_lock); } } + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + unconditional_schedule(); /* Syncing many dirty buffers */ + spin_lock(&lru_list_lock); + } + } } + RESET_RESCHED_COUNT(); + while (!list_empty(&tmp.i_dirty_buffers)) { bh = BH_ENTRY(tmp.i_dirty_buffers.prev); remove_inode_queue(bh); @@ -895,6 +916,10 @@ if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + conditional_schedule(); + } spin_lock(&lru_list_lock); } @@ -924,14 +949,23 @@ struct buffer_head *bh; struct list_head *list; int err = 0; + DEFINE_RESCHED_COUNT; +repeat: + conditional_schedule(); spin_lock(&lru_list_lock); - repeat: - for (list = inode->i_dirty_buffers.prev; bh = BH_ENTRY(list), list != &inode->i_dirty_buffers; list = bh->b_inode_buffers.prev) { + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + goto repeat; + } + } + if (buffer_locked(bh)) { atomic_inc(&bh->b_count); spin_unlock(&lru_list_lock); @@ -939,7 +973,6 @@ if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); - spin_lock(&lru_list_lock); goto repeat; } } @@ -2549,8 +2582,10 @@ { struct buffer_head * bh, *next; int flushed = 0, i; + DEFINE_RESCHED_COUNT; restart: + conditional_schedule(); spin_lock(&lru_list_lock); bh = lru_list[BUF_DIRTY]; if (!bh) @@ -2583,8 +2618,6 @@ ll_rw_block(WRITE, 1, &bh); atomic_dec(&bh->b_count); - if (current->need_resched) - schedule(); goto restart; } out_unlock: --- linux-2.4.2/fs/dcache.c Sun Feb 25 17:37:11 2001 +++ lk/fs/dcache.c Sun Feb 25 23:08:02 2001 @@ -324,11 +324,23 @@ void prune_dcache(int count) { + DEFINE_RESCHED_COUNT; + +redo: spin_lock(&dcache_lock); for (;;) { struct dentry *dentry; struct list_head *tmp; + if (TEST_RESCHED_COUNT(100)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + spin_unlock(&dcache_lock); + unconditional_schedule(); + goto redo; + } + } + tmp = dentry_unused.prev; if (tmp == &dentry_unused) @@ -485,6 +497,7 @@ struct dentry *this_parent = parent; struct list_head *next; int found = 0; + DEFINE_RESCHED_COUNT; spin_lock(&dcache_lock); repeat: @@ -499,6 +512,13 @@ list_add(&dentry->d_lru, dentry_unused.prev); found++; } + + if (TEST_RESCHED_COUNT(500) && found > 10) { + if (conditional_schedule_needed()) /* Typically sys_rmdir() */ + goto out; + RESET_RESCHED_COUNT(); + } + /* * Descend a level if the d_subdirs list is non-empty. */ @@ -523,6 +543,7 @@ #endif goto resume; } +out: spin_unlock(&dcache_lock); return found; } @@ -538,8 +559,10 @@ { int found; - while ((found = select_parent(parent)) != 0) + while ((found = select_parent(parent)) != 0) { prune_dcache(found); + conditional_schedule(); /* Typically sys_rmdir() */ + } } /* --- linux-2.4.2/fs/inode.c Sun Feb 25 17:37:11 2001 +++ lk/fs/inode.c Sun Feb 25 23:08:02 2001 @@ -226,6 +226,8 @@ filemap_fdatawait(inode->i_mapping); + conditional_schedule(); /* sync_old_buffers */ + spin_lock(&inode_lock); inode->i_state &= ~I_LOCK; wake_up(&inode->i_wait); @@ -403,6 +405,7 @@ while ((inode_entry = head->next) != head) { + conditional_schedule(); list_del(inode_entry); inode = list_entry(inode_entry, struct inode, i_list); @@ -431,6 +434,15 @@ if (tmp == head) break; inode = list_entry(tmp, struct inode, i_list); + + if (conditional_schedule_needed()) { + atomic_inc(&inode->i_count); + spin_unlock(&inode_lock); + unconditional_schedule(); + spin_lock(&inode_lock); + atomic_dec(&inode->i_count); + } + if (inode->i_sb != sb) continue; invalidate_inode_buffers(inode); @@ -505,11 +517,12 @@ struct list_head *entry, *freeable = &list; int count = 0; struct inode * inode; + DEFINE_RESCHED_COUNT; spin_lock(&inode_lock); /* go simple and safe syncing everything before starting */ sync_all_inodes(); - +rescan: entry = inode_unused.prev; while (entry != &inode_unused) { @@ -531,6 +544,16 @@ count++; if (!--goal) break; + + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + spin_unlock(&inode_lock); + unconditional_schedule(); + spin_lock(&inode_lock); + goto rescan; + } + } } inodes_stat.nr_unused -= count; spin_unlock(&inode_lock); --- linux-2.4.2/fs/ext2/inode.c Sat Dec 30 09:36:44 2000 +++ lk/fs/ext2/inode.c Sun Feb 25 23:08:02 2001 @@ -785,8 +785,13 @@ int blocks = inode->i_sb->s_blocksize / 512; unsigned long block_to_free = 0, count = 0; unsigned long nr; + DEFINE_RESCHED_COUNT; for ( ; p < q ; p++) { + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + conditional_schedule(); + } nr = le32_to_cpu(*p); if (nr) { *p = 0; @@ -835,6 +840,7 @@ if (depth--) { int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); for ( ; p < q ; p++) { + conditional_schedule(); /* Deleting large files */ nr = le32_to_cpu(*p); if (!nr) continue; --- linux-2.4.2/fs/ext2/namei.c Sun Feb 25 17:37:11 2001 +++ lk/fs/ext2/namei.c Sun Feb 25 23:08:02 2001 @@ -91,6 +91,8 @@ struct ext2_dir_entry_2 * de; char * dlimit; + conditional_schedule(); /* Searching large directories */ + if ((block % NAMEI_RA_BLOCKS) == 0 && toread) { ll_rw_block (READ, toread, bh_read); toread = 0; @@ -227,6 +229,7 @@ offset = 0; de = (struct ext2_dir_entry_2 *) bh->b_data; while (1) { + conditional_schedule(); /* Adding to a large directory */ if ((char *)de >= sb->s_blocksize + bh->b_data) { brelse (bh); bh = NULL; --- linux-2.4.2/fs/proc/array.c Wed Nov 15 06:22:36 2000 +++ lk/fs/proc/array.c Sun Feb 25 23:08:02 2001 @@ -415,9 +415,11 @@ if (end > PMD_SIZE) end = PMD_SIZE; do { - pte_t page = *pte; + pte_t page; struct page *ptpage; + conditional_schedule(); /* For `top' and `ps' */ + page = *pte; address += PAGE_SIZE; pte++; if (pte_none(page)) --- linux-2.4.2/fs/proc/generic.c Tue Dec 12 08:45:42 2000 +++ lk/fs/proc/generic.c Sun Feb 25 23:08:02 2001 @@ -98,6 +98,8 @@ retval = n; break; } + + conditional_schedule(); /* Some /proc files are large */ /* This is a hack to allow mangling of file pos independent * of actual bytes read. Simply place the data at page, --- linux-2.4.2/net/core/iovec.c Sun Feb 25 17:37:14 2001 +++ lk/net/core/iovec.c Sun Feb 25 23:08:02 2001 @@ -88,7 +88,7 @@ if(iov->iov_len) { int copy = min(iov->iov_len, len); - if (copy_to_user(iov->iov_base, kdata, copy)) + if (ll_copy_to_user(iov->iov_base, kdata, copy)) goto out; kdata+=copy; len-=copy; --- linux-2.4.2/net/ipv4/tcp_minisocks.c Wed Nov 29 16:53:45 2000 +++ lk/net/ipv4/tcp_minisocks.c Sun Feb 25 23:08:02 2001 @@ -434,6 +434,9 @@ { struct tcp_tw_bucket *tw; int killed = 0; +#if LOWLATENCY_NEEDED + int max_killed = 0; +#endif /* NOTE: compare this to previous version where lock * was released after detaching chain. It was racy, @@ -447,6 +450,13 @@ goto out; while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) { +#if LOWLATENCY_NEEDED + /* This loop takes ~6 usecs per iteration. */ + if (killed > 100) { + max_killed = 1; + break; + } +#endif tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death; tw->pprev_death = NULL; spin_unlock(&tw_death_lock); @@ -457,12 +467,24 @@ killed++; spin_lock(&tw_death_lock); + + } + +#if LOWLATENCY_NEEDED + if (max_killed) { /* More to do: do it soon */ + mod_timer(&tcp_tw_timer, jiffies+2); + tcp_tw_count -= killed; + } + else +#endif + { + tcp_tw_death_row_slot = + ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); + + if ((tcp_tw_count -= killed) != 0) + mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); } - tcp_tw_death_row_slot = - ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); - if ((tcp_tw_count -= killed) != 0) - mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); net_statistics[smp_processor_id()*2].TimeWaited += killed; out: spin_unlock(&tw_death_lock); --- linux-2.4.2/arch/i386/kernel/entry.S Thu Nov 9 12:09:50 2000 +++ lk/arch/i386/kernel/entry.S Sun Feb 25 23:08:02 2001 @@ -215,21 +215,27 @@ jne handle_softirq ret_with_reschedule: - cmpl $0,need_resched(%ebx) - jne reschedule - cmpl $0,sigpending(%ebx) - jne signal_return + cli + movl need_resched(%ebx),%eax + orl sigpending(%ebx),%eax + jne signal_or_resched restore_all: RESTORE_ALL ALIGN -signal_return: +signal_or_resched: + cmpl $0,need_resched(%ebx) + jne reschedule + # Must be a pending signal sti # we can get here from an interrupt handler testl $(VM_MASK),EFLAGS(%esp) movl %esp,%eax jne v86_signal_return xorl %edx,%edx call SYMBOL_NAME(do_signal) + cli + cmpl $0,need_resched(%ebx) + jne reschedule jmp restore_all ALIGN @@ -285,6 +291,7 @@ ALIGN reschedule: + sti call SYMBOL_NAME(schedule) # test jmp ret_from_sys_call --- linux-2.4.2/arch/i386/config.in Tue Jan 9 08:27:56 2001 +++ lk/arch/i386/config.in Sun Feb 25 23:08:02 2001 @@ -26,6 +26,9 @@ mainmenu_option next_comment comment 'Processor type and features' +bool 'Low latency scheduling' CONFIG_LOLAT +dep_bool 'Control low latency with sysctl' CONFIG_LOLAT_SYSCTL $CONFIG_LOLAT + choice 'Processor family' \ "386 CONFIG_M386 \ 486 CONFIG_M486 \ --- linux-2.4.2/include/linux/reiserfs_fs.h Sun Feb 25 17:37:13 2001 +++ lk/include/linux/reiserfs_fs.h Sun Feb 25 23:08:02 2001 @@ -1161,8 +1161,8 @@ #define fs_generation(s) ((s)->u.reiserfs_sb.s_generation_counter) #define get_generation(s) atomic_read (&fs_generation(s)) #define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen) -#define fs_changed(gen,s) (gen != get_generation (s)) - +#define __fs_changed(gen,s) (gen != get_generation (s)) +#define fs_changed(gen,s) ({conditional_schedule(); __fs_changed(gen,s);}) /***************************************************************************/ /* FIXATE NODES */ --- linux-2.4.2/fs/reiserfs/bitmap.c Tue Jan 16 10:31:19 2001 +++ lk/fs/reiserfs/bitmap.c Sun Feb 25 23:08:02 2001 @@ -423,14 +423,20 @@ #ifdef CONFIG_REISERFS_CHECK if (buffer_locked (SB_AP_BITMAP (s)[i]) || is_reusable (s, search_start, 0) == 0) +#if LOWLATENCY_NEEDED + reiserfs_warning("vs-4140: reiserfs_new_blocknrs: bitmap block is locked or bad block number found"); +#else reiserfs_panic (s, "vs-4140: reiserfs_new_blocknrs: bitmap block is locked or bad block number found"); #endif +#endif /* if this bit was already set, we've scheduled, and someone else ** has allocated it. loop around and try again */ if (reiserfs_test_and_set_le_bit (j, SB_AP_BITMAP (s)[i]->b_data)) { - reiserfs_warning("vs-4150: reiserfs_new_blocknrs, block not free"); +#ifndef LOWLATENCY_NEEDED + reiserfs_warning("vs-4150: reiserfs_new_blocknrs, block not free\n"); +#endif reiserfs_restore_prepared_buffer(s, SB_AP_BITMAP(s)[i]) ; amount_needed++ ; continue ; --- linux-2.4.2/fs/reiserfs/buffer2.c Tue Jan 16 10:31:19 2001 +++ lk/fs/reiserfs/buffer2.c Sun Feb 25 23:08:02 2001 @@ -73,7 +73,9 @@ struct buffer_head * reiserfs_bread (kdev_t n_dev, int n_block, int n_size) { - return bread (n_dev, n_block, n_size); + struct buffer_head *ret = bread (n_dev, n_block, n_size); + conditional_schedule(); + return ret; } /* This function looks for a buffer which contains a given block. If --- linux-2.4.2/fs/reiserfs/journal.c Tue Jan 16 10:31:19 2001 +++ lk/fs/reiserfs/journal.c Sun Feb 25 23:08:02 2001 @@ -581,6 +581,7 @@ /* lock the current transaction */ inline static void lock_journal(struct super_block *p_s_sb) { + conditional_schedule(); while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) { sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ; } @@ -710,6 +711,7 @@ mark_buffer_dirty(tbh) ; } ll_rw_block(WRITE, 1, &tbh) ; + conditional_schedule(); count++ ; atomic_dec(&(tbh->b_count)) ; /* once for our get_hash */ } @@ -837,6 +839,7 @@ set_bit(BH_Dirty, &(SB_JOURNAL(p_s_sb)->j_header_bh->b_state)) ; ll_rw_block(WRITE, 1, &(SB_JOURNAL(p_s_sb)->j_header_bh)) ; wait_on_buffer((SB_JOURNAL(p_s_sb)->j_header_bh)) ; + conditional_schedule(); if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) { reiserfs_panic(p_s_sb, "journal-712: buffer write failed\n") ; } @@ -2076,6 +2079,7 @@ } int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks) { + conditional_schedule(); return do_journal_begin_r(th, p_s_sb, nblocks, 0) ; } @@ -2213,6 +2217,7 @@ } int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + conditional_schedule(); return do_journal_end(th, p_s_sb, nblocks, 0) ; } @@ -2649,6 +2654,7 @@ } #endif wait_on_buffer(bh) ; + conditional_schedule(); } retry_count++ ; } @@ -3085,6 +3091,7 @@ /* copy all the real blocks into log area. dirty log blocks */ if (test_bit(BH_JDirty, &cn->bh->b_state)) { struct buffer_head *tmp_bh ; + conditional_schedule(); /* getblk can sleep, so... */ tmp_bh = getblk(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + ((cur_write_start + jindex) % JOURNAL_BLOCK_COUNT), p_s_sb->s_blocksize) ; --- linux-2.4.2/fs/reiserfs/stree.c Tue Jan 16 10:31:19 2001 +++ lk/fs/reiserfs/stree.c Sun Feb 25 23:08:02 2001 @@ -694,6 +694,8 @@ int n_repeat_counter = 0; #endif + conditional_schedule(); + /* As we add each node to a path we increase its count. This means that we must be careful to release all nodes in a path before we either discard the path struct or re-use the path struct, as we do here. */ @@ -1174,6 +1176,8 @@ for ( n_retry = 0, n_counter = *p_n_removed; n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) { + conditional_schedule(); + if (item_moved (&s_ih, p_s_path)) { need_research = 1 ; break; @@ -1191,6 +1195,16 @@ } /* Search for the buffer in cache. */ p_s_un_bh = get_hash_table(p_s_sb->s_dev, *p_n_unfm_pointer, n_blk_size); + + /* AKPM: this is not _really_ needed. It takes us from 2,000 usecs to 500 */ + if (p_s_un_bh && conditional_schedule_needed()) { + unconditional_schedule(); + if ( item_moved (&s_ih, p_s_path) ) { + need_research = 1; + brelse(p_s_un_bh) ; + break ; + } + } if (p_s_un_bh && buffer_locked(p_s_un_bh)) { __wait_on_buffer(p_s_un_bh) ;