You are on page 1of 189

/* * linux/kernel/fork.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * 'fork.

c' contains the help-routines for the 'fork' system call * (see also entry.S and others). * Fork is rather simple, once you get the hang of it, but the memory * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ #include <linux/slab.h> #include <linux/init.h> #include <linux/unistd.h> #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/completion.h> #include <linux/personality.h> #include <linux/mempolicy.h> #include <linux/sem.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/binfmts.h> #include <linux/mman.h> #include <linux/mmu_notifier.h> #include <linux/fs.h> #include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/cgroup.h> #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/jiffies.h> #include <linux/futex.h> #include <linux/compat.h> #include <linux/kthread.h> #include <linux/task_io_accounting_ops.h> #include <linux/rcupdate.h> #include <linux/ptrace.h> #include <linux/mount.h> #include <linux/audit.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> #include <linux/profile.h> #include <linux/rmap.h> #include <linux/ksm.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/freezer.h> #include <linux/delayacct.h> #include <linux/taskstats_kern.h> #include <linux/random.h> #include <linux/tty.h> #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> #include <linux/perf_event.h> #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> #include <linux/oom.h> #include <linux/khugepaged.h> #include <asm/pgtable.h>

#include <asm/pgalloc.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <trace/events/sched.h> /* * Protected counters by write_lock_irq(&tasklist_lock) */ unsigned long total_forks; /* Handle normal Linux uptimes. */ int nr_threads; /* The idle threads do not count.. */ int max_threads; /* tunable limit on nr_threads */

DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ #ifdef CONFIG_PROVE_RCU int lockdep_tasklist_lock_is_held(void) { return lockdep_is_held(&tasklist_lock); } EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); #endif /* #ifdef CONFIG_PROVE_RCU */ int nr_processes(void) { int cpu; int total = 0; for_each_possible_cpu(cpu) total += per_cpu(process_counts, cpu); return total; } #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR # define alloc_task_struct_node(node) \ kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) # define free_task_struct(tsk) \ kmem_cache_free(task_struct_cachep, (tsk)) static struct kmem_cache *task_struct_cachep; #endif #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { #ifdef CONFIG_DEBUG_STACK_USAGE gfp_t mask = GFP_KERNEL | __GFP_ZERO; #else gfp_t mask = GFP_KERNEL; #endif struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { free_pages((unsigned long)ti, THREAD_SIZE_ORDER); } #endif /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep;

/* SLAB cache for sighand_struct structures (tsk->sighand) */ struct kmem_cache *sighand_cachep; /* SLAB cache for files_struct structures (tsk->files) */ struct kmem_cache *files_cachep; /* SLAB cache for fs_struct structures (tsk->fs) */ struct kmem_cache *fs_cachep; /* SLAB cache for vm_area_struct structures */ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; static void account_kernel_stack(struct thread_info *ti, int account) { struct zone *zone = page_zone(virt_to_page(ti)); mod_zone_page_state(zone, NR_KERNEL_STACK, account); } void free_task(struct task_struct *tsk) { account_kernel_stack(tsk->stack, -1); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); sched_autogroup_exit(sig); kmem_cache_free(signal_cachep, sig); } static inline void put_signal_struct(struct signal_struct *sig) { if (atomic_dec_and_test(&sig->sigcnt)) free_signal_struct(sig); } void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); if (!profile_handoff_task(tsk)) free_task(tsk); } EXPORT_SYMBOL_GPL(__put_task_struct); /* * macro override instead of weak attribute alias, to workaround * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. */ #ifndef arch_task_cache_init #define arch_task_cache_init() #endif void __init fork_init(unsigned long mempages) {

#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES #endif /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", sizeof(struct task_struct), ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); #endif /* do the arch specific task caches init */ arch_task_cache_init(); /* * The default maximum number of threads is set to a safe * value: the thread structures can take up at most half * of memory. */ max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); /* * we need to allow at least 20 threads to boot a system */ if (max_threads < 20) max_threads = 20; init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; } int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; return 0; } static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; int node = tsk_fork_get_node(orig); int err; prepare_to_copy(orig); tsk = alloc_task_struct_node(node); if (!tsk) return NULL; ti = alloc_thread_info_node(tsk, node); if (!ti) { free_task_struct(tsk); return NULL; } err = arch_dup_task_struct(tsk, orig); if (err) goto out; tsk->stack = ti; setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); stackend = end_of_stack(tsk); *stackend = STACK_END_MAGIC; /* for overflow detection */

#ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); #endif /* * One for us, one for whoever does the "release_task()" (usually * parent) */ atomic_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; account_kernel_stack(ti, 1); return tsk; out: free_thread_info(ti); free_task_struct(tsk); return NULL; } #ifdef CONFIG_MMU static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; struct mempolicy *pol; down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); /* * Not linked in yet - no deadlock potential: */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; retval = khugepaged_fork(mm, oldmm); if (retval) goto out; prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { long pages = vma_pages(mpnt); mm->total_vm -= pages; vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -pages); continue; }

charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; if (security_vm_enough_memory(len)) goto fail_nomem; charge = len; } tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; INIT_LIST_HEAD(&tmp->anon_vma_chain); pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = file->f_mapping; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } /* * Clear hugetlb-related page reserves for children. This only * affects MAP_PRIVATE mappings. Faults generated by the child * are not guaranteed to succeed, even if read-only */ if (is_vm_hugetlb_page(tmp)) reset_vma_resv_huge_pages(tmp); /* * Link in the new vma and copy the page table entries. */ *pprev = tmp; pprev = &tmp->vm_next; tmp->vm_prev = prev; prev = tmp; __vma_link_rb(mm, tmp, rb_link, rb_parent); rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; mm->map_count++; retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); if (retval) goto out; } /* a new mm has just been created */

arch_dup_mmap(oldmm, mm); retval = 0; out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); return retval; fail_nomem_anon_vma_fork: mpol_put(pol); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); goto out; } static inline int mm_alloc_pgd(struct mm_struct *mm) { mm->pgd = pgd_alloc(mm); if (unlikely(!mm->pgd)) return -ENOMEM; return 0; } static inline void mm_free_pgd(struct mm_struct *mm) { pgd_free(mm, mm->pgd); } #else #define dup_mmap(mm, oldmm) (0) #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); #define allocate_mm() #define free_mm(mm) (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) (kmem_cache_free(mm_cachep, (mm)))

static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; static int __init coredump_filter_setup(char *s) { default_dump_filter = (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & MMF_DUMP_FILTER_MASK; return 1; } __setup("coredump_filter=", coredump_filter_setup); #include <linux/init_task.h> static void mm_init_aio(struct mm_struct *mm) { #ifdef CONFIG_AIO spin_lock_init(&mm->ioctx_lock); INIT_HLIST_HEAD(&mm->ioctx_list); #endif } static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;

mm->core_state = NULL; mm->nr_ptes = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; mmu_notifier_mm_init(mm); return mm; } free_mm(mm); return NULL; } /* * Allocate and initialize an mm_struct. */ struct mm_struct *mm_alloc(void) { struct mm_struct *mm; mm = allocate_mm(); if (!mm) return NULL; memset(mm, 0, sizeof(*mm)); mm_init_cpumask(mm); return mm_init(mm, current); } /* * Called when the last reference to the mm * is dropped: either by a lazy thread or by * mmput. Free the page directory and the mm. */ void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); #ifdef CONFIG_TRANSPARENT_HUGEPAGE VM_BUG_ON(mm->pmd_huge_pte); #endif free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); /* * Decrement the use count and release all resources for an mm. */ void mmput(struct mm_struct *mm) { might_sleep(); if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); list_del(&mm->mmlist); spin_unlock(&mmlist_lock);

} put_swap_token(mm); if (mm->binfmt) module_put(mm->binfmt->module); mmdrop(mm); } } EXPORT_SYMBOL_GPL(mmput); /* * We added or removed a vma mapping the executable. The vmas are only mapped * during exec and are not mapped with the mmap system call. * Callers must hold down_write() on the mm's mmap_sem for these */ void added_exe_file_vma(struct mm_struct *mm) { mm->num_exe_file_vmas++; } void removed_exe_file_vma(struct mm_struct *mm) { mm->num_exe_file_vmas--; if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { fput(mm->exe_file); mm->exe_file = NULL; } } void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { if (new_exe_file) get_file(new_exe_file); if (mm->exe_file) fput(mm->exe_file); mm->exe_file = new_exe_file; mm->num_exe_file_vmas = 0; } struct file *get_mm_exe_file(struct mm_struct *mm) { struct file *exe_file; /* We need mmap_sem to protect against races with removal of * VM_EXECUTABLE vmas */ down_read(&mm->mmap_sem); exe_file = mm->exe_file; if (exe_file) get_file(exe_file); up_read(&mm->mmap_sem); return exe_file; } static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) { /* It's safe to write the exe_file pointer without exe_file_lock because * this is called during fork when the task is not yet in /proc */ newmm->exe_file = get_mm_exe_file(oldmm); } /** * get_task_mm - acquire a reference to the task's mm * * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning * this kernel workthread has transiently adopted a user mm with use_mm, * to do its AIO) is not set and if so returns a reference to it, after * bumping up the use count. User must release the mm via mmput() * after use. Typically used by /proc and ptrace. */ struct mm_struct *get_task_mm(struct task_struct *task)

{ struct mm_struct *mm; task_lock(task); mm = task->mm; if (mm) { if (task->flags & PF_KTHREAD) mm = NULL; else atomic_inc(&mm->mm_users); } task_unlock(task); return mm; } EXPORT_SYMBOL_GPL(get_task_mm); /* Please note the differences between mmput and mm_release. * mmput is called whenever we stop holding onto a mm_struct, * error success whatever. * * mm_release is called after a mm_struct has been removed * from the current process. * * This difference is important for error handling, when we * only half set up a mm_struct for a new process and need to restore * the old one. Because we mmput the new mm_struct before * restoring the old one. . . * Eric Biederman 10 January 1998 */ void mm_release(struct task_struct *tsk, struct mm_struct *mm) { struct completion *vfork_done = tsk->vfork_done; /* Get rid of any futexes when releasing the mm */ #ifdef CONFIG_FUTEX if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); tsk->robust_list = NULL; } #ifdef CONFIG_COMPAT if (unlikely(tsk->compat_robust_list)) { compat_exit_robust_list(tsk); tsk->compat_robust_list = NULL; } #endif if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); #endif /* Get rid of any cached register state */ deactivate_mm(tsk, mm); /* notify parent sleeping on vfork() */ if (vfork_done) { tsk->vfork_done = NULL; complete(vfork_done); } /* * If we're exiting normally, clear a user-space tid field if * requested. We leave this alone when dying by signal, to leave * the value intact in a core dump, and to save the unnecessary * trouble otherwise. Userland only wants this done for a sys_exit. */ if (tsk->clear_child_tid) { if (!(tsk->flags & PF_SIGNALED) && atomic_read(&mm->mm_users) > 1) { /* * We don't check the error code - if userspace has * not set up a proper pointer then tough luck.

*/ put_user(0, tsk->clear_child_tid); sys_futex(tsk->clear_child_tid, FUTEX_WAKE, 1, NULL, NULL, 0); } tsk->clear_child_tid = NULL; } } /* * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */ struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; if (!oldmm) return NULL; mm = allocate_mm(); if (!mm) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); mm_init_cpumask(mm); /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif if (!mm_init(mm, tsk)) goto fail_nomem; if (init_new_context(tsk, mm)) goto fail_nocontext; dup_mm_exe_file(oldmm, mm); err = dup_mmap(mm, oldmm); if (err) goto free_pt; mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; if (mm->binfmt && !try_module_get(mm->binfmt->module)) goto free_pt; return mm; free_pt: /* don't put binfmt in mmput, we haven't got module yet */ mm->binfmt = NULL; mmput(mm); fail_nomem: return NULL; fail_nocontext: /* * If init_new_context() failed, we cannot use mmput() to free the mm * because it calls destroy_context() */ mm_free_pgd(mm);

free_mm(mm); return NULL; } static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; int retval; tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; #ifdef CONFIG_DETECT_HUNG_TASK tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; #endif tsk->mm = NULL; tsk->active_mm = NULL; /* * Are we cloning a kernel thread? * * We need to steal a active VM for that.. */ oldmm = current->mm; if (!oldmm) return 0; if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); mm = oldmm; goto good_mm; } retval = -ENOMEM; mm = dup_mm(tsk); if (!mm) goto fail_nomem; good_mm: /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; tsk->mm = mm; tsk->active_mm = mm; return 0; fail_nomem: return retval; } static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ spin_lock(&fs->lock); if (fs->in_exec) { spin_unlock(&fs->lock); return -EAGAIN; } fs->users++; spin_unlock(&fs->lock); return 0; } tsk->fs = copy_fs_struct(fs); if (!tsk->fs) return -ENOMEM; return 0; }

static int copy_files(unsigned long clone_flags, struct task_struct *tsk) { struct files_struct *oldf, *newf; int error = 0; /* * A background process may not have any files ... */ oldf = current->files; if (!oldf) goto out; if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); goto out; } newf = dup_fd(oldf, &error); if (!newf) goto out; tsk->files = newf; error = 0; out: return error; } static int copy_io(unsigned long clone_flags, struct task_struct *tsk) { #ifdef CONFIG_BLOCK struct io_context *ioc = current->io_context; if (!ioc) return 0; /* * Share io context with parent, if CLONE_IO is set */ if (clone_flags & CLONE_IO) { tsk->io_context = ioc_task_link(ioc); if (unlikely(!tsk->io_context)) return -ENOMEM; } else if (ioprio_valid(ioc->ioprio)) { tsk->io_context = alloc_io_context(GFP_KERNEL, -1); if (unlikely(!tsk->io_context)) return -ENOMEM; tsk->io_context->ioprio = ioc->ioprio; } #endif return 0; } static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; if (clone_flags & CLONE_SIGHAND) { atomic_inc(&current->sighand->count); return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); rcu_assign_pointer(tsk->sighand, sig); if (!sig) return -ENOMEM; atomic_set(&sig->count, 1); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); return 0; }

void __cleanup_sighand(struct sighand_struct *sighand) { if (atomic_dec_and_test(&sighand->count)) kmem_cache_free(sighand_cachep, sighand); } /* * Initialize POSIX timer handling for a thread group. */ static void posix_cpu_timers_init_group(struct signal_struct *sig) { unsigned long cpu_limit; /* Thread group counters. */ thread_group_cputime_init(sig); cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); sig->cputimer.running = 1; } /* The timer lists. */ INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); } static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; if (clone_flags & CLONE_THREAD) return 0; sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; if (!sig) return -ENOMEM; sig->nr_threads = 1; atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); init_waitqueue_head(&sig->wait_chldexit); if (clone_flags & CLONE_NEWPID) sig->flags |= SIGNAL_UNKILLABLE; sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); posix_cpu_timers_init_group(sig); tty_audit_fork(sig); sched_autogroup_fork(sig); #ifdef CONFIG_CGROUPS init_rwsem(&sig->threadgroup_fork_lock); #endif sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min;

mutex_init(&sig->cred_guard_mutex); return 0; } static void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); new_flags |= PF_FORKNOEXEC; new_flags |= PF_STARTING; p->flags = new_flags; clear_freeze_flag(p); } SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; return task_pid_vnr(current); } static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES plist_head_init(&p->pi_waiters); p->pi_blocked_on = NULL; #endif } #ifdef CONFIG_MM_OWNER void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { mm->owner = p; } #endif /* CONFIG_MM_OWNER */ /* * Initialize POSIX timer handling for a single task. */ static void posix_cpu_timers_init(struct task_struct *tsk) { tsk->cputime_expires.prof_exp = cputime_zero; tsk->cputime_expires.virt_exp = cputime_zero; tsk->cputime_expires.sched_exp = 0; INIT_LIST_HEAD(&tsk->cpu_timers[0]); INIT_LIST_HEAD(&tsk->cpu_timers[1]); INIT_LIST_HEAD(&tsk->cpu_timers[2]); } /* * This creates a new process as a copy of the old one, * but does not actually start it yet. * * It copies the registers, and all the appropriate * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, int trace) { int retval;

struct task_struct *p; int cgroup_callbacks_done = 0; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL); /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); /* * Siblings of global init remain as zombies on exit since they are * not reaped by their parent (swapper). To solve this and to avoid * multi-rooted process trees, prevent global and container-inits * from creating siblings. */ if ((clone_flags & CLONE_PARENT) && current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL); retval = security_task_create(clone_flags); if (retval) goto fork_out; retval = -ENOMEM; p = dup_task_struct(current); if (!p) goto fork_out; ftrace_graph_init_task(p); rt_mutex_init_task(p); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->real_cred->user != INIT_USER) goto bad_fork_free; } current->flags &= ~PF_NPROC_EXCEEDED; retval = copy_creds(p, clone_flags); if (retval < 0) goto bad_fork_free; /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ retval = -EAGAIN; if (nr_threads >= max_threads) goto bad_fork_cleanup_count;

if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); p->utime = cputime_zero; p->stime = cputime_zero; p->gtime = cputime_zero; p->utimescaled = cputime_zero; p->stimescaled = cputime_zero; #ifndef CONFIG_VIRT_CPU_ACCOUNTING p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif p->default_timer_slack_ns = current->timer_slack_ns; task_io_accounting_init(&p->ioac); acct_clear_integrals(p); posix_cpu_timers_init(p); do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; p->audit_context = NULL; if (clone_flags & CLONE_THREAD) threadgroup_fork_read_lock(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; goto bad_fork_cleanup_cgroup; } mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; p->cpuset_slab_spread_rotor = NUMA_NO_NODE; #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW p->hardirqs_enabled = 1; #else p->hardirqs_enabled = 0; #endif p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; p->hardirq_disable_event = 0; p->softirqs_enabled = 1; p->softirq_enable_ip = _THIS_IP_; p->softirq_enable_event = 0; p->softirq_disable_ip = 0;

p->softirq_disable_event = 0; p->hardirq_context = 0; p->softirq_context = 0; #endif #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; p->lockdep_recursion = 0; #endif #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif #ifdef CONFIG_CGROUP_MEM_RES_CTLR p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p); retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; retval = audit_alloc(p); if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; retval = copy_files(clone_flags, p); if (retval) goto bad_fork_cleanup_semundo; retval = copy_fs(clone_flags, p); if (retval) goto bad_fork_cleanup_files; retval = copy_sighand(clone_flags, p); if (retval) goto bad_fork_cleanup_fs; retval = copy_signal(clone_flags, p); if (retval) goto bad_fork_cleanup_sighand; retval = copy_mm(clone_flags, p); if (retval) goto bad_fork_cleanup_signal; retval = copy_namespaces(clone_flags, p); if (retval) goto bad_fork_cleanup_mm; retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_io; if (pid != &init_struct_pid) { retval = -ENOMEM; pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; } p->pid = pid_nr(pid); p->tgid = p->pid; if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /*

* Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif #ifdef CONFIG_FUTEX p->robust_list = NULL; #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif /* * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) p->sas_ss_sp = p->sas_ss_size = 0; /* * Syscall tracing and stepping should be turned off in the * child regardless of CLONE_PTRACE. */ user_disable_single_step(p); clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif clear_all_latency_tracing(p); /* ok, now we should be set up.. */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; p->exit_state = 0; p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible * on the tasklist. */ cgroup_fork_callbacks(p); cgroup_callbacks_done = 1; /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; } spin_lock(&current->sighand->siglock); /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to

* it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). */ recalc_sigpending(); if (signal_pending(current)) { spin_unlock(&current->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_free_pid; } if (clone_flags & CLONE_THREAD) { current->signal->nr_threads++; atomic_inc(&current->signal->live); atomic_inc(&current->signal->sigcnt); p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); } if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); if (thread_group_leader(p)) { if (is_child_reaper(pid)) p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); __this_cpu_inc(process_counts); } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; } total_forks++; spin_unlock(&current->sighand->siglock); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) threadgroup_fork_read_unlock(current); perf_event_fork(p); return p; bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) mmput(p->mm); bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup_semundo:

exit_sem(p); bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif if (clone_flags & CLONE_THREAD) threadgroup_fork_read_unlock(current); cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: free_task(p); fork_out: return ERR_PTR(retval); } noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); return regs; } static inline void init_idle_pids(struct pid_link *links) { enum pid_type type; for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { INIT_HLIST_NODE(&links[type].node); /* not really needed */ links[type].pid = &init_struct_pid; } } struct task_struct * __cpuinit fork_idle(int cpu) { struct task_struct *task; struct pt_regs regs; task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, &init_struct_pid, 0); if (!IS_ERR(task)) { init_idle_pids(task->pids); init_idle(task, cpu); } return task; } /* * Ok, this is the main fork-routine. * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ long do_fork(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) { struct task_struct *p; int trace = 0; long nr;

/* * Do some preliminary argument and permissions checking before we * actually start allocating stuff */ if (clone_flags & CLONE_NEWUSER) { if (clone_flags & CLONE_THREAD) return -EINVAL; /* hopefully this check will go away when userns support is * complete */ if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || !capable(CAP_SETGID)) return -EPERM; } /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; if (likely(!ptrace_event_enabled(current, trace))) trace = 0; } p = copy_process(clone_flags, stack_start, regs, stack_size, child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ if (!IS_ERR(p)) { struct completion vfork; trace_sched_process_fork(current, p); nr = task_pid_vnr(p); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); } audit_finish_fork(p); /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that * hasn't finished SIGSTOP raising yet. Now we clear it * and set the child going. */ p->flags &= ~PF_STARTING; wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ if (unlikely(trace)) ptrace_event(trace, nr);

if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); } } else { nr = PTR_ERR(p); } return nr; } #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif static void sighand_ctor(void *data) { struct sighand_struct *sighand = data; spin_lock_init(&sighand->siglock); init_waitqueue_head(&sighand->signalfd_wqh); } void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| SLAB_NOTRACK, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); /* * FIXME! The "sizeof(struct mm_struct)" currently includes the * whole struct cpumask for the OFFSTACK case. We could change * this to *only* allocate as much of it as required by the * maximum number of CPU's we can ever have. The cpumask_allocation * is at the end of the structure, exactly for that reason. */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); mmap_init(); nsproxy_cache_init(); } /* * Check constraints on flags passed to the unshare system call. */ static int check_unshare_flags(unsigned long unshare_flags) { if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing to * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND * needs to unshare vm. */ if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {

/* FIXME: get_task_mm() increments ->mm_users */ if (atomic_read(&current->mm->mm_users) > 1) return -EINVAL; } return 0; } /* * Unshare the filesystem structure if it is being shared */ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) { struct fs_struct *fs = current->fs; if (!(unshare_flags & CLONE_FS) || !fs) return 0; /* don't need lock here; in the worst case we'll do useless copy */ if (fs->users == 1) return 0; *new_fsp = copy_fs_struct(fs); if (!*new_fsp) return -ENOMEM; return 0; } /* * Unshare file descriptor table if it is being shared */ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) { struct files_struct *fd = current->files; int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { *new_fdp = dup_fd(fd, &error); if (!*new_fdp) return error; } return 0; } /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* * functions used by do_fork() cannot be used here directly * because they modify an inactive task_struct that is being * constructed. Here we are modifying the current, active, * task_struct. */ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *fd, *new_fd = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; int err; err = check_unshare_flags(unshare_flags); if (err) goto bad_unshare_out; /* * If unsharing namespace, must also unshare filesystem information. */

if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old * namespace are unreachable. */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); if (err) goto bad_unshare_cleanup_fd; if (new_fs || new_fd || do_sysvsem || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). */ exit_sem(current); } if (new_nsproxy) { switch_task_namespaces(current, new_nsproxy); new_nsproxy = NULL; } task_lock(current); if (new_fs) { fs = current->fs; spin_lock(&fs->lock); current->fs = new_fs; if (--fs->users) new_fs = NULL; else new_fs = fs; spin_unlock(&fs->lock); } if (new_fd) { fd = current->files; current->files = new_fd; new_fd = fd; } task_unlock(current); } if (new_nsproxy) put_nsproxy(new_nsproxy); bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); bad_unshare_cleanup_fs: if (new_fs) free_fs_struct(new_fs); bad_unshare_out: return err; } /*

* * * */

Helper to unshare the files of the current task. We don't want to expose copy_files internals to the exec layer of the kernel.

int unshare_files(struct files_struct **displaced) { struct task_struct *task = current; struct files_struct *copy = NULL; int error; error = unshare_fd(CLONE_FILES, &copy); if (error || !copy) { *displaced = NULL; return error; } *displaced = task->files; task_lock(task); task->files = copy; task_unlock(task); return 0; }

/* * crash_dump.c - Memory preserving reboot related code. * * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) * Copyright (C) IBM Corporation, 2004. All rights reserved */ #include <linux/errno.h> #include <linux/crash_dump.h> #include <linux/io.h> #include <asm/uaccess.h> /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied * @buf: target memory address for the copy; this can be in kernel address * space or user address space (see @userbuf) * @csize: number of bytes to copy * @offset: offset in bytes into the page (based on pfn) to begin the copy * @userbuf: if set, @buf is in user address space, use copy_to_user(), * otherwise @buf is in kernel address space, use memcpy(). * * Copy a page from "oldmem". For this page, there is no pte mapped * in the current kernel. We stitch up a pte, similar to kmap_atomic. */ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, unsigned long offset, int userbuf) { void *vaddr; if (!csize) return 0; vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); if (userbuf) { if (copy_to_user(buf, (vaddr + offset), csize)) { iounmap(vaddr); return -EFAULT; } } else memcpy(buf, (vaddr + offset), csize); iounmap(vaddr); return csize; }

/* * Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or * user-supplied information to configure own IP address and routes. * * Copyright (C) 1996-1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz> * * Derived from network configuration code in fs/nfs/nfsroot.c, * originally Copyright (C) 1995, 1996 Gero Kuhlmann and me. * * BOOTP rewritten to construct and analyse packets itself instead * of misusing the IP layer. num_bugs_causing_wrong_arp_replies--; * -- MJ, December 1998 * * Fixed ip_auto_config_setup calling at startup in the new "Linker Magic" * initialization scheme. * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 08/11/1999 * * DHCP support added. To users this looks like a whole separate * protocol, but we know it's just a bag on the side of BOOTP. * -- Chip Salzenberg <chip@valinux.com>, May 2000 * * Ported DHCP support from 2.2.16 to 2.4.0-test4 * -- Eric Biederman <ebiederman@lnxi.com>, 30 Aug 2000 * * Merged changes from 2.2.19 into 2.4.3 * -- Eric Biederman <ebiederman@lnxi.com>, 22 April Aug 2001 * * Multiple Nameservers in /proc/net/pnp * -- Josef Siemes <jsiemes@web.de>, Aug 2002 */ #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/random.h> #include <linux/init.h> #include <linux/utsname.h> #include <linux/in.h> #include <linux/if.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/socket.h> #include <linux/route.h> #include <linux/udp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/major.h> #include <linux/root_dev.h> #include <linux/delay.h> #include <linux/nfs_fs.h> #include <linux/slab.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/arp.h> #include <net/ip.h> #include <net/ipconfig.h> #include <net/route.h> #include <asm/uaccess.h> #include <net/checksum.h> #include <asm/processor.h> /* Define this to allow debugging output */ #undef IPCONFIG_DEBUG

#ifdef IPCONFIG_DEBUG #define DBG(x) printk x #else #define DBG(x) do { } while(0) #endif #if defined(CONFIG_IP_PNP_DHCP) #define IPCONFIG_DHCP #endif #if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_DHCP) #define IPCONFIG_BOOTP #endif #if defined(CONFIG_IP_PNP_RARP) #define IPCONFIG_RARP #endif #if defined(IPCONFIG_BOOTP) || defined(IPCONFIG_RARP) #define IPCONFIG_DYNAMIC #endif /* Define the friendly delay before and after opening net devices */ #define CONF_POST_OPEN 10 /* After opening: 10 msecs */ #define CONF_CARRIER_TIMEOUT120000 /* Wait for carrier timeout */ /* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ #define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ #define CONF_SEND_RETRIES 6 /* Send six requests per open */ #define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */ #define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */ #define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */ #define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */ #define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */ #define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers - '3' from resolv.h */ #define NONE cpu_to_be32(INADDR_NONE) #define ANY cpu_to_be32(INADDR_ANY) /* * Public IP configuration */ /* This is used by platforms which might be able to set the ipconfig * variables using firmware environment vars. If this is set, it will * ignore such firmware variables. */ int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */ static int ic_enable __initdata = 0; /* Protocol choice */ int ic_proto_enabled __initdata = 0 #ifdef IPCONFIG_BOOTP | IC_BOOTP #endif #ifdef CONFIG_IP_PNP_DHCP | IC_USE_DHCP #endif #ifdef IPCONFIG_RARP | IC_RARP #endif ; static int ic_host_name_set __initdata = 0; __be32 ic_myaddr = NONE; static __be32 ic_netmask = NONE; __be32 ic_gateway = NONE; __be32 ic_servaddr = NONE; /* Host name set by us? */ /* IP config enabled? */

/* My IP address */ /* Netmask for local subnet */ /* Gateway IP address */ /* Boot server IP address */

__be32 root_server_addr = NONE; u8 root_server_path[256] = { 0, }; u32 ic_dev_xid;

/* Address of NFS server */ /* Path to mount as root */

/* Device under configuration */

/* vendor class identifier */ static char vendor_class_identifier[253] __initdata; /* Persistent data: */ static int ic_proto_used; /* Protocol used, if any */ static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */ static u8 ic_domain[64]; /* DNS (not NIS) domain name */ /* * Private state. */ /* Name of user-selected boot device */ static char user_dev_name[IFNAMSIZ] __initdata = { 0, }; /* Protocols supported by available interfaces */ static int ic_proto_have_if __initdata = 0; /* MTU for boot device */ static int ic_dev_mtu __initdata = 0; #ifdef IPCONFIG_DYNAMIC static DEFINE_SPINLOCK(ic_recv_lock); static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */ #endif #ifdef IPCONFIG_DHCP static int ic_dhcp_msgtype __initdata = 0; /* DHCP msg type received */ #endif /* * */

Network devices

struct ic_device { struct ic_device *next; struct net_device *dev; unsigned short flags; short able; __be32 xid; }; static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ static struct net_device *ic_dev __initdata = NULL; /* Selected device */ static bool __init ic_is_init_dev(struct net_device *dev) { if (dev->flags & IFF_LOOPBACK) return false; return user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : (!(dev->flags & IFF_LOOPBACK) && (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && strncmp(dev->name, "dummy", 5)); } static int __init ic_open_devs(void) { struct ic_device *d, **last; struct net_device *dev; unsigned short oflags; unsigned long start; last = &ic_first_dev; rtnl_lock();

/* bring loopback device up first */ for_each_netdev(&init_net, dev) { if (!(dev->flags & IFF_LOOPBACK)) continue; if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); } for_each_netdev(&init_net, dev) { if (ic_is_init_dev(dev)) { int able = 0; if (dev->mtu >= 364) able |= IC_BOOTP; else printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev>name, dev->mtu); if (!(dev->flags & IFF_NOARP)) able |= IC_RARP; able &= ic_proto_enabled; if (ic_proto_enabled && !able) continue; oflags = dev->flags; if (dev_change_flags(dev, oflags | IFF_UP) < 0) { printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); continue; } if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) { rtnl_unlock(); return -ENOMEM; } d->dev = dev; *last = d; last = &d->next; d->flags = oflags; d->able = able; if (able & IC_BOOTP) get_random_bytes(&d->xid, sizeof(__be32)); else d->xid = 0; ic_proto_have_if |= able; DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n", dev->name, able, d->xid)); } } /* wait for a carrier on at least one device */ start = jiffies; while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { for_each_netdev(&init_net, dev) if (ic_is_init_dev(dev) && netif_carrier_ok(dev)) goto have_carrier; msleep(1); } have_carrier: rtnl_unlock(); *last = NULL; if (!ic_first_dev) { if (user_dev_name[0]) printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name); else printk(KERN_ERR "IP-Config: No network devices available.\n"); return -ENODEV; } return 0; }

static void __init ic_close_devs(void) { struct ic_device *d, *next; struct net_device *dev; rtnl_lock(); next = ic_first_dev; while ((d = next)) { next = d->next; dev = d->dev; if (dev != ic_dev) { DBG(("IP-Config: Downing %s\n", dev->name)); dev_change_flags(dev, d->flags); } kfree(d); } rtnl_unlock(); } /* * */ Interface to various network functions.

static inline void set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = addr; sin->sin_port = port; } static int __init ic_devinet_ioctl(unsigned int cmd, struct ifreq *arg) { int res; mm_segment_t oldfs = get_fs(); set_fs(get_ds()); res = devinet_ioctl(&init_net, cmd, (struct ifreq __user *) arg); set_fs(oldfs); return res; } static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg) { int res; mm_segment_t oldfs = get_fs(); set_fs(get_ds()); res = dev_ioctl(&init_net, cmd, (struct ifreq __user *) arg); set_fs(oldfs); return res; } static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg) { int res; mm_segment_t oldfs = get_fs(); set_fs(get_ds()); res = ip_rt_ioctl(&init_net, cmd, (void __user *) arg); set_fs(oldfs); return res; } /* * */ Set up interface addresses and routes.

static int __init ic_setup_if(void) {

struct ifreq ir; struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr; int err; memset(&ir, 0, sizeof(ir)); strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name); set_sockaddr(sin, ic_myaddr, 0); if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) { printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err); return -1; } set_sockaddr(sin, ic_netmask, 0); if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) { printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err); return -1; } set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0); if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) { printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err); return -1; } /* Handle the case where we need non-standard MTU on the boot link (a network * using jumbo frames, for instance). If we can't set the mtu, don't error * out, we'll try to muddle along. */ if (ic_dev_mtu != 0) { strcpy(ir.ifr_name, ic_dev->name); ir.ifr_mtu = ic_dev_mtu; if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0) printk(KERN_ERR "IP-Config: Unable to set interface mtu to %d (%d).\n", ic_dev_mtu, err); } return 0; } static int __init ic_setup_routes(void) { /* No need to setup device routes, only the default route... */ if (ic_gateway != NONE) { struct rtentry rm; int err; memset(&rm, 0, sizeof(rm)); if ((ic_gateway ^ ic_myaddr) & ic_netmask) { printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n"); return -1; } set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0); set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0); set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0); rm.rt_flags = RTF_UP | RTF_GATEWAY; if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) { printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err); return -1; } } return 0; } /* * */ Fill in default values for all missing parameters.

static int __init ic_defaults(void) { /* * At this point we have no userspace running so need not * claim locks on system_utsname

*/ if (!ic_host_name_set) sprintf(init_utsname()->nodename, "%pI4", &ic_myaddr); if (root_server_addr == NONE) root_server_addr = ic_servaddr; if (ic_netmask == NONE) { if (IN_CLASSA(ntohl(ic_myaddr))) ic_netmask = htonl(IN_CLASSA_NET); else if (IN_CLASSB(ntohl(ic_myaddr))) ic_netmask = htonl(IN_CLASSB_NET); else if (IN_CLASSC(ntohl(ic_myaddr))) ic_netmask = htonl(IN_CLASSC_NET); else { printk(KERN_ERR "IP-Config: Unable to guess netmask for address %pI4\n", &ic_myaddr); return -1; } printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask); } return 0; } /* * */ RARP support.

#ifdef IPCONFIG_RARP static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static struct packet_type rarp_packet_type __initdata = { .type = cpu_to_be16(ETH_P_RARP), .func = ic_rarp_recv, }; static inline void __init ic_rarp_init(void) { dev_add_pack(&rarp_packet_type); } static inline void __init ic_rarp_cleanup(void) { dev_remove_pack(&rarp_packet_type); } /* * Process received RARP packet. */ static int __init ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct arphdr *rarp; unsigned char *rarp_ptr; __be32 sip, tip; unsigned char *sha, *tha; /* s for "source", t for "target" */ struct ic_device *d; if (!net_eq(dev_net(dev), &init_net)) goto drop; if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) return NET_RX_DROP; if (!pskb_may_pull(skb, sizeof(struct arphdr))) goto drop;

/* Basic sanity checks can be done without the lock. */ rarp = (struct arphdr *)skb_transport_header(skb); /* If this test doesn't pass, it's not IP, or we should * ignore it anyway. */ if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd)) goto drop; /* If it's not a RARP reply, delete it. */ if (rarp->ar_op != htons(ARPOP_RREPLY)) goto drop; /* If it's not Ethernet, delete it. */ if (rarp->ar_pro != htons(ETH_P_IP)) goto drop; if (!pskb_may_pull(skb, arp_hdr_len(dev))) goto drop; /* OK, it is all there and looks valid, process... */ rarp = (struct arphdr *)skb_transport_header(skb); rarp_ptr = (unsigned char *) (rarp + 1); /* One reply at a time, please. */ spin_lock(&ic_recv_lock); /* If we already have a reply, just drop the packet */ if (ic_got_reply) goto drop_unlock; /* Find the ic_device that the packet arrived on */ d = ic_first_dev; while (d && d->dev != dev) d = d->next; if (!d) goto drop_unlock; /* should never happen */ /* Extract variable-width fields */ sha = rarp_ptr; rarp_ptr += dev->addr_len; memcpy(&sip, rarp_ptr, 4); rarp_ptr += 4; tha = rarp_ptr; rarp_ptr += dev->addr_len; memcpy(&tip, rarp_ptr, 4); /* Discard packets which are not meant for us. */ if (memcmp(tha, dev->dev_addr, dev->addr_len)) goto drop_unlock; /* Discard packets which are not from specified server. */ if (ic_servaddr != NONE && ic_servaddr != sip) goto drop_unlock; /* We have a winner! */ ic_dev = dev; if (ic_myaddr == NONE) ic_myaddr = tip; ic_servaddr = sip; ic_got_reply = IC_RARP; drop_unlock: /* Show's over. Nothing to see here. */ spin_unlock(&ic_recv_lock); drop: /* Throw the packet out. */ kfree_skb(skb); return 0;

} /* * Send RARP request packet over a single interface. */ static void __init ic_rarp_send_if(struct ic_device *d) { struct net_device *dev = d->dev; arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL, dev->dev_addr, dev->dev_addr); } #endif /* * */ DHCP/BOOTP support.

#ifdef IPCONFIG_BOOTP struct bootp_pkt { /* BOOTP packet format */ struct iphdr iph; /* IP header */ struct udphdr udph;/* UDP header */ u8 op; /* 1=request, 2=reply */ u8 htype; /* HW address type */ u8 hlen; /* HW address length */ u8 hops; /* Used only by gateways */ __be32 xid; /* Transaction ID */ __be16 secs; /* Seconds since we started */ __be16 flags; /* Just what it says */ __be32 client_ip; /* Client's IP address if known */ __be32 your_ip; /* Assigned IP address */ __be32 server_ip; /* (Next, e.g. NFS) Server's IP address */ __be32 relay_ip; /* IP address of BOOTP relay */ u8 hw_addr[16]; /* Client's HW address */ u8 serv_name[64]; /* Server host name */ u8 boot_file[128]; /* Name of boot file */ u8 exten[312]; /* DHCP options / BOOTP vendor extensions */ }; /* packet ops */ #define BOOTP_REQUEST 1 #define BOOTP_REPLY 2 /* DHCP message types */ #define DHCPDISCOVER #define DHCPOFFER #define DHCPREQUEST #define DHCPDECLINE #define DHCPACK #define DHCPNAK #define DHCPRELEASE #define DHCPINFORM 1 2 3 4 5 6 7 8

static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static struct packet_type bootp_packet_type __initdata = { .type = cpu_to_be16(ETH_P_IP), .func = ic_bootp_recv, }; /* * Initialize DHCP/BOOTP extension fields in the request. */ static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 }; #ifdef IPCONFIG_DHCP

static void __init ic_dhcp_init_options(u8 *options) { u8 mt = ((ic_servaddr == NONE) ? DHCPDISCOVER : DHCPREQUEST); u8 *e = options; int len; #ifdef IPCONFIG_DEBUG printk("DHCP: Sending message type %d\n", mt); #endif memcpy(e, ic_bootp_cookie, 4); e += 4; *e++ = 53; *e++ = 1; *e++ = mt; /* RFC1048 Magic Cookie */

/* DHCP message type */

if (mt == DHCPREQUEST) { *e++ = 54; /* Server ID (IP address) */ *e++ = 4; memcpy(e, &ic_servaddr, 4); e += 4; *e++ = 50; /* Requested IP address */ *e++ = 4; memcpy(e, &ic_myaddr, 4); e += 4; } /* always? */ { static const u8 ic_req_params[] = { 1, /* Subnet mask */ 3, /* Default gateway */ 6, /* DNS server */ 12, /* Host name */ 15, /* Domain name */ 17, /* Boot path */ 26, /* MTU */ 40, /* NIS domain name */ }; *e++ = 55; /* Parameter request list */ *e++ = sizeof(ic_req_params); memcpy(e, ic_req_params, sizeof(ic_req_params)); e += sizeof(ic_req_params); if (ic_host_name_set) { *e++ = 12; /* host-name */ len = strlen(utsname()->nodename); *e++ = len; memcpy(e, utsname()->nodename, len); e += len; } if (*vendor_class_identifier) { printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n", vendor_class_identifier); *e++ = 60; /* Class-identifier */ len = strlen(vendor_class_identifier); *e++ = len; memcpy(e, vendor_class_identifier, len); e += len; } } *e++ = 255; } /* End of the list */

#endif /* IPCONFIG_DHCP */ static void __init ic_bootp_init_ext(u8 *e) { memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */ e += 4; *e++ = 1; /* Subnet mask request */ *e++ = 4; e += 4; *e++ = 3; /* Default gateway request */ *e++ = 4; e += 4; *e++ = 5; /* Name server request */ *e++ = 8; e += 8; *e++ = 12; /* Host name request */ *e++ = 32; e += 32; *e++ = 40; /* NIS Domain name request */ *e++ = 32; e += 32; *e++ = 17; /* Boot path */ *e++ = 40; e += 40; *e++ = 57; *e++ = 2; *e++ = 1; *e++ = 150; *e++ = 255; } /* * Initialize the DHCP/BOOTP mechanism. */ static inline void __init ic_bootp_init(void) { int i; for (i = 0; i < CONF_NAMESERVERS_MAX; i++) ic_nameservers[i] = NONE; dev_add_pack(&bootp_packet_type); } /* * DHCP/BOOTP cleanup. */ static inline void __init ic_bootp_cleanup(void) { dev_remove_pack(&bootp_packet_type); } /* * Send DHCP/BOOTP request to single interface. */ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_diff) { struct net_device *dev = d->dev; struct sk_buff *skb; struct bootp_pkt *b; struct iphdr *h; /* Allocate packet */ skb = alloc_skb(sizeof(struct bootp_pkt) + LL_ALLOCATED_SPACE(dev) + 15, GFP_KERNEL); /* set extension buffer size for reply */ /* 128+236+8+20+14, see dhcpd sources */ /* End of the list */

if (!skb) return; skb_reserve(skb, LL_RESERVED_SPACE(dev)); b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt)); memset(b, 0, sizeof(struct bootp_pkt)); /* Construct IP header */ skb_reset_network_header(skb); h = ip_hdr(skb); h->version = 4; h->ihl = 5; h->tot_len = htons(sizeof(struct bootp_pkt)); h->frag_off = htons(IP_DF); h->ttl = 64; h->protocol = IPPROTO_UDP; h->daddr = htonl(INADDR_BROADCAST); h->check = ip_fast_csum((unsigned char *) h, h->ihl); /* Construct UDP header */ b->udph.source = htons(68); b->udph.dest = htons(67); b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr)); /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */ /* Construct DHCP/BOOTP header */ b->op = BOOTP_REQUEST; if (dev->type < 256) /* check for false types */ b->htype = dev->type; else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */ b->htype = ARPHRD_IEEE802; else if (dev->type == ARPHRD_FDDI) b->htype = ARPHRD_ETHER; else { printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name); b->htype = dev->type; /* can cause undefined behavior */ } /* server_ip and your_ip address are both already zero per RFC2131 */ b->hlen = dev->addr_len; memcpy(b->hw_addr, dev->dev_addr, dev->addr_len); b->secs = htons(jiffies_diff / HZ); b->xid = d->xid; /* add DHCP options or BOOTP extensions */ #ifdef IPCONFIG_DHCP if (ic_proto_enabled & IC_USE_DHCP) ic_dhcp_init_options(b->exten); else #endif ic_bootp_init_ext(b->exten); /* Chain packet down the line... */ skb->dev = dev; skb->protocol = htons(ETH_P_IP); if (dev_hard_header(skb, dev, ntohs(skb->protocol), dev->broadcast, dev->dev_addr, skb->len) < 0 || dev_queue_xmit(skb) < 0) printk("E"); } /* * Copy BOOTP-supplied string if not already set. */ static int __init ic_bootp_string(char *dest, char *src, int len, int max) { if (!len) return 0; if (len > max-1) len = max-1;

memcpy(dest, src, len); dest[len] = '\0'; return 1; } /* * Process BOOTP extensions. */ static void __init ic_do_bootp_ext(u8 *ext) { u8 servers; int i; u16 mtu; #ifdef IPCONFIG_DEBUG u8 *c; printk("DHCP/BOOTP: Got extension %d:",*ext); for (c=ext+2; c<ext+2+ext[1]; c++) printk(" %02x", *c); printk("\n"); #endif switch (*ext++) { case 1: /* Subnet mask */ if (ic_netmask == NONE) memcpy(&ic_netmask, ext+1, 4); break; case 3: /* Default gateway */ if (ic_gateway == NONE) memcpy(&ic_gateway, ext+1, 4); break; case 6: /* DNS server */ servers= *ext/4; if (servers > CONF_NAMESERVERS_MAX) servers = CONF_NAMESERVERS_MAX; for (i = 0; i < servers; i++) { if (ic_nameservers[i] == NONE) memcpy(&ic_nameservers[i], ext+1+4*i, 4); } break; case 12: /* Host name */ ic_bootp_string(utsname()->nodename, ext+1, *ext, __NEW_UTS_LEN); ic_host_name_set = 1; break; case 15: /* Domain name (DNS) */ ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); break; case 17: /* Root path */ if (!root_server_path[0]) ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path)); break; case 26: /* Interface MTU */ memcpy(&mtu, ext+1, sizeof(mtu)); ic_dev_mtu = ntohs(mtu); break; case 40: /* NIS Domain name (_not_ DNS) */ ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN); break; } } /* * Receive BOOTP reply. */

static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct bootp_pkt *b; struct iphdr *h; struct ic_device *d; int len, ext_len; if (!net_eq(dev_net(dev), &init_net)) goto drop; /* Perform verifications before taking the lock. */ if (skb->pkt_type == PACKET_OTHERHOST) goto drop; if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) return NET_RX_DROP; if (!pskb_may_pull(skb, sizeof(struct iphdr) + sizeof(struct udphdr))) goto drop; b = (struct bootp_pkt *)skb_network_header(skb); h = &b->iph; if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP) goto drop; /* Fragments are not supported */ if (ip_is_fragment(h)) { if (net_ratelimit()) printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented " "reply.\n"); goto drop; } if (skb->len < ntohs(h->tot_len)) goto drop; if (ip_fast_csum((char *) h, h->ihl)) goto drop; if (b->udph.source != htons(67) || b->udph.dest != htons(68)) goto drop; if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr)) goto drop; len = ntohs(b->udph.len) - sizeof(struct udphdr); ext_len = len - (sizeof(*b) sizeof(struct iphdr) sizeof(struct udphdr) sizeof(b->exten)); if (ext_len < 0) goto drop; /* Ok the front looks good, make sure we can get at the rest. */ if (!pskb_may_pull(skb, skb->len)) goto drop; b = (struct bootp_pkt *)skb_network_header(skb); h = &b->iph; /* One reply at a time, please. */ spin_lock(&ic_recv_lock); /* If we already have a reply, just drop the packet */ if (ic_got_reply) goto drop_unlock;

/* Find the ic_device that the packet arrived on */ d = ic_first_dev; while (d && d->dev != dev) d = d->next; if (!d) goto drop_unlock; /* should never happen */ /* Is it a reply to our BOOTP request? */ if (b->op != BOOTP_REPLY || b->xid != d->xid) { if (net_ratelimit()) printk(KERN_ERR "DHCP/BOOTP: Reply not for us, " "op[%x] xid[%x]\n", b->op, b->xid); goto drop_unlock; } /* Is it a reply for the device we are configuring? */ if (b->xid != ic_dev_xid) { if (net_ratelimit()) printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet\n"); goto drop_unlock; } /* Parse extensions */ if (ext_len >= 4 && !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */ u8 *end = (u8 *) b + ntohs(b->iph.tot_len); u8 *ext; #ifdef IPCONFIG_DHCP if (ic_proto_enabled & IC_USE_DHCP) { __be32 server_id = NONE; int mt = 0; ext = &b->exten[4]; while (ext < end && *ext != 0xff) { u8 *opt = ext++; if (*opt == 0) /* Padding */ continue; ext += *ext + 1; if (ext >= end) break; switch (*opt) { case 53: /* Message type */ if (opt[1]) mt = opt[2]; break; case 54: /* Server ID (IP address) */ if (opt[1] >= 4) memcpy(&server_id, opt + 2, 4); break; } } #ifdef IPCONFIG_DEBUG printk("DHCP: Got message type %d\n", mt); #endif switch (mt) { case DHCPOFFER: /* While in the process of accepting one offer, * ignore all others. */ if (ic_myaddr != NONE) goto drop_unlock; /* Let's accept that offer. */ ic_myaddr = b->your_ip; ic_servaddr = server_id;

#ifdef IPCONFIG_DEBUG printk("DHCP: Offered address %pI4 by server %pI4\n", &ic_myaddr, &ic_servaddr); #endif /* The DHCP indicated server address takes * precedence over the bootp header one if * they are different. */ if ((server_id != NONE) && (b->server_ip != server_id)) b->server_ip = ic_servaddr; break; case DHCPACK: if (memcmp(dev->dev_addr, b->hw_addr, dev->addr_len) != 0) goto drop_unlock; /* Yeah! */ break; default: /* Urque. Forget it*/ ic_myaddr = NONE; ic_servaddr = NONE; goto drop_unlock; } ic_dhcp_msgtype = mt; } #endif /* IPCONFIG_DHCP */ ext = &b->exten[4]; while (ext < end && *ext != 0xff) { u8 *opt = ext++; if (*opt == 0) /* Padding */ continue; ext += *ext + 1; if (ext < end) ic_do_bootp_ext(opt); } } /* We have a winner! */ ic_dev = dev; ic_myaddr = b->your_ip; ic_servaddr = b->server_ip; if (ic_gateway == NONE && b->relay_ip) ic_gateway = b->relay_ip; if (ic_nameservers[0] == NONE) ic_nameservers[0] = ic_servaddr; ic_got_reply = IC_BOOTP; drop_unlock: /* Show's over. Nothing to see here. */ spin_unlock(&ic_recv_lock); drop: /* Throw the packet out. */ kfree_skb(skb); return 0; } #endif /* *

Dynamic IP configuration -- DHCP, BOOTP, RARP.

*/ #ifdef IPCONFIG_DYNAMIC static int __init ic_dynamic(void) { int retries; struct ic_device *d; unsigned long start_jiffies, timeout, jiff; int do_bootp = ic_proto_have_if & IC_BOOTP; int do_rarp = ic_proto_have_if & IC_RARP; /* * If none of DHCP/BOOTP/RARP was selected, return with an error. * This routine gets only called when some pieces of information * are missing, and without DHCP/BOOTP/RARP we are unable to get it. */ if (!ic_proto_enabled) { printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); return -1; } #ifdef IPCONFIG_BOOTP if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP) printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n"); #endif #ifdef IPCONFIG_RARP if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP) printk(KERN_ERR "RARP: No suitable device found.\n"); #endif if (!ic_proto_have_if) /* Error message already printed */ return -1; /* * Setup protocols */ #ifdef IPCONFIG_BOOTP if (do_bootp) ic_bootp_init(); #endif #ifdef IPCONFIG_RARP if (do_rarp) ic_rarp_init(); #endif /* * Send requests and wait, until we get an answer. This loop * seems to be a terrible waste of CPU time, but actually there is * only one process running at all, so we don't need to use any * scheduler functions. * [Actually we could now, but the nothing else running note still * applies.. - AC] */ printk(KERN_NOTICE "Sending %s%s%s requests .", do_bootp ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "", (do_bootp && do_rarp) ? " and " : "", do_rarp ? "RARP" : ""); start_jiffies = jiffies; d = ic_first_dev; retries = CONF_SEND_RETRIES; get_random_bytes(&timeout, sizeof(timeout)); timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); for (;;) { /* Track the device we are configuring */ ic_dev_xid = d->xid;

#ifdef IPCONFIG_BOOTP if (do_bootp && (d->able & IC_BOOTP)) ic_bootp_send_if(d, jiffies - start_jiffies); #endif #ifdef IPCONFIG_RARP if (do_rarp && (d->able & IC_RARP)) ic_rarp_send_if(d); #endif jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout); while (time_before(jiffies, jiff) && !ic_got_reply) schedule_timeout_uninterruptible(1); #ifdef IPCONFIG_DHCP /* DHCP isn't done until we get a DHCPACK. */ if ((ic_got_reply & IC_BOOTP) && (ic_proto_enabled & IC_USE_DHCP) && ic_dhcp_msgtype != DHCPACK) { ic_got_reply = 0; printk(KERN_CONT ","); continue; } #endif /* IPCONFIG_DHCP */ if (ic_got_reply) { printk(KERN_CONT " OK\n"); break; } if ((d = d->next)) continue; if (! --retries) { printk(KERN_CONT " timed out!\n"); break; } d = ic_first_dev; timeout = timeout CONF_TIMEOUT_MULT; if (timeout > CONF_TIMEOUT_MAX) timeout = CONF_TIMEOUT_MAX; printk(KERN_CONT "."); } #ifdef IPCONFIG_BOOTP if (do_bootp) ic_bootp_cleanup(); #endif #ifdef IPCONFIG_RARP if (do_rarp) ic_rarp_cleanup(); #endif if (!ic_got_reply) { ic_myaddr = NONE; return -1; } printk("IP-Config: Got %s answer from %pI4, ", ((ic_got_reply & IC_RARP) ? "RARP" : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), &ic_servaddr); printk(KERN_CONT "my address is %pI4\n", &ic_myaddr); return 0; } #endif /* IPCONFIG_DYNAMIC */

#ifdef CONFIG_PROC_FS static int pnp_seq_show(struct seq_file *seq, void *v) { int i; if (ic_proto_used & IC_PROTO) seq_printf(seq, "#PROTO: %s\n", (ic_proto_used & IC_RARP) ? "RARP" : (ic_proto_used & IC_USE_DHCP) ? "DHCP" : "BOOTP"); else seq_puts(seq, "#MANUAL\n"); if (ic_domain[0]) seq_printf(seq, "domain %s\n", ic_domain); for (i = 0; i < CONF_NAMESERVERS_MAX; i++) { if (ic_nameservers[i] != NONE) seq_printf(seq, "nameserver %pI4\n", &ic_nameservers[i]); } if (ic_servaddr != NONE) seq_printf(seq, "bootserver %pI4\n", &ic_servaddr); return 0; } static int pnp_seq_open(struct inode *indoe, struct file *file) { return single_open(file, pnp_seq_show, NULL); } static const struct file_operations pnp_seq_fops = { .owner = THIS_MODULE, .open = pnp_seq_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; #endif /* CONFIG_PROC_FS */ /* * Extract IP address from the parameter string if needed. Note that we * need to have root_server_addr set _before_ IPConfig gets called as it * can override it. */ __be32 __init root_nfs_parse_addr(char *name) { __be32 addr; int octets = 0; char *cp, *cq; cp = cq = name; while (octets < 4) { while (*cp >= '0' && *cp <= '9') cp++; if (cp == cq || cp - cq > 3) break; if (*cp == '.' || octets == 3) octets++; if (octets < 4) cp++; cq = cp; } if (octets == 4 && (*cp == ':' || *cp == '\0')) { if (*cp == ':') *cp++ = '\0'; addr = in_aton(name); memmove(name, cp, strlen(cp) + 1); } else

addr = NONE; return addr; } #define DEVICE_WAIT_MAX static int __init wait_for_devices(void) { int i; for (i = 0; i < DEVICE_WAIT_MAX; i++) { struct net_device *dev; int found = 0; rtnl_lock(); for_each_netdev(&init_net, dev) { if (ic_is_init_dev(dev)) { found = 1; break; } } rtnl_unlock(); if (found) return 0; ssleep(1); } return -ENODEV; } /* * */ IP Autoconfig dispatcher. 12 /* 12 seconds */

static int __init ip_auto_config(void) { __be32 addr; #ifdef IPCONFIG_DYNAMIC int retries = CONF_OPEN_RETRIES; #endif int err; #ifdef CONFIG_PROC_FS proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); #endif /* CONFIG_PROC_FS */ if (!ic_enable) return 0; DBG(("IP-Config: Entered.\n")); #ifdef IPCONFIG_DYNAMIC try_try_again: #endif /* Wait for devices to appear */ err = wait_for_devices(); if (err) return err; /* Setup all network devices */ err = ic_open_devs(); if (err) return err; /* Give drivers a chance to settle */ msleep(CONF_POST_OPEN); /* * If the config information is insufficient (e.g., our IP address or * IP address of the boot server is missing or we have multiple network * interfaces and no default was set), use BOOTP or RARP to get the

* missing values. */ if (ic_myaddr == NONE || #ifdef CONFIG_ROOT_NFS (root_server_addr == NONE && ic_servaddr == NONE && ROOT_DEV == Root_NFS) || #endif ic_first_dev->next) { #ifdef IPCONFIG_DYNAMIC if (ic_dynamic() < 0) { ic_close_devs(); /* * I don't know why, but sometimes the * eepro100 driver (at least) gets upset and * doesn't work the first time it's opened. * But then if you close it and reopen it, it * works just fine. So we need to try that at * least once before giving up. * * Also, if the root will be NFS-mounted, we * have nowhere to go if DHCP fails. So we * just have to keep trying forever. * * -- Chip */ #ifdef CONFIG_ROOT_NFS if (ROOT_DEV == Root_NFS) { printk(KERN_ERR "IP-Config: Retrying forever (NFS root)...\n"); goto try_try_again; } #endif if (--retries) { printk(KERN_ERR "IP-Config: Reopening network devices...\n"); goto try_try_again; } /* Oh, well. At least we tried. */ printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n"); return -1; } #else /* !DYNAMIC */ printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); ic_close_devs(); return -1; #endif /* IPCONFIG_DYNAMIC */ } else { /* Device selected manually or only one device -> use it */ ic_dev = ic_first_dev->dev; } addr = root_nfs_parse_addr(root_server_path); if (root_server_addr == NONE) root_server_addr = addr; /* * Use defaults wherever applicable. */ if (ic_defaults() < 0) return -1; /* * Close all network devices except the device we've * autoconfigured and set up routes. */ ic_close_devs();

if (ic_setup_if() < 0 || ic_setup_routes() < 0) return -1; /* * Record which protocol was actually used. */ #ifdef IPCONFIG_DYNAMIC ic_proto_used = ic_got_reply | (ic_proto_enabled & IC_USE_DHCP); #endif #ifndef IPCONFIG_SILENT /* * Clue in the operator. */ printk("IP-Config: Complete:\n"); printk(" device=%s", ic_dev->name); printk(KERN_CONT ", addr=%pI4", &ic_myaddr); printk(KERN_CONT ", mask=%pI4", &ic_netmask); printk(KERN_CONT ", gw=%pI4", &ic_gateway); printk(KERN_CONT ",\n host=%s, domain=%s, nis-domain=%s", utsname()->nodename, ic_domain, utsname()->domainname); printk(KERN_CONT ",\n bootserver=%pI4", &ic_servaddr); printk(KERN_CONT ", rootserver=%pI4", &root_server_addr); printk(KERN_CONT ", rootpath=%s", root_server_path); if (ic_dev_mtu) printk(KERN_CONT ", mtu=%d", ic_dev_mtu); printk(KERN_CONT "\n"); #endif /* !SILENT */ return 0; } late_initcall(ip_auto_config); /* * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel * command line parameter. See Documentation/filesystems/nfs/nfsroot.txt. */ static int __init ic_proto_name(char *name) { if (!strcmp(name, "on") || !strcmp(name, "any")) { return 1; } if (!strcmp(name, "off") || !strcmp(name, "none")) { return 0; } #ifdef CONFIG_IP_PNP_DHCP else if (!strcmp(name, "dhcp")) { ic_proto_enabled &= ~IC_RARP; return 1; } #endif #ifdef CONFIG_IP_PNP_BOOTP else if (!strcmp(name, "bootp")) { ic_proto_enabled &= ~(IC_RARP | IC_USE_DHCP); return 1; } #endif #ifdef CONFIG_IP_PNP_RARP else if (!strcmp(name, "rarp")) { ic_proto_enabled &= ~(IC_BOOTP | IC_USE_DHCP); return 1; } #endif #ifdef IPCONFIG_DYNAMIC else if (!strcmp(name, "both")) { ic_proto_enabled &= ~IC_USE_DHCP; /* backward compat :-( */ return 1; }

#endif return 0; } static int __init ip_auto_config_setup(char *addrs) { char *cp, *ip, *dp; int num = 0; ic_set_manually = 1; ic_enable = 1; /* * If any dhcp, bootp etc options are set, leave autoconfig on * and skip the below static IP processing. */ if (ic_proto_name(addrs)) return 1; /* If no static IP is given, turn off autoconfig and bail. */ if (*addrs == 0 || strcmp(addrs, "off") == 0 || strcmp(addrs, "none") == 0) { ic_enable = 0; return 1; } /* Parse string for static IP assignment. */ ip = addrs; while (ip && *ip) { if ((cp = strchr(ip, ':'))) *cp++ = '\0'; if (strlen(ip) > 0) { DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip)); switch (num) { case 0: if ((ic_myaddr = in_aton(ip)) == ANY) ic_myaddr = NONE; break; case 1: if ((ic_servaddr = in_aton(ip)) == ANY) ic_servaddr = NONE; break; case 2: if ((ic_gateway = in_aton(ip)) == ANY) ic_gateway = NONE; break; case 3: if ((ic_netmask = in_aton(ip)) == ANY) ic_netmask = NONE; break; case 4: if ((dp = strchr(ip, '.'))) { *dp++ = '\0'; strlcpy(utsname()->domainname, dp, sizeof(utsname()->domainname)); } strlcpy(utsname()->nodename, ip, sizeof(utsname()->nodename)); ic_host_name_set = 1; break; case 5: strlcpy(user_dev_name, ip, sizeof(user_dev_name)); break; case 6: if (ic_proto_name(ip) == 0 && ic_myaddr == NONE) { ic_enable = 0; } break;

} } ip = cp; num++; } return 1; } static int __init nfsaddrs_config_setup(char *addrs) { return ip_auto_config_setup(addrs); } static int __init vendor_class_identifier_setup(char *addrs) { if (strlcpy(vendor_class_identifier, addrs, sizeof(vendor_class_identifier)) >= sizeof(vendor_class_identifier)) printk(KERN_WARNING "DHCP: vendorclass too long, truncated to \"%s\"", vendor_class_identifier); return 1; } __setup("ip=", ip_auto_config_setup); __setup("nfsaddrs=", nfsaddrs_config_setup); __setup("dhcpclass=", vendor_class_identifier_setup);

/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * ROUTE - implementation of the IP router. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Linus Torvalds, <Linus.Torvalds@helsinki.fi> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Alan Cox: Verify area fixes. * Alan Cox: cli() protects routing changes * Rui Oliveira : ICMP routing table updates * (rco@di.uminho.pt) Routing table insertion and update * Linus Torvalds : Rewrote bits to be sensible * Alan Cox: Added BSD route gw semantics * Alan Cox: Super /proc >4K * Alan Cox: MTU in route table * Alan Cox: MSS actually. Also added the window * clamper. * Sam Lantinga : Fixed route matching in rt_del() * Alan Cox: Routing cache support. * Alan Cox: Removed compatibility cruft. * Alan Cox: RTF_REJECT support. * Alan Cox: TCP irtt support. * Jonathan Naylor : Added Metric support. * Miquel van Smoorenburg : BSD API fixes. * Miquel van Smoorenburg : Metrics. * Alan Cox: Use __u32 properly * Alan Cox: Aligned routing errors more closely with BSD * our system is still very different. * Alan Cox: Faster /proc handling * Alexey Kuznetsov : Massive rework to support tree based routing, * routing caches and better behaviour. * * Olaf Erb : irtt wasn't being copied right.

* * * * * * * * * * * * * * * * * * * * * * */

Bjorn Ekwall Alan Cox: Pavel Krauz Mike McLagan Alexey Kuznetsov : Andi Kleen Vitaly E. Lavrov : Vitaly E. Lavrov : Tobias Ringstrom : Vladimir V. Ivanov : Marc Boucher Robert Olsson Arnaldo C. Melo Eric Dumazet Ilia Sotnikov Ilia Sotnikov

: Kerneld route support. Multicast fixed (I hope) : Limited broadcast fixed : Routing by source End of old history. Split to fib.c and route.c and rewritten from scratch. : Load-limit warning messages. Transparent proxy revived after year coma. Race condition in ip_route_input_slow. Uninitialized res.type in ip_route_output_slow. IP rule info (flowid) is really useful. : routing by fwmark : Added rt_cache statistics : Convert proc stuff to seq_file : hashed spinlocks and rt_check_expire() fixes. : Ignore TOS on PMTUD and Redirect : Removed TOS from hash calculations

This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.

#include <linux/module.h> #include <asm/uaccess.h> #include <asm/system.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/bootmem.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/workqueue.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/pkt_sched.h> #include <linux/mroute.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> #include <linux/jhash.h> #include <linux/rcupdate.h> #include <linux/times.h> #include <linux/slab.h> #include <net/dst.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/ip.h> #include <net/route.h> #include <net/inetpeer.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/arp.h> #include <net/tcp.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/netevent.h> #include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif

#include <net/atmclip.h> #include <net/secure_seq.h> #define RT_FL_TOS(oldflp4) \ ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) #define IP_MAX_MTU 0xFFF0

#define RT_GC_TIMEOUT (300*HZ) static int ip_rt_max_size; static int ip_rt_gc_timeout __read_mostly static int ip_rt_gc_min_interval __read_mostly static int ip_rt_redirect_number __read_mostly static int ip_rt_redirect_load __read_mostly static int ip_rt_redirect_silence __read_mostly static int ip_rt_error_cost __read_mostly static int ip_rt_error_burst __read_mostly static int ip_rt_gc_elasticity __read_mostly static int ip_rt_mtu_expires __read_mostly static int ip_rt_min_pmtu __read_mostly static int ip_rt_min_advmss __read_mostly static int rt_chain_length_max __read_mostly /* * */ Interface to generic destination cache. = RT_GC_TIMEOUT; = HZ / 2; = 9; = HZ / 50; = ((HZ / 50) << (9 + 1)); = HZ; = 5 * HZ; = 8; = 10 * 60 * HZ; = 512 + 20 + 20; = 256; = 20;

static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); static unsigned int ipv4_default_mtu(const struct dst_entry *dst); static void ipv4_dst_destroy(struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); static int rt_garbage_collect(struct dst_ops *ops); static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int how) { } static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) { struct rtable *rt = (struct rtable *) dst; struct inet_peer *peer; u32 *p = NULL; if (!rt->peer) rt_bind_peer(rt, rt->rt_dst, 1); peer = rt->peer; if (peer) { u32 *old_p = __DST_METRICS_PTR(old); unsigned long prev, new; p = peer->metrics; if (inet_metrics_new(peer)) memcpy(p, old_p, sizeof(u32) * RTAX_MAX); new = (unsigned long) p; prev = cmpxchg(&dst->_metrics, old, new); if (prev != old) { p = __DST_METRICS_PTR(prev); if (prev & DST_METRICS_READ_ONLY) p = NULL; } else { if (rt->fi) { fib_info_put(rt->fi);

rt->fi = NULL; } } } return p; } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), .gc = rt_garbage_collect, .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .default_mtu = ipv4_default_mtu, .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .local_out = __ip_local_out, .neigh_lookup = ipv4_neigh_lookup, }; #define ECN_OR_COST(class) TC_PRIO_##class

const __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK) }; /* * Route cache. */ /* The locking scheme is rather straight forward: * * 1) Read-Copy Update protects the buckets of the central route hash. * 2) Only writers remove entries, and they hold the lock * as they look at rtable reference counts. * 3) Only readers acquire references to rtable entries, * they do so with atomic increments and with the * lock held. */ struct rt_hash_bucket { struct rtable __rcu *chain; }; #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ defined(CONFIG_PROVE_LOCKING) /* * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks

* The size of this table is a power of two and depends on the number of CPUS. * (on lockdep we have a quite big spinlock_t, so keep the size down there) */ #ifdef CONFIG_LOCKDEP # define RT_HASH_LOCK_SZ 256 #else # if NR_CPUS >= 32 # define RT_HASH_LOCK_SZ 4096 # elif NR_CPUS >= 16 # define RT_HASH_LOCK_SZ 2048 # elif NR_CPUS >= 8 # define RT_HASH_LOCK_SZ 1024 # elif NR_CPUS >= 4 # define RT_HASH_LOCK_SZ 512 # else # define RT_HASH_LOCK_SZ 256 # endif #endif static spinlock_t *rt_hash_locks; # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] static __init void rt_hash_lock_init(void) { int i; rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); for (i = 0; i < RT_HASH_LOCK_SZ; i++) spin_lock_init(&rt_hash_locks[i]); } #else # define rt_hash_lock_addr(slot) NULL static inline void rt_hash_lock_init(void) { } #endif static struct rt_hash_bucket *rt_hash_table __read_mostly; static unsigned rt_hash_mask __read_mostly; static unsigned int rt_hash_log __read_mostly; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, int genid) { return jhash_3words((__force u32)daddr, (__force u32)saddr, idx, genid) & rt_hash_mask; } static inline int rt_genid(struct net *net) { return atomic_read(&net->ipv4.rt_genid); } #ifdef CONFIG_PROC_FS struct rt_cache_iter_state { struct seq_net_private p; int bucket; int genid; }; static struct rtable *rt_cache_get_first(struct seq_file *seq)

{ struct rt_cache_iter_state *st = seq->private; struct rtable *r = NULL; for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { if (!rcu_access_pointer(rt_hash_table[st->bucket].chain)) continue; rcu_read_lock_bh(); r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); while (r) { if (dev_net(r->dst.dev) == seq_file_net(seq) && r->rt_genid == st->genid) return r; r = rcu_dereference_bh(r->dst.rt_next); } rcu_read_unlock_bh(); } return r; } static struct rtable *__rt_cache_get_next(struct seq_file *seq, struct rtable *r) { struct rt_cache_iter_state *st = seq->private; r = rcu_dereference_bh(r->dst.rt_next); while (!r) { rcu_read_unlock_bh(); do { if (--st->bucket < 0) return NULL; } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain)); rcu_read_lock_bh(); r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); } return r; } static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r) { struct rt_cache_iter_state *st = seq->private; while ((r = __rt_cache_get_next(seq, r)) != NULL) { if (dev_net(r->dst.dev) != seq_file_net(seq)) continue; if (r->rt_genid == st->genid) break; } return r; } static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) { struct rtable *r = rt_cache_get_first(seq); if (r) while (pos && (r = rt_cache_get_next(seq, r))) --pos; return pos ? NULL : r; } static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { struct rt_cache_iter_state *st = seq->private; if (*pos) return rt_cache_get_idx(seq, *pos - 1); st->genid = rt_genid(seq_file_net(seq)); return SEQ_START_TOKEN; }

static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct rtable *r; if (v == SEQ_START_TOKEN) r = rt_cache_get_first(seq); else r = rt_cache_get_next(seq, v); ++*pos; return r; } static void rt_cache_seq_stop(struct seq_file *seq, void *v) { if (v && v != SEQ_START_TOKEN) rcu_read_unlock_bh(); } static int rt_cache_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); else { struct rtable *r = v; struct neighbour *n; int len; n = dst_get_neighbour(&r->dst); seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", r->dst.dev ? r->dst.dev->name : "*", (__force u32)r->rt_dst, (__force u32)r->rt_gateway, r->rt_flags, atomic_read(&r->dst.__refcnt), r->dst.__use, 0, (__force u32)r->rt_src, dst_metric_advmss(&r->dst) + 40, dst_metric(&r->dst, RTAX_WINDOW), (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + dst_metric(&r->dst, RTAX_RTTVAR)), r->rt_key_tos, -1, (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0, r->rt_spec_dst, &len); seq_printf(seq, "%*s\n", 127 - len, ""); } return 0; } static const struct seq_operations rt_cache_seq_ops = { .start = rt_cache_seq_start, .next = rt_cache_seq_next, .stop = rt_cache_seq_stop, .show = rt_cache_seq_show, }; static int rt_cache_seq_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &rt_cache_seq_ops, sizeof(struct rt_cache_iter_state)); } static const struct file_operations rt_cache_seq_fops = { .owner = THIS_MODULE, .open = rt_cache_seq_open, .read = seq_read, .llseek = seq_lseek,

.release = seq_release_net, }; static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) { int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } return NULL; } static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } return NULL; } static void rt_cpu_seq_stop(struct seq_file *seq, void *v) { } static int rt_cpu_seq_show(struct seq_file *seq, void *v) { struct rt_cache_stat *st = v; if (v == SEQ_START_TOKEN) { seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); return 0; } seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", dst_entries_get_slow(&ipv4_dst_ops), st->in_hit, st->in_slow_tot, st->in_slow_mc, st->in_no_route, st->in_brd, st->in_martian_dst, st->in_martian_src, st->out_hit, st->out_slow_tot, st->out_slow_mc, st->gc_total, st->gc_ignored, st->gc_goal_miss, st->gc_dst_overflow, st->in_hlist_search, st->out_hlist_search

); return 0; } static const struct seq_operations rt_cpu_seq_ops = { .start = rt_cpu_seq_start, .next = rt_cpu_seq_next, .stop = rt_cpu_seq_stop, .show = rt_cpu_seq_show, }; static int rt_cpu_seq_open(struct inode *inode, struct file *file) { return seq_open(file, &rt_cpu_seq_ops); } static const struct file_operations rt_cpu_seq_fops = { .owner = THIS_MODULE, .open = rt_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; #ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { struct ip_rt_acct *dst, *src; unsigned int i, j; dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); if (!dst) return -ENOMEM; for_each_possible_cpu(i) { src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); for (j = 0; j < 256; j++) { dst[j].o_bytes += src[j].o_bytes; dst[j].o_packets += src[j].o_packets; dst[j].i_bytes += src[j].i_bytes; dst[j].i_packets += src[j].i_packets; } } seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); kfree(dst); return 0; } static int rt_acct_proc_open(struct inode *inode, struct file *file) { return single_open(file, rt_acct_proc_show, NULL); } static const struct file_operations rt_acct_proc_fops = { .owner = THIS_MODULE, .open = rt_acct_proc_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; #endif static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, &rt_cache_seq_fops);

if (!pde) goto err1; pde = proc_create("rt_cache", S_IRUGO, net->proc_net_stat, &rt_cpu_seq_fops); if (!pde) goto err2; #ifdef CONFIG_IP_ROUTE_CLASSID pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); if (!pde) goto err3; #endif return 0; #ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif err2: remove_proc_entry("rt_cache", net->proc_net); err1: return -ENOMEM; } static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); #ifdef CONFIG_IP_ROUTE_CLASSID remove_proc_entry("rt_acct", net->proc_net); #endif } static struct pernet_operations ip_rt_proc_ops __net_initdata = { .init = ip_rt_do_proc_init, .exit = ip_rt_do_proc_exit, }; static int __init ip_rt_proc_init(void) { return register_pernet_subsys(&ip_rt_proc_ops); } #else static inline int ip_rt_proc_init(void) { return 0; } #endif /* CONFIG_PROC_FS */ static inline void rt_free(struct rtable *rt) { call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); } static inline void rt_drop(struct rtable *rt) { ip_rt_put(rt); call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); } static inline int rt_fast_clean(struct rtable *rth) { /* Kill broadcast/multicast entries very aggresively, if they collide in hash table with more useful entries */ return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && rt_is_input_route(rth) && rth->dst.rt_next; }

static inline int rt_valuable(struct rtable *rth) { return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || (rth->peer && rth->peer->pmtu_expires); } static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) { unsigned long age; int ret = 0; if (atomic_read(&rth->dst.__refcnt)) goto out; age = jiffies - rth->dst.lastuse; if ((age <= tmo1 && !rt_fast_clean(rth)) || (age <= tmo2 && rt_valuable(rth))) goto out; ret = 1; return ret;

out: }

/* Bits of score are: * 31: very valuable * 30: not quite useless * 29..0: usage counter */ static inline u32 rt_score(struct rtable *rt) { u32 score = jiffies - rt->dst.lastuse; score = ~score & ~(3<<30); if (rt_valuable(rt)) score |= (1<<31); if (rt_is_output_route(rt) || !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) score |= (1<<30); return score; } static inline bool rt_caching(const struct net *net) { return net->ipv4.current_rt_cache_rebuild_count <= net->ipv4.sysctl_rt_cache_rebuild_count; } static inline bool compare_hash_inputs(const struct rtable *rt1, const struct rtable *rt2) { return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0); } static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) { return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | (rt1->rt_mark ^ rt2->rt_mark) | (rt1->rt_key_tos ^ rt2->rt_key_tos) | (rt1->rt_route_iif ^ rt2->rt_route_iif) | (rt1->rt_oif ^ rt2->rt_oif)) == 0; } static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) { return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));

} static inline int rt_is_expired(struct rtable *rth) { return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); } /* * Perform a full scan of hash table and free all entries. * Can be called by a softirq or a process. * In the later case, we want to be reschedule if necessary */ static void rt_do_flush(struct net *net, int process_context) { unsigned int i; struct rtable *rth, *next; for (i = 0; i <= rt_hash_mask; i++) { struct rtable __rcu **pprev; struct rtable *list; if (process_context && need_resched()) cond_resched(); rth = rcu_access_pointer(rt_hash_table[i].chain); if (!rth) continue; spin_lock_bh(rt_hash_lock_addr(i)); list = NULL; pprev = &rt_hash_table[i].chain; rth = rcu_dereference_protected(*pprev, lockdep_is_held(rt_hash_lock_addr(i))); while (rth) { next = rcu_dereference_protected(rth->dst.rt_next, lockdep_is_held(rt_hash_lock_addr(i))); if (!net || net_eq(dev_net(rth->dst.dev), net)) { rcu_assign_pointer(*pprev, next); rcu_assign_pointer(rth->dst.rt_next, list); list = rth; } else { pprev = &rth->dst.rt_next; } rth = next; } spin_unlock_bh(rt_hash_lock_addr(i)); for (; list; list = next) { next = rcu_dereference_protected(list->dst.rt_next, 1); rt_free(list); } } } /* * While freeing expired entries, we compute average chain length * and standard deviation, using fixed-point arithmetic. * This to have an estimation of rt_chain_length_max * rt_chain_length_max = max(elasticity, AVG + 4*SD) * We use 3 bits for frational part, and 29 (or 61) for magnitude. */ #define FRACT_BITS 3 #define ONE (1UL << FRACT_BITS) /*

* Given a hash chain and an item in this hash chain, * find if a previous entry has the same hash_inputs * (but differs on tos, mark or oif) * Returns 0 if an alias is found. * Returns ONE if rth has no alias before itself. */ static int has_noalias(const struct rtable *head, const struct rtable *rth) { const struct rtable *aux = head; while (aux != rth) { if (compare_hash_inputs(aux, rth)) return 0; aux = rcu_dereference_protected(aux->dst.rt_next, 1); } return ONE; } /* * Perturbation of rt_genid by a small quantity [1..256] * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() * many times (2^24) without giving recent rt_genid. * Jenkins hash is strong enough that litle changes of rt_genid are OK. */ static void rt_cache_invalidate(struct net *net) { unsigned char shuffle; get_random_bytes(&shuffle, sizeof(shuffle)); atomic_add(shuffle + 1U, &net->ipv4.rt_genid); } /* * delay < 0 : invalidate cache (fast : entries will be deleted later) * delay >= 0 : invalidate & flush cache (can be long) */ void rt_cache_flush(struct net *net, int delay) { rt_cache_invalidate(net); if (delay >= 0) rt_do_flush(net, !in_softirq()); } /* Flush previous cache invalidated entries from the cache */ void rt_cache_flush_batch(struct net *net) { rt_do_flush(net, !in_softirq()); } static void rt_emergency_hash_rebuild(struct net *net) { if (net_ratelimit()) printk(KERN_WARNING "Route hash chain too long!\n"); rt_cache_invalidate(net); } /* Short description of GC goals. We want to build algorithm, which will keep routing cache at some equilibrium point, when number of aged off entries is kept approximately equal to newly generated ones. Current expiration strength is variable "expire". We try to adjust it dynamically, so that if networking is idle expires is large enough to keep enough of warm entries, and when load increases it reduces to limit cache size. */ static int rt_garbage_collect(struct dst_ops *ops)

{ static unsigned long expire = RT_GC_TIMEOUT; static unsigned long last_gc; static int rover; static int equilibrium; struct rtable *rth; struct rtable __rcu **rthp; unsigned long now = jiffies; int goal; int entries = dst_entries_get_fast(&ipv4_dst_ops); /* * Garbage collection is pretty expensive, * do not make it too frequently. */ RT_CACHE_STAT_INC(gc_total); if (now - last_gc < ip_rt_gc_min_interval && entries < ip_rt_max_size) { RT_CACHE_STAT_INC(gc_ignored); goto out; } entries = dst_entries_get_slow(&ipv4_dst_ops); /* Calculate number of entries, which we want to expire now. */ goal = entries - (ip_rt_gc_elasticity << rt_hash_log); if (goal <= 0) { if (equilibrium < ipv4_dst_ops.gc_thresh) equilibrium = ipv4_dst_ops.gc_thresh; goal = entries - equilibrium; if (goal > 0) { equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); goal = entries - equilibrium; } } else { /* We are in dangerous area. Try to reduce cache really * aggressively. */ goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); equilibrium = entries - goal; } if (now - last_gc >= ip_rt_gc_min_interval) last_gc = now; if (goal <= 0) { equilibrium += goal; goto work_done; } do { int i, k; for (i = rt_hash_mask, k = rover; i >= 0; i--) { unsigned long tmo = expire; k = (k + 1) & rt_hash_mask; rthp = &rt_hash_table[k].chain; spin_lock_bh(rt_hash_lock_addr(k)); while ((rth = rcu_dereference_protected(*rthp, lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { if (!rt_is_expired(rth) && !rt_may_expire(rth, tmo, expire)) { tmo >>= 1; rthp = &rth->dst.rt_next; continue; } *rthp = rth->dst.rt_next; rt_free(rth);

goal--; } spin_unlock_bh(rt_hash_lock_addr(k)); if (goal <= 0) break; } rover = k; if (goal <= 0) goto work_done; /* Goal is not achieved. We stop process if: - if expire reduced to zero. Otherwise, expire is halfed. - if table is not full. - if we are called from interrupt. - jiffies check is just fallback/debug loop breaker. We will not spin here for long time in any case. */ RT_CACHE_STAT_INC(gc_goal_miss); if (expire == 0) break; expire >>= 1; if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) goto out; } while (!in_softirq() && time_before_eq(jiffies, now)); if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) goto out; if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) goto out; if (net_ratelimit()) printk(KERN_WARNING "dst cache overflow\n"); RT_CACHE_STAT_INC(gc_dst_overflow); return 1; work_done: expire += ip_rt_gc_min_interval; if (expire > ip_rt_gc_timeout || dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) expire = ip_rt_gc_timeout; out: return 0; } /* * Returns number of entries in a hash chain that have different hash_inputs */ static int slow_chain_length(const struct rtable *head) { int length = 0; const struct rtable *rth = head; while (rth) { length += has_noalias(head, rth); rth = rcu_dereference_protected(rth->dst.rt_next, 1); } return length >> FRACT_BITS; } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) { struct neigh_table *tbl = &arp_tbl; static const __be32 inaddr_any = 0; struct net_device *dev = dst->dev; const __be32 *pkey = daddr;

struct neighbour *n; #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) if (dev->type == ARPHRD_ATM) tbl = clip_tbl_hook; #endif if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) pkey = &inaddr_any; n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey); if (n) return n; return neigh_create(tbl, pkey, dev); } static int rt_bind_neighbour(struct rtable *rt) { struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); if (IS_ERR(n)) return PTR_ERR(n); dst_set_neighbour(&rt->dst, n); return 0; } static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, struct sk_buff *skb, int ifindex) { struct rtable *rth, *cand; struct rtable __rcu **rthp, **candp; unsigned long now; u32 min_score; int chain_length; int attempts = !in_softirq(); restart: chain_length = 0; min_score = ~(u32)0; cand = NULL; candp = NULL; now = jiffies; if (!rt_caching(dev_net(rt->dst.dev))) { /* * If we're not caching, just tell the caller we * were successful and don't touch the route. The * caller hold the sole reference to the cache entry, and * it will be released when the caller is done with it. * If we drop it here, the callers have no way to resolve routes * when we're not caching. Instead, just point *rp at rt, so * the caller gets a single use out of the route * Note that we do rt_free on this new route entry, so that * once its refcount hits zero, we are still able to reap it * (Thanks Alexey) * Note: To avoid expensive rcu stuff for this uncached dst, * we set DST_NOCACHE so that dst_release() can free dst without * waiting a grace period. */ rt->dst.flags |= DST_NOCACHE; if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { int err = rt_bind_neighbour(rt); if (err) { if (net_ratelimit()) printk(KERN_WARNING "Neighbour table failure & not caching routes.\n"); ip_rt_put(rt); return ERR_PTR(err); } }

goto skip_hashing; } rthp = &rt_hash_table[hash].chain; spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = rcu_dereference_protected(*rthp, lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { if (rt_is_expired(rth)) { *rthp = rth->dst.rt_next; rt_free(rth); continue; } if (compare_keys(rth, rt) && compare_netns(rth, rt)) { /* Put it first */ *rthp = rth->dst.rt_next; /* * Since lookup is lockfree, the deletion * must be visible to another weakly ordered CPU before * the insertion at the start of the hash chain. */ rcu_assign_pointer(rth->dst.rt_next, rt_hash_table[hash].chain); /* * Since lookup is lockfree, the update writes * must be ordered for consistency on SMP. */ rcu_assign_pointer(rt_hash_table[hash].chain, rth); dst_use(&rth->dst, now); spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); if (skb) skb_dst_set(skb, &rth->dst); return rth; } if (!atomic_read(&rth->dst.__refcnt)) { u32 score = rt_score(rth); if (score <= min_score) { cand = rth; candp = rthp; min_score = score; } } chain_length++; rthp = &rth->dst.rt_next; } if (cand) { /* ip_rt_gc_elasticity used to be average length of chain * length, when exceeded gc becomes really aggressive. * * The second limit is less certain. At the moment it allows * only 2 entries per bucket. We will see. */ if (chain_length > ip_rt_gc_elasticity) { *candp = cand->dst.rt_next; rt_free(cand); } } else { if (chain_length > rt_chain_length_max && slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { struct net *net = dev_net(rt->dst.dev); int num = ++net->ipv4.current_rt_cache_rebuild_count;

if (!rt_caching(net)) { printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", rt->dst.dev->name, num); } rt_emergency_hash_rebuild(net); spin_unlock_bh(rt_hash_lock_addr(hash)); hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, ifindex, rt_genid(net)); goto restart; } } /* Try to bind route to arp only if it is output route or unicast forwarding path. */ if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { int err = rt_bind_neighbour(rt); if (err) { spin_unlock_bh(rt_hash_lock_addr(hash)); if (err != -ENOBUFS) { rt_drop(rt); return ERR_PTR(err); } /* Neighbour tables are full and nothing can be released. Try to shrink route cache, it is most likely it holds some neighbour records. */ if (attempts-- > 0) { int saved_elasticity = ip_rt_gc_elasticity; int saved_int = ip_rt_gc_min_interval; ip_rt_gc_elasticity = 1; ip_rt_gc_min_interval = 0; rt_garbage_collect(&ipv4_dst_ops); ip_rt_gc_min_interval = saved_int; ip_rt_gc_elasticity = saved_elasticity; goto restart; } if (net_ratelimit()) printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); rt_drop(rt); return ERR_PTR(-ENOBUFS); } } rt->dst.rt_next = rt_hash_table[hash].chain; /* * Since lookup is lockfree, we must make sure * previous writes to rt are committed to memory * before making rt visible to other CPUS. */ rcu_assign_pointer(rt_hash_table[hash].chain, rt); spin_unlock_bh(rt_hash_lock_addr(hash)); skip_hashing: if (skb) skb_dst_set(skb, &rt->dst); return rt; } static atomic_t __rt_peer_genid = ATOMIC_INIT(0); static u32 rt_peer_genid(void) { return atomic_read(&__rt_peer_genid);

} void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) { struct inet_peer *peer; peer = inet_getpeer_v4(daddr, create); if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) inet_putpeer(peer); else rt->rt_peer_genid = rt_peer_genid(); } /* * Peer allocation may fail only in serious out-of-memory conditions. However * we still can generate some output. * Random ID selection looks a bit dangerous because we have no chances to * select ID being unique in a reasonable period of time. * But broken packet identifier may be better than no packet at all. */ static void ip_select_fb_ident(struct iphdr *iph) { static DEFINE_SPINLOCK(ip_fb_id_lock); static u32 ip_fallback_id; u32 salt; spin_lock_bh(&ip_fb_id_lock); salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); iph->id = htons(salt & 0xFFFF); ip_fallback_id = salt; spin_unlock_bh(&ip_fb_id_lock); } void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) { struct rtable *rt = (struct rtable *) dst; if (rt) { if (rt->peer == NULL) rt_bind_peer(rt, rt->rt_dst, 1); /* If peer is attached to destination, it is never detached, so that we need not to grab a lock to dereference it. */ if (rt->peer) { iph->id = htons(inet_getid(rt->peer, more)); return; } } else printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", __builtin_return_address(0)); ip_select_fb_ident(iph); } EXPORT_SYMBOL(__ip_select_ident); static void rt_del(unsigned hash, struct rtable *rt) { struct rtable __rcu **rthp; struct rtable *aux; rthp = &rt_hash_table[hash].chain; spin_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); while ((aux = rcu_dereference_protected(*rthp, lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { if (aux == rt || rt_is_expired(aux)) { *rthp = aux->dst.rt_next; rt_free(aux);

continue; } rthp = &aux->dst.rt_next; } spin_unlock_bh(rt_hash_lock_addr(hash)); } static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) { struct rtable *rt = (struct rtable *) dst; __be32 orig_gw = rt->rt_gateway; struct neighbour *n, *old_n; dst_confirm(&rt->dst); rt->rt_gateway = peer->redirect_learned.a4; n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); if (IS_ERR(n)) return PTR_ERR(n); old_n = xchg(&rt->dst._neighbour, n); if (old_n) neigh_release(old_n); if (!n || !(n->nud_state & NUD_VALID)) { if (n) neigh_event_send(n, NULL); rt->rt_gateway = orig_gw; return -EAGAIN; } else { rt->rt_flags |= RTCF_REDIRECTED; call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } return 0; } /* called in rcu_read_lock() section */ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 saddr, struct net_device *dev) { int s, i; struct in_device *in_dev = __in_dev_get_rcu(dev); __be32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; struct inet_peer *peer; struct net *net; if (!in_dev) return; net = dev_net(dev); if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || ipv4_is_zeronet(new_gw)) goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { if (!inet_addr_onlink(in_dev, new_gw, old_gw)) goto reject_redirect; if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { if (inet_addr_type(net, new_gw) != RTN_UNICAST) goto reject_redirect; } for (s = 0; s < 2; s++) { for (i = 0; i < 2; i++) { unsigned int hash; struct rtable __rcu **rthp; struct rtable *rt;

hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); rthp = &rt_hash_table[hash].chain; while ((rt = rcu_dereference(*rthp)) != NULL) { rthp = &rt->dst.rt_next; if (rt->rt_key_dst != daddr || rt->rt_key_src != skeys[s] || rt->rt_oif != ikeys[i] || rt_is_input_route(rt) || rt_is_expired(rt) || !net_eq(dev_net(rt->dst.dev), net) || rt->dst.error || rt->dst.dev != dev || rt->rt_gateway != old_gw) continue; if (!rt->peer) rt_bind_peer(rt, rt->rt_dst, 1); peer = rt->peer; if (peer) { if (peer->redirect_learned.a4 != new_gw) { peer->redirect_learned.a4 = new_gw; atomic_inc(&__rt_peer_genid); } check_peer_redir(&rt->dst, peer); } } } } return; reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n" " Advised path = %pI4 -> %pI4\n", &old_gw, dev->name, &new_gw, &saddr, &daddr); #endif ; } static bool peer_pmtu_expired(struct inet_peer *peer) { unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); return orig && time_after_eq(jiffies, orig) && cmpxchg(&peer->pmtu_expires, orig, 0) == orig; } static bool peer_pmtu_cleaned(struct inet_peer *peer) { unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); return orig && cmpxchg(&peer->pmtu_expires, orig, 0) == orig; } static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { struct rtable *rt = (struct rtable *)dst; struct dst_entry *ret = dst; if (rt) { if (dst->obsolete > 0) {

ip_rt_put(rt); ret = NULL; } else if (rt->rt_flags & RTCF_REDIRECTED) { unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, rt->rt_oif, rt_genid(dev_net(dst->dev))); rt_del(hash, rt); ret = NULL; } else if (rt->peer && peer_pmtu_expired(rt->peer)) { dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig); } } return ret; } /* * Algorithm: * 1. The first ip_rt_redirect_number redirects are sent * with exponential backoff, then we stop sending them at all, * assuming that the host ignores our redirects. * 2. If we did not see packets requiring redirects * during ip_rt_redirect_silence, we assume that the host * forgot redirected route and start to send redirects again. * * This algorithm is much cheaper and more intelligent than dumb load limiting * in icmp.c. * * NOTE. Do not forget to inhibit load limiting for redirects (redundant) * and "frag. need" (breaks PMTU discovery) in icmp.c. */ void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; struct inet_peer *peer; int log_martians; rcu_read_lock(); in_dev = __in_dev_get_rcu(rt->dst.dev); if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { rcu_read_unlock(); return; } log_martians = IN_DEV_LOG_MARTIANS(in_dev); rcu_read_unlock(); if (!rt->peer) rt_bind_peer(rt, rt->rt_dst, 1); peer = rt->peer; if (!peer) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); return; } /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) peer->rate_tokens = 0; /* Too many ignored redirects; do not send anything * set dst.rate_last to the last seen redirected packet. */ if (peer->rate_tokens >= ip_rt_redirect_number) { peer->rate_last = jiffies; return; } /* Check for load limit; set rate_last to the latest sent

* redirect. */ if (peer->rate_tokens == 0 || time_after(jiffies, (peer->rate_last + (ip_rt_redirect_load << peer->rate_tokens)))) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); peer->rate_last = jiffies; ++peer->rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && peer->rate_tokens == ip_rt_redirect_number && net_ratelimit()) printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", &ip_hdr(skb)->saddr, rt->rt_iif, &rt->rt_dst, &rt->rt_gateway); #endif } } static int ip_error(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct inet_peer *peer; unsigned long now; bool send; int code; switch (rt->dst.error) { case EINVAL: default: goto out; case EHOSTUNREACH: code = ICMP_HOST_UNREACH; break; case ENETUNREACH: code = ICMP_NET_UNREACH; IP_INC_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; break; } if (!rt->peer) rt_bind_peer(rt, rt->rt_dst, 1); peer = rt->peer; send = true; if (peer) { now = jiffies; peer->rate_tokens += now - peer->rate_last; if (peer->rate_tokens > ip_rt_error_burst) peer->rate_tokens = ip_rt_error_burst; peer->rate_last = now; if (peer->rate_tokens >= ip_rt_error_cost) peer->rate_tokens -= ip_rt_error_cost; else send = false; } if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); out: } /* * The last two values are not from the RFC but kfree_skb(skb); return 0;

* */

are needed for AMPRnet AX.25 paths.

static const unsigned short mtu_plateau[] = {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; static inline unsigned short guess_mtu(unsigned short old_mtu) { int i; for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) if (old_mtu > mtu_plateau[i]) return mtu_plateau[i]; return 68; } unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, unsigned short new_mtu, struct net_device *dev) { unsigned short old_mtu = ntohs(iph->tot_len); unsigned short est_mtu = 0; struct inet_peer *peer; peer = inet_getpeer_v4(iph->daddr, 1); if (peer) { unsigned short mtu = new_mtu; if (new_mtu < 68 || new_mtu >= old_mtu) { /* BSD 4.2 derived systems incorrectly adjust * tot_len by the IP header length, and report * a zero MTU in the ICMP message. */ if (mtu == 0 && old_mtu >= 68 + (iph->ihl << 2)) old_mtu -= iph->ihl << 2; mtu = guess_mtu(old_mtu); } if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { unsigned long pmtu_expires; pmtu_expires = jiffies + ip_rt_mtu_expires; if (!pmtu_expires) pmtu_expires = 1UL; est_mtu = mtu; peer->pmtu_learned = mtu; peer->pmtu_expires = pmtu_expires; atomic_inc(&__rt_peer_genid); } inet_putpeer(peer); } return est_mtu ? : new_mtu; } static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) { unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); if (!expires) return; if (time_before(jiffies, expires)) { u32 orig_dst_mtu = dst_mtu(dst); if (peer->pmtu_learned < orig_dst_mtu) { if (!peer->pmtu_orig) peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);

dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); } } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); } static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) { struct rtable *rt = (struct rtable *) dst; struct inet_peer *peer; dst_confirm(dst); if (!rt->peer) rt_bind_peer(rt, rt->rt_dst, 1); peer = rt->peer; if (peer) { unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; if (!pmtu_expires || mtu < peer->pmtu_learned) { pmtu_expires = jiffies + ip_rt_mtu_expires; if (!pmtu_expires) pmtu_expires = 1UL; peer->pmtu_learned = mtu; peer->pmtu_expires = pmtu_expires; atomic_inc(&__rt_peer_genid); rt->rt_peer_genid = rt_peer_genid(); } check_peer_pmtu(dst, peer); } } static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = (struct rtable *) dst; if (rt_is_expired(rt)) return NULL; if (rt->rt_peer_genid != rt_peer_genid()) { struct inet_peer *peer; if (!rt->peer) rt_bind_peer(rt, rt->rt_dst, 0); peer = rt->peer; if (peer) { check_peer_pmtu(dst, peer); if (peer->redirect_learned.a4 && peer->redirect_learned.a4 != rt->rt_gateway) { if (check_peer_redir(dst, peer)) return NULL; } } rt->rt_peer_genid = rt_peer_genid(); } return dst; } static void ipv4_dst_destroy(struct dst_entry *dst) { struct rtable *rt = (struct rtable *) dst; struct inet_peer *peer = rt->peer;

if (rt->fi) { fib_info_put(rt->fi); rt->fi = NULL; } if (peer) { rt->peer = NULL; inet_putpeer(peer); } } static void ipv4_link_failure(struct sk_buff *skb) { struct rtable *rt; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); rt = skb_rtable(skb); if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); } static int ip_rt_bug(struct sk_buff *skb) { printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, skb->dev ? skb->dev->name : "?"); kfree_skb(skb); WARN_ON(1); return 0; } /* We do not cache source address of outgoing interface, because it is used only by IP RR, TS and SRR options, so that it out of fast path. BTW remember: "addr" is allowed to be not aligned in IP options! */ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) { __be32 src; if (rt_is_output_route(rt)) src = ip_hdr(skb)->saddr; else { struct fib_result res; struct flowi4 fl4; struct iphdr *iph; iph = ip_hdr(skb); memset(&fl4, 0, sizeof(fl4)); fl4.daddr = iph->daddr; fl4.saddr = iph->saddr; fl4.flowi4_tos = RT_TOS(iph->tos); fl4.flowi4_oif = rt->dst.dev->ifindex; fl4.flowi4_iif = skb->dev->ifindex; fl4.flowi4_mark = skb->mark; rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); else src = inet_select_addr(rt->dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); rcu_read_unlock();

} memcpy(addr, &src, 4); } #ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { if (!(rt->dst.tclassid & 0xFFFF)) rt->dst.tclassid |= tag & 0xFFFF; if (!(rt->dst.tclassid & 0xFFFF0000)) rt->dst.tclassid |= tag & 0xFFFF0000; } #endif static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); if (advmss == 0) { advmss = max_t(unsigned int, dst->dev->mtu - 40, ip_rt_min_advmss); if (advmss > 65535 - 40) advmss = 65535 - 40; } return advmss; } static unsigned int ipv4_default_mtu(const struct dst_entry *dst) { unsigned int mtu = dst->dev->mtu; if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { const struct rtable *rt = (const struct rtable *) dst; if (rt->rt_gateway != rt->rt_dst && mtu > 576) mtu = 576; } if (mtu > IP_MAX_MTU) mtu = IP_MAX_MTU; return mtu; } static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, struct fib_info *fi) { struct inet_peer *peer; int create = 0; /* If a peer entry exists for this destination, we must hook * it up in order to get at cached metrics. */ if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) create = 1; rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); if (peer) { rt->rt_peer_genid = rt_peer_genid(); if (inet_metrics_new(peer)) memcpy(peer->metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX); dst_init_metrics(&rt->dst, peer->metrics, false); check_peer_pmtu(&rt->dst, peer); if (peer->redirect_learned.a4 && peer->redirect_learned.a4 != rt->rt_gateway) { rt->rt_gateway = peer->redirect_learned.a4; rt->rt_flags |= RTCF_REDIRECTED; }

} else { if (fi->fib_metrics != (u32 *) dst_default_metrics) { rt->fi = fi; atomic_inc(&fi->fib_clntref); } dst_init_metrics(&rt->dst, fi->fib_metrics, true); } } static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, const struct fib_result *res, struct fib_info *fi, u16 type, u32 itag) { struct dst_entry *dst = &rt->dst; if (fi) { if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); rt_init_metrics(rt, fl4, fi); #ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } if (dst_mtu(dst) > IP_MAX_MTU) dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, fib_rules_tclass(res)); #endif set_class_tag(rt, itag); #endif } static struct rtable *rt_dst_alloc(struct net_device *dev, bool nopolicy, bool noxfrm) { return dst_alloc(&ipv4_dst_ops, dev, 1, -1, DST_HOST | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); } /* called in rcu_read_lock() section */ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, int our) { unsigned int hash; struct rtable *rth; __be32 spec_dst; struct in_device *in_dev = __in_dev_get_rcu(dev); u32 itag = 0; int err; /* Primary sanity checks. */ if (in_dev == NULL) return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) goto e_inval; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) goto e_inval;

spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, &itag); if (err < 0) goto e_err; } rth = rt_dst_alloc(init_net.loopback_dev, IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->dst.output = ip_rt_bug; rth->rt_key_dst = daddr; rth->rt_key_src = saddr; rth->rt_genid = rt_genid(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; rth->rt_key_tos = tos; rth->rt_dst = daddr; rth->rt_src = saddr; rth->rt_route_iif = dev->ifindex; rth->rt_iif = dev->ifindex; rth->rt_oif = 0; rth->rt_mark = skb->mark; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->rt_peer_genid = 0; rth->peer = NULL; rth->fi = NULL; if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; } #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) rth->dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); rth = rt_intern_hash(hash, rth, skb, dev->ifindex); return IS_ERR(rth) ? PTR_ERR(rth) : 0; e_nobufs: return -ENOBUFS; e_inval: return -EINVAL; e_err: return err; } static void ip_handle_martian_source(struct net_device *dev, struct in_device *in_dev, struct sk_buff *skb, __be32 daddr, __be32 saddr) { RT_CACHE_STAT_INC(in_martian_src); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { /* * RFC1812 recommendation, if source is martian, * the only hint is MAC header.

*/ printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", &daddr, &saddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { int i; const unsigned char *p = skb_mac_header(skb); printk(KERN_WARNING "ll header: "); for (i = 0; i < dev->hard_header_len; i++, p++) { printk("%02x", *p); if (i < (dev->hard_header_len - 1)) printk(":"); } printk("\n"); } } #endif } /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos, struct rtable **result) { struct rtable *rth; int err; struct in_device *out_dev; unsigned int flags = 0; __be32 spec_dst; u32 itag; /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); if (out_dev == NULL) { if (net_ratelimit()) printk(KERN_CRIT "Bug in ip_route_input" \ "_slow(). Please, report\n"); return -EINVAL; } err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), in_dev->dev, &spec_dst, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); goto cleanup; } if (err) flags |= RTCF_DIRECTSRC; if (out_dev == in_dev && err && (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) flags |= RTCF_DOREDIRECT; if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. * * Proxy arp feature have been extended to allow, ARP * replies back to the same interface, to support * Private VLAN switch technologies. See arp.c. */ if (out_dev == in_dev && IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { err = -EINVAL;

goto cleanup; } } rth = rt_dst_alloc(out_dev->dev, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; goto cleanup; } rth->rt_key_dst = daddr; rth->rt_key_src = saddr; rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); rth->rt_flags = flags; rth->rt_type = res->type; rth->rt_key_tos = tos; rth->rt_dst = daddr; rth->rt_src = saddr; rth->rt_route_iif = in_dev->dev->ifindex; rth->rt_iif = in_dev->dev->ifindex; rth->rt_oif = 0; rth->rt_mark = skb->mark; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->rt_peer_genid = 0; rth->peer = NULL; rth->fi = NULL; rth->dst.input = ip_forward; rth->dst.output = ip_output; rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); *result = rth; err = 0; cleanup: return err; } static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, const struct flowi4 *fl4, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { struct rtable* rth = NULL; int err; unsigned hash; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) fib_select_multipath(res); #endif /* create a routing cache entry */ err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); if (err) return err; /* put it into the cache */ hash = rt_hash(daddr, saddr, fl4->flowi4_iif, rt_genid(dev_net(rth->dst.dev))); rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); if (IS_ERR(rth)) return PTR_ERR(rth); return 0; }

/* * * * * * * * * */

NOTE. We drop all the packets that has local source addresses, because every properly looped back packet must have correct destination already attached by output routine. Such approach solves two big problems: 1. Not simplex devices are handled properly. 2. IP spoofing attempts are filtered with 100% of guarantee. called with rcu_read_lock()

static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev) { struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); struct flowi4 fl4; unsigned flags = 0; u32 itag = 0; struct rtable * rth; unsigned hash; __be32 spec_dst; int err = -EINVAL; struct net * net = dev_net(dev); /* IP on this device is disabled. */ if (!in_dev) goto out; /* Check for the most weird martians, which can be not detected by fib_lookup. */ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || ipv4_is_loopback(saddr)) goto martian_source; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ if (ipv4_is_zeronet(saddr)) goto martian_source; if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) goto martian_destination; /* * Now we are ready to route packet. */ fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.daddr = daddr; fl4.saddr = saddr; err = fib_lookup(net, &fl4, &res); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; goto no_route; } RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_BROADCAST)

goto brd_input; if (res.type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, net->loopback_dev->ifindex, dev, &spec_dst, &itag); if (err < 0) goto martian_source_keep_err; if (err) flags |= RTCF_DIRECTSRC; spec_dst = daddr; goto local_input; } if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; if (res.type != RTN_UNICAST) goto martian_destination; out: err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); return err;

brd_input: if (skb->protocol != htons(ETH_P_IP)) goto e_inval; if (ipv4_is_zeronet(saddr)) spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, &itag); if (err < 0) goto martian_source_keep_err; if (err) flags |= RTCF_DIRECTSRC; } flags |= RTCF_BROADCAST; res.type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); local_input: rth = rt_dst_alloc(net->loopback_dev, IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; rth->dst.input= ip_local_deliver; rth->dst.output= ip_rt_bug; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_key_dst = daddr; rth->rt_key_src = saddr; rth->rt_genid = rt_genid(net); rth->rt_flags = flags|RTCF_LOCAL; rth->rt_type = res.type; rth->rt_key_tos = tos; rth->rt_dst = daddr; rth->rt_src = saddr; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_route_iif = dev->ifindex; rth->rt_iif = dev->ifindex; rth->rt_oif = 0; rth->rt_mark = skb->mark; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->rt_peer_genid = 0;

rth->peer = NULL; rth->fi = NULL; if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); err = 0; if (IS_ERR(rth)) err = PTR_ERR(rth); goto out; no_route: RT_CACHE_STAT_INC(in_no_route); spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); res.type = RTN_UNREACHABLE; if (err == -ESRCH) err = -ENETUNREACH; goto local_input; /* * Do not cache martian addresses: they should be logged (RFC1812) */ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", &daddr, &saddr, dev->name); #endif e_hostunreach: err = -EHOSTUNREACH; goto out; e_inval: err = -EINVAL; goto out; e_nobufs: err = -ENOBUFS; goto out; martian_source: err = -EINVAL; martian_source_keep_err: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); goto out; } int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, bool noref) { struct rtable * rth; unsigned hash; int iif = dev->ifindex; struct net *net; int res; net = dev_net(dev); rcu_read_lock(); if (!rt_caching(net)) goto skip_cache; tos &= IPTOS_RT_MASK; hash = rt_hash(daddr, saddr, iif, rt_genid(net));

for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->dst.rt_next)) { if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | (rth->rt_route_iif ^ iif) | (rth->rt_key_tos ^ tos)) == 0 && rth->rt_mark == skb->mark && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { if (noref) { dst_use_noref(&rth->dst, jiffies); skb_dst_set_noref(skb, &rth->dst); } else { dst_use(&rth->dst, jiffies); skb_dst_set(skb, &rth->dst); } RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); return 0; } RT_CACHE_STAT_INC(in_hlist_search); } skip_cache: /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting network acquires a lot of useless route cache entries, sort of SDR messages from all the world. Now we try to get rid of them. Really, provided software IP multicast filter is organized reasonably (at least, hashed), it does not result in a slowdown comparing with route cache reject entries. Note, that multicast routers are not affected, because route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev) { int our = ip_check_mc_rcu(in_dev, daddr, saddr, ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE || (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) #endif ){ int res = ip_route_input_mc(skb, daddr, saddr, tos, dev, our); rcu_read_unlock(); return res; } } rcu_read_unlock(); return -EINVAL; } res = ip_route_input_slow(skb, daddr, saddr, tos, dev); rcu_read_unlock(); return res; } EXPORT_SYMBOL(ip_route_input_common); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, const struct flowi4 *fl4, __be32 orig_daddr, __be32 orig_saddr, int orig_oif, struct net_device *dev_out, unsigned int flags)

{ struct fib_info *fi = res->fi; u32 tos = RT_FL_TOS(fl4); struct in_device *in_dev; u16 type = res->type; struct rtable *rth; if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) return ERR_PTR(-EINVAL); if (ipv4_is_lbcast(fl4->daddr)) type = RTN_BROADCAST; else if (ipv4_is_multicast(fl4->daddr)) type = RTN_MULTICAST; else if (ipv4_is_zeronet(fl4->daddr)) return ERR_PTR(-EINVAL); if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) return ERR_PTR(-EINVAL); if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; fi = NULL; } else if (type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, fl4->flowi4_proto)) flags &= ~RTCF_LOCAL; /* If multicast route do not exist use * default one, but do not gateway in this case. * Yes, it is hack. */ if (fi && res->prefixlen < 4) fi = NULL; } rth = rt_dst_alloc(dev_out, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(in_dev, NOXFRM)); if (!rth) return ERR_PTR(-ENOBUFS); rth->dst.output = ip_output; rth->rt_key_dst = orig_daddr; rth->rt_key_src = orig_saddr; rth->rt_genid = rt_genid(dev_net(dev_out)); rth->rt_flags = flags; rth->rt_type = type; rth->rt_key_tos = tos; rth->rt_dst = fl4->daddr; rth->rt_src = fl4->saddr; rth->rt_route_iif = 0; rth->rt_iif = orig_oif ? : dev_out->ifindex; rth->rt_oif = orig_oif; rth->rt_mark = fl4->flowi4_mark; rth->rt_gateway = fl4->daddr; rth->rt_spec_dst= fl4->saddr; rth->rt_peer_genid = 0; rth->peer = NULL; rth->fi = NULL; RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) { rth->dst.input = ip_local_deliver;

rth->rt_spec_dst = fl4->daddr; } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { rth->rt_spec_dst = fl4->saddr; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(fl4->daddr)) { rth->dst.input = ip_mr_input; rth->dst.output = ip_mc_output; } } #endif } rt_set_nexthop(rth, fl4, res, fi, type, 0); return rth; } /* * Major route resolver routine. * called with rcu_read_lock(); */ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) { struct net_device *dev_out = NULL; u32 tos = RT_FL_TOS(fl4); unsigned int flags = 0; struct fib_result res; struct rtable *rth; __be32 orig_daddr; __be32 orig_saddr; int orig_oif; res.fi = NULL; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; #endif orig_daddr = fl4->daddr; orig_saddr = fl4->saddr; orig_oif = fl4->flowi4_oif; fl4->flowi4_iif = net->loopback_dev->ifindex; fl4->flowi4_tos = tos & IPTOS_RT_MASK; fl4->flowi4_scope = ((tos & RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); rcu_read_lock(); if (fl4->saddr) { rth = ERR_PTR(-EINVAL); if (ipv4_is_multicast(fl4->saddr) || ipv4_is_lbcast(fl4->saddr) || ipv4_is_zeronet(fl4->saddr)) goto out; /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: 1. ip_dev_find(net, saddr) can return wrong iface, if saddr is assigned to multiple interfaces. 2. Moreover, we are allowed to send packets with saddr of another iface. --ANK */

if (fl4->flowi4_oif == 0 && (ipv4_is_multicast(fl4->daddr) || ipv4_is_lbcast(fl4->daddr))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = __ip_dev_find(net, fl4->saddr, false); if (dev_out == NULL) goto out; /* Special hack: user can direct multicasts and limited broadcast via necessary interface without fiddling with IP_MULTICAST_IF or IP_PKTINFO. This hack is not just for fun, it allows vic,vat and friends to work. They bind socket to loopback, set ttl to zero and expect that it will work. From the viewpoint of routing cache they are broken, because we are not allowed to build multicast path with loopback source addr (look, routing cache cannot know, that ttl is zero, so that packet will not leave this host and route is valid). Luckily, this hack is good workaround. */ fl4->flowi4_oif = dev_out->ifindex; goto make_route; } if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ if (!__ip_dev_find(net, fl4->saddr, false)) goto out; } } if (fl4->flowi4_oif) { dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); rth = ERR_PTR(-ENODEV); if (dev_out == NULL) goto out; /* RACE: Check return value of inet_select_addr instead. */ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { rth = ERR_PTR(-ENETUNREACH); goto out; } if (ipv4_is_local_multicast(fl4->daddr) || ipv4_is_lbcast(fl4->daddr)) { if (!fl4->saddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } if (fl4->saddr) { if (ipv4_is_multicast(fl4->daddr)) fl4->saddr = inet_select_addr(dev_out, 0, fl4->flowi4_scope); else if (!fl4->daddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } } if (!fl4->daddr) { fl4->daddr = fl4->saddr; if (!fl4->daddr) fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; fl4->flowi4_oif = net->loopback_dev->ifindex;

res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } if (fib_lookup(net, fl4, &res)) { res.fi = NULL; if (fl4->flowi4_oif) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. WHY? DW. Because we are allowed to send to iface even if it has NO routes and NO assigned addresses. When oif is specified, routing tables are looked up with only one purpose: to catch if destination is gatewayed, rather than direct. Moreover, if MSG_DONTROUTE is set, we send packet, ignoring both routing tables and ifaddr state. --ANK We could make it even if oif is unknown, likely IPv6, but we do not. */ if (fl4->saddr == 0) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); res.type = RTN_UNICAST; goto make_route; } rth = ERR_PTR(-ENETUNREACH); goto out; } if (res.type == RTN_LOCAL) { if (!fl4->saddr) { if (res.fi->fib_prefsrc) fl4->saddr = res.fi->fib_prefsrc; else fl4->saddr = fl4->daddr; } dev_out = net->loopback_dev; fl4->flowi4_oif = dev_out->ifindex; res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) fib_select_multipath(&res); else #endif if (!res.prefixlen && res.table->tb_num_default > 1 && res.type == RTN_UNICAST && !fl4->flowi4_oif) fib_select_default(&res); if (!fl4->saddr) fl4->saddr = FIB_RES_PREFSRC(net, res); dev_out = FIB_RES_DEV(res); fl4->flowi4_oif = dev_out->ifindex; make_route: rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, dev_out, flags);

if (!IS_ERR(rth)) { unsigned int hash; hash = rt_hash(orig_daddr, orig_saddr, orig_oif, rt_genid(dev_net(dev_out))); rth = rt_intern_hash(hash, rth, NULL, orig_oif); } out: rcu_read_unlock(); return rth; } struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) { struct rtable *rth; unsigned int hash; if (!rt_caching(net)) goto slow_output; hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); rcu_read_lock_bh(); for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; rth = rcu_dereference_bh(rth->dst.rt_next)) { if (rth->rt_key_dst == flp4->daddr && rth->rt_key_src == flp4->saddr && rt_is_output_route(rth) && rth->rt_oif == flp4->flowi4_oif && rth->rt_mark == flp4->flowi4_mark && !((rth->rt_key_tos ^ flp4->flowi4_tos) & (IPTOS_RT_MASK | RTO_ONLINK)) && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { dst_use(&rth->dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); if (!flp4->saddr) flp4->saddr = rth->rt_src; if (!flp4->daddr) flp4->daddr = rth->rt_dst; return rth; } RT_CACHE_STAT_INC(out_hlist_search); } rcu_read_unlock_bh(); slow_output: return ip_route_output_slow(net, flp4); } EXPORT_SYMBOL_GPL(__ip_route_output_key); static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) { return NULL; } static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst) { return 0; } static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) { } static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old) {

return NULL; } static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), .destroy = ipv4_dst_destroy, .check = ipv4_blackhole_dst_check, .default_mtu = ipv4_blackhole_default_mtu, .default_advmss = ipv4_default_advmss, .update_pmtu = ipv4_rt_blackhole_update_pmtu, .cow_metrics = ipv4_rt_blackhole_cow_metrics, .neigh_lookup = ipv4_neigh_lookup, }; struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0); struct rtable *ort = (struct rtable *) dst_orig; if (rt) { struct dst_entry *new = &rt->dst; new->__use = 1; new->input = dst_discard; new->output = dst_discard; dst_copy_metrics(new, &ort->dst); new->dev = ort->dst.dev; if (new->dev) dev_hold(new->dev); rt->rt_key_dst = ort->rt_key_dst; rt->rt_key_src = ort->rt_key_src; rt->rt_key_tos = ort->rt_key_tos; rt->rt_route_iif = ort->rt_route_iif; rt->rt_iif = ort->rt_iif; rt->rt_oif = ort->rt_oif; rt->rt_mark = ort->rt_mark; rt->rt_genid = rt_genid(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_dst = ort->rt_dst; rt->rt_src = ort->rt_src; rt->rt_gateway = ort->rt_gateway; rt->rt_spec_dst = ort->rt_spec_dst; rt->peer = ort->peer; if (rt->peer) atomic_inc(&rt->peer->refcnt); rt->fi = ort->fi; if (rt->fi) atomic_inc(&rt->fi->fib_clntref); dst_free(new); } dst_release(dst_orig); return rt ? &rt->dst : ERR_PTR(-ENOMEM); } struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, struct sock *sk) { struct rtable *rt = __ip_route_output_key(net, flp4); if (IS_ERR(rt)) return rt;

if (flp4->flowi4_proto) rt = (struct rtable *) xfrm_lookup(net, &rt->dst, flowi4_to_flowi(flp4), sk, 0); return rt; } EXPORT_SYMBOL_GPL(ip_route_output_flow); static int rt_fill_info(struct net *net, struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait, unsigned int flags) { struct rtable *rt = skb_rtable(skb); struct rtmsg *r; struct nlmsghdr *nlh; unsigned long expires = 0; const struct inet_peer *peer = rt->peer; u32 id = 0, ts = 0, tsage = 0, error; nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); if (nlh == NULL) return -EMSGSIZE; r = nlmsg_data(nlh); r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; r->rtm_tos = rt->rt_key_tos; r->rtm_table = RT_TABLE_MAIN; NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = RTPROT_UNSPEC; r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); if (rt->rt_key_src) { r->rtm_src_len = 32; NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src); } if (rt->dst.dev) NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); #ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid) NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); #endif if (rt_is_input_route(rt)) NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); else if (rt->rt_src != rt->rt_key_src) NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); if (rt->rt_dst != rt->rt_gateway) NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) goto nla_put_failure; if (rt->rt_mark) NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); error = rt->dst.error; if (peer) { inet_peer_refcheck(rt->peer); id = atomic_read(&peer->ip_id_count) & 0xffff; if (peer->tcp_ts_stamp) { ts = peer->tcp_ts;

tsage = get_seconds() - peer->tcp_ts_stamp; } expires = ACCESS_ONCE(peer->pmtu_expires); if (expires) { if (time_before(jiffies, expires)) expires -= jiffies; else expires = 0; } } if (rt_is_input_route(rt)) { #ifdef CONFIG_IP_MROUTE __be32 dst = rt->rt_dst; if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { int err = ipmr_get_route(net, skb, rt->rt_src, rt->rt_dst, r, nowait); if (err <= 0) { if (!nowait) { if (err == 0) return 0; goto nla_put_failure; } else { if (err == -EMSGSIZE) goto nla_put_failure; error = err; } } } else #endif NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif); } if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, expires, error) < 0) goto nla_put_failure; return nlmsg_end(skb, nlh); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { struct net *net = sock_net(in_skb->sk); struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; struct rtable *rt = NULL; __be32 dst = 0; __be32 src = 0; u32 iif; int err; int mark; struct sk_buff *skb; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) goto errout; rtm = nlmsg_data(nlh); skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) { err = -ENOBUFS; goto errout;

} /* Reserve room for dummy headers, this skb can pass through good chunk of routing engine. */ skb_reset_mac_header(skb); skb_reset_network_header(skb); /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ ip_hdr(skb)->protocol = IPPROTO_ICMP; skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; if (iif) { struct net_device *dev; dev = __dev_get_by_index(net, iif); if (dev == NULL) { err = -ENODEV; goto errout_free; } skb->protocol = htons(ETH_P_IP); skb->dev = dev; skb->mark = mark; local_bh_disable(); err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); local_bh_enable(); rt = skb_rtable(skb); if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { struct flowi4 fl4 = { .daddr = dst, .saddr = src, .flowi4_tos = rtm->rtm_tos, .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, .flowi4_mark = mark, }; rt = ip_route_output_key(net, &fl4); err = 0; if (IS_ERR(rt)) err = PTR_ERR(rt); } if (err) goto errout_free; skb_dst_set(skb, &rt->dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0); if (err <= 0) goto errout_free; err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); errout: return err; errout_free: kfree_skb(skb); goto errout;

} int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct rtable *rt; int h, s_h; int idx, s_idx; struct net *net; net = sock_net(skb->sk); s_h = cb->args[0]; if (s_h < 0) s_h = 0; s_idx = idx = cb->args[1]; for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { if (!rt_hash_table[h].chain) continue; rcu_read_lock_bh(); for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) continue; if (rt_is_expired(rt)) continue; skb_dst_set_noref(skb, &rt->dst); if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1, NLM_F_MULTI) <= 0) { skb_dst_drop(skb); rcu_read_unlock_bh(); goto done; } skb_dst_drop(skb); } rcu_read_unlock_bh(); } done: cb->args[0] = h; cb->args[1] = idx; return skb->len; } void ip_rt_multicast_event(struct in_device *in_dev) { rt_cache_flush(dev_net(in_dev->dev), 0); } #ifdef CONFIG_SYSCTL static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { if (write) { int flush_delay; ctl_table ctl; struct net *net; memcpy(&ctl, __ctl, sizeof(ctl)); ctl.data = &flush_delay; proc_dointvec(&ctl, write, buffer, lenp, ppos); net = (struct net *)__ctl->extra1; rt_cache_flush(net, flush_delay); return 0; } return -EINVAL; }

static ctl_table ipv4_route_table[] = { { .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "max_size", .data = &ip_rt_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { /* Deprecated. Use gc_min_interval_ms */ .procname .data .maxlen .mode .proc_handler }, { .procname .data .maxlen .mode .proc_handler }, { .procname .data .maxlen .mode .proc_handler }, { .procname .data .maxlen .mode .proc_handler }, { .procname .data .maxlen .mode .proc_handler }, { .procname .data .maxlen .mode .proc_handler }, { .procname .data .maxlen .mode .proc_handler }, { .procname .data = "error_burst", = &ip_rt_error_burst, = "error_cost", = &ip_rt_error_cost, = sizeof(int), = 0644, = proc_dointvec, = "redirect_silence", = &ip_rt_redirect_silence, = sizeof(int), = 0644, = proc_dointvec, = "redirect_number", = &ip_rt_redirect_number, = sizeof(int), = 0644, = proc_dointvec, = "redirect_load", = &ip_rt_redirect_load, = sizeof(int), = 0644, = proc_dointvec, = "gc_timeout", = &ip_rt_gc_timeout, = sizeof(int), = 0644, = proc_dointvec_jiffies, = "gc_min_interval_ms", = &ip_rt_gc_min_interval, = sizeof(int), = 0644, = proc_dointvec_ms_jiffies, = "gc_min_interval", = &ip_rt_gc_min_interval, = sizeof(int), = 0644, = proc_dointvec_jiffies,

.maxlen .mode .proc_handler }, { .procname .data .maxlen .mode .proc_handler }, { .procname .data .maxlen .mode .proc_handler }, { .procname .data .maxlen .mode .proc_handler }, { .procname .data .maxlen .mode .proc_handler }, {} }; static struct ctl_table empty[1];

= sizeof(int), = 0644, = proc_dointvec, = "gc_elasticity", = &ip_rt_gc_elasticity, = sizeof(int), = 0644, = proc_dointvec, = "mtu_expires", = &ip_rt_mtu_expires, = sizeof(int), = 0644, = proc_dointvec_jiffies, = "min_pmtu", = &ip_rt_min_pmtu, = sizeof(int), = 0644, = proc_dointvec, = "min_adv_mss", = &ip_rt_min_advmss, = sizeof(int), = 0644, = proc_dointvec,

static struct ctl_table ipv4_skeleton[] = { { .procname = "route", .mode = 0555, .child = ipv4_route_table}, { .procname = "neigh", .mode = 0555, .child = empty}, {} }; static __net_initdata struct ctl_path ipv4_path[] = { { .procname = "net", }, { .procname = "ipv4", }, { }, }; static struct ctl_table ipv4_route_flush_table[] = { { .procname = "flush", .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv4_sysctl_rtcache_flush, }, { }, }; static __net_initdata struct ctl_path ipv4_route_path[] = { { .procname = "net", }, { .procname = "ipv4", }, { .procname = "route", }, { }, }; static __net_init int sysctl_route_net_init(struct net *net)

{ struct ctl_table *tbl; tbl = ipv4_route_flush_table; if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; } tbl[0].extra1 = net; net->ipv4.route_hdr = register_net_sysctl_table(net, ipv4_route_path, tbl); if (net->ipv4.route_hdr == NULL) goto err_reg; return 0; err_reg: if (tbl != ipv4_route_flush_table) kfree(tbl); err_dup: return -ENOMEM; } static __net_exit void sysctl_route_net_exit(struct net *net) { struct ctl_table *tbl; tbl = net->ipv4.route_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.route_hdr); BUG_ON(tbl == ipv4_route_flush_table); kfree(tbl); } static __net_initdata struct pernet_operations sysctl_route_ops = { .init = sysctl_route_net_init, .exit = sysctl_route_net_exit, }; #endif static __net_init int rt_genid_init(struct net *net) { get_random_bytes(&net->ipv4.rt_genid, sizeof(net->ipv4.rt_genid)); get_random_bytes(&net->ipv4.dev_addr_genid, sizeof(net->ipv4.dev_addr_genid)); return 0; } static __net_initdata struct pernet_operations rt_genid_ops = { .init = rt_genid_init, }; #ifdef CONFIG_IP_ROUTE_CLASSID struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; #endif /* CONFIG_IP_ROUTE_CLASSID */ static __initdata unsigned long rhash_entries; static int __init set_rhash_entries(char *str) { if (!str) return 0; rhash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("rhash_entries=", set_rhash_entries); int __init ip_rt_init(void) {

int rc = 0; #ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); #endif ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; if (dst_entries_init(&ipv4_dst_ops) < 0) panic("IP: failed to allocate ipv4_dst_ops counter\n"); if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); rt_hash_table = (struct rt_hash_bucket *) alloc_large_system_hash("IP route cache", sizeof(struct rt_hash_bucket), rhash_entries, (totalram_pages >= 128 * 1024) ? 15 : 17, 0, &rt_hash_log, &rt_hash_mask, rhash_entries ? 0 : 512 * 1024); memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); rt_hash_lock_init(); ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); ip_rt_max_size = (rt_hash_mask + 1) * 16; devinet_init(); ip_fib_init(); if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); xfrm4_init(ip_rt_max_size); #endif rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); #endif register_pernet_subsys(&rt_genid_ops); return rc; } #ifdef CONFIG_SYSCTL /* * We really need to sanitize the damn ipv4 init order, then all * this nonsense will go away. */ void __init ip_static_sysctl_init(void) { register_sysctl_paths(ipv4_path, ipv4_skeleton); } #endif

/* * INET *

An implementation of the TCP/IP protocol suite for the LINUX operating system. INET is implemented using the BSD Socket

* interface as the means of communication with the user level. * * "Ping" sockets * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Based on ipv4/udp.c code. * * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6), * Pavel Kankovsky (for Linux 2.4.32) * * Pavel gave all rights to bugs to Vasiliy, * none of the bugs are Pavel's now. * */ #include <asm/system.h> #include <linux/uaccess.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/snmp.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/icmp.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/export.h> #include <net/sock.h> #include <net/ping.h> #include <net/udp.h> #include <net/route.h> #include <net/inet_common.h> #include <net/checksum.h> static struct ping_table ping_table; static u16 ping_port_rover; static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask) { int res = (num + net_hash_mix(net)) & mask; pr_debug("hash(%d) = %d\n", num, res); return res; } static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, struct net *net, unsigned num) { return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; } static int ping_v4_get_port(struct sock *sk, unsigned short ident) { struct hlist_nulls_node *node; struct hlist_nulls_head *hlist; struct inet_sock *isk, *isk2; struct sock *sk2 = NULL;

isk = inet_sk(sk); write_lock_bh(&ping_table.lock); if (ident == 0) { u32 i; u16 result = ping_port_rover + 1; for (i = 0; i < (1L << 16); i++, result++) { if (!result) result++; /* avoid zero */ hlist = ping_hashslot(&ping_table, sock_net(sk), result); ping_portaddr_for_each_entry(sk2, node, hlist) { isk2 = inet_sk(sk2); if (isk2->inet_num == result) goto next_port; } /* found */ ping_port_rover = ident = result; break; next_port: ; } if (i >= (1L << 16)) goto fail; } else { hlist = ping_hashslot(&ping_table, sock_net(sk), ident); ping_portaddr_for_each_entry(sk2, node, hlist) { isk2 = inet_sk(sk2); if ((isk2->inet_num == ident) && (sk2 != sk) && (!sk2->sk_reuse || !sk->sk_reuse)) goto fail; } } pr_debug("found port/ident = %d\n", ident); isk->inet_num = ident; if (sk_unhashed(sk)) { pr_debug("was not hashed\n"); sock_hold(sk); hlist_nulls_add_head(&sk->sk_nulls_node, hlist); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } write_unlock_bh(&ping_table.lock); return 0; fail: write_unlock_bh(&ping_table.lock); return 1; } static void ping_v4_hash(struct sock *sk) { pr_debug("ping_v4_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); BUG(); /* "Please do not press this button again." */ } static void ping_v4_unhash(struct sock *sk) { struct inet_sock *isk = inet_sk(sk); pr_debug("ping_v4_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); if (sk_hashed(sk)) { write_lock_bh(&ping_table.lock); hlist_nulls_del(&sk->sk_nulls_node); sock_put(sk); isk->inet_num = isk->inet_sport = 0;

sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&ping_table.lock); } } static struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr, u16 ident, int dif) { struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); struct sock *sk = NULL; struct inet_sock *isk; struct hlist_nulls_node *hnode; pr_debug("try to find: num = %d, daddr = %ld, dif = %d\n", (int)ident, (unsigned long)daddr, dif); read_lock_bh(&ping_table.lock); ping_portaddr_for_each_entry(sk, hnode, hslot) { isk = inet_sk(sk); pr_debug("found: %p: num = %d, daddr = %ld, dif = %d\n", sk, (int)isk->inet_num, (unsigned long)isk->inet_rcv_saddr, sk->sk_bound_dev_if); pr_debug("iterate\n"); if (isk->inet_num != ident) continue; if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != daddr) continue; if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) continue; sock_hold(sk); goto exit; } sk = NULL; exit: read_unlock_bh(&ping_table.lock); return sk; } static void inet_get_ping_group_range_net(struct net *net, gid_t *low, gid_t *high) { gid_t *data = net->ipv4.sysctl_ping_group_range; unsigned seq; do { seq = read_seqbegin(&sysctl_local_ports.lock); *low = data[0]; *high = data[1]; } while (read_seqretry(&sysctl_local_ports.lock, seq)); } static int ping_init_sock(struct sock *sk) { struct net *net = sock_net(sk); gid_t group = current_egid(); gid_t range[2]; struct group_info *group_info = get_current_groups(); int i, j, count = group_info->ngroups; inet_get_ping_group_range_net(net, range, range+1); if (range[0] <= group && group <= range[1]) return 0; for (i = 0; i < group_info->nblocks; i++) {

int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); for (j = 0; j < cp_count; j++) { group = group_info->blocks[i][j]; if (range[0] <= group && group <= range[1]) return 0; } count -= cp_count; } return -EACCES; } static void ping_close(struct sock *sk, long timeout) { pr_debug("ping_close(sk=%p,sk->num=%u)\n", inet_sk(sk), inet_sk(sk)->inet_num); pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter); sk_common_release(sk); } /* * We need our own bind because there are no privileged id's == local ports. * Moreover, we don't allow binding to multi- and broadcast addresses. */ static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct inet_sock *isk = inet_sk(sk); unsigned short snum; int chk_addr_ret; int err; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n", sk, addr->sin_addr.s_addr, ntohs(addr->sin_port)); chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); if (addr->sin_addr.s_addr == INADDR_ANY) chk_addr_ret = RTN_LOCAL; if ((sysctl_ip_nonlocal_bind == 0 && isk->freebind == 0 && isk->transparent == 0 && chk_addr_ret != RTN_LOCAL) || chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) return -EADDRNOTAVAIL; lock_sock(sk); err = -EINVAL; if (isk->inet_num != 0) goto out; err = -EADDRINUSE; isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; snum = ntohs(addr->sin_port); if (ping_v4_get_port(sk, snum) != 0) { isk->inet_saddr = isk->inet_rcv_saddr = 0; goto out; } pr_debug("after bind(): num = %d, daddr = %ld, dif = %d\n", (int)isk->inet_num, (unsigned long) isk->inet_rcv_saddr,

(int)sk->sk_bound_dev_if); err = 0; if (isk->inet_rcv_saddr) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; isk->inet_sport = htons(isk->inet_num); isk->inet_daddr = 0; isk->inet_dport = 0; sk_dst_reset(sk); out: release_sock(sk); pr_debug("ping_v4_bind -> %d\n", err); return err; } /* * Is this a supported type of ICMP message? */ static inline int ping_supported(int type, int code) { if (type == ICMP_ECHO && code == 0) return 1; return 0; } /* * This routine is called by the ICMP module when it gets some * sort of error condition. */ static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void ping_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2)); struct inet_sock *inet_sock; int type = icmph->type; int code = icmph->code; struct net *net = dev_net(skb->dev); struct sock *sk; int harderr; int err; /* We assume the packet has already been checked by icmp_unreach */ if (!ping_supported(icmph->type, icmph->code)) return; pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type, code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); sk = ping_v4_lookup(net, iph->daddr, iph->saddr, ntohs(icmph->un.echo.id), skb->dev->ifindex); if (sk == NULL) { pr_debug("no socket, dropping\n"); return; /* No socket for error */ } pr_debug("err on socket %p\n", sk); err = 0; harderr = 0; inet_sock = inet_sk(sk); switch (type) { default: case ICMP_TIME_EXCEEDED:

err = EHOSTUNREACH; break; case ICMP_SOURCE_QUENCH: /* This is not a real error but ping wants to see it. * Report it with some fake errno. */ err = EREMOTEIO; break; case ICMP_PARAMETERPROB: err = EPROTO; harderr = 1; break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { err = EMSGSIZE; harderr = 1; break; } goto out; } err = EHOSTUNREACH; if (code <= NR_ICMP_UNREACH) { harderr = icmp_err_convert[code].fatal; err = icmp_err_convert[code].errno; } break; case ICMP_REDIRECT: /* See ICMP_SOURCE_QUENCH */ err = EREMOTEIO; break; } /* * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ if (!inet_sock->recverr) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else { ip_icmp_error(sk, skb, err, 0 /* no remote port */, info, (u8 *)icmph); } sk->sk_err = err; sk->sk_error_report(sk); out: sock_put(sk); } /* * */ Copy and checksum an ICMP Echo packet from user space into a buffer.

struct pingfakehdr { struct icmphdr icmph; struct iovec *iov; u32 wcheck; }; static int ping_getfrag(void *from, char * to, int offset, int fraglen, int odd, struct sk_buff *skb) { struct pingfakehdr *pfh = (struct pingfakehdr *)from; if (offset == 0) { if (fraglen < sizeof(struct icmphdr)) BUG(); if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr), pfh->iov, 0, fraglen - sizeof(struct icmphdr), &pfh->wcheck))

return -EFAULT; return 0; } if (offset < sizeof(struct icmphdr)) BUG(); if (csum_partial_copy_fromiovecend (to, pfh->iov, offset - sizeof(struct icmphdr), fraglen, &pfh->wcheck)) return -EFAULT; return 0; } static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, struct flowi4 *fl4) { struct sk_buff *skb = skb_peek(&sk->sk_write_queue); pfh->wcheck = csum_partial((char *)&pfh->icmph, sizeof(struct icmphdr), pfh->wcheck); pfh->icmph.checksum = csum_fold(pfh->wcheck); memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr)); skb->ip_summed = CHECKSUM_NONE; return ip_push_pending_frames(sk, fl4); } static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { struct net *net = sock_net(sk); struct flowi4 fl4; struct inet_sock *inet = inet_sk(sk); struct ipcm_cookie ipc; struct icmphdr user_icmph; struct pingfakehdr pfh; struct rtable *rt = NULL; struct ip_options_data opt_copy; int free = 0; u32 saddr, daddr, faddr; u8 tos; int err; pr_debug("ping_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); if (len > 0xFFFF) return -EMSGSIZE; /* * */ Check the flags.

/* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; /* * * */ Fetch the ICMP header provided by the userland. iovec is modified!

if (memcpy_fromiovec((u8 *)&user_icmph, msg->msg_iov, sizeof(struct icmphdr))) return -EFAULT; if (!ping_supported(user_icmph.type, user_icmph.code)) return -EINVAL; /* * */ Get and verify the address.

if (msg->msg_name) { struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) return -EINVAL; daddr = usin->sin_addr.s_addr; /* no remote port */ } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = inet->inet_daddr; /* no remote port */ } ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.oif = sk->sk_bound_dev_if; ipc.tx_flags = 0; err = sock_tx_timestamp(sk, &ipc.tx_flags); if (err) return err; if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc); if (err) return err; if (ipc.opt) free = 1; } if (!ipc.opt) { struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { memcpy(&opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); ipc.opt = &opt_copy.opt; } rcu_read_unlock(); } saddr = ipc.addr; ipc.addr = faddr = daddr; if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) return -EINVAL; faddr = ipc.opt->opt.faddr; } tos = RT_TOS(inet->tos); if (sock_flag(sk, SOCK_LOCALROUTE) || (msg->msg_flags & MSG_DONTROUTE) || (ipc.opt && ipc.opt->opt.is_strictroute)) { tos |= RTO_ONLINK; } if (ipv4_is_multicast(daddr)) { if (!ipc.oif) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; } flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);

security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; if (err == -ENETUNREACH) IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); goto out; } err = -EACCES; if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) goto out; if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); pfh.icmph.type = user_icmph.type; /* already checked */ pfh.icmph.code = user_icmph.code; /* ditto */ pfh.icmph.checksum = 0; pfh.icmph.un.echo.id = inet->inet_sport; pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; pfh.iov = msg->msg_iov; pfh.wcheck = 0; err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, 0, &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else err = ping_push_pending_frames(sk, &pfh, &fl4); release_sock(sk); out: ip_rt_put(rt); if (free) kfree(ipc.opt); if (!err) { icmp_out_count(sock_net(sk), user_icmph.type); return len; } return err; do_confirm: dst_confirm(&rt->dst); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto out; } static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) { struct inet_sock *isk = inet_sk(sk); struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; struct sk_buff *skb; int copied, err; pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); if (flags & MSG_OOB) goto out;

if (addr_len) *addr_len = sizeof(*sin); if (flags & MSG_ERRQUEUE) return ip_recv_error(sk, msg, len); skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } /* Don't bother checking the checksum */ err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); if (err) goto done; sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; sin->sin_port = 0 /* skb->h.uh->source */; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); } if (isk->cmsg_flags) ip_cmsg_recv(msg, skb); err = copied; done: skb_free_datagram(sk, skb); out: pr_debug("ping_recvmsg -> %d\n", err); return err; } static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", inet_sk(sk), inet_sk(sk)->inet_num, skb); if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); pr_debug("ping_queue_rcv_skb -> failed\n"); return -1; } return 0; } /* * */

All we need to do is get the socket.

void ping_rcv(struct sk_buff *skb) { struct sock *sk; struct net *net = dev_net(skb->dev); struct iphdr *iph = ip_hdr(skb); struct icmphdr *icmph = icmp_hdr(skb); u32 saddr = iph->saddr; u32 daddr = iph->daddr; /* We assume the packet has already been checked by icmp_rcv */

pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n", skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); /* Push ICMP header back */ skb_push(skb, skb->data - (u8 *)icmph); sk = ping_v4_lookup(net, saddr, daddr, ntohs(icmph->un.echo.id), skb->dev->ifindex); if (sk != NULL) { pr_debug("rcv on socket %p\n", sk); ping_queue_rcv_skb(sk, skb_get(skb)); sock_put(sk); return; } pr_debug("no socket, dropping\n"); /* We're called from icmp_rcv(). kfree_skb() is done there. */ } struct proto ping_prot = { .name = "PING", .owner = THIS_MODULE, .init = ping_init_sock, .close = ping_close, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .sendmsg = ping_sendmsg, .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, .hash = ping_v4_hash, .unhash =ping_v4_unhash, .get_port = ping_v4_get_port, .obj_size = sizeof(struct inet_sock), }; EXPORT_SYMBOL(ping_prot); #ifdef CONFIG_PROC_FS static struct sock *ping_get_first(struct seq_file *seq, int start) { struct sock *sk; struct ping_iter_state *state = seq->private; struct net *net = seq_file_net(seq); for (state->bucket = start; state->bucket < PING_HTABLE_SIZE; ++state->bucket) { struct hlist_nulls_node *node; struct hlist_nulls_head *hslot; hslot = &ping_table.hash[state->bucket]; if (hlist_nulls_empty(hslot)) continue; sk_nulls_for_each(sk, node, hslot) { if (net_eq(sock_net(sk), net)) goto found; } } sk = NULL; found: return sk; } static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk) { struct ping_iter_state *state = seq->private;

struct net *net = seq_file_net(seq); do { sk = sk_nulls_next(sk); } while (sk && (!net_eq(sock_net(sk), net))); if (!sk) return ping_get_first(seq, state->bucket + 1); return sk; } static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) { struct sock *sk = ping_get_first(seq, 0); if (sk) while (pos && (sk = ping_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } static void *ping_seq_start(struct seq_file *seq, loff_t *pos) { struct ping_iter_state *state = seq->private; state->bucket = 0; read_lock_bh(&ping_table.lock); return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == SEQ_START_TOKEN) sk = ping_get_idx(seq, 0); else sk = ping_get_next(seq, v); ++*pos; return sk; } static void ping_seq_stop(struct seq_file *seq, void *v) { read_unlock_bh(&ping_table.lock); } static void ping_format_sock(struct sock *sp, struct seq_file *f, int bucket, int *len) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops), len); } static int ping_seq_show(struct seq_file *seq, void *v) {

if (v == SEQ_START_TOKEN) seq_printf(seq, "%-127s\n", " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct ping_iter_state *state = seq->private; int len; ping_format_sock(v, seq, state->bucket, &len); seq_printf(seq, "%*s\n", 127 - len, ""); } return 0; } static const struct seq_operations ping_seq_ops = { .show = ping_seq_show, .start = ping_seq_start, .next = ping_seq_next, .stop = ping_seq_stop, }; static int ping_seq_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &ping_seq_ops, sizeof(struct ping_iter_state)); } static const struct file_operations ping_seq_fops = { .open = ping_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_net, }; static int ping_proc_register(struct net *net) { struct proc_dir_entry *p; int rc = 0; p = proc_net_fops_create(net, "icmp", S_IRUGO, &ping_seq_fops); if (!p) rc = -ENOMEM; return rc; } static void ping_proc_unregister(struct net *net) { proc_net_remove(net, "icmp"); } static int __net_init ping_proc_init_net(struct net *net) { return ping_proc_register(net); } static void __net_exit ping_proc_exit_net(struct net *net) { ping_proc_unregister(net); } static struct pernet_operations ping_net_ops = { .init = ping_proc_init_net, .exit = ping_proc_exit_net, }; int __init ping_proc_init(void) { return register_pernet_subsys(&ping_net_ops);

} void ping_proc_exit(void) { unregister_pernet_subsys(&ping_net_ops); } #endif void __init ping_init(void) { int i; for (i = 0; i < PING_HTABLE_SIZE; i++) INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i); rwlock_init(&ping_table.lock); }

/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * IPv4 specific functions * * * code split from: * linux/ipv4/tcp.c * linux/ipv4/tcp_input.c * linux/ipv4/tcp_output.c * * See tcp.c for author information * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ /* * Changes: * David S. Miller : New socket lookup architecture. * This code is dedicated to John Dyson. * David S. Miller : Change semantics of established hash, * half is devoted to TIME_WAIT sockets * and the rest go in the other half. * Andi Kleen : Add support for syncookies and fixed * some bugs: ip options weren't passed to * the TCP layer, missed a check for an * ACK bit. * Andi Kleen : Implemented fast path mtu discovery. * Fixed many serious bugs in the * request_sock handling and moved * most of it into the af independent code. * Added tail drop and some other bugfixes. * Added new listen semantics. * Mike McLagan : Routing by source * Juan Jose Ciarlante: ip_dynaddr bits * Andi Kleen: various fixes. * Vitaly E. Lavrov : Transparent proxy revived after year * coma. * Andi Kleen : Fix new listen. * Andi Kleen : Fix accept error reporting. * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind * a single port at the same time.

*/ #include <linux/bottom_half.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/module.h> #include <linux/random.h> #include <linux/cache.h> #include <linux/jhash.h> #include <linux/init.h> #include <linux/times.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/icmp.h> #include <net/inet_hashtables.h> #include <net/tcp.h> #include <net/transp_v6.h> #include <net/ipv6.h> #include <net/inet_common.h> #include <net/timewait_sock.h> #include <net/xfrm.h> #include <net/netdma.h> #include <net/secure_seq.h> #include <linux/inet.h> #include <linux/ipv6.h> #include <linux/stddef.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/crypto.h> #include <linux/scatterlist.h> int sysctl_tcp_tw_reuse __read_mostly; int sysctl_tcp_low_latency __read_mostly; EXPORT_SYMBOL(sysctl_tcp_low_latency); #ifdef CONFIG_TCP_MD5SIG static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr); static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); #else static inline struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) { return NULL; } #endif struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb) { return secure_tcp_sequence_number(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, tcp_hdr(skb)->dest, tcp_hdr(skb)->source); } int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) { const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); struct tcp_sock *tp = tcp_sk(sk); /* With PAWS, it is safe from the viewpoint

of data integrity. Even without PAWS it is safe provided sequence spaces do not overlap i.e. at data rates <= 80Mbit/sec. Actually, the idea is close to VJ's one, only timestamp cache is held not per host, but per port pair and TW bucket is used as state holder. If TW bucket has been already destroyed we fall back to VJ's scheme and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && (twp == NULL || (sysctl_tcp_tw_reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (tp->write_seq == 0) tp->write_seq = 1; tp->rx_opt.ts_recent = tcptw->tw_ts_recent; tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; sock_hold(sktw); return 1; } return 0; } EXPORT_SYMBOL_GPL(tcp_twsk_unique); /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); __be16 orig_sport, orig_dport; __be32 daddr, nexthop; struct flowi4 *fl4; struct rtable *rt; int err; struct ip_options_rcu *inet_opt; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; nexthop = daddr = usin->sin_addr.s_addr; inet_opt = rcu_dereference_protected(inet->inet_opt, sock_owned_by_user(sk)); if (inet_opt && inet_opt->opt.srr) { if (!daddr) return -EINVAL; nexthop = inet_opt->opt.faddr; } orig_sport = inet->inet_sport; orig_dport = usin->sin_port; fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, orig_dport, sk, true); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return err; } if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { ip_rt_put(rt);

return -ENETUNREACH; } if (!inet_opt || !inet_opt->opt.srr) daddr = fl4->daddr; if (!inet->inet_saddr) inet->inet_saddr = fl4->saddr; inet->inet_rcv_saddr = inet->inet_saddr; if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { /* Reset inherited state */ tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; tp->write_seq = 0; } if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); /* * VJ's idea. We save last timestamp seen from * the destination in peer table, when entering state * TIME-WAIT * and initialize rx_opt.ts_recent from it, * when trying new connection. */ if (peer) { inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; tp->rx_opt.ts_recent = peer->tcp_ts; } } } inet->inet_dport = usin->sin_port; inet->inet_daddr = daddr; inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket * lock select source port, enter ourselves into the hash tables and * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); err = inet_hash_connect(&tcp_death_row, sk); if (err) goto failure; rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto failure; } /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, inet->inet_daddr, inet->inet_sport, usin->sin_port);

inet->inet_id = tp->write_seq ^ jiffies; err = tcp_connect(sk); rt = NULL; if (err) goto failure; return 0; failure: /* * This unhashes the socket and releases the local port, * if necessary. */ tcp_set_state(sk, TCP_CLOSE); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; return err; } EXPORT_SYMBOL(tcp_v4_connect); /* * This routine does path mtu discovery as defined in RFC1191. */ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs * send out by Linux are always <576bytes so they should go through * unfragmented). */ if (sk->sk_state == TCP_LISTEN) return; /* We don't check in the destentry if pmtu discovery is forbidden * on this route. We just assume that no packet_to_big packets * are send back when pmtu discovery is not active. * There is a small race when the user changes this flag in the * route, but I think that's acceptable. */ if ((dst = __sk_dst_check(sk, 0)) == NULL) return; dst->ops->update_pmtu(dst, mtu); /* Something is about to be wrong... Remember soft error * for the case, if this connection will not able to recover. */ if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) sk->sk_err_soft = EMSGSIZE; mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && inet_csk(sk)->icsk_pmtu_cookie > mtu) { tcp_sync_mss(sk, mtu); /* Resend the TCP packet because it's * clear that the old packet has been * dropped. This is the new "fast" path mtu * discovery. */ tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ }

/* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. After adjustment * header points to the first 8 bytes of the tcp header. We need * to find the appropriate port. * * The locking strategy used here is very "optimistic". When * someone else accesses the socket the ICMP is just dropped * and for some paths there is no check at all. * A more general error queue to queue errors for later handling * is probably better. * */ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); struct inet_connection_sock *icsk; struct tcp_sock *tp; struct inet_sock *inet; const int type = icmp_hdr(icmp_skb)->type; const int code = icmp_hdr(icmp_skb)->code; struct sock *sk; struct sk_buff *skb; __u32 seq; __u32 remaining; int err; struct net *net = dev_net(icmp_skb->dev); if (icmp_skb->len < (iph->ihl << 2) + 8) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; } sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, th->source, inet_iif(icmp_skb)); if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; } if (sk->sk_state == TCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); return; } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto out; } icsk = inet_csk(sk); tp = tcp_sk(sk); seq = ntohl(th->seq); if (sk->sk_state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out;

} switch (type) { case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ goto out; case ICMP_PARAMETERPROB: err = EPROTO; break; case ICMP_DEST_UNREACH: if (code > NR_ICMP_UNREACH) goto out; if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ if (!sock_owned_by_user(sk)) do_pmtu_discovery(sk, iph, info); goto out; } err = icmp_err_convert[code].errno; /* check if icmp_skb allows revert of backoff * (see draft-zimmermann-tcp-lcd) */ if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) break; if (seq != tp->snd_una || !icsk->icsk_retransmits || !icsk->icsk_backoff) break; if (sock_owned_by_user(sk)) break; icsk->icsk_backoff--; inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT) << icsk->icsk_backoff; tcp_bound_rto(sk); skb = tcp_write_queue_head(sk); BUG_ON(!skb); remaining = icsk->icsk_rto - min(icsk->icsk_rto, tcp_time_stamp - TCP_SKB_CB(skb)->when); if (remaining) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, TCP_RTO_MAX); } else { /* RTO revert clocked out retransmission. * Will retransmit now */ tcp_retransmit_timer(sk); } break; case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; default: goto out; } switch (sk->sk_state) { struct request_sock *req, **prev; case TCP_LISTEN: if (sock_owned_by_user(sk)) goto out; req = inet_csk_search_req(sk, &prev, th->dest, iph->daddr, iph->saddr); if (!req) goto out;

/* ICMPs are not backlogged, hence we cannot get an established socket here. */ WARN_ON(req->sk); if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } /* * Still in SYN_RECV, just remove it silently. * There is no good way to pass the error to the newly * created socket, and POSIX does not want network * errors returned from accept(). */ inet_csk_reqsk_queue_drop(sk, req, prev); goto out; case TCP_SYN_SENT: case TCP_SYN_RECV: /* Cannot happen. It can f.e. if SYNs crossed. */ if (!sock_owned_by_user(sk)) { sk->sk_err = err; sk->sk_error_report(sk); tcp_done(sk); } else { sk->sk_err_soft = err; } goto out; } /* If we've already connected we will keep trying * until we time out, or the user gives up. * * rfc1122 4.2.3.9 allows to consider as hard errors * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, * but it is obsoleted by pmtu discovery). * * Note, that in modern internet, where routing is unreliable * and in each dark corner broken firewalls sit, sending random * errors ordered by their masters even this two messages finally lose * their original sense (even Linux sends invalid PORT_UNREACHs) * * Now we are in compliance with RFCs. * --ANK (980905) */ inet = inet_sk(sk); if (!sock_owned_by_user(sk) && inet->recverr) { sk->sk_err = err; sk->sk_error_report(sk); } else { /* Only an error on timeout */ sk->sk_err_soft = err; } out: bh_unlock_sock(sk); sock_put(sk); } static void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct tcphdr *th = tcp_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) {

th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } else { th->check = tcp_v4_check(skb->len, saddr, daddr, csum_partial(th, th->doff << 2, skb->csum)); } } /* This routine computes an IPv4 TCP checksum. */ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) { const struct inet_sock *inet = inet_sk(sk); __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); } EXPORT_SYMBOL(tcp_v4_send_check); int tcp_v4_gso_send_check(struct sk_buff *skb) { const struct iphdr *iph; struct tcphdr *th; if (!pskb_may_pull(skb, sizeof(*th))) return -EINVAL; iph = ip_hdr(skb); th = tcp_hdr(skb); th->check = 0; skb->ip_summed = CHECKSUM_PARTIAL; __tcp_v4_send_check(skb, iph->saddr, iph->daddr); return 0; } /* * * * * * * * * * * */ This routine will send an RST to the other tcp. Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) for reset. Answer: if a packet caused RST, it is not for a socket existing in our system, if it is matched to a socket, it is just duplicate segment or bug in other side's TCP. So that we build reply only basing on parameters arrived with segment. Exception: precedence violation. We do not implement it in any case.

static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; #ifdef CONFIG_TCP_MD5SIG __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; #endif } rep; struct ip_reply_arg arg; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; #endif struct net *net; /* Never send a reset in response to a reset. */ if (th->rst) return; if (skb_rtable(skb)->rt_type != RTN_LOCAL)

return; /* Swap the send and the receive. */ memset(&rep, 0, sizeof(rep)); rep.th.dest = th->source; rep.th.source = th->dest; rep.th.doff = sizeof(struct tcphdr) / 4; rep.th.rst = 1; if (th->ack) { rep.th.seq = th->ack_seq; } else { rep.th.ack = 1; rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2)); } memset(&arg, 0, sizeof(arg)); arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); #ifdef CONFIG_TCP_MD5SIG key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL; if (key) { rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); /* Update length and the length the header thinks exists */ arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; rep.th.doff = arg.iov[0].iov_len / 4; tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], key, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &rep.th); } #endif arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); } /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states outside socket context is ugly, certainly. What can I do? */ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts, int oif, struct tcp_md5sig_key *key, int reply_flags, u8 tos) { const struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) #ifdef CONFIG_TCP_MD5SIG + (TCPOLEN_MD5SIG_ALIGNED >> 2) #endif ]; } rep;

struct ip_reply_arg arg; struct net *net = dev_net(skb_dst(skb)->dev); memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); if (ts) { rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); rep.opt[1] = htonl(tcp_time_stamp); rep.opt[2] = htonl(ts); arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; } /* Swap the send and the receive. */ rep.th.dest = th->source; rep.th.source = th->dest; rep.th.doff = arg.iov[0].iov_len / 4; rep.th.seq = htonl(seq); rep.th.ack_seq = htonl(ack); rep.th.ack = 1; rep.th.window = htons(win); #ifdef CONFIG_TCP_MD5SIG if (key) { int offset = (ts) ? 3 : 0; rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; rep.th.doff = arg.iov[0].iov_len/4; tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], key, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &rep.th); } #endif arg.flags = reply_flags; arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; if (oif) arg.bound_dev_if = oif; arg.tos = tos; ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); } static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, tw->tw_tos );

inet_twsk_put(tw); } static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent, 0, tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos); } /* * Send a SYN-ACK after having received a SYN. * This still operates on a request_sock only, not on a big * socket. */ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct request_values *rvp) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; struct sk_buff * skb; /* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; skb = tcp_make_synack(sk, dst, req, rvp); if (skb) { __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, ireq->rmt_addr, ireq->opt); err = net_xmit_eval(err); } dst_release(dst); return err; } static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, struct request_values *rvp) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); return tcp_v4_send_synack(sk, NULL, req, rvp); } /* * IPv4 request_sock destructor. */ static void tcp_v4_reqsk_destructor(struct request_sock *req) { kfree(inet_rsk(req)->opt); } /* * Return 1 if a syncookie should be sent */ int tcp_syn_flood_action(struct sock *sk, const struct sk_buff *skb, const char *proto) {

const char *msg = "Dropping request"; int want_cookie = 0; struct listen_sock *lopt;

#ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { msg = "Sending cookies"; want_cookie = 1; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); } else #endif NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; if (!lopt->synflood_warned) { lopt->synflood_warned = 1; pr_info("%s: Possible SYN flooding on port %d. %s. " " Check SNMP counters.\n", proto, ntohs(tcp_hdr(skb)->dest), msg); } return want_cookie; } EXPORT_SYMBOL(tcp_syn_flood_action); /* * Save and compile IPv4 options into the request_sock if needed. */ static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk, struct sk_buff *skb) { const struct ip_options *opt = &(IPCB(skb)->opt); struct ip_options_rcu *dopt = NULL; if (opt && opt->optlen) { int opt_size = sizeof(*dopt) + opt->optlen; dopt = kmalloc(opt_size, GFP_ATOMIC); if (dopt) { if (ip_options_echo(&dopt->opt, skb)) { kfree(dopt); dopt = NULL; } } } return dopt; } #ifdef CONFIG_TCP_MD5SIG /* * RFC2385 MD5 checksumming requires a mapping of * IP address->MD5 Key. * We need to maintain these in the sk structure. */ /* Find the Key structure for an address. */ static struct tcp_md5sig_key * tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) { struct tcp_sock *tp = tcp_sk(sk); int i; if (!tp->md5sig_info || !tp->md5sig_info->entries4) return NULL; for (i = 0; i < tp->md5sig_info->entries4; i++) { if (tp->md5sig_info->keys4[i].addr == addr) return &tp->md5sig_info->keys4[i].base; } return NULL;

} struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, struct sock *addr_sk) { return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr); } EXPORT_SYMBOL(tcp_v4_md5_lookup); static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, struct request_sock *req) { return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr); } /* This can be called on a newly created socket, from other files */ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, u8 *newkey, u8 newkeylen) { /* Add Key to the list */ struct tcp_md5sig_key *key; struct tcp_sock *tp = tcp_sk(sk); struct tcp4_md5sig_key *keys; key = tcp_v4_md5_do_lookup(sk, addr); if (key) { /* Pre-existing entry - just update that one. */ kfree(key->key); key->key = newkey; key->keylen = newkeylen; } else { struct tcp_md5sig_info *md5sig; if (!tp->md5sig_info) { tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), GFP_ATOMIC); if (!tp->md5sig_info) { kfree(newkey); return -ENOMEM; } sk_nocaps_add(sk, NETIF_F_GSO_MASK); } md5sig = tp->md5sig_info; if (md5sig->entries4 == 0 && tcp_alloc_md5sig_pool(sk) == NULL) { kfree(newkey); return -ENOMEM; } if (md5sig->alloced4 == md5sig->entries4) { keys = kmalloc((sizeof(*keys) * (md5sig->entries4 + 1)), GFP_ATOMIC); if (!keys) { kfree(newkey); if (md5sig->entries4 == 0) tcp_free_md5sig_pool(); return -ENOMEM; } if (md5sig->entries4) memcpy(keys, md5sig->keys4, sizeof(*keys) * md5sig->entries4); /* Free old key list, and reference new one */ kfree(md5sig->keys4); md5sig->keys4 = keys; md5sig->alloced4++; } md5sig->entries4++;

md5sig->keys4[md5sig->entries4 - 1].addr = addr; md5sig->keys4[md5sig->entries4 - 1].base.key = newkey; md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen; } return 0; } EXPORT_SYMBOL(tcp_v4_md5_do_add); static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, u8 *newkey, u8 newkeylen) { return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr, newkey, newkeylen); } int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) { struct tcp_sock *tp = tcp_sk(sk); int i; for (i = 0; i < tp->md5sig_info->entries4; i++) { if (tp->md5sig_info->keys4[i].addr == addr) { /* Free the key */ kfree(tp->md5sig_info->keys4[i].base.key); tp->md5sig_info->entries4--; if (tp->md5sig_info->entries4 == 0) { kfree(tp->md5sig_info->keys4); tp->md5sig_info->keys4 = NULL; tp->md5sig_info->alloced4 = 0; tcp_free_md5sig_pool(); } else if (tp->md5sig_info->entries4 != i) { /* Need to do some manipulation */ memmove(&tp->md5sig_info->keys4[i], &tp->md5sig_info->keys4[i+1], (tp->md5sig_info->entries4 - i) * sizeof(struct tcp4_md5sig_key)); } return 0; } } return -ENOENT; } EXPORT_SYMBOL(tcp_v4_md5_do_del); static void tcp_v4_clear_md5_list(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); /* Free each key, then the set of key keys, * the crypto element, and then decrement our * hold on the last resort crypto. */ if (tp->md5sig_info->entries4) { int i; for (i = 0; i < tp->md5sig_info->entries4; i++) kfree(tp->md5sig_info->keys4[i].base.key); tp->md5sig_info->entries4 = 0; tcp_free_md5sig_pool(); } if (tp->md5sig_info->keys4) { kfree(tp->md5sig_info->keys4); tp->md5sig_info->keys4 = NULL; tp->md5sig_info->alloced4 = 0; } } static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, int optlen) {

struct tcp_md5sig cmd; struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; u8 *newkey; if (optlen < sizeof(cmd)) return -EINVAL; if (copy_from_user(&cmd, optval, sizeof(cmd))) return -EFAULT; if (sin->sin_family != AF_INET) return -EINVAL; if (!cmd.tcpm_key || !cmd.tcpm_keylen) { if (!tcp_sk(sk)->md5sig_info) return -ENOENT; return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr); } if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; if (!tcp_sk(sk)->md5sig_info) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *p; p = kzalloc(sizeof(*p), sk->sk_allocation); if (!p) return -EINVAL; tp->md5sig_info = p; sk_nocaps_add(sk, NETIF_F_GSO_MASK); } newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation); if (!newkey) return -ENOMEM; return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr, newkey, cmd.tcpm_keylen); } static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, __be32 daddr, __be32 saddr, int nbytes) { struct tcp4_pseudohdr *bp; struct scatterlist sg; bp = &hp->md5_blk.ip4; /* * 1. the TCP pseudo-header (in the order: source IP address, * destination IP address, zero-padded protocol number, and * segment length) */ bp->saddr = saddr; bp->daddr = daddr; bp->pad = 0; bp->protocol = IPPROTO_TCP; bp->len = cpu_to_be16(nbytes); sg_init_one(&sg, bp, sizeof(*bp)); return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); } static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th) { struct tcp_md5sig_pool *hp; struct hash_desc *desc;

hp = tcp_get_md5sig_pool(); if (!hp) goto clear_hash_noput; desc = &hp->md5_desc; if (crypto_hash_init(desc)) goto clear_hash; if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_header(hp, th)) goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; if (crypto_hash_final(desc, md5_hash)) goto clear_hash; tcp_put_md5sig_pool(); return 0; clear_hash: tcp_put_md5sig_pool(); clear_hash_noput: memset(md5_hash, 0, 16); return 1; } int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, const struct sock *sk, const struct request_sock *req, const struct sk_buff *skb) { struct tcp_md5sig_pool *hp; struct hash_desc *desc; const struct tcphdr *th = tcp_hdr(skb); __be32 saddr, daddr; if (sk) { saddr = inet_sk(sk)->inet_saddr; daddr = inet_sk(sk)->inet_daddr; } else if (req) { saddr = inet_rsk(req)->loc_addr; daddr = inet_rsk(req)->rmt_addr; } else { const struct iphdr *iph = ip_hdr(skb); saddr = iph->saddr; daddr = iph->daddr; } hp = tcp_get_md5sig_pool(); if (!hp) goto clear_hash_noput; desc = &hp->md5_desc; if (crypto_hash_init(desc)) goto clear_hash; if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) goto clear_hash; if (tcp_md5_hash_header(hp, th)) goto clear_hash; if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; if (crypto_hash_final(desc, md5_hash)) goto clear_hash; tcp_put_md5sig_pool(); return 0; clear_hash:

tcp_put_md5sig_pool(); clear_hash_noput: memset(md5_hash, 0, 16); return 1; } EXPORT_SYMBOL(tcp_v4_md5_hash_skb); static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) { /* * This gets called for each TCP segment that arrives * so we want to be efficient. * We have 3 drop cases: * o No MD5 hash and one expected. * o MD5 hash and we're not expecting one. * o MD5 hash and its wrong. */ const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); int genhash; unsigned char newhash[16]; hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr); hash_location = tcp_parse_md5sig_option(th); /* We've parsed the options - do we have a hash? */ if (!hash_expected && !hash_location) return 0; if (hash_expected && !hash_location) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); return 1; } if (!hash_expected && hash_location) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); return 1; } /* Okay, so this is hash_expected and hash_location * so we need to calculate the checksum. */ genhash = tcp_v4_md5_hash_skb(newhash, hash_expected, NULL, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { if (net_ratelimit()) { printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", &iph->saddr, ntohs(th->source), &iph->daddr, ntohs(th->dest), genhash ? " tcp_v4_calc_md5_hash failed" : ""); } return 1; } return 0; } #endif struct request_sock_ops tcp_request_sock_ops __read_mostly = { .family = PF_INET, .obj_size = sizeof(struct tcp_request_sock), .rtx_syn_ack = tcp_v4_rtx_synack, .send_ack = tcp_v4_reqsk_send_ack, .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, .syn_ack_timeout = tcp_syn_ack_timeout,

}; #ifdef CONFIG_TCP_MD5SIG static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .md5_lookup = tcp_v4_reqsk_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, }; #endif int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_extend_values tmp_ext; struct tcp_options_received tmp_opt; const u8 *hash_location; struct request_sock *req; struct inet_request_sock *ireq; struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = NULL; __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; int want_cookie = 0; /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is * evidently real one. */ if (inet_csk_reqsk_queue_is_full(sk) && !isn) { want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); if (!want_cookie) goto drop; } /* Accept backlog is full. If we have already queued enough * of warm entries in syn queue, drop request. It is better than * clogging syn queue with openreqs with exponentially increasing * timeout. */ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; req = inet_reqsk_alloc(&tcp_request_sock_ops); if (!req) goto drop; #ifdef CONFIG_TCP_MD5SIG tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; #endif tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = TCP_MSS_DEFAULT; tmp_opt.user_mss = tp->rx_opt.user_mss; tcp_parse_options(skb, &tmp_opt, &hash_location, 0); if (tmp_opt.cookie_plus > 0 && tmp_opt.saw_tstamp && !tp->rx_opt.cookie_out_never && (sysctl_tcp_cookie_size > 0 || (tp->cookie_values != NULL && tp->cookie_values->cookie_desired > 0))) { u8 *c; u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) goto drop_and_release;

/* Secret recipe starts with IP addresses */ *mess++ ^= (__force u32)daddr; *mess++ ^= (__force u32)saddr; /* plus variable length Initiator Cookie */ c = (u8 *)mess; while (l-- > 0) *c++ ^= *hash_location++; want_cookie = 0; /* not our kind of cookie */ tmp_ext.cookie_out_never = 0; /* false */ tmp_ext.cookie_plus = tmp_opt.cookie_plus; } else if (!tp->rx_opt.cookie_in_always) { /* redundant indications, but ensure initialization. */ tmp_ext.cookie_out_never = 1; /* true */ tmp_ext.cookie_plus = 0; } else { goto drop_and_release; } tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); ireq = inet_rsk(req); ireq->loc_addr = daddr; ireq->rmt_addr = saddr; ireq->no_srccheck = inet_sk(sk)->transparent; ireq->opt = tcp_v4_save_options(sk, skb); if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; if (!want_cookie || tmp_opt.tstamp_ok) TCP_ECN_create_request(req, tcp_hdr(skb)); if (want_cookie) { isn = cookie_v4_init_sequence(sk, skb, &req->mss); req->cookie_ts = tmp_opt.tstamp_ok; } else if (!isn) { struct inet_peer *peer = NULL; struct flowi4 fl4; /* VJ's idea. We save last timestamp seen * from the destination in peer table, when entering * state TIME-WAIT, and check against it before * accepting new connection request. * * If "isn" is not zero, this request hit alive * timewait bucket, so that all the necessary checks * are made in the function processing timewait state. */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && fl4.daddr == saddr && (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } } /* Kill the following clause, if you dislike this way. */

else if (!sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && (!peer || !peer->tcp_ts_stamp) && (!dst || !dst_metric(dst, RTAX_RTT))) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. * It means that we continue to communicate * to destinations, already remembered * to the moment of synflood. */ LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n", &saddr, ntohs(tcp_hdr(skb)->source)); goto drop_and_release; } isn = tcp_v4_init_sequence(skb); } tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->snt_synack = tcp_time_stamp; if (tcp_v4_send_synack(sk, dst, req, (struct request_values *)&tmp_ext) || want_cookie) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); return 0; drop_and_release: dst_release(dst); drop_and_free: reqsk_free(req); drop: return 0; } EXPORT_SYMBOL(tcp_v4_conn_request); /* * The three way handshake has completed - we got a valid synack * now create the new socket. */ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { struct inet_request_sock *ireq; struct inet_sock *newinet; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; #endif struct ip_options_rcu *inet_opt; if (sk_acceptq_is_full(sk)) goto exit_overflow; newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto exit_nonewsk; newsk->sk_gso_type = SKB_GSO_TCPV4; newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); newinet->inet_daddr = ireq->rmt_addr;

newinet->inet_rcv_saddr = ireq->loc_addr; newinet->inet_saddr = ireq->loc_addr; inet_opt = ireq->opt; rcu_assign_pointer(newinet->inet_opt, inet_opt); ireq->opt = NULL; newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; inet_csk(newsk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) goto put_and_exit; sk_setup_caps(newsk, dst); tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric_advmss(dst); if (tcp_sk(sk)->rx_opt.user_mss && tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; tcp_initialize_rcv_mss(newsk); if (tcp_rsk(req)->snt_synack) tcp_valid_rtt_meas(newsk, tcp_time_stamp - tcp_rsk(req)->snt_synack); newtp->total_retrans = req->retrans; #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr); if (key != NULL) { /* * We're using one, so create a matching key * on the newsk structure. If we fail to get * memory, then we end up not copying the key * across. Shucks. */ char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); if (newkey != NULL) tcp_v4_md5_do_add(newsk, newinet->inet_daddr, newkey, key->keylen); sk_nocaps_add(newsk, NETIF_F_GSO_MASK); } #endif if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; __inet_hash_nolisten(newsk, NULL); return newsk; exit_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); exit_nonewsk: dst_release(dst); exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; put_and_exit: tcp_clear_xmit_timers(newsk); bh_unlock_sock(newsk); sock_put(newsk); goto exit; } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)

{ struct tcphdr *th = tcp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); struct sock *nsk; struct request_sock **prev; /* Find possible connection requests. */ struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, iph->saddr, iph->daddr); if (req) return tcp_check_req(sk, skb, req, prev); nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { bh_lock_sock(nsk); return nsk; } inet_twsk_put(inet_twsk(nsk)); return NULL; } #ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); #endif return sk; } static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); if (skb->ip_summed == CHECKSUM_COMPLETE) { if (!tcp_v4_check(skb->len, iph->saddr, iph->daddr, skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; } } skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len, IPPROTO_TCP, 0); if (skb->len <= 76) { return __skb_checksum_complete(skb); } return 0; } /* The socket must have it's spinlock held when we get * here. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. * This is because we cannot sleep with the original spinlock * held. */ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; #ifdef CONFIG_TCP_MD5SIG /* * We really want to reject the packet as early as possible * if: * o We're expecting an MD5'd packet and this is no MD5 tcp option * o There is an MD5 option and we're not expecting one */

if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard; #endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ sock_rps_save_rxhash(sk, skb); if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } return 0; } if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v4_hnd_req(sk, skb); if (!nsk) goto discard; if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; } return 0; } } else sock_rps_save_rxhash(sk, skb); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } return 0; reset: tcp_v4_send_reset(rsk, skb); discard: kfree_skb(skb); /* Be careful here. If this function gets more complicated and * gcc suffers from register pressure on the x86, sk (in %ebx) * might be destroyed here. This current version compiles correctly, * but you have been warned. */ return 0; csum_err: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; } EXPORT_SYMBOL(tcp_v4_do_rcv); /* * */ From tcp_input.c

int tcp_v4_rcv(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; int ret; struct net *net = dev_net(skb->dev); if (skb->pkt_type != PACKET_HOST) goto discard_it;

/* Count it even if it's bad */ TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) goto bad_packet; if (!pskb_may_pull(skb, th->doff * 4)) goto discard_it; /* An explanation is required here, I think. * Packet length and doff are validated by header prediction, * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) goto bad_packet; th = tcp_hdr(skb); iph = ip_hdr(skb); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->when = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; } if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; nf_reset(skb); if (sk_filter(sk, skb)) goto discard_and_relse; skb->dev = NULL; bh_lock_sock_nested(sk); ret = 0; if (!sock_owned_by_user(sk)) { #ifdef CONFIG_NET_DMA struct tcp_sock *tp = tcp_sk(sk); if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); if (tp->ucopy.dma_chan) ret = tcp_v4_do_rcv(sk, skb); else #endif { if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); } } else if (unlikely(sk_add_backlog(sk, skb))) { bh_unlock_sock(sk); NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);

goto discard_and_relse; } bh_unlock_sock(sk); sock_put(sk); return ret; no_tcp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { bad_packet: TCP_INC_STATS_BH(net, TCP_MIB_INERRS); } else { tcp_v4_send_reset(NULL, skb); } discard_it: /* Discard frame. */ kfree_skb(skb); return 0; discard_and_relse: sock_put(sk); goto discard_it; do_time_wait: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { inet_twsk_put(inet_twsk(sk)); goto discard_it; } if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(net, TCP_MIB_INERRS); inet_twsk_put(inet_twsk(sk)); goto discard_it; } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest, inet_iif(skb)); if (sk2) { inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); inet_twsk_put(inet_twsk(sk)); sk = sk2; goto process; } /* Fall through to ACK */ } case TCP_TW_ACK: tcp_v4_timewait_ack(sk, skb); break; case TCP_TW_RST: goto no_tcp_socket; case TCP_TW_SUCCESS:; } goto discard_it; } struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) { struct rtable *rt = (struct rtable *) __sk_dst_get(sk); struct inet_sock *inet = inet_sk(sk); struct inet_peer *peer; if (!rt ||

inet->cork.fl.u.ip4.daddr != inet->inet_daddr) { peer = inet_getpeer_v4(inet->inet_daddr, 1); *release_it = true; } else { if (!rt->peer) rt_bind_peer(rt, inet->inet_daddr, 1); peer = rt->peer; *release_it = false; } return peer; } EXPORT_SYMBOL(tcp_v4_get_peer); void *tcp_v4_tw_get_peer(struct sock *sk) { const struct inet_timewait_sock *tw = inet_twsk(sk); return inet_getpeer_v4(tw->tw_daddr, 1); } EXPORT_SYMBOL(tcp_v4_tw_get_peer); static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), .twsk_unique = tcp_twsk_unique, .twsk_destructor= tcp_twsk_destructor, .twsk_getpeer = tcp_v4_tw_get_peer, }; const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .get_peer = tcp_v4_get_peer, .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), .bind_conflict = inet_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ip_setsockopt, .compat_getsockopt = compat_ip_getsockopt, #endif }; EXPORT_SYMBOL(ipv4_specific); #ifdef CONFIG_TCP_MD5SIG static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { .md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_add = tcp_v4_md5_add_func, .md5_parse = tcp_v4_parse_md5_keys, }; #endif /* NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ static int tcp_v4_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp);

icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ tp->snd_cwnd = TCP_INIT_CWND; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = sysctl_tcp_reordering; icsk->icsk_ca_ops = &tcp_init_congestion_ops; sk->sk_state = TCP_CLOSE; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); icsk->icsk_af_ops = &ipv4_specific; icsk->icsk_sync_mss = tcp_sync_mss; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv4_specific; #endif /* TCP Cookie Transactions */ if (sysctl_tcp_cookie_size > 0) { /* Default, cookies without s_data_payload. */ tp->cookie_values = kzalloc(sizeof(*tp->cookie_values), sk->sk_allocation); if (tp->cookie_values != NULL) kref_init(&tp->cookie_values->kref); } /* Presumed zeroed, in order of appearance: * cookie_in_always, cookie_out_never, * s_data_constant, s_data_in, s_data_out */ sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; local_bh_disable(); percpu_counter_inc(&tcp_sockets_allocated); local_bh_enable(); return 0; } void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); tcp_clear_xmit_timers(sk); tcp_cleanup_congestion_control(sk); /* Cleanup up the write buffer. */ tcp_write_queue_purge(sk); /* Cleans up our, hopefully empty, out_of_order_queue. */ __skb_queue_purge(&tp->out_of_order_queue); #ifdef CONFIG_TCP_MD5SIG /* Clean up the MD5 key list, if any */

if (tp->md5sig_info) { tcp_v4_clear_md5_list(sk); kfree(tp->md5sig_info); tp->md5sig_info = NULL; } #endif #ifdef CONFIG_NET_DMA /* Cleans up our sk_async_wait_queue */ __skb_queue_purge(&sk->sk_async_wait_queue); #endif /* Clean prequeue, it must be empty really */ __skb_queue_purge(&tp->ucopy.prequeue); /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); /* * If sendmsg cached page exists, toss it. */ if (sk->sk_sndmsg_page) { __free_page(sk->sk_sndmsg_page); sk->sk_sndmsg_page = NULL; } /* TCP Cookie Transactions */ if (tp->cookie_values != NULL) { kref_put(&tp->cookie_values->kref, tcp_cookie_values_release); tp->cookie_values = NULL; } percpu_counter_dec(&tcp_sockets_allocated); } EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head) { return hlist_nulls_empty(head) ? NULL : list_entry(head->first, struct inet_timewait_sock, tw_node); } static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) { return !is_a_nulls(tw->tw_node.next) ? hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; } /* * Get next listener socket follow cur. If cur is NULL, get first socket * starting from bucket given in st->bucket; when st->bucket is zero the * very first socket in the hash table is returned. */ static void *listening_get_next(struct seq_file *seq, void *cur) { struct inet_connection_sock *icsk; struct hlist_nulls_node *node; struct sock *sk = cur; struct inet_listen_hashbucket *ilb; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); if (!sk) { ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock_bh(&ilb->lock);

sk = sk_nulls_head(&ilb->head); st->offset = 0; goto get_sk; } ilb = &tcp_hashinfo.listening_hash[st->bucket]; ++st->num; ++st->offset; if (st->state == TCP_SEQ_STATE_OPENREQ) { struct request_sock *req = cur; icsk = inet_csk(st->syn_wait_sk); req = req->dl_next; while (1) { while (req) { if (req->rsk_ops->family == st->family) { cur = req; goto out; } req = req->dl_next; } if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) break; get_req: req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; } sk = sk_nulls_next(st->syn_wait_sk); st->state = TCP_SEQ_STATE_LISTENING; read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } else { icsk = inet_csk(sk); read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); if (reqsk_queue_len(&icsk->icsk_accept_queue)) goto start_req; read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); sk = sk_nulls_next(sk); } get_sk: sk_nulls_for_each_from(sk, node) { if (!net_eq(sock_net(sk), net)) continue; if (sk->sk_family == st->family) { cur = sk; goto out; } icsk = inet_csk(sk); read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); if (reqsk_queue_len(&icsk->icsk_accept_queue)) { start_req: st->uid = sock_i_uid(sk); st->syn_wait_sk = sk; st->state = TCP_SEQ_STATE_OPENREQ; st->sbucket = 0; goto get_req; } read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); st->offset = 0; if (++st->bucket < INET_LHTABLE_SIZE) { ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock_bh(&ilb->lock); sk = sk_nulls_head(&ilb->head); goto get_sk; } cur = NULL; out: return cur; }

static void *listening_get_idx(struct seq_file *seq, loff_t *pos) { struct tcp_iter_state *st = seq->private; void *rc; st->bucket = 0; st->offset = 0; rc = listening_get_next(seq, NULL); while (rc && *pos) { rc = listening_get_next(seq, rc); --*pos; } return rc; } static inline int empty_bucket(struct tcp_iter_state *st) { return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); } /* * Get first established socket starting from bucket given in st->bucket. * If st->bucket is zero, the very first socket in the hash is returned. */ static void *established_get_first(struct seq_file *seq) { struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); void *rc = NULL; st->offset = 0; for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { struct sock *sk; struct hlist_nulls_node *node; struct inet_timewait_sock *tw; spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); /* Lockless fast path for the common case of empty buckets */ if (empty_bucket(st)) continue; spin_lock_bh(lock); sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family || !net_eq(sock_net(sk), net)) { continue; } rc = sk; goto out; } st->state = TCP_SEQ_STATE_TIME_WAIT; inet_twsk_for_each(tw, node, &tcp_hashinfo.ehash[st->bucket].twchain) { if (tw->tw_family != st->family || !net_eq(twsk_net(tw), net)) { continue; } rc = tw; goto out; } spin_unlock_bh(lock); st->state = TCP_SEQ_STATE_ESTABLISHED; } out: return rc; } static void *established_get_next(struct seq_file *seq, void *cur)

{ struct sock *sk = cur; struct inet_timewait_sock *tw; struct hlist_nulls_node *node; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); ++st->num; ++st->offset; if (st->state == TCP_SEQ_STATE_TIME_WAIT) { tw = cur; tw = tw_next(tw); get_tw: while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) { tw = tw_next(tw); } if (tw) { cur = tw; goto out; } spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); st->state = TCP_SEQ_STATE_ESTABLISHED; /* Look for next non empty bucket */ st->offset = 0; while (++st->bucket <= tcp_hashinfo.ehash_mask && empty_bucket(st)) ; if (st->bucket > tcp_hashinfo.ehash_mask) return NULL; spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain); } else sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) goto found; } st->state = TCP_SEQ_STATE_TIME_WAIT; tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain); goto get_tw; found: cur = sk; out: return cur; } static void *established_get_idx(struct seq_file *seq, loff_t pos) { struct tcp_iter_state *st = seq->private; void *rc; st->bucket = 0; rc = established_get_first(seq); while (rc && pos) { rc = established_get_next(seq, rc); --pos; } return rc; } static void *tcp_get_idx(struct seq_file *seq, loff_t pos) { void *rc; struct tcp_iter_state *st = seq->private;

st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_idx(seq, &pos); if (!rc) { st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_idx(seq, pos); } return rc; } static void *tcp_seek_last_pos(struct seq_file *seq) { struct tcp_iter_state *st = seq->private; int offset = st->offset; int orig_num = st->num; void *rc = NULL; switch (st->state) { case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: if (st->bucket >= INET_LHTABLE_SIZE) break; st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_next(seq, NULL); while (offset-- && rc) rc = listening_get_next(seq, rc); if (rc) break; st->bucket = 0; /* Fallthrough */ case TCP_SEQ_STATE_ESTABLISHED: case TCP_SEQ_STATE_TIME_WAIT: st->state = TCP_SEQ_STATE_ESTABLISHED; if (st->bucket > tcp_hashinfo.ehash_mask) break; rc = established_get_first(seq); while (offset-- && rc) rc = established_get_next(seq, rc); } st->num = orig_num; return rc; } static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) { struct tcp_iter_state *st = seq->private; void *rc; if (*pos && *pos == st->last_pos) { rc = tcp_seek_last_pos(seq); if (rc) goto out; } st->state = TCP_SEQ_STATE_LISTENING; st->num = 0; st->bucket = 0; st->offset = 0; rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; out: st->last_pos = *pos; return rc; } static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)

{ struct tcp_iter_state *st = seq->private; void *rc = NULL; if (v == SEQ_START_TOKEN) { rc = tcp_get_idx(seq, 0); goto out; } switch (st->state) { case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: rc = listening_get_next(seq, v); if (!rc) { st->state = TCP_SEQ_STATE_ESTABLISHED; st->bucket = 0; st->offset = 0; rc = established_get_first(seq); } break; case TCP_SEQ_STATE_ESTABLISHED: case TCP_SEQ_STATE_TIME_WAIT: rc = established_get_next(seq, v); break; } out: ++*pos; st->last_pos = *pos; return rc; } static void tcp_seq_stop(struct seq_file *seq, void *v) { struct tcp_iter_state *st = seq->private; switch (st->state) { case TCP_SEQ_STATE_OPENREQ: if (v) { struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); break; case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: if (v) spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); break; } } int tcp_seq_open(struct inode *inode, struct file *file) { struct tcp_seq_afinfo *afinfo = PDE(inode)->data; struct tcp_iter_state *s; int err; err = seq_open_net(inode, file, &afinfo->seq_ops, sizeof(struct tcp_iter_state)); if (err < 0) return err; s = ((struct seq_file *)file->private_data)->private; s->family = afinfo->family; s->last_pos = 0; return 0; } EXPORT_SYMBOL(tcp_seq_open);

int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) { int rc = 0; struct proc_dir_entry *p; afinfo->seq_ops.start afinfo->seq_ops.next afinfo->seq_ops.stop = tcp_seq_start; = tcp_seq_next; = tcp_seq_stop;

p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, afinfo->seq_fops, afinfo); if (!p) rc = -ENOMEM; return rc; } EXPORT_SYMBOL(tcp_proc_register); void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) { proc_net_remove(net, afinfo->name); } EXPORT_SYMBOL(tcp_proc_unregister); static void get_openreq4(const struct sock *sk, const struct request_sock *req, struct seq_file *f, int i, int uid, int *len) { const struct inet_request_sock *ireq = inet_rsk(req); int ttd = req->expires - jiffies; seq_printf(f, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", i, ireq->loc_addr, ntohs(inet_sk(sk)->inet_sport), ireq->rmt_addr, ntohs(ireq->rmt_port), TCP_SYN_RECV, 0, 0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->retrans, uid, 0, /* non standard timer */ 0, /* open_requests have no inode */ atomic_read(&sk->sk_refcnt), req, len); } static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) { int timer_active; unsigned long timer_expires; const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_sock *inet = inet_sk(sk); __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); int rx_queue; if (icsk->icsk_pending == ICSK_TIME_RETRANS) { timer_active = 1; timer_expires = icsk->icsk_timeout; } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sk->sk_timer)) {

timer_active timer_expires } else {

= 2; = sk->sk_timer.expires;

timer_active = 0; timer_expires = jiffies; } if (sk->sk_state == TCP_LISTEN) rx_queue = sk->sk_ack_backlog; else /* * because we dont lock socket, we might find a transient negative value */ rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n", i, src, srcp, dest, destp, sk->sk_state, tp->write_seq - tp->snd_una, rx_queue, timer_active, jiffies_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, sock_i_uid(sk), icsk->icsk_probes_out, sock_i_ino(sk), atomic_read(&sk->sk_refcnt), sk, jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh, len); } static void get_timewait4_sock(const struct inet_timewait_sock *tw, struct seq_file *f, int i, int *len) { __be32 dest, src; __u16 destp, srcp; int ttd = tw->tw_ttd - jiffies; if (ttd < 0) ttd = 0; dest = tw->tw_daddr; src = tw->tw_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); seq_printf(f, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, atomic_read(&tw->tw_refcnt), tw, len); } #define TMPSZ 150 static int tcp4_seq_show(struct seq_file *seq, void *v) { struct tcp_iter_state *st; int len; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-*s\n", TMPSZ - 1, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode"); goto out;

} st = seq->private; switch (st->state) { case TCP_SEQ_STATE_LISTENING: case TCP_SEQ_STATE_ESTABLISHED: get_tcp4_sock(v, seq, st->num, &len); break; case TCP_SEQ_STATE_OPENREQ: get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len); break; case TCP_SEQ_STATE_TIME_WAIT: get_timewait4_sock(v, seq, st->num, &len); break; } seq_printf(seq, "%*s\n", TMPSZ - 1 - len, ""); out: return 0; } static const struct file_operations tcp_afinfo_seq_fops = { .owner = THIS_MODULE, .open = tcp_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_net }; static struct tcp_seq_afinfo tcp4_seq_afinfo = { .name = "tcp", .family = AF_INET, .seq_fops = &tcp_afinfo_seq_fops, .seq_ops = { .show = tcp4_seq_show, }, }; static int __net_init tcp4_proc_init_net(struct net *net) { return tcp_proc_register(net, &tcp4_seq_afinfo); } static void __net_exit tcp4_proc_exit_net(struct net *net) { tcp_proc_unregister(net, &tcp4_seq_afinfo); } static struct pernet_operations tcp4_net_ops = { .init = tcp4_proc_init_net, .exit = tcp4_proc_exit_net, }; int __init tcp4_proc_init(void) { return register_pernet_subsys(&tcp4_net_ops); } void tcp4_proc_exit(void) { unregister_pernet_subsys(&tcp4_net_ops); } #endif /* CONFIG_PROC_FS */ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { const struct iphdr *iph = skb_gro_network_header(skb); switch (skb->ip_summed) { case CHECKSUM_COMPLETE: if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,

skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; break; } /* fall through */ case CHECKSUM_NONE: NAPI_GRO_CB(skb)->flush = 1; return NULL; } return tcp_gro_receive(head, skb); } int tcp4_gro_complete(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), iph->saddr, iph->daddr, 0); skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; return tcp_gro_complete(skb); } struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem = sysctl_tcp_wmem, .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .slab_flags = SLAB_DESTROY_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, .no_autobind = true, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif }; EXPORT_SYMBOL(tcp_prot); static int __net_init tcp_sk_init(struct net *net)

{ return inet_ctl_sock_create(&net->ipv4.tcp_sock, PF_INET, SOCK_RAW, IPPROTO_TCP, net); } static void __net_exit tcp_sk_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv4.tcp_sock); } static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET); } static struct pernet_operations __net_initdata tcp_sk_ops = { .init = tcp_sk_init, .exit = tcp_sk_exit, .exit_batch = tcp_sk_exit_batch, }; void __init tcp_v4_init(void) { inet_hashinfo_init(&tcp_hashinfo); if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); }

/* * linux/init/main.c * * Copyright (C) 1991, 1992 Linus Torvalds * * GK 2/5/95 - Changed to support mounting root fs via NFS * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 * Simplified starting of init: Michael A. Griffith <grif@acm.org> */ #include <linux/types.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/stackprotector.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/delay.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/initrd.h> #include <linux/bootmem.h> #include <linux/acpi.h> #include <linux/tty.h> #include <linux/percpu.h> #include <linux/kmod.h> #include <linux/vmalloc.h> #include <linux/kernel_stat.h> #include <linux/start_kernel.h> #include <linux/security.h> #include <linux/smp.h> #include <linux/profile.h> #include <linux/rcupdate.h> #include <linux/moduleparam.h> #include <linux/kallsyms.h> #include <linux/writeback.h> #include <linux/cpu.h>

#include <linux/cpuset.h> #include <linux/cgroup.h> #include <linux/efi.h> #include <linux/tick.h> #include <linux/interrupt.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/unistd.h> #include <linux/rmap.h> #include <linux/mempolicy.h> #include <linux/key.h> #include <linux/buffer_head.h> #include <linux/page_cgroup.h> #include <linux/debug_locks.h> #include <linux/debugobjects.h> #include <linux/lockdep.h> #include <linux/kmemleak.h> #include <linux/pid_namespace.h> #include <linux/device.h> #include <linux/kthread.h> #include <linux/sched.h> #include <linux/signal.h> #include <linux/idr.h> #include <linux/kgdb.h> #include <linux/ftrace.h> #include <linux/async.h> #include <linux/kmemcheck.h> #include <linux/sfi.h> #include <linux/shmem_fs.h> #include <linux/slab.h> #include <linux/perf_event.h> #include <asm/io.h> #include <asm/bugs.h> #include <asm/setup.h> #include <asm/sections.h> #include <asm/cacheflush.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/smp.h> #endif static int kernel_init(void *); extern void init_IRQ(void); extern void fork_init(unsigned long); extern void mca_init(void); extern void sbus_init(void); extern void prio_tree_init(void); extern void radix_tree_init(void); extern void free_initmem(void); #ifndef CONFIG_DEBUG_RODATA static inline void mark_rodata_ro(void) { } #endif #ifdef CONFIG_TC extern void tc_init(void); #endif /* * Debug helper: via this flag we know that we are in 'early bootup code' * where only the boot processor is running with IRQ disabled. This means * two things - IRQ must not be enabled before the flag is cleared and some * operations which are not allowed with IRQ disabled are allowed while the * flag is set. */ bool early_boot_irqs_disabled __read_mostly; enum system_states system_state __read_mostly; EXPORT_SYMBOL(system_state);

/* * Boot command-line arguments */ #define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT #define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT extern void time_init(void); /* Default late time init is NULL. archs can override this later. */ void (*__initdata late_time_init)(void); extern void softirq_init(void); /* Untouched command line saved by arch-specific code. */ char __initdata boot_command_line[COMMAND_LINE_SIZE]; /* Untouched saved command line (eg. for /proc) */ char *saved_command_line; /* Command line for parameter parsing */ static char *static_command_line; static char *execute_command; static char *ramdisk_execute_command; /* * If set, this is an indication to the drivers that reset the underlying * device before going ahead with the initialization otherwise driver might * rely on the BIOS and skip the reset operation. * * This is useful if kernel is booting in an unreliable environment. * For ex. kdump situaiton where previous kernel has crashed, BIOS has been * skipped and devices will be in unknown state. */ unsigned int reset_devices; EXPORT_SYMBOL(reset_devices); static int __init set_reset_devices(char *str) { reset_devices = 1; return 1; } __setup("reset_devices", set_reset_devices); static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; static const char *panic_later, *panic_param; extern const struct obs_kernel_param __setup_start[], __setup_end[]; static int __init obsolete_checksetup(char *line) { const struct obs_kernel_param *p; int had_early_param = 0; p = __setup_start; do { int n = strlen(p->str); if (parameqn(line, p->str, n)) { if (p->early) { /* Already done in parse_early_param? * (Needs exact match on param part). * Keep iterating, as we can have early * params and __setups of same names 8( */ if (line[n] == '\0' || line[n] == '=') had_early_param = 1; } else if (!p->setup_func) { printk(KERN_WARNING "Parameter %s is obsolete," " ignored\n", p->str); return 1; } else if (p->setup_func(line + n)) return 1;

} p++; } while (p < __setup_end); return had_early_param; } /* * This should be approx 2 Bo*oMips to start (note initial shift), and will * still work even if initially too large, it will just take slightly longer */ unsigned long loops_per_jiffy = (1<<12); EXPORT_SYMBOL(loops_per_jiffy); static int __init debug_kernel(char *str) { console_loglevel = 10; return 0; } static int __init quiet_kernel(char *str) { console_loglevel = 4; return 0; } early_param("debug", debug_kernel); early_param("quiet", quiet_kernel); static int __init loglevel(char *str) { int newlevel; /* * Only update loglevel value when a correct setting was passed, * to prevent blind crashes (when loglevel being set to 0) that * are quite hard to debug */ if (get_option(&str, &newlevel)) { console_loglevel = newlevel; return 0; } return -EINVAL; } early_param("loglevel", loglevel); /* * Unknown boot options get handed to init, unless they look like * unused parameters (modprobe will find them in /proc/cmdline). */ static int __init unknown_bootoption(char *param, char *val) { /* Change NUL term back to "=", to make "param" the whole string. */ if (val) { /* param=val or param="val"? */ if (val == param+strlen(param)+1) val[-1] = '='; else if (val == param+strlen(param)+2) { val[-2] = '='; memmove(val-1, val, strlen(val)+1); val--; } else BUG(); } /* Handle obsolete-style parameters */ if (obsolete_checksetup(param))

return 0; /* Unused module parameter. */ if (strchr(param, '.') && (!val || strchr(param, '.') < val)) return 0; if (panic_later) return 0; if (val) { /* Environment option */ unsigned int i; for (i = 0; envp_init[i]; i++) { if (i == MAX_INIT_ENVS) { panic_later = "Too many boot env vars at `%s'"; panic_param = param; } if (!strncmp(param, envp_init[i], val - param)) break; } envp_init[i] = param; } else { /* Command line option */ unsigned int i; for (i = 0; argv_init[i]; i++) { if (i == MAX_INIT_ARGS) { panic_later = "Too many boot init vars at `%s'"; panic_param = param; } } argv_init[i] = param; } return 0; } #ifdef CONFIG_DEBUG_PAGEALLOC int __read_mostly debug_pagealloc_enabled = 0; #endif static int __init init_setup(char *str) { unsigned int i; execute_command = str; /* * In case LILO is going to boot us with default command line, * it prepends "auto" before the whole cmdline which makes * the shell think it should execute a script with such name. * So we ignore all arguments entered _before_ init=... [MJ] */ for (i = 1; i < MAX_INIT_ARGS; i++) argv_init[i] = NULL; return 1; } __setup("init=", init_setup); static int __init rdinit_setup(char *str) { unsigned int i; ramdisk_execute_command = str; /* See "auto" comment in init_setup */ for (i = 1; i < MAX_INIT_ARGS; i++) argv_init[i] = NULL; return 1; } __setup("rdinit=", rdinit_setup); #ifndef CONFIG_SMP static const unsigned int setup_max_cpus = NR_CPUS;

#ifdef CONFIG_X86_LOCAL_APIC static void __init smp_init(void) { APIC_init_uniprocessor(); } #else #define smp_init() do { } while (0) #endif static inline void setup_nr_cpu_ids(void) { } static inline void smp_prepare_cpus(unsigned int maxcpus) { } #endif /* * We need to store the untouched command line for future reference. * We also need to store the touched command line since the parameter * parsing is performed in place, and we should allow a component to * store reference of name/value for future reference. */ static void __init setup_command_line(char *command_line) { saved_command_line = alloc_bootmem(strlen (boot_command_line)+1); static_command_line = alloc_bootmem(strlen (command_line)+1); strcpy (saved_command_line, boot_command_line); strcpy (static_command_line, command_line); } /* * We need to finalize in a non-__init function or else race conditions * between the root thread and the init thread may cause start_kernel to * be reaped by free_initmem before the root thread has proceeded to * cpu_idle. * * gcc-3.4 accidentally inlines this function, so use noinline. */ static __initdata DECLARE_COMPLETION(kthreadd_done); static noinline void __init_refok rest_init(void) { int pid; rcu_scheduler_starting(); /* * We need to spawn init first so that it obtains pid 1, however * the init task will end up wanting to create kthreads, which, if * we schedule it before we create kthreadd, will OOPS. */ kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); numa_default_policy(); pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); rcu_read_lock(); kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); rcu_read_unlock(); complete(&kthreadd_done); /* * The boot idle thread must execute schedule() * at least once to get things moving: */ init_idle_bootup_task(current); preempt_enable_no_resched(); schedule(); /* Call into cpu_idle with preempt disabled */ preempt_disable(); cpu_idle(); } /* Check for early params. */

static int __init do_early_param(char *param, char *val) { const struct obs_kernel_param *p; for (p = __setup_start; p < __setup_end; p++) { if ((p->early && parameq(param, p->str)) || (strcmp(param, "console") == 0 && strcmp(p->str, "earlycon") == 0) ){ if (p->setup_func(val) != 0) printk(KERN_WARNING "Malformed early option '%s'\n", param); } } /* We accept everything at this stage. */ return 0; } void __init parse_early_options(char *cmdline) { parse_args("early options", cmdline, NULL, 0, do_early_param); } /* Arch code calls this early on, or if not, just before other parsing. */ void __init parse_early_param(void) { static __initdata int done = 0; static __initdata char tmp_cmdline[COMMAND_LINE_SIZE]; if (done) return; /* All fall through to do_early_param. */ strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); parse_early_options(tmp_cmdline); done = 1; } /* * */ Activate the first processor.

static void __init boot_cpu_init(void) { int cpu = smp_processor_id(); /* Mark the boot cpu "present", "online" etc for SMP and UP case */ set_cpu_online(cpu, true); set_cpu_active(cpu, true); set_cpu_present(cpu, true); set_cpu_possible(cpu, true); } void __init __weak smp_setup_processor_id(void) { } void __init __weak thread_info_cache_init(void) { } /* * Set up kernel memory allocators */ static void __init mm_init(void) { /* * page_cgroup requires countinous pages as memmap * and it's bigger than MAX_ORDER unless SPARSEMEM. */ page_cgroup_init_flatmem();

mem_init(); kmem_cache_init(); percpu_init_late(); pgtable_cache_init(); vmalloc_init(); } asmlinkage void __init start_kernel(void) { char * command_line; extern const struct kernel_param __start___param[], __stop___param[]; smp_setup_processor_id(); /* * Need to run as early as possible, to initialize the * lockdep hash: */ lockdep_init(); debug_objects_early_init(); /* * Set up the the initial canary ASAP: */ boot_init_stack_canary(); cgroup_init_early(); local_irq_disable(); early_boot_irqs_disabled = true; /* * Interrupts are still disabled. Do necessary setups, then * enable them */ tick_init(); boot_cpu_init(); page_address_init(); printk(KERN_NOTICE "%s", linux_banner); setup_arch(&command_line); mm_init_owner(&init_mm, &init_task); mm_init_cpumask(&init_mm); setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ build_all_zonelists(NULL); page_alloc_init(); printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, __stop___param - __start___param, &unknown_bootoption); jump_label_init(); /* * These use large bootmem allocations and must precede * kmem_cache_init() */ setup_log_buf(0); pidhash_init(); vfs_caches_init_early(); sort_main_extable(); trap_init(); mm_init(); /*

* Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() * time - but meanwhile we still have a functioning scheduler. */ sched_init(); /* * Disable preemption - early bootup scheduling is extremely * fragile until we cpu_idle() for the first time. */ preempt_disable(); if (!irqs_disabled()) { printk(KERN_WARNING "start_kernel(): bug: interrupts were " "enabled *very* early, fixing it\n"); local_irq_disable(); } idr_init_cache(); perf_event_init(); rcu_init(); radix_tree_init(); /* init some links before init_ISA_irqs() */ early_irq_init(); init_IRQ(); prio_tree_init(); init_timers(); hrtimers_init(); softirq_init(); timekeeping_init(); time_init(); profile_init(); call_function_init(); if (!irqs_disabled()) printk(KERN_CRIT "start_kernel(): bug: interrupts were " "enabled early\n"); early_boot_irqs_disabled = false; local_irq_enable(); /* Interrupts are enabled now so all GFP allocations are safe. */ gfp_allowed_mask = __GFP_BITS_MASK; kmem_cache_init_late(); /* * HACK ALERT! This is early. We're enabling the console before * we've done PCI setups etc, and console_init() must be aware of * this. But we do want output early, in case something goes wrong. */ console_init(); if (panic_later) panic(panic_later, panic_param); lockdep_info(); /* * Need to run this when irqs are enabled, because it wants * to self-test [hard/soft]-irqs on/off lock inversion bugs * too: */ locking_selftest(); #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) { printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - " "disabling it.\n", page_to_pfn(virt_to_page((void *)initrd_start)), min_low_pfn); initrd_start = 0; } #endif page_cgroup_init();

enable_debug_pagealloc(); debug_objects_mem_init(); kmemleak_init(); setup_per_cpu_pageset(); numa_policy_init(); if (late_time_init) late_time_init(); sched_clock_init(); calibrate_delay(); pidmap_init(); anon_vma_init(); #ifdef CONFIG_X86 if (efi_enabled) efi_enter_virtual_mode(); #endif thread_info_cache_init(); cred_init(); fork_init(totalram_pages); proc_caches_init(); buffer_init(); key_init(); security_init(); dbg_late_init(); vfs_caches_init(totalram_pages); signals_init(); /* rootfs populating might need page-writeback */ page_writeback_init(); #ifdef CONFIG_PROC_FS proc_root_init(); #endif cgroup_init(); cpuset_init(); taskstats_init_early(); delayacct_init(); check_bugs(); acpi_early_init(); /* before LAPIC and SMP init */ sfi_init_late(); ftrace_init(); /* Do the rest non-__init'ed, we're now alive */ rest_init(); } /* Call all constructor functions linked into the kernel. */ static void __init do_ctors(void) { #ifdef CONFIG_CONSTRUCTORS ctor_fn_t *fn = (ctor_fn_t *) __ctors_start; for (; fn < (ctor_fn_t *) __ctors_end; fn++) (*fn)(); #endif } int initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); static char msgbuf[64]; static int __init_or_module do_one_initcall_debug(initcall_t fn) { ktime_t calltime, delta, rettime; unsigned long long duration; int ret; printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); calltime = ktime_get();

ret = fn(); rettime = ktime_get(); delta = ktime_sub(rettime, calltime); duration = (unsigned long long) ktime_to_ns(delta) >> 10; printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", fn, ret, duration); return ret; } int __init_or_module do_one_initcall(initcall_t fn) { int count = preempt_count(); int ret; if (initcall_debug) ret = do_one_initcall_debug(fn); else ret = fn(); msgbuf[0] = 0; if (ret && ret != -ENODEV && initcall_debug) sprintf(msgbuf, "error code %d ", ret); if (preempt_count() != count) { strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf)); preempt_count() = count; } if (irqs_disabled()) { strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); local_irq_enable(); } if (msgbuf[0]) { printk("initcall %pF returned with %s\n", fn, msgbuf); } return ret; } extern initcall_t __initcall_start[], __initcall_end[], __early_initcall_end[]; static void __init do_initcalls(void) { initcall_t *fn; for (fn = __early_initcall_end; fn < __initcall_end; fn++) do_one_initcall(*fn); } /* * Ok, the machine is now initialized. None of the devices * have been touched yet, but the CPU subsystem is up and * running, and memory and process management works. * * Now we can finally start doing some real work.. */ static void __init do_basic_setup(void) { cpuset_init_smp(); usermodehelper_init(); shmem_init(); driver_init(); init_irq_proc(); do_ctors(); usermodehelper_enable(); do_initcalls(); }

static void __init do_pre_smp_initcalls(void) { initcall_t *fn; for (fn = __initcall_start; fn < __early_initcall_end; fn++) do_one_initcall(*fn); } static void run_init_process(const char *init_filename) { argv_init[0] = init_filename; kernel_execve(init_filename, argv_init, envp_init); } /* This is a non __init function. Force it to be noinline otherwise gcc * makes it inline to init() and it becomes part of init.text section */ static noinline int init_post(void) { /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); free_initmem(); mark_rodata_ro(); system_state = SYSTEM_RUNNING; numa_default_policy(); current->signal->flags |= SIGNAL_UNKILLABLE; if (ramdisk_execute_command) { run_init_process(ramdisk_execute_command); printk(KERN_WARNING "Failed to execute %s\n", ramdisk_execute_command); } /* * We try each of these until one succeeds. * * The Bourne shell can be used instead of init if we are * trying to recover a really broken machine. */ if (execute_command) { run_init_process(execute_command); printk(KERN_WARNING "Failed to execute %s. Attempting " "defaults...\n", execute_command); } run_init_process("/sbin/init"); run_init_process("/etc/init"); run_init_process("/bin/init"); run_init_process("/bin/sh"); panic("No init found. Try passing init= option to kernel. " "See Linux Documentation/init.txt for guidance."); } static int __init kernel_init(void * unused) { /* * Wait until kthreadd is all set-up. */ wait_for_completion(&kthreadd_done); /* * init can allocate pages on any node */ set_mems_allowed(node_states[N_HIGH_MEMORY]); /* * init can run on any cpu. */ set_cpus_allowed_ptr(current, cpu_all_mask);

cad_pid = task_pid(current); smp_prepare_cpus(setup_max_cpus); do_pre_smp_initcalls(); lockup_detector_init(); smp_init(); sched_init_smp(); do_basic_setup(); /* Open the /dev/console on the rootfs, this should never fail */ if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) printk(KERN_WARNING "Warning: unable to open an initial console.\n"); (void) sys_dup(0); (void) sys_dup(0); /* * check if there is an early userspace init. If yes, let it do all * the work */ if (!ramdisk_execute_command) ramdisk_execute_command = "/init"; if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) { ramdisk_execute_command = NULL; prepare_namespace(); } /* * Ok, we have completed the initial bootup, and * we're essentially up and running. Get rid of the * initmem segments and start the user-mode stuff.. */ init_post(); return 0; }

/* * Cryptographic API. * * AES Cipher Algorithm. * * Based on Brian Gladman's code. * * Linux developers: * Alexander Kjeldaas <astor@fast.no> * Herbert Valerio Riedel <hvr@hvrlab.org> * Kyle McMartin <kyle@debian.org> * Adam J. Richter <adam@yggdrasil.com> (conversion to 2.5 API). * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * --------------------------------------------------------------------------* Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK. * All rights reserved. * * LICENSE TERMS * * The free distribution and use of this software in both source and binary * form is allowed (with or without changes) provided that: *

* 1. distributions of this source code include the above copyright * notice, this list of conditions and the following disclaimer; * * 2. distributions in binary form include the above copyright * notice, this list of conditions and the following disclaimer * in the documentation and/or other associated materials; * * 3. the copyright holder's name is not used to endorse products * built using this software without specific written permission. * * ALTERNATIVELY, provided that this notice is retained in full, this product * may be distributed under the terms of the GNU General Public License (GPL), * in which case the provisions of the GPL apply INSTEAD OF those given above. * * DISCLAIMER * * This software is provided 'as is' with no explicit or implied warranties * in respect of its properties, including, but not limited to, correctness * and/or fitness for purpose. * --------------------------------------------------------------------------*/ #include <crypto/aes.h> #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/errno.h> #include <linux/crypto.h> #include <asm/byteorder.h> static inline u8 byte(const u32 x, const unsigned n) { return x >> (n << 3); } static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 }; const u32 crypto_ft_tab[4][256] = { { 0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6, 0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591, 0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56, 0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec, 0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa, 0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb, 0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45, 0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b, 0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c, 0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83, 0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9, 0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a, 0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d, 0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f, 0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df, 0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea, 0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34, 0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b, 0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d, 0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413, 0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1, 0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6, 0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972, 0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85, 0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed, 0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511, 0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe, 0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b, 0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05, 0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1, 0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142,

0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf, 0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3, 0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e, 0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a, 0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6, 0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3, 0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b, 0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428, 0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad, 0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14, 0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8, 0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4, 0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2, 0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda, 0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949, 0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf, 0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810, 0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c, 0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697, 0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e, 0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f, 0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc, 0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c, 0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969, 0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27, 0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122, 0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433, 0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9, 0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5, 0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a, 0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0, 0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e, 0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c, }, { 0x6363c6a5, 0x7c7cf884, 0x7777ee99, 0x7b7bf68d, 0xf2f2ff0d, 0x6b6bd6bd, 0x6f6fdeb1, 0xc5c59154, 0x30306050, 0x01010203, 0x6767cea9, 0x2b2b567d, 0xfefee719, 0xd7d7b562, 0xabab4de6, 0x7676ec9a, 0xcaca8f45, 0x82821f9d, 0xc9c98940, 0x7d7dfa87, 0xfafaef15, 0x5959b2eb, 0x47478ec9, 0xf0f0fb0b, 0xadad41ec, 0xd4d4b367, 0xa2a25ffd, 0xafaf45ea, 0x9c9c23bf, 0xa4a453f7, 0x7272e496, 0xc0c09b5b, 0xb7b775c2, 0xfdfde11c, 0x93933dae, 0x26264c6a, 0x36366c5a, 0x3f3f7e41, 0xf7f7f502, 0xcccc834f, 0x3434685c, 0xa5a551f4, 0xe5e5d134, 0xf1f1f908, 0x7171e293, 0xd8d8ab73, 0x31316253, 0x15152a3f, 0x0404080c, 0xc7c79552, 0x23234665, 0xc3c39d5e, 0x18183028, 0x969637a1, 0x05050a0f, 0x9a9a2fb5, 0x07070e09, 0x12122436, 0x80801b9b, 0xe2e2df3d, 0xebebcd26, 0x27274e69, 0xb2b27fcd, 0x7575ea9f, 0x0909121b, 0x83831d9e, 0x2c2c5874, 0x1a1a342e, 0x1b1b362d, 0x6e6edcb2, 0x5a5ab4ee, 0xa0a05bfb, 0x5252a4f6, 0x3b3b764d, 0xd6d6b761, 0xb3b37dce, 0x2929527b, 0xe3e3dd3e, 0x2f2f5e71, 0x84841397, 0x5353a6f5, 0xd1d1b968, 0x00000000, 0xededc12c, 0x20204060, 0xfcfce31f, 0xb1b179c8, 0x5b5bb6ed, 0x6a6ad4be, 0xcbcb8d46, 0xbebe67d9, 0x3939724b, 0x4a4a94de, 0x4c4c98d4, 0x5858b0e8, 0xcfcf854a, 0xd0d0bb6b, 0xefefc52a, 0xaaaa4fe5, 0xfbfbed16, 0x434386c5, 0x4d4d9ad7, 0x33336655, 0x85851194, 0x45458acf, 0xf9f9e910, 0x02020406, 0x7f7ffe81, 0x5050a0f0, 0x3c3c7844, 0x9f9f25ba, 0xa8a84be3, 0x5151a2f3, 0xa3a35dfe, 0x404080c0, 0x8f8f058a, 0x92923fad, 0x9d9d21bc, 0x38387048, 0xf5f5f104, 0xbcbc63df, 0xb6b677c1, 0xdadaaf75, 0x21214263, 0x10102030, 0xffffe51a, 0xf3f3fd0e, 0xd2d2bf6d, 0xcdcd814c, 0x0c0c1814, 0x13132635, 0xececc32f, 0x5f5fbee1, 0x979735a2, 0x444488cc, 0x17172e39, 0xc4c49357, 0xa7a755f2, 0x7e7efc82, 0x3d3d7a47, 0x6464c8ac, 0x5d5dbae7, 0x1919322b, 0x7373e695,

0x6060c0a0, 0x81811998, 0x4f4f9ed1, 0xdcdca37f, 0x22224466, 0x2a2a547e, 0x90903bab, 0x88880b83, 0x46468cca, 0xeeeec729, 0xb8b86bd3, 0x1414283c, 0xdedea779, 0x5e5ebce2, 0x0b0b161d, 0xdbdbad76, 0xe0e0db3b, 0x32326456, 0x3a3a744e, 0x0a0a141e, 0x494992db, 0x06060c0a, 0x2424486c, 0x5c5cb8e4, 0xc2c29f5d, 0xd3d3bd6e, 0xacac43ef, 0x6262c4a6, 0x919139a8, 0x959531a4, 0xe4e4d337, 0x7979f28b, 0xe7e7d532, 0xc8c88b43, 0x37376e59, 0x6d6ddab7, 0x8d8d018c, 0xd5d5b164, 0x4e4e9cd2, 0xa9a949e0, 0x6c6cd8b4, 0x5656acfa, 0xf4f4f307, 0xeaeacf25, 0x6565caaf, 0x7a7af48e, 0xaeae47e9, 0x08081018, 0xbaba6fd5, 0x7878f088, 0x25254a6f, 0x2e2e5c72, 0x1c1c3824, 0xa6a657f1, 0xb4b473c7, 0xc6c69751, 0xe8e8cb23, 0xdddda17c, 0x7474e89c, 0x1f1f3e21, 0x4b4b96dd, 0xbdbd61dc, 0x8b8b0d86, 0x8a8a0f85, 0x7070e090, 0x3e3e7c42, 0xb5b571c4, 0x6666ccaa, 0x484890d8, 0x03030605, 0xf6f6f701, 0x0e0e1c12, 0x6161c2a3, 0x35356a5f, 0x5757aef9, 0xb9b969d0, 0x86861791, 0xc1c19958, 0x1d1d3a27, 0x9e9e27b9, 0xe1e1d938, 0xf8f8eb13, 0x98982bb3, 0x11112233, 0x6969d2bb, 0xd9d9a970, 0x8e8e0789, 0x949433a7, 0x9b9b2db6, 0x1e1e3c22, 0x87871592, 0xe9e9c920, 0xcece8749, 0x5555aaff, 0x28285078, 0xdfdfa57a, 0x8c8c038f, 0xa1a159f8, 0x89890980, 0x0d0d1a17, 0xbfbf65da, 0xe6e6d731, 0x424284c6, 0x6868d0b8, 0x414182c3, 0x999929b0, 0x2d2d5a77, 0x0f0f1e11, 0xb0b07bcb, 0x5454a8fc, 0xbbbb6dd6, 0x16162c3a, }, { 0x63c6a563, 0x7cf8847c, 0x77ee9977, 0x7bf68d7b, 0xf2ff0df2, 0x6bd6bd6b, 0x6fdeb16f, 0xc59154c5, 0x30605030, 0x01020301, 0x67cea967, 0x2b567d2b, 0xfee719fe, 0xd7b562d7, 0xab4de6ab, 0x76ec9a76, 0xca8f45ca, 0x821f9d82, 0xc98940c9, 0x7dfa877d, 0xfaef15fa, 0x59b2eb59, 0x478ec947, 0xf0fb0bf0, 0xad41ecad, 0xd4b367d4, 0xa25ffda2, 0xaf45eaaf, 0x9c23bf9c, 0xa453f7a4, 0x72e49672, 0xc09b5bc0, 0xb775c2b7, 0xfde11cfd, 0x933dae93, 0x264c6a26, 0x366c5a36, 0x3f7e413f, 0xf7f502f7, 0xcc834fcc, 0x34685c34, 0xa551f4a5, 0xe5d134e5, 0xf1f908f1, 0x71e29371, 0xd8ab73d8, 0x31625331, 0x152a3f15, 0x04080c04, 0xc79552c7, 0x23466523, 0xc39d5ec3, 0x18302818, 0x9637a196, 0x050a0f05, 0x9a2fb59a, 0x070e0907, 0x12243612, 0x801b9b80, 0xe2df3de2, 0xebcd26eb, 0x274e6927, 0xb27fcdb2, 0x75ea9f75, 0x09121b09, 0x831d9e83, 0x2c58742c, 0x1a342e1a, 0x1b362d1b, 0x6edcb26e, 0x5ab4ee5a, 0xa05bfba0, 0x52a4f652, 0x3b764d3b, 0xd6b761d6, 0xb37dceb3, 0x29527b29, 0xe3dd3ee3, 0x2f5e712f, 0x84139784, 0x53a6f553, 0xd1b968d1, 0x00000000, 0xedc12ced, 0x20406020, 0xfce31ffc, 0xb179c8b1, 0x5bb6ed5b, 0x6ad4be6a, 0xcb8d46cb, 0xbe67d9be, 0x39724b39, 0x4a94de4a, 0x4c98d44c, 0x58b0e858, 0xcf854acf, 0xd0bb6bd0, 0xefc52aef, 0xaa4fe5aa, 0xfbed16fb, 0x4386c543, 0x4d9ad74d, 0x33665533, 0x85119485, 0x458acf45, 0xf9e910f9, 0x02040602, 0x7ffe817f, 0x50a0f050, 0x3c78443c, 0x9f25ba9f, 0xa84be3a8, 0x51a2f351, 0xa35dfea3, 0x4080c040, 0x8f058a8f, 0x923fad92, 0x9d21bc9d, 0x38704838, 0xf5f104f5, 0xbc63dfbc, 0xb677c1b6, 0xdaaf75da, 0x21426321, 0x10203010, 0xffe51aff, 0xf3fd0ef3, 0xd2bf6dd2, 0xcd814ccd, 0x0c18140c, 0x13263513, 0xecc32fec, 0x5fbee15f, 0x9735a297, 0x4488cc44, 0x172e3917, 0xc49357c4, 0xa755f2a7, 0x7efc827e, 0x3d7a473d, 0x64c8ac64, 0x5dbae75d, 0x19322b19, 0x73e69573, 0x60c0a060, 0x81199881, 0x4f9ed14f, 0xdca37fdc, 0x22446622, 0x2a547e2a, 0x903bab90, 0x880b8388, 0x468cca46, 0xeec729ee, 0xb86bd3b8, 0x14283c14, 0xdea779de, 0x5ebce25e, 0x0b161d0b, 0xdbad76db, 0xe0db3be0, 0x32645632, 0x3a744e3a, 0x0a141e0a,

0x4992db49, 0x060c0a06, 0x24486c24, 0x5cb8e45c, 0xc29f5dc2, 0xd3bd6ed3, 0xac43efac, 0x62c4a662, 0x9139a891, 0x9531a495, 0xe4d337e4, 0x79f28b79, 0xe7d532e7, 0xc88b43c8, 0x376e5937, 0x6ddab76d, 0x8d018c8d, 0xd5b164d5, 0x4e9cd24e, 0xa949e0a9, 0x6cd8b46c, 0x56acfa56, 0xf4f307f4, 0xeacf25ea, 0x65caaf65, 0x7af48e7a, 0xae47e9ae, 0x08101808, 0xba6fd5ba, 0x78f08878, 0x254a6f25, 0x2e5c722e, 0x1c38241c, 0xa657f1a6, 0xb473c7b4, 0xc69751c6, 0xe8cb23e8, 0xdda17cdd, 0x74e89c74, 0x1f3e211f, 0x4b96dd4b, 0xbd61dcbd, 0x8b0d868b, 0x8a0f858a, 0x70e09070, 0x3e7c423e, 0xb571c4b5, 0x66ccaa66, 0x4890d848, 0x03060503, 0xf6f701f6, 0x0e1c120e, 0x61c2a361, 0x356a5f35, 0x57aef957, 0xb969d0b9, 0x86179186, 0xc19958c1, 0x1d3a271d, 0x9e27b99e, 0xe1d938e1, 0xf8eb13f8, 0x982bb398, 0x11223311, 0x69d2bb69, 0xd9a970d9, 0x8e07898e, 0x9433a794, 0x9b2db69b, 0x1e3c221e, 0x87159287, 0xe9c920e9, 0xce8749ce, 0x55aaff55, 0x28507828, 0xdfa57adf, 0x8c038f8c, 0xa159f8a1, 0x89098089, 0x0d1a170d, 0xbf65dabf, 0xe6d731e6, 0x4284c642, 0x68d0b868, 0x4182c341, 0x9929b099, 0x2d5a772d, 0x0f1e110f, 0xb07bcbb0, 0x54a8fc54, 0xbb6dd6bb, 0x162c3a16, }, { 0xc6a56363, 0xf8847c7c, 0xee997777, 0xf68d7b7b, 0xff0df2f2, 0xd6bd6b6b, 0xdeb16f6f, 0x9154c5c5, 0x60503030, 0x02030101, 0xcea96767, 0x567d2b2b, 0xe719fefe, 0xb562d7d7, 0x4de6abab, 0xec9a7676, 0x8f45caca, 0x1f9d8282, 0x8940c9c9, 0xfa877d7d, 0xef15fafa, 0xb2eb5959, 0x8ec94747, 0xfb0bf0f0, 0x41ecadad, 0xb367d4d4, 0x5ffda2a2, 0x45eaafaf, 0x23bf9c9c, 0x53f7a4a4, 0xe4967272, 0x9b5bc0c0, 0x75c2b7b7, 0xe11cfdfd, 0x3dae9393, 0x4c6a2626, 0x6c5a3636, 0x7e413f3f, 0xf502f7f7, 0x834fcccc, 0x685c3434, 0x51f4a5a5, 0xd134e5e5, 0xf908f1f1, 0xe2937171, 0xab73d8d8, 0x62533131, 0x2a3f1515, 0x080c0404, 0x9552c7c7, 0x46652323, 0x9d5ec3c3, 0x30281818, 0x37a19696, 0x0a0f0505, 0x2fb59a9a, 0x0e090707, 0x24361212, 0x1b9b8080, 0xdf3de2e2, 0xcd26ebeb, 0x4e692727, 0x7fcdb2b2, 0xea9f7575, 0x121b0909, 0x1d9e8383, 0x58742c2c, 0x342e1a1a, 0x362d1b1b, 0xdcb26e6e, 0xb4ee5a5a, 0x5bfba0a0, 0xa4f65252, 0x764d3b3b, 0xb761d6d6, 0x7dceb3b3, 0x527b2929, 0xdd3ee3e3, 0x5e712f2f, 0x13978484, 0xa6f55353, 0xb968d1d1, 0x00000000, 0xc12ceded, 0x40602020, 0xe31ffcfc, 0x79c8b1b1, 0xb6ed5b5b, 0xd4be6a6a, 0x8d46cbcb, 0x67d9bebe, 0x724b3939, 0x94de4a4a, 0x98d44c4c, 0xb0e85858, 0x854acfcf, 0xbb6bd0d0, 0xc52aefef, 0x4fe5aaaa, 0xed16fbfb, 0x86c54343, 0x9ad74d4d, 0x66553333, 0x11948585, 0x8acf4545, 0xe910f9f9, 0x04060202, 0xfe817f7f, 0xa0f05050, 0x78443c3c, 0x25ba9f9f, 0x4be3a8a8, 0xa2f35151, 0x5dfea3a3, 0x80c04040, 0x058a8f8f, 0x3fad9292, 0x21bc9d9d, 0x70483838, 0xf104f5f5, 0x63dfbcbc, 0x77c1b6b6, 0xaf75dada, 0x42632121, 0x20301010, 0xe51affff, 0xfd0ef3f3, 0xbf6dd2d2, 0x814ccdcd, 0x18140c0c, 0x26351313, 0xc32fecec, 0xbee15f5f, 0x35a29797, 0x88cc4444, 0x2e391717, 0x9357c4c4, 0x55f2a7a7, 0xfc827e7e, 0x7a473d3d, 0xc8ac6464, 0xbae75d5d, 0x322b1919, 0xe6957373, 0xc0a06060, 0x19988181, 0x9ed14f4f, 0xa37fdcdc, 0x44662222, 0x547e2a2a, 0x3bab9090, 0x0b838888, 0x8cca4646, 0xc729eeee, 0x6bd3b8b8, 0x283c1414, 0xa779dede, 0xbce25e5e, 0x161d0b0b, 0xad76dbdb, 0xdb3be0e0, 0x64563232, 0x744e3a3a, 0x141e0a0a, 0x92db4949, 0x0c0a0606, 0x486c2424, 0xb8e45c5c, 0x9f5dc2c2, 0xbd6ed3d3, 0x43efacac, 0xc4a66262, 0x39a89191, 0x31a49595, 0xd337e4e4, 0xf28b7979, 0xd532e7e7, 0x8b43c8c8, 0x6e593737, 0xdab76d6d, 0x018c8d8d, 0xb164d5d5, 0x9cd24e4e, 0x49e0a9a9,

0xd8b46c6c, 0xacfa5656, 0xf307f4f4, 0xcf25eaea, 0xcaaf6565, 0xf48e7a7a, 0x47e9aeae, 0x10180808, 0x6fd5baba, 0xf0887878, 0x4a6f2525, 0x5c722e2e, 0x38241c1c, 0x57f1a6a6, 0x73c7b4b4, 0x9751c6c6, 0xcb23e8e8, 0xa17cdddd, 0xe89c7474, 0x3e211f1f, 0x96dd4b4b, 0x61dcbdbd, 0x0d868b8b, 0x0f858a8a, 0xe0907070, 0x7c423e3e, 0x71c4b5b5, 0xccaa6666, 0x90d84848, 0x06050303, 0xf701f6f6, 0x1c120e0e, 0xc2a36161, 0x6a5f3535, 0xaef95757, 0x69d0b9b9, 0x17918686, 0x9958c1c1, 0x3a271d1d, 0x27b99e9e, 0xd938e1e1, 0xeb13f8f8, 0x2bb39898, 0x22331111, 0xd2bb6969, 0xa970d9d9, 0x07898e8e, 0x33a79494, 0x2db69b9b, 0x3c221e1e, 0x15928787, 0xc920e9e9, 0x8749cece, 0xaaff5555, 0x50782828, 0xa57adfdf, 0x038f8c8c, 0x59f8a1a1, 0x09808989, 0x1a170d0d, 0x65dabfbf, 0xd731e6e6, 0x84c64242, 0xd0b86868, 0x82c34141, 0x29b09999, 0x5a772d2d, 0x1e110f0f, 0x7bcbb0b0, 0xa8fc5454, 0x6dd6bbbb, 0x2c3a1616, } }; const u32 crypto_fl_tab[4][256] = { { 0x00000063, 0x0000007c, 0x00000077, 0x0000007b, 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5, 0x00000030, 0x00000001, 0x00000067, 0x0000002b, 0x000000fe, 0x000000d7, 0x000000ab, 0x00000076, 0x000000ca, 0x00000082, 0x000000c9, 0x0000007d, 0x000000fa, 0x00000059, 0x00000047, 0x000000f0, 0x000000ad, 0x000000d4, 0x000000a2, 0x000000af, 0x0000009c, 0x000000a4, 0x00000072, 0x000000c0, 0x000000b7, 0x000000fd, 0x00000093, 0x00000026, 0x00000036, 0x0000003f, 0x000000f7, 0x000000cc, 0x00000034, 0x000000a5, 0x000000e5, 0x000000f1, 0x00000071, 0x000000d8, 0x00000031, 0x00000015, 0x00000004, 0x000000c7, 0x00000023, 0x000000c3, 0x00000018, 0x00000096, 0x00000005, 0x0000009a, 0x00000007, 0x00000012, 0x00000080, 0x000000e2, 0x000000eb, 0x00000027, 0x000000b2, 0x00000075, 0x00000009, 0x00000083, 0x0000002c, 0x0000001a, 0x0000001b, 0x0000006e, 0x0000005a, 0x000000a0, 0x00000052, 0x0000003b, 0x000000d6, 0x000000b3, 0x00000029, 0x000000e3, 0x0000002f, 0x00000084, 0x00000053, 0x000000d1, 0x00000000, 0x000000ed, 0x00000020, 0x000000fc, 0x000000b1, 0x0000005b, 0x0000006a, 0x000000cb, 0x000000be, 0x00000039, 0x0000004a, 0x0000004c, 0x00000058, 0x000000cf, 0x000000d0, 0x000000ef, 0x000000aa, 0x000000fb, 0x00000043, 0x0000004d, 0x00000033, 0x00000085, 0x00000045, 0x000000f9, 0x00000002, 0x0000007f, 0x00000050, 0x0000003c, 0x0000009f, 0x000000a8, 0x00000051, 0x000000a3, 0x00000040, 0x0000008f, 0x00000092, 0x0000009d, 0x00000038, 0x000000f5, 0x000000bc, 0x000000b6, 0x000000da, 0x00000021, 0x00000010, 0x000000ff, 0x000000f3, 0x000000d2, 0x000000cd, 0x0000000c, 0x00000013, 0x000000ec, 0x0000005f, 0x00000097, 0x00000044, 0x00000017, 0x000000c4, 0x000000a7, 0x0000007e, 0x0000003d, 0x00000064, 0x0000005d, 0x00000019, 0x00000073, 0x00000060, 0x00000081, 0x0000004f, 0x000000dc, 0x00000022, 0x0000002a, 0x00000090, 0x00000088, 0x00000046, 0x000000ee, 0x000000b8, 0x00000014, 0x000000de, 0x0000005e, 0x0000000b, 0x000000db, 0x000000e0, 0x00000032, 0x0000003a, 0x0000000a, 0x00000049, 0x00000006, 0x00000024, 0x0000005c, 0x000000c2, 0x000000d3, 0x000000ac, 0x00000062, 0x00000091, 0x00000095, 0x000000e4, 0x00000079, 0x000000e7, 0x000000c8, 0x00000037, 0x0000006d, 0x0000008d, 0x000000d5, 0x0000004e, 0x000000a9, 0x0000006c, 0x00000056, 0x000000f4, 0x000000ea,

0x00000065, 0x0000007a, 0x000000ae, 0x00000008, 0x000000ba, 0x00000078, 0x00000025, 0x0000002e, 0x0000001c, 0x000000a6, 0x000000b4, 0x000000c6, 0x000000e8, 0x000000dd, 0x00000074, 0x0000001f, 0x0000004b, 0x000000bd, 0x0000008b, 0x0000008a, 0x00000070, 0x0000003e, 0x000000b5, 0x00000066, 0x00000048, 0x00000003, 0x000000f6, 0x0000000e, 0x00000061, 0x00000035, 0x00000057, 0x000000b9, 0x00000086, 0x000000c1, 0x0000001d, 0x0000009e, 0x000000e1, 0x000000f8, 0x00000098, 0x00000011, 0x00000069, 0x000000d9, 0x0000008e, 0x00000094, 0x0000009b, 0x0000001e, 0x00000087, 0x000000e9, 0x000000ce, 0x00000055, 0x00000028, 0x000000df, 0x0000008c, 0x000000a1, 0x00000089, 0x0000000d, 0x000000bf, 0x000000e6, 0x00000042, 0x00000068, 0x00000041, 0x00000099, 0x0000002d, 0x0000000f, 0x000000b0, 0x00000054, 0x000000bb, 0x00000016, }, { 0x00006300, 0x00007c00, 0x00007700, 0x00007b00, 0x0000f200, 0x00006b00, 0x00006f00, 0x0000c500, 0x00003000, 0x00000100, 0x00006700, 0x00002b00, 0x0000fe00, 0x0000d700, 0x0000ab00, 0x00007600, 0x0000ca00, 0x00008200, 0x0000c900, 0x00007d00, 0x0000fa00, 0x00005900, 0x00004700, 0x0000f000, 0x0000ad00, 0x0000d400, 0x0000a200, 0x0000af00, 0x00009c00, 0x0000a400, 0x00007200, 0x0000c000, 0x0000b700, 0x0000fd00, 0x00009300, 0x00002600, 0x00003600, 0x00003f00, 0x0000f700, 0x0000cc00, 0x00003400, 0x0000a500, 0x0000e500, 0x0000f100, 0x00007100, 0x0000d800, 0x00003100, 0x00001500, 0x00000400, 0x0000c700, 0x00002300, 0x0000c300, 0x00001800, 0x00009600, 0x00000500, 0x00009a00, 0x00000700, 0x00001200, 0x00008000, 0x0000e200, 0x0000eb00, 0x00002700, 0x0000b200, 0x00007500, 0x00000900, 0x00008300, 0x00002c00, 0x00001a00, 0x00001b00, 0x00006e00, 0x00005a00, 0x0000a000, 0x00005200, 0x00003b00, 0x0000d600, 0x0000b300, 0x00002900, 0x0000e300, 0x00002f00, 0x00008400, 0x00005300, 0x0000d100, 0x00000000, 0x0000ed00, 0x00002000, 0x0000fc00, 0x0000b100, 0x00005b00, 0x00006a00, 0x0000cb00, 0x0000be00, 0x00003900, 0x00004a00, 0x00004c00, 0x00005800, 0x0000cf00, 0x0000d000, 0x0000ef00, 0x0000aa00, 0x0000fb00, 0x00004300, 0x00004d00, 0x00003300, 0x00008500, 0x00004500, 0x0000f900, 0x00000200, 0x00007f00, 0x00005000, 0x00003c00, 0x00009f00, 0x0000a800, 0x00005100, 0x0000a300, 0x00004000, 0x00008f00, 0x00009200, 0x00009d00, 0x00003800, 0x0000f500, 0x0000bc00, 0x0000b600, 0x0000da00, 0x00002100, 0x00001000, 0x0000ff00, 0x0000f300, 0x0000d200, 0x0000cd00, 0x00000c00, 0x00001300, 0x0000ec00, 0x00005f00, 0x00009700, 0x00004400, 0x00001700, 0x0000c400, 0x0000a700, 0x00007e00, 0x00003d00, 0x00006400, 0x00005d00, 0x00001900, 0x00007300, 0x00006000, 0x00008100, 0x00004f00, 0x0000dc00, 0x00002200, 0x00002a00, 0x00009000, 0x00008800, 0x00004600, 0x0000ee00, 0x0000b800, 0x00001400, 0x0000de00, 0x00005e00, 0x00000b00, 0x0000db00, 0x0000e000, 0x00003200, 0x00003a00, 0x00000a00, 0x00004900, 0x00000600, 0x00002400, 0x00005c00, 0x0000c200, 0x0000d300, 0x0000ac00, 0x00006200, 0x00009100, 0x00009500, 0x0000e400, 0x00007900, 0x0000e700, 0x0000c800, 0x00003700, 0x00006d00, 0x00008d00, 0x0000d500, 0x00004e00, 0x0000a900, 0x00006c00, 0x00005600, 0x0000f400, 0x0000ea00, 0x00006500, 0x00007a00, 0x0000ae00, 0x00000800, 0x0000ba00, 0x00007800, 0x00002500, 0x00002e00, 0x00001c00, 0x0000a600, 0x0000b400, 0x0000c600, 0x0000e800, 0x0000dd00, 0x00007400, 0x00001f00, 0x00004b00, 0x0000bd00, 0x00008b00, 0x00008a00,

0x00007000, 0x00003e00, 0x0000b500, 0x00006600, 0x00004800, 0x00000300, 0x0000f600, 0x00000e00, 0x00006100, 0x00003500, 0x00005700, 0x0000b900, 0x00008600, 0x0000c100, 0x00001d00, 0x00009e00, 0x0000e100, 0x0000f800, 0x00009800, 0x00001100, 0x00006900, 0x0000d900, 0x00008e00, 0x00009400, 0x00009b00, 0x00001e00, 0x00008700, 0x0000e900, 0x0000ce00, 0x00005500, 0x00002800, 0x0000df00, 0x00008c00, 0x0000a100, 0x00008900, 0x00000d00, 0x0000bf00, 0x0000e600, 0x00004200, 0x00006800, 0x00004100, 0x00009900, 0x00002d00, 0x00000f00, 0x0000b000, 0x00005400, 0x0000bb00, 0x00001600, }, { 0x00630000, 0x007c0000, 0x00770000, 0x007b0000, 0x00f20000, 0x006b0000, 0x006f0000, 0x00c50000, 0x00300000, 0x00010000, 0x00670000, 0x002b0000, 0x00fe0000, 0x00d70000, 0x00ab0000, 0x00760000, 0x00ca0000, 0x00820000, 0x00c90000, 0x007d0000, 0x00fa0000, 0x00590000, 0x00470000, 0x00f00000, 0x00ad0000, 0x00d40000, 0x00a20000, 0x00af0000, 0x009c0000, 0x00a40000, 0x00720000, 0x00c00000, 0x00b70000, 0x00fd0000, 0x00930000, 0x00260000, 0x00360000, 0x003f0000, 0x00f70000, 0x00cc0000, 0x00340000, 0x00a50000, 0x00e50000, 0x00f10000, 0x00710000, 0x00d80000, 0x00310000, 0x00150000, 0x00040000, 0x00c70000, 0x00230000, 0x00c30000, 0x00180000, 0x00960000, 0x00050000, 0x009a0000, 0x00070000, 0x00120000, 0x00800000, 0x00e20000, 0x00eb0000, 0x00270000, 0x00b20000, 0x00750000, 0x00090000, 0x00830000, 0x002c0000, 0x001a0000, 0x001b0000, 0x006e0000, 0x005a0000, 0x00a00000, 0x00520000, 0x003b0000, 0x00d60000, 0x00b30000, 0x00290000, 0x00e30000, 0x002f0000, 0x00840000, 0x00530000, 0x00d10000, 0x00000000, 0x00ed0000, 0x00200000, 0x00fc0000, 0x00b10000, 0x005b0000, 0x006a0000, 0x00cb0000, 0x00be0000, 0x00390000, 0x004a0000, 0x004c0000, 0x00580000, 0x00cf0000, 0x00d00000, 0x00ef0000, 0x00aa0000, 0x00fb0000, 0x00430000, 0x004d0000, 0x00330000, 0x00850000, 0x00450000, 0x00f90000, 0x00020000, 0x007f0000, 0x00500000, 0x003c0000, 0x009f0000, 0x00a80000, 0x00510000, 0x00a30000, 0x00400000, 0x008f0000, 0x00920000, 0x009d0000, 0x00380000, 0x00f50000, 0x00bc0000, 0x00b60000, 0x00da0000, 0x00210000, 0x00100000, 0x00ff0000, 0x00f30000, 0x00d20000, 0x00cd0000, 0x000c0000, 0x00130000, 0x00ec0000, 0x005f0000, 0x00970000, 0x00440000, 0x00170000, 0x00c40000, 0x00a70000, 0x007e0000, 0x003d0000, 0x00640000, 0x005d0000, 0x00190000, 0x00730000, 0x00600000, 0x00810000, 0x004f0000, 0x00dc0000, 0x00220000, 0x002a0000, 0x00900000, 0x00880000, 0x00460000, 0x00ee0000, 0x00b80000, 0x00140000, 0x00de0000, 0x005e0000, 0x000b0000, 0x00db0000, 0x00e00000, 0x00320000, 0x003a0000, 0x000a0000, 0x00490000, 0x00060000, 0x00240000, 0x005c0000, 0x00c20000, 0x00d30000, 0x00ac0000, 0x00620000, 0x00910000, 0x00950000, 0x00e40000, 0x00790000, 0x00e70000, 0x00c80000, 0x00370000, 0x006d0000, 0x008d0000, 0x00d50000, 0x004e0000, 0x00a90000, 0x006c0000, 0x00560000, 0x00f40000, 0x00ea0000, 0x00650000, 0x007a0000, 0x00ae0000, 0x00080000, 0x00ba0000, 0x00780000, 0x00250000, 0x002e0000, 0x001c0000, 0x00a60000, 0x00b40000, 0x00c60000, 0x00e80000, 0x00dd0000, 0x00740000, 0x001f0000, 0x004b0000, 0x00bd0000, 0x008b0000, 0x008a0000, 0x00700000, 0x003e0000, 0x00b50000, 0x00660000, 0x00480000, 0x00030000, 0x00f60000, 0x000e0000, 0x00610000, 0x00350000, 0x00570000, 0x00b90000, 0x00860000, 0x00c10000, 0x001d0000, 0x009e0000, 0x00e10000, 0x00f80000, 0x00980000, 0x00110000,

0x00690000, 0x00d90000, 0x008e0000, 0x00940000, 0x009b0000, 0x001e0000, 0x00870000, 0x00e90000, 0x00ce0000, 0x00550000, 0x00280000, 0x00df0000, 0x008c0000, 0x00a10000, 0x00890000, 0x000d0000, 0x00bf0000, 0x00e60000, 0x00420000, 0x00680000, 0x00410000, 0x00990000, 0x002d0000, 0x000f0000, 0x00b00000, 0x00540000, 0x00bb0000, 0x00160000, }, { 0x63000000, 0x7c000000, 0x77000000, 0x7b000000, 0xf2000000, 0x6b000000, 0x6f000000, 0xc5000000, 0x30000000, 0x01000000, 0x67000000, 0x2b000000, 0xfe000000, 0xd7000000, 0xab000000, 0x76000000, 0xca000000, 0x82000000, 0xc9000000, 0x7d000000, 0xfa000000, 0x59000000, 0x47000000, 0xf0000000, 0xad000000, 0xd4000000, 0xa2000000, 0xaf000000, 0x9c000000, 0xa4000000, 0x72000000, 0xc0000000, 0xb7000000, 0xfd000000, 0x93000000, 0x26000000, 0x36000000, 0x3f000000, 0xf7000000, 0xcc000000, 0x34000000, 0xa5000000, 0xe5000000, 0xf1000000, 0x71000000, 0xd8000000, 0x31000000, 0x15000000, 0x04000000, 0xc7000000, 0x23000000, 0xc3000000, 0x18000000, 0x96000000, 0x05000000, 0x9a000000, 0x07000000, 0x12000000, 0x80000000, 0xe2000000, 0xeb000000, 0x27000000, 0xb2000000, 0x75000000, 0x09000000, 0x83000000, 0x2c000000, 0x1a000000, 0x1b000000, 0x6e000000, 0x5a000000, 0xa0000000, 0x52000000, 0x3b000000, 0xd6000000, 0xb3000000, 0x29000000, 0xe3000000, 0x2f000000, 0x84000000, 0x53000000, 0xd1000000, 0x00000000, 0xed000000, 0x20000000, 0xfc000000, 0xb1000000, 0x5b000000, 0x6a000000, 0xcb000000, 0xbe000000, 0x39000000, 0x4a000000, 0x4c000000, 0x58000000, 0xcf000000, 0xd0000000, 0xef000000, 0xaa000000, 0xfb000000, 0x43000000, 0x4d000000, 0x33000000, 0x85000000, 0x45000000, 0xf9000000, 0x02000000, 0x7f000000, 0x50000000, 0x3c000000, 0x9f000000, 0xa8000000, 0x51000000, 0xa3000000, 0x40000000, 0x8f000000, 0x92000000, 0x9d000000, 0x38000000, 0xf5000000, 0xbc000000, 0xb6000000, 0xda000000, 0x21000000, 0x10000000, 0xff000000, 0xf3000000, 0xd2000000, 0xcd000000, 0x0c000000, 0x13000000, 0xec000000, 0x5f000000, 0x97000000, 0x44000000, 0x17000000, 0xc4000000, 0xa7000000, 0x7e000000, 0x3d000000, 0x64000000, 0x5d000000, 0x19000000, 0x73000000, 0x60000000, 0x81000000, 0x4f000000, 0xdc000000, 0x22000000, 0x2a000000, 0x90000000, 0x88000000, 0x46000000, 0xee000000, 0xb8000000, 0x14000000, 0xde000000, 0x5e000000, 0x0b000000, 0xdb000000, 0xe0000000, 0x32000000, 0x3a000000, 0x0a000000, 0x49000000, 0x06000000, 0x24000000, 0x5c000000, 0xc2000000, 0xd3000000, 0xac000000, 0x62000000, 0x91000000, 0x95000000, 0xe4000000, 0x79000000, 0xe7000000, 0xc8000000, 0x37000000, 0x6d000000, 0x8d000000, 0xd5000000, 0x4e000000, 0xa9000000, 0x6c000000, 0x56000000, 0xf4000000, 0xea000000, 0x65000000, 0x7a000000, 0xae000000, 0x08000000, 0xba000000, 0x78000000, 0x25000000, 0x2e000000, 0x1c000000, 0xa6000000, 0xb4000000, 0xc6000000, 0xe8000000, 0xdd000000, 0x74000000, 0x1f000000, 0x4b000000, 0xbd000000, 0x8b000000, 0x8a000000, 0x70000000, 0x3e000000, 0xb5000000, 0x66000000, 0x48000000, 0x03000000, 0xf6000000, 0x0e000000, 0x61000000, 0x35000000, 0x57000000, 0xb9000000, 0x86000000, 0xc1000000, 0x1d000000, 0x9e000000, 0xe1000000, 0xf8000000, 0x98000000, 0x11000000, 0x69000000, 0xd9000000, 0x8e000000, 0x94000000, 0x9b000000, 0x1e000000, 0x87000000, 0xe9000000, 0xce000000, 0x55000000, 0x28000000, 0xdf000000, 0x8c000000, 0xa1000000, 0x89000000, 0x0d000000, 0xbf000000, 0xe6000000, 0x42000000, 0x68000000,

0x41000000, 0x99000000, 0x2d000000, 0x0f000000, 0xb0000000, 0x54000000, 0xbb000000, 0x16000000, } }; const u32 crypto_it_tab[4][256] = { { 0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b, 0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5, 0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5, 0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d, 0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b, 0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295, 0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e, 0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927, 0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d, 0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362, 0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9, 0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52, 0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566, 0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3, 0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed, 0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e, 0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4, 0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4, 0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd, 0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d, 0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060, 0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967, 0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879, 0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000, 0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c, 0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36, 0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624, 0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b, 0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c, 0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12, 0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14, 0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3, 0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b, 0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8, 0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684, 0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7, 0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177, 0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947, 0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322, 0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498, 0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f, 0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54, 0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382, 0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf, 0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb, 0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83, 0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef, 0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029, 0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235, 0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733, 0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117, 0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4, 0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546, 0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb, 0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d, 0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb, 0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a, 0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773, 0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478, 0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2, 0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff, 0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664,

0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0, }, { 0xa7f45150, 0x65417e53, 0xa4171ac3, 0x5e273a96, 0x6bab3bcb, 0x459d1ff1, 0x58faacab, 0x03e34b93, 0xfa302055, 0x6d76adf6, 0x76cc8891, 0x4c02f525, 0xd7e54ffc, 0xcb2ac5d7, 0x44352680, 0xa362b58f, 0x5ab1de49, 0x1bba2567, 0x0eea4598, 0xc0fe5de1, 0x752fc302, 0xf04c8112, 0x97468da3, 0xf9d36bc6, 0x5f8f03e7, 0x9c921595, 0x7a6dbfeb, 0x595295da, 0x83bed42d, 0x217458d3, 0x69e04929, 0xc8c98e44, 0x89c2756a, 0x798ef478, 0x3e58996b, 0x71b927dd, 0x4fe1beb6, 0xad88f017, 0xac20c966, 0x3ace7db4, 0x4adf6318, 0x311ae582, 0x33519760, 0x7f536245, 0x7764b1e0, 0xae6bbb84, 0xa081fe1c, 0x2b08f994, 0x68487058, 0xfd458f19, 0x6cde9487, 0xf87b52b7, 0xd373ab23, 0x024b72e2, 0x8f1fe357, 0xab55662a, 0x28ebb207, 0xc2b52f03, 0x7bc5869a, 0x0837d3a5, 0x872830f2, 0xa5bf23b2, 0x6a0302ba, 0x8216ed5c, 0x1ccf8a2b, 0xb479a792, 0xf207f3f0, 0xe2694ea1, 0xf4da65cd, 0xbe0506d5, 0x6234d11f, 0xfea6c48a, 0x532e349d, 0x55f3a2a0, 0xe18a0532, 0xebf6a475, 0xec830b39, 0xef6040aa, 0x9f715e06, 0x106ebd51, 0x8a213ef9, 0x06dd963d, 0x053eddae, 0xbde64d46, 0x8d5491b5, 0x5dc47105, 0xd406046f, 0x155060ff, 0xfb981924, 0xe9bdd697, 0x434089cc, 0x9ed96777, 0x42e8b0bd, 0x8b890788, 0x5b19e738, 0xeec879db, 0x0a7ca147, 0x0f427ce9, 0x1e84f8c9, 0x00000000, 0x86800983, 0xed2b3248, 0x70111eac, 0x725a6c4e, 0xff0efdfb, 0x38850f56, 0xd5ae3d1e, 0x392d3627, 0xd90f0a64, 0xa65c6821, 0x545b9bd1, 0x2e36243a, 0x670a0cb1, 0xe757930f, 0x96eeb4d2, 0x919b1b9e, 0xc5c0804f, 0x20dc61a2, 0x4b775a69, 0x1a121c16, 0xba93e20a, 0x2aa0c0e5, 0xe0223c43, 0x171b121d, 0x0d090e0b, 0xc78bf2ad, 0xa8b62db9, 0xa91e14c8, 0x19f15785, 0x0775af4c, 0xdd99eebb, 0x607fa3fd, 0x2601f79f, 0xf5725cbc, 0x3b6644c5, 0x7efb5b34, 0x29438b76, 0xc623cbdc, 0xfcedb668, 0xf1e4b863, 0xdc31d7ca, 0x85634210, 0x22971340, 0x11c68420, 0x244a857d, 0x3dbbd2f8, 0x32f9ae11, 0xa129c76d, 0x2f9e1d4b, 0x30b2dcf3, 0x52860dec, 0xe3c177d0, 0x16b32b6c, 0xb970a999, 0x489411fa, 0x64e94722, 0x8cfca8c4, 0x3ff0a01a, 0x2c7d56d8, 0x903322ef, 0x4e4987c7, 0xd138d9c1, 0xa2ca8cfe, 0x0bd49836, 0x81f5a6cf, 0xde7aa528, 0x8eb7da26, 0xbfad3fa4, 0x9d3a2ce4, 0x9278500d, 0xcc5f6a9b, 0x467e5462, 0x138df6c2, 0xb8d890e8, 0xf7392e5e, 0xafc382f5, 0x805d9fbe, 0x93d0697c, 0x2dd56fa9, 0x1225cfb3, 0x99acc83b, 0x7d1810a7, 0x639ce86e, 0xbb3bdb7b, 0x7826cd09, 0x18596ef4, 0xb79aec01, 0x9a4f83a8, 0x6e95e665, 0xe6ffaa7e, 0xcfbc2108, 0xe815efe6, 0x9be7bad9, 0x366f4ace, 0x099fead4, 0x7cb029d6, 0xb2a431af, 0x233f2a31, 0x94a5c630, 0x66a235c0, 0xbc4e7437, 0xca82fca6, 0xd090e0b0, 0xd8a73315, 0x9804f14a, 0xdaec41f7, 0x50cd7f0e, 0xf691172f, 0xd64d768d, 0xb0ef434d, 0x4daacc54, 0x0496e4df, 0xb5d19ee3, 0x886a4c1b, 0x1f2cc1b8, 0x5165467f, 0xea5e9d04, 0x358c015d, 0x7487fa73, 0x410bfb2e, 0x1d67b35a, 0xd2db9252, 0x5610e933, 0x47d66d13, 0x61d79a8c, 0x0ca1377a, 0x14f8598e, 0x3c13eb89, 0x27a9ceee, 0xc961b735, 0xe51ce1ed, 0xb1477a3c, 0xdfd29c59, 0x73f2553f, 0xce141879, 0x37c773bf, 0xcdf753ea, 0xaafd5f5b, 0x6f3ddf14, 0xdb447886, 0xf3afca81, 0xc468b93e, 0x3424382c, 0x40a3c25f, 0xc31d1672, 0x25e2bc0c, 0x493c288b, 0x950dff41, 0x01a83971, 0xb30c08de, 0xe4b4d89c, 0xc1566490, 0x84cb7b61, 0xb632d570, 0x5c6c4874, 0x57b8d042, }, { 0xf45150a7, 0x417e5365, 0x171ac3a4, 0x273a965e, 0xab3bcb6b, 0x9d1ff145, 0xfaacab58, 0xe34b9303, 0x302055fa, 0x76adf66d, 0xcc889176, 0x02f5254c,

0xe54ffcd7, 0x2ac5d7cb, 0x35268044, 0x62b58fa3, 0xb1de495a, 0xba25671b, 0xea45980e, 0xfe5de1c0, 0x2fc30275, 0x4c8112f0, 0x468da397, 0xd36bc6f9, 0x8f03e75f, 0x9215959c, 0x6dbfeb7a, 0x5295da59, 0xbed42d83, 0x7458d321, 0xe0492969, 0xc98e44c8, 0xc2756a89, 0x8ef47879, 0x58996b3e, 0xb927dd71, 0xe1beb64f, 0x88f017ad, 0x20c966ac, 0xce7db43a, 0xdf63184a, 0x1ae58231, 0x51976033, 0x5362457f, 0x64b1e077, 0x6bbb84ae, 0x81fe1ca0, 0x08f9942b, 0x48705868, 0x458f19fd, 0xde94876c, 0x7b52b7f8, 0x73ab23d3, 0x4b72e202, 0x1fe3578f, 0x55662aab, 0xebb20728, 0xb52f03c2, 0xc5869a7b, 0x37d3a508, 0x2830f287, 0xbf23b2a5, 0x0302ba6a, 0x16ed5c82, 0xcf8a2b1c, 0x79a792b4, 0x07f3f0f2, 0x694ea1e2, 0xda65cdf4, 0x0506d5be, 0x34d11f62, 0xa6c48afe, 0x2e349d53, 0xf3a2a055, 0x8a0532e1, 0xf6a475eb, 0x830b39ec, 0x6040aaef, 0x715e069f, 0x6ebd5110, 0x213ef98a, 0xdd963d06, 0x3eddae05, 0xe64d46bd, 0x5491b58d, 0xc471055d, 0x06046fd4, 0x5060ff15, 0x981924fb, 0xbdd697e9, 0x4089cc43, 0xd967779e, 0xe8b0bd42, 0x8907888b, 0x19e7385b, 0xc879dbee, 0x7ca1470a, 0x427ce90f, 0x84f8c91e, 0x00000000, 0x80098386, 0x2b3248ed, 0x111eac70, 0x5a6c4e72, 0x0efdfbff, 0x850f5638, 0xae3d1ed5, 0x2d362739, 0x0f0a64d9, 0x5c6821a6, 0x5b9bd154, 0x36243a2e, 0x0a0cb167, 0x57930fe7, 0xeeb4d296, 0x9b1b9e91, 0xc0804fc5, 0xdc61a220, 0x775a694b, 0x121c161a, 0x93e20aba, 0xa0c0e52a, 0x223c43e0, 0x1b121d17, 0x090e0b0d, 0x8bf2adc7, 0xb62db9a8, 0x1e14c8a9, 0xf1578519, 0x75af4c07, 0x99eebbdd, 0x7fa3fd60, 0x01f79f26, 0x725cbcf5, 0x6644c53b, 0xfb5b347e, 0x438b7629, 0x23cbdcc6, 0xedb668fc, 0xe4b863f1, 0x31d7cadc, 0x63421085, 0x97134022, 0xc6842011, 0x4a857d24, 0xbbd2f83d, 0xf9ae1132, 0x29c76da1, 0x9e1d4b2f, 0xb2dcf330, 0x860dec52, 0xc177d0e3, 0xb32b6c16, 0x70a999b9, 0x9411fa48, 0xe9472264, 0xfca8c48c, 0xf0a01a3f, 0x7d56d82c, 0x3322ef90, 0x4987c74e, 0x38d9c1d1, 0xca8cfea2, 0xd498360b, 0xf5a6cf81, 0x7aa528de, 0xb7da268e, 0xad3fa4bf, 0x3a2ce49d, 0x78500d92, 0x5f6a9bcc, 0x7e546246, 0x8df6c213, 0xd890e8b8, 0x392e5ef7, 0xc382f5af, 0x5d9fbe80, 0xd0697c93, 0xd56fa92d, 0x25cfb312, 0xacc83b99, 0x1810a77d, 0x9ce86e63, 0x3bdb7bbb, 0x26cd0978, 0x596ef418, 0x9aec01b7, 0x4f83a89a, 0x95e6656e, 0xffaa7ee6, 0xbc2108cf, 0x15efe6e8, 0xe7bad99b, 0x6f4ace36, 0x9fead409, 0xb029d67c, 0xa431afb2, 0x3f2a3123, 0xa5c63094, 0xa235c066, 0x4e7437bc, 0x82fca6ca, 0x90e0b0d0, 0xa73315d8, 0x04f14a98, 0xec41f7da, 0xcd7f0e50, 0x91172ff6, 0x4d768dd6, 0xef434db0, 0xaacc544d, 0x96e4df04, 0xd19ee3b5, 0x6a4c1b88, 0x2cc1b81f, 0x65467f51, 0x5e9d04ea, 0x8c015d35, 0x87fa7374, 0x0bfb2e41, 0x67b35a1d, 0xdb9252d2, 0x10e93356, 0xd66d1347, 0xd79a8c61, 0xa1377a0c, 0xf8598e14, 0x13eb893c, 0xa9ceee27, 0x61b735c9, 0x1ce1ede5, 0x477a3cb1, 0xd29c59df, 0xf2553f73, 0x141879ce, 0xc773bf37, 0xf753eacd, 0xfd5f5baa, 0x3ddf146f, 0x447886db, 0xafca81f3, 0x68b93ec4, 0x24382c34, 0xa3c25f40, 0x1d1672c3, 0xe2bc0c25, 0x3c288b49, 0x0dff4195, 0xa8397101, 0x0c08deb3, 0xb4d89ce4, 0x566490c1, 0xcb7b6184, 0x32d570b6, 0x6c48745c, 0xb8d04257, }, { 0x5150a7f4, 0x7e536541, 0x1ac3a417, 0x3a965e27, 0x3bcb6bab, 0x1ff1459d, 0xacab58fa, 0x4b9303e3, 0x2055fa30, 0xadf66d76, 0x889176cc, 0xf5254c02, 0x4ffcd7e5, 0xc5d7cb2a, 0x26804435, 0xb58fa362, 0xde495ab1, 0x25671bba, 0x45980eea, 0x5de1c0fe, 0xc302752f, 0x8112f04c, 0x8da39746, 0x6bc6f9d3, 0x03e75f8f, 0x15959c92, 0xbfeb7a6d, 0x95da5952, 0xd42d83be, 0x58d32174, 0x492969e0, 0x8e44c8c9,

0x756a89c2, 0xf478798e, 0x996b3e58, 0x27dd71b9, 0xbeb64fe1, 0xf017ad88, 0xc966ac20, 0x7db43ace, 0x63184adf, 0xe582311a, 0x97603351, 0x62457f53, 0xb1e07764, 0xbb84ae6b, 0xfe1ca081, 0xf9942b08, 0x70586848, 0x8f19fd45, 0x94876cde, 0x52b7f87b, 0xab23d373, 0x72e2024b, 0xe3578f1f, 0x662aab55, 0xb20728eb, 0x2f03c2b5, 0x869a7bc5, 0xd3a50837, 0x30f28728, 0x23b2a5bf, 0x02ba6a03, 0xed5c8216, 0x8a2b1ccf, 0xa792b479, 0xf3f0f207, 0x4ea1e269, 0x65cdf4da, 0x06d5be05, 0xd11f6234, 0xc48afea6, 0x349d532e, 0xa2a055f3, 0x0532e18a, 0xa475ebf6, 0x0b39ec83, 0x40aaef60, 0x5e069f71, 0xbd51106e, 0x3ef98a21, 0x963d06dd, 0xddae053e, 0x4d46bde6, 0x91b58d54, 0x71055dc4, 0x046fd406, 0x60ff1550, 0x1924fb98, 0xd697e9bd, 0x89cc4340, 0x67779ed9, 0xb0bd42e8, 0x07888b89, 0xe7385b19, 0x79dbeec8, 0xa1470a7c, 0x7ce90f42, 0xf8c91e84, 0x00000000, 0x09838680, 0x3248ed2b, 0x1eac7011, 0x6c4e725a, 0xfdfbff0e, 0x0f563885, 0x3d1ed5ae, 0x3627392d, 0x0a64d90f, 0x6821a65c, 0x9bd1545b, 0x243a2e36, 0x0cb1670a, 0x930fe757, 0xb4d296ee, 0x1b9e919b, 0x804fc5c0, 0x61a220dc, 0x5a694b77, 0x1c161a12, 0xe20aba93, 0xc0e52aa0, 0x3c43e022, 0x121d171b, 0x0e0b0d09, 0xf2adc78b, 0x2db9a8b6, 0x14c8a91e, 0x578519f1, 0xaf4c0775, 0xeebbdd99, 0xa3fd607f, 0xf79f2601, 0x5cbcf572, 0x44c53b66, 0x5b347efb, 0x8b762943, 0xcbdcc623, 0xb668fced, 0xb863f1e4, 0xd7cadc31, 0x42108563, 0x13402297, 0x842011c6, 0x857d244a, 0xd2f83dbb, 0xae1132f9, 0xc76da129, 0x1d4b2f9e, 0xdcf330b2, 0x0dec5286, 0x77d0e3c1, 0x2b6c16b3, 0xa999b970, 0x11fa4894, 0x472264e9, 0xa8c48cfc, 0xa01a3ff0, 0x56d82c7d, 0x22ef9033, 0x87c74e49, 0xd9c1d138, 0x8cfea2ca, 0x98360bd4, 0xa6cf81f5, 0xa528de7a, 0xda268eb7, 0x3fa4bfad, 0x2ce49d3a, 0x500d9278, 0x6a9bcc5f, 0x5462467e, 0xf6c2138d, 0x90e8b8d8, 0x2e5ef739, 0x82f5afc3, 0x9fbe805d, 0x697c93d0, 0x6fa92dd5, 0xcfb31225, 0xc83b99ac, 0x10a77d18, 0xe86e639c, 0xdb7bbb3b, 0xcd097826, 0x6ef41859, 0xec01b79a, 0x83a89a4f, 0xe6656e95, 0xaa7ee6ff, 0x2108cfbc, 0xefe6e815, 0xbad99be7, 0x4ace366f, 0xead4099f, 0x29d67cb0, 0x31afb2a4, 0x2a31233f, 0xc63094a5, 0x35c066a2, 0x7437bc4e, 0xfca6ca82, 0xe0b0d090, 0x3315d8a7, 0xf14a9804, 0x41f7daec, 0x7f0e50cd, 0x172ff691, 0x768dd64d, 0x434db0ef, 0xcc544daa, 0xe4df0496, 0x9ee3b5d1, 0x4c1b886a, 0xc1b81f2c, 0x467f5165, 0x9d04ea5e, 0x015d358c, 0xfa737487, 0xfb2e410b, 0xb35a1d67, 0x9252d2db, 0xe9335610, 0x6d1347d6, 0x9a8c61d7, 0x377a0ca1, 0x598e14f8, 0xeb893c13, 0xceee27a9, 0xb735c961, 0xe1ede51c, 0x7a3cb147, 0x9c59dfd2, 0x553f73f2, 0x1879ce14, 0x73bf37c7, 0x53eacdf7, 0x5f5baafd, 0xdf146f3d, 0x7886db44, 0xca81f3af, 0xb93ec468, 0x382c3424, 0xc25f40a3, 0x1672c31d, 0xbc0c25e2, 0x288b493c, 0xff41950d, 0x397101a8, 0x08deb30c, 0xd89ce4b4, 0x6490c156, 0x7b6184cb, 0xd570b632, 0x48745c6c, 0xd04257b8, } }; const u32 crypto_il_tab[4][256] = { { 0x00000052, 0x00000009, 0x0000006a, 0x000000d5, 0x00000030, 0x00000036, 0x000000a5, 0x00000038, 0x000000bf, 0x00000040, 0x000000a3, 0x0000009e, 0x00000081, 0x000000f3, 0x000000d7, 0x000000fb, 0x0000007c, 0x000000e3, 0x00000039, 0x00000082, 0x0000009b, 0x0000002f, 0x000000ff, 0x00000087, 0x00000034, 0x0000008e, 0x00000043, 0x00000044, 0x000000c4, 0x000000de, 0x000000e9, 0x000000cb, 0x00000054, 0x0000007b, 0x00000094, 0x00000032,

0x000000a6, 0x000000c2, 0x00000023, 0x0000003d, 0x000000ee, 0x0000004c, 0x00000095, 0x0000000b, 0x00000042, 0x000000fa, 0x000000c3, 0x0000004e, 0x00000008, 0x0000002e, 0x000000a1, 0x00000066, 0x00000028, 0x000000d9, 0x00000024, 0x000000b2, 0x00000076, 0x0000005b, 0x000000a2, 0x00000049, 0x0000006d, 0x0000008b, 0x000000d1, 0x00000025, 0x00000072, 0x000000f8, 0x000000f6, 0x00000064, 0x00000086, 0x00000068, 0x00000098, 0x00000016, 0x000000d4, 0x000000a4, 0x0000005c, 0x000000cc, 0x0000005d, 0x00000065, 0x000000b6, 0x00000092, 0x0000006c, 0x00000070, 0x00000048, 0x00000050, 0x000000fd, 0x000000ed, 0x000000b9, 0x000000da, 0x0000005e, 0x00000015, 0x00000046, 0x00000057, 0x000000a7, 0x0000008d, 0x0000009d, 0x00000084, 0x00000090, 0x000000d8, 0x000000ab, 0x00000000, 0x0000008c, 0x000000bc, 0x000000d3, 0x0000000a, 0x000000f7, 0x000000e4, 0x00000058, 0x00000005, 0x000000b8, 0x000000b3, 0x00000045, 0x00000006, 0x000000d0, 0x0000002c, 0x0000001e, 0x0000008f, 0x000000ca, 0x0000003f, 0x0000000f, 0x00000002, 0x000000c1, 0x000000af, 0x000000bd, 0x00000003, 0x00000001, 0x00000013, 0x0000008a, 0x0000006b, 0x0000003a, 0x00000091, 0x00000011, 0x00000041, 0x0000004f, 0x00000067, 0x000000dc, 0x000000ea, 0x00000097, 0x000000f2, 0x000000cf, 0x000000ce, 0x000000f0, 0x000000b4, 0x000000e6, 0x00000073, 0x00000096, 0x000000ac, 0x00000074, 0x00000022, 0x000000e7, 0x000000ad, 0x00000035, 0x00000085, 0x000000e2, 0x000000f9, 0x00000037, 0x000000e8, 0x0000001c, 0x00000075, 0x000000df, 0x0000006e, 0x00000047, 0x000000f1, 0x0000001a, 0x00000071, 0x0000001d, 0x00000029, 0x000000c5, 0x00000089, 0x0000006f, 0x000000b7, 0x00000062, 0x0000000e, 0x000000aa, 0x00000018, 0x000000be, 0x0000001b, 0x000000fc, 0x00000056, 0x0000003e, 0x0000004b, 0x000000c6, 0x000000d2, 0x00000079, 0x00000020, 0x0000009a, 0x000000db, 0x000000c0, 0x000000fe, 0x00000078, 0x000000cd, 0x0000005a, 0x000000f4, 0x0000001f, 0x000000dd, 0x000000a8, 0x00000033, 0x00000088, 0x00000007, 0x000000c7, 0x00000031, 0x000000b1, 0x00000012, 0x00000010, 0x00000059, 0x00000027, 0x00000080, 0x000000ec, 0x0000005f, 0x00000060, 0x00000051, 0x0000007f, 0x000000a9, 0x00000019, 0x000000b5, 0x0000004a, 0x0000000d, 0x0000002d, 0x000000e5, 0x0000007a, 0x0000009f, 0x00000093, 0x000000c9, 0x0000009c, 0x000000ef, 0x000000a0, 0x000000e0, 0x0000003b, 0x0000004d, 0x000000ae, 0x0000002a, 0x000000f5, 0x000000b0, 0x000000c8, 0x000000eb, 0x000000bb, 0x0000003c, 0x00000083, 0x00000053, 0x00000099, 0x00000061, 0x00000017, 0x0000002b, 0x00000004, 0x0000007e, 0x000000ba, 0x00000077, 0x000000d6, 0x00000026, 0x000000e1, 0x00000069, 0x00000014, 0x00000063, 0x00000055, 0x00000021, 0x0000000c, 0x0000007d, }, { 0x00005200, 0x00000900, 0x00006a00, 0x0000d500, 0x00003000, 0x00003600, 0x0000a500, 0x00003800, 0x0000bf00, 0x00004000, 0x0000a300, 0x00009e00, 0x00008100, 0x0000f300, 0x0000d700, 0x0000fb00, 0x00007c00, 0x0000e300, 0x00003900, 0x00008200, 0x00009b00, 0x00002f00, 0x0000ff00, 0x00008700, 0x00003400, 0x00008e00, 0x00004300, 0x00004400, 0x0000c400, 0x0000de00, 0x0000e900, 0x0000cb00, 0x00005400, 0x00007b00, 0x00009400, 0x00003200, 0x0000a600, 0x0000c200, 0x00002300, 0x00003d00, 0x0000ee00, 0x00004c00, 0x00009500, 0x00000b00, 0x00004200, 0x0000fa00, 0x0000c300, 0x00004e00, 0x00000800, 0x00002e00, 0x0000a100, 0x00006600, 0x00002800, 0x0000d900, 0x00002400, 0x0000b200,

0x00007600, 0x00005b00, 0x0000a200, 0x00004900, 0x00006d00, 0x00008b00, 0x0000d100, 0x00002500, 0x00007200, 0x0000f800, 0x0000f600, 0x00006400, 0x00008600, 0x00006800, 0x00009800, 0x00001600, 0x0000d400, 0x0000a400, 0x00005c00, 0x0000cc00, 0x00005d00, 0x00006500, 0x0000b600, 0x00009200, 0x00006c00, 0x00007000, 0x00004800, 0x00005000, 0x0000fd00, 0x0000ed00, 0x0000b900, 0x0000da00, 0x00005e00, 0x00001500, 0x00004600, 0x00005700, 0x0000a700, 0x00008d00, 0x00009d00, 0x00008400, 0x00009000, 0x0000d800, 0x0000ab00, 0x00000000, 0x00008c00, 0x0000bc00, 0x0000d300, 0x00000a00, 0x0000f700, 0x0000e400, 0x00005800, 0x00000500, 0x0000b800, 0x0000b300, 0x00004500, 0x00000600, 0x0000d000, 0x00002c00, 0x00001e00, 0x00008f00, 0x0000ca00, 0x00003f00, 0x00000f00, 0x00000200, 0x0000c100, 0x0000af00, 0x0000bd00, 0x00000300, 0x00000100, 0x00001300, 0x00008a00, 0x00006b00, 0x00003a00, 0x00009100, 0x00001100, 0x00004100, 0x00004f00, 0x00006700, 0x0000dc00, 0x0000ea00, 0x00009700, 0x0000f200, 0x0000cf00, 0x0000ce00, 0x0000f000, 0x0000b400, 0x0000e600, 0x00007300, 0x00009600, 0x0000ac00, 0x00007400, 0x00002200, 0x0000e700, 0x0000ad00, 0x00003500, 0x00008500, 0x0000e200, 0x0000f900, 0x00003700, 0x0000e800, 0x00001c00, 0x00007500, 0x0000df00, 0x00006e00, 0x00004700, 0x0000f100, 0x00001a00, 0x00007100, 0x00001d00, 0x00002900, 0x0000c500, 0x00008900, 0x00006f00, 0x0000b700, 0x00006200, 0x00000e00, 0x0000aa00, 0x00001800, 0x0000be00, 0x00001b00, 0x0000fc00, 0x00005600, 0x00003e00, 0x00004b00, 0x0000c600, 0x0000d200, 0x00007900, 0x00002000, 0x00009a00, 0x0000db00, 0x0000c000, 0x0000fe00, 0x00007800, 0x0000cd00, 0x00005a00, 0x0000f400, 0x00001f00, 0x0000dd00, 0x0000a800, 0x00003300, 0x00008800, 0x00000700, 0x0000c700, 0x00003100, 0x0000b100, 0x00001200, 0x00001000, 0x00005900, 0x00002700, 0x00008000, 0x0000ec00, 0x00005f00, 0x00006000, 0x00005100, 0x00007f00, 0x0000a900, 0x00001900, 0x0000b500, 0x00004a00, 0x00000d00, 0x00002d00, 0x0000e500, 0x00007a00, 0x00009f00, 0x00009300, 0x0000c900, 0x00009c00, 0x0000ef00, 0x0000a000, 0x0000e000, 0x00003b00, 0x00004d00, 0x0000ae00, 0x00002a00, 0x0000f500, 0x0000b000, 0x0000c800, 0x0000eb00, 0x0000bb00, 0x00003c00, 0x00008300, 0x00005300, 0x00009900, 0x00006100, 0x00001700, 0x00002b00, 0x00000400, 0x00007e00, 0x0000ba00, 0x00007700, 0x0000d600, 0x00002600, 0x0000e100, 0x00006900, 0x00001400, 0x00006300, 0x00005500, 0x00002100, 0x00000c00, 0x00007d00, }, { 0x00520000, 0x00090000, 0x006a0000, 0x00d50000, 0x00300000, 0x00360000, 0x00a50000, 0x00380000, 0x00bf0000, 0x00400000, 0x00a30000, 0x009e0000, 0x00810000, 0x00f30000, 0x00d70000, 0x00fb0000, 0x007c0000, 0x00e30000, 0x00390000, 0x00820000, 0x009b0000, 0x002f0000, 0x00ff0000, 0x00870000, 0x00340000, 0x008e0000, 0x00430000, 0x00440000, 0x00c40000, 0x00de0000, 0x00e90000, 0x00cb0000, 0x00540000, 0x007b0000, 0x00940000, 0x00320000, 0x00a60000, 0x00c20000, 0x00230000, 0x003d0000, 0x00ee0000, 0x004c0000, 0x00950000, 0x000b0000, 0x00420000, 0x00fa0000, 0x00c30000, 0x004e0000, 0x00080000, 0x002e0000, 0x00a10000, 0x00660000, 0x00280000, 0x00d90000, 0x00240000, 0x00b20000, 0x00760000, 0x005b0000, 0x00a20000, 0x00490000, 0x006d0000, 0x008b0000, 0x00d10000, 0x00250000, 0x00720000, 0x00f80000, 0x00f60000, 0x00640000, 0x00860000, 0x00680000, 0x00980000, 0x00160000, 0x00d40000, 0x00a40000, 0x005c0000, 0x00cc0000,

0x005d0000, 0x00650000, 0x00b60000, 0x00920000, 0x006c0000, 0x00700000, 0x00480000, 0x00500000, 0x00fd0000, 0x00ed0000, 0x00b90000, 0x00da0000, 0x005e0000, 0x00150000, 0x00460000, 0x00570000, 0x00a70000, 0x008d0000, 0x009d0000, 0x00840000, 0x00900000, 0x00d80000, 0x00ab0000, 0x00000000, 0x008c0000, 0x00bc0000, 0x00d30000, 0x000a0000, 0x00f70000, 0x00e40000, 0x00580000, 0x00050000, 0x00b80000, 0x00b30000, 0x00450000, 0x00060000, 0x00d00000, 0x002c0000, 0x001e0000, 0x008f0000, 0x00ca0000, 0x003f0000, 0x000f0000, 0x00020000, 0x00c10000, 0x00af0000, 0x00bd0000, 0x00030000, 0x00010000, 0x00130000, 0x008a0000, 0x006b0000, 0x003a0000, 0x00910000, 0x00110000, 0x00410000, 0x004f0000, 0x00670000, 0x00dc0000, 0x00ea0000, 0x00970000, 0x00f20000, 0x00cf0000, 0x00ce0000, 0x00f00000, 0x00b40000, 0x00e60000, 0x00730000, 0x00960000, 0x00ac0000, 0x00740000, 0x00220000, 0x00e70000, 0x00ad0000, 0x00350000, 0x00850000, 0x00e20000, 0x00f90000, 0x00370000, 0x00e80000, 0x001c0000, 0x00750000, 0x00df0000, 0x006e0000, 0x00470000, 0x00f10000, 0x001a0000, 0x00710000, 0x001d0000, 0x00290000, 0x00c50000, 0x00890000, 0x006f0000, 0x00b70000, 0x00620000, 0x000e0000, 0x00aa0000, 0x00180000, 0x00be0000, 0x001b0000, 0x00fc0000, 0x00560000, 0x003e0000, 0x004b0000, 0x00c60000, 0x00d20000, 0x00790000, 0x00200000, 0x009a0000, 0x00db0000, 0x00c00000, 0x00fe0000, 0x00780000, 0x00cd0000, 0x005a0000, 0x00f40000, 0x001f0000, 0x00dd0000, 0x00a80000, 0x00330000, 0x00880000, 0x00070000, 0x00c70000, 0x00310000, 0x00b10000, 0x00120000, 0x00100000, 0x00590000, 0x00270000, 0x00800000, 0x00ec0000, 0x005f0000, 0x00600000, 0x00510000, 0x007f0000, 0x00a90000, 0x00190000, 0x00b50000, 0x004a0000, 0x000d0000, 0x002d0000, 0x00e50000, 0x007a0000, 0x009f0000, 0x00930000, 0x00c90000, 0x009c0000, 0x00ef0000, 0x00a00000, 0x00e00000, 0x003b0000, 0x004d0000, 0x00ae0000, 0x002a0000, 0x00f50000, 0x00b00000, 0x00c80000, 0x00eb0000, 0x00bb0000, 0x003c0000, 0x00830000, 0x00530000, 0x00990000, 0x00610000, 0x00170000, 0x002b0000, 0x00040000, 0x007e0000, 0x00ba0000, 0x00770000, 0x00d60000, 0x00260000, 0x00e10000, 0x00690000, 0x00140000, 0x00630000, 0x00550000, 0x00210000, 0x000c0000, 0x007d0000, }, { 0x52000000, 0x09000000, 0x6a000000, 0xd5000000, 0x30000000, 0x36000000, 0xa5000000, 0x38000000, 0xbf000000, 0x40000000, 0xa3000000, 0x9e000000, 0x81000000, 0xf3000000, 0xd7000000, 0xfb000000, 0x7c000000, 0xe3000000, 0x39000000, 0x82000000, 0x9b000000, 0x2f000000, 0xff000000, 0x87000000, 0x34000000, 0x8e000000, 0x43000000, 0x44000000, 0xc4000000, 0xde000000, 0xe9000000, 0xcb000000, 0x54000000, 0x7b000000, 0x94000000, 0x32000000, 0xa6000000, 0xc2000000, 0x23000000, 0x3d000000, 0xee000000, 0x4c000000, 0x95000000, 0x0b000000, 0x42000000, 0xfa000000, 0xc3000000, 0x4e000000, 0x08000000, 0x2e000000, 0xa1000000, 0x66000000, 0x28000000, 0xd9000000, 0x24000000, 0xb2000000, 0x76000000, 0x5b000000, 0xa2000000, 0x49000000, 0x6d000000, 0x8b000000, 0xd1000000, 0x25000000, 0x72000000, 0xf8000000, 0xf6000000, 0x64000000, 0x86000000, 0x68000000, 0x98000000, 0x16000000, 0xd4000000, 0xa4000000, 0x5c000000, 0xcc000000, 0x5d000000, 0x65000000, 0xb6000000, 0x92000000, 0x6c000000, 0x70000000, 0x48000000, 0x50000000, 0xfd000000, 0xed000000, 0xb9000000, 0xda000000, 0x5e000000, 0x15000000, 0x46000000, 0x57000000, 0xa7000000, 0x8d000000, 0x9d000000, 0x84000000,

0x90000000, 0xd8000000, 0xab000000, 0x00000000, 0x8c000000, 0xbc000000, 0xd3000000, 0x0a000000, 0xf7000000, 0xe4000000, 0x58000000, 0x05000000, 0xb8000000, 0xb3000000, 0x45000000, 0x06000000, 0xd0000000, 0x2c000000, 0x1e000000, 0x8f000000, 0xca000000, 0x3f000000, 0x0f000000, 0x02000000, 0xc1000000, 0xaf000000, 0xbd000000, 0x03000000, 0x01000000, 0x13000000, 0x8a000000, 0x6b000000, 0x3a000000, 0x91000000, 0x11000000, 0x41000000, 0x4f000000, 0x67000000, 0xdc000000, 0xea000000, 0x97000000, 0xf2000000, 0xcf000000, 0xce000000, 0xf0000000, 0xb4000000, 0xe6000000, 0x73000000, 0x96000000, 0xac000000, 0x74000000, 0x22000000, 0xe7000000, 0xad000000, 0x35000000, 0x85000000, 0xe2000000, 0xf9000000, 0x37000000, 0xe8000000, 0x1c000000, 0x75000000, 0xdf000000, 0x6e000000, 0x47000000, 0xf1000000, 0x1a000000, 0x71000000, 0x1d000000, 0x29000000, 0xc5000000, 0x89000000, 0x6f000000, 0xb7000000, 0x62000000, 0x0e000000, 0xaa000000, 0x18000000, 0xbe000000, 0x1b000000, 0xfc000000, 0x56000000, 0x3e000000, 0x4b000000, 0xc6000000, 0xd2000000, 0x79000000, 0x20000000, 0x9a000000, 0xdb000000, 0xc0000000, 0xfe000000, 0x78000000, 0xcd000000, 0x5a000000, 0xf4000000, 0x1f000000, 0xdd000000, 0xa8000000, 0x33000000, 0x88000000, 0x07000000, 0xc7000000, 0x31000000, 0xb1000000, 0x12000000, 0x10000000, 0x59000000, 0x27000000, 0x80000000, 0xec000000, 0x5f000000, 0x60000000, 0x51000000, 0x7f000000, 0xa9000000, 0x19000000, 0xb5000000, 0x4a000000, 0x0d000000, 0x2d000000, 0xe5000000, 0x7a000000, 0x9f000000, 0x93000000, 0xc9000000, 0x9c000000, 0xef000000, 0xa0000000, 0xe0000000, 0x3b000000, 0x4d000000, 0xae000000, 0x2a000000, 0xf5000000, 0xb0000000, 0xc8000000, 0xeb000000, 0xbb000000, 0x3c000000, 0x83000000, 0x53000000, 0x99000000, 0x61000000, 0x17000000, 0x2b000000, 0x04000000, 0x7e000000, 0xba000000, 0x77000000, 0xd6000000, 0x26000000, 0xe1000000, 0x69000000, 0x14000000, 0x63000000, 0x55000000, 0x21000000, 0x0c000000, 0x7d000000, } }; EXPORT_SYMBOL_GPL(crypto_ft_tab); EXPORT_SYMBOL_GPL(crypto_fl_tab); EXPORT_SYMBOL_GPL(crypto_it_tab); EXPORT_SYMBOL_GPL(crypto_il_tab); /* initialise the key schedule from the user supplied key */ #define star_x(x) (((x) & 0x7f7f7f7f) << 1) ^ ((((x) & 0x80808080) >> 7) * 0x1b) #define imix_col(y, x) do { u = star_x(x); v = star_x(u); w = star_x(v); t = w ^ (x); (y) = u ^ v ^ w; (y) ^= ror32(u ^ t, 8) ^ \ ror32(v ^ t, 16) ^ \ ror32(t, 24); } while (0) #define ls_box(x) \ crypto_fl_tab[0][byte(x, 0)] ^ crypto_fl_tab[1][byte(x, 1)] ^ crypto_fl_tab[2][byte(x, 2)] ^ crypto_fl_tab[3][byte(x, 3)] #define loop4(i) do { \ \ \ \ \ \ \ \

\ \ \

t = ror32(t, 8); \ t = ls_box(t) ^ rco_tab[i]; \ t ^= ctx->key_enc[4 * i]; ctx->key_enc[4 * i + 4] = t; t ^= ctx->key_enc[4 * i + 1]; ctx->key_enc[4 * i + 5] = t; t ^= ctx->key_enc[4 * i + 2]; ctx->key_enc[4 * i + 6] = t; t ^= ctx->key_enc[4 * i + 3]; ctx->key_enc[4 * i + 7] = t; } while (0) #define loop6(i) do { \ t = ror32(t, 8); \ t = ls_box(t) ^ rco_tab[i]; \ t ^= ctx->key_enc[6 * i]; ctx->key_enc[6 * i + 6] = t; t ^= ctx->key_enc[6 * i + 1]; ctx->key_enc[6 * i + 7] = t; t ^= ctx->key_enc[6 * i + 2]; ctx->key_enc[6 * i + 8] = t; t ^= ctx->key_enc[6 * i + 3]; ctx->key_enc[6 * i + 9] = t; t ^= ctx->key_enc[6 * i + 4]; ctx->key_enc[6 * i + 10] = t; t ^= ctx->key_enc[6 * i + 5]; ctx->key_enc[6 * i + 11] = t; } while (0) #define loop8tophalf(i) do { t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; t ^= ctx->key_enc[8 * i]; ctx->key_enc[8 * i + 8] = t; t ^= ctx->key_enc[8 * i + 1]; ctx->key_enc[8 * i + 9] = t; t ^= ctx->key_enc[8 * i + 2]; ctx->key_enc[8 * i + 10] = t; t ^= ctx->key_enc[8 * i + 3]; ctx->key_enc[8 * i + 11] = t; } while (0)

\ \ \ \ \ \ \ \

\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \

#define loop8(i) do { loop8tophalf(i); t = ctx->key_enc[8 * i + 4] ^ ls_box(t); ctx->key_enc[8 * i + 12] = t; t ^= ctx->key_enc[8 * i + 5]; ctx->key_enc[8 * i + 13] = t; t ^= ctx->key_enc[8 * i + 6]; ctx->key_enc[8 * i + 14] = t; t ^= ctx->key_enc[8 * i + 7]; ctx->key_enc[8 * i + 15] = t; } while (0)

/** * crypto_aes_expand_key - Expands the AES key as described in FIPS-197 * @ctx: The location where the computed key will be stored. * @in_key: The supplied key. * @key_len: The length of the supplied key. * * Returns 0 on success. The function fails only if an invalid key size (or * pointer) is supplied. * The expanded key size is 240 bytes (max of 14 rounds with a unique 16 bytes * key schedule plus a 16 bytes key which is used before the first round). * The decryption key is prepared for the "Equivalent Inverse Cipher" as * described in FIPS-197. The first slot (16 bytes) of each key (enc or dec) is * for the initial combination, the second slot for the first round and so on. */ int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len)

{ const __le32 *key = (const __le32 *)in_key; u32 i, t, u, v, w, j; if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 && key_len != AES_KEYSIZE_256) return -EINVAL; ctx->key_length = key_len; ctx->key_dec[key_len + 24] = ctx->key_enc[0] = le32_to_cpu(key[0]); ctx->key_dec[key_len + 25] = ctx->key_enc[1] = le32_to_cpu(key[1]); ctx->key_dec[key_len + 26] = ctx->key_enc[2] = le32_to_cpu(key[2]); ctx->key_dec[key_len + 27] = ctx->key_enc[3] = le32_to_cpu(key[3]); switch (key_len) { case AES_KEYSIZE_128: t = ctx->key_enc[3]; for (i = 0; i < 10; ++i) loop4(i); break; case AES_KEYSIZE_192: ctx->key_enc[4] = le32_to_cpu(key[4]); t = ctx->key_enc[5] = le32_to_cpu(key[5]); for (i = 0; i < 8; ++i) loop6(i); break; case AES_KEYSIZE_256: ctx->key_enc[4] = le32_to_cpu(key[4]); ctx->key_enc[5] = le32_to_cpu(key[5]); ctx->key_enc[6] = le32_to_cpu(key[6]); t = ctx->key_enc[7] = le32_to_cpu(key[7]); for (i = 0; i < 6; ++i) loop8(i); loop8tophalf(i); break; } ctx->key_dec[0] = ctx->key_enc[key_len + 24]; ctx->key_dec[1] = ctx->key_enc[key_len + 25]; ctx->key_dec[2] = ctx->key_enc[key_len + 26]; ctx->key_dec[3] = ctx->key_enc[key_len + 27]; for (i = 4; i < key_len + 24; ++i) { j = key_len + 24 - (i & ~3) + (i & 3); imix_col(ctx->key_dec[j], ctx->key_enc[i]); } return 0; } EXPORT_SYMBOL_GPL(crypto_aes_expand_key); /** * crypto_aes_set_key - Set the AES key. * @tfm: The %crypto_tfm that is used in the context. * @in_key: The input key. * @key_len: The size of the key. * * Returns 0 on success, on failure the %CRYPTO_TFM_RES_BAD_KEY_LEN flag in tfm * is set. The function uses crypto_aes_expand_key() to expand the key. * &crypto_aes_ctx _must_ be the private data embedded in @tfm which is * retrieved with crypto_tfm_ctx(). */ int crypto_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); u32 *flags = &tfm->crt_flags; int ret;

ret = crypto_aes_expand_key(ctx, in_key, key_len); if (!ret) return 0; *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; return -EINVAL; } EXPORT_SYMBOL_GPL(crypto_aes_set_key); /* encrypt a block of text */ #define f_rn(bo, bi, n, k) do { \ bo[n] = crypto_ft_tab[0][byte(bi[n], 0)] ^ crypto_ft_tab[1][byte(bi[(n + 1) & 3], 1)] ^ crypto_ft_tab[2][byte(bi[(n + 2) & 3], 2)] ^ crypto_ft_tab[3][byte(bi[(n + 3) & 3], 3)] ^ *(k + n); } while (0) #define f_nround(bo, bi, k) f_rn(bo, bi, 0, k); f_rn(bo, bi, 1, k); f_rn(bo, bi, 2, k); f_rn(bo, bi, 3, k); k += 4; } while (0) do {\ \ \ \ \ \ \ \ \ \

#define f_rl(bo, bi, n, k) do { \ bo[n] = crypto_fl_tab[0][byte(bi[n], 0)] ^ crypto_fl_tab[1][byte(bi[(n + 1) & 3], 1)] ^ crypto_fl_tab[2][byte(bi[(n + 2) & 3], 2)] ^ crypto_fl_tab[3][byte(bi[(n + 3) & 3], 3)] ^ *(k + n); } while (0) #define f_lround(bo, bi, k) f_rl(bo, bi, 0, k); f_rl(bo, bi, 1, k); f_rl(bo, bi, 2, k); f_rl(bo, bi, 3, k); } while (0) do {\ \ \ \ \

\ \ \ \

static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); const __le32 *src = (const __le32 *)in; __le32 *dst = (__le32 *)out; u32 b0[4], b1[4]; const u32 *kp = ctx->key_enc + 4; const int key_len = ctx->key_length; b0[0] = le32_to_cpu(src[0]) ^ ctx->key_enc[0]; b0[1] = le32_to_cpu(src[1]) ^ ctx->key_enc[1]; b0[2] = le32_to_cpu(src[2]) ^ ctx->key_enc[2]; b0[3] = le32_to_cpu(src[3]) ^ ctx->key_enc[3]; if (key_len > 24) { f_nround(b1, b0, kp); f_nround(b0, b1, kp); } if (key_len > 16) { f_nround(b1, b0, kp); f_nround(b0, b1, kp); } f_nround(b1, b0, kp); f_nround(b0, b1, kp); f_nround(b1, b0, kp); f_nround(b0, b1, kp); f_nround(b1, b0, kp);

f_nround(b0, b1, kp); f_nround(b1, b0, kp); f_nround(b0, b1, kp); f_nround(b1, b0, kp); f_lround(b0, b1, kp); dst[0] = cpu_to_le32(b0[0]); dst[1] = cpu_to_le32(b0[1]); dst[2] = cpu_to_le32(b0[2]); dst[3] = cpu_to_le32(b0[3]); } /* decrypt a block of text */ #define i_rn(bo, bi, n, k) do { \ bo[n] = crypto_it_tab[0][byte(bi[n], 0)] ^ crypto_it_tab[1][byte(bi[(n + 3) & 3], 1)] ^ crypto_it_tab[2][byte(bi[(n + 2) & 3], 2)] ^ crypto_it_tab[3][byte(bi[(n + 1) & 3], 3)] ^ *(k + n); } while (0) #define i_nround(bo, bi, k) i_rn(bo, bi, 0, k); i_rn(bo, bi, 1, k); i_rn(bo, bi, 2, k); i_rn(bo, bi, 3, k); k += 4; } while (0) do {\ \ \ \ \ \ \ \ \ \

#define i_rl(bo, bi, n, k) do { \ bo[n] = crypto_il_tab[0][byte(bi[n], 0)] ^ crypto_il_tab[1][byte(bi[(n + 3) & 3], 1)] ^ crypto_il_tab[2][byte(bi[(n + 2) & 3], 2)] ^ crypto_il_tab[3][byte(bi[(n + 1) & 3], 3)] ^ *(k + n); } while (0) #define i_lround(bo, bi, k) i_rl(bo, bi, 0, k); i_rl(bo, bi, 1, k); i_rl(bo, bi, 2, k); i_rl(bo, bi, 3, k); } while (0) do {\ \ \ \ \

\ \ \ \

static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); const __le32 *src = (const __le32 *)in; __le32 *dst = (__le32 *)out; u32 b0[4], b1[4]; const int key_len = ctx->key_length; const u32 *kp = ctx->key_dec + 4; b0[0] = le32_to_cpu(src[0]) ^ b0[1] = le32_to_cpu(src[1]) ^ b0[2] = le32_to_cpu(src[2]) ^ b0[3] = le32_to_cpu(src[3]) ^ ctx->key_dec[0]; ctx->key_dec[1]; ctx->key_dec[2]; ctx->key_dec[3];

if (key_len > 24) { i_nround(b1, b0, kp); i_nround(b0, b1, kp); } if (key_len > 16) { i_nround(b1, b0, kp); i_nround(b0, b1, kp); } i_nround(b1, b0, kp); i_nround(b0, b1, kp); i_nround(b1, b0, kp);

i_nround(b0, b1, kp); i_nround(b1, b0, kp); i_nround(b0, b1, kp); i_nround(b1, b0, kp); i_nround(b0, b1, kp); i_nround(b1, b0, kp); i_lround(b0, b1, kp); dst[0] = cpu_to_le32(b0[0]); dst[1] = cpu_to_le32(b0[1]); dst[2] = cpu_to_le32(b0[2]); dst[3] = cpu_to_le32(b0[3]); } static struct crypto_alg aes_alg = { .cra_name = "aes", .cra_driver_name = "aes-generic", .cra_priority = 100, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx), .cra_alignmask = 3, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(aes_alg.cra_list), .cra_u = { .cipher = { .cia_min_keysize = AES_MIN_KEY_SIZE, .cia_max_keysize = AES_MAX_KEY_SIZE, .cia_setkey = crypto_aes_set_key, .cia_encrypt = aes_encrypt, .cia_decrypt = aes_decrypt } } }; static int __init aes_init(void) { return crypto_register_alg(&aes_alg); } static void __exit aes_fini(void) { crypto_unregister_alg(&aes_alg); } module_init(aes_init); module_exit(aes_fini); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("aes");

Linux kernel release 3.x <http://kernel.org/> These are the release notes for Linux version 3. Read them carefully, as they tell you what this is all about, explain how to install the kernel, and what to do if something goes wrong. WHAT IS LINUX? Linux is a clone of the operating system Unix, written from scratch by Linus Torvalds with assistance from a loosely-knit team of hackers across the Net. It aims towards POSIX and Single UNIX Specification compliance. It has all the features you would expect in a modern fully-fledged Unix, including true multitasking, virtual memory, shared libraries, demand loading, shared copy-on-write executables, proper memory management, and multistack networking including IPv4 and IPv6.

It is distributed under the GNU General Public License - see the accompanying COPYING file for more details. ON WHAT HARDWARE DOES IT RUN? Although originally developed first for 32-bit x86-based PCs (386 or higher), today Linux also runs on (at least) the Compaq Alpha AXP, Sun SPARC and UltraSPARC, Motorola 68000, PowerPC, PowerPC64, ARM, Hitachi SuperH, Cell, IBM S/390, MIPS, HP PA-RISC, Intel IA-64, DEC VAX, AMD x86-64, AXIS CRIS, Xtensa, Tilera TILE, AVR32 and Renesas M32R architectures. Linux is easily portable to most general-purpose 32- or 64-bit architectures as long as they have a paged memory management unit (PMMU) and a port of the GNU C compiler (gcc) (part of The GNU Compiler Collection, GCC). Linux has also been ported to a number of architectures without a PMMU, although functionality is then obviously somewhat limited. Linux has also been ported to itself. You can now run the kernel as a userspace application - this is called UserMode Linux (UML). DOCUMENTATION: - There is a lot of documentation available both in electronic form on the Internet and in books, both Linux-specific and pertaining to general UNIX questions. I'd recommend looking into the documentation subdirectories on any Linux FTP site for the LDP (Linux Documentation Project) books. This README is not meant to be documentation on the system: there are much better sources available. - There are various README files in the Documentation/ subdirectory: these typically contain kernel-specific installation notes for some drivers for example. See Documentation/00-INDEX for a list of what is contained in each file. Please read the Changes file, as it contains information about the problems, which may result by upgrading your kernel. - The Documentation/DocBook/ subdirectory contains several guides for kernel developers and users. These guides can be rendered in a number of formats: PostScript (.ps), PDF, HTML, & man-pages, among others. After installation, "make psdocs", "make pdfdocs", "make htmldocs", or "make mandocs" will render the documentation in the requested format. INSTALLING the kernel source: - If you install the full sources, put the kernel tarball in a directory where you have permissions (eg. your home directory) and unpack it: gzip -cd linux-3.X.tar.gz | tar xvf or bzip2 -dc linux-3.X.tar.bz2 | tar xvf Replace "XX" with the version number of the latest kernel. Do NOT use the /usr/src/linux area! This area has a (usually incomplete) set of kernel headers that are used by the library header files. They should match the library, and not get messed up by whatever the kernel-du-jour happens to be. - You can also upgrade between 3.x releases by patching. Patches are distributed in the traditional gzip and the newer bzip2 format. To install by patching, get all the newer patch files, enter the top level directory of the kernel source (linux-3.x) and execute: gzip -cd ../patch-3.x.gz | patch -p1 or bzip2 -dc ../patch-3.x.bz2 | patch -p1

(repeat xx for all versions bigger than the version of your current source tree, _in_order_) and you should be ok. You may want to remove the backup files (xxx~ or xxx.orig), and make sure that there are no failed patches (xxx# or xxx.rej). If there are, either you or me has made a mistake. Unlike patches for the 3.x kernels, patches for the 3.x.y kernels (also known as the -stable kernels) are not incremental but instead apply directly to the base 3.x kernel. Please read Documentation/applying-patches.txt for more information. Alternatively, the script patch-kernel can be used to automate this process. It determines the current kernel version and applies any patches found. linux/scripts/patch-kernel linux The first argument in the command above is the location of the kernel source. Patches are applied from the current directory, but an alternative directory can be specified as the second argument. - If you are upgrading between releases using the stable series patches (for example, patch-3.x.y), note that these "dot-releases" are not incremental and must be applied to the 3.x base tree. For example, if your base kernel is 3.0 and you want to apply the 3.0.3 patch, you do not and indeed must not first apply the 3.0.1 and 3.0.2 patches. Similarly, if you are running kernel version 3.0.2 and want to jump to 3.0.3, you must first reverse the 3.0.2 patch (that is, patch -R) _before_ applying the 3.0.3 patch. You can read more on this in Documentation/applying-patches.txt - Make sure you have no stale .o files and dependencies lying around: cd linux make mrproper You should now have the sources correctly installed. SOFTWARE REQUIREMENTS Compiling and running the 3.x kernels requires up-to-date versions of various software packages. Consult Documentation/Changes for the minimum version numbers required and how to get updates for these packages. Beware that using excessively old versions of these packages can cause indirect errors that are very difficult to track down, so don't assume that you can just update packages when obvious problems arise during build or operation. BUILD directory for the kernel: When compiling the kernel all output files will per default be stored together with the kernel source code. Using the option "make O=output/dir" allow you to specify an alternate place for the output files (including .config). Example: kernel source code: /usr/src/linux-3.N build directory: /home/name/build/kernel To configure and build the kernel use: cd /usr/src/linux-3.N make O=/home/name/build/kernel menuconfig make O=/home/name/build/kernel sudo make O=/home/name/build/kernel modules_install install Please note: If the 'O=output/dir' option is used then it must be used for all invocations of make.

CONFIGURING the kernel: Do not skip this step even if you are only upgrading one minor version. New configuration options are added in each release, and odd problems will turn up if the configuration files are not set up as expected. If you want to carry your existing configuration to a new version with minimal work, use "make oldconfig", which will only ask you for the answers to new questions. - Alternate configuration commands are: "make config" Plain text interface. "make menuconfig" Text based color menus, radiolists & dialogs. "make nconfig" Enhanced text based color menus. "make xconfig" X windows (Qt) based configuration tool. "make gconfig" X windows (Gtk) based configuration tool. "make oldconfig" Default all questions based on the contents of your existing ./.config file and asking about new config symbols. "make silentoldconfig" Like above, but avoids cluttering the screen with questions already answered. Additionally updates the dependencies. "make defconfig" Create a ./.config file by using the default symbol values from either arch/$ARCH/defconfig or arch/$ARCH/configs/${PLATFORM}_defconfig, depending on the architecture. "make ${PLATFORM}_defconfig" Create a ./.config file by using the default symbol values from arch/$ARCH/configs/${PLATFORM}_defconfig. Use "make help" to get a list of all available platforms of your architecture. "make allyesconfig" Create a ./.config file by setting symbol values to 'y' as much as possible. "make allmodconfig" Create a ./.config file by setting symbol values to 'm' as much as possible. "make allnoconfig" Create a ./.config file by setting symbol values to 'n' as much as possible. "make randconfig" Create a ./.config file by setting symbol values to random values. You can find more information on using the Linux kernel config tools in Documentation/kbuild/kconfig.txt. NOTES on "make config": - having unnecessary drivers will make the kernel bigger, and can under some circumstances lead to problems: probing for a nonexistent controller card may confuse your other controllers - compiling the kernel with "Processor type" set higher than 386 will result in a kernel that does NOT work on a 386. The kernel will detect this on bootup, and give up. - A kernel with math-emulation compiled in will still use the coprocessor if one is present: the math emulation will just never get used in that case. The kernel will be slightly larger, but will work on different machines regardless of whether they have a math coprocessor or not. - the "kernel hacking" configuration details usually result in a bigger or slower kernel (or both), and can even make the kernel less stable by configuring some routines to actively try to break bad code to find kernel problems (kmalloc()). Thus you should probably answer 'n' to the questions for "development", "experimental", or "debugging" features. COMPILING the kernel: - Make sure you have at least gcc 3.2 available. For more information, refer to Documentation/Changes.

Please note that you can still run a.out user programs with this kernel. - Do a "make" to create a compressed kernel image. It is also possible to do "make install" if you have lilo installed to suit the kernel makefiles, but you may want to check your particular lilo setup first. To do the actual install you have to be root, but none of the normal build should require that. Don't take the name of root in vain. - If you configured any of the parts of the kernel as `modules', you will also have to do "make modules_install". - Verbose kernel compile/build output: Normally the kernel build system runs in a fairly quiet mode (but not totally silent). However, sometimes you or other kernel developers need to see compile, link, or other commands exactly as they are executed. For this, use "verbose" build mode. This is done by inserting "V=1" in the "make" command. E.g.: make V=1 all To have the build system also tell the reason for the rebuild of each target, use "V=2". The default is "V=0". - Keep a backup kernel handy in case something goes wrong. This is especially true for the development releases, since each new release contains new code which has not been debugged. Make sure you keep a backup of the modules corresponding to that kernel, as well. If you are installing a new kernel with the same version number as your working kernel, make a backup of your modules directory before you do a "make modules_install". Alternatively, before compiling, use the kernel config option "LOCALVERSION" to append a unique suffix to the regular kernel version. LOCALVERSION can be set in the "General Setup" menu. - In order to boot your new kernel, you'll need to copy the kernel image (e.g. .../linux/arch/i386/boot/bzImage after compilation) to the place where your regular bootable kernel is found. - Booting a kernel directly from a floppy without the assistance of a bootloader such as LILO, is no longer supported. If you boot Linux from the hard drive, chances are you use LILO which uses the kernel image as specified in the file /etc/lilo.conf. The kernel image file is usually /vmlinuz, /boot/vmlinuz, /bzImage or /boot/bzImage. To use the new kernel, save a copy of the old image and copy the new image over the old one. Then, you MUST RERUN LILO to update the loading map!! If you don't, you won't be able to boot the new kernel image. Reinstalling LILO is usually a matter of running /sbin/lilo. You may wish to edit /etc/lilo.conf to specify an entry for your old kernel image (say, /vmlinux.old) in case the new one does not work. See the LILO docs for more information. After reinstalling LILO, you should be all set. Shutdown the system, reboot, and enjoy! If you ever need to change the default root device, video mode, ramdisk size, etc. in the kernel image, use the 'rdev' program (or alternatively the LILO boot options when appropriate). No need to recompile the kernel to change these parameters. - Reboot with the new kernel and enjoy. IF SOMETHING GOES WRONG: - If you have problems that seem to be due to kernel bugs, please check the file MAINTAINERS to see if there is a particular person associated

with the part of the kernel that you are having trouble with. If there isn't anyone listed there, then the second best thing is to mail them to me (torvalds@linux-foundation.org), and possibly to any other relevant mailing-list or to the newsgroup. - In all bug-reports, *please* tell what kernel you are talking about, how to duplicate the problem, and what your setup is (use your common sense). If the problem is new, tell me so, and if the problem is old, please try to tell me when you first noticed it. - If the bug results in a message like unable to handle kernel paging request at address C0000010 Oops: 0002 EIP: 0010:XXXXXXXX eax: xxxxxxxx ebx: xxxxxxxx ecx: xxxxxxxx edx: xxxxxxxx esi: xxxxxxxx edi: xxxxxxxx ebp: xxxxxxxx ds: xxxx es: xxxx fs: xxxx gs: xxxx Pid: xx, process nr: xx xx xx xx xx xx xx xx xx xx xx or similar kernel debugging information on your screen or in your system log, please duplicate it *exactly*. The dump may look incomprehensible to you, but it does contain information that may help debugging the problem. The text above the dump is also important: it tells something about why the kernel dumped code (in the above example it's due to a bad kernel pointer). More information on making sense of the dump is in Documentation/oops-tracing.txt - If you compiled the kernel with CONFIG_KALLSYMS you can send the dump as is, otherwise you will have to use the "ksymoops" program to make sense of the dump (but compiling with CONFIG_KALLSYMS is usually preferred). This utility can be downloaded from ftp://ftp.<country>.kernel.org/pub/linux/utils/kernel/ksymoops/ . Alternately you can do the dump lookup by hand: - In debugging dumps like the above, it helps enormously if you can look up what the EIP value means. The hex value as such doesn't help me or anybody else very much: it will depend on your particular kernel setup. What you should do is take the hex value from the EIP line (ignore the "0010:"), and look it up in the kernel namelist to see which kernel function contains the offending address. To find out the kernel function name, you'll need to find the system binary associated with the kernel that exhibited the symptom. This is the file 'linux/vmlinux'. To extract the namelist and match it against the EIP from the kernel crash, do: nm vmlinux | sort | less This will give you a list of kernel addresses sorted in ascending order, from which it is simple to find the function that contains the offending address. Note that the address given by the kernel debugging messages will not necessarily match exactly with the function addresses (in fact, that is very unlikely), so you can't just 'grep' the list: the list will, however, give you the starting point of each kernel function, so by looking for the function that has a starting address lower than the one you are searching for but is followed by a function with a higher address you will find the one you want. In fact, it may be a good idea to include a bit of "context" in your problem report, giving a few lines around the interesting one. If you for some reason cannot do the above (you have a pre-compiled kernel image or similar), telling me as much about your setup as possible will help. Please read the REPORTING-BUGS document for details. - Alternately, you can use gdb on a running kernel. (read-only; i.e. you cannot change values or set break points.) To do this, first compile the kernel with -g; edit arch/i386/Makefile appropriately, then do a "make

clean". You'll also need to enable CONFIG_PROC_FS (via "make config"). After you've rebooted with the new kernel, do "gdb vmlinux /proc/kcore". You can now use all the usual gdb commands. The command to look up the point where your system crashed is "l *0xXXXXXXXX". (Replace the XXXes with the EIP value.) gdb'ing a non-running kernel currently fails because gdb (wrongly) disregards the starting offset for which the kernel is compiled.

You might also like