From f3720830189356bed9247951ae4d7a1a6347ff10 Mon Sep 17 00:00:00 2001 From: Alexandre SIMON Date: Fri, 1 Feb 2013 15:31:54 +0100 Subject: printk: fix buffer overflow when calling log_prefix function from call_console_drivers This patch corrects a buffer overflow in kernels from 3.0 to 3.4 when calling log_prefix() function from call_console_drivers(). This bug existed in previous releases but has been revealed with commit 162a7e7500f9664636e649ba59defe541b7c2c60 (2.6.39 => 3.0) that made changes about how to allocate memory for early printk buffer (use of memblock_alloc). It disappears with commit 7ff9554bb578ba02166071d2d487b7fc7d860d62 (3.4 => 3.5) that does a refactoring of printk buffer management. In log_prefix(), the access to "p[0]", "p[1]", "p[2]" or "simple_strtoul(&p[1], &endp, 10)" may cause a buffer overflow as this function is called from call_console_drivers by passing "&LOG_BUF(cur_index)" where the index must be masked to do not exceed the buffer's boundary. The trick is to prepare in call_console_drivers() a buffer with the necessary data (PRI field of syslog message) to be safely evaluated in log_prefix(). This patch can be applied to stable kernel branches 3.0.y, 3.2.y and 3.4.y. Without this patch, one can freeze a server running this loop from shell : $ export DUMMY=`cat /dev/urandom | tr -dc '12345AZERTYUIOPQSDFGHJKLMWXCVBNazertyuiopqsdfghjklmwxcvbn' | head -c255` $ while true do ; echo $DUMMY > /dev/kmsg ; done The "server freeze" depends on where memblock_alloc does allocate printk buffer : if the buffer overflow is inside another kernel allocation the problem may not be revealed, else the server may hangs up. Signed-off-by: Alexandre SIMON Signed-off-by: Greg Kroah-Hartman --- include/linux/syslog.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/syslog.h b/include/linux/syslog.h index 3891139..ce4c665 100644 --- a/include/linux/syslog.h +++ b/include/linux/syslog.h @@ -47,6 +47,12 @@ #define SYSLOG_FROM_CALL 0 #define SYSLOG_FROM_FILE 1 +/* + * Syslog priority (PRI) maximum length in char : '<[0-9]{1,3}>' + * See RFC5424 for details +*/ +#define SYSLOG_PRI_MAX_LENGTH 5 + int do_syslog(int type, char __user *buf, int count, bool from_file); #endif /* _LINUX_SYSLOG_H */ -- cgit v1.1 From ed5ac19078a65a66008f9bef0037b56828349b5b Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 8 Oct 2012 16:29:24 -0700 Subject: mm: mmu_notifier: have mmu_notifiers use a global SRCU so they may safely schedule commit 21a92735f660eaecf69a6f2e777f18463760ec32 upstream. With an RCU based mmu_notifier implementation, any callout to mmu_notifier_invalidate_range_{start,end}() or mmu_notifier_invalidate_page() would not be allowed to call schedule() as that could potentially allow a modification to the mmu_notifier structure while it is currently being used. Since srcu allocs 4 machine words per instance per cpu, we may end up with memory exhaustion if we use srcu per mm. So all mms share a global srcu. Note that during large mmu_notifier activity exit & unregister paths might hang for longer periods, but it is tolerable for current mmu_notifier clients. Signed-off-by: Sagi Grimberg Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Haggai Eran Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/mmu_notifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 1d1b1e1..ee2baf0 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -4,6 +4,7 @@ #include #include #include +#include struct mmu_notifier; struct mmu_notifier_ops; -- cgit v1.1 From d631d0d60c31bed15002ac656cbbfba3d6ce99b5 Mon Sep 17 00:00:00 2001 From: Pawel Moll Date: Thu, 21 Feb 2013 01:55:50 +0000 Subject: ALSA: usb: Fix Processing Unit Descriptor parsers commit b531f81b0d70ffbe8d70500512483227cc532608 upstream. Commit 99fc86450c439039d2ef88d06b222fd51a779176 "ALSA: usb-mixer: parse descriptors with structs" introduced a set of useful parsers for descriptors. Unfortunately the parses for the Processing Unit Descriptor came with a very subtle bug... Functions uac_processing_unit_iProcessing() and uac_processing_unit_specific() were indexing the baSourceID array forgetting the fields before the iProcessing and process-specific descriptors. The problem was observed with Sound Blaster Extigy mixer, where nNrModes in Up/Down-mix Processing Unit Descriptor was accessed at offset 10 of the descriptor (value 0) instead of offset 15 (value 7). In result the resulting control had interesting limit values: Simple mixer control 'Channel Routing Mode Select',0 Capabilities: volume volume-joined penum Playback channels: Mono Capture channels: Mono Limits: 0 - -1 Mono: -1 [100%] Fixed by starting from the bmControls, which was calculated correctly, instead of baSourceID. Now the mentioned control is fine: Simple mixer control 'Channel Routing Mode Select',0 Capabilities: volume volume-joined penum Playback channels: Mono Capture channels: Mono Limits: 0 - 6 Mono: 0 [0%] Signed-off-by: Pawel Moll Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/audio.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/usb/audio.h b/include/linux/usb/audio.h index a54b825..6f8b026 100644 --- a/include/linux/usb/audio.h +++ b/include/linux/usb/audio.h @@ -384,14 +384,16 @@ static inline __u8 uac_processing_unit_iProcessing(struct uac_processing_unit_de int protocol) { __u8 control_size = uac_processing_unit_bControlSize(desc, protocol); - return desc->baSourceID[desc->bNrInPins + control_size]; + return *(uac_processing_unit_bmControls(desc, protocol) + + control_size); } static inline __u8 *uac_processing_unit_specific(struct uac_processing_unit_descriptor *desc, int protocol) { __u8 control_size = uac_processing_unit_bControlSize(desc, protocol); - return &desc->baSourceID[desc->bNrInPins + control_size + 1]; + return uac_processing_unit_bmControls(desc, protocol) + + control_size + 1; } /* 4.5.2 Class-Specific AS Interface Descriptor */ -- cgit v1.1 From 8eac4364548b8f53476602969a2fba65d029d8b7 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 17 Nov 2011 16:42:19 -0500 Subject: NLS: improve UTF8 -> UTF16 string conversion routine commit 0720a06a7518c9d0c0125bd5d1f3b6264c55c3dd upstream. The utf8s_to_utf16s conversion routine needs to be improved. Unlike its utf16s_to_utf8s sibling, it doesn't accept arguments specifying the maximum length of the output buffer or the endianness of its 16-bit output. This patch (as1501) adds the two missing arguments, and adjusts the only two places in the kernel where the function is called. A follow-on patch will add a third caller that does utilize the new capabilities. The two conversion routines are still annoyingly inconsistent in the way they handle invalid byte combinations. But that's a subject for a different patch. Signed-off-by: Alan Stern CC: Clemens Ladisch Signed-off-by: Greg Kroah-Hartman --- include/linux/nls.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nls.h b/include/linux/nls.h index d47beef..5dc635f 100644 --- a/include/linux/nls.h +++ b/include/linux/nls.h @@ -43,7 +43,7 @@ enum utf16_endian { UTF16_BIG_ENDIAN }; -/* nls.c */ +/* nls_base.c */ extern int register_nls(struct nls_table *); extern int unregister_nls(struct nls_table *); extern struct nls_table *load_nls(char *); @@ -52,7 +52,8 @@ extern struct nls_table *load_nls_default(void); extern int utf8_to_utf32(const u8 *s, int len, unicode_t *pu); extern int utf32_to_utf8(unicode_t u, u8 *s, int maxlen); -extern int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs); +extern int utf8s_to_utf16s(const u8 *s, int len, + enum utf16_endian endian, wchar_t *pwcs, int maxlen); extern int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, u8 *s, int maxlen); -- cgit v1.1 From abd9120040d5f427b950561277f2846b0a80be44 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 25 Jan 2013 10:28:15 +1000 Subject: fb: rework locking to fix lock ordering on takeover commit 50e244cc793d511b86adea24972f3a7264cae114 upstream. Adjust the console layer to allow a take over call where the caller already holds the locks. Make the fb layer lock in order. This is partly a band aid, the fb layer is terminally confused about the locking rules it uses for its notifiers it seems. [akpm@linux-foundation.org: remove stray non-ascii char, tidy comment] [akpm@linux-foundation.org: export do_take_over_console()] [airlied: cleanup another non-ascii char] Signed-off-by: Alan Cox Cc: Florian Tobias Schandinat Cc: Stephen Rothwell Cc: Jiri Kosina Tested-by: Sedat Dilek Reviewed-by: Daniel Vetter Signed-off-by: Andrew Morton Signed-off-by: Dave Airlie Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/console.h b/include/linux/console.h index 7453cfd..49b1061 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -78,6 +78,7 @@ int con_is_bound(const struct consw *csw); int register_con_driver(const struct consw *csw, int first, int last); int unregister_con_driver(const struct consw *csw); int take_over_console(const struct consw *sw, int first, int last, int deflt); +int do_take_over_console(const struct consw *sw, int first, int last, int deflt); void give_up_console(const struct consw *sw); #ifdef CONFIG_HW_CONSOLE int con_debug_enter(struct vc_data *vc); -- cgit v1.1 From 49a656f8337670ffc66f28235f371767f5d25f42 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 25 Jan 2013 10:28:18 +1000 Subject: fb: Yet another band-aid for fixing lockdep mess commit e93a9a868792ad71cdd09d75e5a02d8067473c4e upstream. I've still got lockdep warnings even after Alan's patch, and it seems that yet more band aids are required to paper over similar paths for unbind_con_driver() and unregister_con_driver(). After this hack, lockdep warnings are finally gone. Signed-off-by: Takashi Iwai Cc: Alan Cox Cc: Florian Tobias Schandinat Cc: Jiri Kosina Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Dave Airlie Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 1 + include/linux/vt_kern.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/console.h b/include/linux/console.h index 49b1061..6ae6a15 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -77,6 +77,7 @@ extern const struct consw prom_con; /* SPARC PROM console */ int con_is_bound(const struct consw *csw); int register_con_driver(const struct consw *csw, int first, int last); int unregister_con_driver(const struct consw *csw); +int do_unregister_con_driver(const struct consw *csw); int take_over_console(const struct consw *sw, int first, int last, int deflt); int do_take_over_console(const struct consw *sw, int first, int last, int deflt); void give_up_console(const struct consw *sw); diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h index 4d05e14..90538b4 100644 --- a/include/linux/vt_kern.h +++ b/include/linux/vt_kern.h @@ -131,6 +131,8 @@ void vt_event_post(unsigned int event, unsigned int old, unsigned int new); int vt_waitactive(int n); void change_console(struct vc_data *new_vc); void reset_vc(struct vc_data *vc); +extern int do_unbind_con_driver(const struct consw *csw, int first, int last, + int deflt); extern int unbind_con_driver(const struct consw *csw, int first, int last, int deflt); int vty_init(const struct file_operations *console_fops); -- cgit v1.1 From ae593067dbed83010fee8ad59bab7948f3d3601f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Feb 2013 12:18:52 +0000 Subject: ipv6: use a stronger hash for tcp [ Upstream commit 08dcdbf6a7b9d14c2302c5bd0c5390ddf122f664 ] It looks like its possible to open thousands of TCP IPv6 sessions on a server, all landing in a single slot of TCP hash table. Incoming packets have to lookup sockets in a very long list. We should hash all bits from foreign IPv6 addresses, using a salt and hash mix, not a simple XOR. inet6_ehashfn() can also separately use the ports, instead of xoring them. Reported-by: Neal Cardwell Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/inet6_hashtables.h | 8 ++++---- include/net/inet_sock.h | 1 + include/net/ipv6.h | 12 ++++++++++++ 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index e46674d..f9ce2fa 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -28,16 +28,16 @@ struct inet_hashinfo; -/* I have no idea if this is a good hash for v6 or not. -DaveM */ static inline unsigned int inet6_ehashfn(struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) { - u32 ports = (lport ^ (__force u16)fport); + u32 ports = (((u32)lport) << 16) | (__force u32)fport; return jhash_3words((__force u32)laddr->s6_addr32[3], - (__force u32)faddr->s6_addr32[3], - ports, inet_ehash_secret + net_hash_mix(net)); + ipv6_addr_jhash(faddr), + ports, + inet_ehash_secret + net_hash_mix(net)); } static inline int inet6_sk_ehashfn(const struct sock *sk) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 14dd9c7..26490b3 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -199,6 +199,7 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, extern int inet_sk_rebuild_header(struct sock *sk); extern u32 inet_ehash_secret; +extern u32 ipv6_hash_secret; extern void build_ehash_secret(void); static inline unsigned int inet_ehashfn(struct net *net, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index c39121f..879aadf 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -386,6 +387,17 @@ struct ip6_create_arg { void ip6_frag_init(struct inet_frag_queue *q, void *a); int ip6_frag_match(struct inet_frag_queue *q, void *a); +/* more secured version of ipv6_addr_hash() */ +static inline u32 ipv6_addr_jhash(const struct in6_addr *a) +{ + u32 v = (__force u32)a->s6_addr32[0] ^ (__force u32)a->s6_addr32[1]; + + return jhash_3words(v, + (__force u32)a->s6_addr32[2], + (__force u32)a->s6_addr32[3], + ipv6_hash_secret); +} + static inline int ipv6_addr_any(const struct in6_addr *a) { return (a->s6_addr32[0] | a->s6_addr32[1] | -- cgit v1.1 From 1adbb5db21a1e08537061607abc9b0a9e7e12848 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 24 Jan 2013 23:24:56 -0500 Subject: quota: autoload the quota_v2 module for QFMT_VFS_V1 quota format commit c3ad83d9efdfe6a86efd44945a781f00c879b7b4 upstream. Otherwise, ext4 file systems with the quota featured enable will get a very confusing "No such process" error message if the quota code is built as a module and the quota_v2 module has not been loaded. Signed-off-by: "Theodore Ts'o" Reviewed-by: Carlos Maiolino Acked-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- include/linux/quota.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/quota.h b/include/linux/quota.h index 9a85412..a6dd995 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -413,6 +413,7 @@ struct quota_module_name { #define INIT_QUOTA_MODULE_NAMES {\ {QFMT_VFS_OLD, "quota_v1"},\ {QFMT_VFS_V0, "quota_v2"},\ + {QFMT_VFS_V1, "quota_v2"},\ {0, NULL}} #else -- cgit v1.1 From 964b12560e1d50f31bc1cc0ac662d52bdbdb6f40 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 19 Feb 2013 14:56:51 +0100 Subject: ptrace: introduce signal_wake_up_state() and ptrace_signal_wake_up() Upstream commit 910ffdb18a6408e14febbb6e4b6840fd2c928c82. Cleanup and preparation for the next change. signal_wake_up(resume => true) is overused. None of ptrace/jctl callers actually want to wakeup a TASK_WAKEKILL task, but they can't specify the necessary mask. Turn signal_wake_up() into signal_wake_up_state(state), reintroduce signal_wake_up() as a trivial helper, and add ptrace_signal_wake_up() which adds __TASK_TRACED. This way ptrace_signal_wake_up() can work "inside" ptrace_request() even if the tracee doesn't have the TASK_WAKEKILL bit set. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds Signed-off-by: Michal Hocko Signed-off-by: Greg Kroah-Hartman --- include/linux/sched.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dae42e7..d728bab 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2564,7 +2564,16 @@ static inline void thread_group_cputime_init(struct signal_struct *sig) extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); -extern void signal_wake_up(struct task_struct *t, int resume_stopped); +extern void signal_wake_up_state(struct task_struct *t, unsigned int state); + +static inline void signal_wake_up(struct task_struct *t, bool resume) +{ + signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0); +} +static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) +{ + signal_wake_up_state(t, resume ? __TASK_TRACED : 0); +} /* * Wrappers for p->thread_info->cpu access. No-op on UP. -- cgit v1.1 From 43f514598ad16af1c6dd08dd429631eaafef7849 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 4 Feb 2013 19:39:52 +0000 Subject: unbreak automounter support on 64-bit kernel with 32-bit userspace (v2) commit 4f4ffc3a5398ef9bdbb32db04756d7d34e356fcf upstream. automount-support is broken on the parisc architecture, because the existing #if list does not include a check for defined(__hppa__). The HPPA (parisc) architecture is similiar to other 64bit Linux targets where we have to define autofs_wqt_t (which is passed back and forth to user space) as int type which has a size of 32bit across 32 and 64bit kernels. During the discussion on the mailing list, H. Peter Anvin suggested to invert the #if list since only specific platforms (specifically those who do not have a 32bit userspace, like IA64 and Alpha) should have autofs_wqt_t as unsigned long type. This suggestion is probably the best way to go, since Arm64 (and maybe others?) seems to have a non-working automounter. So in the long run even for other new upcoming architectures this inverted check seem to be the best solution, since it will not require them to change this #if again (unless they are 64bit only). Signed-off-by: Helge Deller Acked-by: H. Peter Anvin Acked-by: Ian Kent Acked-by: Catalin Marinas CC: James Bottomley CC: Rolf Eike Beer Signed-off-by: Greg Kroah-Hartman --- include/linux/auto_fs.h | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h index da64e15..6cdabb4 100644 --- a/include/linux/auto_fs.h +++ b/include/linux/auto_fs.h @@ -31,25 +31,16 @@ #define AUTOFS_MIN_PROTO_VERSION AUTOFS_PROTO_VERSION /* - * Architectures where both 32- and 64-bit binaries can be executed - * on 64-bit kernels need this. This keeps the structure format - * uniform, and makes sure the wait_queue_token isn't too big to be - * passed back down to the kernel. - * - * This assumes that on these architectures: - * mode 32 bit 64 bit - * ------------------------- - * int 32 bit 32 bit - * long 32 bit 64 bit - * - * If so, 32-bit user-space code should be backwards compatible. + * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed + * back to the kernel via ioctl from userspace. On architectures where 32- and + * 64-bit userspace binaries can be executed it's important that the size of + * autofs_wqt_t stays constant between 32- and 64-bit Linux kernels so that we + * do not break the binary ABI interface by changing the structure size. */ - -#if defined(__sparc__) || defined(__mips__) || defined(__x86_64__) \ - || defined(__powerpc__) || defined(__s390__) -typedef unsigned int autofs_wqt_t; -#else +#if defined(__ia64__) || defined(__alpha__) /* pure 64bit architectures */ typedef unsigned long autofs_wqt_t; +#else +typedef unsigned int autofs_wqt_t; #endif /* Packet types */ -- cgit v1.1 From 87a42f27adef5e88b8907edbc168de1380e7129e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 15 Mar 2013 14:26:07 +0100 Subject: perf,x86: fix kernel crash with PEBS/BTS after suspend/resume commit 1d9d8639c063caf6efc2447f5f26aa637f844ff6 upstream. This patch fixes a kernel crash when using precise sampling (PEBS) after a suspend/resume. Turns out the CPU notifier code is not invoked on CPU0 (BP). Therefore, the DS_AREA (used by PEBS) is not restored properly by the kernel and keeps it power-on/resume value of 0 causing any PEBS measurement to crash when running on CPU0. The workaround is to add a hook in the actual resume code to restore the DS Area MSR value. It is invoked for all CPUS. So for all but CPU0, the DS_AREA will be restored twice but this is harmless. Reported-by: Linus Torvalds Signed-off-by: Stephane Eranian Signed-off-by: Linus Torvalds Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- include/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index effadd6..038ad4a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1153,6 +1153,7 @@ extern void perf_swevent_put_recursion_context(int rctx); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_task_tick(void); +extern void perf_restore_debug_store(void); #else static inline void perf_event_task_sched_in(struct task_struct *task) { } @@ -1187,6 +1188,7 @@ static inline void perf_swevent_put_recursion_context(int rctx) { } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline void perf_event_task_tick(void) { } +static inline void perf_restore_debug_store(void) { } #endif #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) -- cgit v1.1 From 68e0bbe8b7781877de7dc96d620a4ce6af8807f9 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Sun, 17 Mar 2013 15:49:10 -0700 Subject: perf,x86: fix link failure for non-Intel configs commit 6c4d3bc99b3341067775efd4d9d13cc8e655fd7c upstream. Commit 1d9d8639c063 ("perf,x86: fix kernel crash with PEBS/BTS after suspend/resume") introduces a link failure since perf_restore_debug_store() is only defined for CONFIG_CPU_SUP_INTEL: arch/x86/power/built-in.o: In function `restore_processor_state': (.text+0x45c): undefined reference to `perf_restore_debug_store' Fix it by defining the dummy function appropriately. Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/perf_event.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 038ad4a..67b9fbc 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1153,7 +1153,6 @@ extern void perf_swevent_put_recursion_context(int rctx); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_task_tick(void); -extern void perf_restore_debug_store(void); #else static inline void perf_event_task_sched_in(struct task_struct *task) { } @@ -1188,6 +1187,11 @@ static inline void perf_swevent_put_recursion_context(int rctx) { } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline void perf_event_task_tick(void) { } +#endif + +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) +extern void perf_restore_debug_store(void); +#else static inline void perf_restore_debug_store(void) { } #endif -- cgit v1.1 From 1b92d599fe0f704d9981063d39339fea1f9bd092 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 13 Mar 2013 00:24:15 +0000 Subject: ipv4: fix definition of FIB_TABLE_HASHSZ [ Upstream commit 5b9e12dbf92b441b37136ea71dac59f05f2673a9 ] a long time ago by the commit commit 93456b6d7753def8760b423ac6b986eb9d5a4a95 Author: Denis V. Lunev Date: Thu Jan 10 03:23:38 2008 -0800 [IPV4]: Unify access to the routing tables. the defenition of FIB_HASH_TABLE size has obtained wrong dependency: it should depend upon CONFIG_IP_MULTIPLE_TABLES (as was in the original code) but it was depended from CONFIG_IP_ROUTE_MULTIPATH This patch returns the situation to the original state. The problem was spotted by Tingwei Liu. Signed-off-by: Denis V. Lunev CC: Tingwei Liu CC: Alexey Kuznetsov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/ip_fib.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 10422ef..2124004 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -129,18 +129,16 @@ struct fib_result_nl { }; #ifdef CONFIG_IP_ROUTE_MULTIPATH - #define FIB_RES_NH(res) ((res).fi->fib_nh[(res).nh_sel]) - -#define FIB_TABLE_HASHSZ 2 - #else /* CONFIG_IP_ROUTE_MULTIPATH */ - #define FIB_RES_NH(res) ((res).fi->fib_nh[0]) +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ +#ifdef CONFIG_IP_MULTIPLE_TABLES #define FIB_TABLE_HASHSZ 256 - -#endif /* CONFIG_IP_ROUTE_MULTIPATH */ +#else +#define FIB_TABLE_HASHSZ 2 +#endif extern __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh); -- cgit v1.1 From 7b7a1b8b3bd1742ca5ab259e741da0070e936db0 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Fri, 15 Mar 2013 11:32:30 +0000 Subject: inet: limit length of fragment queue hash table bucket lists [ Upstream commit 5a3da1fe9561828d0ca7eca664b16ec2b9bf0055 ] This patch introduces a constant limit of the fragment queue hash table bucket list lengths. Currently the limit 128 is choosen somewhat arbitrary and just ensures that we can fill up the fragment cache with empty packets up to the default ip_frag_high_thresh limits. It should just protect from list iteration eating considerable amounts of cpu. If we reach the maximum length in one hash bucket a warning is printed. This is implemented on the caller side of inet_frag_find to distinguish between the different users of inet_fragment.c. I dropped the out of memory warning in the ipv4 fragment lookup path, because we already get a warning by the slab allocator. Cc: Eric Dumazet Cc: Jesper Dangaard Brouer Signed-off-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/inet_frag.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 16ff29a..b289bd2 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -33,6 +33,13 @@ struct inet_frag_queue { #define INETFRAGS_HASHSZ 64 +/* averaged: + * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ / + * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or + * struct frag_queue)) + */ +#define INETFRAGS_MAXDEPTH 128 + struct inet_frags { struct hlist_head hash[INETFRAGS_HASHSZ]; rwlock_t lock; @@ -64,6 +71,8 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f); struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash) __releases(&f->lock); +void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, + const char *prefix); static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f) { -- cgit v1.1 From ea8d2d19ad17ceafc883b86e448a405cf7808927 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 17 Dec 2012 16:03:20 -0800 Subject: exec: use -ELOOP for max recursion depth commit d740269867021faf4ce38a449353d2b986c34a67 upstream. To avoid an explosion of request_module calls on a chain of abusive scripts, fail maximum recursion with -ELOOP instead of -ENOEXEC. As soon as maximum recursion depth is hit, the error will fail all the way back up the chain, aborting immediately. This also has the side-effect of stopping the user's shell from attempting to reexecute the top-level file as a shell script. As seen in the dash source: if (cmd != path_bshell && errno == ENOEXEC) { *argv-- = cmd; *argv = cmd = path_bshell; goto repeat; } The above logic was designed for running scripts automatically that lacked the "#!" header, not to re-try failed recursion. On a legitimate -ENOEXEC, things continue to behave as the shell expects. Additionally, when tracking recursion, the binfmt handlers should not be involved. The recursion being tracked is the depth of calls through search_binary_handler(), so that function should be exclusively responsible for tracking the depth. Signed-off-by: Kees Cook Cc: halfdog Cc: P J P Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- include/linux/binfmts.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 384e37f..718eb0b 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -67,8 +67,6 @@ struct linux_binprm { #define BINPRM_FLAGS_EXECFD_BIT 1 #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT) -#define BINPRM_MAX_RECURSION 4 - /* Function parameter for binfmt->coredump */ struct coredump_params { long signr; -- cgit v1.1 From 1c190534db77a019936d95a1826a55bf34d7ed23 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 25 Nov 2012 22:24:19 -0500 Subject: signal: Define __ARCH_HAS_SA_RESTORER so we know whether to clear sa_restorer Vaguely based on upstream commit 574c4866e33d 'consolidate kernel-side struct sigaction declarations'. flush_signal_handlers() needs to know whether sigaction::sa_restorer is defined, not whether SA_RESTORER is defined. Define the __ARCH_HAS_SA_RESTORER macro to indicate this. Signed-off-by: Ben Hutchings Cc: Al Viro Signed-off-by: Greg Kroah-Hartman --- include/asm-generic/signal.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/asm-generic/signal.h b/include/asm-generic/signal.h index 555c0ae..743f7a5 100644 --- a/include/asm-generic/signal.h +++ b/include/asm-generic/signal.h @@ -99,6 +99,10 @@ typedef unsigned long old_sigset_t; #include +#ifdef SA_RESTORER +#define __ARCH_HAS_SA_RESTORER +#endif + struct sigaction { __sighandler_t sa_handler; unsigned long sa_flags; -- cgit v1.1 From c938c22b48302eeb9a6f3cc83f223f37d98ba6f7 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 19 Mar 2013 12:36:52 +0100 Subject: NFSv4: include bitmap in nfsv4 get acl data commit bf118a342f10dafe44b14451a1392c3254629a1f upstream. The NFSv4 bitmap size is unbounded: a server can return an arbitrary sized bitmap in an FATTR4_WORD0_ACL request. Replace using the nfs4_fattr_bitmap_maxsz as a guess to the maximum bitmask returned by a server with the inclusion of the bitmap (xdr length plus bitmasks) and the acl data xdr length to the (cached) acl page data. This is a general solution to commit e5012d1f "NFSv4.1: update nfs4_fattr_bitmap_maxsz" and fixes hitting a BUG_ON in xdr_shrink_bufhead when getting ACLs. Fix a bug in decode_getacl that returned -EINVAL on ACLs > page when getxattr was called with a NULL buffer, preventing ACL > PAGE_SIZE from being retrieved. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- include/linux/nfs_xdr.h | 5 +++++ include/linux/sunrpc/xdr.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 0012fc3..2b25494 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -588,11 +588,16 @@ struct nfs_getaclargs { size_t acl_len; unsigned int acl_pgbase; struct page ** acl_pages; + struct page * acl_scratch; struct nfs4_sequence_args seq_args; }; +/* getxattr ACL interface flags */ +#define NFS4_ACL_LEN_REQUEST 0x0001 /* zero length getxattr buffer */ struct nfs_getaclres { size_t acl_len; + size_t acl_data_offset; + int acl_flags; struct nfs4_sequence_res seq_res; }; diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index a20970e..af70af3 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -191,6 +191,8 @@ extern int xdr_decode_array2(struct xdr_buf *buf, unsigned int base, struct xdr_array2_desc *desc); extern int xdr_encode_array2(struct xdr_buf *buf, unsigned int base, struct xdr_array2_desc *desc); +extern void _copy_from_pages(char *p, struct page **pages, size_t pgbase, + size_t len); /* * Provide some simple tools for XDR buffer overflow-checking etc. -- cgit v1.1 From 01b140abad66f022ff6dff7cc1307b07281035fa Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 19 Mar 2013 12:36:53 +0100 Subject: NFSv4: Fix an Oops in the NFSv4 getacl code commit 331818f1c468a24e581aedcbe52af799366a9dfe upstream. Commit bf118a342f10dafe44b14451a1392c3254629a1f (NFSv4: include bitmap in nfsv4 get acl data) introduces the 'acl_scratch' page for the case where we may need to decode multi-page data. However it fails to take into account the fact that the variable may be NULL (for the case where we're not doing multi-page decode), and it also attaches it to the encoding xdr_stream rather than the decoding one. The immediate result is an Oops in nfs4_xdr_enc_getacl due to the call to page_address() with a NULL page pointer. Signed-off-by: Trond Myklebust Cc: Andy Adamson Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 2b25494..9733df5 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -588,7 +588,6 @@ struct nfs_getaclargs { size_t acl_len; unsigned int acl_pgbase; struct page ** acl_pages; - struct page * acl_scratch; struct nfs4_sequence_args seq_args; }; @@ -598,6 +597,7 @@ struct nfs_getaclres { size_t acl_len; size_t acl_data_offset; int acl_flags; + struct page * acl_scratch; struct nfs4_sequence_res seq_res; }; -- cgit v1.1 From 8868daebc1b6240d07d5c6428f8bc8631b2bed42 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 19 Mar 2013 12:36:55 +0100 Subject: KVM: Ensure all vcpus are consistent with in-kernel irqchip settings commit 3e515705a1f46beb1c942bb8043c16f8ac7b1e9e upstream. If some vcpus are created before KVM_CREATE_IRQCHIP, then irqchip_in_kernel() and vcpu->arch.apic will be inconsistent, leading to potential NULL pointer dereferences. Fix by: - ensuring that no vcpus are installed when KVM_CREATE_IRQCHIP is called - ensuring that a vcpu has an apic if it is installed after KVM_CREATE_IRQCHIP This is somewhat long winded because vcpu->arch.apic is created without kvm->lock held. Based on earlier patch by Michael Ellerman. Signed-off-by: Michael Ellerman Signed-off-by: Avi Kivity Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- include/linux/kvm_host.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 82d5476..8663a26 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -736,6 +736,13 @@ static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id; } + +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu); + +#else + +static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; } + #endif #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT -- cgit v1.1 From 0cbf0cbd285ef39202743ecfd62b4fe2dcdc81fd Mon Sep 17 00:00:00 2001 From: Masatake YAMATO Date: Mon, 1 Apr 2013 14:50:40 -0400 Subject: thermal: shorten too long mcast group name [ Upstream commits 73214f5d9f33b79918b1f7babddd5c8af28dd23d and f1e79e208076ffe7bad97158275f1c572c04f5c7, the latter adds an assertion to genetlink to prevent this from happening again in the future. ] The original name is too long. Signed-off-by: Masatake YAMATO Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/thermal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index d3ec89f..6b762c6 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -130,7 +130,7 @@ struct thermal_zone_device { /* Adding event notification support elements */ #define THERMAL_GENL_FAMILY_NAME "thermal_event" #define THERMAL_GENL_VERSION 0x01 -#define THERMAL_GENL_MCAST_GROUP_NAME "thermal_mc_group" +#define THERMAL_GENL_MCAST_GROUP_NAME "thermal_mc_grp" enum events { THERMAL_AUX0, -- cgit v1.1 From 878315cbf04dde3f93bf796f1835ae8d07604ba7 Mon Sep 17 00:00:00 2001 From: Shan Hai Date: Mon, 18 Mar 2013 10:30:43 +0800 Subject: libata: Use integer return value for atapi_command_packet_set commit d8668fcb0b257d9fdcfbe5c172a99b8d85e1cd82 upstream. The function returns type of ATAPI drives so it should return integer value. The commit 4dce8ba94c7 (libata: Use 'bool' return value for ata_id_XXX) since v2.6.39 changed the type of return value from int to bool, the change would cause all of the ATAPI class drives to be treated as TYPE_TAPE and the max_sectors of the drives to be set to 65535 because of the commit f8d8e5799b7(libata: increase 128 KB / cmd limit for ATAPI tape drives), for the function would return true for all ATAPI class drives and the TYPE_TAPE is defined as 0x01. Signed-off-by: Shan Hai Signed-off-by: Jeff Garzik Signed-off-by: Greg Kroah-Hartman --- include/linux/ata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ata.h b/include/linux/ata.h index 32df2b6..5856c9e 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -937,7 +937,7 @@ static inline int atapi_cdb_len(const u16 *dev_id) } } -static inline bool atapi_command_packet_set(const u16 *dev_id) +static inline int atapi_command_packet_set(const u16 *dev_id) { return (dev_id[ATA_ID_CONFIG] >> 8) & 0x1f; } -- cgit v1.1 From c55f9197cfc3d968c64ed4e7762214c04090426e Mon Sep 17 00:00:00 2001 From: Shan Hai Date: Mon, 18 Mar 2013 10:30:44 +0800 Subject: libata: Set max sector to 65535 for Slimtype DVD A DS8A8SH drive commit a32450e127fc6e5ca6d958ceb3cfea4d30a00846 upstream. The Slimtype DVD A DS8A8SH drive locks up when max sector is smaller than 65535, and the blow backtrace is observed on locking up: INFO: task flush-8:32:1130 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. flush-8:32 D ffffffff8180cf60 0 1130 2 0x00000000 ffff880273aef618 0000000000000046 0000000000000005 ffff880273aee000 ffff880273aee000 ffff880273aeffd8 ffff880273aee010 ffff880273aee000 ffff880273aeffd8 ffff880273aee000 ffff88026e842ea0 ffff880274a10000 Call Trace: [] schedule+0x5d/0x70 [] io_schedule+0x8c/0xd0 [] get_request+0x731/0x7d0 [] ? cfq_allow_merge+0x50/0x90 [] ? wake_up_bit+0x40/0x40 [] ? bio_attempt_back_merge+0x33/0x110 [] blk_queue_bio+0x23a/0x3f0 [] generic_make_request+0xc6/0x120 [] submit_bio+0x138/0x160 [] ? bio_alloc_bioset+0x96/0x120 [] submit_bh+0x1f1/0x220 [] __block_write_full_page+0x228/0x340 [] ? attach_nobh_buffers+0xc0/0xc0 [] ? I_BDEV+0x10/0x10 [] ? I_BDEV+0x10/0x10 [] block_write_full_page_endio+0xe6/0x100 [] block_write_full_page+0x15/0x20 [] blkdev_writepage+0x18/0x20 [] __writepage+0x17/0x40 [] write_cache_pages+0x34a/0x4a0 [] ? set_page_dirty+0x70/0x70 [] generic_writepages+0x51/0x80 [] do_writepages+0x20/0x50 [] __writeback_single_inode+0xa6/0x2b0 [] writeback_sb_inodes+0x311/0x4d0 [] __writeback_inodes_wb+0x86/0xd0 [] wb_writeback+0x1a3/0x330 [] ? _raw_spin_lock_irqsave+0x3f/0x50 [] ? get_nr_inodes+0x52/0x70 [] wb_do_writeback+0x1dc/0x260 [] ? schedule_timeout+0x204/0x240 [] bdi_writeback_thread+0x102/0x2b0 [] ? wb_do_writeback+0x260/0x260 [] kthread+0xc0/0xd0 [] ? kthread_worker_fn+0x1b0/0x1b0 [] ret_from_fork+0x7c/0xb0 [] ? kthread_worker_fn+0x1b0/0x1b0 The above trace was triggered by "dd if=/dev/zero of=/dev/sr0 bs=2048 count=32768" It was previously working by accident, since another bug introduced by 4dce8ba94c7 (libata: Use 'bool' return value for ata_id_XXX) caused all drives to use maxsect=65535. Signed-off-by: Shan Hai Signed-off-by: Jeff Garzik Signed-off-by: Greg Kroah-Hartman --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index 5a9926b..1cf324e 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -382,6 +382,7 @@ enum { ATA_HORKAGE_NOSETXFER = (1 << 14), /* skip SETXFER, SATA only */ ATA_HORKAGE_BROKEN_FPDMA_AA = (1 << 15), /* skip AA */ ATA_HORKAGE_DUMP_ID = (1 << 16), /* dump IDENTIFY data */ + ATA_HORKAGE_MAX_SEC_LBA48 = (1 << 17), /* Set max sects to 65535 */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.1 From 17229e4f8ef6a7cb514d7d4d67197cd6a8b06eca Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 9 Apr 2013 10:48:33 -0700 Subject: spinlocks and preemption points need to be at least compiler barriers commit 386afc91144b36b42117b0092893f15bc8798a80 upstream. In UP and non-preempt respectively, the spinlocks and preemption disable/enable points are stubbed out entirely, because there is no regular code that can ever hit the kind of concurrency they are meant to protect against. However, while there is no regular code that can cause scheduling, we _do_ end up having some exceptional (literally!) code that can do so, and that we need to make sure does not ever get moved into the critical region by the compiler. In particular, get_user() and put_user() is generally implemented as inline asm statements (even if the inline asm may then make a call instruction to call out-of-line), and can obviously cause a page fault and IO as a result. If that inline asm has been scheduled into the middle of a preemption-safe (or spinlock-protected) code region, we obviously lose. Now, admittedly this is *very* unlikely to actually ever happen, and we've not seen examples of actual bugs related to this. But partly exactly because it's so hard to trigger and the resulting bug is so subtle, we should be extra careful to get this right. So make sure that even when preemption is disabled, and we don't have to generate any actual *code* to explicitly tell the system that we are in a preemption-disabled region, we need to at least tell the compiler not to move things around the critical region. This patch grew out of the same discussion that caused commits 79e5f05edcbf ("ARC: Add implicit compiler barrier to raw_local_irq* functions") and 3e2e0d2c222b ("tile: comment assumption about __insn_mtspr for ") to come about. Note for stable: use discretion when/if applying this. As mentioned, this bug may never have actually bitten anybody, and gcc may never have done the required code motion for it to possibly ever trigger in practice. Signed-off-by: Linus Torvalds Cc: Steven Rostedt Cc: Peter Zijlstra Signed-off-by: Greg Kroah-Hartman --- include/linux/preempt.h | 22 ++++++++++++++-------- include/linux/spinlock_up.h | 29 ++++++++++++++++++----------- 2 files changed, 32 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 2e681d9..d309dcb 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -82,14 +82,20 @@ do { \ #else -#define preempt_disable() do { } while (0) -#define preempt_enable_no_resched() do { } while (0) -#define preempt_enable() do { } while (0) -#define preempt_check_resched() do { } while (0) - -#define preempt_disable_notrace() do { } while (0) -#define preempt_enable_no_resched_notrace() do { } while (0) -#define preempt_enable_notrace() do { } while (0) +/* + * Even if we don't have any preemption, we need preempt disable/enable + * to be barriers, so that we don't have things like get_user/put_user + * that can cause faults and scheduling migrate into our preempt-protected + * region. + */ +#define preempt_disable() barrier() +#define preempt_enable_no_resched() barrier() +#define preempt_enable() barrier() +#define preempt_check_resched() barrier() + +#define preempt_disable_notrace() barrier() +#define preempt_enable_no_resched_notrace() barrier() +#define preempt_enable_notrace() barrier() #endif diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index a26e2fb..e2369c1 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -16,7 +16,10 @@ * In the debug case, 1 means unlocked, 0 means locked. (the values * are inverted, to catch initialization bugs) * - * No atomicity anywhere, we are on UP. + * No atomicity anywhere, we are on UP. However, we still need + * the compiler barriers, because we do not want the compiler to + * move potentially faulting instructions (notably user accesses) + * into the locked sequence, resulting in non-atomic execution. */ #ifdef CONFIG_DEBUG_SPINLOCK @@ -25,6 +28,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) { lock->slock = 0; + barrier(); } static inline void @@ -32,6 +36,7 @@ arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) { local_irq_save(flags); lock->slock = 0; + barrier(); } static inline int arch_spin_trylock(arch_spinlock_t *lock) @@ -39,32 +44,34 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) char oldval = lock->slock; lock->slock = 0; + barrier(); return oldval > 0; } static inline void arch_spin_unlock(arch_spinlock_t *lock) { + barrier(); lock->slock = 1; } /* * Read-write spinlocks. No debug version. */ -#define arch_read_lock(lock) do { (void)(lock); } while (0) -#define arch_write_lock(lock) do { (void)(lock); } while (0) -#define arch_read_trylock(lock) ({ (void)(lock); 1; }) -#define arch_write_trylock(lock) ({ (void)(lock); 1; }) -#define arch_read_unlock(lock) do { (void)(lock); } while (0) -#define arch_write_unlock(lock) do { (void)(lock); } while (0) +#define arch_read_lock(lock) do { barrier(); (void)(lock); } while (0) +#define arch_write_lock(lock) do { barrier(); (void)(lock); } while (0) +#define arch_read_trylock(lock) ({ barrier(); (void)(lock); 1; }) +#define arch_write_trylock(lock) ({ barrier(); (void)(lock); 1; }) +#define arch_read_unlock(lock) do { barrier(); (void)(lock); } while (0) +#define arch_write_unlock(lock) do { barrier(); (void)(lock); } while (0) #else /* DEBUG_SPINLOCK */ #define arch_spin_is_locked(lock) ((void)(lock), 0) /* for sched.c and kernel_lock.c: */ -# define arch_spin_lock(lock) do { (void)(lock); } while (0) -# define arch_spin_lock_flags(lock, flags) do { (void)(lock); } while (0) -# define arch_spin_unlock(lock) do { (void)(lock); } while (0) -# define arch_spin_trylock(lock) ({ (void)(lock); 1; }) +# define arch_spin_lock(lock) do { barrier(); (void)(lock); } while (0) +# define arch_spin_lock_flags(lock, flags) do { barrier(); (void)(lock); } while (0) +# define arch_spin_unlock(lock) do { barrier(); (void)(lock); } while (0) +# define arch_spin_trylock(lock) ({ barrier(); (void)(lock); 1; }) #endif /* DEBUG_SPINLOCK */ #define arch_spin_is_contended(lock) (((void)(lock), 0)) -- cgit v1.1 From d715cdddb8cdf1c17bf1c5ff8fcc9852cd6ba79e Mon Sep 17 00:00:00 2001 From: Andrew Honig Date: Fri, 29 Mar 2013 09:35:21 -0700 Subject: KVM: Allow cross page reads and writes from cached translations. commit 8f964525a121f2ff2df948dac908dcc65be21b5b upstream. This patch adds support for kvm_gfn_to_hva_cache_init functions for reads and writes that will cross a page. If the range falls within the same memslot, then this will be a fast operation. If the range is split between two memslots, then the slower kvm_read_guest and kvm_write_guest are used. Tested: Test against kvm_clock unit tests. Signed-off-by: Andrew Honig Signed-off-by: Gleb Natapov Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- include/linux/kvm_host.h | 2 +- include/linux/kvm_types.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8663a26..8cd0f20 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -388,7 +388,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len); int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - gpa_t gpa); + gpa_t gpa, unsigned long len); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index fa7cc72..b0bcce0 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -71,6 +71,7 @@ struct gfn_to_hva_cache { u64 generation; gpa_t gpa; unsigned long hva; + unsigned long len; struct kvm_memory_slot *memslot; }; -- cgit v1.1 From d1a01d18320e37367e23f006f0dfbd74ff32de68 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 16 Apr 2013 13:45:37 -0700 Subject: vm: add vm_iomap_memory() helper function commit b4cbb197c7e7a68dbad0d491242e3ca67420c13e upstream. Various drivers end up replicating the code to mmap() their memory buffers into user space, and our core memory remapping function may be very flexible but it is unnecessarily complicated for the common cases to use. Our internal VM uses pfn's ("page frame numbers") which simplifies things for the VM, and allows us to pass physical addresses around in a denser and more efficient format than passing a "phys_addr_t" around, and having to shift it up and down by the page size. But it just means that drivers end up doing that shifting instead at the interface level. It also means that drivers end up mucking around with internal VM things like the vma details (vm_pgoff, vm_start/end) way more than they really need to. So this just exports a function to map a certain physical memory range into user space (using a phys_addr_t based interface that is much more natural for a driver) and hides all the complexity from the driver. Some drivers will still end up tweaking the vm_page_prot details for things like prefetching or cacheability etc, but that's actually relevant to the driver, rather than caring about what the page offset of the mapping is into the particular IO memory region. Acked-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 18eea05..f6e0b4b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1542,6 +1542,8 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); + struct page *follow_page(struct vm_area_struct *, unsigned long address, unsigned int foll_flags); -- cgit v1.1 From d569e833b770b21d29147c1ed937ab3882647252 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 2 Apr 2013 17:10:07 -0400 Subject: net: count hw_addr syncs so that unsync works properly. [ Upstream commit 4543fbefe6e06a9e40d9f2b28d688393a299f079 ] A few drivers use dev_uc_sync/unsync to synchronize the address lists from master down to slave/lower devices. In some cases (bond/team) a single address list is synched down to multiple devices. At the time of unsync, we have a leak in these lower devices, because "synced" is treated as a boolean and the address will not be unsynced for anything after the first device/call. Treat "synced" as a count (same as refcount) and allow all unsync calls to work. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c037215..02f887a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -231,9 +231,9 @@ struct netdev_hw_addr { #define NETDEV_HW_ADDR_T_SLAVE 3 #define NETDEV_HW_ADDR_T_UNICAST 4 #define NETDEV_HW_ADDR_T_MULTICAST 5 - bool synced; bool global_use; int refcount; + int synced; struct rcu_head rcu_head; }; -- cgit v1.1 From a57d91ae48c1bca556dcde0d0a6273f7d8fabe1e Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 5 Apr 2013 20:42:05 +0200 Subject: netfilter: don't reset nf_trace in nf_reset() [ Upstream commit 124dff01afbdbff251f0385beca84ba1b9adda68 ] Commit 130549fe ("netfilter: reset nf_trace in nf_reset") added code to reset nf_trace in nf_reset(). This is wrong and unnecessary. nf_reset() is used in the following cases: - when passing packets up the the socket layer, at which point we want to release all netfilter references that might keep modules pinned while the packet is queued. nf_trace doesn't matter anymore at this point. - when encapsulating or decapsulating IPsec packets. We want to continue tracing these packets after IPsec processing. - when passing packets through virtual network devices. Only devices on that encapsulate in IPv4/v6 matter since otherwise nf_trace is not used anymore. Its not entirely clear whether those packets should be traced after that, however we've always done that. - when passing packets through virtual network devices that make the packet cross network namespace boundaries. This is the only cases where we clearly want to reset nf_trace and is also what the original patch intended to fix. Add a new function nf_reset_trace() and use it in dev_forward_skb() to fix this properly. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/skbuff.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 37b643b..7b547c2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2135,6 +2135,14 @@ static inline void nf_reset(struct sk_buff *skb) #endif } +static inline void nf_reset_trace(struct sk_buff *skb) +{ +#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ + defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) + skb->nf_trace = 0; +#endif +} + /* Note: This doesn't put any conntrack and bridge info in dst. */ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src) { -- cgit v1.1 From 73d2de1ad017f674ec21e57405e47028dbc884bf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 19 Apr 2013 15:32:32 +0000 Subject: net: fix incorrect credentials passing [ Upstream commit 83f1b4ba917db5dc5a061a44b3403ddb6e783494 ] Commit 257b5358b32f ("scm: Capture the full credentials of the scm sender") changed the credentials passing code to pass in the effective uid/gid instead of the real uid/gid. Obviously this doesn't matter most of the time (since normally they are the same), but it results in differences for suid binaries when the wrong uid/gid ends up being used. This just undoes that (presumably unintentional) part of the commit. Reported-by: Andy Lutomirski Cc: Eric W. Biederman Cc: Serge E. Hallyn Cc: David S. Miller Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Acked-by: "Eric W. Biederman" Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/socket.h | 3 ++- include/net/scm.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index 635c213..2417952 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -312,7 +312,8 @@ struct ucred { /* IPX options */ #define IPX_TYPE 1 -extern void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred); +extern void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred, + bool use_effective); extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len); extern int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, diff --git a/include/net/scm.h b/include/net/scm.h index 745460f..820c1b3 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -50,7 +50,7 @@ static __inline__ void scm_set_cred(struct scm_cookie *scm, { scm->pid = get_pid(pid); scm->cred = get_cred(cred); - cred_to_ucred(pid, cred, &scm->creds); + cred_to_ucred(pid, cred, &scm->creds, false); } static __inline__ void scm_destroy_cred(struct scm_cookie *scm) -- cgit v1.1 From 9b2bdb66b65fcbdd4f3a3d08c28e4c46b4a59364 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Tue, 30 Apr 2013 19:15:54 -0700 Subject: ipc: sysv shared memory limited to 8TiB commit d69f3bad4675ac519d41ca2b11e1c00ca115cecd upstream. Trying to run an application which was trying to put data into half of memory using shmget(), we found that having a shmall value below 8EiB-8TiB would prevent us from using anything more than 8TiB. By setting kernel.shmall greater than 8EiB-8TiB would make the job work. In the newseg() function, ns->shm_tot which, at 8TiB is INT_MAX. ipc/shm.c: 458 static int newseg(struct ipc_namespace *ns, struct ipc_params *params) 459 { ... 465 int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; ... 474 if (ns->shm_tot + numpages > ns->shm_ctlall) 475 return -ENOSPC; [akpm@linux-foundation.org: make ipc/shm.c:newseg()'s numpages size_t, not int] Signed-off-by: Robin Holt Reported-by: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/ipc_namespace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index a6d1655..f1605b5 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -42,8 +42,8 @@ struct ipc_namespace { size_t shm_ctlmax; size_t shm_ctlall; + unsigned long shm_tot; int shm_ctlmni; - int shm_tot; struct notifier_block ipcns_nb; -- cgit v1.1 From a0464b18acc3f17ff86aa25df34c5066a21c8291 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Apr 2013 18:34:55 -0700 Subject: tcp: force a dst refcount when prequeue packet [ Upstream commit 093162553c33e9479283e107b4431378271c735d ] Before escaping RCU protected section and adding packet into prequeue, make sure the dst is refcounted. Reported-by: Mike Galbraith Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index b28a49f..4881cb6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -902,6 +902,7 @@ static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb) if (sysctl_tcp_low_latency || !tp->ucopy.task) return 0; + skb_dst_force(skb); __skb_queue_tail(&tp->ucopy.prequeue, skb); tp->ucopy.memory += skb->truesize; if (tp->ucopy.memory > sk->sk_rcvbuf) { -- cgit v1.1 From 1d81283ce68fb5f6841aeea620a133e5f9707e33 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Wed, 8 May 2013 09:45:47 +0000 Subject: if_cablemodem.h: Add parenthesis around ioctl macros [ Upstream commit 4f924b2aa4d3cb30f07e57d6b608838edcbc0d88 ] Protect the SIOCGCM* ioctl macros with parenthesis. Reported-by: Paul Wouters Signed-off-by: Josh Boyer Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/if_cablemodem.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/if_cablemodem.h b/include/linux/if_cablemodem.h index 9ca1007..ee6b3c4 100644 --- a/include/linux/if_cablemodem.h +++ b/include/linux/if_cablemodem.h @@ -12,11 +12,11 @@ */ /* some useful defines for sb1000.c e cmconfig.c - fv */ -#define SIOCGCMSTATS SIOCDEVPRIVATE+0 /* get cable modem stats */ -#define SIOCGCMFIRMWARE SIOCDEVPRIVATE+1 /* get cm firmware version */ -#define SIOCGCMFREQUENCY SIOCDEVPRIVATE+2 /* get cable modem frequency */ -#define SIOCSCMFREQUENCY SIOCDEVPRIVATE+3 /* set cable modem frequency */ -#define SIOCGCMPIDS SIOCDEVPRIVATE+4 /* get cable modem PIDs */ -#define SIOCSCMPIDS SIOCDEVPRIVATE+5 /* set cable modem PIDs */ +#define SIOCGCMSTATS (SIOCDEVPRIVATE+0) /* get cable modem stats */ +#define SIOCGCMFIRMWARE (SIOCDEVPRIVATE+1) /* get cm firmware version */ +#define SIOCGCMFREQUENCY (SIOCDEVPRIVATE+2) /* get cable modem frequency */ +#define SIOCSCMFREQUENCY (SIOCDEVPRIVATE+3) /* set cable modem frequency */ +#define SIOCGCMPIDS (SIOCDEVPRIVATE+4) /* get cable modem PIDs */ +#define SIOCSCMPIDS (SIOCDEVPRIVATE+5) /* set cable modem PIDs */ #endif -- cgit v1.1 From d5bf240fa193989d605a715bda7cb3283b1abc89 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 9 May 2013 04:23:40 +0000 Subject: macvlan: fix passthru mode race between dev removal and rx path [ Upstream commit 233c7df0821c4190e2d3f4be0f2ca0ab40a5ed8c, note that I had to add list_first_or_null_rcu to rculist.h in order to accomodate this fix. ] Currently, if macvlan in passthru mode is created and data are rxed and you remove this device, following panic happens: NULL pointer dereference at 0000000000000198 IP: [] macvlan_handle_frame+0x153/0x1f7 [macvlan] I'm using following script to trigger this: I run this script while "ping -f" is running on another machine to send packets to e1 rx. Reason of the panic is that list_first_entry() is blindly called in macvlan_handle_frame() even if the list was empty. vlan is set to incorrect pointer which leads to the crash. I'm fixing this by protecting port->vlans list by rcu and by preventing from getting incorrect pointer in case the list is empty. Introduced by: commit eb06acdc85585f2 "macvlan: Introduce 'passthru' mode to takeover the underlying device" Signed-off-by: Jiri Pirko Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/rculist.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index e3beb31..c30ffd8 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -242,6 +242,23 @@ static inline void list_splice_init_rcu(struct list_head *list, list_entry_rcu((ptr)->next, type, member) /** + * list_first_or_null_rcu - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note that if the list is empty, it returns NULL. + * + * This primitive may safely run concurrently with the _rcu list-mutation + * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). + */ +#define list_first_or_null_rcu(ptr, type, member) \ + ({struct list_head *__ptr = (ptr); \ + struct list_head __rcu *__next = list_next_rcu(__ptr); \ + likely(__ptr != __next) ? container_of(__next, type, member) : NULL; \ + }) + +/** * list_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. -- cgit v1.1 From 1e74f2ea952f201c5ee5edce74daab21aea89b31 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 May 2013 10:28:16 +0000 Subject: ipv6: do not clear pinet6 field [ Upstream commit f77d602124d865c38705df7fa25c03de9c284ad2 ] We have seen multiple NULL dereferences in __inet6_lookup_established() After analysis, I found that inet6_sk() could be NULL while the check for sk_family == AF_INET6 was true. Bug was added in linux-2.6.29 when RCU lookups were introduced in UDP and TCP stacks. Once an IPv6 socket, using SLAB_DESTROY_BY_RCU is inserted in a hash table, we no longer can clear pinet6 field. This patch extends logic used in commit fcbdf09d9652c891 ("net: fix nulls list corruptions in sk_prot_alloc") TCP/UDP/UDPLite IPv6 protocols provide their own .clear_sk() method to make sure we do not clear pinet6 field. At socket clone phase, we do not really care, as cloning the parent (non NULL) pinet6 is not adding a fatal race. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/sock.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index b2deeab..b6abd4f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -721,6 +721,18 @@ struct timewait_sock_ops; struct inet_hashinfo; struct raw_hashinfo; +/* + * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes + * un-modified. Special care is taken when initializing object to zero. + */ +static inline void sk_prot_clear_nulls(struct sock *sk, int size) +{ + if (offsetof(struct sock, sk_node.next) != 0) + memset(sk, 0, offsetof(struct sock, sk_node.next)); + memset(&sk->sk_node.pprev, 0, + size - offsetof(struct sock, sk_node.pprev)); +} + /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface * transport -> network interface is defined by struct inet_proto -- cgit v1.1 From 26deb18bd5aa6e3d7099b291038fef47b31cbf69 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Fri, 24 May 2013 15:55:09 -0700 Subject: wait: fix false timeouts when using wait_event_timeout() commit 4c663cfc523a88d97a8309b04a089c27dc57fd7e upstream. Many callers of the wait_event_timeout() and wait_event_interruptible_timeout() expect that the return value will be positive if the specified condition becomes true before the timeout elapses. However, at the moment this isn't guaranteed. If the wake-up handler is delayed enough, the time remaining until timeout will be calculated as 0 - and passed back as a return value - even if the condition became true before the timeout has passed. Fix this by returning at least 1 if the condition becomes true. This semantic is in line with what wait_for_condition_timeout() does; see commit bb10ed09 ("sched: fix wait_for_completion_timeout() spurious failure under heavy load"). Daniel said "We have 3 instances of this bug in drm/i915. One case even where we switch between the interruptible and not interruptible wait_event_timeout variants, foolishly presuming they have the same semantics. I very much like this." One such bug is reported at https://bugs.freedesktop.org/show_bug.cgi?id=64133 Signed-off-by: Imre Deak Acked-by: Daniel Vetter Acked-by: David Howells Acked-by: Jens Axboe Cc: "Paul E. McKenney" Cc: Dave Jones Cc: Lukas Czerner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/wait.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 3efc9f3..bea7ad5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -233,6 +233,8 @@ do { \ if (!ret) \ break; \ } \ + if (!ret && (condition)) \ + ret = 1; \ finish_wait(&wq, &__wait); \ } while (0) @@ -249,8 +251,9 @@ do { \ * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * - * The function returns 0 if the @timeout elapsed, and the remaining - * jiffies if the condition evaluated to true before the timeout elapsed. + * The function returns 0 if the @timeout elapsed, or the remaining + * jiffies (at least 1) if the @condition evaluated to %true before + * the @timeout elapsed. */ #define wait_event_timeout(wq, condition, timeout) \ ({ \ @@ -318,6 +321,8 @@ do { \ ret = -ERESTARTSYS; \ break; \ } \ + if (!ret && (condition)) \ + ret = 1; \ finish_wait(&wq, &__wait); \ } while (0) @@ -334,9 +339,10 @@ do { \ * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * - * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it - * was interrupted by a signal, and the remaining jiffies otherwise - * if the condition evaluated to true before the timeout elapsed. + * Returns: + * 0 if the @timeout elapsed, -%ERESTARTSYS if it was interrupted by + * a signal, or the remaining jiffies (at least 1) if the @condition + * evaluated to %true before the @timeout elapsed. */ #define wait_event_interruptible_timeout(wq, condition, timeout) \ ({ \ -- cgit v1.1 From 79848ba66d91e0c171ff203363e0c96629279c15 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 12 Jun 2013 14:05:04 -0700 Subject: mm: migration: add migrate_entry_wait_huge() commit 30dad30922ccc733cfdbfe232090cf674dc374dc upstream. When we have a page fault for the address which is backed by a hugepage under migration, the kernel can't wait correctly and do busy looping on hugepage fault until the migration finishes. As a result, users who try to kick hugepage migration (via soft offlining, for example) occasionally experience long delay or soft lockup. This is because pte_offset_map_lock() can't get a correct migration entry or a correct page table lock for hugepage. This patch introduces migration_entry_wait_huge() to solve this. Signed-off-by: Naoya Horiguchi Reviewed-by: Rik van Riel Reviewed-by: Wanpeng Li Reviewed-by: Michal Hocko Cc: Mel Gorman Cc: Andi Kleen Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/swapops.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index cd42e30..96c7110 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -113,6 +113,7 @@ static inline void make_migration_entry_read(swp_entry_t *entry) extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); +extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte); #else #define make_migration_entry(page, write) swp_entry(0, 0) @@ -124,6 +125,8 @@ static inline int is_migration_entry(swp_entry_t swp) static inline void make_migration_entry_read(swp_entry_t *entryp) { } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } +static inline void migration_entry_wait_huge(struct mm_struct *mm, + pte_t *pte) { } static inline int is_write_migration_entry(swp_entry_t entry) { return 0; -- cgit v1.1 From e1b796f9408a33d18709e9fdbf18ce91dfede962 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 22 May 2013 14:07:44 -0700 Subject: net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg [ Upstream commits 1be374a0518a288147c6a7398792583200a67261 and a7526eb5d06b0084ef12d7b168d008fcf516caab ] MSG_CMSG_COMPAT is (AFAIK) not intended to be part of the API -- it's a hack that steals a bit to indicate to other networking code that a compat entry was used. So don't allow it from a non-compat syscall. This prevents an oops when running this code: int main() { int s; struct sockaddr_in addr; struct msghdr *hdr; char *highpage = mmap((void*)(TASK_SIZE_MAX - 4096), 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); if (highpage == MAP_FAILED) err(1, "mmap"); s = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); if (s == -1) err(1, "socket"); addr.sin_family = AF_INET; addr.sin_port = htons(1); addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); if (connect(s, (struct sockaddr*)&addr, sizeof(addr)) != 0) err(1, "connect"); void *evil = highpage + 4096 - COMPAT_MSGHDR_SIZE; printf("Evil address is %p\n", evil); if (syscall(__NR_sendmmsg, s, evil, 1, MSG_CMSG_COMPAT) < 0) err(1, "sendmmsg"); return 0; } Cc: David S. Miller Signed-off-by: Andy Lutomirski Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/socket.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index 2417952..50b2530 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -332,6 +332,9 @@ extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); struct timespec; +/* The __sys_...msg variants allow MSG_CMSG_COMPAT */ +extern long __sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags); +extern long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags); extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct timespec *timeout); extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, -- cgit v1.1 From 64274c35beebe1be22650a9353c0c33a7b8b723c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 May 2013 09:06:27 +0000 Subject: net: force a reload of first item in hlist_nulls_for_each_entry_rcu [ Upstream commit c87a124a5d5e8cf8e21c4363c3372bcaf53ea190 ] Roman Gushchin discovered that udp4_lib_lookup2() was not reloading first item in the rcu protected list, in case the loop was restarted. This produced soft lockups as in https://lkml.org/lkml/2013/4/16/37 rcu_dereference(X)/ACCESS_ONCE(X) seem to not work as intended if X is ptr->field : In some cases, gcc caches the value or ptr->field in a register. Use a barrier() to disallow such caching, as documented in Documentation/atomic_ops.txt line 114 Thanks a lot to Roman for providing analysis and numerous patches. Diagnosed-by: Roman Gushchin Signed-off-by: Eric Dumazet Reported-by: Boris Zhmurov Signed-off-by: Roman Gushchin Acked-by: Paul E. McKenney Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/rculist_nulls.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 2ae1371..1c33dd7 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -105,9 +105,14 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, * @head: the head for your list. * @member: the name of the hlist_nulls_node within the struct. * + * The barrier() is needed to make sure compiler doesn't cache first element [1], + * as this loop can be restarted [2] + * [1] Documentation/atomic_ops.txt around line 114 + * [2] Documentation/RCU/rculist_nulls.txt around line 146 */ #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ + for (({barrier();}), \ + pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) -- cgit v1.1 From e533fb01a23b2c572b11f5357c330daba10039a9 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sat, 29 Jun 2013 00:15:51 +0800 Subject: net: Swap ver and type in pppoe_hdr [ Upstream commit b1a5a34bd0b8767ea689e68f8ea513e9710b671e ] Ver and type in pppoe_hdr should be swapped as defined by RFC2516 section-4. Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/if_pppox.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index 397921b..3c7d8ba 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -127,11 +127,11 @@ struct pppoe_tag { struct pppoe_hdr { #if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 ver : 4; __u8 type : 4; + __u8 ver : 4; #elif defined(__BIG_ENDIAN_BITFIELD) - __u8 type : 4; __u8 ver : 4; + __u8 type : 4; #else #error "Please fix " #endif -- cgit v1.1 From 52ef39eeff06aecc56266902bba6bf28891cabd3 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Sat, 29 Jun 2013 21:30:49 +0800 Subject: ipv6,mcast: always hold idev->lock before mca_lock [ Upstream commit 8965779d2c0e6ab246c82a405236b1fb2adae6b2, with some bits from commit b7b1bfce0bb68bd8f6e62a28295922785cc63781 ("ipv6: split duplicate address detection and router solicitation timer") to get the __ipv6_get_lladdr() used by this patch. ] dingtianhong reported the following deadlock detected by lockdep: ====================================================== [ INFO: possible circular locking dependency detected ] 3.4.24.05-0.1-default #1 Not tainted ------------------------------------------------------- ksoftirqd/0/3 is trying to acquire lock: (&ndev->lock){+.+...}, at: [] ipv6_get_lladdr+0x74/0x120 but task is already holding lock: (&mc->mca_lock){+.+...}, at: [] mld_send_report+0x40/0x150 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&mc->mca_lock){+.+...}: [] validate_chain+0x637/0x730 [] __lock_acquire+0x2f7/0x500 [] lock_acquire+0x114/0x150 [] rt_spin_lock+0x4a/0x60 [] igmp6_group_added+0x3b/0x120 [] ipv6_mc_up+0x38/0x60 [] ipv6_find_idev+0x3d/0x80 [] addrconf_notify+0x3d5/0x4b0 [] notifier_call_chain+0x3f/0x80 [] raw_notifier_call_chain+0x11/0x20 [] call_netdevice_notifiers+0x32/0x60 [] __dev_notify_flags+0x34/0x80 [] dev_change_flags+0x40/0x70 [] do_setlink+0x237/0x8a0 [] rtnl_newlink+0x3ec/0x600 [] rtnetlink_rcv_msg+0x160/0x310 [] netlink_rcv_skb+0x89/0xb0 [] rtnetlink_rcv+0x27/0x40 [] netlink_unicast+0x140/0x180 [] netlink_sendmsg+0x33e/0x380 [] sock_sendmsg+0x112/0x130 [] __sys_sendmsg+0x44e/0x460 [] sys_sendmsg+0x44/0x70 [] system_call_fastpath+0x16/0x1b -> #0 (&ndev->lock){+.+...}: [] check_prev_add+0x3de/0x440 [] validate_chain+0x637/0x730 [] __lock_acquire+0x2f7/0x500 [] lock_acquire+0x114/0x150 [] rt_read_lock+0x42/0x60 [] ipv6_get_lladdr+0x74/0x120 [] mld_newpack+0xb6/0x160 [] add_grhead+0xab/0xc0 [] add_grec+0x3ab/0x460 [] mld_send_report+0x5a/0x150 [] igmp6_timer_handler+0x4e/0xb0 [] call_timer_fn+0xca/0x1d0 [] run_timer_softirq+0x1df/0x2e0 [] handle_pending_softirqs+0xf7/0x1f0 [] __do_softirq_common+0x7b/0xf0 [] __thread_do_softirq+0x1af/0x210 [] run_ksoftirqd+0xe1/0x1f0 [] kthread+0xae/0xc0 [] kernel_thread_helper+0x4/0x10 actually we can just hold idev->lock before taking pmc->mca_lock, and avoid taking idev->lock again when iterating idev->addr_list, since the upper callers of mld_newpack() already take read_lock_bh(&idev->lock). Reported-by: dingtianhong Cc: dingtianhong Cc: Hideaki YOSHIFUJI Cc: David S. Miller Cc: Hannes Frederic Sowa Tested-by: Ding Tianhong Tested-by: Chen Weilong Signed-off-by: Cong Wang Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/addrconf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 582e4ae..561fd2a 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -81,6 +81,9 @@ extern int ipv6_dev_get_saddr(struct net *net, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); +extern int __ipv6_get_lladdr(struct inet6_dev *idev, + struct in6_addr *addr, + unsigned char banned_flags); extern int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, unsigned char banned_flags); -- cgit v1.1 From 639e5920a9ae14b1eefc44a8740f5d0f816adb9a Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 1 Jul 2013 20:21:30 +0200 Subject: ipv6: call udp_push_pending_frames when uncorking a socket with AF_INET pending data [ Upstream commit 8822b64a0fa64a5dd1dfcf837c5b0be83f8c05d1 ] We accidentally call down to ip6_push_pending_frames when uncorking pending AF_INET data on a ipv6 socket. This results in the following splat (from Dave Jones): skbuff: skb_under_panic: text:ffffffff816765f6 len:48 put:40 head:ffff88013deb6df0 data:ffff88013deb6dec tail:0x2c end:0xc0 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:126! invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: dccp_ipv4 dccp 8021q garp bridge stp dlci mpoa snd_seq_dummy sctp fuse hidp tun bnep nfnetlink scsi_transport_iscsi rfcomm can_raw can_bcm af_802154 appletalk caif_socket can caif ipt_ULOG x25 rose af_key pppoe pppox ipx phonet irda llc2 ppp_generic slhc p8023 psnap p8022 llc crc_ccitt atm bluetooth +netrom ax25 nfc rfkill rds af_rxrpc coretemp hwmon kvm_intel kvm crc32c_intel snd_hda_codec_realtek ghash_clmulni_intel microcode pcspkr snd_hda_codec_hdmi snd_hda_intel snd_hda_codec snd_hwdep usb_debug snd_seq snd_seq_device snd_pcm e1000e snd_page_alloc snd_timer ptp snd pps_core soundcore xfs libcrc32c CPU: 2 PID: 8095 Comm: trinity-child2 Not tainted 3.10.0-rc7+ #37 task: ffff8801f52c2520 ti: ffff8801e6430000 task.ti: ffff8801e6430000 RIP: 0010:[] [] skb_panic+0x63/0x65 RSP: 0018:ffff8801e6431de8 EFLAGS: 00010282 RAX: 0000000000000086 RBX: ffff8802353d3cc0 RCX: 0000000000000006 RDX: 0000000000003b90 RSI: ffff8801f52c2ca0 RDI: ffff8801f52c2520 RBP: ffff8801e6431e08 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000001 R12: ffff88022ea0c800 R13: ffff88022ea0cdf8 R14: ffff8802353ecb40 R15: ffffffff81cc7800 FS: 00007f5720a10740(0000) GS:ffff880244c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000005862000 CR3: 000000022843c000 CR4: 00000000001407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 Stack: ffff88013deb6dec 000000000000002c 00000000000000c0 ffffffff81a3f6e4 ffff8801e6431e18 ffffffff8159a9aa ffff8801e6431e90 ffffffff816765f6 ffffffff810b756b 0000000700000002 ffff8801e6431e40 0000fea9292aa8c0 Call Trace: [] skb_push+0x3a/0x40 [] ip6_push_pending_frames+0x1f6/0x4d0 [] ? mark_held_locks+0xbb/0x140 [] udp_v6_push_pending_frames+0x2b9/0x3d0 [] ? udplite_getfrag+0x20/0x20 [] udp_lib_setsockopt+0x1aa/0x1f0 [] ? fget_light+0x387/0x4f0 [] udpv6_setsockopt+0x34/0x40 [] sock_common_setsockopt+0x14/0x20 [] SyS_setsockopt+0x71/0xd0 [] tracesys+0xdd/0xe2 Code: 00 00 48 89 44 24 10 8b 87 d8 00 00 00 48 89 44 24 08 48 8b 87 e8 00 00 00 48 c7 c7 c0 04 aa 81 48 89 04 24 31 c0 e8 e1 7e ff ff <0f> 0b 55 48 89 e5 0f 0b 55 48 89 e5 0f 0b 55 48 89 e5 0f 0b 55 RIP [] skb_panic+0x63/0x65 RSP This patch adds a check if the pending data is of address family AF_INET and directly calls udp_push_ending_frames from udp_v6_push_pending_frames if that is the case. This bug was found by Dave Jones with trinity. (Also move the initialization of fl6 below the AF_INET check, even if not strictly necessary.) Signed-off-by: Hannes Frederic Sowa Cc: Dave Jones Cc: YOSHIFUJI Hideaki Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/udp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index 67ea6fc..e723c9d 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -180,6 +180,7 @@ extern int udp_get_port(struct sock *sk, unsigned short snum, extern void udp_err(struct sk_buff *, u32); extern int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len); +extern int udp_push_pending_frames(struct sock *sk); extern void udp_flush_pending_frames(struct sock *sk); extern int udp_rcv(struct sk_buff *skb); extern int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); -- cgit v1.1 From 8ff3d73133cb5a8a3514cc184152c553637e74a8 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 4 Aug 2013 15:43:40 +0800 Subject: virtio: support unlocked queue poll commit cc229884d3f77ec3b1240e467e0236c3e0647c0c upstream. This adds a way to check ring empty state after enable_cb outside any locks. Will be used by virtio_net. Note: there's room for more optimization: caller is likely to have a memory barrier already, which means we might be able to get rid of a barrier here. Deferring this optimization until we do some benchmarking. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller [wg: Backported to 3.2] Signed-off-by: Wolfram Gloger Signed-off-by: Greg Kroah-Hartman --- include/linux/virtio.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 7108857..a39e962 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -93,6 +93,10 @@ void virtqueue_disable_cb(struct virtqueue *vq); bool virtqueue_enable_cb(struct virtqueue *vq); +unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq); + +bool virtqueue_poll(struct virtqueue *vq, unsigned); + bool virtqueue_enable_cb_delayed(struct virtqueue *vq); void *virtqueue_detach_unused_buf(struct virtqueue *vq); -- cgit v1.1 From c63eea737793f3562cc62d1395b6b1d325804d27 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Fri, 2 Aug 2013 21:16:43 +0400 Subject: tracing: Fix fields of struct trace_iterator that are zeroed by mistake commit ed5467da0e369e65b247b99eb6403cb79172bcda upstream. tracing_read_pipe zeros all fields bellow "seq". The declaration contains a comment about that, but it doesn't help. The first field is "snapshot", it's true when current open file is snapshot. Looks obvious, that it should not be zeroed. The second field is "started". It was converted from cpumask_t to cpumask_var_t (v2.6.28-4983-g4462344), in other words it was converted from cpumask to pointer on cpumask. Currently the reference on "started" memory is lost after the first read from tracing_read_pipe and a proper object will never be freed. The "started" is never dereferenced for trace_pipe, because trace_pipe can't have the TRACE_FILE_ANNOTATE options. Link: http://lkml.kernel.org/r/1375463803-3085183-1-git-send-email-avagin@openvz.org Signed-off-by: Andrew Vagin Signed-off-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- include/linux/ftrace_event.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 59d3ef1..217b3c2 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -71,6 +71,8 @@ struct trace_iterator { /* trace_seq for __print_flags() and __print_symbolic() etc. */ struct trace_seq tmp_seq; + cpumask_var_t started; + /* The below is zeroed out in pipe_read */ struct trace_seq seq; struct trace_entry *ent; @@ -82,7 +84,7 @@ struct trace_iterator { loff_t pos; long idx; - cpumask_var_t started; + /* All new field here will be zeroed out in pipe_read */ }; -- cgit v1.1 From 481101ccce3d7cb2247ff29fc7b8ae1916deeae4 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 25 Jun 2013 21:19:31 +0800 Subject: futex: Take hugepages into account when generating futex_key commit 13d60f4b6ab5b702dc8d2ee20999f98a93728aec upstream. The futex_keys of process shared futexes are generated from the page offset, the mapping host and the mapping index of the futex user space address. This should result in an unique identifier for each futex. Though this is not true when futexes are located in different subpages of an hugepage. The reason is, that the mapping index for all those futexes evaluates to the index of the base page of the hugetlbfs mapping. So a futex at offset 0 of the hugepage mapping and another one at offset PAGE_SIZE of the same hugepage mapping have identical futex_keys. This happens because the futex code blindly uses page->index. Steps to reproduce the bug: 1. Map a file from hugetlbfs. Initialize pthread_mutex1 at offset 0 and pthread_mutex2 at offset PAGE_SIZE of the hugetlbfs mapping. The mutexes must be initialized as PTHREAD_PROCESS_SHARED because PTHREAD_PROCESS_PRIVATE mutexes are not affected by this issue as their keys solely depend on the user space address. 2. Lock mutex1 and mutex2 3. Create thread1 and in the thread function lock mutex1, which results in thread1 blocking on the locked mutex1. 4. Create thread2 and in the thread function lock mutex2, which results in thread2 blocking on the locked mutex2. 5. Unlock mutex2. Despite the fact that mutex2 got unlocked, thread2 still blocks on mutex2 because the futex_key points to mutex1. To solve this issue we need to take the normal page index of the page which contains the futex into account, if the futex is in an hugetlbfs mapping. In other words, we calculate the normal page mapping index of the subpage in the hugetlbfs mapping. Mappings which are not based on hugetlbfs are not affected and still use page->index. Thanks to Mel Gorman who provided a patch for adding proper evaluation functions to the hugetlbfs code to avoid exposing hugetlbfs specific details to the futex code. [ tglx: Massaged changelog ] Signed-off-by: Zhang Yi Reviewed-by: Jiang Biao Tested-by: Ma Chenggong Reviewed-by: 'Mel Gorman' Acked-by: 'Darren Hart' Cc: 'Peter Zijlstra' Link: http://lkml.kernel.org/r/000101ce71a6%24a83c5880%24f8b50980%24@com Signed-off-by: Thomas Gleixner Cc: Mike Galbraith Signed-off-by: Greg Kroah-Hartman --- include/linux/hugetlb.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 59225ef..db70f1b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -313,6 +313,17 @@ static inline unsigned hstate_index_to_shift(unsigned index) return hstates[index].order + PAGE_SHIFT; } +pgoff_t __basepage_index(struct page *page); + +/* Return page->index in PAGE_SIZE units */ +static inline pgoff_t basepage_index(struct page *page) +{ + if (!PageCompound(page)) + return page->index; + + return __basepage_index(page); +} + #else struct hstate {}; #define alloc_huge_page_node(h, nid) NULL @@ -331,6 +342,11 @@ static inline unsigned int pages_per_huge_page(struct hstate *h) return 1; } #define hstate_index_to_shift(index) 0 + +static inline pgoff_t basepage_index(struct page *page) +{ + return page->index; +} #endif #endif /* _LINUX_HUGETLB_H */ -- cgit v1.1 From 374172589a9c99e63c373b1c74588553237a74d3 Mon Sep 17 00:00:00 2001 From: Martin Peschke Date: Thu, 22 Aug 2013 17:45:36 +0200 Subject: SCSI: zfcp: fix lock imbalance by reworking request queue locking commit d79ff142624e1be080ad8d09101f7004d79c36e1 upstream. This patch adds wait_event_interruptible_lock_irq_timeout(), which is a straight-forward descendant of wait_event_interruptible_timeout() and wait_event_interruptible_lock_irq(). The zfcp driver used to call wait_event_interruptible_timeout() in combination with some intricate and error-prone locking. Using wait_event_interruptible_lock_irq_timeout() as a replacement nicely cleans up that locking. This rework removes a situation that resulted in a locking imbalance in zfcp_qdio_sbal_get(): BUG: workqueue leaked lock or atomic: events/1/0xffffff00/10 last function: zfcp_fc_wka_port_offline+0x0/0xa0 [zfcp] It was introduced by commit c2af7545aaff3495d9bf9a7608c52f0af86fb194 "[SCSI] zfcp: Do not wait for SBALs on stopped queue", which had a new code path related to ZFCP_STATUS_ADAPTER_QDIOUP that took an early exit without a required lock being held. The problem occured when a special, non-SCSI I/O request was being submitted in process context, when the adapter's queues had been torn down. In this case the bug surfaced when the Fibre Channel port connection for a well-known address was closed during a concurrent adapter shut-down procedure, which is a rare constellation. This patch also fixes these warnings from the sparse tool (make C=1): drivers/s390/scsi/zfcp_qdio.c:224:12: warning: context imbalance in 'zfcp_qdio_sbal_check' - wrong count at exit drivers/s390/scsi/zfcp_qdio.c:244:5: warning: context imbalance in 'zfcp_qdio_sbal_get' - unexpected unlock Last but not least, we get rid of that crappy lock-unlock-lock sequence at the beginning of the critical section. It is okay to call zfcp_erp_adapter_reopen() with req_q_lock held. Reported-by: Mikulas Patocka Reported-by: Heiko Carstens Signed-off-by: Martin Peschke Signed-off-by: Steffen Maier Signed-off-by: James Bottomley Signed-off-by: Greg Kroah-Hartman --- include/linux/wait.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index bea7ad5..e007f76 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -530,6 +530,63 @@ do { \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1)) +#define __wait_event_interruptible_lock_irq_timeout(wq, condition, \ + lock, ret) \ +do { \ + DEFINE_WAIT(__wait); \ + \ + for (;;) { \ + prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ + if (condition) \ + break; \ + if (signal_pending(current)) { \ + ret = -ERESTARTSYS; \ + break; \ + } \ + spin_unlock_irq(&lock); \ + ret = schedule_timeout(ret); \ + spin_lock_irq(&lock); \ + if (!ret) \ + break; \ + } \ + finish_wait(&wq, &__wait); \ +} while (0) + +/** + * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets true or a timeout elapses. + * The condition is checked under the lock. This is expected + * to be called with the lock taken. + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @lock: a locked spinlock_t, which will be released before schedule() + * and reacquired afterwards. + * @timeout: timeout, in jiffies + * + * The process is put to sleep (TASK_INTERRUPTIBLE) until the + * @condition evaluates to true or signal is received. The @condition is + * checked each time the waitqueue @wq is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * This is supposed to be called while holding the lock. The lock is + * dropped before going to sleep and is reacquired afterwards. + * + * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it + * was interrupted by a signal, and the remaining jiffies otherwise + * if the condition evaluated to true before the timeout elapsed. + */ +#define wait_event_interruptible_lock_irq_timeout(wq, condition, lock, \ + timeout) \ +({ \ + int __ret = timeout; \ + \ + if (!(condition)) \ + __wait_event_interruptible_lock_irq_timeout( \ + wq, condition, lock, __ret); \ + __ret; \ +}) + #define __wait_event_killable(wq, condition, ret) \ do { \ -- cgit v1.1 From 1eeceae48fdc6e6fcb71403010ef5dd863b7ef2f Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Fri, 16 Aug 2013 13:30:07 +0200 Subject: ipv6: drop packets with multiple fragmentation headers [ Upstream commit f46078cfcd77fa5165bf849f5e568a7ac5fa569c ] It is not allowed for an ipv6 packet to contain multiple fragmentation headers. So discard packets which were already reassembled by fragmentation logic and send back a parameter problem icmp. The updates for RFC 6980 will come in later, I have to do a bit more research here. Cc: YOSHIFUJI Hideaki Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 0c99776..84b1447 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -255,6 +255,7 @@ struct inet6_skb_parm { #define IP6SKB_XFRM_TRANSFORMED 1 #define IP6SKB_FORWARDED 2 #define IP6SKB_REROUTED 4 +#define IP6SKB_FRAGMENTED 16 }; #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) -- cgit v1.1 From 2eeeacf627ab4bac67c3c1735b2c96fccbea6262 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Fri, 30 Aug 2013 11:18:45 +0200 Subject: ICMPv6: treat dest unreachable codes 5 and 6 as EACCES, not EPROTO [ Upstream commit 61e76b178dbe7145e8d6afa84bb4ccea71918994 ] RFC 4443 has defined two additional codes for ICMPv6 type 1 (destination unreachable) messages: 5 - Source address failed ingress/egress policy 6 - Reject route to destination Now they are treated as protocol error and icmpv6_err_convert() converts them to EPROTO. RFC 4443 says: "Codes 5 and 6 are more informative subsets of code 1." Treat codes 5 and 6 as code 1 (EACCES) Btw, connect() returning -EPROTO confuses firefox, so that fallback to other/IPv4 addresses does not work: https://bugzilla.mozilla.org/show_bug.cgi?id=910773 Signed-off-by: Jiri Bohac Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/icmpv6.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index ba45e6b..f5a21d0 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -123,6 +123,8 @@ static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb) #define ICMPV6_NOT_NEIGHBOUR 2 #define ICMPV6_ADDR_UNREACH 3 #define ICMPV6_PORT_UNREACH 4 +#define ICMPV6_POLICY_FAIL 5 +#define ICMPV6_REJECT_ROUTE 6 /* * Codes for Time Exceeded -- cgit v1.1 From c97081c634fdee51845ca8a30f394cf06125a509 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 10:34:48 -0700 Subject: rculist: list_first_or_null_rcu() should use list_entry_rcu() commit c34ac00caefbe49d40058ae7200bd58725cebb45 upstream. list_first_or_null() should test whether the list is empty and return pointer to the first entry if not in a RCU safe manner. It's broken in several ways. * It compares __kernel @__ptr with __rcu @__next triggering the following sparse warning. net/core/dev.c:4331:17: error: incompatible types in comparison expression (different address spaces) * It doesn't perform rcu_dereference*() and computes the entry address using container_of() directly from the __rcu pointer which is inconsitent with other rculist interface. As a result, all three in-kernel users - net/core/dev.c, macvlan, cgroup - are buggy. They dereference the pointer w/o going through read barrier. * While ->next dereference passes through list_next_rcu(), the compiler is still free to fetch ->next more than once and thus nullify the "__ptr != __next" condition check. Fix it by making list_first_or_null_rcu() dereference ->next directly using ACCESS_ONCE() and then use list_entry_rcu() on it like other rculist accessors. v2: Paul pointed out that the compiler may fetch the pointer more than once nullifying the condition check. ACCESS_ONCE() added on ->next dereference. v3: Restored () around macro param which was accidentally removed. Spotted by Paul. Signed-off-by: Tejun Heo Reported-by: Fengguang Wu Cc: Dipankar Sarma Cc: "Paul E. McKenney" Cc: "David S. Miller" Cc: Li Zefan Cc: Patrick McHardy Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Signed-off-by: Greg Kroah-Hartman --- include/linux/rculist.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index c30ffd8..52d280b 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -254,8 +254,9 @@ static inline void list_splice_init_rcu(struct list_head *list, */ #define list_first_or_null_rcu(ptr, type, member) \ ({struct list_head *__ptr = (ptr); \ - struct list_head __rcu *__next = list_next_rcu(__ptr); \ - likely(__ptr != __next) ? container_of(__next, type, member) : NULL; \ + struct list_head *__next = ACCESS_ONCE(__ptr->next); \ + likely(__ptr != __next) ? \ + list_entry_rcu(__next, type, member) : NULL; \ }) /** -- cgit v1.1 From 645c9dcabb2c234e6d4a546125830d04988a1f90 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 28 Aug 2013 22:29:55 +0200 Subject: HID: validate HID report id size commit 43622021d2e2b82ea03d883926605bdd0525e1d1 upstream. The "Report ID" field of a HID report is used to build indexes of reports. The kernel's index of these is limited to 256 entries, so any malicious device that sets a Report ID greater than 255 will trigger memory corruption on the host: [ 1347.156239] BUG: unable to handle kernel paging request at ffff88094958a878 [ 1347.156261] IP: [] hid_register_report+0x2a/0x8b CVE-2013-2888 Signed-off-by: Kees Cook Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- include/linux/hid.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hid.h b/include/linux/hid.h index 42f7e2f..af30c64 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -414,10 +414,12 @@ struct hid_report { struct hid_device *device; /* associated device */ }; +#define HID_MAX_IDS 256 + struct hid_report_enum { unsigned numbered; struct list_head report_list; - struct hid_report *report_id_hash[256]; + struct hid_report *report_id_hash[HID_MAX_IDS]; }; #define HID_REPORT_TYPES 3 -- cgit v1.1 From 7247e66f07168b28410549634ea5d29a84602f81 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 11 Sep 2013 21:56:50 +0200 Subject: HID: provide a helper for validating hid reports commit 331415ff16a12147d57d5c953f3a961b7ede348b upstream. Many drivers need to validate the characteristics of their HID report during initialization to avoid misusing the reports. This adds a common helper to perform validation of the report exisitng, the field existing, and the expected number of values within the field. Signed-off-by: Kees Cook Reviewed-by: Benjamin Tissoires Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- include/linux/hid.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/hid.h b/include/linux/hid.h index af30c64..4649e29 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -713,6 +713,10 @@ void hid_output_report(struct hid_report *report, __u8 *data); struct hid_device *hid_allocate_device(void); struct hid_report *hid_register_report(struct hid_device *device, unsigned type, unsigned id); int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size); +struct hid_report *hid_validate_values(struct hid_device *hid, + unsigned int type, unsigned int id, + unsigned int field_index, + unsigned int report_counts); int hid_check_keys_pressed(struct hid_device *hid); int hid_connect(struct hid_device *hid, unsigned int connect_mask); void hid_disconnect(struct hid_device *hid); -- cgit v1.1 From 38770b82bdd3523d92596a0807d3751bb7d25224 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Oct 2012 15:38:52 +0200 Subject: perf: Clarify perf_cpu_context::active_pmu usage by renaming it to ::unique_pmu commit 3f1f33206c16c7b3839d71372bc2ac3f305aa802 upstream. Stephane thought the perf_cpu_context::active_pmu name confusing and suggested using 'unique_pmu' instead. This pointer is a pointer to a 'random' pmu sharing the cpuctx instance, therefore limiting a for_each_pmu loop to those where cpuctx->unique_pmu matches the pmu we get a loop over unique cpuctx instances. Suggested-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-kxyjqpfj2fn9gt7kwu5ag9ks@git.kernel.org Signed-off-by: Ingo Molnar Cc: Li Zefan Signed-off-by: Greg Kroah-Hartman --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 67b9fbc..e4d3640 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -939,7 +939,7 @@ struct perf_cpu_context { int exclusive; struct list_head rotation_list; int jiffies_interval; - struct pmu *active_pmu; + struct pmu *unique_pmu; struct perf_cgroup *cgrp; }; -- cgit v1.1 From 2e4e7cb96933d2c9794125b038bd9ea9cba9bcfc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Jun 2012 15:24:40 +0200 Subject: splice: fix racy pipe->buffers uses commit 047fe3605235888f3ebcda0c728cb31937eadfe6 upstream. Dave Jones reported a kernel BUG at mm/slub.c:3474! triggered by splice_shrink_spd() called from vmsplice_to_pipe() commit 35f3d14dbbc5 (pipe: add support for shrinking and growing pipes) added capability to adjust pipe->buffers. Problem is some paths don't hold pipe mutex and assume pipe->buffers doesn't change for their duration. Fix this by adding nr_pages_max field in struct splice_pipe_desc, and use it in place of pipe->buffers where appropriate. splice_shrink_spd() loses its struct pipe_inode_info argument. Reported-by: Dave Jones Signed-off-by: Eric Dumazet Cc: Jens Axboe Cc: Alexander Viro Cc: Tom Herbert Cc: stable # 2.6.35 Tested-by: Dave Jones Signed-off-by: Jens Axboe Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- include/linux/splice.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/splice.h b/include/linux/splice.h index 997c3b4..dbbe79c 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -51,7 +51,8 @@ struct partial_page { struct splice_pipe_desc { struct page **pages; /* page map */ struct partial_page *partial; /* pages[] may not be contig */ - int nr_pages; /* number of pages in map */ + int nr_pages; /* number of populated pages in map */ + unsigned int nr_pages_max; /* pages[] & partial[] arrays size */ unsigned int flags; /* splice flags */ const struct pipe_buf_operations *ops;/* ops associated with output pipe */ void (*spd_release)(struct splice_pipe_desc *, unsigned int); @@ -85,8 +86,7 @@ extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, /* * for dynamic pipe sizing */ -extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *); -extern void splice_shrink_spd(struct pipe_inode_info *, - struct splice_pipe_desc *); +extern int splice_grow_spd(const struct pipe_inode_info *, struct splice_pipe_desc *); +extern void splice_shrink_spd(struct splice_pipe_desc *); #endif -- cgit v1.1 From 2a6a2791b1e6ebd7ad29f137a309471f92d71c55 Mon Sep 17 00:00:00 2001 From: Ansis Atteka Date: Wed, 18 Sep 2013 15:29:53 -0700 Subject: ip: generate unique IP identificator if local fragmentation is allowed [ Upstream commit 703133de331a7a7df47f31fb9de51dc6f68a9de8 ] If local fragmentation is allowed, then ip_select_ident() and ip_select_ident_more() need to generate unique IDs to ensure correct defragmentation on the peer. For example, if IPsec (tunnel mode) has to encrypt large skbs that have local_df bit set, then all IP fragments that belonged to different ESP datagrams would have used the same identificator. If one of these IP fragments would get lost or reordered, then peer could possibly stitch together wrong IP fragments that did not belong to the same datagram. This would lead to a packet loss or data corruption. Signed-off-by: Ansis Atteka Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/ip.h | 12 ++++++++---- include/net/ipip.h | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 66dd491..2370f47 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -262,9 +262,11 @@ int ip_dont_fragment(struct sock *sk, struct dst_entry *dst) extern void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more); -static inline void ip_select_ident(struct iphdr *iph, struct dst_entry *dst, struct sock *sk) +static inline void ip_select_ident(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk) { - if (iph->frag_off & htons(IP_DF)) { + struct iphdr *iph = ip_hdr(skb); + + if ((iph->frag_off & htons(IP_DF)) && !skb->local_df) { /* This is only to work around buggy Windows95/2000 * VJ compression implementations. If the ID field * does not change, they drop every other packet in @@ -276,9 +278,11 @@ static inline void ip_select_ident(struct iphdr *iph, struct dst_entry *dst, str __ip_select_ident(iph, dst, 0); } -static inline void ip_select_ident_more(struct iphdr *iph, struct dst_entry *dst, struct sock *sk, int more) +static inline void ip_select_ident_more(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk, int more) { - if (iph->frag_off & htons(IP_DF)) { + struct iphdr *iph = ip_hdr(skb); + + if ((iph->frag_off & htons(IP_DF)) && !skb->local_df) { if (sk && inet_sk(sk)->inet_daddr) { iph->id = htons(inet_sk(sk)->inet_id); inet_sk(sk)->inet_id += 1 + more; diff --git a/include/net/ipip.h b/include/net/ipip.h index a32654d..4dccfe3 100644 --- a/include/net/ipip.h +++ b/include/net/ipip.h @@ -50,7 +50,7 @@ struct ip_tunnel_prl_entry { int pkt_len = skb->len - skb_transport_offset(skb); \ \ skb->ip_summed = CHECKSUM_NONE; \ - ip_select_ident(iph, &rt->dst, NULL); \ + ip_select_ident(skb, &rt->dst, NULL); \ \ err = ip_local_out(skb); \ if (likely(net_xmit_eval(err) == 0)) { \ -- cgit v1.1 From fa759b5d100c8b0d2ec2b83fcac990f2c3ac13b7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 10 Sep 2013 10:52:35 -0400 Subject: random: run random_int_secret_init() run after all late_initcalls commit 47d06e532e95b71c0db3839ebdef3fe8812fca2c upstream. The some platforms (e.g., ARM) initializes their clocks as late_initcalls for some unknown reason. So make sure random_int_secret_init() is run after all of the late_initcalls are run. Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- include/linux/random.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/random.h b/include/linux/random.h index ac621ce..7e58ad2 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -56,6 +56,7 @@ extern void add_interrupt_randomness(int irq, int irq_flags); extern void get_random_bytes(void *buf, int nbytes); extern void get_random_bytes_arch(void *buf, int nbytes); void generate_random_uuid(unsigned char uuid_out[16]); +extern int random_int_secret_init(void); #ifndef MODULE extern const struct file_operations random_fops, urandom_fops; -- cgit v1.1 From 6d0e5f5dec80d74730912ed3d19899a6da191317 Mon Sep 17 00:00:00 2001 From: DerTeufel Date: Wed, 26 Nov 2014 20:05:39 +0100 Subject: smdk4412: network: squashed commits commit 9792f37daba788506559f99832c62b240402296c Author: Sreeram Ramachandran Date: Tue Jul 8 11:37:03 2014 -0700 Handle 'sk' being NULL in UID-based routing. Bug: 15413527 Change-Id: If33bebb7b52c0ebfa8dac2452607bce0c2b0faa0 Signed-off-by: Sreeram Ramachandran commit 7ab80d7fd3f1e3faebb14313119700fd7416ad54 Author: Lorenzo Colitti Date: Mon Mar 31 16:23:51 2014 +0900 net: core: Support UID-based routing. This contains the following commits: 1. 0149763 net: core: Add a UID range to fib rules. 2. 1650474 net: core: Use the socket UID in routing lookups. 3. 0b16771 net: ipv4: Add the UID to the route cache. 4. ee058f1 net: core: Add a RTA_UID attribute to routes. This is so that userspace can do per-UID route lookups. Bug: 15413527 Change-Id: I1285474c6734614d3bda6f61d88dfe89a4af7892 Signed-off-by: Lorenzo Colitti commit a769ab7f07dcbbf29f2a8658aa5486bb6a2a66c3 Author: Hannes Frederic Sowa Date: Fri Mar 8 02:07:16 2013 +0000 ipv6: introdcue __ipv6_addr_needs_scope_id and ipv6_iface_scope_id helper functions [net-next commit b7ef213ef65256168df83ddfbb8131ed9adc10f9] __ipv6_addr_needs_scope_id checks if an ipv6 address needs to supply a 'sin6_scope_id != 0'. 'sin6_scope_id != 0' was enforced in case of link-local addresses. To support interface-local multicast these checks had to be enhanced and are now consolidated into these new helper functions. v2: a) migrated to struct ipv6_addr_props v3: a) reverted changes for ipv6_addr_props b) test for address type instead of comparing scope v4: a) unchanged Change-Id: Id6fc54cec61f967928e08a9eba4f857157d973a3 Suggested-by: YOSHIFUJI Hideaki Cc: YOSHIFUJI Hideaki Acked-by: YOSHIFUJI Hideaki Signed-off-by: Hannes Frederic Sowa Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller commit af9b98af02a072c3eb0f3dd7d3df7242d8294e5c Author: Hannes Frederic Sowa Date: Mon Nov 18 07:07:45 2013 +0100 ping: prevent NULL pointer dereference on write to msg_name A plain read() on a socket does set msg->msg_name to NULL. So check for NULL pointer first. [Backport of net-next cf970c002d270c36202bd5b9c2804d3097a52da0] Bug: 12780426 Change-Id: I29d9cb95ef05ec76d37517e01317f4a29e60931c Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller Signed-off-by: Lorenzo Colitti commit d66ae9bbbf35cd6e7a3d04f6946d506b3148f06b Author: Cong Wang Date: Sun Jun 2 22:43:52 2013 +0000 ping: always initialize ->sin6_scope_id and ->sin6_flowinfo [net-next commit c26d6b46da3ee86fa8a864347331e5513ca84c2b] If we don't need scope id, we should initialize it to zero. Same for ->sin6_flowinfo. Change-Id: I28e4bc9593e76fc3434052182466fab4bb8ccf3a Cc: Lorenzo Colitti Cc: David S. Miller Signed-off-by: Cong Wang Acked-by: Lorenzo Colitti Signed-off-by: David S. Miller commit 22d188e621c143108e1207831e5817f24d0cccc0 Author: Lorenzo Colitti Date: Thu Jul 4 00:12:40 2013 +0900 net: ipv6: fix wrong ping_v6_sendmsg return value [net-next commit fbfe80c890a1dc521d0b629b870e32fcffff0da5] ping_v6_sendmsg currently returns 0 on success. It should return the number of bytes written instead. Bug: 9469865 Change-Id: I82b7d3a37ba91ad24e6dbd97a4880745ce16ad31 Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller commit b691b1c9931f86c3fc7a10208030752f205d1adf Author: Lorenzo Colitti Date: Thu Jul 4 00:52:49 2013 +0900 net: ipv6: add missing lock in ping_v6_sendmsg [net-next commit a1bdc45580fc19e968b32ad27cd7e476a4aa58f6] Bug: 9469865 Change-Id: I480f8ce95956dd8f17fbbb26dc60cc162f8ec933 Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller commit 515b76147e907579254cd5997a4ab9e64da32268 Author: Lorenzo Colitti Date: Wed Jan 16 22:09:49 2013 +0000 net: ipv6: Add IPv6 support to the ping socket. [backport of net-next 6d0bfe22611602f36617bc7aa2ffa1bbb2f54c67] This adds the ability to send ICMPv6 echo requests without a raw socket. The equivalent ability for ICMPv4 was added in 2011. Instead of having separate code paths for IPv4 and IPv6, make most of the code in net/ipv4/ping.c dual-stack and only add a few IPv6-specific bits (like the protocol definition) to a new net/ipv6/ping.c. Hopefully this will reduce divergence and/or duplication of bugs in the future. Caveats: - Setting options via ancillary data (e.g., using IPV6_PKTINFO to specify the outgoing interface) is not yet supported. - There are no separate security settings for IPv4 and IPv6; everything is controlled by /proc/net/ipv4/ping_group_range. - The proc interface does not yet display IPv6 ping sockets properly. Tested with a patched copy of ping6 and using raw socket calls. Compiles and works with all of CONFIG_IPV6={n,m,y}. Change-Id: Ia359af556021344fc7f890c21383aadf950b6498 Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller [lorenzo@google.com: backported to 3.0] Signed-off-by: Lorenzo Colitti commit d72b1c37bab1bbdebb096421b5ef88ceec6eae8e Author: Li Wei Date: Thu Feb 21 00:09:54 2013 +0000 ipv4: fix a bug in ping_err(). [ Upstream commit b531ed61a2a2a77eeb2f7c88b49aa5ec7d9880d8 ] We should get 'type' and 'code' from the outer ICMP header. Change-Id: I9a467b4aa794127f22dbc5f802d17ae618aa0c74 Signed-off-by: Li Wei Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman commit ead1926fc318a4c97e735a885db40e77135c0531 Author: Eric Dumazet Date: Mon Oct 24 03:06:21 2011 -0400 ipv4: tcp: fix TOS value in ACK messages sent from TIME_WAIT There is a long standing bug in linux tcp stack, about ACK messages sent on behalf of TIME_WAIT sockets. In the IP header of the ACK message, we choose to reflect TOS field of incoming message, and this might break some setups. Example of things that were broken : - Routing using TOS as a selector - Firewalls - Trafic classification / shaping We now remember in timewait structure the inet tos field and use it in ACK generation, and route lookup. Notes : - We still reflect incoming TOS in RST messages. - We could extend MuraliRaja Muniraju patch to report TOS value in netlink messages for TIME_WAIT sockets. - A patch is needed for IPv6 Change-Id: Ic7ad8a7b858de181bfe2a789c472f84955397d4c Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller commit 47ef68bdd0ceb7113496f3325068202e5d1f3eba Author: Eric Dumazet Date: Wed Nov 30 19:00:53 2011 +0000 ipv4: use a 64bit load/store in output path gcc compiler is smart enough to use a single load/store if we memcpy(dptr, sptr, 8) on x86_64, regardless of CONFIG_CC_OPTIMIZE_FOR_SIZE In IP header, daddr immediately follows saddr, this wont change in the future. We only need to make sure our flowi4 (saddr,daddr) fields wont break the rule. Change-Id: Iad9c8fd9121ec84c2599b013badaebba92db7c39 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller commit 5b7251328273e10d0d768a24f7b555d1e1f671e6 Author: Julian Anastasov Date: Sun Aug 7 09:16:09 2011 +0000 ipv4: route non-local sources for raw socket The raw sockets can provide source address for routing but their privileges are not considered. We can provide non-local source address, make sure the FLOWI_FLAG_ANYSRC flag is set if socket has privileges for this, i.e. based on hdrincl (IP_HDRINCL) and transparent flags. Change-Id: I136b161c584deac3885efbf217e959e1a829fc1d Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller Change-Id: I0022e9536ee1861bf163e5bba4a86a3e94669960 --- include/linux/fib_rules.h | 2 ++ include/linux/rtnetlink.h | 2 ++ include/net/fib_rules.h | 6 ++++- include/net/flow.h | 13 +++++++++-- include/net/inet_sock.h | 2 +- include/net/inet_timewait_sock.h | 3 ++- include/net/ip.h | 4 +++- include/net/ipv6.h | 20 ++++++++++++++++ include/net/ping.h | 50 +++++++++++++++++++++++++++++++++++++--- include/net/route.h | 6 +++-- include/net/transp_v6.h | 3 +++ 11 files changed, 100 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/fib_rules.h b/include/linux/fib_rules.h index 51da65b..9dcdb62 100644 --- a/include/linux/fib_rules.h +++ b/include/linux/fib_rules.h @@ -49,6 +49,8 @@ enum { FRA_TABLE, /* Extended table id */ FRA_FWMASK, /* mask for netfilter mark */ FRA_OIFNAME, + FRA_UID_START, /* UID range */ + FRA_UID_END, __FRA_MAX }; diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 5415dfb..bd95696 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -283,6 +283,8 @@ enum rtattr_type_t { RTA_MP_ALGO, /* no longer used */ RTA_TABLE, RTA_MARK, + RTA_MFC_STATS, /* not used - backported from the future */ + RTA_UID, __RTA_MAX }; diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 075f1e3..52e77a3 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -23,6 +23,8 @@ struct fib_rule { struct fib_rule __rcu *ctarget; char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; + uid_t uid_start; + uid_t uid_end; struct rcu_head rcu; struct net * fr_net; }; @@ -79,7 +81,9 @@ struct fib_rules_ops { [FRA_FWMARK] = { .type = NLA_U32 }, \ [FRA_FWMASK] = { .type = NLA_U32 }, \ [FRA_TABLE] = { .type = NLA_U32 }, \ - [FRA_GOTO] = { .type = NLA_U32 } + [FRA_GOTO] = { .type = NLA_U32 }, \ + [FRA_UID_START] = { .type = NLA_U32 }, \ + [FRA_UID_END] = { .type = NLA_U32 } static inline void fib_rule_get(struct fib_rule *rule) { diff --git a/include/net/flow.h b/include/net/flow.h index e37cfda..34c3dd4 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -23,6 +23,7 @@ struct flowi_common { #define FLOWI_FLAG_PRECOW_METRICS 0x02 #define FLOWI_FLAG_CAN_SLEEP 0x04 __u32 flowic_secid; + uid_t flowic_uid; }; union flowi_uli { @@ -59,8 +60,12 @@ struct flowi4 { #define flowi4_proto __fl_common.flowic_proto #define flowi4_flags __fl_common.flowic_flags #define flowi4_secid __fl_common.flowic_secid - __be32 daddr; +#define flowi4_uid __fl_common.flowic_uid + + /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; + __be32 daddr; + union flowi_uli uli; #define fl4_sport uli.ports.sport #define fl4_dport uli.ports.dport @@ -75,7 +80,8 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, __u32 mark, __u8 tos, __u8 scope, __u8 proto, __u8 flags, __be32 daddr, __be32 saddr, - __be16 dport, __be32 sport) + __be16 dport, __be32 sport, + uid_t uid) { fl4->flowi4_oif = oif; fl4->flowi4_iif = 0; @@ -85,6 +91,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_proto = proto; fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; + fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; @@ -112,6 +119,7 @@ struct flowi6 { #define flowi6_proto __fl_common.flowic_proto #define flowi6_flags __fl_common.flowic_flags #define flowi6_secid __fl_common.flowic_secid +#define flowi6_uid __fl_common.flowic_uid struct in6_addr daddr; struct in6_addr saddr; __be32 flowlabel; @@ -155,6 +163,7 @@ struct flowi { #define flowi_proto u.__fl_common.flowic_proto #define flowi_flags u.__fl_common.flowic_flags #define flowi_secid u.__fl_common.flowic_secid +#define flowi_uid u.__fl_common.flowic_uid } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 14dd9c7..f941964 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -240,7 +240,7 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) { __u8 flags = 0; - if (inet_sk(sk)->transparent) + if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl) flags |= FLOWI_FLAG_ANYSRC; if (sk->sk_protocol == IPPROTO_TCP) flags |= FLOWI_FLAG_PRECOW_METRICS; diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 17404b5..1658150 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -126,7 +126,8 @@ struct inet_timewait_sock { /* And these are ours. */ unsigned int tw_ipv6only : 1, tw_transparent : 1, - tw_pad : 14, /* 14 bits hole */ + tw_pad : 6, /* 6 bits hole */ + tw_tos : 8, tw_ipv6_offset : 16; kmemcheck_bitfield_end(flags); unsigned long tw_ttd; diff --git a/include/net/ip.h b/include/net/ip.h index 66dd491..24352d0 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -165,6 +165,8 @@ struct ip_reply_arg { int csumoffset; /* u16 offset of csum in iov[0].iov_base */ /* -1 if not needed */ int bound_dev_if; + u8 tos; + uid_t uid; }; #define IP_REPLY_ARG_NOSRCCHECK 1 @@ -175,7 +177,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) } void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, - struct ip_reply_arg *arg, unsigned int len); + const struct ip_reply_arg *arg, unsigned int len); struct ipv4_config { int log_martians; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index c39121f..eafd2f4 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -243,6 +243,14 @@ static inline void fl6_sock_release(struct ip6_flowlabel *fl) atomic_dec(&fl->users); } +extern void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info); + +int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct icmp6hdr *thdr, int len); + +struct dst_entry *icmpv6_route_lookup(struct net *net, struct sk_buff *skb, + struct sock *sk, struct flowi6 *fl6); + extern int ip6_ra_control(struct sock *sk, int sel); extern int ipv6_parse_hopopts(struct sk_buff *skb); @@ -285,6 +293,18 @@ static inline int ipv6_addr_src_scope(const struct in6_addr *addr) return __ipv6_addr_src_scope(__ipv6_addr_type(addr)); } +static inline bool __ipv6_addr_needs_scope_id(int type) +{ + return type & IPV6_ADDR_LINKLOCAL || + (type & IPV6_ADDR_MULTICAST && + (type & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL))); +} + +static inline __u32 ipv6_iface_scope_id(const struct in6_addr *addr, int iface) +{ + return __ipv6_addr_needs_scope_id(__ipv6_addr_type(addr)) ? iface : 0; +} + static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2) { return memcmp(a1, a2, sizeof(struct in6_addr)); diff --git a/include/net/ping.h b/include/net/ping.h index 682b5ae..c103135e 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -13,6 +13,7 @@ #ifndef _PING_H #define _PING_H +#include #include /* PING_HTABLE_SIZE must be power of 2 */ @@ -28,6 +29,18 @@ */ #define GID_T_MAX (((gid_t)~0U) >> 1) +/* Compatibility glue so we can support IPv6 when it's compiled as a module */ +struct pingv6_ops { + int (*ipv6_recv_error)(struct sock *sk, struct msghdr *msg, int len); + int (*datagram_recv_ctl)(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb); + int (*icmpv6_err_convert)(u8 type, u8 code, int *err); + void (*ipv6_icmp_error)(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload); + int (*ipv6_chk_addr)(struct net *net, const struct in6_addr *addr, + struct net_device *dev, int strict); +}; + struct ping_table { struct hlist_nulls_head hash[PING_HTABLE_SIZE]; rwlock_t lock; @@ -39,10 +52,40 @@ struct ping_iter_state { }; extern struct proto ping_prot; +extern struct ping_table ping_table; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +extern struct pingv6_ops pingv6_ops; +#endif +struct pingfakehdr { + struct icmphdr icmph; + struct iovec *iov; + sa_family_t family; + __wsum wcheck; +}; -extern void ping_rcv(struct sk_buff *); -extern void ping_err(struct sk_buff *, u32 info); +int ping_get_port(struct sock *sk, unsigned short ident); +void ping_hash(struct sock *sk); +void ping_unhash(struct sock *sk); + +int ping_init_sock(struct sock *sk); +void ping_close(struct sock *sk, long timeout); +int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len); +void ping_err(struct sk_buff *skb, int offset, u32 info); +void ping_v4_err(struct sk_buff *skb, u32 info); +int ping_getfrag(void *from, char *to, int offset, int fraglen, int odd, + struct sk_buff *); + +int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len); +int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, + void *user_icmph, size_t icmph_len); +int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len); +int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len); +int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); +void ping_rcv(struct sk_buff *skb); #ifdef CONFIG_PROC_FS extern int __init ping_proc_init(void); @@ -50,6 +93,7 @@ extern void ping_proc_exit(void); #endif void __init ping_init(void); - +int __init pingv6_init(void); +void pingv6_exit(void); #endif /* _PING_H */ diff --git a/include/net/route.h b/include/net/route.h index 5d7aae4..f4b1489 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -60,6 +60,7 @@ struct rtable { int rt_iif; int rt_oif; __u32 rt_mark; + uid_t rt_uid; /* Info on neighbour */ __be32 rt_gateway; @@ -146,7 +147,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos, RT_SCOPE_UNIVERSE, proto, sk ? inet_sk_flowi_flags(sk) : 0, - daddr, saddr, dport, sport); + daddr, saddr, dport, sport, sk ? sock_i_uid(sk) : 0); if (sk) security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); return ip_route_output_flow(net, fl4, sk); @@ -250,7 +251,8 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 flow_flags |= FLOWI_FLAG_CAN_SLEEP; flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, - protocol, flow_flags, dst, src, dport, sport); + protocol, flow_flags, dst, src, dport, sport, + sock_i_uid(sk)); } static inline struct rtable *ip_route_connect(struct flowi4 *fl4, diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index 45ce307..fb36dd5 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -11,6 +11,7 @@ extern struct proto rawv6_prot; extern struct proto udpv6_prot; extern struct proto udplitev6_prot; extern struct proto tcpv6_prot; +extern struct proto pingv6_prot; struct flowi6; @@ -23,6 +24,8 @@ extern int ipv6_frag_init(void); extern void ipv6_frag_exit(void); /* transport protocols */ +extern int pingv6_init(void); +extern void pingv6_exit(void); extern int rawv6_init(void); extern void rawv6_exit(void); extern int udpv6_init(void); -- cgit v1.1 From bea86fff3cf23c564a5e700ea0e22b53b3972bda Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Thu, 10 Feb 2011 02:04:45 -0800 Subject: cpu_pm: Add cpu power management notifiers During some CPU power modes entered during idle, hotplug and suspend, peripherals located in the CPU power domain, such as the GIC, localtimers, and VFP, may be powered down. Add a notifier chain that allows drivers for those peripherals to be notified before and after they may be reset. Notified drivers can include VFP co-processor, interrupt controller and it's PM extensions, local CPU timers context save/restore which shouldn't be interrupted. Hence CPU PM event APIs must be called with interrupts disabled. Change-Id: I2918473962a75cd55c148f91a3c09a773c99162c Signed-off-by: Colin Cross Signed-off-by: Santosh Shilimkar Reviewed-by: Kevin Hilman Tested-and-Acked-by: Shawn Guo Tested-by: Kevin Hilman Tested-by: Vishwanath BS Signed-off-by: Lanchon --- include/linux/cpu_pm.h | 109 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 include/linux/cpu_pm.h (limited to 'include') diff --git a/include/linux/cpu_pm.h b/include/linux/cpu_pm.h new file mode 100644 index 0000000..455b233 --- /dev/null +++ b/include/linux/cpu_pm.h @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2011 Google, Inc. + * + * Author: + * Colin Cross + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef _LINUX_CPU_PM_H +#define _LINUX_CPU_PM_H + +#include +#include + +/* + * When a CPU goes to a low power state that turns off power to the CPU's + * power domain, the contents of some blocks (floating point coprocessors, + * interrupt controllers, caches, timers) in the same power domain can + * be lost. The cpm_pm notifiers provide a method for platform idle, suspend, + * and hotplug implementations to notify the drivers for these blocks that + * they may be reset. + * + * All cpu_pm notifications must be called with interrupts disabled. + * + * The notifications are split into two classes: CPU notifications and CPU + * cluster notifications. + * + * CPU notifications apply to a single CPU and must be called on the affected + * CPU. They are used to save per-cpu context for affected blocks. + * + * CPU cluster notifications apply to all CPUs in a single power domain. They + * are used to save any global context for affected blocks, and must be called + * after all the CPUs in the power domain have been notified of the low power + * state. + */ + +/* + * Event codes passed as unsigned long val to notifier calls + */ +enum cpu_pm_event { + /* A single cpu is entering a low power state */ + CPU_PM_ENTER, + + /* A single cpu failed to enter a low power state */ + CPU_PM_ENTER_FAILED, + + /* A single cpu is exiting a low power state */ + CPU_PM_EXIT, + + /* A cpu power domain is entering a low power state */ + CPU_CLUSTER_PM_ENTER, + + /* A cpu power domain failed to enter a low power state */ + CPU_CLUSTER_PM_ENTER_FAILED, + + /* A cpu power domain is exiting a low power state */ + CPU_CLUSTER_PM_EXIT, +}; + +#ifdef CONFIG_CPU_PM +int cpu_pm_register_notifier(struct notifier_block *nb); +int cpu_pm_unregister_notifier(struct notifier_block *nb); +int cpu_pm_enter(void); +int cpu_pm_exit(void); +int cpu_cluster_pm_enter(void); +int cpu_cluster_pm_exit(void); + +#else + +static inline int cpu_pm_register_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline int cpu_pm_unregister_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline int cpu_pm_enter(void) +{ + return 0; +} + +static inline int cpu_pm_exit(void) +{ + return 0; +} + +static inline int cpu_cluster_pm_enter(void) +{ + return 0; +} + +static inline int cpu_cluster_pm_exit(void) +{ + return 0; +} +#endif +#endif -- cgit v1.1 From 361003a3fcdb7bf1dfdf9a009cedbe5eb80ac173 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Mon, 5 Nov 2012 08:15:34 -0500 Subject: Add security hooks to binder and implement the hooks for SELinux. Add security hooks to the binder and implement the hooks for SELinux. The security hooks enable security modules such as SELinux to implement controls over binder IPC. The security hooks include support for controlling what process can become the binder context manager (binder_set_context_mgr), controlling the ability of a process to invoke a binder transaction/IPC to another process (binder_transaction), controlling the ability a process to transfer a binder reference to another process (binder_transfer_binder), and controlling the ability of a process to transfer an open file to another process (binder_transfer_file). This support is used by SE Android, http://selinuxproject.org/page/SEAndroid. Signed-off-by: Stephen Smalley Change-Id: I9a64a87825df2e60b9c51400377af4a9cd1c4049 --- include/linux/security.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/security.h b/include/linux/security.h index 8ce59ef..8388914 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1375,6 +1375,11 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) struct security_operations { char name[SECURITY_NAME_MAX + 1]; + int (*binder_set_context_mgr) (struct task_struct *mgr); + int (*binder_transaction) (struct task_struct *from, struct task_struct *to); + int (*binder_transfer_binder) (struct task_struct *from, struct task_struct *to); + int (*binder_transfer_file) (struct task_struct *from, struct task_struct *to, struct file *file); + int (*ptrace_access_check) (struct task_struct *child, unsigned int mode); int (*ptrace_traceme) (struct task_struct *parent); int (*capget) (struct task_struct *target, @@ -1657,6 +1662,10 @@ extern int security_module_enable(struct security_operations *ops); extern int register_security(struct security_operations *ops); /* Security operations */ +int security_binder_set_context_mgr(struct task_struct *mgr); +int security_binder_transaction(struct task_struct *from, struct task_struct *to); +int security_binder_transfer_binder(struct task_struct *from, struct task_struct *to); +int security_binder_transfer_file(struct task_struct *from, struct task_struct *to, struct file *file); int security_ptrace_access_check(struct task_struct *child, unsigned int mode); int security_ptrace_traceme(struct task_struct *parent); int security_capget(struct task_struct *target, @@ -1837,6 +1846,26 @@ static inline int security_init(void) return 0; } +static inline int security_binder_set_context_mgr(struct task_struct *mgr) +{ + return 0; +} + +static inline int security_binder_transaction(struct task_struct *from, struct task_struct *to) +{ + return 0; +} + +static inline int security_binder_transfer_binder(struct task_struct *from, struct task_struct *to) +{ + return 0; +} + +static inline int security_binder_transfer_file(struct task_struct *from, struct task_struct *to, struct file *file) +{ + return 0; +} + static inline int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { -- cgit v1.1 From e3c7a358e9e78b085c8db7446dd4b854a0faeffe Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 28 Jan 2014 14:45:41 -0500 Subject: selinux: add SOCK_DIAG_BY_FAMILY to the list of netlink message types commit 6a96e15096da6e7491107321cfa660c7c2aa119d upstream. The SELinux AF_NETLINK/NETLINK_SOCK_DIAG socket class was missing the SOCK_DIAG_BY_FAMILY definition which caused SELINUX_ERR messages when the ss tool was run. # ss Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port u_str ESTAB 0 0 * 14189 * 14190 u_str ESTAB 0 0 * 14145 * 14144 u_str ESTAB 0 0 * 14151 * 14150 {...} # ausearch -m SELINUX_ERR ---- time->Thu Jan 23 11:11:16 2014 type=SYSCALL msg=audit(1390493476.445:374): arch=c000003e syscall=44 success=yes exit=40 a0=3 a1=7fff03aa11f0 a2=28 a3=0 items=0 ppid=1852 pid=1895 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=1 comm="ss" exe="/usr/sbin/ss" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=SELINUX_ERR msg=audit(1390493476.445:374): SELinux: unrecognized netlink message type=20 for sclass=32 Change-Id: I22218ec620bc3ee6396145f1c2ad8ed222648309 Signed-off-by: Paul Moore --- include/linux/sock_diag.h | 48 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 include/linux/sock_diag.h (limited to 'include') diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h new file mode 100644 index 0000000..251729a --- /dev/null +++ b/include/linux/sock_diag.h @@ -0,0 +1,48 @@ +#ifndef __SOCK_DIAG_H__ +#define __SOCK_DIAG_H__ + +#include + +#define SOCK_DIAG_BY_FAMILY 20 + +struct sock_diag_req { + __u8 sdiag_family; + __u8 sdiag_protocol; +}; + +enum { + SK_MEMINFO_RMEM_ALLOC, + SK_MEMINFO_RCVBUF, + SK_MEMINFO_WMEM_ALLOC, + SK_MEMINFO_SNDBUF, + SK_MEMINFO_FWD_ALLOC, + SK_MEMINFO_WMEM_QUEUED, + SK_MEMINFO_OPTMEM, + + SK_MEMINFO_VARS, +}; + +#ifdef __KERNEL__ +struct sk_buff; +struct nlmsghdr; +struct sock; + +struct sock_diag_handler { + __u8 family; + int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh); +}; + +int sock_diag_register(struct sock_diag_handler *h); +void sock_diag_unregister(struct sock_diag_handler *h); + +void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); +void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); + +int sock_diag_check_cookie(void *sk, __u32 *cookie); +void sock_diag_save_cookie(void *sk, __u32 *cookie); + +int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr); + +extern struct sock *sock_diag_nlsk; +#endif /* KERNEL */ +#endif -- cgit v1.1 From ccec3aa9fda8483a2e0e0b9de6ba612f4bdcc314 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Sat, 4 Apr 2015 16:15:54 -0700 Subject: security: lsm_audit: add ioctl specific auditing Add information about ioctl calls to the LSM audit data. Log the file path and command number. Bug: 18087110 Change-Id: Idbbd106db6226683cb30022d9e8f6f3b8fab7f84 Signed-off-by: Jeff Vander Stoep --- include/linux/lsm_audit.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 88e78de..65e2962 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -24,6 +24,11 @@ #include +struct lsm_ioctlop_audit { + struct path path; + u16 cmd; +}; + /* Auxiliary data to use in generating the audit record. */ struct common_audit_data { char type; @@ -37,6 +42,7 @@ struct common_audit_data { #define LSM_AUDIT_DATA_KMOD 8 #define LSM_AUDIT_DATA_INODE 9 #define LSM_AUDIT_DATA_DENTRY 10 +#define LSM_AUDIT_DATA_IOCTL_OP 11 struct task_struct *tsk; union { struct path path; @@ -69,6 +75,7 @@ struct common_audit_data { } key_struct; #endif char *kmod_name; + struct lsm_ioctlop_audit *op; } u; /* this union contains LSM specific data */ union { -- cgit v1.1 From fa7b4d4938aa43fa7cbf8a75af5d0b9213b62c7d Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 30 Oct 2012 14:45:57 -0400 Subject: hashtable: introduce a small and naive hashtable This hashtable implementation is using hlist buckets to provide a simple hashtable to prevent it from getting reimplemented all over the kernel. Signed-off-by: Sasha Levin [ Merging this now, so that subsystems can start applying Sasha's patches that use this - Linus ] Signed-off-by: Linus Torvalds Change-Id: I08357176e20fb805170de4736915cde9103db7d2 --- include/linux/hashtable.h | 192 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 include/linux/hashtable.h (limited to 'include') diff --git a/include/linux/hashtable.h b/include/linux/hashtable.h new file mode 100644 index 0000000..227c624 --- /dev/null +++ b/include/linux/hashtable.h @@ -0,0 +1,192 @@ +/* + * Statically sized hash table implementation + * (C) 2012 Sasha Levin + */ + +#ifndef _LINUX_HASHTABLE_H +#define _LINUX_HASHTABLE_H + +#include +#include +#include +#include +#include + +#define DEFINE_HASHTABLE(name, bits) \ + struct hlist_head name[1 << (bits)] = \ + { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } + +#define DECLARE_HASHTABLE(name, bits) \ + struct hlist_head name[1 << (bits)] + +#define HASH_SIZE(name) (ARRAY_SIZE(name)) +#define HASH_BITS(name) ilog2(HASH_SIZE(name)) + +/* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */ +#define hash_min(val, bits) \ + (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits)) + +static inline void __hash_init(struct hlist_head *ht, unsigned int sz) +{ + unsigned int i; + + for (i = 0; i < sz; i++) + INIT_HLIST_HEAD(&ht[i]); +} + +/** + * hash_init - initialize a hash table + * @hashtable: hashtable to be initialized + * + * Calculates the size of the hashtable from the given parameter, otherwise + * same as hash_init_size. + * + * This has to be a macro since HASH_BITS() will not work on pointers since + * it calculates the size during preprocessing. + */ +#define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable)) + +/** + * hash_add - add an object to a hashtable + * @hashtable: hashtable to add to + * @node: the &struct hlist_node of the object to be added + * @key: the key of the object to be added + */ +#define hash_add(hashtable, node, key) \ + hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) + +/** + * hash_add_rcu - add an object to a rcu enabled hashtable + * @hashtable: hashtable to add to + * @node: the &struct hlist_node of the object to be added + * @key: the key of the object to be added + */ +#define hash_add_rcu(hashtable, node, key) \ + hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) + +/** + * hash_hashed - check whether an object is in any hashtable + * @node: the &struct hlist_node of the object to be checked + */ +static inline bool hash_hashed(struct hlist_node *node) +{ + return !hlist_unhashed(node); +} + +static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz) +{ + unsigned int i; + + for (i = 0; i < sz; i++) + if (!hlist_empty(&ht[i])) + return false; + + return true; +} + +/** + * hash_empty - check whether a hashtable is empty + * @hashtable: hashtable to check + * + * This has to be a macro since HASH_BITS() will not work on pointers since + * it calculates the size during preprocessing. + */ +#define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable)) + +/** + * hash_del - remove an object from a hashtable + * @node: &struct hlist_node of the object to remove + */ +static inline void hash_del(struct hlist_node *node) +{ + hlist_del_init(node); +} + +/** + * hash_del_rcu - remove an object from a rcu enabled hashtable + * @node: &struct hlist_node of the object to remove + */ +static inline void hash_del_rcu(struct hlist_node *node) +{ + hlist_del_init_rcu(node); +} + +/** + * hash_for_each - iterate over a hashtable + * @name: hashtable to iterate + * @bkt: integer to use as bucket loop cursor + * @node: the &struct list_head to use as a loop cursor for each entry + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + */ +#define hash_for_each(name, bkt, node, obj, member) \ + for ((bkt) = 0, node = NULL; node == NULL && (bkt) < HASH_SIZE(name); (bkt)++)\ + hlist_for_each_entry(obj, node, &name[bkt], member) + +/** + * hash_for_each_rcu - iterate over a rcu enabled hashtable + * @name: hashtable to iterate + * @bkt: integer to use as bucket loop cursor + * @node: the &struct list_head to use as a loop cursor for each entry + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + */ +#define hash_for_each_rcu(name, bkt, node, obj, member) \ + for ((bkt) = 0, node = NULL; node == NULL && (bkt) < HASH_SIZE(name); (bkt)++)\ + hlist_for_each_entry_rcu(obj, node, &name[bkt], member) + +/** + * hash_for_each_safe - iterate over a hashtable safe against removal of + * hash entry + * @name: hashtable to iterate + * @bkt: integer to use as bucket loop cursor + * @node: the &struct list_head to use as a loop cursor for each entry + * @tmp: a &struct used for temporary storage + * @obj: the type * to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + */ +#define hash_for_each_safe(name, bkt, node, tmp, obj, member) \ + for ((bkt) = 0, node = NULL; node == NULL && (bkt) < HASH_SIZE(name); (bkt)++)\ + hlist_for_each_entry_safe(obj, node, tmp, &name[bkt], member) + +/** + * hash_for_each_possible - iterate over all possible objects hashing to the + * same bucket + * @name: hashtable to iterate + * @obj: the type * to use as a loop cursor for each entry + * @node: the &struct list_head to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + * @key: the key of the objects to iterate over + */ +#define hash_for_each_possible(name, obj, node, member, key) \ + hlist_for_each_entry(obj, node, &name[hash_min(key, HASH_BITS(name))], member) + +/** + * hash_for_each_possible_rcu - iterate over all possible objects hashing to the + * same bucket in an rcu enabled hashtable + * in a rcu enabled hashtable + * @name: hashtable to iterate + * @obj: the type * to use as a loop cursor for each entry + * @node: the &struct list_head to use as a loop cursor for each entry + * @member: the name of the hlist_node within the struct + * @key: the key of the objects to iterate over + */ +#define hash_for_each_possible_rcu(name, obj, node, member, key) \ + hlist_for_each_entry_rcu(obj, node, &name[hash_min(key, HASH_BITS(name))], member) + +/** + * hash_for_each_possible_safe - iterate over all possible objects hashing to the + * same bucket safe against removals + * @name: hashtable to iterate + * @obj: the type * to use as a loop cursor for each entry + * @node: the &struct list_head to use as a loop cursor for each entry + * @tmp: a &struct used for temporary storage + * @member: the name of the hlist_node within the struct + * @key: the key of the objects to iterate over + */ +#define hash_for_each_possible_safe(name, obj, node, tmp, member, key) \ + hlist_for_each_entry_safe(obj, node, tmp, \ + &name[hash_min(key, HASH_BITS(name))], member) + + +#endif -- cgit v1.1 From 60b47b4138ff19c2d6558f6bc7a8b481789279be Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 24 Aug 2011 09:36:46 +0200 Subject: cputime: Clean up cputime_to_usecs and usecs_to_cputime macros Get rid of semicolon so that those expressions can be used also somewhere else than just in an assignment. Signed-off-by: Michal Hocko Acked-by: Arnd Bergmann Cc: Dave Jones Cc: Alexey Dobriyan Link: http://lkml.kernel.org/r/7565417ce30d7e6b1ddc169843af0777dbf66e75.1314172057.git.mhocko@suse.cz Signed-off-by: Thomas Gleixner Change-Id: I0ffcd25ee16589fd98906d3d9f5ee20200542175 --- include/asm-generic/cputime.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h index 61e03dd..62ce682 100644 --- a/include/asm-generic/cputime.h +++ b/include/asm-generic/cputime.h @@ -38,8 +38,8 @@ typedef u64 cputime64_t; /* * Convert cputime to microseconds and back. */ -#define cputime_to_usecs(__ct) jiffies_to_usecs(__ct); -#define usecs_to_cputime(__msecs) usecs_to_jiffies(__msecs); +#define cputime_to_usecs(__ct) jiffies_to_usecs(__ct) +#define usecs_to_cputime(__msecs) usecs_to_jiffies(__msecs) /* * Convert cputime to seconds and back. -- cgit v1.1