From 6b7d2f5b6ef27a89a0aee245a94d988e9ce8315e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Aug 2015 17:35:07 -0500 Subject: [PATCH 01/55] unshare: Unsharing a thread does not require unsharing a vm commit 12c641ab8270f787dfcce08b5f20ce8b65008096 upstream. In the logic in the initial commit of unshare made creating a new thread group for a process, contingent upon creating a new memory address space for that process. That is wrong. Two separate processes in different thread groups can share a memory address space and clone allows creation of such proceses. This is significant because it was observed that mm_users > 1 does not mean that a process is multi-threaded, as reading /proc/PID/maps temporarily increments mm_users, which allows other processes to (accidentally) interfere with unshare() calls. Correct the check in check_unshare_flags() to test for !thread_group_empty() for CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM. For sighand->count > 1 for CLONE_SIGHAND and CLONE_VM. For !current_is_single_threaded instead of mm_users > 1 for CLONE_VM. By using the correct checks in unshare this removes the possibility of an accidental denial of service attack. Additionally using the correct checks in unshare ensures that only an explicit unshare(CLONE_VM) can possibly trigger the slow path of current_is_single_threaded(). As an explict unshare(CLONE_VM) is pointless it is not expected there are many applications that make that call. Fixes: b2e0d98705e60e45bbb3c0032c48824ad7ae0704 userns: Implement unshare of the user namespace Reported-by: Ricky Zhou Reported-by: Kees Cook Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- kernel/fork.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 514dbc40f98..2358bd4c875 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1760,13 +1760,21 @@ static int check_unshare_flags(unsigned long unshare_flags) CLONE_NEWUSER|CLONE_NEWPID)) return -EINVAL; /* - * Not implemented, but pretend it works if there is nothing to - * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND - * needs to unshare vm. + * Not implemented, but pretend it works if there is nothing + * to unshare. Note that unsharing the address space or the + * signal handlers also need to unshare the signal queues (aka + * CLONE_THREAD). */ if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { - /* FIXME: get_task_mm() increments ->mm_users */ - if (atomic_read(¤t->mm->mm_users) > 1) + if (!thread_group_empty(current)) + return -EINVAL; + } + if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { + if (atomic_read(¤t->sighand->count) > 1) + return -EINVAL; + } + if (unshare_flags & CLONE_VM) { + if (!current_is_single_threaded()) return -EINVAL; } @@ -1839,16 +1847,16 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ if (unshare_flags & CLONE_NEWPID) unshare_flags |= CLONE_THREAD; - /* - * If unsharing a thread from a thread group, must also unshare vm. - */ - if (unshare_flags & CLONE_THREAD) - unshare_flags |= CLONE_VM; /* * If unsharing vm, must also unshare signal handlers. */ if (unshare_flags & CLONE_VM) unshare_flags |= CLONE_SIGHAND; + /* + * If unsharing a signal handlers, must also unshare the signal queues. + */ + if (unshare_flags & CLONE_SIGHAND) + unshare_flags |= CLONE_THREAD; /* * If unsharing namespace, must also unshare filesystem information. */ From f04fce5fcb1a6de2cf2f80c414ec9f16dbb15f6e Mon Sep 17 00:00:00 2001 From: Adrien Schildknecht Date: Wed, 19 Aug 2015 17:33:12 +0200 Subject: [PATCH 02/55] rtlwifi: rtl8192cu: Add new device ID commit 1642d09fb9b128e8e538b2a4179962a34f38dff9 upstream. The v2 of NetGear WNA1000M uses a different idProduct: USB ID 0846:9043 Signed-off-by: Adrien Schildknecht Acked-by: Larry Finger Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/rtlwifi/rtl8192cu/sw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c b/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c index 7555095e0b7..fa669b52fc9 100644 --- a/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c +++ b/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c @@ -313,6 +313,7 @@ static struct usb_device_id rtl8192c_usb_ids[] = { {RTL_USB_DEVICE(0x07b8, 0x8188, rtl92cu_hal_cfg)}, /*Abocom - Abocom*/ {RTL_USB_DEVICE(0x07b8, 0x8189, rtl92cu_hal_cfg)}, /*Funai - Abocom*/ {RTL_USB_DEVICE(0x0846, 0x9041, rtl92cu_hal_cfg)}, /*NetGear WNA1000M*/ + {RTL_USB_DEVICE(0x0846, 0x9043, rtl92cu_hal_cfg)}, /*NG WNA1000Mv2*/ {RTL_USB_DEVICE(0x0b05, 0x17ba, rtl92cu_hal_cfg)}, /*ASUS-Edimax*/ {RTL_USB_DEVICE(0x0bda, 0x5088, rtl92cu_hal_cfg)}, /*Thinkware-CC&C*/ {RTL_USB_DEVICE(0x0df6, 0x0052, rtl92cu_hal_cfg)}, /*Sitecom - Edimax*/ From f4487c483c26c0374450e0e910a45a7c0e91407d Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 1 Sep 2015 18:07:41 +0200 Subject: [PATCH 03/55] tg3: Fix temperature reporting commit d3d11fe08ccc9bff174fc958722b5661f0932486 upstream. The temperature registers appear to report values in degrees Celsius while the hwmon API mandates values to be exposed in millidegrees Celsius. Do the conversion so that the values reported by "sensors" are correct. Fixes: aed93e0bf493 ("tg3: Add hwmon support for temperature") Signed-off-by: Jean Delvare Cc: Prashant Sreedharan Cc: Michael Chan Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/broadcom/tg3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 680d26d6d2c..518cc4b6c7d 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -10518,7 +10518,7 @@ static ssize_t tg3_show_temp(struct device *dev, tg3_ape_scratchpad_read(tp, &temperature, attr->index, sizeof(temperature)); spin_unlock_bh(&tp->lock); - return sprintf(buf, "%u\n", temperature); + return sprintf(buf, "%u\n", temperature * 1000); } From 957c0c65c71536c95d0cd92e9a4583272402567d Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Sat, 13 Jun 2015 10:16:31 -0400 Subject: [PATCH 04/55] mac80211: enable assoc check for mesh interfaces commit 3633ebebab2bbe88124388b7620442315c968e8f upstream. We already set a station to be associated when peering completes, both in user space and in the kernel. Thus we should always have an associated sta before sending data frames to that station. Failure to check assoc state can cause crashes in the lower-level driver due to transmitting unicast data frames before driver sta structures (e.g. ampdu state in ath9k) are initialized. This occurred when forwarding in the presence of fixed mesh paths: frames were transmitted to stations with whom we hadn't yet completed peering. Reported-by: Alexis Green Tested-by: Jesse Jones Signed-off-by: Bob Copeland Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/mac80211/tx.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 10eea232602..e960fbe9e27 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -281,9 +281,6 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx) if (tx->sdata->vif.type == NL80211_IFTYPE_WDS) return TX_CONTINUE; - if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT) - return TX_CONTINUE; - if (tx->flags & IEEE80211_TX_PS_BUFFERED) return TX_CONTINUE; From f828609ff36c1180c06e307d2c51d4ede337f7da Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Tue, 18 Aug 2015 20:50:10 +0100 Subject: [PATCH 05/55] arm64: kconfig: Move LIST_POISON to a safe value commit bf0c4e04732479f650ff59d1ee82de761c0071f0 upstream. Move the poison pointer offset to 0xdead000000000000, a recognized value that is not mappable by user-space exploits. Acked-by: Catalin Marinas Signed-off-by: Thierry Strudel Signed-off-by: Jeff Vander Stoep Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 0677ff4814f..661ccf87b9a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -56,6 +56,10 @@ config NO_IOPORT config STACKTRACE_SUPPORT def_bool y +config ILLEGAL_POINTER_VALUE + hex + default 0xdead000000000000 + config LOCKDEP_SUPPORT def_bool y From a507adf4f05a41a638cd6cbdfd78149c35cec8db Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Sep 2015 12:07:06 +0100 Subject: [PATCH 06/55] arm64: compat: fix vfp save/restore across signal handlers in big-endian commit bdec97a855ef1e239f130f7a11584721c9a1bf04 upstream. When saving/restoring the VFP registers from a compat (AArch32) signal frame, we rely on the compat registers forming a prefix of the native register file and therefore make use of copy_{to,from}_user to transfer between the native fpsimd_state and the compat_vfp_sigframe. Unfortunately, this doesn't work so well in a big-endian environment. Our fpsimd save/restore code operates directly on 128-bit quantities (Q registers) whereas the compat_vfp_sigframe represents the registers as an array of 64-bit (D) registers. The architecture packs the compat D registers into the Q registers, with the least significant bytes holding the lower register. Consequently, we need to swap the 64-bit halves when converting between these two representations on a big-endian machine. This patch replaces the __copy_{to,from}_user invocations in our compat VFP signal handling code with explicit __put_user loops that operate on 64-bit values and swap them accordingly. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/signal32.c | 47 +++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index b9564b8d6ba..1e60acc6a4d 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -231,14 +231,32 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) /* * VFP save/restore code. + * + * We have to be careful with endianness, since the fpsimd context-switch + * code operates on 128-bit (Q) register values whereas the compat ABI + * uses an array of 64-bit (D) registers. Consequently, we need to swap + * the two halves of each Q register when running on a big-endian CPU. */ +union __fpsimd_vreg { + __uint128_t raw; + struct { +#ifdef __AARCH64EB__ + u64 hi; + u64 lo; +#else + u64 lo; + u64 hi; +#endif + }; +}; + static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) { struct fpsimd_state *fpsimd = ¤t->thread.fpsimd_state; compat_ulong_t magic = VFP_MAGIC; compat_ulong_t size = VFP_STORAGE_SIZE; compat_ulong_t fpscr, fpexc; - int err = 0; + int i, err = 0; /* * Save the hardware registers to the fpsimd_state structure. @@ -254,10 +272,15 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) /* * Now copy the FP registers. Since the registers are packed, * we can copy the prefix we want (V0-V15) as it is. - * FIXME: Won't work if big endian. */ - err |= __copy_to_user(&frame->ufp.fpregs, fpsimd->vregs, - sizeof(frame->ufp.fpregs)); + for (i = 0; i < ARRAY_SIZE(frame->ufp.fpregs); i += 2) { + union __fpsimd_vreg vreg = { + .raw = fpsimd->vregs[i >> 1], + }; + + __put_user_error(vreg.lo, &frame->ufp.fpregs[i], err); + __put_user_error(vreg.hi, &frame->ufp.fpregs[i + 1], err); + } /* Create an AArch32 fpscr from the fpsr and the fpcr. */ fpscr = (fpsimd->fpsr & VFP_FPSCR_STAT_MASK) | @@ -282,7 +305,7 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) compat_ulong_t magic = VFP_MAGIC; compat_ulong_t size = VFP_STORAGE_SIZE; compat_ulong_t fpscr; - int err = 0; + int i, err = 0; __get_user_error(magic, &frame->magic, err); __get_user_error(size, &frame->size, err); @@ -292,12 +315,14 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) if (magic != VFP_MAGIC || size != VFP_STORAGE_SIZE) return -EINVAL; - /* - * Copy the FP registers into the start of the fpsimd_state. - * FIXME: Won't work if big endian. - */ - err |= __copy_from_user(fpsimd.vregs, frame->ufp.fpregs, - sizeof(frame->ufp.fpregs)); + /* Copy the FP registers into the start of the fpsimd_state. */ + for (i = 0; i < ARRAY_SIZE(frame->ufp.fpregs); i += 2) { + union __fpsimd_vreg vreg; + + __get_user_error(vreg.lo, &frame->ufp.fpregs[i], err); + __get_user_error(vreg.hi, &frame->ufp.fpregs[i + 1], err); + fpsimd.vregs[i >> 1] = vreg.raw; + } /* Extract the fpsr and the fpcr from the fpscr */ __get_user_error(fpscr, &frame->ufp.fpscr, err); From 8a31f0de7f474c13bcb4312f5c517bc594330f68 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 2 Sep 2015 18:49:28 +0100 Subject: [PATCH 07/55] arm64: head.S: initialise mdcr_el2 in el2_setup commit d10bcd473301888f957ec4b6b12aa3621be78d59 upstream. When entering the kernel at EL2, we fail to initialise the MDCR_EL2 register which controls debug access and PMU capabilities at EL1. This patch ensures that the register is initialised so that all traps are disabled and all the PMU counters are available to the host. When a guest is scheduled, KVM takes care to configure trapping appropriately. Acked-by: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/head.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 53dcae49e72..f480e7d6e8b 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -184,6 +184,11 @@ ENTRY(el2_setup) msr hstr_el2, xzr // Disable CP15 traps to EL2 #endif + /* EL2 debug */ + mrs x0, pmcr_el0 // Disable debug access traps + ubfx x0, x0, #11, #5 // to EL2 and allow access to + msr mdcr_el2, x0 // all PMU counters from EL1 + /* Stage-2 translation */ msr vttbr_el2, xzr From e8c2bbe98ae6636123043bf7a9bf908635b1c5cc Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 13 Aug 2015 18:02:39 +0200 Subject: [PATCH 08/55] ALSA: hda - Enable headphone jack detect on old Fujitsu laptops commit bb148bdeb0ab16fc0ae8009799471e4d7180073b upstream. According to the bug report, FSC Amilo laptops with ALC880 can detect the headphone jack but currently the driver disables it. It's partly intentionally, as non-working jack detect was reported in the past. Let's enable now. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=102501 Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index d30252e7f3e..23f8241924b 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -1137,7 +1137,7 @@ static const struct hda_fixup alc880_fixups[] = { /* override all pins as BIOS on old Amilo is broken */ .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { - { 0x14, 0x0121411f }, /* HP */ + { 0x14, 0x0121401f }, /* HP */ { 0x15, 0x99030120 }, /* speaker */ { 0x16, 0x99030130 }, /* bass speaker */ { 0x17, 0x411111f0 }, /* N/A */ @@ -1157,7 +1157,7 @@ static const struct hda_fixup alc880_fixups[] = { /* almost compatible with FUJITSU, but no bass and SPDIF */ .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { - { 0x14, 0x0121411f }, /* HP */ + { 0x14, 0x0121401f }, /* HP */ { 0x15, 0x99030120 }, /* speaker */ { 0x16, 0x411111f0 }, /* N/A */ { 0x17, 0x411111f0 }, /* N/A */ From 4c9510d519440ca46a4bf8f72ad72f842db22432 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 13 Aug 2015 18:05:06 +0200 Subject: [PATCH 09/55] ALSA: hda - Use ALC880_FIXUP_FUJITSU for FSC Amilo M1437 commit a161574e200ae63a5042120e0d8c36830e81bde3 upstream. It turned out that the machine has a bass speaker, so take a correct fixup entry. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=102501 Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 23f8241924b..183a96ab253 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -1365,7 +1365,7 @@ static const struct snd_pci_quirk alc880_fixup_tbl[] = { SND_PCI_QUIRK(0x161f, 0x203d, "W810", ALC880_FIXUP_W810), SND_PCI_QUIRK(0x161f, 0x205d, "Medion Rim 2150", ALC880_FIXUP_MEDION_RIM), SND_PCI_QUIRK(0x1631, 0xe011, "PB 13201056", ALC880_FIXUP_6ST_AUTOMUTE), - SND_PCI_QUIRK(0x1734, 0x107c, "FSC F1734", ALC880_FIXUP_F1734), + SND_PCI_QUIRK(0x1734, 0x107c, "FSC Amilo M1437", ALC880_FIXUP_FUJITSU), SND_PCI_QUIRK(0x1734, 0x1094, "FSC Amilo M1451G", ALC880_FIXUP_FUJITSU), SND_PCI_QUIRK(0x1734, 0x10ac, "FSC AMILO Xi 1526", ALC880_FIXUP_F1734), SND_PCI_QUIRK(0x1734, 0x10b0, "FSC Amilo Pi1556", ALC880_FIXUP_FUJITSU), From 36e5789bc706a67a0281a14b3888b3f5d0f723e0 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 7 Aug 2015 16:19:43 +1000 Subject: [PATCH 10/55] powerpc/mm: Fix pte_pagesize_index() crash on 4K w/64K hash commit 74b5037baa2011a2799e2c43adde7d171b072f9e upstream. The powerpc kernel can be built to have either a 4K PAGE_SIZE or a 64K PAGE_SIZE. However when built with a 4K PAGE_SIZE there is an additional config option which can be enabled, PPC_HAS_HASH_64K, which means the kernel also knows how to hash a 64K page even though the base PAGE_SIZE is 4K. This is used in one obscure configuration, to support 64K pages for SPU local store on the Cell processor when the rest of the kernel is using 4K pages. In this configuration, pte_pagesize_index() is defined to just pass through its arguments to get_slice_psize(). However pte_pagesize_index() is called for both user and kernel addresses, whereas get_slice_psize() only knows how to handle user addresses. This has been broken forever, however until recently it happened to work. That was because in get_slice_psize() the large kernel address would cause the right shift of the slice mask to return zero. However in commit 7aa0727f3302 ("powerpc/mm: Increase the slice range to 64TB"), the get_slice_psize() code was changed so that instead of a right shift we do an array lookup based on the address. When passed a kernel address this means we index way off the end of the slice array and return random junk. That is only fatal if we happen to hit something non-zero, but when we do return a non-zero value we confuse the MMU code and eventually cause a check stop. This fix is ugly, but simple. When we're called for a kernel address we return 4K, which is always correct in this configuration, otherwise we use the slice mask. Fixes: 7aa0727f3302 ("powerpc/mm: Increase the slice range to 64TB") Reported-by: Cyril Bur Signed-off-by: Michael Ellerman Reviewed-by: Aneesh Kumar K.V Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/pgtable-ppc64.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index e3d55f6f24f..6fbb2b46098 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -130,7 +130,19 @@ #define pte_iterate_hashed_end() } while(0) #ifdef CONFIG_PPC_HAS_HASH_64K -#define pte_pagesize_index(mm, addr, pte) get_slice_psize(mm, addr) +/* + * We expect this to be called only for user addresses or kernel virtual + * addresses other than the linear mapping. + */ +#define pte_pagesize_index(mm, addr, pte) \ + ({ \ + unsigned int psize; \ + if (is_kernel_addr(addr)) \ + psize = MMU_PAGE_4K; \ + else \ + psize = get_slice_psize(mm, addr); \ + psize; \ + }) #else #define pte_pagesize_index(mm, addr, pte) MMU_PAGE_4K #endif From 2ba90c0e0638dba68ffe6e78b135729e41a4540a Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Fri, 17 Jul 2015 12:46:58 +0200 Subject: [PATCH 11/55] powerpc/rtas: Introduce rtas_get_sensor_fast() for IRQ handlers commit 1c2cb594441d02815d304cccec9742ff5c707495 upstream. The EPOW interrupt handler uses rtas_get_sensor(), which in turn uses rtas_busy_delay() to wait for RTAS becoming ready in case it is necessary. But rtas_busy_delay() is annotated with might_sleep() and thus may not be used by interrupts handlers like the EPOW handler! This leads to the following BUG when CONFIG_DEBUG_ATOMIC_SLEEP is enabled: BUG: sleeping function called from invalid context at arch/powerpc/kernel/rtas.c:496 in_atomic(): 1, irqs_disabled(): 1, pid: 0, name: swapper/1 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.2.0-rc2-thuth #6 Call Trace: [c00000007ffe7b90] [c000000000807670] dump_stack+0xa0/0xdc (unreliable) [c00000007ffe7bc0] [c0000000000e1f14] ___might_sleep+0x134/0x180 [c00000007ffe7c20] [c00000000002aec0] rtas_busy_delay+0x30/0xd0 [c00000007ffe7c50] [c00000000002bde4] rtas_get_sensor+0x74/0xe0 [c00000007ffe7ce0] [c000000000083264] ras_epow_interrupt+0x44/0x450 [c00000007ffe7d90] [c000000000120260] handle_irq_event_percpu+0xa0/0x300 [c00000007ffe7e70] [c000000000120524] handle_irq_event+0x64/0xc0 [c00000007ffe7eb0] [c000000000124dbc] handle_fasteoi_irq+0xec/0x260 [c00000007ffe7ef0] [c00000000011f4f0] generic_handle_irq+0x50/0x80 [c00000007ffe7f20] [c000000000010f3c] __do_irq+0x8c/0x200 [c00000007ffe7f90] [c0000000000236cc] call_do_irq+0x14/0x24 [c00000007e6f39e0] [c000000000011144] do_IRQ+0x94/0x110 [c00000007e6f3a30] [c000000000002594] hardware_interrupt_common+0x114/0x180 Fix this issue by introducing a new rtas_get_sensor_fast() function that does not use rtas_busy_delay() - and thus can only be used for sensors that do not cause a BUSY condition - known as "fast" sensors. The EPOW sensor is defined to be "fast" in sPAPR - mpe. Fixes: 587f83e8dd50 ("powerpc/pseries: Use rtas_get_sensor in RAS code") Signed-off-by: Thomas Huth Reviewed-by: Nathan Fontenot Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/rtas.h | 1 + arch/powerpc/kernel/rtas.c | 17 +++++++++++++++++ arch/powerpc/platforms/pseries/ras.c | 3 ++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 34fd70488d8..c5d5cb36f6c 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -255,6 +255,7 @@ extern void rtas_power_off(void); extern void rtas_halt(void); extern void rtas_os_term(char *str); extern int rtas_get_sensor(int sensor, int index, int *state); +extern int rtas_get_sensor_fast(int sensor, int index, int *state); extern int rtas_get_power_level(int powerdomain, int *level); extern int rtas_set_power_level(int powerdomain, int level, int *setlevel); extern bool rtas_indicator_present(int token, int *maxindex); diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 52add6f3e20..f956a2f84a1 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -584,6 +584,23 @@ int rtas_get_sensor(int sensor, int index, int *state) } EXPORT_SYMBOL(rtas_get_sensor); +int rtas_get_sensor_fast(int sensor, int index, int *state) +{ + int token = rtas_token("get-sensor-state"); + int rc; + + if (token == RTAS_UNKNOWN_SERVICE) + return -ENOENT; + + rc = rtas_call(token, 2, 2, state, sensor, index); + WARN_ON(rc == RTAS_BUSY || (rc >= RTAS_EXTENDED_DELAY_MIN && + rc <= RTAS_EXTENDED_DELAY_MAX)); + + if (rc < 0) + return rtas_error_rc(rc); + return rc; +} + bool rtas_indicator_present(int token, int *maxindex) { int proplen, count, i; diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index c4dfccd3a3d..2338e6e9848 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -187,7 +187,8 @@ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) int state; int critical; - status = rtas_get_sensor(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state); + status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, + &state); if (state > 3) critical = 1; /* Time Critical */ From 1d6c45737c391f980df36e9183f81f8b19632d37 Mon Sep 17 00:00:00 2001 From: Jeffery Miller Date: Tue, 1 Sep 2015 11:23:02 -0400 Subject: [PATCH 12/55] Add radeon suspend/resume quirk for HP Compaq dc5750. commit 09bfda10e6efd7b65bcc29237bee1765ed779657 upstream. With the radeon driver loaded the HP Compaq dc5750 Small Form Factor machine fails to resume from suspend. Adding a quirk similar to other devices avoids the problem and the system resumes properly. Signed-off-by: Jeffery Miller Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_combios.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/radeon/radeon_combios.c b/drivers/gpu/drm/radeon/radeon_combios.c index 8cac6981905..9c64a973190 100644 --- a/drivers/gpu/drm/radeon/radeon_combios.c +++ b/drivers/gpu/drm/radeon/radeon_combios.c @@ -3403,6 +3403,14 @@ void radeon_combios_asic_init(struct drm_device *dev) rdev->pdev->subsystem_device == 0x30ae) return; + /* quirk for rs4xx HP Compaq dc5750 Small Form Factor to make it resume + * - it hangs on resume inside the dynclk 1 table. + */ + if (rdev->family == CHIP_RS480 && + rdev->pdev->subsystem_vendor == 0x103c && + rdev->pdev->subsystem_device == 0x280a) + return; + /* DYN CLK 1 */ table = combios_get_table_offset(dev, COMBIOS_DYN_CLK_1_TABLE); if (table) From 55b9029eab6e1cf44afa35b9823308246a0605d9 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Sun, 12 Jul 2015 20:18:42 +0800 Subject: [PATCH 13/55] x86/mm: Initialize pmd_idx in page_table_range_init_count() commit 9962eea9e55f797f05f20ba6448929cab2a9f018 upstream. The variable pmd_idx is not initialized for the first iteration of the for loop. Assign the proper value which indexes the start address. Fixes: 719272c45b82 'x86, mm: only call early_ioremap_page_table_range_init() once' Signed-off-by: Minfei Huang Cc: tony.luck@intel.com Cc: wangnan0@huawei.com Cc: david.vrabel@citrix.com Reviewed-by: yinghai@kernel.org Link: http://lkml.kernel.org/r/1436703522-29552-1-git-send-email-mhuang@redhat.com Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/init_32.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3ac7e319918..6a70f0ee409 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -137,6 +137,7 @@ page_table_range_init_count(unsigned long start, unsigned long end) vaddr = start; pgd_idx = pgd_index(vaddr); + pmd_idx = pmd_index(vaddr); for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) { for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); From 92a6eef0fb73b10dcc9fa3061d3ec2cf604d6d0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Tue, 19 May 2015 19:03:12 -0300 Subject: [PATCH 14/55] rc-core: fix remove uevent generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a66b0c41ad277ae62a3ae6ac430a71882f899557 upstream. The input_dev is already gone when the rc device is being unregistered so checking for its presence only means that no remove uevent will be generated. Signed-off-by: David Härdeman Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/rc-main.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 1cf382a0b27..cf7bbb6c980 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -943,9 +943,6 @@ static int rc_dev_uevent(struct device *device, struct kobj_uevent_env *env) { struct rc_dev *dev = to_rc_dev(device); - if (!dev || !dev->input_dev) - return -ENODEV; - if (dev->rc_map.name) ADD_HOTPLUG_VAR("NAME=%s", dev->rc_map.name); if (dev->driver_name) From 9520ac796ef656c8ca8f7b0dfc56651f185d845f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 30 Jul 2015 13:00:56 +1000 Subject: [PATCH 15/55] NFSv4: don't set SETATTR for O_RDONLY|O_EXCL commit efcbc04e16dfa95fef76309f89710dd1d99a5453 upstream. It is unusual to combine the open flags O_RDONLY and O_EXCL, but it appears that libre-office does just that. [pid 3250] stat("/home/USER/.config", {st_mode=S_IFDIR|0700, st_size=8192, ...}) = 0 [pid 3250] open("/home/USER/.config/libreoffice/4-suse/user/extensions/buildid", O_RDONLY|O_EXCL NFSv4 takes O_EXCL as a sign that a setattr command should be sent, probably to reset the timestamps. When it was an O_RDONLY open, the SETATTR command does not identify any actual attributes to change. If no delegation was provided to the open, the SETATTR uses the all-zeros stateid and the request is accepted (at least by the Linux NFS server - no harm, no foul). If a read-delegation was provided, this is used in the SETATTR request, and a Netapp filer will justifiably claim NFS4ERR_BAD_STATEID, which the Linux client takes as a sign to retry - indefinitely. So only treat O_EXCL specially if O_CREAT was also given. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 20ebcfa3c92..78679b48948 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2043,7 +2043,7 @@ static int _nfs4_do_open(struct inode *dir, if (status != 0) goto err_opendata_put; - if ((opendata->o_arg.open_flags & O_EXCL) && + if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) && (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { nfs4_exclusive_attrset(opendata, sattr); From 690eb5ee31ee70eb31565de1052147e20cc40ff0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 17 Aug 2015 12:57:07 -0500 Subject: [PATCH 16/55] NFS: nfs_set_pgio_error sometimes misses errors commit e9ae58aeee8842a50f7e199d602a5ccb2e41a95f upstream. We should ensure that we always set the pgio_header's error field if a READ or WRITE RPC call returns an error. The current code depends on 'hdr->good_bytes' always being initialised to a large value, which is not always done correctly by callers. When this happens, applications may end up missing important errors. Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/pagelist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 29cfb7ade12..d852ca281c1 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -60,8 +60,8 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) { spin_lock(&hdr->lock); - if (pos < hdr->io_start + hdr->good_bytes) { - set_bit(NFS_IOHDR_ERROR, &hdr->flags); + if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags) + || pos < hdr->io_start + hdr->good_bytes) { clear_bit(NFS_IOHDR_EOF, &hdr->flags); hdr->good_bytes = pos - hdr->io_start; hdr->error = error; From 706ad8dcb5a2db85e6c2b2da30449c9a9918fa9d Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 3 Sep 2015 22:45:21 +0200 Subject: [PATCH 17/55] parisc: Filter out spurious interrupts in PA-RISC irq handler commit b1b4e435e4ef7de77f07bf2a42c8380b960c2d44 upstream. When detecting a serial port on newer PA-RISC machines (with iosapic) we have a long way to go to find the right IRQ line, registering it, then registering the serial port and the irq handler for the serial port. During this phase spurious interrupts for the serial port may happen which then crashes the kernel because the action handler might not have been set up yet. So, basically it's a race condition between the serial port hardware and the CPU which sets up the necessary fields in the irq sructs. The main reason for this race is, that we unmask the serial port irqs too early without having set up everything properly before (which isn't easily possible because we need the IRQ number to register the serial ports). This patch is a work-around for this problem. It adds checks to the CPU irq handler to verify if the IRQ action field has been initialized already. If not, we just skip this interrupt (which isn't critical for a serial port at bootup). The real fix would probably involve rewriting all PA-RISC specific IRQ code (for CPU, IOSAPIC, GSC and EISA) to use IRQ domains with proper parenting of the irq chips and proper irq enabling along this line. This bug has been in the PA-RISC port since the beginning, but the crashes happened very rarely with currently used hardware. But on the latest machine which I bought (a C8000 workstation), which uses the fastest CPUs (4 x PA8900, 1GHz) and which has the largest possible L1 cache size (64MB each), the kernel crashed at every boot because of this race. So, without this patch the machine would currently be unuseable. For the record, here is the flow logic: 1. serial_init_chip() in 8250_gsc.c calls iosapic_serial_irq(). 2. iosapic_serial_irq() calls txn_alloc_irq() to find the irq. 3. iosapic_serial_irq() calls cpu_claim_irq() to register the CPU irq 4. cpu_claim_irq() unmasks the CPU irq (which it shouldn't!) 5. serial_init_chip() then registers the 8250 port. Problems: - In step 4 the CPU irq shouldn't have been registered yet, but after step 5 - If serial irq happens between 4 and 5 have finished, the kernel will crash Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman --- arch/parisc/kernel/irq.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index 2e6443b1e92..c32a37e0e0d 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -524,8 +524,8 @@ void do_cpu_irq_mask(struct pt_regs *regs) struct pt_regs *old_regs; unsigned long eirr_val; int irq, cpu = smp_processor_id(); -#ifdef CONFIG_SMP struct irq_desc *desc; +#ifdef CONFIG_SMP cpumask_t dest; #endif @@ -538,8 +538,12 @@ void do_cpu_irq_mask(struct pt_regs *regs) goto set_out; irq = eirr_to_irq(eirr_val); -#ifdef CONFIG_SMP + /* Filter out spurious interrupts, mostly from serial port at bootup */ desc = irq_to_desc(irq); + if (unlikely(!desc->action)) + goto set_out; + +#ifdef CONFIG_SMP cpumask_copy(&dest, desc->irq_data.affinity); if (irqd_is_per_cpu(&desc->irq_data) && !cpu_isset(smp_processor_id(), dest)) { From de047ce49582e2fc3303efc58b40f4dbe7a4519f Mon Sep 17 00:00:00 2001 From: Jaewon Kim Date: Tue, 8 Sep 2015 15:02:21 -0700 Subject: [PATCH 18/55] vmscan: fix increasing nr_isolated incurred by putback unevictable pages commit c54839a722a02818677bcabe57e957f0ce4f841d upstream. reclaim_clean_pages_from_list() assumes that shrink_page_list() returns number of pages removed from the candidate list. But shrink_page_list() puts back mlocked pages without passing it to caller and without counting as nr_reclaimed. This increases nr_isolated. To fix this, this patch changes shrink_page_list() to pass unevictable pages back to caller. Caller will take care those pages. Minchan said: It fixes two issues. 1. With unevictable page, cma_alloc will be successful. Exactly speaking, cma_alloc of current kernel will fail due to unevictable pages. 2. fix leaking of NR_ISOLATED counter of vmstat With it, too_many_isolated works. Otherwise, it could make hang until the process get SIGKILL. Signed-off-by: Jaewon Kim Acked-by: Minchan Kim Cc: Mel Gorman Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 233f0011f76..a1e3becef05 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -925,7 +925,7 @@ cull_mlocked: if (PageSwapCache(page)) try_to_free_swap(page); unlock_page(page); - putback_lru_page(page); + list_add(&page->lru, &ret_pages); continue; activate_locked: From 9bdee2f90804f2c5865b1c4a75d3208ef28b9007 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 9 Sep 2015 15:38:28 -0700 Subject: [PATCH 19/55] fs: if a coredump already exists, unlink and recreate with O_EXCL commit fbb1816942c04429e85dbf4c1a080accc534299e upstream. It was possible for an attacking user to trick root (or another user) into writing his coredumps into an attacker-readable, pre-existing file using rename() or link(), causing the disclosure of secret data from the victim process' virtual memory. Depending on the configuration, it was also possible to trick root into overwriting system files with coredumps. Fix that issue by never writing coredumps into existing files. Requirements for the attack: - The attack only applies if the victim's process has a nonzero RLIMIT_CORE and is dumpable. - The attacker can trick the victim into coredumping into an attacker-writable directory D, either because the core_pattern is relative and the victim's cwd is attacker-writable or because an absolute core_pattern pointing to a world-writable directory is used. - The attacker has one of these: A: on a system with protected_hardlinks=0: execute access to a folder containing a victim-owned, attacker-readable file on the same partition as D, and the victim-owned file will be deleted before the main part of the attack takes place. (In practice, there are lots of files that fulfill this condition, e.g. entries in Debian's /var/lib/dpkg/info/.) This does not apply to most Linux systems because most distros set protected_hardlinks=1. B: on a system with protected_hardlinks=1: execute access to a folder containing a victim-owned, attacker-readable and attacker-writable file on the same partition as D, and the victim-owned file will be deleted before the main part of the attack takes place. (This seems to be uncommon.) C: on any system, independent of protected_hardlinks: write access to a non-sticky folder containing a victim-owned, attacker-readable file on the same partition as D (This seems to be uncommon.) The basic idea is that the attacker moves the victim-owned file to where he expects the victim process to dump its core. The victim process dumps its core into the existing file, and the attacker reads the coredump from it. If the attacker can't move the file because he does not have write access to the containing directory, he can instead link the file to a directory he controls, then wait for the original link to the file to be deleted (because the kernel checks that the link count of the corefile is 1). A less reliable variant that requires D to be non-sticky works with link() and does not require deletion of the original link: link() the file into D, but then unlink() it directly before the kernel performs the link count check. On systems with protected_hardlinks=0, this variant allows an attacker to not only gain information from coredumps, but also clobber existing, victim-writable files with coredumps. (This could theoretically lead to a privilege escalation.) Signed-off-by: Jann Horn Cc: Kees Cook Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/coredump.c | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 1d402ce5b72..4f03b2b5037 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -491,10 +491,10 @@ void do_coredump(siginfo_t *siginfo) const struct cred *old_cred; struct cred *cred; int retval = 0; - int flag = 0; int ispipe; struct files_struct *displaced; - bool need_nonrelative = false; + /* require nonrelative corefile path and be extra careful */ + bool need_suid_safe = false; bool core_dumped = false; static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { @@ -528,9 +528,8 @@ void do_coredump(siginfo_t *siginfo) */ if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) { /* Setuid core dump mode */ - flag = O_EXCL; /* Stop rewrite attacks */ cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ - need_nonrelative = true; + need_suid_safe = true; } retval = coredump_wait(siginfo->si_signo, &core_state); @@ -611,7 +610,7 @@ void do_coredump(siginfo_t *siginfo) if (cprm.limit < binfmt->min_coredump) goto fail_unlock; - if (need_nonrelative && cn.corename[0] != '/') { + if (need_suid_safe && cn.corename[0] != '/') { printk(KERN_WARNING "Pid %d(%s) can only dump core "\ "to fully qualified path!\n", task_tgid_vnr(current), current->comm); @@ -619,8 +618,35 @@ void do_coredump(siginfo_t *siginfo) goto fail_unlock; } + /* + * Unlink the file if it exists unless this is a SUID + * binary - in that case, we're running around with root + * privs and don't want to unlink another user's coredump. + */ + if (!need_suid_safe) { + mm_segment_t old_fs; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + /* + * If it doesn't exist, that's fine. If there's some + * other problem, we'll catch it at the filp_open(). + */ + (void) sys_unlink((const char __user *)cn.corename); + set_fs(old_fs); + } + + /* + * There is a race between unlinking and creating the + * file, but if that causes an EEXIST here, that's + * fine - another process raced with us while creating + * the corefile, and the other process won. To userspace, + * what matters is that at least one of the two processes + * writes its coredump successfully, not which one. + */ cprm.file = filp_open(cn.corename, - O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, + O_CREAT | 2 | O_NOFOLLOW | + O_LARGEFILE | O_EXCL, 0600); if (IS_ERR(cprm.file)) goto fail_unlock; From ab7a4b4b9d31e0458a5f327d1da66d649d814066 Mon Sep 17 00:00:00 2001 From: Jialing Fu Date: Fri, 28 Aug 2015 11:13:09 +0800 Subject: [PATCH 20/55] mmc: core: fix race condition in mmc_wait_data_done commit 71f8a4b81d040b3d094424197ca2f1bf811b1245 upstream. The following panic is captured in ker3.14, but the issue still exists in latest kernel. --------------------------------------------------------------------- [ 20.738217] c0 3136 (Compiler) Unable to handle kernel NULL pointer dereference at virtual address 00000578 ...... [ 20.738499] c0 3136 (Compiler) PC is at _raw_spin_lock_irqsave+0x24/0x60 [ 20.738527] c0 3136 (Compiler) LR is at _raw_spin_lock_irqsave+0x20/0x60 [ 20.740134] c0 3136 (Compiler) Call trace: [ 20.740165] c0 3136 (Compiler) [] _raw_spin_lock_irqsave+0x24/0x60 [ 20.740200] c0 3136 (Compiler) [] __wake_up+0x1c/0x54 [ 20.740230] c0 3136 (Compiler) [] mmc_wait_data_done+0x28/0x34 [ 20.740262] c0 3136 (Compiler) [] mmc_request_done+0xa4/0x220 [ 20.740314] c0 3136 (Compiler) [] sdhci_tasklet_finish+0xac/0x264 [ 20.740352] c0 3136 (Compiler) [] tasklet_action+0xa0/0x158 [ 20.740382] c0 3136 (Compiler) [] __do_softirq+0x10c/0x2e4 [ 20.740411] c0 3136 (Compiler) [] irq_exit+0x8c/0xc0 [ 20.740439] c0 3136 (Compiler) [] handle_IRQ+0x48/0xac [ 20.740469] c0 3136 (Compiler) [] gic_handle_irq+0x38/0x7c ---------------------------------------------------------------------- Because in SMP, "mrq" has race condition between below two paths: path1: CPU0: static void mmc_wait_data_done(struct mmc_request *mrq) { mrq->host->context_info.is_done_rcv = true; // // If CPU0 has just finished "is_done_rcv = true" in path1, and at // this moment, IRQ or ICache line missing happens in CPU0. // What happens in CPU1 (path2)? // // If the mmcqd thread in CPU1(path2) hasn't entered to sleep mode: // path2 would have chance to break from wait_event_interruptible // in mmc_wait_for_data_req_done and continue to run for next // mmc_request (mmc_blk_rw_rq_prep). // // Within mmc_blk_rq_prep, mrq is cleared to 0. // If below line still gets host from "mrq" as the result of // compiler, the panic happens as we traced. wake_up_interruptible(&mrq->host->context_info.wait); } path2: CPU1: static int mmc_wait_for_data_req_done(... { ... while (1) { wait_event_interruptible(context_info->wait, (context_info->is_done_rcv || context_info->is_new_req)); static void mmc_blk_rw_rq_prep(... { ... memset(brq, 0, sizeof(struct mmc_blk_request)); This issue happens very coincidentally; however adding mdelay(1) in mmc_wait_data_done as below could duplicate it easily. static void mmc_wait_data_done(struct mmc_request *mrq) { mrq->host->context_info.is_done_rcv = true; + mdelay(1); wake_up_interruptible(&mrq->host->context_info.wait); } At runtime, IRQ or ICache line missing may just happen at the same place of the mdelay(1). This patch gets the mmc_context_info at the beginning of function, it can avoid this race condition. Signed-off-by: Jialing Fu Tested-by: Shawn Lin Fixes: 2220eedfd7ae ("mmc: fix async request mechanism ....") Signed-off-by: Shawn Lin Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/core/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index c40396f2320..68ab26385d0 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -327,8 +327,10 @@ EXPORT_SYMBOL(mmc_start_bkops); */ static void mmc_wait_data_done(struct mmc_request *mrq) { - mrq->host->context_info.is_done_rcv = true; - wake_up_interruptible(&mrq->host->context_info.wait); + struct mmc_context_info *context_info = &mrq->host->context_info; + + context_info->is_done_rcv = true; + wake_up_interruptible(&context_info->wait); } static void mmc_wait_done(struct mmc_request *mrq) From d3e972d5e77997cf0944d2c91162a9e893264c92 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 6 Jul 2015 17:37:49 +1000 Subject: [PATCH 21/55] md/raid10: always set reshape_safe when initializing reshape_position. commit 299b0685e31c9f3dcc2d58ee3beca761a40b44b3 upstream. 'reshape_position' tracks where in the reshape we have reached. 'reshape_safe' tracks where in the reshape we have safely recorded in the metadata. These are compared to determine when to update the metadata. So it is important that reshape_safe is initialised properly. Currently it isn't. When starting a reshape from the beginning it usually has the correct value by luck. But when reducing the number of devices in a RAID10, it has the wrong value and this leads to the metadata not being updated correctly. This can lead to corruption if the reshape is not allowed to complete. This patch is suitable for any -stable kernel which supports RAID10 reshape, which is 3.5 and later. Fixes: 3ea7daa5d7fd ("md/raid10: add reshape support") Signed-off-by: NeilBrown Signed-off-by: Greg Kroah-Hartman --- drivers/md/raid10.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a1ea2a75391..5b2a1eaea34 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3578,6 +3578,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) /* far_copies must be 1 */ conf->prev.stride = conf->dev_sectors; } + conf->reshape_safe = conf->reshape_progress; spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); @@ -3785,7 +3786,6 @@ static int run(struct mddev *mddev) } conf->offset_diff = min_offset_diff; - conf->reshape_safe = conf->reshape_progress; clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -4130,6 +4130,7 @@ static int raid10_start_reshape(struct mddev *mddev) conf->reshape_progress = size; } else conf->reshape_progress = 0; + conf->reshape_safe = conf->reshape_progress; spin_unlock_irq(&conf->device_lock); if (mddev->delta_disks && mddev->bitmap) { @@ -4196,6 +4197,7 @@ abort: rdev->new_data_offset = rdev->data_offset; smp_wmb(); conf->reshape_progress = MaxSector; + conf->reshape_safe = MaxSector; mddev->reshape_position = MaxSector; spin_unlock_irq(&conf->device_lock); return ret; @@ -4543,6 +4545,7 @@ static void end_reshape(struct r10conf *conf) md_finish_reshape(conf->mddev); smp_wmb(); conf->reshape_progress = MaxSector; + conf->reshape_safe = MaxSector; spin_unlock_irq(&conf->device_lock); /* read-ahead size must cover two whole stripes, which is From f8cb6399b16470397e7870e272ee7f3b03ed76ef Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Fri, 9 Jan 2015 18:06:12 +0000 Subject: [PATCH 22/55] xen/gntdev: convert priv->lock to a mutex commit 1401c00e59ea021c575f74612fe2dbba36d6a4ee upstream. Unmapping may require sleeping and we unmap while holding priv->lock, so convert it to a mutex. Signed-off-by: David Vrabel Reviewed-by: Stefano Stabellini Cc: Ian Campbell Signed-off-by: Greg Kroah-Hartman --- drivers/xen/gntdev.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 474d11499d0..e68205cbc46 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -65,7 +65,7 @@ struct gntdev_priv { * Only populated if populate_freeable_maps == 1 */ struct list_head freeable_maps; /* lock protects maps and freeable_maps */ - spinlock_t lock; + struct mutex lock; struct mm_struct *mm; struct mmu_notifier mn; }; @@ -214,9 +214,9 @@ static void gntdev_put_map(struct gntdev_priv *priv, struct grant_map *map) } if (populate_freeable_maps && priv) { - spin_lock(&priv->lock); + mutex_lock(&priv->lock); list_del(&map->next); - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); } if (map->pages && !use_ptemod) @@ -392,9 +392,9 @@ static void gntdev_vma_close(struct vm_area_struct *vma) * not do any unmapping, since that has been done prior to * closing the vma, but it may still iterate the unmap_ops list. */ - spin_lock(&priv->lock); + mutex_lock(&priv->lock); map->vma = NULL; - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); } vma->vm_private_data = NULL; gntdev_put_map(priv, map); @@ -438,14 +438,14 @@ static void mn_invl_range_start(struct mmu_notifier *mn, struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); struct grant_map *map; - spin_lock(&priv->lock); + mutex_lock(&priv->lock); list_for_each_entry(map, &priv->maps, next) { unmap_if_in_range(map, start, end); } list_for_each_entry(map, &priv->freeable_maps, next) { unmap_if_in_range(map, start, end); } - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); } static void mn_invl_page(struct mmu_notifier *mn, @@ -462,7 +462,7 @@ static void mn_release(struct mmu_notifier *mn, struct grant_map *map; int err; - spin_lock(&priv->lock); + mutex_lock(&priv->lock); list_for_each_entry(map, &priv->maps, next) { if (!map->vma) continue; @@ -481,7 +481,7 @@ static void mn_release(struct mmu_notifier *mn, err = unmap_grant_pages(map, /* offset */ 0, map->count); WARN_ON(err); } - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); } static struct mmu_notifier_ops gntdev_mmu_ops = { @@ -503,7 +503,7 @@ static int gntdev_open(struct inode *inode, struct file *flip) INIT_LIST_HEAD(&priv->maps); INIT_LIST_HEAD(&priv->freeable_maps); - spin_lock_init(&priv->lock); + mutex_init(&priv->lock); if (use_ptemod) { priv->mm = get_task_mm(current); @@ -579,10 +579,10 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, return -EFAULT; } - spin_lock(&priv->lock); + mutex_lock(&priv->lock); gntdev_add_map(priv, map); op.index = map->index << PAGE_SHIFT; - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); if (copy_to_user(u, &op, sizeof(op)) != 0) return -EFAULT; @@ -601,7 +601,7 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, return -EFAULT; pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count); - spin_lock(&priv->lock); + mutex_lock(&priv->lock); map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); if (map) { list_del(&map->next); @@ -609,7 +609,7 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, list_add_tail(&map->next, &priv->freeable_maps); err = 0; } - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); if (map) gntdev_put_map(priv, map); return err; @@ -677,7 +677,7 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) out_flags = op.action; out_event = op.event_channel_port; - spin_lock(&priv->lock); + mutex_lock(&priv->lock); list_for_each_entry(map, &priv->maps, next) { uint64_t begin = map->index << PAGE_SHIFT; @@ -705,7 +705,7 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) rc = 0; unlock_out: - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); /* Drop the reference to the event channel we did not save in the map */ if (out_flags & UNMAP_NOTIFY_SEND_EVENT) @@ -755,7 +755,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) pr_debug("map %d+%d at %lx (pgoff %lx)\n", index, count, vma->vm_start, vma->vm_pgoff); - spin_lock(&priv->lock); + mutex_lock(&priv->lock); map = gntdev_find_map_index(priv, index, count); if (!map) goto unlock_out; @@ -790,7 +790,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) map->flags |= GNTMAP_readonly; } - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); if (use_ptemod) { err = apply_to_page_range(vma->vm_mm, vma->vm_start, @@ -818,11 +818,11 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) return 0; unlock_out: - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); return err; out_unlock_put: - spin_unlock(&priv->lock); + mutex_unlock(&priv->lock); out_put_map: if (use_ptemod) map->vma = NULL; From 431152b6b341eefac26cef3d812ae52bc24df566 Mon Sep 17 00:00:00 2001 From: Hin-Tak Leung Date: Wed, 9 Sep 2015 15:38:07 -0700 Subject: [PATCH 23/55] hfs: fix B-tree corruption after insertion at position 0 commit b4cc0efea4f0bfa2477c56af406cfcf3d3e58680 upstream. Fix B-tree corruption when a new record is inserted at position 0 in the node in hfs_brec_insert(). This is an identical change to the corresponding hfs b-tree code to Sergei Antonov's "hfsplus: fix B-tree corruption after insertion at position 0", to keep similar code paths in the hfs and hfsplus drivers in sync, where appropriate. Signed-off-by: Hin-Tak Leung Cc: Sergei Antonov Cc: Joe Perches Reviewed-by: Vyacheslav Dubeyko Cc: Anton Altaparmakov Cc: Al Viro Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/hfs/brec.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 9f4ee7f5202..6fc766df046 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -131,13 +131,16 @@ skip: hfs_bnode_write(node, entry, data_off + key_len, entry_len); hfs_bnode_dump(node); - if (new_node) { - /* update parent key if we inserted a key - * at the start of the first node - */ - if (!rec && new_node != node) - hfs_brec_update_parent(fd); + /* + * update parent key if we inserted a key + * at the start of the node and it is not the new node + */ + if (!rec && new_node != node) { + hfs_bnode_read_key(node, fd->search_key, data_off + size); + hfs_brec_update_parent(fd); + } + if (new_node) { hfs_bnode_put(fd->bnode); if (!new_node->parent) { hfs_btree_inc_height(tree); @@ -166,9 +169,6 @@ skip: goto again; } - if (!rec) - hfs_brec_update_parent(fd); - return 0; } @@ -366,6 +366,8 @@ again: if (IS_ERR(parent)) return PTR_ERR(parent); __hfs_brec_find(parent, fd); + if (fd->record < 0) + return -ENOENT; hfs_bnode_dump(parent); rec = fd->record; From 939f8043048f59a4d77db1f5eeed1526a74a1503 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Aug 2015 11:00:37 +0200 Subject: [PATCH 24/55] IB/uverbs: reject invalid or unknown opcodes commit b632ffa7cee439ba5dce3b3bc4a5cbe2b3e20133 upstream. We have many WR opcodes that are only supported in kernel space and/or require optional information to be copied into the WR structure. Reject all those not explicitly handled so that we can't pass invalid information to drivers. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/uverbs_cmd.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index a7d00f6b3bc..44c15cebd43 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2106,6 +2106,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, next->send_flags = user_wr->send_flags; if (is_ud) { + if (next->opcode != IB_WR_SEND && + next->opcode != IB_WR_SEND_WITH_IMM) { + ret = -EINVAL; + goto out_put; + } + next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah, file->ucontext); if (!next->wr.ud.ah) { @@ -2142,9 +2148,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, user_wr->wr.atomic.compare_add; next->wr.atomic.swap = user_wr->wr.atomic.swap; next->wr.atomic.rkey = user_wr->wr.atomic.rkey; + case IB_WR_SEND: break; default: - break; + ret = -EINVAL; + goto out_put; } } From caf233503f3ce58ebe564c5d11aa53f2344a1053 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 13 Aug 2015 18:32:03 +0300 Subject: [PATCH 25/55] IB/uverbs: Fix race between ib_uverbs_open and remove_one commit 35d4a0b63dc0c6d1177d4f532a9deae958f0662c upstream. Fixes: 2a72f212263701b927559f6850446421d5906c41 ("IB/uverbs: Remove dev_table") Before this commit there was a device look-up table that was protected by a spin_lock used by ib_uverbs_open and by ib_uverbs_remove_one. When it was dropped and container_of was used instead, it enabled the race with remove_one as dev might be freed just after: dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev) but before the kref_get. In addition, this buggy patch added some dead code as container_of(x,y,z) can never be NULL and so dev can never be NULL. As a result the comment above ib_uverbs_open saying "the open method will either immediately run -ENXIO" is wrong as it can never happen. The solution follows Jason Gunthorpe suggestion from below URL: https://www.mail-archive.com/linux-rdma@vger.kernel.org/msg25692.html cdev will hold a kref on the parent (the containing structure, ib_uverbs_device) and only when that kref is released it is guaranteed that open will never be called again. In addition, fixes the active count scheme to use an atomic not a kref to prevent WARN_ON as pointed by above comment from Jason. Signed-off-by: Yishai Hadas Signed-off-by: Shachar Raindel Reviewed-by: Jason Gunthorpe Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/uverbs.h | 3 +- drivers/infiniband/core/uverbs_main.c | 43 +++++++++++++++++++-------- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 0fcd7aa26fa..8b8de21bfdc 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -69,7 +69,7 @@ */ struct ib_uverbs_device { - struct kref ref; + atomic_t refcount; int num_comp_vectors; struct completion comp; struct device *dev; @@ -78,6 +78,7 @@ struct ib_uverbs_device { struct cdev cdev; struct rb_root xrcd_tree; struct mutex xrcd_tree_mutex; + struct kobject kobj; }; struct ib_uverbs_event_file { diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 949b3863349..b6062b9236a 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -119,14 +119,18 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, static void ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device); -static void ib_uverbs_release_dev(struct kref *ref) +static void ib_uverbs_release_dev(struct kobject *kobj) { struct ib_uverbs_device *dev = - container_of(ref, struct ib_uverbs_device, ref); + container_of(kobj, struct ib_uverbs_device, kobj); - complete(&dev->comp); + kfree(dev); } +static struct kobj_type ib_uverbs_dev_ktype = { + .release = ib_uverbs_release_dev, +}; + static void ib_uverbs_release_event_file(struct kref *ref) { struct ib_uverbs_event_file *file = @@ -282,13 +286,19 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, return context->device->dealloc_ucontext(context); } +static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev) +{ + complete(&dev->comp); +} + static void ib_uverbs_release_file(struct kref *ref) { struct ib_uverbs_file *file = container_of(ref, struct ib_uverbs_file, ref); module_put(file->device->ib_dev->owner); - kref_put(&file->device->ref, ib_uverbs_release_dev); + if (atomic_dec_and_test(&file->device->refcount)) + ib_uverbs_comp_dev(file->device); kfree(file); } @@ -629,9 +639,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) int ret; dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev); - if (dev) - kref_get(&dev->ref); - else + if (!atomic_inc_not_zero(&dev->refcount)) return -ENXIO; if (!try_module_get(dev->ib_dev->owner)) { @@ -652,6 +660,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) mutex_init(&file->mutex); filp->private_data = file; + kobject_get(&dev->kobj); return nonseekable_open(inode, filp); @@ -659,13 +668,16 @@ err_module: module_put(dev->ib_dev->owner); err: - kref_put(&dev->ref, ib_uverbs_release_dev); + if (atomic_dec_and_test(&dev->refcount)) + ib_uverbs_comp_dev(dev); + return ret; } static int ib_uverbs_close(struct inode *inode, struct file *filp) { struct ib_uverbs_file *file = filp->private_data; + struct ib_uverbs_device *dev = file->device; ib_uverbs_cleanup_ucontext(file, file->ucontext); @@ -673,6 +685,7 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) kref_put(&file->async_file->ref, ib_uverbs_release_event_file); kref_put(&file->ref, ib_uverbs_release_file); + kobject_put(&dev->kobj); return 0; } @@ -768,10 +781,11 @@ static void ib_uverbs_add_one(struct ib_device *device) if (!uverbs_dev) return; - kref_init(&uverbs_dev->ref); + atomic_set(&uverbs_dev->refcount, 1); init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; mutex_init(&uverbs_dev->xrcd_tree_mutex); + kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype); spin_lock(&map_lock); devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); @@ -798,6 +812,7 @@ static void ib_uverbs_add_one(struct ib_device *device) cdev_init(&uverbs_dev->cdev, NULL); uverbs_dev->cdev.owner = THIS_MODULE; uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; + uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj; kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum); if (cdev_add(&uverbs_dev->cdev, base, 1)) goto err_cdev; @@ -828,9 +843,10 @@ err_cdev: clear_bit(devnum, overflow_map); err: - kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); + if (atomic_dec_and_test(&uverbs_dev->refcount)) + ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); - kfree(uverbs_dev); + kobject_put(&uverbs_dev->kobj); return; } @@ -850,9 +866,10 @@ static void ib_uverbs_remove_one(struct ib_device *device) else clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); - kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); + if (atomic_dec_and_test(&uverbs_dev->refcount)) + ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); - kfree(uverbs_dev); + kobject_put(&uverbs_dev->kobj); } static char *uverbs_devnode(struct device *dev, umode_t *mode) From a6d452e0f3d91f697b90650f21fb540c13d55b44 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Thu, 30 Jul 2015 17:34:23 +0300 Subject: [PATCH 26/55] IB/mlx4: Forbid using sysfs to change RoCE pkeys commit 2b135db3e81301d0452e6aa107349abe67b097d6 upstream. The pkey mapping for RoCE must remain the default mapping: VFs: virtual index 0 = mapped to real index 0 (0xFFFF) All others indices: mapped to a real pkey index containing an invalid pkey. PF: virtual index i = real index i. Don't allow users to change these mappings using files found in sysfs. Fixes: c1e7e466120b ('IB/mlx4: Add iov directory in sysfs under the ib device') Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx4/sysfs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c index 97516eb363b..c5ce4082fdc 100644 --- a/drivers/infiniband/hw/mlx4/sysfs.c +++ b/drivers/infiniband/hw/mlx4/sysfs.c @@ -563,6 +563,8 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) struct mlx4_port *p; int i; int ret; + int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port_num) == + IB_LINK_LAYER_ETHERNET; p = kzalloc(sizeof *p, GFP_KERNEL); if (!p) @@ -580,7 +582,8 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) p->pkey_group.name = "pkey_idx"; p->pkey_group.attrs = - alloc_group_attrs(show_port_pkey, store_port_pkey, + alloc_group_attrs(show_port_pkey, + is_eth ? NULL : store_port_pkey, dev->dev->caps.pkey_table_len[port_num]); if (!p->pkey_group.attrs) goto err_alloc; From 2698f5747a861bc1ecfb35b08ffa3ffabc91a77f Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Thu, 30 Jul 2015 17:34:24 +0300 Subject: [PATCH 27/55] IB/mlx4: Use correct SL on AH query under RoCE commit 5e99b139f1b68acd65e36515ca347b03856dfb5a upstream. The mlx4 IB driver implementation for ib_query_ah used a wrong offset (28 instead of 29) when link type is Ethernet. Fixed to use the correct one. Fixes: fa417f7b520e ('IB/mlx4: Add support for IBoE') Signed-off-by: Shani Michaeli Signed-off-by: Noa Osherovich Signed-off-by: Or Gerlitz Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx4/ah.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index a251becdaa9..890c23b3d71 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -169,9 +169,13 @@ int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) enum rdma_link_layer ll; memset(ah_attr, 0, sizeof *ah_attr); - ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24; ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num); + if (ll == IB_LINK_LAYER_ETHERNET) + ah_attr->sl = be32_to_cpu(ah->av.eth.sl_tclass_flowlabel) >> 29; + else + ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0; if (ah->av.ib.stat_rate) ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET; From 61cabc7d549fde1afddac0efbbf03a5c63161b33 Mon Sep 17 00:00:00 2001 From: Hin-Tak Leung Date: Wed, 9 Sep 2015 15:38:04 -0700 Subject: [PATCH 28/55] hfs,hfsplus: cache pages correctly between bnode_create and bnode_free commit 7cb74be6fd827e314f81df3c5889b87e4c87c569 upstream. Pages looked up by __hfs_bnode_create() (called by hfs_bnode_create() and hfs_bnode_find() for finding or creating pages corresponding to an inode) are immediately kmap()'ed and used (both read and write) and kunmap()'ed, and should not be page_cache_release()'ed until hfs_bnode_free(). This patch fixes a problem I first saw in July 2012: merely running "du" on a large hfsplus-mounted directory a few times on a reasonably loaded system would get the hfsplus driver all confused and complaining about B-tree inconsistencies, and generates a "BUG: Bad page state". Most recently, I can generate this problem on up-to-date Fedora 22 with shipped kernel 4.0.5, by running "du /" (="/" + "/home" + "/mnt" + other smaller mounts) and "du /mnt" simultaneously on two windows, where /mnt is a lightly-used QEMU VM image of the full Mac OS X 10.9: $ df -i / /home /mnt Filesystem Inodes IUsed IFree IUse% Mounted on /dev/mapper/fedora-root 3276800 551665 2725135 17% / /dev/mapper/fedora-home 52879360 716221 52163139 2% /home /dev/nbd0p2 4294967295 1387818 4293579477 1% /mnt After applying the patch, I was able to run "du /" (60+ times) and "du /mnt" (150+ times) continuously and simultaneously for 6+ hours. There are many reports of the hfsplus driver getting confused under load and generating "BUG: Bad page state" or other similar issues over the years. [1] The unpatched code [2] has always been wrong since it entered the kernel tree. The only reason why it gets away with it is that the kmap/memcpy/kunmap follow very quickly after the page_cache_release() so the kernel has not had a chance to reuse the memory for something else, most of the time. The current RW driver appears to have followed the design and development of the earlier read-only hfsplus driver [3], where-by version 0.1 (Dec 2001) had a B-tree node-centric approach to read_cache_page()/page_cache_release() per bnode_get()/bnode_put(), migrating towards version 0.2 (June 2002) of caching and releasing pages per inode extents. When the current RW code first entered the kernel [2] in 2005, there was an REF_PAGES conditional (and "//" commented out code) to switch between B-node centric paging to inode-centric paging. There was a mistake with the direction of one of the REF_PAGES conditionals in __hfs_bnode_create(). In a subsequent "remove debug code" commit [4], the read_cache_page()/page_cache_release() per bnode_get()/bnode_put() were removed, but a page_cache_release() was mistakenly left in (propagating the "REF_PAGES <-> !REF_PAGE" mistake), and the commented-out page_cache_release() in bnode_release() (which should be spanned by !REF_PAGES) was never enabled. References: [1]: Michael Fox, Apr 2013 http://www.spinics.net/lists/linux-fsdevel/msg63807.html ("hfsplus volume suddenly inaccessable after 'hfs: recoff %d too large'") Sasha Levin, Feb 2015 http://lkml.org/lkml/2015/2/20/85 ("use after free") https://bugs.launchpad.net/ubuntu/+source/linux/+bug/740814 https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1027887 https://bugzilla.kernel.org/show_bug.cgi?id=42342 https://bugzilla.kernel.org/show_bug.cgi?id=63841 https://bugzilla.kernel.org/show_bug.cgi?id=78761 [2]: http://git.kernel.org/cgit/linux/kernel/git/tglx/history.git/commit/\ fs/hfs/bnode.c?id=d1081202f1d0ee35ab0beb490da4b65d4bc763db commit d1081202f1d0ee35ab0beb490da4b65d4bc763db Author: Andrew Morton Date: Wed Feb 25 16:17:36 2004 -0800 [PATCH] HFS rewrite http://git.kernel.org/cgit/linux/kernel/git/tglx/history.git/commit/\ fs/hfsplus/bnode.c?id=91556682e0bf004d98a529bf829d339abb98bbbd commit 91556682e0bf004d98a529bf829d339abb98bbbd Author: Andrew Morton Date: Wed Feb 25 16:17:48 2004 -0800 [PATCH] HFS+ support [3]: http://sourceforge.net/projects/linux-hfsplus/ http://sourceforge.net/projects/linux-hfsplus/files/Linux%202.4.x%20patch/hfsplus%200.1/ http://sourceforge.net/projects/linux-hfsplus/files/Linux%202.4.x%20patch/hfsplus%200.2/ http://linux-hfsplus.cvs.sourceforge.net/viewvc/linux-hfsplus/linux/\ fs/hfsplus/bnode.c?r1=1.4&r2=1.5 Date: Thu Jun 6 09:45:14 2002 +0000 Use buffer cache instead of page cache in bnode.c. Cache inode extents. [4]: http://git.kernel.org/cgit/linux/kernel/git/\ stable/linux-stable.git/commit/?id=a5e3985fa014029eb6795664c704953720cc7f7d commit a5e3985fa014029eb6795664c704953720cc7f7d Author: Roman Zippel Date: Tue Sep 6 15:18:47 2005 -0700 [PATCH] hfs: remove debug code Signed-off-by: Hin-Tak Leung Signed-off-by: Sergei Antonov Reviewed-by: Anton Altaparmakov Reported-by: Sasha Levin Cc: Al Viro Cc: Christoph Hellwig Cc: Vyacheslav Dubeyko Cc: Sougata Santra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/hfs/bnode.c | 9 ++++----- fs/hfsplus/bnode.c | 3 --- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index d3fa6bd9503..221719eac5d 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -288,7 +288,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page_cache_release(page); goto fail; } - page_cache_release(page); node->page[i] = page; } @@ -398,11 +397,11 @@ node_error: void hfs_bnode_free(struct hfs_bnode *node) { - //int i; + int i; - //for (i = 0; i < node->tree->pages_per_bnode; i++) - // if (node->page[i]) - // page_cache_release(node->page[i]); + for (i = 0; i < node->tree->pages_per_bnode; i++) + if (node->page[i]) + page_cache_release(node->page[i]); kfree(node); } diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 11c86020452..bedfe5f7d33 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -456,7 +456,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page_cache_release(page); goto fail; } - page_cache_release(page); node->page[i] = page; } @@ -568,13 +567,11 @@ node_error: void hfs_bnode_free(struct hfs_bnode *node) { -#if 0 int i; for (i = 0; i < node->tree->pages_per_bnode; i++) if (node->page[i]) page_cache_release(node->page[i]); -#endif kfree(node); } From 7bf24986e3c2e4b818be4a6172aebb3784c6bcda Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 12 Jun 2015 10:16:41 -0300 Subject: [PATCH 29/55] sctp: fix ASCONF list handling commit 2d45a02d0166caf2627fe91897c6ffc3b19514c4 upstream. ->auto_asconf_splist is per namespace and mangled by functions like sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization. Also, the call to inet_sk_copy_descendant() was backuping ->auto_asconf_list through the copy but was not honoring ->do_auto_asconf, which could lead to list corruption if it was different between both sockets. This commit thus fixes the list handling by using ->addr_wq_lock spinlock to protect the list. A special handling is done upon socket creation and destruction for that. Error handlig on sctp_init_sock() will never return an error after having initialized asconf, so sctp_destroy_sock() can be called without addrq_wq_lock. The lock now will be take on sctp_close_sock(), before locking the socket, so we don't do it in inverse order compared to sctp_addr_wq_timeout_handler(). Instead of taking the lock on sctp_sock_migrate() for copying and restoring the list values, it's preferred to avoid rewritting it by implementing sctp_copy_descendant(). Issue was found with a test application that kept flipping sysctl default_auto_asconf on and off, but one could trigger it by issuing simultaneous setsockopt() calls on multiple sockets or by creating/destroying sockets fast enough. This is only triggerable locally. Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).") Reported-by: Ji Jianwen Suggested-by: Neil Horman Suggested-by: Hannes Frederic Sowa Acked-by: Hannes Frederic Sowa Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller [wangkai: backport to 3.10: adjust context] Signed-off-by: Wang Kai Signed-off-by: Greg Kroah-Hartman --- include/net/netns/sctp.h | 1 + include/net/sctp/structs.h | 4 ++++ net/sctp/socket.c | 43 ++++++++++++++++++++++++++++---------- 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index 3573a81815a..8ba379f9e46 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -31,6 +31,7 @@ struct netns_sctp { struct list_head addr_waitq; struct timer_list addr_wq_timer; struct list_head auto_asconf_splist; + /* Lock that protects both addr_waitq and auto_asconf_splist */ spinlock_t addr_wq_lock; /* Lock that protects the local_addr_list writers */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index da6b9a01ff7..b30c1d95be2 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -228,6 +228,10 @@ struct sctp_sock { atomic_t pd_mode; /* Receive to here while partial delivery is in effect. */ struct sk_buff_head pd_lobby; + + /* These must be the last fields, as they will skipped on copies, + * like on accept and peeloff operations + */ struct list_head auto_asconf_list; int do_auto_asconf; }; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index dfb9b133e66..ec5766dc394 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1548,8 +1548,10 @@ SCTP_STATIC void sctp_close(struct sock *sk, long timeout) /* Supposedly, no process has access to the socket, but * the net layers still may. + * Also, sctp_destroy_sock() needs to be called with addr_wq_lock + * held and that should be grabbed before socket lock. */ - sctp_local_bh_disable(); + spin_lock_bh(&net->sctp.addr_wq_lock); sctp_bh_lock_sock(sk); /* Hold the sock, since sk_common_release() will put sock_put() @@ -1559,7 +1561,7 @@ SCTP_STATIC void sctp_close(struct sock *sk, long timeout) sk_common_release(sk); sctp_bh_unlock_sock(sk); - sctp_local_bh_enable(); + spin_unlock_bh(&net->sctp.addr_wq_lock); sock_put(sk); @@ -3508,6 +3510,7 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, if ((val && sp->do_auto_asconf) || (!val && !sp->do_auto_asconf)) return 0; + spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock); if (val == 0 && sp->do_auto_asconf) { list_del(&sp->auto_asconf_list); sp->do_auto_asconf = 0; @@ -3516,6 +3519,7 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, &sock_net(sk)->sctp.auto_asconf_splist); sp->do_auto_asconf = 1; } + spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock); return 0; } @@ -4007,18 +4011,28 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk) local_bh_disable(); percpu_counter_inc(&sctp_sockets_allocated); sock_prot_inuse_add(net, sk->sk_prot, 1); + + /* Nothing can fail after this block, otherwise + * sctp_destroy_sock() will be called without addr_wq_lock held + */ if (net->sctp.default_auto_asconf) { + spin_lock(&sock_net(sk)->sctp.addr_wq_lock); list_add_tail(&sp->auto_asconf_list, &net->sctp.auto_asconf_splist); sp->do_auto_asconf = 1; - } else + spin_unlock(&sock_net(sk)->sctp.addr_wq_lock); + } else { sp->do_auto_asconf = 0; + } + local_bh_enable(); return 0; } -/* Cleanup any SCTP per socket resources. */ +/* Cleanup any SCTP per socket resources. Must be called with + * sock_net(sk)->sctp.addr_wq_lock held if sp->do_auto_asconf is true + */ SCTP_STATIC void sctp_destroy_sock(struct sock *sk) { struct sctp_sock *sp; @@ -6957,6 +6971,19 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newinet->mc_list = NULL; } +static inline void sctp_copy_descendant(struct sock *sk_to, + const struct sock *sk_from) +{ + int ancestor_size = sizeof(struct inet_sock) + + sizeof(struct sctp_sock) - + offsetof(struct sctp_sock, auto_asconf_list); + + if (sk_from->sk_family == PF_INET6) + ancestor_size += sizeof(struct ipv6_pinfo); + + __inet_sk_copy_descendant(sk_to, sk_from, ancestor_size); +} + /* Populate the fields of the newsk from the oldsk and migrate the assoc * and its messages to the newsk. */ @@ -6971,7 +6998,6 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, struct sk_buff *skb, *tmp; struct sctp_ulpevent *event; struct sctp_bind_hashbucket *head; - struct list_head tmplist; /* Migrate socket buffer sizes and all the socket level options to the * new socket. @@ -6979,12 +7005,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, newsk->sk_sndbuf = oldsk->sk_sndbuf; newsk->sk_rcvbuf = oldsk->sk_rcvbuf; /* Brute force copy old sctp opt. */ - if (oldsp->do_auto_asconf) { - memcpy(&tmplist, &newsp->auto_asconf_list, sizeof(tmplist)); - inet_sk_copy_descendant(newsk, oldsk); - memcpy(&newsp->auto_asconf_list, &tmplist, sizeof(tmplist)); - } else - inet_sk_copy_descendant(newsk, oldsk); + sctp_copy_descendant(newsk, oldsk); /* Restore the ep value that was overwritten with the above structure * copy. From fa83234f6a4e7b378f0da63938a09b9e8d535c4d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 5 Feb 2015 10:37:33 +0300 Subject: [PATCH 30/55] vhost/scsi: potential memory corruption commit 59c816c1f24df0204e01851431d3bab3eb76719c upstream. This code in vhost_scsi_make_tpg() is confusing because we limit "tpgt" to UINT_MAX but the data type of "tpg->tport_tpgt" and that is a u16. I looked at the context and it turns out that in vhost_scsi_set_endpoint(), "tpg->tport_tpgt" is used as an offset into the vs_tpg[] array which has VHOST_SCSI_MAX_TARGET (256) elements so anything higher than 255 then it is invalid. I have made that the limit now. In vhost_scsi_send_evt() we mask away values higher than 255, but now that the limit has changed, we don't need the mask. Signed-off-by: Dan Carpenter Signed-off-by: Nicholas Bellinger [ The affected function was renamed to vhost_scsi_make_tpg before the vulnerability was announced, I ported it to 3.10 stable and changed the code in function tcm_vhost_make_tpg] Signed-off-by: Wang Long Signed-off-by: Greg Kroah-Hartman --- drivers/vhost/scsi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index fb97bc0b80e..2947eda522b 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1088,7 +1088,7 @@ static void tcm_vhost_send_evt(struct vhost_scsi *vs, struct tcm_vhost_tpg *tpg, * lun[4-7] need to be zero according to virtio-scsi spec. */ evt->event.lun[0] = 0x01; - evt->event.lun[1] = tpg->tport_tpgt & 0xFF; + evt->event.lun[1] = tpg->tport_tpgt; if (lun->unpacked_lun >= 256) evt->event.lun[2] = lun->unpacked_lun >> 8 | 0x40 ; evt->event.lun[3] = lun->unpacked_lun & 0xFF; @@ -1894,12 +1894,12 @@ static struct se_portal_group *tcm_vhost_make_tpg(struct se_wwn *wwn, struct tcm_vhost_tport, tport_wwn); struct tcm_vhost_tpg *tpg; - unsigned long tpgt; + u16 tpgt; int ret; if (strstr(name, "tpgt_") != name) return ERR_PTR(-EINVAL); - if (kstrtoul(name + 5, 10, &tpgt) || tpgt > UINT_MAX) + if (kstrtou16(name + 5, 10, &tpgt) || tpgt >= VHOST_SCSI_MAX_TARGET) return ERR_PTR(-EINVAL); tpg = kzalloc(sizeof(struct tcm_vhost_tpg), GFP_KERNEL); From 9f6191daa545384ce5cc90b770f0d2bf64c0ba22 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 22 May 2015 15:42:55 -0700 Subject: [PATCH 31/55] x86: bpf_jit: fix compilation of large bpf programs commit 3f7352bf21f8fd7ba3e2fcef9488756f188e12be upstream. x86 has variable length encoding. x86 JIT compiler is trying to pick the shortest encoding for given bpf instruction. While doing so the jump targets are changing, so JIT is doing multiple passes over the program. Typical program needs 3 passes. Some very short programs converge with 2 passes. Large programs may need 4 or 5. But specially crafted bpf programs may hit the pass limit and if the program converges on the last iteration the JIT compiler will be producing an image full of 'int 3' insns. Fix this corner case by doing final iteration over bpf program. Fixes: 0a14842f5a3c ("net: filter: Just In Time compiler for x86-64") Reported-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Tested-by: Daniel Borkmann Acked-by: Daniel Borkmann Signed-off-by: David S. Miller Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- arch/x86/net/bpf_jit_comp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0c966fecfb8..5479d677f9b 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -176,7 +176,12 @@ void bpf_jit_compile(struct sk_filter *fp) } cleanup_addr = proglen; /* epilogue address */ - for (pass = 0; pass < 10; pass++) { + /* JITed image shrinks with every pass and the loop iterates + * until the image stops shrinking. Very large bpf programs + * may converge on the last pass. In such case do one more + * pass to emit the final image + */ + for (pass = 0; pass < 10 || image; pass++) { u8 seen_or_pass0 = (pass == 0) ? (SEEN_XREG | SEEN_DATAREF | SEEN_MEM) : seen; /* no prologue/epilogue for trivial filters (RET something) */ proglen = 0; From 3b9393dc1d4cc226bce44708ab899518cc598e57 Mon Sep 17 00:00:00 2001 From: Angga Date: Fri, 3 Jul 2015 14:40:52 +1200 Subject: [PATCH 32/55] ipv6: Make MLD packets to only be processed locally [ Upstream commit 4c938d22c88a9ddccc8c55a85e0430e9c62b1ac5 ] Before commit daad151263cf ("ipv6: Make ipv6_is_mld() inline and use it from ip6_mc_input().") MLD packets were only processed locally. After the change, a copy of MLD packet goes through ip6_mr_input, causing MRT6MSG_NOCACHE message to be generated to user space. Make MLD packet only processed locally. Fixes: daad151263cf ("ipv6: Make ipv6_is_mld() inline and use it from ip6_mc_input().") Signed-off-by: Hermin Anggawijaya Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 774b09cb292..63264c9a15c 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -325,10 +325,10 @@ int ip6_mc_input(struct sk_buff *skb) if (offset < 0) goto out; - if (!ipv6_is_mld(skb, nexthdr, offset)) - goto out; + if (ipv6_is_mld(skb, nexthdr, offset)) + deliver = true; - deliver = true; + goto out; } /* unknown RA - process it normally */ } From afabf2a8b621f2c300cbdf6adb0f8855f612b3a6 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Tue, 7 Jul 2015 09:43:45 -0400 Subject: [PATCH 33/55] net/tipc: initialize security state for new connection socket [ Upstream commit fdd75ea8df370f206a8163786e7470c1277a5064 ] Calling connect() with an AF_TIPC socket would trigger a series of error messages from SELinux along the lines of: SELinux: Invalid class 0 type=AVC msg=audit(1434126658.487:34500): avc: denied { } for pid=292 comm="kworker/u16:5" scontext=system_u:system_r:kernel_t:s0 tcontext=system_u:object_r:unlabeled_t:s0 tclass= permissive=0 This was due to a failure to initialize the security state of the new connection sock by the tipc code, leaving it with junk in the security class field and an unlabeled secid. Add a call to security_sk_clone() to inherit the security state from the parent socket. Reported-by: Tim Shearer Signed-off-by: Stephen Smalley Acked-by: Paul Moore Acked-by: Ying Xue Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/socket.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 2b1d7c2d677..e0cb5edc6d1 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1528,6 +1528,7 @@ static int accept(struct socket *sock, struct socket *new_sock, int flags) res = tipc_create(sock_net(sock->sk), new_sock, 0, 0); if (res) goto exit; + security_sk_clone(sock->sk, new_sock->sk); new_sk = new_sock->sk; new_tsock = tipc_sk(new_sk); From 7865ece30f06072ea61f7db2ebbe24b632800e17 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 7 Jul 2015 15:55:56 +0200 Subject: [PATCH 34/55] bridge: mdb: zero out the local br_ip variable before use [ Upstream commit f1158b74e54f2e2462ba5e2f45a118246d9d5b43 ] Since commit b0e9a30dd669 ("bridge: Add vlan id to multicast groups") there's a check in br_ip_equal() for a matching vlan id, but the mdb functions were not modified to use (or at least zero it) so when an entry was added it would have a garbage vlan id (from the local br_ip variable in __br_mdb_add/del) and this would prevent it from being matched and also deleted. So zero out the whole local ip var to protect ourselves from future changes and also to fix the current bug, since there's no vlan id support in the mdb uapi - use always vlan id 0. Example before patch: root@debian:~# bridge mdb add dev br0 port eth1 grp 239.0.0.1 permanent root@debian:~# bridge mdb dev br0 port eth1 grp 239.0.0.1 permanent root@debian:~# bridge mdb del dev br0 port eth1 grp 239.0.0.1 permanent RTNETLINK answers: Invalid argument After patch: root@debian:~# bridge mdb add dev br0 port eth1 grp 239.0.0.1 permanent root@debian:~# bridge mdb dev br0 port eth1 grp 239.0.0.1 permanent root@debian:~# bridge mdb del dev br0 port eth1 grp 239.0.0.1 permanent root@debian:~# bridge mdb Signed-off-by: Nikolay Aleksandrov Fixes: b0e9a30dd669 ("bridge: Add vlan id to multicast groups") Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_mdb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 19942e38fd2..ff61e200bf0 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -368,6 +368,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, if (!p || p->br != br || p->state == BR_STATE_DISABLED) return -EINVAL; + memset(&ip, 0, sizeof(ip)); ip.proto = entry->addr.proto; if (ip.proto == htons(ETH_P_IP)) ip.u.ip4 = entry->addr.u.ip4; @@ -417,6 +418,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (timer_pending(&br->multicast_querier_timer)) return -EBUSY; + memset(&ip, 0, sizeof(ip)); ip.proto = entry->addr.proto; if (ip.proto == htons(ETH_P_IP)) ip.u.ip4 = entry->addr.u.ip4; From f85eee641c7a3bb928a4b605db632c2e90b9574f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 8 Jul 2015 21:42:11 +0200 Subject: [PATCH 35/55] net: pktgen: fix race between pktgen_thread_worker() and kthread_stop() [ Upstream commit fecdf8be2d91e04b0a9a4f79ff06499a36f5d14f ] pktgen_thread_worker() is obviously racy, kthread_stop() can come between the kthread_should_stop() check and set_current_state(). Signed-off-by: Oleg Nesterov Reported-by: Jan Stancek Reported-by: Marcelo Leitner Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/pktgen.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index ebbea537196..21a23d97e99 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3377,8 +3377,10 @@ static int pktgen_thread_worker(void *arg) pktgen_rem_thread(t); /* Wait for kthread_stop */ - while (!kthread_should_stop()) { + for (;;) { set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; schedule(); } __set_current_state(TASK_RUNNING); From c987fa7146e0c18acc2392b25349cca45c177175 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 9 Jul 2015 09:59:10 +0300 Subject: [PATCH 36/55] net: call rcu_read_lock early in process_backlog [ Upstream commit 2c17d27c36dcce2b6bf689f41a46b9e909877c21 ] Incoming packet should be either in backlog queue or in RCU read-side section. Otherwise, the final sequence of flush_backlog() and synchronize_net() may miss packets that can run without device reference: CPU 1 CPU 2 skb->dev: no reference process_backlog:__skb_dequeue process_backlog:local_irq_enable on_each_cpu for flush_backlog => IPI(hardirq): flush_backlog - packet not found in backlog CPU delayed ... synchronize_net - no ongoing RCU read-side sections netdev_run_todo, rcu_barrier: no ongoing callbacks __netif_receive_skb_core:rcu_read_lock - too late free dev process packet for freed dev Fixes: 6e583ce5242f ("net: eliminate refcounting in backlog queue") Cc: Eric W. Biederman Cc: Stephen Hemminger Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/dev.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index aeca8dd88b2..1ccfc49683b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3443,8 +3443,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) pt_prev = NULL; - rcu_read_lock(); - another_round: skb->skb_iif = skb->dev->ifindex; @@ -3454,7 +3452,7 @@ another_round: skb->protocol == cpu_to_be16(ETH_P_8021AD)) { skb = vlan_untag(skb); if (unlikely(!skb)) - goto unlock; + goto out; } #ifdef CONFIG_NET_CLS_ACT @@ -3479,7 +3477,7 @@ skip_taps: #ifdef CONFIG_NET_CLS_ACT skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) - goto unlock; + goto out; ncls: #endif @@ -3494,7 +3492,7 @@ ncls: if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) - goto unlock; + goto out; } rx_handler = rcu_dereference(skb->dev->rx_handler); @@ -3506,7 +3504,7 @@ ncls: switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: ret = NET_RX_SUCCESS; - goto unlock; + goto out; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT: @@ -3558,8 +3556,6 @@ drop: ret = NET_RX_DROP; } -unlock: - rcu_read_unlock(); out: return ret; } @@ -3606,29 +3602,30 @@ static int __netif_receive_skb(struct sk_buff *skb) */ int netif_receive_skb(struct sk_buff *skb) { + int ret; + net_timestamp_check(netdev_tstamp_prequeue, skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; + rcu_read_lock(); + #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; - int cpu, ret; - - rcu_read_lock(); - - cpu = get_rps_cpu(skb->dev, skb, &rflow); + int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); return ret; } - rcu_read_unlock(); } #endif - return __netif_receive_skb(skb); + ret = __netif_receive_skb(skb); + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(netif_receive_skb); @@ -4038,8 +4035,10 @@ static int process_backlog(struct napi_struct *napi, int quota) unsigned int qlen; while ((skb = __skb_dequeue(&sd->process_queue))) { + rcu_read_lock(); local_irq_enable(); __netif_receive_skb(skb); + rcu_read_unlock(); local_irq_disable(); input_queue_head_incr(sd); if (++work >= quota) { From 0ba48ae94c393dc4c43b257400046feeeb9c6fad Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 13 Jul 2015 16:04:13 +0800 Subject: [PATCH 37/55] net: Clone skb before setting peeked flag [ Upstream commit 738ac1ebb96d02e0d23bc320302a6ea94c612dec ] Shared skbs must not be modified and this is crucial for broadcast and/or multicast paths where we use it as an optimisation to avoid unnecessary cloning. The function skb_recv_datagram breaks this rule by setting peeked without cloning the skb first. This causes funky races which leads to double-free. This patch fixes this by cloning the skb and replacing the skb in the list when setting skb->peeked. Fixes: a59322be07c9 ("[UDP]: Only increment counter on first peek/recv") Reported-by: Konstantin Khlebnikov Signed-off-by: Herbert Xu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/datagram.c | 41 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/net/core/datagram.c b/net/core/datagram.c index b71423db778..f1506c7d414 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -128,6 +128,35 @@ out_noerr: goto out; } +static int skb_set_peeked(struct sk_buff *skb) +{ + struct sk_buff *nskb; + + if (skb->peeked) + return 0; + + /* We have to unshare an skb before modifying it. */ + if (!skb_shared(skb)) + goto done; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + skb->prev->next = nskb; + skb->next->prev = nskb; + nskb->prev = skb->prev; + nskb->next = skb->next; + + consume_skb(skb); + skb = nskb; + +done: + skb->peeked = 1; + + return 0; +} + /** * __skb_recv_datagram - Receive a datagram skbuff * @sk: socket @@ -162,7 +191,9 @@ out_noerr: struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, int *peeked, int *off, int *err) { + struct sk_buff_head *queue = &sk->sk_receive_queue; struct sk_buff *skb, *last; + unsigned long cpu_flags; long timeo; /* * Caller is allowed not to check sk->sk_err before skb_recv_datagram() @@ -181,8 +212,6 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, * Look at current nfs client by the way... * However, this function was correct in any case. 8) */ - unsigned long cpu_flags; - struct sk_buff_head *queue = &sk->sk_receive_queue; int _off = *off; last = (struct sk_buff *)queue; @@ -196,7 +225,11 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, _off -= skb->len; continue; } - skb->peeked = 1; + + error = skb_set_peeked(skb); + if (error) + goto unlock_err; + atomic_inc(&skb->users); } else __skb_unlink(skb, queue); @@ -216,6 +249,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, return NULL; +unlock_err: + spin_unlock_irqrestore(&queue->lock, cpu_flags); no_packet: *err = error; return NULL; From 4164cda8cad7303614d211192d27a912de53a463 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 13 Jul 2015 20:01:42 +0800 Subject: [PATCH 38/55] net: Fix skb csum races when peeking [ Upstream commit 89c22d8c3b278212eef6a8cc66b570bc840a6f5a ] When we calculate the checksum on the recv path, we store the result in the skb as an optimisation in case we need the checksum again down the line. This is in fact bogus for the MSG_PEEK case as this is done without any locking. So multiple threads can peek and then store the result to the same skb, potentially resulting in bogus skb states. This patch fixes this by only storing the result if the skb is not shared. This preserves the optimisations for the few cases where it can be done safely due to locking or other reasons, e.g., SIOCINQ. Signed-off-by: Herbert Xu Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/datagram.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/datagram.c b/net/core/datagram.c index f1506c7d414..80b0fd83fac 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -700,7 +700,8 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) netdev_rx_csum_fault(skb->dev); - skb->ip_summed = CHECKSUM_UNNECESSARY; + if (!skb_shared(skb)) + skb->ip_summed = CHECKSUM_UNNECESSARY; } return sum; } From 5fa39f16036eb627fe50a38c307fa99eace26ee3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 4 Aug 2015 15:42:47 +0800 Subject: [PATCH 39/55] net: Fix skb_set_peeked use-after-free bug [ Upstream commit a0a2a6602496a45ae838a96db8b8173794b5d398 ] The commit 738ac1ebb96d02e0d23bc320302a6ea94c612dec ("net: Clone skb before setting peeked flag") introduced a use-after-free bug in skb_recv_datagram. This is because skb_set_peeked may create a new skb and free the existing one. As it stands the caller will continue to use the old freed skb. This patch fixes it by making skb_set_peeked return the new skb (or the old one if unchanged). Fixes: 738ac1ebb96d ("net: Clone skb before setting peeked flag") Reported-by: Brenden Blanco Signed-off-by: Herbert Xu Tested-by: Brenden Blanco Reviewed-by: Konstantin Khlebnikov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/datagram.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/core/datagram.c b/net/core/datagram.c index 80b0fd83fac..052b71c5b1b 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -128,12 +128,12 @@ out_noerr: goto out; } -static int skb_set_peeked(struct sk_buff *skb) +static struct sk_buff *skb_set_peeked(struct sk_buff *skb) { struct sk_buff *nskb; if (skb->peeked) - return 0; + return skb; /* We have to unshare an skb before modifying it. */ if (!skb_shared(skb)) @@ -141,7 +141,7 @@ static int skb_set_peeked(struct sk_buff *skb) nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) - return -ENOMEM; + return ERR_PTR(-ENOMEM); skb->prev->next = nskb; skb->next->prev = nskb; @@ -154,7 +154,7 @@ static int skb_set_peeked(struct sk_buff *skb) done: skb->peeked = 1; - return 0; + return skb; } /** @@ -226,8 +226,9 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, continue; } - error = skb_set_peeked(skb); - if (error) + skb = skb_set_peeked(skb); + error = PTR_ERR(skb); + if (IS_ERR(skb)) goto unlock_err; atomic_inc(&skb->users); From 8d228c93f31c90894fa86c597beb42b67c5ac978 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 13 Jul 2015 06:36:19 -0700 Subject: [PATCH 40/55] bridge: mdb: fix double add notification [ Upstream commit 5ebc784625ea68a9570d1f70557e7932988cd1b4 ] Since the mdb add/del code was introduced there have been 2 br_mdb_notify calls when doing br_mdb_add() resulting in 2 notifications on each add. Example: Command: bridge mdb add dev br0 port eth1 grp 239.0.0.1 permanent Before patch: root@debian:~# bridge monitor all [MDB]dev br0 port eth1 grp 239.0.0.1 permanent [MDB]dev br0 port eth1 grp 239.0.0.1 permanent After patch: root@debian:~# bridge monitor all [MDB]dev br0 port eth1 grp 239.0.0.1 permanent Signed-off-by: Nikolay Aleksandrov Fixes: cfd567543590 ("bridge: add support of adding and deleting mdb entries") Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_mdb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index ff61e200bf0..4e76d2a1128 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -345,7 +345,6 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, return -ENOMEM; rcu_assign_pointer(*pp, p); - br_mdb_notify(br->dev, port, group, RTM_NEWMDB); return 0; } From c6419a861e56cf6c6446bf0e8f9c75557c2a3a02 Mon Sep 17 00:00:00 2001 From: Tilman Schmidt Date: Tue, 14 Jul 2015 00:37:13 +0200 Subject: [PATCH 41/55] isdn/gigaset: reset tty->receive_room when attaching ser_gigaset [ Upstream commit fd98e9419d8d622a4de91f76b306af6aa627aa9c ] Commit 79901317ce80 ("n_tty: Don't flush buffer when closing ldisc"), first merged in kernel release 3.10, caused the following regression in the Gigaset M101 driver: Before that commit, when closing the N_TTY line discipline in preparation to switching to N_GIGASET_M101, receive_room would be reset to a non-zero value by the call to n_tty_flush_buffer() in n_tty's close method. With the removal of that call, receive_room might be left at zero, blocking data reception on the serial line. The present patch fixes that regression by setting receive_room to an appropriate value in the ldisc open method. Fixes: 79901317ce80 ("n_tty: Don't flush buffer when closing ldisc") Signed-off-by: Tilman Schmidt Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/isdn/gigaset/ser-gigaset.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/isdn/gigaset/ser-gigaset.c b/drivers/isdn/gigaset/ser-gigaset.c index 8c91fd5eb6f..3ac9c419481 100644 --- a/drivers/isdn/gigaset/ser-gigaset.c +++ b/drivers/isdn/gigaset/ser-gigaset.c @@ -524,9 +524,18 @@ gigaset_tty_open(struct tty_struct *tty) cs->hw.ser->tty = tty; atomic_set(&cs->hw.ser->refcnt, 1); init_completion(&cs->hw.ser->dead_cmp); - tty->disc_data = cs; + /* Set the amount of data we're willing to receive per call + * from the hardware driver to half of the input buffer size + * to leave some reserve. + * Note: We don't do flow control towards the hardware driver. + * If more data is received than will fit into the input buffer, + * it will be dropped and an error will be logged. This should + * never happen as the device is slow and the buffer size ample. + */ + tty->receive_room = RBUFSIZE/2; + /* OK.. Initialization of the datastructures and the HW is done.. Now * startup system and notify the LL that we are ready to run */ From 4b633bbef8730380d624a3d55a54d9e42effdd1c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Jul 2015 08:10:22 +0200 Subject: [PATCH 42/55] ipv6: lock socket in ip6_datagram_connect() [ Upstream commit 03645a11a570d52e70631838cb786eb4253eb463 ] ip6_datagram_connect() is doing a lot of socket changes without socket being locked. This looks wrong, at least for udp_lib_rehash() which could corrupt lists because of concurrent udp_sk(sk)->udp_portaddr_hash accesses. Signed-off-by: Eric Dumazet Acked-by: Herbert Xu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/ip.h | 1 + net/ipv4/datagram.c | 16 ++++++++++++---- net/ipv6/datagram.c | 20 +++++++++++++++----- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 0a62365149e..ea9be6b407b 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -141,6 +141,7 @@ static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4) } /* datagram.c */ +int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); extern int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 5f3dc1df04b..291b0821d1a 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -20,7 +20,7 @@ #include #include -int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; @@ -39,8 +39,6 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk_dst_reset(sk); - lock_sock(sk); - oif = sk->sk_bound_dev_if; saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { @@ -81,9 +79,19 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk_dst_set(sk, &rt->dst); err = 0; out: - release_sock(sk); return err; } +EXPORT_SYMBOL(__ip4_datagram_connect); + +int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + int res; + + lock_sock(sk); + res = __ip4_datagram_connect(sk, uaddr, addr_len); + release_sock(sk); + return res; +} EXPORT_SYMBOL(ip4_datagram_connect); /* Because UDP xmit path can manipulate sk_dst_cache without holding diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index ce17d3da9b2..b0d5d7eb946 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -40,7 +40,7 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a) return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); } -int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); @@ -56,7 +56,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (usin->sin6_family == AF_INET) { if (__ipv6_only_sock(sk)) return -EAFNOSUPPORT; - err = ip4_datagram_connect(sk, uaddr, addr_len); + err = __ip4_datagram_connect(sk, uaddr, addr_len); goto ipv4_connected; } @@ -99,9 +99,9 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sin.sin_addr.s_addr = daddr->s6_addr32[3]; sin.sin_port = usin->sin6_port; - err = ip4_datagram_connect(sk, - (struct sockaddr *) &sin, - sizeof(sin)); + err = __ip4_datagram_connect(sk, + (struct sockaddr *) &sin, + sizeof(sin)); ipv4_connected: if (err) @@ -204,6 +204,16 @@ out: fl6_sock_release(flowlabel); return err; } + +int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + int res; + + lock_sock(sk); + res = __ip6_datagram_connect(sk, uaddr, addr_len); + release_sock(sk); + return res; +} EXPORT_SYMBOL_GPL(ip6_datagram_connect); void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, From e3e3caac28a5c7c9428ebc4fa154a007c0636393 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 15 Jul 2015 21:52:51 +0200 Subject: [PATCH 43/55] bonding: fix destruction of bond with devices different from arphrd_ether [ Upstream commit 06f6d1094aa0992432b1e2a0920b0ee86ccd83bf ] When the bonding is being unloaded and the netdevice notifier is unregistered it executes NETDEV_UNREGISTER for each device which should remove the bond's proc entry but if the device enslaved is not of ARPHRD_ETHER type and is in front of the bonding, it may execute bond_release_and_destroy() first which would release the last slave and destroy the bond device leaving the proc entry and thus we will get the following error (with dynamic debug on for bond_netdev_event to see the events order): [ 908.963051] eql: event: 9 [ 908.963052] eql: IFF_SLAVE [ 908.963054] eql: event: 2 [ 908.963056] eql: IFF_SLAVE [ 908.963058] eql: event: 6 [ 908.963059] eql: IFF_SLAVE [ 908.963110] bond0: Releasing active interface eql [ 908.976168] bond0: Destroying bond bond0 [ 908.976266] bond0 (unregistering): Released all slaves [ 908.984097] ------------[ cut here ]------------ [ 908.984107] WARNING: CPU: 0 PID: 1787 at fs/proc/generic.c:575 remove_proc_entry+0x112/0x160() [ 908.984110] remove_proc_entry: removing non-empty directory 'net/bonding', leaking at least 'bond0' [ 908.984111] Modules linked in: bonding(-) eql(O) 9p nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel ppdev qxl drm_kms_helper snd_hda_codec_generic aesni_intel ttm aes_x86_64 glue_helper pcspkr lrw gf128mul ablk_helper cryptd snd_hda_intel virtio_console snd_hda_codec psmouse serio_raw snd_hwdep snd_hda_core 9pnet_virtio 9pnet evdev joydev drm virtio_balloon snd_pcm snd_timer snd soundcore i2c_piix4 i2c_core pvpanic acpi_cpufreq parport_pc parport processor thermal_sys button autofs4 ext4 crc16 mbcache jbd2 hid_generic usbhid hid sg sr_mod cdrom ata_generic virtio_blk virtio_net floppy ata_piix e1000 libata ehci_pci virtio_pci scsi_mod uhci_hcd ehci_hcd virtio_ring virtio usbcore usb_common [last unloaded: bonding] [ 908.984168] CPU: 0 PID: 1787 Comm: rmmod Tainted: G W O 4.2.0-rc2+ #8 [ 908.984170] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 908.984172] 0000000000000000 ffffffff81732d41 ffffffff81525b34 ffff8800358dfda8 [ 908.984175] ffffffff8106c521 ffff88003595af78 ffff88003595af40 ffff88003e3a4280 [ 908.984178] ffffffffa058d040 0000000000000000 ffffffff8106c59a ffffffff8172ebd0 [ 908.984181] Call Trace: [ 908.984188] [] ? dump_stack+0x40/0x50 [ 908.984193] [] ? warn_slowpath_common+0x81/0xb0 [ 908.984196] [] ? warn_slowpath_fmt+0x4a/0x50 [ 908.984199] [] ? remove_proc_entry+0x112/0x160 [ 908.984205] [] ? bond_destroy_proc_dir+0x26/0x30 [bonding] [ 908.984208] [] ? bond_net_exit+0x8e/0xa0 [bonding] [ 908.984217] [] ? ops_exit_list.isra.4+0x37/0x70 [ 908.984225] [] ? unregister_pernet_operations+0x8d/0xd0 [ 908.984228] [] ? unregister_pernet_subsys+0x1d/0x30 [ 908.984232] [] ? bonding_exit+0x23/0xdba [bonding] [ 908.984236] [] ? SyS_delete_module+0x18a/0x250 [ 908.984241] [] ? task_work_run+0x89/0xc0 [ 908.984244] [] ? entry_SYSCALL_64_fastpath+0x16/0x75 [ 908.984247] ---[ end trace 7c006ed4abbef24b ]--- Thus remove the proc entry manually if bond_release_and_destroy() is used. Because of the checks in bond_remove_proc_entry() it's not a problem for a bond device to change namespaces (the bug fixed by the Fixes commit) but since commit f9399814927ad ("bonding: Don't allow bond devices to change network namespaces.") that can't happen anyway. Reported-by: Carol Soto Signed-off-by: Nikolay Aleksandrov Fixes: a64d49c3dd50 ("bonding: Manage /proc/net/bonding/ entries from the netdev events") Tested-by: Carol L Soto Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/bonding/bond_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index b143ce91e08..6b5baf01512 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2188,6 +2188,7 @@ static int bond_release_and_destroy(struct net_device *bond_dev, bond_dev->priv_flags |= IFF_DISABLE_NETPOLL; pr_info("%s: destroying bond %s.\n", bond_dev->name, bond_dev->name); + bond_remove_proc_entry(bond); unregister_netdevice(bond_dev); } return ret; From cecc56226678a5a218e8b9859382ab0b0c27ad0b Mon Sep 17 00:00:00 2001 From: Edward Hyunkoo Jee Date: Tue, 21 Jul 2015 09:43:59 +0200 Subject: [PATCH 44/55] inet: frags: fix defragmented packet's IP header for af_packet [ Upstream commit 0848f6428ba3a2e42db124d41ac6f548655735bf ] When ip_frag_queue() computes positions, it assumes that the passed sk_buff does not contain L2 headers. However, when PACKET_FANOUT_FLAG_DEFRAG is used, IP reassembly functions can be called on outgoing packets that contain L2 headers. Also, IPv4 checksum is not corrected after reassembly. Fixes: 7736d33f4262 ("packet: Add pre-defragmentation support for ipv4 fanouts.") Signed-off-by: Edward Hyunkoo Jee Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Jerry Chu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_fragment.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 4c1884fed54..4d98a6b80b0 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -356,7 +356,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) ihl = ip_hdrlen(skb); /* Determine the position of this fragment. */ - end = offset + skb->len - ihl; + end = offset + skb->len - skb_network_offset(skb) - ihl; err = -EINVAL; /* Is this the final fragment? */ @@ -386,7 +386,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) goto err; err = -ENOMEM; - if (pskb_pull(skb, ihl) == NULL) + if (!pskb_pull(skb, skb_network_offset(skb) + ihl)) goto err; err = pskb_trim_rcsum(skb, end - offset); @@ -627,6 +627,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0; iph->tot_len = htons(len); iph->tos |= ecn; + + ip_send_check(iph); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; qp->q.fragments_tail = NULL; From 3ebe377baaebbed39733c3eb03212b9c22601888 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 21 Jul 2015 16:33:50 +0200 Subject: [PATCH 45/55] netlink: don't hold mutex in rcu callback when releasing mmapd ring [ Upstream commit 0470eb99b4721586ccac954faac3fa4472da0845 ] Kirill A. Shutemov says: This simple test-case trigers few locking asserts in kernel: int main(int argc, char **argv) { unsigned int block_size = 16 * 4096; struct nl_mmap_req req = { .nm_block_size = block_size, .nm_block_nr = 64, .nm_frame_size = 16384, .nm_frame_nr = 64 * block_size / 16384, }; unsigned int ring_size; int fd; fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); if (setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &req, sizeof(req)) < 0) exit(1); if (setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &req, sizeof(req)) < 0) exit(1); ring_size = req.nm_block_nr * req.nm_block_size; mmap(NULL, 2 * ring_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); return 0; } +++ exited with 0 +++ BUG: sleeping function called from invalid context at /home/kas/git/public/linux-mm/kernel/locking/mutex.c:616 in_atomic(): 1, irqs_disabled(): 0, pid: 1, name: init 3 locks held by init/1: #0: (reboot_mutex){+.+...}, at: [] SyS_reboot+0xa9/0x220 #1: ((reboot_notifier_list).rwsem){.+.+..}, at: [] __blocking_notifier_call_chain+0x39/0x70 #2: (rcu_callback){......}, at: [] rcu_do_batch.isra.49+0x160/0x10c0 Preemption disabled at:[] __delay+0xf/0x20 CPU: 1 PID: 1 Comm: init Not tainted 4.1.0-00009-gbddf4c4818e0 #253 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Debian-1.8.2-1 04/01/2014 ffff88017b3d8000 ffff88027bc03c38 ffffffff81929ceb 0000000000000102 0000000000000000 ffff88027bc03c68 ffffffff81085a9d 0000000000000002 ffffffff81ca2a20 0000000000000268 0000000000000000 ffff88027bc03c98 Call Trace: [] dump_stack+0x4f/0x7b [] ___might_sleep+0x16d/0x270 [] __might_sleep+0x4d/0x90 [] mutex_lock_nested+0x2f/0x430 [] ? _raw_spin_unlock_irqrestore+0x5d/0x80 [] ? __this_cpu_preempt_check+0x13/0x20 [] netlink_set_ring+0x1ed/0x350 [] ? netlink_undo_bind+0x70/0x70 [] netlink_sock_destruct+0x80/0x150 [] __sk_free+0x1d/0x160 [] sk_free+0x19/0x20 [..] Cong Wang says: We can't hold mutex lock in a rcu callback, [..] Thomas Graf says: The socket should be dead at this point. It might be simpler to add a netlink_release_ring() function which doesn't require locking at all. Reported-by: "Kirill A. Shutemov" Diagnosed-by: Cong Wang Suggested-by: Thomas Graf Signed-off-by: Florian Westphal Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/netlink/af_netlink.c | 79 ++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f7ad5c630b6..56ff3b45227 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -214,25 +214,52 @@ err1: return NULL; } + +static void +__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec, + unsigned int order) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct sk_buff_head *queue; + struct netlink_ring *ring; + + queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; + ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; + + spin_lock_bh(&queue->lock); + + ring->frame_max = req->nm_frame_nr - 1; + ring->head = 0; + ring->frame_size = req->nm_frame_size; + ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; + + swap(ring->pg_vec_len, req->nm_block_nr); + swap(ring->pg_vec_order, order); + swap(ring->pg_vec, pg_vec); + + __skb_queue_purge(queue); + spin_unlock_bh(&queue->lock); + + WARN_ON(atomic_read(&nlk->mapped)); + + if (pg_vec) + free_pg_vec(pg_vec, order, req->nm_block_nr); +} + static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, - bool closing, bool tx_ring) + bool tx_ring) { struct netlink_sock *nlk = nlk_sk(sk); struct netlink_ring *ring; - struct sk_buff_head *queue; void **pg_vec = NULL; unsigned int order = 0; - int err; ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; - queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; - if (!closing) { - if (atomic_read(&nlk->mapped)) - return -EBUSY; - if (atomic_read(&ring->pending)) - return -EBUSY; - } + if (atomic_read(&nlk->mapped)) + return -EBUSY; + if (atomic_read(&ring->pending)) + return -EBUSY; if (req->nm_block_nr) { if (ring->pg_vec != NULL) @@ -264,31 +291,19 @@ static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, return -EINVAL; } - err = -EBUSY; mutex_lock(&nlk->pg_vec_lock); - if (closing || atomic_read(&nlk->mapped) == 0) { - err = 0; - spin_lock_bh(&queue->lock); - - ring->frame_max = req->nm_frame_nr - 1; - ring->head = 0; - ring->frame_size = req->nm_frame_size; - ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; - - swap(ring->pg_vec_len, req->nm_block_nr); - swap(ring->pg_vec_order, order); - swap(ring->pg_vec, pg_vec); - - __skb_queue_purge(queue); - spin_unlock_bh(&queue->lock); - - WARN_ON(atomic_read(&nlk->mapped)); + if (atomic_read(&nlk->mapped) == 0) { + __netlink_set_ring(sk, req, tx_ring, pg_vec, order); + mutex_unlock(&nlk->pg_vec_lock); + return 0; } + mutex_unlock(&nlk->pg_vec_lock); if (pg_vec) free_pg_vec(pg_vec, order, req->nm_block_nr); - return err; + + return -EBUSY; } static void netlink_mm_open(struct vm_area_struct *vma) @@ -762,10 +777,10 @@ static void netlink_sock_destruct(struct sock *sk) memset(&req, 0, sizeof(req)); if (nlk->rx_ring.pg_vec) - netlink_set_ring(sk, &req, true, false); + __netlink_set_ring(sk, &req, false, NULL, 0); memset(&req, 0, sizeof(req)); if (nlk->tx_ring.pg_vec) - netlink_set_ring(sk, &req, true, true); + __netlink_set_ring(sk, &req, true, NULL, 0); } #endif /* CONFIG_NETLINK_MMAP */ @@ -2017,7 +2032,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, return -EINVAL; if (copy_from_user(&req, optval, sizeof(req))) return -EFAULT; - err = netlink_set_ring(sk, &req, false, + err = netlink_set_ring(sk, &req, optname == NETLINK_TX_RING); break; } From 7cd1033116c708c7e3e772cbf053fd9c98163570 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 1 Aug 2015 15:33:26 +0300 Subject: [PATCH 46/55] rds: fix an integer overflow test in rds_info_getsockopt() [ Upstream commit 468b732b6f76b138c0926eadf38ac88467dcd271 ] "len" is a signed integer. We check that len is not negative, so it goes from zero to INT_MAX. PAGE_SIZE is unsigned long so the comparison is type promoted to unsigned long. ULONG_MAX - 4095 is a higher than INT_MAX so the condition can never be true. I don't know if this is harmful but it seems safe to limit "len" to INT_MAX - 4095. Fixes: a8c879a7ee98 ('RDS: Info and stats') Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/rds/info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/info.c b/net/rds/info.c index 9a6b4f66187..140a44a5f7b 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -176,7 +176,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, /* check for all kinds of wrapping and the like */ start = (unsigned long)optval; - if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) { + if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) { ret = -EINVAL; goto out; } From 6d8c190531d0020870b93eee6a3933cb39fb1f52 Mon Sep 17 00:00:00 2001 From: huaibin Wang Date: Tue, 25 Aug 2015 16:20:34 +0200 Subject: [PATCH 47/55] ip6_gre: release cached dst on tunnel removal [ Upstream commit d4257295ba1b389c693b79de857a96e4b7cd8ac0 ] When a tunnel is deleted, the cached dst entry should be released. This problem may prevent the removal of a netns (seen with a x-netns IPv6 gre tunnel): unregister_netdevice: waiting for lo to become free. Usage count = 3 CC: Dmitry Kozlov Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Signed-off-by: huaibin Wang Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_gre.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 65156a73b3f..bf6233cdb75 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -359,6 +359,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); ip6gre_tunnel_unlink(ign, netdev_priv(dev)); + ip6_tnl_dst_reset(netdev_priv(dev)); dev_put(dev); } From fe474009a2167693c1a1cc2f396cb20621770450 Mon Sep 17 00:00:00 2001 From: Eugene Shatokhin Date: Mon, 24 Aug 2015 23:13:42 +0300 Subject: [PATCH 48/55] usbnet: Get EVENT_NO_RUNTIME_PM bit before it is cleared [ Upstream commit f50791ac1aca1ac1b0370d62397b43e9f831421a ] It is needed to check EVENT_NO_RUNTIME_PM bit of dev->flags in usbnet_stop(), but its value should be read before it is cleared when dev->flags is set to 0. The problem was spotted and the fix was provided by Oliver Neukum . Signed-off-by: Eugene Shatokhin Acked-by: Oliver Neukum Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/usbnet.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 3d50e7db141..fb068ada0c5 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -753,7 +753,7 @@ int usbnet_stop (struct net_device *net) { struct usbnet *dev = netdev_priv(net); struct driver_info *info = dev->driver_info; - int retval, pm; + int retval, pm, mpn; clear_bit(EVENT_DEV_OPEN, &dev->flags); netif_stop_queue (net); @@ -784,6 +784,8 @@ int usbnet_stop (struct net_device *net) usbnet_purge_paused_rxq(dev); + mpn = !test_and_clear_bit(EVENT_NO_RUNTIME_PM, &dev->flags); + /* deferred work (task, timer, softirq) must also stop. * can't flush_scheduled_work() until we drop rtnl (later), * else workers could deadlock; so make workers a NOP. @@ -794,8 +796,7 @@ int usbnet_stop (struct net_device *net) if (!pm) usb_autopm_put_interface(dev->intf); - if (info->manage_power && - !test_and_clear_bit(EVENT_NO_RUNTIME_PM, &dev->flags)) + if (info->manage_power && mpn) info->manage_power(dev, 0); else usb_autopm_put_interface(dev->intf); From 162e3d1c34b0c7b4a2bd016332b48c171a926965 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Sep 2015 00:29:07 +0200 Subject: [PATCH 49/55] ipv6: fix exthdrs offload registration in out_rt path [ Upstream commit e41b0bedba0293b9e1e8d1e8ed553104b9693656 ] We previously register IPPROTO_ROUTING offload under inet6_add_offload(), but in error path, we try to unregister it with inet_del_offload(). This doesn't seem correct, it should actually be inet6_del_offload(), also ipv6_exthdrs_offload_exit() from that commit seems rather incorrect (it also uses rthdr_offload twice), but it got removed entirely later on. Fixes: 3336288a9fea ("ipv6: Switch to using new offload infrastructure.") Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/exthdrs_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/exthdrs_offload.c b/net/ipv6/exthdrs_offload.c index 447a7fbd1bb..f5e2ba1c18b 100644 --- a/net/ipv6/exthdrs_offload.c +++ b/net/ipv6/exthdrs_offload.c @@ -36,6 +36,6 @@ out: return ret; out_rt: - inet_del_offload(&rthdr_offload, IPPROTO_ROUTING); + inet6_del_offload(&rthdr_offload, IPPROTO_ROUTING); goto out; } From 02b5ca779d6d15b3af9ea79e1be6e9eb1636f286 Mon Sep 17 00:00:00 2001 From: Richard Laing Date: Thu, 3 Sep 2015 13:52:31 +1200 Subject: [PATCH 50/55] net/ipv6: Correct PIM6 mrt_lock handling [ Upstream commit 25b4a44c19c83d98e8c0807a7ede07c1f28eab8b ] In the IPv6 multicast routing code the mrt_lock was not being released correctly in the MFC iterator, as a result adding or deleting a MIF would cause a hang because the mrt_lock could not be acquired. This fix is a copy of the code for the IPv4 case and ensures that the lock is released correctly. Signed-off-by: Richard Laing Acked-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 2c84072b1da..57dd3e7d86c 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -552,7 +552,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) if (it->cache == &mrt->mfc6_unres_queue) spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == mrt->mfc6_cache_array) + else if (it->cache == &mrt->mfc6_cache_array[it->ct]) read_unlock(&mrt_lock); } From e7bb902b26f1e8f7c1a4f0cc7b3abfd9b3fbb108 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 10 Sep 2015 17:31:15 -0300 Subject: [PATCH 51/55] sctp: fix race on protocol/netns initialization [ Upstream commit 8e2d61e0aed2b7c4ecb35844fe07e0b2b762dee4 ] Consider sctp module is unloaded and is being requested because an user is creating a sctp socket. During initialization, sctp will add the new protocol type and then initialize pernet subsys: status = sctp_v4_protosw_init(); if (status) goto err_protosw_init; status = sctp_v6_protosw_init(); if (status) goto err_v6_protosw_init; status = register_pernet_subsys(&sctp_net_ops); The problem is that after those calls to sctp_v{4,6}_protosw_init(), it is possible for userspace to create SCTP sockets like if the module is already fully loaded. If that happens, one of the possible effects is that we will have readers for net->sctp.local_addr_list list earlier than expected and sctp_net_init() does not take precautions while dealing with that list, leading to a potential panic but not limited to that, as sctp_sock_init() will copy a bunch of blank/partially initialized values from net->sctp. The race happens like this: CPU 0 | CPU 1 socket() | __sock_create | socket() inet_create | __sock_create list_for_each_entry_rcu( | answer, &inetsw[sock->type], | list) { | inet_create /* no hits */ | if (unlikely(err)) { | ... | request_module() | /* socket creation is blocked | * the module is fully loaded | */ | sctp_init | sctp_v4_protosw_init | inet_register_protosw | list_add_rcu(&p->list, | last_perm); | | list_for_each_entry_rcu( | answer, &inetsw[sock->type], sctp_v6_protosw_init | list) { | /* hit, so assumes protocol | * is already loaded | */ | /* socket creation continues | * before netns is initialized | */ register_pernet_subsys | Simply inverting the initialization order between register_pernet_subsys() and sctp_v4_protosw_init() is not possible because register_pernet_subsys() will create a control sctp socket, so the protocol must be already visible by then. Deferring the socket creation to a work-queue is not good specially because we loose the ability to handle its errors. So, as suggested by Vlad, the fix is to split netns initialization in two moments: defaults and control socket, so that the defaults are already loaded by when we register the protocol, while control socket initialization is kept at the same moment it is today. Fixes: 4db67e808640 ("sctp: Make the address lists per network namespace") Signed-off-by: Vlad Yasevich Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/protocol.c | 64 +++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 5a3c1c0a84a..57c2c4c0c97 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1170,7 +1170,7 @@ static void sctp_v4_del_protocol(void) unregister_inetaddr_notifier(&sctp_inetaddr_notifier); } -static int __net_init sctp_net_init(struct net *net) +static int __net_init sctp_defaults_init(struct net *net) { int status; @@ -1263,12 +1263,6 @@ static int __net_init sctp_net_init(struct net *net) sctp_dbg_objcnt_init(net); - /* Initialize the control inode/socket for handling OOTB packets. */ - if ((status = sctp_ctl_sock_init(net))) { - pr_err("Failed to initialize the SCTP control sock\n"); - goto err_ctl_sock_init; - } - /* Initialize the local address list. */ INIT_LIST_HEAD(&net->sctp.local_addr_list); spin_lock_init(&net->sctp.local_addr_lock); @@ -1284,9 +1278,6 @@ static int __net_init sctp_net_init(struct net *net) return 0; -err_ctl_sock_init: - sctp_dbg_objcnt_exit(net); - sctp_proc_exit(net); err_init_proc: cleanup_sctp_mibs(net); err_init_mibs: @@ -1295,15 +1286,12 @@ err_sysctl_register: return status; } -static void __net_exit sctp_net_exit(struct net *net) +static void __net_exit sctp_defaults_exit(struct net *net) { /* Free the local address list */ sctp_free_addr_wq(net); sctp_free_local_addr_list(net); - /* Free the control endpoint. */ - inet_ctl_sock_destroy(net->sctp.ctl_sock); - sctp_dbg_objcnt_exit(net); sctp_proc_exit(net); @@ -1311,9 +1299,32 @@ static void __net_exit sctp_net_exit(struct net *net) sctp_sysctl_net_unregister(net); } -static struct pernet_operations sctp_net_ops = { - .init = sctp_net_init, - .exit = sctp_net_exit, +static struct pernet_operations sctp_defaults_ops = { + .init = sctp_defaults_init, + .exit = sctp_defaults_exit, +}; + +static int __net_init sctp_ctrlsock_init(struct net *net) +{ + int status; + + /* Initialize the control inode/socket for handling OOTB packets. */ + status = sctp_ctl_sock_init(net); + if (status) + pr_err("Failed to initialize the SCTP control sock\n"); + + return status; +} + +static void __net_init sctp_ctrlsock_exit(struct net *net) +{ + /* Free the control endpoint. */ + inet_ctl_sock_destroy(net->sctp.ctl_sock); +} + +static struct pernet_operations sctp_ctrlsock_ops = { + .init = sctp_ctrlsock_init, + .exit = sctp_ctrlsock_exit, }; /* Initialize the universe into something sensible. */ @@ -1448,8 +1459,11 @@ SCTP_STATIC __init int sctp_init(void) sctp_v4_pf_init(); sctp_v6_pf_init(); - status = sctp_v4_protosw_init(); + status = register_pernet_subsys(&sctp_defaults_ops); + if (status) + goto err_register_defaults; + status = sctp_v4_protosw_init(); if (status) goto err_protosw_init; @@ -1457,9 +1471,9 @@ SCTP_STATIC __init int sctp_init(void) if (status) goto err_v6_protosw_init; - status = register_pernet_subsys(&sctp_net_ops); + status = register_pernet_subsys(&sctp_ctrlsock_ops); if (status) - goto err_register_pernet_subsys; + goto err_register_ctrlsock; status = sctp_v4_add_protocol(); if (status) @@ -1476,12 +1490,14 @@ out: err_v6_add_protocol: sctp_v4_del_protocol(); err_add_protocol: - unregister_pernet_subsys(&sctp_net_ops); -err_register_pernet_subsys: + unregister_pernet_subsys(&sctp_ctrlsock_ops); +err_register_ctrlsock: sctp_v6_protosw_exit(); err_v6_protosw_init: sctp_v4_protosw_exit(); err_protosw_init: + unregister_pernet_subsys(&sctp_defaults_ops); +err_register_defaults: sctp_v4_pf_exit(); sctp_v6_pf_exit(); sctp_sysctl_unregister(); @@ -1514,12 +1530,14 @@ SCTP_STATIC __exit void sctp_exit(void) sctp_v6_del_protocol(); sctp_v4_del_protocol(); - unregister_pernet_subsys(&sctp_net_ops); + unregister_pernet_subsys(&sctp_ctrlsock_ops); /* Free protosw registrations */ sctp_v6_protosw_exit(); sctp_v4_protosw_exit(); + unregister_pernet_subsys(&sctp_defaults_ops); + /* Unregister with socket layer. */ sctp_v6_pf_exit(); sctp_v4_pf_exit(); From d0550a3f2d313a5310ace03299d45ef63edfb750 Mon Sep 17 00:00:00 2001 From: Wilson Kok Date: Tue, 22 Sep 2015 21:40:22 -0700 Subject: [PATCH 52/55] fib_rules: fix fib rule dumps across multiple skbs [ Upstream commit 41fc014332d91ee90c32840bf161f9685b7fbf2b ] dump_rules returns skb length and not error. But when family == AF_UNSPEC, the caller of dump_rules assumes that it returns an error. Hence, when family == AF_UNSPEC, we continue trying to dump on -EMSGSIZE errors resulting in incorrect dump idx carried between skbs belonging to the same dump. This results in fib rule dump always only dumping rules that fit into the first skb. This patch fixes dump_rules to return error so that we exit correctly and idx is correctly maintained between skbs that are part of the same dump. Signed-off-by: Wilson Kok Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/fib_rules.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 55e08e2de3a..627e517077e 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -596,15 +596,17 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, { int idx = 0; struct fib_rule *rule; + int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; - if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWRULE, - NLM_F_MULTI, ops) < 0) + err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWRULE, + NLM_F_MULTI, ops); + if (err) break; skip: idx++; @@ -613,7 +615,7 @@ skip: cb->args[1] = idx; rules_ops_put(ops); - return skb->len; + return err; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) @@ -629,7 +631,9 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) if (ops == NULL) return -EAFNOSUPPORT; - return dump_rules(skb, cb, ops); + dump_rules(skb, cb, ops); + + return skb->len; } rcu_read_lock(); From e6478de4fad8e6d7cad3ca3440ee4da599fb0b2f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 24 May 2015 09:25:00 -0500 Subject: [PATCH 53/55] vfs: Remove incorrect debugging WARN in prepend_path commit 93e3bce6287e1fb3e60d3324ed08555b5bbafa89 upstream. The warning message in prepend_path is unclear and outdated. It was added as a warning that the mechanism for generating names of pseudo files had been removed from prepend_path and d_dname should be used instead. Unfortunately the warning reads like a general warning, making it unclear what to do with it. Remove the warning. The transition it was added to warn about is long over, and I added code several years ago which in rare cases causes the warning to fire on legitimate code, and the warning is now firing and scaring people for no good reason. Reported-by: Ivan Delalande Reported-by: Omar Sandoval Fixes: f48cfddc6729e ("vfs: In d_path don't call d_dname on a mount point") Signed-off-by: "Eric W. Biederman" [ vlee: Backported to 3.10. Adjusted context. ] Signed-off-by: Vinson Lee Signed-off-by: Greg Kroah-Hartman --- fs/dcache.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 90be2809e15..f1e80178597 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2569,15 +2569,6 @@ static int prepend_path(const struct path *path, return error; global_root: - /* - * Filesystems needing to implement special "root names" - * should do so with ->d_dname() - */ - if (IS_ROOT(dentry) && - (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) { - WARN(1, "Root dentry has weird name <%.*s>\n", - (int) dentry->d_name.len, dentry->d_name.name); - } if (!slash) error = prepend(buffer, buflen, "/", 1); if (!error) From d565d87eb95146aef43f0c60a88d4cfdadebb16c Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Wed, 29 Jul 2015 15:46:03 +0200 Subject: [PATCH 54/55] Revert "iio: bmg160: IIO_BUFFER and IIO_TRIGGERED_BUFFER are required" This reverts commit 35c45e8bce3c92fb1ff94d376f1d4bfaae079d66 which was commit 06d2f6ca5a38abe92f1f3a132b331eee773868c3 upstream as it should not have been applied. Reported-by: Luis Henriques Cc: Markus Pargmann Cc: Srinivas Pandruvada Cc: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/gyro/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iio/gyro/Kconfig b/drivers/iio/gyro/Kconfig index 953a0621c6f..107cafcb89d 100644 --- a/drivers/iio/gyro/Kconfig +++ b/drivers/iio/gyro/Kconfig @@ -73,8 +73,7 @@ config IIO_ST_GYRO_SPI_3AXIS config ITG3200 tristate "InvenSense ITG3200 Digital 3-Axis Gyroscope I2C driver" depends on I2C - select IIO_BUFFER - select IIO_TRIGGERED_BUFFER + select IIO_TRIGGERED_BUFFER if IIO_BUFFER help Say yes here to add support for the InvenSense ITG3200 digital 3-axis gyroscope sensor. From f5552cd830e58c46dffae3617b3ce0c839771981 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 1 Oct 2015 12:07:55 +0200 Subject: [PATCH 55/55] Linux 3.10.90 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ca82393e1ee..ce741a9f5b1 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 3 PATCHLEVEL = 10 -SUBLEVEL = 89 +SUBLEVEL = 90 EXTRAVERSION = NAME = TOSSUG Baby Fish