From 6b7d2f5b6ef27a89a0aee245a94d988e9ce8315e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 10 Aug 2015 17:35:07 -0500
Subject: [PATCH 01/55] unshare: Unsharing a thread does not require unsharing
 a vm

commit 12c641ab8270f787dfcce08b5f20ce8b65008096 upstream.

In the logic in the initial commit of unshare made creating a new
thread group for a process, contingent upon creating a new memory
address space for that process.  That is wrong.  Two separate
processes in different thread groups can share a memory address space
and clone allows creation of such proceses.

This is significant because it was observed that mm_users > 1 does not
mean that a process is multi-threaded, as reading /proc/PID/maps
temporarily increments mm_users, which allows other processes to
(accidentally) interfere with unshare() calls.

Correct the check in check_unshare_flags() to test for
!thread_group_empty() for CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM.
For sighand->count > 1 for CLONE_SIGHAND and CLONE_VM.
For !current_is_single_threaded instead of mm_users > 1 for CLONE_VM.

By using the correct checks in unshare this removes the possibility of
an accidental denial of service attack.

Additionally using the correct checks in unshare ensures that only an
explicit unshare(CLONE_VM) can possibly trigger the slow path of
current_is_single_threaded().  As an explict unshare(CLONE_VM) is
pointless it is not expected there are many applications that make
that call.

Fixes: b2e0d98705e60e45bbb3c0032c48824ad7ae0704 userns: Implement unshare of the user namespace
Reported-by: Ricky Zhou <rickyz@chromium.org>
Reported-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/fork.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 514dbc40f98..2358bd4c875 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1760,13 +1760,21 @@ static int check_unshare_flags(unsigned long unshare_flags)
 				CLONE_NEWUSER|CLONE_NEWPID))
 		return -EINVAL;
 	/*
-	 * Not implemented, but pretend it works if there is nothing to
-	 * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
-	 * needs to unshare vm.
+	 * Not implemented, but pretend it works if there is nothing
+	 * to unshare.  Note that unsharing the address space or the
+	 * signal handlers also need to unshare the signal queues (aka
+	 * CLONE_THREAD).
 	 */
 	if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
-		/* FIXME: get_task_mm() increments ->mm_users */
-		if (atomic_read(&current->mm->mm_users) > 1)
+		if (!thread_group_empty(current))
+			return -EINVAL;
+	}
+	if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
+		if (atomic_read(&current->sighand->count) > 1)
+			return -EINVAL;
+	}
+	if (unshare_flags & CLONE_VM) {
+		if (!current_is_single_threaded())
 			return -EINVAL;
 	}
 
@@ -1839,16 +1847,16 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 	 */
 	if (unshare_flags & CLONE_NEWPID)
 		unshare_flags |= CLONE_THREAD;
-	/*
-	 * If unsharing a thread from a thread group, must also unshare vm.
-	 */
-	if (unshare_flags & CLONE_THREAD)
-		unshare_flags |= CLONE_VM;
 	/*
 	 * If unsharing vm, must also unshare signal handlers.
 	 */
 	if (unshare_flags & CLONE_VM)
 		unshare_flags |= CLONE_SIGHAND;
+	/*
+	 * If unsharing a signal handlers, must also unshare the signal queues.
+	 */
+	if (unshare_flags & CLONE_SIGHAND)
+		unshare_flags |= CLONE_THREAD;
 	/*
 	 * If unsharing namespace, must also unshare filesystem information.
 	 */

From f04fce5fcb1a6de2cf2f80c414ec9f16dbb15f6e Mon Sep 17 00:00:00 2001
From: Adrien Schildknecht <adrien+dev@schischi.me>
Date: Wed, 19 Aug 2015 17:33:12 +0200
Subject: [PATCH 02/55] rtlwifi: rtl8192cu: Add new device ID

commit 1642d09fb9b128e8e538b2a4179962a34f38dff9 upstream.

The v2 of NetGear WNA1000M uses a different idProduct: USB ID 0846:9043

Signed-off-by: Adrien Schildknecht <adrien+dev@schischi.me>
Acked-by: Larry Finger <Larry.Finger@lwfinger.net>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/wireless/rtlwifi/rtl8192cu/sw.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c b/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c
index 7555095e0b7..fa669b52fc9 100644
--- a/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c
+++ b/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c
@@ -313,6 +313,7 @@ static struct usb_device_id rtl8192c_usb_ids[] = {
 	{RTL_USB_DEVICE(0x07b8, 0x8188, rtl92cu_hal_cfg)}, /*Abocom - Abocom*/
 	{RTL_USB_DEVICE(0x07b8, 0x8189, rtl92cu_hal_cfg)}, /*Funai - Abocom*/
 	{RTL_USB_DEVICE(0x0846, 0x9041, rtl92cu_hal_cfg)}, /*NetGear WNA1000M*/
+	{RTL_USB_DEVICE(0x0846, 0x9043, rtl92cu_hal_cfg)}, /*NG WNA1000Mv2*/
 	{RTL_USB_DEVICE(0x0b05, 0x17ba, rtl92cu_hal_cfg)}, /*ASUS-Edimax*/
 	{RTL_USB_DEVICE(0x0bda, 0x5088, rtl92cu_hal_cfg)}, /*Thinkware-CC&C*/
 	{RTL_USB_DEVICE(0x0df6, 0x0052, rtl92cu_hal_cfg)}, /*Sitecom - Edimax*/

From f4487c483c26c0374450e0e910a45a7c0e91407d Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Tue, 1 Sep 2015 18:07:41 +0200
Subject: [PATCH 03/55] tg3: Fix temperature reporting

commit d3d11fe08ccc9bff174fc958722b5661f0932486 upstream.

The temperature registers appear to report values in degrees Celsius
while the hwmon API mandates values to be exposed in millidegrees
Celsius. Do the conversion so that the values reported by "sensors"
are correct.

Fixes: aed93e0bf493 ("tg3: Add hwmon support for temperature")
Signed-off-by: Jean Delvare <jdelvare@suse.de>
Cc: Prashant Sreedharan <prashant@broadcom.com>
Cc: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/broadcom/tg3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 680d26d6d2c..518cc4b6c7d 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -10518,7 +10518,7 @@ static ssize_t tg3_show_temp(struct device *dev,
 	tg3_ape_scratchpad_read(tp, &temperature, attr->index,
 				sizeof(temperature));
 	spin_unlock_bh(&tp->lock);
-	return sprintf(buf, "%u\n", temperature);
+	return sprintf(buf, "%u\n", temperature * 1000);
 }
 
 
From 957c0c65c71536c95d0cd92e9a4583272402567d Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Sat, 13 Jun 2015 10:16:31 -0400
Subject: [PATCH 04/55] mac80211: enable assoc check for mesh interfaces

commit 3633ebebab2bbe88124388b7620442315c968e8f upstream.

We already set a station to be associated when peering completes, both
in user space and in the kernel.  Thus we should always have an
associated sta before sending data frames to that station.

Failure to check assoc state can cause crashes in the lower-level driver
due to transmitting unicast data frames before driver sta structures
(e.g. ampdu state in ath9k) are initialized.  This occurred when
forwarding in the presence of fixed mesh paths: frames were transmitted
to stations with whom we hadn't yet completed peering.

Reported-by: Alexis Green <agreen@cococorp.com>
Tested-by: Jesse Jones <jjones@cococorp.com>
Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/mac80211/tx.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 10eea232602..e960fbe9e27 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -281,9 +281,6 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
 	if (tx->sdata->vif.type == NL80211_IFTYPE_WDS)
 		return TX_CONTINUE;
 
-	if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
-		return TX_CONTINUE;
-
 	if (tx->flags & IEEE80211_TX_PS_BUFFERED)
 		return TX_CONTINUE;
 

From f828609ff36c1180c06e307d2c51d4ede337f7da Mon Sep 17 00:00:00 2001
From: Jeff Vander Stoep <jeffv@google.com>
Date: Tue, 18 Aug 2015 20:50:10 +0100
Subject: [PATCH 05/55] arm64: kconfig: Move LIST_POISON to a safe value

commit bf0c4e04732479f650ff59d1ee82de761c0071f0 upstream.

Move the poison pointer offset to 0xdead000000000000, a
recognized value that is not mappable by user-space exploits.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Thierry Strudel <tstrudel@google.com>
Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 0677ff4814f..661ccf87b9a 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -56,6 +56,10 @@ config NO_IOPORT
 config STACKTRACE_SUPPORT
 	def_bool y
 
+config ILLEGAL_POINTER_VALUE
+	hex
+	default 0xdead000000000000
+
 config LOCKDEP_SUPPORT
 	def_bool y
 

From a507adf4f05a41a638cd6cbdfd78149c35cec8db Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 15 Sep 2015 12:07:06 +0100
Subject: [PATCH 06/55] arm64: compat: fix vfp save/restore across signal
 handlers in big-endian

commit bdec97a855ef1e239f130f7a11584721c9a1bf04 upstream.

When saving/restoring the VFP registers from a compat (AArch32)
signal frame, we rely on the compat registers forming a prefix of the
native register file and therefore make use of copy_{to,from}_user to
transfer between the native fpsimd_state and the compat_vfp_sigframe.

Unfortunately, this doesn't work so well in a big-endian environment.
Our fpsimd save/restore code operates directly on 128-bit quantities
(Q registers) whereas the compat_vfp_sigframe represents the registers
as an array of 64-bit (D) registers. The architecture packs the compat D
registers into the Q registers, with the least significant bytes holding
the lower register. Consequently, we need to swap the 64-bit halves when
converting between these two representations on a big-endian machine.

This patch replaces the __copy_{to,from}_user invocations in our
compat VFP signal handling code with explicit __put_user loops that
operate on 64-bit values and swap them accordingly.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/kernel/signal32.c | 47 +++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index b9564b8d6ba..1e60acc6a4d 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -231,14 +231,32 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
 
 /*
  * VFP save/restore code.
+ *
+ * We have to be careful with endianness, since the fpsimd context-switch
+ * code operates on 128-bit (Q) register values whereas the compat ABI
+ * uses an array of 64-bit (D) registers. Consequently, we need to swap
+ * the two halves of each Q register when running on a big-endian CPU.
  */
+union __fpsimd_vreg {
+	__uint128_t	raw;
+	struct {
+#ifdef __AARCH64EB__
+		u64	hi;
+		u64	lo;
+#else
+		u64	lo;
+		u64	hi;
+#endif
+	};
+};
+
 static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
 {
 	struct fpsimd_state *fpsimd = &current->thread.fpsimd_state;
 	compat_ulong_t magic = VFP_MAGIC;
 	compat_ulong_t size = VFP_STORAGE_SIZE;
 	compat_ulong_t fpscr, fpexc;
-	int err = 0;
+	int i, err = 0;
 
 	/*
 	 * Save the hardware registers to the fpsimd_state structure.
@@ -254,10 +272,15 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
 	/*
 	 * Now copy the FP registers. Since the registers are packed,
 	 * we can copy the prefix we want (V0-V15) as it is.
-	 * FIXME: Won't work if big endian.
 	 */
-	err |= __copy_to_user(&frame->ufp.fpregs, fpsimd->vregs,
-			      sizeof(frame->ufp.fpregs));
+	for (i = 0; i < ARRAY_SIZE(frame->ufp.fpregs); i += 2) {
+		union __fpsimd_vreg vreg = {
+			.raw = fpsimd->vregs[i >> 1],
+		};
+
+		__put_user_error(vreg.lo, &frame->ufp.fpregs[i], err);
+		__put_user_error(vreg.hi, &frame->ufp.fpregs[i + 1], err);
+	}
 
 	/* Create an AArch32 fpscr from the fpsr and the fpcr. */
 	fpscr = (fpsimd->fpsr & VFP_FPSCR_STAT_MASK) |
@@ -282,7 +305,7 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
 	compat_ulong_t magic = VFP_MAGIC;
 	compat_ulong_t size = VFP_STORAGE_SIZE;
 	compat_ulong_t fpscr;
-	int err = 0;
+	int i, err = 0;
 
 	__get_user_error(magic, &frame->magic, err);
 	__get_user_error(size, &frame->size, err);
@@ -292,12 +315,14 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
 	if (magic != VFP_MAGIC || size != VFP_STORAGE_SIZE)
 		return -EINVAL;
 
-	/*
-	 * Copy the FP registers into the start of the fpsimd_state.
-	 * FIXME: Won't work if big endian.
-	 */
-	err |= __copy_from_user(fpsimd.vregs, frame->ufp.fpregs,
-				sizeof(frame->ufp.fpregs));
+	/* Copy the FP registers into the start of the fpsimd_state. */
+	for (i = 0; i < ARRAY_SIZE(frame->ufp.fpregs); i += 2) {
+		union __fpsimd_vreg vreg;
+
+		__get_user_error(vreg.lo, &frame->ufp.fpregs[i], err);
+		__get_user_error(vreg.hi, &frame->ufp.fpregs[i + 1], err);
+		fpsimd.vregs[i >> 1] = vreg.raw;
+	}
 
 	/* Extract the fpsr and the fpcr from the fpscr */
 	__get_user_error(fpscr, &frame->ufp.fpscr, err);

From 8a31f0de7f474c13bcb4312f5c517bc594330f68 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 2 Sep 2015 18:49:28 +0100
Subject: [PATCH 07/55] arm64: head.S: initialise mdcr_el2 in el2_setup

commit d10bcd473301888f957ec4b6b12aa3621be78d59 upstream.

When entering the kernel at EL2, we fail to initialise the MDCR_EL2
register which controls debug access and PMU capabilities at EL1.

This patch ensures that the register is initialised so that all traps
are disabled and all the PMU counters are available to the host. When a
guest is scheduled, KVM takes care to configure trapping appropriately.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/kernel/head.S | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 53dcae49e72..f480e7d6e8b 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -184,6 +184,11 @@ ENTRY(el2_setup)
 	msr	hstr_el2, xzr			// Disable CP15 traps to EL2
 #endif
 
+	/* EL2 debug */
+	mrs	x0, pmcr_el0			// Disable debug access traps
+	ubfx	x0, x0, #11, #5			// to EL2 and allow access to
+	msr	mdcr_el2, x0			// all PMU counters from EL1
+
 	/* Stage-2 translation */
 	msr	vttbr_el2, xzr
 

From e8c2bbe98ae6636123043bf7a9bf908635b1c5cc Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 13 Aug 2015 18:02:39 +0200
Subject: [PATCH 08/55] ALSA: hda - Enable headphone jack detect on old Fujitsu
 laptops

commit bb148bdeb0ab16fc0ae8009799471e4d7180073b upstream.

According to the bug report, FSC Amilo laptops with ALC880 can detect
the headphone jack but currently the driver disables it.  It's partly
intentionally, as non-working jack detect was reported in the past.
Let's enable now.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=102501
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/patch_realtek.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index d30252e7f3e..23f8241924b 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -1137,7 +1137,7 @@ static const struct hda_fixup alc880_fixups[] = {
 		/* override all pins as BIOS on old Amilo is broken */
 		.type = HDA_FIXUP_PINS,
 		.v.pins = (const struct hda_pintbl[]) {
-			{ 0x14, 0x0121411f }, /* HP */
+			{ 0x14, 0x0121401f }, /* HP */
 			{ 0x15, 0x99030120 }, /* speaker */
 			{ 0x16, 0x99030130 }, /* bass speaker */
 			{ 0x17, 0x411111f0 }, /* N/A */
@@ -1157,7 +1157,7 @@ static const struct hda_fixup alc880_fixups[] = {
 		/* almost compatible with FUJITSU, but no bass and SPDIF */
 		.type = HDA_FIXUP_PINS,
 		.v.pins = (const struct hda_pintbl[]) {
-			{ 0x14, 0x0121411f }, /* HP */
+			{ 0x14, 0x0121401f }, /* HP */
 			{ 0x15, 0x99030120 }, /* speaker */
 			{ 0x16, 0x411111f0 }, /* N/A */
 			{ 0x17, 0x411111f0 }, /* N/A */

From 4c9510d519440ca46a4bf8f72ad72f842db22432 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 13 Aug 2015 18:05:06 +0200
Subject: [PATCH 09/55] ALSA: hda - Use ALC880_FIXUP_FUJITSU for FSC Amilo
 M1437

commit a161574e200ae63a5042120e0d8c36830e81bde3 upstream.

It turned out that the machine has a bass speaker, so take a correct
fixup entry.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=102501
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/patch_realtek.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 23f8241924b..183a96ab253 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -1365,7 +1365,7 @@ static const struct snd_pci_quirk alc880_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x161f, 0x203d, "W810", ALC880_FIXUP_W810),
 	SND_PCI_QUIRK(0x161f, 0x205d, "Medion Rim 2150", ALC880_FIXUP_MEDION_RIM),
 	SND_PCI_QUIRK(0x1631, 0xe011, "PB 13201056", ALC880_FIXUP_6ST_AUTOMUTE),
-	SND_PCI_QUIRK(0x1734, 0x107c, "FSC F1734", ALC880_FIXUP_F1734),
+	SND_PCI_QUIRK(0x1734, 0x107c, "FSC Amilo M1437", ALC880_FIXUP_FUJITSU),
 	SND_PCI_QUIRK(0x1734, 0x1094, "FSC Amilo M1451G", ALC880_FIXUP_FUJITSU),
 	SND_PCI_QUIRK(0x1734, 0x10ac, "FSC AMILO Xi 1526", ALC880_FIXUP_F1734),
 	SND_PCI_QUIRK(0x1734, 0x10b0, "FSC Amilo Pi1556", ALC880_FIXUP_FUJITSU),

From 36e5789bc706a67a0281a14b3888b3f5d0f723e0 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 7 Aug 2015 16:19:43 +1000
Subject: [PATCH 10/55] powerpc/mm: Fix pte_pagesize_index() crash on 4K w/64K
 hash

commit 74b5037baa2011a2799e2c43adde7d171b072f9e upstream.

The powerpc kernel can be built to have either a 4K PAGE_SIZE or a 64K
PAGE_SIZE.

However when built with a 4K PAGE_SIZE there is an additional config
option which can be enabled, PPC_HAS_HASH_64K, which means the kernel
also knows how to hash a 64K page even though the base PAGE_SIZE is 4K.

This is used in one obscure configuration, to support 64K pages for SPU
local store on the Cell processor when the rest of the kernel is using
4K pages.

In this configuration, pte_pagesize_index() is defined to just pass
through its arguments to get_slice_psize(). However pte_pagesize_index()
is called for both user and kernel addresses, whereas get_slice_psize()
only knows how to handle user addresses.

This has been broken forever, however until recently it happened to
work. That was because in get_slice_psize() the large kernel address
would cause the right shift of the slice mask to return zero.

However in commit 7aa0727f3302 ("powerpc/mm: Increase the slice range to
64TB"), the get_slice_psize() code was changed so that instead of a
right shift we do an array lookup based on the address. When passed a
kernel address this means we index way off the end of the slice array
and return random junk.

That is only fatal if we happen to hit something non-zero, but when we
do return a non-zero value we confuse the MMU code and eventually cause
a check stop.

This fix is ugly, but simple. When we're called for a kernel address we
return 4K, which is always correct in this configuration, otherwise we
use the slice mask.

Fixes: 7aa0727f3302 ("powerpc/mm: Increase the slice range to 64TB")
Reported-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/include/asm/pgtable-ppc64.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index e3d55f6f24f..6fbb2b46098 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -130,7 +130,19 @@
 #define pte_iterate_hashed_end() } while(0)
 
 #ifdef CONFIG_PPC_HAS_HASH_64K
-#define pte_pagesize_index(mm, addr, pte)	get_slice_psize(mm, addr)
+/*
+ * We expect this to be called only for user addresses or kernel virtual
+ * addresses other than the linear mapping.
+ */
+#define pte_pagesize_index(mm, addr, pte)			\
+	({							\
+		unsigned int psize;				\
+		if (is_kernel_addr(addr))			\
+			psize = MMU_PAGE_4K;			\
+		else						\
+			psize = get_slice_psize(mm, addr);	\
+		psize;						\
+	})
 #else
 #define pte_pagesize_index(mm, addr, pte)	MMU_PAGE_4K
 #endif

From 2ba90c0e0638dba68ffe6e78b135729e41a4540a Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Fri, 17 Jul 2015 12:46:58 +0200
Subject: [PATCH 11/55] powerpc/rtas: Introduce rtas_get_sensor_fast() for IRQ
 handlers

commit 1c2cb594441d02815d304cccec9742ff5c707495 upstream.

The EPOW interrupt handler uses rtas_get_sensor(), which in turn
uses rtas_busy_delay() to wait for RTAS becoming ready in case it
is necessary. But rtas_busy_delay() is annotated with might_sleep()
and thus may not be used by interrupts handlers like the EPOW handler!
This leads to the following BUG when CONFIG_DEBUG_ATOMIC_SLEEP is
enabled:

 BUG: sleeping function called from invalid context at arch/powerpc/kernel/rtas.c:496
 in_atomic(): 1, irqs_disabled(): 1, pid: 0, name: swapper/1
 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.2.0-rc2-thuth #6
 Call Trace:
 [c00000007ffe7b90] [c000000000807670] dump_stack+0xa0/0xdc (unreliable)
 [c00000007ffe7bc0] [c0000000000e1f14] ___might_sleep+0x134/0x180
 [c00000007ffe7c20] [c00000000002aec0] rtas_busy_delay+0x30/0xd0
 [c00000007ffe7c50] [c00000000002bde4] rtas_get_sensor+0x74/0xe0
 [c00000007ffe7ce0] [c000000000083264] ras_epow_interrupt+0x44/0x450
 [c00000007ffe7d90] [c000000000120260] handle_irq_event_percpu+0xa0/0x300
 [c00000007ffe7e70] [c000000000120524] handle_irq_event+0x64/0xc0
 [c00000007ffe7eb0] [c000000000124dbc] handle_fasteoi_irq+0xec/0x260
 [c00000007ffe7ef0] [c00000000011f4f0] generic_handle_irq+0x50/0x80
 [c00000007ffe7f20] [c000000000010f3c] __do_irq+0x8c/0x200
 [c00000007ffe7f90] [c0000000000236cc] call_do_irq+0x14/0x24
 [c00000007e6f39e0] [c000000000011144] do_IRQ+0x94/0x110
 [c00000007e6f3a30] [c000000000002594] hardware_interrupt_common+0x114/0x180

Fix this issue by introducing a new rtas_get_sensor_fast() function
that does not use rtas_busy_delay() - and thus can only be used for
sensors that do not cause a BUSY condition - known as "fast" sensors.

The EPOW sensor is defined to be "fast" in sPAPR - mpe.

Fixes: 587f83e8dd50 ("powerpc/pseries: Use rtas_get_sensor in RAS code")
Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/include/asm/rtas.h      |  1 +
 arch/powerpc/kernel/rtas.c           | 17 +++++++++++++++++
 arch/powerpc/platforms/pseries/ras.c |  3 ++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 34fd70488d8..c5d5cb36f6c 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -255,6 +255,7 @@ extern void rtas_power_off(void);
 extern void rtas_halt(void);
 extern void rtas_os_term(char *str);
 extern int rtas_get_sensor(int sensor, int index, int *state);
+extern int rtas_get_sensor_fast(int sensor, int index, int *state);
 extern int rtas_get_power_level(int powerdomain, int *level);
 extern int rtas_set_power_level(int powerdomain, int level, int *setlevel);
 extern bool rtas_indicator_present(int token, int *maxindex);
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 52add6f3e20..f956a2f84a1 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -584,6 +584,23 @@ int rtas_get_sensor(int sensor, int index, int *state)
 }
 EXPORT_SYMBOL(rtas_get_sensor);
 
+int rtas_get_sensor_fast(int sensor, int index, int *state)
+{
+	int token = rtas_token("get-sensor-state");
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	rc = rtas_call(token, 2, 2, state, sensor, index);
+	WARN_ON(rc == RTAS_BUSY || (rc >= RTAS_EXTENDED_DELAY_MIN &&
+				    rc <= RTAS_EXTENDED_DELAY_MAX));
+
+	if (rc < 0)
+		return rtas_error_rc(rc);
+	return rc;
+}
+
 bool rtas_indicator_present(int token, int *maxindex)
 {
 	int proplen, count, i;
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index c4dfccd3a3d..2338e6e9848 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -187,7 +187,8 @@ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
 	int state;
 	int critical;
 
-	status = rtas_get_sensor(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state);
+	status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX,
+				      &state);
 
 	if (state > 3)
 		critical = 1;		/* Time Critical */

From 1d6c45737c391f980df36e9183f81f8b19632d37 Mon Sep 17 00:00:00 2001
From: Jeffery Miller <jmiller@neverware.com>
Date: Tue, 1 Sep 2015 11:23:02 -0400
Subject: [PATCH 12/55] Add radeon suspend/resume quirk for HP Compaq dc5750.

commit 09bfda10e6efd7b65bcc29237bee1765ed779657 upstream.

With the radeon driver loaded the HP Compaq dc5750
Small Form Factor machine fails to resume from suspend.
Adding a quirk similar to other devices avoids
the problem and the system resumes properly.

Signed-off-by: Jeffery Miller <jmiller@neverware.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/radeon/radeon_combios.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/radeon/radeon_combios.c b/drivers/gpu/drm/radeon/radeon_combios.c
index 8cac6981905..9c64a973190 100644
--- a/drivers/gpu/drm/radeon/radeon_combios.c
+++ b/drivers/gpu/drm/radeon/radeon_combios.c
@@ -3403,6 +3403,14 @@ void radeon_combios_asic_init(struct drm_device *dev)
 	    rdev->pdev->subsystem_device == 0x30ae)
 		return;
 
+	/* quirk for rs4xx HP Compaq dc5750 Small Form Factor to make it resume
+	 * - it hangs on resume inside the dynclk 1 table.
+	 */
+	if (rdev->family == CHIP_RS480 &&
+	    rdev->pdev->subsystem_vendor == 0x103c &&
+	    rdev->pdev->subsystem_device == 0x280a)
+		return;
+
 	/* DYN CLK 1 */
 	table = combios_get_table_offset(dev, COMBIOS_DYN_CLK_1_TABLE);
 	if (table)

From 55b9029eab6e1cf44afa35b9823308246a0605d9 Mon Sep 17 00:00:00 2001
From: Minfei Huang <mnfhuang@gmail.com>
Date: Sun, 12 Jul 2015 20:18:42 +0800
Subject: [PATCH 13/55] x86/mm: Initialize pmd_idx in
 page_table_range_init_count()

commit 9962eea9e55f797f05f20ba6448929cab2a9f018 upstream.

The variable pmd_idx is not initialized for the first iteration of the
for loop.

Assign the proper value which indexes the start address.

Fixes: 719272c45b82 'x86, mm: only call early_ioremap_page_table_range_init() once'
Signed-off-by: Minfei Huang <mnfhuang@gmail.com>
Cc: tony.luck@intel.com
Cc: wangnan0@huawei.com
Cc: david.vrabel@citrix.com
Reviewed-by: yinghai@kernel.org
Link: http://lkml.kernel.org/r/1436703522-29552-1-git-send-email-mhuang@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/init_32.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3ac7e319918..6a70f0ee409 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -137,6 +137,7 @@ page_table_range_init_count(unsigned long start, unsigned long end)
 
 	vaddr = start;
 	pgd_idx = pgd_index(vaddr);
+	pmd_idx = pmd_index(vaddr);
 
 	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
 		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);

From 92a6eef0fb73b10dcc9fa3061d3ec2cf604d6d0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20H=C3=A4rdeman?= <david@hardeman.nu>
Date: Tue, 19 May 2015 19:03:12 -0300
Subject: [PATCH 14/55] rc-core: fix remove uevent generation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit a66b0c41ad277ae62a3ae6ac430a71882f899557 upstream.

The input_dev is already gone when the rc device is being unregistered
so checking for its presence only means that no remove uevent will be
generated.

Signed-off-by: David Härdeman <david@hardeman.nu>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/rc/rc-main.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c
index 1cf382a0b27..cf7bbb6c980 100644
--- a/drivers/media/rc/rc-main.c
+++ b/drivers/media/rc/rc-main.c
@@ -943,9 +943,6 @@ static int rc_dev_uevent(struct device *device, struct kobj_uevent_env *env)
 {
 	struct rc_dev *dev = to_rc_dev(device);
 
-	if (!dev || !dev->input_dev)
-		return -ENODEV;
-
 	if (dev->rc_map.name)
 		ADD_HOTPLUG_VAR("NAME=%s", dev->rc_map.name);
 	if (dev->driver_name)

From 9520ac796ef656c8ca8f7b0dfc56651f185d845f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 30 Jul 2015 13:00:56 +1000
Subject: [PATCH 15/55] NFSv4: don't set SETATTR for O_RDONLY|O_EXCL

commit efcbc04e16dfa95fef76309f89710dd1d99a5453 upstream.

It is unusual to combine the open flags O_RDONLY and O_EXCL, but
it appears that libre-office does just that.

[pid  3250] stat("/home/USER/.config", {st_mode=S_IFDIR|0700, st_size=8192, ...}) = 0
[pid  3250] open("/home/USER/.config/libreoffice/4-suse/user/extensions/buildid", O_RDONLY|O_EXCL <unfinished ...>

NFSv4 takes O_EXCL as a sign that a setattr command should be sent,
probably to reset the timestamps.

When it was an O_RDONLY open, the SETATTR command does not
identify any actual attributes to change.
If no delegation was provided to the open, the SETATTR uses the
all-zeros stateid and the request is accepted (at least by the
Linux NFS server - no harm, no foul).

If a read-delegation was provided, this is used in the SETATTR
request, and a Netapp filer will justifiably claim
NFS4ERR_BAD_STATEID, which the Linux client takes as a sign
to retry - indefinitely.

So only treat O_EXCL specially if O_CREAT was also given.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 20ebcfa3c92..78679b48948 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2043,7 +2043,7 @@ static int _nfs4_do_open(struct inode *dir,
 	if (status != 0)
 		goto err_opendata_put;
 
-	if ((opendata->o_arg.open_flags & O_EXCL) &&
+	if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
 	    (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
 		nfs4_exclusive_attrset(opendata, sattr);
 

From 690eb5ee31ee70eb31565de1052147e20cc40ff0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Aug 2015 12:57:07 -0500
Subject: [PATCH 16/55] NFS: nfs_set_pgio_error sometimes misses errors

commit e9ae58aeee8842a50f7e199d602a5ccb2e41a95f upstream.

We should ensure that we always set the pgio_header's error field
if a READ or WRITE RPC call returns an error. The current code depends
on 'hdr->good_bytes' always being initialised to a large value, which
is not always done correctly by callers.
When this happens, applications may end up missing important errors.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/pagelist.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 29cfb7ade12..d852ca281c1 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -60,8 +60,8 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init);
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
 {
 	spin_lock(&hdr->lock);
-	if (pos < hdr->io_start + hdr->good_bytes) {
-		set_bit(NFS_IOHDR_ERROR, &hdr->flags);
+	if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)
+	    || pos < hdr->io_start + hdr->good_bytes) {
 		clear_bit(NFS_IOHDR_EOF, &hdr->flags);
 		hdr->good_bytes = pos - hdr->io_start;
 		hdr->error = error;

From 706ad8dcb5a2db85e6c2b2da30449c9a9918fa9d Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Thu, 3 Sep 2015 22:45:21 +0200
Subject: [PATCH 17/55] parisc: Filter out spurious interrupts in PA-RISC irq
 handler

commit b1b4e435e4ef7de77f07bf2a42c8380b960c2d44 upstream.

When detecting a serial port on newer PA-RISC machines (with iosapic) we have a
long way to go to find the right IRQ line, registering it, then registering the
serial port and the irq handler for the serial port. During this phase spurious
interrupts for the serial port may happen which then crashes the kernel because
the action handler might not have been set up yet.

So, basically it's a race condition between the serial port hardware and the
CPU which sets up the necessary fields in the irq sructs. The main reason for
this race is, that we unmask the serial port irqs too early without having set
up everything properly before (which isn't easily possible because we need the
IRQ number to register the serial ports).

This patch is a work-around for this problem. It adds checks to the CPU irq
handler to verify if the IRQ action field has been initialized already. If not,
we just skip this interrupt (which isn't critical for a serial port at bootup).
The real fix would probably involve rewriting all PA-RISC specific IRQ code
(for CPU, IOSAPIC, GSC and EISA) to use IRQ domains with proper parenting of
the irq chips and proper irq enabling along this line.

This bug has been in the PA-RISC port since the beginning, but the crashes
happened very rarely with currently used hardware.  But on the latest machine
which I bought (a C8000 workstation), which uses the fastest CPUs (4 x PA8900,
1GHz) and which has the largest possible L1 cache size (64MB each), the kernel
crashed at every boot because of this race. So, without this patch the machine
would currently be unuseable.

For the record, here is the flow logic:
1. serial_init_chip() in 8250_gsc.c calls iosapic_serial_irq().
2. iosapic_serial_irq() calls txn_alloc_irq() to find the irq.
3. iosapic_serial_irq() calls cpu_claim_irq() to register the CPU irq
4. cpu_claim_irq() unmasks the CPU irq (which it shouldn't!)
5. serial_init_chip() then registers the 8250 port.
Problems:
- In step 4 the CPU irq shouldn't have been registered yet, but after step 5
- If serial irq happens between 4 and 5 have finished, the kernel will crash

Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/parisc/kernel/irq.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 2e6443b1e92..c32a37e0e0d 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -524,8 +524,8 @@ void do_cpu_irq_mask(struct pt_regs *regs)
 	struct pt_regs *old_regs;
 	unsigned long eirr_val;
 	int irq, cpu = smp_processor_id();
-#ifdef CONFIG_SMP
 	struct irq_desc *desc;
+#ifdef CONFIG_SMP
 	cpumask_t dest;
 #endif
 
@@ -538,8 +538,12 @@ void do_cpu_irq_mask(struct pt_regs *regs)
 		goto set_out;
 	irq = eirr_to_irq(eirr_val);
 
-#ifdef CONFIG_SMP
+	/* Filter out spurious interrupts, mostly from serial port at bootup */
 	desc = irq_to_desc(irq);
+	if (unlikely(!desc->action))
+		goto set_out;
+
+#ifdef CONFIG_SMP
 	cpumask_copy(&dest, desc->irq_data.affinity);
 	if (irqd_is_per_cpu(&desc->irq_data) &&
 	    !cpu_isset(smp_processor_id(), dest)) {

From de047ce49582e2fc3303efc58b40f4dbe7a4519f Mon Sep 17 00:00:00 2001
From: Jaewon Kim <jaewon31.kim@samsung.com>
Date: Tue, 8 Sep 2015 15:02:21 -0700
Subject: [PATCH 18/55] vmscan: fix increasing nr_isolated incurred by putback
 unevictable pages

commit c54839a722a02818677bcabe57e957f0ce4f841d upstream.

reclaim_clean_pages_from_list() assumes that shrink_page_list() returns
number of pages removed from the candidate list.  But shrink_page_list()
puts back mlocked pages without passing it to caller and without
counting as nr_reclaimed.  This increases nr_isolated.

To fix this, this patch changes shrink_page_list() to pass unevictable
pages back to caller.  Caller will take care those pages.

Minchan said:

It fixes two issues.

1. With unevictable page, cma_alloc will be successful.

Exactly speaking, cma_alloc of current kernel will fail due to
unevictable pages.

2. fix leaking of NR_ISOLATED counter of vmstat

With it, too_many_isolated works.  Otherwise, it could make hang until
the process get SIGKILL.

Signed-off-by: Jaewon Kim <jaewon31.kim@samsung.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 233f0011f76..a1e3becef05 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -925,7 +925,7 @@ cull_mlocked:
 		if (PageSwapCache(page))
 			try_to_free_swap(page);
 		unlock_page(page);
-		putback_lru_page(page);
+		list_add(&page->lru, &ret_pages);
 		continue;
 
 activate_locked:

From 9bdee2f90804f2c5865b1c4a75d3208ef28b9007 Mon Sep 17 00:00:00 2001
From: Jann Horn <jann@thejh.net>
Date: Wed, 9 Sep 2015 15:38:28 -0700
Subject: [PATCH 19/55] fs: if a coredump already exists, unlink and recreate
 with O_EXCL

commit fbb1816942c04429e85dbf4c1a080accc534299e upstream.

It was possible for an attacking user to trick root (or another user) into
writing his coredumps into an attacker-readable, pre-existing file using
rename() or link(), causing the disclosure of secret data from the victim
process' virtual memory.  Depending on the configuration, it was also
possible to trick root into overwriting system files with coredumps.  Fix
that issue by never writing coredumps into existing files.

Requirements for the attack:
 - The attack only applies if the victim's process has a nonzero
   RLIMIT_CORE and is dumpable.
 - The attacker can trick the victim into coredumping into an
   attacker-writable directory D, either because the core_pattern is
   relative and the victim's cwd is attacker-writable or because an
   absolute core_pattern pointing to a world-writable directory is used.
 - The attacker has one of these:
  A: on a system with protected_hardlinks=0:
     execute access to a folder containing a victim-owned,
     attacker-readable file on the same partition as D, and the
     victim-owned file will be deleted before the main part of the attack
     takes place. (In practice, there are lots of files that fulfill
     this condition, e.g. entries in Debian's /var/lib/dpkg/info/.)
     This does not apply to most Linux systems because most distros set
     protected_hardlinks=1.
  B: on a system with protected_hardlinks=1:
     execute access to a folder containing a victim-owned,
     attacker-readable and attacker-writable file on the same partition
     as D, and the victim-owned file will be deleted before the main part
     of the attack takes place.
     (This seems to be uncommon.)
  C: on any system, independent of protected_hardlinks:
     write access to a non-sticky folder containing a victim-owned,
     attacker-readable file on the same partition as D
     (This seems to be uncommon.)

The basic idea is that the attacker moves the victim-owned file to where
he expects the victim process to dump its core.  The victim process dumps
its core into the existing file, and the attacker reads the coredump from
it.

If the attacker can't move the file because he does not have write access
to the containing directory, he can instead link the file to a directory
he controls, then wait for the original link to the file to be deleted
(because the kernel checks that the link count of the corefile is 1).

A less reliable variant that requires D to be non-sticky works with link()
and does not require deletion of the original link: link() the file into
D, but then unlink() it directly before the kernel performs the link count
check.

On systems with protected_hardlinks=0, this variant allows an attacker to
not only gain information from coredumps, but also clobber existing,
victim-writable files with coredumps.  (This could theoretically lead to a
privilege escalation.)

Signed-off-by: Jann Horn <jann@thejh.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/coredump.c | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index 1d402ce5b72..4f03b2b5037 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -491,10 +491,10 @@ void do_coredump(siginfo_t *siginfo)
 	const struct cred *old_cred;
 	struct cred *cred;
 	int retval = 0;
-	int flag = 0;
 	int ispipe;
 	struct files_struct *displaced;
-	bool need_nonrelative = false;
+	/* require nonrelative corefile path and be extra careful */
+	bool need_suid_safe = false;
 	bool core_dumped = false;
 	static atomic_t core_dump_count = ATOMIC_INIT(0);
 	struct coredump_params cprm = {
@@ -528,9 +528,8 @@ void do_coredump(siginfo_t *siginfo)
 	 */
 	if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
 		/* Setuid core dump mode */
-		flag = O_EXCL;		/* Stop rewrite attacks */
 		cred->fsuid = GLOBAL_ROOT_UID;	/* Dump root private */
-		need_nonrelative = true;
+		need_suid_safe = true;
 	}
 
 	retval = coredump_wait(siginfo->si_signo, &core_state);
@@ -611,7 +610,7 @@ void do_coredump(siginfo_t *siginfo)
 		if (cprm.limit < binfmt->min_coredump)
 			goto fail_unlock;
 
-		if (need_nonrelative && cn.corename[0] != '/') {
+		if (need_suid_safe && cn.corename[0] != '/') {
 			printk(KERN_WARNING "Pid %d(%s) can only dump core "\
 				"to fully qualified path!\n",
 				task_tgid_vnr(current), current->comm);
@@ -619,8 +618,35 @@ void do_coredump(siginfo_t *siginfo)
 			goto fail_unlock;
 		}
 
+		/*
+		 * Unlink the file if it exists unless this is a SUID
+		 * binary - in that case, we're running around with root
+		 * privs and don't want to unlink another user's coredump.
+		 */
+		if (!need_suid_safe) {
+			mm_segment_t old_fs;
+
+			old_fs = get_fs();
+			set_fs(KERNEL_DS);
+			/*
+			 * If it doesn't exist, that's fine. If there's some
+			 * other problem, we'll catch it at the filp_open().
+			 */
+			(void) sys_unlink((const char __user *)cn.corename);
+			set_fs(old_fs);
+		}
+
+		/*
+		 * There is a race between unlinking and creating the
+		 * file, but if that causes an EEXIST here, that's
+		 * fine - another process raced with us while creating
+		 * the corefile, and the other process won. To userspace,
+		 * what matters is that at least one of the two processes
+		 * writes its coredump successfully, not which one.
+		 */
 		cprm.file = filp_open(cn.corename,
-				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+				 O_CREAT | 2 | O_NOFOLLOW |
+				 O_LARGEFILE | O_EXCL,
 				 0600);
 		if (IS_ERR(cprm.file))
 			goto fail_unlock;

From ab7a4b4b9d31e0458a5f327d1da66d649d814066 Mon Sep 17 00:00:00 2001
From: Jialing Fu <jlfu@marvell.com>
Date: Fri, 28 Aug 2015 11:13:09 +0800
Subject: [PATCH 20/55] mmc: core: fix race condition in mmc_wait_data_done

commit 71f8a4b81d040b3d094424197ca2f1bf811b1245 upstream.

The following panic is captured in ker3.14, but the issue still exists
in latest kernel.
---------------------------------------------------------------------
[   20.738217] c0 3136 (Compiler) Unable to handle kernel NULL pointer dereference
at virtual address 00000578
......
[   20.738499] c0 3136 (Compiler) PC is at _raw_spin_lock_irqsave+0x24/0x60
[   20.738527] c0 3136 (Compiler) LR is at _raw_spin_lock_irqsave+0x20/0x60
[   20.740134] c0 3136 (Compiler) Call trace:
[   20.740165] c0 3136 (Compiler) [<ffffffc0008ee900>] _raw_spin_lock_irqsave+0x24/0x60
[   20.740200] c0 3136 (Compiler) [<ffffffc0000dd024>] __wake_up+0x1c/0x54
[   20.740230] c0 3136 (Compiler) [<ffffffc000639414>] mmc_wait_data_done+0x28/0x34
[   20.740262] c0 3136 (Compiler) [<ffffffc0006391a0>] mmc_request_done+0xa4/0x220
[   20.740314] c0 3136 (Compiler) [<ffffffc000656894>] sdhci_tasklet_finish+0xac/0x264
[   20.740352] c0 3136 (Compiler) [<ffffffc0000a2b58>] tasklet_action+0xa0/0x158
[   20.740382] c0 3136 (Compiler) [<ffffffc0000a2078>] __do_softirq+0x10c/0x2e4
[   20.740411] c0 3136 (Compiler) [<ffffffc0000a24bc>] irq_exit+0x8c/0xc0
[   20.740439] c0 3136 (Compiler) [<ffffffc00008489c>] handle_IRQ+0x48/0xac
[   20.740469] c0 3136 (Compiler) [<ffffffc000081428>] gic_handle_irq+0x38/0x7c
----------------------------------------------------------------------
Because in SMP, "mrq" has race condition between below two paths:
path1: CPU0: <tasklet context>
  static void mmc_wait_data_done(struct mmc_request *mrq)
  {
     mrq->host->context_info.is_done_rcv = true;
     //
     // If CPU0 has just finished "is_done_rcv = true" in path1, and at
     // this moment, IRQ or ICache line missing happens in CPU0.
     // What happens in CPU1 (path2)?
     //
     // If the mmcqd thread in CPU1(path2) hasn't entered to sleep mode:
     // path2 would have chance to break from wait_event_interruptible
     // in mmc_wait_for_data_req_done and continue to run for next
     // mmc_request (mmc_blk_rw_rq_prep).
     //
     // Within mmc_blk_rq_prep, mrq is cleared to 0.
     // If below line still gets host from "mrq" as the result of
     // compiler, the panic happens as we traced.
     wake_up_interruptible(&mrq->host->context_info.wait);
  }

path2: CPU1: <The mmcqd thread runs mmc_queue_thread>
  static int mmc_wait_for_data_req_done(...
  {
     ...
     while (1) {
           wait_event_interruptible(context_info->wait,
                   (context_info->is_done_rcv ||
                    context_info->is_new_req));
     	   static void mmc_blk_rw_rq_prep(...
           {
           ...
           memset(brq, 0, sizeof(struct mmc_blk_request));

This issue happens very coincidentally; however adding mdelay(1) in
mmc_wait_data_done as below could duplicate it easily.

   static void mmc_wait_data_done(struct mmc_request *mrq)
   {
     mrq->host->context_info.is_done_rcv = true;
+    mdelay(1);
     wake_up_interruptible(&mrq->host->context_info.wait);
    }

At runtime, IRQ or ICache line missing may just happen at the same place
of the mdelay(1).

This patch gets the mmc_context_info at the beginning of function, it can
avoid this race condition.

Signed-off-by: Jialing Fu <jlfu@marvell.com>
Tested-by: Shawn Lin <shawn.lin@rock-chips.com>
Fixes: 2220eedfd7ae ("mmc: fix async request mechanism ....")
Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mmc/core/core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index c40396f2320..68ab26385d0 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -327,8 +327,10 @@ EXPORT_SYMBOL(mmc_start_bkops);
  */
 static void mmc_wait_data_done(struct mmc_request *mrq)
 {
-	mrq->host->context_info.is_done_rcv = true;
-	wake_up_interruptible(&mrq->host->context_info.wait);
+	struct mmc_context_info *context_info = &mrq->host->context_info;
+
+	context_info->is_done_rcv = true;
+	wake_up_interruptible(&context_info->wait);
 }
 
 static void mmc_wait_done(struct mmc_request *mrq)

From d3e972d5e77997cf0944d2c91162a9e893264c92 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 6 Jul 2015 17:37:49 +1000
Subject: [PATCH 21/55] md/raid10: always set reshape_safe when initializing
 reshape_position.

commit 299b0685e31c9f3dcc2d58ee3beca761a40b44b3 upstream.

'reshape_position' tracks where in the reshape we have reached.
'reshape_safe' tracks where in the reshape we have safely recorded
in the metadata.

These are compared to determine when to update the metadata.
So it is important that reshape_safe is initialised properly.
Currently it isn't.  When starting a reshape from the beginning
it usually has the correct value by luck.  But when reducing the
number of devices in a RAID10, it has the wrong value and this leads
to the metadata not being updated correctly.
This can lead to corruption if the reshape is not allowed to complete.

This patch is suitable for any -stable kernel which supports RAID10
reshape, which is 3.5 and later.

Fixes: 3ea7daa5d7fd ("md/raid10: add reshape support")
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/md/raid10.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a1ea2a75391..5b2a1eaea34 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3578,6 +3578,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 			/* far_copies must be 1 */
 			conf->prev.stride = conf->dev_sectors;
 	}
+	conf->reshape_safe = conf->reshape_progress;
 	spin_lock_init(&conf->device_lock);
 	INIT_LIST_HEAD(&conf->retry_list);
 
@@ -3785,7 +3786,6 @@ static int run(struct mddev *mddev)
 		}
 		conf->offset_diff = min_offset_diff;
 
-		conf->reshape_safe = conf->reshape_progress;
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -4130,6 +4130,7 @@ static int raid10_start_reshape(struct mddev *mddev)
 		conf->reshape_progress = size;
 	} else
 		conf->reshape_progress = 0;
+	conf->reshape_safe = conf->reshape_progress;
 	spin_unlock_irq(&conf->device_lock);
 
 	if (mddev->delta_disks && mddev->bitmap) {
@@ -4196,6 +4197,7 @@ abort:
 		rdev->new_data_offset = rdev->data_offset;
 	smp_wmb();
 	conf->reshape_progress = MaxSector;
+	conf->reshape_safe = MaxSector;
 	mddev->reshape_position = MaxSector;
 	spin_unlock_irq(&conf->device_lock);
 	return ret;
@@ -4543,6 +4545,7 @@ static void end_reshape(struct r10conf *conf)
 	md_finish_reshape(conf->mddev);
 	smp_wmb();
 	conf->reshape_progress = MaxSector;
+	conf->reshape_safe = MaxSector;
 	spin_unlock_irq(&conf->device_lock);
 
 	/* read-ahead size must cover two whole stripes, which is

From f8cb6399b16470397e7870e272ee7f3b03ed76ef Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Fri, 9 Jan 2015 18:06:12 +0000
Subject: [PATCH 22/55] xen/gntdev: convert priv->lock to a mutex

commit 1401c00e59ea021c575f74612fe2dbba36d6a4ee upstream.

Unmapping may require sleeping and we unmap while holding priv->lock, so
convert it to a mutex.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/xen/gntdev.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 474d11499d0..e68205cbc46 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -65,7 +65,7 @@ struct gntdev_priv {
 	 * Only populated if populate_freeable_maps == 1 */
 	struct list_head freeable_maps;
 	/* lock protects maps and freeable_maps */
-	spinlock_t lock;
+	struct mutex lock;
 	struct mm_struct *mm;
 	struct mmu_notifier mn;
 };
@@ -214,9 +214,9 @@ static void gntdev_put_map(struct gntdev_priv *priv, struct grant_map *map)
 	}
 
 	if (populate_freeable_maps && priv) {
-		spin_lock(&priv->lock);
+		mutex_lock(&priv->lock);
 		list_del(&map->next);
-		spin_unlock(&priv->lock);
+		mutex_unlock(&priv->lock);
 	}
 
 	if (map->pages && !use_ptemod)
@@ -392,9 +392,9 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
 		 * not do any unmapping, since that has been done prior to
 		 * closing the vma, but it may still iterate the unmap_ops list.
 		 */
-		spin_lock(&priv->lock);
+		mutex_lock(&priv->lock);
 		map->vma = NULL;
-		spin_unlock(&priv->lock);
+		mutex_unlock(&priv->lock);
 	}
 	vma->vm_private_data = NULL;
 	gntdev_put_map(priv, map);
@@ -438,14 +438,14 @@ static void mn_invl_range_start(struct mmu_notifier *mn,
 	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
 	struct grant_map *map;
 
-	spin_lock(&priv->lock);
+	mutex_lock(&priv->lock);
 	list_for_each_entry(map, &priv->maps, next) {
 		unmap_if_in_range(map, start, end);
 	}
 	list_for_each_entry(map, &priv->freeable_maps, next) {
 		unmap_if_in_range(map, start, end);
 	}
-	spin_unlock(&priv->lock);
+	mutex_unlock(&priv->lock);
 }
 
 static void mn_invl_page(struct mmu_notifier *mn,
@@ -462,7 +462,7 @@ static void mn_release(struct mmu_notifier *mn,
 	struct grant_map *map;
 	int err;
 
-	spin_lock(&priv->lock);
+	mutex_lock(&priv->lock);
 	list_for_each_entry(map, &priv->maps, next) {
 		if (!map->vma)
 			continue;
@@ -481,7 +481,7 @@ static void mn_release(struct mmu_notifier *mn,
 		err = unmap_grant_pages(map, /* offset */ 0, map->count);
 		WARN_ON(err);
 	}
-	spin_unlock(&priv->lock);
+	mutex_unlock(&priv->lock);
 }
 
 static struct mmu_notifier_ops gntdev_mmu_ops = {
@@ -503,7 +503,7 @@ static int gntdev_open(struct inode *inode, struct file *flip)
 
 	INIT_LIST_HEAD(&priv->maps);
 	INIT_LIST_HEAD(&priv->freeable_maps);
-	spin_lock_init(&priv->lock);
+	mutex_init(&priv->lock);
 
 	if (use_ptemod) {
 		priv->mm = get_task_mm(current);
@@ -579,10 +579,10 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
 		return -EFAULT;
 	}
 
-	spin_lock(&priv->lock);
+	mutex_lock(&priv->lock);
 	gntdev_add_map(priv, map);
 	op.index = map->index << PAGE_SHIFT;
-	spin_unlock(&priv->lock);
+	mutex_unlock(&priv->lock);
 
 	if (copy_to_user(u, &op, sizeof(op)) != 0)
 		return -EFAULT;
@@ -601,7 +601,7 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
 		return -EFAULT;
 	pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count);
 
-	spin_lock(&priv->lock);
+	mutex_lock(&priv->lock);
 	map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
 	if (map) {
 		list_del(&map->next);
@@ -609,7 +609,7 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
 			list_add_tail(&map->next, &priv->freeable_maps);
 		err = 0;
 	}
-	spin_unlock(&priv->lock);
+	mutex_unlock(&priv->lock);
 	if (map)
 		gntdev_put_map(priv, map);
 	return err;
@@ -677,7 +677,7 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
 	out_flags = op.action;
 	out_event = op.event_channel_port;
 
-	spin_lock(&priv->lock);
+	mutex_lock(&priv->lock);
 
 	list_for_each_entry(map, &priv->maps, next) {
 		uint64_t begin = map->index << PAGE_SHIFT;
@@ -705,7 +705,7 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
 	rc = 0;
 
  unlock_out:
-	spin_unlock(&priv->lock);
+	mutex_unlock(&priv->lock);
 
 	/* Drop the reference to the event channel we did not save in the map */
 	if (out_flags & UNMAP_NOTIFY_SEND_EVENT)
@@ -755,7 +755,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 	pr_debug("map %d+%d at %lx (pgoff %lx)\n",
 			index, count, vma->vm_start, vma->vm_pgoff);
 
-	spin_lock(&priv->lock);
+	mutex_lock(&priv->lock);
 	map = gntdev_find_map_index(priv, index, count);
 	if (!map)
 		goto unlock_out;
@@ -790,7 +790,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 			map->flags |= GNTMAP_readonly;
 	}
 
-	spin_unlock(&priv->lock);
+	mutex_unlock(&priv->lock);
 
 	if (use_ptemod) {
 		err = apply_to_page_range(vma->vm_mm, vma->vm_start,
@@ -818,11 +818,11 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 	return 0;
 
 unlock_out:
-	spin_unlock(&priv->lock);
+	mutex_unlock(&priv->lock);
 	return err;
 
 out_unlock_put:
-	spin_unlock(&priv->lock);
+	mutex_unlock(&priv->lock);
 out_put_map:
 	if (use_ptemod)
 		map->vma = NULL;

From 431152b6b341eefac26cef3d812ae52bc24df566 Mon Sep 17 00:00:00 2001
From: Hin-Tak Leung <htl10@users.sourceforge.net>
Date: Wed, 9 Sep 2015 15:38:07 -0700
Subject: [PATCH 23/55] hfs: fix B-tree corruption after insertion at position
 0

commit b4cc0efea4f0bfa2477c56af406cfcf3d3e58680 upstream.

Fix B-tree corruption when a new record is inserted at position 0 in the
node in hfs_brec_insert().

This is an identical change to the corresponding hfs b-tree code to Sergei
Antonov's "hfsplus: fix B-tree corruption after insertion at position 0",
to keep similar code paths in the hfs and hfsplus drivers in sync, where
appropriate.

Signed-off-by: Hin-Tak Leung <htl10@users.sourceforge.net>
Cc: Sergei Antonov <saproj@gmail.com>
Cc: Joe Perches <joe@perches.com>
Reviewed-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Anton Altaparmakov <anton@tuxera.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/hfs/brec.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 9f4ee7f5202..6fc766df046 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -131,13 +131,16 @@ skip:
 	hfs_bnode_write(node, entry, data_off + key_len, entry_len);
 	hfs_bnode_dump(node);
 
-	if (new_node) {
-		/* update parent key if we inserted a key
-		 * at the start of the first node
-		 */
-		if (!rec && new_node != node)
-			hfs_brec_update_parent(fd);
+	/*
+	 * update parent key if we inserted a key
+	 * at the start of the node and it is not the new node
+	 */
+	if (!rec && new_node != node) {
+		hfs_bnode_read_key(node, fd->search_key, data_off + size);
+		hfs_brec_update_parent(fd);
+	}
 
+	if (new_node) {
 		hfs_bnode_put(fd->bnode);
 		if (!new_node->parent) {
 			hfs_btree_inc_height(tree);
@@ -166,9 +169,6 @@ skip:
 		goto again;
 	}
 
-	if (!rec)
-		hfs_brec_update_parent(fd);
-
 	return 0;
 }
 
@@ -366,6 +366,8 @@ again:
 	if (IS_ERR(parent))
 		return PTR_ERR(parent);
 	__hfs_brec_find(parent, fd);
+	if (fd->record < 0)
+		return -ENOENT;
 	hfs_bnode_dump(parent);
 	rec = fd->record;
 

From 939f8043048f59a4d77db1f5eeed1526a74a1503 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Aug 2015 11:00:37 +0200
Subject: [PATCH 24/55] IB/uverbs: reject invalid or unknown opcodes

commit b632ffa7cee439ba5dce3b3bc4a5cbe2b3e20133 upstream.

We have many WR opcodes that are only supported in kernel space
and/or require optional information to be copied into the WR
structure.  Reject all those not explicitly handled so that we
can't pass invalid information to drivers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/infiniband/core/uverbs_cmd.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index a7d00f6b3bc..44c15cebd43 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2106,6 +2106,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
 		next->send_flags = user_wr->send_flags;
 
 		if (is_ud) {
+			if (next->opcode != IB_WR_SEND &&
+			    next->opcode != IB_WR_SEND_WITH_IMM) {
+				ret = -EINVAL;
+				goto out_put;
+			}
+
 			next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
 						     file->ucontext);
 			if (!next->wr.ud.ah) {
@@ -2142,9 +2148,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
 					user_wr->wr.atomic.compare_add;
 				next->wr.atomic.swap = user_wr->wr.atomic.swap;
 				next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
+			case IB_WR_SEND:
 				break;
 			default:
-				break;
+				ret = -EINVAL;
+				goto out_put;
 			}
 		}
 

From caf233503f3ce58ebe564c5d11aa53f2344a1053 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 13 Aug 2015 18:32:03 +0300
Subject: [PATCH 25/55] IB/uverbs: Fix race between ib_uverbs_open and
 remove_one

commit 35d4a0b63dc0c6d1177d4f532a9deae958f0662c upstream.

Fixes: 2a72f212263701b927559f6850446421d5906c41 ("IB/uverbs: Remove dev_table")

Before this commit there was a device look-up table that was protected
by a spin_lock used by ib_uverbs_open and by ib_uverbs_remove_one. When
it was dropped and container_of was used instead, it enabled the race
with remove_one as dev might be freed just after:
dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev) but
before the kref_get.

In addition, this buggy patch added some dead code as
container_of(x,y,z) can never be NULL and so dev can never be NULL.
As a result the comment above ib_uverbs_open saying "the open method
will either immediately run -ENXIO" is wrong as it can never happen.

The solution follows Jason Gunthorpe suggestion from below URL:
https://www.mail-archive.com/linux-rdma@vger.kernel.org/msg25692.html

cdev will hold a kref on the parent (the containing structure,
ib_uverbs_device) and only when that kref is released it is
guaranteed that open will never be called again.

In addition, fixes the active count scheme to use an atomic
not a kref to prevent WARN_ON as pointed by above comment
from Jason.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/infiniband/core/uverbs.h      |  3 +-
 drivers/infiniband/core/uverbs_main.c | 43 +++++++++++++++++++--------
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 0fcd7aa26fa..8b8de21bfdc 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -69,7 +69,7 @@
  */
 
 struct ib_uverbs_device {
-	struct kref				ref;
+	atomic_t				refcount;
 	int					num_comp_vectors;
 	struct completion			comp;
 	struct device			       *dev;
@@ -78,6 +78,7 @@ struct ib_uverbs_device {
 	struct cdev			        cdev;
 	struct rb_root				xrcd_tree;
 	struct mutex				xrcd_tree_mutex;
+	struct kobject				kobj;
 };
 
 struct ib_uverbs_event_file {
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 949b3863349..b6062b9236a 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -119,14 +119,18 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
 static void ib_uverbs_add_one(struct ib_device *device);
 static void ib_uverbs_remove_one(struct ib_device *device);
 
-static void ib_uverbs_release_dev(struct kref *ref)
+static void ib_uverbs_release_dev(struct kobject *kobj)
 {
 	struct ib_uverbs_device *dev =
-		container_of(ref, struct ib_uverbs_device, ref);
+		container_of(kobj, struct ib_uverbs_device, kobj);
 
-	complete(&dev->comp);
+	kfree(dev);
 }
 
+static struct kobj_type ib_uverbs_dev_ktype = {
+	.release = ib_uverbs_release_dev,
+};
+
 static void ib_uverbs_release_event_file(struct kref *ref)
 {
 	struct ib_uverbs_event_file *file =
@@ -282,13 +286,19 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 	return context->device->dealloc_ucontext(context);
 }
 
+static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
+{
+	complete(&dev->comp);
+}
+
 static void ib_uverbs_release_file(struct kref *ref)
 {
 	struct ib_uverbs_file *file =
 		container_of(ref, struct ib_uverbs_file, ref);
 
 	module_put(file->device->ib_dev->owner);
-	kref_put(&file->device->ref, ib_uverbs_release_dev);
+	if (atomic_dec_and_test(&file->device->refcount))
+		ib_uverbs_comp_dev(file->device);
 
 	kfree(file);
 }
@@ -629,9 +639,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 	int ret;
 
 	dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
-	if (dev)
-		kref_get(&dev->ref);
-	else
+	if (!atomic_inc_not_zero(&dev->refcount))
 		return -ENXIO;
 
 	if (!try_module_get(dev->ib_dev->owner)) {
@@ -652,6 +660,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 	mutex_init(&file->mutex);
 
 	filp->private_data = file;
+	kobject_get(&dev->kobj);
 
 	return nonseekable_open(inode, filp);
 
@@ -659,13 +668,16 @@ err_module:
 	module_put(dev->ib_dev->owner);
 
 err:
-	kref_put(&dev->ref, ib_uverbs_release_dev);
+	if (atomic_dec_and_test(&dev->refcount))
+		ib_uverbs_comp_dev(dev);
+
 	return ret;
 }
 
 static int ib_uverbs_close(struct inode *inode, struct file *filp)
 {
 	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_uverbs_device *dev = file->device;
 
 	ib_uverbs_cleanup_ucontext(file, file->ucontext);
 
@@ -673,6 +685,7 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp)
 		kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
 
 	kref_put(&file->ref, ib_uverbs_release_file);
+	kobject_put(&dev->kobj);
 
 	return 0;
 }
@@ -768,10 +781,11 @@ static void ib_uverbs_add_one(struct ib_device *device)
 	if (!uverbs_dev)
 		return;
 
-	kref_init(&uverbs_dev->ref);
+	atomic_set(&uverbs_dev->refcount, 1);
 	init_completion(&uverbs_dev->comp);
 	uverbs_dev->xrcd_tree = RB_ROOT;
 	mutex_init(&uverbs_dev->xrcd_tree_mutex);
+	kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
 
 	spin_lock(&map_lock);
 	devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -798,6 +812,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
 	cdev_init(&uverbs_dev->cdev, NULL);
 	uverbs_dev->cdev.owner = THIS_MODULE;
 	uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+	uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj;
 	kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
 	if (cdev_add(&uverbs_dev->cdev, base, 1))
 		goto err_cdev;
@@ -828,9 +843,10 @@ err_cdev:
 		clear_bit(devnum, overflow_map);
 
 err:
-	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+	if (atomic_dec_and_test(&uverbs_dev->refcount))
+		ib_uverbs_comp_dev(uverbs_dev);
 	wait_for_completion(&uverbs_dev->comp);
-	kfree(uverbs_dev);
+	kobject_put(&uverbs_dev->kobj);
 	return;
 }
 
@@ -850,9 +866,10 @@ static void ib_uverbs_remove_one(struct ib_device *device)
 	else
 		clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
 
-	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+	if (atomic_dec_and_test(&uverbs_dev->refcount))
+		ib_uverbs_comp_dev(uverbs_dev);
 	wait_for_completion(&uverbs_dev->comp);
-	kfree(uverbs_dev);
+	kobject_put(&uverbs_dev->kobj);
 }
 
 static char *uverbs_devnode(struct device *dev, umode_t *mode)

From a6d452e0f3d91f697b90650f21fb540c13d55b44 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 30 Jul 2015 17:34:23 +0300
Subject: [PATCH 26/55] IB/mlx4: Forbid using sysfs to change RoCE pkeys

commit 2b135db3e81301d0452e6aa107349abe67b097d6 upstream.

The pkey mapping for RoCE must remain the default mapping:
VFs:
  virtual index 0 = mapped to real index 0 (0xFFFF)
  All others indices: mapped to a real pkey index containing an
                      invalid pkey.
PF:
  virtual index i = real index i.

Don't allow users to change these mappings using files found in
sysfs.

Fixes: c1e7e466120b ('IB/mlx4: Add iov directory in sysfs under the ib device')
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/infiniband/hw/mlx4/sysfs.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c
index 97516eb363b..c5ce4082fdc 100644
--- a/drivers/infiniband/hw/mlx4/sysfs.c
+++ b/drivers/infiniband/hw/mlx4/sysfs.c
@@ -563,6 +563,8 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
 	struct mlx4_port *p;
 	int i;
 	int ret;
+	int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port_num) ==
+			IB_LINK_LAYER_ETHERNET;
 
 	p = kzalloc(sizeof *p, GFP_KERNEL);
 	if (!p)
@@ -580,7 +582,8 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
 
 	p->pkey_group.name  = "pkey_idx";
 	p->pkey_group.attrs =
-		alloc_group_attrs(show_port_pkey, store_port_pkey,
+		alloc_group_attrs(show_port_pkey,
+				  is_eth ? NULL : store_port_pkey,
 				  dev->dev->caps.pkey_table_len[port_num]);
 	if (!p->pkey_group.attrs)
 		goto err_alloc;

From 2698f5747a861bc1ecfb35b08ffa3ffabc91a77f Mon Sep 17 00:00:00 2001
From: Noa Osherovich <noaos@mellanox.com>
Date: Thu, 30 Jul 2015 17:34:24 +0300
Subject: [PATCH 27/55] IB/mlx4: Use correct SL on AH query under RoCE

commit 5e99b139f1b68acd65e36515ca347b03856dfb5a upstream.

The mlx4 IB driver implementation for ib_query_ah used a wrong offset
(28 instead of 29) when link type is Ethernet. Fixed to use the correct one.

Fixes: fa417f7b520e ('IB/mlx4: Add support for IBoE')
Signed-off-by: Shani Michaeli <shanim@mellanox.com>
Signed-off-by: Noa Osherovich <noaos@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/infiniband/hw/mlx4/ah.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index a251becdaa9..890c23b3d71 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -169,9 +169,13 @@ int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
 	enum rdma_link_layer ll;
 
 	memset(ah_attr, 0, sizeof *ah_attr);
-	ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
 	ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24;
 	ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num);
+	if (ll == IB_LINK_LAYER_ETHERNET)
+		ah_attr->sl = be32_to_cpu(ah->av.eth.sl_tclass_flowlabel) >> 29;
+	else
+		ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+
 	ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0;
 	if (ah->av.ib.stat_rate)
 		ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET;

From 61cabc7d549fde1afddac0efbbf03a5c63161b33 Mon Sep 17 00:00:00 2001
From: Hin-Tak Leung <htl10@users.sourceforge.net>
Date: Wed, 9 Sep 2015 15:38:04 -0700
Subject: [PATCH 28/55] hfs,hfsplus: cache pages correctly between bnode_create
 and bnode_free

commit 7cb74be6fd827e314f81df3c5889b87e4c87c569 upstream.

Pages looked up by __hfs_bnode_create() (called by hfs_bnode_create() and
hfs_bnode_find() for finding or creating pages corresponding to an inode)
are immediately kmap()'ed and used (both read and write) and kunmap()'ed,
and should not be page_cache_release()'ed until hfs_bnode_free().

This patch fixes a problem I first saw in July 2012: merely running "du"
on a large hfsplus-mounted directory a few times on a reasonably loaded
system would get the hfsplus driver all confused and complaining about
B-tree inconsistencies, and generates a "BUG: Bad page state".  Most
recently, I can generate this problem on up-to-date Fedora 22 with shipped
kernel 4.0.5, by running "du /" (="/" + "/home" + "/mnt" + other smaller
mounts) and "du /mnt" simultaneously on two windows, where /mnt is a
lightly-used QEMU VM image of the full Mac OS X 10.9:

$ df -i / /home /mnt
Filesystem                  Inodes   IUsed      IFree IUse% Mounted on
/dev/mapper/fedora-root    3276800  551665    2725135   17% /
/dev/mapper/fedora-home   52879360  716221   52163139    2% /home
/dev/nbd0p2             4294967295 1387818 4293579477    1% /mnt

After applying the patch, I was able to run "du /" (60+ times) and "du
/mnt" (150+ times) continuously and simultaneously for 6+ hours.

There are many reports of the hfsplus driver getting confused under load
and generating "BUG: Bad page state" or other similar issues over the
years.  [1]

The unpatched code [2] has always been wrong since it entered the kernel
tree.  The only reason why it gets away with it is that the
kmap/memcpy/kunmap follow very quickly after the page_cache_release() so
the kernel has not had a chance to reuse the memory for something else,
most of the time.

The current RW driver appears to have followed the design and development
of the earlier read-only hfsplus driver [3], where-by version 0.1 (Dec
2001) had a B-tree node-centric approach to
read_cache_page()/page_cache_release() per bnode_get()/bnode_put(),
migrating towards version 0.2 (June 2002) of caching and releasing pages
per inode extents.  When the current RW code first entered the kernel [2]
in 2005, there was an REF_PAGES conditional (and "//" commented out code)
to switch between B-node centric paging to inode-centric paging.  There
was a mistake with the direction of one of the REF_PAGES conditionals in
__hfs_bnode_create().  In a subsequent "remove debug code" commit [4], the
read_cache_page()/page_cache_release() per bnode_get()/bnode_put() were
removed, but a page_cache_release() was mistakenly left in (propagating
the "REF_PAGES <-> !REF_PAGE" mistake), and the commented-out
page_cache_release() in bnode_release() (which should be spanned by
!REF_PAGES) was never enabled.

References:
[1]:
Michael Fox, Apr 2013
http://www.spinics.net/lists/linux-fsdevel/msg63807.html
("hfsplus volume suddenly inaccessable after 'hfs: recoff %d too large'")

Sasha Levin, Feb 2015
http://lkml.org/lkml/2015/2/20/85 ("use after free")

https://bugs.launchpad.net/ubuntu/+source/linux/+bug/740814
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1027887
https://bugzilla.kernel.org/show_bug.cgi?id=42342
https://bugzilla.kernel.org/show_bug.cgi?id=63841
https://bugzilla.kernel.org/show_bug.cgi?id=78761

[2]:
http://git.kernel.org/cgit/linux/kernel/git/tglx/history.git/commit/\
fs/hfs/bnode.c?id=d1081202f1d0ee35ab0beb490da4b65d4bc763db
commit d1081202f1d0ee35ab0beb490da4b65d4bc763db
Author: Andrew Morton <akpm@osdl.org>
Date:   Wed Feb 25 16:17:36 2004 -0800

    [PATCH] HFS rewrite

http://git.kernel.org/cgit/linux/kernel/git/tglx/history.git/commit/\
fs/hfsplus/bnode.c?id=91556682e0bf004d98a529bf829d339abb98bbbd

commit 91556682e0bf004d98a529bf829d339abb98bbbd
Author: Andrew Morton <akpm@osdl.org>
Date:   Wed Feb 25 16:17:48 2004 -0800

    [PATCH] HFS+ support

[3]:
http://sourceforge.net/projects/linux-hfsplus/

http://sourceforge.net/projects/linux-hfsplus/files/Linux%202.4.x%20patch/hfsplus%200.1/
http://sourceforge.net/projects/linux-hfsplus/files/Linux%202.4.x%20patch/hfsplus%200.2/

http://linux-hfsplus.cvs.sourceforge.net/viewvc/linux-hfsplus/linux/\
fs/hfsplus/bnode.c?r1=1.4&r2=1.5

Date:   Thu Jun 6 09:45:14 2002 +0000
Use buffer cache instead of page cache in bnode.c. Cache inode extents.

[4]:
http://git.kernel.org/cgit/linux/kernel/git/\
stable/linux-stable.git/commit/?id=a5e3985fa014029eb6795664c704953720cc7f7d

commit a5e3985fa014029eb6795664c704953720cc7f7d
Author: Roman Zippel <zippel@linux-m68k.org>
Date:   Tue Sep 6 15:18:47 2005 -0700

[PATCH] hfs: remove debug code

Signed-off-by: Hin-Tak Leung <htl10@users.sourceforge.net>
Signed-off-by: Sergei Antonov <saproj@gmail.com>
Reviewed-by: Anton Altaparmakov <anton@tuxera.com>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Sougata Santra <sougata@tuxera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/hfs/bnode.c     | 9 ++++-----
 fs/hfsplus/bnode.c | 3 ---
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index d3fa6bd9503..221719eac5d 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -288,7 +288,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 			page_cache_release(page);
 			goto fail;
 		}
-		page_cache_release(page);
 		node->page[i] = page;
 	}
 
@@ -398,11 +397,11 @@ node_error:
 
 void hfs_bnode_free(struct hfs_bnode *node)
 {
-	//int i;
+	int i;
 
-	//for (i = 0; i < node->tree->pages_per_bnode; i++)
-	//	if (node->page[i])
-	//		page_cache_release(node->page[i]);
+	for (i = 0; i < node->tree->pages_per_bnode; i++)
+		if (node->page[i])
+			page_cache_release(node->page[i]);
 	kfree(node);
 }
 
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 11c86020452..bedfe5f7d33 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -456,7 +456,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 			page_cache_release(page);
 			goto fail;
 		}
-		page_cache_release(page);
 		node->page[i] = page;
 	}
 
@@ -568,13 +567,11 @@ node_error:
 
 void hfs_bnode_free(struct hfs_bnode *node)
 {
-#if 0
 	int i;
 
 	for (i = 0; i < node->tree->pages_per_bnode; i++)
 		if (node->page[i])
 			page_cache_release(node->page[i]);
-#endif
 	kfree(node);
 }
 

From 7bf24986e3c2e4b818be4a6172aebb3784c6bcda Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Fri, 12 Jun 2015 10:16:41 -0300
Subject: [PATCH 29/55] sctp: fix ASCONF list handling

commit 2d45a02d0166caf2627fe91897c6ffc3b19514c4 upstream.

->auto_asconf_splist is per namespace and mangled by functions like
sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.

Also, the call to inet_sk_copy_descendant() was backuping
->auto_asconf_list through the copy but was not honoring
->do_auto_asconf, which could lead to list corruption if it was
different between both sockets.

This commit thus fixes the list handling by using ->addr_wq_lock
spinlock to protect the list. A special handling is done upon socket
creation and destruction for that. Error handlig on sctp_init_sock()
will never return an error after having initialized asconf, so
sctp_destroy_sock() can be called without addrq_wq_lock. The lock now
will be take on sctp_close_sock(), before locking the socket, so we
don't do it in inverse order compared to sctp_addr_wq_timeout_handler().

Instead of taking the lock on sctp_sock_migrate() for copying and
restoring the list values, it's preferred to avoid rewritting it by
implementing sctp_copy_descendant().

Issue was found with a test application that kept flipping sysctl
default_auto_asconf on and off, but one could trigger it by issuing
simultaneous setsockopt() calls on multiple sockets or by
creating/destroying sockets fast enough. This is only triggerable
locally.

Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
Reported-by: Ji Jianwen <jiji@redhat.com>
Suggested-by: Neil Horman <nhorman@tuxdriver.com>
Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
[wangkai: backport to 3.10: adjust context]
Signed-off-by: Wang Kai <morgan.wang@huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/netns/sctp.h   |  1 +
 include/net/sctp/structs.h |  4 ++++
 net/sctp/socket.c          | 43 ++++++++++++++++++++++++++++----------
 3 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index 3573a81815a..8ba379f9e46 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -31,6 +31,7 @@ struct netns_sctp {
 	struct list_head addr_waitq;
 	struct timer_list addr_wq_timer;
 	struct list_head auto_asconf_splist;
+	/* Lock that protects both addr_waitq and auto_asconf_splist */
 	spinlock_t addr_wq_lock;
 
 	/* Lock that protects the local_addr_list writers */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index da6b9a01ff7..b30c1d95be2 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -228,6 +228,10 @@ struct sctp_sock {
 	atomic_t pd_mode;
 	/* Receive to here while partial delivery is in effect. */
 	struct sk_buff_head pd_lobby;
+
+	/* These must be the last fields, as they will skipped on copies,
+	 * like on accept and peeloff operations
+	 */
 	struct list_head auto_asconf_list;
 	int do_auto_asconf;
 };
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index dfb9b133e66..ec5766dc394 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1548,8 +1548,10 @@ SCTP_STATIC void sctp_close(struct sock *sk, long timeout)
 
 	/* Supposedly, no process has access to the socket, but
 	 * the net layers still may.
+	 * Also, sctp_destroy_sock() needs to be called with addr_wq_lock
+	 * held and that should be grabbed before socket lock.
 	 */
-	sctp_local_bh_disable();
+	spin_lock_bh(&net->sctp.addr_wq_lock);
 	sctp_bh_lock_sock(sk);
 
 	/* Hold the sock, since sk_common_release() will put sock_put()
@@ -1559,7 +1561,7 @@ SCTP_STATIC void sctp_close(struct sock *sk, long timeout)
 	sk_common_release(sk);
 
 	sctp_bh_unlock_sock(sk);
-	sctp_local_bh_enable();
+	spin_unlock_bh(&net->sctp.addr_wq_lock);
 
 	sock_put(sk);
 
@@ -3508,6 +3510,7 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 	if ((val && sp->do_auto_asconf) || (!val && !sp->do_auto_asconf))
 		return 0;
 
+	spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	if (val == 0 && sp->do_auto_asconf) {
 		list_del(&sp->auto_asconf_list);
 		sp->do_auto_asconf = 0;
@@ -3516,6 +3519,7 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 		    &sock_net(sk)->sctp.auto_asconf_splist);
 		sp->do_auto_asconf = 1;
 	}
+	spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
 	return 0;
 }
 
@@ -4007,18 +4011,28 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
 	local_bh_disable();
 	percpu_counter_inc(&sctp_sockets_allocated);
 	sock_prot_inuse_add(net, sk->sk_prot, 1);
+
+	/* Nothing can fail after this block, otherwise
+	 * sctp_destroy_sock() will be called without addr_wq_lock held
+	 */
 	if (net->sctp.default_auto_asconf) {
+		spin_lock(&sock_net(sk)->sctp.addr_wq_lock);
 		list_add_tail(&sp->auto_asconf_list,
 		    &net->sctp.auto_asconf_splist);
 		sp->do_auto_asconf = 1;
-	} else
+		spin_unlock(&sock_net(sk)->sctp.addr_wq_lock);
+	} else {
 		sp->do_auto_asconf = 0;
+	}
+
 	local_bh_enable();
 
 	return 0;
 }
 
-/* Cleanup any SCTP per socket resources.  */
+/* Cleanup any SCTP per socket resources. Must be called with
+ * sock_net(sk)->sctp.addr_wq_lock held if sp->do_auto_asconf is true
+ */
 SCTP_STATIC void sctp_destroy_sock(struct sock *sk)
 {
 	struct sctp_sock *sp;
@@ -6957,6 +6971,19 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newinet->mc_list = NULL;
 }
 
+static inline void sctp_copy_descendant(struct sock *sk_to,
+					const struct sock *sk_from)
+{
+	int ancestor_size = sizeof(struct inet_sock) +
+			    sizeof(struct sctp_sock) -
+			    offsetof(struct sctp_sock, auto_asconf_list);
+
+	if (sk_from->sk_family == PF_INET6)
+		ancestor_size += sizeof(struct ipv6_pinfo);
+
+	__inet_sk_copy_descendant(sk_to, sk_from, ancestor_size);
+}
+
 /* Populate the fields of the newsk from the oldsk and migrate the assoc
  * and its messages to the newsk.
  */
@@ -6971,7 +6998,6 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	struct sk_buff *skb, *tmp;
 	struct sctp_ulpevent *event;
 	struct sctp_bind_hashbucket *head;
-	struct list_head tmplist;
 
 	/* Migrate socket buffer sizes and all the socket level options to the
 	 * new socket.
@@ -6979,12 +7005,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	newsk->sk_sndbuf = oldsk->sk_sndbuf;
 	newsk->sk_rcvbuf = oldsk->sk_rcvbuf;
 	/* Brute force copy old sctp opt. */
-	if (oldsp->do_auto_asconf) {
-		memcpy(&tmplist, &newsp->auto_asconf_list, sizeof(tmplist));
-		inet_sk_copy_descendant(newsk, oldsk);
-		memcpy(&newsp->auto_asconf_list, &tmplist, sizeof(tmplist));
-	} else
-		inet_sk_copy_descendant(newsk, oldsk);
+	sctp_copy_descendant(newsk, oldsk);
 
 	/* Restore the ep value that was overwritten with the above structure
 	 * copy.

From fa83234f6a4e7b378f0da63938a09b9e8d535c4d Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 5 Feb 2015 10:37:33 +0300
Subject: [PATCH 30/55] vhost/scsi: potential memory corruption

commit 59c816c1f24df0204e01851431d3bab3eb76719c upstream.

This code in vhost_scsi_make_tpg() is confusing because we limit "tpgt"
to UINT_MAX but the data type of "tpg->tport_tpgt" and that is a u16.

I looked at the context and it turns out that in
vhost_scsi_set_endpoint(), "tpg->tport_tpgt" is used as an offset into
the vs_tpg[] array which has VHOST_SCSI_MAX_TARGET (256) elements so
anything higher than 255 then it is invalid.  I have made that the limit
now.

In vhost_scsi_send_evt() we mask away values higher than 255, but now
that the limit has changed, we don't need the mask.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
[ The affected function was renamed to vhost_scsi_make_tpg before
  the vulnerability was announced, I ported it to 3.10 stable and
  changed the code in function tcm_vhost_make_tpg]
Signed-off-by: Wang Long <long.wanglong@huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/vhost/scsi.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index fb97bc0b80e..2947eda522b 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1088,7 +1088,7 @@ static void tcm_vhost_send_evt(struct vhost_scsi *vs, struct tcm_vhost_tpg *tpg,
 		 * lun[4-7] need to be zero according to virtio-scsi spec.
 		 */
 		evt->event.lun[0] = 0x01;
-		evt->event.lun[1] = tpg->tport_tpgt & 0xFF;
+		evt->event.lun[1] = tpg->tport_tpgt;
 		if (lun->unpacked_lun >= 256)
 			evt->event.lun[2] = lun->unpacked_lun >> 8 | 0x40 ;
 		evt->event.lun[3] = lun->unpacked_lun & 0xFF;
@@ -1894,12 +1894,12 @@ static struct se_portal_group *tcm_vhost_make_tpg(struct se_wwn *wwn,
 			struct tcm_vhost_tport, tport_wwn);
 
 	struct tcm_vhost_tpg *tpg;
-	unsigned long tpgt;
+	u16 tpgt;
 	int ret;
 
 	if (strstr(name, "tpgt_") != name)
 		return ERR_PTR(-EINVAL);
-	if (kstrtoul(name + 5, 10, &tpgt) || tpgt > UINT_MAX)
+	if (kstrtou16(name + 5, 10, &tpgt) || tpgt >= VHOST_SCSI_MAX_TARGET)
 		return ERR_PTR(-EINVAL);
 
 	tpg = kzalloc(sizeof(struct tcm_vhost_tpg), GFP_KERNEL);

From 9f6191daa545384ce5cc90b770f0d2bf64c0ba22 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 22 May 2015 15:42:55 -0700
Subject: [PATCH 31/55] x86: bpf_jit: fix compilation of large bpf programs

commit 3f7352bf21f8fd7ba3e2fcef9488756f188e12be upstream.

x86 has variable length encoding. x86 JIT compiler is trying
to pick the shortest encoding for given bpf instruction.
While doing so the jump targets are changing, so JIT is doing
multiple passes over the program. Typical program needs 3 passes.
Some very short programs converge with 2 passes. Large programs
may need 4 or 5. But specially crafted bpf programs may hit the
pass limit and if the program converges on the last iteration
the JIT compiler will be producing an image full of 'int 3' insns.
Fix this corner case by doing final iteration over bpf program.

Fixes: 0a14842f5a3c ("net: filter: Just In Time compiler for x86-64")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Tested-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/net/bpf_jit_comp.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 0c966fecfb8..5479d677f9b 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -176,7 +176,12 @@ void bpf_jit_compile(struct sk_filter *fp)
 	}
 	cleanup_addr = proglen; /* epilogue address */
 
-	for (pass = 0; pass < 10; pass++) {
+	/* JITed image shrinks with every pass and the loop iterates
+	 * until the image stops shrinking. Very large bpf programs
+	 * may converge on the last pass. In such case do one more
+	 * pass to emit the final image
+	 */
+	for (pass = 0; pass < 10 || image; pass++) {
 		u8 seen_or_pass0 = (pass == 0) ? (SEEN_XREG | SEEN_DATAREF | SEEN_MEM) : seen;
 		/* no prologue/epilogue for trivial filters (RET something) */
 		proglen = 0;

From 3b9393dc1d4cc226bce44708ab899518cc598e57 Mon Sep 17 00:00:00 2001
From: Angga <Hermin.Anggawijaya@alliedtelesis.co.nz>
Date: Fri, 3 Jul 2015 14:40:52 +1200
Subject: [PATCH 32/55] ipv6: Make MLD packets to only be processed locally

[ Upstream commit 4c938d22c88a9ddccc8c55a85e0430e9c62b1ac5 ]

Before commit daad151263cf ("ipv6: Make ipv6_is_mld() inline and use it
from ip6_mc_input().") MLD packets were only processed locally. After the
change, a copy of MLD packet goes through ip6_mr_input, causing
MRT6MSG_NOCACHE message to be generated to user space.

Make MLD packet only processed locally.

Fixes: daad151263cf ("ipv6: Make ipv6_is_mld() inline and use it from ip6_mc_input().")
Signed-off-by: Hermin Anggawijaya <hermin.anggawijaya@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/ip6_input.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 774b09cb292..63264c9a15c 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -325,10 +325,10 @@ int ip6_mc_input(struct sk_buff *skb)
 				if (offset < 0)
 					goto out;
 
-				if (!ipv6_is_mld(skb, nexthdr, offset))
-					goto out;
+				if (ipv6_is_mld(skb, nexthdr, offset))
+					deliver = true;
 
-				deliver = true;
+				goto out;
 			}
 			/* unknown RA - process it normally */
 		}

From afabf2a8b621f2c300cbdf6adb0f8855f612b3a6 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Tue, 7 Jul 2015 09:43:45 -0400
Subject: [PATCH 33/55] net/tipc: initialize security state for new connection
 socket

[ Upstream commit fdd75ea8df370f206a8163786e7470c1277a5064 ]

Calling connect() with an AF_TIPC socket would trigger a series
of error messages from SELinux along the lines of:
SELinux: Invalid class 0
type=AVC msg=audit(1434126658.487:34500): avc:  denied  { <unprintable> }
  for pid=292 comm="kworker/u16:5" scontext=system_u:system_r:kernel_t:s0
  tcontext=system_u:object_r:unlabeled_t:s0 tclass=<unprintable>
  permissive=0

This was due to a failure to initialize the security state of the new
connection sock by the tipc code, leaving it with junk in the security
class field and an unlabeled secid.  Add a call to security_sk_clone()
to inherit the security state from the parent socket.

Reported-by: Tim Shearer <tim.shearer@overturenetworks.com>
Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/tipc/socket.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 2b1d7c2d677..e0cb5edc6d1 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1528,6 +1528,7 @@ static int accept(struct socket *sock, struct socket *new_sock, int flags)
 	res = tipc_create(sock_net(sock->sk), new_sock, 0, 0);
 	if (res)
 		goto exit;
+	security_sk_clone(sock->sk, new_sock->sk);
 
 	new_sk = new_sock->sk;
 	new_tsock = tipc_sk(new_sk);

From 7865ece30f06072ea61f7db2ebbe24b632800e17 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <razor@blackwall.org>
Date: Tue, 7 Jul 2015 15:55:56 +0200
Subject: [PATCH 34/55] bridge: mdb: zero out the local br_ip variable before
 use

[ Upstream commit f1158b74e54f2e2462ba5e2f45a118246d9d5b43 ]

Since commit b0e9a30dd669 ("bridge: Add vlan id to multicast groups")
there's a check in br_ip_equal() for a matching vlan id, but the mdb
functions were not modified to use (or at least zero it) so when an
entry was added it would have a garbage vlan id (from the local br_ip
variable in __br_mdb_add/del) and this would prevent it from being
matched and also deleted. So zero out the whole local ip var to protect
ourselves from future changes and also to fix the current bug, since
there's no vlan id support in the mdb uapi - use always vlan id 0.
Example before patch:
root@debian:~# bridge mdb add dev br0 port eth1 grp 239.0.0.1 permanent
root@debian:~# bridge mdb
dev br0 port eth1 grp 239.0.0.1 permanent
root@debian:~# bridge mdb del dev br0 port eth1 grp 239.0.0.1 permanent
RTNETLINK answers: Invalid argument

After patch:
root@debian:~# bridge mdb add dev br0 port eth1 grp 239.0.0.1 permanent
root@debian:~# bridge mdb
dev br0 port eth1 grp 239.0.0.1 permanent
root@debian:~# bridge mdb del dev br0 port eth1 grp 239.0.0.1 permanent
root@debian:~# bridge mdb

Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Fixes: b0e9a30dd669 ("bridge: Add vlan id to multicast groups")
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bridge/br_mdb.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 19942e38fd2..ff61e200bf0 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -368,6 +368,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
 	if (!p || p->br != br || p->state == BR_STATE_DISABLED)
 		return -EINVAL;
 
+	memset(&ip, 0, sizeof(ip));
 	ip.proto = entry->addr.proto;
 	if (ip.proto == htons(ETH_P_IP))
 		ip.u.ip4 = entry->addr.u.ip4;
@@ -417,6 +418,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
 	if (timer_pending(&br->multicast_querier_timer))
 		return -EBUSY;
 
+	memset(&ip, 0, sizeof(ip));
 	ip.proto = entry->addr.proto;
 	if (ip.proto == htons(ETH_P_IP))
 		ip.u.ip4 = entry->addr.u.ip4;

From f85eee641c7a3bb928a4b605db632c2e90b9574f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 8 Jul 2015 21:42:11 +0200
Subject: [PATCH 35/55] net: pktgen: fix race between pktgen_thread_worker()
 and kthread_stop()

[ Upstream commit fecdf8be2d91e04b0a9a4f79ff06499a36f5d14f ]

pktgen_thread_worker() is obviously racy, kthread_stop() can come
between the kthread_should_stop() check and set_current_state().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Jan Stancek <jstancek@redhat.com>
Reported-by: Marcelo Leitner <mleitner@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/pktgen.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index ebbea537196..21a23d97e99 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3377,8 +3377,10 @@ static int pktgen_thread_worker(void *arg)
 	pktgen_rem_thread(t);
 
 	/* Wait for kthread_stop */
-	while (!kthread_should_stop()) {
+	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
 		schedule();
 	}
 	__set_current_state(TASK_RUNNING);

From c987fa7146e0c18acc2392b25349cca45c177175 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 9 Jul 2015 09:59:10 +0300
Subject: [PATCH 36/55] net: call rcu_read_lock early in process_backlog

[ Upstream commit 2c17d27c36dcce2b6bf689f41a46b9e909877c21 ]

Incoming packet should be either in backlog queue or
in RCU read-side section. Otherwise, the final sequence of
flush_backlog() and synchronize_net() may miss packets
that can run without device reference:

CPU 1                  CPU 2
                       skb->dev: no reference
                       process_backlog:__skb_dequeue
                       process_backlog:local_irq_enable

on_each_cpu for
flush_backlog =>       IPI(hardirq): flush_backlog
                       - packet not found in backlog

                       CPU delayed ...
synchronize_net
- no ongoing RCU
read-side sections

netdev_run_todo,
rcu_barrier: no
ongoing callbacks
                       __netif_receive_skb_core:rcu_read_lock
                       - too late
free dev
                       process packet for freed dev

Fixes: 6e583ce5242f ("net: eliminate refcounting in backlog queue")
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/dev.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index aeca8dd88b2..1ccfc49683b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3443,8 +3443,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 
 	pt_prev = NULL;
 
-	rcu_read_lock();
-
 another_round:
 	skb->skb_iif = skb->dev->ifindex;
 
@@ -3454,7 +3452,7 @@ another_round:
 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 		skb = vlan_untag(skb);
 		if (unlikely(!skb))
-			goto unlock;
+			goto out;
 	}
 
 #ifdef CONFIG_NET_CLS_ACT
@@ -3479,7 +3477,7 @@ skip_taps:
 #ifdef CONFIG_NET_CLS_ACT
 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
 	if (!skb)
-		goto unlock;
+		goto out;
 ncls:
 #endif
 
@@ -3494,7 +3492,7 @@ ncls:
 		if (vlan_do_receive(&skb))
 			goto another_round;
 		else if (unlikely(!skb))
-			goto unlock;
+			goto out;
 	}
 
 	rx_handler = rcu_dereference(skb->dev->rx_handler);
@@ -3506,7 +3504,7 @@ ncls:
 		switch (rx_handler(&skb)) {
 		case RX_HANDLER_CONSUMED:
 			ret = NET_RX_SUCCESS;
-			goto unlock;
+			goto out;
 		case RX_HANDLER_ANOTHER:
 			goto another_round;
 		case RX_HANDLER_EXACT:
@@ -3558,8 +3556,6 @@ drop:
 		ret = NET_RX_DROP;
 	}
 
-unlock:
-	rcu_read_unlock();
 out:
 	return ret;
 }
@@ -3606,29 +3602,30 @@ static int __netif_receive_skb(struct sk_buff *skb)
  */
 int netif_receive_skb(struct sk_buff *skb)
 {
+	int ret;
+
 	net_timestamp_check(netdev_tstamp_prequeue, skb);
 
 	if (skb_defer_rx_timestamp(skb))
 		return NET_RX_SUCCESS;
 
+	rcu_read_lock();
+
 #ifdef CONFIG_RPS
 	if (static_key_false(&rps_needed)) {
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
-		int cpu, ret;
-
-		rcu_read_lock();
-
-		cpu = get_rps_cpu(skb->dev, skb, &rflow);
+		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 
 		if (cpu >= 0) {
 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 			rcu_read_unlock();
 			return ret;
 		}
-		rcu_read_unlock();
 	}
 #endif
-	return __netif_receive_skb(skb);
+	ret = __netif_receive_skb(skb);
+	rcu_read_unlock();
+	return ret;
 }
 EXPORT_SYMBOL(netif_receive_skb);
 
@@ -4038,8 +4035,10 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		unsigned int qlen;
 
 		while ((skb = __skb_dequeue(&sd->process_queue))) {
+			rcu_read_lock();
 			local_irq_enable();
 			__netif_receive_skb(skb);
+			rcu_read_unlock();
 			local_irq_disable();
 			input_queue_head_incr(sd);
 			if (++work >= quota) {

From 0ba48ae94c393dc4c43b257400046feeeb9c6fad Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 13 Jul 2015 16:04:13 +0800
Subject: [PATCH 37/55] net: Clone skb before setting peeked flag

[ Upstream commit 738ac1ebb96d02e0d23bc320302a6ea94c612dec ]

Shared skbs must not be modified and this is crucial for broadcast
and/or multicast paths where we use it as an optimisation to avoid
unnecessary cloning.

The function skb_recv_datagram breaks this rule by setting peeked
without cloning the skb first.  This causes funky races which leads
to double-free.

This patch fixes this by cloning the skb and replacing the skb
in the list when setting skb->peeked.

Fixes: a59322be07c9 ("[UDP]: Only increment counter on first peek/recv")
Reported-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/datagram.c | 41 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/net/core/datagram.c b/net/core/datagram.c
index b71423db778..f1506c7d414 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -128,6 +128,35 @@ out_noerr:
 	goto out;
 }
 
+static int skb_set_peeked(struct sk_buff *skb)
+{
+	struct sk_buff *nskb;
+
+	if (skb->peeked)
+		return 0;
+
+	/* We have to unshare an skb before modifying it. */
+	if (!skb_shared(skb))
+		goto done;
+
+	nskb = skb_clone(skb, GFP_ATOMIC);
+	if (!nskb)
+		return -ENOMEM;
+
+	skb->prev->next = nskb;
+	skb->next->prev = nskb;
+	nskb->prev = skb->prev;
+	nskb->next = skb->next;
+
+	consume_skb(skb);
+	skb = nskb;
+
+done:
+	skb->peeked = 1;
+
+	return 0;
+}
+
 /**
  *	__skb_recv_datagram - Receive a datagram skbuff
  *	@sk: socket
@@ -162,7 +191,9 @@ out_noerr:
 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 				    int *peeked, int *off, int *err)
 {
+	struct sk_buff_head *queue = &sk->sk_receive_queue;
 	struct sk_buff *skb, *last;
+	unsigned long cpu_flags;
 	long timeo;
 	/*
 	 * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
@@ -181,8 +212,6 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 		 * Look at current nfs client by the way...
 		 * However, this function was correct in any case. 8)
 		 */
-		unsigned long cpu_flags;
-		struct sk_buff_head *queue = &sk->sk_receive_queue;
 		int _off = *off;
 
 		last = (struct sk_buff *)queue;
@@ -196,7 +225,11 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 					_off -= skb->len;
 					continue;
 				}
-				skb->peeked = 1;
+
+				error = skb_set_peeked(skb);
+				if (error)
+					goto unlock_err;
+
 				atomic_inc(&skb->users);
 			} else
 				__skb_unlink(skb, queue);
@@ -216,6 +249,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 
 	return NULL;
 
+unlock_err:
+	spin_unlock_irqrestore(&queue->lock, cpu_flags);
 no_packet:
 	*err = error;
 	return NULL;

From 4164cda8cad7303614d211192d27a912de53a463 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 13 Jul 2015 20:01:42 +0800
Subject: [PATCH 38/55] net: Fix skb csum races when peeking

[ Upstream commit 89c22d8c3b278212eef6a8cc66b570bc840a6f5a ]

When we calculate the checksum on the recv path, we store the
result in the skb as an optimisation in case we need the checksum
again down the line.

This is in fact bogus for the MSG_PEEK case as this is done without
any locking.  So multiple threads can peek and then store the result
to the same skb, potentially resulting in bogus skb states.

This patch fixes this by only storing the result if the skb is not
shared.  This preserves the optimisations for the few cases where
it can be done safely due to locking or other reasons, e.g., SIOCINQ.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/datagram.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/datagram.c b/net/core/datagram.c
index f1506c7d414..80b0fd83fac 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -700,7 +700,8 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
 	if (likely(!sum)) {
 		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
 			netdev_rx_csum_fault(skb->dev);
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		if (!skb_shared(skb))
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 	return sum;
 }

From 5fa39f16036eb627fe50a38c307fa99eace26ee3 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 4 Aug 2015 15:42:47 +0800
Subject: [PATCH 39/55] net: Fix skb_set_peeked use-after-free bug

[ Upstream commit a0a2a6602496a45ae838a96db8b8173794b5d398 ]

The commit 738ac1ebb96d02e0d23bc320302a6ea94c612dec ("net: Clone
skb before setting peeked flag") introduced a use-after-free bug
in skb_recv_datagram.  This is because skb_set_peeked may create
a new skb and free the existing one.  As it stands the caller will
continue to use the old freed skb.

This patch fixes it by making skb_set_peeked return the new skb
(or the old one if unchanged).

Fixes: 738ac1ebb96d ("net: Clone skb before setting peeked flag")
Reported-by: Brenden Blanco <bblanco@plumgrid.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Tested-by: Brenden Blanco <bblanco@plumgrid.com>
Reviewed-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/datagram.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/core/datagram.c b/net/core/datagram.c
index 80b0fd83fac..052b71c5b1b 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -128,12 +128,12 @@ out_noerr:
 	goto out;
 }
 
-static int skb_set_peeked(struct sk_buff *skb)
+static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
 {
 	struct sk_buff *nskb;
 
 	if (skb->peeked)
-		return 0;
+		return skb;
 
 	/* We have to unshare an skb before modifying it. */
 	if (!skb_shared(skb))
@@ -141,7 +141,7 @@ static int skb_set_peeked(struct sk_buff *skb)
 
 	nskb = skb_clone(skb, GFP_ATOMIC);
 	if (!nskb)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	skb->prev->next = nskb;
 	skb->next->prev = nskb;
@@ -154,7 +154,7 @@ static int skb_set_peeked(struct sk_buff *skb)
 done:
 	skb->peeked = 1;
 
-	return 0;
+	return skb;
 }
 
 /**
@@ -226,8 +226,9 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 					continue;
 				}
 
-				error = skb_set_peeked(skb);
-				if (error)
+				skb = skb_set_peeked(skb);
+				error = PTR_ERR(skb);
+				if (IS_ERR(skb))
 					goto unlock_err;
 
 				atomic_inc(&skb->users);

From 8d228c93f31c90894fa86c597beb42b67c5ac978 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Mon, 13 Jul 2015 06:36:19 -0700
Subject: [PATCH 40/55] bridge: mdb: fix double add notification

[ Upstream commit 5ebc784625ea68a9570d1f70557e7932988cd1b4 ]

Since the mdb add/del code was introduced there have been 2 br_mdb_notify
calls when doing br_mdb_add() resulting in 2 notifications on each add.

Example:
 Command: bridge mdb add dev br0 port eth1 grp 239.0.0.1 permanent
 Before patch:
 root@debian:~# bridge monitor all
 [MDB]dev br0 port eth1 grp 239.0.0.1 permanent
 [MDB]dev br0 port eth1 grp 239.0.0.1 permanent

 After patch:
 root@debian:~# bridge monitor all
 [MDB]dev br0 port eth1 grp 239.0.0.1 permanent

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Fixes: cfd567543590 ("bridge: add support of adding and deleting mdb entries")
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bridge/br_mdb.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index ff61e200bf0..4e76d2a1128 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -345,7 +345,6 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
 		return -ENOMEM;
 	rcu_assign_pointer(*pp, p);
 
-	br_mdb_notify(br->dev, port, group, RTM_NEWMDB);
 	return 0;
 }
 

From c6419a861e56cf6c6446bf0e8f9c75557c2a3a02 Mon Sep 17 00:00:00 2001
From: Tilman Schmidt <tilman@imap.cc>
Date: Tue, 14 Jul 2015 00:37:13 +0200
Subject: [PATCH 41/55] isdn/gigaset: reset tty->receive_room when attaching
 ser_gigaset

[ Upstream commit fd98e9419d8d622a4de91f76b306af6aa627aa9c ]

Commit 79901317ce80 ("n_tty: Don't flush buffer when closing ldisc"),
first merged in kernel release 3.10, caused the following regression
in the Gigaset M101 driver:

Before that commit, when closing the N_TTY line discipline in
preparation to switching to N_GIGASET_M101, receive_room would be
reset to a non-zero value by the call to n_tty_flush_buffer() in
n_tty's close method. With the removal of that call, receive_room
might be left at zero, blocking data reception on the serial line.

The present patch fixes that regression by setting receive_room
to an appropriate value in the ldisc open method.

Fixes: 79901317ce80 ("n_tty: Don't flush buffer when closing ldisc")
Signed-off-by: Tilman Schmidt <tilman@imap.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/isdn/gigaset/ser-gigaset.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/isdn/gigaset/ser-gigaset.c b/drivers/isdn/gigaset/ser-gigaset.c
index 8c91fd5eb6f..3ac9c419481 100644
--- a/drivers/isdn/gigaset/ser-gigaset.c
+++ b/drivers/isdn/gigaset/ser-gigaset.c
@@ -524,9 +524,18 @@ gigaset_tty_open(struct tty_struct *tty)
 	cs->hw.ser->tty = tty;
 	atomic_set(&cs->hw.ser->refcnt, 1);
 	init_completion(&cs->hw.ser->dead_cmp);
-
 	tty->disc_data = cs;
 
+	/* Set the amount of data we're willing to receive per call
+	 * from the hardware driver to half of the input buffer size
+	 * to leave some reserve.
+	 * Note: We don't do flow control towards the hardware driver.
+	 * If more data is received than will fit into the input buffer,
+	 * it will be dropped and an error will be logged. This should
+	 * never happen as the device is slow and the buffer size ample.
+	 */
+	tty->receive_room = RBUFSIZE/2;
+
 	/* OK.. Initialization of the datastructures and the HW is done.. Now
 	 * startup system and notify the LL that we are ready to run
 	 */

From 4b633bbef8730380d624a3d55a54d9e42effdd1c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 14 Jul 2015 08:10:22 +0200
Subject: [PATCH 42/55] ipv6: lock socket in ip6_datagram_connect()

[ Upstream commit 03645a11a570d52e70631838cb786eb4253eb463 ]

ip6_datagram_connect() is doing a lot of socket changes without
socket being locked.

This looks wrong, at least for udp_lib_rehash() which could corrupt
lists because of concurrent udp_sk(sk)->udp_portaddr_hash accesses.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/net/ip.h    |  1 +
 net/ipv4/datagram.c | 16 ++++++++++++----
 net/ipv6/datagram.c | 20 +++++++++++++++-----
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 0a62365149e..ea9be6b407b 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -141,6 +141,7 @@ static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4)
 }
 
 /* datagram.c */
+int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 extern int		ip4_datagram_connect(struct sock *sk, 
 					     struct sockaddr *uaddr, int addr_len);
 
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 5f3dc1df04b..291b0821d1a 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -20,7 +20,7 @@
 #include <net/route.h>
 #include <net/tcp_states.h>
 
-int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
@@ -39,8 +39,6 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	sk_dst_reset(sk);
 
-	lock_sock(sk);
-
 	oif = sk->sk_bound_dev_if;
 	saddr = inet->inet_saddr;
 	if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
@@ -81,9 +79,19 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	sk_dst_set(sk, &rt->dst);
 	err = 0;
 out:
-	release_sock(sk);
 	return err;
 }
+EXPORT_SYMBOL(__ip4_datagram_connect);
+
+int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	int res;
+
+	lock_sock(sk);
+	res = __ip4_datagram_connect(sk, uaddr, addr_len);
+	release_sock(sk);
+	return res;
+}
 EXPORT_SYMBOL(ip4_datagram_connect);
 
 /* Because UDP xmit path can manipulate sk_dst_cache without holding
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index ce17d3da9b2..b0d5d7eb946 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -40,7 +40,7 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a)
 	return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0);
 }
 
-int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct sockaddr_in6	*usin = (struct sockaddr_in6 *) uaddr;
 	struct inet_sock      	*inet = inet_sk(sk);
@@ -56,7 +56,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (usin->sin6_family == AF_INET) {
 		if (__ipv6_only_sock(sk))
 			return -EAFNOSUPPORT;
-		err = ip4_datagram_connect(sk, uaddr, addr_len);
+		err = __ip4_datagram_connect(sk, uaddr, addr_len);
 		goto ipv4_connected;
 	}
 
@@ -99,9 +99,9 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		sin.sin_addr.s_addr = daddr->s6_addr32[3];
 		sin.sin_port = usin->sin6_port;
 
-		err = ip4_datagram_connect(sk,
-					   (struct sockaddr *) &sin,
-					   sizeof(sin));
+		err = __ip4_datagram_connect(sk,
+					     (struct sockaddr *) &sin,
+					     sizeof(sin));
 
 ipv4_connected:
 		if (err)
@@ -204,6 +204,16 @@ out:
 	fl6_sock_release(flowlabel);
 	return err;
 }
+
+int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	int res;
+
+	lock_sock(sk);
+	res = __ip6_datagram_connect(sk, uaddr, addr_len);
+	release_sock(sk);
+	return res;
+}
 EXPORT_SYMBOL_GPL(ip6_datagram_connect);
 
 void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,

From e3e3caac28a5c7c9428ebc4fa154a007c0636393 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Wed, 15 Jul 2015 21:52:51 +0200
Subject: [PATCH 43/55] bonding: fix destruction of bond with devices different
 from arphrd_ether

[ Upstream commit 06f6d1094aa0992432b1e2a0920b0ee86ccd83bf ]

When the bonding is being unloaded and the netdevice notifier is
unregistered it executes NETDEV_UNREGISTER for each device which should
remove the bond's proc entry but if the device enslaved is not of
ARPHRD_ETHER type and is in front of the bonding, it may execute
bond_release_and_destroy() first which would release the last slave and
destroy the bond device leaving the proc entry and thus we will get the
following error (with dynamic debug on for bond_netdev_event to see the
events order):
[  908.963051] eql: event: 9
[  908.963052] eql: IFF_SLAVE
[  908.963054] eql: event: 2
[  908.963056] eql: IFF_SLAVE
[  908.963058] eql: event: 6
[  908.963059] eql: IFF_SLAVE
[  908.963110] bond0: Releasing active interface eql
[  908.976168] bond0: Destroying bond bond0
[  908.976266] bond0 (unregistering): Released all slaves
[  908.984097] ------------[ cut here ]------------
[  908.984107] WARNING: CPU: 0 PID: 1787 at fs/proc/generic.c:575
remove_proc_entry+0x112/0x160()
[  908.984110] remove_proc_entry: removing non-empty directory
'net/bonding', leaking at least 'bond0'
[  908.984111] Modules linked in: bonding(-) eql(O) 9p nfsd auth_rpcgss
oid_registry nfs_acl nfs lockd grace fscache sunrpc crct10dif_pclmul
crc32_pclmul crc32c_intel ghash_clmulni_intel ppdev qxl drm_kms_helper
snd_hda_codec_generic aesni_intel ttm aes_x86_64 glue_helper pcspkr lrw
gf128mul ablk_helper cryptd snd_hda_intel virtio_console snd_hda_codec
psmouse serio_raw snd_hwdep snd_hda_core 9pnet_virtio 9pnet evdev joydev
drm virtio_balloon snd_pcm snd_timer snd soundcore i2c_piix4 i2c_core
pvpanic acpi_cpufreq parport_pc parport processor thermal_sys button
autofs4 ext4 crc16 mbcache jbd2 hid_generic usbhid hid sg sr_mod cdrom
ata_generic virtio_blk virtio_net floppy ata_piix e1000 libata ehci_pci
virtio_pci scsi_mod uhci_hcd ehci_hcd virtio_ring virtio usbcore
usb_common [last unloaded: bonding]

[  908.984168] CPU: 0 PID: 1787 Comm: rmmod Tainted: G        W  O
4.2.0-rc2+ #8
[  908.984170] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[  908.984172]  0000000000000000 ffffffff81732d41 ffffffff81525b34
ffff8800358dfda8
[  908.984175]  ffffffff8106c521 ffff88003595af78 ffff88003595af40
ffff88003e3a4280
[  908.984178]  ffffffffa058d040 0000000000000000 ffffffff8106c59a
ffffffff8172ebd0
[  908.984181] Call Trace:
[  908.984188]  [<ffffffff81525b34>] ? dump_stack+0x40/0x50
[  908.984193]  [<ffffffff8106c521>] ? warn_slowpath_common+0x81/0xb0
[  908.984196]  [<ffffffff8106c59a>] ? warn_slowpath_fmt+0x4a/0x50
[  908.984199]  [<ffffffff81218352>] ? remove_proc_entry+0x112/0x160
[  908.984205]  [<ffffffffa05850e6>] ? bond_destroy_proc_dir+0x26/0x30
[bonding]
[  908.984208]  [<ffffffffa057540e>] ? bond_net_exit+0x8e/0xa0 [bonding]
[  908.984217]  [<ffffffff8142f407>] ? ops_exit_list.isra.4+0x37/0x70
[  908.984225]  [<ffffffff8142f52d>] ?
unregister_pernet_operations+0x8d/0xd0
[  908.984228]  [<ffffffff8142f58d>] ?
unregister_pernet_subsys+0x1d/0x30
[  908.984232]  [<ffffffffa0585269>] ? bonding_exit+0x23/0xdba [bonding]
[  908.984236]  [<ffffffff810e28ba>] ? SyS_delete_module+0x18a/0x250
[  908.984241]  [<ffffffff81086f99>] ? task_work_run+0x89/0xc0
[  908.984244]  [<ffffffff8152b732>] ?
entry_SYSCALL_64_fastpath+0x16/0x75
[  908.984247] ---[ end trace 7c006ed4abbef24b ]---

Thus remove the proc entry manually if bond_release_and_destroy() is
used. Because of the checks in bond_remove_proc_entry() it's not a
problem for a bond device to change namespaces (the bug fixed by the
Fixes commit) but since commit
f9399814927ad ("bonding: Don't allow bond devices to change network
namespaces.") that can't happen anyway.

Reported-by: Carol Soto <clsoto@linux.vnet.ibm.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Fixes: a64d49c3dd50 ("bonding: Manage /proc/net/bonding/ entries from
                      the netdev events")
Tested-by: Carol L Soto <clsoto@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/bonding/bond_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index b143ce91e08..6b5baf01512 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2188,6 +2188,7 @@ static int  bond_release_and_destroy(struct net_device *bond_dev,
 		bond_dev->priv_flags |= IFF_DISABLE_NETPOLL;
 		pr_info("%s: destroying bond %s.\n",
 			bond_dev->name, bond_dev->name);
+		bond_remove_proc_entry(bond);
 		unregister_netdevice(bond_dev);
 	}
 	return ret;

From cecc56226678a5a218e8b9859382ab0b0c27ad0b Mon Sep 17 00:00:00 2001
From: Edward Hyunkoo Jee <edjee@google.com>
Date: Tue, 21 Jul 2015 09:43:59 +0200
Subject: [PATCH 44/55] inet: frags: fix defragmented packet's IP header for
 af_packet

[ Upstream commit 0848f6428ba3a2e42db124d41ac6f548655735bf ]

When ip_frag_queue() computes positions, it assumes that the passed
sk_buff does not contain L2 headers.

However, when PACKET_FANOUT_FLAG_DEFRAG is used, IP reassembly
functions can be called on outgoing packets that contain L2 headers.

Also, IPv4 checksum is not corrected after reassembly.

Fixes: 7736d33f4262 ("packet: Add pre-defragmentation support for ipv4 fanouts.")
Signed-off-by: Edward Hyunkoo Jee <edjee@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Jerry Chu <hkchu@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/ip_fragment.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 4c1884fed54..4d98a6b80b0 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -356,7 +356,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	ihl = ip_hdrlen(skb);
 
 	/* Determine the position of this fragment. */
-	end = offset + skb->len - ihl;
+	end = offset + skb->len - skb_network_offset(skb) - ihl;
 	err = -EINVAL;
 
 	/* Is this the final fragment? */
@@ -386,7 +386,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		goto err;
 
 	err = -ENOMEM;
-	if (pskb_pull(skb, ihl) == NULL)
+	if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
 		goto err;
 
 	err = pskb_trim_rcsum(skb, end - offset);
@@ -627,6 +627,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
 	iph->tot_len = htons(len);
 	iph->tos |= ecn;
+
+	ip_send_check(iph);
+
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
 	qp->q.fragments = NULL;
 	qp->q.fragments_tail = NULL;

From 3ebe377baaebbed39733c3eb03212b9c22601888 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 21 Jul 2015 16:33:50 +0200
Subject: [PATCH 45/55] netlink: don't hold mutex in rcu callback when
 releasing mmapd ring

[ Upstream commit 0470eb99b4721586ccac954faac3fa4472da0845 ]

Kirill A. Shutemov says:

This simple test-case trigers few locking asserts in kernel:

int main(int argc, char **argv)
{
        unsigned int block_size = 16 * 4096;
        struct nl_mmap_req req = {
                .nm_block_size          = block_size,
                .nm_block_nr            = 64,
                .nm_frame_size          = 16384,
                .nm_frame_nr            = 64 * block_size / 16384,
        };
        unsigned int ring_size;
	int fd;

	fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
        if (setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &req, sizeof(req)) < 0)
                exit(1);
        if (setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &req, sizeof(req)) < 0)
                exit(1);

	ring_size = req.nm_block_nr * req.nm_block_size;
	mmap(NULL, 2 * ring_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
	return 0;
}

+++ exited with 0 +++
BUG: sleeping function called from invalid context at /home/kas/git/public/linux-mm/kernel/locking/mutex.c:616
in_atomic(): 1, irqs_disabled(): 0, pid: 1, name: init
3 locks held by init/1:
 #0:  (reboot_mutex){+.+...}, at: [<ffffffff81080959>] SyS_reboot+0xa9/0x220
 #1:  ((reboot_notifier_list).rwsem){.+.+..}, at: [<ffffffff8107f379>] __blocking_notifier_call_chain+0x39/0x70
 #2:  (rcu_callback){......}, at: [<ffffffff810d32e0>] rcu_do_batch.isra.49+0x160/0x10c0
Preemption disabled at:[<ffffffff8145365f>] __delay+0xf/0x20

CPU: 1 PID: 1 Comm: init Not tainted 4.1.0-00009-gbddf4c4818e0 #253
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Debian-1.8.2-1 04/01/2014
 ffff88017b3d8000 ffff88027bc03c38 ffffffff81929ceb 0000000000000102
 0000000000000000 ffff88027bc03c68 ffffffff81085a9d 0000000000000002
 ffffffff81ca2a20 0000000000000268 0000000000000000 ffff88027bc03c98
Call Trace:
 <IRQ>  [<ffffffff81929ceb>] dump_stack+0x4f/0x7b
 [<ffffffff81085a9d>] ___might_sleep+0x16d/0x270
 [<ffffffff81085bed>] __might_sleep+0x4d/0x90
 [<ffffffff8192e96f>] mutex_lock_nested+0x2f/0x430
 [<ffffffff81932fed>] ? _raw_spin_unlock_irqrestore+0x5d/0x80
 [<ffffffff81464143>] ? __this_cpu_preempt_check+0x13/0x20
 [<ffffffff8182fc3d>] netlink_set_ring+0x1ed/0x350
 [<ffffffff8182e000>] ? netlink_undo_bind+0x70/0x70
 [<ffffffff8182fe20>] netlink_sock_destruct+0x80/0x150
 [<ffffffff817e484d>] __sk_free+0x1d/0x160
 [<ffffffff817e49a9>] sk_free+0x19/0x20
[..]

Cong Wang says:

We can't hold mutex lock in a rcu callback, [..]

Thomas Graf says:

The socket should be dead at this point. It might be simpler to
add a netlink_release_ring() function which doesn't require
locking at all.

Reported-by: "Kirill A. Shutemov" <kirill@shutemov.name>
Diagnosed-by: Cong Wang <cwang@twopensource.com>
Suggested-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/netlink/af_netlink.c | 79 ++++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 32 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f7ad5c630b6..56ff3b45227 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -214,25 +214,52 @@ err1:
 	return NULL;
 }
 
+
+static void
+__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec,
+		   unsigned int order)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+	struct sk_buff_head *queue;
+	struct netlink_ring *ring;
+
+	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
+	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
+
+	spin_lock_bh(&queue->lock);
+
+	ring->frame_max		= req->nm_frame_nr - 1;
+	ring->head		= 0;
+	ring->frame_size	= req->nm_frame_size;
+	ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE;
+
+	swap(ring->pg_vec_len, req->nm_block_nr);
+	swap(ring->pg_vec_order, order);
+	swap(ring->pg_vec, pg_vec);
+
+	__skb_queue_purge(queue);
+	spin_unlock_bh(&queue->lock);
+
+	WARN_ON(atomic_read(&nlk->mapped));
+
+	if (pg_vec)
+		free_pg_vec(pg_vec, order, req->nm_block_nr);
+}
+
 static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
-			    bool closing, bool tx_ring)
+			    bool tx_ring)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
 	struct netlink_ring *ring;
-	struct sk_buff_head *queue;
 	void **pg_vec = NULL;
 	unsigned int order = 0;
-	int err;
 
 	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
-	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
 
-	if (!closing) {
-		if (atomic_read(&nlk->mapped))
-			return -EBUSY;
-		if (atomic_read(&ring->pending))
-			return -EBUSY;
-	}
+	if (atomic_read(&nlk->mapped))
+		return -EBUSY;
+	if (atomic_read(&ring->pending))
+		return -EBUSY;
 
 	if (req->nm_block_nr) {
 		if (ring->pg_vec != NULL)
@@ -264,31 +291,19 @@ static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
 			return -EINVAL;
 	}
 
-	err = -EBUSY;
 	mutex_lock(&nlk->pg_vec_lock);
-	if (closing || atomic_read(&nlk->mapped) == 0) {
-		err = 0;
-		spin_lock_bh(&queue->lock);
-
-		ring->frame_max		= req->nm_frame_nr - 1;
-		ring->head		= 0;
-		ring->frame_size	= req->nm_frame_size;
-		ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE;
-
-		swap(ring->pg_vec_len, req->nm_block_nr);
-		swap(ring->pg_vec_order, order);
-		swap(ring->pg_vec, pg_vec);
-
-		__skb_queue_purge(queue);
-		spin_unlock_bh(&queue->lock);
-
-		WARN_ON(atomic_read(&nlk->mapped));
+	if (atomic_read(&nlk->mapped) == 0) {
+		__netlink_set_ring(sk, req, tx_ring, pg_vec, order);
+		mutex_unlock(&nlk->pg_vec_lock);
+		return 0;
 	}
+
 	mutex_unlock(&nlk->pg_vec_lock);
 
 	if (pg_vec)
 		free_pg_vec(pg_vec, order, req->nm_block_nr);
-	return err;
+
+	return -EBUSY;
 }
 
 static void netlink_mm_open(struct vm_area_struct *vma)
@@ -762,10 +777,10 @@ static void netlink_sock_destruct(struct sock *sk)
 
 		memset(&req, 0, sizeof(req));
 		if (nlk->rx_ring.pg_vec)
-			netlink_set_ring(sk, &req, true, false);
+			__netlink_set_ring(sk, &req, false, NULL, 0);
 		memset(&req, 0, sizeof(req));
 		if (nlk->tx_ring.pg_vec)
-			netlink_set_ring(sk, &req, true, true);
+			__netlink_set_ring(sk, &req, true, NULL, 0);
 	}
 #endif /* CONFIG_NETLINK_MMAP */
 
@@ -2017,7 +2032,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 			return -EINVAL;
 		if (copy_from_user(&req, optval, sizeof(req)))
 			return -EFAULT;
-		err = netlink_set_ring(sk, &req, false,
+		err = netlink_set_ring(sk, &req,
 				       optname == NETLINK_TX_RING);
 		break;
 	}

From 7cd1033116c708c7e3e772cbf053fd9c98163570 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Sat, 1 Aug 2015 15:33:26 +0300
Subject: [PATCH 46/55] rds: fix an integer overflow test in
 rds_info_getsockopt()

[ Upstream commit 468b732b6f76b138c0926eadf38ac88467dcd271 ]

"len" is a signed integer.  We check that len is not negative, so it
goes from zero to INT_MAX.  PAGE_SIZE is unsigned long so the comparison
is type promoted to unsigned long.  ULONG_MAX - 4095 is a higher than
INT_MAX so the condition can never be true.

I don't know if this is harmful but it seems safe to limit "len" to
INT_MAX - 4095.

Fixes: a8c879a7ee98 ('RDS: Info and stats')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/rds/info.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rds/info.c b/net/rds/info.c
index 9a6b4f66187..140a44a5f7b 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -176,7 +176,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
 
 	/* check for all kinds of wrapping and the like */
 	start = (unsigned long)optval;
-	if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) {
+	if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) {
 		ret = -EINVAL;
 		goto out;
 	}

From 6d8c190531d0020870b93eee6a3933cb39fb1f52 Mon Sep 17 00:00:00 2001
From: huaibin Wang <huaibin.wang@6wind.com>
Date: Tue, 25 Aug 2015 16:20:34 +0200
Subject: [PATCH 47/55] ip6_gre: release cached dst on tunnel removal

[ Upstream commit d4257295ba1b389c693b79de857a96e4b7cd8ac0 ]

When a tunnel is deleted, the cached dst entry should be released.

This problem may prevent the removal of a netns (seen with a x-netns IPv6
gre tunnel):
  unregister_netdevice: waiting for lo to become free. Usage count = 3

CC: Dmitry Kozlov <xeb@mail.ru>
Fixes: c12b395a4664 ("gre: Support GRE over IPv6")
Signed-off-by: huaibin Wang <huaibin.wang@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/ip6_gre.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 65156a73b3f..bf6233cdb75 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -359,6 +359,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
 	struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
 
 	ip6gre_tunnel_unlink(ign, netdev_priv(dev));
+	ip6_tnl_dst_reset(netdev_priv(dev));
 	dev_put(dev);
 }
 

From fe474009a2167693c1a1cc2f396cb20621770450 Mon Sep 17 00:00:00 2001
From: Eugene Shatokhin <eugene.shatokhin@rosalab.ru>
Date: Mon, 24 Aug 2015 23:13:42 +0300
Subject: [PATCH 48/55] usbnet: Get EVENT_NO_RUNTIME_PM bit before it is
 cleared

[ Upstream commit f50791ac1aca1ac1b0370d62397b43e9f831421a ]

It is needed to check EVENT_NO_RUNTIME_PM bit of dev->flags in
usbnet_stop(), but its value should be read before it is cleared
when dev->flags is set to 0.

The problem was spotted and the fix was provided by
Oliver Neukum <oneukum@suse.de>.

Signed-off-by: Eugene Shatokhin <eugene.shatokhin@rosalab.ru>
Acked-by: Oliver Neukum <oneukum@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/usb/usbnet.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 3d50e7db141..fb068ada0c5 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -753,7 +753,7 @@ int usbnet_stop (struct net_device *net)
 {
 	struct usbnet		*dev = netdev_priv(net);
 	struct driver_info	*info = dev->driver_info;
-	int			retval, pm;
+	int			retval, pm, mpn;
 
 	clear_bit(EVENT_DEV_OPEN, &dev->flags);
 	netif_stop_queue (net);
@@ -784,6 +784,8 @@ int usbnet_stop (struct net_device *net)
 
 	usbnet_purge_paused_rxq(dev);
 
+	mpn = !test_and_clear_bit(EVENT_NO_RUNTIME_PM, &dev->flags);
+
 	/* deferred work (task, timer, softirq) must also stop.
 	 * can't flush_scheduled_work() until we drop rtnl (later),
 	 * else workers could deadlock; so make workers a NOP.
@@ -794,8 +796,7 @@ int usbnet_stop (struct net_device *net)
 	if (!pm)
 		usb_autopm_put_interface(dev->intf);
 
-	if (info->manage_power &&
-	    !test_and_clear_bit(EVENT_NO_RUNTIME_PM, &dev->flags))
+	if (info->manage_power && mpn)
 		info->manage_power(dev, 0);
 	else
 		usb_autopm_put_interface(dev->intf);

From 162e3d1c34b0c7b4a2bd016332b48c171a926965 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Sep 2015 00:29:07 +0200
Subject: [PATCH 49/55] ipv6: fix exthdrs offload registration in out_rt path

[ Upstream commit e41b0bedba0293b9e1e8d1e8ed553104b9693656 ]

We previously register IPPROTO_ROUTING offload under inet6_add_offload(),
but in error path, we try to unregister it with inet_del_offload(). This
doesn't seem correct, it should actually be inet6_del_offload(), also
ipv6_exthdrs_offload_exit() from that commit seems rather incorrect (it
also uses rthdr_offload twice), but it got removed entirely later on.

Fixes: 3336288a9fea ("ipv6: Switch to using new offload infrastructure.")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/exthdrs_offload.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/exthdrs_offload.c b/net/ipv6/exthdrs_offload.c
index 447a7fbd1bb..f5e2ba1c18b 100644
--- a/net/ipv6/exthdrs_offload.c
+++ b/net/ipv6/exthdrs_offload.c
@@ -36,6 +36,6 @@ out:
 	return ret;
 
 out_rt:
-	inet_del_offload(&rthdr_offload, IPPROTO_ROUTING);
+	inet6_del_offload(&rthdr_offload, IPPROTO_ROUTING);
 	goto out;
 }

From 02b5ca779d6d15b3af9ea79e1be6e9eb1636f286 Mon Sep 17 00:00:00 2001
From: Richard Laing <richard.laing@alliedtelesis.co.nz>
Date: Thu, 3 Sep 2015 13:52:31 +1200
Subject: [PATCH 50/55] net/ipv6: Correct PIM6 mrt_lock handling

[ Upstream commit 25b4a44c19c83d98e8c0807a7ede07c1f28eab8b ]

In the IPv6 multicast routing code the mrt_lock was not being released
correctly in the MFC iterator, as a result adding or deleting a MIF would
cause a hang because the mrt_lock could not be acquired.

This fix is a copy of the code for the IPv4 case and ensures that the lock
is released correctly.

Signed-off-by: Richard Laing <richard.laing@alliedtelesis.co.nz>
Acked-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/ip6mr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 2c84072b1da..57dd3e7d86c 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -552,7 +552,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 
 	if (it->cache == &mrt->mfc6_unres_queue)
 		spin_unlock_bh(&mfc_unres_lock);
-	else if (it->cache == mrt->mfc6_cache_array)
+	else if (it->cache == &mrt->mfc6_cache_array[it->ct])
 		read_unlock(&mrt_lock);
 }
 

From e7bb902b26f1e8f7c1a4f0cc7b3abfd9b3fbb108 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 10 Sep 2015 17:31:15 -0300
Subject: [PATCH 51/55] sctp: fix race on protocol/netns initialization

[ Upstream commit 8e2d61e0aed2b7c4ecb35844fe07e0b2b762dee4 ]

Consider sctp module is unloaded and is being requested because an user
is creating a sctp socket.

During initialization, sctp will add the new protocol type and then
initialize pernet subsys:

        status = sctp_v4_protosw_init();
        if (status)
                goto err_protosw_init;

        status = sctp_v6_protosw_init();
        if (status)
                goto err_v6_protosw_init;

        status = register_pernet_subsys(&sctp_net_ops);

The problem is that after those calls to sctp_v{4,6}_protosw_init(), it
is possible for userspace to create SCTP sockets like if the module is
already fully loaded. If that happens, one of the possible effects is
that we will have readers for net->sctp.local_addr_list list earlier
than expected and sctp_net_init() does not take precautions while
dealing with that list, leading to a potential panic but not limited to
that, as sctp_sock_init() will copy a bunch of blank/partially
initialized values from net->sctp.

The race happens like this:

     CPU 0                           |  CPU 1
  socket()                           |
   __sock_create                     | socket()
    inet_create                      |  __sock_create
     list_for_each_entry_rcu(        |
        answer, &inetsw[sock->type], |
        list) {                      |   inet_create
      /* no hits */                  |
     if (unlikely(err)) {            |
      ...                            |
      request_module()               |
      /* socket creation is blocked  |
       * the module is fully loaded  |
       */                            |
       sctp_init                     |
        sctp_v4_protosw_init         |
         inet_register_protosw       |
          list_add_rcu(&p->list,     |
                       last_perm);   |
                                     |  list_for_each_entry_rcu(
                                     |     answer, &inetsw[sock->type],
        sctp_v6_protosw_init         |     list) {
                                     |     /* hit, so assumes protocol
                                     |      * is already loaded
                                     |      */
                                     |  /* socket creation continues
                                     |   * before netns is initialized
                                     |   */
        register_pernet_subsys       |

Simply inverting the initialization order between
register_pernet_subsys() and sctp_v4_protosw_init() is not possible
because register_pernet_subsys() will create a control sctp socket, so
the protocol must be already visible by then. Deferring the socket
creation to a work-queue is not good specially because we loose the
ability to handle its errors.

So, as suggested by Vlad, the fix is to split netns initialization in
two moments: defaults and control socket, so that the defaults are
already loaded by when we register the protocol, while control socket
initialization is kept at the same moment it is today.

Fixes: 4db67e808640 ("sctp: Make the address lists per network namespace")
Signed-off-by: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sctp/protocol.c | 64 +++++++++++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 23 deletions(-)

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5a3c1c0a84a..57c2c4c0c97 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1170,7 +1170,7 @@ static void sctp_v4_del_protocol(void)
 	unregister_inetaddr_notifier(&sctp_inetaddr_notifier);
 }
 
-static int __net_init sctp_net_init(struct net *net)
+static int __net_init sctp_defaults_init(struct net *net)
 {
 	int status;
 
@@ -1263,12 +1263,6 @@ static int __net_init sctp_net_init(struct net *net)
 
 	sctp_dbg_objcnt_init(net);
 
-	/* Initialize the control inode/socket for handling OOTB packets.  */
-	if ((status = sctp_ctl_sock_init(net))) {
-		pr_err("Failed to initialize the SCTP control sock\n");
-		goto err_ctl_sock_init;
-	}
-
 	/* Initialize the local address list. */
 	INIT_LIST_HEAD(&net->sctp.local_addr_list);
 	spin_lock_init(&net->sctp.local_addr_lock);
@@ -1284,9 +1278,6 @@ static int __net_init sctp_net_init(struct net *net)
 
 	return 0;
 
-err_ctl_sock_init:
-	sctp_dbg_objcnt_exit(net);
-	sctp_proc_exit(net);
 err_init_proc:
 	cleanup_sctp_mibs(net);
 err_init_mibs:
@@ -1295,15 +1286,12 @@ err_sysctl_register:
 	return status;
 }
 
-static void __net_exit sctp_net_exit(struct net *net)
+static void __net_exit sctp_defaults_exit(struct net *net)
 {
 	/* Free the local address list */
 	sctp_free_addr_wq(net);
 	sctp_free_local_addr_list(net);
 
-	/* Free the control endpoint.  */
-	inet_ctl_sock_destroy(net->sctp.ctl_sock);
-
 	sctp_dbg_objcnt_exit(net);
 
 	sctp_proc_exit(net);
@@ -1311,9 +1299,32 @@ static void __net_exit sctp_net_exit(struct net *net)
 	sctp_sysctl_net_unregister(net);
 }
 
-static struct pernet_operations sctp_net_ops = {
-	.init = sctp_net_init,
-	.exit = sctp_net_exit,
+static struct pernet_operations sctp_defaults_ops = {
+	.init = sctp_defaults_init,
+	.exit = sctp_defaults_exit,
+};
+
+static int __net_init sctp_ctrlsock_init(struct net *net)
+{
+	int status;
+
+	/* Initialize the control inode/socket for handling OOTB packets.  */
+	status = sctp_ctl_sock_init(net);
+	if (status)
+		pr_err("Failed to initialize the SCTP control sock\n");
+
+	return status;
+}
+
+static void __net_init sctp_ctrlsock_exit(struct net *net)
+{
+	/* Free the control endpoint.  */
+	inet_ctl_sock_destroy(net->sctp.ctl_sock);
+}
+
+static struct pernet_operations sctp_ctrlsock_ops = {
+	.init = sctp_ctrlsock_init,
+	.exit = sctp_ctrlsock_exit,
 };
 
 /* Initialize the universe into something sensible.  */
@@ -1448,8 +1459,11 @@ SCTP_STATIC __init int sctp_init(void)
 	sctp_v4_pf_init();
 	sctp_v6_pf_init();
 
-	status = sctp_v4_protosw_init();
+	status = register_pernet_subsys(&sctp_defaults_ops);
+	if (status)
+		goto err_register_defaults;
 
+	status = sctp_v4_protosw_init();
 	if (status)
 		goto err_protosw_init;
 
@@ -1457,9 +1471,9 @@ SCTP_STATIC __init int sctp_init(void)
 	if (status)
 		goto err_v6_protosw_init;
 
-	status = register_pernet_subsys(&sctp_net_ops);
+	status = register_pernet_subsys(&sctp_ctrlsock_ops);
 	if (status)
-		goto err_register_pernet_subsys;
+		goto err_register_ctrlsock;
 
 	status = sctp_v4_add_protocol();
 	if (status)
@@ -1476,12 +1490,14 @@ out:
 err_v6_add_protocol:
 	sctp_v4_del_protocol();
 err_add_protocol:
-	unregister_pernet_subsys(&sctp_net_ops);
-err_register_pernet_subsys:
+	unregister_pernet_subsys(&sctp_ctrlsock_ops);
+err_register_ctrlsock:
 	sctp_v6_protosw_exit();
 err_v6_protosw_init:
 	sctp_v4_protosw_exit();
 err_protosw_init:
+	unregister_pernet_subsys(&sctp_defaults_ops);
+err_register_defaults:
 	sctp_v4_pf_exit();
 	sctp_v6_pf_exit();
 	sctp_sysctl_unregister();
@@ -1514,12 +1530,14 @@ SCTP_STATIC __exit void sctp_exit(void)
 	sctp_v6_del_protocol();
 	sctp_v4_del_protocol();
 
-	unregister_pernet_subsys(&sctp_net_ops);
+	unregister_pernet_subsys(&sctp_ctrlsock_ops);
 
 	/* Free protosw registrations */
 	sctp_v6_protosw_exit();
 	sctp_v4_protosw_exit();
 
+	unregister_pernet_subsys(&sctp_defaults_ops);
+
 	/* Unregister with socket layer. */
 	sctp_v6_pf_exit();
 	sctp_v4_pf_exit();

From d0550a3f2d313a5310ace03299d45ef63edfb750 Mon Sep 17 00:00:00 2001
From: Wilson Kok <wkok@cumulusnetworks.com>
Date: Tue, 22 Sep 2015 21:40:22 -0700
Subject: [PATCH 52/55] fib_rules: fix fib rule dumps across multiple skbs

[ Upstream commit 41fc014332d91ee90c32840bf161f9685b7fbf2b ]

dump_rules returns skb length and not error.
But when family == AF_UNSPEC, the caller of dump_rules
assumes that it returns an error. Hence, when family == AF_UNSPEC,
we continue trying to dump on -EMSGSIZE errors resulting in
incorrect dump idx carried between skbs belonging to the same dump.
This results in fib rule dump always only dumping rules that fit
into the first skb.

This patch fixes dump_rules to return error so that we exit correctly
and idx is correctly maintained between skbs that are part of the
same dump.

Signed-off-by: Wilson Kok <wkok@cumulusnetworks.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/fib_rules.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 55e08e2de3a..627e517077e 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -596,15 +596,17 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
 {
 	int idx = 0;
 	struct fib_rule *rule;
+	int err = 0;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(rule, &ops->rules_list, list) {
 		if (idx < cb->args[1])
 			goto skip;
 
-		if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid,
-				     cb->nlh->nlmsg_seq, RTM_NEWRULE,
-				     NLM_F_MULTI, ops) < 0)
+		err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid,
+				       cb->nlh->nlmsg_seq, RTM_NEWRULE,
+				       NLM_F_MULTI, ops);
+		if (err)
 			break;
 skip:
 		idx++;
@@ -613,7 +615,7 @@ skip:
 	cb->args[1] = idx;
 	rules_ops_put(ops);
 
-	return skb->len;
+	return err;
 }
 
 static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
@@ -629,7 +631,9 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
 		if (ops == NULL)
 			return -EAFNOSUPPORT;
 
-		return dump_rules(skb, cb, ops);
+		dump_rules(skb, cb, ops);
+
+		return skb->len;
 	}
 
 	rcu_read_lock();

From e6478de4fad8e6d7cad3ca3440ee4da599fb0b2f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 24 May 2015 09:25:00 -0500
Subject: [PATCH 53/55] vfs: Remove incorrect debugging WARN in prepend_path

commit 93e3bce6287e1fb3e60d3324ed08555b5bbafa89 upstream.

The warning message in prepend_path is unclear and outdated.  It was
added as a warning that the mechanism for generating names of pseudo
files had been removed from prepend_path and d_dname should be used
instead.  Unfortunately the warning reads like a general warning,
making it unclear what to do with it.

Remove the warning.  The transition it was added to warn about is long
over, and I added code several years ago which in rare cases causes
the warning to fire on legitimate code, and the warning is now firing
and scaring people for no good reason.

Reported-by: Ivan Delalande <colona@arista.com>
Reported-by: Omar Sandoval <osandov@osandov.com>
Fixes: f48cfddc6729e ("vfs: In d_path don't call d_dname on a mount point")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
[ vlee: Backported to 3.10. Adjusted context. ]
Signed-off-by: Vinson Lee <vlee@twitter.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dcache.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 90be2809e15..f1e80178597 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2569,15 +2569,6 @@ static int prepend_path(const struct path *path,
 	return error;
 
 global_root:
-	/*
-	 * Filesystems needing to implement special "root names"
-	 * should do so with ->d_dname()
-	 */
-	if (IS_ROOT(dentry) &&
-	    (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) {
-		WARN(1, "Root dentry has weird name <%.*s>\n",
-		     (int) dentry->d_name.len, dentry->d_name.name);
-	}
 	if (!slash)
 		error = prepend(buffer, buflen, "/", 1);
 	if (!error)

From d565d87eb95146aef43f0c60a88d4cfdadebb16c Mon Sep 17 00:00:00 2001
From: Markus Pargmann <mpa@pengutronix.de>
Date: Wed, 29 Jul 2015 15:46:03 +0200
Subject: [PATCH 54/55] Revert "iio: bmg160: IIO_BUFFER and
 IIO_TRIGGERED_BUFFER are required"

This reverts commit 35c45e8bce3c92fb1ff94d376f1d4bfaae079d66 which was
commit 06d2f6ca5a38abe92f1f3a132b331eee773868c3 upstream as it should
not have been applied.


Reported-by: Luis Henriques <luis.henriques@canonical.com>
Cc: Markus Pargmann <mpa@pengutronix.de>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Jonathan Cameron <jic23@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/iio/gyro/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/iio/gyro/Kconfig b/drivers/iio/gyro/Kconfig
index 953a0621c6f..107cafcb89d 100644
--- a/drivers/iio/gyro/Kconfig
+++ b/drivers/iio/gyro/Kconfig
@@ -73,8 +73,7 @@ config IIO_ST_GYRO_SPI_3AXIS
 config ITG3200
 	tristate "InvenSense ITG3200 Digital 3-Axis Gyroscope I2C driver"
 	depends on I2C
-	select IIO_BUFFER
-	select IIO_TRIGGERED_BUFFER
+	select IIO_TRIGGERED_BUFFER if IIO_BUFFER
 	help
 	  Say yes here to add support for the InvenSense ITG3200 digital
 	  3-axis gyroscope sensor.

From f5552cd830e58c46dffae3617b3ce0c839771981 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 1 Oct 2015 12:07:55 +0200
Subject: [PATCH 55/55] Linux 3.10.90

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index ca82393e1ee..ce741a9f5b1 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 3
 PATCHLEVEL = 10
-SUBLEVEL = 89
+SUBLEVEL = 90
 EXTRAVERSION =
 NAME = TOSSUG Baby Fish