From f8d9605243280f1870dd2c6c37a735b925c15f3c Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@infradead.org>
Date: Thu, 7 Jul 2011 00:28:35 +0000
Subject: sctp: Enforce retransmission limit during shutdown

When initiating a graceful shutdown while having data chunks
on the retransmission queue with a peer which is in zero
window mode the shutdown is never completed because the
retransmission error count is reset periodically by the
following two rules:

 - Do not timeout association while doing zero window probe.
 - Reset overall error count when a heartbeat request has
   been acknowledged.

The graceful shutdown will wait for all outstanding TSN to
be acknowledged before sending the SHUTDOWN request. This
never happens due to the peer's zero window not acknowledging
the continuously retransmitted data chunks. Although the
error counter is incremented for each failed retransmission,
the receiving of the SACK announcing the zero window clears
the error count again immediately. Also heartbeat requests
continue to be sent periodically. The peer acknowledges these
requests causing the error counter to be reset as well.

This patch changes behaviour to only reset the overall error
counter for the above rules while not in shutdown. After
reaching the maximum number of retransmission attempts, the
T5 shutdown guard timer is scheduled to give the receiver
some additional time to recover. The timer is stopped as soon
as the receiver acknowledges any data.

The issue can be easily reproduced by establishing a sctp
association over the loopback device, constantly queueing
data at the sender while not reading any at the receiver.
Wait for the window to reach zero, then initiate a shutdown
by killing both processes simultaneously. The association
will never be freed and the chunks on the retransmission
queue will be retransmitted indefinitely.

Signed-off-by: Thomas Graf <tgraf@infradead.org>
Acked-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/command.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h
index dd6847e5d6e4..6506458ccd33 100644
--- a/include/net/sctp/command.h
+++ b/include/net/sctp/command.h
@@ -63,6 +63,7 @@ typedef enum {
 	SCTP_CMD_ECN_ECNE,	/* Do delayed ECNE processing. */
 	SCTP_CMD_ECN_CWR,	/* Do delayed CWR processing.  */
 	SCTP_CMD_TIMER_START,	/* Start a timer.  */
+	SCTP_CMD_TIMER_START_ONCE, /* Start a timer once */
 	SCTP_CMD_TIMER_RESTART,	/* Restart a timer. */
 	SCTP_CMD_TIMER_STOP,	/* Stop a timer. */
 	SCTP_CMD_INIT_CHOOSE_TRANSPORT, /* Choose transport for an INIT. */
-- 
cgit v1.2.3


From cd4fcc704f30f2064ab30b5300d44d431e46db50 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@infradead.org>
Date: Fri, 8 Jul 2011 04:37:46 +0000
Subject: sctp: ABORT if receive, reassmbly, or reodering queue is not empty
 while closing socket

Trigger user ABORT if application closes a socket which has data
queued on the socket receive queue or chunks waiting on the
reassembly or ordering queue as this would imply data being lost
which defeats the point of a graceful shutdown.

This behavior is already practiced in TCP.

We do not check the input queue because that would mean to parse
all chunks on it to look for unacknowledged data which seems too
much of an effort. Control chunks or duplicated chunks may also
be in the input queue and should not be stopping a graceful
shutdown.

Signed-off-by: Thomas Graf <tgraf@infradead.org>
Acked-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/ulpevent.h |  2 +-
 net/sctp/socket.c           | 13 ++++++++-----
 net/sctp/ulpevent.c         | 16 +++++++++++++---
 3 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 99b027b2adce..ca4693b4e09e 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -80,7 +80,7 @@ static inline struct sctp_ulpevent *sctp_skb2event(struct sk_buff *skb)
 
 void sctp_ulpevent_free(struct sctp_ulpevent *);
 int sctp_ulpevent_is_notification(const struct sctp_ulpevent *);
-void sctp_queue_purge_ulpevents(struct sk_buff_head *list);
+unsigned int sctp_queue_purge_ulpevents(struct sk_buff_head *list);
 
 struct sctp_ulpevent *sctp_ulpevent_make_assoc_change(
 	const struct sctp_association *asoc,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 08c6238802de..d3ccf7973c59 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1384,6 +1384,7 @@ SCTP_STATIC void sctp_close(struct sock *sk, long timeout)
 	struct sctp_endpoint *ep;
 	struct sctp_association *asoc;
 	struct list_head *pos, *temp;
+	unsigned int data_was_unread;
 
 	SCTP_DEBUG_PRINTK("sctp_close(sk: 0x%p, timeout:%ld)\n", sk, timeout);
 
@@ -1393,6 +1394,10 @@ SCTP_STATIC void sctp_close(struct sock *sk, long timeout)
 
 	ep = sctp_sk(sk)->ep;
 
+	/* Clean up any skbs sitting on the receive queue.  */
+	data_was_unread = sctp_queue_purge_ulpevents(&sk->sk_receive_queue);
+	data_was_unread += sctp_queue_purge_ulpevents(&sctp_sk(sk)->pd_lobby);
+
 	/* Walk all associations on an endpoint.  */
 	list_for_each_safe(pos, temp, &ep->asocs) {
 		asoc = list_entry(pos, struct sctp_association, asocs);
@@ -1410,7 +1415,9 @@ SCTP_STATIC void sctp_close(struct sock *sk, long timeout)
 			}
 		}
 
-		if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+		if (data_was_unread || !skb_queue_empty(&asoc->ulpq.lobby) ||
+		    !skb_queue_empty(&asoc->ulpq.reasm) ||
+		    (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) {
 			struct sctp_chunk *chunk;
 
 			chunk = sctp_make_abort_user(asoc, NULL, 0);
@@ -1420,10 +1427,6 @@ SCTP_STATIC void sctp_close(struct sock *sk, long timeout)
 			sctp_primitive_SHUTDOWN(asoc, NULL);
 	}
 
-	/* Clean up any skbs sitting on the receive queue.  */
-	sctp_queue_purge_ulpevents(&sk->sk_receive_queue);
-	sctp_queue_purge_ulpevents(&sctp_sk(sk)->pd_lobby);
-
 	/* On a TCP-style socket, block for at most linger_time if set. */
 	if (sctp_style(sk, TCP) && timeout)
 		sctp_wait_for_close(sk, timeout);
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index e70e5fc87890..8a84017834c2 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -1081,9 +1081,19 @@ void sctp_ulpevent_free(struct sctp_ulpevent *event)
 }
 
 /* Purge the skb lists holding ulpevents. */
-void sctp_queue_purge_ulpevents(struct sk_buff_head *list)
+unsigned int sctp_queue_purge_ulpevents(struct sk_buff_head *list)
 {
 	struct sk_buff *skb;
-	while ((skb = skb_dequeue(list)) != NULL)
-		sctp_ulpevent_free(sctp_skb2event(skb));
+	unsigned int data_unread = 0;
+
+	while ((skb = skb_dequeue(list)) != NULL) {
+		struct sctp_ulpevent *event = sctp_skb2event(skb);
+
+		if (!sctp_ulpevent_is_notification(event))
+			data_unread += skb->len;
+
+		sctp_ulpevent_free(event);
+	}
+
+	return data_unread;
 }
-- 
cgit v1.2.3


From 07e49a7a31153a95caa270d8ad7350a0bcd4d511 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 6 Jul 2011 20:44:25 +0200
Subject: ACPI: Fix lockdep false positives in acpi_power_off()

All ACPICA locks are allocated by the same function,
acpi_os_create_lock(), with the help of a local variable called
"lock".  Thus, when lockdep is enabled, it uses "lock" as the
name of all those locks and regards them as instances of the same
lock, which causes it to report possible locking problems with them
when there aren't any.

To work around this problem, define acpi_os_create_lock() as a macro
and make it pass its argument to spin_lock_init(), so that lockdep
uses it as the name of the new lock.  Define this macron in a
Linux-specific file, to minimize the resulting modifications of
the OS-independent ACPICA parts.

This change is based on an earlier patch from Andrea Righi and it
addresses a regression from 2.6.39 tracked as
https://bugzilla.kernel.org/show_bug.cgi?id=38152

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reported-and-tested-by: Borislav Petkov <bp@alien8.de>
Tested-by: Andrea Righi <andrea@betterlinux.com>
Reviewed-by: Florian Mickler <florian@mickler.org>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/osl.c              | 17 -----------------
 include/acpi/acpiosxf.h         |  3 +++
 include/acpi/platform/aclinux.h | 18 ++++++++++++++++++
 3 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 52ca9649d769..372f9b70f7f4 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -1332,23 +1332,6 @@ int acpi_resources_are_enforced(void)
 }
 EXPORT_SYMBOL(acpi_resources_are_enforced);
 
-/*
- * Create and initialize a spinlock.
- */
-acpi_status
-acpi_os_create_lock(acpi_spinlock *out_handle)
-{
-	spinlock_t *lock;
-
-	lock = ACPI_ALLOCATE(sizeof(spinlock_t));
-	if (!lock)
-		return AE_NO_MEMORY;
-	spin_lock_init(lock);
-	*out_handle = lock;
-
-	return AE_OK;
-}
-
 /*
  * Deallocate the memory for a spinlock.
  */
diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h
index a756bc8d866d..4543b6f75867 100644
--- a/include/acpi/acpiosxf.h
+++ b/include/acpi/acpiosxf.h
@@ -98,8 +98,11 @@ acpi_os_table_override(struct acpi_table_header *existing_table,
 /*
  * Spinlock primitives
  */
+
+#ifndef acpi_os_create_lock
 acpi_status
 acpi_os_create_lock(acpi_spinlock *out_handle);
+#endif
 
 void acpi_os_delete_lock(acpi_spinlock handle);
 
diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index 5d2a5e9544d9..2ce1be9f6291 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -159,6 +159,24 @@ static inline void *acpi_os_acquire_object(acpi_cache_t * cache)
 	} while (0)
 #endif
 
+/*
+ * When lockdep is enabled, the spin_lock_init() macro stringifies it's
+ * argument and uses that as a name for the lock in debugging.
+ * By executing spin_lock_init() in a macro the key changes from "lock" for
+ * all locks to the name of the argument of acpi_os_create_lock(), which
+ * prevents lockdep from reporting false positives for ACPICA locks.
+ */
+#define acpi_os_create_lock(__handle)				\
+({								\
+	spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));	\
+								\
+	if (lock) {						\
+		*(__handle) = lock;				\
+		spin_lock_init(*(__handle));			\
+	}							\
+	lock ? AE_OK : AE_NO_MEMORY;				\
+})
+
 #endif /* __KERNEL__ */
 
 #endif /* __ACLINUX_H__ */
-- 
cgit v1.2.3


From f39b2dd9d065151a04f5996656d1f27a7eb32d45 Mon Sep 17 00:00:00 2001
From: Philip Rakity <prakity@marvell.com>
Date: Thu, 7 Jul 2011 09:04:55 -0700
Subject: mmc: core: Bus width testing needs to handle suspend/resume

On reading the ext_csd for the first time (in 1 bit mode), save the
ext_csd information needed for bus width compare.

On every pass we make re-reading the ext_csd, compare the data
against the saved ext_csd data.

This fixes a regression introduced in 3.0-rc1 by 08ee80cc397ac1a3
("mmc: core: eMMC bus width may not work on all platforms"), which
incorrectly assumed we would be re-reading the ext_csd at resume-
time.

Signed-off-by: Philip Rakity <prakity@marvell.com>
Tested-by: Jaehoon Chung <jh80.chung@samsung.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/core/mmc.c   | 77 +++++++++++++++++++++++++++++++-----------------
 include/linux/mmc/card.h | 13 ++++++++
 2 files changed, 63 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 2a7e43bc796d..aa7d1d79b8c5 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -247,12 +247,12 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		return 0;
 
 	/* Version is coded in the CSD_STRUCTURE byte in the EXT_CSD register */
+	card->ext_csd.raw_ext_csd_structure = ext_csd[EXT_CSD_STRUCTURE];
 	if (card->csd.structure == 3) {
-		int ext_csd_struct = ext_csd[EXT_CSD_STRUCTURE];
-		if (ext_csd_struct > 2) {
+		if (card->ext_csd.raw_ext_csd_structure > 2) {
 			printk(KERN_ERR "%s: unrecognised EXT_CSD structure "
 				"version %d\n", mmc_hostname(card->host),
-					ext_csd_struct);
+					card->ext_csd.raw_ext_csd_structure);
 			err = -EINVAL;
 			goto out;
 		}
@@ -266,6 +266,10 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		goto out;
 	}
 
+	card->ext_csd.raw_sectors[0] = ext_csd[EXT_CSD_SEC_CNT + 0];
+	card->ext_csd.raw_sectors[1] = ext_csd[EXT_CSD_SEC_CNT + 1];
+	card->ext_csd.raw_sectors[2] = ext_csd[EXT_CSD_SEC_CNT + 2];
+	card->ext_csd.raw_sectors[3] = ext_csd[EXT_CSD_SEC_CNT + 3];
 	if (card->ext_csd.rev >= 2) {
 		card->ext_csd.sectors =
 			ext_csd[EXT_CSD_SEC_CNT + 0] << 0 |
@@ -277,7 +281,7 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		if (card->ext_csd.sectors > (2u * 1024 * 1024 * 1024) / 512)
 			mmc_card_set_blockaddr(card);
 	}
-
+	card->ext_csd.raw_card_type = ext_csd[EXT_CSD_CARD_TYPE];
 	switch (ext_csd[EXT_CSD_CARD_TYPE] & EXT_CSD_CARD_TYPE_MASK) {
 	case EXT_CSD_CARD_TYPE_DDR_52 | EXT_CSD_CARD_TYPE_52 |
 	     EXT_CSD_CARD_TYPE_26:
@@ -307,6 +311,11 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 			mmc_hostname(card->host));
 	}
 
+	card->ext_csd.raw_s_a_timeout = ext_csd[EXT_CSD_S_A_TIMEOUT];
+	card->ext_csd.raw_erase_timeout_mult =
+		ext_csd[EXT_CSD_ERASE_TIMEOUT_MULT];
+	card->ext_csd.raw_hc_erase_grp_size =
+		ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE];
 	if (card->ext_csd.rev >= 3) {
 		u8 sa_shift = ext_csd[EXT_CSD_S_A_TIMEOUT];
 		card->ext_csd.part_config = ext_csd[EXT_CSD_PART_CONFIG];
@@ -334,6 +343,16 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		card->ext_csd.boot_size = ext_csd[EXT_CSD_BOOT_MULT] << 17;
 	}
 
+	card->ext_csd.raw_hc_erase_gap_size =
+		ext_csd[EXT_CSD_PARTITION_ATTRIBUTE];
+	card->ext_csd.raw_sec_trim_mult =
+		ext_csd[EXT_CSD_SEC_TRIM_MULT];
+	card->ext_csd.raw_sec_erase_mult =
+		ext_csd[EXT_CSD_SEC_ERASE_MULT];
+	card->ext_csd.raw_sec_feature_support =
+		ext_csd[EXT_CSD_SEC_FEATURE_SUPPORT];
+	card->ext_csd.raw_trim_mult =
+		ext_csd[EXT_CSD_TRIM_MULT];
 	if (card->ext_csd.rev >= 4) {
 		/*
 		 * Enhanced area feature support -- check whether the eMMC
@@ -341,7 +360,7 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		 * area offset and size to user by adding sysfs interface.
 		 */
 		if ((ext_csd[EXT_CSD_PARTITION_SUPPORT] & 0x2) &&
-				(ext_csd[EXT_CSD_PARTITION_ATTRIBUTE] & 0x1)) {
+		    (ext_csd[EXT_CSD_PARTITION_ATTRIBUTE] & 0x1)) {
 			u8 hc_erase_grp_sz =
 				ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE];
 			u8 hc_wp_grp_sz =
@@ -401,17 +420,17 @@ static inline void mmc_free_ext_csd(u8 *ext_csd)
 }
 
 
-static int mmc_compare_ext_csds(struct mmc_card *card, u8 *ext_csd,
-			unsigned bus_width)
+static int mmc_compare_ext_csds(struct mmc_card *card, unsigned bus_width)
 {
 	u8 *bw_ext_csd;
 	int err;
 
+	if (bus_width == MMC_BUS_WIDTH_1)
+		return 0;
+
 	err = mmc_get_ext_csd(card, &bw_ext_csd);
-	if (err)
-		return err;
 
-	if ((ext_csd == NULL || bw_ext_csd == NULL)) {
+	if (err || bw_ext_csd == NULL) {
 		if (bus_width != MMC_BUS_WIDTH_1)
 			err = -EINVAL;
 		goto out;
@@ -421,35 +440,40 @@ static int mmc_compare_ext_csds(struct mmc_card *card, u8 *ext_csd,
 		goto out;
 
 	/* only compare read only fields */
-	err = (!(ext_csd[EXT_CSD_PARTITION_SUPPORT] ==
+	err = (!(card->ext_csd.raw_partition_support ==
 			bw_ext_csd[EXT_CSD_PARTITION_SUPPORT]) &&
-		(ext_csd[EXT_CSD_ERASED_MEM_CONT] ==
+		(card->ext_csd.raw_erased_mem_count ==
 			bw_ext_csd[EXT_CSD_ERASED_MEM_CONT]) &&
-		(ext_csd[EXT_CSD_REV] ==
+		(card->ext_csd.rev ==
 			bw_ext_csd[EXT_CSD_REV]) &&
-		(ext_csd[EXT_CSD_STRUCTURE] ==
+		(card->ext_csd.raw_ext_csd_structure ==
 			bw_ext_csd[EXT_CSD_STRUCTURE]) &&
-		(ext_csd[EXT_CSD_CARD_TYPE] ==
+		(card->ext_csd.raw_card_type ==
 			bw_ext_csd[EXT_CSD_CARD_TYPE]) &&
-		(ext_csd[EXT_CSD_S_A_TIMEOUT] ==
+		(card->ext_csd.raw_s_a_timeout ==
 			bw_ext_csd[EXT_CSD_S_A_TIMEOUT]) &&
-		(ext_csd[EXT_CSD_HC_WP_GRP_SIZE] ==
+		(card->ext_csd.raw_hc_erase_gap_size ==
 			bw_ext_csd[EXT_CSD_HC_WP_GRP_SIZE]) &&
-		(ext_csd[EXT_CSD_ERASE_TIMEOUT_MULT] ==
+		(card->ext_csd.raw_erase_timeout_mult ==
 			bw_ext_csd[EXT_CSD_ERASE_TIMEOUT_MULT]) &&
-		(ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE] ==
+		(card->ext_csd.raw_hc_erase_grp_size ==
 			bw_ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE]) &&
-		(ext_csd[EXT_CSD_SEC_TRIM_MULT] ==
+		(card->ext_csd.raw_sec_trim_mult ==
 			bw_ext_csd[EXT_CSD_SEC_TRIM_MULT]) &&
-		(ext_csd[EXT_CSD_SEC_ERASE_MULT] ==
+		(card->ext_csd.raw_sec_erase_mult ==
 			bw_ext_csd[EXT_CSD_SEC_ERASE_MULT]) &&
-		(ext_csd[EXT_CSD_SEC_FEATURE_SUPPORT] ==
+		(card->ext_csd.raw_sec_feature_support ==
 			bw_ext_csd[EXT_CSD_SEC_FEATURE_SUPPORT]) &&
-		(ext_csd[EXT_CSD_TRIM_MULT] ==
+		(card->ext_csd.raw_trim_mult ==
 			bw_ext_csd[EXT_CSD_TRIM_MULT]) &&
-		memcmp(&ext_csd[EXT_CSD_SEC_CNT],
-		       &bw_ext_csd[EXT_CSD_SEC_CNT],
-		       4) != 0);
+		(card->ext_csd.raw_sectors[0] ==
+			bw_ext_csd[EXT_CSD_SEC_CNT + 0]) &&
+		(card->ext_csd.raw_sectors[1] ==
+			bw_ext_csd[EXT_CSD_SEC_CNT + 1]) &&
+		(card->ext_csd.raw_sectors[2] ==
+			bw_ext_csd[EXT_CSD_SEC_CNT + 2]) &&
+		(card->ext_csd.raw_sectors[3] ==
+			bw_ext_csd[EXT_CSD_SEC_CNT + 3]));
 	if (err)
 		err = -EINVAL;
 
@@ -770,7 +794,6 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 				 */
 				if (!(host->caps & MMC_CAP_BUS_WIDTH_TEST))
 					err = mmc_compare_ext_csds(card,
-						ext_csd,
 						bus_width);
 				else
 					err = mmc_bus_test(card, bus_width);
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index c6927a4d157f..6ad43554ac05 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -64,6 +64,19 @@ struct mmc_ext_csd {
 	unsigned long long	enhanced_area_offset;	/* Units: Byte */
 	unsigned int		enhanced_area_size;	/* Units: KB */
 	unsigned int		boot_size;		/* in bytes */
+	u8			raw_partition_support;	/* 160 */
+	u8			raw_erased_mem_count;	/* 181 */
+	u8			raw_ext_csd_structure;	/* 194 */
+	u8			raw_card_type;		/* 196 */
+	u8			raw_s_a_timeout;		/* 217 */
+	u8			raw_hc_erase_gap_size;	/* 221 */
+	u8			raw_erase_timeout_mult;	/* 223 */
+	u8			raw_hc_erase_grp_size;	/* 224 */
+	u8			raw_sec_trim_mult;	/* 229 */
+	u8			raw_sec_erase_mult;	/* 230 */
+	u8			raw_sec_feature_support;/* 231 */
+	u8			raw_trim_mult;		/* 232 */
+	u8			raw_sectors[4];		/* 212 - 4 bytes */
 };
 
 struct sd_scr {
-- 
cgit v1.2.3


From b4a03b9aa96cc186bf3cfd7a55cb7d7227f0cf4d Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Wed, 1 Jun 2011 23:54:02 +0800
Subject: ACPI: Fixes device power states array overflow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 28c2103 added new state ACPI_STATE_D3_COLD, so the device power
states array must be expanded by one also.

v2: Use ACPI_D_STATE_COUNT instead of number 5 for the array size.

Reported-by: Dan Carpenter <error27@gmail.com>
Suggested-by: Oldřich Jedlička <oldium.pro@seznam.cz>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/acpi/acpi_bus.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 3a10ef5914eb..6cd5b6403a7b 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -210,7 +210,7 @@ struct acpi_device_power_state {
 struct acpi_device_power {
 	int state;		/* Current state */
 	struct acpi_device_power_flags flags;
-	struct acpi_device_power_state states[4];	/* Power states (D0-D3) */
+	struct acpi_device_power_state states[ACPI_D_STATE_COUNT];	/* Power states (D0-D3Cold) */
 };
 
 /* Performance Management */
-- 
cgit v1.2.3


From 62f2a3a48bdc99822a24356e667e52c30df287c9 Mon Sep 17 00:00:00 2001
From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Wed, 13 Jul 2011 14:10:29 +0000
Subject: net: remove NETIF_F_ALL_TX_OFFLOADS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no software fallback implemented for SCTP or FCoE checksumming,
and so it should not be passed on by software devices like bridge or bonding.

For VLAN devices, this is different. First, the driver for underlying device
should be prepared to get offloaded packets even when the feature is disabled
(especially if it advertises it in vlan_features). Second, devices under
VLANs do not get replaced without tearing down the VLAN first.

This fixes a mess I accidentally introduced while converting bonding to
ndo_fix_features.

NETIF_F_SOFT_FEATURES are removed from BOND_VLAN_FEATURES because they
are unused as of commit 712ae51afd.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 6 +++---
 include/linux/netdevice.h       | 6 ------
 net/8021q/vlan_dev.c            | 6 +++++-
 3 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index eafe44a528ac..63c22b0bb5ad 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1428,9 +1428,9 @@ out:
 	return features;
 }
 
-#define BOND_VLAN_FEATURES	(NETIF_F_ALL_TX_OFFLOADS | \
-				 NETIF_F_SOFT_FEATURES | \
-				 NETIF_F_LRO)
+#define BOND_VLAN_FEATURES	(NETIF_F_ALL_CSUM | NETIF_F_SG | \
+				 NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
+				 NETIF_F_HIGHDMA | NETIF_F_LRO)
 
 static void bond_compute_features(struct bonding *bond)
 {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 54b8b4d7b68f..9e19477991ad 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1097,12 +1097,6 @@ struct net_device {
 #define NETIF_F_ALL_FCOE	(NETIF_F_FCOE_CRC | NETIF_F_FCOE_MTU | \
 				 NETIF_F_FSO)
 
-#define NETIF_F_ALL_TX_OFFLOADS	(NETIF_F_ALL_CSUM | NETIF_F_SG | \
-				 NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
-				 NETIF_F_HIGHDMA | \
-				 NETIF_F_SCTP_CSUM | \
-				 NETIF_F_ALL_FCOE)
-
 	/*
 	 * If one device supports one of these features, then enable them
 	 * for all in netdev_increment_features.
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 86bff9b1ac47..6e82148edfc8 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -528,7 +528,11 @@ static int vlan_dev_init(struct net_device *dev)
 					  (1<<__LINK_STATE_DORMANT))) |
 		      (1<<__LINK_STATE_PRESENT);
 
-	dev->hw_features = NETIF_F_ALL_TX_OFFLOADS;
+	dev->hw_features = NETIF_F_ALL_CSUM | NETIF_F_SG |
+			   NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
+			   NETIF_F_HIGHDMA | NETIF_F_SCTP_CSUM |
+			   NETIF_F_ALL_FCOE;
+
 	dev->features |= real_dev->vlan_features | NETIF_F_LLTX;
 	dev->gso_max_size = real_dev->gso_max_size;
 
-- 
cgit v1.2.3


From e3c1620434ac77b618ce74c024ace3559602ac99 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexdeucher@gmail.com>
Date: Fri, 15 Jul 2011 14:39:10 +0000
Subject: drm/radeon/kms: add new NI pci ids

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@kernel.org
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/drm/drm_pciids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h
index e08f344c6cff..3d53efd25ab9 100644
--- a/include/drm/drm_pciids.h
+++ b/include/drm/drm_pciids.h
@@ -182,6 +182,7 @@
 	{0x1002, 0x6750, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6758, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6759, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \
+	{0x1002, 0x675F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6760, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6761, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6762, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \
@@ -192,6 +193,7 @@
 	{0x1002, 0x6767, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6768, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6770, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \
+	{0x1002, 0x6778, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6779, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6880, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYPRESS|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6888, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYPRESS|RADEON_NEW_MEMMAP}, \
-- 
cgit v1.2.3


From 9c8b04be443b33939f374a811c82abeebe0a61d1 Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segoon@openwall.com>
Date: Sat, 25 Jun 2011 21:07:52 +0400
Subject: ACPI: constify ops structs

Structs battery_file, acpi_dock_ops, file_operations,
thermal_cooling_device_ops, thermal_zone_device_ops, kernel_param_ops
are not changed in runtime.  It is safe to make them const.
register_hotplug_dock_device() was altered to take const "ops" argument
to respect acpi_dock_ops' const notion.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Acked-by: Jeff Garzik <jgarzik@redhat.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/battery.c             | 2 +-
 drivers/acpi/dock.c                | 4 ++--
 drivers/acpi/ec_sys.c              | 2 +-
 drivers/acpi/fan.c                 | 2 +-
 drivers/acpi/processor_thermal.c   | 2 +-
 drivers/acpi/sysfs.c               | 4 ++--
 drivers/acpi/thermal.c             | 2 +-
 drivers/acpi/video.c               | 2 +-
 drivers/ata/libata-acpi.c          | 4 ++--
 drivers/pci/hotplug/acpiphp_glue.c | 2 +-
 include/acpi/acpi_drivers.h        | 2 +-
 include/acpi/processor.h           | 2 +-
 12 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index fcc13ac0aa18..746b47507e9f 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -863,7 +863,7 @@ DECLARE_FILE_FUNCTIONS(alarm);
 		}, \
 	}
 
-static struct battery_file {
+static const struct battery_file {
 	struct file_operations ops;
 	mode_t mode;
 	const char *name;
diff --git a/drivers/acpi/dock.c b/drivers/acpi/dock.c
index 1864ad3cf895..19a61136d848 100644
--- a/drivers/acpi/dock.c
+++ b/drivers/acpi/dock.c
@@ -77,7 +77,7 @@ struct dock_dependent_device {
 	struct list_head list;
 	struct list_head hotplug_list;
 	acpi_handle handle;
-	struct acpi_dock_ops *ops;
+	const struct acpi_dock_ops *ops;
 	void *context;
 };
 
@@ -589,7 +589,7 @@ EXPORT_SYMBOL_GPL(unregister_dock_notifier);
  * the dock driver after _DCK is executed.
  */
 int
-register_hotplug_dock_device(acpi_handle handle, struct acpi_dock_ops *ops,
+register_hotplug_dock_device(acpi_handle handle, const struct acpi_dock_ops *ops,
 			     void *context)
 {
 	struct dock_dependent_device *dd;
diff --git a/drivers/acpi/ec_sys.c b/drivers/acpi/ec_sys.c
index 05b44201a614..22f918bacd35 100644
--- a/drivers/acpi/ec_sys.c
+++ b/drivers/acpi/ec_sys.c
@@ -92,7 +92,7 @@ static ssize_t acpi_ec_write_io(struct file *f, const char __user *buf,
 	return count;
 }
 
-static struct file_operations acpi_ec_io_ops = {
+static const struct file_operations acpi_ec_io_ops = {
 	.owner = THIS_MODULE,
 	.open  = acpi_ec_open_io,
 	.read  = acpi_ec_read_io,
diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c
index 467479f07c1f..0f0356ca1a9e 100644
--- a/drivers/acpi/fan.c
+++ b/drivers/acpi/fan.c
@@ -110,7 +110,7 @@ fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state)
 	return result;
 }
 
-static struct thermal_cooling_device_ops fan_cooling_ops = {
+static const struct thermal_cooling_device_ops fan_cooling_ops = {
 	.get_max_state = fan_get_max_state,
 	.get_cur_state = fan_get_cur_state,
 	.set_cur_state = fan_set_cur_state,
diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c
index 79cb65332894..870550d6a4bf 100644
--- a/drivers/acpi/processor_thermal.c
+++ b/drivers/acpi/processor_thermal.c
@@ -244,7 +244,7 @@ processor_set_cur_state(struct thermal_cooling_device *cdev,
 	return result;
 }
 
-struct thermal_cooling_device_ops processor_cooling_ops = {
+const struct thermal_cooling_device_ops processor_cooling_ops = {
 	.get_max_state = processor_get_max_state,
 	.get_cur_state = processor_get_cur_state,
 	.set_cur_state = processor_set_cur_state,
diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c
index 77255f250dbb..c538d0ef10ff 100644
--- a/drivers/acpi/sysfs.c
+++ b/drivers/acpi/sysfs.c
@@ -149,12 +149,12 @@ static int param_get_debug_level(char *buffer, const struct kernel_param *kp)
 	return result;
 }
 
-static struct kernel_param_ops param_ops_debug_layer = {
+static const struct kernel_param_ops param_ops_debug_layer = {
 	.set = param_set_uint,
 	.get = param_get_debug_layer,
 };
 
-static struct kernel_param_ops param_ops_debug_level = {
+static const struct kernel_param_ops param_ops_debug_level = {
 	.set = param_set_uint,
 	.get = param_get_debug_level,
 };
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 2607e17b520f..48fbc647b178 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -812,7 +812,7 @@ acpi_thermal_unbind_cooling_device(struct thermal_zone_device *thermal,
 				thermal_zone_unbind_cooling_device);
 }
 
-static struct thermal_zone_device_ops acpi_thermal_zone_ops = {
+static const struct thermal_zone_device_ops acpi_thermal_zone_ops = {
 	.bind = acpi_thermal_bind_cooling_device,
 	.unbind	= acpi_thermal_unbind_cooling_device,
 	.get_temp = thermal_get_temp,
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index db39e9e607d8..c6f9ef8d9ccb 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -308,7 +308,7 @@ video_set_cur_state(struct thermal_cooling_device *cooling_dev, unsigned long st
 	return acpi_video_device_lcd_set_level(video, level);
 }
 
-static struct thermal_cooling_device_ops video_cooling_ops = {
+static const struct thermal_cooling_device_ops video_cooling_ops = {
 	.get_max_state = video_get_max_state,
 	.get_cur_state = video_get_cur_state,
 	.set_cur_state = video_set_cur_state,
diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c
index a791b8ce6294..993d40620f91 100644
--- a/drivers/ata/libata-acpi.c
+++ b/drivers/ata/libata-acpi.c
@@ -218,12 +218,12 @@ static void ata_acpi_dev_uevent(acpi_handle handle, u32 event, void *data)
 	ata_acpi_uevent(dev->link->ap, dev, event);
 }
 
-static struct acpi_dock_ops ata_acpi_dev_dock_ops = {
+static const struct acpi_dock_ops ata_acpi_dev_dock_ops = {
 	.handler = ata_acpi_dev_notify_dock,
 	.uevent = ata_acpi_dev_uevent,
 };
 
-static struct acpi_dock_ops ata_acpi_ap_dock_ops = {
+static const struct acpi_dock_ops ata_acpi_ap_dock_ops = {
 	.handler = ata_acpi_ap_notify_dock,
 	.uevent = ata_acpi_ap_uevent,
 };
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index a70fa89f76fd..220285760b68 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -110,7 +110,7 @@ static int post_dock_fixups(struct notifier_block *nb, unsigned long val,
 }
 
 
-static struct acpi_dock_ops acpiphp_dock_ops = {
+static const struct acpi_dock_ops acpiphp_dock_ops = {
 	.handler = handle_hotplug_event_func,
 };
 
diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index 3090471b2a5e..e49c36d38d7e 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h
@@ -128,7 +128,7 @@ extern int is_dock_device(acpi_handle handle);
 extern int register_dock_notifier(struct notifier_block *nb);
 extern void unregister_dock_notifier(struct notifier_block *nb);
 extern int register_hotplug_dock_device(acpi_handle handle,
-					struct acpi_dock_ops *ops,
+					const struct acpi_dock_ops *ops,
 					void *context);
 extern void unregister_hotplug_dock_device(acpi_handle handle);
 #else
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index ba4928cae473..67055f180330 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -337,7 +337,7 @@ extern struct cpuidle_driver acpi_idle_driver;
 
 /* in processor_thermal.c */
 int acpi_processor_get_limit_info(struct acpi_processor *pr);
-extern struct thermal_cooling_device_ops processor_cooling_ops;
+extern const struct thermal_cooling_device_ops processor_cooling_ops;
 #ifdef CONFIG_CPU_FREQ
 void acpi_thermal_cpufreq_init(void);
 void acpi_thermal_cpufreq_exit(void);
-- 
cgit v1.2.3


From a07c7964a29b6dc515b120f1e1c223ac2f8666f5 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Sat, 16 Jul 2011 22:22:20 +0000
Subject: include/linux/sdla.h: remove the prototype of sdla()

`make headers_check` complains that

linux-2.6/usr/include/linux/sdla.h:116: userspace cannot reference
function or variable defined in the kernel

this is due to that there is no such a kernel function,

void sdla(void *cfg_info, char *dev, struct frad_conf *conf, int quiet);

I don't know why we have it in a kernel header, so remove it.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sdla.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/sdla.h b/include/linux/sdla.h
index 564acd3a71c1..9995c7fc3f60 100644
--- a/include/linux/sdla.h
+++ b/include/linux/sdla.h
@@ -112,11 +112,7 @@ struct sdla_dlci_conf {
    short Tb_max;
 };
 
-#ifndef __KERNEL__
-
-void sdla(void *cfg_info, char *dev, struct frad_conf *conf, int quiet);
-
-#else
+#ifdef __KERNEL__
 
 /* important Z80 window addresses */
 #define SDLA_CONTROL_WND		0xE000
-- 
cgit v1.2.3


From 7765be2fec0f476fcd61812d5f9406b04c765020 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 14 Jul 2011 12:24:11 -0700
Subject: rcu: Fix RCU_BOOST race handling current->rcu_read_unlock_special

The RCU_BOOST commits for TREE_PREEMPT_RCU introduced an other-task
write to a new RCU_READ_UNLOCK_BOOSTED bit in the task_struct structure's
->rcu_read_unlock_special field, but, as noted by Steven Rostedt, without
correctly synchronizing all accesses to ->rcu_read_unlock_special.
This could result in bits in ->rcu_read_unlock_special being spuriously
set and cleared due to conflicting accesses, which in turn could result
in deadlocks between the rcu_node structure's ->lock and the scheduler's
rq and pi locks.  These deadlocks would result from RCU incorrectly
believing that the just-ended RCU read-side critical section had been
preempted and/or boosted.  If that RCU read-side critical section was
executed with either rq or pi locks held, RCU's ensuing (incorrect)
calls to the scheduler would cause the scheduler to attempt to once
again acquire the rq and pi locks, resulting in deadlock.  More complex
deadlock cycles are also possible, involving multiple rq and pi locks
as well as locks from multiple rcu_node structures.

This commit fixes synchronization by creating ->rcu_boosted field in
task_struct that is accessed and modified only when holding the ->lock
in the rcu_node structure on which the task is queued (on that rcu_node
structure's ->blkd_tasks list).  This results in tasks accessing only
their own current->rcu_read_unlock_special fields, making unsynchronized
access once again legal, and keeping the rcu_read_unlock() fastpath free
of atomic instructions and memory barriers.

The reason that the rcu_read_unlock() fastpath does not need to access
the new current->rcu_boosted field is that this new field cannot
be non-zero unless the RCU_READ_UNLOCK_BLOCKED bit is set in the
current->rcu_read_unlock_special field.  Therefore, rcu_read_unlock()
need only test current->rcu_read_unlock_special: if that is zero, then
current->rcu_boosted must also be zero.

This bug does not affect TINY_PREEMPT_RCU because this implementation
of RCU accesses current->rcu_read_unlock_special with irqs disabled,
thus preventing races on the !SMP systems that TINY_PREEMPT_RCU runs on.

Maybe-reported-by: Dave Jones <davej@redhat.com>
Maybe-reported-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/sched.h   | 3 +++
 kernel/rcutree_plugin.h | 8 ++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 496770a96487..76676a407e4a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1254,6 +1254,9 @@ struct task_struct {
 #ifdef CONFIG_PREEMPT_RCU
 	int rcu_read_lock_nesting;
 	char rcu_read_unlock_special;
+#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU)
+	int rcu_boosted;
+#endif /* #if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU) */
 	struct list_head rcu_node_entry;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 #ifdef CONFIG_TREE_PREEMPT_RCU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 6abef3cfcbc1..3a0ae0355222 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -342,6 +342,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
 		if (&t->rcu_node_entry == rnp->boost_tasks)
 			rnp->boost_tasks = np;
+		/* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
+		if (t->rcu_boosted) {
+			special |= RCU_READ_UNLOCK_BOOSTED;
+			t->rcu_boosted = 0;
+		}
 #endif /* #ifdef CONFIG_RCU_BOOST */
 		t->rcu_blocked_node = NULL;
 
@@ -358,7 +363,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
 		/* Unboost if we were boosted. */
 		if (special & RCU_READ_UNLOCK_BOOSTED) {
-			t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
 			rt_mutex_unlock(t->rcu_boost_mutex);
 			t->rcu_boost_mutex = NULL;
 		}
@@ -1176,7 +1180,7 @@ static int rcu_boost(struct rcu_node *rnp)
 	t = container_of(tb, struct task_struct, rcu_node_entry);
 	rt_mutex_init_proxy_locked(&mtx, t);
 	t->rcu_boost_mutex = &mtx;
-	t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+	t->rcu_boosted = 1;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
 	rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
-- 
cgit v1.2.3


From 9c3f75cbd144014bea6af866a154cc2e73ab2287 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 Jul 2011 13:00:06 +0200
Subject: sched: Break out cpu_power from the sched_group structure

In order to prepare for non-unique sched_groups per domain, we need to
carry the cpu_power elsewhere, so put a level of indirection in.

Reported-and-tested-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-qkho2byuhe4482fuknss40ad@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 14 +++++++++-----
 kernel/sched.c        | 32 ++++++++++++++++++++++++++------
 kernel/sched_fair.c   | 46 +++++++++++++++++++++++-----------------------
 3 files changed, 58 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 496770a96487..2e5b3c8e2d3e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -893,16 +893,20 @@ static inline int sd_power_saving_flags(void)
 	return 0;
 }
 
-struct sched_group {
-	struct sched_group *next;	/* Must be a circular list */
-	atomic_t ref;
-
+struct sched_group_power {
 	/*
 	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
 	 * single CPU.
 	 */
-	unsigned int cpu_power, cpu_power_orig;
+	unsigned int power, power_orig;
+};
+
+struct sched_group {
+	struct sched_group *next;	/* Must be a circular list */
+	atomic_t ref;
+
 	unsigned int group_weight;
+	struct sched_group_power *sgp;
 
 	/*
 	 * The CPUs this group covers.
diff --git a/kernel/sched.c b/kernel/sched.c
index 3dc716f6d8ad..36c10d25d4cd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6557,7 +6557,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 			break;
 		}
 
-		if (!group->cpu_power) {
+		if (!group->sgp->power) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: domain->cpu_power not "
 					"set\n");
@@ -6581,9 +6581,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 
 		printk(KERN_CONT " %s", str);
-		if (group->cpu_power != SCHED_POWER_SCALE) {
+		if (group->sgp->power != SCHED_POWER_SCALE) {
 			printk(KERN_CONT " (cpu_power = %d)",
-				group->cpu_power);
+				group->sgp->power);
 		}
 
 		group = group->next;
@@ -6777,8 +6777,10 @@ static struct root_domain *alloc_rootdomain(void)
 static void free_sched_domain(struct rcu_head *rcu)
 {
 	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-	if (atomic_dec_and_test(&sd->groups->ref))
+	if (atomic_dec_and_test(&sd->groups->ref)) {
+		kfree(sd->groups->sgp);
 		kfree(sd->groups);
+	}
 	kfree(sd);
 }
 
@@ -6945,6 +6947,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 struct sd_data {
 	struct sched_domain **__percpu sd;
 	struct sched_group **__percpu sg;
+	struct sched_group_power **__percpu sgp;
 };
 
 struct s_data {
@@ -6981,8 +6984,10 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 	if (child)
 		cpu = cpumask_first(sched_domain_span(child));
 
-	if (sg)
+	if (sg) {
 		*sg = *per_cpu_ptr(sdd->sg, cpu);
+		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+	}
 
 	return cpu;
 }
@@ -7020,7 +7025,7 @@ build_sched_groups(struct sched_domain *sd)
 			continue;
 
 		cpumask_clear(sched_group_cpus(sg));
-		sg->cpu_power = 0;
+		sg->sgp->power = 0;
 
 		for_each_cpu(j, span) {
 			if (get_group(j, sdd, NULL) != group)
@@ -7185,6 +7190,7 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 	if (cpu == cpumask_first(sched_group_cpus(sg))) {
 		WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
+		*per_cpu_ptr(sdd->sgp, cpu) = NULL;
 	}
 }
 
@@ -7234,9 +7240,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 		if (!sdd->sg)
 			return -ENOMEM;
 
+		sdd->sgp = alloc_percpu(struct sched_group_power *);
+		if (!sdd->sgp)
+			return -ENOMEM;
+
 		for_each_cpu(j, cpu_map) {
 			struct sched_domain *sd;
 			struct sched_group *sg;
+			struct sched_group_power *sgp;
 
 		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
@@ -7251,6 +7262,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 				return -ENOMEM;
 
 			*per_cpu_ptr(sdd->sg, j) = sg;
+
+			sgp = kzalloc_node(sizeof(struct sched_group_power),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sgp)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sgp, j) = sgp;
 		}
 	}
 
@@ -7268,9 +7286,11 @@ static void __sdt_free(const struct cpumask *cpu_map)
 		for_each_cpu(j, cpu_map) {
 			kfree(*per_cpu_ptr(sdd->sd, j));
 			kfree(*per_cpu_ptr(sdd->sg, j));
+			kfree(*per_cpu_ptr(sdd->sgp, j));
 		}
 		free_percpu(sdd->sd);
 		free_percpu(sdd->sg);
+		free_percpu(sdd->sgp);
 	}
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 433491c2dc8f..c768588e180b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1585,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		}
 
 		/* Adjust by relative CPU power of the group */
-		avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;
+		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
 
 		if (local_group) {
 			this_load = avg_load;
@@ -2631,7 +2631,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 		power >>= SCHED_POWER_SHIFT;
 	}
 
-	sdg->cpu_power_orig = power;
+	sdg->sgp->power_orig = power;
 
 	if (sched_feat(ARCH_POWER))
 		power *= arch_scale_freq_power(sd, cpu);
@@ -2647,7 +2647,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 		power = 1;
 
 	cpu_rq(cpu)->cpu_power = power;
-	sdg->cpu_power = power;
+	sdg->sgp->power = power;
 }
 
 static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2665,11 +2665,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
 
 	group = child->groups;
 	do {
-		power += group->cpu_power;
+		power += group->sgp->power;
 		group = group->next;
 	} while (group != child->groups);
 
-	sdg->cpu_power = power;
+	sdg->sgp->power = power;
 }
 
 /*
@@ -2691,7 +2691,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 	/*
 	 * If ~90% of the cpu_power is still there, we're good.
 	 */
-	if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
 		return 1;
 
 	return 0;
@@ -2771,7 +2771,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	}
 
 	/* Adjust by relative CPU power of the group */
-	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;
+	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
 
 	/*
 	 * Consider the group unbalanced when the imbalance is larger
@@ -2788,7 +2788,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
 		sgs->group_imb = 1;
 
-	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power,
+	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
 						SCHED_POWER_SCALE);
 	if (!sgs->group_capacity)
 		sgs->group_capacity = fix_small_capacity(sd, group);
@@ -2877,7 +2877,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 			return;
 
 		sds->total_load += sgs.group_load;
-		sds->total_pwr += sg->cpu_power;
+		sds->total_pwr += sg->sgp->power;
 
 		/*
 		 * In case the child domain prefers tasks go to siblings
@@ -2962,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd,
 	if (this_cpu > busiest_cpu)
 		return 0;
 
-	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
 				       SCHED_POWER_SCALE);
 	return 1;
 }
@@ -2993,7 +2993,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 
 	scaled_busy_load_per_task = sds->busiest_load_per_task
 					 * SCHED_POWER_SCALE;
-	scaled_busy_load_per_task /= sds->busiest->cpu_power;
+	scaled_busy_load_per_task /= sds->busiest->sgp->power;
 
 	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
 			(scaled_busy_load_per_task * imbn)) {
@@ -3007,28 +3007,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 	 * moving them.
 	 */
 
-	pwr_now += sds->busiest->cpu_power *
+	pwr_now += sds->busiest->sgp->power *
 			min(sds->busiest_load_per_task, sds->max_load);
-	pwr_now += sds->this->cpu_power *
+	pwr_now += sds->this->sgp->power *
 			min(sds->this_load_per_task, sds->this_load);
 	pwr_now /= SCHED_POWER_SCALE;
 
 	/* Amount of load we'd subtract */
 	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-		sds->busiest->cpu_power;
+		sds->busiest->sgp->power;
 	if (sds->max_load > tmp)
-		pwr_move += sds->busiest->cpu_power *
+		pwr_move += sds->busiest->sgp->power *
 			min(sds->busiest_load_per_task, sds->max_load - tmp);
 
 	/* Amount of load we'd add */
-	if (sds->max_load * sds->busiest->cpu_power <
+	if (sds->max_load * sds->busiest->sgp->power <
 		sds->busiest_load_per_task * SCHED_POWER_SCALE)
-		tmp = (sds->max_load * sds->busiest->cpu_power) /
-			sds->this->cpu_power;
+		tmp = (sds->max_load * sds->busiest->sgp->power) /
+			sds->this->sgp->power;
 	else
 		tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-			sds->this->cpu_power;
-	pwr_move += sds->this->cpu_power *
+			sds->this->sgp->power;
+	pwr_move += sds->this->sgp->power *
 			min(sds->this_load_per_task, sds->this_load + tmp);
 	pwr_move /= SCHED_POWER_SCALE;
 
@@ -3074,7 +3074,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 
 		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
 
-		load_above_capacity /= sds->busiest->cpu_power;
+		load_above_capacity /= sds->busiest->sgp->power;
 	}
 
 	/*
@@ -3090,8 +3090,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
 
 	/* How much load to actually move to equalise the imbalance */
-	*imbalance = min(max_pull * sds->busiest->cpu_power,
-		(sds->avg_load - sds->this_load) * sds->this->cpu_power)
+	*imbalance = min(max_pull * sds->busiest->sgp->power,
+		(sds->avg_load - sds->this_load) * sds->this->sgp->power)
 			/ SCHED_POWER_SCALE;
 
 	/*
-- 
cgit v1.2.3


From e3589f6c81e4764d32a25d2a2a0afe54fa344f5c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 Jul 2011 10:35:52 +0200
Subject: sched: Allow for overlapping sched_domain spans

Allow for sched_domain spans that overlap by giving such domains their
own sched_group list instead of sharing the sched_groups amongst
each-other.

This is needed for machines with more than 16 nodes, because
sched_domain_node_span() will generate a node mask from the
16 nearest nodes without regard if these masks have any overlap.

Currently sched_domains have a sched_group that maps to their child
sched_domain span, and since there is no overlap we share the
sched_group between the sched_domains of the various CPUs. If however
there is overlap, we would need to link the sched_group list in
different ways for each cpu, and hence sharing isn't possible.

In order to solve this, allocate private sched_groups for each CPU's
sched_domain but have the sched_groups share a sched_group_power
structure such that we can uniquely track the power.

Reported-and-tested-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-08bxqw9wis3qti9u5inifh3y@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |   2 +
 kernel/sched.c          | 157 +++++++++++++++++++++++++++++++++++++++---------
 kernel/sched_features.h |   2 +
 3 files changed, 132 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2e5b3c8e2d3e..bde99d5358dc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -844,6 +844,7 @@ enum cpu_idle_type {
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
 #define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
+#define SD_OVERLAP		0x2000	/* sched_domains of this level overlap */
 
 enum powersavings_balance_level {
 	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
@@ -894,6 +895,7 @@ static inline int sd_power_saving_flags(void)
 }
 
 struct sched_group_power {
+	atomic_t ref;
 	/*
 	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
 	 * single CPU.
diff --git a/kernel/sched.c b/kernel/sched.c
index 36c10d25d4cd..921adf6f6fad 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6774,10 +6774,36 @@ static struct root_domain *alloc_rootdomain(void)
 	return rd;
 }
 
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+	struct sched_group *tmp, *first;
+
+	if (!sg)
+		return;
+
+	first = sg;
+	do {
+		tmp = sg->next;
+
+		if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+			kfree(sg->sgp);
+
+		kfree(sg);
+		sg = tmp;
+	} while (sg != first);
+}
+
 static void free_sched_domain(struct rcu_head *rcu)
 {
 	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-	if (atomic_dec_and_test(&sd->groups->ref)) {
+
+	/*
+	 * If its an overlapping domain it has private groups, iterate and
+	 * nuke them all.
+	 */
+	if (sd->flags & SD_OVERLAP) {
+		free_sched_groups(sd->groups, 1);
+	} else if (atomic_dec_and_test(&sd->groups->ref)) {
 		kfree(sd->groups->sgp);
 		kfree(sd->groups);
 	}
@@ -6967,15 +6993,73 @@ struct sched_domain_topology_level;
 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 
+#define SDTL_OVERLAP	0x01
+
 struct sched_domain_topology_level {
 	sched_domain_init_f init;
 	sched_domain_mask_f mask;
+	int		    flags;
 	struct sd_data      data;
 };
 
-/*
- * Assumes the sched_domain tree is fully constructed
- */
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+	struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+	const struct cpumask *span = sched_domain_span(sd);
+	struct cpumask *covered = sched_domains_tmpmask;
+	struct sd_data *sdd = sd->private;
+	struct sched_domain *child;
+	int i;
+
+	cpumask_clear(covered);
+
+	for_each_cpu(i, span) {
+		struct cpumask *sg_span;
+
+		if (cpumask_test_cpu(i, covered))
+			continue;
+
+		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+				GFP_KERNEL, cpu_to_node(i));
+
+		if (!sg)
+			goto fail;
+
+		sg_span = sched_group_cpus(sg);
+
+		child = *per_cpu_ptr(sdd->sd, i);
+		if (child->child) {
+			child = child->child;
+			cpumask_copy(sg_span, sched_domain_span(child));
+		} else
+			cpumask_set_cpu(i, sg_span);
+
+		cpumask_or(covered, covered, sg_span);
+
+		sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+		atomic_inc(&sg->sgp->ref);
+
+		if (cpumask_test_cpu(cpu, sg_span))
+			groups = sg;
+
+		if (!first)
+			first = sg;
+		if (last)
+			last->next = sg;
+		last = sg;
+		last->next = first;
+	}
+	sd->groups = groups;
+
+	return 0;
+
+fail:
+	free_sched_groups(first, 0);
+
+	return -ENOMEM;
+}
+
 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 {
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@ -6987,23 +7071,21 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 	if (sg) {
 		*sg = *per_cpu_ptr(sdd->sg, cpu);
 		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+		atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
 	}
 
 	return cpu;
 }
 
 /*
- * build_sched_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
  * build_sched_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
  */
-static void
-build_sched_groups(struct sched_domain *sd)
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
 {
 	struct sched_group *first = NULL, *last = NULL;
 	struct sd_data *sdd = sd->private;
@@ -7011,6 +7093,12 @@ build_sched_groups(struct sched_domain *sd)
 	struct cpumask *covered;
 	int i;
 
+	get_group(cpu, sdd, &sd->groups);
+	atomic_inc(&sd->groups->ref);
+
+	if (cpu != cpumask_first(sched_domain_span(sd)))
+		return 0;
+
 	lockdep_assert_held(&sched_domains_mutex);
 	covered = sched_domains_tmpmask;
 
@@ -7042,6 +7130,8 @@ build_sched_groups(struct sched_domain *sd)
 		last = sg;
 	}
 	last->next = first;
+
+	return 0;
 }
 
 /*
@@ -7056,12 +7146,17 @@ build_sched_groups(struct sched_domain *sd)
  */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
-	WARN_ON(!sd || !sd->groups);
+	struct sched_group *sg = sd->groups;
 
-	if (cpu != group_first_cpu(sd->groups))
-		return;
+	WARN_ON(!sd || !sg);
+
+	do {
+		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+		sg = sg->next;
+	} while (sg != sd->groups);
 
-	sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+	if (cpu != group_first_cpu(sg))
+		return;
 
 	update_group_power(sd, cpu);
 }
@@ -7182,16 +7277,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
 static void claim_allocations(int cpu, struct sched_domain *sd)
 {
 	struct sd_data *sdd = sd->private;
-	struct sched_group *sg = sd->groups;
 
 	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
 	*per_cpu_ptr(sdd->sd, cpu) = NULL;
 
-	if (cpu == cpumask_first(sched_group_cpus(sg))) {
-		WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
+	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
+
+	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
 		*per_cpu_ptr(sdd->sgp, cpu) = NULL;
-	}
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -7216,7 +7310,7 @@ static struct sched_domain_topology_level default_topology[] = {
 #endif
 	{ sd_init_CPU, cpu_cpu_mask, },
 #ifdef CONFIG_NUMA
-	{ sd_init_NODE, cpu_node_mask, },
+	{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
 	{ sd_init_ALLNODES, cpu_allnodes_mask, },
 #endif
 	{ NULL, },
@@ -7284,7 +7378,9 @@ static void __sdt_free(const struct cpumask *cpu_map)
 		struct sd_data *sdd = &tl->data;
 
 		for_each_cpu(j, cpu_map) {
-			kfree(*per_cpu_ptr(sdd->sd, j));
+			struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+			if (sd && (sd->flags & SD_OVERLAP))
+				free_sched_groups(sd->groups, 0);
 			kfree(*per_cpu_ptr(sdd->sg, j));
 			kfree(*per_cpu_ptr(sdd->sgp, j));
 		}
@@ -7336,8 +7432,11 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 		struct sched_domain_topology_level *tl;
 
 		sd = NULL;
-		for (tl = sched_domain_topology; tl->init; tl++)
+		for (tl = sched_domain_topology; tl->init; tl++) {
 			sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+				sd->flags |= SD_OVERLAP;
+		}
 
 		while (sd->child)
 			sd = sd->child;
@@ -7349,13 +7448,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 	for_each_cpu(i, cpu_map) {
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
 			sd->span_weight = cpumask_weight(sched_domain_span(sd));
-			get_group(i, sd->private, &sd->groups);
-			atomic_inc(&sd->groups->ref);
-
-			if (i != cpumask_first(sched_domain_span(sd)))
-				continue;
-
-			build_sched_groups(sd);
+			if (sd->flags & SD_OVERLAP) {
+				if (build_overlap_sched_groups(sd, i))
+					goto error;
+			} else {
+				if (build_sched_groups(sd, i))
+					goto error;
+			}
 		}
 	}
 
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index be40f7371ee1..1e7066d76c26 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -70,3 +70,5 @@ SCHED_FEAT(NONIRQ_POWER, 1)
  * using the scheduler IPI. Reduces rq->lock contention/bounces.
  */
 SCHED_FEAT(TTWU_QUEUE, 1)
+
+SCHED_FEAT(FORCE_SD_OVERLAP, 0)
-- 
cgit v1.2.3


From 31f5396ad3bde23c8416e8d23ba425e27f413314 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Thu, 28 Jul 2011 13:48:42 -0700
Subject: thermal: make THERMAL_HWMON implementation fully internal

THERMAL_HWMON is implemented inside the thermal_sys driver and has no
effect on drivers implementing thermal zones, so they shouldn't see
anything related to it in <linux/thermal.h>.  Making the THERMAL_HWMON
implementation fully internal has two advantages beyond the cleaner
design:

* This avoids rebuilding all thermal drivers if the THERMAL_HWMON
  implementation changes, or if CONFIG_THERMAL_HWMON gets enabled or
  disabled.

* This avoids breaking the thermal kABI in these cases too, which should
  make distributions happy.

The only drawback I can see is slightly higher memory fragmentation, as
the number of kzalloc() calls will increase by one per thermal zone.  But
I doubt it will be a problem in practice, as I've never seen a system with
more than two thermal zones.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Rene Herman <rene.herman@gmail.com>
Acked-by: Guenter Roeck <guenter.roeck@ericsson.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/thermal/thermal_sys.c | 117 +++++++++++++++++++++++++++++++++---------
 include/linux/thermal.h       |  22 --------
 2 files changed, 92 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c
index b6353d61fac5..708f8e92771a 100644
--- a/drivers/thermal/thermal_sys.c
+++ b/drivers/thermal/thermal_sys.c
@@ -420,6 +420,29 @@ thermal_cooling_device_trip_point_show(struct device *dev,
 
 /* hwmon sys I/F */
 #include <linux/hwmon.h>
+
+/* thermal zone devices with the same type share one hwmon device */
+struct thermal_hwmon_device {
+	char type[THERMAL_NAME_LENGTH];
+	struct device *device;
+	int count;
+	struct list_head tz_list;
+	struct list_head node;
+};
+
+struct thermal_hwmon_attr {
+	struct device_attribute attr;
+	char name[16];
+};
+
+/* one temperature input for each thermal zone */
+struct thermal_hwmon_temp {
+	struct list_head hwmon_node;
+	struct thermal_zone_device *tz;
+	struct thermal_hwmon_attr temp_input;	/* hwmon sys attr */
+	struct thermal_hwmon_attr temp_crit;	/* hwmon sys attr */
+};
+
 static LIST_HEAD(thermal_hwmon_list);
 
 static ssize_t
@@ -437,9 +460,10 @@ temp_input_show(struct device *dev, struct device_attribute *attr, char *buf)
 	int ret;
 	struct thermal_hwmon_attr *hwmon_attr
 			= container_of(attr, struct thermal_hwmon_attr, attr);
-	struct thermal_zone_device *tz
-			= container_of(hwmon_attr, struct thermal_zone_device,
+	struct thermal_hwmon_temp *temp
+			= container_of(hwmon_attr, struct thermal_hwmon_temp,
 				       temp_input);
+	struct thermal_zone_device *tz = temp->tz;
 
 	ret = tz->ops->get_temp(tz, &temperature);
 
@@ -455,9 +479,10 @@ temp_crit_show(struct device *dev, struct device_attribute *attr,
 {
 	struct thermal_hwmon_attr *hwmon_attr
 			= container_of(attr, struct thermal_hwmon_attr, attr);
-	struct thermal_zone_device *tz
-			= container_of(hwmon_attr, struct thermal_zone_device,
+	struct thermal_hwmon_temp *temp
+			= container_of(hwmon_attr, struct thermal_hwmon_temp,
 				       temp_crit);
+	struct thermal_zone_device *tz = temp->tz;
 	long temperature;
 	int ret;
 
@@ -485,10 +510,29 @@ thermal_hwmon_lookup_by_type(const struct thermal_zone_device *tz)
 	return NULL;
 }
 
+/* Find the temperature input matching a given thermal zone */
+static struct thermal_hwmon_temp *
+thermal_hwmon_lookup_temp(const struct thermal_hwmon_device *hwmon,
+			  const struct thermal_zone_device *tz)
+{
+	struct thermal_hwmon_temp *temp;
+
+	mutex_lock(&thermal_list_lock);
+	list_for_each_entry(temp, &hwmon->tz_list, hwmon_node)
+		if (temp->tz == tz) {
+			mutex_unlock(&thermal_list_lock);
+			return temp;
+		}
+	mutex_unlock(&thermal_list_lock);
+
+	return NULL;
+}
+
 static int
 thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
 {
 	struct thermal_hwmon_device *hwmon;
+	struct thermal_hwmon_temp *temp;
 	int new_hwmon_device = 1;
 	int result;
 
@@ -515,30 +559,36 @@ thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
 		goto free_mem;
 
  register_sys_interface:
-	tz->hwmon = hwmon;
+	temp = kzalloc(sizeof(struct thermal_hwmon_temp), GFP_KERNEL);
+	if (!temp) {
+		result = -ENOMEM;
+		goto unregister_name;
+	}
+
+	temp->tz = tz;
 	hwmon->count++;
 
-	snprintf(tz->temp_input.name, THERMAL_NAME_LENGTH,
+	snprintf(temp->temp_input.name, THERMAL_NAME_LENGTH,
 		 "temp%d_input", hwmon->count);
-	tz->temp_input.attr.attr.name = tz->temp_input.name;
-	tz->temp_input.attr.attr.mode = 0444;
-	tz->temp_input.attr.show = temp_input_show;
-	sysfs_attr_init(&tz->temp_input.attr.attr);
-	result = device_create_file(hwmon->device, &tz->temp_input.attr);
+	temp->temp_input.attr.attr.name = temp->temp_input.name;
+	temp->temp_input.attr.attr.mode = 0444;
+	temp->temp_input.attr.show = temp_input_show;
+	sysfs_attr_init(&temp->temp_input.attr.attr);
+	result = device_create_file(hwmon->device, &temp->temp_input.attr);
 	if (result)
-		goto unregister_name;
+		goto free_temp_mem;
 
 	if (tz->ops->get_crit_temp) {
 		unsigned long temperature;
 		if (!tz->ops->get_crit_temp(tz, &temperature)) {
-			snprintf(tz->temp_crit.name, THERMAL_NAME_LENGTH,
+			snprintf(temp->temp_crit.name, THERMAL_NAME_LENGTH,
 				"temp%d_crit", hwmon->count);
-			tz->temp_crit.attr.attr.name = tz->temp_crit.name;
-			tz->temp_crit.attr.attr.mode = 0444;
-			tz->temp_crit.attr.show = temp_crit_show;
-			sysfs_attr_init(&tz->temp_crit.attr.attr);
+			temp->temp_crit.attr.attr.name = temp->temp_crit.name;
+			temp->temp_crit.attr.attr.mode = 0444;
+			temp->temp_crit.attr.show = temp_crit_show;
+			sysfs_attr_init(&temp->temp_crit.attr.attr);
 			result = device_create_file(hwmon->device,
-						    &tz->temp_crit.attr);
+						    &temp->temp_crit.attr);
 			if (result)
 				goto unregister_input;
 		}
@@ -547,13 +597,15 @@ thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
 	mutex_lock(&thermal_list_lock);
 	if (new_hwmon_device)
 		list_add_tail(&hwmon->node, &thermal_hwmon_list);
-	list_add_tail(&tz->hwmon_node, &hwmon->tz_list);
+	list_add_tail(&temp->hwmon_node, &hwmon->tz_list);
 	mutex_unlock(&thermal_list_lock);
 
 	return 0;
 
  unregister_input:
-	device_remove_file(hwmon->device, &tz->temp_input.attr);
+	device_remove_file(hwmon->device, &temp->temp_input.attr);
+ free_temp_mem:
+	kfree(temp);
  unregister_name:
 	if (new_hwmon_device) {
 		device_remove_file(hwmon->device, &dev_attr_name);
@@ -569,15 +621,30 @@ thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
 static void
 thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)
 {
-	struct thermal_hwmon_device *hwmon = tz->hwmon;
+	struct thermal_hwmon_device *hwmon;
+	struct thermal_hwmon_temp *temp;
+
+	hwmon = thermal_hwmon_lookup_by_type(tz);
+	if (unlikely(!hwmon)) {
+		/* Should never happen... */
+		dev_dbg(&tz->device, "hwmon device lookup failed!\n");
+		return;
+	}
+
+	temp = thermal_hwmon_lookup_temp(hwmon, tz);
+	if (unlikely(!temp)) {
+		/* Should never happen... */
+		dev_dbg(&tz->device, "temperature input lookup failed!\n");
+		return;
+	}
 
-	tz->hwmon = NULL;
-	device_remove_file(hwmon->device, &tz->temp_input.attr);
+	device_remove_file(hwmon->device, &temp->temp_input.attr);
 	if (tz->ops->get_crit_temp)
-		device_remove_file(hwmon->device, &tz->temp_crit.attr);
+		device_remove_file(hwmon->device, &temp->temp_crit.attr);
 
 	mutex_lock(&thermal_list_lock);
-	list_del(&tz->hwmon_node);
+	list_del(&temp->hwmon_node);
+	kfree(temp);
 	if (!list_empty(&hwmon->tz_list)) {
 		mutex_unlock(&thermal_list_lock);
 		return;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index d3ec89fb4122..47b4a27e6e97 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -85,22 +85,6 @@ struct thermal_cooling_device {
 				((long)t-2732+5)/10 : ((long)t-2732-5)/10)
 #define CELSIUS_TO_KELVIN(t)	((t)*10+2732)
 
-#if defined(CONFIG_THERMAL_HWMON)
-/* thermal zone devices with the same type share one hwmon device */
-struct thermal_hwmon_device {
-	char type[THERMAL_NAME_LENGTH];
-	struct device *device;
-	int count;
-	struct list_head tz_list;
-	struct list_head node;
-};
-
-struct thermal_hwmon_attr {
-	struct device_attribute attr;
-	char name[16];
-};
-#endif
-
 struct thermal_zone_device {
 	int id;
 	char type[THERMAL_NAME_LENGTH];
@@ -120,12 +104,6 @@ struct thermal_zone_device {
 	struct mutex lock;	/* protect cooling devices list */
 	struct list_head node;
 	struct delayed_work poll_queue;
-#if defined(CONFIG_THERMAL_HWMON)
-	struct list_head hwmon_node;
-	struct thermal_hwmon_device *hwmon;
-	struct thermal_hwmon_attr temp_input;	/* hwmon sys attr */
-	struct thermal_hwmon_attr temp_crit;	/* hwmon sys attr */
-#endif
 };
 /* Adding event notification support elements */
 #define THERMAL_GENL_FAMILY_NAME                "thermal_event"
-- 
cgit v1.2.3


From f52e00c668669c9c290e84adf859c76db6d92a5a Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 28 Jul 2011 13:48:43 -0700
Subject: ACPI: remove NID_INVAL

b552a8c56db8 ("ACPI: remove NID_INVAL") removed the left over uses of
NID_INVAL, but didn't actually remove the definition.  Remove it.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/linux/acpi.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 1deb2a73c2da..2312e850aab8 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -238,7 +238,6 @@ extern int acpi_paddr_to_node(u64 start_addr, u64 size);
 extern int pnpacpi_disabled;
 
 #define PXM_INVAL	(-1)
-#define NID_INVAL	(-1)
 
 int acpi_check_resource_conflict(const struct resource *res);
 
-- 
cgit v1.2.3